diff --git a/imperative/python/megengine/amp/autocast.py b/imperative/python/megengine/amp/autocast.py
index eeb23620..01b98e8f 100644
--- a/imperative/python/megengine/amp/autocast.py
+++ b/imperative/python/megengine/amp/autocast.py
@@ -11,38 +11,37 @@ from ..core.tensor import amp
 
 
 class autocast:
-    r"""
-    A class to control autocast mode for amp as a context manager or a decorator.
+    r"""A class to control autocast mode for amp as a context manager or a decorator.
 
-    :param enabled: Whether autocast mode is enabled.
-    :param low_prec_dtype: Set amp autocast mode's lower precision dtype. It will change
-        the target dtype in tensor casting for better speed and memory. Default: float16.
-    :param high_prec_dtype: Set amp autocast mode's higher precision dtype. It will
-        change the target dtype in tensor casting for better precision. Default: float32.
+    Args:
+        enabled: Whether autocast mode is enabled.
+        low_prec_dtype: Set amp autocast mode's lower precision dtype. It will change
+            the target dtype in tensor casting for better speed and memory. Default: float16.
+        high_prec_dtype: Set amp autocast mode's higher precision dtype. It will
+            change the target dtype in tensor casting for better precision. Default: float32.
 
     Examples:
+        .. code-block::
 
-    .. code-block::
+           # used as decorator
+           @autocast()
+           def train_step(image, label):
+               with gm:
+                   logits = model(image)
+                   loss = F.nn.cross_entropy(logits, label)
+                   gm.backward(loss)
+               opt.step().clear_grad()
+               return loss
 
-        # used as decorator
-        @autocast()
-        def train_step(image, label):
-            with gm:
-                logits = model(image)
-                loss = F.nn.cross_entropy(logits, label)
-                gm.backward(loss)
-            opt.step().clear_grad()
-            return loss
-
-        # used as context manager
-        def train_step(image, label):
-            with autocast():
-                with gm:
-                    logits = model(image)
-                    loss = F.nn.cross_entropy(logits, label)
-                    gm.backward(loss)
-            opt.step().clear_grad()
-            return loss
+           # used as context manager
+           def train_step(image, label):
+               with autocast():
+                   with gm:
+                       logits = model(image)
+                       loss = F.nn.cross_entropy(logits, label)
+                       gm.backward(loss)
+               opt.step().clear_grad()
+               return loss
     """
 
     def __init__(
diff --git a/imperative/python/megengine/amp/grad_scaler.py b/imperative/python/megengine/amp/grad_scaler.py
index 2af68a3f..b23103b0 100644
--- a/imperative/python/megengine/amp/grad_scaler.py
+++ b/imperative/python/megengine/amp/grad_scaler.py
@@ -16,50 +16,51 @@ from ..tensor import Tensor
 
 
 class GradScaler:
-    r"""
-    A helper class that performs grad scaling to prevent from data overflow in
+    r"""A helper class that performs grad scaling to prevent from data overflow in
     :class:`~.autocast` mode.
 
-    :param init_scale: Initial scale factor.
-    :param growth_factor: Factor that the scale is multiplied by in actual
-        :meth:`update` stage. If growth_factor is 0, scale_factor will not update.
-    :param backoff_factor: Factor that the scale is multiplied by when encountering
-        overflow grad.
-    :param growth_interval: The interval between two scale update stages.
-
-    Example::
-
-        gm = GradManager()
-        opt = ...
-        scaler = GradScaler()
-
-        gm.attach(model.parameters())
-
-        @autocast()
-        def train_step(image, label):
-            with gm:
-                logits = model(image)
-                loss = F.nn.cross_entropy(logits, label)
-                scaler.backward(gm, loss)
-            opt.step().clear_grad()
-            return loss
-
-    If need more flexible usage, could split ``scaler.backward`` into three lines:
-
-    .. code-block::
-
-        @autocast()
-        def train_step(image, label):
-            with gm:
-                logits = model(image)
-                loss = F.nn.cross_entropy(logits, label)
-                gm.backward(loss， dy=megengine.tensor(scaler.scale_factor))
-            scaler.unscale(gm.attached_tensors())
-            scaler.update()
-            opt.step().clear_grad()
-            return loss
-
-    This is useful when need to accumulate grads for multi batches.
+    Args:
+        init_scale: Initial scale factor.
+        growth_factor: Factor that the scale is multiplied by in actual
+            :meth:`update` stage. If growth_factor is 0, scale_factor will not update.
+        backoff_factor: Factor that the scale is multiplied by when encountering
+            overflow grad.
+        growth_interval: The interval between two scale update stages.
+
+    Example:
+        .. code-block::
+
+           gm = GradManager()
+           opt = ...
+           scaler = GradScaler()
+
+           gm.attach(model.parameters())
+
+           @autocast()
+           def train_step(image, label):
+               with gm:
+                   logits = model(image)
+                   loss = F.nn.cross_entropy(logits, label)
+                   scaler.backward(gm, loss)
+               opt.step().clear_grad()
+               return loss
+
+        If need more flexible usage, could split ``scaler.backward`` into three lines:
+
+        .. code-block::
+
+           @autocast()
+           def train_step(image, label):
+               with gm:
+                   logits = model(image)
+                   loss = F.nn.cross_entropy(logits, label)
+                   gm.backward(loss， dy=megengine.tensor(scaler.scale_factor))
+               scaler.unscale(gm.attached_tensors())
+               scaler.update()
+               opt.step().clear_grad()
+               return loss
+
+        This is useful when need to accumulate grads for multi batches.
     """
 
     def __init__(
@@ -86,18 +87,18 @@ class GradScaler:
         unscale_grad: bool = True,
         update_scale: bool = "if_unscale_grad"
     ):
-        r"""
-        A wrapper of GradManager's :meth:`~.GradManager.backward`, used to scale
+        r"""A wrapper of GradManager's :meth:`~.GradManager.backward`, used to scale
         ``y``'s grad and unscale parameters' grads.
 
-        :param gm: The to be wrapped GradManager.
-        :param y: Same as GradManager backward's ``y``.
-        :param dy: Same as GradManager backward's ``dy``. Will be multiplied
-            by ``scale_factor``.
-        :param unscale_grad: Whether do :meth:`unscale` at the same time. Could be
-            ``False`` if needs to accumulate grads.
-        :param update_scale: Same as :meth:`unscale`'s ``update``. Will be ignored
-            if ``unscale_grad`` is ``False``.
+        Args:
+            gm: The to be wrapped GradManager.
+            y: Same as GradManager backward's ``y``.
+            dy: Same as GradManager backward's ``dy``. Will be multiplied
+                by ``scale_factor``.
+            unscale_grad: Whether do :meth:`unscale` at the same time. Could be
+                ``False`` if needs to accumulate grads.
+            update_scale: Same as :meth:`unscale`'s ``update``. Will be ignored
+                if ``unscale_grad`` is ``False``.
         """
         # These checks should be consistent with GradManager's
         if y is None:
@@ -121,11 +122,11 @@ class GradScaler:
                 self.update()
 
     def unscale(self, grad_tensors: Iterable[Tensor]):
-        r"""
-        Unscale all ``grad_tensors``'s grad.
+        r"""Unscale all ``grad_tensors``'s grad.
 
-        :param grad_tensors: Tensors needed to unscale grads. Should be all tensors
-            that are affected by ``target`` tensor in GradManager's backward.
+        Args:
+            grad_tensors: Tensors needed to unscale grads. Should be all tensors
+                that are affected by ``target`` tensor in GradManager's backward.
         """
         # use float64 for better precision
         inv_scale = Tensor(1.0 / self.scale_factor)
@@ -151,7 +152,8 @@ class GradScaler:
 
     def update(self, new_scale: float = None):
         r"""Update the scale factor according to whether encountered overflow grad.
-        If ``new_scale`` is provided, internal update mechanism will be ignored."""
+        If ``new_scale`` is provided, internal update mechanism will be ignored.
+        """
         if self.growth_interval == 0:
             return
 
diff --git a/imperative/python/megengine/autodiff/grad_manager.py b/imperative/python/megengine/autodiff/grad_manager.py
index 923f1f40..a754d8ef 100644
--- a/imperative/python/megengine/autodiff/grad_manager.py
+++ b/imperative/python/megengine/autodiff/grad_manager.py
@@ -32,8 +32,7 @@ _global_priority = 0
 
 
 class GradManager:
-    r"""
-    GradManager computes gradients or more generally, vector-Jacobian product, by reverse mode
+    r"""GradManager computes gradients or more generally, vector-Jacobian product, by reverse mode
     automatic differentiation (a.k.a. back propagation).
 
     Reverse mode autodiff normally reuses many intermediate tensors for best computation efficiency.
@@ -120,7 +119,6 @@ class GradManager:
 
         gm = GradManager()
         gm.attach(model.parameters(), callback=dist.make_allreduce_cb("MEAN"))
-
     """
 
     def __init__(self):
@@ -136,8 +134,7 @@ class GradManager:
         return [spec.tensor() for spec in self._attach_specs.values()]
 
     def attach(self, tensors: Iterable[Tensor], callbacks=None):
-        r"""
-        Instruct GradManager to track operations on tensors, so that gradients with respect
+        r"""Instruct GradManager to track operations on tensors, so that gradients with respect
         to those tensors could be evaluated later.
 
         :meth:`attach` also accepts a list of callbacks, which will be called with the tensor and
@@ -188,8 +185,9 @@ class GradManager:
             multiple uses of a GradManager, which is unrelated to whether resources is timely
             released within a single use.
 
-        :param tensors: tensor or list of tensors to track
-        :param callbacks: callback or list of callbacks
+        Args:
+            tensors: tensor or list of tensors to track
+            callbacks: callback or list of callbacks
         """
         if callbacks is None:
             callbacks = []
@@ -234,8 +232,7 @@ class GradManager:
         y: Union[Tensor, List[Tensor]] = None,
         dy: Union[Tensor, List[Tensor]] = None,
     ):
-        r"""
-        Compute gradients (or vector-Jacobian product) for all attached tensors, accumulate to
+        r"""Compute gradients (or vector-Jacobian product) for all attached tensors, accumulate to
         corresponding .grad attribute, and release resources along the way.
 
         :meth:`backward` computes the vector-Jacobian product :math:`dx_j = \sum_{i} dy_i J_{ij}`
@@ -257,8 +254,9 @@ class GradManager:
         process of this call. When the call successfully finishes, the GradManager will be put back
         to an inactive state.
 
-        :param y: tensor or list of tensors
-        :param dy: tensor or list of tensors. Defaults to 1 if y is scalar
+        Args:
+            y: tensor or list of tensors
+            dy: tensor or list of tensors. Defaults to 1 if y is scalar
         """
         push_scope("backward")
         set_option("record_computing_path", 0)
@@ -310,8 +308,7 @@ class GradManager:
         pop_scope("backward")
 
     def record(self):
-        r"""
-        Start recording operations
+        r"""Start recording operations
 
         After this call, you will be able to call :meth:`backward`.
         """
@@ -342,8 +339,7 @@ class GradManager:
         self._grad.wrt(tensor, callback=callback)
 
     def release(self):
-        r"""
-        Stop recording operations and release resources kept for gradient computation
+        r"""Stop recording operations and release resources kept for gradient computation
 
         After this call, you will not be able to call :meth:`backward`.
         """
diff --git a/imperative/python/megengine/core/_trace_option.py b/imperative/python/megengine/core/_trace_option.py
index b580d346..40c002c4 100644
--- a/imperative/python/megengine/core/_trace_option.py
+++ b/imperative/python/megengine/core/_trace_option.py
@@ -15,16 +15,12 @@ if os.environ.get("MEGENGINE_USE_SYMBOLIC_SHAPE"):
 
 
 def use_symbolic_shape() -> bool:
-    """
-    Returns whether tensor.shape returns a tensor instead of a tuple
-
-    """
+    r"""Returns whether tensor.shape returns a tensor instead of a tuple"""
     return _use_symbolic_shape
 
 
 def set_symbolic_shape(option: bool):
-    """ Sets whether tensor.shape returns a tensor instead of a tuple
-    """
+    r"""Sets whether tensor.shape returns a tensor instead of a tuple"""
     global _use_symbolic_shape
     _org = _use_symbolic_shape
     _use_symbolic_shape = option
diff --git a/imperative/python/megengine/core/autodiff/grad.py b/imperative/python/megengine/core/autodiff/grad.py
index 805e0437..0a64733e 100644
--- a/imperative/python/megengine/core/autodiff/grad.py
+++ b/imperative/python/megengine/core/autodiff/grad.py
@@ -88,67 +88,56 @@ class Grad:
 
 
 class Function(ops.PyOpBase):
-    """
-    Defines a block of operations with customizable differentiation.
-
+    r"""Defines a block of operations with customizable differentiation.
+    
     The computation should be defined in ``forward`` method, with gradient
     computation defined in ``backward`` method.
-
+    
     Each instance of ``Function`` should be used only once during forwardding.
-
+    
     Examples:
-
-    .. code-block::
-
-        class Sigmoid(Function):
-            def forward(self, x):
-                y = 1 / (1 + F.exp(-x))
-                self.y = y
-                return y
-
-            def backward(self, dy):
-                y = self.y
-                return dy * y * (1-y)
-
+    
+        .. code-block::
+    
+            class Sigmoid(Function):
+                def forward(self, x):
+                    y = 1 / (1 + F.exp(-x))
+                    self.y = y
+                    return y
+
+                def backward(self, dy):
+                    y = self.y
     """
 
     def forward(self, *args, **kwargs):
-        """
-        Applies operations to ``inputs`` and returns results. It must be overriden by all subclasses.
-
-        :param input: input tensors.
-        :return: a tuple of Tensor or a single Tensor.
-
-        .. note::
-
-            This method should return a tuple of Tensor or a single Tensor representing the output
-            of the function.
-
-        .. note::
-
-            positional arguments should all be Tensor
-
+        r"""Applies operations to ``inputs`` and returns results. It must be overriden by all subclasses.
+
+        Args:
+            input: input tensors.
+
+        Returns:
+            a tuple of Tensor or a single Tensor.
+          
+        Note:
+            * This method should return a tuple of Tensor or a single Tensor representing the output
+              of the function.
+            * positional arguments should all be Tensor
         """
         raise NotImplementedError
 
     def backward(self, *output_grads):
-        """
-        Compute the gradient of the forward function. It must be overriden by all subclasses.
-
-        :param output_grads: gradients of outputs that are returned by :meth:`forward`.
-
-        .. note::
-
-            In case when some tensors of outputs are not related to loss function, the corresponding
-            values in ``output_grads`` would be ``None``.
-
-        .. note::
-
-            This method should return a tuple which containing the gradients of all inputs, in the same order
-            as the ``inputs`` argument of :meth:`forward` . A ``Tensor`` could be returned
-            instead if there is only one input. If users want to stop the propagation of some gradients,
-            the corresponding returned values should be set ``None`` .
-
+        r"""Compute the gradient of the forward function. It must be overriden by all subclasses.
+
+        Args:
+            output_grads: gradients of outputs that are returned by :meth:`forward`.
+        
+        Note:
+            * In case when some tensors of outputs are not related to loss function, the corresponding
+              values in ``output_grads`` would be ``None``.
+            * This method should return a tuple which containing the gradients of all inputs, in the same order
+              as the ``inputs`` argument of :meth:`forward` . A ``Tensor`` could be returned
+              instead if there is only one input. If users want to stop the propagation of some gradients,
+              the corresponding returned values should be set ``None`` .
         """
         raise NotImplementedError
 
diff --git a/imperative/python/megengine/core/tensor/amp.py b/imperative/python/megengine/core/tensor/amp.py
index 9d54927b..cb6ae5b0 100644
--- a/imperative/python/megengine/core/tensor/amp.py
+++ b/imperative/python/megengine/core/tensor/amp.py
@@ -12,16 +12,14 @@ _low_prec_dtype = "float16"
 
 @property
 def enabled(mod):
-    r"""
-    Get or set amp autocast mode enabled or not.
-
+    r"""Get or set amp autocast mode enabled or not.
+    
     Examples:
+    
+        .. code-block::
 
-    .. code-block::
-
-        import megengine as mge
-        mge.amp.enabled = True
-
+           import megengine as mge
+           mge.amp.enabled = True
     """
     return _enabled
 
@@ -34,17 +32,15 @@ def enabled(mod, enabled: bool):
 
 @property
 def high_prec_dtype(mod):
-    r"""
-    Get or set amp autocast mode's higher precision dtype. It will change the
+    r"""Get or set amp autocast mode's higher precision dtype. It will change the
     target dtype in tensor casting for better precision. Default: float32.
-
+    
     Examples:
+    
+        .. code-block::
 
-    .. code-block::
-
-        import megengine as mge
-        mge.amp.high_prec_dtype = "float32"
-
+           import megengine as mge
+           mge.amp.high_prec_dtype = "float32"
     """
     return _high_prec_dtype
 
@@ -57,17 +53,15 @@ def high_prec_dtype(mod, dtype: str):
 
 @property
 def low_prec_dtype(mod):
-    r"""
-    Get or set amp autocast mode's lower precision dtype. It will change the
+    r"""Get or set amp autocast mode's lower precision dtype. It will change the
     target dtype in tensor casting for better speed and memory. Default: float16.
-
+    
     Examples:
+    
+        .. code-block::
 
-    .. code-block::
-
-        import megengine as mge
-        mge.amp.low_prec_dtype = "float16"
-
+           import megengine as mge
+           mge.amp.low_prec_dtype = "float16"
     """
     return _low_prec_dtype
 
diff --git a/imperative/python/megengine/core/tensor/array_method.py b/imperative/python/megengine/core/tensor/array_method.py
index 1fbd4b87..4fa9af21 100644
--- a/imperative/python/megengine/core/tensor/array_method.py
+++ b/imperative/python/megengine/core/tensor/array_method.py
@@ -389,9 +389,7 @@ class ArrayMethodMixin(abc.ABC):
 
     @property
     def ndim(self):
-        r"""
-        Returns the number of dimensions of self :class:`~.Tensor`.
-        """
+        r"""Returns the number of dimensions of self :class:`~.Tensor`."""
         shape = self._tuple_shape
         if shape is None:
             raise ValueError("unkown ndim")
@@ -399,8 +397,7 @@ class ArrayMethodMixin(abc.ABC):
 
     @property
     def size(self):
-        r"""
-        Returns the size of the self :class:`~.Tensor`.
+        r"""Returns the size of the self :class:`~.Tensor`.
         The returned value is a subclass of :class:`tuple`.
         """
         shape = self.shape
@@ -410,14 +407,11 @@ class ArrayMethodMixin(abc.ABC):
 
     @property
     def T(self):
-        r"""
-        alias of :attr:`~.Tensor.transpose`.
-        """
+        r"""alias of :attr:`~.Tensor.transpose`."""
         return self.transpose()
 
     def item(self, *args):
-        r"""
-        Returns the value of this :class:`~.Tensor` as a standard Python :class:`numbers.Number`.
+        r"""Returns the value of this :class:`~.Tensor` as a standard Python :class:`numbers.Number`.
         This only works for tensors with one element. For other cases, see :meth:`~.tolist`.
         """
         if not args:
@@ -427,8 +421,7 @@ class ArrayMethodMixin(abc.ABC):
         return self[args].item()
 
     def tolist(self):
-        r"""
-        Returns the tensor as a (nested) list.
+        r"""Returns the tensor as a (nested) list.
         For scalars, a standard Python number is returned, just like with :meth:`~.item`.
         Tensors are automatically moved to the CPU first if necessary.
 
@@ -437,16 +430,13 @@ class ArrayMethodMixin(abc.ABC):
         return self.numpy().tolist()
 
     def astype(self, dtype):
-        r"""
-        Returns a :class:`Tensor` with the same data and number of elements
+        r"""Returns a :class:`Tensor` with the same data and number of elements
         with the specified :attr:`~.Tensor.dtype`.
         """
         return astype(self, dtype)
 
     def reshape(self, *args):
-        r"""
-        See :func:`~.reshape`.
-        """
+        r"""See :func:`~.reshape`."""
         return _reshape(self, _expand_args(args))
 
     # FIXME: remove this method
@@ -454,9 +444,7 @@ class ArrayMethodMixin(abc.ABC):
         return _broadcast(self, _expand_args(args))
 
     def transpose(self, *args):
-        r"""
-        See :func:`~.transpose`.
-        """
+        r"""See :func:`~.transpose`."""
         if self.ndim == 0:
             assert (
                 len(args) == 0
@@ -469,172 +457,170 @@ class ArrayMethodMixin(abc.ABC):
         return _transpose(self, _expand_args(args))
 
     def flatten(self):
-        r"""
-        See :func:`~.flatten`.
-        """
+        r"""See :func:`~.flatten`."""
         return self.reshape(-1)
 
     def sum(self, axis=None, keepdims: bool = False):
-        r"""
-        Returns the sum of each row of the input tensor in the given dimension ``axis``.
+        r"""Returns the sum of each row of the input tensor in the given dimension ``axis``.
 
         If ``axis`` is a list of axises, reduce over all of them.
         If ``keepdims`` is ``True``, the shape of output tensor is the same as the input tensor,
         except in the dimension(s) ``axis`` where it is of size 1.
         Otherwise, ``axis`` is squeezed (see :func:`~.squeeze`).
 
-        :param axis: the dimension or dimensions to reduce.
-        :param keepdims: whether the output tensor has ndim retained or not.
-        :return: output tensor.
+        Args:
+            axis: the dimension or dimensions to reduce.
+            keepdims: whether the output tensor has ndim retained or not.
 
-        Examples:
-
-        .. testcode::
+        Returns:
+            output tensor.
 
-            from megengine import tensor
-            a = tensor([False, True, True, False])
-            b = tensor([1.0, 2.0, 3.0, 4.0])
-            print(a.sum().numpy())
-            print(b.sum().numpy())
+        Examples:
+            .. testcode::
 
-        Outputs:
+               from megengine import tensor
+               a = tensor([False, True, True, False])
+               b = tensor([1.0, 2.0, 3.0, 4.0])
+               print(a.sum().numpy())
+               print(b.sum().numpy())
 
-        .. testoutput::
+            Outputs:
 
-            2
-            10.0
+            .. testoutput::
 
+               2
+               10.0
         """
         return _reduce("sum")(self, axis, keepdims)
 
     def prod(self, axis=None, keepdims: bool = False):
-        r"""
-        Returns the product of each row of the input tensor in the given dimension ``axis``.
+        r"""Returns the product of each row of the input tensor in the given dimension ``axis``.
 
         If ``axis`` is a list of axises, reduce over all of them.
         If ``keepdims`` is ``True``, the shape of output tensor is the same as the input tensor,
         except in the dimension(s) ``axis`` where it is of size 1.
         Otherwise, ``axis`` is squeezed (see :func:`~.squeeze`).
 
-        :param axis: the dimension or dimensions to reduce.
-        :param keepdims: whether the output tensor has ndim retained or not.
-        :return: output tensor.
-
-        Examples:
+        Args:
+            axis: the dimension or dimensions to reduce.
+            keepdims: whether the output tensor has ndim retained or not.
 
-        .. testcode::
+        Returns:
+            output tensor.
 
-            from megengine import tensor
-            a = tensor([False, True, True, False])
-            b = tensor([1.0, 2.0, 3.0, 4.0])
-            print(a.prod().numpy())
-            print(b.prod().numpy())
+        Examples:
+            .. testcode::
 
-        Outputs:
+               from megengine import tensor
+               a = tensor([False, True, True, False])
+               b = tensor([1.0, 2.0, 3.0, 4.0])
+               print(a.prod().numpy())
+               print(b.prod().numpy())
 
-        .. testoutput::
+            Outputs:
 
-            0
-            24.0
+            .. testoutput::
 
+               0
+               24.0
         """
         return _reduce("product")(self, axis, keepdims)
 
     def min(self, axis=None, keepdims: bool = False):
-        r"""
-        Returns the min value of each row of the input tensor in the given dimension ``axis``.
+        r"""Returns the min value of each row of the input tensor in the given dimension ``axis``.
 
         If ``axis`` is a list of axises, reduce over all of them.
         If ``keepdims`` is ``True``, the shape of output tensor is the same as the input tensor,
         except in the dimension(s) ``axis`` where it is of size 1.
         Otherwise, ``axis`` is squeezed (see :func:`~.squeeze`).
 
-        :param axis: the dimension or dimensions to reduce.
-        :param keepdims: whether the output tensor has ndim retained or not.
-        :return: output tensor.
-
-        Examples:
+        Args:
+            axis: the dimension or dimensions to reduce.
+            keepdims: whether the output tensor has ndim retained or not.
 
-        .. testcode::
+        Returns:
+            output tensor.
 
-            from megengine import tensor
-            a = tensor([False, True, True, False])
-            b = tensor([1.0, 2.0, 3.0, 4.0])
-            print(a.min().numpy())
-            print(b.min().numpy())
+        Examples:
+            .. testcode::
 
-        Outputs:
+               from megengine import tensor
+               a = tensor([False, True, True, False])
+               b = tensor([1.0, 2.0, 3.0, 4.0])
+               print(a.min().numpy())
+               print(b.min().numpy())
 
-        .. testoutput::
+            Outputs:
 
-            False
-            1.0
+            .. testoutput::
 
+               False
+               1.0
         """
         return _reduce("min")(self, axis, keepdims)
 
     def max(self, axis=None, keepdims: bool = False):
-        r"""
-        Returns the max value of each row of the input tensor in the given dimension ``axis``.
+        r"""Returns the max value of each row of the input tensor in the given dimension ``axis``.
 
         If ``axis`` is a list of axises, reduce over all of them.
         If ``keepdims`` is ``True``, the shape of output tensor is the same as the input tensor,
         except in the dimension(s) ``axis`` where it is of size 1.
         Otherwise, ``axis`` is squeezed (see :func:`~.squeeze`).
 
-        :param axis: the dimension or dimensions to reduce.
-        :param keepdims: whether the output tensor has ndim retained or not.
-        :return: output tensor.
-
-        Examples:
+        Args:
+            axis: the dimension or dimensions to reduce.
+            keepdims: whether the output tensor has ndim retained or not.
 
-        .. testcode::
+        Returns:
+            output tensor.
 
-            from megengine import tensor
-            a = tensor([False, True, True, False])
-            b = tensor([1.0, 2.0, 3.0, 4.0])
-            print(a.max().numpy())
-            print(b.max().numpy())
+        Examples:
+            .. testcode::
 
-        Outputs:
+               from megengine import tensor
+               a = tensor([False, True, True, False])
+               b = tensor([1.0, 2.0, 3.0, 4.0])
+               print(a.max().numpy())
+               print(b.max().numpy())
 
-        .. testoutput::
+            Outputs:
 
-            True
-            4.0
+            .. testoutput::
 
+               True
+               4.0
         """
         return _reduce("max")(self, axis, keepdims)
 
     def mean(self, axis=None, keepdims: bool = False):
-        r"""
-        Returns the mean value of each row of the input tensor in the given dimension ``axis``.
+        r"""Returns the mean value of each row of the input tensor in the given dimension ``axis``.
 
         If ``axis`` is a list of axises, reduce over all of them.
         If ``keepdims`` is ``True``, the shape of output tensor is the same as the input tensor,
         except in the dimension(s) ``axis`` where it is of size 1.
         Otherwise, ``axis`` is squeezed (see :func:`~.squeeze`).
 
-        :param axis: the dimension or dimensions to reduce.
-        :param keepdims: whether the output tensor has ndim retained or not.
-        :return: output tensor.
+        Args:
+            axis: the dimension or dimensions to reduce.
+            keepdims: whether the output tensor has ndim retained or not.
 
-        Examples:
+        Returns:
+            output tensor.
 
-        .. testcode::
-
-            from megengine import tensor
-            a = tensor([False, True, True, False])
-            b = tensor([1.0, 2.0, 3.0, 4.0])
-            print(a.mean().numpy())
-            print(b.mean().numpy())
+        Examples:
+            .. testcode::
 
-        Outputs:
+               from megengine import tensor
+               a = tensor([False, True, True, False])
+               b = tensor([1.0, 2.0, 3.0, 4.0])
+               print(a.mean().numpy())
+               print(b.mean().numpy())
 
-        .. testoutput::
+            Outputs:
 
-            0.5
-            2.5
+            .. testoutput::
 
+               0.5
+               2.5
         """
         return _reduce("mean")(self, axis, keepdims)
diff --git a/imperative/python/megengine/core/tensor/dtype.py b/imperative/python/megengine/core/tensor/dtype.py
index 408b9ded..248cbdaf 100644
--- a/imperative/python/megengine/core/tensor/dtype.py
+++ b/imperative/python/megengine/core/tensor/dtype.py
@@ -47,17 +47,17 @@ class QuantDtypeMeta(
         ["name", "cname", "np_dtype_str", "qmin", "qmax", "is_unsigned"],
     )
 ):
-    r"""
-    Store metadata for quantize dtype. Could be used to create custom quant dtype
+    r"""Store metadata for quantize dtype. Could be used to create custom quant dtype
     for QAT when the network don't need to be converted for inference, but only
     to export network metadata for third-party platform inference.
 
-    :param name: a unique name string.
-    :param cname: used in :func:`~.create_quantized_dtype` for model dump and inference.
-    :param np_dtype_str: used in :func:`~.create_quantized_dtype` to generate ``np.dtype``.
-    :param qmin: a int number indicating quant dtype's lowerbound.
-    :param qmax: a int number indicating quant dtype's upperbound.
-    :param is_unsigned: a helper value that could be inference from np_dtype_str.
+    Args:
+        name: a unique name string.
+        cname: used in :func:`~.create_quantized_dtype` for model dump and inference.
+        np_dtype_str: used in :func:`~.create_quantized_dtype` to generate ``np.dtype``.
+        qmin: a int number indicating quant dtype's lowerbound.
+        qmax: a int number indicating quant dtype's upperbound.
+        is_unsigned: a helper value that could be inference from np_dtype_str.
     """
 
     def __new__(
@@ -77,7 +77,7 @@ class QuantDtypeMeta(
         return self
 
     def __deepcopy__(self, _):
-        """
+        r"""
         Ignore deepcopy so that a dtype meta can be treated as singleton, for more
         strict check in :meth:`~.FakeQuantize.fake_quant_forward`.
         """
@@ -113,17 +113,17 @@ def _check_zero_point(zp: int, dtype_meta: QuantDtypeMeta):
 def create_quantized_dtype(
     dtype_meta: QuantDtypeMeta, scale: float, zp: Union[int, None]
 ):
-    r"""
-    Get quantized dtype with metadata attribute according to _metadata_dict.
-
+    r"""Get quantized dtype with metadata attribute according to _metadata_dict.
+    
     Note that unsigned dtype must have ``zero_point`` and signed dtype must
     not have ``zero_point``, to be consitent with tensor generated by calling
     compiled function from `CompGraph.compile(inputs, outspec)`.
 
-    :param dtype_meta: a QuantDtypeMeta indicating which dtype to return. the
-        ``cname`` attribute cannot be ``None``.
-    :param scale: a number for scale to store in dtype's metadata
-    :param zp: a number for zero_point to store in dtype's metadata
+    Args:
+        dtype_meta: a QuantDtypeMeta indicating which dtype to return. the
+            ``cname`` attribute cannot be ``None``.
+        scale: a number for scale to store in dtype's metadata
+        zp: a number for zero_point to store in dtype's metadata
     """
     if dtype_meta.cname is None:
         raise ValueError("dtype {} without cname attr is not supported.")
@@ -152,8 +152,7 @@ def create_quantized_dtype(
 
 
 def quint8(scale, zero_point):
-    """
-    Consturct a quantized unsigned int8 data type with ``scale`` (float) and
+    r"""Consturct a quantized unsigned int8 data type with ``scale`` (float) and
     ``zero_point`` (uint8). The real value represented by a quint8 data type is
     float_val = scale * (uint8_val - zero_point)
     """
@@ -161,24 +160,21 @@ def quint8(scale, zero_point):
 
 
 def qint8(scale):
-    """
-    Construct a quantized int8 data type with ``scale`` (float). The real value
+    r"""Construct a quantized int8 data type with ``scale`` (float). The real value
     represented by a qint8 data type is float_val = scale * int8_val
     """
     return create_quantized_dtype(_builtin_quant_dtypes["qint8"], scale, None)
 
 
 def qint32(scale):
-    """
-    Construct a quantized int32 data type with ``scale`` (float). The real value
+    r"""Construct a quantized int32 data type with ``scale`` (float). The real value
     represented by a qint32 data type is float_val = scale * int32_val
     """
     return create_quantized_dtype(_builtin_quant_dtypes["qint32"], scale, None)
 
 
 def quint4(scale, zero_point):
-    """
-    Consturct a quantized unsigned int4 data type with ``scale`` (float) and
+    r"""Consturct a quantized unsigned int4 data type with ``scale`` (float) and
     ``zero_point`` (uint8). The real value represented by a quint4 data type is
     float_val = scale * (uint4_val - zero_point)
     """
@@ -186,8 +182,7 @@ def quint4(scale, zero_point):
 
 
 def qint4(scale):
-    """
-    Construct a quantized int4 data type with ``scale`` (float). The real value
+    r"""Construct a quantized int4 data type with ``scale`` (float). The real value
     represented by a qint4 data type is float_val = scale * int4_val
     """
     return create_quantized_dtype(_builtin_quant_dtypes["qint4"], scale, None)
@@ -244,95 +239,95 @@ def _convert_from_quantized_dtype(arr: np.ndarray, dtype_meta: QuantDtypeMeta):
 
 
 def convert_to_quint8(arr: np.ndarray, q: np.dtype):
-    """
-    Quantize a float NumPy ndarray into a quint8 one with specified params.
+    r"""Quantize a float NumPy ndarray into a quint8 one with specified params.
 
-    :param arr: Input ndarray.
-    :param q: Target data type, should be a quint8.
+    Args:
+        arr: Input ndarray.
+        q: Target data type, should be a quint8.
     """
     return _convert_to_quantized_dtype(arr, q, _builtin_quant_dtypes["quint8"])
 
 
 def convert_from_quint8(arr: np.ndarray):
-    """
-    Dequantize a quint8 NumPy ndarray into a float one.
+    r"""Dequantize a quint8 NumPy ndarray into a float one.
 
-    :param arr: Input ndarray.
+    Args:
+        arr: Input ndarray.
     """
     return _convert_from_quantized_dtype(arr, _builtin_quant_dtypes["quint8"])
 
 
 def convert_to_qint8(arr: np.ndarray, q: np.dtype):
-    """
-    Quantize a float NumPy ndarray into a qint8 one with specified params.
+    r"""Quantize a float NumPy ndarray into a qint8 one with specified params.
 
-    :param arr: Input ndarray.
-    :param q: Target data type, should be a qint8.
+    Args:
+        arr: Input ndarray.
+        q: Target data type, should be a qint8.
     """
     return _convert_to_quantized_dtype(arr, q, _builtin_quant_dtypes["qint8"])
 
 
 def convert_from_qint8(arr: np.ndarray):
-    """
-    Dequantize a qint8 NumPy ndarray into a float one.
+    r"""Dequantize a qint8 NumPy ndarray into a float one.
 
-    :param arr: Input ndarray.
+    Args:
+        arr: Input ndarray.
     """
     return _convert_from_quantized_dtype(arr, _builtin_quant_dtypes["qint8"])
 
 
 def convert_to_qint32(arr: np.ndarray, q: np.dtype):
-    """
-    Quantize a float NumPy ndarray into a qint32 one with specified params.
+    r"""Quantize a float NumPy ndarray into a qint32 one with specified params.
 
-    :param arr: Input ndarray.
-    :param q: Target data type, should be a qint8.
+    Args:
+        arr: Input ndarray.
+        q: Target data type, should be a qint8.
     """
     return _convert_to_quantized_dtype(arr, q, _builtin_quant_dtypes["qint32"])
 
 
 def convert_from_qint32(arr):
-    """
-    Dequantize a qint32 NumPy ndarray into a float one.
+    r"""Dequantize a qint32 NumPy ndarray into a float one.
 
-    :param arr: Input ndarray.
+    Args:
+        arr: Input ndarray.
     """
     return _convert_from_quantized_dtype(arr, _builtin_quant_dtypes["qint32"])
 
 
 def convert_to_quint4(arr: np.ndarray, q: np.dtype):
-    """
-    Quantize a float NumPy ndarray into a quint4 one with specified params.
+    r"""Quantize a float NumPy ndarray into a quint4 one with specified params.
 
-    :param arr: Input ndarray.
-    :param q: Target data type, should be a quint4.
+    Args:
+        arr: Input ndarray.
+        q: Target data type, should be a quint4.
     """
     return _convert_to_quantized_dtype(arr, q, _builtin_quant_dtypes["quint4"])
 
 
 def convert_from_quint4(arr: np.ndarray):
-    """
-    Dequantize a quint4 NumPy ndarray into a float one.
+    r"""Dequantize a quint4 NumPy ndarray into a float one.
 
-    :param arr: Input ndarray.
+    Args:
+        arr: Input ndarray.
     """
     return _convert_from_quantized_dtype(arr, _builtin_quant_dtypes["quint4"])
 
 
 def convert_to_qint4(arr: np.ndarray, q: np.dtype):
-    """
-    Quantize a float NumPy ndarray into a qint4 one with specified params.
+    r"""Quantize a float NumPy ndarray into a qint4 one with specified params.
 
-    :param arr: Input ndarray.
-    :param q: Target data type, should be a qint4.
+    Args:
+        arr: Input ndarray.
+        q: Target data type, should be a qint4.
     """
     return _convert_to_quantized_dtype(arr, q, _builtin_quant_dtypes["qint4"])
 
 
 def convert_from_qint4(arr: np.ndarray):
-    """
-    Dequantize a qint4 NumPy ndarray into a float one.
+    r"""Dequantize a qint4 NumPy ndarray into a float one.
 
-    :param arr: Input ndarray.
+    Args:
+        arr: Input ndarray.
     """
     return _convert_from_quantized_dtype(arr, _builtin_quant_dtypes["qint4"])
diff --git a/imperative/python/megengine/core/tensor/megbrain_graph.py b/imperative/python/megengine/core/tensor/megbrain_graph.py
index 02efb4b4..ce198816 100644
--- a/imperative/python/megengine/core/tensor/megbrain_graph.py
+++ b/imperative/python/megengine/core/tensor/megbrain_graph.py
@@ -24,11 +24,11 @@ from .core import TensorBase
 
 
 def set_priority_to_id(dest_vars):
-    """
-    For all oprs in the subgraph constructed by dest_vars,
+    r"""For all oprs in the subgraph constructed by dest_vars,
     sets its priority to id if its original priority is zero.
-    
-    :param dest_vars: target vars representing the graph.
+
+    Args:
+        dest_vars: target vars representing the graph.
     """
     dest_vec = []
     for i in dest_vars:
@@ -220,54 +220,50 @@ class OpNode:
 
 
 def optimize_for_inference(dest_vars, **kwargs):
-    r"""
-    Applies optimize_for_inference pass for computing graph.
-
-        :param dest_vars: list of output vars in the computing graph
-
-        :Keyword Arguments:
-
-            * enable_io16xc32 --
-                whether to use float16 for I/O between oprs and use
-                float32 as internal computation precision. Note the output var would be
-                changed to float16.
-            * enable_ioc16 --
-                whether to use float16 for both I/O and computation
-                precision.
-
-            * enable_hwcd4 --
-                whether to use NHWCD4 data layout. This is faster on some
-                OpenCL backend.
-            * enable_nchw88 --
-                whether to use NCHW88 data layout, currently
-                used in X86 AVX backend.
-            * enable_nchw44 --
-                whether to use NCHW44 data layout, currently
-                used in arm backend.
-            * enable_nchw44_dot --
-                whether to use NCHW44_dot data layout, currently
-                used in armv8.2+dotprod backend.
-            * enable_nchw4 --
-                whether to use NCHW4 data layout, currently
-                used in nvidia backend(based on cudnn).
-            * enable_nchw32 --
-                whether to use NCHW32 data layout, currently
-                used in nvidia backend with tensorcore(based on cudnn).
-            * enable_chwn4 --
-                whether to use CHWN4 data layout, currently
-                used in nvidia backend with tensorcore.
-            * enable_nchw64 --
-                whether to use NCHW64 data layout, used for fast int4
-                support on Nvidia GPU.
-
-            * enable_fuse_conv_bias_nonlinearity: whether to fuse conv+bias+nonlinearty
-                into one opr.
-            * enable_fuse_conv_bias_with_z: whether to fuse conv_bias with z
-                input for inference on nvidia backend(this optimization pass will
-                result in mismatch of the precision of output of training and
-                inference)
-            * enable_fuse_preprocess: whether to fuse astype\pad channel\dimshuffle and
-                etc opr from h2d opr.
+    r"""Applies optimize_for_inference pass for computing graph.
+
+    Args:
+        dest_vars: list of output vars in the computing graph
+
+    Keyword Arguments:
+
+        * enable_io16xc32 --
+          whether to use float16 for I/O between oprs and use
+          float32 as internal computation precision. Note the output var would be
+          changed to float16.
+        * enable_ioc16 --
+          whether to use float16 for both I/O and computation
+          precision.
+        * enable_hwcd4 --
+          whether to use NHWCD4 data layout. This is faster on some
+          OpenCL backend.
+        * enable_nchw88 --
+          whether to use NCHW88 data layout, currently
+          used in X86 AVX backend.
+        * enable_nchw44 --
+          whether to use NCHW44 data layout, currently
+          used in arm backend.
+        * enable_nchw44_dot --
+          whether to use NCHW44_dot data layout, currently
+          used in armv8.2+dotprod backend.
+        * enable_nchw4 --
+          whether to use NCHW4 data layout, currently
+          used in nvidia backend(based on cudnn).
+        * enable_nchw32 --
+          whether to use NCHW32 data layout, currently
+          used in nvidia backend with tensorcore(based on cudnn).
+        * enable_chwn4 --
+          whether to use CHWN4 data layout, currently
+          used in nvidia backend with tensorcore.
+        * enable_nchw64 --
+          whether to use NCHW64 data layout, used for fast int4
+          support on Nvidia GPU.
+        * enable_fuse_conv_bias_nonlinearity: whether to fuse conv+bias+nonlinearty
+          into one opr.
+        * enable_fuse_conv_bias_with_z: whether to fuse conv_bias with z
+          input for inference on nvidia backend(this optimization pass will
+          result in mismatch of the precision of output of training and
+          inference)
     """
     inference_options = GraphOptimizeOptions()
     inference_optimize_layout_transform_map = {
@@ -305,11 +301,13 @@ def optimize_for_inference(dest_vars, **kwargs):
 
 
 def deserialize_infer_option(x: int) -> Dict[str, bool]:
-    r"""
-    Deserailize optimize options generated by ``imperative_rt.GraphOptimizeOptions``.
+    r"""Deserailize optimize options generated by ``imperative_rt.GraphOptimizeOptions``.
 
-    :param x: inference options represented by int.
-    :return: inference options represented by dict.
+    Args:
+        x: inference options represented by int.
+
+    Returns:
+        inference options represented by dict.
     """
 
     inference_options = GraphOptimizeOptions.deserialize(x)
@@ -346,13 +344,12 @@ def deserialize_infer_option(x: int) -> Dict[str, bool]:
 
 
 def modify_opr_algo_strategy_inplace(dest_vars, strategy: str):
-    """
-    C++ graph version of :func:`~.set_execution_strategy`. Used to inplacely modify
+    r"""C++ graph version of :func:`~.set_execution_strategy`. Used to inplacely modify
     dumped graph's fast-run strategy.
 
-    :param dest_vars: list of output vars in the computing graph.
-    :param strategy: fast-run algorithms strategy.
-
+    Args:
+        dest_vars: list of output vars in the computing graph.
+        strategy: fast-run algorithms strategy.
     """
     dest_vars = _unwrap(dest_vars)
     _imperative_rt.modify_opr_algo_strategy_inplace(dest_vars, strategy)
@@ -383,39 +380,40 @@ def dump_graph(
     append_json=False,
     metadata=None
 ) -> Tuple[bytes, CompGraphDumpResult]:
-    """
-    serialize the computing graph of `output_vars` and get byte result.
-
-    :param output_vars: output variables which are the graph's end point.
-
-        .. note::
-
-            The underlying C++ API only accepts a var list. If a dict is given,
-            the vars would be renamed to the given names.
-
-    :param keep_var_name: level for keeping variable names:
-
-        * 0: none of the names are kept
-        * 1: (default)keep names of output vars
-        * 2: keep names of all (output and internal) vars
-    :param keep_opr_name: whether to keep operator names.
-    :param keep_param_name: whether to keep param names, so param values can be
-        easily manipulated after loading model
-    :param keep_opr_priority: whether to keep priority setting for operators
-    :param strip_info_file: a string for path or a file handler. if is not None,
-        then the dump information for code strip would be written to ``strip_info_file``
-    :param append_json: will be check when `strip_info_file` is not None. if set
-        true, the information for code strip will be append to strip_info_file.
-        if set false, will rewrite strip_info_file
-    :return: dump result as byte string, and an instance of namedtuple
+    r"""serialize the computing graph of `output_vars` and get byte result.
+
+    Args:
+        output_vars: output variables which are the graph's end point.
+        keep_var_name: level for keeping variable names:
+
+            * 0: none of the names are kept
+            * 1: (default)keep names of output vars
+            * 2: keep names of all (output and internal) vars
+
+        keep_opr_name: whether to keep operator names.
+        keep_param_name: whether to keep param names, so param values can be
+            easily manipulated after loading model
+        keep_opr_priority: whether to keep priority setting for operators
+        strip_info_file: a string for path or a file handler. if is not None,
+            then the dump information for code strip would be written to ``strip_info_file``
+        append_json: will be check when `strip_info_file` is not None. if set
+            true, the information for code strip will be append to strip_info_file.
+            if set false, will rewrite strip_info_file
+
+    Note:
+        The underlying C++ API only accepts a var list. If a dict is given,
+        the vars would be renamed to the given names.
+
+    Returns:
+        dump result as byte string, and an instance of namedtuple
         :class:`CompGraphDumpResult`, whose fields are:
 
-            * ``nr_opr`` number of operators dumped
-            * ``tot_bytes`` total bytes for the whole graph
-            * ``tensor_value_bytes`` bytes consumed for dumping tensor values
-            * ``inputs`` names of input tensors
-            * ``params`` list of names of dumped params
-            * ``outputs`` names of output vars
+        * ``nr_opr`` number of operators dumped
+        * ``tot_bytes`` total bytes for the whole graph
+        * ``tensor_value_bytes`` bytes consumed for dumping tensor values
+        * ``inputs`` names of input tensors
+        * ``params`` list of names of dumped params
+        * ``outputs`` names of output vars
     """
     if isinstance(output_vars, dict):
         used_vars = set()
@@ -483,17 +481,19 @@ CompGraphLoadResult = collections.namedtuple(
 
 
 def load_graph(fpath) -> CompGraphLoadResult:
-    """
-    Load a serialized computing graph from file.
+    r"""Load a serialized computing graph from file.
+
+    Args:
+        fpath: Path or Handle of the input file
 
-    :param fpath: Path or Handle of the input file
-    :return: An instance of namedtuple :class:`CompGraphLoadResult`,
+    Returns:
+        An instance of namedtuple :class:`CompGraphLoadResult`,
         whose fields are:
 
-            * ``graph`` loaded CompGraph
-            * ``output_vars_dict`` A Python dict, mapping name to output SymbolVar
-            * ``output_vars_list`` A Python list, containing output vars in the
-                                   order passed to serialize_comp_graph_to_file
+        * ``graph`` loaded CompGraph
+        * ``output_vars_dict`` A Python dict, mapping name to output SymbolVar
+        * ``output_vars_list`` A Python list, containing output vars in the
+          order passed to serialize_comp_graph_to_file
     """
     output_vars_map = []
     output_vars_list = []
diff --git a/imperative/python/megengine/core/tensor/utils.py b/imperative/python/megengine/core/tensor/utils.py
index 9c734a99..0b2b2104 100644
--- a/imperative/python/megengine/core/tensor/utils.py
+++ b/imperative/python/megengine/core/tensor/utils.py
@@ -24,12 +24,12 @@ _enable_convert_inputs = True
 
 
 def get_convert_inputs():
-    """ get the curerent state of `_enable_convert_inputs` """
+    r"""get the curerent state of `_enable_convert_inputs`"""
     return _enable_convert_inputs
 
 
 def set_convert_inputs(flag):
-    """ This function is a temporary workaround for reducing the overhead of operator
+    r"""This function is a temporary workaround for reducing the overhead of operator
     invocations. The function `convert_inputs` is disabled if the global state
     `_enable_convert_inputs` is set to `False`, otherwise enabled. This function is for
     internal use only, and should be removed when the tensor-like system is refactored.
@@ -137,11 +137,11 @@ def setscalar(x):
 
 
 def astensor1d(x, *reference, dtype=None, device=None):
-    """
-    Convert something to 1D tensor. Support following types
-    * sequence of scalar literal / tensor
-    * numpy array
-    * tensor (returned as is, regardless of dtype and device)
+    """Convert something to 1D tensor. Support following types
+
+      * sequence of scalar literal / tensor
+      * numpy array
+      * tensor (returned as is, regardless of dtype and device)
     """
     try:
         ndim = x.ndim
diff --git a/imperative/python/megengine/data/collator.py b/imperative/python/megengine/data/collator.py
index 113507f3..f7fded9f 100644
--- a/imperative/python/megengine/data/collator.py
+++ b/imperative/python/megengine/data/collator.py
@@ -33,16 +33,11 @@ default_collate_err_msg_format = (
 
 
 class Collator:
-    r"""
-    Used for merging a list of samples to form a mini-batch of Tensor(s). Used when using batched loading from a dataset.
+    r"""Used for merging a list of samples to form a mini-batch of Tensor(s). Used when using batched loading from a dataset.
     Modified from https://github.com/pytorch/pytorch/blob/master/torch/utils/data/_utils/collate.py
     """
 
     def apply(self, inputs):
-        """
-        :param inputs: sequence_N(tuple(CHW, C, CK)).
-        :return: tuple(NCHW, NC, NCK).
-        """
         elem = inputs[0]
         elem_type = type(elem)
         if (
diff --git a/imperative/python/megengine/data/dataloader.py b/imperative/python/megengine/data/dataloader.py
index a9c226ea..69a835cc 100644
--- a/imperative/python/megengine/data/dataloader.py
+++ b/imperative/python/megengine/data/dataloader.py
@@ -44,28 +44,28 @@ def raise_timeout_error():
 
 class DataLoader:
     r"""Provides a convenient way to iterate on a given dataset.
-
+    
     DataLoader combines a dataset with
     :class:`~.Sampler`, :class:`~.Transform` and :class:`~.Collator`,
     make it flexible to get minibatch continually from a dataset.
 
-    :param dataset: dataset from which to load the minibatch.
-    :param sampler: defines the strategy to sample data from the dataset.
-    :param transform: defined the transforming strategy for a sampled batch.
-        Default: None
-    :param collator: defined the merging strategy for a transformed batch.
-        Default: None
-    :param num_workers: the number of sub-process to load, transform and collate
-        the batch. ``0`` means using single-process. Default: 0
-    :param timeout: if positive, means the timeout value(second) for collecting a
-        batch from workers. Default: 0
-    :param timeout_event: callback function triggered by timeout, default to raise
-        runtime error.
-    :param divide: define the paralleling strategy in multi-processing mode.
-        ``True`` means one batch is divided into :attr:`num_workers` pieces, and
-        the workers will process these pieces parallelly. ``False`` means
-        different sub-process will process different batch. Default: False
-
+    Args:
+        dataset: dataset from which to load the minibatch.
+        sampler: defines the strategy to sample data from the dataset.
+        transform: defined the transforming strategy for a sampled batch.
+            Default: None
+        collator: defined the merging strategy for a transformed batch.
+            Default: None
+        num_workers: the number of sub-process to load, transform and collate
+            the batch. ``0`` means using single-process. Default: 0
+        timeout: if positive, means the timeout value(second) for collecting a
+            batch from workers. Default: 0
+        timeout_event: callback function triggered by timeout, default to raise
+            runtime error.
+        divide: define the paralleling strategy in multi-processing mode.
+            ``True`` means one batch is divided into :attr:`num_workers` pieces, and
+            the workers will process these pieces parallelly. ``False`` means
+            different sub-process will process different batch. Default: False
     """
     __initialized = False
 
diff --git a/imperative/python/megengine/data/dataset/meta_dataset.py b/imperative/python/megengine/data/dataset/meta_dataset.py
index cf4567fd..9bf6311e 100644
--- a/imperative/python/megengine/data/dataset/meta_dataset.py
+++ b/imperative/python/megengine/data/dataset/meta_dataset.py
@@ -11,8 +11,7 @@ from typing import Tuple
 
 
 class Dataset(ABC):
-    r"""
-    An abstract base class for all datasets.
+    r"""An abstract base class for all datasets.
 
     __getitem__ and __len__ method are aditionally needed.
     """
@@ -31,8 +30,7 @@ class Dataset(ABC):
 
 
 class StreamDataset(Dataset):
-    r"""
-    An abstract class for stream data.
+    r"""An abstract class for stream data.
 
     __iter__ method is aditionally needed.
     """
@@ -53,10 +51,9 @@ class StreamDataset(Dataset):
 
 
 class ArrayDataset(Dataset):
-    r"""
-    ArrayDataset is a dataset for numpy array data.
+    r"""ArrayDataset is a dataset for numpy array data.
 
-    One or more numpy arrays are needed to initiate the dataset. 
+    One or more numpy arrays are needed to initiate the dataset.
     And the dimensions represented sample number are expected to be the same.
     """
 
diff --git a/imperative/python/megengine/data/dataset/vision/cifar.py b/imperative/python/megengine/data/dataset/vision/cifar.py
index 81bc1433..16e68a22 100644
--- a/imperative/python/megengine/data/dataset/vision/cifar.py
+++ b/imperative/python/megengine/data/dataset/vision/cifar.py
@@ -21,8 +21,7 @@ logger = get_logger(__name__)
 
 
 class CIFAR10(VisionDataset):
-    r""" :class:`~.Dataset` for CIFAR10 meta data.
-    """
+    r""":class:`~.Dataset` for CIFAR10 meta data."""
 
     url_path = "http://www.cs.utoronto.ca/~kriz/"
     raw_file_name = "cifar-10-python.tar.gz"
@@ -138,8 +137,7 @@ class CIFAR10(VisionDataset):
 
 
 class CIFAR100(CIFAR10):
-    r""" :class:`~.Dataset` for CIFAR100 meta data.
-    """
+    r""":class:`~.Dataset` for CIFAR100 meta data."""
 
     url_path = "http://www.cs.utoronto.ca/~kriz/"
     raw_file_name = "cifar-100-python.tar.gz"
diff --git a/imperative/python/megengine/data/dataset/vision/cityscapes.py b/imperative/python/megengine/data/dataset/vision/cityscapes.py
index 2602e1ca..a85efb46 100644
--- a/imperative/python/megengine/data/dataset/vision/cityscapes.py
+++ b/imperative/python/megengine/data/dataset/vision/cityscapes.py
@@ -23,9 +23,7 @@ from .meta_vision import VisionDataset
 
 
 class Cityscapes(VisionDataset):
-    r"""
-    `Cityscapes <http://www.cityscapes-dataset.com/>`_ Dataset.
-    """
+    r"""`Cityscapes <http://www.cityscapes-dataset.com/>`_ Dataset."""
 
     supported_order = (
         "image",
diff --git a/imperative/python/megengine/data/dataset/vision/coco.py b/imperative/python/megengine/data/dataset/vision/coco.py
index 4cc13334..f4a31102 100644
--- a/imperative/python/megengine/data/dataset/vision/coco.py
+++ b/imperative/python/megengine/data/dataset/vision/coco.py
@@ -46,9 +46,7 @@ def has_valid_annotation(anno, order):
 
 
 class COCO(VisionDataset):
-    r"""
-    `MS COCO <http://cocodataset.org/#home>`_ Dataset.
-    """
+    r"""`MS COCO <http://cocodataset.org/#home>`_ Dataset."""
 
     supported_order = (
         "image",
diff --git a/imperative/python/megengine/data/dataset/vision/folder.py b/imperative/python/megengine/data/dataset/vision/folder.py
index 1ecafb80..fc9705e5 100644
--- a/imperative/python/megengine/data/dataset/vision/folder.py
+++ b/imperative/python/megengine/data/dataset/vision/folder.py
@@ -26,22 +26,21 @@ from .utils import is_img
 
 
 class ImageFolder(VisionDataset):
-    r"""
-    ImageFolder is a class for loading image data and labels from a organized folder.
-
+    r"""ImageFolder is a class for loading image data and labels from a organized folder.
+    
     The folder is expected to be organized as followed: root/cls/xxx.img_ext
-
+    
     Labels are indices of sorted classes in the root directory.
 
-    :param root: root directory of an image folder.
-    :param loader: a function used to load image from path,
-                   if ``None``, default function that loads
-                   images with PIL will be called.
-    :param check_valid_func: a function used to check if files in folder are
-                             expected image files, if ``None``, default function
-                             that checks file extensions will be called.
-    :param class_name: if ``True``, return class name instead of class index.
-
+    Args:
+        root: root directory of an image folder.
+        loader: a function used to load image from path,
+            if ``None``, default function that loads
+            images with PIL will be called.
+        check_valid_func: a function used to check if files in folder are
+            expected image files, if ``None``, default function
+            that checks file extensions will be called.
+        class_name: if ``True``, return class name instead of class index.
     """
 
     def __init__(self, root: str, check_valid_func=None, class_name: bool = False):
diff --git a/imperative/python/megengine/data/dataset/vision/imagenet.py b/imperative/python/megengine/data/dataset/vision/imagenet.py
index a5371eeb..b8315e6a 100644
--- a/imperative/python/megengine/data/dataset/vision/imagenet.py
+++ b/imperative/python/megengine/data/dataset/vision/imagenet.py
@@ -30,11 +30,10 @@ logger = get_logger(__name__)
 
 
 class ImageNet(ImageFolder):
-    r"""
-    Load ImageNet from raw files or folder. Expected folder looks like:
-
-    .. code-block:: bash
-
+    r"""Load ImageNet from raw files or folder. Expected folder looks like:
+    
+    .. code-block:: shell
+    
         ${root}/
         |       [REQUIRED TAR FILES]
         |-  ILSVRC2012_img_train.tar
@@ -45,22 +44,8 @@ class ImageNet(ImageFolder):
         |-  val/cls/xxx.${img_ext}
         |-  ILSVRC2012_devkit_t12/data/meta.mat
         |-  ILSVRC2012_devkit_t12/data/ILSVRC2012_validation_ground_truth.txt
-
+    
     If the image folders don't exist, raw tar files are required to get extracted and processed.
-    """
-
-    raw_file_meta = {
-        "train": ("ILSVRC2012_img_train.tar", "1d675b47d978889d74fa0da5fadfb00e"),
-        "val": ("ILSVRC2012_img_val.tar", "29b22e2961454d5413ddabcf34fc5622"),
-        "devkit": ("ILSVRC2012_devkit_t12.tar.gz", "fa75699e90414af021442c21a62c3abf"),
-    }  # ImageNet raw files
-    default_train_dir = "train"
-    default_val_dir = "val"
-    default_devkit_dir = "ILSVRC2012_devkit_t12"
-
-    def __init__(self, root: str = None, train: bool = True, **kwargs):
-        r"""
-        Initialization:
 
         * if ``root`` contains ``self.target_folder`` depending on ``train``:
 
@@ -77,10 +62,22 @@ class ImageNet(ImageFolder):
 
             * raise error.
 
-        :param root: root directory of imagenet data, if root is ``None``, use default_dataset_root.
-        :param train: if ``True``, load the train split, otherwise load the validation split.
-        """
+    Args:
+        root: root directory of imagenet data, if root is ``None``, use default_dataset_root.
+        train: if ``True``, load the train split, otherwise load the validation split.
+
+    """
 
+    raw_file_meta = {
+        "train": ("ILSVRC2012_img_train.tar", "1d675b47d978889d74fa0da5fadfb00e"),
+        "val": ("ILSVRC2012_img_val.tar", "29b22e2961454d5413ddabcf34fc5622"),
+        "devkit": ("ILSVRC2012_devkit_t12.tar.gz", "fa75699e90414af021442c21a62c3abf"),
+    }  # ImageNet raw files
+    default_train_dir = "train"
+    default_val_dir = "val"
+    default_devkit_dir = "ILSVRC2012_devkit_t12"
+
+    def __init__(self, root: str = None, train: bool = True, **kwargs):
         # process the root path
         if root is None:
             self.root = self._default_root
diff --git a/imperative/python/megengine/data/dataset/vision/mnist.py b/imperative/python/megengine/data/dataset/vision/mnist.py
index 1e62f107..efa81628 100644
--- a/imperative/python/megengine/data/dataset/vision/mnist.py
+++ b/imperative/python/megengine/data/dataset/vision/mnist.py
@@ -22,8 +22,7 @@ logger = get_logger(__name__)
 
 
 class MNIST(VisionDataset):
-    r""" :class:`~.Dataset` for MNIST meta data.
-    """
+    r""":class:`~.Dataset` for MNIST meta data."""
 
     url_path = "http://yann.lecun.com/exdb/mnist/"
     """
diff --git a/imperative/python/megengine/data/dataset/vision/objects365.py b/imperative/python/megengine/data/dataset/vision/objects365.py
index 12643ec5..e9e9923f 100644
--- a/imperative/python/megengine/data/dataset/vision/objects365.py
+++ b/imperative/python/megengine/data/dataset/vision/objects365.py
@@ -23,9 +23,7 @@ from .meta_vision import VisionDataset
 
 
 class Objects365(VisionDataset):
-    r"""
-    `Objects365 <https://www.objects365.org/overview.html>`_ Dataset.
-    """
+    r"""`Objects365 <https://www.objects365.org/overview.html>`_ Dataset."""
 
     supported_order = (
         "image",
diff --git a/imperative/python/megengine/data/dataset/vision/voc.py b/imperative/python/megengine/data/dataset/vision/voc.py
index 7d1eb744..8c85496b 100644
--- a/imperative/python/megengine/data/dataset/vision/voc.py
+++ b/imperative/python/megengine/data/dataset/vision/voc.py
@@ -24,9 +24,7 @@ from .meta_vision import VisionDataset
 
 
 class PascalVOC(VisionDataset):
-    r"""
-    `Pascal VOC <http://host.robots.ox.ac.uk/pascal/VOC/>`_ Dataset.
-    """
+    r"""`Pascal VOC <http://host.robots.ox.ac.uk/pascal/VOC/>`_ Dataset."""
 
     supported_order = (
         "image",
diff --git a/imperative/python/megengine/data/sampler.py b/imperative/python/megengine/data/sampler.py
index 7cec2873..45fda1fd 100644
--- a/imperative/python/megengine/data/sampler.py
+++ b/imperative/python/megengine/data/sampler.py
@@ -17,9 +17,7 @@ import megengine.distributed as dist
 
 
 class Sampler(ABC):
-    r"""
-    An abstract base class for all Sampler
-    """
+    r"""An abstract base class for all Sampler"""
 
     @abstractmethod
     def __init__(self):
@@ -27,19 +25,19 @@ class Sampler(ABC):
 
 
 class MapSampler(Sampler):
-    r"""
-    Sampler for map dataset.
-
-    :param dataset: dataset to sample from.
-    :param batch_size: batch size for batch method.
-    :param drop_last: set ``True`` to drop the last incomplete batch,
-        if the dataset size is not divisible by the batch size. If ``False`` and 
-        the size of dataset is not divisible by the batch_size, then the last batch will
-        be smaller. Default: False
-    :param num_samples: number of samples assigned to one rank.
-    :param world_size: number of ranks.
-    :param rank: rank id, non-negative interger within 0 and ``world_size``.
-    :param seed: seed for random operators.
+    r"""Sampler for map dataset.
+
+    Args:
+        dataset: dataset to sample from.
+        batch_size: batch size for batch method.
+        drop_last: set ``True`` to drop the last incomplete batch,
+            if the dataset size is not divisible by the batch size. If ``False`` and
+            the size of dataset is not divisible by the batch_size, then the last batch will
+            be smaller. Default: False
+        num_samples: number of samples assigned to one rank.
+        world_size: number of ranks.
+        rank: rank id, non-negative interger within 0 and ``world_size``.
+        seed: seed for random operators.
     """
 
     def __init__(
@@ -106,14 +104,11 @@ class MapSampler(Sampler):
             return int(math.ceil(self.num_samples / self.batch_size))
 
     def sample(self):
-        """
-        Return a list contains all sample indices.
-        """
+        r"""Return a list contains all sample indices."""
         raise NotImplementedError
 
     def scatter(self, indices) -> List:
-        r"""
-        Scatter method is used for splitting indices into subset, each subset
+        r"""Scatter method is used for splitting indices into subset, each subset
         will be assigned to a rank. Indices are evenly splitted by default.
         If customized indices assignment method is needed, please rewrite this method.
         """
@@ -130,9 +125,7 @@ class MapSampler(Sampler):
         return indices
 
     def batch(self) -> Iterator[List[Any]]:
-        r"""
-        Batch method provides a batch indices generator.
-        """
+        r"""Batch method provides a batch indices generator."""
         indices = list(self.sample())
 
         # user might pass the world_size parameter without dist,
@@ -150,18 +143,15 @@ class MapSampler(Sampler):
 
 
 class StreamSampler(Sampler):
-    r"""
-    Sampler for stream dataset.
-
-    .. warning::
+    r"""Sampler for stream dataset.
 
+    Warning:
         In the case of multiple machines, sampler should ensure that each worker gets
         different data. But this class cannot do it yet, please build your own
         dataset and sampler to achieve this goal.
 
     Usually, :meth:`~.StreamDataset.__iter__` can return different iterator by
     ``rank = dist.get_rank()``. So that they will get different data.
-
     """
 
     def __init__(self, batch_size=1):
@@ -175,18 +165,18 @@ class StreamSampler(Sampler):
 
 
 class SequentialSampler(MapSampler):
-    r"""
-    Sample elements sequentially.
-
-    :param dataset: dataset to sample from.
-    :param batch_size: batch size for batch method.
-    :param drop_last: set ``True`` to drop the last incomplete batch,
-        if the dataset size is not divisible by the batch size. If ``False`` and 
-        the size of dataset is not divisible by the batch_size, then the last batch will
-        be smaller. Default: False
-    :param indices: indice of samples.
-    :param world_size: number of ranks.
-    :param rank: rank id, non-negative interger within 0 and ``world_size``.
+    r"""Sample elements sequentially.
+
+    Args:
+        dataset: dataset to sample from.
+        batch_size: batch size for batch method.
+        drop_last: set ``True`` to drop the last incomplete batch,
+            if the dataset size is not divisible by the batch size. If ``False`` and
+            the size of dataset is not divisible by the batch_size, then the last batch will
+            be smaller. Default: False
+        indices: indice of samples.
+        world_size: number of ranks.
+        rank: rank id, non-negative interger within 0 and ``world_size``.
     """
 
     def __init__(
@@ -207,9 +197,7 @@ class SequentialSampler(MapSampler):
         self.indices = indices
 
     def sample(self) -> Iterator[Any]:
-        r"""
-        Return a generator.
-        """
+        r"""Return a generator."""
         if self.indices is None:
             return iter(range(len(self.dataset)))
         else:
@@ -217,19 +205,19 @@ class SequentialSampler(MapSampler):
 
 
 class RandomSampler(MapSampler):
-    r"""
-    Sample elements randomly without replacement.
-
-    :param dataset: dataset to sample from.
-    :param batch_size: batch size for batch method.
-    :param drop_last: set ``True`` to drop the last incomplete batch,
-        if the dataset size is not divisible by the batch size. If ``False`` and 
-        the size of dataset is not divisible by the batch_size, then the last batch will
-        be smaller. Default: False
-    :param indices: indice of samples.
-    :param world_size: number of ranks.
-    :param rank: rank id, non-negative interger within 0 and ``world_size``.
-    :param seed: seed for random operators.
+    r"""Sample elements randomly without replacement.
+
+    Args:
+        dataset: dataset to sample from.
+        batch_size: batch size for batch method.
+        drop_last: set ``True`` to drop the last incomplete batch,
+            if the dataset size is not divisible by the batch size. If ``False`` and
+            the size of dataset is not divisible by the batch_size, then the last batch will
+            be smaller. Default: False
+        indices: indice of samples.
+        world_size: number of ranks.
+        rank: rank id, non-negative interger within 0 and ``world_size``.
+        seed: seed for random operators.
     """
 
     def __init__(
@@ -258,20 +246,20 @@ class RandomSampler(MapSampler):
 
 
 class ReplacementSampler(MapSampler):
-    r"""
-    Sample elements randomly with replacement.
-
-    :param dataset: dataset to sample from.
-    :param batch_size: batch size for batch method.
-    :param drop_last: set ``True`` to drop the last incomplete batch,
-        if the dataset size is not divisible by the batch size. If ``False`` and 
-        the size of dataset is not divisible by the batch_size, then the last batch will
-        be smaller. Default: False
-    :param num_samples: number of samples assigned to one rank.
-    :param weights: weights for sampling indices, it could be unnormalized weights.
-    :param world_size: number of ranks.
-    :param rank: rank id, non-negative interger within 0 and ``world_size``.
-    :param seed: seed for random operators.
+    r"""Sample elements randomly with replacement.
+
+    Args:
+        dataset: dataset to sample from.
+        batch_size: batch size for batch method.
+        drop_last: set ``True`` to drop the last incomplete batch,
+            if the dataset size is not divisible by the batch size. If ``False`` and
+            the size of dataset is not divisible by the batch_size, then the last batch will
+            be smaller. Default: False
+        num_samples: number of samples assigned to one rank.
+        weights: weights for sampling indices, it could be unnormalized weights.
+        world_size: number of ranks.
+        rank: rank id, non-negative interger within 0 and ``world_size``.
+        seed: seed for random operators.
     """
 
     def __init__(
diff --git a/imperative/python/megengine/data/tools/_queue.py b/imperative/python/megengine/data/tools/_queue.py
index 9acd8396..d6d598dd 100644
--- a/imperative/python/megengine/data/tools/_queue.py
+++ b/imperative/python/megengine/data/tools/_queue.py
@@ -59,15 +59,13 @@ class _PlasmaStoreManager:
 
 class PlasmaShmQueue:
     def __init__(self, maxsize: int = 0):
-        r"""
-        Use pyarrow in-memory plasma store to implement shared memory queue.
-
+        r"""Use pyarrow in-memory plasma store to implement shared memory queue.
         Compared to native `multiprocess.Queue`, `PlasmaShmQueue` avoid pickle/unpickle
         and communication overhead, leading to better performance in multi-process
         application.
 
-        :type maxsize: int
-        :param maxsize: maximum size of the queue, `None` means no limit. (default: ``None``)
+        Args:
+            maxsize: maximum size of the queue, `None` means no limit. (default: ``None``)
         """
 
         # Lazy start the plasma store manager
diff --git a/imperative/python/megengine/data/transform/meta_transform.py b/imperative/python/megengine/data/transform/meta_transform.py
index 3549692b..3892a1a4 100644
--- a/imperative/python/megengine/data/transform/meta_transform.py
+++ b/imperative/python/megengine/data/transform/meta_transform.py
@@ -11,9 +11,7 @@ from typing import Sequence, Tuple
 
 
 class Transform(ABC):
-    """
-    Rewrite apply method in subclass.
-    """
+    r"""Rewrite apply method in subclass."""
 
     def apply_batch(self, inputs: Sequence[Tuple]):
         return tuple(self.apply(input) for input in inputs)
diff --git a/imperative/python/megengine/data/transform/vision/functional.py b/imperative/python/megengine/data/transform/vision/functional.py
index 0116a055..089113e7 100644
--- a/imperative/python/megengine/data/transform/vision/functional.py
+++ b/imperative/python/megengine/data/transform/vision/functional.py
@@ -15,7 +15,7 @@ import numpy as np
 
 
 def wrap_keepdims(func):
-    """Wraper to keep the dimension of input images unchanged."""
+    r"""Wraper to keep the dimension of input images unchanged."""
 
     @functools.wraps(func)
     def wrapper(image, *args, **kwargs):
@@ -33,41 +33,47 @@ def wrap_keepdims(func):
 
 @wrap_keepdims
 def to_gray(image):
-    r"""
-    Change BGR format image's color space to gray.
+    r"""Change BGR format image's color space to gray.
 
-    :param image: input BGR format image, with `(H, W, C)` shape.
-    :return: gray format image, with `(H, W, C)` shape.
+    Args:
+        image: input BGR format image, with `(H, W, C)` shape.
+
+    Returns:
+        gray format image, with `(H, W, C)` shape.
     """
     return cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
 
 
 @wrap_keepdims
 def to_bgr(image):
-    r"""
-    Change gray format image's color space to BGR.
+    r"""Change gray format image's color space to BGR.
+
+    Args:
+        image: input Gray format image, with `(H, W, C)` shape.
 
-    :param image: input Gray format image, with `(H, W, C)` shape.
-    :return: BGR format image, with `(H, W, C)` shape.
+    Returns:
+        BGR format image, with `(H, W, C)` shape.
     """
     return cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)
 
 
 @wrap_keepdims
 def pad(input, size, value):
-    r"""
-    Pad input data with *value* and given *size*.
-
-    :param input: input data, with `(H, W, C)` shape.
-    :param size: padding size of input data, it could be integer or sequence.
-        If it is an integer, the input data will be padded in four directions.
-        If it is a sequence contains two integer, the bottom and right side
-        of input data will be padded.
-        If it is a sequence contains four integer, the top, bottom, left, right
-        side of input data will be padded with given size.
-    :param value: padding value of data, could be a sequence of int or float.
-        If it is float value, the dtype of image will be casted to float32 also.
-    :return: padded image.
+    r"""Pad input data with *value* and given *size*.
+
+    Args:
+        input: input data, with `(H, W, C)` shape.
+        size: padding size of input data, it could be integer or sequence.
+            If it is an integer, the input data will be padded in four directions.
+            If it is a sequence contains two integer, the bottom and right side
+            of input data will be padded.
+            If it is a sequence contains four integer, the top, bottom, left, right
+            side of input data will be padded with given size.
+        value: padding value of data, could be a sequence of int or float.
+            If it is float value, the dtype of image will be casted to float32 also.
+
+    Returns:
+        padded image.
     """
     if isinstance(size, int):
         size = (size, size, size, size)
@@ -80,32 +86,33 @@ def pad(input, size, value):
 
 @wrap_keepdims
 def flip(image, flipCode):
-    r"""
-    Accordding to the flipCode (the type of flip), flip the input image.
-
-    :param image: input image, with `(H, W, C)` shape.
-    :param flipCode: code that indicates the type of flip.
+    r"""Accordding to the flipCode (the type of flip), flip the input image.
 
-        * 1 : Flip horizontally
+    Args:
+        image: input image, with `(H, W, C)` shape.
+        flipCode: code that indicates the type of flip.
 
-        * 0 : Flip vertically
+            * 1 : Flip horizontally
+            * 0 : Flip vertically
+            * -1: Flip horizontally and vertically
 
-        * -1: Flip horizontally and vertically
-
-    :return: BGR format image, with `(H, W, C)` shape.
+    Returns:
+        BGR format image, with `(H, W, C)` shape.
     """
     return cv2.flip(image, flipCode=flipCode)
 
 
 @wrap_keepdims
 def resize(input, size, interpolation=cv2.INTER_LINEAR):
-    r"""
-    Resize the input data to given size.
+    r"""Resize the input data to given size.
+
+    Args:
+        input: input data, could be image or masks, with `(H, W, C)` shape.
+        size: target size of input data, with (height, width) shape.
+        interpolation: interpolation method.
 
-    :param input: input data, could be image or masks, with `(H, W, C)` shape.
-    :param size: target size of input data, with (height, width) shape.
-    :param interpolation: interpolation method.
-    :return: resized data, with `(H, W, C)` shape.
+    Returns:
+        resized data, with `(H, W, C)` shape.
     """
     if len(size) != 2:
         raise ValueError("resize needs (h, w), but got {}".format(size))
diff --git a/imperative/python/megengine/data/transform/vision/transform.py b/imperative/python/megengine/data/transform/vision/transform.py
index d567669c..2af7bfce 100644
--- a/imperative/python/megengine/data/transform/vision/transform.py
+++ b/imperative/python/megengine/data/transform/vision/transform.py
@@ -42,36 +42,36 @@ __all__ = [
 
 
 class VisionTransform(Transform):
-    r"""
-    Base class of all transforms used in computer vision.
+    r"""Base class of all transforms used in computer vision.
     Calling logic: apply_batch() -> apply() -> _apply_image() and other _apply_*()
     method. If you want to implement a self-defined transform method for image,
     rewrite _apply_image method in subclass.
 
-    :param order: input type order. Input is a tuple containing different structures,
-        order is used to specify the order of structures. For example, if your input
-        is (image, boxes) type, then the ``order`` should be ("image", "boxes").
-        Current available strings and data type are describe below:
-
-        * "image": input image, with shape of `(H, W, C)`.
-        * "coords": coordinates, with shape of `(N, 2)`.
-        * "boxes": bounding boxes, with shape of `(N, 4)`, "xyxy" format,
-          the 1st "xy" represents top left point of a box,
-          the 2nd "xy" represents right bottom point.
-        * "mask": map used for segmentation, with shape of `(H, W, 1)`.
-        * "keypoints": keypoints with shape of `(N, K, 3)`, N for number of instances,
-          and K for number of keypoints in one instance. The first two dimensions
-          of last axis is coordinate of keypoints and the the 3rd dimension is
-          the label of keypoints.
-        * "polygons": a sequence containing numpy arrays, its length is the number of instances.
-          Each numpy array represents polygon coordinate of one instance.
-        * "category": categories for some data type. For example, "image_category"
-          means category of the input image and "boxes_category" means categories of
-          bounding boxes.
-        * "info": information for images such as image shapes and image path.
-
-        You can also customize your data types only if you implement the corresponding
-        _apply_*() methods, otherwise ``NotImplementedError`` will be raised.
+    Args:
+        order: input type order. Input is a tuple containing different structures,
+            order is used to specify the order of structures. For example, if your input
+            is (image, boxes) type, then the ``order`` should be ("image", "boxes").
+            Current available strings and data type are describe below:
+    
+            * "image": input image, with shape of `(H, W, C)`.
+            * "coords": coordinates, with shape of `(N, 2)`.
+            * "boxes": bounding boxes, with shape of `(N, 4)`, "xyxy" format,
+              the 1st "xy" represents top left point of a box,
+              the 2nd "xy" represents right bottom point.
+            * "mask": map used for segmentation, with shape of `(H, W, 1)`.
+            * "keypoints": keypoints with shape of `(N, K, 3)`, N for number of instances,
+              and K for number of keypoints in one instance. The first two dimensions
+              of last axis is coordinate of keypoints and the the 3rd dimension is
+              the label of keypoints.
+            * "polygons": a sequence containing numpy arrays, its length is the number of instances.
+              Each numpy array represents polygon coordinate of one instance.
+            * "category": categories for some data type. For example, "image_category"
+              means category of the input image and "boxes_category" means categories of
+              bounding boxes.
+            * "info": information for images such as image shapes and image path.
+    
+    You can also customize your data types only if you implement the corresponding
+    _apply_*() methods, otherwise ``NotImplementedError`` will be raised.
     """
 
     def __init__(self, order=None):
@@ -154,13 +154,13 @@ class VisionTransform(Transform):
 
 
 class ToMode(VisionTransform):
-    r"""
-    Change input data to a target mode.
+    r"""Change input data to a target mode.
     For example, most transforms use HWC mode image,
     while the neural network might use CHW mode input tensor.
 
-    :param mode: output mode of input. Default: "CHW"
-    :param order: the same with :class:`VisionTransform`
+    Args:
+        mode: output mode of input. Default: "CHW"
+        order: the same with :class:`VisionTransform`
     """
 
     def __init__(self, mode="CHW", *, order=None):
@@ -183,32 +183,31 @@ class ToMode(VisionTransform):
 
 
 class Compose(VisionTransform):
-    r"""
-    Composes several transforms together.
-
-    :param transforms: list of :class:`VisionTransform` to compose.
-    :param batch_compose: whether use shuffle_indices for batch data or not.
-        If True, use original input sequence.
-        Otherwise, the shuffle_indices will be used for transforms.
-    :param shuffle_indices: indices used for random shuffle, start at 1.
-        For example, if shuffle_indices is [(1, 3), (2, 4)], then the 1st and 3rd transform
-        will be random shuffled, the 2nd and 4th transform will also be shuffled.
-    :param order: the same with :class:`VisionTransform`
-
+    r"""Composes several transforms together.
+
+    Args:
+        transforms: list of :class:`VisionTransform` to compose.
+        batch_compose: whether use shuffle_indices for batch data or not.
+            If True, use original input sequence.
+            Otherwise, the shuffle_indices will be used for transforms.
+        shuffle_indices: indices used for random shuffle, start at 1.
+            For example, if shuffle_indices is [(1, 3), (2, 4)], then the 1st and 3rd transform
+            will be random shuffled, the 2nd and 4th transform will also be shuffled.
+        order: the same with :class:`VisionTransform`
+    
     Examples:
-
-    .. testcode::
-
-        from megengine.data.transform import RandomHorizontalFlip, RandomVerticalFlip, CenterCrop, ToMode, Compose
-
-        transform_func = Compose([
-            RandomHorizontalFlip(),
-            RandomVerticalFlip(),
-            CenterCrop(100),
-            ToMode("CHW"),
-            ],
-            shuffle_indices=[(1, 2, 3)]
-            )
+        .. testcode::
+        
+           from megengine.data.transform import RandomHorizontalFlip, RandomVerticalFlip, CenterCrop, ToMode, Compose
+           
+           transform_func = Compose([
+           RandomHorizontalFlip(),
+           RandomVerticalFlip(),
+           CenterCrop(100),
+           ToMode("CHW"),
+           ],
+           shuffle_indices=[(1, 2, 3)]
+           )
     """
 
     def __init__(
@@ -260,13 +259,13 @@ class Compose(VisionTransform):
 
 
 class TorchTransformCompose(VisionTransform):
-    r"""
-    Compose class used for transforms in torchvision, only support PIL image,
+    r"""Compose class used for transforms in torchvision, only support PIL image,
     some transforms with tensor in torchvision are not supported,
     such as Normalize and ToTensor in torchvision.
 
-    :param transforms: the same with ``Compose``.
-    :param order: the same with :class:`VisionTransform`.
+    Args:
+        transforms: the same with ``Compose``.
+        order: the same with :class:`VisionTransform`.
     """
 
     def __init__(self, transforms, *, order=None):
@@ -302,19 +301,19 @@ class TorchTransformCompose(VisionTransform):
 
 
 class Pad(VisionTransform):
-    r"""
-    Pad the input data.
-
-    :param size: padding size of input image, it could be integer or sequence.
-        If it is an integer, the input image will be padded in four directions.
-        If it is a sequence containing two integers, the bottom and right side
-        of image will be padded.
-        If it is a sequence containing four integers, the top, bottom, left, right
-        side of image will be padded with given size.
-    :param value: padding value of image, could be a sequence of int or float.
-        if it is float value, the dtype of image will be casted to float32 also.
-    :param mask_value: padding value of segmentation map.
-    :param order: the same with :class:`VisionTransform`.
+    r"""Pad the input data.
+
+    Args:
+        size: padding size of input image, it could be integer or sequence.
+            If it is an integer, the input image will be padded in four directions.
+            If it is a sequence containing two integers, the bottom and right side
+            of image will be padded.
+            If it is a sequence containing four integers, the top, bottom, left, right
+            side of image will be padded with given size.
+        value: padding value of image, could be a sequence of int or float.
+            if it is float value, the dtype of image will be casted to float32 also.
+        mask_value: padding value of segmentation map.
+        order: the same with :class:`VisionTransform`.
     """
 
     def __init__(self, size=0, value=0, mask_value=0, *, order=None):
@@ -350,18 +349,18 @@ class Pad(VisionTransform):
 
 
 class Resize(VisionTransform):
-    r"""
-    Resize the input data.
-
-    :param output_size: target size of image, with (height, width) shape.
-    :param interpolation: interpolation method. All methods are listed below:
-
-        * cv2.INTER_NEAREST – a nearest-neighbor interpolation.
-        * cv2.INTER_LINEAR – a bilinear interpolation (used by default).
-        * cv2.INTER_AREA – resampling using pixel area relation.
-        * cv2.INTER_CUBIC – a bicubic interpolation over 4×4 pixel neighborhood.
-        * cv2.INTER_LANCZOS4 – a Lanczos interpolation over 8×8 pixel neighborhood.
-    :param order: the same with :class:`VisionTransform`.
+    r"""Resize the input data.
+
+    Args:
+        output_size: target size of image, with (height, width) shape.
+        interpolation: interpolation method. All methods are listed below:
+    
+            * cv2.INTER_NEAREST – a nearest-neighbor interpolation.
+            * cv2.INTER_LINEAR – a bilinear interpolation (used by default).
+            * cv2.INTER_AREA – resampling using pixel area relation.
+            * cv2.INTER_CUBIC – a bicubic interpolation over 4×4 pixel neighborhood.
+            * cv2.INTER_LANCZOS4 – a Lanczos interpolation over 8×8 pixel neighborhood.
+        order: the same with :class:`VisionTransform`.
     """
 
     def __init__(self, output_size, interpolation=cv2.INTER_LINEAR, *, order=None):
@@ -410,9 +409,7 @@ class Resize(VisionTransform):
 
 
 class ShortestEdgeResize(VisionTransform):
-    r"""
-    Resize the input data with specified shortset edge.
-    """
+    r"""Resize the input data with specified shortset edge."""
 
     def __init__(
         self,
@@ -481,11 +478,11 @@ class ShortestEdgeResize(VisionTransform):
 
 
 class RandomResize(VisionTransform):
-    r"""
-    Resize the input data randomly.
+    r"""Resize the input data randomly.
 
-    :param scale_range: range of scaling.
-    :param order: the same with :class:`VisionTransform`.
+    Args:
+        scale_range: range of scaling.
+        order: the same with :class:`VisionTransform`.
     """
 
     def __init__(self, scale_range, interpolation=cv2.INTER_LINEAR, *, order=None):
@@ -526,15 +523,15 @@ class RandomResize(VisionTransform):
 
 
 class RandomCrop(VisionTransform):
-    r"""
-    Crop the input data randomly. Before applying the crop transform,
+    r"""Crop the input data randomly. Before applying the crop transform,
     pad the image first. If target size is still bigger than the size of
     padded image, pad the image size to target size.
 
-    :param output_size: target size of output image, with (height, width) shape.
-    :param padding_size: the same with `size` in ``Pad``.
-    :param padding_value: the same with `value` in ``Pad``.
-    :param order: the same with :class:`VisionTransform`.
+    Args:
+        output_size: target size of output image, with (height, width) shape.
+        padding_size: the same with `size` in ``Pad``.
+        padding_value: the same with `value` in ``Pad``.
+        order: the same with :class:`VisionTransform`.
     """
 
     def __init__(
@@ -584,16 +581,16 @@ class RandomCrop(VisionTransform):
 
 
 class RandomResizedCrop(VisionTransform):
-    r"""
-    Crop the input data to random size and aspect ratio.
+    r"""Crop the input data to random size and aspect ratio.
     A crop of random size (default: of 0.08 to 1.0) of the original size and a random
     aspect ratio (default: of 3/4 to 1.33) of the original aspect ratio is made.
     After applying crop transfrom, the input data will be resized to given size.
 
-    :param output_size: target size of output image, with (height, width) shape.
-    :param scale_range: range of size of the origin size cropped. Default: (0.08, 1.0)
-    :param ratio_range: range of aspect ratio of the origin aspect ratio cropped. Default: (0.75, 1.33)
-    :param order: the same with :class:`VisionTransform`.
+    Args:
+        output_size: target size of output image, with (height, width) shape.
+        scale_range: range of size of the origin size cropped. Default: (0.08, 1.0)
+        ratio_range: range of aspect ratio of the origin aspect ratio cropped. Default: (0.75, 1.33)
+        order: the same with :class:`VisionTransform`.
     """
 
     def __init__(
@@ -674,11 +671,11 @@ class RandomResizedCrop(VisionTransform):
 
 
 class CenterCrop(VisionTransform):
-    r"""
-    Crops the given the input data at the center.
+    r"""Crops the given the input data at the center.
 
-    :param output_size: target size of output image, with (height, width) shape.
-    :param order: the same with :class:`VisionTransform`.
+    Args:
+        output_size: target size of output image, with (height, width) shape.
+        order: the same with :class:`VisionTransform`.
     """
 
     def __init__(self, output_size, *, order=None):
@@ -718,11 +715,11 @@ class CenterCrop(VisionTransform):
 
 
 class RandomHorizontalFlip(VisionTransform):
-    r"""
-    Horizontally flip the input data randomly with a given probability.
+    r"""Horizontally flip the input data randomly with a given probability.
 
-    :param p: probability of the input data being flipped. Default: 0.5
-    :param order: the same with :class:`VisionTransform`.
+    Args:
+        p: probability of the input data being flipped. Default: 0.5
+        order: the same with :class:`VisionTransform`.
     """
 
     def __init__(self, prob: float = 0.5, *, order=None):
@@ -751,11 +748,11 @@ class RandomHorizontalFlip(VisionTransform):
 
 
 class RandomVerticalFlip(VisionTransform):
-    r"""
-    Vertically flip the input data randomly with a given probability.
+    r"""Vertically flip the input data randomly with a given probability.
 
-    :param p: probability of the input data being flipped. Default: 0.5
-    :param order: the same with :class:`VisionTransform`.
+    Args:
+        p: probability of the input data being flipped. Default: 0.5
+        order: the same with :class:`VisionTransform`.
     """
 
     def __init__(self, prob: float = 0.5, *, order=None):
@@ -784,15 +781,15 @@ class RandomVerticalFlip(VisionTransform):
 
 
 class Normalize(VisionTransform):
-    r"""
-    Normalize the input data with mean and standard deviation.
+    r"""Normalize the input data with mean and standard deviation.
     Given mean: ``(M1,...,Mn)`` and std: ``(S1,..,Sn)`` for ``n`` channels,
     this transform will normalize each channel of the input data.
     ``output[channel] = (input[channel] - mean[channel]) / std[channel]``
 
-    :param mean: sequence of means for each channel.
-    :param std: sequence of standard deviations for each channel.
-    :param order: the same with :class:`VisionTransform`.
+    Args:
+        mean: sequence of means for each channel.
+        std: sequence of standard deviations for each channel.
+        order: the same with :class:`VisionTransform`.
     """
 
     def __init__(self, mean=0.0, std=1.0, *, order=None):
@@ -811,13 +808,13 @@ class Normalize(VisionTransform):
 
 
 class GaussianNoise(VisionTransform):
-    r"""
-    Add random gaussian noise to the input data.
+    r"""Add random gaussian noise to the input data.
     Gaussian noise is generated with given mean and std.
 
-    :param mean: Gaussian mean used to generate noise.
-    :param std: Gaussian standard deviation used to generate noise.
-    :param order: the same with :class:`VisionTransform`
+    Args:
+        mean: Gaussian mean used to generate noise.
+        std: Gaussian standard deviation used to generate noise.
+        order: the same with :class:`VisionTransform`
     """
 
     def __init__(self, mean=0.0, std=1.0, *, order=None):
@@ -839,12 +836,12 @@ class GaussianNoise(VisionTransform):
 
 
 class BrightnessTransform(VisionTransform):
-    r"""
-    Adjust brightness of the input data.
+    r"""Adjust brightness of the input data.
 
-    :param value: how much to adjust the brightness. Can be any
-        non negative number. 0 gives the original image.
-    :param order: the same with :class:`VisionTransform`.
+    Args:
+        value: how much to adjust the brightness. Can be any
+            non negative number. 0 gives the original image.
+        order: the same with :class:`VisionTransform`.
     """
 
     def __init__(self, value, *, order=None):
@@ -871,12 +868,12 @@ class BrightnessTransform(VisionTransform):
 
 
 class ContrastTransform(VisionTransform):
-    r"""
-    Adjust contrast of the input data.
+    r"""Adjust contrast of the input data.
 
-    :param value: how much to adjust the contrast. Can be any
-        non negative number. 0 gives the original image.
-    :param order: the same with :class:`VisionTransform`.
+    Args:
+        value: how much to adjust the contrast. Can be any
+            non negative number. 0 gives the original image.
+        order: the same with :class:`VisionTransform`.
     """
 
     def __init__(self, value, *, order=None):
@@ -903,12 +900,12 @@ class ContrastTransform(VisionTransform):
 
 
 class SaturationTransform(VisionTransform):
-    r"""
-    Adjust saturation of the input data.
+    r"""Adjust saturation of the input data.
 
-    :param value: how much to adjust the saturation. Can be any
-        non negative number. 0 gives the original image.
-    :param order: the same with :class:`VisionTransform`.
+    Args:
+        value: how much to adjust the saturation. Can be any
+            non negative number. 0 gives the original image.
+        order: the same with :class:`VisionTransform`.
     """
 
     def __init__(self, value, *, order=None):
@@ -935,12 +932,12 @@ class SaturationTransform(VisionTransform):
 
 
 class HueTransform(VisionTransform):
-    r"""
-    Adjust hue of the input data.
+    r"""Adjust hue of the input data.
 
-    :param value: how much to adjust the hue. Can be any number
-        between 0 and 0.5, 0 gives the original image.
-    :param order: the same with :class:`VisionTransform`.
+    Args:
+        value: how much to adjust the hue. Can be any number
+            between 0 and 0.5, 0 gives the original image.
+        order: the same with :class:`VisionTransform`.
     """
 
     def __init__(self, value, *, order=None):
@@ -974,22 +971,22 @@ class HueTransform(VisionTransform):
 
 
 class ColorJitter(VisionTransform):
-    r"""
-    Randomly change the brightness, contrast, saturation and hue of an image.
-
-    :param brightness: how much to jitter brightness.
-        Chosen uniformly from [max(0, 1 - brightness), 1 + brightness]
-        or the given [min, max]. Should be non negative numbers.
-    :param contrast: how much to jitter contrast.
-        Chosen uniformly from [max(0, 1 - contrast), 1 + contrast]
-        or the given [min, max]. Should be non negative numbers.
-    :param saturation: how much to jitter saturation.
-        Chosen uniformly from [max(0, 1 - saturation), 1 + saturation]
-        or the given [min, max]. Should be non negative numbers.
-    :param hue: how much to jitter hue.
-        Chosen uniformly from [-hue, hue] or the given [min, max].
-        Should have 0<= hue <= 0.5 or -0.5 <= min <= max <= 0.5.
-    :param order: the same with :class:`VisionTransform`.
+    r"""Randomly change the brightness, contrast, saturation and hue of an image.
+
+    Args:
+        brightness: how much to jitter brightness.
+            Chosen uniformly from [max(0, 1 - brightness), 1 + brightness]
+            or the given [min, max]. Should be non negative numbers.
+        contrast: how much to jitter contrast.
+            Chosen uniformly from [max(0, 1 - contrast), 1 + contrast]
+            or the given [min, max]. Should be non negative numbers.
+        saturation: how much to jitter saturation.
+            Chosen uniformly from [max(0, 1 - saturation), 1 + saturation]
+            or the given [min, max]. Should be non negative numbers.
+        hue: how much to jitter hue.
+            Chosen uniformly from [-hue, hue] or the given [min, max].
+            Should have 0<= hue <= 0.5 or -0.5 <= min <= max <= 0.5.
+        order: the same with :class:`VisionTransform`.
     """
 
     def __init__(self, brightness=0, contrast=0, saturation=0, hue=0, *, order=None):
@@ -1014,11 +1011,10 @@ class ColorJitter(VisionTransform):
 
 
 class Lighting(VisionTransform):
-    r"""
-    Apply AlexNet-Style "lighting" augmentation to input data.
-
+    r"""Apply AlexNet-Style "lighting" augmentation to input data.
+    
     Input images are assumed to have 'RGB' channel order.
-
+    
     The degree of color jittering is randomly sampled via a normal distribution,
     with standard deviation given by the scale parameter.
     """
diff --git a/imperative/python/megengine/device.py b/imperative/python/megengine/device.py
index 8a31754a..06a93313 100644
--- a/imperative/python/megengine/device.py
+++ b/imperative/python/megengine/device.py
@@ -54,10 +54,10 @@ _device_type_set = {"cpu", "gpu", "xpu", "rocm"}
 
 
 def get_device_count(device_type: str) -> int:
-    """
-    Gets number of devices installed on this system.
+    r"""Gets number of devices installed on this system.
 
-    :param device_type: device type, one of 'gpu' or 'cpu'
+    Args:
+        device_type: device type, one of 'gpu' or 'cpu'
     """
     assert device_type in _device_type_set, "device must be one of {}".format(
         _device_type_set
@@ -67,73 +67,59 @@ def get_device_count(device_type: str) -> int:
 
 
 def is_cuda_available() -> bool:
-    """
-    Returns whether cuda device is available on this system.
-
-    """
+    r"""Returns whether cuda device is available on this system."""
     t = _str2device_type("gpu")
     return CompNode._get_device_count(t, False) > 0
 
 
 def is_cambricon_available() -> bool:
-    """
-    Returns whether cambricon device is available on this system.
-
-    """
+    r"""Returns whether cambricon device is available on this system."""
     t = _str2device_type("cambricon")
     return CompNode._get_device_count(t, False) > 0
 
 
 def is_atlas_available() -> bool:
-    """
-    Returns whether atlas device is available on this system.
-
-    """
+    r"""Returns whether atlas device is available on this system."""
     t = _str2device_type("atlas")
     return CompNode._get_device_count(t, False) > 0
 
 
 def is_rocm_available() -> bool:
-    """Returns whether rocm device is available on this system.
-
-    """
+    r"""Returns whether rocm device is available on this system."""
     t = _str2device_type("rocm")
     return CompNode._get_device_count(t, False) > 0
 
 
 def set_default_device(device: str = "xpux"):
-    r"""
-    Sets default computing node.
-
-    :param device: default device type. The type can be 'cpu0', 'cpu1', etc.,
-        or 'gpu0', 'gpu1', etc., to specify the particular cpu or gpu to use.
-        'cpux' and  'gpux' can also be used to specify any number of cpu or gpu devices.
-
-        'multithread' device type is avaliable when inference, which implements
-        multi-threading parallelism at the operator level. For example,
-        'multithread4' will compute with 4 threads.
-
-        The default value is 'xpux' to specify any device available. The priority of using gpu is higher when both gpu and cpu are available.
-
-        It can also be set by environment variable `MGE_DEFAULT_DEVICE`.
+    r"""Sets default computing node.
+
+    Args:
+        device: default device type.
+
+    Note:
+        * The type can be 'cpu0', 'cpu1', etc., or 'gpu0', 'gpu1', etc.,
+          to specify the particular CPU or GPU to use.
+        * 'cpux' and  'gpux' can also be used to specify any number of CPU or GPU devices.
+        * The default value is 'xpux' to specify any device available.
+        * The priority of using GPU is higher when both GPU and CPU are available.
+        * 'multithread' device type is avaliable when inference,
+          which implements multi-threading parallelism at the operator level.
+          For example, 'multithread4' will compute with 4 threads.
+        * It can also be set by environment variable ``MGE_DEFAULT_DEVICE``.
     """
     assert _valid_device(device), "Invalid device name {}".format(device)
     CompNode._set_default_device(device)
 
 
 def get_default_device() -> str:
-    r"""
-    Gets default computing node.
-
+    r"""Gets default computing node.
     It returns the value set by :func:`~.set_default_device`.
     """
     return CompNode._get_default_device()
 
 
 def get_mem_status_bytes(device: Optional[str] = None):
-    r"""
-    Get total and free memory on the computing device in bytes.
-    """
+    r"""Get total and free memory on the computing device in bytes."""
     if device is None:
         device = get_default_device()
     tot, free = CompNode(device).get_mem_status_bytes
@@ -150,15 +136,17 @@ def set_prealloc_config(
     growth_factor=2.0,
     device_type=DeviceType.CUDA,
 ):
-    """
-    Specifies how to pre-allocate from raw device allocator.
-
-    :param alignment: specifies the alignment in bytes.
-    :param min_req: min request size in bytes.
-    :param max_overhead: max overhead above required size in bytes.
-    :param growth_factor: `request size / cur allocated`
-    :param device_type: the device type
-
+    r"""Specifies how to pre-allocate from raw device allocator.
+
+    Args:
+        alignment: specifies the alignment in bytes.
+        min_req: min request size in bytes.
+        max_overhead: max overhead above required size in bytes.
+        growth_factor: request size / cur allocated`
+        device_type: the device type
+        alignment: int:
+        min_req: int:
+        max_overhead: int:
     """
     assert alignment > 0
     assert min_req > 0
diff --git a/imperative/python/megengine/distributed/__init__.py b/imperative/python/megengine/distributed/__init__.py
index 3ac321c3..4b3a8847 100644
--- a/imperative/python/megengine/distributed/__init__.py
+++ b/imperative/python/megengine/distributed/__init__.py
@@ -31,17 +31,15 @@ from .server import Client, Server
 
 @mproperty
 def backend(mod):
-    r"""
-    Get or set backend of collective communication.
+    r"""Get or set backend of collective communication.
     Available backends are ['nccl', 'shm', 'rccl']
 
     Examples:
 
-    .. code-block::
-
-        import megengine.distributed as dist
-        dist.backend = "nccl"
+        .. code-block::
 
+            import megengine.distributed as dist
+            dist.backend = "nccl"
     """
     assert group._sd, "please call init_process_group first"
     return group._sd.backend
diff --git a/imperative/python/megengine/distributed/functional.py b/imperative/python/megengine/distributed/functional.py
index e0b28fc7..d1c67271 100644
--- a/imperative/python/megengine/distributed/functional.py
+++ b/imperative/python/megengine/distributed/functional.py
@@ -50,7 +50,7 @@ def _backend():
 
 
 def collective_comm(inp, mode, group, device):
-    """Helper function for applying collective communication functions."""
+    r"""Helper function for applying collective communication functions."""
     assert isinstance(group, Group)
     if group is None:
         return inp
@@ -158,8 +158,7 @@ class _ReduceSum(Function):
 def reduce_sum(
     inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = None,
 ) -> Tensor:
-    r"""
-    Reduce tensor data across the specified group by sum.
+    r"""Reduce tensor data across the specified group by sum.
     Only root process will receive the final result.
 
     Args:
@@ -176,22 +175,20 @@ def reduce_sum(
         Reduced tensor if in root process, None in other processes.
 
     Examples:
-
-    .. code-block::
-
-        input = Tensor([rank])
-        # Rank 0 # input: Tensor([0])
-        # Rank 1 # input: Tensor([1])
-        output = reduce_sum(input)
-        # Rank 0 # output: Tensor([1])
-        # Rank 1 # output: None
-
-        input = Tensor([rank])
-        group = Group([1, 0]) # first rank is root
-        output = reduce_sum(input, group)
-        # Rank 0 # output: None
-        # Rank 1 # output: Tensor([1])
-
+        .. code-block::
+
+           input = Tensor([rank])
+           # Rank 0 # input: Tensor([0])
+           # Rank 1 # input: Tensor([1])
+           output = reduce_sum(input)
+           # Rank 0 # output: Tensor([1])
+           # Rank 1 # output: None
+
+           input = Tensor([rank])
+           group = Group([1, 0]) # first rank is root
+           output = reduce_sum(input, group)
+           # Rank 0 # output: None
+           # Rank 1 # output: Tensor([1])
     """
     op = _ReduceSum(group, device)
     (out,) = apply(op, inp)
@@ -222,8 +219,7 @@ class _Broadcast(Function):
 def broadcast(
     inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = None,
 ) -> Tensor:
-    r"""
-    Broadcast tensor data from root process to others.
+    r"""Broadcast tensor data from root process to others.
 
     Args:
         inp: Input tensor.
@@ -240,21 +236,20 @@ def broadcast(
 
     Examples:
 
-    .. code-block::
-
-        input = Tensor([rank])
-        # Rank 0 # input: Tensor([0])
-        # Rank 1 # input: Tensor([1])
-        output = broadcast(input)
-        # Rank 0 # output: Tensor([0])
-        # Rank 1 # output: Tensor([0])
+        .. code-block::
 
-        input = Tensor([rank])
-        group = Group([1, 0]) # first rank is root
-        output = broadcast(input, group)
-        # Rank 0 # output: Tensor([1])
-        # Rank 1 # output: Tensor([1])
+           input = Tensor([rank])
+           # Rank 0 # input: Tensor([0])
+           # Rank 1 # input: Tensor([1])
+           output = broadcast(input)
+           # Rank 0 # output: Tensor([0])
+           # Rank 1 # output: Tensor([0])
 
+           input = Tensor([rank])
+           group = Group([1, 0]) # first rank is root
+           output = broadcast(input, group)
+           # Rank 0 # output: Tensor([1])
+           # Rank 1 # output: Tensor([1])
     """
     shape, dtype = _bcast_shape_dtype(group, inp)
     if group.rank != 0:
@@ -278,8 +273,7 @@ def _bcast_param(
 def all_gather(
     inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = None, axis=0,
 ) -> Tensor:
-    r"""
-    Gather tensors across the specified group and concat them at first dimension.
+    r"""Gather tensors across the specified group and concat them at first dimension.
 
     Args:
         inp: Input tensor.
@@ -298,21 +292,20 @@ def all_gather(
 
     Examples:
 
-    .. code-block::
-
-        input = Tensor([rank])
-        # Rank 0 # input: Tensor([0])
-        # Rank 1 # input: Tensor([1])
-        output = all_gather(input)
-        # Rank 0 # output: Tensor([0 1])
-        # Rank 1 # output: Tensor([0 1])
+        .. code-block::
 
-        input = Tensor([rank])
-        group = Group([1, 0])
-        output = all_gather(input, group)
-        # Rank 0 # output: Tensor([1 0])
-        # Rank 1 # output: Tensor([1 0])
+           input = Tensor([rank])
+           # Rank 0 # input: Tensor([0])
+           # Rank 1 # input: Tensor([1])
+           output = all_gather(input)
+           # Rank 0 # output: Tensor([0 1])
+           # Rank 1 # output: Tensor([0 1])
 
+           input = Tensor([rank])
+           group = Group([1, 0])
+           output = all_gather(input, group)
+           # Rank 0 # output: Tensor([1 0])
+           # Rank 1 # output: Tensor([1 0])
     """
     mode = CollectiveComm.Mode.ALL_GATHER
     out = collective_comm(inp, mode, group, device)
@@ -338,8 +331,7 @@ def all_gather(
 def reduce_scatter_sum(
     inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = None, axis=0
 ) -> Tensor:
-    r"""
-    Reduce tensors across the specified group by sum and split them at first dimension.
+    r"""Reduce tensors across the specified group by sum and split them at first dimension.
 
     Args:
         inp: Input tensor.
@@ -358,21 +350,20 @@ def reduce_scatter_sum(
 
     Examples:
 
-    .. code-block::
-
-        input = Tensor([0 1])
-        # Rank 0 # input: Tensor([0 1])
-        # Rank 1 # input: Tensor([0 1])
-        output = reduce_scatter_sum(input)
-        # Rank 0 # output: Tensor([0])
-        # Rank 1 # output: Tensor([2])
+        .. code-block::
 
-        input = Tensor([0 1])
-        group = Group([1, 0])
-        output = reduce_scatter_sum(input, group)
-        # Rank 0 # output: Tensor([2])
-        # Rank 1 # output: Tensor([0])
+           input = Tensor([0 1])
+           # Rank 0 # input: Tensor([0 1])
+           # Rank 1 # input: Tensor([0 1])
+           output = reduce_scatter_sum(input)
+           # Rank 0 # output: Tensor([0])
+           # Rank 1 # output: Tensor([2])
 
+           input = Tensor([0 1])
+           group = Group([1, 0])
+           output = reduce_scatter_sum(input, group)
+           # Rank 0 # output: Tensor([2])
+           # Rank 1 # output: Tensor([0])
     """
     group_size = group.size if group is not None else 1
     assert (
@@ -398,8 +389,7 @@ def reduce_scatter_sum(
 def all_reduce_sum(
     inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = None,
 ) -> Tensor:
-    r"""
-    Reduce tensors across the specified group by sum.
+    r"""Reduce tensors across the specified group by sum.
 
     Args:
         inp: Input tensor.
@@ -416,15 +406,14 @@ def all_reduce_sum(
 
     Examples:
 
-    .. code-block::
-
-        input = Tensor(rank)
-        # Rank 0 # input: Tensor(0)
-        # Rank 1 # input: Tensor(1)
-        output = all_reduce_sum(input)
-        # Rank 0 # output: Tensor(1)
-        # Rank 1 # output: Tensor(1)
+        .. code-block::
 
+           input = Tensor(rank)
+           # Rank 0 # input: Tensor(0)
+           # Rank 1 # input: Tensor(1)
+           output = all_reduce_sum(input)
+           # Rank 0 # output: Tensor(1)
+           # Rank 1 # output: Tensor(1)
     """
     mode = CollectiveComm.Mode.ALL_REDUCE_SUM
     return collective_comm(inp, mode, group, device)
@@ -433,8 +422,7 @@ def all_reduce_sum(
 def all_reduce_max(
     inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = None,
 ) -> Tensor:
-    r"""
-    Reduce tensors across the specified group by max.
+    r"""Reduce tensors across the specified group by max.
 
     Args:
         inp: Input tensor.
@@ -451,15 +439,14 @@ def all_reduce_max(
 
     Examples:
 
-    .. code-block::
-
-        input = Tensor(rank)
-        # Rank 0 # input: Tensor(0)
-        # Rank 1 # input: Tensor(1)
-        output = all_reduce_max(input)
-        # Rank 0 # output: Tensor(1)
-        # Rank 1 # output: Tensor(1)
+        .. code-block::
 
+           input = Tensor(rank)
+           # Rank 0 # input: Tensor(0)
+           # Rank 1 # input: Tensor(1)
+           output = all_reduce_max(input)
+           # Rank 0 # output: Tensor(1)
+           # Rank 1 # output: Tensor(1)
     """
     mode = CollectiveComm.Mode.ALL_REDUCE_MAX
     return collective_comm(inp, mode, group, device)
@@ -468,8 +455,7 @@ def all_reduce_max(
 def all_reduce_min(
     inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = None,
 ) -> Tensor:
-    r"""
-    Reduce tensors across the specified group by min.
+    r"""Reduce tensors across the specified group by min.
 
     Args:
         inp: Input tensor.
@@ -486,15 +472,14 @@ def all_reduce_min(
 
     Examples:
 
-    .. code-block::
-
-        input = Tensor(rank)
-        # Rank 0 # input: Tensor(0)
-        # Rank 1 # input: Tensor(1)
-        output = all_reduce_min(input)
-        # Rank 0 # output: Tensor(0)
-        # Rank 1 # output: Tensor(0)
+        .. code-block::
 
+           input = Tensor(rank)
+           # Rank 0 # input: Tensor(0)
+           # Rank 1 # input: Tensor(1)
+           output = all_reduce_min(input)
+           # Rank 0 # output: Tensor(0)
+           # Rank 1 # output: Tensor(0)
     """
     mode = CollectiveComm.Mode.ALL_REDUCE_MIN
     return collective_comm(inp, mode, group, device)
@@ -520,8 +505,7 @@ class _Gather(Function):
 def gather(
     inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = None, axis=0,
 ) -> Tensor:
-    r"""
-    Gather tensors across the specified group.
+    r"""Gather tensors across the specified group.
     Only root process will receive the final result.
 
     Args:
@@ -534,27 +518,23 @@ def gather(
             Specify "gpu0:1" to execute this operator on diffrent cuda stream,
             1 is stream id, and default stream id is 0.
         axis: The concat axis for collective_comm result
-            The default axis is 0
-    Returns:
-        Result tensor if in root process, None if in other process
 
     Examples:
 
-    .. code-block::
-
-        input = Tensor([rank])
-        # Rank 0 # input: Tensor([0])
-        # Rank 1 # input: Tensor([1])
-        output = gather(input)
-        # Rank 0 # output: Tensor([0 1])
-        # Rank 1 # output: None
+        .. code-block::
 
-        input = Tensor([rank])
-        group = Group([1, 0]) # first rank is root
-        output = gather(input, group)
-        # Rank 0 # output: None
-        # Rank 1 # output: Tensor([1 0])
+           input = Tensor([rank])
+           # Rank 0 # input: Tensor([0])
+           # Rank 1 # input: Tensor([1])
+           output = gather(input)
+           # Rank 0 # output: Tensor([0 1])
+           # Rank 1 # output: None
 
+           input = Tensor([rank])
+           group = Group([1, 0]) # first rank is root
+           output = gather(input, group)
+           # Rank 0 # output: None
+           # Rank 1 # output: Tensor([1 0])
     """
     assert (
         axis < inp.ndim
@@ -607,8 +587,7 @@ class _Scatter(Function):
 def scatter(
     inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = None, axis=0,
 ) -> Tensor:
-    r"""
-    Split tensor in root process at first dimension.
+    r"""Split tensor in root process at first dimension.
 
     Args:
         inp: Input tensor.
@@ -627,21 +606,20 @@ def scatter(
 
     Examples:
 
-    .. code-block::
-
-        input = Tensor([0 1]) + rank*2
-        # Rank 0 # input: Tensor([0 1])
-        # Rank 1 # input: Tensor([2 3])
-        output = scatter(input)
-        # Rank 0 # output: Tensor([0])
-        # Rank 1 # output: Tensor([1])
+        .. code-block::
 
-        input = Tensor([0 1]) + rank*2
-        group = Group([1, 0]) # first rank is root
-        output = scatter(input, group)
-        # Rank 0 # output: Tensor([3])
-        # Rank 1 # output: Tensor([2])
+           input = Tensor([0 1]) + rank*2
+           # Rank 0 # input: Tensor([0 1])
+           # Rank 1 # input: Tensor([2 3])
+           output = scatter(input)
+           # Rank 0 # output: Tensor([0])
+           # Rank 1 # output: Tensor([1])
 
+           input = Tensor([0 1]) + rank*2
+           group = Group([1, 0]) # first rank is root
+           output = scatter(input, group)
+           # Rank 0 # output: Tensor([3])
+           # Rank 1 # output: Tensor([2])
     """
     shape, dtype = _bcast_shape_dtype(group, inp)
     if group.rank != 0:
@@ -680,8 +658,7 @@ def all_to_all(
     split_axis: int = 0,
     concat_axis: int = 0,
 ) -> Tensor:
-    r"""
-    Each process scatter input tensor to all processes and return gathered tensor.
+    r"""Each process scatter input tensor to all processes and return gathered tensor.
 
     Args:
         inp: Input tensor.
@@ -694,29 +671,26 @@ def all_to_all(
             1 is stream id, and default stream id is 0.
         split_axis: The axis that collectivecomm will split data
             the default axis is 0
-        split_axis: The axis that collectivecomm will concat data
-            the default axis is 0
 
     Returns:
         Result tensor.
 
     Examples:
 
-    .. code-block::
-
-        input = Tensor([0 1]) + rank*2
-        # Rank 0 # input: Tensor([0 1])
-        # Rank 1 # input: Tensor([2 3])
-        output = all_to_all(input)
-        # Rank 0 # output: Tensor([0 2])
-        # Rank 1 # output: Tensor([1 3])
+        .. code-block::
 
-        input = Tensor([0 1]) + rank*2
-        group = Group([1, 0])
-        output = all_to_all(input, group)
-        # Rank 0 # output: Tensor([0 3])
-        # Rank 1 # output: Tensor([2 1])
+           input = Tensor([0 1]) + rank*2
+           # Rank 0 # input: Tensor([0 1])
+           # Rank 1 # input: Tensor([2 3])
+           output = all_to_all(input)
+           # Rank 0 # output: Tensor([0 2])
+           # Rank 1 # output: Tensor([1 3])
 
+           input = Tensor([0 1]) + rank*2
+           group = Group([1, 0])
+           output = all_to_all(input, group)
+           # Rank 0 # output: Tensor([0 3])
+           # Rank 1 # output: Tensor([2 1])
     """
     group_size = group.size if group is not None else 1
     assert (
@@ -805,8 +779,7 @@ class _RemoteRecv(Function):
 
 
 def remote_send(inp: Tensor, dest_rank: int):
-    r"""
-    Send tensor to another process.
+    r"""Send tensor to another process.
 
     Args:
         inp: Tensor to send.
@@ -816,17 +789,15 @@ def remote_send(inp: Tensor, dest_rank: int):
         None.
 
     Examples:
-
-    .. code-block::
-
-        if rank == 0:
-            data = mge.tensor(1)
-            # Tensor(1)
-            F.distributed.remote_send(data, 1) # return None
-        else:
-            data = F.distributed.remote_recv(0)
-            # Tensor(1)
-
+        .. code-block::
+
+           if rank == 0:
+               data = mge.tensor(1)
+               # Tensor(1)
+               F.distributed.remote_send(data, 1) # return None
+           else:
+               data = F.distributed.remote_recv(0)
+               # Tensor(1)
     """
     group = _SendRecvGroup(get_rank(), dest_rank)
     _bcast_shape_dtype(group, inp)
@@ -844,8 +815,7 @@ def remote_send(inp: Tensor, dest_rank: int):
 
 
 def remote_recv(src_rank: int, device: Optional[str] = None, inp=None) -> Tensor:
-    r"""
-    Receive a tensor from another process.
+    r"""Receive a tensor from another process.
 
     Args:
         src_rank: Rank of source process.
@@ -862,14 +832,13 @@ def remote_recv(src_rank: int, device: Optional[str] = None, inp=None) -> Tensor
 
     .. code-block::
 
-        if rank == 0:
-            data = mge.tensor(1)
-            # Tensor(1)
-            F.distributed.remote_send(data, 1) # return None
-        else:
-            data = F.distributed.remote_recv(0)
-            # Tensor(1)
-
+       if rank == 0:
+           data = mge.tensor(1)
+           # Tensor(1)
+           F.distributed.remote_send(data, 1) # return None
+       else:
+           data = F.distributed.remote_recv(0)
+           # Tensor(1)
     """
     group = _SendRecvGroup(src_rank, get_rank())
     shape, dtype = _bcast_shape_dtype(group, None)
diff --git a/imperative/python/megengine/distributed/group.py b/imperative/python/megengine/distributed/group.py
index 63017507..b9b43783 100644
--- a/imperative/python/megengine/distributed/group.py
+++ b/imperative/python/megengine/distributed/group.py
@@ -36,15 +36,13 @@ _sd = None
 
 
 class Group:
-    r"""
-    Include ranked nodes running collective communication (See :mod:`~.functional.distributed`).
+    r"""Include ranked nodes running collective communication (See :mod:`~.functional.distributed`).
 
-    By default collectives operate on the default group (also called ``WORLD``) 
-    and require all processes to enter the distributed function call. 
+    By default collectives operate on the default group (also called ``WORLD``)
+    and require all processes to enter the distributed function call.
 
-    :param proc_ranks: rank list of the group, the first one is root rank.
-
-    
+    Args:
+        proc_ranks: rank list of the group, the first one is root rank.
     """
 
     def __init__(self, proc_ranks):
@@ -116,15 +114,15 @@ def init_process_group(
     backend: Optional[str] = "auto",
     device_type: str = "xpu",
 ) -> None:
-    """
-    Initialize the distributed process group and specify the device used in the current process
-
-    :param master_ip: ip address of the master node.
-    :param port: port available for all processes to communicate.
-    :param world_size: total number of processes participating in the job.
-    :param rank: rank of the current process.
-    :param device: the GPU device id to bind this process to.
-    :param backend: communicator backend, currently support 'nccl' and 'shm'.
+    r"""Initialize the distributed process group and specify the device used in the current process
+
+    Args:
+        master_ip: ip address of the master node.
+        port: port available for all processes to communicate.
+        world_size: total number of processes participating in the job.
+        rank: rank of the current process.
+        device: the GPU device id to bind this process to.
+        backend: communicator backend, currently support 'nccl' and 'shm'.
     """
     physical_device_type = what_is_xpu() if device_type == "xpu" else device_type
     if not isinstance(master_ip, str):
@@ -180,10 +178,10 @@ def _set_machine_ranks(ranks) -> None:
 
 @contextmanager
 def override_backend(new_backend: str):
-    """
-    Override distributed backend
+    r"""Override distributed backend
 
-    :param new_backend: communicator backend set in this context.
+    Args:
+        new_backend: communicator backend set in this context.
     """
     global _sd
     assert _sd, "please call init_process_group first"
@@ -196,51 +194,51 @@ def override_backend(new_backend: str):
 
 
 def is_distributed() -> bool:
-    """Return True if the distributed process group has been initialized."""
+    r"""Return True if the distributed process group has been initialized."""
     return _sd is not None
 
 
 def get_rank() -> int:
-    """Get the rank of the current process."""
+    r"""Get the rank of the current process."""
     return _sd.proc_rank if _sd is not None else 0
 
 
 def get_world_size() -> int:
-    """Get the total number of processes participating in the job."""
+    r"""Get the total number of processes participating in the job."""
     return _sd.world_size if _sd is not None else 1
 
 
 def get_backend() -> str:
-    """Get the backend str."""
+    r"""Get the backend str."""
     assert _sd is not None, "please call init_process_group first"
     return _sd.backend if _sd is not None else None
 
 
 def get_py_server_addr() -> Tuple[str, int]:
-    """Get master_ip and port of python XML RPC server."""
+    r"""Get master_ip and port of python XML RPC server."""
     assert _sd is not None, "please call init_process_group first"
     return _sd.master_ip, _sd.py_server_port
 
 
 def get_mm_server_addr() -> Tuple[str, int]:
-    """Get master_ip and port of C++ mm_server."""
+    r"""Get master_ip and port of C++ mm_server."""
     assert _sd is not None, "please call init_process_group first"
     return _sd.master_ip, _sd.mm_server_port
 
 
 def get_client() -> Client:
-    """Get client of python XML RPC server."""
+    r"""Get client of python XML RPC server."""
     assert _sd is not None, "please call init_process_group first"
     return _sd.client
 
 
 def new_group(proc_ranks: List[int]) -> Group:
-    """Build a subgroup containing certain ranks."""
+    r"""Build a subgroup containing certain ranks."""
     return Group(proc_ranks)
 
 
 def group_barrier(group: Group = WORLD) -> None:
-    """Block until all ranks in the group reach this barrier."""
+    r"""Block until all ranks in the group reach this barrier."""
     # if running with single node, skip it
     if _sd is None:
         return
diff --git a/imperative/python/megengine/distributed/helper.py b/imperative/python/megengine/distributed/helper.py
index 43662363..9998a63c 100644
--- a/imperative/python/megengine/distributed/helper.py
+++ b/imperative/python/megengine/distributed/helper.py
@@ -28,39 +28,40 @@ from .group import WORLD, Group, group_barrier, is_distributed, override_backend
 
 
 def param_pack_split(inp: Tensor, offsets: list, shapes: list):
-    r"""
-    Returns split tensor to tensor list as offsets and shapes described,
-            only used for ``parampack``.
+    r"""Returns split tensor to tensor list as offsets and shapes described,
+    only used for ``parampack``.
 
-    :param inp: input tensor.
-    :param offsets: offsets of outputs, length of `2 * n`,
+    Args:
+        inp: input tensor.
+        offsets: offsets of outputs, length of `2 * n`,
             while n is tensor nums you want to split,
             format `[begin0, end0, begin1, end1]`.
-    :param shapes: tensor shapes of outputs.
-    :return: splitted tensors.
+        shapes: tensor shapes of outputs.
 
-    Examples:
+    Returns:
+        splitted tensors.
 
-    .. testcode::
+    Examples:
 
-        import numpy as np
-        from megengine import tensor
-        from megengine.distributed.helper import param_pack_split
+        .. testcode::
 
-        a = tensor(np.ones((10,), np.int32))
-        b, c = param_pack_split(a, [0, 1, 1, 10], [(1,), (3, 3)])
-        print(b.numpy())
-        print(c.numpy())
+           import numpy as np
+           from megengine import tensor
+           from megengine.distributed.helper import param_pack_split
 
-    Outputs:
+           a = tensor(np.ones((10,), np.int32))
+           b, c = param_pack_split(a, [0, 1, 1, 10], [(1,), (3, 3)])
+           print(b.numpy())
+           print(c.numpy())
 
-    .. testoutput::
+        Outputs:
 
-        [1]
-        [[1 1 1]
-         [1 1 1]
-         [1 1 1]]
+        .. testoutput::
 
+           [1]
+           [[1 1 1]
+            [1 1 1]
+            [1 1 1]]
     """
     op = ParamPackSplit()
     op.offsets = offsets
@@ -73,36 +74,37 @@ def param_pack_split(inp: Tensor, offsets: list, shapes: list):
 
 
 def param_pack_concat(inps: list, offsets: Tensor, offsets_val: list):
-    r"""
-    Returns concated tensor, only used for ``parampack``.
+    r"""Returns concated tensor, only used for ``parampack``.
 
-    :param inps: input tensors.
-    :param offsets: device value of offsets.
-    :param offsets_val: offsets of inputs, length of `2 * n`,
+    Args:
+         inps: input tensors.
+         offsets: device value of offsets.
+         offsets_val: offsets of inputs, length of `2 * n`,
             format `[begin0, end0, begin1, end1]`.
-    :return: concated tensor.
 
-    Examples:
+    Returns:
+         concated tensor.
 
-    .. testcode::
+    Examples:
 
-        import numpy as np
-        from megengine import tensor
-        from megengine.distributed.helper import param_pack_concat
+         .. testcode::
 
-        a = tensor(np.ones((1,), np.int32))
-        b = tensor(np.ones((3, 3), np.int32))
-        offsets_val = [0, 1, 1, 10]
-        offsets = tensor(offsets_val, np.int32)
-        c = param_pack_concat([a, b], offsets, offsets_val)
-        print(c.numpy())
+            import numpy as np
+            from megengine import tensor
+            from megengine.distributed.helper import param_pack_concat
 
-    Outputs:
+            a = tensor(np.ones((1,), np.int32))
+            b = tensor(np.ones((3, 3), np.int32))
+            offsets_val = [0, 1, 1, 10]
+            offsets = tensor(offsets_val, np.int32)
+            c = param_pack_concat([a, b], offsets, offsets_val)
+            print(c.numpy())
 
-    .. testoutput::
+         Outputs:
 
-        [1 1 1 1 1 1 1 1 1 1]
+         .. testoutput::
 
+            [1 1 1 1 1 1 1 1 1 1]
     """
     op = ParamPackConcat()
     op.offsets = offsets_val
@@ -165,9 +167,9 @@ class TensorFuture(Future):
 
 
 def synchronized(func: Callable):
+    r"""Decorator. Decorated function will synchronize when finished.
+    Specifically, we use this to prevent data race during hub.load
     """
-    Decorator. Decorated function will synchronize when finished.
-    Specifically, we use this to prevent data race during hub.load"""
 
     @functools.wraps(func)
     def wrapper(*args, **kwargs):
@@ -199,23 +201,23 @@ get_device_count_by_fork = deprecated_func(
 
 
 def bcast_list_(inps: list, group: Group = WORLD):
-    """
-    Broadcast tensors between given group.
+    r"""Broadcast tensors between given group.
 
-    :param inps: input tensors.
-    :param group: communication group.
+    Args:
+        inps: input tensors.
+        group: communication group.
     """
     for inp in inps:
         inp._reset(_bcast_param(inp, group))
 
 
 class AllreduceCallback:
-    """
-    Allreduce Callback with tensor fusion optimization.
+    r"""Allreduce Callback with tensor fusion optimization.
 
-    :param reduce_method: the method to reduce gradiants.
-    :param group: communication group.
-    :param backend: override distributed backend in allreduce
+    Args:
+        reduce_method: the method to reduce gradiants.
+        group: communication group.
+        backend: override distributed backend in allreduce
     """
 
     def __init__(self, reduce_method: str, group: Group = WORLD, backend: str = None):
diff --git a/imperative/python/megengine/distributed/launcher.py b/imperative/python/megengine/distributed/launcher.py
index 531962ee..4deca430 100644
--- a/imperative/python/megengine/distributed/launcher.py
+++ b/imperative/python/megengine/distributed/launcher.py
@@ -39,7 +39,7 @@ def _run_wrapped(
     queue: mp.Queue,
     machine_ranks: list,
 ):
-    """Init distributed process group and run wrapped function."""
+    r"""Init distributed process group and run wrapped function."""
     _check_device_initialized(device_type, dev)
     init_process_group(
         master_ip=master_ip,
@@ -64,15 +64,16 @@ def _run_wrapped(
 
 
 class launcher:
-    """Decorator for launching multiple processes in single-machine multi-gpu training.
-
-    :param func: the function you want to launch in distributed mode.
-    :param n_gpus: how many devices each node.
-    :param world_size: how many devices totally.
-    :param rank_start: start number for rank.
-    :param master_ip: ip address for master node (where the rank 0 is).
-    :param port: server port for distributed server.
-    :param backend: set default collective communication backend.
+    r"""Decorator for launching multiple processes in single-machine multi-gpu training.
+
+    Args:
+        func: the function you want to launch in distributed mode.
+        n_gpus: how many devices each node.
+        world_size: how many devices totally.
+        rank_start: start number for rank.
+        master_ip: ip address for master node (where the rank 0 is).
+        port: server port for distributed server.
+        backend: set default collective communication backend.
     """
 
     def __new__(cls, *args, **kwargs):
diff --git a/imperative/python/megengine/distributed/server.py b/imperative/python/megengine/distributed/server.py
index 8b3b569e..90a16413 100644
--- a/imperative/python/megengine/distributed/server.py
+++ b/imperative/python/megengine/distributed/server.py
@@ -20,11 +20,11 @@ from ..utils.future import Future
 
 
 class Methods:
-    """
-    Distributed Server Method.
+    r"""Distributed Server Method.
     Used for exchange information between distributed nodes.
 
-    :param mm_server_port: multiple machine rpc server port.
+    Args:
+        mm_server_port: multiple machine rpc server port.
     """
 
     def __init__(self, mm_server_port):
@@ -39,19 +39,19 @@ class Methods:
         self.bcast_dict = {}
 
     def connect(self):
-        """Method for checking connection success."""
+        r"""Method for checking connection success."""
         return True
 
     def get_mm_server_port(self):
-        """Get multiple machine rpc server port."""
+        r"""Get multiple machine rpc server port."""
         return self.mm_server_port
 
     def set_is_grad(self, key, is_grad):
-        """
-        Mark send/recv need gradiants by key.
+        r"""Mark send/recv need gradiants by key.
 
-        :param key: key to match send/recv op.
-        :param is_grad: whether this op need grad.
+        Args:
+            key: key to match send/recv op.
+            is_grad: whether this op need grad.
         """
         with self.lock:
             future = self.dict_is_grad[key]
@@ -59,10 +59,10 @@ class Methods:
         return True
 
     def check_is_grad(self, key):
-        """
-        Check whether send/recv need gradiants.
+        r"""Check whether send/recv need gradiants.
 
-        :param key: key to match send/recv op.
+        Args:
+            key: key to match send/recv op.
         """
         with self.lock:
             future = self.dict_is_grad[key]
@@ -72,11 +72,11 @@ class Methods:
         return ret
 
     def set_remote_tracer(self, key, tracer_set):
-        """
-        Set tracer dict for tracing send/recv op.
+        r"""Set tracer dict for tracing send/recv op.
 
-        :param key: key to match send/recv op.
-        :param tracer_set: valid tracer set.
+        Args:
+            key: key to match send/recv op.
+            tracer_set: valid tracer set.
         """
         with self.lock:
             future = self.dict_remote_tracer[key]
@@ -84,10 +84,10 @@ class Methods:
         return True
 
     def check_remote_tracer(self, key):
-        """
-        Get tracer dict for send/recv op.
+        r"""Get tracer dict for send/recv op.
 
-        :param key: key to match send/recv op.
+        Args:
+            key: key to match send/recv op.
         """
         with self.lock:
             future = self.dict_remote_tracer[key]
@@ -97,11 +97,11 @@ class Methods:
         return ret
 
     def group_barrier(self, key, size):
-        """
-        A barrier wait for all group member.
+        r"""A barrier wait for all group member.
 
-        :param key: group key to match each other.
-        :param size: group size.
+        Args:
+            key: group key to match each other.
+            size: group size.
         """
         with self.lock:
             self.dict_barrier_counter[key] += 1
@@ -116,14 +116,14 @@ class Methods:
         return True
 
     def user_set(self, key, val):
-        """Set user defined key-value pairs across processes."""
+        r"""Set user defined key-value pairs across processes."""
         with self.lock:
             future = self.user_dict[key]
         future.set(val)
         return True
 
     def user_get(self, key):
-        """Get user defined key-value pairs across processes."""
+        r"""Get user defined key-value pairs across processes."""
         with self.lock:
             future = self.user_dict[key]
         return future.get()
@@ -161,12 +161,12 @@ class ThreadXMLRPCServer(ThreadingMixIn, SimpleXMLRPCServer):
 
 
 def _start_server(py_server_port, queue):
-    """
-    Start python distributed server and multiple machine server.
+    r"""Start python distributed server and multiple machine server.
 
-    :param py_server_port: python server port.
-    :param mm_server_port: multiple machine server port.
-    :param queue: server port will put in this queue, puts exception when process fails.
+    Args:
+        py_server_port: python server port.
+        mm_server_port: multiple machine server port.
+        queue: server port will put in this queue, puts exception when process fails.
     """
     try:
         mm_server_port = create_mm_server("0.0.0.0", 0)
@@ -182,11 +182,11 @@ def _start_server(py_server_port, queue):
 
 
 class Server:
-    """
-    Distributed Server for distributed training.
+    r"""Distributed Server for distributed training.
     Should be running at master node.
 
-    :param port: python server port.
+    Args:
+        port: python server port.
     """
 
     def __init__(self, port=0):
@@ -204,11 +204,11 @@ class Server:
 
 
 class Client:
-    """
-    Distributed Client for distributed training.
+    r"""Distributed Client for distributed training.
 
-    :param master_ip: ip address of master node.
-    :param port: port of server at master node.
+    Args:
+        master_ip: ip address of master node.
+        port: port of server at master node.
     """
 
     def __init__(self, master_ip, port):
@@ -218,7 +218,7 @@ class Client:
         self.bcast_dict = defaultdict(lambda: 0)
 
     def connect(self):
-        """Check connection success."""
+        r"""Check connection success."""
         while True:
             try:
                 self.proxy = ServerProxy(
@@ -230,62 +230,62 @@ class Client:
                 time.sleep(1)
 
     def get_mm_server_port(self):
-        """Get multiple machine server port."""
+        r"""Get multiple machine server port."""
         return self.proxy.get_mm_server_port()
 
     def set_is_grad(self, key, is_grad):
-        """
-        Mark send/recv need gradiants by key.
+        r"""Mark send/recv need gradiants by key.
 
-        :param key: key to match send/recv op.
-        :param is_grad: whether this op need grad.
+        Args:
+            key: key to match send/recv op.
+            is_grad: whether this op need grad.
         """
         self.proxy.set_is_grad(key, is_grad)
 
     def check_is_grad(self, key):
-        """
-        Check whether send/recv need gradiants.
+        r"""Check whether send/recv need gradiants.
 
-        :param key: key to match send/recv op.
+        Args:
+            key: key to match send/recv op.
         """
         return self.proxy.check_is_grad(key)
 
     def set_remote_tracer(self, key, tracer_set):
-        """
-        Set tracer dict for tracing send/recv op.
+        r"""Set tracer dict for tracing send/recv op.
 
-        :param key: key to match send/recv op.
-        :param tracer_set: valid tracer set.
+        Args:
+            key: key to match send/recv op.
+            tracer_set: valid tracer set.
         """
         self.proxy.set_remote_tracer(key, tracer_set)
 
     def check_remote_tracer(self, key):
-        """
-        Get tracer dict for send/recv op.
+        r"""Get tracer dict for send/recv op.
 
-        :param key: key to match send/recv op.
+        Args:
+            key: key to match send/recv op.
         """
         return self.proxy.check_remote_tracer(key)
 
     def group_barrier(self, key, size):
-        """
-        A barrier wait for all group member.
+        r"""A barrier wait for all group member.
 
-        :param key: group key to match each other.
-        :param size: group size.
+        Args:
+            key: group key to match each other.
+            size: group size.
         """
         self.proxy.group_barrier(key, size)
 
     def user_set(self, key, val):
-        """Set user defined key-value pairs across processes."""
+        r"""Set user defined key-value pairs across processes."""
         return self.proxy.user_set(key, val)
 
     def user_get(self, key):
-        """Get user defined key-value pairs across processes."""
+        r"""Get user defined key-value pairs across processes."""
         return self.proxy.user_get(key)
 
     def user_pop(self, key):
-        """Get user defined key-value pairs and delete the resources when the get is done"""
+        r"""Get user defined key-value pairs and delete the resources when the get is done"""
         return self.proxy.user_pop(key)
 
     def bcast_val(self, val, key, size):
diff --git a/imperative/python/megengine/dtr/dtr.py b/imperative/python/megengine/dtr/dtr.py
index 10435fff..5f043d3c 100644
--- a/imperative/python/megengine/dtr/dtr.py
+++ b/imperative/python/megengine/dtr/dtr.py
@@ -30,24 +30,20 @@ def _str2bytes(text: str) -> int:
 
 @property
 def eviction_threshold(mod):
-    r"""
-    Get or set the eviction threshold in bytes. It can also be set to a string,
+    r"""Get or set the eviction threshold in bytes. It can also be set to a string,
     whose formatting supports byte(B), kilobyte(KB), megabyte(MB) and
     gigabyte(GB) units.
-
-    .. note::
-
+    
+    Note: 
        When GPU memory usage exceeds this value, DTR will heuristically select
        and evict resident tensors until the amount of used memory falls below
        this threshold.
-
+    
     Examples:
+        .. code-block::
 
-    .. code-block::
-
-        import megengine as mge
-        mge.dtr.eviction_threshold = "2GB"
-
+           import megengine as mge
+           mge.dtr.eviction_threshold = "2GB"
     """
     return _eviction_threshold
 
@@ -66,24 +62,21 @@ def eviction_threshold(mod, value: Union[int, str]):
 
 @property
 def evictee_minimum_size(mod):
-    r"""
-    Get or set the memory threshold of tensors in bytes. It can also be set to a
+    r"""Get or set the memory threshold of tensors in bytes. It can also be set to a
     string, whose formatting supports byte(B), kilobyte(KB), megabyte(MB) and
     gigabyte(GB) units.
-
-    .. note::
-
+    
+    Note:
        Only tensors whose size exceeds this threshold will be added to the
        candidate set. A tensor that is not added to the candidate set will
        never be evicted during its lifetime.
-
+    
     Examples:
+    
+        .. code-block::
 
-    .. code-block::
-
-        import megengine as mge
-        mge.dtr.evictee_minimum_size = "2MB"
-
+           import megengine as mge
+           mge.dtr.evictee_minimum_size = "2MB"
     """
     return _evictee_minimum_size
 
@@ -102,19 +95,16 @@ def evictee_minimum_size(mod, value: Union[int, str]):
 
 @property
 def enable_sqrt_sampling(mod):
-    r"""
-    Get or set whether sqrt sampling is allowed. Sqrt sampling means that given
+    r"""Get or set whether sqrt sampling is allowed. Sqrt sampling means that given
     the size of the candidate set is N, only enumerate sqrt(N) tensors. When
     the number of tensors is very high, enabling this optimization will speed
     up the training.
+    
+    Examples:    
+        .. code-block::
 
-    Examples:
-
-    .. code-block::
-
-        import megengine as mge
-        mge.dtr.enable_sqrt_sampling = True
-
+           import megengine as mge
+           mge.dtr.enable_sqrt_sampling = True
     """
     return _enable_sqrt_sampling
 
@@ -127,9 +117,7 @@ def enable_sqrt_sampling(mod, value: bool):
 
 
 def enable():
-    r"""
-    Enable to record computing path of tensors and to perform DTR policy.
-    """
+    r"""Enable to record computing path of tensors and to perform DTR policy."""
     _set_defrag(True)
     _set_option("enable_dtr_auto_drop", 1)
     _set_option("enable_drop", 1)
@@ -138,9 +126,7 @@ def enable():
 
 
 def disable():
-    r"""
-    Stop recording computing path of tensors and performing DTR policy.
-    """
+    r"""Stop recording computing path of tensors and performing DTR policy."""
     _set_defrag(False)
     _set_option("enable_dtr_auto_drop", 0)
     _set_option("enable_drop", 0)
diff --git a/imperative/python/megengine/functional/debug_param.py b/imperative/python/megengine/functional/debug_param.py
index 2bdf9935..43d00596 100644
--- a/imperative/python/megengine/functional/debug_param.py
+++ b/imperative/python/megengine/functional/debug_param.py
@@ -23,8 +23,7 @@ if os.getenv("MEGENGINE_CONV_EXECUTION_STRATEGY") != None:
 
 
 def get_execution_strategy() -> Strategy:
-    """
-    Returns the execution strategy of :class:`~module..Conv2d` and :func:`~.matmul`
+    r"""Returns the execution strategy of :class:`~module..Conv2d` and :func:`~.matmul`
 
     See :func:`~.set_execution_strategy` for possible return values
     """
@@ -32,31 +31,32 @@ def get_execution_strategy() -> Strategy:
 
 
 def set_execution_strategy(option):
-    """
-    Sets the execution strategy of :class:`~module.Conv2d` and :func:`~.matmul`
+    r"""Sets the execution strategy of :class:`~module.Conv2d` and :func:`~.matmul`
+
+    Args:
+        option: Decides how :class:`~.module.Conv2d`and :func:`~.matmul` algorithms are chosen.
+            Available value Strategy
 
-    :param option: Decides how :class:`~module.Conv2d`and :func:`~.matmul` algorithms are chosen.
-        Available value Strategy
-        * HEURISTIC uses heuristic to choose the fastest algorithm.
-        * PROFILE runs possible algorithms on real device to find the best one.
-        * REPRODUCIBLE uses the algorithms that is reproducible.
-        * OPTIMIZED uses the algorithms that is optimized.
+                * HEURISTIC uses heuristic to choose the fastest algorithm.
+                * PROFILE runs possible algorithms on real device to find the best one.
+                * REPRODUCIBLE uses the algorithms that is reproducible.
+                * OPTIMIZED uses the algorithms that is optimized.
 
-        The default strategy is HEURISTIC, this options can be combined to
-        form a combination option, e.g. PROFILE | REPRODUCIBLE
-        can combined a option that uses the fastest of profiling result that is also reproducible.
+    The default strategy is HEURISTIC, this options can be combined to
+    form a combination option, e.g. PROFILE | REPRODUCIBLE
+    can combined a option that uses the fastest of profiling result that is also reproducible.
 
-        Available values string:
+    Available values string:
 
-        * 'HEURISTIC' uses heuristic to choose the fastest algorithm.
-        * 'PROFILE' runs possible algorithms on real device to find the best one.
-        * 'PROFILE_HEURISTIC' uses profiling result and heuristic to choose the fastest algorithm.
-        * 'PROFILE_REPRODUCIBLE' uses the fastest of profiling result that is also reproducible.
-        * 'HEURISTIC_REPRODUCIBLE' uses heuristic to choose the fastest algorithm that is also reproducible.
+    * 'HEURISTIC' uses heuristic to choose the fastest algorithm.
+    * 'PROFILE' runs possible algorithms on real device to find the best one.
+    * 'PROFILE_HEURISTIC' uses profiling result and heuristic to choose the fastest algorithm.
+    * 'PROFILE_REPRODUCIBLE' uses the fastest of profiling result that is also reproducible.
+    * 'HEURISTIC_REPRODUCIBLE' uses heuristic to choose the fastest algorithm that is also reproducible.
 
-        The default strategy is 'HEURISTIC'.
+    The default strategy is 'HEURISTIC'.
 
-        It can also be set through the environment variable 'MEGENGINE_EXECUTION_STRATEGY'.
+    It can also be set through the environment variable 'MEGENGINE_EXECUTION_STRATEGY'.
     """
     valid_string_option = {
         "REPRODUCIBLE": Strategy.REPRODUCIBLE,
diff --git a/imperative/python/megengine/functional/elemwise.py b/imperative/python/megengine/functional/elemwise.py
index 2b9bd610..cd584139 100644
--- a/imperative/python/megengine/functional/elemwise.py
+++ b/imperative/python/megengine/functional/elemwise.py
@@ -78,182 +78,163 @@ def _elemwise_multi_type(*args, mode, **kwargs):
 
 
 def add(x, y):
-    """
-    Element-wise `addition`.
-    At least one operand should be tensor.
-
-    Same for sub/mul/div/floor_div/pow/mod/atan2/equal/not_equal/less/less_equal/greater/greater_equal/maximum/minmium.
-
-    :param x: input tensor.
-    :return: computed tensor.
+    r"""Element-wise `addition`.
 
     Examples:
 
-    .. testcode::
+        .. testcode::
 
-        import numpy as np
-        from megengine import tensor
-        import megengine.functional as F
+            import numpy as np
+            from megengine import tensor
+            import megengine.functional as F
 
-        x = tensor(np.arange(0, 6, dtype=np.float32).reshape(2, 3))
-        y = tensor(np.arange(0, 6, dtype=np.float32).reshape(2, 3))
-        out = F.add(x, y)
-        print(out.numpy())
+            x = tensor(np.arange(0, 6, dtype=np.float32).reshape(2, 3))
+            y = tensor(np.arange(0, 6, dtype=np.float32).reshape(2, 3))
+            out = F.add(x, y)
+            print(out.numpy())
 
-    Outputs:
+        Outputs:
 
-    .. testoutput::
-
-        [[ 0.  2.  4.]
-         [ 6.  8. 10.]]
+        .. testoutput::
 
+            [[ 0.  2.  4.]
+             [ 6.  8. 10.]]
     """
     return _elwise(x, y, mode=Elemwise.Mode.ADD)
 
 
 def sub(x, y):
-    """Element-wise `subtraction`."""
+    r"""Element-wise `subtraction`."""
     return _elwise(x, y, mode=Elemwise.Mode.SUB)
 
 
 def mul(x, y):
-    """Element-wise `multiplication`."""
+    r"""Element-wise `multiplication`."""
     return _elwise(x, y, mode=Elemwise.Mode.MUL)
 
 
 def div(x, y):
-    """Element-wise `(x / y)`."""
+    r"""Element-wise `(x / y)`."""
     return _elwise(x, y, mode=Elemwise.Mode.TRUE_DIV)
 
 
 def floor_div(x, y):
-    """Element-wise `floor(x / y)`."""
+    r"""Element-wise `floor(x / y)`."""
     return _elwise(x, y, mode=Elemwise.Mode.FLOOR_DIV)
 
 
 def neg(x):
-    """Element-wise `negation`."""
+    r"""Element-wise `negation`."""
     return _elwise(x, mode=Elemwise.Mode.NEGATE)
 
 
 def pow(x, y):
-    """Element-wise `power`."""
+    r"""Element-wise `power`."""
     return _elwise(x, y, mode=Elemwise.Mode.POW)
 
 
 def mod(x, y):
-    """Element-wise `remainder of division`."""
+    r"""Element-wise `remainder of division`."""
     return _elwise(x, y, mode=Elemwise.Mode.MOD)
 
 
 def abs(x):
-    """Element-wise `absolute value`."""
+    r"""Element-wise `absolute value`."""
     return _elwise(x, mode=Elemwise.Mode.ABS)
 
 
 def exp(x):
-    """Element-wise `exponential`."""
+    r"""Element-wise `exponential`."""
     return _elwise(x, mode=Elemwise.Mode.EXP)
 
 
 def expm1(x):
-    """Element-wise `exp(x)-1`."""
+    r"""Element-wise `exp(x)-1`."""
     return _elwise(x, mode=Elemwise.Mode.EXPM1)
 
 
 def log(x):
-    """Element-wise `logarithm (base e)`."""
+    r"""Element-wise `logarithm (base e)`."""
     return _elwise(x, mode=Elemwise.Mode.LOG)
 
 
 def log1p(x):
-    """Element-wise `log(x+1) (base e)`."""
+    r"""Element-wise `log(x+1) (base e)`."""
     return _elwise(x, mode=Elemwise.Mode.LOG1P)
 
 
 def sqrt(x: Tensor) -> Tensor:
-    """
-    Element-wise `sqrt`.
-    Returns ``NaN`` for negative input value.
-
-    :param x: input tensor.
-    :return: computed tensor.
+    r"""Element-wise `sqrt`.
 
     Examples:
 
-    .. testcode::
+        .. testcode::
 
-        import numpy as np
-        from megengine import tensor
-        import megengine.functional as F
+            import numpy as np
+            from megengine import tensor
+            import megengine.functional as F
 
-        x = tensor(np.arange(0, 6, dtype=np.float32).reshape(2, 3))
-        out = F.sqrt(x)
-        print(out.numpy().round(decimals=4))
+            x = tensor(np.arange(0, 6, dtype=np.float32).reshape(2, 3))
+            out = F.sqrt(x)
+            print(out.numpy().round(decimals=4))
 
-    Outputs:
+        Outputs:
 
-    .. testoutput::
-
-        [[0.     1.     1.4142]
-         [1.7321 2.     2.2361]]
+        .. testoutput::
 
+            [[0.     1.     1.4142]
+             [1.7321 2.     2.2361]]
     """
     return x ** 0.5
 
 
 def square(x: Tensor) -> Tensor:
-    """
-    Returns a new tensor with the square of the elements of input tensor.
-
-    :param inp: input tensor.
-    :return: computed tensor.
+    r"""Element-wise `square`.
 
     Examples:
 
-    .. testcode::
+        .. testcode::
 
-        import numpy as np
-        import megengine as mge
-        import megengine.functional as F
+            import numpy as np
+            import megengine as mge
+            import megengine.functional as F
 
-        data = mge.tensor(np.arange(0, 6, dtype=np.float32).reshape(2, 3))
-        out = F.square(data)
-        print(out.numpy().round(decimals=4))
+            data = mge.tensor(np.arange(0, 6, dtype=np.float32).reshape(2, 3))
+            out = F.square(data)
+            print(out.numpy().round(decimals=4))
 
-    Outputs:
+        Outputs:
 
-    .. testoutput::
-
-        [[ 0.  1.  4.]
-         [ 9. 16. 25.]]
+        .. testoutput::
 
+            [[ 0.  1.  4.]
+             [ 9. 16. 25.]]
     """
     return x ** 2
 
 
 def round(x):
-    """Element-wise `rounding to int`."""
+    r"""Element-wise `rounding to int`."""
     return _elwise(x, mode=Elemwise.Mode.ROUND)
 
 
 def ceil(x):
-    """Element-wise `ceiling`."""
+    r"""Element-wise `ceiling`."""
     return _elwise(x, mode=Elemwise.Mode.CEIL)
 
 
 def floor(x):
-    """Element-wise `floor`."""
+    r"""Element-wise `floor`."""
     return _elwise(x, mode=Elemwise.Mode.FLOOR)
 
 
 def maximum(x, y):
-    """Element-wise `maximum of array elements`."""
+    r"""Element-wise `maximum of array elements`."""
     return _elwise(x, y, mode=Elemwise.Mode.MAX)
 
 
 def minimum(x, y):
-    """Element-wise `minimum of array elements`."""
+    r"""Element-wise `minimum of array elements`."""
     return _elwise(x, y, mode=Elemwise.Mode.MIN)
 
 
@@ -261,62 +242,57 @@ def minimum(x, y):
 
 
 def cos(x):
-    """
-    Element-wise `cosine`.
-
-    :param x: input tensor.
-    :return: computed tensor.
+    r"""Element-wise `cosine`.
 
     Examples:
 
-    .. testcode::
-
-        import numpy as np
-        from megengine import tensor
-        import megengine.functional as F
+        .. testcode::
 
-        x = tensor(np.arange(0, 6, dtype=np.float32).reshape(2, 3))
-        out = F.cos(x)
-        print(out.numpy().round(decimals=4))
+            import numpy as np
+            from megengine import tensor
+            import megengine.functional as F
 
-    Outputs:
+            x = tensor(np.arange(0, 6, dtype=np.float32).reshape(2, 3))
+            out = F.cos(x)
+            print(out.numpy().round(decimals=4))
 
-    .. testoutput::
+        Outputs:
 
-        [[ 1.      0.5403 -0.4161]
-         [-0.99   -0.6536  0.2837]]
+        .. testoutput::
 
+            [[ 1.      0.5403 -0.4161]
+             [-0.99   -0.6536  0.2837]]
     """
     return _elwise(x, mode=Elemwise.Mode.COS)
 
 
 def sin(x):
-    """Element-wise `sine`."""
+    r"""Element-wise `sine`."""
     return _elwise(x, mode=Elemwise.Mode.SIN)
 
 
 def tan(x):
-    """Element-wise `tangent`."""
+    r"""Element-wise `tangent`."""
     return sin(x) / cos(x)
 
 
 def acos(x):
-    """Element-wise `inverse cosine`."""
+    r"""Element-wise `inverse cosine`."""
     return _elwise(x, mode=Elemwise.Mode.ACOS)
 
 
 def asin(x):
-    """Element-wise `inverse sine`."""
+    r"""Element-wise `inverse sine`."""
     return _elwise(x, mode=Elemwise.Mode.ASIN)
 
 
 def atan(x):
-    """Element-wise `inverse tangent`."""
+    r"""Element-wise `inverse tangent`."""
     return _elwise(x, 1, mode=Elemwise.Mode.ATAN2)
 
 
 def atan2(y, x):
-    """Element-wise `2-argument arctangent`."""
+    r"""Element-wise `2-argument arctangent`."""
     return _elwise(y, x, mode=Elemwise.Mode.ATAN2)
 
 
@@ -355,38 +331,33 @@ def atanh(x):
 
 
 def left_shift(x, y):
-    """
-    Element-wise `bitwise binary: x << y`.
+    r"""Element-wise `bitwise binary: x << y`.
 
-    :param x: input tensor, should be int.
-    :param y: how many bits to be left-shifted.
-    :return: computed tensor.
+        Examples:
 
-    Examples:
-
-    .. testcode::
+        .. testcode::
 
-        import numpy as np
-        from megengine import tensor
-        import megengine.functional as F
+            import numpy as np
+            from megengine import tensor
+            import megengine.functional as F
 
-        x = tensor(np.arange(0, 6, dtype=np.int32).reshape(2, 3))
-        out = F.left_shift(x, 2)
-        print(out.numpy())
+            x = tensor(np.arange(0, 6, dtype=np.int32).reshape(2, 3))
+            out = F.left_shift(x, 2)
+            print(out.numpy())
 
-    Outputs:
+        Outputs:
 
-    .. testoutput::
+        .. testoutput::
 
-        [[ 0  4  8]
-         [12 16 20]]
+            [[ 0  4  8]
+             [12 16 20]]
 
     """
     return _elwise(x, y, mode=Elemwise.Mode.SHL)
 
 
 def right_shift(x, y):
-    """Element-wise `bitwise binary: x >> y`."""
+    r"""Element-wise `bitwise binary: x >> y`."""
     return _elwise(x, y, mode=Elemwise.Mode.SHR)
 
 
@@ -394,22 +365,22 @@ def right_shift(x, y):
 
 
 def logical_and(x, y):
-    """Element-wise `logical and: x && y`."""
+    r"""Element-wise `logical and: x && y`."""
     return _elwise(x, y, mode=Elemwise.Mode.AND)
 
 
 def logical_not(x):
-    """Element-wise `logical not: ~x`."""
+    r"""Element-wise `logical not: ~x`."""
     return _elwise(x, mode=Elemwise.Mode.NOT)
 
 
 def logical_or(x, y):
-    """Element-wise `logical or: x || y`."""
+    r"""Element-wise `logical or: x || y`."""
     return _elwise(x, y, mode=Elemwise.Mode.OR)
 
 
 def logical_xor(x, y):
-    """Element-wise `logical xor: x ^ y`."""
+    r"""Element-wise `logical xor: x ^ y`."""
     return _elwise(x, y, mode=Elemwise.Mode.XOR)
 
 
@@ -417,59 +388,53 @@ def logical_xor(x, y):
 
 
 def equal(x, y):
-    """
-    Element-wise `(x == y)`.
-
-    :param x: input tensor 1.
-    :param y: input tensor 2.
-    :return: computed tensor.
+    r"""Element-wise `(x == y)`.
 
     Examples:
 
-    .. testcode::
-
-        import numpy as np
-        from megengine import tensor
-        import megengine.functional as F
+        .. testcode::
 
-        x = tensor(np.arange(0, 6, dtype=np.float32).reshape(2, 3))
-        y = tensor(np.arange(0, 6, dtype=np.float32).reshape(2, 3))
-        out = F.equal(x, y)
-        print(out.numpy())
+            import numpy as np
+            from megengine import tensor
+            import megengine.functional as F
 
-    Outputs:
+            x = tensor(np.arange(0, 6, dtype=np.float32).reshape(2, 3))
+            y = tensor(np.arange(0, 6, dtype=np.float32).reshape(2, 3))
+            out = F.equal(x, y)
+            print(out.numpy())
 
-    .. testoutput::
+        Outputs:
 
-        [[1. 1. 1.]
-         [1. 1. 1.]]
+        .. testoutput::
 
+            [[1. 1. 1.]
+             [1. 1. 1.]]
     """
     return _elwise(x, y, mode=Elemwise.Mode.EQ)
 
 
 def not_equal(x, y):
-    """Element-wise `(x != y)`."""
+    r"""Element-wise `(x != y)`."""
     return x != y
 
 
 def less(x, y):
-    """Element-wise `(x < y)`."""
+    r"""Element-wise `(x < y)`."""
     return _elwise(x, y, mode=Elemwise.Mode.LT)
 
 
 def less_equal(x, y):
-    """Element-wise `(x <= y)`."""
+    r"""Element-wise `(x <= y)`."""
     return _elwise(x, y, mode=Elemwise.Mode.LEQ)
 
 
 def greater(x, y):
-    """Element-wise `(x > y)`."""
+    r"""Element-wise `(x > y)`."""
     return _elwise(y, x, mode=Elemwise.Mode.LT)
 
 
 def greater_equal(x, y):
-    """Element-wise `(x >= y)`."""
+    r"""Element-wise `(x >= y)`."""
     return _elwise(y, x, mode=Elemwise.Mode.LEQ)
 
 
@@ -477,43 +442,45 @@ def greater_equal(x, y):
 
 
 def clip(x: Tensor, lower=None, upper=None) -> Tensor:
-    r"""
-    Clamps all elements in input tensor into the range `[` :attr:`lower`, :attr:`upper` `]` and returns
+    r"""Clamps all elements in input tensor into the range ``[ lower, upper ]`` and returns
     a resulting tensor:
 
     .. math::
+
         y_i = \begin{cases}
             \text{lower} & \text{if } x_i < \text{lower} \\
             x_i & \text{if } \text{lower} \leq x_i \leq \text{upper} \\
             \text{upper} & \text{if } x_i > \text{upper}
         \end{cases}
 
-    :param x: input tensor.
-    :param lower: lower-bound of the range to be clamped to.
-    :param upper: upper-bound of the range to be clamped to.
-    :return: output clamped tensor.
+    Args:
+        x: input tensor.
+        lower: lower-bound of the range to be clamped to.
+        upper: upper-bound of the range to be clamped to.
 
-    Examples:
+    Returns:
+        output clamped tensor.
 
-    .. testcode::
+    Examples:
 
-        import numpy as np
-        from megengine import tensor
-        import megengine.functional as F
+        .. testcode::
 
-        a = tensor(np.arange(5).astype(np.int32))
-        print(F.clip(a, 2, 4).numpy())
-        print(F.clip(a, lower=3).numpy())
-        print(F.clip(a, upper=3).numpy())
+            import numpy as np
+            from megengine import tensor
+            import megengine.functional as F
 
-    Outputs:
+            a = tensor(np.arange(5).astype(np.int32))
+            print(F.clip(a, 2, 4).numpy())
+            print(F.clip(a, lower=3).numpy())
+            print(F.clip(a, upper=3).numpy())
 
-    .. testoutput::
+        Outputs:
 
-        [2 2 2 3 4]
-        [3 3 3 3 4]
-        [0 1 2 3 3]
+        .. testoutput::
 
+            [2 2 2 3 4]
+            [3 3 3 3 4]
+            [0 1 2 3 3]
     """
     assert (
         lower is not None or upper is not None
diff --git a/imperative/python/megengine/functional/external.py b/imperative/python/megengine/functional/external.py
index 8fba2e0c..44fffc27 100644
--- a/imperative/python/megengine/functional/external.py
+++ b/imperative/python/megengine/functional/external.py
@@ -23,14 +23,14 @@ def tensorrt_runtime_opr(inputs, *, data: bytes = None):
 
 
 def cambricon_runtime_opr(inputs, data, symbol, tensor_dim_mutable):
-    r"""
-    Load a serialized Cambricon model as a runtime operator in MegEngine.
-
-    :param inputs: list of input tensors.
-    :param data: the serialized Cambricon model.
-    :param symbol: name of the function in Cambricon model.
-    :param tensor_dim_mutable: whether the input tensors' shapes are mutable
-        in ``cnrtModel_t``.
+    r"""Load a serialized Cambricon model as a runtime operator in MegEngine.
+
+    Args:
+        inputs: list of input tensors.
+        data: the serialized Cambricon model.
+        symbol: name of the function in Cambricon model.
+        tensor_dim_mutable: whether the input tensors' shapes are mutable
+            in ``cnrtModel_t``.
     """
 
     op = builtin.CambriconRuntime(data, len(data), symbol, tensor_dim_mutable)
@@ -38,11 +38,11 @@ def cambricon_runtime_opr(inputs, data, symbol, tensor_dim_mutable):
 
 
 def atlas_runtime_opr(inputs, data):
-    r"""
-    Load a serialized Atlas model as a runtime operator in MegEngine.
+    r"""Load a serialized Atlas model as a runtime operator in MegEngine.
 
-    :param inputs: list of input tensors.
-    :param data: the serialized Atlas model.
+    Args:
+        inputs: list of input tensors.
+        data: the serialized Atlas model.
     """
 
     op = builtin.AtlasRuntime(data, len(data))
diff --git a/imperative/python/megengine/functional/loss.py b/imperative/python/megengine/functional/loss.py
index d62e240f..655c5d0b 100644
--- a/imperative/python/megengine/functional/loss.py
+++ b/imperative/python/megengine/functional/loss.py
@@ -26,9 +26,7 @@ __all__ = [
 
 
 def _reduce_output(loss_fn):
-    r"""
-    Wrapper to apply canonical reductions to loss outputs.
-    """
+    r"""Wrapper to apply canonical reductions to loss outputs."""
 
     @functools.wraps(loss_fn)
     def reduced_loss_fn(*args, reduction="mean", **kwargs):
@@ -45,13 +43,14 @@ def _reduce_output(loss_fn):
 
 @_reduce_output
 def l1_loss(pred: Tensor, label: Tensor, reduction: str = "mean") -> Tensor:
-    r"""
-    Calculates the mean absolute error (MAE) between
+    r"""Calculates the mean absolute error (MAE) between
     each element in the pred :math:`x` and label :math:`y`.
 
     The mean absolute error can be described as:
 
-    .. math:: \ell(x,y) = mean\left(L \right)
+    .. math::
+
+       \ell(x,y) = mean\left(L \right)
 
     where
 
@@ -63,30 +62,32 @@ def l1_loss(pred: Tensor, label: Tensor, reduction: str = "mean") -> Tensor:
     :math:`x` and :math:`y` are tensors of arbitrary shapes with a total
     of :math:`N` elements each. :math:`N` is the batch size.
 
-    :param pred: predicted result from model.
-    :param label: ground truth to compare.
-    :param reduction: the reduction to apply to the output: 'none' | 'mean' | 'sum'. Default: 'mean'
-    :return: loss value.
+    Args:
+        pred: predicted result from model.
+        label: ground truth to compare.
+        reduction: the reduction to apply to the output: 'none' | 'mean' | 'sum'. Default: 'mean'
 
-    Examples:
+    Returns:
+        loss value.
 
-    .. testcode::
+    Examples:
 
-        import numpy as np
-        import megengine as mge
-        import megengine.functional as F
+        .. testcode::
 
-        ipt = mge.tensor(np.array([3, 3, 3, 3]).astype(np.float32))
-        tgt = mge.tensor(np.array([2, 8, 6, 1]).astype(np.float32))
-        loss = F.nn.l1_loss(ipt, tgt)
-        print(loss.numpy())
+            import numpy as np
+            import megengine as mge
+            import megengine.functional as F
 
-    Outputs:
+            ipt = mge.tensor(np.array([3, 3, 3, 3]).astype(np.float32))
+            tgt = mge.tensor(np.array([2, 8, 6, 1]).astype(np.float32))
+            loss = F.nn.l1_loss(ipt, tgt)
+            print(loss.numpy())
 
-    .. testoutput::
+        Outputs:
 
-        2.75
+        .. testoutput::
 
+            2.75
     """
     diff = pred - label
     return abs(diff)
@@ -94,53 +95,56 @@ def l1_loss(pred: Tensor, label: Tensor, reduction: str = "mean") -> Tensor:
 
 @_reduce_output
 def square_loss(pred: Tensor, label: Tensor, reduction: str = "mean") -> Tensor:
-    r"""
-    Calculates the mean squared error (squared L2 norm) between
+    r"""Calculates the mean squared error (squared L2 norm) between
     each element in the pred :math:`x` and label :math:`y`.
 
     The mean squared error can be described as:
 
-    .. math:: \ell(x, y) = mean\left( L \right)
+    .. math::
+
+       \ell(x, y) = mean\left( L \right)
 
     where
 
     .. math::
 
-        L = \{l_1,\dots,l_N\}, \quad
-        l_n = \left( x_n - y_n \right)^2,
+       L = \{l_1,\dots,l_N\}, \quad
+       l_n = \left( x_n - y_n \right)^2,
 
     :math:`x` and :math:`y` are tensors of arbitrary shapes with a total
     of :math:`N` elements each. :math:`N` is the batch size.
 
-    :param pred: predicted result from model.
-    :param label: ground truth to compare.
-    :param reduction: the reduction to apply to the output: 'none' | 'mean' | 'sum'. Default: 'mean'
-    :return: loss value.
+    Args:
+        pred: predicted result from model.
+        label: ground truth to compare.
+        reduction: the reduction to apply to the output: 'none' | 'mean' | 'sum'. Default: 'mean'
+
+    Returns:
+        loss value.
 
     Shape:
-        - pred: :math:`(N, *)` where :math:`*` means any number of additional
-          dimensions.
-        - label: :math:`(N, *)`. Same shape as ``pred``.
+      * pred: :math:`(N, *)` where :math:`*` means any number of additional
+        dimensions.
+      * label: :math:`(N, *)`. Same shape as ``pred``.
 
     Examples:
 
-    .. testcode::
-
-        import numpy as np
-        import megengine as mge
-        import megengine.functional as F
+        .. testcode::
 
-        ipt = mge.tensor(np.array([3, 3, 3, 3]).astype(np.float32))
-        tgt = mge.tensor(np.array([2, 8, 6, 1]).astype(np.float32))
-        loss = F.nn.square_loss(ipt, tgt)
-        print(loss.numpy())
+            import numpy as np
+            import megengine as mge
+            import megengine.functional as F
 
-    Outputs:
+            ipt = mge.tensor(np.array([3, 3, 3, 3]).astype(np.float32))
+            tgt = mge.tensor(np.array([2, 8, 6, 1]).astype(np.float32))
+            loss = F.nn.square_loss(ipt, tgt)
+            print(loss.numpy())
 
-    .. testoutput::
+        Outputs:
 
-        9.75
+        .. testoutput::
 
+            9.75
     """
     diff = pred - label
     return diff ** 2
@@ -155,8 +159,7 @@ def cross_entropy(
     label_smooth: float = 0,
     reduction: str = "mean",
 ) -> Tensor:
-    r"""
-    Computes the multi-class cross entropy loss (using logits by default).
+    r"""Computes the multi-class cross entropy loss (using logits by default).
 
     By default(``with_logitis`` is True), ``pred`` is assumed to be logits,
     class probabilities are given by softmax.
@@ -170,35 +173,37 @@ def cross_entropy(
     where :math:`y^{LS}` and :math:`y` are new label distribution and origin label distribution respectively.
     k is the index of label distribution. :math:`\alpha` is ``label_smooth`` and :math:`K` is the number of classes.
 
-    :param pred: input tensor representing the predicted probability.
-    :param label: input tensor representing the classification label.
-    :param axis: an axis along which softmax will be applied. Default: 1
-    :param with_logits: whether to apply softmax first. Default: True
-    :param label_smooth: a label smoothing of parameter that can re-distribute target distribution. Default: 0
-    :param reduction: the reduction to apply to the output: 'none' | 'mean' | 'sum'. Default: 'mean'
-    :return: loss value.
+    Args:
+        pred: input tensor representing the predicted probability.
+        label: input tensor representing the classification label.
+        axis: an axis along which softmax will be applied. Default: 1
+        with_logits: whether to apply softmax first. Default: True
+        label_smooth: a label smoothing of parameter that can re-distribute target distribution. Default: 0
+        reduction: the reduction to apply to the output: 'none' | 'mean' | 'sum'. Default: 'mean'
 
-    Examples:
+    Returns:
+        loss value.
 
-    .. testcode::
+    Examples:
 
-        import numpy as np
-        from megengine import tensor
-        import megengine.functional as F
+        .. testcode::
 
-        data_shape = (1, 2)
-        label_shape = (1, )
-        pred = tensor(np.array([0, 0], dtype=np.float32).reshape(data_shape))
-        label = tensor(np.ones(label_shape, dtype=np.int32))
-        loss = F.nn.cross_entropy(pred, label)
-        print(loss.numpy().round(decimals=4))
+            import numpy as np
+            from megengine import tensor
+            import megengine.functional as F
 
-    Outputs:
+            data_shape = (1, 2)
+            label_shape = (1, )
+            pred = tensor(np.array([0, 0], dtype=np.float32).reshape(data_shape))
+            label = tensor(np.ones(label_shape, dtype=np.int32))
+            loss = F.nn.cross_entropy(pred, label)
+            print(loss.numpy().round(decimals=4))
 
-    .. testoutput::
+        Outputs:
 
-        0.6931
+        .. testoutput::
 
+            0.6931
     """
     n0 = pred.ndim
     n1 = label.ndim
@@ -226,37 +231,38 @@ def cross_entropy(
 def binary_cross_entropy(
     pred: Tensor, label: Tensor, with_logits: bool = True, reduction: str = "mean",
 ) -> Tensor:
-    r"""
-    Computes the binary cross entropy loss (using logits by default).
+    r"""Computes the binary cross entropy loss (using logits by default).
 
     By default(``with_logitis`` is True), ``pred`` is assumed to be logits,
     class probabilities are given by sigmoid.
 
-    :param pred: `(N, *)`, where `*` means any number of additional dimensions.
-    :param label: `(N, *)`, same shape as the input.
-    :param with_logits: bool, whether to apply sigmoid first. Default: True
-    :param reduction: the reduction to apply to the output: 'none' | 'mean' | 'sum'. Default: 'mean'
-    :return: loss value.
+    Args:
+        pred: `(N, *)`, where `*` means any number of additional dimensions.
+        label: `(N, *)`, same shape as the input.
+        with_logits: bool, whether to apply sigmoid first. Default: True
+        reduction: the reduction to apply to the output: 'none' | 'mean' | 'sum'. Default: 'mean'
 
-    Examples:
+    Returns:
+        loss value.
 
-    .. testcode::
+    Examples:
 
-        import numpy as np
-        from megengine import tensor
-        import megengine.functional as F
+        .. testcode::
 
-        pred = tensor(np.array([0, 0], dtype=np.float32).reshape(1, 2))
-        label = tensor(np.ones((1, 2), dtype=np.float32))
-        loss = F.nn.binary_cross_entropy(pred, label)
-        print(loss.numpy().round(decimals=4))
+            import numpy as np
+            from megengine import tensor
+            import megengine.functional as F
 
-    Outputs:
+            pred = tensor(np.array([0, 0], dtype=np.float32).reshape(1, 2))
+            label = tensor(np.ones((1, 2), dtype=np.float32))
+            loss = F.nn.binary_cross_entropy(pred, label)
+            print(loss.numpy().round(decimals=4))
 
-    .. testoutput::
+        Outputs:
 
-        0.6931
+        .. testoutput::
 
+            0.6931
     """
     if not with_logits:
         return -(label * log(pred) + (1 - label) * log(1 - pred))
@@ -269,37 +275,38 @@ def binary_cross_entropy(
 def hinge_loss(
     pred: Tensor, label: Tensor, norm: str = "L1", reduction: str = "mean"
 ) -> Tensor:
-    r"""
-    Caculates the hinge loss which is often used in SVM.
+    r"""Caculates the hinge loss which is often used in SVM.
 
     The hinge loss can be described as:
 
     .. math:: loss(x, y) = \frac{1}{N}\sum_i\sum_j(max(0, 1 - x_{ij}*y_{ij}))
 
-    :param pred: input tensor representing the predicted probability, shape is `(N, C)`.
-    :param label: input tensor representing the binary classification label, shape is `(N, C)`.
-    :param norm: specify the norm to caculate the loss, should be "L1" or "L2".
-    :param reduction: the reduction to apply to the output: 'none' | 'mean' | 'sum'. Default: 'mean'
-    :return: loss value.
+    Args:
+        pred: input tensor representing the predicted probability, shape is `(N, C)`.
+        label: input tensor representing the binary classification label, shape is `(N, C)`.
+        norm: specify the norm to caculate the loss, should be "L1" or "L2".
+        reduction: the reduction to apply to the output: 'none' | 'mean' | 'sum'. Default: 'mean'
 
-    Examples:
+    Returns:
+        loss value.
 
-    .. testcode::
+    Examples:
 
-        from megengine import tensor
-        import megengine.functional as F
+        .. testcode::
 
-        pred = tensor([[0.5, -0.5, 0.1], [-0.6, 0.7, 0.8]], dtype="float32")
-        label = tensor([[1, -1, -1], [-1, 1, 1]], dtype="float32")
-        loss = F.nn.hinge_loss(pred, label)
-        print(loss.numpy())
+            from megengine import tensor
+            import megengine.functional as F
 
-    Outputs:
+            pred = tensor([[0.5, -0.5, 0.1], [-0.6, 0.7, 0.8]], dtype="float32")
+            label = tensor([[1, -1, -1], [-1, 1, 1]], dtype="float32")
+            loss = F.nn.hinge_loss(pred, label)
+            print(loss.numpy())
 
-    .. testoutput::
+        Outputs:
 
-        1.5
+        .. testoutput::
 
+            1.5
     """
     norm = norm.upper()
     assert norm in ["L1", "L2"], "norm must be L1 or L2"
diff --git a/imperative/python/megengine/functional/math.py b/imperative/python/megengine/functional/math.py
index 12614e4f..70e316ea 100644
--- a/imperative/python/megengine/functional/math.py
+++ b/imperative/python/megengine/functional/math.py
@@ -51,82 +51,85 @@ __all__ = [
 
 
 def isnan(inp: Tensor) -> Tensor:
-    r"""
-    Returns a new tensor representing if each element is ``NaN`` or not.
+    r"""Returns a new tensor representing if each element is ``NaN`` or not.
 
-    :param inp: input tensor.
-    :return: result tensor.
+    Args:
+        inp: input tensor.
 
-    Examples:
+    Returns:
+        result tensor.
 
-    .. testcode::
+    Examples:
 
-        from megengine import tensor
-        import megengine.functional as F
+        .. testcode::
 
-        x = tensor([1, float("nan"), 0])
-        print(F.isnan(x).numpy())
+            from megengine import tensor
+            import megengine.functional as F
 
-    Outputs:
+            x = tensor([1, float("nan"), 0])
+            print(F.isnan(x).numpy())
 
-    .. testoutput::
+        Outputs:
 
-        [False  True False]
+        .. testoutput::
 
+            [False  True False]
     """
     return inp != inp
 
 
 def isinf(inp: Tensor) -> Tensor:
-    r"""
-    Returns a new tensor representing if each element is ``Inf`` or not.
+    r"""Returns a new tensor representing if each element is ``Inf`` or not.
 
-    :param inp: input tensor.
-    :return: result tensor.
+    Args:
+        inp: input tensor.
 
-    Examples:
+    Returns:
+        result tensor.
 
-    .. testcode::
+    Examples:
 
-        from megengine import tensor
-        import megengine.functional as F
+        .. testcode::
 
-        x = tensor([1, float("inf"), 0])
-        print(F.isinf(x).numpy())
+            from megengine import tensor
+            import megengine.functional as F
 
-    Outputs:
+            x = tensor([1, float("inf"), 0])
+            print(F.isinf(x).numpy())
 
-    .. testoutput::
+        Outputs:
 
-        [False  True False]
+        .. testoutput::
 
+            [False  True False]
     """
     return abs(inp).astype("float32") == float("inf")
 
 
 def sign(inp: Tensor):
-    r"""
-    Returns a new tensor representing the sign of each element in input tensor.
+    r"""Returns a new tensor representing the sign of each element in input tensor.
 
-    :param: input tensor.
-    :return: the sign of input tensor.
+    Args:
+        inp: Tensor:
 
-    Examples:
+    Returns:
+        the sign of input tensor.
 
-    .. testcode::
+    Examples:
 
-        from megengine import tensor
-        import megengine.functional as F
+        .. testcode::
 
-        x = tensor([1, -1, 0])
-        print(F.sign(x).numpy())
+            from megengine import tensor
+            import megengine.functional as F
 
-    Outputs:
+            x = tensor([1, -1, 0])
+            print(F.sign(x).numpy())
 
-    .. testoutput::
+        Outputs:
 
-        [ 1 -1  0]
+        .. testoutput::
 
+            [ 1 -1  0]
     """
     return (inp > 0).astype(inp.dtype) - (inp < 0).astype(inp.dtype)
 
@@ -136,35 +139,36 @@ def sum(
     axis: Optional[Union[int, Sequence[int]]] = None,
     keepdims: bool = False,
 ) -> Tensor:
-    r"""
-    Returns the sum of input tensor along given axis. If axis is a list of dimensions,
+    r"""Returns the sum of input tensor along given axis. If axis is a list of dimensions,
     reduce over all of them.
 
-    :param inp: input tensor.
-    :param axis: dimension to reduce. If None, all dimensions will be reduced.
-        Default: None
-    :param keepdims: whether the output tensor has axis retained or not.
-        Default: False
-    :return: output tensor.
+    Args:
+        inp: input tensor.
+        axis: dimension to reduce. If None, all dimensions will be reduced.
+            Default: None
+        keepdims: whether the output tensor has axis retained or not.
+            Default: False
 
-    Examples:
+    Returns:
+        output tensor.
 
-    .. testcode::
+    Examples:
 
-        import numpy as np
-        from megengine import tensor
-        import megengine.functional as F
+        .. testcode::
 
-        x = tensor(np.arange(1, 7, dtype=np.int32).reshape(2, 3))
-        out = F.sum(x)
-        print(out.numpy())
+            import numpy as np
+            from megengine import tensor
+            import megengine.functional as F
 
-    Outputs:
+            x = tensor(np.arange(1, 7, dtype=np.int32).reshape(2, 3))
+            out = F.sum(x)
+            print(out.numpy())
 
-    .. testoutput::
+        Outputs:
 
-        21
+        .. testoutput::
 
+            21
     """
     return inp.sum(axis=axis, keepdims=keepdims)
 
@@ -172,33 +176,34 @@ def sum(
 def prod(
     inp: Tensor, axis: Optional[Union[int, Sequence[int]]] = None, keepdims=False
 ) -> Tensor:
-    r"""
-    Returns the product of input tensor along given axis. If axis is a list of dimensions,
+    r"""Returns the product of input tensor along given axis. If axis is a list of dimensions,
     reduce over all of them.
 
-    :param inp: input tensor.
-    :param axis: dimension to reduce. If None, all dimensions will be reduced. Default: None
-    :param keepdims: whether the output tensor has axis retained or not. Default: False
-    :return: output tensor.
+    Args:
+        inp: input tensor.
+        axis: dimension to reduce. If None, all dimensions will be reduced. Default: None
+        keepdims: whether the output tensor has axis retained or not. Default: False
 
-    Examples:
+    Returns:
+        output tensor.
 
-    .. testcode::
+    Examples:
 
-        import numpy as np
-        from megengine import tensor
-        import megengine.functional as F
+        .. testcode::
 
-        x = tensor(np.arange(1, 7, dtype=np.int32).reshape(2, 3))
-        out = F.prod(x)
-        print(out.numpy())
+            import numpy as np
+            from megengine import tensor
+            import megengine.functional as F
 
-    Outputs:
+            x = tensor(np.arange(1, 7, dtype=np.int32).reshape(2, 3))
+            out = F.prod(x)
+            print(out.numpy())
 
-    .. testoutput::
+        Outputs:
 
-        720
+        .. testoutput::
 
+            720
     """
     return inp.prod(axis=axis, keepdims=keepdims)
 
@@ -208,34 +213,35 @@ def mean(
     axis: Optional[Union[int, Sequence[int]]] = None,
     keepdims: bool = False,
 ) -> Tensor:
-    """
-    Returns the mean value of input tensor along
+    r"""Returns the mean value of input tensor along
     given axis. If axis is a list of dimensions,
     reduce over all of them.
 
-    :param inp: input tensor.
-    :param axis: dimension to reduce. If None, all dimensions will be reduced. Default: None
-    :param keepdims: whether the output tensor has axis retained or not. Default: False
-    :return: output tensor.
+    Args:
+        inp: input tensor.
+        axis: dimension to reduce. If None, all dimensions will be reduced. Default: None
+        keepdims: whether the output tensor has axis retained or not. Default: False
 
-    Examples:
+    Returns:
+        output tensor.
 
-    .. testcode::
+    Examples:
 
-        import numpy as np
-        from megengine import tensor
-        import megengine.functional as F
+        .. testcode::
 
-        x = tensor(np.arange(1, 7, dtype=np.int32).reshape(2, 3))
-        out = F.mean(x)
-        print(out.numpy())
+            import numpy as np
+            from megengine import tensor
+            import megengine.functional as F
 
-    Outputs:
+            x = tensor(np.arange(1, 7, dtype=np.int32).reshape(2, 3))
+            out = F.mean(x)
+            print(out.numpy())
 
-    .. testoutput::
+        Outputs:
 
-        3.5
+        .. testoutput::
 
+            3.5
     """
     return inp.mean(axis=axis, keepdims=keepdims)
 
@@ -245,33 +251,35 @@ def var(
     axis: Optional[Union[int, Sequence[int]]] = None,
     keepdims: bool = False,
 ) -> Tensor:
-    """
-    Returns the variance value of input tensor along
+    r"""Returns the variance value of input tensor along
     given axis. If axis is a list of dimensions,
     reduce over all of them.
 
-    :param inp: input tensor.
-    :param axis: dimension to reduce. If None, all dimensions will be reduced. Default: None
-    :param keepdims: whether the output tensor has axis retained or not. Default: False
-    :return: output tensor.
+    Args:
+        inp: input tensor.
+        axis: dimension to reduce. If None, all dimensions will be reduced. Default: None
+        keepdims: whether the output tensor has axis retained or not. Default: False
+
+    Returns:
+        output tensor.
 
     Examples:
 
-    .. testcode::
+        .. testcode::
 
-        import numpy as np
-        from megengine import tensor
-        import megengine.functional as F
+            import numpy as np
+            from megengine import tensor
+            import megengine.functional as F
 
-        data = tensor(np.arange(1, 7, dtype=np.float32).reshape(2, 3))
-        out = F.var(data)
-        print(out.numpy().round(decimals=4))
+            data = tensor(np.arange(1, 7, dtype=np.float32).reshape(2, 3))
+            out = F.var(data)
+            print(out.numpy().round(decimals=4))
 
-    Outputs:
+        Outputs:
 
-    .. testoutput::
+        .. testoutput::
 
-        2.9167
+            2.9167
     """
     if axis is None:
         m = mean(inp, axis=axis, keepdims=False)
@@ -286,33 +294,35 @@ def std(
     axis: Optional[Union[int, Sequence[int]]] = None,
     keepdims: bool = False,
 ) -> Tensor:
-    """
-    Returns the standard deviation of input tensor along
+    r"""Returns the standard deviation of input tensor along
     given axis. If axis is a list of dimensions,
     reduce over all of them.
 
-    :param inp: input tensor.
-    :param axis: dimension to reduce. If None, all dimensions will be reduced. Default: None
-    :param keepdims: whether the output tensor has axis retained or not. Default: False
-    :return: output tensor.
+    Args:
+        inp: input tensor.
+        axis: dimension to reduce. If None, all dimensions will be reduced. Default: None
+        keepdims: whether the output tensor has axis retained or not. Default: False
+
+    Returns:
+        output tensor.
 
     Examples:
 
-    .. testcode::
+        .. testcode::
 
-        import numpy as np
-        from megengine import tensor
-        import megengine.functional as F
+            import numpy as np
+            from megengine import tensor
+            import megengine.functional as F
 
-        data = tensor(np.arange(1, 7, dtype=np.float32).reshape(2, 3))
-        out = F.std(data, axis=1)
-        print(out.numpy().round(decimals=4))
+            data = tensor(np.arange(1, 7, dtype=np.float32).reshape(2, 3))
+            out = F.std(data, axis=1)
+            print(out.numpy().round(decimals=4))
 
-    Outputs:
+        Outputs:
 
-    .. testoutput::
+        .. testoutput::
 
-        [0.8165 0.8165]
+            [0.8165 0.8165]
     """
     return var(inp, axis=axis, keepdims=keepdims) ** 0.5
 
@@ -322,34 +332,35 @@ def min(
     axis: Optional[Union[int, Sequence[int]]] = None,
     keepdims: bool = False,
 ) -> Tensor:
-    r"""
-    Returns the min value of input tensor along
+    r"""Returns the min value of input tensor along
     given axis. If axis is a list of dimensions,
     reduce over all of them.
 
-    :param inp: input tensor.
-    :param axis: dimension to reduce. If None, all dimensions will be reduced. Default: None
-    :param keepdims: whether the output tensor has axis retained or not. Default: False
-    :return: output tensor.
+    Args:
+        inp: input tensor.
+        axis: dimension to reduce. If None, all dimensions will be reduced. Default: None
+        keepdims: whether the output tensor has axis retained or not. Default: False
 
-    Examples:
+    Returns:
+        output tensor.
 
-    .. testcode::
+    Examples:
 
-        import numpy as np
-        from megengine import tensor
-        import megengine.functional as F
+        .. testcode::
 
-        x = tensor(np.arange(1, 7, dtype=np.int32).reshape(2,3))
-        out = F.min(x)
-        print(out.numpy())
+            import numpy as np
+            from megengine import tensor
+            import megengine.functional as F
 
-    Outputs:
+            x = tensor(np.arange(1, 7, dtype=np.int32).reshape(2,3))
+            out = F.min(x)
+            print(out.numpy())
 
-    .. testoutput::
+        Outputs:
 
-        1
+        .. testoutput::
 
+            1
     """
     return inp.min(axis=axis, keepdims=keepdims)
 
@@ -359,34 +370,35 @@ def max(
     axis: Optional[Union[int, Sequence[int]]] = None,
     keepdims: bool = False,
 ) -> Tensor:
-    r"""
-    Returns the max value of the input tensor along
+    r"""Returns the max value of the input tensor along
     given axis. If axis is a list of dimensions,
     reduce over all of them.
 
-    :param inp: input tensor.
-    :param axis: dimension to reduce. If None, all dimensions will be reduced. Default: None
-    :param keepdims: whether the output tensor has axis retained or not. Default: False
-    :return: output tensor.
+    Args:
+        inp: input tensor.
+        axis: dimension to reduce. If None, all dimensions will be reduced. Default: None
+        keepdims: whether the output tensor has axis retained or not. Default: False
 
-    Examples:
+    Returns:
+        output tensor.
 
-    .. testcode::
+    Examples:
 
-        import numpy as np
-        from megengine import tensor
-        import megengine.functional as F
+        .. testcode::
 
-        x = tensor(np.arange(1, 7, dtype=np.int32).reshape(2,3))
-        out = F.max(x)
-        print(out.numpy())
+            import numpy as np
+            from megengine import tensor
+            import megengine.functional as F
 
-    Outputs:
+            x = tensor(np.arange(1, 7, dtype=np.int32).reshape(2,3))
+            out = F.max(x)
+            print(out.numpy())
 
-    .. testoutput::
+        Outputs:
 
-        6
+        .. testoutput::
 
+            6
     """
     return inp.max(axis=axis, keepdims=keepdims)
 
@@ -394,34 +406,35 @@ def max(
 def norm(
     inp: Tensor, ord: float = None, axis: int = None, keepdims=False,
 ):
-    """
-    Calculates ``p``-norm of input tensor along
+    r"""Calculates ``p``-norm of input tensor along
     given axis.
 
-    :param inp: input tensor.
-    :param ord: power of value applied to inp. Default: 2
-    :param axis: dimension to reduce. If None, input must be a vector. Default: None
-    :param keepdims: whether the output tensor has axis retained or not. Default: False
-    :return: output tensor.
+    Args:
+        inp: input tensor.
+        ord: power of value applied to inp. Default: 2
+        axis: dimension to reduce. If None, input must be a vector. Default: None
+        keepdims: whether the output tensor has axis retained or not. Default: False
 
-    Examples:
+    Returns:
+        output tensor.
 
-    .. testcode::
+    Examples:
 
-        import numpy as np
-        from megengine import tensor
-        import megengine.functional as F
+        .. testcode::
 
-        x = tensor(np.arange(-3, 3, dtype=np.float32))
-        out = F.norm(x)
-        print(out.numpy().round(decimals=4))
+            import numpy as np
+            from megengine import tensor
+            import megengine.functional as F
 
-    Outputs:
+            x = tensor(np.arange(-3, 3, dtype=np.float32))
+            out = F.norm(x)
+            print(out.numpy().round(decimals=4))
 
-    .. testoutput::
+        Outputs:
 
-        4.3589
+        .. testoutput::
 
+            4.3589
     """
     if axis is None:
         if inp.ndim != 1:
@@ -442,34 +455,35 @@ def argmin(
     axis: Optional[Union[int, Sequence[int]]] = None,
     keepdims: bool = False,
 ) -> Tensor:
-    r"""
-    Returns the indices of the minimum values along
+    r"""Returns the indices of the minimum values along
     given axis. If axis is a list of dimensions,
     reduce over all of them.
 
-    :param inp: input tensor.
-    :param axis: dimension to reduce. If None, all dimensions will be reduced. Default: None
-    :param keepdims: whether the output tensor has axis retained or not. Default: False
-    :return: output tensor.
+    Args:
+        inp: input tensor.
+        axis: dimension to reduce. If None, all dimensions will be reduced. Default: None
+        keepdims: whether the output tensor has axis retained or not. Default: False
 
-    Examples:
+    Returns:
+        output tensor.
 
-    .. testcode::
+    Examples:
 
-        import numpy as np
-        from megengine import tensor
-        import megengine.functional as F
+        .. testcode::
 
-        x = tensor(np.arange(1, 7, dtype=np.int32).reshape(2,3))
-        out = F.argmin(x)
-        print(out.numpy())
+            import numpy as np
+            from megengine import tensor
+            import megengine.functional as F
 
-    Outputs:
+            x = tensor(np.arange(1, 7, dtype=np.int32).reshape(2,3))
+            out = F.argmin(x)
+            print(out.numpy())
 
-    .. testoutput::
+        Outputs:
 
-        0
+        .. testoutput::
 
+            0
     """
     if axis is None:
         assert not keepdims, "can not set axis=None and keepdims=True"
@@ -500,34 +514,35 @@ def argmax(
     axis: Optional[Union[int, Sequence[int]]] = None,
     keepdims: bool = False,
 ) -> Tensor:
-    r"""
-    Returns the indices of the maximum values along
+    r"""Returns the indices of the maximum values along
     given axis. If axis is a list of dimensions,
     reduce over all of them.
 
-    :param inp: input tensor.
-    :param axis: dimension to reduce. If None, all dimensions will be reduced. Default: None
-    :param keepdims: whether the output tensor has axis retained or not. Default: False
-    :return: output tensor.
+    Args:
+        inp: input tensor.
+        axis: dimension to reduce. If None, all dimensions will be reduced. Default: None
+        keepdims: whether the output tensor has axis retained or not. Default: False
 
-    Examples:
+    Returns:
+        output tensor.
 
-    .. testcode::
+    Examples:
 
-        import numpy as np
-        from megengine import tensor
-        import megengine.functional as F
+        .. testcode::
 
-        x = tensor(np.arange(1, 7, dtype=np.int32).reshape(2,3))
-        out = F.argmax(x)
-        print(out.numpy())
+            import numpy as np
+            from megengine import tensor
+            import megengine.functional as F
 
-    Outputs:
+            x = tensor(np.arange(1, 7, dtype=np.int32).reshape(2,3))
+            out = F.argmax(x)
+            print(out.numpy())
 
-    .. testoutput::
+        Outputs:
 
-        5
+        .. testoutput::
 
+            5
     """
     if axis is None:
         assert not keepdims, "can not set axis=None and keepdims=True"
@@ -556,8 +571,7 @@ def argmax(
 def normalize(
     inp: Tensor, ord: float = None, axis: int = None, eps: float = 1e-12,
 ) -> Tensor:
-    r"""
-    Performs :math:`L_p` normalization of input tensor along
+    r"""Performs :math:`L_p` normalization of input tensor along
     given axis.
 
     For a tensor of shape :math:`(n_0, ..., n_{dim}, ..., n_k)`, each
@@ -566,11 +580,14 @@ def normalize(
     .. math::
         v = \frac{v}{\max(\lVert v \rVert_p, \epsilon)}.
 
-    :param inp: input tensor.
-    :param ord: power of value applied to input tensor. Default: 2
-    :param axis: dimension to reduce.If None, input must be a vector. Default: None
-    :param eps: a small value to avoid division by zero. Default: 1e-12
-    :return: normalized output tensor.
+    Args:
+        inp: input tensor.
+        ord: power of value applied to input tensor. Default: 2
+        axis: dimension to reduce.If None, input must be a vector. Default: None
+        eps: a small value to avoid division by zero. Default: 1e-12
+
+    Returns:
+        normalized output tensor.
     """
     if axis is None:
         return inp / clip(norm(inp, ord, axis), lower=eps)
@@ -579,31 +596,34 @@ def normalize(
 
 
 def argsort(inp: Tensor, descending: bool = False) -> Tensor:
-    r"""
-    Returns the indices that would sort the input tensor.
+    r"""Returns the indices that would sort the input tensor.
 
-    :param inp: input tensor. If it's 2d, the result would be array of indices show how to sort each row in the input tensor.
-    :param descending: sort in descending order, where the largest comes first. Default: False
-    :return: indices of int32 indicates how to sort the input.
+    Args:
+        inp: input tensor. If it's 2d, the result would be array of indices show how to sort each row in the input tensor.
+        descending: sort in descending order, where the largest comes first. Default: False
+        inp: Tensor:
+        descending: bool:
 
-    Examples:
+    Returns:
+        indices of int32 indicates how to sort the input.
 
-    .. testcode::
+    Examples:
 
-        import numpy as np
-        from megengine import tensor
-        import megengine.functional as F
+        .. testcode::
 
-        x = tensor(np.array([1,2], dtype=np.float32))
-        indices = F.argsort(x)
-        print(indices.numpy())
+            import numpy as np
+            from megengine import tensor
+            import megengine.functional as F
 
-    Outputs:
+            x = tensor(np.array([1,2], dtype=np.float32))
+            indices = F.argsort(x)
+            print(indices.numpy())
 
-    .. testoutput::
+        Outputs:
 
-        [0 1]
+        .. testoutput::
 
+            [0 1]
     """
     assert len(inp.shape) <= 2, "Input should be 1d or 2d"
     if descending:
@@ -621,31 +641,32 @@ def argsort(inp: Tensor, descending: bool = False) -> Tensor:
 
 
 def sort(inp: Tensor, descending: bool = False) -> Tuple[Tensor, Tensor]:
-    r"""
-    Returns sorted tensor and the indices would sort the input tensor.
+    r"""Returns sorted tensor and the indices would sort the input tensor.
 
-    :param inp: input tensor. If it's 2d, the result would be sorted by row.
-    :param descending: sort in descending order, where the largest comes first. Default: False
-    :return: tuple of two tensors `(sorted_tensor, indices_of_int32)`.
+    Args:
+        inp: input tensor. If it's 2d, the result would be sorted by row.
+        descending: sort in descending order, where the largest comes first. Default: False
 
-    Examples:
+    Returns:
+        tuple of two tensors `(sorted_tensor, indices_of_int32)`.
 
-    .. testcode::
+    Examples:
 
-        import numpy as np
-        from megengine import tensor
-        import megengine.functional as F
+        .. testcode::
 
-        x = tensor(np.array([1,2], dtype=np.float32))
-        out, indices = F.sort(x)
-        print(out.numpy())
+            import numpy as np
+            from megengine import tensor
+            import megengine.functional as F
 
-    Outputs:
+            x = tensor(np.array([1,2], dtype=np.float32))
+            out, indices = F.sort(x)
+            print(out.numpy())
 
-    .. testoutput::
+        Outputs:
 
-        [1. 2.]
+        .. testoutput::
 
+            [1. 2.]
     """
     assert len(inp.shape) <= 2, "Input should be 1d or 2d"
     if descending:
@@ -669,34 +690,35 @@ def topk(
     kth_only: bool = False,
     no_sort: bool = False,
 ) -> Tuple[Tensor, Tensor]:
-    r"""
-    Selects the ``Top-K`` (by default) smallest elements of 2d matrix by row.
+    r"""Selects the ``Top-K`` (by default) smallest elements of 2d matrix by row.
 
-    :param inp: input tensor. If input tensor is 2d, each row will be sorted.
-    :param k: number of elements needed.
-    :param descending: if True, return the largest elements instead. Default: False
-    :param kth_only: if True, only the k-th element will be returned. Default: False
-    :param no_sort: if True, the returned elements can be unordered. Default: False
-    :return: tuple of two tensors ``(topk_tensor, indices_of_int32)``
+    Args:
+        inp: input tensor. If input tensor is 2d, each row will be sorted.
+        k: number of elements needed.
+        descending: if True, return the largest elements instead. Default: False
+        kth_only: if True, only the k-th element will be returned. Default: False
+        no_sort: if True, the returned elements can be unordered. Default: False
 
-    Examples:
+    Returns:
+        tuple of two tensors ``(topk_tensor, indices_of_int32)``
 
-    .. testcode::
+    Examples:
 
-        import numpy as np
-        from megengine import tensor
-        import  megengine.functional as F
+        .. testcode::
 
-        x = tensor(np.array([2, 4, 6, 8, 7, 5, 3, 1], dtype=np.float32))
-        top, indices = F.topk(x, 5)
-        print(top.numpy(), indices.numpy())
+            import numpy as np
+            from megengine import tensor
+            import  megengine.functional as F
 
-    Outputs:
+            x = tensor(np.array([2, 4, 6, 8, 7, 5, 3, 1], dtype=np.float32))
+            top, indices = F.topk(x, 5)
+            print(top.numpy(), indices.numpy())
 
-    .. testoutput::
+        Outputs:
 
-        [1. 2. 3. 4. 5.] [7 0 6 1 5]
+        .. testoutput::
 
+            [1. 2. 3. 4. 5.] [7 0 6 1 5]
     """
     if descending:
         k = -k
@@ -736,31 +758,33 @@ def topk(
 
 
 def matinv(inp: Tensor) -> Tensor:
-    """
-    Computes the inverse of a batch of matrices; input must has shape [..., n, n].
+    r"""Computes the inverse of a batch of matrices; input must has shape [..., n, n].
 
-    :param inp: input tensor.
-    :return: output tensor.
+    Args:
+        inp: input tensor.
 
-    Examples:
+    Returns:
+        output tensor.
 
-    .. testcode::
 
-        import numpy as np
-        from megengine import tensor
-        import megengine.functional as F
+    Examples:
 
-        data = tensor([[1.0, 0.0], [1.0, 1.0]])
-        out = F.matinv(data)
-        print(out.numpy())
+        .. testcode::
 
-    Outputs:
+            import numpy as np
+            from megengine import tensor
+            import megengine.functional as F
 
-    .. testoutput::
+            data = tensor([[1.0, 0.0], [1.0, 1.0]])
+            out = F.matinv(data)
+            print(out.numpy())
 
-        [[ 1.  0.]
-         [-1.  1.]]
+        Outputs:
 
+        .. testoutput::
+
+            [[ 1.  0.]
+             [-1.  1.]]
     """
 
     (result,) = apply(builtin.MatrixInverse(), inp)
@@ -998,46 +1022,47 @@ def matmul(
     compute_mode="default",
     format="default",
 ) -> Tensor:
-    """
-    Performs a matrix multiplication of the matrices ``inp1`` and ``inp2``.
+    r"""Performs a matrix multiplication of the matrices ``inp1`` and ``inp2``.
 
     With different inputs dim, this function behaves differently:
 
-    - Both 1-D tensor, simply forward to ``dot``.
-    - Both 2-D tensor, normal matrix multiplication.
-    - If one input tensor is 1-D, matrix vector multiplication.
-    - If at least one tensor are 3-dimensional or >3-dimensional, the other tensor should have dim >= 2,
-         the batched matrix-matrix is returned, and the tensor with smaller dimension will be broadcasted.
-         For example:
+    * Both 1-D tensor, simply forward to ``dot``.
+    * Both 2-D tensor, normal matrix multiplication.
+    * If one input tensor is 1-D, matrix vector multiplication.
+    * If at least one tensor are 3-dimensional or >3-dimensional, the other tensor should have dim >= 2,
+      the batched matrix-matrix is returned, and the tensor with smaller dimension will be broadcasted.
+      For example:
 
-      - inp1: `(n, k, m)`, inp2: `(n, m, p)`, return: `(n, k, p)`
-      - inp1: `(n, k, m)`, inp2: `(m, p)`, return: `(n, k, p)`
-      - inp1: `(n, j, k, m)`, inp2: `(n, j, m, p)`, return: `(n, j, k, p)`
+      * inp1: `(n, k, m)`, inp2: `(n, m, p)`, return: `(n, k, p)`
+      * inp1: `(n, k, m)`, inp2: `(m, p)`, return: `(n, k, p)`
+      * inp1: `(n, j, k, m)`, inp2: `(n, j, m, p)`, return: `(n, j, k, p)`
 
-    :param inp1: first matrix to be multiplied.
-    :param inp2: second matrix to be multiplied.
-    :return: output tensor.
+    Args:
+        inp1: first matrix to be multiplied.
+        inp2: second matrix to be multiplied.
 
-    Examples:
+    Returns:
+        output tensor.
 
-    .. testcode::
+    Examples:
 
-        import numpy as np
-        from megengine import tensor
-        import megengine.functional as F
+        .. testcode::
 
-        data1 = tensor(np.arange(0, 6, dtype=np.float32).reshape(2, 3))
-        data2 = tensor(np.arange(0, 6, dtype=np.float32).reshape(3, 2))
-        out = F.matmul(data1, data2)
-        print(out.numpy())
+            import numpy as np
+            from megengine import tensor
+            import megengine.functional as F
 
-    Outputs:
+            data1 = tensor(np.arange(0, 6, dtype=np.float32).reshape(2, 3))
+            data2 = tensor(np.arange(0, 6, dtype=np.float32).reshape(3, 2))
+            out = F.matmul(data1, data2)
+            print(out.numpy())
 
-    .. testoutput::
+        Outputs:
 
-        [[10. 13.]
-         [28. 40.]]
+        .. testoutput::
 
+            [[10. 13.]
+             [28. 40.]]
     """
     if amp._enabled:
         compute_mode = "float32"
@@ -1085,34 +1110,35 @@ def matmul(
 
 
 def dot(inp1: Tensor, inp2: Tensor) -> Tensor:
-    """
-    Computes dot-product of two vectors ``inp1`` and ``inp2``.
+    r"""Computes dot-product of two vectors ``inp1`` and ``inp2``.
     inputs must be 1-dimensional or scalar. A scalar input is automatically broadcasted.
     Refer to :func:`~.matmul` for more general usage.
 
-    :param inp1: first vector.
-    :param inp2: second vector.
-    :return: output value.
+    Args:
+        inp1: first vector.
+        inp2: second vector.
 
-    Examples:
+    Returns:
+        output value.
 
-    .. testcode::
+    Examples:
 
-        import numpy as np
-        from megengine import tensor
-        import megengine.functional as F
+        .. testcode::
 
-        data1 = tensor(np.arange(0, 6, dtype=np.float32))
-        data2 = tensor(np.arange(0, 6, dtype=np.float32))
-        out = F.dot(data1, data2)
-        print(out.numpy())
+            import numpy as np
+            from megengine import tensor
+            import megengine.functional as F
 
-    Outputs:
+            data1 = tensor(np.arange(0, 6, dtype=np.float32))
+            data2 = tensor(np.arange(0, 6, dtype=np.float32))
+            out = F.dot(data1, data2)
+            print(out.numpy())
 
-    .. testoutput::
+        Outputs:
 
-        55.
+        .. testoutput::
 
+            55.
     """
     op = builtin.Dot()
     assert (
@@ -1124,30 +1150,31 @@ def dot(inp1: Tensor, inp2: Tensor) -> Tensor:
 
 
 def svd(inp: Tensor, full_matrices=False, compute_uv=True) -> Tensor:
-    """
-    Computes the singular value decompositions of input matrix.
+    r"""Computes the singular value decompositions of input matrix.
 
-    :param inp: input matrix, must has shape `[..., M, N]`.
-    :return: output matrices, `(U, sigma, V)`.
+    Args:
+        inp: input matrix, must has shape `[..., M, N]`.
 
-    Examples:
+    Returns:
+        output matrices, `(U, sigma, V)`.
 
-    .. testcode::
+    Examples:
 
-        import numpy as np
-        from megengine import tensor
-        import megengine.functional as F
+        .. testcode::
 
-        x = tensor(np.arange(0, 6, dtype=np.float32).reshape(2,3))
-        _, y, _ = F.svd(x)
-        print(y.numpy().round(decimals=3))
+            import numpy as np
+            from megengine import tensor
+            import megengine.functional as F
 
-    Outputs:
+            x = tensor(np.arange(0, 6, dtype=np.float32).reshape(2,3))
+            _, y, _ = F.svd(x)
+            print(y.numpy().round(decimals=3))
 
-    .. testoutput::
+        Outputs:
 
-        [7.348 1.   ]
+        .. testoutput::
 
+            [7.348 1.   ]
     """
     op = builtin.SVD(full_matrices=full_matrices, compute_uv=compute_uv)
     U, sigma, V = apply(op, inp)
@@ -1155,11 +1182,13 @@ def svd(inp: Tensor, full_matrices=False, compute_uv=True) -> Tensor:
 
 
 def _has_inf(inp: Tensor) -> Tensor:
-    """
-    Check whether input contains infinite value.
+    r"""Check whether input contains infinite value.
+
+    Args:
+        inp: a tensor to be checked.
 
-    :param inp: a tensor to be checked.
-    :return: a int32 scalar tensor, 0 for False and 1 for True.
+    Returns:
+        a int32 scalar tensor, 0 for False and 1 for True.
     """
     op = builtin.CheckHasInf()
     (oup,) = apply(op, inp.reshape(-1).astype("float32"))
diff --git a/imperative/python/megengine/functional/metric.py b/imperative/python/megengine/functional/metric.py
index 6eb2ccdc..77dd63af 100644
--- a/imperative/python/megengine/functional/metric.py
+++ b/imperative/python/megengine/functional/metric.py
@@ -19,33 +19,16 @@ from .tensor import broadcast_to, transpose
 def topk_accuracy(
     logits: Tensor, target: Tensor, topk: Union[int, Iterable[int]] = 1
 ) -> Union[Tensor, Iterable[Tensor]]:
-    r"""
-    Calculates the classification accuracy given predicted logits and ground-truth labels.
+    r"""Calculates the classification accuracy given predicted logits and ground-truth labels.
 
-    :param logits: model predictions of shape `[batch_size, num_classes]`,
-        representing the probability (likelyhood) of each class.
-    :param target: ground-truth labels, 1d tensor of int32.
-    :param topk: specifies the topk values, could be an int or tuple of ints. Default: 1
-    :return: tensor(s) of classification accuracy between 0.0 and 1.0.
+    Args:
+        logits: model predictions of shape `[batch_size, num_classes]`,
+            representing the probability (likelyhood) of each class.
+        target: ground-truth labels, 1d tensor of int32.
+        topk: specifies the topk values, could be an int or tuple of ints. Default: 1
 
-    Examples:
-
-    .. testcode::
-
-        import numpy as np
-        from megengine import tensor
-        import megengine.functional as F
-
-        logits = tensor(np.arange(80, dtype=np.int32).reshape(8,10))
-        target = tensor(np.arange(8, dtype=np.int32))
-        top1, top5 = F.metric.topk_accuracy(logits, target, (1, 5))
-        print(top1.numpy(), top5.numpy())
-
-    Outputs:
-
-    .. testoutput::
-
-        0.0 0.375
+    Returns:
+        tensor(s) of classification accuracy between 0.0 and 1.0.
     """
     if isinstance(topk, int):
         topk = (topk,)
diff --git a/imperative/python/megengine/functional/nn.py b/imperative/python/megengine/functional/nn.py
index b5f4dbde..9d2e9634 100644
--- a/imperative/python/megengine/functional/nn.py
+++ b/imperative/python/megengine/functional/nn.py
@@ -103,15 +103,14 @@ def expand_hw(x):
 def linear(
     inp: Tensor, weight: Tensor, bias: Optional[Tensor] = None, compute_mode="default",
 ) -> Tensor:
-    """
-    Applies a linear transformation to the input tensor.
+    r"""Applies a linear transformation to the input tensor.
 
     Refer to :class:`~.module.linear.Linear` for more information.
 
-    :param inp: input tensor with shape `(N, in_features)`.
-    :param weight: weight with shape `(out_features, in_features)`.
-    :param bias: bias with shape `(out_features,)`.
-        Default: None
+    Args:
+        inp: input tensor with shape `(N, in_features)`.
+        weight: weight with shape `(out_features, in_features)`.
+        bias: bias with shape `(out_features,)`. Default: None
     """
     ret = matmul(inp, weight, transpose_b=True, compute_mode=compute_mode)
     if bias is not None:
@@ -132,32 +131,29 @@ def conv1d(
     conv_mode="cross_correlation",
     compute_mode="default",
 ) -> Tensor:
-    """1D convolution operation.
+    r"""1D convolution operation.
 
     Refer to :class:`~.Conv1d` for more information.
 
-    :param inp: The feature map of the convolution operation
-    :param weight: The convolution kernel.
-    :param bias: The bias added to the result of convolution (if given)
-    :param stride: Stride of the 1D convolution operation. Default: 1
-    :param padding: Size of the paddings added to the input on both sides of its
-        spatial dimensions. Only zero-padding is supported. Default: 0
-    :param dilation: Dilation of the 1D convolution operation. Default: 1
-    :param groups: number of groups to divide input and output channels into,
-        so as to perform a "grouped convolution". When ``groups`` is not 1,
-        ``in_channels`` and ``out_channels`` must be divisible by ``groups``,
-        and the shape of weight should be ``(groups, out_channel // groups,
-        in_channels // groups, kernel_size)``. Default: 1
-    :type conv_mode: string or :class:`mgb.opr_param_defs.Convolution.Mode`
-    :param conv_mode: Supports 'cross_correlation'. Default:
-        'cross_correlation'.
-    :type compute_mode: string or
-        :class:`mgb.opr_param_defs.Convolution.ComputeMode`
-    :param compute_mode: When set to 'default', no special requirements will be
-        placed on the precision of intermediate results. When set to 'float32',
-        float32 would be used for accumulator and intermediate result, but only
-        effective when input and output are of float16 dtype.
-
+    Args:
+        inp: The feature map of the convolution operation
+        weight: The convolution kernel.
+        bias: The bias added to the result of convolution (if given)
+        stride: Stride of the 1D convolution operation. Default: 1
+        padding: Size of the paddings added to the input on both sides of its
+            spatial dimensions. Only zero-padding is supported. Default: 0
+        dilation: Dilation of the 1D convolution operation. Default: 1
+        groups: number of groups to divide input and output channels into,
+            so as to perform a "grouped convolution". When ``groups`` is not 1,
+            ``in_channels`` and ``out_channels`` must be divisible by ``groups``,
+            and the shape of weight should be ``(groups, out_channel // groups,
+            in_channels // groups, kernel_size)``. Default: 1
+        conv_mode: Supports 'cross_correlation'. Default:
+            'cross_correlation'.
+        compute_mode: When set to 'default', no special requirements will be
+            placed on the precision of intermediate results. When set to 'float32',
+            float32 would be used for accumulator and intermediate result, but only
+            effective when input and output are of float16 dtype.
     """
     assert (
         conv_mode.lower() == "cross_correlation"
@@ -217,33 +213,31 @@ def conv2d(
     conv_mode="cross_correlation",
     compute_mode="default",
 ) -> Tensor:
-    """
-    2D convolution operation.
+    r"""2D convolution operation.
 
     Refer to :class:`~.module.Conv2d` for more information.
 
-    :param inp: feature map of the convolution operation.
-    :param weight: convolution kernel.
-    :param bias: bias added to the result of convolution (if given).
-    :param stride: stride of the 2D convolution operation. Default: 1
-    :param padding: size of the paddings added to the input on both sides of its
-        spatial dimensions. Only zero-padding is supported. Default: 0
-    :param dilation: dilation of the 2D convolution operation. Default: 1
-    :param groups: number of groups into which the input and output channels are divided,
-        so as to perform a ``grouped convolution``. When ``groups`` is not 1,
-        ``in_channels`` and ``out_channels`` must be divisible by ``groups``,
-        and the shape of weight should be ``(groups, out_channel // groups,
-        in_channels // groups, height, width)``. Default: 1
-    :type conv_mode: string or :class:`Convolution.Mode`
-    :param conv_mode: supports "cross_correlation". Default:
-        "cross_correlation"
-    :type compute_mode: string or
-        :class:`Convolution.ComputeMode`
-    :param compute_mode: when set to "default", no special requirements will be
-        placed on the precision of intermediate results. When set to "float32",
-        "float32" would be used for accumulator and intermediate result, but only
-        effective when input and output are of float16 dtype.
-    :return: output tensor.
+    Args:
+        inp: feature map of the convolution operation.
+        weight: convolution kernel.
+        bias: bias added to the result of convolution (if given).
+        stride: stride of the 2D convolution operation. Default: 1
+        padding: size of the paddings added to the input on both sides of its
+            spatial dimensions. Only zero-padding is supported. Default: 0
+        dilation: dilation of the 2D convolution operation. Default: 1
+        groups: number of groups into which the input and output channels are divided,
+            so as to perform a ``grouped convolution``. When ``groups`` is not 1,
+            ``in_channels`` and ``out_channels`` must be divisible by ``groups``,
+            and the shape of weight should be ``(groups, out_channel // groups,
+            in_channels // groups, height, width)``. Default: 1
+        conv_mode: supports "cross_correlation". Default: "cross_correlation"
+        compute_mode: when set to "default", no special requirements will be
+            placed on the precision of intermediate results. When set to "float32",
+            "float32" would be used for accumulator and intermediate result, but only
+            effective when input and output are of float16 dtype.
+
+    Returns:
+        output tensor.
     """
     assert (
         conv_mode.lower() == "cross_correlation"
@@ -292,26 +286,27 @@ def conv3d(
     groups: int = 1,
     conv_mode: str = "cross_correlation",
 ) -> Tensor:
-    """
-    3D convolution operation.
+    r"""3D convolution operation.
 
     Refer to :class:`~.Conv3d` for more information.
 
-    :param inp: feature map of the convolution operation.
-    :param weight: convolution kernel.
-    :param bias: bias added to the result of convolution (if given).
-    :param stride: stride of the 3D convolution operation. Default: 1
-    :param padding: size of the paddings added to the input on both sides of its
-        spatial dimensions. Only zero-padding is supported. Default: 0
-    :param dilation: dilation of the 3D convolution operation. Default: 1
-    :param groups: number of groups into which the input and output channels are divided,
-        so as to perform a ``grouped convolution``. When ``groups`` is not 1,
-        ``in_channels`` and ``out_channels`` must be divisible by ``groups``,
-        and the shape of weight should be ``(groups, out_channel // groups,
-        in_channels // groups, depth, height, width)``. Default: 1
-    :param conv_mode: supports "cross_correlation". Default:
-        "cross_correlation"
-    :return: output tensor.
+    Args:
+        inp: feature map of the convolution operation.
+        weight: convolution kernel.
+        bias: bias added to the result of convolution (if given).
+        stride: stride of the 3D convolution operation. Default: 1
+        padding: size of the paddings added to the input on both sides of its
+            spatial dimensions. Only zero-padding is supported. Default: 0
+        dilation: dilation of the 3D convolution operation. Default: 1
+        groups: number of groups into which the input and output channels are divided,
+            so as to perform a ``grouped convolution``. When ``groups`` is not 1,
+            ``in_channels`` and ``out_channels`` must be divisible by ``groups``,
+            and the shape of weight should be ``(groups, out_channel // groups,
+            in_channels // groups, depth, height, width)``. Default: 1
+        conv_mode: supports "cross_correlation". Default: "cross_correlation"
+
+    Returns:
+        output tensor.
     """
     assert conv_mode.lower() == "cross_correlation"
 
@@ -359,33 +354,31 @@ def conv_transpose2d(
     conv_mode="cross_correlation",
     compute_mode="default",
 ) -> Tensor:
-    """
-    2D transposed convolution operation.
+    r"""2D transposed convolution operation.
 
     Refer to :class:`~.ConvTranspose2d` for more information.
 
-    :param inp: feature map of the convolution operation.
-    :param weight: convolution kernel.
-    :param bias: bias added to the result of convolution (if given).
-    :param stride: stride of the 2D convolution operation. Default: 1
-    :param padding: size of the paddings added to the input on both sides of its
-        spatial dimensions. Only zero-padding is supported. Default: 0
-    :param dilation: dilation of the 2D convolution operation. Default: 1
-    :param groups: number of groups into which the input and output channels are divided,
-        so as to perform a ``grouped convolution``. When ``groups`` is not 1,
-        ``in_channels`` and ``out_channels`` must be divisible by groups,
-        and the shape of weight should be ``(groups, in_channels // groups,
-        out_channels // groups, height, width)``. Default: 1
-    :type conv_mode: string or :class:`Convolution.Mode`
-    :param conv_mode: supports "cross_correlation". Default:
-        "cross_correlation"
-    :type compute_mode: string or
-        :class:`Convolution.ComputeMode`
-    :param compute_mode: when set to "default", no special requirements will be
-        placed on the precision of intermediate results. When set to "float32",
-        "float32" would be used for accumulator and intermediate result, but only
-        effective when input and output are of float16 dtype.
-    :return: output tensor.
+    Args:
+        inp: feature map of the convolution operation.
+        weight: convolution kernel.
+        bias: bias added to the result of convolution (if given).
+        stride: stride of the 2D convolution operation. Default: 1
+        padding: size of the paddings added to the input on both sides of its
+            spatial dimensions. Only zero-padding is supported. Default: 0
+        dilation: dilation of the 2D convolution operation. Default: 1
+        groups: number of groups into which the input and output channels are divided,
+            so as to perform a ``grouped convolution``. When ``groups`` is not 1,
+            ``in_channels`` and ``out_channels`` must be divisible by groups,
+            and the shape of weight should be ``(groups, in_channels // groups,
+            out_channels // groups, height, width)``. Default: 1
+        conv_mode: supports "cross_correlation". Default: "cross_correlation"
+        compute_mode: when set to "default", no special requirements will be
+            placed on the precision of intermediate results. When set to "float32",
+            "float32" would be used for accumulator and intermediate result, but only
+            effective when input and output are of float16 dtype.
+
+    Returns:
+        output tensor.
     """
     assert (
         conv_mode.lower() == "cross_correlation"
@@ -437,33 +430,31 @@ def deformable_conv2d(
     conv_mode="cross_correlation",
     compute_mode="default",
 ) -> Tensor:
-    """
-    Deformable Convolution.
-
-    :param inp: input feature map.
-    :param weight: convolution kernel.
-    :param offset: input offset to kernel, channel of this tensor should match the deformable settings.
-    :param mask: input mask to kernel, channel of this tensor should match the deformable settings.
-    :param bias: bias added to the result of convolution (if given).
-    :param stride: stride of the 2D convolution operation. Default: 1
-    :param padding: size of the paddings added to the input on both sides of its
-        spatial dimensions. Only zero-padding is supported. Default: 0
-    :param dilation: dilation of the 2D convolution operation. Default: 1
-    :param groups: number of groups into which the input and output channels are divided,
-        so as to perform a ``grouped convolution``. When ``groups`` is not 1,
-        ``in_channels`` and ``out_channels`` must be divisible by groups,
-        and the shape of weight should be ``(groups, out_channel // groups,
-        in_channels // groups, height, width)``. Default: 1
-    :type conv_mode: string or :class:`Convolution.Mode`
-    :param conv_mode: supports "cross_correlation". Default:
-        "cross_correlation"
-    :type compute_mode: string or
-        :class:`Convolution.ComputeMode`
-    :param compute_mode: when set to "default", no special requirements will be
-        placed on the precision of intermediate results. When set to "float32",
-        "float32" would be used for accumulator and intermediate result, but only
-        effective when input and output are of float16 dtype.
-    :return: output tensor.
+    r"""Deformable Convolution.
+
+    Args:
+        inp: input feature map.
+        weight: convolution kernel.
+        offset: input offset to kernel, channel of this tensor should match the deformable settings.
+        mask: input mask to kernel, channel of this tensor should match the deformable settings.
+        bias: bias added to the result of convolution (if given).
+        stride: stride of the 2D convolution operation. Default: 1
+        padding: size of the paddings added to the input on both sides of its
+            spatial dimensions. Only zero-padding is supported. Default: 0
+        dilation: dilation of the 2D convolution operation. Default: 1
+        groups: number of groups into which the input and output channels are divided,
+            so as to perform a ``grouped convolution``. When ``groups`` is not 1,
+            ``in_channels`` and ``out_channels`` must be divisible by groups,
+            and the shape of weight should be ``(groups, out_channel // groups,
+            in_channels // groups, height, width)``. Default: 1
+        conv_mode: supports "cross_correlation". Default: "cross_correlation"
+        compute_mode: when set to "default", no special requirements will be
+            placed on the precision of intermediate results. When set to "float32",
+            "float32" would be used for accumulator and intermediate result, but only
+            effective when input and output are of float16 dtype.
+
+    Returns:
+        output tensor.
     """
     assert (
         conv_mode.lower() == "cross_correlation"
@@ -508,7 +499,7 @@ def local_conv2d(
     dilation: Union[int, Tuple[int, int]] = 1,
     conv_mode="cross_correlation",
 ):
-    """Applies spatial 2D convolution over an groupped channeled image with untied kernels."""
+    r"""Applies spatial 2D convolution over an groupped channeled image with untied kernels."""
     assert (
         conv_mode.lower() == "cross_correlation"
         or conv_mode.name == "CROSS_CORRELATION"
@@ -548,21 +539,23 @@ def conv_transpose3d(
     padding: Union[int, Tuple[int, int, int]] = 0,
     dilation: Union[int, Tuple[int, int, int]] = 1,
 ) -> Tensor:
-    """
-    3D transposed convolution operation. Only support the case that groups = 1
+    r"""3D transposed convolution operation. Only support the case that groups = 1
     and conv_mode = "cross_correlation".
 
     Refer to :class:`~.ConvTranspose3d` for more information.
 
-    :param inp: feature map of the convolution operation.
-    :param weight: convolution kernel.
-        weight usually has shape ``(in_channels, out_channels, depth, height, width)``.
-    :param bias: bias added to the result of convolution (if given).
-    :param stride: stride of the 3D convolution operation. Default: 1
-    :param padding: size of the paddings added to the input on all sides of its
-        spatial dimensions. Only zero-padding is supported. Default: 0
-    :param dilation: dilation of the 3D convolution operation. Default: 1
-    :return: output tensor.
+    Args:
+        inp: feature map of the convolution operation.
+        weight: convolution kernel.
+            weight usually has shape ``(in_channels, out_channels, depth, height, width)``.
+        bias: bias added to the result of convolution (if given).
+        stride: stride of the 3D convolution operation. Default: 1
+        padding: size of the paddings added to the input on all sides of its
+            spatial dimensions. Only zero-padding is supported. Default: 0
+        dilation: dilation of the 3D convolution operation. Default: 1
+
+    Returns:
+        output tensor.
     """
     D, H, W = 0, 1, 2
     pad = _triple(padding)
@@ -599,17 +592,19 @@ def max_pool2d(
     stride: Optional[Union[int, Tuple[int, int]]] = None,
     padding: Union[int, Tuple[int, int]] = 0,
 ) -> Tensor:
-    """
-    Applies a 2D max pooling over an input tensor.
+    r"""Applies a 2D max pooling over an input tensor.
 
     Refer to :class:`~.MaxPool2d` for more information.
 
-    :param inp: input tensor.
-    :param kernel_size: size of the window.
-    :param stride: stride of the window. If not provided, its value is set to kernel_size.
-        Default: None
-    :param padding: implicit zero padding added on both sides. Default: 0
-    :return: output tensor.
+    Args:
+        inp: input tensor.
+        kernel_size: size of the window.
+        stride: stride of the window. If not provided, its value is set to kernel_size.
+            Default: None
+        padding: implicit zero padding added on both sides. Default: 0
+
+    Returns:
+        output tensor.
     """
     if stride is None:
         stride = kernel_size
@@ -637,19 +632,21 @@ def avg_pool2d(
     padding: Union[int, Tuple[int, int]] = 0,
     mode: str = "average_count_exclude_padding",
 ) -> Tensor:
-    """
-    Applies 2D average pooling over an input tensor.
+    r"""Applies 2D average pooling over an input tensor.
 
     Refer to :class:`~.AvgPool2d` for more information.
 
-    :param inp: input tensor.
-    :param kernel_size: size of the window.
-    :param stride: stride of the window. If not provided, its value is set to ``kernel_size``.
-        Default: None
-    :param padding: implicit zero padding added on both sides. Default: 0
-    :param mode: whether to count padding values, set to "average" will do counting.
-        Default: "average_count_exclude_padding"
-    :return: output tensor.
+    Args:
+        inp: input tensor.
+        kernel_size: size of the window.
+        stride: stride of the window. If not provided, its value is set to ``kernel_size``.
+            Default: None
+        padding: implicit zero padding added on both sides. Default: 0
+        mode: whether to count padding values, set to "average" will do counting.
+            Default: "average_count_exclude_padding"
+
+    Returns:
+        output tensor.
     """
     if stride is None:
         stride = kernel_size
@@ -673,14 +670,16 @@ def avg_pool2d(
 def adaptive_max_pool2d(
     inp: Tensor, oshp: Union[Tuple[int, int], int, Tensor],
 ) -> Tensor:
-    """
-    Applies a 2D max adaptive pooling over an input.
+    r"""Applies a 2D max adaptive pooling over an input.
 
     Refer to :class:`~.MaxAdaptivePool2d` for more information.
 
-    :param inp: input tensor.
-    :param oshp: `(OH, OW)` size of the output shape.
-    :return: output tensor.
+    Args:
+        inp: input tensor.
+        oshp: OH, OW)` size of the output shape.
+
+    Returns:
+        output tensor.
     """
     if isinstance(oshp, int):
         oshp = (oshp, oshp)
@@ -694,14 +693,16 @@ def adaptive_max_pool2d(
 def adaptive_avg_pool2d(
     inp: Tensor, oshp: Union[Tuple[int, int], int, Tensor],
 ) -> Tensor:
-    """
-    Applies a 2D average adaptive pooling over an input.
+    r"""Applies a 2D average adaptive pooling over an input.
 
     Refer to :class:`~.AvgAdaptivePool2d` for more information.
 
-    :param inp: input tensor.
-    :param oshp: `(OH, OW)` size of the output shape.
-    :return: output tensor.
+    Args:
+        inp: input tensor.
+        oshp: OH, OW)` size of the output shape.
+
+    Returns:
+        output tensor.
     """
     if isinstance(oshp, int):
         oshp = (oshp, oshp)
@@ -724,19 +725,19 @@ def deformable_psroi_pooling(
     spatial_scale: float,
     trans_std: float = 0.1,
 ):
-    """
-    Deformable PSROI(Position Sensitive Region of Interest) Pooling.
-
-    :param inp: input feature map.
-    :param rois: the rois for feature pooling.
-    :param trans: input offset to psroi_pooling.
-    :param no_trans: check the phase of DeformablePSROIPooling. False to the
-                        1st phase, True to the 2nd phase.
-    :param part_size: part size.
-    :param sample_per_part: sample points of each part.
-    :param pooled_shape: kernel shape of convolution.
-    :param spatial_scale: the spatial_scale w.r.t input image.
-    :param trans_std: multiplier used in 2nd phase.
+    r"""Deformable PSROI(Position Sensitive Region of Interest) Pooling.
+
+    Args:
+        inp: input feature map.
+        rois: the rois for feature pooling.
+        trans: input offset to psroi_pooling.
+        no_trans: check the phase of DeformablePSROIPooling. False to the
+            1st phase, True to the 2nd phase.
+        part_size: part size.
+        sample_per_part: sample points of each part.
+        pooled_shape: kernel shape of convolution.
+        spatial_scale: the spatial_scale w.r.t input image.
+        trans_std: multiplier used in 2nd phase.
     """
     op = builtin.DeformablePSROIPooling(
         no_trans=no_trans,
@@ -752,55 +753,50 @@ def deformable_psroi_pooling(
 
 
 def hswish(x):
-    """
-    Element-wise `x * relu6(x + 3) / 6`.
-
-    :param x: input tensor.
-    :return: computed tensor.
+    r"""Element-wise `x * relu6(x + 3) / 6`.
 
     Example:
 
-    .. testcode::
+        .. testcode::
 
-        import numpy as np
-        from megengine import tensor
-        import megengine.functional as F
+            import numpy as np
+            from megengine import tensor
+            import megengine.functional as F
 
-        x = tensor(np.arange(5).astype(np.float32))
-        out = F.hswish(x)
-        print(out.numpy().round(decimals=4))
+            x = tensor(np.arange(5).astype(np.float32))
+            out = F.hswish(x)
+            print(out.numpy().round(decimals=4))
 
-    .. testoutput::
+        .. testoutput::
 
-        [0.     0.6667 1.6667 3.     4.    ]
+            [0.     0.6667 1.6667 3.     4.    ]
 
     """
     return _elwise(x, mode=Elemwise.Mode.H_SWISH)
 
 
 def sigmoid(x):
-    """Element-wise `1 / ( 1 + exp( -x ) )`."""
+    r"""Element-wise `1 / ( 1 + exp( -x ) )`."""
     return _elwise(x, mode=Elemwise.Mode.SIGMOID)
 
 
 def hsigmoid(x):
-    """Element-wise `relu6(x + 3) / 6`."""
+    r"""Element-wise `relu6(x + 3) / 6`."""
     return relu6(x + 3) / 6
 
 
 def relu(x):
-    """Element-wise `max(x, 0)`."""
+    r"""Element-wise `max(x, 0)`."""
     return _elwise(x, mode=Elemwise.Mode.RELU)
 
 
 def relu6(x):
-    """Element-wise `min(max(x, 0), 6)`."""
+    r"""Element-wise `min(max(x, 0), 6)`."""
     return minimum(maximum(x, 0), 6)
 
 
 def prelu(inp: Tensor, weight: Tensor) -> Tensor:
-    r"""
-    Applies the element-wise PReLU function.
+    r"""Elememt-wise PReLU function.
 
     Refer to :class:`~.PReLU` for more information.
     """
@@ -808,8 +804,7 @@ def prelu(inp: Tensor, weight: Tensor) -> Tensor:
 
 
 def leaky_relu(inp: Tensor, negative_slope: float = 0.01) -> Tensor:
-    r"""
-    Applies the element-wise leaky_relu function
+    r"""Element-wose LeakyReLU function
 
     Refer to :class:`~.LeakyReLU` for more information.
     """
@@ -817,15 +812,12 @@ def leaky_relu(inp: Tensor, negative_slope: float = 0.01) -> Tensor:
 
 
 def silu(x):
-    r"""
-    Applies the element-wise Sigmoid Linear Unit function, i.e. `x * sigmoid(x)`.
-    """
+    r"""Applies the element-wise Sigmoid Linear Unit function, i.e. `x * sigmoid(x)`."""
     return _elwise(x, mode=Elemwise.Mode.SILU)
 
 
 def gelu(x):
-    r"""
-    Applies the element-wise function:
+    r"""Applies the element-wise function:
 
     .. math::
         \text{gelu}(x) = x\Phi(x)
@@ -836,8 +828,7 @@ def gelu(x):
 
 
 def softplus(inp: Tensor) -> Tensor:
-    r"""
-    Applies the element-wise function:
+    r"""Applies the element-wise function:
 
     .. math::
         \text{softplus}(x) = \log(1 + \exp(x))
@@ -851,33 +842,29 @@ def softplus(inp: Tensor) -> Tensor:
                            = \log(1 + \exp(-\text{abs}(x))) + \max(x, 0)
                            = \log1p(\exp(-\text{abs}(x))) + \text{relu}(x)
 
-    :param inp: input tensor.
+   Examples:
 
-    Examples:
-
-    .. testcode::
-
-        import numpy as np
-        from megengine import tensor
-        import megengine.functional as F
+        .. testcode::
 
-        x = tensor(np.arange(-3, 3, dtype=np.float32))
-        y = F.softplus(x)
-        print(y.numpy().round(decimals=4))
+            import numpy as np
+            from megengine import tensor
+            import megengine.functional as F
 
-    Outputs:
+            x = tensor(np.arange(-3, 3, dtype=np.float32))
+            y = F.softplus(x)
+            print(y.numpy().round(decimals=4))
 
-    .. testoutput::
+        Outputs:
 
-        [0.0486 0.1269 0.3133 0.6931 1.3133 2.1269]
+        .. testoutput::
 
+            [0.0486 0.1269 0.3133 0.6931 1.3133 2.1269]
     """
     return log1p(exp(-abs(inp))) + relu(inp)
 
 
 def logsoftmax(inp: Tensor, axis: Union[int, Sequence[int]]) -> Tensor:
-    r"""
-    Applies the :math:`\log(\text{softmax}(x))` function to an n-dimensional
+    r"""Applies the :math:`\log(\text{softmax}(x))` function to an n-dimensional
     input tensor. The :math:`\text{logsoftmax}(x)` formulation can be simplified as:
 
     .. math::
@@ -891,35 +878,30 @@ def logsoftmax(inp: Tensor, axis: Union[int, Sequence[int]]) -> Tensor:
         = x - \log (\sum_{i}(\exp (x_{i})))
         = x - \text{logsumexp}(x)
 
-    :param inp: input tensor.
-    :param axis: axis along which :math:`\text{logsoftmax}(x)` will be applied.
-
     Examples:
 
-    .. testcode::
-
-        import numpy as np
-        from megengine import tensor
-        import megengine.functional as F
+        .. testcode::
 
-        x = tensor(np.arange(-5, 5, dtype=np.float32)).reshape(2,5)
-        y = F.logsoftmax(x, axis=1)
-        print(y.numpy().round(decimals=4))
+            import numpy as np
+            from megengine import tensor
+            import megengine.functional as F
 
-    Outputs:
+            x = tensor(np.arange(-5, 5, dtype=np.float32)).reshape(2,5)
+            y = F.logsoftmax(x, axis=1)
+            print(y.numpy().round(decimals=4))
 
-    .. testoutput::
+        Outputs:
 
-        [[-4.4519 -3.4519 -2.4519 -1.4519 -0.4519]
-         [-4.4519 -3.4519 -2.4519 -1.4519 -0.4519]]
+        .. testoutput::
 
+            [[-4.4519 -3.4519 -2.4519 -1.4519 -0.4519]
+             [-4.4519 -3.4519 -2.4519 -1.4519 -0.4519]]
     """
     return inp - logsumexp(inp, axis, keepdims=True)
 
 
 def logsigmoid(inp: Tensor) -> Tensor:
-    r"""
-    Applies the element-wise function:
+    r"""Applies the element-wise function:
 
     .. math::
         \text{logsigmoid}(x) = \log(\frac{ 1 }{ 1 + \exp(-x)})
@@ -927,27 +909,24 @@ def logsigmoid(inp: Tensor) -> Tensor:
         = - \log(1 + \exp(-x))
         = - \text{softplus}(-x)
 
-    :param inp: input tensor.
-
     Examples:
 
-    .. testcode::
-
-        import numpy as np
-        from megengine import tensor
-        import megengine.functional as F
+        .. testcode::
 
-        x = tensor(np.arange(-5, 5, dtype=np.float32))
-        y = F.logsigmoid(x)
-        print(y.numpy().round(decimals=4))
+            import numpy as np
+            from megengine import tensor
+            import megengine.functional as F
 
-    Outputs:
+            x = tensor(np.arange(-5, 5, dtype=np.float32))
+            y = F.logsigmoid(x)
+            print(y.numpy().round(decimals=4))
 
-    .. testoutput::
+        Outputs:
 
-        [-5.0067 -4.0182 -3.0486 -2.1269 -1.3133 -0.6931 -0.3133 -0.1269 -0.0486
-         -0.0181]
+        .. testoutput::
 
+            [-5.0067 -4.0182 -3.0486 -2.1269 -1.3133 -0.6931 -0.3133 -0.1269 -0.0486
+            -0.0181]
     """
     return -softplus(-inp)
 
@@ -955,8 +934,7 @@ def logsigmoid(inp: Tensor) -> Tensor:
 def logsumexp(
     inp: Tensor, axis: Union[int, Sequence[int]], keepdims: bool = False
 ) -> Tensor:
-    r"""
-    Calculates the logarithm of the inputs' exponential sum along the given :attr:`axis`.
+    r"""Calculates the logarithm of the inputs' exponential sum along the given :attr:`axis`.
 
     .. math::
 
@@ -974,28 +952,23 @@ def logsumexp(
     .. math::
         b = \max(x_j)
 
-    :param inp: input tensor.
-    :param axis: axis over which the sum is taken. It could be single axis or list of axes.
-    :param keepdims: whether to retain :attr:`axis` or not for the output tensor.
-
     Examples:
 
-    .. testcode::
-
-        import numpy as np
-        from megengine import tensor
-        import megengine.functional as F
+        .. testcode::
 
-        x = tensor(np.arange(-5, 5, dtype=np.float32)).reshape(2,5)
-        y = F.logsumexp(x, axis=1, keepdims=False)
-        print(y.numpy().round(decimals=4))
+            import numpy as np
+            from megengine import tensor
+            import megengine.functional as F
 
-    Outputs:
+            x = tensor(np.arange(-5, 5, dtype=np.float32)).reshape(2,5)
+            y = F.logsumexp(x, axis=1, keepdims=False)
+            print(y.numpy().round(decimals=4))
 
-    .. testoutput::
+        Outputs:
 
-        [-0.5481  4.4519]
+        .. testoutput::
 
+            [-0.5481  4.4519]
     """
     max_value = max(inp.detach(), axis, keepdims=True)
     if keepdims:
@@ -1013,8 +986,7 @@ def _get_softmax_axis(ndim: int) -> int:
 
 
 def softmax(inp: Tensor, axis: Optional[int] = None) -> Tensor:
-    r"""
-    Applies a :math:`\text{softmax}(x)` function. :math:`\text{softmax}(x)` is defined as:
+    r"""Applies a :math:`\text{softmax}(x)` function. :math:`\text{softmax}(x)` is defined as:
 
     .. math::
             \text{softmax}(x_{i}) = \frac{\exp(x_i)}{\sum_j \exp(x_j)}
@@ -1022,31 +994,26 @@ def softmax(inp: Tensor, axis: Optional[int] = None) -> Tensor:
     It is applied to all elements along axis, and rescales elements so that
     they stay in the range `[0, 1]` and sum to 1.
 
-    See :class:`~megengine.module.activation.Softmax` for more details.
-
-    :param inp: input tensor.
-    :param axis: an axis along which :math:`\text{softmax}(x)` will be applied. By default,
-        :math:`\text{softmax}(x)` will apply along the highest ranked axis.
+    See :class:`~.module.Softmax` for more details.
 
     Examples:
 
-    .. testcode::
-
-        import numpy as np
-        from megengine import tensor
-        import megengine.functional as F
+        .. testcode::
 
-        x = tensor(np.arange(-5, 5, dtype=np.float32)).reshape(2,5)
-        out = F.softmax(x)
-        print(out.numpy().round(decimals=4))
+            import numpy as np
+            from megengine import tensor
+            import megengine.functional as F
 
-    Outputs:
+            x = tensor(np.arange(-5, 5, dtype=np.float32)).reshape(2,5)
+            out = F.softmax(x)
+            print(out.numpy().round(decimals=4))
 
-    .. testoutput::
+        Outputs:
 
-        [[0.0117 0.0317 0.0861 0.2341 0.6364]
-         [0.0117 0.0317 0.0861 0.2341 0.6364]]
+        .. testoutput::
 
+            [[0.0117 0.0317 0.0861 0.2341 0.6364]
+            [0.0117 0.0317 0.0861 0.2341 0.6364]]
     """
     if axis is None:
         axis = _get_softmax_axis(len(inp.shape))
@@ -1069,28 +1036,25 @@ def batch_norm(
     inplace: bool = True,
     compute_mode="default"
 ):
-    r"""
-    Applies batch normalization to the input.
+    r"""Applies batch normalization to the input.
 
     Refer to :class:`~.BatchNorm2d` and :class:`~.BatchNorm1d` for more information.
 
-    :param inp: input tensor.
-    :param running_mean: tensor to store running mean.
-    :param running_var: tensor to store running variance.
-    :param weight: scaling tensor in the learnable affine parameters.
-        See :math:`\gamma` in :class:`~.BatchNorm2d`.
-    :param bias: bias tensor in the learnable affine parameters.
-        See :math:`\beta` in :class:`~.BatchNorm2d`.
-    :param training: a boolean value to indicate whether batch norm is performed
-        in training mode. Default: False
-    :param momentum: value used for the ``running_mean`` and ``running_var``
-        computation.
-        Default: 0.9
-    :param eps: a value added to the denominator for numerical stability.
-        Default: 1e-5
-    :param inplace: whether to update ``running_mean`` and ``running_var`` inplace or return new tensors
-        Default: True
-    :return: output tensor.
+    Args:
+        inp: input tensor.
+        running_mean: tensor to store running mean.
+        running_var: tensor to store running variance.
+        weight: scaling tensor in the learnable affine parameters.
+            See :math:`\gamma` in :class:`~.BatchNorm2d`.
+        bias: bias tensor in the learnable affine parameters.
+            See :math:`\beta` in :class:`~.BatchNorm2d`.
+        training: a boolean value to indicate whether batch norm is performed
+            in training mode. Default: False
+        momentum: value used for the ``running_mean`` and ``running_var``
+            computation. Default: 0.9
+        eps: a value added to the denominator for numerical stability. Default: 1e-5
+        inplace: whether to update ``running_mean`` and ``running_var``
+            inplace or return new tensors. Default: True
     """
     if inp.ndim != 4:
         raise NotImplementedError("batch_norm for ndim != 4")
@@ -1282,30 +1246,28 @@ def sync_batch_norm(
     eps_mode="additive",
     group=WORLD,
 ) -> Tensor:
-    r"""
-    Applies synchronized batch normalization to the input.
+    r"""Applies synchronized batch normalization to the input.
 
     Refer to :class:`~.BatchNorm2d` and :class:`~.BatchNorm1d` for more information.
 
-    :param inp: input tensor.
-    :param running_mean: tensor to store running mean.
-    :param running_var: tensor to store running variance.
-    :param weight: scaling tensor in the learnable affine parameters.
-        See :math:`\gamma` in :class:`~.BatchNorm2d`.
-    :param bias: bias tensor in the learnable affine parameters.
-        See :math:`\beta` in :class:`~.BatchNorm2d`.
-    :param training: a boolean value to indicate whether batch norm is performed
-        in traning mode. Default: False
-    :param momentum: value used for the ``running_mean`` and ``running_var``
-        computation.
-        Default: 0.9
-    :param eps: a value added to the denominator for numerical stability.
-        Default: 1e-5
-    :param eps_mode: mode of calculation for eps, "max" or "additive".
-        Default: "additive"
-    :param group: communication group, caculate mean and variance between this group.
-        Default: :obj:`~megengine.distributed.WORLD`
-    :return: output tensor.
+    Args:
+        inp: input tensor.
+        running_mean: tensor to store running mean.
+        running_var: tensor to store running variance.
+        weight: scaling tensor in the learnable affine parameters.
+            See :math:`\gamma` in :class:`~.BatchNorm2d`.
+        bias: bias tensor in the learnable affine parameters.
+            See :math:`\beta` in :class:`~.BatchNorm2d`.
+        training: a boolean value to indicate whether batch norm is performed
+            in traning mode. Default: False
+        momentum: value used for the ``running_mean`` and ``running_var``
+            computation. Default: 0.9
+        eps: a value added to the denominator for numerical stability.
+            Default: 1e-5
+        eps_mode: mode of calculation for eps, "max" or "additive".
+            Default: "additive"
+        group: communication group, caculate mean and variance between this group.
+            Default: :obj:`~megengine.distributed.WORLD`
     """
     assert eps_mode.lower() in {"max", "additive"}, "unknown eps_mode: {}".format(
         eps_mode
@@ -1392,40 +1354,40 @@ def sync_batch_norm(
 
 
 def dropout(inp: Tensor, drop_prob: float, training: bool = True) -> Tensor:
-    """
-    Returns a new tensor where each of the elements are randomly set to zero
+    r"""Returns a new tensor where each of the elements are randomly set to zero
     with probability P = ``drop_prob``. Optionally rescale the output tensor if ``training`` is True.
 
-    :param inp: input tensor.
-    :param drop_prob: probability to drop (set to zero) a single element.
-    :param training: the default behavior of ``dropout`` during training is to rescale the output,
-        then it can be replaced by an :class:`~.Identity` during inference. Default: True
-    :return: the output tensor
+    Args:
+        inp: input tensor.
+        drop_prob: probability to drop (set to zero) a single element.
+        training: the default behavior of ``dropout`` during training is to rescale the output,
+            then it can be replaced by an :class:`~.Identity` during inference. Default: True
+    Returns:
+        the ouput tensor
 
     Examples:
 
-    .. testcode::
+        .. testcode::
 
-        import numpy as np
-        from megengine import tensor
-        import megengine.functional as F
+            import numpy as np
+            from megengine import tensor
+            import megengine.functional as F
 
-        # test training mode
-        data = tensor(np.ones(10000000, dtype=np.float32))
-        out = F.nn.dropout(data, 1.0 / 3.0, training=True)
-        assert not out.numpy().all()
+            # test training mode
+            data = tensor(np.ones(10000000, dtype=np.float32))
+            out = F.nn.dropout(data, 1.0 / 3.0, training=True)
+            assert not out.numpy().all()
 
-        # test eval mode
-        out = F.nn.dropout(data, 1.0 / 3.0, training=False)
-        assert out.numpy().all()
+            # test eval mode
+            out = F.nn.dropout(data, 1.0 / 3.0, training=False)
+            assert out.numpy().all()
 
-    Outputs:
+        Outputs:
 
-    .. testoutput::
-        :options: +SKIP
-
-        [1.5 1.5 0.  1.5 1.5 1.5 1.5 1.5 1.5 1.5]
+        .. testoutput::
+            :options: +SKIP
 
+            [1.5 1.5 0.  1.5 1.5 1.5 1.5 1.5 1.5 1.5]
     """
     assert 0 <= drop_prob < 1
     if not training or drop_prob == 0:
@@ -1441,33 +1403,31 @@ def dropout(inp: Tensor, drop_prob: float, training: bool = True) -> Tensor:
 
 
 def one_hot(inp: Tensor, num_classes: int) -> Tensor:
-    r"""
-    Performs one-hot encoding for the input tensor.
+    r"""Performs one-hot encoding for the input tensor.
 
-    :param inp: input tensor.
-    :param num_classes: number of classes denotes the last dimension of the output tensor.
-    :return: output tensor.
+    Args:
+        inp: input tensor.
+        num_classes: number of classes denotes the last dimension of the output tensor.
 
     Examples:
 
-    .. testcode::
-
-        import numpy as np
-        from megengine import tensor
-        import megengine.functional as F
+        .. testcode::
 
-        x = tensor(np.arange(1, 4, dtype=np.int32))
-        out = F.one_hot(x, num_classes=4)
-        print(out.numpy())
+            import numpy as np
+            from megengine import tensor
+            import megengine.functional as F
 
-    Outputs:
+            x = tensor(np.arange(1, 4, dtype=np.int32))
+            out = F.one_hot(x, num_classes=4)
+            print(out.numpy())
 
-    .. testoutput::
+        Outputs:
 
-        [[0 1 0 0]
-         [0 0 1 0]
-         [0 0 0 1]]
+        .. testoutput::
 
+            [[0 1 0 0]
+             [0 0 1 0]
+             [0 0 0 1]]
     """
     zeros_tensor = zeros(list(inp.shape) + [num_classes], inp.dtype, inp.device)
     ones_tensor = ones(list(inp.shape) + [1], inp.dtype, inp.device)
@@ -1484,17 +1444,16 @@ def embedding(
     max_norm: Optional[float] = None,
     norm_type: Optional[float] = None,
 ):
-    """
-    Applies lookup table for embedding.
+    r"""Applies lookup table for embedding.
 
-    :param inp: tensor with indices.
-    :param weight: learnable weights which embeds from.
-    :param padding_idx: should be set to None, not supported now.
-    :param max_norm: should be set to None, not supported now.
-    :param norm_type: should be set to None, not supported now.
-    :return: output tensor.
+    Args:
+        inp: tensor with indices.
+        weight: learnable weights which embeds from.
+        padding_idx: should be set to None, not supported now.
+        max_norm: should be set to None, not supported now.
+        norm_type: should be set to None, not supported now.
 
-    Refer to :class:`~.Embedding` for more information.
+    Refer to :class:`~.module.Embedding` for more information.
     """
     if padding_idx is not None:
         raise ValueError("Not support padding_idx Now!")
@@ -1508,33 +1467,31 @@ def embedding(
 def indexing_one_hot(
     src: Tensor, index: Tensor, axis: int = 1, keepdims=False
 ) -> Tensor:
-    r"""
-    One-hot indexing for some axes.
+    r"""One-hot indexing for some axes.
 
-    :param src: input tensor.
-    :param index: index tensor.
-    :param axis: axis on src for which values in index index. Default: 1
-    :param keepdims: whether not to remove the axis in result. Default: False
-    :return: output tensor.
+    Args:
+        src: input tensor.
+        index: index tensor.
+        axis: axis on src for which values in index index. Default: 1
+        keepdims: whether not to remove the axis in result. Default: False
 
     Examples:
 
-    .. testcode::
-
-        import megengine.functional as F
-        from megengine import tensor
+        .. testcode::
 
-        src = tensor([[1.0, 2.0]])
-        index = tensor([0])
-        val = F.indexing_one_hot(src, index)
-        print(val.numpy())
+            import megengine.functional as F
+            from megengine import tensor
 
-    Outputs:
+            src = tensor([[1.0, 2.0]])
+            index = tensor([0])
+            val = F.indexing_one_hot(src, index)
+            print(val.numpy())
 
-    .. testoutput::
+        Outputs:
 
-        [1.]
+        .. testoutput::
 
+            [1.]
     """
     assert isinstance(src, Tensor), "src must be of Tensor type"
     op = builtin.IndexingOneHot(axis=axis)
@@ -1552,17 +1509,16 @@ def sliding_window(
     stride: Union[int, Tuple[int, int]] = 1,
     dilation: Union[int, Tuple[int, int]] = 1,
 ) -> Tensor:
-    """
-    Extracts sliding local blocks from a batched input tensor.
+    r"""Extracts sliding local blocks from a batched input tensor.
 
     Refer to :class:`~.SlidingWindow` for more information.
 
-    :param inp: input tensor.
-    :param kernel_size: size of the window.
-    :param padding: implicit zero padding added on both sides of input. Default: 0
-    :param stride: stride of the window. Default: 1
-    :param dilation: dilation of the window. Default: 1
-    :return: output tensor.
+    Args:
+        inp: input tensor.
+        kernel_size: size of the window.
+        padding: implicit zero padding added on both sides of input. Default: 0
+        stride: stride of the window. Default: 1
+        dilation: dilation of the window. Default: 1
     """
     padding_h, padding_w = _pair(padding)
     stride_h, stride_w = _pair_nonzero(stride)
@@ -1591,18 +1547,17 @@ def sliding_window_transpose(
     stride: Union[int, Tuple[int, int]] = 1,
     dilation: Union[int, Tuple[int, int]] = 1,
 ) -> Tensor:
-    """
-    Sum over the sliding windows on the corresponding input location.
+    r"""Sum over the sliding windows on the corresponding input location.
 
     Refer to :class:`~.SlidingWindowTranspose` for more information.
 
-    :param inp: input tensor.
-    :param output_size: shape of output tensor.
-    :param kernel_size: size of the window.
-    :param padding: implicit zero padding added on both sides of input. Default: 0
-    :param stride: stride of the window. Default: 1
-    :param dilation: dilation of the window. Default: 1
-    :return: output tensor.
+    Args:
+        inp: input tensor.
+        output_size: shape of output tensor.
+        kernel_size: size of the window.
+        padding: implicit zero padding added on both sides of input. Default: 0
+        stride: stride of the window. Default: 1
+        dilation: dilation of the window. Default: 1
     """
     output_h, output_w = _pair_nonzero(output_size)
     padding_h, padding_w = _pair(padding)
diff --git a/imperative/python/megengine/functional/quantized.py b/imperative/python/megengine/functional/quantized.py
index 16975c7a..d004b762 100644
--- a/imperative/python/megengine/functional/quantized.py
+++ b/imperative/python/megengine/functional/quantized.py
@@ -28,32 +28,28 @@ def conv_bias_activation(
     conv_mode="cross_correlation",
     compute_mode="default",
 ) -> Tensor:
-    """
-    Convolution bias with activation operation, only for inference.
-
-    :param inp: feature map of the convolution operation.
-    :param weight: convolution kernel.
-    :param bias: bias added to the result of convolution
-    :param stride: stride of the 2D convolution operation. Default: 1
-    :param padding: size of the paddings added to the input on both sides
-        of its spatial dimensions. Only zero-padding is supported. Default: 0
-    :param dilation: dilation of the 2D convolution operation. Default: 1
-    :param groups: number of groups into which the input and output channels are divided,
-        so as to perform a "grouped convolution". When ``groups`` is not 1,
-        ``in_channels`` and ``out_channels`` must be divisible by ``groups``,
-        and the shape of weight should be `(groups, out_channel // groups,
-        in_channels // groups, height, width)`.
-    :type conv_mode: string or :class:`Convolution.Mode`.
-    :param conv_mode: supports 'cross_correlation' or 'convolution'. Default:
-        'cross_correlation'
-    :param dtype: support for ``np.dtype``, Default: np.int8
-    :type compute_mode: string or
-        :class:`Convolution.ComputeMode`.
-    :param compute_mode: when set to "default", no special requirements will be
-        placed on the precision of intermediate results. When set to "float32",
-        "float32" would be used for accumulator and intermediate result,
-        but only effective when input and output are of float16 dtype.
+    r"""Convolution bias with activation operation, only for inference.
 
+    Args:
+        inp: feature map of the convolution operation.
+        weight: convolution kernel.
+        bias: bias added to the result of convolution
+        stride: stride of the 2D convolution operation. Default: 1
+        padding: size of the paddings added to the input on both sides
+            of its spatial dimensions. Only zero-padding is supported. Default: 0
+        dilation: dilation of the 2D convolution operation. Default: 1
+        groups: number of groups into which the input and output channels are divided,
+            so as to perform a "grouped convolution". When ``groups`` is not 1,
+            ``in_channels`` and ``out_channels`` must be divisible by ``groups``,
+            and the shape of weight should be `(groups, out_channel // groups,
+            in_channels // groups, height, width)`.
+        conv_mode: supports 'cross_correlation' or 'convolution'. Default:
+            'cross_correlation'
+        dtype: support for ``np.dtype``, Default: np.int8
+        compute_mode: when set to "default", no special requirements will be
+            placed on the precision of intermediate results. When set to "float32",
+            "float32" would be used for accumulator and intermediate result,
+            but only effective when input and output are of float16 dtype.
     """
     ph, pw = _pair(padding)
     sh, sw = _pair_nonzero(stride)
@@ -91,32 +87,28 @@ def batch_conv_bias_activation(
     conv_mode="cross_correlation",
     compute_mode="default",
 ) -> Tensor:
-    """
-    Batch convolution bias with activation operation, only for inference.
-
-    :param inp: feature map of the convolution operation.
-    :param weight: convolution kernel in batched way.
-    :param bias: bias added to the result of convolution
-    :param stride: stride of the 2D convolution operation. Default: 1
-    :param padding: size of the paddings added to the input on both sides
-        of its spatial dimensions. Only zero-padding is supported. Default: 0
-    :param dilation: dilation of the 2D convolution operation. Default: 1
-    :param groups: number of groups into which the input and output channels are divided,
-        so as to perform a "grouped convolution". When ``groups`` is not 1,
-        ``in_channels`` and ``out_channels`` must be divisible by ``groups``,
-        and the shape of weight should be `(groups, out_channel // groups,
-        in_channels // groups, height, width)`.
-    :type conv_mode: string or :class:`Convolution.Mode`.
-    :param conv_mode: supports 'cross_correlation' or 'convolution'. Default:
-        'cross_correlation'
-    :param dtype: support for ``np.dtype``, Default: np.int8
-    :type compute_mode: string or
-        :class:`Convolution.ComputeMode`.
-    :param compute_mode: when set to "default", no special requirements will be
-        placed on the precision of intermediate results. When set to "float32",
-        "float32" would be used for accumulator and intermediate result,
-        but only effective when input and output are of float16 dtype.
+    r"""Batch convolution bias with activation operation, only for inference.
 
+    Args:
+        inp: feature map of the convolution operation.
+        weight: convolution kernel in batched way.
+        bias: bias added to the result of convolution
+        stride: stride of the 2D convolution operation. Default: 1
+        padding: size of the paddings added to the input on both sides
+            of its spatial dimensions. Only zero-padding is supported. Default: 0
+        dilation: dilation of the 2D convolution operation. Default: 1
+        groups: number of groups into which the input and output channels are divided,
+            so as to perform a "grouped convolution". When ``groups`` is not 1,
+            ``in_channels`` and ``out_channels`` must be divisible by ``groups``,
+            and the shape of weight should be `(groups, out_channel // groups,
+            in_channels // groups, height, width)`.
+        conv_mode: supports 'cross_correlation' or 'convolution'. Default:
+            'cross_correlation'
+        dtype: support for ``np.dtype``, Default: np.int8
+        compute_mode: when set to "default", no special requirements will be
+            placed on the precision of intermediate results. When set to "float32",
+            "float32" would be used for accumulator and intermediate result,
+            but only effective when input and output are of float16 dtype.
     """
     ph, pw = _pair(padding)
     sh, sw = _pair_nonzero(stride)
diff --git a/imperative/python/megengine/functional/tensor.py b/imperative/python/megengine/functional/tensor.py
index 37fb6e9d..cc60542d 100755
--- a/imperative/python/megengine/functional/tensor.py
+++ b/imperative/python/megengine/functional/tensor.py
@@ -54,33 +54,35 @@ __all__ = [
 
 
 def eye(N, M=None, *, dtype="float32", device: Optional[CompNode] = None) -> Tensor:
-    """
-    Returns a 2D tensor with ones on the diagonal and zeros elsewhere.
+    r"""Returns a 2D tensor with ones on the diagonal and zeros elsewhere.
 
-    :param shape: expected shape of output tensor.
-    :param dtype: data type. Default: None
-    :param device: compute node of the matrix. Default: None
-    :return: eye matrix.
+    Args:
+        shape: a list, tuple or integer defining the shape of the output tensor.
+        dtype: the desired data type of the output tensor. Default: ``float32``.
+        device: the desired device of the output tensor. Default: if ``None``,
+            use the default device (see :func:`~.megengine.get_default_device`).
 
-    Examples:
+    Returns:
+        eye matrix.
 
-    .. testcode::
+    Examples:
 
-        import numpy as np
-        import megengine.functional as F
+        .. testcode::
 
-        out = F.eye(4, 6, dtype=np.float32)
-        print(out.numpy())
+            import numpy as np
+            import megengine.functional as F
 
-    Outputs:
+            out = F.eye(4, 6, dtype=np.float32)
+            print(out.numpy())
 
-    .. testoutput::
+        Outputs:
 
-        [[1. 0. 0. 0. 0. 0.]
-         [0. 1. 0. 0. 0. 0.]
-         [0. 0. 1. 0. 0. 0.]
-         [0. 0. 0. 1. 0. 0.]]
+        .. testoutput::
 
+            [[1. 0. 0. 0. 0. 0.]
+             [0. 1. 0. 0. 0. 0.]
+             [0. 0. 1. 0. 0. 0.]
+             [0. 0. 0. 1. 0. 0.]]
     """
     if M is not None:
         if isinstance(N, Tensor) or isinstance(M, Tensor):
@@ -97,32 +99,34 @@ def eye(N, M=None, *, dtype="float32", device: Optional[CompNode] = None) -> Ten
 
 
 def full(shape, value, dtype="float32", device=None) -> Tensor:
-    r"""
-    Creates a tensor of shape ``shape`` filled with ``value``.
+    r"""Creates a tensor of shape ``shape`` filled with ``value``.
 
-    :param shape: a list, tuple or integer defining the shape of the output tensor.
-    :param value: the value to fill the output tensor with.
-    :param dtype: the desired data type of the output tensor. Default: ``float32``.
-    :param device: the desired device of the output tensor. Default: if ``None``, use the default device(see ``megengine.get_default_device()``).
-    :return: output tensor.
+    Args:
+        shape: a list, tuple or integer defining the shape of the output tensor.
+        value: the value to fill the output tensor with.
+        dtype: the desired data type of the output tensor. Default: ``float32``.
+        device: the desired device of the output tensor. Default: if ``None``,
+            use the default device (see :func:`~.megengine.get_default_device`).
 
-    Examples:
+    Returns:
+        output tensor.
 
-    .. testcode::
+    Examples:
 
-        import numpy as np
-        import megengine.functional as F
+        .. testcode::
 
-        out = F.full([2,3], 1.5)
-        print(out.numpy())
+            import numpy as np
+            import megengine.functional as F
 
-    Outputs:
+            out = F.full([2,3], 1.5)
+            print(out.numpy())
 
-    .. testoutput::
+        Outputs:
 
-        [[1.5 1.5 1.5]
-         [1.5 1.5 1.5]]
+        .. testoutput::
 
+            [[1.5 1.5 1.5]
+             [1.5 1.5 1.5]]
     """
 
     if isinstance(shape, int):
@@ -136,95 +140,107 @@ def full(shape, value, dtype="float32", device=None) -> Tensor:
 
 
 def ones(shape, dtype="float32", device=None) -> Tensor:
-    """
-    Returns a ones tensor with given shape.
+    r"""Returns a ones tensor with given shape.
 
-    :param inp: input tensor.
-    :return: output zero tensor.
+    Args:
+        shape: a list, tuple or integer defining the shape of the output tensor.
+        dtype: the desired data type of the output tensor. Default: ``float32``.
+        device: the desired device of the output tensor. Default: if ``None``,
+            use the default device (see :func:`~.megengine.get_default_device`).
 
-    Examples:
+    Returns:
+        output tensor.
 
-    .. testcode::
+    Examples:
 
-        import megengine.functional as F
+        .. testcode::
 
-        out = F.ones((2, 1))
-        print(out.numpy())
+            import megengine.functional as F
 
-    Outputs:
+            out = F.ones((2, 1))
+            print(out.numpy())
 
-    .. testoutput::
+        Outputs:
 
-        [[1.]
-         [1.]]
+        .. testoutput::
 
+            [[1.]
+             [1.]]
     """
     return full(shape, 1.0, dtype=dtype, device=device)
 
 
 def zeros(shape, dtype="float32", device=None) -> Tensor:
-    """
-    Returns a zero tensor with given shape.
+    r"""Returns a zero tensor with given shape.
+
+    Args:
+        shape: a list, tuple or integer defining the shape of the output tensor.
+        dtype: the desired data type of the output tensor. Default: ``float32``.
+        device: the desired device of the output tensor. Default: if ``None``,
+            use the default device (see :func:`~.megengine.get_default_device`).
     """
     return full(shape, 0.0, dtype=dtype, device=device)
 
 
 def zeros_like(inp: Union[Tensor, SymbolVar]) -> Union[Tensor, SymbolVar]:
-    """
-    Returns a zero tensor with the same shape as input tensor.
+    r"""Returns a zero tensor with the same shape as input tensor.
+
+    Args:
+        inp: input tensor.
 
-    :param inp: input tensor.
-    :return: output zero tensor.
+    Return:
+        output tensor.
 
     Examples:
 
-    .. testcode::
+        .. testcode::
 
-        import numpy as np
-        from megengine import tensor
-        import megengine.functional as F
+            import numpy as np
+            from megengine import tensor
+            import megengine.functional as F
 
-        inp = tensor(np.arange(1, 7, dtype=np.int32).reshape(2,3))
-        out = F.zeros_like(inp)
-        print(out.numpy())
+            inp = tensor(np.arange(1, 7, dtype=np.int32).reshape(2,3))
+            out = F.zeros_like(inp)
+            print(out.numpy())
 
-    Outputs:
+        Outputs:
 
-    .. testoutput::
+        .. testoutput::
 
-        [[0 0 0]
-         [0 0 0]]
+            [[0 0 0]
+             [0 0 0]]
 
     """
     return full_like(inp, 0.0)
 
 
 def ones_like(inp: Union[Tensor, SymbolVar]) -> Union[Tensor, SymbolVar]:
-    """
-    Returns a ones tensor with the same shape as input tensor.
+    r"""Returns a ones tensor with the same shape as input tensor.
 
-    :param inp: input tensor.
-    :return: output ones tensor.
+    Args:
+        inp: input tensor.
 
-    Examples:
+    Return:
+        output tensor.
 
-    .. testcode::
+    Examples:
 
-        import numpy as np
-        from megengine import tensor
-        import megengine.functional as F
+        .. testcode::
 
-        inp = tensor(np.arange(1, 7, dtype=np.int32).reshape(2,3))
-        out = F.ones_like(inp)
-        print(out.numpy())
+            import numpy as np
+            from megengine import tensor
+            import megengine.functional as F
 
-    Outputs:
+            inp = tensor(np.arange(1, 7, dtype=np.int32).reshape(2,3))
+            out = F.ones_like(inp)
+            print(out.numpy())
 
-    .. testoutput::
+        Outputs:
 
-        [[1 1 1]
-         [1 1 1]]
+        .. testoutput::
 
+            [[1 1 1]
+             [1 1 1]]
     """
     return full_like(inp, 1.0)
 
@@ -232,30 +248,33 @@ def ones_like(inp: Union[Tensor, SymbolVar]) -> Union[Tensor, SymbolVar]:
 def full_like(
     inp: Union[Tensor, SymbolVar], value: Union[int, float]
 ) -> Union[Tensor, SymbolVar]:
-    """
-    Returns a tensor filled with given value with the same shape as input tensor.
+    r"""Returns a tensor filled with given value with the same shape as input tensor.
 
-    :param inp: input tensor.
-    :param value: target value.
-    :return: output tensor.
+    Args:
+        inp: input tensor.
+        value: target value.
+
+    Return:
+        output tensor.
 
     Examples:
-    .. testcode::
 
-        import numpy as np
-        from megengine import tensor
-        import megengine.functional as F
+        .. testcode::
+
+            import numpy as np
+            from megengine import tensor
+            import megengine.functional as F
 
-        inp = tensor(np.arange(1, 7, dtype=np.int32).reshape(2,3))
-        out = F.full_like(inp, 2)
-        print(out.numpy())
+            inp = tensor(np.arange(1, 7, dtype=np.int32).reshape(2,3))
+            out = F.full_like(inp, 2)
+            print(out.numpy())
 
-    Outputs:
+        Outputs:
 
-    .. testoutput::
+        .. testoutput::
 
-        [[2 2 2]
-         [2 2 2]]
+            [[2 2 2]
+             [2 2 2]]
 
     """
     (x,) = Const(value, dtype=inp.dtype, device=inp.device)(inp)
@@ -265,67 +284,69 @@ def full_like(
 
 
 def broadcast_to(inp: Tensor, shape: Union[int, Iterable[int]]) -> Tensor:
-    """
-    Broadcasts a tensor to given shape.
+    r"""Broadcasts a tensor to given shape.
 
-    :param inp: input tensor.
-    :param shape: target shape.
-    :return: output tensor.
+    Args:
+        inp: input tensor.
+        shape: target shape.
 
-    Examples:
+    Returns:
+        output tensor.
 
-    .. testcode::
+    Examples:
 
-        import numpy as np
-        from megengine import tensor
-        import megengine.functional as F
+        .. testcode::
 
-        data = tensor(np.arange(0, 3, dtype=np.float32).reshape(3))
-        out = F.broadcast_to(data, (2, 3))
-        print(out.numpy())
+            import numpy as np
+            from megengine import tensor
+            import megengine.functional as F
 
-    Outputs:
+            data = tensor(np.arange(0, 3, dtype=np.float32).reshape(3))
+            out = F.broadcast_to(data, (2, 3))
+            print(out.numpy())
 
-    .. testoutput::
+        Outputs:
 
-        [[0. 1. 2.]
-         [0. 1. 2.]]
+        .. testoutput::
 
+            [[0. 1. 2.]
+             [0. 1. 2.]]
     """
     return _broadcast(inp, shape)
 
 
 def concat(inps: Iterable[Tensor], axis: int = 0, device=None) -> Tensor:
-    r"""
-    Concat some tensors
+    r"""Concat some tensors
 
-    :param inps: input tensors to concat.
-    :param axis: over which dimension the tensors are concatenated. Default: 0
-    :param device: which device output will be. Default: None
-    :return: output tensor.
+    Args:
+        inps: input tensors to concat.
+        axis: over which dimension the tensors are concatenated. Default: 0
+        device: which device output will be. Default: None
 
-    Examples:
+    Returns:
+        output tensor.
 
-    .. testcode::
+    Examples:
 
-        import numpy as np
-        from megengine import tensor
-        import megengine.functional as F
+        .. testcode::
 
-        data1 = tensor(np.arange(0, 6, dtype=np.float32).reshape((2, 3)))
-        data2 = tensor(np.arange(6, 12, dtype=np.float32).reshape((2, 3)))
-        out = F.concat([data1, data2])
-        print(out.numpy())
+            import numpy as np
+            from megengine import tensor
+            import megengine.functional as F
 
-    Outputs:
+            data1 = tensor(np.arange(0, 6, dtype=np.float32).reshape((2, 3)))
+            data2 = tensor(np.arange(6, 12, dtype=np.float32).reshape((2, 3)))
+            out = F.concat([data1, data2])
+            print(out.numpy())
 
-    .. testoutput::
+        Outputs:
 
-        [[ 0.  1.  2.]
-         [ 3.  4.  5.]
-         [ 6.  7.  8.]
-         [ 9. 10. 11.]]
+        .. testoutput::
 
+            [[ 0.  1.  2.]
+             [ 3.  4.  5.]
+             [ 6.  7.  8.]
+             [ 9. 10. 11.]]
     """
     if len(inps) == 1:
         return inps[0]
@@ -340,35 +361,36 @@ def concat(inps: Iterable[Tensor], axis: int = 0, device=None) -> Tensor:
 
 
 def stack(inps, axis=0, device=None):
-    """
-    Concats a sequence of tensors along a new axis.
+    r"""Concats a sequence of tensors along a new axis.
     The input tensors must have the same shape.
 
-    :param inps: input tensors.
-    :param axis: which axis will be concatenated.
-    :param device: the device output will be. Default: None
-    :return: output concatenated tensor.
+    Args:
+        inps: input tensors.
+        axis: which axis will be concatenated.
+        device: the device output will be. Default: None
 
-    Examples:
+    Returns:
+        output concatenated tensor.
 
-    .. testcode::
+    Examples:
 
-        import numpy as np
-        from megengine import tensor
-        import megengine.functional as F
+        .. testcode::
 
-        x1 = tensor(np.arange(0, 3, dtype=np.float32).reshape((3)))
-        x2 = tensor(np.arange(6, 9, dtype=np.float32).reshape((3)))
-        out = F.stack([x1, x2], axis=0)
-        print(out.numpy())
+            import numpy as np
+            from megengine import tensor
+            import megengine.functional as F
 
-    Outputs:
+            x1 = tensor(np.arange(0, 3, dtype=np.float32).reshape((3)))
+            x2 = tensor(np.arange(6, 9, dtype=np.float32).reshape((3)))
+            out = F.stack([x1, x2], axis=0)
+            print(out.numpy())
 
-    .. testoutput::
+        Outputs:
 
-        [[0. 1. 2.]
-         [6. 7. 8.]]
+        .. testoutput::
 
+            [[0. 1. 2.]
+             [6. 7. 8.]]
     """
     if len(inps) > 0 and not isinstance(inps[0].shape, inps[0].__class__):
         shapes = {arr.shape for arr in inps}
@@ -380,38 +402,39 @@ def stack(inps, axis=0, device=None):
 
 
 def split(inp, nsplits_or_sections, axis=0):
-    """
-    Splits the input tensor into several smaller tensors.
+    r"""Splits the input tensor into several smaller tensors.
     When nsplits_or_sections is int, the last tensor may be smaller than others.
 
-    :param inp: input tensor.
-    :param nsplits_or_sections: number of sub tensors or sections information list.
-    :param axis: which axis will be splited.
-    :return: output tensor list.
+    Args:
+        inp: input tensor.
+        nsplits_or_sections: number of sub tensors or sections information list.
+        axis: which axis will be splited.
 
-    Examples:
+    Returns:
+        output tensor list.
 
-    .. testcode::
+    Examples:
 
-        import os
-        import numpy as np
-        from megengine import tensor
-        import megengine.functional as F
+        .. testcode::
 
-        x = tensor(np.random.random((10, 20)), dtype=np.float32)
-        y = F.split(x, 3)
-        z = F.split(x, [6, 17], axis=1)
+            import os
+            import numpy as np
+            from megengine import tensor
+            import megengine.functional as F
 
-        print([i.numpy().shape for i in y])
-        print([i.numpy().shape for i in z])
+            x = tensor(np.random.random((10, 20)), dtype=np.float32)
+            y = F.split(x, 3)
+            z = F.split(x, [6, 17], axis=1)
 
-    Outputs:
+            print([i.numpy().shape for i in y])
+            print([i.numpy().shape for i in z])
 
-    .. testoutput::
+        Outputs:
 
-        [(4, 20), (3, 20), (3, 20)]
-        [(10, 6), (10, 11), (10, 3)]
+        .. testoutput::
 
+            [(4, 20), (3, 20), (3, 20)]
+            [(10, 6), (10, 11), (10, 3)]
     """
     ndim = len(inp.shape)
     if axis >= ndim:
@@ -491,11 +514,13 @@ def gather(inp: Tensor, axis: int, index: Tensor) -> Tensor:
     r"""
     Gathers data from input tensor on axis using index.
 
-    For a 3-D tensor, the output is specified by::
+    For a 3-D tensor, the output is specified by:
 
-        out[i][j][k] = inp[index[i][j][k]][j][k] # if axis == 0
-        out[i][j][k] = inp[i][index[i][j][k]][k] # if axis == 1
-        out[i][j][k] = inp[i][j][index[i][j][k]] # if axis == 2
+    .. code-block::
+
+       out[i][j][k] = inp[index[i][j][k]][j][k] # if axis == 0
+       out[i][j][k] = inp[i][index[i][j][k]][k] # if axis == 1
+       out[i][j][k] = inp[i][j][index[i][j][k]] # if axis == 2
 
     if input tensor is a n-dimensional tensor with size
     :math:`(x_0,x_1,...,x_{i-1},x_i,x_{i+1},...,x_{n-1})` and axis=i,
@@ -503,32 +528,34 @@ def gather(inp: Tensor, axis: int, index: Tensor) -> Tensor:
     :math:`(x_0,x_1,...,x_{i-1},y,x_{i+1},...,x_{n-1})` where :math:`y\ge 1` and
     output will have the same size as index.
 
-    :param inp: input tensor.
-    :param axis: along which axis to index.
-    :param index: indices of elements to gather.
-    :return: output tensor.
+    Args:
+        inp: input tensor.
+        axis: along which axis to index.
+        index: indices of elements to gather.
 
-    Examples:
+    Return:
+        output tensor.
 
-    .. testcode::
+    Examples:
 
-        import megengine.functional as F
-        from megengine import tensor
+        .. testcode::
 
-        inp = tensor([
-            [1,2], [3,4], [5,6],
-        ])
-        index = tensor([[0,2], [1,0]])
-        oup = F.gather(inp, 0, index)
-        print(oup.numpy())
+            import megengine.functional as F
+            from megengine import tensor
 
-    Outputs:
+            inp = tensor([
+                [1,2], [3,4], [5,6],
+            ])
+            index = tensor([[0,2], [1,0]])
+            oup = F.gather(inp, 0, index)
+            print(oup.numpy())
 
-    .. testoutput::
+        Outputs:
 
-        [[1 6]
-         [3 2]]
+        .. testoutput::
 
+            [[1 6]
+             [3 2]]
     """
     input_shape = inp.shape
     index_shape = index.shape
@@ -569,11 +596,13 @@ def scatter(inp: Tensor, axis: int, index: Tensor, source: Tensor) -> Tensor:
     in source for ``axis != dimension`` and by the corresponding value in
     index for ``axis = dimension``.
 
-    For a 3-D tensor, input tensor is updated as::
+    For a 3-D tensor, input tensor is updated as:
+
+    .. code-block::
 
-        inp[index[i][j][k]][j][k] = source[i][j][k]  # if axis == 0
-        inp[i][index[i][j][k]][k] = source[i][j][k]  # if axis == 1
-        inp[i][j][index[i][j][k]] = source[i][j][k]  # if axis == 2
+       inp[index[i][j][k]][j][k] = source[i][j][k]  # if axis == 0
+       inp[i][index[i][j][k]][k] = source[i][j][k]  # if axis == 1
+       inp[i][j][index[i][j][k]] = source[i][j][k]  # if axis == 2
 
     ``inp``, ``index`` and ``source`` should have same number of dimensions.
 
@@ -582,7 +611,7 @@ def scatter(inp: Tensor, axis: int, index: Tensor, source: Tensor) -> Tensor:
 
     Moreover, the values of index must be between ``0`` and ``inp.shape(axis) - 1`` inclusive.
 
-    .. note::
+    Note:
         Please notice that, due to performance issues, the result is uncertain on the GPU device
         if scattering different positions from source to the same destination position
         regard to index tensor.
@@ -591,34 +620,36 @@ def scatter(inp: Tensor, axis: int, index: Tensor, source: Tensor) -> Tensor:
         from source[0][2] which value is 0.2256 or source[1][2] which value is 0.5339
         if set the index[1][2] from 1 to 0.
 
-    :param inp: inp tensor which to be scattered.
-    :param axis: axis along which to index.
-    :param index: indices of elements to scatter.
-    :param source: source element(s) to scatter.
-    :return: output tensor.
+    Args:
+        inp: inp tensor which to be scattered.
+        axis: axis along which to index.
+        index: indices of elements to scatter.
+        source: source element(s) to scatter.
 
-    Examples:
+    Return:
+        output tensor.
 
-    .. testcode::
+    Examples:
 
-        import numpy as np
-        import megengine.functional as F
-        from megengine import tensor
+        .. testcode::
 
-        inp = tensor(np.zeros(shape=(3,5),dtype=np.float32))
-        source = tensor([[0.9935,0.9465,0.2256,0.8926,0.4396],[0.7723,0.0718,0.5939,0.357,0.4576]])
-        index = tensor([[0,2,0,2,1],[2,0,1,1,2]])
-        oup = F.scatter(inp, 0, index,source)
-        print(oup.numpy())
+            import numpy as np
+            import megengine.functional as F
+            from megengine import tensor
 
-    Outputs:
+            inp = tensor(np.zeros(shape=(3,5),dtype=np.float32))
+            source = tensor([[0.9935,0.9465,0.2256,0.8926,0.4396],[0.7723,0.0718,0.5939,0.357,0.4576]])
+            index = tensor([[0,2,0,2,1],[2,0,1,1,2]])
+            oup = F.scatter(inp, 0, index,source)
+            print(oup.numpy())
 
-    .. testoutput::
+        Outputs:
 
-        [[0.9935 0.0718 0.2256 0.     0.    ]
-         [0.     0.     0.5939 0.357  0.4396]
-         [0.7723 0.9465 0.     0.8926 0.4576]]
+        .. testoutput::
 
+            [[0.9935 0.0718 0.2256 0.     0.    ]
+             [0.     0.     0.5939 0.357  0.4396]
+             [0.7723 0.9465 0.     0.8926 0.4576]]
     """
     input_shape = inp.shape
     index_shape = index.shape
@@ -667,38 +698,40 @@ def scatter(inp: Tensor, axis: int, index: Tensor, source: Tensor) -> Tensor:
 
 
 def where(mask: Tensor, x: Tensor, y: Tensor) -> Tensor:
-    r"""
-    Selects elements either from Tensor x or Tensor y, according to mask.
+    r"""Selects elements either from Tensor x or Tensor y, according to mask.
 
     .. math::
 
         \textrm{out}_i = x_i \textrm{ if } \textrm{mask}_i \textrm{ is True else } y_i
 
-    :param mask: a mask used for choosing ``x`` or ``y``.
-    :param x: first choice.
-    :param y: second choice.
-    :return: output tensor.
+    Args:
+        mask: a mask used for choosing ``x`` or ``y``.
+        x: first choice.
+        y: second choice.
+
+    Returns:
+        output tensor.
 
     Examples:
 
-    .. testcode::
+        .. testcode::
 
-        import numpy as np
-        from megengine import tensor
-        import megengine.functional as F
-        mask = tensor(np.array([[True, False], [False, True]], dtype=np.bool))
-        x = tensor(np.array([[1, np.inf], [np.nan, 4]],
-            dtype=np.float32))
-        y = tensor(np.array([[5, 6], [7, 8]], dtype=np.float32))
-        out = F.where(mask, x, y)
-        print(out.numpy())
+            import numpy as np
+            from megengine import tensor
+            import megengine.functional as F
+            mask = tensor(np.array([[True, False], [False, True]], dtype=np.bool))
+            x = tensor(np.array([[1, np.inf], [np.nan, 4]],
+                dtype=np.float32))
+            y = tensor(np.array([[5, 6], [7, 8]], dtype=np.float32))
+            out = F.where(mask, x, y)
+            print(out.numpy())
 
-    Outputs:
+        Outputs:
 
-    .. testoutput::
+        .. testoutput::
 
-        [[1. 6.]
-         [7. 4.]]
+            [[1. 6.]
+             [7. 4.]]
     """
 
     if not isinstance(x, Tensor):
@@ -730,34 +763,33 @@ def where(mask: Tensor, x: Tensor, y: Tensor) -> Tensor:
 
 
 def cond_take(mask: Tensor, x: Tensor) -> Tensor:
-    r"""
-    Takes elements from data if specific condition is satisfied on mask.
+    r"""Takes elements from data if specific condition is satisfied on mask.
     This operator has two outputs: the first is the elements taken,
     and the second is the indices corresponding to those elements;
     they are both 1-dimensional. High-dimension input would first be flattened.
 
-    :param mask: condition param; must be the same shape with data.
-    :param x: input tensor from which to take elements.
+    Args:
+        mask: condition param; must be the same shape with data.
+        x: input tensor from which to take elements.
 
     Examples:
 
-    .. testcode::
-
-        import numpy as np
-        from megengine import tensor
-        import megengine.functional as F
-        mask = tensor(np.array([[True, False], [False, True]], dtype=np.bool_))
-        x = tensor(np.array([[1, np.inf], [np.nan, 4]],
-            dtype=np.float32))
-        v, index = F.cond_take(mask, x)
-        print(v.numpy(), index.numpy())
+        .. testcode::
 
-    Outputs:
+            import numpy as np
+            from megengine import tensor
+            import megengine.functional as F
+            mask = tensor(np.array([[True, False], [False, True]], dtype=np.bool_))
+            x = tensor(np.array([[1, np.inf], [np.nan, 4]],
+                dtype=np.float32))
+            v, index = F.cond_take(mask, x)
+            print(v.numpy(), index.numpy())
 
-    .. testoutput::
+        Outputs:
 
-        [1. 4.] [0 3]
+        .. testoutput::
 
+            [1. 4.] [0 3]
     """
     if not isinstance(x, (Tensor, SymbolVar)):
         raise TypeError("input must be a tensor")
@@ -774,110 +806,111 @@ def cond_take(mask: Tensor, x: Tensor) -> Tensor:
 
 
 def transpose(inp: Tensor, pattern: Iterable[int]) -> Tensor:
-    r"""
-    Swaps shapes and strides according to given pattern.
-
-    :param inp: input tensor.
-    :param pattern: a list of integers including 0, 1, ... , ``ndim``-1,
-        and any number of ``'x'`` char in dimensions where this tensor should be broadcasted. For examples:
-
-        * (``'x'``) -> make a 0d (scalar) into a 1d vector
-        * (0, 1) -> identity for 2d vectors
-        * (1, 0) -> inverts the first and second dimensions
-        * (``'x'``, 0) -> make a row out of a 1d vector (N to 1xN)
-        * (0, ``'x'``) -> make a column out of a 1d vector (N to Nx1)
-        * (2, 0, 1) -> AxBxC to CxAxB
-        * (0, ``'x'``, 1) -> AxB to Ax1xB
-        * (1, ``'x'``, 0) -> AxB to Bx1xA
-        * (1,) -> this removes dimensions 0. It must be a broadcastable dimension (1xA to A)
-
-    :return: output tensor.
+    r"""Swaps shapes and strides according to given pattern.
+
+    Args:
+        inp: input tensor.
+        pattern: a list of integers including 0, 1, ... , ``ndim``-1,
+            and any number of ``'x'`` char in dimensions where this tensor should be broadcasted.
+            For examples:
+
+            * (``'x'``) -> make a 0d (scalar) into a 1d vector
+            * (0, 1) -> identity for 2d vectors
+            * (1, 0) -> inverts the first and second dimensions
+            * (``'x'``, 0) -> make a row out of a 1d vector (N to 1xN)
+            * (0, ``'x'``) -> make a column out of a 1d vector (N to Nx1)
+            * (2, 0, 1) -> AxBxC to CxAxB
+            * (0, ``'x'``, 1) -> AxB to Ax1xB
+            * (1, ``'x'``, 0) -> AxB to Bx1xA
+            * (1,) -> this removes dimensions 0. It must be a broadcastable dimension (1xA to A)
+
+    Returns:
+        output tensor.
 
     Examples:
 
-    .. testcode::
-
-        import numpy as np
-        from megengine import tensor
-        import megengine.functional as F
-        x = tensor(np.array([[1, 1], [0, 0]], dtype=np.int32))
-        out = F.transpose(x, (1, 0))
-        print(out.numpy())
+        .. testcode::
 
-    Outputs:
+            import numpy as np
+            from megengine import tensor
+            import megengine.functional as F
+            x = tensor(np.array([[1, 1], [0, 0]], dtype=np.int32))
+            out = F.transpose(x, (1, 0))
+            print(out.numpy())
 
-    .. testoutput::
+        Outputs:
 
-        [[1 0]
-         [1 0]]
+        .. testoutput::
 
+            [[1 0]
+            [1 0]]
     """
     return inp.transpose(list(-1 if _ == "x" else _ for _ in pattern))
 
 
 def reshape(inp: Tensor, target_shape: Iterable[int]) -> Tensor:
-    r"""
-    Reshapes a tensor to given target shape; total number of logical elements must
+    r"""Reshapes a tensor to given target shape; total number of logical elements must
     remain unchanged
 
-    :param inp: input tensor.
-    :param target_shape: target shape, it can contain an element of -1 representing ``unspec_axis``.
+    Args:
+        inp: input tensor.
+        target_shape: target shape, it can contain an element of -1 representing ``unspec_axis``.
 
     Examples:
 
-    .. testcode::
-
-        import numpy as np
-        from megengine import tensor
-        import megengine.functional as F
-        x = tensor(np.arange(12, dtype=np.int32))
-        out = F.reshape(x, (3, 4))
-        print(out.numpy())
+        .. testcode::
 
-    Outputs:
+            import numpy as np
+            from megengine import tensor
+            import megengine.functional as F
+            x = tensor(np.arange(12, dtype=np.int32))
+            out = F.reshape(x, (3, 4))
+            print(out.numpy())
 
-    .. testoutput::
+        Outputs:
 
-        [[ 0  1  2  3]
-         [ 4  5  6  7]
-         [ 8  9 10 11]]
+        .. testoutput::
 
+            [[ 0  1  2  3]
+             [ 4  5  6  7]
+             [ 8  9 10 11]]
     """
     return inp.reshape(target_shape)
 
 
 def flatten(inp: Tensor, start_axis: int = 0, end_axis: int = -1) -> Tensor:
-    r"""
-    Reshapes the tensor by flattening the sub-tensor from dimension ``start_axis`` to dimension ``end_axis``.
+    r"""Reshapes the tensor by flattening the sub-tensor from dimension ``start_axis`` to dimension ``end_axis``.
 
-    :param inp: input tensor.
-    :param start_axis: start dimension that the sub-tensor to be flattened. Default: 0
-    :param end_axis: end dimension that the sub-tensor to be flattened. Default: -1
-    :return: output tensor.
+    Args:
+        inp: input tensor.
+        start_axis: start dimension that the sub-tensor to be flattened. Default: 0
+        end_axis: end dimension that the sub-tensor to be flattened. Default: -1
 
-    Examples:
+    Returns:
+        output tensor.
 
-    .. testcode::
+    Examples:
 
-        import numpy as np
-        from megengine import tensor
-        import megengine.functional as F
+        .. testcode::
 
-        inp_shape = (2, 2, 3, 3)
-        x = tensor(
-            np.arange(36, dtype=np.int32).reshape(inp_shape),
-        )
-        out = F.flatten(x, 2)
-        print(x.numpy().shape)
-        print(out.numpy().shape)
+            import numpy as np
+            from megengine import tensor
+            import megengine.functional as F
 
-    Outputs:
+            inp_shape = (2, 2, 3, 3)
+            x = tensor(
+                np.arange(36, dtype=np.int32).reshape(inp_shape),
+            )
+            out = F.flatten(x, 2)
+            print(x.numpy().shape)
+            print(out.numpy().shape)
 
-    .. testoutput::
+        Outputs:
 
-        (2, 2, 3, 3)
-        (2, 2, 9)
+        .. testoutput::
 
+            (2, 2, 3, 3)
+            (2, 2, 9)
     """
     target_shape = tuple(inp.shape[i] for i in range(start_axis)) + (-1,)
     if end_axis != -1:
@@ -886,31 +919,32 @@ def flatten(inp: Tensor, start_axis: int = 0, end_axis: int = -1) -> Tensor:
 
 
 def expand_dims(inp: Tensor, axis: Union[int, Sequence[int]]) -> Tensor:
-    r"""
-    Adds dimension before given axis.
+    r"""Adds dimension before given axis.
 
-    :param inp: input tensor.
-    :param axis: place of new axes.
-    :return: output tensor.
+    Args:
+        inp: input tensor.
+        axis: place of new axes.
 
-    Examples:
+    Returns:
+        output tensor.
 
-    .. testcode::
+    Examples:
 
-        import numpy as np
-        from megengine import tensor
-        import megengine.functional as F
+        .. testcode::
 
-        x = tensor([1, 2])
-        out = F.expand_dims(x, 0)
-        print(out.numpy().shape)
+            import numpy as np
+            from megengine import tensor
+            import megengine.functional as F
 
-    Outputs:
+            x = tensor([1, 2])
+            out = F.expand_dims(x, 0)
+            print(out.numpy().shape)
 
-    .. testoutput::
+        Outputs:
 
-        (1, 2)
+        .. testoutput::
 
+            (1, 2)
     """
 
     def get_axes():
@@ -944,31 +978,32 @@ def expand_dims(inp: Tensor, axis: Union[int, Sequence[int]]) -> Tensor:
 
 
 def squeeze(inp: Tensor, axis: Optional[Union[int, Sequence[int]]] = None) -> Tensor:
-    r"""
-    Removes dimension of shape 1.
+    r"""Removes dimension of shape 1.
 
-    :param inp: input tensor.
-    :param axis: place of axis to be removed.
-    :return: output tensor.
+    Args:
+        inp: input tensor.
+        axis: place of axis to be removed.
 
-    Examples:
+    Returns:
+        output tensor.
 
-    .. testcode::
+    Examples:
 
-        import numpy as np
-        from megengine import tensor
-        import megengine.functional as F
+        .. testcode::
 
-        x = tensor(np.array([1, 2], dtype=np.int32).reshape(1, 1, 2, 1))
-        out = F.squeeze(x, 3)
-        print(out.numpy().shape)
+            import numpy as np
+            from megengine import tensor
+            import megengine.functional as F
 
-    Outputs:
+            x = tensor(np.array([1, 2], dtype=np.int32).reshape(1, 1, 2, 1))
+            out = F.squeeze(x, 3)
+            print(out.numpy().shape)
 
-    .. testoutput::
+        Outputs:
 
-        (1, 1, 2)
+        .. testoutput::
 
+            (1, 1, 2)
     """
     return _remove_axis(inp, axis)
 
@@ -980,31 +1015,32 @@ def linspace(
     dtype="float32",
     device: Optional[CompNode] = None,
 ) -> Tensor:
-    r"""
-    Returns equally spaced numbers over a specified interval.
+    r"""Returns equally spaced numbers over a specified interval.
 
-    :param start: starting value of the squence, shoule be scalar.
-    :param stop: last value of the squence, shoule be scalar.
-    :param num: number of values to generate.
-    :param dtype: result data type.
-    :return: generated tensor.
+    Args:
+        start: starting value of the squence, shoule be scalar.
+        stop: last value of the squence, shoule be scalar.
+        num: number of values to generate.
+        dtype: result data type.
 
-    Examples:
+    Returns:
+        generated tensor.
 
-    .. testcode::
+    Examples:
 
-        import numpy as np
-        import megengine.functional as F
+        .. testcode::
 
-        a = F.linspace(3, 10, 5)
-        print(a.numpy())
+            import numpy as np
+            import megengine.functional as F
 
-    Outputs:
+            a = F.linspace(3, 10, 5)
+            print(a.numpy())
 
-    .. testoutput::
+        Outputs:
 
-        [ 3.    4.75  6.5   8.25 10.  ]
+        .. testoutput::
 
+            [ 3.    4.75  6.5   8.25 10.  ]
     """
     for item in (start, stop, num):
         cur_device = getattr(item, "device", None)
@@ -1039,33 +1075,32 @@ def arange(
     dtype="float32",
     device: Optional[CompNode] = None,
 ) -> Tensor:
-    r"""
-    Returns a tensor with values from start to stop with adjacent interval step.
-
-    :param start: starting value of the squence, shoule be scalar.
-    :param stop: ending value of the squence, shoule be scalar.
-    :param step: gap between each pair of adjacent values. Default: 1
-    :param dtype: result data type.
-    :return: generated tensor.
+    r"""Returns a tensor with values from start to stop with adjacent interval step.
 
-    Examples:
+    Args:
+        start: starting value of the squence, shoule be scalar.
+        stop: ending value of the squence, shoule be scalar.
+        step: gap between each pair of adjacent values. Default: 1
+        dtype: result data type.
 
-    .. testcode::
+    Returns:
+        generated tensor.
 
-        import numpy as np
-        import megengine.functional as F
+    Examples:
 
-        a = F.arange(5)
-        print(a.numpy())
+        .. testcode::
 
-    Outputs:
+            import numpy as np
+            import megengine.functional as F
 
-    Outputs:
+            a = F.arange(5)
+            print(a.numpy())
 
-    .. testoutput::
+        Outputs:
 
-        [0. 1. 2. 3. 4.]
+        .. testoutput::
 
+            [0. 1. 2. 3. 4.]
     """
     if stop is None:
         start, stop = 0, start
@@ -1083,36 +1118,37 @@ def arange(
 
 
 def repeat(inp: Tensor, repeats: int, axis: Optional[int] = None):
-    """
-    Repeat elements of an array.
+    r"""Repeat elements of an array.
 
-    :param inp: input tensor.
-    :param repeats: the number of repetitions for each element.
-    :param axis: the axis along which to repeat values. By default, use the
-                 flattened input array, and return a flat output array.
-    :return: output tensor.
+    Args:
+        inp: input tensor.
+        repeats: the number of repetitions for each element.
+        axis: the axis along which to repeat values. By default, use the
+            flattened input array, and return a flat output array.
 
-    Examples:
+    Returns:
+        output tensor.
 
-    .. testcode::
+    Examples:
 
-        import numpy as np
-        import megengine.functional as F
-        from megengine import tensor
+        .. testcode::
 
-        x = tensor([[1, 2], [3, 4]], np.int32)
-        y = F.repeat(x, 2, axis=0)
-        print(y.numpy())
+            import numpy as np
+            import megengine.functional as F
+            from megengine import tensor
 
-    Outputs:
+            x = tensor([[1, 2], [3, 4]], np.int32)
+            y = F.repeat(x, 2, axis=0)
+            print(y.numpy())
 
-    .. testoutput::
+        Outputs:
 
-        [[1 2]
-         [1 2]
-         [3 4]
-         [3 4]]
+        .. testoutput::
 
+            [[1 2]
+             [1 2]
+             [3 4]
+             [3 4]]
     """
     if axis is None:
         inp = inp.reshape(-1)  # flatten
@@ -1168,36 +1204,38 @@ def _tile_one_dim(inp, rep, axis):
 
 
 def tile(inp: Tensor, reps: Iterable[int]):
-    """
-    Construct an array by repeating ``inp`` the number of times given by ``reps``. If reps has length d,
+    r"""Construct an array by repeating ``inp`` the number of times given by ``reps``. If reps has length d,
     the result will have dimension of ``max(d, inp.ndim)``. It is required that ``d >= inp.dim``. If ``inp.ndim < d``,
     ``inp`` is promoted to be ``d``-dimensional by prepending new axis.
 
-    :param inp: input tensor.
-    :param reps: The number of repetitions of inp along each axis.
-    :return: output tensor.
+    Args:
+        inp: input tensor.
+        reps: The number of repetitions of inp along each axis.
 
-    Examples:
+    Returns:
+        output tensor.
 
-    .. testcode::
 
-        import numpy as np
-        import megengine.functional as F
-        from megengine import tensor
+    Examples:
 
-        x = tensor([[1, 2], [3, 4]], np.int32)
-        y = F.tile(x, (2,1))
-        print(y.numpy())
+        .. testcode::
 
-    Outputs:
+            import numpy as np
+            import megengine.functional as F
+            from megengine import tensor
+
+            x = tensor([[1, 2], [3, 4]], np.int32)
+            y = F.tile(x, (2,1))
+            print(y.numpy())
 
-    .. testoutput::
+        Outputs:
 
-        [[1 2]
-        [3 4]
-        [1 2]
-        [3 4]]
+        .. testoutput::
 
+            [[1 2]
+             [3 4]
+             [1 2]
+             [3 4]]
     """
     shape = astensor1d(inp.shape, inp, dtype="int32", device=inp.device)
     reps = astensor1d(reps, inp, dtype="int32", device=inp.device)
@@ -1224,35 +1262,35 @@ def tile(inp: Tensor, reps: Iterable[int]):
 
 
 def copy(inp, device=None):
-    r"""
-    Copies tensor to another device.
+    r"""Copies tensor to another device.
 
-    :param inp: input tensor.
-    :param device: destination device.
+    Args:
+        inp: input tensor.
+        device: destination device.
 
     Examples:
 
-    .. testcode::
+        .. testcode::
 
-        import numpy as np
-        import platform
-        from megengine import tensor
-        from megengine.device import get_device_count
-        import megengine.functional as F
+            import numpy as np
+            import platform
+            from megengine import tensor
+            from megengine.device import get_device_count
+            import megengine.functional as F
 
-        x = tensor([1, 2, 3], np.int32)
-        if 1 == get_device_count("gpu"):
-            y = F.copy(x, "cpu1")
-            print(y.numpy())
-        else:
-            y = F.copy(x, "xpu1")
-            print(y.numpy())
+            x = tensor([1, 2, 3], np.int32)
+            if 1 == get_device_count("gpu"):
+                y = F.copy(x, "cpu1")
+                print(y.numpy())
+            else:
+                y = F.copy(x, "xpu1")
+                print(y.numpy())
 
-    Outputs:
+        Outputs:
 
-    .. testoutput::
+        .. testoutput::
 
-        [1 2 3]
+            [1 2 3]
     """
     if device is None:
         return apply(Identity(), inp)[0]
@@ -1264,38 +1302,37 @@ def roll(
     shift: Union[int, Iterable[int]],
     axis: Optional[Union[int, Iterable[int]]] = None,
 ):
-    """
-    Roll the tensor along the given axis(or axes). Elements that are shifted
+    r"""Roll the tensor along the given axis(or axes). Elements that are shifted
     beyond the last position are re-introduced at the first position.
 
-    :param inp: input tensor.
-    :param shift: the number of places by which the elements of the tensor are
-        shifted. If shift is a tuple, axis must be a tuple of the same size,
-        and each axis will be rolled by the corresponding shift value.
-    :param axis: axis along which to roll. If axis is not specified, the tensor
-        will be flattened before rolling and then restored to the original shape.
-        Duplicate axes is allowed if it is a tuple. Default: None.
+    Args:
+        inp: input tensor.
+        shift: the number of places by which the elements of the tensor are
+            shifted. If shift is a tuple, axis must be a tuple of the same size,
+            and each axis will be rolled by the corresponding shift value.
+        axis: axis along which to roll. If axis is not specified, the tensor
+            will be flattened before rolling and then restored to the original shape.
+            Duplicate axes is allowed if it is a tuple. Default: None.
 
     Examples:
 
-    .. testcode::
+        .. testcode::
 
-        import numpy as np
-        from megengine import tensor
-        import megengine.functional as F
+            import numpy as np
+            from megengine import tensor
+            import megengine.functional as F
 
-        x = tensor([[1,2],[3,4],[5,6]], np.int32)
-        y = F.roll(x, 1, 0)
-        print(y.numpy())
-
-    Outputs:
+            x = tensor([[1,2],[3,4],[5,6]], np.int32)
+            y = F.roll(x, 1, 0)
+            print(y.numpy())
 
-    .. testoutput::
+        Outputs:
 
-        [[5 6]
-         [1 2]
-         [3 4]]
+        .. testoutput::
 
+            [[5 6]
+            [1 2]
+            [3 4]]
     """
     shp_bak = None
     if axis is None:
@@ -1332,30 +1369,29 @@ def roll(
 
 
 def cumsum(inp: Tensor, axis: int):
-    """
-    Computes the cumulative sum of elements along given axis.
+    r"""Computes the cumulative sum of elements along given axis.
 
-    :param inp: input tensor.
-    :param axis: axis along which cumsum is performed.
+    Args:
+        inp: input tensor.
+        axis: axis along which cumsum is performed.
 
     Examples:
 
-    .. testcode::
+        .. testcode::
 
-        from megengine import tensor
-        import megengine.functional as F
+            from megengine import tensor
+            import megengine.functional as F
 
-        x = tensor([[1, 2, 3], [4, 5, 6]], "int32")
-        y = F.cumsum(x, 1)
-        print(y.numpy())
-
-    Outputs:
+            x = tensor([[1, 2, 3], [4, 5, 6]], "int32")
+            y = F.cumsum(x, 1)
+            print(y.numpy())
 
-    .. testoutput::
+        Outputs:
 
-        [[ 1  3  6]
-         [ 4  9 15]]
+        .. testoutput::
 
+            [[ 1  3  6]
+            [ 4  9 15]]
     """
     assert isinstance(inp, Tensor), "input of cumsum must be type of Tensor"
     assert axis >= 0 and axis < inp.ndim, "input axis {} out of bound".format(axis)
diff --git a/imperative/python/megengine/functional/utils.py b/imperative/python/megengine/functional/utils.py
index e86fa993..35b14f9e 100644
--- a/imperative/python/megengine/functional/utils.py
+++ b/imperative/python/megengine/functional/utils.py
@@ -19,37 +19,36 @@ __all__ = ["topk_accuracy"]
 def _assert_equal(
     expect: Tensor, actual: Tensor, *, maxerr: float = 0.0001, verbose: bool = False
 ):
-    r"""
-    Asserts two tensors equal and returns expected value (first input).
+    r"""Asserts two tensors equal and returns expected value (first input).
     It is a variant of python assert which is symbolically traceable (similar to ``numpy.testing.assert_equal``).
     If we want to verify the correctness of model, just ``assert`` its states and outputs.
     While sometimes we need to verify the correctness at different backends for *dumped* model
     (or in :class:`~jit.trace` context), and no python code could be executed in that case.
     Thus we have to use :func:`~functional.utils._assert_equal` instead.
 
-    :param expect: expected tensor value
-    :param actual: tensor to check value
-    :param maxerr: max allowed error; error is defined as the minimal of absolute and relative error
-    :param verbose: whether to print maxerr to stdout during opr exec
-    :return: expected tensor
+    Args:
+        expect: expected tensor value
+        actual: tensor to check value
+        maxerr: max allowed error; error is defined as the minimal of absolute and relative error
+        verbose: whether to print maxerr to stdout during opr exec
 
     Examples:
 
-    .. testcode::
+        .. testcode::
 
-        import numpy as np
-        from megengine import tensor
-        import megengine.functional as F
+            import numpy as np
+            from megengine import tensor
+            import megengine.functional as F
 
-        x = tensor([1, 2, 3], np.float32)
-        y = tensor([1, 2, 3], np.float32)
-        print(F.utils._assert_equal(x, y, maxerr=0).numpy())
+            x = tensor([1, 2, 3], np.float32)
+            y = tensor([1, 2, 3], np.float32)
+            print(F.utils._assert_equal(x, y, maxerr=0).numpy())
 
-    Outputs:
+        Outputs:
 
-    .. testoutput::
+        .. testoutput::
 
-        [1. 2. 3.]
+            [1. 2. 3.]
     """
     err = (
         abs(expect - actual)
diff --git a/imperative/python/megengine/functional/vision.py b/imperative/python/megengine/functional/vision.py
index b713a829..94d53497 100644
--- a/imperative/python/megengine/functional/vision.py
+++ b/imperative/python/megengine/functional/vision.py
@@ -21,31 +21,32 @@ from .tensor import broadcast_to, concat, expand_dims, reshape, transpose
 
 
 def cvt_color(inp: Tensor, mode: str = ""):
-    r"""
-    Convert images from one format to another
+    r"""Convert images from one format to another
 
-    :param inp: input images.
-    :param mode: format mode.
-    :return: convert result.
+    Args:
+        inp: input images.
+        mode: format mode.
 
-    Examples:
+    Returns:
+        convert result.
 
-    .. testcode::
+    Examples:
 
-        import numpy as np
-        import megengine as mge
-        import megengine.functional as F
+        .. testcode::
 
-        x = mge.tensor(np.array([[[[-0.58675045, 1.7526233, 0.10702174]]]]).astype(np.float32))
-        y = F.vision.cvt_color(x, mode="RGB2GRAY")
-        print(y.numpy())
+            import numpy as np
+            import megengine as mge
+            import megengine.functional as F
 
-    Outputs:
+            x = mge.tensor(np.array([[[[-0.58675045, 1.7526233, 0.10702174]]]]).astype(np.float32))
+            y = F.vision.cvt_color(x, mode="RGB2GRAY")
+            print(y.numpy())
 
-    .. testoutput::
+        Outputs:
 
-        [[[[0.86555195]]]]
+        .. testoutput::
 
+            [[[[0.86555195]]]]
     """
     mode = mode.upper()
     assert mode in builtin.CvtColor.Mode.__dict__, "unspport mode for cvt_color"
@@ -63,37 +64,38 @@ def roi_pooling(
     mode: str = "max",
     scale: float = 1.0,
 ) -> Tensor:
-    """
-    Applies roi pooling on input feature.
+    r"""Applies roi pooling on input feature.
 
-    :param inp: tensor that represents the input feature, `(N, C, H, W)` images.
-    :param rois: `(K, 5)` boxes. First column is the index into N. The other 4 columns are xyxy.
-    :param output_shape: `(height, width)` of output rois feature.
-    :param mode: "max" or "average", use max/average align just like max/average pooling. Default: "max"
-    :param scale: scale the input boxes by this number. Default: 1.0
-    :return: `(K, C, output_shape[0], output_shape[1])` feature of rois.
+    Args:
+        inp: tensor that represents the input feature, `(N, C, H, W)` images.
+        rois: K, 5)` boxes. First column is the index into N. The other 4 columns are xyxy.
+        output_shape: height, width)` of output rois feature.
+        mode: max" or "average", use max/average align just like max/average pooling. Default: "max"
+        scale: scale the input boxes by this number. Default: 1.0
 
-    Examples:
+    Returns:
+        ``K, C, output_shape[0], output_shape[1])`` feature of rois.
 
-    .. testcode::
+    Examples:
 
-            import numpy as np
-            from megengine import tensor
-            import megengine.functional as F
+        .. testcode::
 
-            np.random.seed(42)
-            inp = tensor(np.random.randn(1, 1, 128, 128))
-            rois = tensor(np.random.random((4, 5)))
-            y = F.vision.roi_pooling(inp, rois, (2, 2))
-            print(y.numpy()[0].round(decimals=4))
+                import numpy as np
+                from megengine import tensor
+                import megengine.functional as F
 
-    Outputs:
+                np.random.seed(42)
+                inp = tensor(np.random.randn(1, 1, 128, 128))
+                rois = tensor(np.random.random((4, 5)))
+                y = F.vision.roi_pooling(inp, rois, (2, 2))
+                print(y.numpy()[0].round(decimals=4))
 
-    .. testoutput::
+        Outputs:
 
-            [[[-0.1383 -0.1383]
-              [-0.5035 -0.5035]]]
+        .. testoutput::
 
+                [[[-0.1383 -0.1383]
+                  [-0.5035 -0.5035]]]
     """
     assert mode.lower() in ["max", "average"], "only max/average mode is supported"
     if isinstance(output_shape, int):
@@ -116,17 +118,17 @@ def correlation(
     pad_size: int = 0,
     is_multiply: bool = True,
 ) -> Tensor:
-    """ Applies correlation to inputs.
-
-    :param data1:  Input data1 to the correlation. format must be nchw
-    :param data2:  Input data2 to the correlation. format must be nchw
-    :param kernel_size: (int (non-negative), optional, default=1) – kernel size for Correlation must be an odd number
-    :param max_displacement: (int (non-negative), optional, default=1) – Max displacement of Correlation
-    :param stride1: (int (non-negative), optional, default=1) – stride1 quantize data1 globally
-    :param stride2: (int (non-negative), optional, default=1) – stride2 quantize data2 within the neighborhood centered around data1
-    :param pad_size: (int (non-negative), optional, default=0) – pad for Correlation
-    :param is_multiply: (boolean, optional, default=True) – operation type is either multiplication or absolute difference
-
+    r"""Applies correlation to inputs.
+
+    Args:
+        data1: Input data1 to the correlation. format must be nchw
+        data2: Input data2 to the correlation. format must be nchw
+        kernel_size: int (non-negative), optional, default=1) – kernel size for Correlation must be an odd number
+        max_displacement: int (non-negative), optional, default=1) – Max displacement of Correlation
+        stride1: int (non-negative), optional, default=1) – stride1 quantize data1 globally
+        stride2: int (non-negative), optional, default=1) – stride2 quantize data2 within the neighborhood centered around data1
+        pad_size: int (non-negative), optional, default=0) – pad for Correlation
+        is_multiply: boolean, optional, default=True) – operation type is either multiplication or absolute difference
     """
 
     op = builtin.Correlation(
@@ -152,41 +154,42 @@ def roi_align(
     sample_points: Union[int, tuple, list] = 2,
     aligned: bool = True,
 ) -> Tensor:
-    """
-    Applies roi align on input feature.
-
-    :param inp: tensor that represents the input feature, shape is `(N, C, H, W)`.
-    :param rois: `(N, 5)` boxes. First column is the box index. The other 4 columns are ``xyxy``.
-    :param output_shape: `(height, width)` shape of output rois feature.
-    :param mode: "max" or "average", use max/average align just like max/average pooling. Default: "average"
-    :param spatial_scale: scale the input boxes by this number. Default: 1.0
-    :param sample_points: number of inputs samples to take for each output sample.
-        0 to take samples densely. Default: 2
-    :param aligned: wheather to align the input feature, with `aligned=True`,
-        we first appropriately scale the ROI and then shift it by -0.5. Default: True
-    :return: output tensor.
+    r"""Applies roi align on input feature.
+
+    Args:
+        inp: tensor that represents the input feature, shape is `(N, C, H, W)`.
+        rois: N, 5)` boxes. First column is the box index. The other 4 columns are ``xyxy``.
+        output_shape: height, width)` shape of output rois feature.
+        mode: max" or "average", use max/average align just like max/average pooling. Default: "average"
+        spatial_scale: scale the input boxes by this number. Default: 1.0
+        sample_points: number of inputs samples to take for each output sample.
+            0 to take samples densely. Default: 2
+        aligned: wheather to align the input feature, with `aligned=True`,
+            we first appropriately scale the ROI and then shift it by -0.5. Default: True
+
+    Returns:
+        output tensor.
 
     Examples:
 
-    .. testcode::
+        .. testcode::
 
-            import numpy as np
-            from megengine import tensor
-            import megengine.functional as F
+                import numpy as np
+                from megengine import tensor
+                import megengine.functional as F
 
-            np.random.seed(42)
-            inp = tensor(np.random.randn(1, 1, 128, 128))
-            rois = tensor(np.random.random((4, 5)))
-            y = F.vision.roi_align(inp, rois, (2, 2))
-            print(y.numpy()[0].round(decimals=4))
-
-    Outputs:
+                np.random.seed(42)
+                inp = tensor(np.random.randn(1, 1, 128, 128))
+                rois = tensor(np.random.random((4, 5)))
+                y = F.vision.roi_align(inp, rois, (2, 2))
+                print(y.numpy()[0].round(decimals=4))
 
-    .. testoutput::
+        Outputs:
 
-            [[[0.175  0.175 ]
-              [0.1359 0.1359]]]
+        .. testoutput::
 
+                [[[0.175  0.175 ]
+                  [0.1359 0.1359]]]
     """
     if inp.dtype != np.float32:
         inp = inp.astype(np.float32)
@@ -217,43 +220,43 @@ def roi_align(
 def nms(
     boxes: Tensor, scores: Tensor, iou_thresh: float, max_output: Optional[int] = None
 ) -> Tensor:
-    r"""
-    Performs non-maximum suppression (NMS) on the boxes according to their intersection-over-union(IoU).
+    r"""Performs non-maximum suppression (NMS) on the boxes according to their intersection-over-union(IoU).
 
-    :param boxes: tensor of shape `(N, 4)`; the boxes to perform nms on; each box is expected to be in `(x1, y1, x2, y2)` format.
-    :param iou_thresh: IoU threshold for overlapping.
-    :param scores: tensor of shape `(N,)`, the score of boxes.
-    :param max_output: the maximum number of boxes to keep; it is optional if this operator is not traced
-        otherwise it required to be specified; if it is not specified, all boxes are kept.
-    :return: indices of the elements that have been kept by NMS, sorted by scores.
+    Args:
+        boxes: tensor of shape `(N, 4)`; the boxes to perform nms on; each box is expected to be in `(x1, y1, x2, y2)` format.
+        iou_thresh: IoU threshold for overlapping.
+        scores: tensor of shape `(N,)`, the score of boxes.
+        max_output: the maximum number of boxes to keep; it is optional if this operator is not traced
+            otherwise it required to be specified; if it is not specified, all boxes are kept.
 
-    .. note::
+    Returns:
+        indices of the elements that have been kept by NMS, sorted by scores.
 
-        max_output should be specified and should have valid positive value under tracing
+    Note:
+        max_output should be specified and should have valid positive value under tracing.
 
     Examples:
 
-    .. testcode::
-
-        import numpy as np
-        from megengine import tensor
-        import megengine.functional as F
+        .. testcode::
 
-        x = np.zeros((100,4))
-        np.random.seed(42)
-        x[:,:2] = np.random.rand(100,2)*20
-        x[:,2:] = np.random.rand(100,2)*20 + 100
-        scores = tensor(np.random.rand(100))
-        inp = tensor(x)
-        result = F.vision.nms(inp, scores, iou_thresh=0.7)
-        print(result.numpy())
+            import numpy as np
+            from megengine import tensor
+            import megengine.functional as F
 
-    Outputs:
+            x = np.zeros((100,4))
+            np.random.seed(42)
+            x[:,:2] = np.random.rand(100,2)*20
+            x[:,2:] = np.random.rand(100,2)*20 + 100
+            scores = tensor(np.random.rand(100))
+            inp = tensor(x)
+            result = F.vision.nms(inp, scores, iou_thresh=0.7)
+            print(result.numpy())
 
-    .. testoutput::
+        Outputs:
 
-        [75 69]
+        .. testoutput::
 
+            [75 69]
     """
     assert (
         boxes.ndim == 2 and boxes.shape[1] == 4
@@ -286,45 +289,46 @@ def remap(
     scalar: float = 0.0,
     interp_mode: str = "linear",
 ) -> Tensor:
-    r"""
-    Applies remap transformation to batched 2D images.
+    r"""Applies remap transformation to batched 2D images.
 
     The input images are transformed to the output images by the tensor map_xy.
     The output's H and W are same as map_xy's H and W.
 
-    :param inp: input image
-    :param map_xy: (batch, oh, ow, 2) transformation matrix
-    :param border_mode: pixel extrapolation method.
-        Default: "replicate". Currently also support "constant", "reflect",
-        "reflect_101", "wrap".
-    :param scalar: value used in case of a constant border. Default: 0
-    :param interp_mode: interpolation methods.
-        Default: "linear". Currently only support "linear" mode.
-    :return: output tensor.
+    Args:
+        inp: input image
+        map_xy: batch, oh, ow, 2) transformation matrix
+        border_mode: pixel extrapolation method.
+            Default: "replicate". Currently also support "constant", "reflect",
+            "reflect_101", "wrap".
+        scalar: value used in case of a constant border. Default: 0
+        interp_mode: interpolation methods.
+            Default: "linear". Currently only support "linear" mode.
+
+    Returns:
+        output tensor.
 
     Examples:
 
-    .. testcode::
+        .. testcode::
 
-        import numpy as np
-        from megengine import tensor
-        import megengine.functional as F
-        inp_shape = (1, 1, 4, 4)
-        inp = tensor(np.arange(16, dtype=np.float32).reshape(inp_shape))
-        map_xy_shape = (1, 2, 2, 2)
-        map_xy = tensor(np.array([[[1., 0.],[0., 1.]],
-                            [[0., 1.],[0., 1.]]],
-                             dtype=np.float32).reshape(map_xy_shape))
-        out = F.vision.remap(inp, map_xy)
-        print(out.numpy())
-
-    Outputs:
+            import numpy as np
+            from megengine import tensor
+            import megengine.functional as F
+            inp_shape = (1, 1, 4, 4)
+            inp = tensor(np.arange(16, dtype=np.float32).reshape(inp_shape))
+            map_xy_shape = (1, 2, 2, 2)
+            map_xy = tensor(np.array([[[1., 0.],[0., 1.]],
+                                [[0., 1.],[0., 1.]]],
+                                dtype=np.float32).reshape(map_xy_shape))
+            out = F.vision.remap(inp, map_xy)
+            print(out.numpy())
 
-    .. testoutput::
+        Outputs:
 
-        [[[[1. 4.]
-           [4. 4.]]]]
+        .. testoutput::
 
+            [[[[1. 4.]
+               [4. 4.]]]]
     """
 
     op = builtin.Remap(
@@ -344,27 +348,28 @@ def warp_affine(
     format: str = "NHWC",
     interp_mode: str = "linear",
 ) -> Tensor:
-    """
-    Batched affine transform on 2D images.
-
-    :param inp: input image.
-    :param mat: `(batch, 2, 3)` transformation matrix.
-    :param out_shape: output tensor shape.
-    :param border_mode: pixel extrapolation method.
-        Default: "wrap". Currently "constant", "reflect",
-        "reflect_101", "isolated", "wrap", "replicate", "transparent" are supported.
-    :param border_val: value used in case of a constant border. Default: 0
-    :param format: "NHWC" as default based on historical concerns,
-        "NCHW" is also supported. Default: "NHWC".
-    :param interp_mode: interpolation methods. Could be "linear", "nearest", "cubic", "area".
-        Default: "linear".
-    :return: output tensor.
-
-    .. note::
-
-       Here all available options for params are listed,
-       however it does not mean that you can use all the combinations.
-       On different platforms, different combinations are supported.
+    r"""Batched affine transform on 2D images.
+
+    Args:
+        inp: input image.
+        mat: batch, 2, 3)` transformation matrix.
+        out_shape: output tensor shape.
+        border_mode: pixel extrapolation method.
+            Default: "wrap". Currently "constant", "reflect",
+            "reflect_101", "isolated", "wrap", "replicate", "transparent" are supported.
+        border_val: value used in case of a constant border. Default: 0
+        format: NHWC" as default based on historical concerns,
+            "NCHW" is also supported. Default: "NHWC".
+        interp_mode: interpolation methods. Could be "linear", "nearest", "cubic", "area".
+            Default: "linear".
+
+    Returns:
+        output tensor.
+
+    Note:
+        Here all available options for params are listed,
+        however it does not mean that you can use all the combinations.
+        On different platforms, different combinations are supported.
     """
     op = builtin.WarpAffine(
         border_mode=border_mode,
@@ -387,8 +392,7 @@ def warp_perspective(
     format: str = "NCHW",
     interp_mode: str = "linear",
 ) -> Tensor:
-    r"""
-    Applies perspective transformation to batched 2D images.
+    r"""Applies perspective transformation to batched 2D images.
 
     The input images are transformed to the output images by the transformation matrix:
 
@@ -401,48 +405,49 @@ def warp_perspective(
     Optionally, we can set `mat_idx` to assign different transformations to the same image,
     otherwise the input images and transformations should be one-to-one correnspondence.
 
-    :param inp: input image.
-    :param mat: `(batch, 3, 3)` transformation matrix.
-    :param out_shape: `(h, w)` size of the output image.
-    :param mat_idx: `(batch, )` image batch idx assigned to each matrix. Default: None
-    :param border_mode: pixel extrapolation method.
-        Default: "replicate". Currently also support "constant", "reflect",
-        "reflect_101", "wrap".
-    :param border_val: value used in case of a constant border. Default: 0
-    :param format: "NHWC" is also supported. Default: "NCHW".
-    :param interp_mode: interpolation methods.
-        Default: "linear". Currently only support "linear" mode.
-    :return: output tensor.
-
-    .. note::
-
-       The transformation matrix is the inverse of that used by `cv2.warpPerspective`.
+    Args:
+        inp: input image.
+        mat: batch, 3, 3)` transformation matrix.
+        out_shape: h, w)` size of the output image.
+        mat_idx: batch, )` image batch idx assigned to each matrix. Default: None
+        border_mode: pixel extrapolation method.
+            Default: "replicate". Currently also support "constant", "reflect",
+            "reflect_101", "wrap".
+        border_val: value used in case of a constant border. Default: 0
+        format: NHWC" is also supported. Default: "NCHW".
+        interp_mode: interpolation methods.
+            Default: "linear". Currently only support "linear" mode.
+
+    Returns:
+        output tensor.
+
+    Note:
+        The transformation matrix is the inverse of that used by `cv2.warpPerspective`.
 
     Examples:
 
-    .. testcode::
-
-        import numpy as np
-        from megengine import tensor
-        import megengine.functional as F
+        .. testcode::
 
-        inp_shape = (1, 1, 4, 4)
-        x = tensor(np.arange(16, dtype=np.float32).reshape(inp_shape))
-        M_shape = (1, 3, 3)
-        # M defines a translation: dst(1, 1, h, w) = rst(1, 1, h+1, w+1)
-        M = tensor(np.array([[1., 0., 1.],
-                             [0., 1., 1.],
-                             [0., 0., 1.]], dtype=np.float32).reshape(M_shape))
-        out = F.vision.warp_perspective(x, M, (2, 2))
-        print(out.numpy())
+            import numpy as np
+            from megengine import tensor
+            import megengine.functional as F
 
-    Outputs:
+            inp_shape = (1, 1, 4, 4)
+            x = tensor(np.arange(16, dtype=np.float32).reshape(inp_shape))
+            M_shape = (1, 3, 3)
+            # M defines a translation: dst(1, 1, h, w) = rst(1, 1, h+1, w+1)
+            M = tensor(np.array([[1., 0., 1.],
+                                [0., 1., 1.],
+                                [0., 0., 1.]], dtype=np.float32).reshape(M_shape))
+            out = F.vision.warp_perspective(x, M, (2, 2))
+            print(out.numpy())
 
-    .. testoutput::
+        Outputs:
 
-        [[[[ 5.  6.]
-           [ 9. 10.]]]]
+        .. testoutput::
 
+            [[[[ 5.  6.]
+               [ 9. 10.]]]]
     """
     if inp.dtype == np.float32:
         mat = mat.astype("float32")
@@ -467,48 +472,48 @@ def interpolate(
     mode: str = "bilinear",
     align_corners: Optional[bool] = None,
 ) -> Tensor:
-    r"""
-    Down/up samples the input tensor to either the given size or with the given scale_factor. ``size`` can not coexist with ``scale_factor``.
-
-    :param inp: input tensor.
-    :param size: size of the output tensor. Default: None
-    :param scale_factor: scaling factor of the output tensor. Default: None
-    :param mode: interpolation methods, acceptable values are:
-        "bilinear", "linear", "bicubic" and "nearest". Default: "bilinear"
-    :param align_corners: This only has an effect when `mode`
-        is "bilinear" or "linear". Geometrically, we consider the pixels of the input
-        and output as squares rather than points. If set to ``True``, the input
-        and output tensors are aligned by the center points of their corner
-        pixels, preserving the values at the corner pixels. If set to ``False``,
-        the input and output tensors are aligned by the corner points of their
-        corner pixels, and the interpolation uses edge value padding for
-        out-of-boundary values, making this operation *independent* of input size
-
-    :return: output tensor.
+    r"""Down/up samples the input tensor to either the given size or with the given scale_factor. ``size`` can not coexist with ``scale_factor``.
+
+    Args:
+        inp: input tensor.
+        size: size of the output tensor. Default: None
+        scale_factor: scaling factor of the output tensor. Default: None
+        mode: interpolation methods, acceptable values are:
+            "bilinear", "linear", "bicubic" and "nearest". Default: "bilinear"
+        align_corners: This only has an effect when `mode`
+            is "bilinear" or "linear". Geometrically, we consider the pixels of the input
+            and output as squares rather than points. If set to ``True``, the input
+            and output tensors are aligned by the center points of their corner
+            pixels, preserving the values at the corner pixels. If set to ``False``,
+            the input and output tensors are aligned by the corner points of their
+            corner pixels, and the interpolation uses edge value padding for
+            out-of-boundary values, making this operation *independent* of input size
+
+    Returns:
+        output tensor.
 
     Examples:
 
-    .. testcode::
-
-        import numpy as np
-        from megengine import tensor
-        import megengine.functional as F
+        .. testcode::
 
-        x = tensor(np.arange(1, 5, dtype=np.float32).reshape(1, 1, 2, 2))
-        out = F.vision.interpolate(x, [4, 4], align_corners=False)
-        print(out.numpy())
-        out2 = F.vision.interpolate(x, scale_factor=2.)
-        np.testing.assert_allclose(out.numpy(), out2.numpy())
+            import numpy as np
+            from megengine import tensor
+            import megengine.functional as F
 
-    Outputs:
+            x = tensor(np.arange(1, 5, dtype=np.float32).reshape(1, 1, 2, 2))
+            out = F.vision.interpolate(x, [4, 4], align_corners=False)
+            print(out.numpy())
+            out2 = F.vision.interpolate(x, scale_factor=2.)
+            np.testing.assert_allclose(out.numpy(), out2.numpy())
 
-    .. testoutput::
+        Outputs:
 
-        [[[[1.   1.25 1.75 2.  ]
-           [1.5  1.75 2.25 2.5 ]
-           [2.5  2.75 3.25 3.5 ]
-           [3.   3.25 3.75 4.  ]]]]
+        .. testoutput::
 
+            [[[[1.   1.25 1.75 2.  ]
+               [1.5  1.75 2.25 2.5 ]
+               [2.5  2.75 3.25 3.5 ]
+               [3.   3.25 3.75 4.  ]]]]
     """
     mode = mode.lower()
     if mode not in ["bilinear", "linear", "bicubic", "nearest"]:
@@ -623,15 +628,15 @@ def interpolate(
 
 
 def nvof(src: Tensor, precision: int = 1) -> Tensor:
-    r"""
-    Implements NVIDIA Optical Flow SDK.
+    r"""Implements NVIDIA Optical Flow SDK.
+
+    Args:
+        src: input tensor with shape (n, t, h, w, c4) and unit8 dtype.
+        precision: 0:NV_OF_PERF_LEVEL_SLOW 1:NV_OF_PERF_LEVEL_MEDIUM 2:NV_OF_PERF_LEVEL_FAST.
 
-    :src shape: input tensor with shape (n, t, h, w, c4).
-    :src dtype: uint8.
-    :param precision: 0:NV_OF_PERF_LEVEL_SLOW 1:NV_OF_PERF_LEVEL_MEDIUM 2:NV_OF_PERF_LEVEL_FAST.
-    :output shape: ``(n, t-1, (h+out_grid_size-1)//out_grid_size, (w+out_grid_size-1)//out_grid_size, c2)``.
-        By default, out_grid_size = 4.
-    :output dtype: int16.
+    Returns:
+        output tensor with shape: ``(n, t-1, (h+out_grid_size-1)//out_grid_size, (w+out_grid_size-1)//out_grid_size, c2)``.
+        By default, out_grid_size = 4. dtype: int16.
 
     .. code-block:: python
 
@@ -643,7 +648,6 @@ def nvof(src: Tensor, precision: int = 1) -> Tensor:
         src = tensor(x)
         result = F.nn.nvof(src, precision=1)
         print(result.numpy())
-
     """
     assert src.ndim == 5 and src.shape[4] == 4
 
diff --git a/imperative/python/megengine/hub/exceptions.py b/imperative/python/megengine/hub/exceptions.py
index 07d08b2e..26075297 100644
--- a/imperative/python/megengine/hub/exceptions.py
+++ b/imperative/python/megengine/hub/exceptions.py
@@ -7,24 +7,24 @@
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 class FetcherError(Exception):
-    """Base class for fetch related error."""
+    r"""Base class for fetch related error."""
 
 
 class InvalidRepo(FetcherError):
-    """The repo provided was somehow invalid."""
+    r"""The repo provided was somehow invalid."""
 
 
 class InvalidGitHost(FetcherError):
-    """The git host provided was somehow invalid."""
+    r"""The git host provided was somehow invalid."""
 
 
 class GitPullError(FetcherError):
-    """A git pull error occurred."""
+    r"""A git pull error occurred."""
 
 
 class GitCheckoutError(FetcherError):
-    """A git checkout error occurred."""
+    r"""A git checkout error occurred."""
 
 
 class InvalidProtocol(FetcherError):
-    """The protocol provided was somehow invalid."""
+    r"""The protocol provided was somehow invalid."""
diff --git a/imperative/python/megengine/hub/fetcher.py b/imperative/python/megengine/hub/fetcher.py
index 65fcfbe5..9590597b 100644
--- a/imperative/python/megengine/hub/fetcher.py
+++ b/imperative/python/megengine/hub/fetcher.py
@@ -102,24 +102,18 @@ class GitSSHFetcher(RepoFetcherBase):
         commit: str = None,
         silent: bool = True,
     ) -> str:
-        """
-        Fetches git repo by SSH protocol
-
-        :param git_host:
-            host address of git repo.
-            Example: github.com
-        :param repo_info:
-            a string with format ``"repo_owner/repo_name[:tag_name/:branch_name]"`` with an optional
-            tag/branch. The default branch is ``master`` if not specified.
-            Example: ``"brain_sdk/MegBrain[:hub]"``
-        :param use_cache:
-            whether to use locally fetched code or completely re-fetch.
-        :param commit:
-            commit id on github or gitlab.
-        :param silent:
-            whether to accept the stdout and stderr of the subprocess with PIPE, instead of
-            displaying on the screen.
-        :return:
+        """Fetches git repo by SSH protocol
+
+        Args:
+            git_host: host address of git repo. Eg: github.com
+            repo_info: a string with format ``"repo_owner/repo_name[:tag_name/:branch_name]"`` with an optional
+                tag/branch. The default branch is ``master`` if not specified. Eg: ``"brain_sdk/MegBrain[:hub]"``
+            use_cache: whether to use locally fetched code or completely re-fetch.
+            commit: commit id on github or gitlab.
+            silent: whether to accept the stdout and stderr of the subprocess with PIPE, instead of
+                displaying on the screen.
+
+        Returns:
             directory where the repo code is stored.
         """
         if not cls._check_git_host(git_host):
@@ -217,24 +211,19 @@ class GitHTTPSFetcher(RepoFetcherBase):
         commit: str = None,
         silent: bool = True,
     ) -> str:
-        """
-        Fetches git repo by HTTPS protocol.
-
-        :param git_host:
-            host address of git repo.
-            Example: github.com
-        :param repo_info:
-            a string with format ``"repo_owner/repo_name[:tag_name/:branch_name]"`` with an optional
-            tag/branch. The default branch is ``master`` if not specified.
-            Example: ``"brain_sdk/MegBrain[:hub]"``
-        :param use_cache:
-            whether to use locally cached code or completely re-fetch.
-        :param commit:
-            commit id on github or gitlab.
-        :param silent:
-            whether to accept the stdout and stderr of the subprocess with PIPE, instead of
-            displaying on the screen.
-        :return:
+        """Fetches git repo by HTTPS protocol.
+
+        Args:
+            git_host: host address of git repo. Eg: github.com
+            repo_info: a string with format ``"repo_owner/repo_name[:tag_name/:branch_name]"`` with an optional
+                tag/branch. The default branch is ``master`` if not specified. Eg: ``"brain_sdk/MegBrain[:hub]"``
+            use_cache: whether to use locally cached code or completely re-fetch.
+            commit: commit id on github or gitlab.
+            silent: whether to accept the stdout and stderr of the subprocess with PIPE, instead of
+                displaying on the screen.
+ 
+
+        Returns:
             directory where the repo code is stored.
         """
         if not cls._check_git_host(git_host):
diff --git a/imperative/python/megengine/hub/hub.py b/imperative/python/megengine/hub/hub.py
index aefa8307..953714bb 100644
--- a/imperative/python/megengine/hub/hub.py
+++ b/imperative/python/megengine/hub/hub.py
@@ -43,9 +43,7 @@ PROTOCOLS = {
 
 
 def _get_megengine_home() -> str:
-    """
-    MGE_HOME setting complies with the XDG Base Directory Specification
-    """
+    r"""MGE_HOME setting complies with the XDG Base Directory Specification"""
     megengine_home = os.path.expanduser(
         os.getenv(
             ENV_MGE_HOME,
@@ -95,24 +93,18 @@ def _init_hub(
     commit: str = None,
     protocol: str = DEFAULT_PROTOCOL,
 ):
-    """
-    Imports hubmodule like python import.
-
-    :param repo_info:
-        a string with format ``"repo_owner/repo_name[:tag_name/:branch_name]"`` with an optional
-        tag/branch. The default branch is ``master`` if not specified.
-        Example: ``"brain_sdk/MegBrain[:hub]"``
-    :param git_host:
-        host address of git repo.
-        Example: github.com
-    :param use_cache:
-        whether to use locally cached code or completely re-fetch.
-    :param commit:
-        commit id on github or gitlab.
-    :param protocol:
-        which protocol to use to get the repo, and HTTPS protocol only supports public repo on github.
-        The value should be one of HTTPS, SSH.
-    :return:
+    r"""Imports hubmodule like python import.
+
+    Args:
+        repo_info: a string with format ``"repo_owner/repo_name[:tag_name/:branch_name]"`` with an optional
+            tag/branch. The default branch is ``master`` if not specified. Eg: ``"brain_sdk/MegBrain[:hub]"``
+        git_host: host address of git repo. Eg: github.com
+        use_cache: whether to use locally cached code or completely re-fetch.
+        commit: commit id on github or gitlab.
+        protocol: which protocol to use to get the repo, and HTTPS protocol only supports public repo on github.
+            The value should be one of HTTPS, SSH.
+
+    Returns:
         a python module.
     """
     cache_dir = os.path.expanduser(os.path.join(_get_megengine_home(), "hub"))
@@ -139,24 +131,18 @@ def list(
     commit: str = None,
     protocol: str = DEFAULT_PROTOCOL,
 ) -> List[str]:
-    """
-    Lists all entrypoints available in repo hubconf.
-
-    :param repo_info:
-        a string with format ``"repo_owner/repo_name[:tag_name/:branch_name]"`` with an optional
-        tag/branch. The default branch is ``master`` if not specified.
-        Example: ``"brain_sdk/MegBrain[:hub]"``
-    :param git_host:
-        host address of git repo.
-        Example: github.com
-    :param use_cache:
-        whether to use locally cached code or completely re-fetch.
-    :param commit:
-        commit id on github or gitlab.
-    :param protocol:
-        which protocol to use to get the repo, and HTTPS protocol only supports public repo on github.
-        The value should be one of HTTPS, SSH.
-    :return:
+    r"""Lists all entrypoints available in repo hubconf.
+
+    Args:
+        repo_info: a string with format ``"repo_owner/repo_name[:tag_name/:branch_name]"`` with an optional
+            tag/branch. The default branch is ``master`` if not specified. Eg: ``"brain_sdk/MegBrain[:hub]"``
+        git_host: host address of git repo. Eg: github.com
+        use_cache: whether to use locally cached code or completely re-fetch.
+        commit: commit id on github or gitlab.
+        protocol: which protocol to use to get the repo, and HTTPS protocol only supports public repo on github.
+            The value should be one of HTTPS, SSH.
+
+    Returns:
         all entrypoint names of the model.
     """
     hubmodule = _init_hub(repo_info, git_host, use_cache, commit, protocol)
@@ -178,26 +164,19 @@ def load(
     protocol: str = DEFAULT_PROTOCOL,
     **kwargs
 ) -> Any:
-    """
-    Loads model from github or gitlab repo, with pretrained weights.
-
-    :param repo_info:
-        a string with format ``"repo_owner/repo_name[:tag_name/:branch_name]"`` with an optional
-        tag/branch. The default branch is ``master`` if not specified.
-        Example: ``"brain_sdk/MegBrain[:hub]"``
-    :param entry:
-        an entrypoint defined in hubconf.
-    :param git_host:
-        host address of git repo.
-        Example: github.com
-    :param use_cache:
-        whether to use locally cached code or completely re-fetch.
-    :param commit:
-        commit id on github or gitlab.
-    :param protocol:
-        which protocol to use to get the repo, and HTTPS protocol only supports public repo on github.
-        The value should be one of HTTPS, SSH.
-    :return:
+    r"""Loads model from github or gitlab repo, with pretrained weights.
+
+    Args:
+        repo_info: a string with format ``"repo_owner/repo_name[:tag_name/:branch_name]"`` with an optional
+            tag/branch. The default branch is ``master`` if not specified. Eg: ``"brain_sdk/MegBrain[:hub]"``
+        entry: an entrypoint defined in hubconf.
+        git_host: host address of git repo. Eg: github.com
+        use_cache: whether to use locally cached code or completely re-fetch.
+        commit: commit id on github or gitlab.
+        protocol: which protocol to use to get the repo, and HTTPS protocol only supports public repo on github.
+            The value should be one of HTTPS, SSH.
+
+    Returns:
         a single model with corresponding pretrained weights.
     """
     hubmodule = _init_hub(repo_info, git_host, use_cache, commit, protocol)
@@ -219,30 +198,23 @@ def help(
     commit: str = None,
     protocol: str = DEFAULT_PROTOCOL,
 ) -> str:
-    """
-    This function returns docstring of entrypoint ``entry`` by following steps:
+    r"""This function returns docstring of entrypoint ``entry`` by following steps:
 
     1. Pull the repo code specified by git and repo_info.
     2. Load the entry defined in repo's hubconf.py
     3. Return docstring of function entry.
 
-    :param repo_info:
-        a string with format ``"repo_owner/repo_name[:tag_name/:branch_name]"`` with an optional
-        tag/branch. The default branch is ``master`` if not specified.
-        Example: ``"brain_sdk/MegBrain[:hub]"``
-    :param entry:
-        an entrypoint defined in hubconf.py
-    :param git_host:
-        host address of git repo.
-        Example: github.com
-    :param use_cache:
-        whether to use locally cached code or completely re-fetch.
-    :param commit:
-        commit id on github or gitlab.
-    :param protocol:
-        which protocol to use to get the repo, and HTTPS protocol only supports public repo on github.
-        The value should be one of HTTPS, SSH.
-    :return:
+    Args:
+        repo_info: a string with format ``"repo_owner/repo_name[:tag_name/:branch_name]"`` with an optional
+            tag/branch. The default branch is ``master`` if not specified. Eg: ``"brain_sdk/MegBrain[:hub]"``
+        entry: an entrypoint defined in hubconf.py
+        git_host: host address of git repo. Eg: github.com
+        use_cache: whether to use locally cached code or completely re-fetch.
+        commit: commit id on github or gitlab.
+        protocol: which protocol to use to get the repo, and HTTPS protocol only supports public repo on github.
+            The value should be one of HTTPS, SSH.
+
+    Returns:
         docstring of entrypoint ``entry``.
     """
     hubmodule = _init_hub(repo_info, git_host, use_cache, commit, protocol)
@@ -255,16 +227,17 @@ def help(
 
 
 def load_serialized_obj_from_url(url: str, model_dir=None) -> Any:
-    """
-    Loads MegEngine serialized object from the given URL.
+    """Loads MegEngine serialized object from the given URL.
 
     If the object is already present in ``model_dir``, it's deserialized and
     returned. If no ``model_dir`` is specified, it will be ``MGE_HOME/serialized``.
 
-    :param url: url to serialized object.
-    :param model_dir: dir to cache target serialized file.
+    Args:
+        url: url to serialized object.
+        model_dir: dir to cache target serialized file.
 
-    :return: loaded object.
+    Returns:
+        loaded object.
     """
     if model_dir is None:
         model_dir = os.path.join(_get_megengine_home(), "serialized")
@@ -297,8 +270,7 @@ def load_serialized_obj_from_url(url: str, model_dir=None) -> Any:
 
 
 class pretrained:
-    r"""
-    Decorator which helps to download pretrained weights from the given url.
+    r"""Decorator which helps to download pretrained weights from the given url.
 
     For example, we can decorate a resnet18 function as follows
 
@@ -306,10 +278,10 @@ class pretrained:
 
         @hub.pretrained("https://url/to/pretrained_resnet18.pkl")
         def resnet18(**kwargs):
-            return ResNet(BasicBlock, [2, 2, 2, 2], **kwargs)
 
-    When decorated function is called with ``pretrained=True``, MegEngine will automatically
-    download and fill the returned model with pretrained weights.
+    Returns:
+        When decorated function is called with ``pretrained=True``, MegEngine will automatically
+        download and fill the returned model with pretrained weights.
     """
 
     def __init__(self, url):
diff --git a/imperative/python/megengine/hub/tools.py b/imperative/python/megengine/hub/tools.py
index 524db01a..eb4db06f 100644
--- a/imperative/python/megengine/hub/tools.py
+++ b/imperative/python/megengine/hub/tools.py
@@ -14,11 +14,11 @@ from typing import Iterator
 
 
 def load_module(name: str, path: str) -> types.ModuleType:
-    """
-    Loads module specified by name and path.
+    r"""Loads module specified by name and path.
 
-    :param name: module name.
-    :param path: module path.
+    Args:
+        name: module name.
+        path: module path.
     """
     spec = importlib.util.spec_from_file_location(name, path)
     module = importlib.util.module_from_spec(spec)
@@ -27,20 +27,20 @@ def load_module(name: str, path: str) -> types.ModuleType:
 
 
 def check_module_exists(module: str) -> bool:
-    """
-    Checks whether python module exists or not.
+    r"""Checks whether python module exists or not.
 
-    :param module: name of module.
+    Args:
+        module: name of module.
     """
     return importlib.util.find_spec(module) is not None
 
 
 @contextmanager
 def cd(target: str) -> Iterator[None]:
-    """
-    Changes current directory to target.
+    """Changes current directory to target.
 
-    :param target: target directory.
+    Args:
+        target: target directory.
     """
     prev = os.getcwd()
     os.chdir(os.path.expanduser(target))
diff --git a/imperative/python/megengine/jit/graph_opt_config.py b/imperative/python/megengine/jit/graph_opt_config.py
index 9cea66b5..47f17fa7 100644
--- a/imperative/python/megengine/jit/graph_opt_config.py
+++ b/imperative/python/megengine/jit/graph_opt_config.py
@@ -9,12 +9,12 @@
 
 
 class GraphOptimizationConfig:
-    r"""
-    Configuration for graph optimization: False for OFF, True for ON. The default value
+    r"""Configuration for graph optimization: False for OFF, True for ON. The default value
     None means that opt_level will decide whther this optimization will be applied or not.
 
-    :param jit_fuse_dimshuffle: whether to fuse dimshuffle in JIT optimization
-    :param jit_fuse_reduce: whether to fuse reduce in JIT optimization
+    Args:
+        jit_fuse_dimshuffle: whether to fuse dimshuffle in JIT optimization
+        jit_fuse_reduce: whether to fuse reduce in JIT optimization
     """
 
     def __init__(self):
diff --git a/imperative/python/megengine/jit/sublinear_memory_config.py b/imperative/python/megengine/jit/sublinear_memory_config.py
index a2ce3100..c7c20187 100644
--- a/imperative/python/megengine/jit/sublinear_memory_config.py
+++ b/imperative/python/megengine/jit/sublinear_memory_config.py
@@ -10,26 +10,26 @@ from ..device import get_device_count
 
 
 class SublinearMemoryConfig:
-    r"""
-    Configuration for sublinear memory optimization.
-
-    :param thresh_nr_try: number of samples both for searching in linear space
-        and around current thresh in sublinear memory optimization. Default: 10.
-        It can also be set through the environmental variable 'MGB_SUBLINEAR_MEMORY_THRESH_NR_TRY'.
-    :param genetic_nr_iter: number of iterations to find the best checkpoints in genetic algorithm.
-        Default: 0.
-        It can also be set through the environmental variable 'MGB_SUBLINEAR_MEMORY_GENETIC_NR_ITER'.
-    :param genetic_pool_size: number of samples for the crossover random selection
-        during genetic optimization. Default: 20.
-        It can also be set through the environmental variable 'MGB_SUBLINEAR_MEMORY_GENETIC_POOL_SIZE'.
-    :param lb_memory_mb: memory lower bound of bottleneck size in MB for sublinear memory optimization.
-        It can be used to perform manual tradeoff between memory and speed. Default: 0.
-        It can also be set through the environmental variable 'MGB_SUBLINEAR_MEMORY_LOWER_BOUND_MB'.
-    :param num_worker: number of thread workers to search the optimum checkpoints
-        in sublinear memory optimization. Default: half of cpu number in the system.
-        Note: the value must be greater or equal to one.
-        It can also be set through the environmental variable 'MGB_SUBLINEAR_MEMORY_WORKERS'.
+    r"""Configuration for sublinear memory optimization.
 
+    Args:
+        thresh_nr_try: number of samples both for searching in linear space
+            and around current thresh in sublinear memory optimization. Default: 10.
+            It can also be set through the environmental variable 'MGB_SUBLINEAR_MEMORY_THRESH_NR_TRY'.
+        genetic_nr_iter: number of iterations to find the best checkpoints in genetic algorithm.
+            Default: 0.
+            It can also be set through the environmental variable 'MGB_SUBLINEAR_MEMORY_GENETIC_NR_ITER'.
+        genetic_pool_size: number of samples for the crossover random selection
+            during genetic optimization. Default: 20.
+            It can also be set through the environmental variable 'MGB_SUBLINEAR_MEMORY_GENETIC_POOL_SIZE'.
+        lb_memory_mb: memory lower bound of bottleneck size in MB for sublinear memory optimization.
+            It can be used to perform manual tradeoff between memory and speed. Default: 0.
+            It can also be set through the environmental variable 'MGB_SUBLINEAR_MEMORY_LOWER_BOUND_MB'.
+        num_worker: number of thread workers to search the optimum checkpoints
+            in sublinear memory optimization. Default: half of cpu number in the system.
+            Note: the value must be greater or equal to one.
+            It can also be set through the environmental variable 'MGB_SUBLINEAR_MEMORY_WORKERS'.
+    
     Note that the environmental variable MGB_COMP_GRAPH_OPT must be set to 'enable_sublinear_memory_opt=1'
     in order for the above environmental variable to be effective.
     """
diff --git a/imperative/python/megengine/jit/tracing.py b/imperative/python/megengine/jit/tracing.py
index 57a59624..4602cc32 100644
--- a/imperative/python/megengine/jit/tracing.py
+++ b/imperative/python/megengine/jit/tracing.py
@@ -120,21 +120,21 @@ _io_op_types = {AssertEqual, CollectiveComm, RemoteSend, RemoteRecv}
 
 
 class trace:
-    """
-    Wraps a callable and provide:
+    """Wraps a callable and provide:
 
     * tracing via :meth:`.trace` and :meth:`.dump`
     * accelerated evalutaion via :meth:`.__call__`
 
-    :param function: the function will be traced.
-    :param symbolic: whether to apply symbolic execution for tracing. Default: False
-    :param capture_as_const: capture global vars or closures as const value. Default: False
-    :param sublinear_memory_config: configuration for sublinear memory optimization.
-        If not None, it enables sublinear memory optimization with given setting.
-    :param profiling: whether to profile compiled trace. Default: False
-    :param opt_level: optimization level for compiling trace. Default: 2
-    :param graph_opt_config: configuration for graph optimization. Default: None
-    :param symbolic_shape: whether to use symbolic shape for tracing. Default: True
+    Args:
+        function: the function will be traced.
+        symbolic: whether to apply symbolic execution for tracing. Default: False
+        capture_as_const: capture global vars or closures as const value. Default: False
+        sublinear_memory_config: configuration for sublinear memory optimization.
+            If not None, it enables sublinear memory optimization with given setting.
+        profiling: whether to profile compiled trace. Default: False
+        opt_level: optimization level for compiling trace. Default: 2
+        graph_opt_config: configuration for graph optimization. Default: None
+        symbolic_shape: whether to use symbolic shape for tracing. Default: True
     """
 
     def __new__(cls, *args, **kwargs):
@@ -696,75 +696,74 @@ class trace:
         enable_metadata: bool = True,
         **kwargs
     ):
-        r"""
-        Serializes trace to file system.
-
-        :param file: output file, could be file object or filename.
-        :param arg_names: names of the input tensors in the traced function.
-        :param output_names: names of the output tensors in the traced function,
-            use the default name if not specified.
-        :param append: whether output is appended to ``file``.
-            Only works when ``file`` is str.
-        :param keep_var_name: level for keeping variable names:
-
-            * 0: none of the names are kept
-            * 1: (default)keep names of output vars
-            * 2: keep names of all (output and internal) vars
-        :param keep_opr_name: whether to keep operator names.
-        :param keep_param_name: whether to keep param names, so param values can be
-            easily manipulated after loading model
-        :param keep_opr_priority: whether to keep priority setting for operators
-        :param strip_info_file: a string for path or a file handler. if is not None,
-            then the dump information for code strip would be written to ``strip_info_file``
-        :param append_json: will be check when `strip_info_file` is not None. if set
-            true, the information for code strip will be append to strip_info_file.
-            if set false, will rewrite strip_info_file
-        :param optimize_for_inference: enbale optmizations,
-            will skip all optimize options if this is False. Default: True
-        :param user_info: any type object, which will be pickled to bytes.
-        :param enable_metadata: whether to save metadata into output file.
-
-        :Keyword Arguments:
-
-            * enable_io16xc32 --
-                whether to use float16 for I/O between oprs and use
-                float32 as internal computation precision. Note the output var would be
-                changed to float16.
-            * enable_ioc16 --
-                whether to use float16 for both I/O and computation
-                precision.
-
-            * enable_hwcd4 --
-                whether to use NHWCD4 data layout. This is faster on some
-                OpenCL backend.
-            * enable_nchw88 --
-                whether to use NCHW88 data layout, currently
-                used in X86 AVX backend.
-            * enable_nchw44 --
-                whether to use NCHW44 data layout, currently
-                used in arm backend.
-            * enable_nchw44_dot --
-                whether to use NCHW44_dot data layout, currently
-                used in armv8.2+dotprod backend.
-            * enable_nchw4 --
-                whether to use NCHW4 data layout, currently
-                used in nvidia backend(based on cudnn).
-            * enable_nchw32 --
-                whether to use NCHW32 data layout, currently
-                used in nvidia backend with tensorcore(based on cudnn).
-            * enable_chwn4 --
-                whether to use CHWN4 data layout, currently
-                used in nvidia backend with tensorcore.
-            * enable_nchw64 --
-                whether to use NCHW64 data layout, used for fast int4
-                support on Nvidia GPU.
-
-            * enable_fuse_conv_bias_nonlinearity: whether to fuse conv+bias+nonlinearty
-                into one opr.
-            * enable_fuse_conv_bias_with_z: whether to fuse conv_bias with z
-                input for inference on nvidia backend(this optimization pass will
-                result in mismatch of the precision of output of training and
-                inference)
+        r"""Serializes trace to file system.
+
+        Args:
+            file: output file, could be file object or filename.
+            arg_names: names of the input tensors in the traced function.
+            output_names: names of the output tensors in the traced function,
+                use the default name if not specified.
+            append: whether output is appended to ``file``.
+                Only works when ``file`` is str.
+            keep_var_name: level for keeping variable names:
+
+                * 0: none of the names are kept
+                * 1: (default)keep names of output vars
+                * 2: keep names of all (output and internal) vars
+
+            keep_opr_name: whether to keep operator names.
+            keep_param_name: whether to keep param names, so param values can be
+                easily manipulated after loading model
+            keep_opr_priority: whether to keep priority setting for operators
+            strip_info_file: a string for path or a file handler. if is not None,
+                then the dump information for code strip would be written to ``strip_info_file``
+            append_json: will be check when `strip_info_file` is not None. if set
+                true, the information for code strip will be append to strip_info_file.
+                if set false, will rewrite strip_info_file
+            optimize_for_inference: enbale optmizations,
+                will skip all optimize options if this is False. Default: True
+            user_info: any type object, which will be pickled to bytes.
+            enable_metadata: whether to save metadata into output file.
+
+        Keyword Arguments:
+
+        * enable_io16xc32 --
+          whether to use float16 for I/O between oprs and use
+          float32 as internal computation precision. Note the output var would be
+          changed to float16.
+        * enable_ioc16 --
+          whether to use float16 for both I/O and computation
+          precision.
+        * enable_hwcd4 --
+          whether to use NHWCD4 data layout. This is faster on some
+          OpenCL backend.
+        * enable_nchw88 --
+          whether to use NCHW88 data layout, currently
+          used in X86 AVX backend.
+        * enable_nchw44 --
+          whether to use NCHW44 data layout, currently
+          used in arm backend.
+        * enable_nchw44_dot --
+          whether to use NCHW44_dot data layout, currently
+          used in armv8.2+dotprod backend.
+        * enable_nchw4 --
+          whether to use NCHW4 data layout, currently
+          used in nvidia backend(based on cudnn).
+        * enable_nchw32 --
+          whether to use NCHW32 data layout, currently
+          used in nvidia backend with tensorcore(based on cudnn).
+        * enable_chwn4 --
+          whether to use CHWN4 data layout, currently
+          used in nvidia backend with tensorcore.
+        * enable_nchw64 --
+          whether to use NCHW64 data layout, used for fast int4
+          support on Nvidia GPU.
+        * enable_fuse_conv_bias_nonlinearity: whether to fuse conv+bias+nonlinearty
+          into one opr.
+        * enable_fuse_conv_bias_with_z: whether to fuse conv_bias with z
+          input for inference on nvidia backend(this optimization pass will
+          result in mismatch of the precision of output of training and
+          inference)
         """
         if not self._capture_as_const:
             raise ValueError(
@@ -1033,10 +1032,10 @@ class trace:
                     )
 
     def get_profile(self):
-        """
-        Get profiling result for compiled trace.
+        r"""Get profiling result for compiled trace.
 
-        :return: a json compatible object.
+        Return:
+            a json compatible object.
         """
         if not self._profiler:
             raise RuntimeError("trace is not set with profiling=True")
@@ -1050,9 +1049,7 @@ class trace:
 
 
 class CompiledTensorProxy:
-    """
-    Duck-typed RawTensor
-    """
+    r"""Duck-typed RawTensor"""
 
     def __init__(self, handle):
         self.__handle = handle
diff --git a/imperative/python/megengine/logger.py b/imperative/python/megengine/logger.py
index 046a3248..f60fefe8 100644
--- a/imperative/python/megengine/logger.py
+++ b/imperative/python/megengine/logger.py
@@ -17,14 +17,11 @@ _default_level = logging.getLevelName(_default_level_name.upper())
 
 
 def set_log_file(fout, mode="a"):
-    r"""
-    Sets log output file.
-
-    :type fout: str or file-like
-    :param fout: file-like object that supports write and flush, or string for
-        the filename
-    :type mode: str
-    :param mode: specify the mode to open log file if *fout* is a string
+    r"""Sets log output file.
+
+    Args:
+        fout: file-like object that supports write and flush, or string for the filename
+        mode: specify the mode to open log file if *fout* is a string
     """
     if isinstance(fout, str):
         fout = open(fout, mode)
@@ -39,45 +36,31 @@ class MegEngineLogFormatter(logging.Formatter):
     max_lines = 256
 
     def _color_exc(self, msg):
-        r"""
-        Sets the color of message as the execution type.
-        """
+        r"""Sets the color of message as the execution type."""
         return "\x1b[34m{}\x1b[0m".format(msg)
 
     def _color_dbg(self, msg):
-        r"""
-        Sets the color of message as the debugging type.
-        """
+        r"""Sets the color of message as the debugging type."""
         return "\x1b[36m{}\x1b[0m".format(msg)
 
     def _color_warn(self, msg):
-        r"""
-        Sets the color of message as the warning type.
-        """
+        r"""Sets the color of message as the warning type."""
         return "\x1b[1;31m{}\x1b[0m".format(msg)
 
     def _color_err(self, msg):
-        r"""
-        Sets the color of message as the error type.
-        """
+        r"""Sets the color of message as the error type."""
         return "\x1b[1;4;31m{}\x1b[0m".format(msg)
 
     def _color_omitted(self, msg):
-        r"""
-        Sets the color of message as the omitted type.
-        """
+        r"""Sets the color of message as the omitted type."""
         return "\x1b[35m{}\x1b[0m".format(msg)
 
     def _color_normal(self, msg):
-        r"""
-        Sets the color of message as the normal type.
-        """
+        r"""Sets the color of message as the normal type."""
         return msg
 
     def _color_date(self, msg):
-        r"""
-        Sets the color of message the same as date.
-        """
+        r"""Sets the color of message the same as date."""
         return "\x1b[32m{}\x1b[0m".format(msg)
 
     def format(self, record):
@@ -150,9 +133,7 @@ class MegEngineLogFormatter(logging.Formatter):
 
 
 def get_logger(name=None, formatter=MegEngineLogFormatter):
-    r"""
-    Gets megengine logger with given name.
-    """
+    r"""Gets megengine logger with given name."""
 
     logger = logging.getLogger(name)
     if getattr(logger, "_init_done__", None):
@@ -170,12 +151,11 @@ def get_logger(name=None, formatter=MegEngineLogFormatter):
 
 
 def set_log_level(level, update_existing=True):
-    """
-    Sets default logging level.
+    r"""Sets default logging level.
 
-    :type level: int e.g. logging.INFO
-    :param level: loggin level given by python :mod:`logging` module
-    :param update_existing: whether to update existing loggers
+    Args:
+        level: loggin level given by python :mod:`logging` module
+        update_existing: whether to update existing loggers
     """
     global _default_level  # pylint: disable=global-statement
     _default_level = level
@@ -202,12 +182,13 @@ try:
     _imperative_rt_logger.set_log_handler(_megbrain_logger)
 
     def set_mgb_log_level(level):
-        r"""
-        Sets megbrain log level
+        r"""Sets megbrain log level
+
+        Args:
+            level: new log level
 
-        :type level: int e.g. logging.INFO
-        :param level: new log level
-        :return: original log level
+        Returns:
+            original log level
         """
         _megbrain_logger.setLevel(level)
         if level == logging.getLevelName("ERROR"):
@@ -235,11 +216,10 @@ except ImportError as exc:
 
 @contextlib.contextmanager
 def replace_mgb_log_level(level):
-    r"""
-    Replaces megbrain log level in a block and restore after exiting.
+    r"""Replaces megbrain log level in a block and restore after exiting.
 
-    :type level: int e.g. logging.INFO
-    :param level: new log level
+    Args:
+        level: new log level
     """
     old = set_mgb_log_level(level)
     try:
@@ -249,8 +229,6 @@ def replace_mgb_log_level(level):
 
 
 def enable_debug_log():
-    r"""
-    Sets logging level to debug for all components.
-    """
+    r"""Sets logging level to debug for all components."""
     set_log_level(logging.DEBUG)
     set_mgb_log_level(logging.DEBUG)
diff --git a/imperative/python/megengine/module/activation.py b/imperative/python/megengine/module/activation.py
index 659d56b4..f2f7c9ee 100644
--- a/imperative/python/megengine/module/activation.py
+++ b/imperative/python/megengine/module/activation.py
@@ -14,8 +14,7 @@ from .module import Module
 
 
 class Softmax(Module):
-    r"""
-    Applies a softmax function. Softmax is defined as:
+    r"""Applies a softmax function. Softmax is defined as:
 
     .. math::
             \text{Softmax}(x_{i}) = \frac{exp(x_i)}{\sum_j exp(x_j)}
@@ -23,29 +22,29 @@ class Softmax(Module):
     It is applied to all elements along axis, and rescales elements so that
     they stay in the range `[0, 1]` and sum to 1.
 
-    :param axis: Along which axis softmax will be applied. By default,
-        softmax will apply along the highest ranked axis.
+    Args:
+        axis: Along which axis softmax will be applied. By default,
+            softmax will apply along the highest ranked axis.
 
     Examples:
 
-    .. testcode::
+        .. testcode::
 
-        import numpy as np
-        import megengine as mge
-        import megengine.module as M
+            import numpy as np
+            import megengine as mge
+            import megengine.module as M
 
-        data = mge.tensor(np.array([-2,-1,0,1,2]).astype(np.float32))
-        softmax = M.Softmax()
-        output = softmax(data)
-        with np.printoptions(precision=6):
-            print(output.numpy())
-
-    Outputs:
+            data = mge.tensor(np.array([-2,-1,0,1,2]).astype(np.float32))
+            softmax = M.Softmax()
+            output = softmax(data)
+            with np.printoptions(precision=6):
+                print(output.numpy())
 
-    .. testoutput::
+        Outputs:
 
-        [0.011656 0.031685 0.086129 0.234122 0.636409]
+        .. testoutput::
 
+            [0.011656 0.031685 0.086129 0.234122 0.636409]
     """
 
     def __init__(self, axis=None, **kwargs):
@@ -60,32 +59,31 @@ class Softmax(Module):
 
 
 class Sigmoid(Module):
-    r"""
-    Applies the element-wise function:
+    r"""Applies the element-wise function:
 
     .. math::
+
         \text{Sigmoid}(x) = \frac{1}{1 + \exp(-x)}
 
     Examples:
 
-    .. testcode::
-
-        import numpy as np
-        import megengine as mge
-        import megengine.module as M
+        .. testcode::
 
-        data = mge.tensor(np.array([-2,-1,0,1,2,]).astype(np.float32))
-        sigmoid = M.Sigmoid()
-        output = sigmoid(data)
-        with np.printoptions(precision=6):
-            print(output.numpy())
+            import numpy as np
+            import megengine as mge
+            import megengine.module as M
 
-    Outputs:
+            data = mge.tensor(np.array([-2,-1,0,1,2,]).astype(np.float32))
+            sigmoid = M.Sigmoid()
+            output = sigmoid(data)
+            with np.printoptions(precision=6):
+                print(output.numpy())
 
-    .. testoutput::
+        Outputs:
 
-        [0.119203 0.268941 0.5      0.731059 0.880797]
+        .. testoutput::
 
+            [0.119203 0.268941 0.5      0.731059 0.880797]
     """
 
     def forward(self, inputs):
@@ -93,32 +91,31 @@ class Sigmoid(Module):
 
 
 class SiLU(Module):
-    r"""
-    Applies the element-wise function:
+    r"""Applies the element-wise function:
 
     .. math::
+
         \text{SiLU}(x) = \frac{x}{1 + \exp(-x)}
 
     Examples:
 
-    .. testcode::
+        .. testcode::
 
-        import numpy as np
-        import megengine as mge
-        import megengine.module as M
-
-        data = mge.tensor(np.array([-2,-1,0,1,2,]).astype(np.float32))
-        silu = M.SiLU()
-        output = silu(data)
-        with np.printoptions(precision=6):
-            print(output.numpy())
+            import numpy as np
+            import megengine as mge
+            import megengine.module as M
 
-    Outputs:
+            data = mge.tensor(np.array([-2,-1,0,1,2,]).astype(np.float32))
+            silu = M.SiLU()
+            output = silu(data)
+            with np.printoptions(precision=6):
+                print(output.numpy())
 
-    .. testoutput::
+        Outputs:
 
-        [-0.238406 -0.268941  0.        0.731059  1.761594]
+        .. testoutput::
 
+            [-0.238406 -0.268941  0.        0.731059  1.761594]
     """
 
     def forward(self, inputs):
@@ -126,8 +123,7 @@ class SiLU(Module):
 
 
 class GELU(Module):
-    r"""
-    Applies the element-wise function:
+    r"""Applies the element-wise function:
 
     .. math::
         \text{GELU}(x) = x\Phi(x)
@@ -136,24 +132,23 @@ class GELU(Module):
 
     Examples:
 
-    .. testcode::
+        .. testcode::
 
-        import numpy as np
-        import megengine as mge
-        import megengine.module as M
+            import numpy as np
+            import megengine as mge
+            import megengine.module as M
 
-        data = mge.tensor(np.array([-2,-1,0,1,2,]).astype(np.float32))
-        gelu = M.GELU()
-        output = gelu(data)
-        with np.printoptions(precision=4):
-            print(output.numpy())
-
-    Outputs:
+            data = mge.tensor(np.array([-2,-1,0,1,2,]).astype(np.float32))
+            gelu = M.GELU()
+            output = gelu(data)
+            with np.printoptions(precision=4):
+                print(output.numpy())
 
-    .. testoutput::
+        Outputs:
 
-        [-0.0455 -0.1587  0.      0.8413  1.9545]
+        .. testoutput::
 
+            [-0.0455 -0.1587  0.      0.8413  1.9545]
     """
 
     def forward(self, inputs):
@@ -161,31 +156,29 @@ class GELU(Module):
 
 
 class ReLU(Module):
-    r"""
-    Applies the element-wise function:
+    r"""Applies the element-wise function:
 
     .. math::
         \text{ReLU}(x) = \max(x, 0)
 
     Examples:
 
-    .. testcode::
-
-        import numpy as np
-        import megengine as mge
-        import megengine.module as M
-        data = mge.tensor(np.array([-2,-1,0,1,2,]).astype(np.float32))
-        relu = M.ReLU()
-        output = relu(data)
-        with np.printoptions(precision=6):
-            print(output.numpy())
+        .. testcode::
 
-    Outputs:
+            import numpy as np
+            import megengine as mge
+            import megengine.module as M
+            data = mge.tensor(np.array([-2,-1,0,1,2,]).astype(np.float32))
+            relu = M.ReLU()
+            output = relu(data)
+            with np.printoptions(precision=6):
+                print(output.numpy())
 
-    .. testoutput::
+        Outputs:
 
-        [0. 0. 0. 1. 2.]
+        .. testoutput::
 
+            [0. 0. 0. 1. 2.]
     """
 
     def forward(self, x):
@@ -193,8 +186,7 @@ class ReLU(Module):
 
 
 class PReLU(Module):
-    r"""
-    Applies the element-wise function:
+    r"""Applies the element-wise function:
 
     .. math::
         \text{PReLU}(x) = \max(0,x) + a * \min(0,x)
@@ -211,28 +203,28 @@ class PReLU(Module):
     Here :math:`a` is a learnable parameter. When called without arguments, `PReLU()` uses
     a single paramter :math:`a` across all input channel. If called with `PReLU(num_of_channels)`, each input channle will has it's own :math:`a`.
 
-    :param num_parameters: number of :math:`a` to learn, there is only two
-        values are legitimate: 1, or the number of channels at input. Default: 1
-    :param init: the initial value of :math:`a`. Default: 0.25
+    Args:
+        num_parameters: number of :math:`a` to learn, there is only two
+            values are legitimate: 1, or the number of channels at input. Default: 1
+        init: the initial value of :math:`a`. Default: 0.25
 
     Examples:
 
-    .. testcode::
+        .. testcode::
 
-        import numpy as np
-        import megengine as mge
-        import megengine.module as M
-        data = mge.tensor(np.array([-1.2, -3.7, 2.7]).astype(np.float32))
-        prelu = M.PReLU()
-        output = prelu(data)
-        print(output.numpy())
-
-    Outputs:
+            import numpy as np
+            import megengine as mge
+            import megengine.module as M
+            data = mge.tensor(np.array([-1.2, -3.7, 2.7]).astype(np.float32))
+            prelu = M.PReLU()
+            output = prelu(data)
+            print(output.numpy())
 
-    .. testoutput::
+        Outputs:
 
-        [-0.3   -0.925  2.7  ]
+        .. testoutput::
 
+            [-0.3   -0.925  2.7  ]
     """
 
     def __init__(self, num_parameters: int = 1, init: float = 0.25, **kwargs):
@@ -257,8 +249,7 @@ class PReLU(Module):
 
 
 class LeakyReLU(Module):
-    r"""
-    Applies the element-wise function:
+    r"""Applies the element-wise function:
 
     .. math::
         \text{LeakyReLU}(x) = \max(0,x) + negative\_slope \times \min(0,x)
@@ -274,23 +265,22 @@ class LeakyReLU(Module):
 
     Examples:
 
-    .. testcode::
+        .. testcode::
 
-        import numpy as np
-        import megengine as mge
-        import megengine.module as M
-        data = mge.tensor(np.array([-8, -12, 6, 10]).astype(np.float32))
+            import numpy as np
+            import megengine as mge
+            import megengine.module as M
+            data = mge.tensor(np.array([-8, -12, 6, 10]).astype(np.float32))
 
-        leakyrelu = M.LeakyReLU(0.01)
-        output = leakyrelu(data)
-        print(output.numpy())
-
-    Outputs:
+            leakyrelu = M.LeakyReLU(0.01)
+            output = leakyrelu(data)
+            print(output.numpy())
 
-    .. testoutput::
+        Outputs:
 
-        [-0.08 -0.12  6.   10.  ]
+        .. testoutput::
 
+            [-0.08 -0.12  6.   10.  ]
     """
 
     def __init__(self, negative_slope: float = 0.01, **kwargs):
diff --git a/imperative/python/megengine/module/adaptive_pooling.py b/imperative/python/megengine/module/adaptive_pooling.py
index 44e33f43..e6c9fc6c 100644
--- a/imperative/python/megengine/module/adaptive_pooling.py
+++ b/imperative/python/megengine/module/adaptive_pooling.py
@@ -25,8 +25,7 @@ class _AdaptivePoolNd(Module):
 
 
 class AdaptiveMaxPool2d(_AdaptivePoolNd):
-    r"""
-    Applies a 2D max adaptive pooling over an input.
+    r"""Applies a 2D max adaptive pooling over an input.
 
     For instance, given an input of the size :math:`(N, C, H, W)` and
     an output shape :math:`(OH, OW)`, this layer generates the output of
@@ -40,29 +39,30 @@ class AdaptiveMaxPool2d(_AdaptivePoolNd):
         \end{aligned}
 
     ``kernel_size`` and ``stride`` can be inferred from input shape and out shape:
+
     * padding: (0, 0)
     * stride: (floor(IH / OH), floor(IW / OW))
     * kernel_size: (IH - (OH - 1) * stride_h, IW - (OW - 1) * stride_w)
 
     Examples:
 
-    .. testcode::
+        .. testcode::
 
-        import numpy as np
-        import megengine as mge
-        import megengine.module as M
+            import numpy as np
+            import megengine as mge
+            import megengine.module as M
 
-        m = M.AdaptiveMaxPool2d((2, 2))
-        inp = mge.tensor(np.arange(0, 16).astype("float32").reshape(1, 1, 4, 4))
-        oup = m(inp)
-        print(oup.numpy())
+            m = M.AdaptiveMaxPool2d((2, 2))
+            inp = mge.tensor(np.arange(0, 16).astype("float32").reshape(1, 1, 4, 4))
+            oup = m(inp)
+            print(oup.numpy())
 
-    Outputs:
+        Outputs:
 
-    .. testoutput::
+        .. testoutput::
 
-        [[[[ 5.  7.]
-           [13. 15.]]]]
+            [[[[ 5.  7.]
+               [13. 15.]]]]
 
     """
 
@@ -71,8 +71,7 @@ class AdaptiveMaxPool2d(_AdaptivePoolNd):
 
 
 class AdaptiveAvgPool2d(_AdaptivePoolNd):
-    r"""
-    Applies a 2D average pooling over an input.
+    r"""Applies a 2D average pooling over an input.
 
     For instance, given an input of the size :math:`(N, C, H, W)` and
     an output shape :math:`(OH, OW)`, this layer generates the output of
@@ -84,29 +83,30 @@ class AdaptiveAvgPool2d(_AdaptivePoolNd):
                                input(N_i, C_j, stride[0] \times h + m, stride[1] \times w + n)
 
     ``kernel_size`` and ``stride`` can be inferred from input shape and out shape:
+
     * padding: (0, 0)
     * stride: (floor(IH / OH), floor(IW / OW))
     * kernel_size: (IH - (OH - 1) * stride_h, IW - (OW - 1) * stride_w)
 
     Examples:
 
-    .. testcode::
+        .. testcode::
 
-        import numpy as np
-        import megengine as mge
-        import megengine.module as M
+            import numpy as np
+            import megengine as mge
+            import megengine.module as M
 
-        m = M.AdaptiveAvgPool2d((2, 2))
-        inp = mge.tensor(np.arange(0, 16).astype("float32").reshape(1, 1, 4, 4))
-        oup = m(inp)
-        print(oup.numpy())
+            m = M.AdaptiveAvgPool2d((2, 2))
+            inp = mge.tensor(np.arange(0, 16).astype("float32").reshape(1, 1, 4, 4))
+            oup = m(inp)
+            print(oup.numpy())
 
-    Outputs:
+        Outputs:
 
-    .. testoutput::
+        .. testoutput::
 
-        [[[[ 2.5  4.5]
-           [10.5 12.5]]]]
+            [[[[ 2.5  4.5]
+               [10.5 12.5]]]]
 
     """
 
diff --git a/imperative/python/megengine/module/batch_matmul_activation.py b/imperative/python/megengine/module/batch_matmul_activation.py
index 301f0a72..b6892fd0 100644
--- a/imperative/python/megengine/module/batch_matmul_activation.py
+++ b/imperative/python/megengine/module/batch_matmul_activation.py
@@ -14,9 +14,7 @@ from .module import Module
 
 
 class BatchMatMulActivation(Module):
-    r"""
-    Batched :func:`~.matmul` with activation(only :func:`~.relu` supported), no transpose anywhere.
-    """
+    r"""Batched :func:`~.matmul` with activation(only :func:`~.relu` supported), no transpose anywhere."""
 
     def __init__(
         self,
diff --git a/imperative/python/megengine/module/batchnorm.py b/imperative/python/megengine/module/batchnorm.py
index efeaba17..b7100339 100644
--- a/imperative/python/megengine/module/batchnorm.py
+++ b/imperative/python/megengine/module/batchnorm.py
@@ -141,37 +141,29 @@ class _BatchNorm(Module):
 
 
 class SyncBatchNorm(_BatchNorm):
-    r"""
-    Applies Synchronized Batch Normalization for distributed training.
-
-    :type num_features: int
-    :param num_features: usually :math:`C` from an input of shape
-        :math:`(N, C, H, W)` or the highest ranked dimension of an input
-        less than 4D.
-    :type eps: float
-    :param eps: a value added to the denominator for numerical stability.
-        Default: 1e-5
-    :type momentum: float
-    :param momentum: the value used for the ``running_mean`` and ``running_var`` computation.
-        Default: 0.9
-    :type affine: bool
-    :param affine: a boolean value that when set to True, this module has
-        learnable affine parameters. Default: True
-    :type track_running_stats: bool
-    :param track_running_stats: when set to True, this module tracks the
-        running mean and variance. When set to False, this module does not
-        track such statistics and always uses batch statistics in both training
-        and eval modes. Default: True
-    :type freeze: bool
-    :param freeze: when set to True, this module does not update the
-        running mean and variance, and uses the running mean and variance instead of
-        the batch mean and batch variance to normalize the input. The parameter takes effect
-        only when the module is initilized with track_running_stats as True.
-        Default: False
-    :type group: :class:`~megengine.distributed.Group`
-    :param group: communication group, caculate mean and variance between this group.
-        Default: :obj:`~megengine.distributed.WORLD`
-    :return: output tensor.
+    r"""Applies Synchronized Batch Normalization for distributed training.
+
+    Args:
+        num_features: usually :math:`C` from an input of shape
+            :math:`(N, C, H, W)` or the highest ranked dimension of an input
+            less than 4D.
+        eps: a value added to the denominator for numerical stability.
+            Default: 1e-5
+        momentum: the value used for the ``running_mean`` and ``running_var`` computation.
+            Default: 0.9
+        affine: a boolean value that when set to True, this module has
+            learnable affine parameters. Default: True
+        track_running_stats: when set to True, this module tracks the
+            running mean and variance. When set to False, this module does not
+            track such statistics and always uses batch statistics in both training
+            and eval modes. Default: True
+        freeze: when set to True, this module does not update the
+            running mean and variance, and uses the running mean and variance instead of
+            the batch mean and batch variance to normalize the input. The parameter takes effect
+            only when the module is initilized with track_running_stats as True.
+            Default: False
+        group: communication group, caculate mean and variance between this group.
+            Default: :obj:`~.distributed.WORLD`
     """
 
     def __init__(
@@ -249,8 +241,7 @@ class SyncBatchNorm(_BatchNorm):
 
 
 class BatchNorm1d(_BatchNorm):
-    r"""
-    Applies Batch Normalization over a 2D/3D tensor.
+    r"""Applies Batch Normalization over a 2D/3D tensor.
 
     Refer to :class:`~.BatchNorm2d` for more information.
     """
@@ -263,8 +254,7 @@ class BatchNorm1d(_BatchNorm):
 
 
 class BatchNorm2d(_BatchNorm):
-    r"""
-    Applies Batch Normalization over a 4D tensor.
+    r"""Applies Batch Normalization over a 4D tensor.
 
     .. math::
 
@@ -287,56 +277,50 @@ class BatchNorm2d(_BatchNorm):
     statistics on `(N, H, W)` slices, it's common terminology to call this
     Spatial Batch Normalization.
 
-    :type num_features: int
-    :param num_features: usually :math:`C` from an input of shape
-        :math:`(N, C, H, W)` or the highest ranked dimension of an input
-        less than 4D.
-    :type eps: float
-    :param eps: a value added to the denominator for numerical stability.
-        Default: 1e-5
-    :type momentum: float
-    :param momentum: the value used for the ``running_mean`` and ``running_var`` computation.
-        Default: 0.9
-    :type affine: bool
-    :param affine: a boolean value that when set to True, this module has
-        learnable affine parameters. Default: True
-    :type track_running_stats: bool
-    :param track_running_stats: when set to True, this module tracks the
-        running mean and variance. When set to False, this module does not
-        track such statistics and always uses batch statistics in both training
-        and eval modes. Default: True
-
-    :type freeze: bool
-    :param freeze: when set to True, this module does not update the
-        running mean and variance, and uses the running mean and variance instead of
-        the batch mean and batch variance to normalize the input. The parameter takes effect
-        only when the module is initilized with track_running_stats as True.
-        Default: False
+    Args:
+        num_features: usually :math:`C` from an input of shape
+            :math:`(N, C, H, W)` or the highest ranked dimension of an input
+            less than 4D.
+        eps: a value added to the denominator for numerical stability.
+            Default: 1e-5
+        momentum: the value used for the ``running_mean`` and ``running_var`` computation.
+            Default: 0.9
+        affine: a boolean value that when set to True, this module has
+            learnable affine parameters. Default: True
+        track_running_stats: when set to True, this module tracks the
+            running mean and variance. When set to False, this module does not
+            track such statistics and always uses batch statistics in both training
+            and eval modes. Default: True
+        freeze: when set to True, this module does not update the
+            running mean and variance, and uses the running mean and variance instead of
+            the batch mean and batch variance to normalize the input. The parameter takes effect
+            only when the module is initilized with track_running_stats as True.
+            Default: False
 
     Examples:
 
-    .. testcode::
+        .. testcode::
 
-        import numpy as np
-        import megengine as mge
-        import megengine.module as M
+            import numpy as np
+            import megengine as mge
+            import megengine.module as M
 
-        # With Learnable Parameters
-        m = M.BatchNorm2d(4)
-        inp = mge.tensor(np.random.rand(1, 4, 3, 3).astype("float32"))
-        oup = m(inp)
-        print(m.weight.numpy().flatten(), m.bias.numpy().flatten())
-        # Without L`e`arnable Parameters
-        m = M.BatchNorm2d(4, affine=False)
-        oup = m(inp)
-        print(m.weight, m.bias)
+            # With Learnable Parameters
+            m = M.BatchNorm2d(4)
+            inp = mge.tensor(np.random.rand(1, 4, 3, 3).astype("float32"))
+            oup = m(inp)
+            print(m.weight.numpy().flatten(), m.bias.numpy().flatten())
+            # Without L`e`arnable Parameters
+            m = M.BatchNorm2d(4, affine=False)
+            oup = m(inp)
+            print(m.weight, m.bias)
 
-    Outputs:
+        Outputs:
 
-    .. testoutput::
+        .. testoutput::
 
-        [1. 1. 1. 1.] [0. 0. 0. 0.]
-        None None
+            [1. 1. 1. 1.] [0. 0. 0. 0.]
+            None None
     """
 
     def _check_input_ndim(self, inp):
diff --git a/imperative/python/megengine/module/concat.py b/imperative/python/megengine/module/concat.py
index 117f1a81..7e27ade4 100644
--- a/imperative/python/megengine/module/concat.py
+++ b/imperative/python/megengine/module/concat.py
@@ -13,8 +13,7 @@ from .module import Module
 
 
 class Concat(Module):
-    r"""
-    A :class:`~.Module` to do functional :func:`~.concat`. Could be replaced with :class:`~.QATModule`
+    r"""A :class:`~.Module` to do functional :func:`~.concat`. Could be replaced with :class:`~.QATModule`
     version :class:`~.qat.Concat` using :func:`~.quantize.quantize_qat`.
     """
 
diff --git a/imperative/python/megengine/module/conv.py b/imperative/python/megengine/module/conv.py
index 183356db..bd561281 100644
--- a/imperative/python/megengine/module/conv.py
+++ b/imperative/python/megengine/module/conv.py
@@ -97,8 +97,7 @@ class _ConvNd(Module):
 
 class Conv1d(_ConvNd):
 
-    r"""
-    Applies a 1D convolution over an input tensor.
+    r"""Applies a 1D convolution over an input tensor.
 
     For instance, given an input of the size :math:`(N, C_{\text{in}}, H)`,
     this layer generates an output of the size
@@ -121,52 +120,49 @@ class Conv1d(_ConvNd):
     a depthwise convolution with a depthwise multiplier `K`, can be constructed
     by arguments :math:`(in\_channels=C_{in}, out\_channels=C_{in} \times K, ..., groups=C_{in})`.
 
-    :param in_channels: number of input channels.
-    :param out_channels: number of output channels.
-    :param kernel_size: size of weight on spatial dimensions.
-    :param stride: stride of the 1D convolution operation.
-    :param padding: size of the paddings added to the input on both sides of its
-        spatial dimensions. Only zero-padding is supported. Default: 0
-    :param dilation: dilation of the 1D convolution operation. Default: 1
-    :param groups: number of groups into which the input and output channels are divided,
-        so as to perform a "grouped convolution". When ``groups`` is not 1,
-        ``in_channels`` and ``out_channels`` must be divisible by ``groups``,
-        and there would be an extra dimension at the beginning of the weight's
-        shape. Default: 1
-    :param bias: whether to add a bias onto the result of convolution. Default:
-        True
-    :param conv_mode: Supports `cross_correlation`. Default:
-        `cross_correlation`
-    :param compute_mode: When set to "default", no special requirements will be
-        placed on the precision of intermediate results. When set to "float32",
-        "float32" would be used for accumulator and intermediate result, but only
-        effective when input and output are of float16 dtype.
-
-    .. note::
-
-       * ``weight`` usually has shape ``(out_channels, in_channels, kernel_size)`` ,
-         if groups is not 1, shape will be ``(groups, out_channels // groups, in_channels // groups, kernel_size)``
-       * ``bias`` usually has shape ``(1, out_channels, 1)``
+    Args:
+        in_channels: number of input channels.
+        out_channels: number of output channels.
+        kernel_size: size of weight on spatial dimensions.
+        stride: stride of the 1D convolution operation.
+        padding: size of the paddings added to the input on both sides of its
+            spatial dimensions. Only zero-padding is supported. Default: 0
+        dilation: dilation of the 1D convolution operation. Default: 1
+        groups: number of groups into which the input and output channels are divided,
+            so as to perform a "grouped convolution". When ``groups`` is not 1,
+            ``in_channels`` and ``out_channels`` must be divisible by ``groups``,
+            and there would be an extra dimension at the beginning of the weight's
+            shape. Default: 1
+        bias: whether to add a bias onto the result of convolution. Default: True
+        conv_mode: Supports `cross_correlation`. Default: `cross_correlation`
+        compute_mode: When set to "default", no special requirements will be
+            placed on the precision of intermediate results. When set to "float32",
+            "float32" would be used for accumulator and intermediate result, but only
+            effective when input and output are of float16 dtype.
+
+    Note:
+        * ``weight`` usually has shape ``(out_channels, in_channels, kernel_size)`` ,
+          if groups is not 1, shape will be ``(groups, out_channels // groups, in_channels // groups, kernel_size)``
+        * ``bias`` usually has shape ``(1, out_channels, 1)``
 
     Examples:
 
-    .. testcode::
+        .. testcode::
 
-        import numpy as np
-        import megengine as mge
-        import megengine.module as M
+            import numpy as np
+            import megengine as mge
+            import megengine.module as M
 
-        m = M.Conv1d(in_channels=3, out_channels=1, kernel_size=3)
-        inp = mge.tensor(np.arange(0, 24).astype("float32").reshape(2, 3, 4))
-        oup = m(inp)
-        print(oup.numpy().shape)
+            m = M.Conv1d(in_channels=3, out_channels=1, kernel_size=3)
+            inp = mge.tensor(np.arange(0, 24).astype("float32").reshape(2, 3, 4))
+            oup = m(inp)
+            print(oup.numpy().shape)
 
-    Outputs:
+        Outputs:
 
-    .. testoutput::
-
-        (2, 1, 2)
+        .. testoutput::
 
+            (2, 1, 2)
     """
 
     def __init__(
@@ -245,8 +241,7 @@ class Conv1d(_ConvNd):
 
 
 class Conv2d(_ConvNd):
-    r"""
-    Applies a 2D convolution over an input tensor.
+    r"""Applies a 2D convolution over an input tensor.
 
     For instance, given an input of the size :math:`(N, C_{\text{in}}, H, W)`,
     this layer generates an output of the size
@@ -284,54 +279,51 @@ class Conv2d(_ConvNd):
     a depthwise convolution with a depthwise multiplier `K`, can be constructed
     by arguments :math:`(in\_channels=C_{in}, out\_channels=C_{in} \times K, ..., groups=C_{in})`.
 
-    :param in_channels: number of input channels.
-    :param out_channels: number of output channels.
-    :param kernel_size: size of weight on spatial dimensions. If kernel_size is
-        an :class:`int`, the actual kernel size would be
-        ``(kernel_size, kernel_size)``.
-    :param stride: stride of the 2D convolution operation. Default: 1
-    :param padding: size of the paddings added to the input on both sides of its
-        spatial dimensions. Only zero-padding is supported. Default: 0
-    :param dilation: dilation of the 2D convolution operation. Default: 1
-    :param groups: number of groups into which the input and output channels are divided,
-        so as to perform a "grouped convolution". When ``groups`` is not 1,
-        ``in_channels`` and ``out_channels`` must be divisible by ``groups``,
-        and there would be an extra dimension at the beginning of the weight's
-        shape. Default: 1
-    :param bias: whether to add a bias onto the result of convolution. Default:
-        True
-    :param conv_mode: Supports `cross_correlation`. Default:
-        `cross_correlation`
-    :param compute_mode: When set to "default", no special requirements will be
-        placed on the precision of intermediate results. When set to "float32",
-        "float32" would be used for accumulator and intermediate result, but only
-        effective when input and output are of float16 dtype.
-
-    .. note::
-
-       * ``weight`` usually has shape ``(out_channels, in_channels, height, width)`` ,
-         if groups is not 1, shape will be ``(groups, out_channels // groups, in_channels // groups, height, width)``
-       * ``bias`` usually has shape ``(1, out_channels, *1)``
+    Args:
+        in_channels: number of input channels.
+        out_channels: number of output channels.
+        kernel_size: size of weight on spatial dimensions. If kernel_size is
+            an :class:`int`, the actual kernel size would be
+            ``(kernel_size, kernel_size)``.
+        stride: stride of the 2D convolution operation. Default: 1
+        padding: size of the paddings added to the input on both sides of its
+            spatial dimensions. Only zero-padding is supported. Default: 0
+        dilation: dilation of the 2D convolution operation. Default: 1
+        groups: number of groups into which the input and output channels are divided,
+            so as to perform a "grouped convolution". When ``groups`` is not 1,
+            ``in_channels`` and ``out_channels`` must be divisible by ``groups``,
+            and there would be an extra dimension at the beginning of the weight's
+            shape. Default: 1
+        bias: whether to add a bias onto the result of convolution. Default: True
+        conv_mode: Supports `cross_correlation`. Default: `cross_correlation`
+        compute_mode: When set to "default", no special requirements will be
+            placed on the precision of intermediate results. When set to "float32",
+            "float32" would be used for accumulator and intermediate result, but only
+            effective when input and output are of float16 dtype.
+
+    Note:
+        * ``weight`` usually has shape ``(out_channels, in_channels, height, width)`` ,
+            if groups is not 1, shape will be ``(groups, out_channels // groups, in_channels // groups, height, width)``
+        * ``bias`` usually has shape ``(1, out_channels, *1)``
 
     Examples:
 
-    .. testcode::
-
-        import numpy as np
-        import megengine as mge
-        import megengine.module as M
+        .. testcode::
 
-        m = M.Conv2d(in_channels=3, out_channels=1, kernel_size=3)
-        inp = mge.tensor(np.arange(0, 96).astype("float32").reshape(2, 3, 4, 4))
-        oup = m(inp)
-        print(oup.numpy().shape)
+            import numpy as np
+            import megengine as mge
+            import megengine.module as M
 
-    Outputs:
+            m = M.Conv2d(in_channels=3, out_channels=1, kernel_size=3)
+            inp = mge.tensor(np.arange(0, 96).astype("float32").reshape(2, 3, 4, 4))
+            oup = m(inp)
+            print(oup.numpy().shape)
 
-    .. testoutput::
+        Outputs:
 
-        (2, 1, 2, 2)
+        .. testoutput::
 
+            (2, 1, 2, 2)
     """
 
     def __init__(
@@ -411,8 +403,7 @@ class Conv2d(_ConvNd):
 
 class Conv3d(_ConvNd):
 
-    r"""
-    Applies a 3D convolution over an input tensor.
+    r"""Applies a 3D convolution over an input tensor.
 
     For instance, given an input of the size :math:`(N, C_{\text{in}}, T, H, W)`,
     this layer generates an output of the size
@@ -434,50 +425,47 @@ class Conv3d(_ConvNd):
     a depthwise convolution with a depthwise multiplier `K`, can be constructed
     by arguments :math:`(in\_channels=C_{in}, out\_channels=C_{in} \times K, ..., groups=C_{in})`.
 
-    :param in_channels: number of input channels.
-    :param out_channels: number of output channels.
-    :param kernel_size: size of weight on spatial dimensions. If kernel_size is
-        an :class:`int`, the actual kernel size would be
-        `(kernel_size, kernel_size, kernel_size)`.
-    :param stride: stride of the 3D convolution operation. Default: 1
-    :param padding: size of the paddings added to the input on both sides of its
-        spatial dimensions. Only zero-padding is supported. Default: 0
-    :param dilation: dilation of the 3D convolution operation. Default: 1
-    :param groups: number of groups into which the input and output channels are divided,
-        so as to perform a "grouped convolution". When ``groups`` is not 1,
-        ``in_channels`` and ``out_channels`` must be divisible by ``groups``,
-        and there would be an extra dimension at the beginning of the weight's
-        shape. Default: 1
-    :param bias: whether to add a bias onto the result of convolution. Default:
-        True
-    :param conv_mode: Supports `cross_correlation`. Default:
-        `cross_correlation`
-
-    .. note::
-
-       * ``weight`` usually has shape ``(out_channels, in_channels, depth, height, width)`` ,
-         if groups is not 1, shape will be ``(groups, out_channels // groups, in_channels // groups, depth, height, width)``
-       * ``bias`` usually has shape ``(1, out_channels, *1)``
+    Args:
+        in_channels: number of input channels.
+        out_channels: number of output channels.
+        kernel_size: size of weight on spatial dimensions. If kernel_size is
+            an :class:`int`, the actual kernel size would be
+            `(kernel_size, kernel_size, kernel_size)`.
+        stride: stride of the 3D convolution operation. Default: 1
+        padding: size of the paddings added to the input on both sides of its
+            spatial dimensions. Only zero-padding is supported. Default: 0
+        dilation: dilation of the 3D convolution operation. Default: 1
+        groups: number of groups into which the input and output channels are divided,
+            so as to perform a "grouped convolution". When ``groups`` is not 1,
+            ``in_channels`` and ``out_channels`` must be divisible by ``groups``,
+            and there would be an extra dimension at the beginning of the weight's
+            shape. Default: 1
+        bias: whether to add a bias onto the result of convolution. Default: True
+        conv_mode: Supports `cross_correlation`. Default: `cross_correlation`
+
+    Note:
+        * ``weight`` usually has shape ``(out_channels, in_channels, depth, height, width)`` ,
+          if groups is not 1, shape will be ``(groups, out_channels // groups, in_channels // groups, depth, height, width)``
+        * ``bias`` usually has shape ``(1, out_channels, *1)``
 
     Examples:
 
-    .. testcode::
-
-        import numpy as np
-        import megengine as mge
-        import megengine.module as M
+        .. testcode::
 
-        m = M.Conv3d(in_channels=3, out_channels=1, kernel_size=3)
-        inp = mge.tensor(np.arange(0, 384).astype("float32").reshape(2, 3, 4, 4, 4))
-        oup = m(inp)
-        print(oup.numpy().shape)
+            import numpy as np
+            import megengine as mge
+            import megengine.module as M
 
-    Outputs:
+            m = M.Conv3d(in_channels=3, out_channels=1, kernel_size=3)
+            inp = mge.tensor(np.arange(0, 384).astype("float32").reshape(2, 3, 4, 4, 4))
+            oup = m(inp)
+            print(oup.numpy().shape)
 
-    .. testoutput::
+        Outputs:
 
-        (2, 1, 2, 2, 2)
+        .. testoutput::
 
+            (2, 1, 2, 2, 2)
     """
 
     def __init__(
@@ -551,8 +539,7 @@ class Conv3d(_ConvNd):
 
 
 class ConvTranspose2d(_ConvNd):
-    r"""
-    Applies a 2D transposed convolution over an input tensor.
+    r"""Applies a 2D transposed convolution over an input tensor.
 
     This module is also known as a deconvolution or a fractionally-strided convolution.
     :class:`ConvTranspose2d` can be seen as the gradient of :class:`Conv2d` operation
@@ -562,35 +549,32 @@ class ConvTranspose2d(_ConvNd):
     the opposite way, transforming a smaller input to a larger output while preserving the
     connectivity pattern.
 
-    :param in_channels: number of input channels.
-    :param out_channels: number of output channels.
-    :param kernel_size: size of weight on spatial dimensions. If ``kernel_size`` is
-        an :class:`int`, the actual kernel size would be
-        ``(kernel_size, kernel_size)``.
-    :param stride: stride of the 2D convolution operation. Default: 1
-    :param padding: size of the paddings added to the input on both sides of its
-        spatial dimensions. Only zero-padding is supported. Default: 0
-    :param dilation: dilation of the 2D convolution operation. Default: 1
-    :param groups: number of groups into which the input and output channels are divided,
-        so as to perform a "grouped convolution". When ``groups`` is not 1,
-        ``in_channels`` and ``out_channels`` must be divisible by ``groups``,
-        and there would be an extra dimension at the beginning of the weight's
-        shape. Default: 1
-    :param bias: wether to add a bias onto the result of convolution. Default:
-        True
-    :param conv_mode: Supports `cross_correlation`. Default:
-        `cross_correlation`
-    :param compute_mode: When set to "default", no special requirements will be
-        placed on the precision of intermediate results. When set to "float32",
-        "float32" would be used for accumulator and intermediate result, but only
-        effective when input and output are of float16 dtype.
-
-    .. note::
-
-       * ``weight`` usually has shape ``(in_channels, out_channels, height, width)`` ,
-         if groups is not 1, shape will be ``(groups, in_channels // groups, out_channels // groups, height, width)``
-       * ``bias`` usually has shape ``(1, out_channels, *1)``
-
+    Args:
+        in_channels: number of input channels.
+        out_channels: number of output channels.
+        kernel_size: size of weight on spatial dimensions. If ``kernel_size`` is
+            an :class:`int`, the actual kernel size would be
+            ``(kernel_size, kernel_size)``.
+        stride: stride of the 2D convolution operation. Default: 1
+        padding: size of the paddings added to the input on both sides of its
+            spatial dimensions. Only zero-padding is supported. Default: 0
+        dilation: dilation of the 2D convolution operation. Default: 1
+        groups: number of groups into which the input and output channels are divided,
+            so as to perform a "grouped convolution". When ``groups`` is not 1,
+            ``in_channels`` and ``out_channels`` must be divisible by ``groups``,
+            and there would be an extra dimension at the beginning of the weight's
+            shape. Default: 1
+        bias: wether to add a bias onto the result of convolution. Default: True
+            conv_mode: Supports `cross_correlation`. Default: `cross_correlation`
+        compute_mode: When set to "default", no special requirements will be
+            placed on the precision of intermediate results. When set to "float32",
+            "float32" would be used for accumulator and intermediate result, but only
+            effective when input and output are of float16 dtype.
+
+    Note:
+        * ``weight`` usually has shape ``(in_channels, out_channels, height, width)`` ,
+          if groups is not 1, shape will be ``(groups, in_channels // groups, out_channels // groups, height, width)``
+        * ``bias`` usually has shape ``(1, out_channels, *1)``
     """
 
     def __init__(
@@ -669,30 +653,28 @@ class ConvTranspose2d(_ConvNd):
 
 
 class LocalConv2d(Conv2d):
-    r"""
-    Applies a spatial convolution with untied kernels over an groupped channeled input 4D tensor.
+    r"""Applies a spatial convolution with untied kernels over an groupped channeled input 4D tensor.
     It is also known as the locally connected layer.
 
-    :param in_channels: number of input channels.
-    :param out_channels: number of output channels.
-    :param input_height: the height of the input images.
-    :param input_width: the width of the input images.
-    :param kernel_size: size of weight on spatial dimensions. If kernel_size is
-        an :class:`int`, the actual kernel size would be
-        ``(kernel_size, kernel_size)``.
-    :param stride: stride of the 2D convolution operation. Default: 1
-    :param padding: size of the paddings added to the input on both sides of its
-        spatial dimensions. Only zero-padding is supported. Default: 0
-    :param groups: number of groups into which the input and output channels are divided,
-        so as to perform a "grouped convolution". When ``groups`` is not 1,
-        ``in_channels`` and ``out_channels`` must be divisible by ``groups``. Default: 1
-
-    .. note::
-
-       * ``weight`` usually has shape ``(out_height, out_width, in_channels, height, width, in_channels)`` ,
-         if groups is not 1, shape will be ``(groups, out_height, out_width, in_channels // groups, height, width, out_channels // groups)``
-       * ``bias`` usually has shape ``(1, out_channels, *1)``
-
+    Args:
+        in_channels: number of input channels.
+        out_channels: number of output channels.
+        input_height: the height of the input images.
+        input_width: the width of the input images.
+        kernel_size: size of weight on spatial dimensions. If kernel_size is
+            an :class:`int`, the actual kernel size would be
+            ``(kernel_size, kernel_size)``.
+        stride: stride of the 2D convolution operation. Default: 1
+        padding: size of the paddings added to the input on both sides of its
+            spatial dimensions. Only zero-padding is supported. Default: 0
+        groups: number of groups into which the input and output channels are divided,
+            so as to perform a "grouped convolution". When ``groups`` is not 1,
+            ``in_channels`` and ``out_channels`` must be divisible by ``groups``. Default: 1
+
+    Note:
+        * ``weight`` usually has shape ``(out_height, out_width, in_channels, height, width, in_channels)`` ,
+          if groups is not 1, shape will be ``(groups, out_height, out_width, in_channels // groups, height, width, out_channels // groups)``
+        * ``bias`` usually has shape ``(1, out_channels, *1)``
     """
 
     def __init__(
@@ -755,8 +737,7 @@ class LocalConv2d(Conv2d):
 
 
 class ConvRelu2d(Conv2d):
-    r"""
-    A fused :class:`~.Module` including :class:`~.module.Conv2d` and :func:`~.relu`.
+    r"""A fused :class:`~.Module` including :class:`~.module.Conv2d` and :func:`~.relu`.
     Could be replaced with :class:`~.QATModule` version :class:`~.qat.ConvRelu2d` using :func:`~.quantize.quantize_qat`.
     """
 
@@ -765,38 +746,34 @@ class ConvRelu2d(Conv2d):
 
 
 class DeformableConv2d(_ConvNd):
-    """
-    Deformable Convolution.
-
-    :param in_channels: number of input channels.
-    :param out_channels: number of output channels.
-    :param kernel_size: size of weight on spatial dimensions. If kernel_size is
-        an :class:`int`, the actual kernel size would be
-        ``(kernel_size, kernel_size)``.
-    :param stride: stride of the 2D convolution operation. Default: 1
-    :param padding: size of the paddings added to the input on both sides of its
-        spatial dimensions. Only zero-padding is supported. Default: 0
-    :param dilation: dilation of the 2D convolution operation. Default: 1
-    :param groups: number of groups into which the input and output channels are divided,
-        so as to perform a "grouped convolution". When ``groups`` is not 1,
-        ``in_channels`` and ``out_channels`` must be divisible by ``groups``,
-        and there would be an extra dimension at the beginning of the weight's
-        shape. Default: 1
-    :param bias: whether to add a bias onto the result of convolution. Default:
-        True
-    :param conv_mode: Supports `cross_correlation`. Default:
-        `cross_correlation`
-    :param compute_mode: When set to "default", no special requirements will be
-        placed on the precision of intermediate results. When set to "float32",
-        "float32" would be used for accumulator and intermediate result, but only
-        effective when input and output are of float16 dtype.
-
-    .. note::
-
-       * ``weight`` usually has shape ``(out_channels, in_channels, height, width)`` ,
-         if groups is not 1, shape will be ``(groups, out_channels // groups, in_channels // groups, height, width)``
-       * ``bias`` usually has shape ``(1, out_channels, *1)``
-
+    r"""Deformable Convolution.
+
+    Args:
+        in_channels: number of input channels.
+        out_channels: number of output channels.
+        kernel_size: size of weight on spatial dimensions. If kernel_size is
+            an :class:`int`, the actual kernel size would be
+            ``(kernel_size, kernel_size)``.
+        stride: stride of the 2D convolution operation. Default: 1
+        padding: size of the paddings added to the input on both sides of its
+            spatial dimensions. Only zero-padding is supported. Default: 0
+        dilation: dilation of the 2D convolution operation. Default: 1
+        groups: number of groups into which the input and output channels are divided,
+            so as to perform a "grouped convolution". When ``groups`` is not 1,
+            ``in_channels`` and ``out_channels`` must be divisible by ``groups``,
+            and there would be an extra dimension at the beginning of the weight's
+            shape. Default: 1
+        bias: whether to add a bias onto the result of convolution. Default: True
+        conv_mode: Supports `cross_correlation`. Default: `cross_correlation`
+        compute_mode: When set to "default", no special requirements will be
+            placed on the precision of intermediate results. When set to "float32",
+            "float32" would be used for accumulator and intermediate result, but only
+            effective when input and output are of float16 dtype.
+
+    Note:
+        * ``weight`` usually has shape ``(out_channels, in_channels, height, width)`` ,
+          if groups is not 1, shape will be ``(groups, out_channels // groups, in_channels // groups, height, width)``
+        * ``bias`` usually has shape ``(1, out_channels, *1)``
     """
 
     def __init__(
@@ -877,8 +854,7 @@ class DeformableConv2d(_ConvNd):
 
 
 class ConvTranspose3d(_ConvNd):
-    r"""
-    Applies a 3D transposed convolution over an input tensor.
+    r"""Applies a 3D transposed convolution over an input tensor.
 
     Only support the case that groups = 1 and conv_mode = "cross_correlation".
 
@@ -889,23 +865,21 @@ class ConvTranspose3d(_ConvNd):
     works the opposite way, transforming a smaller input to a larger output while
     preserving the connectivity pattern.
 
-    :param in_channels: number of input channels.
-    :param out_channels: number of output channels.
-    :param kernel_size: size of weight on spatial dimensions. If ``kernel_size`` is
-        an :class:`int`, the actual kernel size would be
-        ``(kernel_size, kernel_size, kernel_size)``.
-    :param stride: stride of the 3D convolution operation. Default: 1
-    :param padding: size of the paddings added to the input on all sides of its
-        spatial dimensions. Only zero-padding is supported. Default: 0
-    :param dilation: dilation of the 3D convolution operation. Default: 1
-    :param bias: wether to add a bias onto the result of convolution. Default:
-        True
-
-    .. note::
-
-       * ``weight`` usually has shape ``(in_channels, out_channels, depth, height, width)`` .
-       * ``bias`` usually has shape ``(1, out_channels, *1)``
-
+    Args:
+        in_channels: number of input channels.
+        out_channels: number of output channels.
+        kernel_size: size of weight on spatial dimensions. If ``kernel_size`` is
+            an :class:`int`, the actual kernel size would be
+            ``(kernel_size, kernel_size, kernel_size)``.
+        stride: stride of the 3D convolution operation. Default: 1
+        padding: size of the paddings added to the input on all sides of its
+            spatial dimensions. Only zero-padding is supported. Default: 0
+        dilation: dilation of the 3D convolution operation. Default: 1
+        bias: wether to add a bias onto the result of convolution. Default: True
+
+    Note:
+        * ``weight`` usually has shape ``(in_channels, out_channels, depth, height, width)`` .
+        * ``bias`` usually has shape ``(1, out_channels, *1)``
     """
 
     def __init__(
diff --git a/imperative/python/megengine/module/conv_bn.py b/imperative/python/megengine/module/conv_bn.py
index d27a292e..fdaabaa5 100644
--- a/imperative/python/megengine/module/conv_bn.py
+++ b/imperative/python/megengine/module/conv_bn.py
@@ -50,8 +50,7 @@ class _ConvBnActivation2d(Module):
 
 
 class ConvBn2d(_ConvBnActivation2d):
-    r"""
-    A fused :class:`~.Module` including :class:`~.module.Conv2d` and :class:`~.module.BatchNorm2d`. 
+    r"""A fused :class:`~.Module` including :class:`~.module.Conv2d` and :class:`~.module.BatchNorm2d`.
     Could be replaced with :class:`~.QATModule` version :class:`~.qat.ConvBn2d` using
     :func:`~.quantize.quantize_qat`.
     """
@@ -61,8 +60,7 @@ class ConvBn2d(_ConvBnActivation2d):
 
 
 class ConvBnRelu2d(_ConvBnActivation2d):
-    r"""
-    A fused :class:`~.Module` including :class:`~.module.Conv2d`, :class:`~.module.BatchNorm2d` and :func:`~.relu`. 
+    r"""A fused :class:`~.Module` including :class:`~.module.Conv2d`, :class:`~.module.BatchNorm2d` and :func:`~.relu`.
     Could be replaced with :class:`~.QATModule` version :class:`~.qat.ConvBnRelu2d` using :func:`~.quantize.quantize_qat`.
     """
 
diff --git a/imperative/python/megengine/module/dropout.py b/imperative/python/megengine/module/dropout.py
index 08587e91..23a83ec3 100644
--- a/imperative/python/megengine/module/dropout.py
+++ b/imperative/python/megengine/module/dropout.py
@@ -11,13 +11,13 @@ from .module import Module
 
 
 class Dropout(Module):
-    r"""
-    Randomly sets input elements to zeros with the probability :math:`drop\_prob` during training.
+    r"""Randomly sets input elements to zeros with the probability :math:`drop\_prob` during training.
     Commonly used in large networks to prevent overfitting.
     Note that we perform dropout only during training, we also rescale(multiply) the output tensor
     by :math:`\frac{1}{1 - drop\_prob}`. During inference :class:`~.Dropout` is equal to :class:`~.Identity`.
 
-    :param drop_prob: The probability to drop (set to zero) each single element
+    Args:
+        drop_prob: The probability to drop (set to zero) each single element
     """
 
     def __init__(self, drop_prob=0.0, **kwargs):
diff --git a/imperative/python/megengine/module/elemwise.py b/imperative/python/megengine/module/elemwise.py
index 007c54b8..5c0dd301 100644
--- a/imperative/python/megengine/module/elemwise.py
+++ b/imperative/python/megengine/module/elemwise.py
@@ -11,67 +11,12 @@ from .module import Module
 
 
 class Elemwise(Module):
-    r"""
-    A :class:`~.Module` to do :mod:`~.functional.elemwise` operator. Could be replaced with :class:`~.QATModule`
+    r"""A :class:`~.Module` to do :mod:`~.functional.elemwise` operator. Could be replaced with :class:`~.QATModule`
     version :class:`~.qat.Elemwise` using :func:`~.quantize.quantize_qat`.
 
-    :param method: the elemwise method, support the following string.
-        It will do the normal elemwise operator for float.
-
-        * "add": a + b
-        * "fuse_add_relu": max(x+y, 0)
-        * "mul": x * y
-        * "min": min(x, y)
-        * "max": max(x, y)
-        * "sub": x - y
-        * "true_div": x / y
-        * "fuse_add_sigmoid": sigmoid(x + y)
-        * "fuse_add_tanh": tanh(x + y)
-        * "relu": x > 0 ? x : 0
-        * "silu": silu(x)
-        * "gelu": gelu(x)
-        * "abs": x > 0 ? x : -x
-        * "sigmoid": sigmoid(x)
-        * "exp": exp(x)
-        * "tanh": tanh(x)
-        * "fuse_mul_add3": x * y + z
-        * "fast_tanh": x * (27. + x * x) / (27. + 9. * x * x)
-        * "negate": -x
-        * "acos": acos(x)
-        * "asin": asin(x)
-        * "ceil": ceil(x)
-        * "cos": cos(x)
-        * "expm1": expm1(x)
-        * "floor": floor(x)
-        * "log": log(x)
-        * "log1p": log1p(x)
-        * "sin": sin(x)
-        * "round": round(x)
-        * "erf": erf(x)
-        * "erfinv": erfinv(x)
-        * "erfc": erfc(x)
-        * "erfcinv": erfcinv(x)
-        * "abs_grad": abs_grad
-        * "floor_div": floor_div
-        * "mod": mod
-        * "sigmoid_grad": sigmoid_grad
-        * "switch_gt0": switch_gt0
-        * "tanh_grad": tanh_grad
-        * "lt": less
-        * "leq": leq
-        * "eq": equal
-        * "pow": pow
-        * "log_sum_exp": log_sum_exp
-        * "fast_tanh_grad": fast_tanh_grad
-        * "atan2": atan2
-        * "cond_leq_mov": cond_leq_mov
-        * "h_swish": h_swish
-        * "fuse_add_h_swish": h_swish(x+y)
-        * "h_swish_grad": h_swish_grad
-        * "and": bool binary: x && y
-        * "or": bool binary: x || y
-        * "xor": bool binary: x ^ y
-        * "not": bool unary: ~x
+    Args:
+        method: the elemwise method, support the following string.
+                It will do the normal elemwise operator for float.
     """
 
     def __init__(self, method, **kwargs):
diff --git a/imperative/python/megengine/module/embedding.py b/imperative/python/megengine/module/embedding.py
index 1f12f249..9a528103 100644
--- a/imperative/python/megengine/module/embedding.py
+++ b/imperative/python/megengine/module/embedding.py
@@ -17,42 +17,41 @@ from .module import Module
 
 
 class Embedding(Module):
-    r"""
-    A simple lookup table that stores embeddings of a fixed dictionary and size.
+    r"""A simple lookup table that stores embeddings of a fixed dictionary and size.
 
     This module is often used to store word embeddings and retrieve them using indices.
     The input to the module is a list of indices, and the output is the corresponding word embeddings.
     The indices should less than num_embeddings.
 
-    :param num_embeddings: size of embedding dictionary.
-    :param embedding_dim: size of each embedding vector.
-    :param padding_idx: should be set to None, not supportted now.
-    :param max_norm: should be set to None, not supportted now.
-    :param norm_type: should be set to None, not supportted now.
-    :param initial_weight: the learnable weights of the module of shape (num_embeddings, embedding_dim).
+    Args:
+        num_embeddings: size of embedding dictionary.
+        embedding_dim: size of each embedding vector.
+        padding_idx: should be set to None, not supportted now.
+        max_norm: should be set to None, not supportted now.
+        norm_type: should be set to None, not supportted now.
+        initial_weight: the learnable weights of the module of shape (num_embeddings, embedding_dim).
 
     Examples:
 
-    .. testcode::
-
-        import numpy as np
-        import megengine as mge
-        import megengine.module as M
-        weight = mge.tensor(np.array([(1.2,2.3,3.4,4.5,5.6)], dtype=np.float32))
-        data = mge.tensor(np.array([(0,0)], dtype=np.int32))
+        .. testcode::
 
-        embedding = M.Embedding(1, 5, initial_weight=weight)
-        output = embedding(data)
-        with np.printoptions(precision=6):
-            print(output.numpy())
+            import numpy as np
+            import megengine as mge
+            import megengine.module as M
+            weight = mge.tensor(np.array([(1.2,2.3,3.4,4.5,5.6)], dtype=np.float32))
+            data = mge.tensor(np.array([(0,0)], dtype=np.int32))
 
-    Outputs:
+            embedding = M.Embedding(1, 5, initial_weight=weight)
+            output = embedding(data)
+            with np.printoptions(precision=6):
+                print(output.numpy())
 
-    .. testoutput::
+        Outputs:
 
-        [[[1.2 2.3 3.4 4.5 5.6]
-          [1.2 2.3 3.4 4.5 5.6]]]
+        .. testoutput::
 
+            [[[1.2 2.3 3.4 4.5 5.6]
+              [1.2 2.3 3.4 4.5 5.6]]]
     """
 
     def __init__(
@@ -110,36 +109,35 @@ class Embedding(Module):
         max_norm: Optional[float] = None,
         norm_type: Optional[float] = None,
     ):
-        r"""
-        Creates Embedding instance from given 2-dimensional FloatTensor.
+        r"""Creates Embedding instance from given 2-dimensional FloatTensor.
 
-        :param embeddings: tensor contained weight for the embedding.
-        :param freeze: if ``True``, the weight does not get updated during the learning process. Default: True.
-        :param padding_idx: should be set to None, not support Now.
-        :param max_norm: should be set to None, not support Now.
-        :param norm_type: should be set to None, not support Now.
+        Args:
+            embeddings: tensor contained weight for the embedding.
+            freeze: if ``True``, the weight does not get updated during the learning process. Default: True.
+            padding_idx: should be set to None, not support Now.
+            max_norm: should be set to None, not support Now.
+            norm_type: should be set to None, not support Now.
 
         Examples:
 
-        .. testcode::
-
-            import numpy as np
-            import megengine as mge
-            import megengine.module as M
-            weight = mge.tensor(np.array([(1.2,2.3,3.4,4.5,5.6)], dtype=np.float32))
-            data = mge.tensor(np.array([(0,0)], dtype=np.int32))
+            .. testcode::
 
-            embedding = M.Embedding.from_pretrained(weight, freeze=False)
-            output = embedding(data)
-            print(output.numpy())
+                import numpy as np
+                import megengine as mge
+                import megengine.module as M
+                weight = mge.tensor(np.array([(1.2,2.3,3.4,4.5,5.6)], dtype=np.float32))
+                data = mge.tensor(np.array([(0,0)], dtype=np.int32))
 
-        Outputs:
+                embedding = M.Embedding.from_pretrained(weight, freeze=False)
+                output = embedding(data)
+                print(output.numpy())
 
-        .. testoutput::
+            Outputs:
 
-            [[[1.2 2.3 3.4 4.5 5.6]
-              [1.2 2.3 3.4 4.5 5.6]]]
+            .. testoutput::
 
+                [[[1.2 2.3 3.4 4.5 5.6]
+                  [1.2 2.3 3.4 4.5 5.6]]]
         """
         embeddings_shape = embeddings.shape
         embeddings_dim = len(embeddings_shape)
diff --git a/imperative/python/megengine/module/external.py b/imperative/python/megengine/module/external.py
index 0bd94cc3..1a7e484c 100644
--- a/imperative/python/megengine/module/external.py
+++ b/imperative/python/megengine/module/external.py
@@ -19,7 +19,7 @@ from .module import Module
 
 class TensorrtRuntimeSubgraph(Module):
     r"""Load a serialized TensorrtRuntime subgraph.
-
+    
     See :func:`~.tensorrt_runtime_opr` for more details.
     """
 
@@ -41,7 +41,7 @@ class TensorrtRuntimeSubgraph(Module):
 
 class CambriconRuntimeSubgraph(Module):
     r"""Load a serialized CambriconRuntime subgraph.
-
+    
     See :func:`~.cambricon_runtime_opr` for more details.
     """
 
@@ -68,7 +68,7 @@ class CambriconRuntimeSubgraph(Module):
 
 class AtlasRuntimeSubgraph(Module):
     r"""Load a serialized AtlasRuntime subgraph.
-
+    
     See :func:`~.atlas_runtime_opr` for more details.
     """
 
diff --git a/imperative/python/megengine/module/init.py b/imperative/python/megengine/module/init.py
index 52b11160..84834755 100644
--- a/imperative/python/megengine/module/init.py
+++ b/imperative/python/megengine/module/init.py
@@ -18,53 +18,53 @@ from ..tensor import Tensor
 
 
 def fill_(tensor: Tensor, val: Union[float, int]) -> None:
-    """
-    Fills the given ``tensor`` with value ``val``.
+    """Fills the given ``tensor`` with value ``val``.
 
-    :param tensor: tensor to be initialized.
-    :param val: value to be filled throughout the tensor.
+    Args:
+        tensor: tensor to be initialized.
+        val: value to be filled throughout the tensor.
     """
     tensor._reset(full(shape=tensor.shape, value=val, dtype=tensor.dtype))
 
 
 def zeros_(tensor: Tensor) -> None:
-    """
-    Fills the given ``tensor`` with scalar value `0`.
+    """Fills the given ``tensor`` with scalar value `0`.
 
-    :param tensor: tensor to be initialized.
+    Args:
+        tensor: tensor to be initialized.
     """
     fill_(tensor, 0)
 
 
 def ones_(tensor: Tensor) -> None:
-    """
-    Fills the given ``tensor`` with the scalar value `1`.
+    """Fills the given ``tensor`` with the scalar value `1`.
 
-    :param tensor: tensor to be initialized.
+    Args:
+        tensor: tensor to be initialized.
     """
     fill_(tensor, 1)
 
 
 def uniform_(tensor: Tensor, a: float = 0.0, b: float = 1.0) -> None:
-    r"""
-    Fills the given ``tensor`` with random value sampled from uniform distribution
+    r"""Fills the given ``tensor`` with random value sampled from uniform distribution
     :math:`\mathcal{U}(\text{a}, \text{b})`.
 
-    :param tensor: tensor to be initialized.
-    :param a: lower bound of the sampling interval.
-    :param b: upper bound of the sampling interval.
+    Args:
+        tensor: tensor to be initialized.
+        a: lower bound of the sampling interval.
+        b: upper bound of the sampling interval.
     """
     tensor._reset(uniform(size=tensor.shape, low=a, high=b).astype(tensor.dtype))
 
 
 def normal_(tensor: Tensor, mean: float = 0.0, std: float = 1.0) -> None:
-    r"""
-    Fills the given ``tensor`` with random value sampled from normal distribution
+    r"""Fills the given ``tensor`` with random value sampled from normal distribution
     :math:`\mathcal{N}(\text{mean}, \text{std}^2)`.
 
-    :param tensor: tensor to be initialized.
-    :param mean: mean of the normal distribution.
-    :param std: standard deviation of the normal distribution.
+    Args:
+        tensor: tensor to be initialized.
+        mean: mean of the normal distribution.
+        std: standard deviation of the normal distribution.
     """
     tensor._reset(normal(size=tensor.shape, mean=mean, std=std).astype(tensor.dtype))
 
@@ -72,10 +72,9 @@ def normal_(tensor: Tensor, mean: float = 0.0, std: float = 1.0) -> None:
 def calculate_gain(
     nonlinearity: str, param: Optional[Union[int, float]] = None
 ) -> float:
-    r"""
-    Returns a recommended gain value (see the table below) for the given nonlinearity
+    r"""Returns a recommended gain value (see the table below) for the given nonlinearity
     function.
-
+    
     ================= ====================================================
     nonlinearity      gain
     ================= ====================================================
@@ -87,10 +86,10 @@ def calculate_gain(
     Leaky Relu        :math:`\sqrt{\frac{2}{1 + {\text{negative}_\text{slope}}^2}}`
     ================= ====================================================
 
-    :param nonlinearity: name of the non-linear function.
-    :param param: optional parameter for leaky_relu. Only effective when
-        ``nonlinearity`` is "leaky_relu".
-
+    Args:
+        nonlinearity: name of the non-linear function.
+        param: optional parameter for leaky_relu. Only effective when
+            ``nonlinearity`` is "leaky_relu".
     """
     linear_fns = [
         "linear",
@@ -124,11 +123,11 @@ def calculate_gain(
 
 
 def calculate_fan_in_and_fan_out(tensor: Tensor) -> Tuple[float, float]:
-    """
-    Calculates fan_in / fan_out value for given weight tensor. This function assumes
+    r"""Calculates fan_in / fan_out value for given weight tensor. This function assumes
     input tensor is stored in ``NCHW`` format.
 
-    :param tensor: weight tensor in ``NCHW`` format.
+    Args:
+        tensor: weight tensor in ``NCHW`` format.
     """
     shape = tensor.shape
     ndim = len(shape)
@@ -153,14 +152,14 @@ def calculate_fan_in_and_fan_out(tensor: Tensor) -> Tuple[float, float]:
 
 
 def calculate_correct_fan(tensor: Tensor, mode: str) -> float:
-    """
-    Calculates fan_in / fan_out value for given weight tensor, depending on given
+    r"""Calculates fan_in / fan_out value for given weight tensor, depending on given
     ``mode``.
-
+    
     See :func:`calculate_fan_in_and_fan_out` for details.
 
-    :param tensor: weight tensor in ``NCHW`` format.
-    :param mode: "fan_in" or "fan_out".
+    Args:
+        tensor: weight tensor in ``NCHW`` format.
+        mode: fan_in" or "fan_out".
     """
     mode = mode.lower()
     valid_modes = ["fan_in", "fan_out"]
@@ -174,19 +173,20 @@ def calculate_correct_fan(tensor: Tensor, mode: str) -> float:
 
 
 def xavier_uniform_(tensor: Tensor, gain: float = 1.0) -> None:
-    r"""
-    Fills tensor with random values sampled from :math:`\mathcal{U}(-a, a)`
+    r"""Fills tensor with random values sampled from :math:`\mathcal{U}(-a, a)`
     where
-
+    
     .. math::
-        a = \text{gain} \times \sqrt{\frac{6}{\text{fan_in} + \text{fan_out}}}
 
+        a = \text{gain} \times \sqrt{\frac{6}{\text{fan_in} + \text{fan_out}}}
+    
     Also known as Glorot initialization. Detailed information can be retrieved from
     `Understanding the difficulty of training deep feedforward neural networks` -
     Glorot, X. & Bengio, Y. (2010).
 
-    :param tensor: tensor to be initialized.
-    :param gain: scaling factor for :math:`a`.
+    Args:
+        tensor: tensor to be initialized.
+        gain: scaling factor for :math:`a`.
     """
     fan_in, fan_out = calculate_fan_in_and_fan_out(tensor)
     std = gain * math.sqrt(2.0 / float(fan_in + fan_out))
@@ -195,19 +195,20 @@ def xavier_uniform_(tensor: Tensor, gain: float = 1.0) -> None:
 
 
 def xavier_normal_(tensor: Tensor, gain: float = 1.0) -> None:
-    r"""
-    Fills tensor with random values sampled from
+    r"""Fills tensor with random values sampled from
     :math:`\mathcal{N}(0, \text{std}^2)` where
-
+    
     .. math::
-        \text{std} = \text{gain} \times \sqrt{\frac{2}{\text{fan_in} + \text{fan_out}}}
 
+        \text{std} = \text{gain} \times \sqrt{\frac{2}{\text{fan_in} + \text{fan_out}}}
+    
     Also known as Glorot initialization. Detailed information can be retrieved from
     `Understanding the difficulty of training deep feedforward neural networks` -
     Glorot, X. & Bengio, Y. (2010).
 
-    :param tensor: tensor to be initialized.
-    :param gain: scaling factor for :math:`std`.
+    Args:
+        tensor: tensor to be initialized.
+        gain: scaling factor for :math:`std`.
     """
     fan_in, fan_out = calculate_fan_in_and_fan_out(tensor)
     std = gain * math.sqrt(2.0 / float(fan_in + fan_out))
@@ -217,25 +218,26 @@ def xavier_normal_(tensor: Tensor, gain: float = 1.0) -> None:
 def msra_uniform_(
     tensor: Tensor, a: float = 0, mode: str = "fan_in", nonlinearity: str = "leaky_relu"
 ) -> None:
-    r"""
-    Fills tensor wilth random values sampled from
+    r"""Fills tensor wilth random values sampled from
     :math:`\mathcal{U}(-\text{bound}, \text{bound})` where
-
+    
     .. math::
-        \text{bound} = \sqrt{\frac{6}{(1 + a^2) \times \text{fan_in}}}
 
+        \text{bound} = \sqrt{\frac{6}{(1 + a^2) \times \text{fan_in}}}
+    
     Detailed information can be retrieved from
     `Delving deep into rectifiers: Surpassing human-level performance on ImageNet
     classification`
 
-    :param tensor: tensor to be initialized.
-    :param a: optional parameter for calculating gain for leaky_relu. See
-        :func:`calculate_gain` for details.
-    :param mode: "fan_in" or "fan_out", used to calculate :math:`gain`, the
-        scaling factor for :math:`bound`. See :func:`calculate_fan_in_and_fan_out` for
-        details.
-    :param nonlinearity: name of the non-linear function used to calculate :math:`gain`.
-        See :func:`calculate_gain` for details.
+    Args:
+        tensor: tensor to be initialized.
+        a: optional parameter for calculating gain for leaky_relu. See
+            :func:`calculate_gain` for details.
+        mode: fan_in" or "fan_out", used to calculate :math:`gain`, the
+            scaling factor for :math:`bound`. See :func:`calculate_fan_in_and_fan_out` for
+            details.
+        nonlinearity: name of the non-linear function used to calculate :math:`gain`.
+            See :func:`calculate_gain` for details.
     """
     fan = calculate_correct_fan(tensor, mode)
     gain = calculate_gain(nonlinearity, a)
@@ -247,25 +249,26 @@ def msra_uniform_(
 def msra_normal_(
     tensor: Tensor, a: float = 0, mode: str = "fan_in", nonlinearity: str = "leaky_relu"
 ) -> None:
-    r"""
-    Fills tensor wilth random values sampled from
+    r"""Fills tensor wilth random values sampled from
     :math:`\mathcal{N}(0, \text{std}^2)` where
-
+    
     .. math::
-        \text{std} = \sqrt{\frac{2}{(1 + a^2) \times \text{fan_in}}}
 
+        \text{std} = \sqrt{\frac{2}{(1 + a^2) \times \text{fan_in}}}
+    
     Detailed information can be retrieved from
     `Delving deep into rectifiers: Surpassing human-level performance on ImageNet
     classification`
 
-    :param tensor: tensor to be initialized
-    :param a: optional parameter for calculating gain for leaky_relu. See
-        :func:`calculate_gain` for details.
-    :param mode: "fan_in" or "fan_out", used to calculate :math:`gain`, the
-        scaling factor for :math:`gain`. See :func:`calculate_fan_in_and_fan_out` for
-        details.
-    :param nonlinearity: name of the non-linear function used to calculate :math:`gain`.
-        See :func:`calculate_gain` for details.
+    Args:
+        tensor: tensor to be initialized
+        a: optional parameter for calculating gain for leaky_relu. See
+            :func:`calculate_gain` for details.
+        mode: fan_in" or "fan_out", used to calculate :math:`gain`, the
+            scaling factor for :math:`gain`. See :func:`calculate_fan_in_and_fan_out` for
+            details.
+        nonlinearity: name of the non-linear function used to calculate :math:`gain`.
+            See :func:`calculate_gain` for details.
     """
     fan = calculate_correct_fan(tensor, mode)
     gain = calculate_gain(nonlinearity, a)
diff --git a/imperative/python/megengine/module/linear.py b/imperative/python/megengine/module/linear.py
index 136040d8..971d9a3e 100644
--- a/imperative/python/megengine/module/linear.py
+++ b/imperative/python/megengine/module/linear.py
@@ -14,8 +14,7 @@ from .module import Module
 
 
 class Linear(Module):
-    r"""
-    Applies a linear transformation to the input. For instance, if input
+    r"""Applies a linear transformation to the input. For instance, if input
     is x, then output y is:
 
     .. math::
@@ -24,30 +23,31 @@ class Linear(Module):
 
     where :math:`y_i= \sum_j W_{ij} x_j + b_i`
 
-    :param in_features: size of each input sample.
-    :param out_features: size of each output sample.
-    :param bias: if it's ``False``, the layer will not learn an additional ``bias``.
-        Default: ``True``
+    Args:
+        in_features: size of each input sample.
+        out_features: size of each output sample.
+        bias: if it's ``False``, the layer will not learn an additional ``bias``.
+            Default: ``True``
 
-    Examples:
 
-    .. testcode::
+    Examples:
 
-        import numpy as np
-        import megengine as mge
-        import megengine.module as M
+        .. testcode::
 
-        m = M.Linear(in_features=3, out_features=1)
-        inp = mge.tensor(np.arange(0, 6).astype("float32").reshape(2, 3))
-        oup = m(inp)
-        print(oup.numpy().shape)
+            import numpy as np
+            import megengine as mge
+            import megengine.module as M
 
-    Outputs:
+            m = M.Linear(in_features=3, out_features=1)
+            inp = mge.tensor(np.arange(0, 6).astype("float32").reshape(2, 3))
+            oup = m(inp)
+            print(oup.numpy().shape)
 
-    .. testoutput::
+        Outputs:
 
-        (2, 1)
+        .. testoutput::
 
+            (2, 1)
     """
 
     def __init__(
diff --git a/imperative/python/megengine/module/module.py b/imperative/python/megengine/module/module.py
index 87b70812..275a36e5 100644
--- a/imperative/python/megengine/module/module.py
+++ b/imperative/python/megengine/module/module.py
@@ -84,15 +84,14 @@ def _get_XNorm_typeclass():
 
 
 class Module(metaclass=ABCMeta):
-    """
-    Base Module class.
+    r"""Base Module class.
+
+    Args:
+        name: module's name, can be initialized by the ``kwargs`` parameter
+            of child class.
     """
 
     def __init__(self, name=None):
-        """
-        :param name: module's name, can be initialized by the ``kwargs`` parameter
-            of child class.
-        """
         self._modules = []
 
         if name is not None:
@@ -118,18 +117,19 @@ class Module(metaclass=ABCMeta):
         pass
 
     def register_forward_pre_hook(self, hook: Callable) -> HookHandler:
-        """
-        Registers a hook to handle forward inputs. `hook` should be a function.
+        """Registers a hook to handle forward inputs. `hook` should be a function.
 
-        :param hook: a function that receive `module` and `inputs`, then return
-            a modified `inputs` or `None`.
-        :return: a handler with :meth:`~.HookHandler.remove` interface to delete the hook.
+        Args:
+            hook: a function that receive `module` and `inputs`, then return
+                a modified `inputs` or `None`.
+
+        Returns:
+            a handler with :meth:`~.HookHandler.remove` interface to delete the hook.
         """
         return HookHandler(self._forward_pre_hooks, hook)
 
     def register_forward_hook(self, hook: Callable) -> HookHandler:
-        """
-        Registers a hook to handle forward results. `hook` should be a function that
+        """Registers a hook to handle forward results. `hook` should be a function that
         receive `module`, `inputs` and `outputs`, then return a modified `outputs` or `None`.
 
         This method return a handler with :meth:`~.HookHandler.remove` interface to delete the hook.
@@ -164,19 +164,19 @@ class Module(metaclass=ABCMeta):
         predicate: Callable[[Any], bool] = lambda _: True,
         seen: Optional[Set[int]] = None
     ) -> Union[Iterable[Any], Iterable[Tuple[str, Any]]]:
-        """
-        Scans the module object and returns an iterable for the :class:`~.Tensor`
+        """Scans the module object and returns an iterable for the :class:`~.Tensor`
         and :class:`~.Module` attributes that agree with the ``predicate``. For multiple
         calls of this function with same arguments, the order of objects within the
         returned iterable is guaranteed to be identical, as long as all the involved
         module objects' ``__dict__`` does not change thoughout those calls.
 
-        :param recursive: whether to recursively scan all the submodules.
-        :param with_key: whether to yield keys along with yielded objects.
-        :param with_parent: whether to yield ``self`` along with yielded objects.
-        :param prefix: prefix appended to the yielded keys.
-        :param predicate: the predication function applied to scanned objects.
-        :param seen: a dict that records whether a module has been traversed yet.
+        Args:
+            recursive: whether to recursively scan all the submodules.
+            with_key: whether to yield keys along with yielded objects.
+            with_parent: whether to yield ``self`` along with yielded objects.
+            prefix: prefix appended to the yielded keys.
+            predicate: the predication function applied to scanned objects.
+            seen: a dict that records whether a module has been traversed yet.
         """
         if seen is None:
             seen = set([id(self)])
@@ -212,12 +212,12 @@ class Module(metaclass=ABCMeta):
                     )
 
     def parameters(self, recursive: bool = True, **kwargs) -> Iterable[Parameter]:
-        r"""
-        Returns an iterable for the :class:`~.Parameter` of the module.
+        r"""Returns an iterable for the :class:`~.Parameter` of the module.
 
-        :param recursive: If ``True``, returns all :class:`~.Parameter` within this
-            module, else only returns :class:`~.Parameter` that are direct attributes
-            of this module.
+        Args:
+            recursive: If ``True``, returns all :class:`~.Parameter` within this
+                module, else only returns :class:`~.Parameter` that are direct attributes
+                of this module.
         """
 
         if "requires_grad" in kwargs:
@@ -237,14 +237,14 @@ class Module(metaclass=ABCMeta):
     def named_parameters(
         self, prefix: Optional[str] = None, recursive: bool = True, **kwargs
     ) -> Iterable[Tuple[str, Parameter]]:
-        """
-        Returns an iterable for key :class:`~.Parameter` pairs of the module, where
+        r"""Returns an iterable for key :class:`~.Parameter` pairs of the module, where
         ``key`` is the dotted path from this module to the :class:`~.Parameter`.
 
-        :param prefix: prefix prepended to the keys.
-        :param recursive: if ``True``, returns all :class:`~.Parameter` within this
-            module, else only returns :class:`~.Parameter` that are direct attributes
-            of this module.
+        Args:
+            prefix: prefix prepended to the keys.
+            recursive: if ``True``, returns all :class:`~.Parameter` within this
+                module, else only returns :class:`~.Parameter` that are direct attributes
+                of this module.
         """
 
         if "requires_grad" in kwargs:
@@ -266,14 +266,13 @@ class Module(metaclass=ABCMeta):
         )
 
     def buffers(self, recursive: bool = True, **kwargs) -> Iterable[Tensor]:
-        """
-        Returns an iterable for the buffers of the module.
+        r"""Returns an iterable for the buffers of the module.
 
         Buffer is defined to be :class:`~.Tensor` excluding :class:`~.Parameter`.
 
-        :param recursive: if ``True``, returns all buffers within this
-            module, else only returns buffers that are direct attributes
-            of this module.
+        Args:
+            recursive: if ``True``, returns all buffers within this
+                module, else only returns buffers that are direct attributes
         """
         yield from self._flatten(
             with_key=False, predicate=_is_buffer, recursive=recursive, **kwargs
@@ -282,16 +281,17 @@ class Module(metaclass=ABCMeta):
     def named_buffers(
         self, prefix: Optional[str] = None, recursive: bool = True, **kwargs
     ) -> Iterable[Tuple[str, Tensor]]:
-        """
-        Returns an iterable for key buffer pairs of the module, where
+        r"""Returns an iterable for key buffer pairs of the module, where
         ``key`` is the dotted path from this module to the buffer.
 
         Buffer is defined to be :class:`~.Tensor` excluding :class:`~.Parameter`.
 
-        :param prefix: prefix prepended to the keys.
-        :param recursive: if ``True``, returns all buffers within this
-            module, else only returns buffers that are direct attributes
-            of this module.
+        Args:
+            prefix: prefix prepended to the keys.
+            recursive: if ``True``, returns all buffers within this
+                module, else only returns buffers that are direct attributes
+                of this module.
+            prefix: Optional[str]:
         """
         yield from self._flatten(
             with_key=True,
@@ -302,8 +302,7 @@ class Module(metaclass=ABCMeta):
         )
 
     def children(self, **kwargs) -> "Iterable[Module]":
-        """
-        Returns an iterable for all the submodules that are direct attributes of this
+        r"""Returns an iterable for all the submodules that are direct attributes of this
         module.
         """
         yield from self._flatten(
@@ -311,8 +310,7 @@ class Module(metaclass=ABCMeta):
         )
 
     def named_children(self, **kwargs) -> "Iterable[Tuple[str, Module]]":
-        """
-        Returns an iterable of key-submodule pairs for all the submodules that are
+        r"""Returns an iterable of key-submodule pairs for all the submodules that are
         direct attributes of this module, where 'key' is the attribute name of
         submodules.
         """
@@ -321,9 +319,7 @@ class Module(metaclass=ABCMeta):
         )
 
     def modules(self, **kwargs) -> "Iterable[Module]":
-        """
-        Returns an iterable for all the modules within this module, including itself.
-        """
+        r"""Returns an iterable for all the modules within this module, including itself."""
         if "with_parent" in kwargs and kwargs["with_parent"]:
             yield self, None
         else:
@@ -333,12 +329,12 @@ class Module(metaclass=ABCMeta):
     def named_modules(
         self, prefix: Optional[str] = None, **kwargs
     ) -> "Iterable[Tuple[str, Module]]":
-        """
-        Returns an iterable of key-module pairs for all the modules within this
+        r"""Returns an iterable of key-module pairs for all the modules within this
         module, including itself, where 'key' is the dotted path from this module to the
         submodules.
 
-        :param prefix: prefix prepended to the path.
+        Args:
+            prefix: prefix prepended to the path.
         """
         if "with_parent" in kwargs and kwargs["with_parent"]:
             yield ("" if prefix is None else prefix), self, None
@@ -349,33 +345,31 @@ class Module(metaclass=ABCMeta):
         )
 
     def apply(self, fn: "Callable[[Module], Any]") -> None:
-        """
-        Applies function ``fn`` to all the modules within this module, including
+        r"""Applies function ``fn`` to all the modules within this module, including
         itself.
 
-        :param fn: the function to be applied on modules.
+        Args:
+            fn: the function to be applied on modules.
         """
         for it in self.modules():
             fn(it)
 
     @deprecated(version="1.0")
     def zero_grad(self) -> None:
-        """
-        Sets all parameters' grads to zero
-        """
+        r"""Sets all parameters' grads to zero"""
         for param in self.parameters():
             if param.grad is not None:
                 param.grad.reset_zero()
 
     def train(self, mode: bool = True, recursive: bool = True) -> None:
-        """
-        Sets training mode of all the modules within this module (including itself) to
+        r"""Sets training mode of all the modules within this module (including itself) to
         ``mode``. This effectively sets the ``training`` attributes of those modules
         to ``mode``, but only has effect on certain modules (e.g.
         :class:`~.BatchNorm2d`, :class:`~.Dropout`, :class:`~.Observer`)
 
-        :param mode: the training mode to be set on modules.
-        :param recursive: whether to recursively call submodules' ``train()``.
+        Args:
+            mode: the training mode to be set on modules.
+            recursive: whether to recursively call submodules' ``train()``.
         """
         if not recursive:
             self.training = mode
@@ -387,15 +381,13 @@ class Module(metaclass=ABCMeta):
         self.apply(fn)
 
     def eval(self) -> None:
-        """
-        Sets training mode of all the modules within this module (including itself) to
+        r"""Sets training mode of all the modules within this module (including itself) to
         ``False``. See :meth:`~.Module.train` for details.
         """
         self.train(False)
 
     def disable_quantize(self, value=True):
-        r"""
-        Sets ``module``'s ``quantize_disabled`` attribute and return ``module``.
+        r"""Sets ``module``'s ``quantize_disabled`` attribute and return ``module``.
         Could be used as a decorator.
         """
 
@@ -408,8 +400,7 @@ class Module(metaclass=ABCMeta):
     def replace_param(
         self, params: dict, start_pos: int, seen: Optional[Set[int]] = None
     ):
-        """
-        Replaces module's parameters with ``params``, used by :class:`~.ParamPack` to
+        r"""Replaces module's parameters with ``params``, used by :class:`~.ParamPack` to
         speedup multimachine training.
         """
         offset = 0
@@ -447,9 +438,7 @@ class Module(metaclass=ABCMeta):
         return rst
 
     def _state_dict(self, rst=None, prefix="", keep_var=False):
-        r"""
-        Returns a dictionary containing whole states of the module.
-        """
+        r"""Returns a dictionary containing whole states of the module."""
 
         def is_state(obj):
             return _is_parameter(obj) or _is_buffer(obj)
@@ -479,8 +468,7 @@ class Module(metaclass=ABCMeta):
         state_dict: Union[dict, Callable[[str, Tensor], Optional[np.ndarray]]],
         strict=True,
     ):
-        r"""
-        Loads a given dictionary created by :func:`state_dict` into this module.
+        r"""Loads a given dictionary created by :func:`state_dict` into this module.
         If ``strict`` is ``True``, the keys of :func:`state_dict` must exactly match the keys
         returned by :func:`state_dict`.
 
@@ -515,8 +503,7 @@ class Module(metaclass=ABCMeta):
                 if 'bias' in k:
                     M.init.zero_(v)
                 if 'conv' in k:
-                    return v.numpy() * (np.abs(v.numpy()) > 1e-3).astype("float32)
-            model.load_state_dict(reinit_and_pruning, strict=False)
+
         """
         unused = []
         if isinstance(state_dict, dict):
@@ -558,8 +545,7 @@ class Module(metaclass=ABCMeta):
                 )
 
     def _load_state_dict_with_closure(self, closure):
-        """
-        Advance state_dict load through callable ``closure`` whose signature is
+        r"""Advance state_dict load through callable ``closure`` whose signature is
         ``closure(key: str, var: Tensor) -> Union[np.ndarry, None]``
         """
         XNorm_typeclass = _get_XNorm_typeclass()
@@ -642,9 +628,7 @@ class Module(metaclass=ABCMeta):
         super().__delattr__(name)
 
     def _module_info_string(self) -> str:
-        r"""
-        Set the extra representation of the module.
-        """
+        r"""Set the extra representation of the module."""
         return ""
 
     def __repr__(self):
diff --git a/imperative/python/megengine/module/normalization.py b/imperative/python/megengine/module/normalization.py
index 317fa17d..f2028dbf 100644
--- a/imperative/python/megengine/module/normalization.py
+++ b/imperative/python/megengine/module/normalization.py
@@ -15,8 +15,7 @@ from .module import Module
 
 
 class GroupNorm(Module):
-    """
-    Simple implementation of GroupNorm. Only support 4d tensor now.
+    """Simple implementation of GroupNorm. Only support 4d tensor now.
     Reference: https://arxiv.org/pdf/1803.08494.pdf.
     """
 
@@ -64,8 +63,7 @@ class GroupNorm(Module):
 
 
 class InstanceNorm(Module):
-    """
-    Simple implementation of InstanceNorm. Only support 4d tensor now.
+    """Simple implementation of InstanceNorm. Only support 4d tensor now.
     Reference: https://arxiv.org/abs/1607.08022.
     Note that InstanceNorm equals using GroupNome with num_groups=num_channels.
     """
@@ -108,8 +106,7 @@ class InstanceNorm(Module):
 
 
 class LayerNorm(Module):
-    """
-    Simple implementation of LayerNorm. Support tensor of any shape as input.
+    """Simple implementation of LayerNorm. Support tensor of any shape as input.
     Reference: https://arxiv.org/pdf/1803.08494.pdf.
     """
 
diff --git a/imperative/python/megengine/module/pooling.py b/imperative/python/megengine/module/pooling.py
index 28d2b796..5ebdfdb8 100644
--- a/imperative/python/megengine/module/pooling.py
+++ b/imperative/python/megengine/module/pooling.py
@@ -37,14 +37,14 @@ class _PoolNd(Module):
 
 
 class MaxPool2d(_PoolNd):
-    r"""
-    Applies a 2D max pooling over an input.
+    r"""Applies a 2D max pooling over an input.
 
     For instance, given an input of the size :math:`(N, C, H, W)` and
     :attr:`kernel_size` :math:`(kH, kW)`, this layer generates the output of
     the size :math:`(N, C, H_{out}, W_{out})` through a process described as:
 
     .. math::
+
         \begin{aligned}
             out(N_i, C_j, h, w) ={} & \max_{m=0, \ldots, kH-1} \max_{n=0, \ldots, kW-1}
                 \text{input}(N_i, C_j, \text{stride[0]} \times h + m,
@@ -54,30 +54,30 @@ class MaxPool2d(_PoolNd):
     If :attr:`padding` is non-zero, then the input is implicitly zero-padded on
     both sides for :attr:`padding` number of points.
 
-    :param kernel_size: the size of the window to take a max over.
-    :param stride: the stride of the window. Default value is kernel_size.
-    :param padding: implicit zero padding to be added on both sides.
+    Args:
+        kernel_size: the size of the window to take a max over.
+        stride: the stride of the window. Default value is kernel_size.
+        padding: implicit zero padding to be added on both sides.
 
     Examples:
 
-    .. testcode::
-
-        import numpy as np
-        import megengine as mge
-        import megengine.module as M
+        .. testcode::
 
-        m = M.MaxPool2d(kernel_size=3, stride=1, padding=0)
-        inp = mge.tensor(np.arange(0, 16).astype("float32").reshape(1, 1, 4, 4))
-        oup = m(inp)
-        print(oup.numpy())
+            import numpy as np
+            import megengine as mge
+            import megengine.module as M
 
-    Outputs:
+            m = M.MaxPool2d(kernel_size=3, stride=1, padding=0)
+            inp = mge.tensor(np.arange(0, 16).astype("float32").reshape(1, 1, 4, 4))
+            oup = m(inp)
+            print(oup.numpy())
 
-    .. testoutput::
+        Outputs:
 
-        [[[[10. 11.]
-           [14. 15.]]]]
+        .. testoutput::
 
+            [[[[10. 11.]
+               [14. 15.]]]]
     """
 
     def forward(self, inp):
@@ -85,8 +85,7 @@ class MaxPool2d(_PoolNd):
 
 
 class AvgPool2d(_PoolNd):
-    r"""
-    Applies a 2D average pooling over an input.
+    r"""Applies a 2D average pooling over an input.
 
     For instance, given an input of the size :math:`(N, C, H, W)` and
     :attr:`kernel_size` :math:`(kH, kW)`, this layer generates the output of
@@ -100,33 +99,13 @@ class AvgPool2d(_PoolNd):
     If :attr:`padding` is non-zero, then the input is implicitly zero-padded on
     both sides for :attr:`padding` number of points.
 
-    :param kernel_size: the size of the window.
-    :param stride: the stride of the window. Default value is kernel_size。
-    :param padding: implicit zero padding to be added on both sides.
-    :param mode: whether to count padding values. "average" mode will do counting and 
-        "average_count_exclude_padding" mode won't do counting. 
-        Default: "average_count_exclude_padding"
-
-    Examples:
-
-    .. testcode::
-
-        import numpy as np
-        import megengine as mge
-        import megengine.module as M
-
-        m = M.AvgPool2d(kernel_size=3, stride=1, padding=0)
-        inp = mge.tensor(np.arange(0, 16).astype("float32").reshape(1, 1, 4, 4))
-        oup = m(inp)
-        print(oup.numpy())
-
-    Outputs:
-
-    .. testoutput::
-
-        [[[[ 5.  6.]
-           [ 9. 10.]]]]
-
+    Args:
+        kernel_size: the size of the window.
+        stride: the stride of the window. Default value is kernel_size。
+        padding: implicit zero padding to be added on both sides.
+        mode: whether to count padding values. "average" mode will do counting and
+            "average_count_exclude_padding" mode won't do counting.
+            Default: "average_count_exclude_padding"
     """
 
     def __init__(
diff --git a/imperative/python/megengine/module/qat/batch_matmul_activation.py b/imperative/python/megengine/module/qat/batch_matmul_activation.py
index 1b1ff2c7..dbab6c5f 100644
--- a/imperative/python/megengine/module/qat/batch_matmul_activation.py
+++ b/imperative/python/megengine/module/qat/batch_matmul_activation.py
@@ -10,9 +10,7 @@ from .module import QATModule
 
 
 class BatchMatMulActivation(Float.BatchMatMulActivation, QATModule):
-    r"""
-    A :class:`~.QATModule` :class:`~.module.BatchMatMulActivation` with QAT support.
-    """
+    r"""A :class:`~.QATModule` :class:`~.module.BatchMatMulActivation` with QAT support."""
 
     def forward(self, inp):
         w_qat = self.apply_quant_weight(self.weight)
diff --git a/imperative/python/megengine/module/qat/concat.py b/imperative/python/megengine/module/qat/concat.py
index bfcca787..f270276b 100644
--- a/imperative/python/megengine/module/qat/concat.py
+++ b/imperative/python/megengine/module/qat/concat.py
@@ -13,8 +13,7 @@ from .module import QATModule
 
 
 class Concat(Float.Concat, QATModule):
-    r"""
-    A :class:`~.QATModule` to do functional :func:`~.concat` with QAT support.
+    r"""A :class:`~.QATModule` to do functional :func:`~.concat` with QAT support.
     Could be applied with :class:`~.Observer` and :class:`~.FakeQuantize`.
     """
 
@@ -23,8 +22,4 @@ class Concat(Float.Concat, QATModule):
 
     @classmethod
     def from_float_module(cls, float_module):
-        r"""
-        Return a :class:`~.QATModule` instance converted from
-        a float :class:`~.Module` instance.
-        """
         return cls(name=float_module.name)
diff --git a/imperative/python/megengine/module/qat/conv.py b/imperative/python/megengine/module/qat/conv.py
index c8465f8c..c004690f 100644
--- a/imperative/python/megengine/module/qat/conv.py
+++ b/imperative/python/megengine/module/qat/conv.py
@@ -11,8 +11,7 @@ from .module import QATModule
 
 
 class Conv2d(Float.Conv2d, QATModule):
-    r"""
-    A :class:`~.QATModule` :class:`~.module.Conv2d` with QAT support.
+    r"""A :class:`~.QATModule` :class:`~.module.Conv2d` with QAT support.
     Could be applied with :class:`~.Observer` and :class:`~.FakeQuantize`.
     """
 
@@ -50,8 +49,7 @@ class Conv2d(Float.Conv2d, QATModule):
 
 
 class ConvRelu2d(Conv2d):
-    r"""
-    A :class:`~.QATModule` include :class:`~.module.Conv2d` and :func:`~.relu` with QAT support.
+    r"""A :class:`~.QATModule` include :class:`~.module.Conv2d` and :func:`~.relu` with QAT support.
     Could be applied with :class:`~.Observer` and :class:`~.FakeQuantize`.
     """
 
@@ -60,8 +58,7 @@ class ConvRelu2d(Conv2d):
 
 
 class ConvTranspose2d(Float.ConvTranspose2d, QATModule):
-    r"""
-    A :class:`~.QATModule` :class:`~.module.ConvTranspose2d` with QAT support.
+    r"""A :class:`~.QATModule` :class:`~.module.ConvTranspose2d` with QAT support.
     Could be applied with :class:`~.Observer` and :class:`~.FakeQuantize`.
     """
 
diff --git a/imperative/python/megengine/module/qat/conv_bn.py b/imperative/python/megengine/module/qat/conv_bn.py
index 3ee4d407..059f0c69 100644
--- a/imperative/python/megengine/module/qat/conv_bn.py
+++ b/imperative/python/megengine/module/qat/conv_bn.py
@@ -136,10 +136,6 @@ class _ConvBnActivation2d(Float._ConvBnActivation2d, QATModule):
 
     @classmethod
     def from_float_module(cls, float_module: Float._ConvBnActivation2d):
-        r"""
-        Return a :class:`~.QATModule` instance converted from
-        a float :class:`~.Module` instance.
-        """
         qat_module = cls(
             float_module.conv.in_channels,
             float_module.conv.out_channels,
@@ -160,8 +156,7 @@ class _ConvBnActivation2d(Float._ConvBnActivation2d, QATModule):
 
 
 class ConvBn2d(_ConvBnActivation2d):
-    r"""
-    A fused :class:`~.QATModule` including :class:`~.module.Conv2d` and :class:`~.module.BatchNorm2d` with QAT support.
+    r"""A fused :class:`~.QATModule` including :class:`~.module.Conv2d` and :class:`~.module.BatchNorm2d` with QAT support.
     Could be applied with :class:`~.Observer` and :class:`~.FakeQuantize`.
     """
 
@@ -170,8 +165,7 @@ class ConvBn2d(_ConvBnActivation2d):
 
 
 class ConvBnRelu2d(_ConvBnActivation2d):
-    r"""
-    A fused :class:`~.QATModule` including :class:`~.module.Conv2d`, :class:`~.module.BatchNorm2d` and :func:`~.relu` with QAT support.
+    r"""A fused :class:`~.QATModule` including :class:`~.module.Conv2d`, :class:`~.module.BatchNorm2d` and :func:`~.relu` with QAT support.
     Could be applied with :class:`~.Observer` and :class:`~.FakeQuantize`.
     """
 
diff --git a/imperative/python/megengine/module/qat/elemwise.py b/imperative/python/megengine/module/qat/elemwise.py
index 956bf4fa..4023b189 100644
--- a/imperative/python/megengine/module/qat/elemwise.py
+++ b/imperative/python/megengine/module/qat/elemwise.py
@@ -10,11 +10,8 @@ from .module import QATModule
 
 
 class Elemwise(Float.Elemwise, QATModule):
-    r"""
-    A :class:`~.QATModule` to do :mod:`~.functional.elemwise` operator with QAT support.
+    r"""A :class:`~.QATModule` to do :mod:`~.functional.elemwise` operator with QAT support.
     Could be applied with :class:`~.Observer` and :class:`~.FakeQuantize`.
-
-    :param method: the elemwise method, see :class:`~.module.Elemwise` for detail.
     """
 
     with_weight = False
diff --git a/imperative/python/megengine/module/qat/linear.py b/imperative/python/megengine/module/qat/linear.py
index fc92ff18..77673a00 100644
--- a/imperative/python/megengine/module/qat/linear.py
+++ b/imperative/python/megengine/module/qat/linear.py
@@ -10,15 +10,14 @@ from .module import QATModule
 
 
 class Linear(Float.Linear, QATModule):
-    r"""
-    A :class:`~.QATModule` version of :class:`~.module.Linear`.
+    r"""A :class:`~.QATModule` version of :class:`~.module.Linear`.
     Could be applied with :class:`~.Observer` and :class:`~.FakeQuantize`.
 
-    :param in_features: size of each input sample.
-    :param out_features: size of each output sample.
-    :param bias: If set to ``False``, the layer will not learn an additive bias.
-        Default: True
-
+    Args:
+        in_features: size of each input sample.
+        out_features: size of each output sample.
+        bias: If set to ``False``, the layer will not learn an additive bias.
+            Default: True
     """
 
     def forward(self, inp):
diff --git a/imperative/python/megengine/module/qat/module.py b/imperative/python/megengine/module/qat/module.py
index 3800e574..ec4a5c3e 100644
--- a/imperative/python/megengine/module/qat/module.py
+++ b/imperative/python/megengine/module/qat/module.py
@@ -17,12 +17,11 @@ from ..module import Module
 
 
 class QATModule(Module):
-    r"""
-    Base class of quantized-float related :class:`~.Module`, basically for QAT and Calibration.
-
+    r"""Base class of quantized-float related :class:`~.Module`, basically for QAT and Calibration.
+    
     Use :meth:`from_float_module` to generate a instance from float :class:`~.Module`.
     Or use :func:`~.quantize.quantize_qat` to do it recursively and automatically.
-
+    
     Can also be converted to :class:`~.QuantizedModule` for deployment using
     :func:`~.quantize.quantize` further.
     """
@@ -43,8 +42,7 @@ class QATModule(Module):
         return "QAT." + super().__repr__()
 
     def set_qconfig(self, qconfig: QConfig):
-        r"""
-        Set quantization related configs with ``qconfig``, including
+        r"""Set quantization related configs with ``qconfig``, including
         observer and fake_quant for weight and activation.
         """
 
@@ -96,24 +94,19 @@ class QATModule(Module):
         return oup
 
     def apply_quant_weight(self, target: Tensor):
-        r"""
-        Apply weight's observer and fake_quant from ``qconfig`` on ``target``.
-        """
+        r"""Apply weight's observer and fake_quant from ``qconfig`` on ``target``."""
         return self._apply_fakequant_with_observer(
             target, self.weight_fake_quant, self.weight_observer
         )
 
     def apply_quant_activation(self, target: Tensor):
-        r"""
-        Apply weight's observer and fake_quant from ``qconfig`` on ``target``.
-        """
+        r"""Apply weight's observer and fake_quant from ``qconfig`` on ``target``."""
         return self._apply_fakequant_with_observer(
             target, self.act_fake_quant, self.act_observer
         )
 
     def apply_quant_bias(self, target: Tensor, inp: Tensor, w_qat: Tensor):
-        r"""
-        Use :func:`~.fake_quant_bias` to process ``target``. Only valid when
+        r"""Use :func:`~.fake_quant_bias` to process ``target``. Only valid when
         ``act_fake_quant`` and ``weight_fake_quant`` are both enabled.
         """
         # bias should have the same dtype as activation, so act_fake_quant can also
@@ -139,33 +132,25 @@ class QATModule(Module):
         return None
 
     def get_weight_dtype(self):
-        r"""
-        Get weight's quantization dtype as the method from ``qconfig``.
-        """
+        r"""Get weight's quantization dtype as the method from ``qconfig``."""
         return self._get_method_result(
             "get_quantized_dtype", self.weight_fake_quant, self.weight_observer
         )
 
     def get_activation_dtype(self):
-        r"""
-        Get activation's quantization dtype as the method from ``qconfig``.
-        """
+        r"""Get activation's quantization dtype as the method from ``qconfig``."""
         return self._get_method_result(
             "get_quantized_dtype", self.act_fake_quant, self.act_observer
         )
 
     def get_weight_qparams(self):
-        r"""
-        Get weight's quantization parameters.
-        """
+        r"""Get weight's quantization parameters."""
         return self._get_method_result(
             "get_qparams", self.weight_fake_quant, self.weight_observer
         )
 
     def get_activation_qparams(self):
-        r"""
-        Get activation's quantization parameters.
-        """
+        r"""Get activation's quantization parameters."""
         return self._get_method_result(
             "get_qparams", self.act_fake_quant, self.act_observer
         )
@@ -173,7 +158,6 @@ class QATModule(Module):
     @classmethod
     @abstractmethod
     def from_float_module(cls, float_module: Module):
-        r"""
-        Return a :class:`~.QATModule` instance converted from
+        r"""Return a :class:`~.QATModule` instance converted from
         a float :class:`~.Module` instance.
         """
diff --git a/imperative/python/megengine/module/qat/quant_dequant.py b/imperative/python/megengine/module/qat/quant_dequant.py
index 580b5f91..13916b63 100644
--- a/imperative/python/megengine/module/qat/quant_dequant.py
+++ b/imperative/python/megengine/module/qat/quant_dequant.py
@@ -10,8 +10,7 @@ from .module import QATModule
 
 
 class QuantStub(Float.QuantStub, QATModule):
-    r"""
-    A helper :class:`~.QATModule` simply return input, but will quantize
+    r"""A helper :class:`~.QATModule` simply return input, but will quantize
     input after converted to :class:`~.QuantizedModule`.
     """
 
@@ -30,8 +29,7 @@ class QuantStub(Float.QuantStub, QATModule):
 
 
 class DequantStub(Float.DequantStub, QATModule):
-    r"""
-    A helper :class:`~.QATModule` simply return input, but will de-quantize
+    r"""A helper :class:`~.QATModule` simply return input, but will de-quantize
     input after converted to :class:`~.QuantizedModule`.
     """
 
diff --git a/imperative/python/megengine/module/quant_dequant.py b/imperative/python/megengine/module/quant_dequant.py
index 8eda151f..6c4bd7b2 100644
--- a/imperative/python/megengine/module/quant_dequant.py
+++ b/imperative/python/megengine/module/quant_dequant.py
@@ -9,8 +9,7 @@ from .module import Module
 
 
 class QuantStub(Module):
-    r"""
-    A helper :class:`~.Module` simply returning input. Could be replaced with :class:`~.QATModule`
+    r"""A helper :class:`~.Module` simply returning input. Could be replaced with :class:`~.QATModule`
     version :class:`~.qat.QuantStub` using :func:`~.quantize.quantize_qat`.
     """
 
@@ -19,8 +18,7 @@ class QuantStub(Module):
 
 
 class DequantStub(Module):
-    r"""
-    A helper :class:`~.Module` simply returning input. Could be replaced with :class:`~.QATModule`
+    r"""A helper :class:`~.Module` simply returning input. Could be replaced with :class:`~.QATModule`
     version :class:`~.qat.DequantStub` using :func:`~.quantize.quantize_qat`.
     """
 
diff --git a/imperative/python/megengine/module/quantized/concat.py b/imperative/python/megengine/module/quantized/concat.py
index 7fef963e..f1eae59f 100644
--- a/imperative/python/megengine/module/quantized/concat.py
+++ b/imperative/python/megengine/module/quantized/concat.py
@@ -14,9 +14,7 @@ from .module import QuantizedModule
 
 
 class Concat(QuantizedModule):
-    r"""
-    A :class:`~.QuantizedModule` to do quantized :func:`~.concat`, used for inference only.
-    """
+    r"""A :class:`~.QuantizedModule` to do quantized :func:`~.concat`, used for inference only."""
 
     def __init__(self, dtype=None, **kwargs):
         super().__init__(**kwargs)
diff --git a/imperative/python/megengine/module/quantized/conv.py b/imperative/python/megengine/module/quantized/conv.py
index 4d7a5b5f..b7bc0847 100644
--- a/imperative/python/megengine/module/quantized/conv.py
+++ b/imperative/python/megengine/module/quantized/conv.py
@@ -75,7 +75,7 @@ class Conv2d(Float.Conv2d, QuantizedModule):
     @classmethod
     def from_qat_module(cls, qat_module: QAT.Conv2d):
         r"""
-        return a :class:`~.QuantizedModule` instance converted from a
+        Return a :class:`~.QuantizedModule` instance converted from a
         :class:`~.QATModule` instance.
         """
         output_dtype = qat_module.get_activation_dtype()
@@ -119,7 +119,8 @@ class ConvTranspose2d(Float.ConvTranspose2d, QuantizedModule):
 
     The parameter is same with :class:`~.module.ConvTranspose2d` but dtype.
 
-    :param dtype: data type of the output, should be qint8.
+    Args:
+        dtype: data type of the output, should be qint8.
     """
 
     def __init__(
diff --git a/imperative/python/megengine/module/quantized/conv_bn.py b/imperative/python/megengine/module/quantized/conv_bn.py
index e17f89e1..cef6b137 100644
--- a/imperative/python/megengine/module/quantized/conv_bn.py
+++ b/imperative/python/megengine/module/quantized/conv_bn.py
@@ -11,10 +11,7 @@ from .conv import Conv2d
 
 
 class _ConvBnActivation2d(Conv2d):
-    r"""
-    Applies a 2D convolution over a quantized input tensor, used for inference only.
-
-    The parameter is same with :class: `~.module.Conv2d`.
+    r"""Applies a 2D convolution over a quantized input tensor, used for inference only.
     """
 
     @classmethod
diff --git a/imperative/python/megengine/module/quantized/module.py b/imperative/python/megengine/module/quantized/module.py
index 3532375a..3c3feb0f 100644
--- a/imperative/python/megengine/module/quantized/module.py
+++ b/imperative/python/megengine/module/quantized/module.py
@@ -12,8 +12,7 @@ from ..qat import QATModule
 
 
 class QuantizedModule(Module):
-    r"""
-    Base class of quantized :class:`~.Module`, 
+    r"""Base class of quantized :class:`~.Module`,
     which should be converted from :class:`~.QATModule` and not support traning.
     """
 
@@ -29,6 +28,6 @@ class QuantizedModule(Module):
     @abstractmethod
     def from_qat_module(cls, qat_module: QATModule):
         r"""
-        Return a :class:`~.QuantizedModule` instance converted from a
-        :class:`~.QATModule` instance.
+        Return a :class:`~.QATModule` instance converted from
+        a float :class:`~.Module` instance.
         """
diff --git a/imperative/python/megengine/module/quantized/quant_dequant.py b/imperative/python/megengine/module/quantized/quant_dequant.py
index d17ca0de..807cf253 100644
--- a/imperative/python/megengine/module/quantized/quant_dequant.py
+++ b/imperative/python/megengine/module/quantized/quant_dequant.py
@@ -10,8 +10,7 @@ from .module import QuantizedModule
 
 
 class QuantStub(QuantizedModule):
-    r"""
-    Quantized version of :class:`~.qat.QuantStub`,
+    r"""Quantized version of :class:`~.qat.QuantStub`,
     will convert input to quantized dtype.
     """
 
@@ -24,16 +23,11 @@ class QuantStub(QuantizedModule):
 
     @classmethod
     def from_qat_module(cls, qat_module: QAT.QuantStub):
-        r"""
-        Return a :class:`~.QuantizedModule` instance converted from a
-        :class:`~.QATModule` instance.
-        """
         return cls(qat_module.get_activation_dtype(), name=qat_module.name)
 
 
 class DequantStub(QuantizedModule):
-    r"""
-    Quantized version of :class:`~.qat.DequantStub`,
+    r"""Quantized version of :class:`~.qat.DequantStub`,
     will restore quantized input to float32 dtype.
     """
 
@@ -42,8 +36,4 @@ class DequantStub(QuantizedModule):
 
     @classmethod
     def from_qat_module(cls, qat_module: QAT.DequantStub):
-        r"""
-        Return a :class:`~.QuantizedModule` instance converted from a
-        :class:`~.QATModule` instance.
-        """
         return cls(name=qat_module.name)
diff --git a/imperative/python/megengine/module/sequential.py b/imperative/python/megengine/module/sequential.py
index 9953140a..7d63efb7 100644
--- a/imperative/python/megengine/module/sequential.py
+++ b/imperative/python/megengine/module/sequential.py
@@ -12,38 +12,35 @@ from .module import Module
 
 
 class Sequential(Module):
-    r"""
-    A sequential container.
+    r"""A sequential container.
     Modules will be added to it in the order they are passed in the constructor.
     Alternatively, an ordered dict of modules can also be passed in.
 
-    To make it easier to understand, here is a small example:
-
     Examples:
 
-    .. testcode::
-
-        import numpy as np
-        import megengine as mge
-        import megengine.module as M
-        import megengine.functional as F
-        from collections import OrderedDict
-
-        batch_size = 64
-        data = mge.tensor(np.zeros((batch_size, 28 * 28)), dtype=np.float32)
-        label = mge.tensor(np.zeros(batch_size,), dtype=np.int32)
-
-        net0 = M.Sequential(
-                M.Linear(28 * 28, 320),
-                M.Linear(320, 10)
-            )
-        pred0 = net0(data)
-
-        modules = OrderedDict()
-        modules["fc0"] = M.Linear(28 * 28, 320)
-        modules["fc1"] = M.Linear(320, 10)
-        net1 = M.Sequential(modules)
-        pred1 = net1(data)
+        .. testcode::
+
+            import numpy as np
+            import megengine as mge
+            import megengine.module as M
+            import megengine.functional as F
+            from collections import OrderedDict
+
+            batch_size = 64
+            data = mge.tensor(np.zeros((batch_size, 28 * 28)), dtype=np.float32)
+            label = mge.tensor(np.zeros(batch_size,), dtype=np.int32)
+
+            net0 = M.Sequential(
+                    M.Linear(28 * 28, 320),
+                    M.Linear(320, 10)
+                )
+            pred0 = net0(data)
+
+            modules = OrderedDict()
+            modules["fc0"] = M.Linear(28 * 28, 320)
+            modules["fc1"] = M.Linear(320, 10)
+            net1 = M.Sequential(modules)
+            pred1 = net1(data)
     """
 
     def __init__(self, *args, **kwargs):
diff --git a/imperative/python/megengine/module/sliding_window.py b/imperative/python/megengine/module/sliding_window.py
index eefe6612..2e4cc21e 100644
--- a/imperative/python/megengine/module/sliding_window.py
+++ b/imperative/python/megengine/module/sliding_window.py
@@ -13,8 +13,7 @@ from .module import Module
 
 
 class SlidingWindow(Module):
-    r"""
-    Apply a sliding window to input tensor and copy content in the window to
+    r"""Apply a sliding window to input tensor and copy content in the window to
     corresponding output location. Assume input shape is :math:`(N, C, IH, IW)`,
     then output shape would be :math:`(N, C, OH, OW, window_h, window_w)` where
     :math:`(OH, OW)` would be computed from padding, stride, window and
@@ -26,46 +25,45 @@ class SlidingWindow(Module):
         \text{where } & ih=-pad_h+oh \times stride_h + (wh-1) \times (dilation_h-1) \\
                        & iw=-pad_w+ow \times stride_w + (ww-1) \times (dilation_w-1)
 
-
-    :param kernel_size: the size of the window to take a max over.
-    :param padding: implicit zero padding to be added on both sides. Default: 0
-    :param stride: the stride of the window. Default: 1
-    :param dilation: the dilation of the window. Default: 1
+    Args:
+        kernel_size: the size of the window to take a max over.
+        padding: implicit zero padding to be added on both sides. Default: 0
+        stride: the stride of the window. Default: 1
+        dilation: the dilation of the window. Default: 1
 
     Example:
 
-    .. testcode::
-
-        from megengine import tensor
-        import megengine.module as M
-        import numpy as np
+        .. testcode::
 
-        inp = tensor(np.arange(30).reshape(1,1,5,6))
-        op = M.SlidingWindow(kernel_size=3, padding=1, stride=2, dilation=2)
-        out = op(inp)
-        print(out.numpy())
+            from megengine import tensor
+            import megengine.module as M
+            import numpy as np
 
-    Outputs:
+            inp = tensor(np.arange(30).reshape(1,1,5,6))
+            op = M.SlidingWindow(kernel_size=3, padding=1, stride=2, dilation=2)
+            out = op(inp)
+            print(out.numpy())
 
-    .. testoutput::
+        Outputs:
 
-        [[[[[[ 0  0  0]
-             [ 0  7  9]
-             [ 0 19 21]]
+        .. testoutput::
 
-            [[ 0  0  0]
-             [ 7  9 11]
-             [19 21 23]]]
+            [[[[[[ 0  0  0]
+                 [ 0  7  9]
+                 [ 0 19 21]]
 
+                [[ 0  0  0]
+                 [ 7  9 11]
+                 [19 21 23]]]
 
-           [[[ 0  7  9]
-             [ 0 19 21]
-             [ 0  0  0]]
 
-            [[ 7  9 11]
-             [19 21 23]
-             [ 0  0  0]]]]]]
+               [[[ 0  7  9]
+                 [ 0 19 21]
+                 [ 0  0  0]]
 
+                [[ 7  9 11]
+                 [19 21 23]
+                 [ 0  0  0]]]]]]
     """
 
     def __init__(
@@ -89,21 +87,20 @@ class SlidingWindow(Module):
 
 
 class SlidingWindowTranspose(Module):
-    r"""
-    Opposite opration of SlidingWindow, sum over the sliding windows on the 
-    corresponding input location. Given an input of the size 
-    :math:`(N, C,  IH, IW, window_h, window_w)` and :attr:`output_size`, the 
+    r"""Opposite opration of SlidingWindow, sum over the sliding windows on the
+    corresponding input location. Given an input of the size
+    :math:`(N, C,  IH, IW, window_h, window_w)` and :attr:`output_size`, the
     output shape would be :math:`(N, C, output\_size_{h}, output\_size_{w})` and the
     arguments must satisfy
 
     .. math::
-        \text{IH} = \lfloor \frac{\text{output_size}_{h} + 2 * \text{padding}_{h} - 
+        \text{IH} = \lfloor \frac{\text{output_size}_{h} + 2 * \text{padding}_{h} -
         \text{dilation}_{h} * (\text{kernel_size}_{h} - 1) - 1}{\text{stride}_{h}} + 1 \rfloor
 
     .. math::
-        \text{IW} = \lfloor \frac{\text{output_size}_{w} + 2 * \text{padding}_{w} - 
+        \text{IW} = \lfloor \frac{\text{output_size}_{w} + 2 * \text{padding}_{w} -
         \text{dilation}_{w} * (\text{kernel_size}_{w} - 1) - 1}{\text{stride}_{w}} + 1 \rfloor
-    
+
     For each output location, we have:
 
     .. math::
@@ -113,36 +110,13 @@ class SlidingWindowTranspose(Module):
         \text{location}(n, c, ih, iw, wh, ww) &= (n, c, oh+wh, ow+ww) \\
         \text{where } & oh=-pad_h+ih \times stride_h + (wh-1) \times (dilation_h-1) \\
                        & ow=-pad_w+iw \times stride_w + (ww-1) \times (dilation_w-1)
-                       
-    :param output_size: the size of the output tensor.
-    :param kernel_size: the size of the window to take a max over.
-    :param padding: implicit zero padding to be added on both sides. Default: 0
-    :param stride: the stride of the window. Default: 1
-    :param dilation: the dilation of the window. Default: 1
-
-    Example:
-
-    .. testcode::
-
-        from megengine import tensor
-        import megengine.module as M
-        import numpy as np
-
-        inp = tensor(np.arange(20).reshape(1,1,4,5))
-        unfold = M.SlidingWindow(kernel_size=3, padding=0, stride=1, dilation=1)
-        fold = M.SlidingWindowTranspose((4,5), kernel_size=3, padding=0, stride=1, dilation=1)
-        out = fold(unfold(inp))
-        print(out.numpy())
-
-    Outputs:
-
-    .. testoutput::
-        
-        [[[[ 0  2  6  6  4]
-           [10 24 42 32 18]
-           [20 44 72 52 28]
-           [15 32 51 36 19]]]]
 
+    Args:
+        output_size: the size of the output tensor.
+        kernel_size: the size of the window to take a max over.
+        padding: implicit zero padding to be added on both sides. Default: 0
+        stride: the stride of the window. Default: 1
+        dilation: the dilation of the window. Default: 1
     """
 
     def __init__(
diff --git a/imperative/python/megengine/optimizer/adadelta.py b/imperative/python/megengine/optimizer/adadelta.py
index 81565a1c..5544cce9 100644
--- a/imperative/python/megengine/optimizer/adadelta.py
+++ b/imperative/python/megengine/optimizer/adadelta.py
@@ -15,20 +15,20 @@ from .optimizer import Optimizer
 
 
 class Adadelta(Optimizer):
-    r"""
-    Implements Adadelta algorithm.
-
+    r"""Implements Adadelta algorithm.
+    
     It has been proposed in `"ADADELTA: An Adaptive Learning Rate Method" <https://arxiv.org/abs/1212.5701>`_.
 
-    :param params: iterable of parameters to optimize or dicts defining
-        parameter groups.
-    :param lr: coefficient that scales delta before it is applied
-        to the parameters. Default: 1.0
-    :param rho: coefficient used for computing a running average
-        of squared gradients. Default: 0.9
-    :param eps: term added to the denominator to improve
-        numerical stability. Default: 1e-6
-    :param weight_decay: weight decay (L2 penalty). Default: 0
+    Args:
+        params: iterable of parameters to optimize or dicts defining
+            parameter groups.
+        lr: coefficient that scales delta before it is applied
+            to the parameters. Default: 1.0
+        rho: coefficient used for computing a running average
+            of squared gradients. Default: 0.9
+        eps: term added to the denominator to improve
+            numerical stability. Default: 1e-6
+        weight_decay: weight decay (L2 penalty). Default: 0
     """
 
     def __init__(
diff --git a/imperative/python/megengine/optimizer/adagrad.py b/imperative/python/megengine/optimizer/adagrad.py
index fadbf48f..43708ae7 100644
--- a/imperative/python/megengine/optimizer/adagrad.py
+++ b/imperative/python/megengine/optimizer/adagrad.py
@@ -15,20 +15,20 @@ from .optimizer import Optimizer
 
 
 class Adagrad(Optimizer):
-    r"""
-    Implements Adagrad algorithm.
-
+    r"""Implements Adagrad algorithm.
+    
     It has been proposed in `"Adaptive Subgradient Methods for Online Learning
     and Stochastic Optimization" <http://jmlr.org/papers/v12/duchi11a.html>`_.
 
-    :param params: iterable of parameters to optimize or dicts defining
-        parameter groups.
-    :param lr: coefficient that scales delta before it is applied
-        to the parameters. Default: 1e-2
-    :param lr_decay: learning rate decay. Default: 0
-    :param eps: term added to the denominator to improve
-        numerical stability. Default: 1e-10
-    :param weight_decay: weight decay (L2 penalty). Default: 0
+    Args:
+        params: iterable of parameters to optimize or dicts defining
+            parameter groups.
+        lr: coefficient that scales delta before it is applied
+            to the parameters. Default: 1e-2
+        lr_decay: learning rate decay. Default: 0
+        eps: term added to the denominator to improve
+            numerical stability. Default: 1e-10
+        weight_decay: weight decay (L2 penalty). Default: 0
     """
 
     def __init__(
diff --git a/imperative/python/megengine/optimizer/adam.py b/imperative/python/megengine/optimizer/adam.py
index 9e51c90a..794bdd94 100644
--- a/imperative/python/megengine/optimizer/adam.py
+++ b/imperative/python/megengine/optimizer/adam.py
@@ -15,17 +15,16 @@ from .optimizer import Optimizer
 
 
 class Adam(Optimizer):
-    r"""
-    Implements Adam algorithm proposed in `"Adam: A Method for Stochastic Optimization" <https://arxiv.org/abs/1412.6980>`_.
+    r"""Implements Adam algorithm proposed in `"Adam: A Method for Stochastic Optimization" <https://arxiv.org/abs/1412.6980>`_.
 
-    :param params: iterable of parameters to optimize or dicts defining
+    Args:
+        params: iterable of parameters to optimize or dicts defining
             parameter groups.
-    :param lr: learning rate.
-    :param betas: coefficients used for computing running averages of gradient
-        and its square. Default: (0.9, 0.999)
-    :param eps: term added to the denominator to improve numerical stability
-        Default: 1e-8
-    :param weight_decay: weight decay (L2 penalty). Default: 0
+        lr: learning rate.
+            betas: coefficients used for computing running averages of gradient
+            and its square. Default: (0.9, 0.999)
+        eps: term added to the denominator to improve numerical stability. Default: 1e-8
+        weight_decay: weight decay (L2 penalty). Default: 0
     """
 
     def __init__(
diff --git a/imperative/python/megengine/optimizer/adamw.py b/imperative/python/megengine/optimizer/adamw.py
index cd3f2d91..cdbe9663 100644
--- a/imperative/python/megengine/optimizer/adamw.py
+++ b/imperative/python/megengine/optimizer/adamw.py
@@ -15,17 +15,16 @@ from .optimizer import Optimizer
 
 
 class AdamW(Optimizer):
-    r"""
-    Implements AdamW algorithm proposed in `"Decoupled Weight Decay Regularization" <https://arxiv.org/abs/1711.05101>`_.
+    r"""Implements AdamW algorithm proposed in `"Decoupled Weight Decay Regularization" <https://arxiv.org/abs/1711.05101>`_.
 
-    :param params: iterable of parameters to optimize or dicts defining
+    Args:
+        params: iterable of parameters to optimize or dicts defining
             parameter groups.
-    :param lr: learning rate.
-    :param betas: coefficients used for computing running averages of gradient
-        and its square. Default: (0.9, 0.999)
-    :param eps: term added to the denominator to improve numerical stability
-        Default: 1e-8
-    :param weight_decay: weight decay (L2 penalty). Default: 1e-2
+        lr: learning rate.
+            betas: coefficients used for computing running averages of gradient
+            and its square. Default: (0.9, 0.999)
+        eps: term added to the denominator to improve numerical stability. Default: 1e-8
+        weight_decay: weight decay (L2 penalty). Default: 1e-2
     """
 
     def __init__(
diff --git a/imperative/python/megengine/optimizer/clip_grad.py b/imperative/python/megengine/optimizer/clip_grad.py
index 84492ad6..bc6b7495 100644
--- a/imperative/python/megengine/optimizer/clip_grad.py
+++ b/imperative/python/megengine/optimizer/clip_grad.py
@@ -23,10 +23,13 @@ def clip_grad_norm(
     The norm is computed over all gradients together, as if they were
     concatenated into a single vector. Gradients are modified in-place.
 
-    :param tensors: an iterable of Tensors or a single Tensor.
-    :param max_norm: max norm of the gradients.
-    :param ord: type of the used p-norm. Can be ``'inf'`` for infinity norm.
-    :return: total norm of the parameters (viewed as a single vector).
+    Args:
+        tensors: an iterable of Tensors or a single Tensor.
+        max_norm: max norm of the gradients.
+        ord: type of the used p-norm. Can be ``'inf'`` for infinity norm.
+
+    Returns:
+      total norm of the parameters (viewed as a single vector).
     """
     push_scope("clip_grad_norm")
     if isinstance(tensors, Tensor):
@@ -53,14 +56,15 @@ def clip_grad_value(
 ):
     r"""Clips gradient of an iterable of parameters to a specified lower and
     upper. Gradients are modified in-place.
-
+    
     The gradients are clipped in the range:
-
+    
     .. math:: \left[\text{lower}, \text{upper}\right]
 
-    :param tensors: an iterable of Tensors or a single Tensor.
-    :param lower: minimum allowed value of the gradients.
-    :param upper: maximum allowed value of the gradients.
+    Args:
+        tensors: an iterable of Tensors or a single Tensor.
+        lower: minimum allowed value of the gradients.
+        upper: maximum allowed value of the gradients.
     """
     push_scope("clip_grad_value")
     if isinstance(tensors, Tensor):
diff --git a/imperative/python/megengine/optimizer/lr_scheduler.py b/imperative/python/megengine/optimizer/lr_scheduler.py
index ec4ab57f..370bc2e7 100644
--- a/imperative/python/megengine/optimizer/lr_scheduler.py
+++ b/imperative/python/megengine/optimizer/lr_scheduler.py
@@ -12,11 +12,11 @@ from .optimizer import Optimizer
 
 
 class LRScheduler(metaclass=ABCMeta):
-    r"""
-    Base class for all learning rate based schedulers.
+    r"""Base class for all learning rate based schedulers.
 
-    :param optimizer: wrapped optimizer.
-    :param current_epoch: the index of current epoch. Default: -1
+    Args:
+        optimizer: wrapped optimizer.
+        current_epoch: the index of current epoch. Default: -1
     """
 
     def __init__(  # pylint: disable=too-many-branches
@@ -45,25 +45,22 @@ class LRScheduler(metaclass=ABCMeta):
         self.step()
 
     def state_dict(self):
-        r"""
-        Returns the state of the scheduler as a :class:`dict`.
+        r"""Returns the state of the scheduler as a :class:`dict`.
             It contains an entry for every variable in self.__dict__ which
             is not the optimizer.
         """
         raise NotImplementedError
 
     def load_state_dict(self, state_dict):
-        r"""
-        Loads the schedulers state.
+        r"""Loads the schedulers state.
 
-        :type state_dict: dict
-        :param state_dict: scheduler state.
+        Args:
+            state_dict: scheduler state.
         """
         raise NotImplementedError
 
     def get_lr(self):
-        r""" Compute current learning rate for the scheduler.
-        """
+        r"""Compute current learning rate for the scheduler."""
         raise NotImplementedError
 
     def step(self, epoch=None):
diff --git a/imperative/python/megengine/optimizer/multi_step_lr.py b/imperative/python/megengine/optimizer/multi_step_lr.py
index f21485ff..6fe94403 100644
--- a/imperative/python/megengine/optimizer/multi_step_lr.py
+++ b/imperative/python/megengine/optimizer/multi_step_lr.py
@@ -14,16 +14,14 @@ from .optimizer import Optimizer
 
 
 class MultiStepLR(LRScheduler):
-    r"""
-    Decays the learning rate of each parameter group by gamma once the
+    r"""Decays the learning rate of each parameter group by gamma once the
         number of epoch reaches one of the milestones.
 
-    :param optimizer: wrapped optimizer.
-    :type milestones: list
-    :param milestones: list of epoch indices which should be increasing.
-    :type gamma: float
-    :param gamma: multiplicative factor of learning rate decay. Default: 0.1
-    :param current_epoch: the index of current epoch. Default: -1
+    Args:
+        optimizer: wrapped optimizer.
+        milestones: list of epoch indices which should be increasing.
+        gamma: multiplicative factor of learning rate decay. Default: 0.1
+        current_epoch: the index of current epoch. Default: -1
     """
 
     def __init__(
@@ -45,8 +43,7 @@ class MultiStepLR(LRScheduler):
         super().__init__(optimizer, current_epoch)
 
     def state_dict(self):
-        r"""
-        Returns the state of the scheduler as a :class:`dict`.
+        r"""Returns the state of the scheduler as a :class:`dict`.
             It contains an entry for every variable in self.__dict__ which
             is not the optimizer.
         """
@@ -57,11 +54,10 @@ class MultiStepLR(LRScheduler):
         }
 
     def load_state_dict(self, state_dict):
-        r"""
-        Loads the schedulers state.
+        r"""Loads the schedulers state.
 
-        :type state_dict: dict
-        :param state_dict: scheduler state.
+        Args:
+          state_dict: scheduler state.
         """
         tmp_dict = {}
         for key in ["milestones", "gamma", "current_epoch"]:
diff --git a/imperative/python/megengine/optimizer/optimizer.py b/imperative/python/megengine/optimizer/optimizer.py
index b6f60cd7..827997fa 100644
--- a/imperative/python/megengine/optimizer/optimizer.py
+++ b/imperative/python/megengine/optimizer/optimizer.py
@@ -30,11 +30,11 @@ required = _RequiredParameter()
 
 
 class Optimizer(metaclass=ABCMeta):
-    r"""
-    Base class for all optimizers.
+    r"""Base class for all optimizers.
 
-    :param params: specifies what Tensors should be optimized.
-    :param defaults: a dict of default parameters of Optimizer, like learning rate or momentum.
+    Args:
+        params: specifies what Tensors should be optimized.
+        defaults: a dict of default parameters of Optimizer, like learning rate or momentum.
     """
 
     def __init__(  # pylint: disable=too-many-branches
@@ -76,14 +76,13 @@ class Optimizer(metaclass=ABCMeta):
             self._create_state(group)
 
     def add_param_group(self, param_group: dict):
-        r"""
-        Add a param group to ``param_groups`` of the :class:`~megengine.optim.optimizer.Optimizer`.
-
+        r"""Add a param group to ``param_groups`` of the :class:`~megengine.optim.optimizer.Optimizer`.
+        
         This can be useful when fine tuning a pre-trained network as frozen layers can be made
         trainable and added to the :class:`~megengine.optim.optimizer.Optimizer` as training progresses.
 
-        :param param_group: specifies what tensors should be optimized along with group.
-
+        Args:
+            param_group: specifies what tensors should be optimized along with group.
         """
         assert isinstance(param_group, dict), "param group must be a dict"
 
@@ -143,10 +142,7 @@ class Optimizer(metaclass=ABCMeta):
         return params
 
     def step(self):
-        r"""
-        Performs a single optimization step.
-
-        """
+        r"""Performs a single optimization step."""
         # set the globle state `_enable_convert_inputs` to `False` to disable
         # the `convert_inputs` for param updates
         set_option("record_computing_path", 0)
@@ -176,9 +172,7 @@ class Optimizer(metaclass=ABCMeta):
                     param.grad.reset_zero()
 
     def clear_grad(self):
-        r"""
-        Set the grad attribute to None for all parameters.
-        """
+        r"""Set the grad attribute to None for all parameters."""
         for param_group in self.param_groups:
             push_scope("clear_grad")
             for param in param_group["params"]:
@@ -186,10 +180,10 @@ class Optimizer(metaclass=ABCMeta):
             pop_scope("clear_grad")
 
     def state_dict(self, keep_var=False) -> Dict:
-        r"""
-        Export the optimizer state.
+        r"""Export the optimizer state.
 
-        :return: optimizer state. Can be loaded by :meth:`load_state_dict`.
+        Return:
+            optimizer state. Can be loaded by :meth:`load_state_dict`.
         """
         param_groups = []
         state = dict()
@@ -217,10 +211,10 @@ class Optimizer(metaclass=ABCMeta):
         return {"param_groups": param_groups, "state": state}
 
     def load_state_dict(self, state: dict):
-        r"""
-        Loads the optimizer state.
+        r"""Loads the optimizer state.
 
-        :param state: optimizer state. Should be an object returned
+        Args:
+            state: optimizer state. Should be an object returned
                 from a call to :meth:`state_dict`.
         """
         if len(self.param_groups) != len(state["param_groups"]):
diff --git a/imperative/python/megengine/optimizer/sgd.py b/imperative/python/megengine/optimizer/sgd.py
index 9c939eb3..fe5efda2 100644
--- a/imperative/python/megengine/optimizer/sgd.py
+++ b/imperative/python/megengine/optimizer/sgd.py
@@ -15,17 +15,17 @@ from .optimizer import Optimizer
 
 
 class SGD(Optimizer):
-    r"""
-    Implements stochastic gradient descent.
-
+    r"""Implements stochastic gradient descent.
+    
     Nesterov momentum is based on the formula from
     `"On the importance of initialization and momentum in deep learning" <http://www.cs.toronto.edu/%7Ehinton/absps/momentum.pdf>`_ .
 
-    :param params: iterable of parameters to optimize or dicts defining
+    Args:
+        params: iterable of parameters to optimize or dicts defining
             parameter groups.
-    :param lr: learning rate.
-    :param momentum: momentum factor. Default: 0.0
-    :param weight_decay: weight decay (L2 penalty). Default: 0.0
+        lr: learning rate.
+        momentum: momentum factor. Default: 0.0
+        weight_decay: weight decay (L2 penalty). Default: 0.0
     """
 
     def __init__(
diff --git a/imperative/python/megengine/quantization/fake_quant.py b/imperative/python/megengine/quantization/fake_quant.py
index f6625d70..a35f8628 100644
--- a/imperative/python/megengine/quantization/fake_quant.py
+++ b/imperative/python/megengine/quantization/fake_quant.py
@@ -72,13 +72,13 @@ class _FakeQuantize(Module):
 
 
 class TQT(_FakeQuantize, QParamsModuleMixin):
-    r"""
-    TQT: https://arxiv.org/abs/1903.08066 Trained Quantization Thresholds
+    r"""TQT: https://arxiv.org/abs/1903.08066 Trained Quantization Thresholds
     for Accurate and Efficient Fixed-Point Inference of Deep Neural Networks.
 
-    :param dtype: a string or :class:`~.QuantDtypeMeta` indicating the target
-        quantization dtype of input.
-    :param enable: whether do ``normal_forward`` or ``fake_quant_forward``.
+    Args:
+        dtype: a string or :class:`~.QuantDtypeMeta` indicating the target
+            quantization dtype of input.
+        enable: whether do ``normal_forward`` or ``fake_quant_forward``.
     """
 
     def __init__(
@@ -104,12 +104,12 @@ class TQT(_FakeQuantize, QParamsModuleMixin):
 
 
 class FakeQuantize(_FakeQuantize):
-    r"""
-    A module to do quant and dequant according to observer's scale and zero_point.
+    r"""A module to do quant and dequant according to observer's scale and zero_point.
 
-    :param dtype: a string or :class:`~.QuantDtypeMeta` indicating the target
-        quantization dtype of input.
-    :param enable: whether do ``normal_forward`` or ``fake_quant_forward``.
+    Args:
+        dtype: a string or :class:`~.QuantDtypeMeta` indicating the target
+            quantization dtype of input.
+        enable: whether do ``normal_forward`` or ``fake_quant_forward``.
     """
 
     def fake_quant_forward(self, inp, qparams: QParams = None):
@@ -122,14 +122,14 @@ class FakeQuantize(_FakeQuantize):
 
 
 class LSQ(_FakeQuantize, QParamsModuleMixin):
-    r"""
-    LSQ: https://arxiv.org/pdf/1902.08153.pdf Estimating and scaling the 
+    r"""LSQ: https://arxiv.org/pdf/1902.08153.pdf Estimating and scaling the
     task loss gradient at each weight and activation layer's quantizer step size
 
-    :param dtype: a string or :class:`~.QuantDtypeMeta` indicating the target
-        quantization dtype of input.
-    :param enable: whether do ``normal_forward`` or ``fake_quant_forward``.
-    :param eps:a small value to avoid division by zero. Default: 1e-5
+    Args:
+        dtype: a string or :class:`~.QuantDtypeMeta` indicating the target
+            quantization dtype of input.
+        enable: whether do ``normal_forward`` or ``fake_quant_forward``.
+        eps: a small value to avoid division by zero. Default: 1e-5
     """
 
     def __init__(
diff --git a/imperative/python/megengine/quantization/observer.py b/imperative/python/megengine/quantization/observer.py
index 381884ae..1c6a58f8 100644
--- a/imperative/python/megengine/quantization/observer.py
+++ b/imperative/python/megengine/quantization/observer.py
@@ -25,11 +25,11 @@ logger = get_logger(__name__)
 
 
 class Observer(Module, QParamsModuleMixin):
-    r"""
-    A base class for Observer Module. Used to record input tensor's statistics for
+    r"""A base class for Observer Module. Used to record input tensor's statistics for
     quantization.
 
-    :param dtype: a string indicating which dtype to collect scale and zero_point of.
+    Args:
+        dtype: a string indicating which dtype to collect scale and zero_point of.
     """
 
     def __init__(self, dtype: Union[str, QuantDtypeMeta], **kwargs):
@@ -73,12 +73,12 @@ class Observer(Module, QParamsModuleMixin):
 
 
 class MinMaxObserver(Observer):
-    r"""
-    A Observer Module records input tensor's running min and max values to calc scale.
+    r"""A Observer Module records input tensor's running min and max values to calc scale.
 
-    :param mode: set quantization mode.
-    :param eps: a initial maximum value to avoid division by zero problem.
-    :param dtype: a string indicating which dtype to collect scale and zero_point of.
+    Args:
+        mode: set quantization mode.
+        eps: a initial maximum value to avoid division by zero problem.
+        dtype: a string indicating which dtype to collect scale and zero_point of.
     """
 
     def __init__(
@@ -128,12 +128,12 @@ class MinMaxObserver(Observer):
 
 
 class SyncMinMaxObserver(MinMaxObserver):
-    r"""
-    A distributed version of :class:`~.MinMaxObserver`.
+    r"""A distributed version of :class:`~.MinMaxObserver`.
 
-    :param mode: set quantization mode.
-    :param eps: a initial maximum value to avoid division by zero problem.
-    :param dtype: a string indicating which dtype to collect scale and zero_point of.
+    Args:
+        mode: set quantization mode.
+        eps: a initial maximum value to avoid division by zero problem.
+        dtype: a string indicating which dtype to collect scale and zero_point of.
     """
 
     def forward(self, x_orig):
@@ -151,13 +151,13 @@ class SyncMinMaxObserver(MinMaxObserver):
 
 
 class ExponentialMovingAverageObserver(MinMaxObserver):
-    r"""
-    A :class:`~.MinMaxObserver` with momentum support for min/max updating.
+    r"""A :class:`~.MinMaxObserver` with momentum support for min/max updating.
 
-    :param momentum: momentum ratio for min/max updating.
-    :param mode: set quantization mode.
-    :param eps: a initial maximum value to avoid division by zero problem.
-    :param dtype: a string indicating which dtype to collect scale and zero_point of.
+    Args:
+        momentum: momentum ratio for min/max updating.
+        mode: set quantization mode.
+        eps: a initial maximum value to avoid division by zero problem.
+        dtype: a string indicating which dtype to collect scale and zero_point of.
     """
 
     def __init__(
@@ -196,13 +196,13 @@ class ExponentialMovingAverageObserver(MinMaxObserver):
 
 
 class SyncExponentialMovingAverageObserver(ExponentialMovingAverageObserver):
-    r"""
-    A distributed version of :class:`~.ExponentialMovingAverageObserver`.
+    r"""A distributed version of :class:`~.ExponentialMovingAverageObserver`.
 
-    :param momentum: momentum ratio for min/max updating.
-    :param mode: set quantization mode.
-    :param eps: a initial maximum value to avoid division by zero problem.
-    :param dtype: a string indicating which dtype to collect scale and zero_point of.
+    Args:
+        momentum: momentum ratio for min/max updating.
+        mode: set quantization mode.
+        eps: a initial maximum value to avoid division by zero problem.
+        dtype: a string indicating which dtype to collect scale and zero_point of.
     """
 
     def forward(self, x_orig):
@@ -227,15 +227,15 @@ class SyncExponentialMovingAverageObserver(ExponentialMovingAverageObserver):
 
 
 class HistogramObserver(MinMaxObserver):
-    r"""
-    A :class:`~.MinMaxObserver` using running histogram of tensor values
+    r"""A :class:`~.MinMaxObserver` using running histogram of tensor values
     for min/max updating. Usually used for calibration quantization.
 
-    :param bins: number of bins to use for the histogram.
-    :param upsample_rate: which ratio to interpolate histograms in.
-    :param mode: set quantization mode.
-    :param eps: a initial maximum value to avoid division by zero problem.
-    :param dtype: a string indicating which dtype to collect scale and zero_point of.
+    Args:
+        bins: number of bins to use for the histogram.
+        upsample_rate: which ratio to interpolate histograms in.
+        mode: set quantization mode.
+        eps: a initial maximum value to avoid division by zero problem.
+        dtype: a string indicating which dtype to collect scale and zero_point of.
     """
 
     def __init__(
@@ -256,8 +256,7 @@ class HistogramObserver(MinMaxObserver):
         self.histogram = Tensor([-1] + [0.0] * (bins - 1), dtype="float32")
 
     def _non_linear_param_search(self):
-        r"""
-        Non-linear parameter search.
+        r"""Non-linear parameter search.
         An approximation for L2 error minimization for selecting min/max.
         By selecting new min/max, we filter out outliers in input distribution.
         """
@@ -269,8 +268,7 @@ class HistogramObserver(MinMaxObserver):
         bin_width = (np_max_val - np_min_val) / self.bins
 
         def _get_norm(delta_begin, delta_end, density, norm_type):
-            r"""
-            Compute the norm of the values uniformaly distributed between
+            r"""Compute the norm of the values uniformaly distributed between
             delta_begin and delta_end.
             norm = density * (integral_{begin, end} x^2)
                  = density * (end^3 - begin^3) / 3
@@ -285,8 +283,7 @@ class HistogramObserver(MinMaxObserver):
             return density * norm
 
         def _compute_quantization_error(next_start_bin, next_end_bin, norm_type):
-            r"""
-            Compute the quantization error if we use start_bin to end_bin as the
+            r"""Compute the quantization error if we use start_bin to end_bin as the
             min and max to do the quantization.
             """
 
@@ -488,9 +485,7 @@ class HistogramObserver(MinMaxObserver):
 
 
 class PassiveObserver(Observer):
-    r"""
-    An Observer that supports setting :attr:`scale` directly.
-    """
+    r"""An Observer that supports setting :attr:`scale` directly."""
 
     def __init__(self, dtype: Union[str, QuantDtypeMeta], **kwargs):
         super().__init__(dtype, **kwargs)
@@ -510,8 +505,10 @@ class PassiveObserver(Observer):
         return self.qparams
 
     def set_qparams(self, qparams: QParams):
-        """
-        :param qparams: used to set initial scale.
+        r"""set the ``qparams``.
+
+        Args:
+          qparams: used to set initial scale.
         """
         self.qparams = deepcopy(qparams)
         if qparams.scale is None:
@@ -527,7 +524,5 @@ class PassiveObserver(Observer):
         self.orig_scale = qparams.scale.numpy()
 
     def forward(self, x):
-        r"""
-        Just return input because :attr:`qparams` is set by :func:`~.apply_easy_quant`.
-        """
+        r"""Just return input because :attr:`qparams` is set by :func:`~.apply_easy_quant`."""
         return x
diff --git a/imperative/python/megengine/quantization/qconfig.py b/imperative/python/megengine/quantization/qconfig.py
index 6d5c476a..44cae4e9 100644
--- a/imperative/python/megengine/quantization/qconfig.py
+++ b/imperative/python/megengine/quantization/qconfig.py
@@ -27,33 +27,33 @@ class QConfig(
         ["weight_observer", "act_observer", "weight_fake_quant", "act_fake_quant"],
     )
 ):
-    r"""
-    A config class indicating how to do quantize toward :class:`~.QATModule` 's
+    r"""A config class indicating how to do quantize toward :class:`~.QATModule` 's
     ``activation`` and ``weight``. See :meth:`~.QATModule.set_qconfig` for detail usage.
 
-    :param weight_observer: interface to instantiate an :class:`~.Observer` indicating
-        how to collect scales and zero_point of wegiht.
-    :param act_observer: similar to ``weight_observer`` but toward activation.
-    :param weight_fake_quant: interface to instantiate a :class:`~.FakeQuantize` indicating
-        how to do fake_quant calculation.
-    :param act_observer: similar to ``weight_fake_quant`` but toward activation.
-
+    Args:
+        weight_observer: interface to instantiate an :class:`~.Observer` indicating
+            how to collect scales and zero_point of wegiht.
+        act_observer: similar to ``weight_observer`` but toward activation.
+        weight_fake_quant: interface to instantiate a :class:`~.FakeQuantize` indicating
+            how to do fake_quant calculation.
+        act_observer: similar to ``weight_fake_quant`` but toward activation.
+    
     Examples:
-
-    .. code-block::
-
-        # Default EMA QConfig for QAT.
-        ema_fakequant_qconfig = QConfig(
-            weight_observer=partial(MinMaxObserver, dtype="qint8_narrow"),
-            act_observer=partial(ExponentialMovingAverageObserver, dtype="qint8"),
-            weight_fake_quant=partial(FakeQuantize, dtype="qint8_narrow"),
-            act_fake_quant=partial(FakeQuantize, dtype="qint8"),
-        )
-
+    
+        .. code-block::
+  
+           # Default EMA QConfig for QAT.
+           ema_fakequant_qconfig = QConfig(
+               weight_observer=partial(MinMaxObserver, dtype="qint8_narrow"),
+               act_observer=partial(ExponentialMovingAverageObserver, dtype="qint8"),
+               weight_fake_quant=partial(FakeQuantize, dtype="qint8_narrow"),
+               act_fake_quant=partial(FakeQuantize, dtype="qint8"),
+           )
+    
     Each parameter is a ``class`` rather than an instance. And we recommand using ``functools.partial``
     to add initialization parameters of the ``class``, so that don't need to provide parameters in
     :meth:`~.QATModule.set_qconfig`.
-
+    
     Usually we choose narrow version dtype (like ``qint8_narrow``) for weight related
     paramters and normal version for activation related ones. For the result of
     multiplication and addition as ``a * b + c * d``, if four variables are all -128 of
diff --git a/imperative/python/megengine/quantization/quantize.py b/imperative/python/megengine/quantization/quantize.py
index 22fe8d15..830334cb 100644
--- a/imperative/python/megengine/quantization/quantize.py
+++ b/imperative/python/megengine/quantization/quantize.py
@@ -57,14 +57,14 @@ qat_modules = tuple(_qat2quantized_dict.keys())
 
 
 def quantize(module: Module, inplace: bool = True, mapping: dict = None):
-    r"""
-    Recursively convert :class:`~.QATModule` to :class:`~.QuantizedModule`
+    r"""Recursively convert :class:`~.QATModule` to :class:`~.QuantizedModule`
     through :meth:`~.Module.apply`.
 
-    :param module: root module to do convert recursively.
-    :param inplace: whether to convert submodules in-place.
-    :param mapping: a dict indicating how to convert custom modules from QATModule to
-        QuantizedModule. Will be combined with internal default convert mapping dict.
+    Args:
+        module: root module to do convert recursively.
+        inplace: whether to convert submodules in-place.
+        mapping: a dict indicating how to convert custom modules from QATModule to
+            QuantizedModule. Will be combined with internal default convert mapping dict.
     """
 
     if not inplace:
@@ -94,16 +94,16 @@ def quantize_qat(
     qconfig: QConfig = ema_fakequant_qconfig,
     mapping: dict = None,
 ):
-    r"""
-    Recursively convert float :class:`~.Module` to :class:`~.QATModule`
+    r"""Recursively convert float :class:`~.Module` to :class:`~.QATModule`
     through :meth:`~.Module.apply` and set qconfig relatively.
 
-    :param module: root module to do convert recursively.
-    :param inplace: whether to convert submodules in-place.
-    :param qconfig: an instance of :class:`~.QConfig` to be set as submodules' qconfig.
-        default is ``ema_fakequant_qconfig``.
-    :param mapping: a dict indicating how to convert custom modules from Module to QATModule.
-        Will be combined with internal default convert mapping dict.
+    Args:
+        module: root module to do convert recursively.
+        inplace: whether to convert submodules in-place.
+        qconfig: an instance of :class:`~.QConfig` to be set as submodules' qconfig.
+            default is ``ema_fakequant_qconfig``.
+        mapping: a dict indicating how to convert custom modules from Module to QATModule.
+            Will be combined with internal default convert mapping dict.
     """
 
     if not inplace:
@@ -133,12 +133,12 @@ def quantize_qat(
 
 
 def reset_qconfig(module: Module, qconfig: QConfig, inplace: bool = True):
-    r"""
-    Reset :class:`~._FakeQuantize` and :class:`~.Observer` according to ``qconfig``
+    r"""Reset :class:`~._FakeQuantize` and :class:`~.Observer` according to ``qconfig``
 
-    :param module: root module to reset recursively.
-    :param qconfig: an instance of :class:`~.QConfig` to be set as submodules' qconfig.
-    :param inplace: whether to reset submodules in-place.
+    Args:
+        module: root module to reset recursively.
+        qconfig: an instance of :class:`~.QConfig` to be set as submodules' qconfig.
+        inplace: whether to reset submodules in-place.
     """
 
     if not inplace:
@@ -175,19 +175,17 @@ def _propagate(module: Module, func_str: str, *args, **kargs):
 
 
 def propagate_qconfig(module: QATModule, qconfig: QConfig):
-    r"""
-    Recursively set ``module``'s qconfig through :meth:`~.Module.apply`.
+    r"""Recursively set ``module``'s qconfig through :meth:`~.Module.apply`.
 
-    :param module: root module to traverse recursively.
-    :param qconfig: a instance of :class:`~.QConfig` to be set as submodules' qconfig.
+    Args:
+        module: root module to traverse recursively.
+        qconfig: a instance of :class:`~.QConfig` to be set as submodules' qconfig.
     """
     _propagate(module, "set_qconfig", qconfig)
 
 
 def hook_qat_module(module: Module, func: Callable):
-    r"""
-    Add hooks for all :class:`~.QATModule` submodule
-    """
+    r"""Add hooks for all :class:`~.QATModule` submodule"""
 
     def is_qat(mod: Module):
         return isinstance(mod, QATModule)
@@ -202,15 +200,16 @@ def hook_qat_module(module: Module, func: Callable):
 def apply_easy_quant(
     module: Module, data: Tensor, start: float = 0.8, stop: float = 1.2, num: int = 40
 ):
-    r"""
-    Implementation of ``EasyQuant``: https://arxiv.org/pdf/2006.16669.
+    r"""Implementation of ``EasyQuant``: https://arxiv.org/pdf/2006.16669.
     Search for optimal scales.
 
-    :param module: root module.
-    :param data: input tensor used to search optimal scale.
-    :param start: lower bound of the search interval.
-    :param stop: upper bound of the search interval.
-    :param num: number of samples to search.
+    Args:
+        module: root module.
+        data: input tensor used to search optimal scale.
+        start: lower bound of the search interval.
+        stop: upper bound of the search interval.
+        num: number of samples to search.
+        module: Module: 
     """
 
     batch_size = data.shape[0]
@@ -267,40 +266,40 @@ def apply_easy_quant(
 
 
 def disable_fake_quant(module: Module):
-    r"""
-    Recursively disable ``module`` fake quantization in QATModule through :meth:`~.Module.apply`
+    r"""Recursively disable ``module`` fake quantization in QATModule through :meth:`~.Module.apply`
 
-    :param module: root module to do disable fake quantization recursively.
+    Args:
+        module: root module to do disable fake quantization recursively.
     """
 
     _propagate(module, "set_fake_quant", False)
 
 
 def disable_observer(module: Module):
-    r"""
-    Recursively disable ``module`` observer in QATModule through :meth:`~.Module.apply`
+    r"""Recursively disable ``module`` observer in QATModule through :meth:`~.Module.apply`
 
-    :param module: root module to do disable observer recursively.
+    Args:
+        module: root module to do disable observer recursively.
     """
 
     _propagate(module, "set_observer", False)
 
 
 def enable_fake_quant(module: Module):
-    r"""
-    Recursively enable ``module`` fake quantization in QATModule through :meth:`~.Module.apply`
+    r"""Recursively enable ``module`` fake quantization in QATModule through :meth:`~.Module.apply`
 
-    :param module: root module to do enable fake quantization recursively.
+    Args:
+        module: root module to do enable fake quantization recursively.
     """
 
     _propagate(module, "set_fake_quant", True)
 
 
 def enable_observer(module: Module):
-    r"""
-    Recursively enable ``module`` observer in QATModule through :meth:`~.Module.apply`
+    r"""Recursively enable ``module`` observer in QATModule through :meth:`~.Module.apply`
 
-    :param module: root module to do enable observer recursively.
+    Args:
+        module: root module to do enable observer recursively.
     """
 
     _propagate(module, "set_observer", True)
diff --git a/imperative/python/megengine/quantization/utils.py b/imperative/python/megengine/quantization/utils.py
index a83844fd..e48edc99 100644
--- a/imperative/python/megengine/quantization/utils.py
+++ b/imperative/python/megengine/quantization/utils.py
@@ -25,8 +25,7 @@ from ..tensor import Tensor
 
 
 class Round(Function):
-    """
-    The functional round have no grad and can not use for quantization-aware-training.
+    r"""The functional round have no grad and can not use for quantization-aware-training.
     We use Function and STE(Straight-Through Estimator) to implement backward propagation.
     """
 
@@ -68,17 +67,14 @@ def register_method_to_class(cls):
 
 
 class QuantMode(Enum):
-    """
-    Quantization mode enumerate class.
-    """
+    r"""Quantization mode enumerate class."""
 
     SYMMERTIC = 1
     ASYMMERTIC = 2
 
 
 class QParams:
-    """
-    To standardize FakeQuant, Observer and Tensor's qparams format. If custom
+    r"""To standardize FakeQuant, Observer and Tensor's qparams format. If custom
     qparams is needed, inherit this class and add custom ``__slots__``.
     """
 
@@ -116,8 +112,7 @@ class QParams:
 
 
 class LSQParams:
-    """
-    To standardize LSQ's qparams format. If custom
+    r"""To standardize LSQ's qparams format. If custom
     qparams is needed, inherit this class and add custom ``__slots__``.
     """
 
@@ -183,8 +178,14 @@ def create_qparams(
     scale: Tensor = None,
     zero_point: Tensor = None,
 ):
-    """
-    Return :class:`~.QParams` according to the mode.
+    r"""
+
+    Args:
+        mode: QuantMode:
+        dtype_meta: Union[str:
+        QuantDtypeMeta]:
+        scale: Tensor:
+        zero_point: Tensor:
     """
     if isinstance(dtype_meta, str):
         dtype_meta = _builtin_quant_dtypes[dtype_meta]
@@ -197,12 +198,11 @@ def create_qparams(
 
 
 def fake_quant_tensor(inp: Tensor, qparams: QParams) -> Tensor:
-    """
-    Apply fake quantization to the inp tensor.
-
-    :param inp: the input tensor which need to be faked.
-    :param qparams: to get mode, qmin, qmax, scale and zero_point from.
+    """Apply fake quantization to the inp tensor.
 
+    Args:
+        inp: the input tensor which need to be faked.
+        qparams: to get mode, qmin, qmax, scale and zero_point from.
     """
     scale = qparams.scale
     if qparams.mode == QuantMode.ASYMMERTIC:
@@ -217,17 +217,16 @@ def fake_quant_tensor(inp: Tensor, qparams: QParams) -> Tensor:
 
 
 def fake_quant_bias(bias: Tensor, inp: Tensor, w_qat: Tensor) -> Tensor:
-    """
-    Apply fake quantization to bias, with the special scale from input tensor
+    """Apply fake quantization to bias, with the special scale from input tensor
     and weight tensor, the quantized type set to qint32 also.
 
-    :param bias: the bias tensor which need to be faked.
-    :param inp:  the input tensor which contain the quantization parameters.
-    :param w_qat: the weight tensor which contain the quantization parameters.
+    Args:
+        bias: the bias tensor which need to be faked.
+        inp: the input tensor which contain the quantization parameters.
+        w_qat: the weight tensor which contain the quantization parameters.
 
-    .. warning::
+    Warning:
         Only work for symmetric quantization method now.
-
     """
     b_qat = bias
     if (
diff --git a/imperative/python/megengine/random/rng.py b/imperative/python/megengine/random/rng.py
index fe47a62c..661a7420 100644
--- a/imperative/python/megengine/random/rng.py
+++ b/imperative/python/megengine/random/rng.py
@@ -220,29 +220,29 @@ def _permutation(n: int, seed: int, device: str, handle: int, dtype: str) -> Ten
 
 class RNG:
 
-    r"""
-    :class:`RNG` exposes a number of methods for generating random numbers.
+    r""":class:`RNG` exposes a number of methods for generating random numbers.
+
+    Args:
+        seed: random seed used to initialize the pseudo-random number generator. Default: None
+        device: the device of generated tensor. Default: None
 
-    :param seed: random seed used to initialize the pseudo-random number generator. 
-        Default: None
-    :param device: the device of generated tensor. Default: None
 
     Examples:
 
-    .. testcode::
+        .. testcode::
 
-        import megengine.random as rand
-        rng = rand.RNG(seed=100)
-        x = rng.uniform(size=(2, 2))
-        print(x.numpy())
+            import megengine.random as rand
+            rng = rand.RNG(seed=100)
+            x = rng.uniform(size=(2, 2))
+            print(x.numpy())
 
-    Outputs:
+        Outputs:
 
-    .. testoutput::
-        :options: +SKIP
+        .. testoutput::
+            :options: +SKIP
 
-        [[0.84811664 0.6147553 ]
-         [0.59429836 0.64727545]]
+            [[0.84811664 0.6147553 ]
+             [0.59429836 0.64727545]]
 
     """
 
@@ -259,32 +259,33 @@ class RNG:
     def uniform(
         self, low: float = 0, high: float = 1, size: Optional[Iterable[int]] = None
     ):
-        r"""
-        Random variable with uniform distribution $U(0, 1)$.
+        r"""Random variable with uniform distribution $U(0, 1)$.
+
+        Args:
+            low: lower range. Default: 0
+            high: upper range. Default: 1
+            size: the size of output tensor. Default: None
 
-        :param low: lower range. Default: 0
-        :param high: upper range. Default: 1
-        :param size: the size of output tensor. Default: None
-        :return: the output tensor.
+        Returns:
+            the output tensor.
 
         Examples:
 
-        .. testcode::
+            .. testcode::
 
-            import megengine as mge
-            import megengine.random as rand
+                import megengine as mge
+                import megengine.random as rand
 
-            x = rand.uniform(size=(2, 2))
-            print(x.numpy())
-        
-        Outputs:
-        
-        .. testoutput::
-            :options: +SKIP
+                x = rand.uniform(size=(2, 2))
+                print(x.numpy())
+
+            Outputs:
 
-            [[0.91600335 0.6680226 ]
-             [0.2046729  0.2769141 ]]
+            .. testoutput::
+                :options: +SKIP
 
+                [[0.91600335 0.6680226 ]
+                 [0.2046729  0.2769141 ]]
         """
         _seed = self._seed() if callable(self._seed) else self._seed
         return _uniform(
@@ -299,33 +300,34 @@ class RNG:
     def normal(
         self, mean: float = 0, std: float = 1, size: Optional[Iterable[int]] = None
     ):
-        r"""
-        Random variable with Gaussian distribution :math:`N(\mu, \sigma)`.
+        r"""Random variable with Gaussian distribution :math:`N(\mu, \sigma)`.
 
-        :param mean: the mean or expectation of the distribution. Default: 0
-        :param std: the standard deviation of the distribution (variance = :math:`\sigma ^ 2`).
-            Default: 1
-        :param size: the size of output tensor. Default: None
-        :return: the output tensor.
+        Args:
+            mean: the mean or expectation of the distribution. Default: 0
+            std: the standard deviation of the distribution (variance = :math:`\sigma ^ 2`).
+                Default: 1
+            size: the size of output tensor. Default: None
+
+        Returns:
+            the output tensor.
 
         Examples:
 
-        .. testcode::
+            .. testcode::
 
-            import megengine as mge
-            import megengine.random as rand
+                import megengine as mge
+                import megengine.random as rand
 
-            x = rand.normal(mean=0, std=1, size=(2, 2))
-            print(x.numpy())
-        
-        Outputs:
-        
-        .. testoutput::
-            :options: +SKIP
+                x = rand.normal(mean=0, std=1, size=(2, 2))
+                print(x.numpy())
 
-            [[-1.4010863  -0.9874344 ]
-             [ 0.56373274  0.79656655]]
+            Outputs:
 
+            .. testoutput::
+                :options: +SKIP
+
+                [[-1.4010863  -0.9874344 ]
+                 [ 0.56373274  0.79656655]]
         """
         _seed = self._seed() if callable(self._seed) else self._seed
         return _normal(
@@ -343,12 +345,12 @@ class RNG:
         scale: Union[Tensor, float] = 1,
         size: Optional[Iterable[int]] = None,
     ):
-        r"""
-        Random variable with Gamma distribution :math:`\Gamma(k, \theta)`.
+        r"""Random variable with Gamma distribution :math:`\Gamma(k, \theta)`.
 
         The corresponding probability density function is
-        
+
         .. math::
+
             p(x)=x^{k-1} \frac{e^{-x / \theta}}{\theta^{k} \Gamma(k)}
             \quad \text { for } x>0 \quad k, \theta>0,
 
@@ -357,52 +359,54 @@ class RNG:
         .. math::
             \Gamma(k)=(k-1) !  \quad \text { for } \quad k>0.
 
-        :param shape: the shape parameter (sometimes designated "k") of the distribution. 
-            Must be non-negative.
-        :param scale: the scale parameter (sometimes designated "theta") of the distribution. 
-            Must be non-negative. Default: 1
-        :param size: the size of output tensor. If shape and scale are scalars and given size is, e.g.,
-            `(m, n)`, then the output shape is `(m, n)`. If shape or scale is a Tensor and given size 
-            is, e.g., `(m, n)`, then the output shape is `(m, n) + broadcast(shape, scale).shape`. 
-            The broadcast rules are consistent with `numpy.broadcast`. Default: None
-        :return: the output tensor.
+        Args:
+            shape: the shape parameter (sometimes designated "k") of the distribution.
+                Must be non-negative.
+            scale: the scale parameter (sometimes designated "theta") of the distribution.
+                Must be non-negative. Default: 1
+            size: the size of output tensor. If shape and scale are scalars and given size is, e.g.,
+                `(m, n)`, then the output shape is `(m, n)`. If shape or scale is a Tensor and given size
+                is, e.g., `(m, n)`, then the output shape is `(m, n) + broadcast(shape, scale).shape`.
+                The broadcast rules are consistent with `numpy.broadcast`. Default: None
+
+        Returns:
+            the output tensor.
 
         Examples:
 
-        .. testcode::
+            .. testcode::
 
-            import megengine as mge
-            import megengine.random as rand
+                import megengine as mge
+                import megengine.random as rand
 
-            x = rand.gamma(shape=2, scale=1, size=(2, 2))
-            print(x.numpy())
+                x = rand.gamma(shape=2, scale=1, size=(2, 2))
+                print(x.numpy())
 
-            shape = mge.Tensor([[ 1],
-                                [10]], dtype="float32")
-            scale = mge.Tensor([1,5], dtype="float32")
+                shape = mge.Tensor([[ 1],
+                                    [10]], dtype="float32")
+                scale = mge.Tensor([1,5], dtype="float32")
 
-            x = rand.gamma(shape=shape, scale=scale)
-            print(x.numpy())
+                x = rand.gamma(shape=shape, scale=scale)
+                print(x.numpy())
 
-            x = rand.gamma(shape=shape, scale=scale, size=2)
-            print(x.numpy())
-        
-        Outputs:
-        
-        .. testoutput::
-            :options: +SKIP
-            
-            [[1.5064533  4.0689363 ]
-             [0.71639484 1.4551026 ]]
+                x = rand.gamma(shape=shape, scale=scale, size=2)
+                print(x.numpy())
+
+            Outputs:
 
-            [[ 0.4352188 11.399335 ]
-             [ 9.1888    52.009277 ]]
+            .. testoutput::
+                :options: +SKIP
 
-            [[[ 1.1726005   3.9654975 ]
-              [13.656933   36.559006  ]]
-             [[ 0.25848487  2.5540342 ]
-              [11.960409   21.031536  ]]]
+                [[1.5064533  4.0689363 ]
+                 [0.71639484 1.4551026 ]]
 
+                [[ 0.4352188 11.399335 ]
+                 [ 9.1888    52.009277 ]]
+
+                [[[ 1.1726005   3.9654975 ]
+                  [13.656933   36.559006  ]]
+                 [[ 0.25848487  2.5540342 ]
+                  [11.960409   21.031536  ]]]
         """
         _seed = self._seed() if callable(self._seed) else self._seed
         return _gamma(
@@ -415,155 +419,161 @@ class RNG:
         beta: Union[Tensor, float],
         size: Optional[Iterable[int]] = None,
     ):
-        r"""
-        Random variable with Beta distribution :math:`\operatorname{Beta}(\alpha, \beta)`.
+        r"""Random variable with Beta distribution :math:`\operatorname{Beta}(\alpha, \beta)`.
 
         The corresponding probability density function is
-        
+
         .. math::
-            p(x)=\frac{1}{\mathrm{~B}(\alpha, \beta)} x^{\alpha-1}(1-x)^{\beta-1} 
+
+            p(x)=\frac{1}{\mathrm{~B}(\alpha, \beta)} x^{\alpha-1}(1-x)^{\beta-1}
             \quad \text { for } \alpha, \beta>0,
 
         where :math:`\mathrm{~B}(\alpha, \beta)` is the beta function,
 
         .. math::
+
             \mathrm{~B}(\alpha, \beta)=\int_{0}^{1} t^{\alpha-1}(1-t)^{\beta-1} d t.
 
-        :param alpha: the alpha parameter of the distribution. Must be non-negative.
-        :param beta: the beta parameter of the distribution. Must be non-negative.
-        :param size: the size of output tensor. If alpha and beta are scalars and given size is, e.g.,
-            `(m, n)`, then the output shape is `(m, n)`. If alpha or beta is a Tensor and given size 
-            is, e.g., `(m, n)`, then the output shape is `(m, n) + broadcast(alpha, beta).shape`. 
-            The broadcast rules are consistent with `numpy.broadcast`. Default: None
-        :return: the output tensor.
+        Args:
+            alpha: the alpha parameter of the distribution. Must be non-negative.
+            beta: the beta parameter of the distribution. Must be non-negative.
+            size: the size of output tensor. If alpha and beta are scalars and given size is, e.g.,
+                `(m, n)`, then the output shape is `(m, n)`. If alpha or beta is a Tensor and given size
+                is, e.g., `(m, n)`, then the output shape is `(m, n) + broadcast(alpha, beta).shape`.
+
+        Returns:
+            the output tensor.
 
         Examples:
 
-        .. testcode::
+            .. testcode::
 
-            import megengine as mge
-            import megengine.random as rand
+                import megengine as mge
+                import megengine.random as rand
 
-            x = rand.beta(alpha=2, beta=1, size=(2, 2))
-            print(x.numpy())
+                x = rand.beta(alpha=2, beta=1, size=(2, 2))
+                print(x.numpy())
 
-            alpha = mge.Tensor([[0.5],
-                                [  3]], dtype="float32")
-            beta = mge.Tensor([0.5,5], dtype="float32")
+                alpha = mge.Tensor([[0.5],
+                                    [  3]], dtype="float32")
+                beta = mge.Tensor([0.5,5], dtype="float32")
 
-            x = rand.beta(alpha=alpha, beta=beta)
-            print(x.numpy())
+                x = rand.beta(alpha=alpha, beta=beta)
+                print(x.numpy())
 
-            x = rand.beta(alpha=alpha, beta=beta, size=2)
-            print(x.numpy())
-        
-        Outputs:
-        
-        .. testoutput::
-            :options: +SKIP
-            
-            [[0.582565   0.91763186]
-             [0.86963767 0.6088103 ]]
-            
-            [[0.41503012 0.16438372]
-             [0.90159506 0.47588003]]
-            
-            [[[0.55195075 0.01111084]
-              [0.95298755 0.25048104]]
-             [[0.11680304 0.13859665]
-              [0.997879   0.43259275]]]
+                x = rand.beta(alpha=alpha, beta=beta, size=2)
+                print(x.numpy())
+
+            Outputs:
+
+            .. testoutput::
+                :options: +SKIP
+
+                [[0.582565   0.91763186]
+                 [0.86963767 0.6088103 ]]
+
+                [[0.41503012 0.16438372]
+                 [0.90159506 0.47588003]]
 
+                [[[0.55195075 0.01111084]
+                  [0.95298755 0.25048104]]
+                 [[0.11680304 0.13859665]
+                  [0.997879   0.43259275]]]
         """
         _seed = self._seed() if callable(self._seed) else self._seed
         return _beta(alpha=alpha, beta=beta, size=size, seed=_seed, handle=self._handle)
 
     def poisson(self, lam: Union[float, Tensor], size: Optional[Iterable[int]] = None):
-        r"""
-        Random variable with poisson distribution :math:`\operatorname{Poisson}(\lambda)`.
+        r"""Random variable with poisson distribution :math:`\operatorname{Poisson}(\lambda)`.
 
         The corresponding probability density function is
 
         .. math::
+
             f(k ; \lambda)=\frac{\lambda^{k} e^{-\lambda}}{k !},
-        
+
         where k is the number of occurrences :math:`({\displaystyle k=0,1,2...})`.
 
-        :param lam: the lambda parameter of the distribution. Must be non-negative.
-        :param size: the size of output tensor. If lam is a scalar and given size is, e.g., `(m, n)`, 
-            then the output shape is `(m, n)`. If lam is a Tensor with shape `(k, v)` and given 
-            size is, e.g., `(m, n)`, then the output shape is `(m, n, k, v)`. Default: None.
-        :return: the output tensor.
+        Args:
+            lam: the lambda parameter of the distribution. Must be non-negative.
+            size: the size of output tensor. If lam is a scalar and given size is, e.g., `(m, n)`,
+                then the output shape is `(m, n)`. If lam is a Tensor with shape `(k, v)` and given
+                size is, e.g., `(m, n)`, then the output shape is `(m, n, k, v)`. Default: None.
+
+        Returns:
+            the output tensor.
+
+
 
         Examples:
 
-        .. testcode::
+            .. testcode::
 
-            import megengine as mge
-            import megengine.random as rand
+                import megengine as mge
+                import megengine.random as rand
 
-            x = rand.poisson(lam=2., size=(1, 3))
-            print(x.numpy())
+                x = rand.poisson(lam=2., size=(1, 3))
+                print(x.numpy())
 
-            lam = mge.Tensor([[1.,1.],
-                            [10,10]], dtype="float32")
+                lam = mge.Tensor([[1.,1.],
+                                [10,10]], dtype="float32")
 
-            x = rand.poisson(lam=lam)
-            print(x.numpy())
+                x = rand.poisson(lam=lam)
+                print(x.numpy())
 
-            x = rand.poisson(lam=lam, size=(1,3))
-            print(x.numpy())
+                x = rand.poisson(lam=lam, size=(1,3))
+                print(x.numpy())
 
-        Outputs:
-        
-        .. testoutput::
-            :options: +SKIP
+            Outputs:
 
-            [[3. 1. 3.]]
+            .. testoutput::
+                :options: +SKIP
 
-            [[ 2.  2.]
-             [12. 11.]]
+                [[3. 1. 3.]]
 
-            [[[[ 1.  1.]
-               [11.  4.]]
-              [[ 0.  0.]
-               [ 9. 13.]]
-              [[ 0.  1.]
-               [ 7. 12.]]]]
+                [[ 2.  2.]
+                 [12. 11.]]
 
+                [[[[ 1.  1.]
+                   [11.  4.]]
+                  [[ 0.  0.]
+                   [ 9. 13.]]
+                  [[ 0.  1.]
+                   [ 7. 12.]]]]
         """
         _seed = self._seed() if callable(self._seed) else self._seed
         return _poisson(lam=lam, size=size, seed=_seed, handle=self._handle)
 
     def permutation(self, n: int, *, dtype: str = "int32"):
-        r"""
-        Generates a random permutation of integers from :math:`0` to :math:`n - 1`.
+        r"""Generates a random permutation of integers from :math:`0` to :math:`n - 1`.
 
-        :param n: the upper bound. Must be larger than 0.
-        :param dtype: the output data type. int32, int16 and float32 are 
-            supported. Default: int32
-        :return: the output tensor.
+        Args:
+            n: the upper bound. Must be larger than 0.
+            dtype: the output data type. int32, int16 and float32 are supported. Default: int32
+
+        Returns:
+            the output tensor.
 
         Examples:
 
-        .. testcode::
+            .. testcode::
 
-            import megengine as mge
-            import megengine.random as rand
+                import megengine as mge
+                import megengine.random as rand
 
-            x = rand.permutation(n=10, dtype="int32")
-            print(x.numpy())
+                x = rand.permutation(n=10, dtype="int32")
+                print(x.numpy())
 
-            x = rand.permutation(n=10, dtype="float32")
-            print(x.numpy())
-        
-        Outputs:
-        
-        .. testoutput::
-            :options: +SKIP
+                x = rand.permutation(n=10, dtype="float32")
+                print(x.numpy())
+
+            Outputs:
 
-            [4 5 0 7 3 8 6 1 9 2]
-            [3. 4. 9. 0. 6. 8. 7. 1. 5. 2.]
+            .. testoutput::
+                :options: +SKIP
 
+                [4 5 0 7 3 8 6 1 9 2]
+                [3. 4. 9. 0. 6. 8. 7. 1. 5. 2.]
         """
         _seed = self._seed() if callable(self._seed) else self._seed
         return _permutation(
diff --git a/imperative/python/megengine/serialization.py b/imperative/python/megengine/serialization.py
index 252b73fa..217c34ae 100644
--- a/imperative/python/megengine/serialization.py
+++ b/imperative/python/megengine/serialization.py
@@ -14,18 +14,13 @@ from .utils.max_recursion_limit import max_recursion_limit
 
 
 def save(obj, f, pickle_module=pickle, pickle_protocol=pickle.HIGHEST_PROTOCOL):
-    r"""
-    Save an object to disk file.
-
-    :type obj: object
-    :param obj: object to save. Only ``module`` or ``state_dict`` are allowed.
-    :type f: text file object
-    :param f: a string of file name or a text file object to which ``obj`` is saved to.
-    :type pickle_module:
-    :param pickle_module: Default: ``pickle``.
-    :type pickle_protocol:
-    :param pickle_protocol: Default: ``pickle.HIGHEST_PROTOCOL``.
+    r"""Save an object to disk file.
 
+    Args:
+        obj: object to save. Only ``module`` or ``state_dict`` are allowed.
+        f: a string of file name or a text file object to which ``obj`` is saved to.
+        pickle_module: Default: ``pickle``.
+        pickle_protocol: Default: ``pickle.HIGHEST_PROTOCOL``.
     """
     if isinstance(f, str):
         with open(f, "wb") as fout:
@@ -82,40 +77,30 @@ def _get_callable_map_location(map_location):
 
 
 def load(f, map_location=None, pickle_module=pickle):
-    r"""
-    Load an object saved with save() from a file.
+    r"""Load an object saved with :func:~.megengine.save` from a file.
 
-    :type f: text file object
-    :param f: a string of file name or a text file object from which to load.
-    :type map_location: str, dict or a function specifying the map rules
-    :param map_location: Default: ``None``.
+    Args:
+        f: a string of file name or a text file object from which to load.
+        map_location: Default: ``None``.
+        pickle_module: Default: ``pickle``.
 
-    .. note::
-
-        map_location defines device mapping. See examples for usage.
-
-    :type pickle_module:
-    :param pickle_module: Default: ``pickle``.
-
-    .. note::
-
-        If you will call :func:`mge.set_default_device()`, please do it
-        before :func:`mge.load()`.
+    Note:
+       * ``map_location`` defines device mapping. See examples for usage.
+       * If you will call :func:`~.megengine.set_default_device()`, please do it
+         before :func:`~.megengine.load()`.
 
     Examples:
-
-    .. testcode:
-
-        import megengine as mge
-        # Load tensors to the same device as defined in model.pkl
-        mge.load('model.pkl')
-        # Load all tensors to gpu0.
-        mge.load('model.pkl', map_location='gpu0')
-        # Load all tensors originally on gpu0 to cpu0
-        mge.load('model.pkl', map_location={'gpu0':'cpu0'})
-        # Load all tensors to cpu0
-        mge.load('model.pkl', map_location=lambda dev: 'cpu0')
-
+        .. code-block::
+
+           import megengine as mge
+           # Load tensors to the same device as defined in model.pkl
+           mge.load('model.pkl')
+           # Load all tensors to gpu0.
+           mge.load('model.pkl', map_location='gpu0')
+           # Load all tensors originally on gpu0 to cpu0
+           mge.load('model.pkl', map_location={'gpu0':'cpu0'})
+           # Load all tensors to cpu0
+           mge.load('model.pkl', map_location=lambda dev: 'cpu0')
     """
     if isinstance(f, str):
         with open(f, "rb") as fin:
diff --git a/imperative/python/megengine/tensor.py b/imperative/python/megengine/tensor.py
index 24763557..955efaa8 100644
--- a/imperative/python/megengine/tensor.py
+++ b/imperative/python/megengine/tensor.py
@@ -26,16 +26,15 @@ logger = get_logger(__name__)
 
 
 class Tensor(_Tensor, ArrayMethodMixin):
-    r"""
-    A tensor object represents a multidimensional, homogeneous array of fixed-size items.
-
-    :param data: The value of returned Tensor.
-    :type data: Tensor, :class:`~.numpy.ndarray`, :class:`list` or python number.
-    :param dtype: The dtype of returned Tensor. Uses data's dtype if not specified.
-    :param device: The desired device of returned Tensor. Uses :func:`get_default_device` if not specified.
-    :param is_const: Whether make it a ``ImutableTensor`` in tracing mode.
-    :param no_cache: Whether cache it for memory sharing.
-    :param name: Used to improve convenience in graph operation on dumped model.
+    r"""A tensor object represents a multidimensional, homogeneous array of fixed-size items.
+
+    Args:
+        data(Tensor, :class:`~.numpy.ndarray`, :class:`list` or python number.): The value of returned Tensor.
+        dtype: The dtype of returned Tensor. Uses data's dtype if not specified.
+        device: The desired device of returned Tensor. Uses :func:`get_default_device` if not specified.
+        is_const: Whether make it a ``ImutableTensor`` in tracing mode.
+        no_cache: Whether cache it for memory sharing.
+        name: Used to improve convenience in graph operation on dumped model.
     """
 
     grad = None
@@ -88,18 +87,16 @@ class Tensor(_Tensor, ArrayMethodMixin):
 
     @property
     def shape(self) -> Union[tuple, "Tensor"]:
-        r"""
-        Returns a :class:`tuple` or a :class:`~.Tensor` represents tensor dimensions.
-
-        .. note::
+        r"""Returns a :class:`tuple` or a :class:`~.Tensor` represents tensor dimensions.
 
+        Note:
            The shape of a tensor was usually represented by a :class:`tuple`.
-           But if a tensor was treated as symbolic placeholder with tracing, 
+           But if a tensor was treated as symbolic placeholder with tracing,
            it's shape could also be a :class:`~.Tensor`. See :class:`~.trace` for more details.
 
-        The shape property is usually used to get the current shape of a tensor, 
-        but may also be used to reshape the tensor in-place by assigning a tuple of tensor dimensions to it. 
-        As with :func:`~.reshape`, one of the new shape dimensions can be -1, 
+        The shape property is usually used to get the current shape of a tensor,
+        but may also be used to reshape the tensor in-place by assigning a tuple of tensor dimensions to it.
+        As with :func:`~.reshape`, one of the new shape dimensions can be -1,
         in which case its value is inferred from the size of the tensor and the remaining dimensions.
         """
         shape = super().shape
@@ -113,23 +110,17 @@ class Tensor(_Tensor, ArrayMethodMixin):
 
     @property
     def device(self) -> CompNode:
-        r"""
-        Returns a string represents the device a :class:`~.Tensor` storaged on. 
-        """
+        r"""Returns a string represents the device a :class:`~.Tensor` storaged on."""
         return super().device
 
     @property
     def dtype(self) -> np.dtype:
-        r"""
-        Returns a :class:`numpy.dtype` object represents the data type of a :class:`~.Tensor`.
-        """
+        r"""Returns a :class:`numpy.dtype` object represents the data type of a :class:`~.Tensor`."""
         return super().dtype
 
     @property
     def qparams(self):
-        r"""
-        Returns a :class:`~.QParams` object containing quantization params of a :class:`~.Tensor`.
-        """
+        r"""Returns a :class:`~.QParams` object containing quantization params of a :class:`~.Tensor`."""
         from .quantization.utils import create_qparams  # pylint: disable=all
 
         if self._qparams is None:
@@ -137,15 +128,11 @@ class Tensor(_Tensor, ArrayMethodMixin):
         return self._qparams
 
     def numpy(self) -> np.ndarray:
-        r"""
-        Returns self :class:`~.Tensor` as a :class:`numpy.ndarray`.
-        """
+        r"""Returns self :class:`~.Tensor` as a :class:`numpy.ndarray`."""
         return super().numpy()
 
     def detach(self):
-        r"""
-        Returns a new :class:`~.Tensor`, detached from the current graph.
-        """
+        r"""Returns a new :class:`~.Tensor`, detached from the current graph."""
         return super().detach()
 
     def _reset(self, other):
@@ -180,9 +167,7 @@ class Tensor(_Tensor, ArrayMethodMixin):
         self *= 0
 
     def to(self, device):
-        r"""
-        Copy self :class:`~.Tensor` to specified device. See :func:`~.copy`
-        """
+        r"""Copy self :class:`~.Tensor` to specified device. See :func:`~.copy`"""
         if isinstance(device, str) and not _valid_device(device):
             raise ValueError(
                 "invalid device name {}. For the correct format of the device name, please refer to the instruction of megengine.device.set_default_device()".format(
@@ -208,13 +193,11 @@ class Tensor(_Tensor, ArrayMethodMixin):
         return id(self)
 
     def __getnewargs__(self):
-        r""" __getnewargs__ will be called for pickle serialization or deep copy
-        """
+        r""" __getnewargs__ will be called for pickle serialization or deep copy"""
         return (self.numpy(), self.dtype, self.device.logical_name)
 
     def __getstate__(self):
-        r""" __getstate__ will be called for pickle serialization or deep copy
-        """
+        r""" __getstate__ will be called for pickle serialization or deep copy"""
         state = {}
         if self._qparams is not None:
             state["qparams"] = self._qparams
@@ -245,13 +228,10 @@ tensor = Tensor
 
 
 class Parameter(Tensor):
-    r"""
-    A kind of Tensor that is to be considered a module parameter.
-
-    .. note::
+    r"""A kind of Tensor that is to be considered a module parameter.
 
+    Note:
         Operations happened on Parameter usually return a Tensor instead of Parameter.
         For example, with a Parameter ``x``, ``x.reshape/to/sum/...`` will result into a Tensor.
         Any operations between Parameter and Tensor will have Tensor as outputs.
-
     """
diff --git a/imperative/python/megengine/tools/compare_binary_iodump.py b/imperative/python/megengine/tools/compare_binary_iodump.py
index 9a4ef87a..4898b566 100755
--- a/imperative/python/megengine/tools/compare_binary_iodump.py
+++ b/imperative/python/megengine/tools/compare_binary_iodump.py
@@ -16,12 +16,14 @@ import numpy as np
 
 
 def load_tensor_binary(fobj):
-    """
-    Load a tensor dumped by the :class:`BinaryOprIODump` plugin; the actual
+    """Load a tensor dumped by the :class:`BinaryOprIODump` plugin; the actual
     tensor value dump is implemented by ``mgb::debug::dump_tensor``.
 
-    :param fobj: file object, or a string that contains the file name.
-    :return: tuple ``(tensor_value, tensor_name)``.
+    Args:
+      fobj: file object, or a string that contains the file name.
+
+    Returns:
+      tuple ``(tensor_value, tensor_name)``.
     """
     if isinstance(fobj, str):
         with open(fobj, "rb") as fin:
diff --git a/imperative/python/megengine/tools/network_visualize.py b/imperative/python/megengine/tools/network_visualize.py
index 1b682e5f..0b2e148e 100755
--- a/imperative/python/megengine/tools/network_visualize.py
+++ b/imperative/python/megengine/tools/network_visualize.py
@@ -47,20 +47,28 @@ def visualize(
     logging_to_stdout: bool = True,
     bar_length_max: int = 20,
 ):
-    r"""
-    Load megengine dumped model and visualize graph structure with tensorboard log files.
+    r"""Load megengine dumped model and visualize graph structure with tensorboard log files.
     Can also record and print model's statistics like :func:`~.module_stats`
 
-    :param model_path: dir path for megengine dumped model.
-    :param log_path: dir path for tensorboard graph log.
-    :param input: user defined input data for running model and calculating stats, alternative with inp_dict, used when the model has only one input.
-    :param inp_dict: input dict for running model and calculating stats, alternative with input, used when the model has more than one input. When both input and inp_dict are None, a random input will be used.
-    :param cal_params: whether calculate and record params size.
-    :param cal_flops: whether calculate and record op flops.
-    :param cal_activations: whether calculate and record op activations.
-    :param logging_to_stdout: whether print all calculated statistic details.
-    :param bar_length_max: size of bar indicating max flops or parameter size in net stats.
-
+    Args:
+      model_path: dir path for megengine dumped model.
+      log_path: dir path for tensorboard graph log.
+      input: user defined input data for running model and calculating stats, alternative with inp_dict, used when the model has only one input.
+      inp_dict: input dict for running model and calculating stats, alternative with input, used when the model has more than one input. When both input and inp_dict are None, a random input will be used.
+      cal_params: whether calculate and record params size.
+      cal_flops: whether calculate and record op flops.
+      cal_activations: whether calculate and record op activations.
+      logging_to_stdout: whether print all calculated statistic details.
+      bar_length_max: size of bar indicating max flops or parameter size in net stats.
+      model_path: str:
+      log_path: str:
+      input: np.ndarray:
+      inp_dict: dict:
+      cal_params: bool:
+      cal_flops: bool:
+      cal_activations: bool:
+      logging_to_stdout: bool:
+      bar_length_max: int:
     """
     if log_path:
         try:
diff --git a/imperative/python/megengine/tools/profile_analyze.py b/imperative/python/megengine/tools/profile_analyze.py
index 071e0660..23031fdc 100755
--- a/imperative/python/megengine/tools/profile_analyze.py
+++ b/imperative/python/megengine/tools/profile_analyze.py
@@ -23,7 +23,7 @@ from megengine.utils.profile_analyzer import (
 
 
 def _tabulate_ml(tab, **kwargs):
-    """Tabulate profile output with multi-line support."""
+    r"""Tabulate profile output with multi-line support."""
     new_tab = []
     new_tab_is_row = []
     for row in tab:
@@ -48,7 +48,7 @@ def _tabulate_ml(tab, **kwargs):
 
 
 def _tabulate_confluence(tab, **kwargs):
-    """Tabulate profile output."""
+    r"""Tabulate profile output."""
     kwargs.pop("tablefmt", None)
     s = tabulate(tab, tablefmt="orgtbl", **kwargs)
     lines = s.split("\n")
@@ -57,9 +57,7 @@ def _tabulate_confluence(tab, **kwargs):
 
 
 def main(passed_args=None):  # pylint: disable=too-many-statements
-    """
-    Analyses profile info from :mod:`~.utils.profile_analyzer` .
-
+    r"""Analyses profile info from :mod:`~.utils.profile_analyzer` .
     Run this file with ``--help`` to get more usage.
     """
     parser = argparse.ArgumentParser(
diff --git a/imperative/python/megengine/traced_module/expr.py b/imperative/python/megengine/traced_module/expr.py
index e42ca241..836f765c 100644
--- a/imperative/python/megengine/traced_module/expr.py
+++ b/imperative/python/megengine/traced_module/expr.py
@@ -33,9 +33,7 @@ def rstrip(s: str, __chars: str):
 
 
 class Expr:
-    """
-    ``Expr`` represents the operations(i.e. CallMethod, CallFunction, Apply, GetAttr, Input, Constant) on ``Node``.
-    """
+    """``Expr`` represents the operations(i.e. CallMethod, CallFunction, Apply, GetAttr, Input, Constant) on ``Node``."""
 
     __total_id = 0
     inputs = None  # type: List[Node]
diff --git a/imperative/python/megengine/traced_module/fake_quant.py b/imperative/python/megengine/traced_module/fake_quant.py
index 120cdc6b..8dd29aa0 100644
--- a/imperative/python/megengine/traced_module/fake_quant.py
+++ b/imperative/python/megengine/traced_module/fake_quant.py
@@ -35,8 +35,9 @@ class FakeQuantize(_FakeQuantize, QParamsModuleMixin):
         return self.qparams
 
     def set_qparams(self, qparams: QParams):
-        """
-        :param qparams: used to set initial scale.
+        r"""
+        Args:
+          qparams: used to set initial scale.
         """
         if qparams.scale is None:
             raise AssertionError("Can not get an initialized scale")
diff --git a/imperative/python/megengine/traced_module/node.py b/imperative/python/megengine/traced_module/node.py
index 6e033058..db1326dd 100644
--- a/imperative/python/megengine/traced_module/node.py
+++ b/imperative/python/megengine/traced_module/node.py
@@ -17,11 +17,12 @@ from ..tensor import Tensor
 
 
 class Node:
-    """
-    ``Node`` represents the variables （Tensor/Module/other python object) used in Module's forward method. They are inputs/outputs of Expr(the operations on variables).
+    r"""``Node`` represents the variables （Tensor/Module/other python object) used in Module's forward method.
+    They are inputs/outputs of Expr(the operations on variables).
 
-    param expr: the Expr which produces the node
-    param name: the name of the node
+    Args:
+        expr: the Expr which produces the node
+        name: the name of the node
     """
 
     expr = None
@@ -90,14 +91,7 @@ class Node:
 
 
 class ModuleNode(Node):
-    """
-    ``ModuleNode`` represents the Module objects.
-
-    Attributes:
-        module_type: type of the Module correspending to the ModuleNode
-        graph: the InternalGraph which will be interpreted when call Module's forward method
-        attr_type_map: record the type of Module's attributes
-    """
+    r"""``ModuleNode`` represents the Module objects."""
 
     module_type = Module  # type: Type[Module]
     _owner = None  # type: weakref.ReferenceType
@@ -123,9 +117,7 @@ class ModuleNode(Node):
 
 
 class TensorNode(Node):
-    """
-    ``TensorNode`` represents the Tensor objects.
-    """
+    r"""``TensorNode`` represents the Tensor objects."""
 
     _shape = None  # type: Tuple[int]
     _dtype = None  # type: numpy.dtype
diff --git a/imperative/python/megengine/traced_module/traced_module.py b/imperative/python/megengine/traced_module/traced_module.py
index 4bd9553c..b18f66a7 100644
--- a/imperative/python/megengine/traced_module/traced_module.py
+++ b/imperative/python/megengine/traced_module/traced_module.py
@@ -341,13 +341,12 @@ class _InsertExprs:
 
 
 class InternalGraph:
-    """
-    ``InternalGraph`` is a graph consist of ``Node`` and  ``Expr``, it is used to represent the execution procedure of Module's forward method.
+    r"""``InternalGraph`` is a graph consist of ``Node`` and  ``Expr``, it is used to represent the execution procedure of Module's forward method.
 
     Attributes:
-    _exprs: List of Exprs in order of execution
-    _inputs: Input Nodes of InternalGraph
-    _outputs: Output Nodes of InternalGraph
+        _exprs: List of Exprs in order of execution
+        _inputs: Input Nodes of InternalGraph
+        _outputs: Output Nodes of InternalGraph
     """
 
     _exprs = None  # type: List[Expr]
@@ -733,9 +732,7 @@ class InternalGraph:
                 n.inputs[idx] = repl_node
 
     def compile(self):
-        """
-        Delete unused expr.
-        """
+        """Delete unused expr."""
         dep_exprs = self.get_dep_exprs(self.outputs)
         i = 0
         while i < len(self._exprs):
@@ -1403,9 +1400,7 @@ class ExprFilterExprId(ExprFilter):
 
 
 class TracedModule(Module):
-    """
-    `TracedModule` is the Module created by tracing normal module. It owns an argdef to graph(InternalGraph) map. The forward method of `TracedModule` will get a graph from `argdef_graph_map` according to the argdef of input args/kwargs and interpret it.
-    """
+    r"""`TracedModule` is the Module created by tracing normal module. It owns an argdef to graph(InternalGraph) map. The forward method of `TracedModule` will get a graph from `argdef_graph_map` according to the argdef of input args/kwargs and interpret it."""
 
     # m_node = None  # type: ModuleNode
     argdef_graph_map = None
@@ -1526,8 +1521,7 @@ class TracedModule(Module):
                     obj._update_ref(mnode_map, graph)
 
     def flatten(self):
-        """
-        Get a new module, which eliminates ``GetAttr`` and has no hierarchy.
+        r"""Get a new module, which eliminates ``GetAttr`` and has no hierarchy.
 
         :return: :class:`TracedModule`
         """
@@ -1661,18 +1655,16 @@ def cpp_apply_module_trace(opdef, *args):
 
 
 def register_as_builtin(mod_cls: Type[Module]) -> None:
-    """
-    Registers class ``mod_cls`` (subclass of megengine.module.Module) as builtin module.
+    r"""Registers class ``mod_cls`` (subclass of megengine.module.Module) as builtin module.
 
-    param mod_cls: the Module class which will be threated as builtin module in tracing
+    Args:
+        mod_cls: the Module class which will be threated as builtin module in tracing
     """
     module_tracer.register_as_builtin(mod_cls)
 
 
 def wrap(func: Callable):
-    """
-    Call this function to register func as a builtin function.
-    """
+    r"""Call this function to register func as a builtin function."""
     assert callable(func), "func must be a callable"
     assert hasattr(func, "__code__")
     fn_name = func.__code__.co_name
@@ -1713,12 +1705,12 @@ def _register_all_builtin_module():
 
 
 def trace_module(mod: Module, *args: Tensor, **kwargs: Tensor) -> TracedModule:
-    """
-    Traces module ``mod`` and returns corresponding TracedModule.
+    r"""Traces module ``mod`` and returns corresponding TracedModule.
 
-    param mod: the module will be converted to TracedModule
-    param input: the positional arguments passed to forward method of ``mod``
-    param kwargs: the keyword arguments passed to forward method of ``mod``
+    Args:
+        mod: the module will be converted to TracedModule
+        input: the positional arguments passed to forward method of ``mod``
+        kwargs: the keyword arguments passed to forward method of ``mod``
     """
     assert active_module_tracer() is None
     assert isinstance(mod, Module)
diff --git a/imperative/python/megengine/traced_module/utils.py b/imperative/python/megengine/traced_module/utils.py
index c7c52e73..197c602e 100644
--- a/imperative/python/megengine/traced_module/utils.py
+++ b/imperative/python/megengine/traced_module/utils.py
@@ -53,8 +53,7 @@ def replace_container_with_module_container(container):
 
 
 class _ModuleList(Module, MutableSequence):
-    r"""
-    A List-like container.
+    r"""A List-like container.
 
     Using a ``ModuleList``, one can visit, add, delete and modify submodules
     just like an ordinary python list.
@@ -140,8 +139,7 @@ class _ModuleList(Module, MutableSequence):
 
 
 class _ModuleDict(Module, MutableMapping):
-    r"""
-    A Dict-like container.
+    r"""A Dict-like container.
 
     Using a ``ModuleDict``, one can visit, add, delete and modify submodules
     just like an ordinary python dict.
diff --git a/imperative/python/megengine/utils/comp_graph_tools.py b/imperative/python/megengine/utils/comp_graph_tools.py
index 840f1e9e..9b5898c3 100644
--- a/imperative/python/megengine/utils/comp_graph_tools.py
+++ b/imperative/python/megengine/utils/comp_graph_tools.py
@@ -37,8 +37,7 @@ __all__ = [
 def get_dep_vars(
     var: Union[_VarNode, List[_VarNode]], var_type: Union[str, List[str]] = None
 ) -> List[_VarNode]:
-    """
-    Returns :class:`.tensor.core.megbrain_graph.VarNode` of type ``var_type`` that input ``var``
+    r"""Returns :class:`.tensor.core.megbrain_graph.VarNode` of type ``var_type`` that input ``var``
     depands on. If ``var_type`` is None, returns all types.
     """
     outputs = []
@@ -67,30 +66,23 @@ def get_dep_vars(
 
 
 def get_owner_opr_inputs(var: _VarNode) -> List[_VarNode]:
-    """
-    Gets the inputs of owner opr of a variable.
-    """
+    r"""Gets the inputs of owner opr of a variable. """
     return var.owner.inputs
 
 
 def get_owner_opr_type(var: _VarNode) -> str:
-    """
-    Gets the type of owner opr of a variable.
-
-    """
+    r"""Gets the type of owner opr of a variable."""
     return var.owner.type
 
 
 def get_opr_type(opr: _OpNode) -> str:
-    """
-    Gets the type of an opr.
-    """
+    r"""Gets the type of an opr."""
     assert isinstance(opr, _OpNode)
     return opr.type
 
 
 class _OprStableOrderHeapq:
-    """heap implementation for operator comparison in stable order"""
+    r"""heap implementation for operator comparison in stable order"""
 
     _list = None
     _extra_priority = None
@@ -125,18 +117,22 @@ class _OprStableOrderHeapq:
 
 
 def graph_traversal(outputs: _VarNode):
-    """
-    Helper function to traverse the computing graph and return enough useful information.
+    r"""Helper function to traverse the computing graph and return enough useful information.
+
+    Args:
+        outputs: model outputs.
+
+    Returns:
+        tuple (map_oprs, map_vars, var2oprs, opr2receivers, indegree2opr, opr2indegree)
 
-    :param outputs: model outputs.
-    :return:  tuple (map_oprs, map_vars, var2oprs, opr2receivers, indegree2opr, opr2indegree)
         WHERE
-        map_oprs is dict from opr_id to actual opr
-        map_vars is dict from var_id to actual var
-        var2oprs is dict from var to dest oprs along with index
-        opr2receivers is dict from current opr to next opr
-        indegree2opr is dict from in_degree to opr in computing graph
-        opr2indegree is dict from opr in computing graph to in_degree
+
+        * map_oprs is dict from opr_id to actual opr
+        * map_vars is dict from var_id to actual var
+        * var2oprs is dict from var to dest oprs along with index
+        * opr2receivers is dict from current opr to next opr
+        * indegree2opr is dict from in_degree to opr in computing graph
+        * opr2indegree is dict from opr in computing graph to in_degree
 
         (indegree2opr, opr2indegree) are only used in topological sort in get_oprs_seq function
     """
@@ -185,13 +181,15 @@ def graph_traversal(outputs: _VarNode):
 def get_oprs_seq(
     outputs: List[_VarNode], prune_reshape=False, prune_immtensor=True
 ) -> List[_OpNode]:
-    """
-    Gets oprs in some topological order for a dumped model.
+    r"""Gets oprs in some topological order for a dumped model.
 
-    :param outputs: model outputs.
-    :param prune_reshape: whether to prune the useless operators used by Reshape opr during inference.
-    :param prune_immtensor: whether to prune the ImmutableTensor opr.
-    :return: opr list with some correct execution order.
+    Args:
+        outputs: model outputs.
+        prune_reshape: whether to prune the useless operators used by Reshape opr during inference.
+        prune_immtensor: whether to prune the ImmutableTensor opr.
+
+    Returns:
+        opr list with some correct execution order.
     """
 
     def topological_sort(map_oprs, opr2receivers, indegree2opr, opr2indegree):
@@ -289,14 +287,14 @@ def get_oprs_seq(
 def replace_vars(
     dst: List[_VarNode], varmap: Dict[_VarNode, _VarNode]
 ) -> List[_VarNode]:
-    """
-    Replaces vars in the graph.
+    r"""Replaces vars in the graph.
 
-    :param dst: target vars representing the graph.
-    :param varmap: the map that specifies how to replace the vars.
+    Args:
+        dst: target vars representing the graph.
+        varmap: the map that specifies how to replace the vars.
 
-    :return: new vars that correspond to ``dst`` with all the dependencies
-        replaced.
+    Returns:
+        new vars that correspond to ``dst`` with all the dependencies replaced.
     """
     dst_vec = []
     repl_src_vec = []
@@ -315,14 +313,14 @@ def replace_vars(
 
 
 def replace_oprs(dst: List[_VarNode], oprmap: Dict[_OpNode, _OpNode]) -> List[_VarNode]:
-    """
-    Replaces operators in the graph.
+    """Replaces operators in the graph.
 
-    :param dst: target vars representing the graph.
-    :param oprmap: the map that specifies how to replace the operators.
+    Args:
+        dst: target vars representing the graph.
+        oprmap: the map that specifies how to replace the operators.
 
-    :return: new vars that correspond to ``dst`` with all the dependencies
-        replaced.
+    Returns:
+        new vars that correspond to ``dst`` with all the dependencies replaced.
     """
     dst_vec = []
     repl_src_vec = []
@@ -341,13 +339,14 @@ def replace_oprs(dst: List[_VarNode], oprmap: Dict[_OpNode, _OpNode]) -> List[_V
 
 
 def find_vars_by_name(dst: List[_VarNode], names: List[str]) -> List[_VarNode]:
-    """
-    Gets VarNode list by names in the graph.
+    r"""Gets VarNode list by names in the graph.
 
-    :param dst: target vars representing the graph.
-    :param names: name list for target VarNode.
+    Args:
+        dst: target vars representing the graph.
+        names: name list for target VarNode.
 
-    :return: results found by names.
+    Returns:
+        results found by names.
     """
     output_names = names.copy()
     all_vars = get_dep_vars(dst) + dst
@@ -366,16 +365,16 @@ def find_vars_by_name(dst: List[_VarNode], names: List[str]) -> List[_VarNode]:
 def convert_inputs(
     dst: List[_VarNode], inputs: List[_VarNode] = None
 ) -> Tuple[List[_VarNode], Dict[str, _VarNode]]:
-    """
-    Replaces ``Host2DeviceCopy`` with :class:`~.InputNode` in the graph
+    r"""Replaces ``Host2DeviceCopy`` with :class:`~.InputNode` in the graph
     to :meth:`~.InputNode.set_value` and run.
 
-    :param dst: target vars representing the graph.
-    :param inputs: indicates which inputs to be replaced. All
-        inputs(``Host2DeiceCopy``) will be replaced if not specified.
+    Args:
+        dst: target vars representing the graph.
+        inputs: indicates which inputs to be replaced. All
+            inputs(``Host2DeiceCopy``) will be replaced if not specified.
 
-    :return: new vars that correspond to ``dst`` with all inputs
-        replaced, and new inputs dict.
+    Returns:
+        new vars that correspond to ``dst`` with all inputs replaced, and new inputs dict.
     """
     if inputs is None:
         inputs = get_dep_vars(dst, "Host2DeviceCopy")
@@ -395,14 +394,14 @@ def convert_inputs(
 
 
 def convert_outputs(dst: List[_VarNode]) -> Tuple[List[_VarNode], Dict[str, _VarNode]]:
-    """
-    Wraps ``dst`` with :class:`~.OutputNode` in the graph to get outputs
+    r"""Wraps ``dst`` with :class:`~.OutputNode` in the graph to get outputs
     with :meth:`~.OutputNode.get_value`.
 
-    :param dst: target vars representing the graph.
+    Args:
+        dst: target vars representing the graph.
 
-    :return: new vars that correspond to ``dst`` with all inputs
-        replaced, and outputs dict.
+    Returns:
+        new vars that correspond to ``dst`` with all inputs replaced, and outputs dict.
     """
     output_dict = OrderedDict([(i.name, G.OutputNode(i)) for i in dst])
     new_output_nodes = [i.outputs[0] for i in output_dict.values()]
@@ -412,15 +411,16 @@ def convert_outputs(dst: List[_VarNode]) -> Tuple[List[_VarNode], Dict[str, _Var
 def embed_inputs(
     dst: List[_VarNode], data: List[np.ndarray], inputs: List[_VarNode] = None
 ) -> Tuple[List[_VarNode], Dict[str, _VarNode]]:
-    """
-    Embeds ``data`` to the graph's inputs of ``dst``.
-
-    :param dst: target vars representing the graph.
-    :param data: data to be embeded.
-    :param inputs: indicates which inputs to be replaced. All
-        inputs(``Host2DeiceCopy``) will be replaced if not specified.
-    :return: new vars that correspond to ``dst`` with all inputs
-        replaced, and new inputs dict.
+    r"""Embeds ``data`` to the graph's inputs of ``dst``.
+
+    Args:
+        dst: target vars representing the graph.
+        data: data to be embeded.
+        inputs: indicates which inputs to be replaced. All
+            inputs(``Host2DeiceCopy``) will be replaced if not specified.
+
+    Returns:
+      new vars that correspond to ``dst`` with all inputs replaced, and new inputs dict.
     """
     if inputs is None:
         inputs = get_dep_vars(dst, "Host2DeviceCopy")
@@ -439,12 +439,12 @@ def embed_inputs(
 
 
 class GraphInference:
-    """
-    Loads a serialized computing graph as a GraphInference object which can be used
+    r"""Loads a serialized computing graph as a GraphInference object which can be used
     to execute the computing graph.
 
-    :param file: could be file object or filename.
-    :param outputs: only compile the subgraph with outputs as its endpoints.
+    Args:
+        file: could be file object or filename.
+        outputs: only compile the subgraph with outputs as its endpoints.
     """
 
     def __init__(
@@ -472,10 +472,14 @@ class GraphInference:
     def run(
         self, *inp_args: np.ndarray, inp_dict: Dict[str, np.ndarray] = None
     ) -> Dict[str, np.ndarray]:
-        """
-        :param inp_args: list of input datas.
-        :param inp_dict: dict of named input datas.
-        :return: a dict {output_name: output_value}.
+        r"""
+
+        Args:
+            inp_args: list of input datas.
+            inp_dict: dict of named input datas.
+
+        Returns:
+            a dict {output_name: output_value}.
         """
         assert len(inp_args) <= len(
             self._inp_dict
diff --git a/imperative/python/megengine/utils/deprecation.py b/imperative/python/megengine/utils/deprecation.py
index c89665ce..42b510c2 100644
--- a/imperative/python/megengine/utils/deprecation.py
+++ b/imperative/python/megengine/utils/deprecation.py
@@ -12,11 +12,13 @@ from deprecated.sphinx import deprecated
 
 
 def deprecated_func(version, origin, name, tbd):
-    """
-    :param version: version to deprecate this function
-    :param origin: origin module path
-    :param name: function name
-    :param tbd: to be discussed, if true, ignore warnings
+    r"""
+
+    Args:
+        version: version to deprecate this function
+        origin: origin module path
+        name: function name
+        tbd: to be discussed, if true, ignore warnings
     """
     should_warning = not tbd
 
diff --git a/imperative/python/megengine/utils/http_download.py b/imperative/python/megengine/utils/http_download.py
index 6cfcf97a..6342be48 100644
--- a/imperative/python/megengine/utils/http_download.py
+++ b/imperative/python/megengine/utils/http_download.py
@@ -23,16 +23,16 @@ HTTP_CONNECTION_TIMEOUT = 5
 
 
 class HTTPDownloadError(BaseException):
-    """The class that represents http request error."""
+    r"""The class that represents http request error."""
 
 
 def download_from_url(url: str, dst: str, http_read_timeout=120):
-    """
-    Downloads file from given url to ``dst``.
+    r"""Downloads file from given url to ``dst``.
 
-    :param url: source URL.
-    :param dst: saving path.
-    :param http_read_timeout: how many seconds to wait for data before giving up.
+    Args:
+        url: source URL.
+        dst: saving path.
+        http_read_timeout: how many seconds to wait for data before giving up.
     """
     dst = os.path.expanduser(dst)
     dst_dir = os.path.dirname(dst)
diff --git a/imperative/python/megengine/utils/max_recursion_limit.py b/imperative/python/megengine/utils/max_recursion_limit.py
index 9ac86845..4af5f9c1 100644
--- a/imperative/python/megengine/utils/max_recursion_limit.py
+++ b/imperative/python/megengine/utils/max_recursion_limit.py
@@ -16,9 +16,7 @@ if platform.system() != "Windows":
 
 
 class AlternativeRecursionLimit:
-    r"""
-    A reentrant context manager for setting global recursion limits.
-    """
+    r"""A reentrant context manager for setting global recursion limits."""
 
     def __init__(self, new_py_limit):
         self.new_py_limit = new_py_limit
@@ -74,7 +72,5 @@ _max_recursion_limit_context_manager = AlternativeRecursionLimit(2 ** 31 - 1)
 
 
 def max_recursion_limit():
-    r"""
-    Sets recursion limit to the max possible value.
-    """
+    r"""Sets recursion limit to the max possible value."""
     return _max_recursion_limit_context_manager
diff --git a/imperative/python/megengine/utils/module_stats.py b/imperative/python/megengine/utils/module_stats.py
index 43f0f1bc..fe46c0ea 100644
--- a/imperative/python/megengine/utils/module_stats.py
+++ b/imperative/python/megengine/utils/module_stats.py
@@ -420,18 +420,17 @@ def module_stats(
     logging_to_stdout: bool = True,
     bar_length_max: int = 20,
 ):
-    r"""
-    Calculate and print ``model``'s statistics by adding hook and record Module's inputs outputs size.
-
-    :param model: model that need to get stats info.
-    :param inputs: user defined input data for running model and calculating stats, alternative with input_shapes.
-    :param input_shapes: shapes to generate random inputs for running model and calculating stats, alternative with inputs.
-    :param cal_params: whether calculate and record params size.
-    :param cal_flops: whether calculate and record op flops.
-    :param cal_activations: whether calculate and record op activations.
-    :param logging_to_stdout: whether print all calculated statistic details.
-    :param bar_length_max: size of bar indicating max flops or parameter size in net stats.
-    
+    r"""Calculate and print ``model``'s statistics by adding hook and record Module's inputs outputs size.
+
+    Args:
+        model: model that need to get stats info.
+        inputs: user defined input data for running model and calculating stats, alternative with input_shapes.
+        input_shapes: shapes to generate random inputs for running model and calculating stats, alternative with inputs.
+        cal_params: whether calculate and record params size.
+        cal_flops: whether calculate and record op flops.
+        cal_activations: whether calculate and record op activations.
+        logging_to_stdout: whether print all calculated statistic details.
+        bar_length_max: size of bar indicating max flops or parameter size in net stats.
     """
     has_inputs = False
     if inputs is not None:
diff --git a/imperative/python/megengine/utils/module_utils.py b/imperative/python/megengine/utils/module_utils.py
index 2ee8e79b..13513a0c 100644
--- a/imperative/python/megengine/utils/module_utils.py
+++ b/imperative/python/megengine/utils/module_utils.py
@@ -14,9 +14,12 @@ from ..tensor import Tensor
 
 
 def get_expand_structure(obj: Module, key: str):
-    """
-    Gets Module's attribute compatible with complex key from Module's :meth:`~.named_children`.
+    r"""Gets Module's attribute compatible with complex key from Module's :meth:`~.named_children`.
     Supports handling structure containing list or dict.
+
+    Args:
+        obj: Module: 
+        key: str: 
     """
 
     def f(_, __, cur):
@@ -26,8 +29,7 @@ def get_expand_structure(obj: Module, key: str):
 
 
 def set_expand_structure(obj: Module, key: str, value):
-    """
-    Sets Module's attribute compatible with complex key from Module's :meth:`~.named_children`.
+    r"""Sets Module's attribute compatible with complex key from Module's :meth:`~.named_children`.
     Supports handling structure containing list or dict.
     """
 
@@ -48,10 +50,11 @@ def set_expand_structure(obj: Module, key: str, value):
 def set_module_mode_safe(
     module: Module, training: bool = False,
 ):
-    """Adjust module to training/eval mode temporarily.
+    r"""Adjust module to training/eval mode temporarily.
 
-    :param module: used module.
-    :param training: training (bool): training mode. True for train mode, False fro eval mode.
+    Args:
+        module: used module.
+        training: training (bool): training mode. True for train mode, False fro eval mode.
     """
     backup_stats = {}
 
diff --git a/imperative/python/megengine/utils/naming.py b/imperative/python/megengine/utils/naming.py
index 4a7eed68..7f1bdbec 100644
--- a/imperative/python/megengine/utils/naming.py
+++ b/imperative/python/megengine/utils/naming.py
@@ -10,8 +10,7 @@ from ..core._imperative_rt.core2 import pop_scope, push_scope
 
 
 class AutoNaming:
-    r"""
-    Name all executed operators automaticlly during tracing and record all tensors
+    r"""Name all executed operators automaticlly during tracing and record all tensors
     renamed by the user.
     """
 
diff --git a/imperative/python/megengine/utils/network.py b/imperative/python/megengine/utils/network.py
index 7209ca06..3409331e 100644
--- a/imperative/python/megengine/utils/network.py
+++ b/imperative/python/megengine/utils/network.py
@@ -48,9 +48,7 @@ class Network:
 
     @property
     def metadata(self):
-        r"""
-        Load metadata as a dict.
-        """
+        r"""Load metadata as a dict."""
         if not self._metadata.is_valid:
             logger.info("metadata is not valid!")
             return None
@@ -71,10 +69,11 @@ class Network:
 
     @classmethod
     def load(cls, model_path: str, outspec: List[str] = None):
-        """
-        Loads a computing graph as a Network object.
-        :param model_path: file path of mge model.
-        :param outspec: only load the subgraph with outspec as its endpoints.
+        r"""Loads a computing graph as a Network object.
+
+        Args:
+            model_path: file path of mge model.
+            outspec: only load the subgraph with outspec as its endpoints.
         """
         self = cls()
         ret = G.load_graph(model_path)
@@ -116,52 +115,50 @@ class Network:
                 self.all_vars_map[o.var.id] = o
 
     def optimize_for_inference(self, dest_vars, **kwargs):
-        r"""
-        Applies optimize_for_inference pass for operator graph.
-
-            :param dest_vars: list of output vars in the operator graph
-
-            :Keyword Arguments:
-
-                * enable_io16xc32 --
-                    whether to use float16 for I/O between oprs and use
-                    float32 as internal computation precision. Note the output var would be
-                    changed to float16.
-                * enable_ioc16 --
-                    whether to use float16 for both I/O and computation
-                    precision.
-
-                * enable_hwcd4 --
-                    whether to use NHWCD4 data layout. This is faster on some
-                    OpenCL backend.
-                * enable_nchw88 --
-                    whether to use NCHW88 data layout, currently
-                    used in X86 AVX backend.
-                * enable_nchw44 --
-                    whether to use NCHW44 data layout, currently
-                    used in arm backend.
-                * enable_nchw44_dot --
-                    whether to use NCHW44_dot data layout, currently
-                    used in armv8.2+dotprod backend.
-                * enable_nchw4 --
-                    whether to use NCHW4 data layout, currently
-                    used in nvidia backend(based on cudnn).
-                * enable_nchw32 --
-                    whether to use NCHW32 data layout, currently
-                    used in nvidia backend with tensorcore(based on cudnn).
-                * enable_chwn4 --
-                    whether to use CHWN4 data layout, currently
-                    used in nvidia backend with tensorcore.
-                * enable_nchw64 --
-                    whether to use NCHW64 data layout, used for fast int4
-                    support on Nvidia GPU.
-
-                * enable_fuse_conv_bias_nonlinearity: whether to fuse conv+bias+nonlinearty
-                    into one opr.
-                * enable_fuse_conv_bias_with_z: whether to fuse conv_bias with z
-                    input for inference on nvidia backend(this optimization pass will
-                    result in mismatch of the precision of output of training and
-                    inference)
+        r"""Applies optimize_for_inference pass for operator graph.
+
+        Args:
+            dest_vars: list of output vars in the operator graph
+
+        Keyword Arguments:
+
+        * enable_io16xc32 --
+          whether to use float16 for I/O between oprs and use
+          float32 as internal computation precision. Note the output var would be
+          changed to float16.
+        * enable_ioc16 --
+          whether to use float16 for both I/O and computation
+          precision.
+        * enable_hwcd4 --
+          whether to use NHWCD4 data layout. This is faster on some
+          OpenCL backend.
+        * enable_nchw88 --
+          whether to use NCHW88 data layout, currently
+          used in X86 AVX backend.
+        * enable_nchw44 --
+          whether to use NCHW44 data layout, currently
+          used in arm backend.
+        * enable_nchw44_dot --
+          whether to use NCHW44_dot data layout, currently
+          used in armv8.2+dotprod backend.
+        * enable_nchw4 --
+          whether to use NCHW4 data layout, currently
+          used in nvidia backend(based on cudnn).
+        * enable_nchw32 --
+          whether to use NCHW32 data layout, currently
+          used in nvidia backend with tensorcore(based on cudnn).
+        * enable_chwn4 --
+          whether to use CHWN4 data layout, currently
+          used in nvidia backend with tensorcore.
+        * enable_nchw64 --
+          whether to use NCHW64 data layout, used for fast int4
+          support on Nvidia GPU.
+        * enable_fuse_conv_bias_nonlinearity: whether to fuse conv+bias+nonlinearty
+          into one opr.
+        * enable_fuse_conv_bias_with_z: whether to fuse conv_bias with z
+          input for inference on nvidia backend(this optimization pass will
+          result in mismatch of the precision of output of training and
+          inference)
         """
 
         if not isinstance(dest_vars, Sequence):
@@ -186,35 +183,33 @@ class Network:
         enable_metadata=True,
         **kwargs
     ):
-        """
-        Serializes graph to file.
-
-        :param file: output file, could be file object or filename.
-        :param append: whether output is appended to ``file``.
-            Only works when ``file`` is str.
-        :param keep_var_name: level for keeping variable names:
-
-            * 0: none of the names are kept
-            * 1: (default)keep names of output vars
-            * 2: keep names of all (output and internal) vars
-        :param keep_opr_name: whether to keep operator names.
-        :param keep_param_name: whether to keep param names, so param values can be
-            easily manipulated after loading model
-        :param keep_opr_priority: whether to keep priority setting for operators
-        :param strip_info_file: a string for path or a file handler. if is not None,
-            then the dump information for code strip would be written to ``strip_info_file``
-        :param append_json: will be check when `strip_info_file` is not None. if set
-            true, the information for code strip will be append to strip_info_file.
-            if set false, will rewrite strip_info_file
-        :param optimize_for_inference: enbale optmizations,
-            will skip all optimize options if this is False. Default: True
-        :param user_info: any type object, which will be pickled to bytes.
-        :param enable_metadata: whether to save metadata into output file.
-
-        :Keyword Arguments:
-
-            See also :py:meth:`optimize_for_inference`.
-
+        r"""Serializes graph to file.
+
+        Args:
+            file: output file, could be file object or filename.
+            append: whether output is appended to ``file``.
+                Only works when ``file`` is str.
+            keep_var_name: level for keeping variable names:
+
+                * 0: none of the names are kept
+                * 1: (default)keep names of output vars
+                * 2: keep names of all (output and internal) vars
+
+            keep_opr_name: whether to keep operator names.
+            keep_param_name: whether to keep param names, so param values can be
+                easily manipulated after loading model
+            keep_opr_priority: whether to keep priority setting for operators
+            strip_info_file: a string for path or a file handler. if is not None,
+                then the dump information for code strip would be written to ``strip_info_file``
+            append_json: will be check when `strip_info_file` is not None. if set
+                true, the information for code strip will be append to strip_info_file.
+                if set false, will rewrite strip_info_file
+            optimize_for_inference: enbale optmizations,
+                will skip all optimize options if this is False. Default: True
+            user_info: any type object, which will be pickled to bytes.
+            enable_metadata: whether to save metadata into output file.
+
+        See more detials in :meth:`~.trace.dump`.
         """
 
         def _set_var_name(var):
@@ -262,22 +257,19 @@ class Network:
         file.write(dump_content)
 
     def make_const(self, data, name=None, device=None):
-        """Makes an ImmutableTensor OpNode to provide a parameter for the network.
-        """
+        r"""Makes an ImmutableTensor OpNode to provide a parameter for the network."""
         node = ImmutableTensor(data, name, device, self.graph)
         node.compile(self.graph)
         return node.outputs[0]
 
     def make_input_node(self, shape, dtype, name=None, device=None):
-        """Makes a Host2DeviceCopy OpNode to provide an input varnode for the network.
-        """
+        r"""Makes a Host2DeviceCopy OpNode to provide an input varnode for the network."""
         node = Host2DeviceCopy(shape, dtype, name, device)
         node.compile(self.graph)
         return node.outputs[0]
 
     def add_output(self, *vars: VarNode):
-        """Adds vars into the network output node list
-        """
+        r"""Adds vars into the network output node list"""
         if not all([var.owner for var in vars]):
             self.add_dep_oprs(*vars)
         for var in vars:
@@ -287,8 +279,7 @@ class Network:
                 self.output_vars.append(var)
 
     def remove_output(self, *vars: VarNode):
-        """Removes vars from the network output node list.
-        """
+        r"""Removes vars from the network output node list"""
         for var in vars:
             # use list pop instead of remove to avoid
             # compare VarNode use elemwise equal
@@ -321,12 +312,12 @@ class Network:
         return list(vars)
 
     def modify_opr_names(self, modifier):
-        """Modifies names of operators **inplace**; useful for merging loaded
+        r"""Modifies names of operators **inplace**; useful for merging loaded
         network into another network
 
-        :param modifier: a string to be prepended to the name, or a function
-            that maps from name to name
-        :type modifier: str or callable
+        Args:
+            modifier(str or callable): a string to be prepended to the name, or a function
+                that maps from name to name
         """
         if isinstance(modifier, str):
             om = modifier
@@ -339,11 +330,12 @@ class Network:
             i.name = v1
 
     def reset_batch_size(self, batchsize, *, blacklist=()):
-        """Helper for reset batch size; first dimension of all data providers
+        r"""Helper for reset batch size; first dimension of all data providers
         not in blacklist are assumed to be the batch size
 
-        :param blacklist: data provider names whose first dimension is not
-            batchbatch size
+        Args:
+            blacklist: data provider names whose first dimension is not
+                batchbatch size
         """
         blacklist = set(blacklist)
         prev_batchsize = None
@@ -366,9 +358,10 @@ class Network:
         assert not blacklist, "unused items in blacklist: {}".format(blacklist)
 
     def replace_vars(self, repl_dict: Dict[VarNode, VarNode]):
-        """
-        Replaces vars in the graph.
-        :param repl_dict: the map {old_var: new_var} that specifies how to replace the vars.
+        r"""Replaces vars in the graph.
+
+        Args:
+            repl_dict: the map {old_var: new_var} that specifies how to replace the vars.
         """
         if not all([var.owner for var in repl_dict.values()]):
             self.add_dep_oprs(*list(repl_dict.values()))
@@ -386,9 +379,10 @@ class Network:
         self._compile()
 
     def replace_oprs(self, repl_dict: Dict[OpNode, OpNode]):
-        """
-        Replaces operators in the graph.
-        :param oprmap: the map {old_opr: new_opr} that specifies how to replace the operators.
+        r"""Replaces operators in the graph.
+
+        Args:
+            repl_dict: the map {old_opr: new_opr} that specifies how to replace the operators.
         """
         for opr in self.all_oprs:
             if opr in repl_dict:
@@ -427,45 +421,38 @@ class Network:
         return rst
 
     def get_var_receive_oprs(self, var):
-        """ Gets all oprs which use var as input
-        """
+        r"""Gets all oprs which use var as input"""
         return self.opr_filter.has_input(var).as_list()
 
     def get_dep_oprs(self, var):
-        """Gets dependent oprs of var
-        """
+        r"""Gets dependent oprs of var"""
         return get_oprs_seq(var, False, False)
 
     @property
     def opr_filter(self):
-        """Filter on all opnodes of the Network.
-        """
+        r"""Filter on all opnodes of the Network."""
         oprs = self.all_oprs
         return NodeFilter(itertools.islice(oprs, len(oprs)))
 
     @property
     def var_filter(self):
-        """Filter on all varnode of the Network.
-        """
+        r"""Filter on all varnode of the Network."""
         vars = self.all_vars
         return NodeFilter(itertools.islice(vars, len(vars)))
 
     @property
     def params_filter(self):  # all immutable tensor
-        """Filter on all parameters (ImmutableTensor Opr) of the Network
-        """
+        r"""Filter on all parameters (ImmutableTensor Opr) of the Network"""
         return self.opr_filter.param_provider()
 
     @property
     def data_providers_filter(self):  # all host2devicecopy
-        """Filter on all input nodes (Host2DeviceCopy Opr) of the Network
-        """
+        r"""Filter on all input nodes (Host2DeviceCopy Opr) of the Network"""
         return self.opr_filter.data_provider()
 
     @property
     def dest_vars(self):
-        """Output varnodes of the Network.
-        """
+        r"""Output varnodes of the Network."""
         return self.output_vars
 
     @property
@@ -485,9 +472,7 @@ class Network:
         return self.opr_filter.as_dict()
 
     def _add_opr(self, opr) -> Optional[OpNode]:
-        """
-        Used for loading and building graph.
-        """
+        r"""Used for loading and building graph."""
         assert isinstance(opr, _imperative_rt.graph.OperatorNode)
 
         # TODO: use megbrain C++ RTTI to replace type string
@@ -520,9 +505,7 @@ class Network:
             return None
 
     def _get_var(self, x):
-        """
-        Convert :class:`~._imperative_rt.graph.VarNode` to :class:`~.VarNode`.
-        """
+        r"""Convert :class:`~._imperative_rt.graph.VarNode` to :class:`~.VarNode`."""
         assert isinstance(x, _imperative_rt.graph.VarNode)
         if x.id not in self.all_vars_map or self.all_vars_map[x.id].var != x:
             self.all_vars_map[x.id] = VarNode.load(x, self._get_opr(x.owner))
@@ -530,24 +513,25 @@ class Network:
 
 
 def set_symbolic_shape(option: bool):
-    """
-    Set the VarNode use symbolic shape or not, return the last status.
+    r"""Set the VarNode use symbolic shape or not, return the last status.
     Please set to True and must recover after dump if want to change the input batch size.
-    :param option: True for enable symbolic shape.
+
+    Args:
+        option: True for enable symbolic shape.
     """
     return _set_symbolic_shape(option)
 
 
 def as_varnode(obj):
-    """convert a :class:`.VarNode` compatible object to :class:`.VarNode`.
+    r"""convert a :class:`.VarNode` compatible object to :class:`.VarNode`.
 
-    :param obj: it must be one of the following:
+    Args:
+        obj: it must be one of the following:
 
-        1. a :class:`.VarNode` object
-        2. a :class:`.OpNode` object that has unique output
-        3. an iterable that produces either type 1 or 2, with length 1
+            1. a :class:`.VarNode` object
+            2. a :class:`.OpNode` object that has unique output
+            3. an iterable that produces either type 1 or 2, with length 1
 
-    :rtype: :class:`.VarNode`
     """
     if type(obj) is VarNode:
         return obj
@@ -575,8 +559,9 @@ def as_varnode(obj):
 
 
 def as_oprnode(obj):
-    """convert a :class:`.OpNode` compatible object to
-    :class:`.OpNode`; it works like :func:`as_varnode`."""
+    r"""convert a :class:`.OpNode` compatible object to
+    :class:`.OpNode`; it works like :func:`as_varnode`.i
+    """
     if type(obj) is VarNode:
         return obj.owner
 
@@ -595,23 +580,24 @@ def as_oprnode(obj):
 
 
 class NodeFilter:
-    """Filter on node iterator. This class is an iterator of
+    r"""Filter on node iterator. This class is an iterator of
     :class:`.NetworkNode` objects and multiple filtering conditions and
     mappers can be chained.
 
-    Example::
+    Example:
 
-        # find all :class:`.ImmutableTensor` nodes
-        for i in NodeFilter(node_iter).param_provider():
-            print(i)
+        .. code-block::
 
-        # find all :class:`.ImmutableTensor` nodes that end with ':W'
-        for i in NodeFilter(node_iter).param_provider().name('*:W'):
-            print(i)
+           # find all :class:`.ImmutableTensor` nodes
+           for i in NodeFilter(node_iter).param_provider():
+               print(i)
 
-        # number of inputs
-        nr_input = NodeFilter(node_iter).data_provider().as_count()
+           # find all :class:`.ImmutableTensor` nodes that end with ':W'
+           for i in NodeFilter(node_iter).param_provider().name('*:W'):
+               print(i)
 
+           # number of inputs
+           nr_input = NodeFilter(node_iter).data_provider().as_count()
     """
 
     _iter = None
@@ -638,106 +624,117 @@ class NodeFilter:
 
     @classmethod
     def make_all_deps(cls, *dest_vars):
-        """make a :class:`NodeFilter` that contains all deps of given vars"""
+        r"""make a :class:`NodeFilter` that contains all deps of given vars"""
         return cls(list(get_oprs_seq(dest_vars, False, False)))
 
     def __iter__(self):
-        """to be overwritten by subclass to implement filters"""
+        r"""to be overwritten by subclass to implement filters"""
         return iter(self._iter)
 
     def type(self, node_type):
-        """filter by specific node type
+        r"""filter by specific node type
 
-        :param node_type: node type class
-        :return: a new :class:`NodeFilter` object
+        Args:
+            node_type: node type class
+
+        Returns:
+            a new :class:`NodeFilter` object
         """
         return NodeFilterType(self, node_type)
 
     def check_type(self, node_type):
-        """assert that all oprs produced by this iterator are instances of
+        r"""assert that all oprs produced by this iterator are instances of
         certain type
 
-        :param node_type: node type class
-        :return: a new :class:`NodeFilter` object
-        :raises TypeError: if type check failed
+        Args:
+            node_type: node type class
+
+        Returns:
+            a new :class:`NodeFilter` object
+
+        Raises:
+            TypeError if type check failed
         """
         return NodeFilterCheckType(self, node_type)
 
     def not_type(self, node_type):
-        """remove oprs of specific type
+        r"""remove oprs of specific type
 
-        :param node_type: node type class
-        :return: a new :class:`NodeFilter` object
+        Args:
+            node_type: node type class
+
+        Returns:
+            a new :class:`NodeFilter` object
         """
         return NodeFilterNotType(self, node_type)
 
     def param_provider(self):
-        """get :class:`.ParamProvider` oprs; shorthand for
-        ``.type(ParamProvider)``"""
+        r"""get :class:`~.ParamProvider` oprs; shorthand for
+        ``.type(ParamProvider)``
+        """
 
         return self.type(ImmutableTensor)
 
     def data_provider(self):
-        """get :class:`.DataProvider` oprs; shorthand for
-        ``.type(DataProvider)``"""
+        r"""get :class:`.DataProvider` oprs; shorthand for
+        ``.type(DataProvider)``
+        """
 
         return self.type(Host2DeviceCopy)
 
     def name(self, pattern, ignorecase=True):
-        """filter by node name
-
-        :param pattern: a string in glob syntax that can contain ``?`` and
-            ``*`` to match a single or arbitrary characters.
-        :type pattern: :class:`str`
-        :param ignorecase: whether to ignroe case
-        :type ignorecase: bool
-        :return: a new :class:`NodeFilter` object
+        r"""filter by node name
+
+        Args:
+            pattern(class:`str`): a string in glob syntax that can contain ``?`` and
+                ``*`` to match a single or arbitrary characters.
+            ignorecase(bool, optional): whether to ignroe case
+
+        Returns:
+            a new :class:`NodeFilter` object
         """
         return NodeFilterName(self, pattern, ignorecase)
 
     def has_input(self, var):
-        """an opr is kept if it has given var as one of its inputs
+        r"""an opr is kept if it has given var as one of its inputs
 
-        :param var: var node to checked
-        :return: a new :class:`NodeFilter` object
+        Args:
+            var: var node to checked
+
+        Returns:
+            a new :class:`NodeFilter` object
         """
         return NodeFilterHasInput(self, var)
 
     def as_list(self):
-        """consume this iterator and return its content as a list
-
-        :rtype: [:class:`.GraphNodeBase`]
-        """
+        r"""consume this iterator and return its content as a list"""
         return list(self)
 
     def as_unique(self):
-        """assert that this iterator yields only one node and return it
+        r"""assert that this iterator yields only one node and return it
+
+        Returns:
+            class:`.GraphNodeBase`: the unique node
 
-        :return: the unique node
-        :rtype: :class:`.GraphNodeBase`
-        :raises ValueError: if this iterator does not yield a unique node
+        Raises:
+            ValueError if this iterator does not yield a unique node
         """
         (opr,) = self
         return opr
 
     def as_dict(self):
-        """construct an ordered dict to map from node names to objects in
+        r"""construct an ordered dict to map from node names to objects in
         this iterator
-
-        :rtype: :class:`OrderedDict`
         """
         return collections.OrderedDict((i.name, i) for i in self)
 
     def as_count(self):
-        """consume this iterator and get the number of elements
-
-        :rtype: int
-        """
+        r"""consume this iterator and get the number of elements"""
         return sum(1 for _ in self)
 
 
 class NodeFilterType(NodeFilter):
-    """see :meth:`NodeFilter.type`"""
+    r"""see :meth:`NodeFilter.type`"""
 
     _node_type = None
 
@@ -753,7 +750,7 @@ class NodeFilterType(NodeFilter):
 
 
 class NodeFilterNotType(NodeFilterType):
-    """see :meth:`NodeFilter.not_type`"""
+    r"""see :meth:`NodeFilter.not_type`"""
 
     def __iter__(self):
         for i in self._iter:
@@ -762,7 +759,7 @@ class NodeFilterNotType(NodeFilterType):
 
 
 class NodeFilterCheckType(NodeFilterType):
-    """see :meth:`NodeFilter.check_type`"""
+    r"""see :meth:`NodeFilter.check_type`"""
 
     def __iter__(self):
         for i in self._iter:
@@ -774,7 +771,7 @@ class NodeFilterCheckType(NodeFilterType):
 
 
 class NodeFilterHasInput(NodeFilter):
-    """see :meth:`NodeFilter.has_input`"""
+    r"""see :meth:`NodeFilter.has_input`"""
 
     _var = None
 
@@ -793,7 +790,7 @@ class NodeFilterHasInput(NodeFilter):
 
 
 class NodeFilterName(NodeFilter):
-    """see :meth:`NodeFilter.name`"""
+    r"""see :meth:`NodeFilter.name`"""
 
     _re = None
 
diff --git a/imperative/python/megengine/utils/network_node.py b/imperative/python/megengine/utils/network_node.py
index a77d38ea..768792bb 100644
--- a/imperative/python/megengine/utils/network_node.py
+++ b/imperative/python/megengine/utils/network_node.py
@@ -58,13 +58,11 @@ class VarNode(NetworkNode, SymbolVar, ArrayMethodMixin, metaclass=VarNodeMeta):
 
     @property
     def partial_shape(self):
-        """Return the tuple type inferred shape of VarNode
-        """
+        r"""Return the tuple type inferred shape of VarNode"""
         return tuple(self._get_var_shape().numpy())
 
     def shapeof(self, axis):
-        """Return the symbolic shape of axis
-        """
+        r"""Return the symbolic shape of axis"""
         return self._get_var_shape(axis=axis) if self.var else None
 
     @property
@@ -73,8 +71,8 @@ class VarNode(NetworkNode, SymbolVar, ArrayMethodMixin, metaclass=VarNodeMeta):
 
     @property
     def shape(self):
-        """Return the symbolic shape if using set_symbolic_shape(True)
-           else inferred shape
+        r"""Return the symbolic shape if using set_symbolic_shape(True)
+        else inferred shape
         """
         rst = None
         if self.var:
diff --git a/imperative/python/megengine/utils/profile_analyzer.py b/imperative/python/megengine/utils/profile_analyzer.py
index 59945ca4..0fa3372c 100644
--- a/imperative/python/megengine/utils/profile_analyzer.py
+++ b/imperative/python/megengine/utils/profile_analyzer.py
@@ -15,8 +15,7 @@ import numpy as np
 
 
 class NonExistNum:
-    """
-    An object that behaves like a number but means a field does not exist; It is
+    r"""An object that behaves like a number but means a field does not exist; It is
     always greater than any real number.
     """
 
@@ -59,27 +58,27 @@ class NonExistNum:
 
 
 class OprProfRst:
-    """Opr profiling result dumped from megengine profiler."""
+    r"""Opr profiling result dumped from megengine profiler.
+
+    Args:
+        entry: profiling json exec_graph items. Opr profiling initialization, 
+            which sets up name, type and id of opr_info.
+    """
 
     opr_info = None
-    """A dict containing operator info:  name, id and type."""
+    r"""A dict containing operator info:  name, id and type."""
 
     time_dict = None
-    """
+    r"""
     A mapping from ``"host"`` or ``"device"`` to list of profiling
     results."""
 
     footprint = None
-    """
+    r"""
     A mapping from ``"memory"`` or ``"computation"`` to the actual number
     of corresponding operations."""
 
     def __init__(self, entry: dict):
-        """
-        Opr profiling initialization, which sets up name, type and id of opr_info.
-
-        :param entry: profiling json exec_graph items.
-        """
         assert isinstance(entry, dict)
         self.opr_info = collections.OrderedDict()
         for key in ["name", "type", "id"]:
@@ -88,38 +87,47 @@ class OprProfRst:
         self.footprint = collections.defaultdict(NonExistNum)
 
     def update_device_prof_info(self, dev_time: dict):
-        """
-        Updates device profiling info.
+        """Updates device profiling info.
 
-        :param dev_time: device time for single opr,
-            is an attribute of profiling result.
+        Args:
+            dev_time: device time for single opr,
+                is an attribute of profiling result.
         """
         assert isinstance(dev_time, dict)
         self.time_dict["device"].append(copy.deepcopy(dev_time))
 
     def update_host_prof_info(self, host_time: dict):
-        """
-        Updates host profiling info.
+        r"""Updates host profiling info.
 
-        :param host_time: host time for single opr,
-            is an attribute of profiling result.
+        Args:
+            host_time: host time for single opr,
+                is an attribute of profiling result.
         """
         assert isinstance(host_time, dict)
         self.time_dict["host"].append(copy.deepcopy(host_time))
 
     def update_footprint(self, footprint: dict):
-        """
-        Updates opr footprint.
+        r"""Updates opr footprint.
 
-        :param footprint: footprint for single opr,
-            is an attribute of profiling result.
+        Args:
+            footprint: footprint for single opr,
+                is an attribute of profiling result.
         """
         assert isinstance(footprint, dict)
         self.footprint.update(footprint)
 
 
 class Record:
-    """A record of analyzing result"""
+    r"""A record of analyzing result
+
+    Args:
+        time: opr running time, evaluated by applying users providing
+            function to OprProfRst.
+        info: opr information, could be original opr information or
+            aggregate infomation if aggregating enabled.
+        footprint: contains footprint information, for now, we have
+            ``"computation"``, ``"memory"``, ``"in_shapes"``, ``"out_shapes"``.
+    """
 
     __slot__ = [
         "time",
@@ -135,17 +143,6 @@ class Record:
     ]
 
     def __init__(self, time: float, info: dict, footprint: dict):
-        """
-        Initializes single record.
-
-        :param time: opr running time, evaluated by applying users providing
-            function to OprProfRst.
-        :param info: opr information, could be original opr information or
-            aggregate infomation if aggregating enabled.
-        :param footprint: contains footprint information, for now, we have
-            ``"computation"``, ``"memory"``, ``"in_shapes"``, ``"out_shapes"``.
-        """
-
         assert isinstance(footprint, dict)
         self.time = time
         self.info = collections.OrderedDict(copy.deepcopy(info))
@@ -161,10 +158,10 @@ class Record:
             self.opr_id = int(self.opr_id)
 
     def get_column_by_name(self, name: str = None):
-        """
-        Extracts column value by its column name.
+        r"""Extracts column value by its column name.
 
-        :param name: column name, None for time.
+        Args:
+            name: column name, None for time.
         """
 
         if name is None:
@@ -173,13 +170,14 @@ class Record:
 
 
 class ProfileAnalyzer:
-    def __init__(self, obj: dict, opr_filter: Callable = lambda opr, inp, out: True):
-        """
-        Initializes ProfileAnalyzer.
+    r"""Initializes ProfileAnalyzer.
 
-        :param obj: dict dumped from json str.
-        :param opr_filter: function that filter oprs.
-        """
+    Args:
+        obj: dict dumped from json str.
+        opr_filter: function that filter oprs.
+    """
+
+    def __init__(self, obj: dict, opr_filter: Callable = lambda opr, inp, out: True):
         self._opr_set = dict()  # type: dict
         assert isinstance(obj, dict), type(obj)
         varz = obj["graph_exec"]["var"]
@@ -212,14 +210,14 @@ class ProfileAnalyzer:
     def _aggregate(
         self, records: List[Record], aop: Union[str, Callable], atype: Optional[str]
     ) -> List[Record]:
-        """
-        Aggregate operation.
-    
-        :param records: selected records.
-        :param aop: aggregate operation, if aop is str, we would replace it
-            with associated numpy function wth aop name".
-        :param atype: the type aggregated by, None for aggregating all into single
-            record.
+        r"""Aggregate operation.
+
+        Args:
+            records: selected records.
+            aop: aggregate operation, if aop is str, we would replace it
+                with associated numpy function wth aop name".
+            atype: the type aggregated by, None for aggregating all into single
+                record.
         """
         if aop is None:
             assert atype is None, "must specify aggregate op"
@@ -258,11 +256,11 @@ class ProfileAnalyzer:
         return rst
 
     def _sort(self, records: List[Record], sort_by: str) -> List[Record]:
-        """
-        Sort operation.
+        r"""Sort operation.
 
-        :param records: the records after aggregate operation.
-        :param sort_by: keyword for sorting the list.
+        Args:
+            records: the records after aggregate operation.
+            sort_by: keyword for sorting the list.
         """
         if sort_by is None:
             return records
@@ -283,18 +281,20 @@ class ProfileAnalyzer:
         sort_by: str = None,
         top_k: int = 0,
     ) -> List[Record]:
-        """
-        Select operation.
-
-        :param time_func: time_func provided by user, would apply to every
-            OprProfRst.
-        :param opr_filter: filter satisfied operatiors.
-        :param aggregate: function that apply to list of records which are
-            aggregated by atype.
-        :param aggregate_by: the type aggregated by.
-        :param sort_by: keyword for sorting all records.
-        :param top_k: specify the maximum number of records.
-        :return: the records that go through select, aggregate, sort.
+        r"""Select operation.
+
+        Args:
+            time_func: time_func provided by user, would apply to every
+                OprProfRst.
+            opr_filter: filter satisfied operatiors.
+            aggregate: function that apply to list of records which are
+                aggregated by atype.
+            aggregate_by: the type aggregated by.
+            sort_by: keyword for sorting all records.
+            top_k: specify the maximum number of records.
+
+        Returns:
+            the records that go through select, aggregate, sort.
         """
 
         records = []
@@ -313,23 +313,20 @@ class ProfileAnalyzer:
 
 
 class TimeFuncHelper:
-    """Time Function Helper for users."""
+    r"""Time Function Helper for users."""
 
     @staticmethod
     def _eval_time(prof_type, end_key, func, opr_prof):
-        """
-        Eval time.
-
-        :type prof_type: str
-        :param prof_type: 'host' or 'device'.
-        :type end_key: str
-        :param end_key: 'kern' or 'end'.
-        :type func: function
-        :param func: apply to list of all ``thread`` of ``gpu`` time.
-        :type opr_prof: `class OprProfRst`
-        :param opr_prof: operator profiling result.
-        :rtype: float
-        :return: time.
+        r"""Eval time.
+
+        Args:
+             prof_type: host' or 'device'.
+            end_key: kern' or 'end'.
+            func: apply to list of all ``thread`` of ``gpu`` time.
+            opr_prof: operator profiling result.
+
+        Returns:
+            time.
         """
 
         if prof_type not in opr_prof.time_dict:
@@ -339,13 +336,15 @@ class TimeFuncHelper:
 
     @staticmethod
     def eval_time_func(prof_type: str, end_key: str, func: Callable) -> float:
-        """
-        Eval oprerator profile time.
+        r"""Eval oprerator profile time.
 
-        :param prof_type: 'host' or 'device'.
-        :param end_key: 'kern' or 'end'.
-        :param func: apply to list of all ``thread`` of ``gpu`` time.
-        :return: eval time results.
+        Args:
+            prof_type: host' or 'device'.
+            end_key: kern' or 'end'.
+            func: apply to list of all ``thread`` of ``gpu`` time.
+
+        Returns:
+            eval time results.
         """
         return functools.partial(TimeFuncHelper._eval_time, prof_type, end_key, func)
 
@@ -353,19 +352,16 @@ class TimeFuncHelper:
     def _min_start(
         prof_type, end_key, func, opr_prof
     ):  # pylint: disable=unused-argument
-        """
-        Eval minimum start time.
-
-        :type prof_type: str
-        :param prof_type: 'host' or 'device'.
-        :type end_key: str
-        :param end_key: 'kern' or 'end'.
-        :type func: function
-        :param func: apply to list of all ``thread`` of ``gpu`` time.
-        :type opr_prof: `class OprProfRst`
-        :param opr_prof: operator profiling result.
-        :rtype: float
-        :return: time.
+        r"""Eval minimum start time.
+
+        Args:
+            prof_type(str): 'host' or 'device'.
+            end_key(str): 'kern' or 'end'.
+            func(function): apply to list of all ``thread`` of ``gpu`` time.
+            opr_prof(OprProfRst): operator profiling result.
+        
+        Returns:
+            time.
         """
         if prof_type not in opr_prof.time_dict:
             return None
@@ -376,31 +372,30 @@ class TimeFuncHelper:
     def min_start_func(
         prof_type: str, end_key: str, func: Callable
     ) -> float:  # pylint: disable=unused-argument
-        """
-        Eval oprerator profile min start time.
+        r"""Eval oprerator profile min start time.
+
+        Args:
+            prof_type(str): 'host' or 'device'.
+            end_key(str): 'kern' or 'end'.
+            func(function): apply to list of all ``thread`` of ``gpu`` time.
 
-        :param prof_type: 'host' or 'device'.
-        :param end_key: 'kern' or 'end'.
-        :param func: apply to list of all ``thread`` of ``gpu`` time.
-        :return: eval time results.
+        Returns:
+            eval time results.
         """
         return functools.partial(TimeFuncHelper._min_start, prof_type, end_key, func)
 
     @staticmethod
     def _max_end(prof_type, end_key, func, opr_prof):  # pylint: disable=unused-argument
-        """
-        Eval maximum end time
-
-        :type prof_type: str
-        :param prof_type: 'host' or 'device'.
-        :type end_key: str
-        :param end_key: 'kern' or 'end'.
-        :type func: function
-        :param func: apply to list of all ``thread`` of ``gpu`` time.
-        :type opr_prof: `class OprProfRst`
-        :param opr_prof: operator profiling result.
-        :rtype: float
-        :return: time.
+        r"""Eval maximum end time
+
+        Args:
+            prof_type(str): 'host' or 'device'.
+            end_key(str): 'kern' or 'end'.
+            func(function): apply to list of all ``thread`` of ``gpu`` time.
+            opr_prof(OprProfRst): operator profiling result.
+        
+        Returns:
+            time.
         """
         if prof_type not in opr_prof.time_dict:
             return None
@@ -409,12 +404,14 @@ class TimeFuncHelper:
 
     @staticmethod
     def max_end_func(prof_type: str, end_key: str, func: Callable) -> float:
-        """
-        Eval oprerator profile max end time.
+        """Eval oprerator profile max end time.
+
+        Args:
+            prof_type(str): 'host' or 'device'.
+            end_key(str): 'kern' or 'end'.
+            func(function): apply to list of all ``thread`` of ``gpu`` time.
 
-        :param prof_type: 'host' or 'device'.
-        :param end_key: 'kern' or 'end'.
-        :param func: apply to list of all ``thread`` of ``gpu`` time.
-        :return: eval time results.
+        Returns:
+            eval time results.
         """
         return functools.partial(TimeFuncHelper._max_end, prof_type, end_key, func)
diff --git a/imperative/python/megengine/utils/profiler.py b/imperative/python/megengine/utils/profiler.py
index 77a5e240..0af5f174 100644
--- a/imperative/python/megengine/utils/profiler.py
+++ b/imperative/python/megengine/utils/profiler.py
@@ -29,28 +29,29 @@ _living_profilers = WeakSet()
 
 
 class Profiler(ContextDecorator):
-    r"""
-    Profile graph execution in imperative mode.
-
-    :type path: Optional[str]
-    :param path: default path prefix for profiler to dump.
+    r"""Profile graph execution in imperative mode.
 
+    Args:
+        path: default path prefix for profiler to dump.
+    
     Examples:
+    
+        .. code-block::
 
-    .. code-block::
+           import megengine as mge
+           import megengine.module as M
+           from megengine.utils.profiler import Profiler
 
-        import megengine as mge
-        import megengine.module as M
-        from megengine.utils.profiler import Profiler
+           # With Learnable Parameters
+           profiler = Profiler()
 
-        # With Learnable Parameters
-        profiler = Profiler()
-        for iter in range(0, 10):
-            # Only profile record of last iter would be saved
-            with profiler:
-                # your code here
+           for iter in range(0, 10):
+           # Only profile record of last iter would be saved
 
-        # Then open the profile file in chrome timeline window
+              with profiler:
+                 # your code here
+    
+           # Then open the profile file in chrome timeline window
     """
 
     CHROME_TIMELINE = "chrome_timeline.json"
diff --git a/imperative/python/megengine/utils/tensor_sanity_check.py b/imperative/python/megengine/utils/tensor_sanity_check.py
index 8fae8ac7..f06ffe1e 100644
--- a/imperative/python/megengine/utils/tensor_sanity_check.py
+++ b/imperative/python/megengine/utils/tensor_sanity_check.py
@@ -3,20 +3,18 @@ from ..core._imperative_rt.core2 import sync
 
 
 class TensorSanityCheck:
-    r"""
-    An object that checks whether the input tensors of each operator have changed before and after the operation.
+    r"""An object that checks whether the input tensors of each operator have changed before and after the operation.
     
     Examples:
-
-    .. code-block:: python
-
-        from megengine import tensor
-        from megengine.utils.tensor_sanity_check import TensorSanityCheck
-        with TensorSanityCheck() as checker:
-            a = tensor([1, 2])
-            b = tensor([3, 4])
-            c = a + b
-
+    
+        .. code-block:: python
+
+           from megengine import tensor
+           from megengine.utils.tensor_sanity_check import TensorSanityCheck
+           with TensorSanityCheck() as checker:
+               a = tensor([1, 2])
+               b = tensor([3, 4])
+               c = a + b
     """
 
     def __init__(self):
diff --git a/imperative/python/megengine/utils/tuple_function.py b/imperative/python/megengine/utils/tuple_function.py
index aa11538f..f8b84fdd 100644
--- a/imperative/python/megengine/utils/tuple_function.py
+++ b/imperative/python/megengine/utils/tuple_function.py
@@ -11,11 +11,11 @@ import functools
 
 
 def get_ndtuple(value, *, n, allow_zero=True):
-    r"""
-    Converts possibly 1D tuple to nd tuple.
+    r"""Converts possibly 1D tuple to nd tuple.
 
-    :type allow_zero: bool
-    :param allow_zero: whether to allow zero tuple value."""
+    Args:
+        allow_zero: whether to allow zero tuple value
+    """
     if not isinstance(value, collections.abc.Iterable):
         value = int(value)
         value = tuple([value for i in range(n)])