diff --git a/imperative/python/megengine/module/batchnorm.py b/imperative/python/megengine/module/batchnorm.py
index 1bc3fd95..b50b94ea 100644
--- a/imperative/python/megengine/module/batchnorm.py
+++ b/imperative/python/megengine/module/batchnorm.py
@@ -252,14 +252,6 @@ class BatchNorm2d(_BatchNorm):
     keep running estimates, batch statistics is used during
     evaluation time instead.
 
-    .. note::
-        This :attr:`momentum` argument is different from one used in optimizer
-        classes and the conventional notion of momentum. Mathematically, the
-        update rule for running statistics here is
-        :math:`\hat{x}_\text{new} = \text{momentum} \times \hat{x} + (1 - \text{momentum}) \times x_t`,
-        where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the
-        new observed value.
-
     Because the Batch Normalization is done over the `C` dimension, computing
     statistics on `(N, H, W)` slices, it's common terminology to call this
     Spatial Batch Normalization.