diff --git a/fastNLP/core/dataset/dataset.py b/fastNLP/core/dataset/dataset.py
index 025d33e5..cd3cae59 100644
--- a/fastNLP/core/dataset/dataset.py
+++ b/fastNLP/core/dataset/dataset.py
@@ -156,6 +156,7 @@ import _pickle as pickle
 from copy import deepcopy
 from typing import Optional, List, Callable, Union, Dict, Any, Mapping
 from types import LambdaType
+from subprocess import DEVNULL
 import sys
 import time
 
@@ -231,7 +232,7 @@ def _multi_proc(ds, _apply_field, func, counter, queue):
     """
     idx = -1
     import contextlib
-    with contextlib.redirect_stdout(None):  # 避免打印触发 rich 的锁
+    with contextlib.redirect_stdout(DEVNULL):  # 避免打印触发 rich 的锁
         logger.set_stdout(stdout='raw')
         results = []
         try:
diff --git a/fastNLP/modules/mix_modules/utils.py b/fastNLP/modules/mix_modules/utils.py
index 142644f9..e709b0ac 100644
--- a/fastNLP/modules/mix_modules/utils.py
+++ b/fastNLP/modules/mix_modules/utils.py
@@ -86,12 +86,12 @@ def _torch2paddle(torch_tensor: 'torch.Tensor', device: str = None, no_gradient:
     if not no_gradient:
         # 保持梯度并保持反向传播
         # paddle的stop_gradient和torch的requires_grad表现是相反的
-        paddle_tensor = paddle.to_tensor(torch_tensor.detach().numpy(), stop_gradient=False)
+        paddle_tensor = paddle.to_tensor(torch_tensor.detach().cpu().numpy(), stop_gradient=False)
         hook = paddle_tensor.register_hook(
             lambda grad: torch.autograd.backward(torch_tensor, torch.tensor(grad.numpy()))
         )
     else:
-        paddle_tensor = paddle.to_tensor(torch_tensor.detach().numpy(), stop_gradient=True)
+        paddle_tensor = paddle.to_tensor(torch_tensor.detach().cpu().numpy(), stop_gradient=True)
 
     paddle_tensor = paddle_to(paddle_tensor, device)
 
diff --git a/fastNLP/transformers/torch/tokenization_utils_base.py b/fastNLP/transformers/torch/tokenization_utils_base.py
index 8ed5a2e2..3a033c96 100644
--- a/fastNLP/transformers/torch/tokenization_utils_base.py
+++ b/fastNLP/transformers/torch/tokenization_utils_base.py
@@ -2179,7 +2179,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
             if padding is True:
                 if verbose:
                     if max_length is not None and (truncation is False or truncation == "do_not_truncate"):
-                        logger.warn(
+                        logger.warning_once(
                             "`max_length` is ignored when `padding`=`True` and there is no truncation strategy. "
                             "To pad to max length, use `padding='max_length'`."
                         )