From 20b42a8c3b1fb606d2214e175c2749d1e65d1a61 Mon Sep 17 00:00:00 2001
From: Megvii Engine Team <megengine@megvii.com>
Date: Tue, 4 Jan 2022 11:48:43 +0800
Subject: [PATCH] fix(dnn): add naive lstm kernel

GitOrigin-RevId: f08ef810cf936768a022c10f226d80e499355659
---
 dnn/include/megdnn/oprs/nn.h                   |   2 +-
 dnn/scripts/opr_param_defs.py                  | 130 +++----
 dnn/src/common/handle.cpp                      |   1 +
 dnn/src/common/lstm.cpp                        |  73 ++--
 dnn/src/common/lstm_cell.cpp                   | 109 ++++--
 dnn/src/common/opr_trait.h                     |   6 +-
 dnn/src/common/rnn.cpp                         |  53 ++-
 dnn/src/common/rnn_cell.cpp                    |  54 ++-
 dnn/src/cuda/cudnn_wrapper.cpp                 | 114 ------
 dnn/src/cuda/cudnn_wrapper.h                   |  39 --
 dnn/src/cuda/handle_create.cpp                 |  15 +-
 dnn/src/cuda/lstm/opr_impl.cpp                 | 112 ------
 dnn/src/cuda/lstm/opr_impl.h                   |  56 ---
 dnn/src/cuda/lstm/utils.cpp                    |  39 --
 dnn/src/cuda/lstm/utils.h                      |  23 --
 dnn/src/cuda/lstm_cell/opr_impl.cpp            |  42 --
 dnn/src/cuda/lstm_cell/opr_impl.h              |  36 --
 dnn/src/cuda/rnn/opr_impl.cpp                  | 170 ---------
 dnn/src/cuda/rnn/opr_impl.h                    |  57 ---
 dnn/src/cuda/rnn/utils.cpp                     | 138 -------
 dnn/src/cuda/rnn/utils.h                       |  56 ---
 dnn/src/cuda/rnn_cell/opr_impl.cpp             |  35 --
 dnn/src/cuda/rnn_cell/opr_impl.h               |  40 --
 dnn/src/naive/handle.cpp                       |   4 +-
 dnn/src/naive/lstm/opr_impl.cpp                |  82 ++--
 dnn/src/naive/lstm/opr_impl.h                  |  15 +-
 dnn/src/naive/lstm/template_impl.cpp           |   3 +-
 dnn/src/naive/lstm_cell/opr_impl.cpp           |  12 +-
 dnn/src/naive/lstm_cell/opr_impl.h             |   2 +
 dnn/src/naive/rnn/funcs.h                      | 445 +++++++++++++++++++++-
 dnn/src/naive/rnn/funcs.tpp                    | 449 ----------------------
 dnn/src/naive/rnn/opr_impl.cpp                 | 116 +++---
 dnn/src/naive/rnn/opr_impl.h                   |  13 +-
 dnn/src/naive/rnn/rnn.cpp                      |  10 +-
 dnn/src/naive/rnn/rnn.h                        |   1 +
 dnn/src/naive/rnn/template_impl.cpp            |   4 +-
 dnn/src/naive/rnn_cell/opr_impl.cpp            |  12 +-
 dnn/src/naive/rnn_cell/opr_impl.h              |   1 +
 dnn/test/common/deduce_layout_proxy.h          |  49 ++-
 dnn/test/common/exec_proxy.h                   |  38 ++
 dnn/test/common/rnn.h                          |  51 ---
 dnn/test/naive/lstm.cpp                        | 197 ++++++++++
 dnn/test/naive/lstmcell.cpp                    | 140 +++++++
 dnn/test/naive/rnn.cpp                         |  74 ++--
 dnn/test/naive/rnn_cell.cpp                    | 104 +++++
 imperative/python/megengine/module/rnn.py      | 507 ++++++++++++++++++-------
 imperative/python/test/unit/module/test_rnn.py |  12 +-
 src/opr/impl/dnn/rnn.cpp                       |   5 +-
 src/opr/include/megbrain/opr/dnn/rnn.h         |  30 +-
 49 files changed, 1838 insertions(+), 1938 deletions(-)
 delete mode 100644 dnn/src/cuda/lstm/opr_impl.cpp
 delete mode 100644 dnn/src/cuda/lstm/opr_impl.h
 delete mode 100644 dnn/src/cuda/lstm/utils.cpp
 delete mode 100644 dnn/src/cuda/lstm/utils.h
 delete mode 100644 dnn/src/cuda/lstm_cell/opr_impl.cpp
 delete mode 100644 dnn/src/cuda/lstm_cell/opr_impl.h
 delete mode 100644 dnn/src/cuda/rnn/opr_impl.cpp
 delete mode 100644 dnn/src/cuda/rnn/opr_impl.h
 delete mode 100644 dnn/src/cuda/rnn/utils.cpp
 delete mode 100644 dnn/src/cuda/rnn/utils.h
 delete mode 100644 dnn/src/cuda/rnn_cell/opr_impl.cpp
 delete mode 100644 dnn/src/cuda/rnn_cell/opr_impl.h
 delete mode 100644 dnn/src/naive/rnn/funcs.tpp
 delete mode 100644 dnn/test/common/rnn.h
 create mode 100644 dnn/test/naive/lstm.cpp
 create mode 100644 dnn/test/naive/lstmcell.cpp
 create mode 100644 dnn/test/naive/rnn_cell.cpp

diff --git a/dnn/include/megdnn/oprs/nn.h b/dnn/include/megdnn/oprs/nn.h
index a9ce7e30..38500626 100644
--- a/dnn/include/megdnn/oprs/nn.h
+++ b/dnn/include/megdnn/oprs/nn.h
@@ -2059,7 +2059,7 @@ public:
             _megdnn_tensor_in bias_ih, _megdnn_tensor_in hx,
             _megdnn_tensor_in weight_hh, _megdnn_tensor_in bias_hh,
             _megdnn_tensor_out dst, _megdnn_workspace workspace) = 0;
-    static void deduce_layout(
+    void deduce_layout(
             const TensorLayout& input, const TensorLayout& weight_ih,
             const TensorLayout& bias_ih, const TensorLayout& hx,
             const TensorLayout& weight_hh, const TensorLayout& bias_hh,
diff --git a/dnn/scripts/opr_param_defs.py b/dnn/scripts/opr_param_defs.py
index 46d6f61b..f6b36aa7 100755
--- a/dnn/scripts/opr_param_defs.py
+++ b/dnn/scripts/opr_param_defs.py
@@ -36,18 +36,13 @@ pdef('Axis').add_fields('int32', 'axis', 0)
  add_enum(Doc('Format', 'convolution data/filter/output format; see '
               ':class:`RelayoutFormat` for more details'),
           'NCHW = 0', 'NHWC = 1', 'NHWCD4 = 2', 'NCHW4 = 3', 'NCHW8 = 4', 'NCHW32 = 5', 'NCHW88 = 6',
-          'NCHW44 = 7', 'NCHW44_DOT = 8',
+          'NCHW44 = 7','NCHW44_DOT = 8', 
           Doc('NCHW_WINOGRAD = 9', 'NCHW layout with weights tranformed by winograd'),
-          Doc('NCHW88_WINOGRAD = 10',
-              'NCHW88 layout with weights tranformed by winograd'),
-          Doc('NCHW44_WINOGRAD = 11',
-              'NCHW44 layout with weights tranformed by winograd'),
-          Doc('NCHW4_NCHW32 = 12',
-              'NCHW4_NCHW32 means input tensors are nchw4 layout, output tensor is nchw32 layout'),
-          Doc('NCHW32_NCHW4 = 13',
-              'NCHW32_NCHW4 means input tensors are nchw32 layout, output tensor is nchw4 layout'),
-          Doc('NCHW4_NCHW = 14',
-              'NCHW4_NCHW means input tensors are nchw4 layout, output tensor is nchw layout'),
+          Doc('NCHW88_WINOGRAD = 10', 'NCHW88 layout with weights tranformed by winograd'),
+          Doc('NCHW44_WINOGRAD = 11', 'NCHW44 layout with weights tranformed by winograd'),
+          Doc('NCHW4_NCHW32 = 12', 'NCHW4_NCHW32 means input tensors are nchw4 layout, output tensor is nchw32 layout'), 
+          Doc('NCHW32_NCHW4 = 13', 'NCHW32_NCHW4 means input tensors are nchw32 layout, output tensor is nchw4 layout'), 
+          Doc('NCHW4_NCHW = 14', 'NCHW4_NCHW means input tensors are nchw4 layout, output tensor is nchw layout'), 
           Doc('NHWC_NCHW = 15', 'NHWC_NCHW means input tensors are nhwc layout, '
               'output tensor is nchw layout'),
           Doc('NHWC_NCHW4_IC_SMALL = 16', 'NHWC_NCHW4_IC_SMALL means input tensors are nhwc(c < 4) layout, '
@@ -101,13 +96,10 @@ pdef('Axis').add_fields('int32', 'axis', 0)
  add_enum(Doc('Format', 'convolution data/filter/output format; see '
               ':class:`RelayoutFormat` for more details'),
           'NCHW = 0', 'NHWC = 1', 'NHWCD4 = 2', 'NCHW4 = 3', 'NCHW8 = 4', 'NCHW32 = 5', 'NCHW88 = 6',
-          'NCHW44 = 7', 'NCHW44_DOT = 8',
-          Doc('NCHW4_NCHW32 = 9',
-              'NCHW4_NCHW32 means input tensors are nchw4 layout, output tensor is nchw32 layout'),
-          Doc('NCHW32_NCHW4 = 10',
-              'NCHW32_NCHW4 means input tensors are nchw32 layout, output tensor is nchw4 layout'),
-          Doc('NCHW4_NCHW = 11',
-              'NCHW4_NCHW means input tensors are nchw4 layout, output tensor is nchw layout'),
+          'NCHW44 = 7','NCHW44_DOT = 8', 
+          Doc('NCHW4_NCHW32 = 9', 'NCHW4_NCHW32 means input tensors are nchw4 layout, output tensor is nchw32 layout'), 
+          Doc('NCHW32_NCHW4 = 10', 'NCHW32_NCHW4 means input tensors are nchw32 layout, output tensor is nchw4 layout'), 
+          Doc('NCHW4_NCHW = 11', 'NCHW4_NCHW means input tensors are nchw4 layout, output tensor is nchw layout'),
           Doc('NHWC_NCHW = 12', 'NHWC_NCHW means input tensors are nhwc layout, '
               'output tensor is nchw layout'),
           Doc('NHWC_NCHW4_IC_SMALL = 13', 'NHWC_NCHW4_IC_SMALL means input tensors are nhwc(c < 4) layout, '
@@ -115,11 +107,11 @@ pdef('Axis').add_fields('int32', 'axis', 0)
           Doc('NCHW_NCHW4_IC_SMALL = 14', 'NCHW_NCHW4_IC_SMALL means input tensors are nchw(c < 4) layout, '
               'output tensor is nchw4 layout, padding c=4'),
           Doc('CHWN4 = 15', 'CHWN4 is currently only used on Nvidia platform for fast implementation '
-              'of convolution using CUDA/SASS. The channels are splitted to groups of 4 channels.'),
+              'of convolution using CUDA/SASS. The channels are splitted to groups of 4 channels.'), 
           Doc('NCHW64 = 16', 'NCHW64 is designed for convolution implementation to utilizing TensorCore '
-              'instructions for 4-bit integers on Nvidia platforms'),
+              'instructions for 4-bit integers on Nvidia platforms'), 
           Doc('NCHW4_NHWC = 17', 'NCHW4_NHWC means input tensors are nchw4 layout, output tensor is nhwc layout')).
- add_enum_alias('ComputeMode', 'ConvolutionV1', name_field='compute_mode')
+ add_enum_alias('ComputeMode', 'ConvolutionV1',name_field='compute_mode')
  )
 
 
@@ -141,7 +133,7 @@ pdef('Axis').add_fields('int32', 'axis', 0)
  add_enum_alias('ConvMode', 'ConvolutionV0', 'Mode').
  add_enum('PoolMode', 'AVERAGE = 0', 'MAX = 1').
  add_enum('NonlineMode', 'IDENTITY = 0', 'RELU = 1', 'SIGMOID = 2').
- add_fields('uint32', 'pool_shape_h', 1, 'pool_shape_w', 1, 'pool_stride_h', 1, 'pool_stride_w', 1,
+ add_fields('uint32', 'pool_shape_h', 1, 'pool_shape_w', 1, 'pool_stride_h', 1, 'pool_stride_w', 1, \
   'pool_pad_h', 0, 'pool_pad_w', 0, 'conv_stride_h', 1, 'conv_stride_w', 1, 'conv_pad_h', 0, 'conv_pad_w', 0))
 
 (pdef('ConvBias', 'legacy conv_bias', version=0, is_legacy=True).
@@ -224,8 +216,8 @@ pdef('Axis').add_fields('int32', 'axis', 0)
 (pdef('SeparableConv').
  add_enum_alias('Mode', 'ConvolutionV0').
  add_enum('BorderMode', 'BORDER_REPLICATE = 0', 'BORDER_REFLECT = 1',
-          'BORDER_REFLECT_101 = 2', 'BORDER_WRAP = 3',
-          'BORDER_CONSTANT = 4', 'BORDER_TRANSPARENT = 5', 'BORDER_ISOLATED = 6').
+          'BORDER_REFLECT_101 = 2','BORDER_WRAP = 3',
+          'BORDER_CONSTANT = 4', 'BORDER_TRANSPARENT = 5','BORDER_ISOLATED = 6').
  add_fields('bool', 'is_symm_kernel', 'true').
  add_fields('uint32', 'pad_h', 0, 'pad_w', 0, 'stride_h', 1, 'stride_w', 1,
             'ksize_h', 3, 'ksize_w', 3, 'anchor_h', 1, 'anchor_w', 1))
@@ -255,7 +247,7 @@ pdef('Axis').add_fields('int32', 'axis', 0)
  )
 
 (pdef('Pooling', version=1).
- add_enum_alias('Mode', 'PoolingV0').
+ add_enum_alias('Mode','PoolingV0').
  add_fields('uint32', 'pad_h', 0, 'pad_w', 0, 'stride_h', 2, 'stride_w', 2,
             'window_h', 2, 'window_w', 2).
  add_enum_alias('Format', 'Convolution')
@@ -310,8 +302,7 @@ pdef('Axis').add_fields('int32', 'axis', 0)
  ).
  add_fields('float32', 'scale', '1.f'))
 
-INTERP_MODES = ['NEAREST = 0', 'LINEAR = 1',
-    'AREA = 2', 'CUBIC = 3', 'LANCZOS4 = 4']
+INTERP_MODES = ['NEAREST = 0', 'LINEAR = 1', 'AREA = 2', 'CUBIC = 3', 'LANCZOS4 = 4']
 BORDER_MODES = [Doc('REPLICATE = 0', 'aaaaaa|abcdefgh|hhhhhhh'),
                 Doc('REFLECT = 1', 'fedcba|abcdefgh|hgfedcb'),
                 Doc('REFLECT_101 = 2', 'gfedcb|abcdefgh|gfedcba'),
@@ -332,8 +323,8 @@ BORDER_MODES = [Doc('REPLICATE = 0', 'aaaaaa|abcdefgh|hhhhhhh'),
  add_fields('float32', Doc('border_val', 'used for CONSTANT bmode'), '.0f'))
 
 (pdef('WarpPerspective', version=2).
- add_enum_alias('InterpolationMode', 'WarpPerspectiveV1', name_field="imode").
- add_enum_alias('BorderMode', 'WarpPerspectiveV1', name_field="bmode").
+ add_enum_alias('InterpolationMode','WarpPerspectiveV1',name_field="imode").
+ add_enum_alias('BorderMode','WarpPerspectiveV1',name_field="bmode").
  add_enum_alias('Format', 'Convolution').
  add_fields('float32', Doc('border_val', 'used for CONSTANT bmode'), '.0f'))
 
@@ -408,7 +399,7 @@ pdef('Elemwise').add_enum(
     Doc('RMULH = 43', 'binary: rounded higher l bits of x * y, where l is the bit '
                 'length of x.'),
 
-    Doc('ATAN2 = 44', 'binary: atan2(y,x)'),
+    Doc('ATAN2 = 44','binary: atan2(y,x)'),
     Doc('ERF = 45', 'unary: erf(x)'),
     Doc('ERFINV = 46', 'unary: inverse function of erf(x)'),
     Doc('ERFC = 47', 'unary: erfc(x)'),
@@ -643,7 +634,7 @@ Currently, ```DEFAULT``` mode means:
             Doc('axis',
                 'axis along which reduction is performed; if INT_MAX is given, '
                 'reduce to given target shape (only used in megbrain)'),
-            (1 << 31)-1).
+            (1<<31)-1).
  add_enum('DataType',
           Doc('DEFAULT = 0',
 '''
@@ -698,7 +689,7 @@ Currently, ```DEFAULT``` mode means:
  add_fields('int32',
           Doc('axis',
               'axis along which cumsum is performed, default with INT_MAX'),
-          (1 << 31)-1).
+          (1<<31)-1).
  add_fields('bool',
           Doc('exclusive',
               'whether the current element is taken into account'),
@@ -770,8 +761,7 @@ pdef('Sleep').add_fields('float32', Doc('time', 'time to sleep in seconds'), 0)
 (pdef('UniformRNG', version=1).
  add_fields('uint64', 'seed', 0).
  add_fields(
-     'dtype', Doc(
-         'dtype', 'The dtype of output Tensor. Only support Float32.'),
+     'dtype', Doc('dtype', 'The dtype of output Tensor. Only support Float32.'),
      'DTypeEnum::Float32'))
 
 (pdef('GaussianRNG', version=0, is_legacy=True).
@@ -782,8 +772,7 @@ pdef('Sleep').add_fields('float32', Doc('time', 'time to sleep in seconds'), 0)
  add_fields('uint64', 'seed', 0).
  add_fields('float32', 'mean', 0, 'std', 1).
  add_fields(
-     'dtype', Doc(
-         'dtype', 'The dtype of output Tensor. Only support Float32.'),
+     'dtype', Doc('dtype', 'The dtype of output Tensor. Only support Float32.'),
      'DTypeEnum::Float32'))
 
 (pdef('GammaRNG').
@@ -830,7 +819,7 @@ pdef('Sleep').add_fields('float32', Doc('time', 'time to sleep in seconds'), 0)
                   ('YUV2GRAY_NV12', 'BT601_YUV2GRAY_NV12'),
                   ('YUV2GRAY_YV12', 'BT601_YUV2GRAY_YV12'),
                   ('YUV2GRAY_YU12', 'BT601_YUV2GRAY_YU12')],
-    name_field='mode'))
+    name_field = 'mode'))
 
 (pdef('WarpAffine', version=0, is_legacy=True)
  .add_enum_alias('InterpolationMode', 'WarpPerspectiveV1', name_field='imode')
@@ -853,7 +842,7 @@ pdef('Sleep').add_fields('float32', Doc('time', 'time to sleep in seconds'), 0)
 (pdef('GaussianBlur')
  .add_enum_alias('BorderMode', 'WarpPerspectiveV1', name_field='border_mode')
  .add_fields('uint32', 'kernel_height', 0, 'kernel_width', 0)
- .add_fields('float32', 'sigma_x', '0.f', 'sigma_y', '0.f'))
+ .add_fields('float32','sigma_x', '0.f', 'sigma_y', '0.f'))
 
 (pdef('Resize', version=0, is_legacy=True)
  .add_enum_alias('InterpolationMode', 'WarpPerspectiveV1', name_field='imode'))
@@ -866,7 +855,7 @@ pdef('Sleep').add_fields('float32', Doc('time', 'time to sleep in seconds'), 0)
  .add_enum_alias('InterpolationMode', 'WarpPerspectiveV1', name_field='imode')
  .add_enum_alias('Format', 'Convolution', default=1))
 
-(pdef('Remap', version=0, is_legacy=True)
+(pdef('Remap', version=0,is_legacy=True)
  .add_enum_alias('InterpolationMode', 'WarpPerspectiveV1', name_field='imode')
  .add_enum_alias('BorderMode', 'WarpPerspectiveV1', name_field='border_type')
  .add_enum_alias('Format', 'ConvolutionV0', default=1)
@@ -920,8 +909,8 @@ pdef('Sleep').add_fields('float32', Doc('time', 'time to sleep in seconds'), 0)
 (pdef('SeparableConv3D').
  add_enum_alias('Mode', 'Convolution3D').
  add_enum('BorderMode', 'BORDER_REPLICATE = 0', 'BORDER_REFLECT = 1',
-          'BORDER_REFLECT_101 = 2', 'BORDER_WRAP = 3',
-          'BORDER_CONSTANT = 4', 'BORDER_TRANSPARENT = 5', 'BORDER_ISOLATED = 6').
+          'BORDER_REFLECT_101 = 2','BORDER_WRAP = 3',
+          'BORDER_CONSTANT = 4', 'BORDER_TRANSPARENT = 5','BORDER_ISOLATED = 6').
  add_fields('bool', 'is_symm_kernel', 'true').
  add_fields('uint32', 'pad_d', 0, 'pad_h', 0, 'pad_w', 0,
             'stride_d', 0, 'stride_h', 1, 'stride_w', 1,
@@ -1034,10 +1023,10 @@ Note: NCHW_NCHW4_WEIGHT will auto pad oc and ic, you should remove oc in later o
      'NCHW_NCHW4 = 24',
      'NCHW4_NCHW = 25',
      'NCHW_NCHW4_WEIGHT = 26',
-     'NCHW_NCHW64 = 27',
-     'NCHW64_NCHW = 28',
-     'NCHW_NHWC = 29',
-     'NHWC_NCHW = 30',
+     'NCHW_NCHW64 = 27', 
+     'NCHW64_NCHW = 28', 
+     'NCHW_NHWC = 29', 
+     'NHWC_NCHW = 30',  
     )
  )
 
@@ -1059,7 +1048,7 @@ Note: NCHW_NCHW4_WEIGHT will auto pad oc and ic, you should remove oc in later o
  add_fields('bool', 'is_symm_kernel', 'true').
  add_fields('uint32', 'ksize_h', 3, 'ksize_w', 3, 'anchor_h', 1, 'anchor_w', 1))
 
-(pdef('LocalShare', 'Local share convolution', version=0, is_legacy=True).
+(pdef('LocalShare', 'Local share convolution',version=0, is_legacy=True).
  add_enum_alias('Mode', 'ConvolutionV0').
  add_fields(
      'uint32',
@@ -1100,7 +1089,7 @@ Note: NCHW_NCHW4_WEIGHT will auto pad oc and ic, you should remove oc in later o
  )
 
 
-(pdef('ROIAlign', version=0, is_legacy=True).
+(pdef('ROIAlign',version=0,is_legacy=True).
  add_enum('Mode', 'MAX = 0', 'AVERAGE = 1', name_field='mode').
  add_enum_alias('Format', 'ConvolutionV0').
  add_fields('float32', 'spatial_scale', '1.0').
@@ -1144,7 +1133,7 @@ Note: NCHW_NCHW4_WEIGHT will auto pad oc and ic, you should remove oc in later o
     Doc('part_size', 'size of each deformable part'), 1,
     Doc('sample_per_part', 'sample count of each bbox'), 1))
 
-(pdef('BatchConvBias', 'Batch convolution (unshare weights on the batch dimension)', version=0, is_legacy=True).
+(pdef('BatchConvBias', 'Batch convolution (unshare weights on the batch dimension)',version=0,is_legacy=True).
  add_enum_alias('NonlineMode', 'ConvBiasV0').
  add_enum_alias('Mode', 'ConvolutionV0').
  add_fields(
@@ -1163,7 +1152,7 @@ Note: NCHW_NCHW4_WEIGHT will auto pad oc and ic, you should remove oc in later o
  add_enum_alias('ComputeMode', 'ConvolutionV1', name_field="compute_mode")
  )
 
-(pdef('BatchConvBias', 'Batch convolution (unshare weights on the batch dimension)', version=1).
+(pdef('BatchConvBias', 'Batch convolution (unshare weights on the batch dimension)',version=1).
  add_enum_alias('NonlineMode', 'ConvBiasV0').
  add_enum_alias('Mode', 'ConvolutionV0').
  add_fields(
@@ -1183,8 +1172,8 @@ Note: NCHW_NCHW4_WEIGHT will auto pad oc and ic, you should remove oc in later o
  )
 
 (pdef('FakeQuant').
- add_fields('int32', 'qmin', '-2147483648').
- add_fields('int32', 'qmax', '2147483647')
+ add_fields('int32','qmin','-2147483648').
+ add_fields('int32','qmax','2147483647')
  )
 (pdef('TQT').
  add_fields('int32', 'qmin', '-2147483648').
@@ -1203,13 +1192,13 @@ PADDING_MODES = [Doc('REPLICATE = 0', 'aaaaaa|abcdefgh|hhhhhhh'),
                 Doc('REFLECT = 1', 'fedcba|abcdefgh|hgfedcb'),
                 Doc('CONSTANT = 2', 'iiiiii|abcdefgh|iiiiiii')]
 (pdef('Padding').
- add_fields('uint32', Doc('front_offset_dim0', 'offset in dim 0'), 0).
- add_fields('uint32', Doc('front_offset_dim1', 'offset in dim 1'), 0).
- add_fields('uint32', Doc('front_offset_dim2', 'offset in dim 2'), 0).
- add_fields('uint32', Doc('front_offset_dim3', 'offset in dim 3'), 0).
- add_fields('uint32', Doc('front_offset_dim4', 'offset in dim 4'), 0).
- add_fields('uint32', Doc('front_offset_dim5', 'offset in dim 5'), 0).
- add_fields('uint32', Doc('front_offset_dim6', 'offset in dim 6'), 0).
+ add_fields('uint32', Doc('front_offset_dim0','offset in dim 0'), 0).
+ add_fields('uint32', Doc('front_offset_dim1','offset in dim 1'), 0).
+ add_fields('uint32', Doc('front_offset_dim2','offset in dim 2'), 0).
+ add_fields('uint32', Doc('front_offset_dim3','offset in dim 3'), 0).
+ add_fields('uint32', Doc('front_offset_dim4','offset in dim 4'), 0).
+ add_fields('uint32', Doc('front_offset_dim5','offset in dim 5'), 0).
+ add_fields('uint32', Doc('front_offset_dim6','offset in dim 6'), 0).
  add_fields('uint32', Doc('back_offset_dim0', 'back offset in dim0'), 0).
  add_fields('uint32', Doc('back_offset_dim1', 'back offset in dim1'), 0).
  add_fields('uint32', Doc('back_offset_dim2', 'back offset in dim2'), 0).
@@ -1217,7 +1206,7 @@ PADDING_MODES = [Doc('REPLICATE = 0', 'aaaaaa|abcdefgh|hhhhhhh'),
  add_fields('uint32', Doc('back_offset_dim4', 'back offset in dim4'), 0).
  add_fields('uint32', Doc('back_offset_dim5', 'back offset in dim5'), 0).
  add_fields('uint32', Doc('back_offset_dim6', 'back offset in dim6'), 0).
- add_fields('float32', Doc('padding_val', 'param of padding opr'), 0).
+ add_fields('float32', Doc('padding_val','param of padding opr'), 0).
  add_enum('PaddingMode', *PADDING_MODES,
           name_field='padding_mode', default=2,
           member_alias=[(i, 'PADDING_{}'.format(i)) for i in PADDING_MODES]
@@ -1241,22 +1230,21 @@ PADDING_MODES = [Doc('REPLICATE = 0', 'aaaaaa|abcdefgh|hhhhhhh'),
  )
 
 (pdef('RNN').
- add_fields('uint32', 'num_layers', '1').
- add_fields('bool', 'bidirectional', 'false').
- add_fields('bool', 'bias', 'true').
- add_fields('uint32', 'hidden_size', '128').
- add_fields('uint32', 'proj_size', '0').
- add_fields('float32', 'dropout', '0.f').
+ add_fields('uint32', Doc('num_layers', 'Number of recurrent layers'), '1').
+ add_fields('bool', Doc('bidirectional', 'If becomes a bidirectional RNN'), 'false').
+ add_fields('bool', Doc('bias', 'If the layer use bias weights b_ih and b_hh'), 'true').
+ add_fields('uint32', Doc('hidden_size', 'The number of features in the hidden state'), '128').
+ add_fields('float32', Doc('dropout', 'If introduce a Dropout layer on the outputs of each RNN layer'), '0.f').
  add_enum_alias('NonlineMode', 'RNNCell').
  add_enum_alias('FwdMode', 'BN', name_field='fwd_mode')
  )
 
 (pdef('LSTM').
- add_fields('uint32', 'num_layers', '1').
- add_fields('bool', 'bidirectional', 'false').
- add_fields('bool', 'bias', 'true').
- add_fields('uint32', 'hidden_size', '128').
- add_fields('uint32', 'proj_size', '0').
- add_fields('float32', 'dropout', '0.f').
+ add_fields('uint32', Doc('num_layers', 'Number of recurrent layers'), '1').
+ add_fields('bool', Doc('bidirectional', 'If becomes a bidirectional LSTM'), 'false').
+ add_fields('bool', Doc('bias', 'If the layer use bias weights b_ih and b_hh'), 'true').
+ add_fields('uint32', Doc('hidden_size', 'The number of features in the hidden state'), '128').
+ add_fields('uint32', Doc('proj_size', 'If use LSTM with projections of corresponding size'), '0').
+ add_fields('float32', Doc('dropout', 'If introduce a Dropout layer on the outputs of each LSTM layer'), '0.f').
  add_enum_alias('FwdMode', 'BN', name_field='fwd_mode')
  )
diff --git a/dnn/src/common/handle.cpp b/dnn/src/common/handle.cpp
index 17f4718c..ee9c3372 100644
--- a/dnn/src/common/handle.cpp
+++ b/dnn/src/common/handle.cpp
@@ -224,5 +224,6 @@ std::unique_ptr<Opr> Handle::create_operator() {
 
 #define INST(opr) template std::unique_ptr<opr> Handle::create_operator();
 MEGDNN_FOREACH_OPR_CLASS(INST)
+
 #undef INST
 // vim: syntax=cpp.doxygen
diff --git a/dnn/src/common/lstm.cpp b/dnn/src/common/lstm.cpp
index 4275a582..211cdf68 100644
--- a/dnn/src/common/lstm.cpp
+++ b/dnn/src/common/lstm.cpp
@@ -10,19 +10,12 @@
  */
 #include "megdnn/oprs.h"
 #include "src/common/utils.h"
-// #include "src/cuda/lstm/utils.h"
 
 namespace megdnn {
 
-/*size_t get_reserve_size(Handle* handle, megdnn::LSTMForward::Param& param, const
-TensorLayout& input) { #if CUDNN_MAJOR >= 6 auto holder =
-megdnn::cuda::lstm::get_RNNDescHolder_v6(handle, param, input); return
-holder.reserveSpace_size; # else return 0; #endif
-}*/
-
 void LSTM::deduce_layout(
         const TensorLayout& input, const TensorLayout& hx, const TensorLayout& cx,
-        const TensorLayout& flatten_weights, TensorLayout& output, TensorLayout& hy,
+        const TensorLayout& /*flatten_weights*/, TensorLayout& output, TensorLayout& hy,
         TensorLayout& cy, TensorLayout& reserve_space) {
     // input: [seq_len, batch_size, input_size]
     // hx: [D * num_layers, batch_size, hidden_size]
@@ -34,24 +27,30 @@ void LSTM::deduce_layout(
             TensorShape{seq_len, batch_size, D * hidden_size}, input.dtype);
     hy = TensorLayout(hx);
     cy = TensorLayout(cx);
-    // reserve_space = {{get_reserve_size(this->handle(), param(), input)},
-    // dtype::Byte()};
-    reserve_space = {{get_reserve_size_in_bytes(input)}, dtype::Byte()};
+    reserve_space = {{get_reserve_size_in_bytes(input)}, input.dtype};
 }
 
 void LSTM::check_exec(
         const TensorLayout& input, const TensorLayout& hx, const TensorLayout& cx,
         const TensorLayout& flatten_weights, const TensorLayout& output,
         const TensorLayout& hy, const TensorLayout& cy,
-        const TensorLayout& reserve_space, size_t workspace_in_bytes) {
+        const TensorLayout& /*reserve_space*/, size_t /*workspace_in_bytes*/) {
     auto errmsg = [&]() {
         std::string msg;
         msg.append("input=");
         msg.append(input.to_string());
+        msg.append(", output=");
+        msg.append(output.to_string());
         msg.append(", hx=");
         msg.append(hx.to_string());
         msg.append(", cx=");
         msg.append(cx.to_string());
+        msg.append(", hy=");
+        msg.append(hy.to_string());
+        msg.append(", cy=");
+        msg.append(cy.to_string());
+        msg.append(", flatten_weights=");
+        msg.append(flatten_weights.to_string());
         msg.append(", hidden_size=");
         msg.append(std::to_string(param().hidden_size));
         msg.append(", num_layers=");
@@ -61,9 +60,29 @@ void LSTM::check_exec(
         return msg;
     };
     size_t D = param().bidirectional ? 2 : 1;
+    size_t b = param().bias ? 1 : 0;
     size_t num_layers = param().num_layers;
+    size_t input_size = input.shape[2];
+    size_t gate_hidden_size = 4 * param().hidden_size;
+    // first layer{ weight_ih_l[k][_reverse].shape = (4*hidden_size, input_size)
+    //              weight_hh_l[k][_reverse].shape = (4*hidden_size, hidden_size)}
+    // other layers{ weight_ih_l[k][_reverse].shape = (4*hidden_size, num_directions *
+    // hidden_size)
+    //               weight_hh_l[k][_reverse].shape = (4*hidden_size, hidden_size)}
+    // bias:  2 * num_directions * num_layers
+    // size_dim1 = D * first layer + (layer -1) * other layer + bias
+    size_t size_dim1 = D * (input_size + param().hidden_size) +
+                       (num_layers - 1) * D * ((D + 1) * param().hidden_size) +
+                       b * 2 * D * num_layers;
 #define ASSERT_BRIEF(_content) megdnn_assert(_content, "%s", errmsg().c_str());
 
+    ASSERT_BRIEF(input.ndim == 3)
+    ASSERT_BRIEF(output.ndim == 3)
+    ASSERT_BRIEF(flatten_weights.shape[0] == gate_hidden_size)
+    ASSERT_BRIEF(flatten_weights.shape[0] == size_dim1)
+    ASSERT_BRIEF(output.shape[0] == input.shape[0])
+    ASSERT_BRIEF(output.shape[1] == input.shape[1])
+    ASSERT_BRIEF(output.shape[2] == D * param().hidden_size)
     ASSERT_BRIEF(hx.ndim == 3)
     ASSERT_BRIEF(hx.shape[0] == D * num_layers)
     ASSERT_BRIEF(hx.shape[1] == input.shape[1])  // batch_size
@@ -72,14 +91,22 @@ void LSTM::check_exec(
     ASSERT_BRIEF(cx.shape[0] == D * num_layers)
     ASSERT_BRIEF(cx.shape[1] == input.shape[1])  // batch_size
     ASSERT_BRIEF(cx.shape[2] == param().hidden_size)
+    ASSERT_BRIEF(hy.ndim == 3)
+    ASSERT_BRIEF(hy.shape[0] == D * num_layers)
+    ASSERT_BRIEF(hy.shape[1] == input.shape[1])  // batch_size
+    ASSERT_BRIEF(hy.shape[2] == param().hidden_size)
+    ASSERT_BRIEF(cy.ndim == 3)
+    ASSERT_BRIEF(cy.shape[0] == D * num_layers)
+    ASSERT_BRIEF(cy.shape[1] == input.shape[1])  // batch_size
+    ASSERT_BRIEF(cy.shape[2] == param().hidden_size)
 #undef ASSERT_BRIEF
 }
 
 void LSTMBackward::deduce_layout(
-        const TensorLayout& x, const TensorLayout& y, const TensorLayout& hx,
-        const TensorLayout& cx, const TensorLayout& dy, const TensorLayout& dhy,
-        const TensorLayout& dcy, const TensorLayout& flatten_weights,
-        const TensorLayout& reserve_space, TensorLayout& dx, TensorLayout& dhx,
+        const TensorLayout& x, const TensorLayout& /*y*/, const TensorLayout& hx,
+        const TensorLayout& cx, const TensorLayout& /*dy*/, const TensorLayout& /*dhy*/,
+        const TensorLayout& /*dcy*/, const TensorLayout& flatten_weights,
+        const TensorLayout& /*reserve_space*/, TensorLayout& dx, TensorLayout& dhx,
         TensorLayout& dcx, TensorLayout& dw) {
     dx = x;
     dhx = hx;
@@ -87,12 +114,14 @@ void LSTMBackward::deduce_layout(
     dw = flatten_weights;
 }
 
+// TODO: add shape check of BWD
 void LSTMBackward::check_exec(
-        const TensorLayout& x, const TensorLayout& y, const TensorLayout& hx,
-        const TensorLayout& cx, const TensorLayout& dy, const TensorLayout& dhy,
-        const TensorLayout& dcy, const TensorLayout& flatten_weights,
-        const TensorLayout& reserve_space, const TensorLayout& dx,
-        const TensorLayout& dhx, const TensorLayout& dcx, const TensorLayout& dw,
-        size_t workspace_in_bytes) {}
+        const TensorLayout& /*x*/, const TensorLayout& /*y*/,
+        const TensorLayout& /*hx*/, const TensorLayout& /*cx*/,
+        const TensorLayout& /*dy*/, const TensorLayout& /*dhy*/,
+        const TensorLayout& /*dcy*/, const TensorLayout& /*flatten_weights*/,
+        const TensorLayout& /*reserve_space*/, const TensorLayout& /*dx*/,
+        const TensorLayout& /*dhx*/, const TensorLayout& /*dcx*/,
+        const TensorLayout& /*dw*/, size_t /*workspace_in_bytes*/) {}
 
 }  // namespace megdnn
\ No newline at end of file
diff --git a/dnn/src/common/lstm_cell.cpp b/dnn/src/common/lstm_cell.cpp
index 35abdf4e..859ed782 100644
--- a/dnn/src/common/lstm_cell.cpp
+++ b/dnn/src/common/lstm_cell.cpp
@@ -20,8 +20,6 @@ void LSTMCell::deduce_layout(
         const TensorLayout& weight_hh, const TensorLayout& bias_hh,
         const TensorLayout& cx, TensorLayout& h_new, TensorLayout& c_new,
         TensorLayout& gates) {
-    // size_t batch_size = hx.shape[0];
-    // size_t hidden_size = hx.shape[1];
     h_new = TensorLayout(hx, hx.dtype);
     c_new = TensorLayout(cx, cx.dtype);
     auto opr = handle()->create_operator<RNNCellForward>();
@@ -36,6 +34,39 @@ void LSTMCell::check_exec(
         const TensorLayout& cx, const TensorLayout& h_new, const TensorLayout& c_new,
         const TensorLayout& gates, size_t workspace_in_bytes) {
     TensorLayout h_new_expected, c_new_expected, gates_expected;
+    auto errmsg = [&]() {
+        std::string msg;
+        msg.append("input=");
+        msg.append(input.to_string());
+        msg.append(", weight_ih=");
+        msg.append(weight_ih.to_string());
+        msg.append(", bias_ih=");
+        msg.append(bias_ih.to_string());
+        msg.append(", hx=");
+        msg.append(hx.to_string());
+        msg.append(", weight_hh=");
+        msg.append(weight_hh.to_string());
+        msg.append(", bias_hh=");
+        msg.append(bias_hh.to_string());
+        msg.append(", cx=");
+        msg.append(cx.to_string());
+        return msg;
+    };
+#define ASSERT_BRIEF(_content) megdnn_assert(_content, "%s", errmsg().c_str());
+
+    ASSERT_BRIEF(input.ndim == 2)
+    ASSERT_BRIEF(input.shape[1] == weight_ih.shape[1])
+    ASSERT_BRIEF(weight_ih.shape[0] == weight_hh.shape[0])
+    ASSERT_BRIEF(weight_hh.shape[0] == 4 * weight_hh.shape[1])
+    ASSERT_BRIEF(bias_ih.shape[0] == bias_hh.shape[0])
+    ASSERT_BRIEF(hx.ndim == 2)
+    ASSERT_BRIEF(hx.shape[0] == input.shape[0])
+    ASSERT_BRIEF(hx.shape[1] == cx.shape[1])  // hidden_size
+    ASSERT_BRIEF(cx.ndim == 2)
+    ASSERT_BRIEF(cx.shape[0] == input.shape[0])
+    ASSERT_BRIEF(cx.shape[1] == weight_hh.shape[1])
+#undef ASSERT_BRIEF
+
     deduce_layout(
             input, weight_ih, bias_ih, hx, weight_hh, bias_hh, cx, h_new_expected,
             c_new_expected, gates_expected);
@@ -57,15 +88,15 @@ size_t get_workspace_in_bytes(
         const TensorLayout& input, const TensorLayout& weight_ih,
         const TensorLayout& bias_ih, const TensorLayout& hx,
         const TensorLayout& weight_hh, const TensorLayout& bias_hh,
-        const TensorLayout& cx, const TensorLayout& h_new, const TensorLayout& c_new,
-        const TensorLayout& gates, Handle* handle) {
+        const TensorLayout& /*cx*/, const TensorLayout& /*h_new*/,
+        const TensorLayout& /*c_new*/, const TensorLayout& gates, Handle* handle) {
     TensorLayout tmp_layout;
     auto opr = handle->create_operator<RNNCellForward>();
     opr->param().nonlineMode = param::RNNCell::NonlineMode::IDENTITY;
     opr->deduce_layout(input, weight_ih, bias_ih, hx, weight_hh, bias_hh, tmp_layout);
     size_t rnn_cell_need = opr->get_workspace_in_bytes(
             input, weight_ih, bias_ih, hx, weight_hh, bias_hh, gates);
-    size_t lstm_cell_need = tmp_layout.span().dist_byte();
+    size_t lstm_cell_need = 2 * tmp_layout.span().dist_byte();
     return rnn_cell_need > lstm_cell_need ? rnn_cell_need : lstm_cell_need;
 }
 
@@ -76,37 +107,48 @@ void exec(
         _megdnn_tensor_out gates, _megdnn_workspace workspace, Handle* handle) {
     auto opr = handle->create_operator<RNNCellForward>();
     opr->param().nonlineMode = param::RNNCell::NonlineMode::IDENTITY;
-    /*TensorLayout tmp_layout;
-    opr->deduce_layout(input.layout, weight_ih.layout,
-                                       hx.layout, weight_hh.layout,
-                                       bias.layout, tmp_layout);
-    auto workspace_ptr = workspace.raw_ptr;
-    // TensorND tmp{static_cast<void*>(workspace.raw_ptr), tmp_layout};
-    TensorND tmp{workspace_ptr, tmp_layout};
-    auto new_workspace = Workspace{workspace_ptr + tmp.layout.span().dist_byte(),
-                                                               workspace.size -
-    tmp.layout.span().dist_byte()};*/
-    // opr->exec(input, weight_ih, hx, weight_hh, bias, tmp, new_workspace);
     opr->exec(input, weight_ih, bias_ih, hx, weight_hh, bias_hh, gates, workspace);
     // activation
-    // size_t batch_size = tmp.layout.shape[0];
     size_t batch_size = hx.layout.shape[0];
     size_t hidden_size = hx.layout.shape[1];
-    // sigmoid: i f o
-    // TensorLayout gates_ifo_layout{TensorShape({batch_size, hidden_size * 3}),
-    // tmp.layout.dtype};
-    TensorND tmp{static_cast<void*>(workspace.raw_ptr), gates.layout};
+
+    auto copy_opr = handle->create_operator<TypeCvtForward>();
+    TensorND copy_gates{static_cast<void*>(workspace.raw_ptr), gates.layout};
+    TensorLayout hidden_layout{TensorShape{hidden_size}, hx.layout.dtype};
+    TensorLayout gateinfo_layout{TensorShape{batch_size, hidden_size}, hx.layout.dtype};
+    for (size_t i = 0; i < batch_size; i++) {
+        for (size_t j = 0; j < 4; j++) {
+            TensorND half_step_states{
+                    // output
+                    static_cast<uint8_t*>(gates.raw_ptr()) +
+                            (4 * i + j) * hidden_layout.span().dist_byte(),
+                    hidden_layout};
+            TensorND half_step_output{
+                    static_cast<uint8_t*>(copy_gates.raw_ptr()) +
+                            j * gateinfo_layout.span().dist_byte() +
+                            i * hidden_layout.span().dist_byte(),
+                    hidden_layout};
+            copy_opr->exec(half_step_states, half_step_output);
+        }
+    }
+    void* workspace_ptr = workspace.raw_ptr + copy_gates.layout.span().dist_byte();
+    copy_opr->exec(copy_gates, gates);
+
+    // sigmoid: i f
+    TensorND tmp{static_cast<void*>(workspace_ptr), copy_gates.layout};
     TensorLayout gates_ifo_layout{
-            TensorShape({batch_size, hidden_size * 3}), gates.layout.dtype};
-    TensorND gates_ifo_origin{gates.raw_ptr(), gates_ifo_layout};
+            TensorShape({batch_size, hidden_size * 2}), copy_gates.layout.dtype};
+    TensorND gates_ifo_origin{copy_gates.raw_ptr(), gates_ifo_layout};
     TensorND gates_ifo{tmp.raw_ptr(), gates_ifo_layout};
     auto sigmoid = handle->create_operator<ElemwiseForward>();
     sigmoid->param().mode = Elemwise::Param::Mode::SIGMOID;
     sigmoid->exec({gates_ifo_origin}, gates_ifo);
     // tanh: g
-    TensorLayout g_layout{TensorShape({batch_size, hidden_size}), gates.layout.dtype};
+    TensorLayout g_layout{
+            TensorShape({batch_size, hidden_size}), copy_gates.layout.dtype};
     TensorND g_origin{
-            static_cast<char*>(gates.raw_ptr()) + gates_ifo_layout.span().dist_byte(),
+            static_cast<char*>(copy_gates.raw_ptr()) +
+                    gates_ifo_layout.span().dist_byte(),
             g_layout};
     TensorND g{
             static_cast<char*>(tmp.raw_ptr()) + gates_ifo_layout.span().dist_byte(),
@@ -114,13 +156,24 @@ void exec(
     auto tanh = handle->create_operator<ElemwiseForward>();
     tanh->param().mode = Elemwise::Param::Mode::TANH;
     tanh->exec({g_origin}, g);
+    // sigmoid: o
+    TensorLayout three_gates_ifo_layout{
+            TensorShape({batch_size, hidden_size * 3}), copy_gates.layout.dtype};
+    TensorLayout o_layout{
+            TensorShape({batch_size, hidden_size}), copy_gates.layout.dtype};
+    TensorND o_origin{
+            static_cast<char*>(copy_gates.raw_ptr()) +
+                    three_gates_ifo_layout.span().dist_byte(),
+            o_layout};
+    TensorND o{
+            static_cast<char*>(tmp.raw_ptr()) +
+                    three_gates_ifo_layout.span().dist_byte(),
+            o_layout};
+    sigmoid->exec({o_origin}, o);
     // extract i f o
     TensorND i{static_cast<char*>(tmp.raw_ptr()), g_layout};
     TensorND f{
             static_cast<char*>(tmp.raw_ptr()) + g_layout.span().dist_byte(), g_layout};
-    TensorND o{
-            static_cast<char*>(tmp.raw_ptr()) + g_layout.span().dist_byte() * 2,
-            g_layout};
     // calculate new cell state
     auto elewise_mul_add = handle->create_operator<ElemwiseForward>();
     elewise_mul_add->param().mode = Elemwise::Param::Mode::FUSE_MUL_ADD4;
diff --git a/dnn/src/common/opr_trait.h b/dnn/src/common/opr_trait.h
index 43e3a8b5..f1558e75 100644
--- a/dnn/src/common/opr_trait.h
+++ b/dnn/src/common/opr_trait.h
@@ -139,8 +139,12 @@ DEF(LayerNormForward, 6, true, true);
 DEF(LayerNormBackward, 8, true, true);
 DEF(DropoutForward, 3, true, true);
 DEF(DropoutBackward, 3, true, true);
-DEF(RNNCellForward, 6, true, true);
+DEF(RNNCellForward, 7, true, true);
 DEF(RNNForward, 6, true, true);
+DEF(RNNBackward, 10, true, true);
+DEF(LSTMCellForward, 10, true, true);
+DEF(LSTMForward, 8, true, true);
+DEF(LSTMBackward, 13, true, true);
 }  // namespace megdnn
 
 // vim: syntax=cpp.doxygen
diff --git a/dnn/src/common/rnn.cpp b/dnn/src/common/rnn.cpp
index 2cf3b2f0..3bb54a24 100644
--- a/dnn/src/common/rnn.cpp
+++ b/dnn/src/common/rnn.cpp
@@ -16,10 +16,8 @@ namespace megdnn {
 
 void RNN::deduce_layout(
         const TensorLayout& input, const TensorLayout& hx,
-        const TensorLayout& flatten_weights, TensorLayout& output, TensorLayout& hy,
+        const TensorLayout& /*flatten_weights*/, TensorLayout& output, TensorLayout& hy,
         TensorLayout& reserve_space) {
-    // input: [seq_len, batch_size, input_size]
-    // hx: [D * num_layers, batch_size, hidden_size]
     size_t seq_len = input.shape[0];
     size_t batch_size = input.shape[1];
     size_t D = param().bidirectional ? 2 : 1;
@@ -27,22 +25,26 @@ void RNN::deduce_layout(
     output = TensorLayout(
             TensorShape{seq_len, batch_size, D * hidden_size}, input.dtype);
     hy = TensorLayout(hx);
-    // reserve_space = {{get_reserve_size(this->handle(), param(), input)},
-    // dtype::Byte()};
-    reserve_space = {{get_reserve_size_in_bytes(input)}, dtype::Byte()};
+    reserve_space = {{get_reserve_size_in_bytes(input)}, input.dtype};
 }
 
 void RNN::check_exec(
         const TensorLayout& input, const TensorLayout& hx,
         const TensorLayout& flatten_weights, const TensorLayout& output,
-        const TensorLayout& hy, const TensorLayout& reserve_space,
-        size_t workspace_in_bytes) {
+        const TensorLayout& hy, const TensorLayout& /*reserve_space*/,
+        size_t /*workspace_in_bytes*/) {
     auto errmsg = [&]() {
         std::string msg;
         msg.append("input=");
         msg.append(input.to_string());
+        msg.append(", output=");
+        msg.append(output.to_string());
         msg.append(", hx=");
         msg.append(hx.to_string());
+        msg.append(", flatten_weights=");
+        msg.append(flatten_weights.to_string());
+        msg.append(", hy=");
+        msg.append(hy.to_string());
         msg.append(", hidden_size=");
         msg.append(std::to_string(param().hidden_size));
         msg.append(", num_layers=");
@@ -52,20 +54,38 @@ void RNN::check_exec(
         return msg;
     };
     size_t D = param().bidirectional ? 2 : 1;
+    size_t b = param().bias ? 1 : 0;
     size_t num_layers = param().num_layers;
+    size_t input_size = input.shape[2];
+    size_t gate_hidden_size = param().hidden_size;
+    // calculate size_dim1 the same as lstm
+    size_t size_dim1 = D * (input_size + param().hidden_size) +
+                       (num_layers - 1) * D * ((D + 1) * param().hidden_size) +
+                       b * 2 * D * num_layers;
 #define ASSERT_BRIEF(_content) megdnn_assert(_content, "%s", errmsg().c_str());
 
     ASSERT_BRIEF(hx.ndim == 3)
+    ASSERT_BRIEF(input.ndim == 3)
+    ASSERT_BRIEF(output.ndim == 3)
+    ASSERT_BRIEF(hy.ndim == 3)
+    ASSERT_BRIEF(flatten_weights.shape[0] == gate_hidden_size)
+    ASSERT_BRIEF(flatten_weights.shape[0] == size_dim1)
     ASSERT_BRIEF(hx.shape[0] == D * num_layers)
     ASSERT_BRIEF(hx.shape[1] == input.shape[1])  // batch_size
     ASSERT_BRIEF(hx.shape[2] == param().hidden_size)
+    ASSERT_BRIEF(output.shape[0] == input.shape[0])
+    ASSERT_BRIEF(output.shape[1] == input.shape[1])
+    ASSERT_BRIEF(output.shape[2] == D * param().hidden_size)
+    ASSERT_BRIEF(hy.shape[0] == hx.shape[0])
+    ASSERT_BRIEF(hy.shape[1] == hx.shape[1])
+    ASSERT_BRIEF(hy.shape[2] == hx.shape[2])
 #undef ASSERT_BRIEF
 }
 
 void RNNBackward::deduce_layout(
-        const TensorLayout& x, const TensorLayout& y, const TensorLayout& hx,
-        const TensorLayout& dy, const TensorLayout& dhy,
-        const TensorLayout& flatten_weights, const TensorLayout& reserve_space,
+        const TensorLayout& x, const TensorLayout& /*y*/, const TensorLayout& hx,
+        const TensorLayout& /*dy*/, const TensorLayout& /*dhy*/,
+        const TensorLayout& flatten_weights, const TensorLayout& /*reserve_space*/,
         TensorLayout& dx, TensorLayout& dhx, TensorLayout& dw) {
     dx = x;
     dhx = hx;
@@ -73,10 +93,11 @@ void RNNBackward::deduce_layout(
 }
 
 void RNNBackward::check_exec(
-        const TensorLayout& x, const TensorLayout& y, const TensorLayout& hx,
-        const TensorLayout& dy, const TensorLayout& dhy,
-        const TensorLayout& flatten_weights, const TensorLayout& reserve_space,
-        const TensorLayout& dx, const TensorLayout& dhx, const TensorLayout& dw,
-        size_t workspace_in_bytes) {}
+        const TensorLayout& /*x*/, const TensorLayout& /*y*/,
+        const TensorLayout& /*hx*/, const TensorLayout& /*dy*/,
+        const TensorLayout& /*dhy*/, const TensorLayout& /*flatten_weights*/,
+        const TensorLayout& /*reserve_space*/, const TensorLayout& /*dx*/,
+        const TensorLayout& /*dhx*/, const TensorLayout& /*dw*/,
+        size_t /*workspace_in_bytes*/) {}
 
 }  // namespace megdnn
diff --git a/dnn/src/common/rnn_cell.cpp b/dnn/src/common/rnn_cell.cpp
index 0cdbc259..aa30addb 100644
--- a/dnn/src/common/rnn_cell.cpp
+++ b/dnn/src/common/rnn_cell.cpp
@@ -16,16 +16,11 @@ namespace megdnn {
 
 void RNNCell::deduce_layout(
         const TensorLayout& input, const TensorLayout& weight_ih,
-        const TensorLayout& bias_ih, const TensorLayout& hx,
-        const TensorLayout& weight_hh, const TensorLayout& bias_hh, TensorLayout& dst) {
-    // megdnn_assert(hx.ndim == 2);
+        const TensorLayout& /*bias_ih*/, const TensorLayout& hx,
+        const TensorLayout& /*weight_hh*/, const TensorLayout& /*bias_hh*/,
+        TensorLayout& dst) {
     size_t batch_size = hx.shape[0];
-    // size_t hidden_size = weight_hh.shape[1];
     size_t gate_hidden_size = weight_ih.shape[0];
-    // size_t input_size = weight_ih.shape[1];
-    // megdnn_assert(input.shape[1] == input_size);
-    // megdnn_assert(hx.shape[1] == hidden_size);
-    // megdnn_assert_eq_dtype(input, hx);
 
     dst = TensorLayout(TensorShape({batch_size, gate_hidden_size}), input.dtype);
 }
@@ -36,6 +31,37 @@ void RNNCell::check_exec(
         const TensorLayout& weight_hh, const TensorLayout& bias_hh,
         const TensorLayout& dst, size_t workspace_in_bytes) {
     TensorLayout dst_expected;
+    auto errmsg = [&]() {
+        std::string msg;
+        msg.append("input=");
+        msg.append(input.to_string());
+        msg.append(", weight_ih=");
+        msg.append(weight_ih.to_string());
+        msg.append(", bias_ih=");
+        msg.append(bias_ih.to_string());
+        msg.append(", hx=");
+        msg.append(hx.to_string());
+        msg.append(", weight_hh=");
+        msg.append(weight_hh.to_string());
+        msg.append(", bias_hh=");
+        msg.append(bias_hh.to_string());
+        msg.append(", dst=");
+        msg.append(dst.to_string());
+        return msg;
+    };
+#define ASSERT_BRIEF(_content) megdnn_assert(_content, "%s", errmsg().c_str());
+
+    ASSERT_BRIEF(input.ndim == 2)
+    ASSERT_BRIEF(hx.ndim == 2)
+    ASSERT_BRIEF(hx.shape[0] == input.shape[0])  // batch
+    ASSERT_BRIEF(input.shape[1] == weight_ih.shape[1])
+    ASSERT_BRIEF(hx.shape[0] == dst.shape[0])  // batch
+    ASSERT_BRIEF(hx.shape[1] == dst.shape[1])
+    ASSERT_BRIEF(hx.shape[1] == weight_ih.shape[0])  // hidden_size
+    ASSERT_BRIEF(weight_ih.shape[0] == weight_hh.shape[0])
+    ASSERT_BRIEF(weight_hh.shape[0] == weight_hh.shape[1])
+    ASSERT_BRIEF(bias_ih.shape[0] == bias_hh.shape[0])
+#undef ASSERT_BRIEF
     megdnn_assert_eq_dtype(input, dst);
     megdnn_assert_eq_dtype(hx, dst);
     deduce_layout(input, weight_ih, bias_ih, hx, weight_hh, bias_hh, dst_expected);
@@ -53,12 +79,15 @@ namespace rnn_cell {
 
 size_t get_workspace_in_bytes(
         const TensorLayout& input, const TensorLayout& weight_ih,
-        const TensorLayout& bias_ih, const TensorLayout& hx,
-        const TensorLayout& weight_hh, const TensorLayout& bias_hh,
+        const TensorLayout& /*bias_ih*/, const TensorLayout& hx,
+        const TensorLayout& weight_hh, const TensorLayout& /*bias_hh*/,
         const TensorLayout& dst, Handle* handle) {
     auto opr = handle->create_operator<MatrixMulForward>();
     opr->param().transposeB = true;
-    return dst.span().dist_byte() + opr->get_workspace_in_bytes(hx, weight_hh, dst);
+    return dst.span().dist_byte() +
+           std::max(
+                   opr->get_workspace_in_bytes(hx, weight_hh, dst),
+                   opr->get_workspace_in_bytes(input, weight_ih, dst));
 }
 
 void exec(
@@ -74,14 +103,11 @@ void exec(
     opr->param().transposeB = true;
     opr->exec(input, weight_ih, tmp, new_workspace);
     opr->exec(hx, weight_hh, dst, new_workspace);
-    // if (this->param().bias) add_bias(dst, tmp, bias, dst);
-    // if (this->param().bias) {
     auto add_opr = handle->create_operator<ElemwiseForward>();
     add_opr->param().mode = Elemwise::Param::Mode::ADD;
     add_opr->exec({dst, tmp}, dst);
     add_opr->exec({dst, bias_ih}, dst);
     add_opr->exec({dst, bias_hh}, dst);
-    // }
 
     // activation
     using NonlineMode = param::RNNCell::NonlineMode;
diff --git a/dnn/src/cuda/cudnn_wrapper.cpp b/dnn/src/cuda/cudnn_wrapper.cpp
index 16d73ef6..f8080afb 100644
--- a/dnn/src/cuda/cudnn_wrapper.cpp
+++ b/dnn/src/cuda/cudnn_wrapper.cpp
@@ -160,29 +160,6 @@ void TensorDesc::set(
     }
 }
 
-void TensorDesc::set_nd(const TensorLayout& layout, int pad) {
-    int nbDims = layout.ndim < pad ? pad : layout.ndim;
-    int dimA[nbDims], strideA[nbDims];
-
-    for (size_t i = 0; i < layout.ndim; ++i) {
-        dimA[i] = layout.shape[i];
-        // strideA[i] = layout.stride[i];
-    }
-    for (size_t i = layout.ndim; i < nbDims; ++i) {
-        dimA[i] = 1;  // unused
-        // strideA[i] = 1;
-    }
-    // stride
-    for (size_t i = 0; i < nbDims; ++i) {
-        strideA[i] = 1;
-        for (size_t j = i + 1; j < nbDims; ++j) {
-            strideA[i] *= dimA[j];
-        }
-    }
-    cudnn_check(cudnnSetTensorNdDescriptor(
-            desc, to_cudnn_dtype(layout.dtype), nbDims, dimA, strideA));
-}
-
 std::string TensorDesc::to_string() {
     cudnnDataType_t data_type;
     int n;
@@ -456,97 +433,6 @@ void Conv3DDesc::set(const param::Convolution3D& param, const size_t nr_group) {
             desc, 3, padA, filterStrideA, dilationA, mode, CUDNN_DATA_FLOAT));
 }
 
-DropoutDesc::DropoutDesc() {
-    cudnn_check(cudnnCreateDropoutDescriptor(&desc));
-}
-
-DropoutDesc::~DropoutDesc() {
-    cudnn_check(cudnnDestroyDropoutDescriptor(desc));
-}
-
-void DropoutDesc::set(float dropout, Handle* handle, TensorND& state) {
-    cudnn_check(cudnnSetDropoutDescriptor(
-            desc, cudnn_handle(handle), dropout, state.raw_ptr(),
-            state.layout.span().dist_byte(), 0  // seed
-            ));
-}
-
-void DropoutDesc::set_no_dropout(Handle* handle) {
-    cudnn_check(
-            cudnnSetDropoutDescriptor(desc, cudnn_handle(handle), 0, nullptr, 0, 0));
-}
-
-RNNDesc::RNNDesc() {
-    cudnn_check(cudnnCreateRNNDescriptor(&desc));
-}
-
-RNNDesc::~RNNDesc() {
-    cudnn_check(cudnnDestroyRNNDescriptor(desc));
-}
-
-void RNNDesc::set(
-        size_t input_size, size_t hidden_size, size_t proj_size, size_t num_layers,
-        bool bidirectional, bool bias, const megdnn::DType dtype, cudnnRNNMode_t mode,
-        DropoutDesc& dropout_desc, Handle* handle) {
-    cudnnRNNMode_t rnn_mode = mode;
-    cudnnRNNBiasMode_t bias_mode = bias ? CUDNN_RNN_DOUBLE_BIAS : CUDNN_RNN_NO_BIAS;
-    cudnnDirectionMode_t dir_mode =
-            bidirectional ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL;
-    cudnnDataType_t math_prec;
-
-    // math precision
-    if (dtype.enumv() == DTypeEnum::Float16)
-        math_prec = CUDNN_DATA_HALF;
-    else
-        math_prec = CUDNN_DATA_FLOAT;
-
-#if false  // CUDNN_MAJOR >= 8
-    cudnn_check(cudnnSetRNNDescriptor_v8(
-            desc, CUDNN_RNN_ALGO_STANDARD, mode, bias_mode, dir_mode,
-            CUDNN_LINEAR_INPUT, to_cudnn_dtype(dtype), math_prec, CUDNN_DEFAULT_MATH,
-            input_size, hidden_size, proj_size, num_layers, dropout_desc.desc,
-            CUDNN_RNN_PADDED_IO_DISABLED));
-#else
-    cudnn_check(cudnnSetRNNDescriptor_v6(
-            cudnn_handle(handle), desc, hidden_size, num_layers, dropout_desc.desc,
-            CUDNN_LINEAR_INPUT, dir_mode, mode, CUDNN_RNN_ALGO_STANDARD, math_prec));
-#endif
-}
-
-RNNDataDesc::RNNDataDesc() {
-    cudnn_check(cudnnCreateRNNDataDescriptor(&desc));
-}
-
-RNNDataDesc::~RNNDataDesc() {
-    cudnn_check(cudnnDestroyRNNDataDescriptor(desc));
-}
-
-void RNNDataDesc::set(
-        int batchSize, int vectorSize, int maxSeqLength, const int* devSeqLengths,
-        DType dtype) {
-    // for now, all tensor are padded in python
-    // int seqLengthArray[batchSize];
-    // for (int i = 0; i < batchSize; ++i) seqLengthArray[i] = maxSeqLength;
-    cudnn_check(cudnnSetRNNDataDescriptor(
-            desc, to_cudnn_dtype(dtype), CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_UNPACKED,
-            maxSeqLength, batchSize, vectorSize, devSeqLengths, nullptr));
-}
-
-RNNWeightFilterDesc::RNNWeightFilterDesc() {
-    cudnn_check(cudnnCreateFilterDescriptor(&desc));
-}
-
-RNNWeightFilterDesc::~RNNWeightFilterDesc() {
-    cudnn_check(cudnnDestroyFilterDescriptor(desc));
-}
-
-void RNNWeightFilterDesc::set(const TensorLayout& flatten_weights) {
-    int weight_elem_num = flatten_weights.total_nr_elems();
-    int dimW[] = {weight_elem_num, 1, 1};
-    cudnn_check(cudnnSetFilterNdDescriptor(
-            desc, to_cudnn_dtype(flatten_weights.dtype), CUDNN_TENSOR_NCHW, 3, dimW));
-}
-
 ////////////////////////// CudnnAlgoPack //////////////////////////
 
 #define V1(v) #v
diff --git a/dnn/src/cuda/cudnn_wrapper.h b/dnn/src/cuda/cudnn_wrapper.h
index 5198e9b5..ef0ab5ab 100644
--- a/dnn/src/cuda/cudnn_wrapper.h
+++ b/dnn/src/cuda/cudnn_wrapper.h
@@ -30,7 +30,6 @@ public:
     void set(
             const TensorLayout& layout,
             const param::Convolution::Format = param::Convolution::Format::NCHW);
-    void set_nd(const TensorLayout& layout, int pad = 3);  // at least 3 dimensions
     std::string to_string();
     ~TensorDesc();
     cudnnTensorDescriptor_t desc;
@@ -122,44 +121,6 @@ public:
     static const std::unordered_map<cudnnConvolutionFwdAlgo_t, Attr> conv3d_fwd_algos();
 };
 
-class DropoutDesc {
-public:
-    DropoutDesc();
-    void set(float dropout, Handle* handle, TensorND& state);
-    void set_no_dropout(Handle* handle);
-    ~DropoutDesc();
-    cudnnDropoutDescriptor_t desc;
-};
-
-class RNNDesc {
-public:
-    RNNDesc();
-    void set(
-            size_t input_size, size_t hidden_size, size_t proj_size, size_t num_layers,
-            bool bidirectional, bool bias, const megdnn::DType dtype,
-            cudnnRNNMode_t mode, DropoutDesc& dropout_desc, Handle* handle);
-    ~RNNDesc();
-    cudnnRNNDescriptor_t desc;
-};
-
-class RNNDataDesc {
-public:
-    RNNDataDesc();
-    void set(
-            int batchSize, int vectorSize, int maxSeqLength, const int* devSeqLengths,
-            DType dtype);
-    ~RNNDataDesc();
-    cudnnRNNDataDescriptor_t desc;
-};
-
-class RNNWeightFilterDesc {
-public:
-    RNNWeightFilterDesc();
-    void set(const TensorLayout& flatten_weights);
-    ~RNNWeightFilterDesc();
-    cudnnFilterDescriptor_t desc;
-};
-
 }  // namespace cuda
 }  // namespace megdnn
 
diff --git a/dnn/src/cuda/handle_create.cpp b/dnn/src/cuda/handle_create.cpp
index 8d383b25..061af596 100644
--- a/dnn/src/cuda/handle_create.cpp
+++ b/dnn/src/cuda/handle_create.cpp
@@ -10,7 +10,7 @@
  * implied.
  */
 
-#include "src/common/handle_impl.h"
+// #include "src/common/handle_impl.h"
 
 #include "src/cuda/adaptive_pooling/opr_impl.h"
 #include "src/cuda/add_update/opr_impl.h"
@@ -52,8 +52,6 @@
 #include "src/cuda/local_share/opr_impl.h"
 #include "src/cuda/lrn/opr_impl.h"
 #include "src/cuda/lsq/opr_impl.h"
-#include "src/cuda/lstm/opr_impl.h"
-#include "src/cuda/lstm_cell/opr_impl.h"
 #include "src/cuda/mask_conv/opr_impl.h"
 #include "src/cuda/matrix_inverse/opr_impl.h"
 #include "src/cuda/matrix_mul/opr_impl.h"
@@ -70,8 +68,6 @@
 #include "src/cuda/repeat/opr_impl.h"
 #include "src/cuda/resize/opr_impl.h"
 #include "src/cuda/rng/opr_impl.h"
-#include "src/cuda/rnn/opr_impl.h"
-#include "src/cuda/rnn_cell/opr_impl.h"
 #include "src/cuda/roi_align/opr_impl.h"
 #include "src/cuda/roi_copy/opr_impl.h"
 #include "src/cuda/roi_pooling/opr_impl.h"
@@ -94,6 +90,7 @@
 namespace megdnn {
 namespace cuda {
 
+// After Adding CUDA LSTM, the declaration of CUDA Backend should be restored
 // MEGDNN_FOREACH_OPR_CLASS(MEGDNN_SPECIALIZE_CREATE_OPERATOR)
 MEGDNN_SPECIALIZE_CREATE_OPERATOR(ConvolutionForward);
 MEGDNN_SPECIALIZE_CREATE_OPERATOR(ConvolutionBackwardData);
@@ -222,6 +219,8 @@ MEGDNN_SPECIALIZE_CREATE_OPERATOR(PaddingForward);
 MEGDNN_SPECIALIZE_CREATE_OPERATOR(PaddingBackward);
 MEGDNN_SPECIALIZE_CREATE_OPERATOR(LayerNormForward);
 MEGDNN_SPECIALIZE_CREATE_OPERATOR(LayerNormBackward);
+MEGDNN_SPECIALIZE_CREATE_OPERATOR(DropoutForward);
+MEGDNN_SPECIALIZE_CREATE_OPERATOR(DropoutBackward);
 
 template <typename Opr>
 std::unique_ptr<Opr> HandleImpl::create_operator() {
@@ -232,9 +231,11 @@ std::unique_ptr<Opr> HandleImpl::create_operator() {
 #define MEGDNN_INST_CREATE_OPERATOR(opr) \
     template std::unique_ptr<megdnn::opr> HandleImpl::create_operator();
 
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wpragmas"
+#pragma GCC diagnostic ignored "-Winstantiation-after-specialization"
 MEGDNN_FOREACH_OPR_CLASS(MEGDNN_INST_CREATE_OPERATOR)
-
-
+#pragma GCC diagnostic pop
 }  // namespace cuda
 }  // namespace megdnn
 
diff --git a/dnn/src/cuda/lstm/opr_impl.cpp b/dnn/src/cuda/lstm/opr_impl.cpp
deleted file mode 100644
index 6dfc9ef0..00000000
--- a/dnn/src/cuda/lstm/opr_impl.cpp
+++ /dev/null
@@ -1,112 +0,0 @@
-/**
- * \file dnn/src/cuda/lstm/opr_impl.cpp
- * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
- *
- * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- */
-#include "src/cuda/lstm/opr_impl.h"
-#include "src/cuda/lstm/utils.h"
-#include "src/cuda/utils.h"
-
-#include <cudnn.h>
-
-namespace megdnn {
-namespace cuda {
-
-void LSTMImpl::exec(
-        _megdnn_tensor_in input, _megdnn_tensor_in hx, _megdnn_tensor_in cx,
-        _megdnn_tensor_in flatten_weights, _megdnn_tensor_out output,
-        _megdnn_tensor_out hy, _megdnn_tensor_out cy, _megdnn_tensor_out reserve_space,
-        _megdnn_workspace workspace) {
-    Handle* handle = this->handle();
-
-    rnn::RNNForwardDescHolder_v6 desc_holder =
-            lstm::get_RNNDescHolder_v6(this->handle(), param(), input.layout);
-    auto x_desc_arr = rnn::get_descs(desc_holder.x_descs);
-    auto y_desc_arr = rnn::get_descs(desc_holder.y_descs);
-    RNNWeightFilterDesc w_desc;
-    w_desc.set(flatten_weights.layout);
-
-    if (param().fwd_mode == param::LSTM::FwdMode::TRAINING) {
-        cudnn_check(cudnnRNNForwardTraining(
-                cudnn_handle(handle), desc_holder.rnn_desc.desc, desc_holder.seq_len,
-                x_desc_arr.data(), input.raw_ptr(), desc_holder.hx_desc.desc,
-                hx.raw_ptr(), desc_holder.cx_desc.desc, cx.raw_ptr(), w_desc.desc,
-                flatten_weights.raw_ptr(), y_desc_arr.data(), output.raw_ptr(),
-                desc_holder.hy_desc.desc, hy.raw_ptr(), desc_holder.cy_desc.desc,
-                cy.raw_ptr(), workspace.raw_ptr, desc_holder.workspace_size,
-                reserve_space.raw_ptr(), desc_holder.reserveSpace_size));
-    } else {
-        cudnn_check(cudnnRNNForwardInference(
-                cudnn_handle(handle), desc_holder.rnn_desc.desc, desc_holder.seq_len,
-                x_desc_arr.data(), input.raw_ptr(), desc_holder.hx_desc.desc,
-                hx.raw_ptr(), desc_holder.cx_desc.desc, nullptr, w_desc.desc,
-                flatten_weights.raw_ptr(), y_desc_arr.data(), output.raw_ptr(),
-                desc_holder.hy_desc.desc, hy.raw_ptr(), desc_holder.cy_desc.desc,
-                cy.raw_ptr(), workspace.raw_ptr, desc_holder.workspace_size));
-    }
-}
-
-size_t LSTMImpl::get_workspace_in_bytes(
-        const TensorLayout& input, const TensorLayout& hx, const TensorLayout& cx,
-        const TensorLayout& flatten_weights, const TensorLayout& output,
-        const TensorLayout& hy, const TensorLayout& cy,
-        const TensorLayout& reserve_space) {
-    rnn::RNNForwardDescHolder_v6 desc_holder =
-            lstm::get_RNNDescHolder_v6(this->handle(), param(), input);
-    return desc_holder.workspace_size;
-}
-
-size_t LSTMImpl::get_reserve_size_in_bytes(const TensorLayout& input) {
-    rnn::RNNForwardDescHolder_v6 desc_holder =
-            lstm::get_RNNDescHolder_v6(this->handle(), param(), input);
-    return desc_holder.reserveSpace_size;
-}
-
-void LSTMBackwardImpl::exec(
-        _megdnn_tensor_in x, _megdnn_tensor_in y, _megdnn_tensor_in hx,
-        _megdnn_tensor_in cx, _megdnn_tensor_in dy, _megdnn_tensor_in dhy,
-        _megdnn_tensor_in dcy, _megdnn_tensor_in flatten_weights,
-        _megdnn_tensor_in reserve_space, _megdnn_tensor_out dx, _megdnn_tensor_out dhx,
-        _megdnn_tensor_out dcx, _megdnn_tensor_out dw, _megdnn_workspace workspace) {
-    Handle* handle = this->handle();
-    size_t seq_len = x.layout.shape[0];
-    auto desc_holder = lstm::get_RNNDescHolder_v6(handle, param(), x.layout);
-    auto x_desc_arr_ptr = rnn::get_descs(desc_holder.x_descs).data();
-    auto y_desc_arr_ptr = rnn::get_descs(desc_holder.y_descs).data();
-    RNNWeightFilterDesc w_desc;
-    w_desc.set(flatten_weights.layout);
-
-    cudnn_check(cudnnRNNBackwardData(
-            cudnn_handle(handle), desc_holder.rnn_desc.desc, seq_len, y_desc_arr_ptr,
-            y.raw_ptr(), y_desc_arr_ptr, dy.raw_ptr(), desc_holder.hy_desc.desc,
-            dhy.raw_ptr(), desc_holder.cy_desc.desc, dcy.raw_ptr(), w_desc.desc,
-            flatten_weights.raw_ptr(), desc_holder.hx_desc.desc, hx.raw_ptr(),
-            desc_holder.cx_desc.desc, cx.raw_ptr(), x_desc_arr_ptr, dx.raw_ptr(),
-            desc_holder.hx_desc.desc, dhx.raw_ptr(), desc_holder.cx_desc.desc,
-            dcx.raw_ptr(), workspace.raw_ptr, desc_holder.workspace_size,
-            reserve_space.raw_ptr(), desc_holder.reserveSpace_size));
-
-    cudnn_check(cudnnRNNBackwardWeights(
-            cudnn_handle(handle), desc_holder.rnn_desc.desc, seq_len, x_desc_arr_ptr,
-            x.raw_ptr(), desc_holder.hx_desc.desc, hx.raw_ptr(), y_desc_arr_ptr,
-            y.raw_ptr(), workspace.raw_ptr, desc_holder.workspace_size, w_desc.desc,
-            dw.raw_ptr(), reserve_space.raw_ptr(), desc_holder.reserveSpace_size));
-}
-
-size_t LSTMBackwardImpl::get_workspace_in_bytes(
-        const TensorLayout& x, const TensorLayout& y, const TensorLayout& hx,
-        const TensorLayout& cx, const TensorLayout& dy, const TensorLayout& dhy,
-        const TensorLayout& dcy, const TensorLayout& flatten_weights,
-        const TensorLayout& reserve_space, const TensorLayout& dx,
-        const TensorLayout& dhx, const TensorLayout& dcx, const TensorLayout& dw) {
-    auto desc_holder = lstm::get_RNNDescHolder_v6(this->handle(), param(), x);
-    return desc_holder.workspace_size;
-}
-
-}  // namespace cuda
-}  // namespace megdnn
diff --git a/dnn/src/cuda/lstm/opr_impl.h b/dnn/src/cuda/lstm/opr_impl.h
deleted file mode 100644
index a01032f7..00000000
--- a/dnn/src/cuda/lstm/opr_impl.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/**
- * \file dnn/src/cuda/lstm/opr_impl.h
- * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
- *
- * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- */
-#pragma once
-#include "megdnn/oprs.h"
-
-namespace megdnn {
-namespace cuda {
-
-class LSTMImpl : public LSTM {
-public:
-    using LSTM::LSTM;
-
-    void exec(
-            _megdnn_tensor_in input, _megdnn_tensor_in hx, _megdnn_tensor_in cx,
-            _megdnn_tensor_in flatten_weights, _megdnn_tensor_out output,
-            _megdnn_tensor_out hy, _megdnn_tensor_out cy,
-            _megdnn_tensor_out reserve_space, _megdnn_workspace workspace);
-
-    size_t get_workspace_in_bytes(
-            const TensorLayout& input, const TensorLayout& hx, const TensorLayout& cx,
-            const TensorLayout& flatten_weights, const TensorLayout& output,
-            const TensorLayout& hy, const TensorLayout& cy,
-            const TensorLayout& reserve_space);
-    size_t get_reserve_size_in_bytes(const TensorLayout& input);
-};
-
-class LSTMBackwardImpl : public LSTMBackward {
-public:
-    using LSTMBackward::LSTMBackward;
-
-    virtual void exec(
-            _megdnn_tensor_in x, _megdnn_tensor_in y, _megdnn_tensor_in hx,
-            _megdnn_tensor_in cx, _megdnn_tensor_in dy, _megdnn_tensor_in dhy,
-            _megdnn_tensor_in dcy, _megdnn_tensor_in flatten_weights,
-            _megdnn_tensor_in reserve_space, _megdnn_tensor_out dx,
-            _megdnn_tensor_out dhx, _megdnn_tensor_out dcx, _megdnn_tensor_out dw,
-            _megdnn_workspace workspace);
-
-    virtual size_t get_workspace_in_bytes(
-            const TensorLayout& x, const TensorLayout& y, const TensorLayout& hx,
-            const TensorLayout& cx, const TensorLayout& dy, const TensorLayout& dhy,
-            const TensorLayout& dcy, const TensorLayout& flatten_weights,
-            const TensorLayout& reserve_space, const TensorLayout& dx,
-            const TensorLayout& dhx, const TensorLayout& dcx, const TensorLayout& dw);
-};
-
-}  // namespace cuda
-}  // namespace megdnn
\ No newline at end of file
diff --git a/dnn/src/cuda/lstm/utils.cpp b/dnn/src/cuda/lstm/utils.cpp
deleted file mode 100644
index 7fe1ab0e..00000000
--- a/dnn/src/cuda/lstm/utils.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-/**
- * \file dnn/src/cuda/lstm/utils.cpp
- * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
- *
- * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- */
-#include "src/cuda/lstm/utils.h"
-#include "src/cuda/utils.h"
-
-#include <cudnn.h>
-
-namespace megdnn {
-namespace cuda {
-namespace lstm {
-
-RNNForwardDescHolder_v6 get_RNNDescHolder_v6(
-        Handle* handle, megdnn::LSTMForward::Param& _param, const TensorLayout& input) {
-    size_t seq_len = input.shape[0];
-    size_t batch_size = input.shape[1];
-    size_t input_size = input.shape[2];
-
-    cudnnRNNMode_t mode = CUDNN_LSTM;
-
-    using FwdMode = param::LSTM::FwdMode;
-
-    RNNForwardDescHolder_v6 desc_holder(
-            handle, seq_len, batch_size, _param.hidden_size, input_size,
-            _param.proj_size, _param.num_layers, _param.bidirectional, _param.bias,
-            input.dtype, mode);
-    return desc_holder;
-}
-
-}  // namespace lstm
-}  // namespace cuda
-}  // namespace megdnn
\ No newline at end of file
diff --git a/dnn/src/cuda/lstm/utils.h b/dnn/src/cuda/lstm/utils.h
deleted file mode 100644
index 2623c6fd..00000000
--- a/dnn/src/cuda/lstm/utils.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/**
- * \file dnn/src/cuda/lstm/utils.h
- * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
- *
- * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- */
-#pragma once
-#include "src/cuda/cudnn_wrapper.h"
-#include "src/cuda/rnn/utils.h"
-
-namespace megdnn {
-namespace cuda {
-namespace lstm {
-using megdnn::cuda::rnn::RNNForwardDescHolder_v6;
-RNNForwardDescHolder_v6 get_RNNDescHolder_v6(
-        Handle* handle, megdnn::LSTMForward::Param& _param, const TensorLayout& input);
-}  // namespace lstm
-}  // namespace cuda
-}  // namespace megdnn
\ No newline at end of file
diff --git a/dnn/src/cuda/lstm_cell/opr_impl.cpp b/dnn/src/cuda/lstm_cell/opr_impl.cpp
deleted file mode 100644
index b01c5de0..00000000
--- a/dnn/src/cuda/lstm_cell/opr_impl.cpp
+++ /dev/null
@@ -1,42 +0,0 @@
-/**
- * \file dnn/src/cuda/lstm_cell/opr_impl.cpp
- * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
- *
- * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- */
-#include "src/cuda/lstm_cell/opr_impl.h"
-#include "megdnn/dtype.h"
-#include "megdnn/oprs/base.h"
-#include "src/common/lstm_cell.h"
-#include "src/common/opr_delegate.h"
-#include "src/common/utils.h"
-
-namespace megdnn {
-namespace cuda {
-size_t LSTMCellImpl::get_workspace_in_bytes(
-        const TensorLayout& input, const TensorLayout& weight_ih,
-        const TensorLayout& bias_ih, const TensorLayout& hx,
-        const TensorLayout& weight_hh, const TensorLayout& bias_hh,
-        const TensorLayout& cx, const TensorLayout& h_new, const TensorLayout& c_new,
-        const TensorLayout& gates) {
-    return megdnn::lstm_cell::get_workspace_in_bytes(
-            input, weight_ih, bias_ih, hx, weight_hh, bias_hh, cx, h_new, c_new, gates,
-            handle());
-}
-
-void LSTMCellImpl::exec(
-        _megdnn_tensor_in input, _megdnn_tensor_in weight_ih, _megdnn_tensor_in bias_ih,
-        _megdnn_tensor_in hx, _megdnn_tensor_in weight_hh, _megdnn_tensor_in bias_hh,
-        _megdnn_tensor_in cx, _megdnn_tensor_out h_new, _megdnn_tensor_out c_new,
-        _megdnn_tensor_out gates, _megdnn_workspace workspace) {
-    megdnn::lstm_cell::exec(
-            input, weight_ih, bias_ih, hx, weight_hh, bias_hh, cx, h_new, c_new, gates,
-            workspace, handle());
-}
-}  // namespace cuda
-
-}  // namespace megdnn
\ No newline at end of file
diff --git a/dnn/src/cuda/lstm_cell/opr_impl.h b/dnn/src/cuda/lstm_cell/opr_impl.h
deleted file mode 100644
index f8578162..00000000
--- a/dnn/src/cuda/lstm_cell/opr_impl.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/**
- * \file dnn/src/cuda/lstm_cell/opr_impl.h
- * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
- *
- * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- */
-#pragma once
-#include "megdnn/oprs.h"
-#include "src/cuda/rnn_cell/opr_impl.h"
-
-namespace megdnn {
-namespace cuda {
-
-class LSTMCellImpl : public LSTMCell {
-public:
-    using LSTMCell::LSTMCell;
-    void exec(
-            _megdnn_tensor_in input, _megdnn_tensor_in weight_ih,
-            _megdnn_tensor_in bias_ih, _megdnn_tensor_in hx,
-            _megdnn_tensor_in weight_hh, _megdnn_tensor_in bias_hh,
-            _megdnn_tensor_in cx, _megdnn_tensor_out h_new, _megdnn_tensor_out c_new,
-            _megdnn_tensor_out gates, _megdnn_workspace workspace) override;
-    size_t get_workspace_in_bytes(
-            const TensorLayout& input, const TensorLayout& weight_ih,
-            const TensorLayout& bias_ih, const TensorLayout& hx,
-            const TensorLayout& weight_hh, const TensorLayout& bias_hh,
-            const TensorLayout& cx, const TensorLayout& h_new,
-            const TensorLayout& c_new, const TensorLayout& gates) override;
-};
-
-}  // namespace cuda
-}  // namespace megdnn
\ No newline at end of file
diff --git a/dnn/src/cuda/rnn/opr_impl.cpp b/dnn/src/cuda/rnn/opr_impl.cpp
deleted file mode 100644
index 937345a2..00000000
--- a/dnn/src/cuda/rnn/opr_impl.cpp
+++ /dev/null
@@ -1,170 +0,0 @@
-/**
- * \file dnn/src/cuda/rnn/opr_impl.cpp
- * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
- *
- * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- */
-#include "src/cuda/rnn/opr_impl.h"
-#include "src/common/rnn.h"
-#include "src/cuda/utils.h"
-
-//#include <cstring>
-#include <cudnn.h>
-#include <cstdlib>
-#include <iostream>
-
-namespace megdnn {
-namespace cuda {
-
-using namespace std;
-
-void RNNImpl::exec(
-        _megdnn_tensor_in input, _megdnn_tensor_in hx,
-        _megdnn_tensor_in flatten_weights, _megdnn_tensor_out output,
-        _megdnn_tensor_out hy, _megdnn_tensor_out reserve_space,
-        _megdnn_workspace workspace) {
-    Handle* handle = this->handle();
-
-#if false  // CUDNN_MAJOR >= 8
-        rnn::RNNForwardDescHolder desc_holder = this->get_desc_holder(input.layout);
-
-        void* workspace_ptr = workspace.raw_ptr;
-        void* reserveSpace_ptr = static_cast<uint8_t*>(workspace_ptr) + desc_holder.workspace_size;
-
-        cudnn_check(cudnnRNNForward(
-            cudnn_handle(handle), desc_holder.rnn_desc.desc, desc_holder.fwdMode, desc_holder.devSeqLengths,
-            desc_holder.x_desc.desc, input.raw_ptr(), desc_holder.y_desc.desc, output.raw_ptr(),
-            desc_holder.h_desc.desc, hx.raw_ptr(), hy.raw_ptr(),
-            desc_holder.h_desc.desc, nullptr, nullptr,
-            desc_holder.weight_size, flatten_weights.raw_ptr(), desc_holder.workspace_size, workspace_ptr,
-            desc_holder.reserveSpace_size, reserveSpace_ptr
-        ));
-#else
-    rnn::RNNForwardDescHolder_v6 desc_holder =
-            rnn::get_RNNDescHolder_v6(this->handle(), param(), input.layout);
-    auto x_desc_arr = rnn::get_descs(desc_holder.x_descs);
-    auto y_desc_arr = rnn::get_descs(desc_holder.y_descs);
-    RNNWeightFilterDesc w_desc;
-    w_desc.set(flatten_weights.layout);
-
-    if (param().fwd_mode == param::RNN::FwdMode::TRAINING) {
-        cudnn_check(cudnnRNNForwardTraining(
-                cudnn_handle(handle), desc_holder.rnn_desc.desc, desc_holder.seq_len,
-                x_desc_arr.data(), input.raw_ptr(), desc_holder.hx_desc.desc,
-                hx.raw_ptr(), desc_holder.cx_desc.desc, NULL, w_desc.desc,
-                flatten_weights.raw_ptr(), y_desc_arr.data(), output.raw_ptr(),
-                desc_holder.hy_desc.desc, hy.raw_ptr(), desc_holder.cy_desc.desc, NULL,
-                workspace.raw_ptr, desc_holder.workspace_size, reserve_space.raw_ptr(),
-                desc_holder.reserveSpace_size));
-    } else {
-        cudnn_check(cudnnRNNForwardInference(
-                cudnn_handle(handle), desc_holder.rnn_desc.desc, desc_holder.seq_len,
-                x_desc_arr.data(), input.raw_ptr(), desc_holder.hx_desc.desc,
-                hx.raw_ptr(), desc_holder.cx_desc.desc, nullptr, w_desc.desc,
-                flatten_weights.raw_ptr(), y_desc_arr.data(), output.raw_ptr(),
-                desc_holder.hy_desc.desc, hy.raw_ptr(), desc_holder.cy_desc.desc,
-                nullptr, workspace.raw_ptr, desc_holder.workspace_size));
-    }
-#endif
-}
-
-size_t RNNImpl::get_workspace_in_bytes(
-        const TensorLayout& input, const TensorLayout& hx,
-        const TensorLayout& flatten_weights, const TensorLayout& output,
-        const TensorLayout& hy, const TensorLayout& reserve_space) {
-#if false  // CUDNN_MAJOR >= 8
-        rnn::RNNForwardDescHolder desc_holder = this->get_desc_holder(input);
-#else
-    rnn::RNNForwardDescHolder_v6 desc_holder =
-            rnn::get_RNNDescHolder_v6(this->handle(), param(), input);
-#endif
-    return desc_holder.workspace_size;
-}
-
-size_t RNNImpl::get_reserve_size_in_bytes(const TensorLayout& input) {
-    rnn::RNNForwardDescHolder_v6 desc_holder =
-            rnn::get_RNNDescHolder_v6(this->handle(), param(), input);
-    return desc_holder.reserveSpace_size;
-}
-
-/*rnn::RNNForwardDescHolder RNNImpl::get_desc_holder(const TensorLayout& input) {
-    Handle* handle = this->handle();
-    size_t seq_len = input.shape[0];
-    size_t batch_size = input.shape[1];
-    size_t input_size = input.shape[2];
-    auto _param = param();
-
-    cudnnRNNMode_t mode;
-    using NonlineMode = param::RNN::NonlineMode;
-    switch (_param.nonlineMode) {
-        case NonlineMode::RELU:
-            mode = CUDNN_RNN_RELU;
-            break;
-        case NonlineMode::TANH:
-            mode = CUDNN_RNN_TANH;
-            break;
-    }
-
-    cudnnForwardMode_t fwdMode = CUDNN_FWD_MODE_TRAINING;
-    using FwdMode = param::RNN::FwdMode;
-    switch (_param.fwd_mode) {
-        case FwdMode::TRAINING:
-            fwdMode = CUDNN_FWD_MODE_TRAINING;
-            break;
-        case FwdMode::INFERENCE:
-            fwdMode = CUDNN_FWD_MODE_INFERENCE;
-            break;
-    }
-
-    rnn::RNNForwardDescHolder desc_holder(
-            handle, seq_len, batch_size, _param.hidden_size, input_size,
-            _param.proj_size, _param.num_layers, _param.bidirectional, _param.bias,
-            input.dtype, mode, fwdMode);
-    return desc_holder;
-}*/
-
-void RNNBackwardImpl::exec(
-        _megdnn_tensor_in x, _megdnn_tensor_in y, _megdnn_tensor_in hx,
-        _megdnn_tensor_in dy, _megdnn_tensor_in dhy, _megdnn_tensor_in flatten_weights,
-        _megdnn_tensor_in reserve_space, _megdnn_tensor_out dx, _megdnn_tensor_out dhx,
-        _megdnn_tensor_out dw, _megdnn_workspace workspace) {
-    Handle* handle = this->handle();
-    size_t seq_len = x.layout.shape[0];
-    auto desc_holder = rnn::get_RNNDescHolder_v6(handle, param(), x.layout);
-    auto x_desc_arr_ptr = rnn::get_descs(desc_holder.x_descs).data();
-    auto y_desc_arr_ptr = rnn::get_descs(desc_holder.y_descs).data();
-    RNNWeightFilterDesc w_desc;
-    w_desc.set(flatten_weights.layout);
-
-    cudnn_check(cudnnRNNBackwardData(
-            cudnn_handle(handle), desc_holder.rnn_desc.desc, seq_len, y_desc_arr_ptr,
-            y.raw_ptr(), y_desc_arr_ptr, dy.raw_ptr(), desc_holder.hy_desc.desc,
-            dhy.raw_ptr(), desc_holder.cy_desc.desc, NULL, w_desc.desc,
-            flatten_weights.raw_ptr(), desc_holder.hx_desc.desc, hx.raw_ptr(),
-            desc_holder.cx_desc.desc, NULL, x_desc_arr_ptr, dx.raw_ptr(),
-            desc_holder.hx_desc.desc, dhx.raw_ptr(), desc_holder.cx_desc.desc, NULL,
-            workspace.raw_ptr, desc_holder.workspace_size, reserve_space.raw_ptr(),
-            desc_holder.reserveSpace_size));
-
-    cudnn_check(cudnnRNNBackwardWeights(
-            cudnn_handle(handle), desc_holder.rnn_desc.desc, seq_len, x_desc_arr_ptr,
-            x.raw_ptr(), desc_holder.hx_desc.desc, hx.raw_ptr(), y_desc_arr_ptr,
-            y.raw_ptr(), workspace.raw_ptr, desc_holder.workspace_size, w_desc.desc,
-            dw.raw_ptr(), reserve_space.raw_ptr(), desc_holder.reserveSpace_size));
-}
-
-size_t RNNBackwardImpl::get_workspace_in_bytes(
-        const TensorLayout& x, const TensorLayout& y, const TensorLayout& hx,
-        const TensorLayout& dy, const TensorLayout& dhy,
-        const TensorLayout& flatten_weights, const TensorLayout& reserve_space,
-        const TensorLayout& dx, const TensorLayout& dhx, const TensorLayout& dw) {
-    auto desc_holder = rnn::get_RNNDescHolder_v6(this->handle(), param(), x);
-    return desc_holder.workspace_size;
-}
-
-}  // namespace cuda
-}  // namespace megdnn
diff --git a/dnn/src/cuda/rnn/opr_impl.h b/dnn/src/cuda/rnn/opr_impl.h
deleted file mode 100644
index 701e7876..00000000
--- a/dnn/src/cuda/rnn/opr_impl.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/**
- * \file dnn/src/cuda/rnn/opr_impl.h
- * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
- *
- * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- */
-#pragma once
-#include "megdnn/oprs.h"
-#include "src/cuda/cudnn_wrapper.h"
-#include "src/cuda/rnn/utils.h"
-
-namespace megdnn {
-namespace cuda {
-
-class RNNImpl : public RNN {
-public:
-    using RNN::RNN;
-
-    void exec(
-            _megdnn_tensor_in input, _megdnn_tensor_in hx,
-            _megdnn_tensor_in flatten_weights, _megdnn_tensor_out output,
-            _megdnn_tensor_out hy, _megdnn_tensor_out reserve_space,
-            _megdnn_workspace workspace);
-
-    size_t get_workspace_in_bytes(
-            const TensorLayout& input, const TensorLayout& hx,
-            const TensorLayout& flatten_weights, const TensorLayout& output,
-            const TensorLayout& hy, const TensorLayout& reserve_space);
-    size_t get_reserve_size_in_bytes(const TensorLayout& input);
-    // private:
-    //     rnn::RNNForwardDescHolder get_desc_holder(const TensorLayout& input);
-};
-
-class RNNBackwardImpl : public RNNBackward {
-public:
-    using RNNBackward::RNNBackward;
-
-    virtual void exec(
-            _megdnn_tensor_in x, _megdnn_tensor_in y, _megdnn_tensor_in hx,
-            _megdnn_tensor_in dy, _megdnn_tensor_in dhy,
-            _megdnn_tensor_in flatten_weights, _megdnn_tensor_in reserve_space,
-            _megdnn_tensor_out dx, _megdnn_tensor_out dhx, _megdnn_tensor_out dw,
-            _megdnn_workspace workspace);
-
-    virtual size_t get_workspace_in_bytes(
-            const TensorLayout& x, const TensorLayout& y, const TensorLayout& hx,
-            const TensorLayout& dy, const TensorLayout& dhy,
-            const TensorLayout& flatten_weights, const TensorLayout& reserve_space,
-            const TensorLayout& dx, const TensorLayout& dhx, const TensorLayout& dw);
-};
-
-}  // namespace cuda
-}  // namespace megdnn
diff --git a/dnn/src/cuda/rnn/utils.cpp b/dnn/src/cuda/rnn/utils.cpp
deleted file mode 100644
index 9e4ff825..00000000
--- a/dnn/src/cuda/rnn/utils.cpp
+++ /dev/null
@@ -1,138 +0,0 @@
-/**
- * \file dnn/src/cuda/rnn/utils.cpp
- * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
- *
- * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- */
-#include "src/cuda/rnn/utils.h"
-#include "src/cuda/utils.h"
-
-#include <cudnn.h>
-
-namespace megdnn {
-namespace cuda {
-namespace rnn {
-/*RNNForwardDescHolder::RNNForwardDescHolder(Handle* handle, size_t seq_len, size_t
-batch_size, size_t hidden_size, size_t input_size, size_t proj_size, size_t num_layers,
-bool bidirectional, bool bias, DType dtype, cudnnRNNMode_t _mode, cudnnForwardMode_t
-_fwdMode) : mode(_mode), fwdMode(_fwdMode)
-{
-        size_t D = bidirectional ? 2 : 1;
-
-        // TODO: set dropout to 0 in inference mode
-        dropout_desc.set_no_dropout(handle);
-
-        // seq len is unified (not packed)
-        // cuda_check(cudaMalloc((void**)&devSeqLengths, sizeof(int32_t) * batch_size));
-        devSeqLengths = (int32_t*)malloc(sizeof(int32_t) * batch_size);
-        for (size_t i = 0; i < batch_size; ++i) devSeqLengths[i] = seq_len;
-
-        // proj size should be smaller than hidden size according to cudnn api
-        // otherwise it is disabled
-        proj_size = (proj_size > hidden_size || proj_size == 0) ? hidden_size :
-proj_size; rnn_desc.set( input_size, hidden_size, proj_size, num_layers, bidirectional,
-bias, dtype, mode, dropout_desc, handle
-        );
-
-        x_desc.set(batch_size, input_size, seq_len, devSeqLengths, dtype);
-        y_desc.set(batch_size, D * proj_size, seq_len,
-                           devSeqLengths, dtype);
-        h_desc.set_nd(TensorLayout(TensorShape{D * num_layers, batch_size, proj_size},
-dtype));
-
-        cudnn_check(cudnnGetRNNWeightSpaceSize(cudnn_handle(handle), rnn_desc.desc,
-&weight_size));
-
-        cudnn_check(cudnnGetRNNTempSpaceSizes(
-                cudnn_handle(handle), rnn_desc.desc, fwdMode, x_desc.desc,
-&workspace_size, &reserveSpace_size
-        ));
-}
-
-RNNForwardDescHolder::~RNNForwardDescHolder() {
-        // cuda_check(cudaFree(devSeqLengths));
-        free(devSeqLengths);
-}*/
-
-RNNForwardDescHolder_v6::RNNForwardDescHolder_v6(
-        Handle* handle, size_t seq_len, size_t batch_size, size_t hidden_size,
-        size_t input_size, size_t proj_size, size_t num_layers, bool bidirectional,
-        bool bias, DType dtype, cudnnRNNMode_t _mode)
-        : mode(_mode), seq_len(seq_len) {
-    size_t D = bidirectional ? 2 : 1;
-
-    // TODO: set dropout to 0 in inference mode
-    dropout_desc.set_no_dropout(handle);
-
-    proj_size = (proj_size > hidden_size || proj_size == 0) ? hidden_size : proj_size;
-    rnn_desc.set(
-            input_size, hidden_size, proj_size, num_layers, bidirectional, bias, dtype,
-            mode, dropout_desc, handle);
-
-    x_descs.resize(seq_len);
-    y_descs.resize(seq_len);
-    for (size_t i = 0; i < seq_len; ++i) {
-        x_descs[i].set_nd(TensorLayout(TensorShape{batch_size, input_size}, dtype), 3);
-        y_descs[i].set_nd(
-                TensorLayout(TensorShape{batch_size, D * hidden_size}, dtype), 3);
-    }
-
-#define SET_H(_var)           \
-    _var.set_nd(TensorLayout( \
-            TensorShape{D * num_layers, batch_size, hidden_size}, dtype));
-
-    SET_H(hx_desc)
-    SET_H(cx_desc)
-    SET_H(hy_desc)
-    SET_H(cy_desc)
-#undef SET_H
-
-    std::vector<cudnnTensorDescriptor_t> x_desc_arr = get_descs(x_descs);
-    cudnn_check(cudnnGetRNNWorkspaceSize(
-            cudnn_handle(handle), rnn_desc.desc, seq_len, x_desc_arr.data(),
-            &workspace_size));
-
-    cudnn_check(cudnnGetRNNTrainingReserveSize(
-            cudnn_handle(handle), rnn_desc.desc, seq_len, x_desc_arr.data(),
-            &reserveSpace_size));
-}
-
-RNNForwardDescHolder_v6 get_RNNDescHolder_v6(
-        Handle* handle, megdnn::RNNForward::Param& _param, const TensorLayout& input) {
-    size_t seq_len = input.shape[0];
-    size_t batch_size = input.shape[1];
-    size_t input_size = input.shape[2];
-
-    cudnnRNNMode_t mode;
-    using NonlineMode = param::RNN::NonlineMode;
-    switch (_param.nonlineMode) {
-        case NonlineMode::RELU:
-            mode = CUDNN_RNN_RELU;
-            break;
-        case NonlineMode::TANH:
-            mode = CUDNN_RNN_TANH;
-            break;
-    }
-
-    RNNForwardDescHolder_v6 desc_holder(
-            handle, seq_len, batch_size, _param.hidden_size, input_size,
-            _param.proj_size, _param.num_layers, _param.bidirectional, _param.bias,
-            input.dtype, mode);
-    return desc_holder;
-}
-
-std::vector<cudnnTensorDescriptor_t> get_descs(const std::vector<TensorDesc>& descs) {
-    std::vector<cudnnTensorDescriptor_t> r;
-    r.reserve(descs.size());
-    for (auto& desc : descs) {
-        r.emplace_back(desc.desc);
-    }
-    return r;
-}
-}  // namespace rnn
-}  // namespace cuda
-}  // namespace megdnn
\ No newline at end of file
diff --git a/dnn/src/cuda/rnn/utils.h b/dnn/src/cuda/rnn/utils.h
deleted file mode 100644
index d2a652dc..00000000
--- a/dnn/src/cuda/rnn/utils.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/**
- * \file dnn/src/cuda/rnn/utils.h
- * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
- *
- * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- */
-#pragma once
-#include "src/cuda/cudnn_wrapper.h"
-
-namespace megdnn {
-namespace cuda {
-namespace rnn {
-// v8, not for now
-/*struct RNNForwardDescHolder {
-
-            int32_t* devSeqLengths;
-            cudnnRNNMode_t mode;
-            cudnnForwardMode_t fwdMode;
-            RNNDesc rnn_desc;
-            DropoutDesc dropout_desc;
-            RNNDataDesc x_desc, y_desc;
-            TensorDesc h_desc;
-            size_t weight_size, workspace_size, reserveSpace_size;
-
-    RNNForwardDescHolder(Handle* handle, size_t seq_len, size_t batch_size, size_t
-hidden_size, size_t input_size, size_t proj_size, size_t num_layers, bool bidirectional,
-                                                     bool bias, DType dtype,
-cudnnRNNMode_t _mode, cudnnForwardMode_t _fwdMode); ~RNNForwardDescHolder();
-};*/
-
-struct RNNForwardDescHolder_v6 {
-    cudnnRNNMode_t mode;
-    RNNDesc rnn_desc;
-    int seq_len;
-    DropoutDesc dropout_desc;
-    std::vector<TensorDesc> x_descs, y_descs;
-    TensorDesc hx_desc, cx_desc, hy_desc, cy_desc;
-
-    size_t workspace_size, reserveSpace_size;
-
-    RNNForwardDescHolder_v6(
-            Handle* handle, size_t seq_len, size_t batch_size, size_t hidden_size,
-            size_t input_size, size_t proj_size, size_t num_layers, bool bidirectional,
-            bool bias, DType dtype, cudnnRNNMode_t _mode);
-};
-
-RNNForwardDescHolder_v6 get_RNNDescHolder_v6(
-        Handle* handle, megdnn::RNNForward::Param& _param, const TensorLayout& input);
-std::vector<cudnnTensorDescriptor_t> get_descs(const std::vector<TensorDesc>& descs);
-}  // namespace rnn
-}  // namespace cuda
-}  // namespace megdnn
\ No newline at end of file
diff --git a/dnn/src/cuda/rnn_cell/opr_impl.cpp b/dnn/src/cuda/rnn_cell/opr_impl.cpp
deleted file mode 100644
index 4fb5f2ed..00000000
--- a/dnn/src/cuda/rnn_cell/opr_impl.cpp
+++ /dev/null
@@ -1,35 +0,0 @@
-/**
- * \file dnn/src/cuda/rnn_cell/opr_impl.cpp
- * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
- *
- * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- */
-#include "src/cuda/rnn_cell/opr_impl.h"
-#include "src/common/rnn_cell.h"
-
-namespace megdnn {
-namespace cuda {
-size_t RNNCellImpl::get_workspace_in_bytes(
-        const TensorLayout& input, const TensorLayout& weight_ih,
-        const TensorLayout& bias_ih, const TensorLayout& hx,
-        const TensorLayout& weight_hh, const TensorLayout& bias_hh,
-        const TensorLayout& dst) {
-    return megdnn::rnn_cell::get_workspace_in_bytes(
-            input, weight_ih, bias_hh, hx, weight_hh, bias_hh, dst, handle());
-}
-
-void RNNCellImpl::exec(
-        _megdnn_tensor_in input, _megdnn_tensor_in weight_ih, _megdnn_tensor_in bias_ih,
-        _megdnn_tensor_in hx, _megdnn_tensor_in weight_hh, _megdnn_tensor_in bias_hh,
-        _megdnn_tensor_out dst, _megdnn_workspace workspace) {
-    megdnn::rnn_cell::exec(
-            input, weight_ih, bias_ih, hx, weight_hh, bias_hh, dst, workspace,
-            param().nonlineMode, handle());
-}
-}  // namespace cuda
-
-}  // namespace megdnn
\ No newline at end of file
diff --git a/dnn/src/cuda/rnn_cell/opr_impl.h b/dnn/src/cuda/rnn_cell/opr_impl.h
deleted file mode 100644
index 916d8586..00000000
--- a/dnn/src/cuda/rnn_cell/opr_impl.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/**
- * \file dnn/src/cuda/rnn_cell/opr_impl.h
- * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
- *
- * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- */
-#pragma once
-#include "megdnn/oprs.h"
-
-namespace megdnn {
-namespace cuda {
-
-class RNNCellImpl : public RNNCell {
-public:
-    using RNNCell::RNNCell;
-    void exec(
-            _megdnn_tensor_in input, _megdnn_tensor_in weight_ih,
-            _megdnn_tensor_in bias_ih, _megdnn_tensor_in hx,
-            _megdnn_tensor_in weight_hh, _megdnn_tensor_in bias_hh,
-            _megdnn_tensor_out dst, _megdnn_workspace workspace) override;
-    size_t get_workspace_in_bytes(
-            const TensorLayout& input, const TensorLayout& weight_ih,
-            const TensorLayout& bias_ih, const TensorLayout& hx,
-            const TensorLayout& weight_hh, const TensorLayout& bias_hh,
-            const TensorLayout& dst) override;
-    /*
-    private:
-        void add_bias(_megdnn_tensor_in A,
-                _megdnn_tensor_in B,
-                        _megdnn_tensor_in bias,
-                _megdnn_tensor_out C);
-    */
-};
-
-}  // namespace cuda
-}  // namespace megdnn
\ No newline at end of file
diff --git a/dnn/src/naive/handle.cpp b/dnn/src/naive/handle.cpp
index d92cdb72..87568168 100644
--- a/dnn/src/naive/handle.cpp
+++ b/dnn/src/naive/handle.cpp
@@ -9,7 +9,6 @@
  * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
  * implied.
  */
-
 #include "src/naive/handle.h"
 
 #include "src/common/handle_impl.h"
@@ -140,4 +139,5 @@ MEGDNN_FOREACH_OPR_CLASS(MEGDNN_SPECIALIZE_CREATE_OPERATOR)
 
 }  // namespace naive
 }  // namespace megdnn
-// vim: syntax=cpp.doxygen
+
+// vim: syntax=cpp.doxygen
\ No newline at end of file
diff --git a/dnn/src/naive/lstm/opr_impl.cpp b/dnn/src/naive/lstm/opr_impl.cpp
index 73f6ac1d..5e11c56b 100644
--- a/dnn/src/naive/lstm/opr_impl.cpp
+++ b/dnn/src/naive/lstm/opr_impl.cpp
@@ -12,6 +12,9 @@
 #include "src/naive/rnn/funcs.h"
 #include "src/naive/rnn/rnn.h"
 
+#include "midout.h"
+MIDOUT_DECL(megdnn_naive_lstm_fwd)
+
 namespace megdnn {
 namespace naive {
 using rnn::LSTMCellWeightWrapper;
@@ -21,29 +24,32 @@ void LSTMImpl::exec(
         _megdnn_tensor_in flatten_weights, _megdnn_tensor_out output,
         _megdnn_tensor_out hy, _megdnn_tensor_out cy, _megdnn_tensor_out reserve_space,
         _megdnn_workspace workspace) {
-    auto _param = param();
-    size_t D = _param.bidirectional ? 2 : 1;
-    size_t num_layers = _param.num_layers;
-    size_t input_size = input.layout.shape[2];
-    std::vector<LSTMCellWeightWrapper> cells;
-    size_t used_workspace_size = rnn::get_cells<LSTMCellWeightWrapper>(
-            D, num_layers, input_size, _param.hidden_size, _param.bias, cells,
-            flatten_weights, workspace);
+    MIDOUT_BEGIN(megdnn_naive_lstm_fwd) {
+        auto _param = param();
+        size_t D = _param.bidirectional ? 2 : 1;
+        size_t num_layers = _param.num_layers;
+        size_t input_size = input.layout.shape[2];
+        std::vector<LSTMCellWeightWrapper> cells;
+        size_t used_workspace_size = rnn::get_cells<LSTMCellWeightWrapper>(
+                D, num_layers, input_size, _param.hidden_size, _param.bias, cells,
+                flatten_weights, workspace);
 
-    Workspace new_workspace(
-            workspace.raw_ptr + used_workspace_size,
-            workspace.size - used_workspace_size);
-    TensorNDArray states = {hx, cx}, states_new = {hy, cy};
-    rnn::exec_internal<LSTMCellWeightWrapper, LSTMCellForward>(
-            cells, input, states, states_new, output, reserve_space, num_layers, D,
-            this->handle(), new_workspace);
+        Workspace new_workspace(
+                workspace.raw_ptr + used_workspace_size,
+                workspace.size - used_workspace_size);
+        TensorNDArray states = {hx, cx}, states_new = {hy, cy};
+        rnn::exec_internal<LSTMCellWeightWrapper, LSTMCellForward>(
+                cells, input, states, states_new, output, reserve_space, num_layers, D,
+                param::RNNCell::NonlineMode::IDENTITY, this->handle(), new_workspace);
+    }
+    MIDOUT_END();
 }
 
 size_t LSTMImpl::get_workspace_in_bytes(
-        const TensorLayout& input, const TensorLayout& hx, const TensorLayout& cx,
-        const TensorLayout& flatten_weights, const TensorLayout& output,
-        const TensorLayout& hy, const TensorLayout& cy,
-        const TensorLayout& reserve_space) {
+        const TensorLayout& input, const TensorLayout& /*hx*/,
+        const TensorLayout& /*cx*/, const TensorLayout& flatten_weights,
+        const TensorLayout& output, const TensorLayout& /*hy*/,
+        const TensorLayout& /*cy*/, const TensorLayout& /*reserve_space*/) {
     size_t workspace_size = rnn::get_workspace_in_bytes<LSTMCellForward>(
             input, flatten_weights, param().hidden_size, param().bidirectional ? 2 : 1,
             this->handle());
@@ -77,6 +83,7 @@ void LSTMBackwardImpl::exec(
     size_t num_layers = param().num_layers;
     size_t D = param().bidirectional ? 2 : 1;
     size_t input_size = x.layout.shape[2];
+    size_t batch_size = x.layout.shape[1];
     size_t hidden_size = param().hidden_size;
     size_t used_workspace_size = 0;
 
@@ -90,10 +97,27 @@ void LSTMBackwardImpl::exec(
     Workspace new_workspace = Workspace(
             workspace.raw_ptr + used_workspace_size,
             workspace.size - used_workspace_size);
+    TensorNDArray states = {hx, cx};
+    std::vector<TensorNDArray> hx_param;
+    TensorLayout unfold_hx_layout{
+            TensorShape{batch_size, hidden_size}, hx.layout.dtype};
+    for (size_t layer = 0; layer < num_layers; ++layer) {
+        for (size_t d = 0; d < D; ++d) {
+            TensorNDArray unfold_hx;
+            size_t idx = layer * D + d;
+            size_t states_offset = idx * unfold_hx_layout.span().dist_byte();
+            for (size_t i = 0; i < states.size(); ++i) {
+                unfold_hx.push_back(TensorND{
+                        static_cast<uint8_t*>(states[i].raw_ptr()) + states_offset,
+                        unfold_hx_layout});
+            }
+            hx_param.push_back(unfold_hx);
+        }
+    }
     used_workspace_size += rnn::get_inputs_for_exec<LSTMCellWeightWrapper>(
-            x, y, reserve_space, num_layers, D, hidden_size, cells, layer_inputs,
-            layer_outputs, cell_seq_states, param::RNNCell::NonlineMode::IDENTITY,
-            new_workspace);
+            x, y, hx_param, reserve_space, num_layers, D, hidden_size, cells,
+            layer_inputs, layer_outputs, cell_seq_states,
+            param::RNNCell::NonlineMode::IDENTITY, new_workspace);
 
     // dhy arr, dhx arr
     TensorNDArray dhy_arr = {dhy, dcy}, dhx_arr = {dhx, dcx};
@@ -110,11 +134,12 @@ void LSTMBackwardImpl::exec(
 }
 
 size_t LSTMBackwardImpl::get_workspace_in_bytes(
-        const TensorLayout& x, const TensorLayout& y, const TensorLayout& hx,
-        const TensorLayout& cx, const TensorLayout& dy, const TensorLayout& dhy,
-        const TensorLayout& dcy, const TensorLayout& flatten_weights,
-        const TensorLayout& reserve_space, const TensorLayout& dx,
-        const TensorLayout& dhx, const TensorLayout& dcx, const TensorLayout& dw) {
+        const TensorLayout& x, const TensorLayout& y, const TensorLayout& /*hx*/,
+        const TensorLayout& /*cx*/, const TensorLayout& /*dy*/,
+        const TensorLayout& /*dhy*/, const TensorLayout& /*dcy*/,
+        const TensorLayout& flatten_weights, const TensorLayout& /*reserve_space*/,
+        const TensorLayout& /*dx*/, const TensorLayout& /*dhx*/,
+        const TensorLayout& /*dcx*/, const TensorLayout& /*dw*/) {
     size_t D = param().bidirectional ? 2 : 1;
     size_t num_layers = param().num_layers;
     size_t hidden_size = param().hidden_size;
@@ -142,5 +167,6 @@ size_t LSTMBackwardImpl::get_workspace_in_bytes(
     return workspace_size;
 }
 }  // namespace naive
+}  // namespace megdnn
 
-}  // namespace megdnn
\ No newline at end of file
+// vim: syntax=cpp.doxygen
\ No newline at end of file
diff --git a/dnn/src/naive/lstm/opr_impl.h b/dnn/src/naive/lstm/opr_impl.h
index 9ff5aa9d..6622f51d 100644
--- a/dnn/src/naive/lstm/opr_impl.h
+++ b/dnn/src/naive/lstm/opr_impl.h
@@ -22,14 +22,16 @@ public:
             _megdnn_tensor_in input, _megdnn_tensor_in hx, _megdnn_tensor_in cx,
             _megdnn_tensor_in flatten_weights, _megdnn_tensor_out output,
             _megdnn_tensor_out hy, _megdnn_tensor_out cy,
-            _megdnn_tensor_out reserve_space, _megdnn_workspace workspace);
+            _megdnn_tensor_out reserve_space, _megdnn_workspace workspace) override;
 
     size_t get_workspace_in_bytes(
             const TensorLayout& input, const TensorLayout& hx, const TensorLayout& cx,
             const TensorLayout& flatten_weights, const TensorLayout& output,
             const TensorLayout& hy, const TensorLayout& cy,
-            const TensorLayout& reserve_space);
-    size_t get_reserve_size_in_bytes(const TensorLayout& input);
+            const TensorLayout& reserve_space) override;
+    size_t get_reserve_size_in_bytes(const TensorLayout& input) override;
+
+    bool is_thread_safe() const override { return true; }
 };
 
 class LSTMBackwardImpl : public LSTMBackward {
@@ -42,14 +44,17 @@ public:
             _megdnn_tensor_in dcy, _megdnn_tensor_in flatten_weights,
             _megdnn_tensor_in reserve_space, _megdnn_tensor_out dx,
             _megdnn_tensor_out dhx, _megdnn_tensor_out dcx, _megdnn_tensor_out dw,
-            _megdnn_workspace workspace);
+            _megdnn_workspace workspace) override;
+
+    bool is_thread_safe() const override { return true; }
 
     virtual size_t get_workspace_in_bytes(
             const TensorLayout& x, const TensorLayout& y, const TensorLayout& hx,
             const TensorLayout& cx, const TensorLayout& dy, const TensorLayout& dhy,
             const TensorLayout& dcy, const TensorLayout& flatten_weights,
             const TensorLayout& reserve_space, const TensorLayout& dx,
-            const TensorLayout& dhx, const TensorLayout& dcx, const TensorLayout& dw);
+            const TensorLayout& dhx, const TensorLayout& dcx,
+            const TensorLayout& dw) override;
 };
 
 }  // namespace naive
diff --git a/dnn/src/naive/lstm/template_impl.cpp b/dnn/src/naive/lstm/template_impl.cpp
index 108ee238..541b828b 100644
--- a/dnn/src/naive/lstm/template_impl.cpp
+++ b/dnn/src/naive/lstm/template_impl.cpp
@@ -19,7 +19,8 @@ void cell_opr_exec<LSTMCellForward>(
         _megdnn_tensor_in input, _megdnn_tensor_in weight_ih,
         _megdnn_tensor_in weight_hh, _megdnn_tensor_in bias_ih,
         _megdnn_tensor_in bias_hh, const TensorNDArray& states,
-        TensorNDArray& states_new, _megdnn_workspace workspace, Handle* handle) {
+        TensorNDArray& states_new, _megdnn_workspace workspace,
+        param::RNNCell::NonlineMode /*nonline_mode*/, Handle* handle) {
     auto opr = handle->create_operator<LSTMCellForward>();
     TensorLayout gates, h_new, c_new;
     opr->deduce_layout(
diff --git a/dnn/src/naive/lstm_cell/opr_impl.cpp b/dnn/src/naive/lstm_cell/opr_impl.cpp
index 10b0dd6b..139a7c81 100644
--- a/dnn/src/naive/lstm_cell/opr_impl.cpp
+++ b/dnn/src/naive/lstm_cell/opr_impl.cpp
@@ -11,6 +11,9 @@
 #include "src/naive/lstm_cell/opr_impl.h"
 #include "src/common/lstm_cell.h"
 
+#include "midout.h"
+MIDOUT_DECL(megdnn_naive_lstmcell_fwd)
+
 namespace megdnn {
 namespace naive {
 size_t LSTMCellImpl::get_workspace_in_bytes(
@@ -29,9 +32,12 @@ void LSTMCellImpl::exec(
         _megdnn_tensor_in hx, _megdnn_tensor_in weight_hh, _megdnn_tensor_in bias_hh,
         _megdnn_tensor_in cx, _megdnn_tensor_out h_new, _megdnn_tensor_out c_new,
         _megdnn_tensor_out gates, _megdnn_workspace workspace) {
-    megdnn::lstm_cell::exec(
-            input, weight_ih, bias_ih, hx, weight_hh, bias_hh, cx, h_new, c_new, gates,
-            workspace, handle());
+    MIDOUT_BEGIN(megdnn_naive_lstmcell_fwd) {
+        megdnn::lstm_cell::exec(
+                input, weight_ih, bias_ih, hx, weight_hh, bias_hh, cx, h_new, c_new,
+                gates, workspace, handle());
+    }
+    MIDOUT_END();
 }
 }  // namespace naive
 
diff --git a/dnn/src/naive/lstm_cell/opr_impl.h b/dnn/src/naive/lstm_cell/opr_impl.h
index 4f56c8eb..3aad44b2 100644
--- a/dnn/src/naive/lstm_cell/opr_impl.h
+++ b/dnn/src/naive/lstm_cell/opr_impl.h
@@ -30,6 +30,8 @@ public:
             const TensorLayout& weight_hh, const TensorLayout& bias_hh,
             const TensorLayout& cx, const TensorLayout& h_new,
             const TensorLayout& c_new, const TensorLayout& gates) override;
+
+    bool is_thread_safe() const override { return true; }
 };
 
 }  // namespace naive
diff --git a/dnn/src/naive/rnn/funcs.h b/dnn/src/naive/rnn/funcs.h
index 2ea83416..f157f8a9 100644
--- a/dnn/src/naive/rnn/funcs.h
+++ b/dnn/src/naive/rnn/funcs.h
@@ -8,8 +8,8 @@
  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
  * ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  */
-#ifndef _RNN_H
-#define _RNN_H
+// #ifndef _RNN_H
+// #define _RNN_H
 #include "megdnn/oprs.h"
 namespace megdnn {
 namespace naive {
@@ -20,7 +20,8 @@ void cell_opr_exec(
         _megdnn_tensor_in input, _megdnn_tensor_in weight_ih,
         _megdnn_tensor_in weight_hh, _megdnn_tensor_in bias_ih,
         _megdnn_tensor_in bias_hh, const TensorNDArray& states,
-        TensorNDArray& states_new, _megdnn_workspace workspace, Handle* handle);
+        TensorNDArray& states_new, _megdnn_workspace workspace,
+        param::RNNCell::NonlineMode nonline_mode, Handle* handle);
 
 template <typename CellOpr>
 size_t cell_opr_get_workspace_in_bytes(
@@ -33,29 +34,274 @@ size_t get_workspace_in_bytes(
         const TensorLayout& input, const TensorLayout& flatten_weights,
         size_t hidden_size,
         size_t D,  // num_directions
-        Handle* handle);
+        Handle* handle) {
+    size_t seq_len = input.shape[0];
+    size_t batch_size = input.shape[1];
+    size_t input_size = input.shape[2];
+    size_t gate_hidden_size = flatten_weights.shape[0];
+    // concat workspace
+    TensorLayout direction_output_layout{
+            TensorShape{seq_len, batch_size, hidden_size}, input.dtype};
+    TensorLayout output_layout{{seq_len, batch_size, D * hidden_size}, input.dtype};
+    TensorLayoutArray layer_layouts;
+    for (size_t i = 0; i < D; ++i)
+        layer_layouts.push_back(direction_output_layout);
+    auto concat_opr = handle->create_operator<ConcatForward>();
+    concat_opr->param().axis = -1;
+    size_t concat_workspace =
+            concat_opr->get_workspace_in_bytes(layer_layouts, output_layout);
+    // cell workspace
+    TensorLayout weight_ih{{gate_hidden_size, input_size}, flatten_weights.dtype};
+    TensorLayout D_weight_ih{
+            {gate_hidden_size, D * hidden_size}, flatten_weights.dtype};
+    TensorLayout weight_hh{{gate_hidden_size, hidden_size}, flatten_weights.dtype};
+    TensorLayout bias{{gate_hidden_size}, flatten_weights.dtype};
+    TensorLayout hx{{batch_size, hidden_size}, input.dtype};
+    TensorLayout cell_input = {{input.shape[1], input.shape[2]}, input.dtype};
+    TensorLayout D_cell_input = {{input.shape[1], D * hidden_size}, input.dtype};
+
+    size_t cell_workspace = cell_opr_get_workspace_in_bytes<CellOpr>(
+            cell_input, weight_ih, weight_hh, bias, bias, hx, handle);
+    size_t D_cell_workspace = cell_opr_get_workspace_in_bytes<CellOpr>(
+            D_cell_input, D_weight_ih, weight_hh, bias, bias, hx, handle);
+
+    return std::max(std::max(cell_workspace, D_cell_workspace), concat_workspace);
+}
 
 template <class Cell, typename CellOpr>
 void exec_internal(
         std::vector<Cell>& cells, _megdnn_tensor_in input, const TensorNDArray& states,
         TensorNDArray& states_new, _megdnn_tensor_out output,
-        _megdnn_tensor_out reserve_space, size_t num_layers,
-        size_t D,  // D is num_directions
-        Handle* handle, _megdnn_workspace workspace);
+        _megdnn_tensor_out reserve_space, size_t num_layers, size_t D,
+        param::RNNCell::NonlineMode nonline_mode, Handle* handle,
+        _megdnn_workspace workspace) {
+    size_t seq_len = input.layout.shape[0];
+    size_t batch_size = input.layout.shape[1];
+    size_t input_size = input.layout.shape[2];
+    size_t hidden_size = cells[0].weight_hh.layout.shape[1];
+    TensorLayout cell_output_layout{
+            TensorShape{batch_size, hidden_size}, states[0].layout.dtype};
+    TensorLayout cell_first_input_layout{
+            TensorShape{batch_size, input_size}, input.layout.dtype};
+    TensorLayout cell_input_layout{
+            TensorShape{batch_size, D * hidden_size}, input.layout.dtype};
+    TensorLayout direction_output_layout{
+            TensorShape{seq_len, batch_size, hidden_size}, output.layout.dtype};
+    TensorND tmp_output{workspace.raw_ptr, output.layout};
+    _megdnn_workspace new_workspace{
+            workspace.raw_ptr + tmp_output.layout.span().dist_byte(),
+            workspace.size - tmp_output.layout.span().dist_byte()};
+
+    auto cell_opr = handle->create_operator<CellOpr>();
+    auto copy_opr = handle->create_operator<TypeCvtForward>();
+
+    // copy states to states_new
+    for (size_t i = 0; i < states.size(); ++i)
+        copy_opr->exec(states[i], states_new[i]);
+    void* reserve_ptr = reserve_space.raw_ptr();
+
+    // layer 1
+    for (size_t d = 0; d < D; ++d) {
+        size_t cell_idx = d;
+        auto& cell = cells[cell_idx];
+
+        TensorNDArray cur_states;
+        size_t states_offset = cell_idx * cell_output_layout.span().dist_byte();
+        for (size_t i = 0; i < states.size(); ++i) {
+            cur_states.push_back(TensorND{
+                    static_cast<uint8_t*>(states_new[i].raw_ptr()) + states_offset,
+                    cell_output_layout});
+        }
+
+        for (size_t i = 0; i < seq_len; ++i) {
+            size_t step = d == 0 ? i : seq_len - 1 - i;
+            TensorND step_input{
+                    static_cast<uint8_t*>(input.raw_ptr()) +
+                            step * cell_first_input_layout.span().dist_byte(),
+                    cell_first_input_layout};
+            TensorND step_output{
+                    static_cast<uint8_t*>(output.raw_ptr()) +
+                            (step * D) * cell_output_layout.span().dist_byte() +
+                            d * cell_output_layout.span().dist_byte() / batch_size,
+                    cell_output_layout};
+            TensorNDArray tmp_states;
+            for (size_t s = 0; s < cur_states.size(); ++s) {
+                tmp_states.push_back(TensorND{reserve_ptr, cur_states[s].layout});
+                size_t size_in_bytes = cur_states[s].layout.span().dist_byte();
+                reserve_ptr = static_cast<uint8_t*>(reserve_ptr) + size_in_bytes;
+            }
+            cell_opr_exec<CellOpr>(
+                    step_input, cell.weight_ih, cell.weight_hh, cell.bias_ih,
+                    cell.bias_hh, cur_states, tmp_states, new_workspace, nonline_mode,
+                    handle);
+            for (size_t s = 0; s < tmp_states.size(); ++s) {
+                copy_opr->exec(tmp_states[s], cur_states[s]);
+            }
+            TensorLayout half_output_layout{
+                    TensorShape{hidden_size}, states[0].layout.dtype};
+            if (D == 2) {
+                for (size_t i = 0; i < batch_size; i++) {
+                    TensorND half_cur_states{
+                            // output
+                            static_cast<uint8_t*>(cur_states[0].raw_ptr()) +
+                                    i * half_output_layout.span().dist_byte(),
+                            half_output_layout};
+                    TensorND half_step_output{
+                            static_cast<uint8_t*>(step_output.raw_ptr()) +
+                                    i * half_output_layout.span().dist_byte() * 2,
+                            half_output_layout};
+                    copy_opr->exec(half_cur_states, half_step_output);
+                }
+            } else
+                copy_opr->exec(cur_states[0], step_output);
+        }
+    }
+
+    for (size_t layer = 1; layer < num_layers; ++layer) {
+        for (size_t d = 0; d < D; ++d) {
+            size_t cell_idx = layer * D + d;
+            auto& cell = cells[cell_idx];
+
+            TensorNDArray cur_states;
+            size_t states_offset = cell_idx * cell_output_layout.span().dist_byte();
+            for (size_t i = 0; i < states.size(); ++i) {
+                cur_states.push_back(TensorND{
+                        static_cast<uint8_t*>(states_new[i].raw_ptr()) + states_offset,
+                        cell_output_layout});
+            }
+
+            for (size_t i = 0; i < seq_len; ++i) {
+                size_t step = d == 0 ? i : seq_len - 1 - i;
+                TensorND step_input{
+                        static_cast<uint8_t*>(output.raw_ptr()) +
+                                step * cell_input_layout.span().dist_byte(),
+                        cell_input_layout};
+                TensorND step_output{
+                        static_cast<uint8_t*>(tmp_output.raw_ptr()) +
+                                (step * D) * cell_output_layout.span().dist_byte() +
+                                d * cell_output_layout.span().dist_byte() / batch_size,
+                        cell_output_layout};
+                TensorNDArray tmp_states;
+                for (size_t s = 0; s < cur_states.size(); ++s) {
+                    tmp_states.push_back(TensorND{reserve_ptr, cur_states[s].layout});
+                    size_t size_in_bytes = cur_states[s].layout.span().dist_byte();
+                    reserve_ptr = static_cast<uint8_t*>(reserve_ptr) + size_in_bytes;
+                }
+                cell_opr_exec<CellOpr>(
+                        step_input, cell.weight_ih, cell.weight_hh, cell.bias_ih,
+                        cell.bias_hh, cur_states, tmp_states, new_workspace,
+                        nonline_mode, handle);
+                // copy states to cur_states
+                for (size_t s = 0; s < tmp_states.size(); ++s) {
+                    copy_opr->exec(tmp_states[s], cur_states[s]);
+                }
+                TensorLayout half_output_layout{
+                        TensorShape{hidden_size}, states[0].layout.dtype};
+                if (D == 2) {
+                    for (size_t i = 0; i < batch_size; i++) {
+                        TensorND half_cur_states{
+                                // output
+                                static_cast<uint8_t*>(cur_states[0].raw_ptr()) +
+                                        i * half_output_layout.span().dist_byte(),
+                                half_output_layout};
+                        TensorND half_step_output{
+                                static_cast<uint8_t*>(step_output.raw_ptr()) +
+                                        i * half_output_layout.span().dist_byte() * 2,
+                                half_output_layout};
+                        copy_opr->exec(half_cur_states, half_step_output);
+                    }
+                } else
+                    copy_opr->exec(cur_states[0], step_output);
+            }
+        }
+        copy_opr->exec(tmp_output, output);
+    }
+}
 
 template <class Cell>
 size_t get_cells(
         size_t D, size_t num_layers, size_t input_size, size_t hidden_size, bool bias,
         std::vector<Cell>& cells, _megdnn_tensor_in flatten_weights,
-        _megdnn_workspace workspace);
+        _megdnn_workspace workspace) {
+    cells.reserve(D * num_layers);
+    void* weight_ptr = flatten_weights.raw_ptr();
+    for (size_t layer = 0; layer < num_layers; ++layer) {
+        for (size_t d = 0; d < D; ++d) {
+            size_t cell_input_size = D * hidden_size;
+            if (layer == 0)
+                cell_input_size = input_size;
+            Cell cell(
+                    weight_ptr, hidden_size, cell_input_size, bias,
+                    flatten_weights.layout.dtype, workspace);
+            weight_ptr =
+                    static_cast<uint8_t*>(weight_ptr) + cell.weight_size_in_bytes();
+            cells.push_back(cell);
+        }
+    }
+    return cells[0].workspace_size_in_bytes();
+}
 
 template <class Cell>
 size_t get_inputs_for_exec(
-        _megdnn_tensor_in x, _megdnn_tensor_in y, _megdnn_tensor_in reserve_space,
+        _megdnn_tensor_in x, _megdnn_tensor_in y,
+        const std::vector<TensorNDArray> unfold_hx, _megdnn_tensor_in reserve_space,
         size_t num_layers, size_t D, size_t hidden_size, const std::vector<Cell>& cells,
         TensorNDArray& layer_inputs, TensorNDArray& layer_outputs,
         std::vector<std::vector<TensorNDArray>>& cell_seq_states,
-        param::RNNCell::NonlineMode nonlineMode, _megdnn_workspace workspace);
+        param::RNNCell::NonlineMode /*nonlineMode*/, _megdnn_workspace workspace) {
+    // return used workspace size
+
+    layer_inputs.push_back(x);
+    size_t seq_len = x.layout.shape[0];
+    size_t batch_size = x.layout.shape[1];
+    size_t num_states = cells[0].num_states();
+    TensorLayout cell_output_layout{{batch_size, hidden_size}, y.layout.dtype};
+    TensorLayout direction_output_layout{
+            {seq_len, batch_size, hidden_size}, y.layout.dtype};
+    void* workspace_ptr = workspace.raw_ptr;
+
+    // extract intermedia states from reserve space
+    for (size_t layer = 0; layer < num_layers; ++layer) {
+        TensorND layer_output{workspace_ptr, y.layout};
+        workspace_ptr = static_cast<uint8_t*>(workspace_ptr) +
+                        layer_output.layout.span().dist_byte();
+        for (size_t d = 0; d < D; ++d) {
+            cell_seq_states.push_back(std::vector<TensorNDArray>());
+            cell_seq_states[cell_seq_states.size() - 1].push_back(
+                    {unfold_hx[layer * d]});
+            // reverse direction is stored with reversed order of sequence order
+            for (size_t i = 0; i < seq_len; ++i) {
+                size_t step = i;
+                if (d == 1)
+                    step = seq_len - i - 1;
+                size_t offset = ((layer * D + d) * seq_len + step) *
+                                cell_output_layout.span().dist_byte() * num_states;
+                TensorNDArray cur_states;
+                for (size_t s = 0; s < num_states; ++s) {
+                    TensorND h{
+                            static_cast<uint8_t*>(reserve_space.raw_ptr()) + offset +
+                                    s * cell_output_layout.span().dist_byte(),
+                            cell_output_layout};
+                    cur_states.push_back(h);
+                }
+                TensorND hy{
+                        static_cast<uint8_t*>(reserve_space.raw_ptr()) + offset,
+                        cell_output_layout};
+                // states
+                cell_seq_states[cell_seq_states.size() - 1].push_back(cur_states);
+                // output
+                offset = i * D * cell_output_layout.span().dist_byte();
+                memcpy(static_cast<uint8_t*>(layer_output.raw_ptr()) + offset,
+                       hy.raw_ptr(), hy.layout.span().dist_byte());
+            }
+        }
+        layer_outputs.push_back(layer_output);
+        if (layer != num_layers - 1)
+            layer_inputs.push_back(layer_output);
+    }
+    return static_cast<uint8_t*>(workspace_ptr) -
+           static_cast<uint8_t*>((void*)workspace.raw_ptr);
+}
 
 template <class Cell>
 void backward_exec_internal(
@@ -65,11 +311,182 @@ void backward_exec_internal(
         const std::vector<std::vector<TensorNDArray>>& cell_seq_states,
         _megdnn_tensor_in dy, const TensorNDArray& dhy, _megdnn_tensor_out dx,
         TensorNDArray& dstates, _megdnn_tensor_out dw, Handle* handle,
-        _megdnn_workspace workspace);
+        _megdnn_workspace workspace) {
+    /*
+        layer_inputs: array of input of each layer, element 0: [seq_len, batch_size,
+       input_size], element others: [seq_len, batch_size, D * hidden_size]
+       layer_outputs: array of outputs of each rnn. To access outputs of the cell at
+       (layer, d), use layer_outputs[layer]. The shape is [seq_len, batch_size,
+       output_size(D*hidden_size)] (in sequence order) cell_seq_states: arrray of states
+       of each cell at each step. To access the states of the cell at (layer, d) at
+       sequence step (step), use cell_seq_states[layer*D + d][step]
+    */
+    size_t seq_len = layer_inputs[0].layout.shape[0];
+    size_t batch_size = layer_inputs[0].layout.shape[1];
+    DType dtype = layer_inputs[0].layout.dtype;
+    size_t cell_y_size = layer_outputs[0].layout.shape[2] / D;
+    size_t hidden_size = cell_y_size;
+    TensorLayout cell_y_layout = {{batch_size, cell_y_size}, dtype};
+    void* workspace_ptr = workspace.raw_ptr;
+
+    TensorND layer_output_grad{
+            workspace_ptr, {{seq_len, batch_size, D * hidden_size}, dtype}};
+    workspace_ptr = static_cast<uint8_t*>(workspace_ptr) +
+                    layer_output_grad.layout.span().dist_byte();
+    memcpy(layer_output_grad.raw_ptr(), dy.raw_ptr(), dy.layout.span().dist_byte());
+    TensorNDArray direction_dx_arr;
+    for (size_t i = 0; i < D; ++i) {
+        TensorLayout direction_dx_layout{{seq_len, batch_size, hidden_size}, dtype};
+        direction_dx_arr.push_back(TensorND(workspace_ptr, direction_dx_layout));
+        workspace_ptr = static_cast<uint8_t*>(workspace_ptr) +
+                        direction_dx_layout.span().dist_byte();
+    }
+    TensorNDArray L0_direction_dx_arr;
+    for (size_t i = 0; i < D; ++i) {
+        TensorLayout direction_dx_layout{{seq_len, batch_size, input_size}, dtype};
+        L0_direction_dx_arr.push_back(TensorND(workspace_ptr, direction_dx_layout));
+        workspace_ptr = static_cast<uint8_t*>(workspace_ptr) +
+                        direction_dx_layout.span().dist_byte();
+    }
+
+    std::vector<TensorNDArray> dstates_arr;
+    for (size_t layer = 0; layer < num_layers; ++layer) {
+        for (size_t d = 0; d < D; ++d) {
+            TensorNDArray cell_states;
+            cell_states.reserve(dstates.size());
+            for (size_t i = 0; i < dstates.size(); ++i) {
+                size_t offset = (layer * D + d) * cell_y_layout.span().dist_byte();
+                TensorND dhx_cell{
+                        static_cast<uint8_t*>(dstates[i].raw_ptr()) + offset,
+                        cell_y_layout};
+                memcpy(dhx_cell.raw_ptr(),
+                       static_cast<uint8_t*>(dhy[i].raw_ptr()) + offset,
+                       cell_y_layout.span().dist_byte());
+                cell_states.emplace_back(dhx_cell);
+            }
+            dstates_arr.push_back(cell_states);
+        }
+    }
+
+    memset(dw.raw_ptr(), 0, dw.layout.span().dist_byte());
+
+    std::vector<Cell> cell_grads;
+    size_t used_workspace_size = static_cast<uint8_t*>(workspace_ptr) -
+                                 static_cast<uint8_t*>((void*)(workspace.raw_ptr));
+    workspace_ptr =
+            static_cast<uint8_t*>(workspace_ptr) +
+            get_cells(
+                    D, num_layers, input_size, hidden_size, bias, cell_grads, dw,
+                    Workspace(
+                            workspace.raw_ptr + used_workspace_size,
+                            workspace.size - used_workspace_size));
+
+    auto add_opr = handle->create_operator<ElemwiseForward>();
+    add_opr->param().mode = Elemwise::Mode::ADD;
+    auto copy_opr = handle->create_operator<TypeCvtForward>();
+
+    // initialize dx to zero
+    memset(dx.raw_ptr(), 0, dx.layout.span().dist_byte());
+
+    // calculate grads
+    for (int layer = (int)num_layers - 1; layer >= 0; --layer) {
+        for (int d = (int)D - 1; d >= 0; --d) {
+            Cell& cell = cells[layer * D + d];
+            Cell& cell_grad = cell_grads[layer * D + d];
+            size_t input_size = layer_inputs[layer].layout.shape[2];
+            const TensorND& x_arr = layer_inputs[layer];
+            const TensorND& y_arr = layer_outputs[layer];
+            TensorLayout x_layout = {{batch_size, input_size}, dtype};
+
+            // tmp tensors
+            void* tmp_workspace_ptr = workspace_ptr;
+            TensorND dwi_tmp{tmp_workspace_ptr, cell_grad.weight_ih.layout};
+            tmp_workspace_ptr = static_cast<uint8_t*>(tmp_workspace_ptr) +
+                                dwi_tmp.layout.span().dist_byte();
+            TensorND dwh_tmp{tmp_workspace_ptr, cell_grad.weight_hh.layout};
+            tmp_workspace_ptr = static_cast<uint8_t*>(tmp_workspace_ptr) +
+                                dwh_tmp.layout.span().dist_byte();
+            TensorND dbias_tmp{tmp_workspace_ptr, cell_grad.bias_ih.layout};
+            tmp_workspace_ptr = static_cast<uint8_t*>(tmp_workspace_ptr) +
+                                dbias_tmp.layout.span().dist_byte();
+            size_t used_workspace_size =
+                    static_cast<uint8_t*>(tmp_workspace_ptr) -
+                    static_cast<uint8_t*>((void*)(workspace.raw_ptr));
+
+            for (size_t i = 0; i < seq_len; ++i) {
+                size_t step = i;
+                if (d == 0)
+                    step = seq_len - i - 1;
+                TensorND x{
+                        static_cast<uint8_t*>(x_arr.raw_ptr()) +
+                                step * x_layout.span().dist_byte(),
+                        x_layout},
+                        y{static_cast<uint8_t*>(y_arr.raw_ptr()) +
+                                  (step * D + d) * cell_y_layout.span().dist_byte(),
+                          cell_y_layout};
+                const TensorNDArray& cell_states = cell_seq_states[layer * D + d][step];
+                TensorNDArray& dstates_new = dstates_arr[layer * D + d];
+
+                TensorND dy_t{
+                        static_cast<uint8_t*>(layer_output_grad.raw_ptr()) +
+                                (step * D + d) * cell_y_layout.span().dist_byte(),
+                        cell_y_layout};
+                add_opr->exec({dstates_new[0], dy_t}, dy_t);
+
+                TensorND dx_t;
+                if (layer == 0)
+                    dx_t = {static_cast<uint8_t*>(L0_direction_dx_arr[d].raw_ptr()) +
+                                    step * x_layout.span().dist_byte(),
+                            x_layout};
+                else
+                    dx_t = {static_cast<uint8_t*>(direction_dx_arr[d].raw_ptr()) +
+                                    step * x_layout.span().dist_byte(),
+                            x_layout};
+                TensorNDArray douts = {dy_t};
+                for (size_t s = 1; s < dstates_new.size(); ++s)
+                    douts.push_back(dstates_new[s]);
+                cell.backward(
+                        handle, nonlineMode, x, cell_states, y, douts, dx_t,
+                        dstates_new, dwi_tmp, dwh_tmp, dbias_tmp,
+                        Workspace(
+                                workspace.raw_ptr + used_workspace_size,
+                                workspace.size - used_workspace_size));
+                // add step gradient to overall gradient
+                add_opr->exec({dwi_tmp, cell_grad.weight_ih}, cell_grad.weight_ih);
+                add_opr->exec({dwh_tmp, cell_grad.weight_hh}, cell_grad.weight_hh);
+                add_opr->exec({dbias_tmp, cell_grad.bias_ih}, cell_grad.bias_ih);
+                add_opr->exec({dbias_tmp, cell_grad.bias_hh}, cell_grad.bias_hh);
+            }
+        }
+        // add gradient of different directions to layer_output_grad.
+        if (layer == 0) {
+            for (size_t i = 0; i < D; ++i)
+                add_opr->exec({L0_direction_dx_arr[i], dx}, dx);
+        } else {
+            if (D == 1)
+                copy_opr->exec(direction_dx_arr[0], layer_output_grad);
+            else {
+                for (size_t t = 0; t < seq_len; ++t) {
+                    size_t offset = t * D * cell_y_layout.span().dist_byte();
+                    for (size_t d = 0; d < D; ++d) {
+                        TensorND src{
+                                static_cast<uint8_t*>(direction_dx_arr[d].raw_ptr()) +
+                                        offset,
+                                cell_y_layout};
+                        TensorND dst{
+                                static_cast<uint8_t*>(layer_output_grad.raw_ptr()) +
+                                        offset + d * cell_y_layout.span().dist_byte(),
+                                cell_y_layout};
+                        copy_opr->exec(src, dst);
+                    }
+                }
+            }
+        }
+    }
+}
 
 }  // namespace rnn
 }  // namespace naive
 }  // namespace megdnn
-
-#include "funcs.tpp"
-#endif
+// #include "funcs.tpp"
+// #endif
diff --git a/dnn/src/naive/rnn/funcs.tpp b/dnn/src/naive/rnn/funcs.tpp
deleted file mode 100644
index 60703f21..00000000
--- a/dnn/src/naive/rnn/funcs.tpp
+++ /dev/null
@@ -1,449 +0,0 @@
-/**
- * \file dnn/src/naive/rnn/funcs.cpp
- * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
- *
- * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- */
-#include "funcs.h"
-
-namespace megdnn {
-namespace naive {
-namespace rnn {
-
-template <typename CellOpr>
-size_t get_workspace_in_bytes(
-        const TensorLayout& input, const TensorLayout& flatten_weights,
-        size_t hidden_size,
-        size_t D,  // num_directions
-        Handle* handle) {
-    size_t seq_len = input.shape[0];
-    size_t batch_size = input.shape[1];
-    size_t input_size = input.shape[2];
-    size_t gate_hidden_size = flatten_weights.shape[0];
-    // concat workspace
-    TensorLayout direction_output_layout{
-            TensorShape{seq_len, batch_size, hidden_size}, input.dtype};
-    TensorLayout output_layout{{seq_len, batch_size, D * hidden_size}, input.dtype};
-    TensorLayoutArray layer_layouts;
-    for (size_t i = 0; i < D; ++i)
-        layer_layouts.push_back(direction_output_layout);
-    auto concat_opr = handle->create_operator<ConcatForward>();
-    concat_opr->param().axis = -1;
-    size_t concat_workspace =
-            concat_opr->get_workspace_in_bytes(layer_layouts, output_layout);
-    // cell workspace
-    TensorLayout weight_ih{{gate_hidden_size, input_size}, flatten_weights.dtype};
-    TensorLayout weight_hh{{gate_hidden_size, hidden_size}, flatten_weights.dtype};
-    TensorLayout bias{{gate_hidden_size}, flatten_weights.dtype};
-    TensorLayout hx{{batch_size, hidden_size}, input.dtype};
-    size_t cell_workspace = cell_opr_get_workspace_in_bytes<CellOpr>(
-            input, weight_ih, weight_hh, bias, bias, hx, handle);
-
-    return std::max(cell_workspace, concat_workspace);
-}
-
-template <class Cell, typename CellOpr>
-void exec_internal(
-        std::vector<Cell>& cells, _megdnn_tensor_in input, const TensorNDArray& states,
-        TensorNDArray& states_new, _megdnn_tensor_out output,
-        _megdnn_tensor_out reserve_space, size_t num_layers,
-        size_t D,  // D is num_directions
-        Handle* handle, _megdnn_workspace workspace) {
-    size_t seq_len = input.layout.shape[0];
-    size_t batch_size = input.layout.shape[1];
-    size_t input_size = input.layout.shape[2];
-    size_t hidden_size = cells[0].weight_hh.layout.shape[1];
-    TensorLayout cell_output_layout{
-            TensorShape{batch_size, hidden_size}, states[0].layout.dtype};
-    TensorLayout cell_first_input_layout{
-            TensorShape{batch_size, input_size}, input.layout.dtype};
-    TensorLayout cell_input_layout{
-            TensorShape{batch_size, D * hidden_size}, input.layout.dtype};
-    TensorLayout direction_output_layout{
-            TensorShape{seq_len, batch_size, hidden_size}, output.layout.dtype};
-    TensorND tmp_output{workspace.raw_ptr, output.layout};
-    _megdnn_workspace new_workspace{
-            workspace.raw_ptr + tmp_output.layout.span().dist_byte(),
-            workspace.size - tmp_output.layout.span().dist_byte()};
-
-    auto cell_opr = handle->create_operator<CellOpr>();
-    auto copy_opr = handle->create_operator<TypeCvtForward>();
-
-    // copy states to states_new
-    for (size_t i = 0; i < states.size(); ++i)
-        copy_opr->exec(states[i], states_new[i]);
-    void* reserve_ptr = reserve_space.raw_ptr();
-
-    // layer 1
-    // TensorNDArray layer_outputs;
-    for (size_t d = 0; d < D; ++d) {
-        size_t cell_idx = d;
-        auto& cell = cells[cell_idx];
-
-        TensorNDArray cur_states;
-        size_t states_offset = cell_idx * cell_output_layout.span().dist_byte();
-        for (size_t i = 0; i < states.size(); ++i) {
-            cur_states.push_back(TensorND{
-                    static_cast<uint8_t*>(states_new[i].raw_ptr()) + states_offset,
-                    cell_output_layout});
-        }
-        // TensorND direction_output_tensor{output.raw_ptr + d *
-        // direction_output_layout.span().dist_byte(),
-        // direction_output_layout};
-        for (size_t i = 0; i < seq_len; ++i) {
-            size_t step = d == 0 ? i : seq_len - 1 - i;
-            TensorND step_input{
-                    static_cast<uint8_t*>(input.raw_ptr()) +
-                            step * cell_first_input_layout.span().dist_byte(),
-                    cell_first_input_layout};
-            TensorND step_output{
-                    static_cast<uint8_t*>(output.raw_ptr()) +
-                            (step * D + d) * cell_output_layout.span().dist_byte(),
-                    cell_output_layout};
-            // temporary states of each step (use reserve space)
-            TensorNDArray tmp_states;
-            for (size_t s = 0; s < cur_states.size(); ++s) {
-                tmp_states.push_back(TensorND{reserve_ptr, cur_states[s].layout});
-                size_t size_in_bytes = cur_states[s].layout.span().dist_byte();
-                reserve_ptr = static_cast<uint8_t*>(reserve_ptr) + size_in_bytes;
-            }
-            cell_opr_exec<CellOpr>(
-                    step_input, cell.weight_ih, cell.weight_hh, cell.bias_ih,
-                    cell.bias_hh, cur_states, tmp_states, new_workspace, handle);
-            // copy states to cur_states
-            for (size_t s = 0; s < tmp_states.size(); ++s) {
-                copy_opr->exec(tmp_states[s], cur_states[s]);
-            }
-            // copy h to step output
-            copy_opr->exec(cur_states[0], step_output);
-        }
-    }
-
-    for (size_t layer = 1; layer < num_layers; ++layer) {
-        // TensorNDArray layer_outputs;
-
-        for (size_t d = 0; d < D; ++d) {
-            size_t cell_idx = layer * D + d;
-            auto& cell = cells[cell_idx];
-
-            TensorNDArray cur_states;
-            size_t states_offset = cell_idx * cell_output_layout.span().dist_byte();
-            for (size_t i = 0; i < states.size(); ++i) {
-                cur_states.push_back(TensorND{
-                        static_cast<uint8_t*>(states_new[i].raw_ptr()) + states_offset,
-                        cell_output_layout});
-            }
-            // TensorND direction_output_tensor{output.raw_ptr + d *
-            // direction_output_layout.span().dist_byte(),
-            // direction_output_layout};
-
-            for (size_t i = 0; i < seq_len; ++i) {
-                size_t step = d == 0 ? i : seq_len - 1 - i;
-                TensorND step_input{
-                        static_cast<uint8_t*>(output.raw_ptr()) +
-                                step * cell_input_layout.span().dist_byte(),
-                        cell_input_layout};
-                TensorND step_output{
-                        static_cast<uint8_t*>(tmp_output.raw_ptr()) +
-                                (step * D + d) * cell_output_layout.span().dist_byte(),
-                        cell_output_layout};
-                // temporary states of each step (use reserve space)
-                TensorNDArray tmp_states;
-                for (size_t s = 0; s < cur_states.size(); ++s) {
-                    tmp_states.push_back(TensorND{reserve_ptr, cur_states[s].layout});
-                    size_t size_in_bytes = cur_states[s].layout.span().dist_byte();
-                    reserve_ptr = static_cast<uint8_t*>(reserve_ptr) + size_in_bytes;
-                }
-                cell_opr_exec<CellOpr>(
-                        step_input, cell.weight_ih, cell.weight_hh, cell.bias_ih,
-                        cell.bias_hh, cur_states, cur_states, new_workspace, handle);
-                // copy states to cur_states
-                for (size_t s = 0; s < tmp_states.size(); ++s) {
-                    copy_opr->exec(tmp_states[s], cur_states[s]);
-                }
-                // copy h to step_output
-                copy_opr->exec(cur_states[0], step_output);
-            }
-        }
-        // copy layer output to output
-        copy_opr->exec(tmp_output, output);
-    }
-    // output: [d0, d1, d0, d1 ...]
-}
-
-template <class Cell>
-size_t get_cells(
-        size_t D, size_t num_layers, size_t input_size, size_t hidden_size, bool bias,
-        std::vector<Cell>& cells, _megdnn_tensor_in flatten_weights,
-        _megdnn_workspace workspace) {
-    cells.reserve(D * num_layers);
-    void* weight_ptr = flatten_weights.raw_ptr();
-    for (size_t layer = 0; layer < num_layers; ++layer) {
-        for (size_t d = 0; d < D; ++d) {
-            size_t cell_input_size = D * hidden_size;
-            if (layer == 0)
-                cell_input_size = input_size;
-            Cell cell(
-                    weight_ptr, hidden_size, cell_input_size, bias,
-                    flatten_weights.layout.dtype, workspace);
-            weight_ptr =
-                    static_cast<uint8_t*>(weight_ptr) + cell.weight_size_in_bytes();
-            cells.push_back(cell);
-        }
-    }
-    // return used workspace
-    return cells[0].workspace_size_in_bytes();
-}
-
-template <class Cell>
-size_t get_inputs_for_exec(
-        _megdnn_tensor_in x, _megdnn_tensor_in y, _megdnn_tensor_in reserve_space,
-        size_t num_layers, size_t D, size_t hidden_size, const std::vector<Cell>& cells,
-        TensorNDArray& layer_inputs, TensorNDArray& layer_outputs,
-        std::vector<std::vector<TensorNDArray>>& cell_seq_states,
-        param::RNNCell::NonlineMode nonlineMode, _megdnn_workspace workspace) {
-    // return used workspace size
-
-    layer_inputs.push_back(x);
-    size_t seq_len = x.layout.shape[0];
-    size_t batch_size = x.layout.shape[1];
-    size_t num_states = cells[0].num_states();
-    TensorLayout cell_output_layout{{batch_size, hidden_size}, y.layout.dtype};
-    TensorLayout direction_output_layout{
-            {seq_len, batch_size, hidden_size}, y.layout.dtype};
-    void* workspace_ptr = workspace.raw_ptr;
-
-    // extract intermedia states from reserve space
-    for (int layer = 0; layer < num_layers; ++layer) {
-        TensorND layer_output{workspace_ptr, y.layout};
-        workspace_ptr = static_cast<uint8_t*>(workspace_ptr) +
-                        layer_output.layout.span().dist_byte();
-        for (int d = 0; d < D; ++d) {
-            cell_seq_states.push_back(std::vector<TensorNDArray>());
-            // reverse direction is stored with reversed order of sequence order
-            for (int i = 0; i < seq_len; ++i) {
-                size_t step = i;
-                if (d == 1)
-                    step = seq_len - i - 1;
-                size_t offset = ((layer * D + d) * seq_len + step) *
-                                cell_output_layout.span().dist_byte() * num_states;
-                TensorNDArray cur_states;
-                for (int s = 0; s < num_states; ++s) {
-                    TensorND h{
-                            static_cast<uint8_t*>(reserve_space.raw_ptr()) + offset +
-                                    s * cell_output_layout.span().dist_byte(),
-                            cell_output_layout};
-                    cur_states.push_back(h);
-                }
-                TensorND hy{
-                        static_cast<uint8_t*>(reserve_space.raw_ptr()) + offset,
-                        cell_output_layout};  // the first hidden state is the output
-                // states
-                cell_seq_states[cell_seq_states.size() - 1].push_back(cur_states);
-                // output
-                offset = i * D * cell_output_layout.span().dist_byte();
-                memcpy(static_cast<uint8_t*>(layer_output.raw_ptr()) + offset, hy.raw_ptr(),
-                       hy.layout.span().dist_byte());
-            }
-        }
-        layer_outputs.push_back(layer_output);
-        if (layer != num_layers - 1)
-            layer_inputs.push_back(layer_output);
-    }
-    return static_cast<uint8_t*>(workspace_ptr) -
-           static_cast<uint8_t*>((void*)workspace.raw_ptr);
-}
-
-template <class Cell>
-// using Cell = RNNCellWeightWrapper;
-void backward_exec_internal(
-        std::vector<Cell>& cells, size_t D, size_t num_layers, size_t input_size,
-        bool bias, param::RNNCell::NonlineMode nonlineMode,
-        const TensorNDArray& layer_inputs, const TensorNDArray& layer_outputs,
-        const std::vector<std::vector<TensorNDArray>>& cell_seq_states,
-        _megdnn_tensor_in dy, const TensorNDArray& dhy, _megdnn_tensor_out dx,
-        TensorNDArray& dstates, _megdnn_tensor_out dw, Handle* handle,
-        _megdnn_workspace workspace) {
-    /*
-        layer_inputs: array of input of each layer, element 0: [seq_len, batch_size,
-       input_size], element others: [seq_len, batch_size, D * hidden_size]
-       layer_outputs: array of outputs of each rnn. To access outputs of the cell at
-       (layer, d), use layer_outputs[layer]. The shape is [seq_len, batch_size,
-       output_size(D*hidden_size)] (in sequence order) cell_seq_states: arrray of states
-       of each cell at each step. To access the states of the cell at (layer, d) at
-       sequence step (step), use cell_seq_states[layer*D + d][step]
-    */
-    size_t seq_len = layer_inputs[0].layout.shape[0];
-    size_t batch_size = layer_inputs[0].layout.shape[1];
-    DType dtype = layer_inputs[0].layout.dtype;
-    size_t cell_y_size =
-            layer_outputs[0].layout.shape[2] / D;  // should all be the same
-    size_t hidden_size = cell_y_size;
-    TensorLayout cell_y_layout = {{batch_size, cell_y_size}, dtype};
-    void* workspace_ptr = workspace.raw_ptr;
-
-    TensorND layer_output_grad{
-            workspace_ptr, {{seq_len, batch_size, D * hidden_size}, dtype}};
-    workspace_ptr = static_cast<uint8_t*>(workspace_ptr) +
-                    layer_output_grad.layout.span().dist_byte();
-    memcpy(layer_output_grad.raw_ptr(), dy.raw_ptr(), dy.layout.span().dist_byte());
-    TensorNDArray direction_dx_arr;  // for layer 1 to layer num_layers-1
-    for (int i = 0; i < D; ++i) {
-        TensorLayout direction_dx_layout{{seq_len, batch_size, hidden_size}, dtype};
-        direction_dx_arr.push_back(TensorND(workspace_ptr, direction_dx_layout));
-        workspace_ptr = static_cast<uint8_t*>(workspace_ptr) +
-                        direction_dx_layout.span().dist_byte();
-    }
-    TensorNDArray L0_direction_dx_arr;
-    for (int i = 0; i < D; ++i) {
-        TensorLayout direction_dx_layout{{seq_len, batch_size, input_size}, dtype};
-        L0_direction_dx_arr.push_back(TensorND(workspace_ptr, direction_dx_layout));
-        workspace_ptr = static_cast<uint8_t*>(workspace_ptr) +
-                        direction_dx_layout.span().dist_byte();
-    }
-    // cell states for each layer and each direction
-    std::vector<TensorNDArray> dstates_arr;
-    for (int layer = 0; layer < num_layers; ++layer) {
-        for (int d = 0; d < D; ++d) {
-            TensorNDArray cell_states;
-            cell_states.reserve(dstates.size());
-            for (int i = 0; i < dstates.size(); ++i) {
-                size_t offset = (layer * D + d) * cell_y_layout.span().dist_byte();
-                TensorND dhx_cell{
-                        static_cast<uint8_t*>(dstates[i].raw_ptr()) + offset,
-                        cell_y_layout};
-                memcpy(dhx_cell.raw_ptr(), static_cast<uint8_t*>(dhy[i].raw_ptr()) + offset,
-                       cell_y_layout.span().dist_byte());
-                cell_states.emplace_back(dhx_cell);
-            }
-            dstates_arr.push_back(cell_states);
-        }
-    }
-
-    // init gradient on weight to zero
-    memset(dw.raw_ptr(), 0, dw.layout.span().dist_byte());
-    // use cells to contain gradient
-    std::vector<Cell> cell_grads;
-    size_t used_workspace_size = static_cast<uint8_t*>(workspace_ptr) -
-                                 static_cast<uint8_t*>((void*)(workspace.raw_ptr));
-    workspace_ptr =
-            static_cast<uint8_t*>(workspace_ptr) +
-            get_cells(
-                    D, num_layers, input_size, hidden_size, bias, cell_grads, dw,
-                    Workspace(
-                            workspace.raw_ptr + used_workspace_size,
-                            workspace.size - used_workspace_size));
-
-    auto add_opr = handle->create_operator<ElemwiseForward>();
-    add_opr->param().mode = Elemwise::Mode::ADD;
-    auto copy_opr = handle->create_operator<TypeCvtForward>();
-
-    // initialize dx to zero
-    memset(dx.raw_ptr(), 0, dx.layout.span().dist_byte());
-
-    // calculate grads
-    for (int layer = num_layers - 1; layer >= 0; --layer) {
-        for (int d = D - 1; d >= 0; --d) {
-            Cell& cell = cells[layer * D + d];
-            Cell& cell_grad = cell_grads[layer * D + d];
-            size_t input_size = layer_inputs[layer].layout.shape[2];
-            const TensorND& x_arr = layer_inputs[layer];
-            const TensorND& y_arr = layer_outputs[layer];
-            TensorLayout x_layout = {{batch_size, input_size}, dtype};
-
-            // tmp tensors
-            void* tmp_workspace_ptr = workspace_ptr;
-            TensorND dwi_tmp{tmp_workspace_ptr, cell_grad.weight_ih.layout};
-            tmp_workspace_ptr = static_cast<uint8_t*>(tmp_workspace_ptr) +
-                                dwi_tmp.layout.span().dist_byte();
-            TensorND dwh_tmp{tmp_workspace_ptr, cell_grad.weight_hh.layout};
-            tmp_workspace_ptr = static_cast<uint8_t*>(tmp_workspace_ptr) +
-                                dwh_tmp.layout.span().dist_byte();
-            TensorND dbias_tmp{tmp_workspace_ptr, cell_grad.bias_ih.layout};
-            tmp_workspace_ptr = static_cast<uint8_t*>(tmp_workspace_ptr) +
-                                dbias_tmp.layout.span().dist_byte();
-            size_t used_workspace_size =
-                    static_cast<uint8_t*>(tmp_workspace_ptr) -
-                    static_cast<uint8_t*>((void*)(workspace.raw_ptr));
-
-            for (int i = 0; i < seq_len; ++i) {
-                // reverse time step (not seq step). Here step means seq step
-                size_t step = i;
-                if (d == 0)
-                    step = seq_len - i - 1;
-                TensorND x{
-                        static_cast<uint8_t*>(x_arr.raw_ptr()) +
-                                step * x_layout.span().dist_byte(),
-                        x_layout},
-                        y{static_cast<uint8_t*>(y_arr.raw_ptr()) +
-                                  (step * D + d) * cell_y_layout.span().dist_byte(),
-                          cell_y_layout};
-                const TensorNDArray& cell_states = cell_seq_states[layer * D + d][step];
-                TensorNDArray& dstates_new = dstates_arr[layer * D + d];
-                // dy should be d_output + d_hidden
-                TensorND dy_t{
-                        static_cast<uint8_t*>(layer_output_grad.raw_ptr()) +
-                                (step * D + d) * cell_y_layout.span().dist_byte(),
-                        cell_y_layout};
-                add_opr->exec({dstates_new[0], dy_t}, dy_t);
-                // dx for layer 0 has a different size
-                TensorND dx_t;
-                if (layer == 0)
-                    dx_t = {static_cast<uint8_t*>(L0_direction_dx_arr[d].raw_ptr()) +
-                                    step * x_layout.span().dist_byte(),
-                            x_layout};
-                else
-                    dx_t = {static_cast<uint8_t*>(direction_dx_arr[d].raw_ptr()) +
-                                    step * x_layout.span().dist_byte(),
-                            x_layout};
-                TensorNDArray douts = {dy_t};
-                for (int s = 1; s < dstates_new.size(); ++s)
-                    douts.push_back(dstates_new[s]);
-                cell.backward(
-                        handle, nonlineMode, x, cell_states, y, douts, dx_t,
-                        dstates_new, dwi_tmp, dwh_tmp, dbias_tmp,
-                        Workspace(
-                                workspace.raw_ptr + used_workspace_size,
-                                workspace.size - used_workspace_size));
-                // add step gradient to overall gradient
-                add_opr->exec({dwi_tmp, cell_grad.weight_ih}, cell_grad.weight_ih);
-                add_opr->exec({dwh_tmp, cell_grad.weight_hh}, cell_grad.weight_hh);
-                add_opr->exec({dbias_tmp, cell_grad.bias_ih}, cell_grad.bias_ih);
-                add_opr->exec({dbias_tmp, cell_grad.bias_hh}, cell_grad.bias_hh);
-            }
-        }
-        // add gradient of different directions to layer_output_grad.
-        // Layer 0 to dx
-        if (layer == 0) {
-            for (int i = 0; i < D; ++i)
-                add_opr->exec({L0_direction_dx_arr[i], dx}, dx);
-        } else {
-            if (D == 1)
-                copy_opr->exec(direction_dx_arr[0], layer_output_grad);
-            else {  // D == 2, arrange as [(d0, d1), (d0, d1), ...]
-                for (size_t t = 0; t < seq_len; ++t) {
-                    size_t offset = t * D * cell_y_layout.span().dist_byte();
-                    for (size_t d = 0; d < D; ++d) {
-                        TensorND src{
-                                static_cast<uint8_t*>(direction_dx_arr[d].raw_ptr()) +
-                                        offset,
-                                cell_y_layout};
-                        TensorND dst{
-                                static_cast<uint8_t*>(layer_output_grad.raw_ptr()) +
-                                        offset + d * cell_y_layout.span().dist_byte(),
-                                cell_y_layout};
-                        copy_opr->exec(src, dst);
-                    }
-                }
-            }
-        }
-    }
-}
-
-}  // namespace rnn
-}  // namespace naive
-}  // namespace megdnn
diff --git a/dnn/src/naive/rnn/opr_impl.cpp b/dnn/src/naive/rnn/opr_impl.cpp
index aaa8b4a1..4379cf1e 100644
--- a/dnn/src/naive/rnn/opr_impl.cpp
+++ b/dnn/src/naive/rnn/opr_impl.cpp
@@ -22,6 +22,9 @@
 
 #include <cstring>
 
+#include "midout.h"
+MIDOUT_DECL(megdnn_naive_rnn_fwd)
+
 namespace megdnn {
 namespace naive {
 
@@ -32,33 +35,40 @@ void RNNImpl::exec(
         _megdnn_tensor_in flatten_weights, _megdnn_tensor_out output,
         _megdnn_tensor_out hy, _megdnn_tensor_out reserve_space,
         _megdnn_workspace workspace) {
-    auto _param = param();
-    size_t D = _param.bidirectional ? 2 : 1;
-    size_t num_layers = _param.num_layers;
-    size_t input_size = input.layout.shape[2];
-    std::vector<RNNCellWeightWrapper> cells;
-    size_t used_workspace_size = rnn::get_cells<RNNCellWeightWrapper>(
-            D, num_layers, input_size, _param.hidden_size, _param.bias, cells,
-            flatten_weights, workspace);
-
-    Workspace new_workspace(
-            workspace.raw_ptr + used_workspace_size,
-            workspace.size - used_workspace_size);
-    TensorNDArray states, states_new;
-    states.push_back(hx);
-    states_new.push_back(hy);
-    rnn::exec_internal<RNNCellWeightWrapper, RNNCellForward>(
-            cells, input, states, states_new, output, reserve_space, num_layers, D,
-            this->handle(), new_workspace);
+    MIDOUT_BEGIN(megdnn_naive_rnn_fwd) {
+        auto _param = param();
+        size_t D = _param.bidirectional ? 2 : 1;
+        size_t num_layers = _param.num_layers;
+        size_t input_size = input.layout.shape[2];
+        std::vector<RNNCellWeightWrapper> cells;
+        size_t used_workspace_size = rnn::get_cells<RNNCellWeightWrapper>(
+                D, num_layers, input_size, _param.hidden_size, _param.bias, cells,
+                flatten_weights, workspace);
+
+        Workspace new_workspace(
+                workspace.raw_ptr + used_workspace_size,
+                workspace.size - used_workspace_size);
+        TensorNDArray states, states_new;
+        states.push_back(hx);
+        states_new.push_back(hy);
+        rnn::exec_internal<RNNCellWeightWrapper, RNNCellForward>(
+                cells, input, states, states_new, output, reserve_space, num_layers, D,
+                _param.nonlineMode, this->handle(), new_workspace);
+    }
+    MIDOUT_END();
 }
 
 size_t RNNImpl::get_workspace_in_bytes(
         const TensorLayout& input, const TensorLayout& hx,
         const TensorLayout& flatten_weights, const TensorLayout& output,
-        const TensorLayout& hy, const TensorLayout& reserve_space) {
+        const TensorLayout& /*hy*/, const TensorLayout& /*reserve_space*/) {
+    auto _param = param();
+    size_t D = _param.bidirectional ? 2 : 1;
+    size_t last_dim = std::max(input.shape[2], D * hx.shape[1]);
+    TensorLayout last_input = {{input.shape[0], input.shape[1], last_dim}, input.dtype};
     size_t workspace_size = rnn::get_workspace_in_bytes<RNNCellForward>(
-            input, flatten_weights, param().hidden_size, param().bidirectional ? 2 : 1,
-            this->handle());
+            last_input, flatten_weights, param().hidden_size,
+            param().bidirectional ? 2 : 1, this->handle());
     if (!param().bias) {  // use fake bias (all 0)
         TensorLayout bias_layout = {{param().hidden_size}, flatten_weights.dtype};
         workspace_size += bias_layout.span().dist_byte();
@@ -82,50 +92,23 @@ void RNNBackwardImpl::exec(
         _megdnn_tensor_in reserve_space, _megdnn_tensor_out dx, _megdnn_tensor_out dhx,
         _megdnn_tensor_out dw, _megdnn_workspace workspace) {
     TensorNDArray layer_inputs;
-    // layer_inputs.push_back(x);
     TensorNDArray layer_outputs;
     std::vector<std::vector<TensorNDArray>> cell_seq_states;
     size_t num_layers = param().num_layers;
     size_t D = param().bidirectional ? 2 : 1;
-    // size_t seq_len = x.layout.shape[0];
-    // size_t batch_size = x.layout.shape[1];
     size_t input_size = x.layout.shape[2];
+    size_t batch_size = x.layout.shape[1];
     size_t hidden_size = param().hidden_size;
     size_t used_workspace_size = 0;
 
     // get cells
     std::vector<RNNCellWeightWrapper> cells;
-    // workspace_ptr = static_cast<uint8_t*>(workspace_ptr) +
     used_workspace_size += rnn::get_cells(
             D, num_layers, input_size, hidden_size, param().bias, cells,
             flatten_weights, workspace);
 
-    // extract intermedia states from reserve space
-    /*for (int layer = 0; layer < num_layers; ++layer) {
-            TensorND layer_output{workspace_ptr, y.layout};
-            workspace_ptr = static_cast<uint8_t*>(workspace_ptr) +
-    layer_output.layout.span().dist_byte(); for (int d = 0; d < D; ++d) {
-                    cell_seq_states.push_back(std::vector<TensorNDArray>());
-                    // reverse direction is stored with reversed order of sequence order
-                    for (int i = 0; i < seq_len; ++i) {
-                            size_t step = i;
-                            if (d == 1) step = seq_len - i - 1;
-                            size_t offset = ((layer * D + d) * seq_len + step) *
-    cell_output_layout.span().dist_byte(); TensorND
-    hy{static_cast<uint8_t*>(reserve_space.raw_ptr) + offset, cell_output_layout};
-                            // states
-                            cell_seq_states[cell_seq_states.size() - 1].push_back({hy});
-                            // output
-                            offset = i * D * cell_output_layout.span().dist_byte();
-                            memcpy(static_cast<uint8_t*>(layer_output.raw_ptr) + offset,
-                                       hy.raw_ptr, hy.layout.span().dist_byte());
-                    }
-            }
-            cell_seq_outputs.push_back(layer_output);
-            if (layer != num_layers - 1) layer_inputs.push_back(layer_output);
-    }*/
     // nonlinear mode
-    param::RNNCell::NonlineMode nonlineMode;
+    param::RNNCell::NonlineMode nonlineMode = param::RNNCell::NonlineMode::TANH;
     using ModeRNN = param::RNN::NonlineMode;
     using ModeRNNCell = param::RNNCell::NonlineMode;
     switch (param().nonlineMode) {
@@ -135,22 +118,34 @@ void RNNBackwardImpl::exec(
         case ModeRNN::TANH:
             nonlineMode = ModeRNNCell::TANH;
             break;
+        case ModeRNN::IDENTITY:
+            break;
     }
 
     // get formatted inputs
     Workspace new_workspace = Workspace(
             workspace.raw_ptr + used_workspace_size,
             workspace.size - used_workspace_size);
+    TensorLayout unfold_hx_layout{
+            TensorShape{batch_size, hidden_size}, hx.layout.dtype};
+    std::vector<TensorNDArray> hx_param;
+    for (size_t layer = 0; layer < num_layers; ++layer) {
+        for (size_t d = 0; d < D; ++d) {
+            TensorNDArray unfold_hx;
+            size_t idx = layer * D + d;
+            size_t states_offset = idx * unfold_hx_layout.span().dist_byte();
+            unfold_hx.push_back(TensorND{
+                    static_cast<uint8_t*>(hx.raw_ptr()) + states_offset,
+                    unfold_hx_layout});
+            hx_param.push_back(unfold_hx);
+        }
+    }
     used_workspace_size += rnn::get_inputs_for_exec<RNNCellWeightWrapper>(
-            x, y, reserve_space, num_layers, D, hidden_size, cells, layer_inputs,
-            layer_outputs, cell_seq_states, nonlineMode, new_workspace);
+            x, y, hx_param, reserve_space, num_layers, D, hidden_size, cells,
+            layer_inputs, layer_outputs, cell_seq_states, nonlineMode, new_workspace);
 
-    // dhy arr, dhx arr
     TensorNDArray dhy_arr = {dhy}, dhx_arr = {dhx};
 
-    // exec
-    /*size_t used_workspace_size = static_cast<uint8_t*>(workspace_ptr) -
-            static_cast<uint8_t*>((void*)workspace.raw_ptr);*/
     new_workspace = Workspace(
             workspace.raw_ptr + used_workspace_size,
             workspace.size - used_workspace_size);
@@ -161,10 +156,11 @@ void RNNBackwardImpl::exec(
 }
 
 size_t RNNBackwardImpl::get_workspace_in_bytes(
-        const TensorLayout& x, const TensorLayout& y, const TensorLayout& hx,
-        const TensorLayout& dy, const TensorLayout& dhy,
-        const TensorLayout& flatten_weights, const TensorLayout& reserve_space,
-        const TensorLayout& dx, const TensorLayout& dhx, const TensorLayout& dw) {
+        const TensorLayout& x, const TensorLayout& y, const TensorLayout& /*hx*/,
+        const TensorLayout& /*dy*/, const TensorLayout& /*dhy*/,
+        const TensorLayout& flatten_weights, const TensorLayout& /*reserve_space*/,
+        const TensorLayout& /*dx*/, const TensorLayout& /*dhx*/,
+        const TensorLayout& /*dw*/) {
     size_t D = param().bidirectional ? 2 : 1;
     size_t num_layers = param().num_layers;
     size_t hidden_size = param().hidden_size;
diff --git a/dnn/src/naive/rnn/opr_impl.h b/dnn/src/naive/rnn/opr_impl.h
index 8ae6809c..b52edbd1 100644
--- a/dnn/src/naive/rnn/opr_impl.h
+++ b/dnn/src/naive/rnn/opr_impl.h
@@ -22,13 +22,14 @@ public:
             _megdnn_tensor_in input, _megdnn_tensor_in hx,
             _megdnn_tensor_in flatten_weights, _megdnn_tensor_out output,
             _megdnn_tensor_out hy, _megdnn_tensor_out reserve_space,
-            _megdnn_workspace workspace);
+            _megdnn_workspace workspace) override;
 
     size_t get_workspace_in_bytes(
             const TensorLayout& input, const TensorLayout& hx,
             const TensorLayout& flatten_weights, const TensorLayout& output,
-            const TensorLayout& hy, const TensorLayout& reserve_space);
-    size_t get_reserve_size_in_bytes(const TensorLayout& input);
+            const TensorLayout& hy, const TensorLayout& reserve_space) override;
+    size_t get_reserve_size_in_bytes(const TensorLayout& input) override;
+    bool is_thread_safe() const override { return true; }
 };
 
 class RNNBackwardImpl : public RNNBackward {
@@ -40,13 +41,15 @@ public:
             _megdnn_tensor_in dy, _megdnn_tensor_in dhy,
             _megdnn_tensor_in flatten_weights, _megdnn_tensor_in reserve_space,
             _megdnn_tensor_out dx, _megdnn_tensor_out dhx, _megdnn_tensor_out dw,
-            _megdnn_workspace workspace);
+            _megdnn_workspace workspace) override;
 
     virtual size_t get_workspace_in_bytes(
             const TensorLayout& x, const TensorLayout& y, const TensorLayout& hx,
             const TensorLayout& dy, const TensorLayout& dhy,
             const TensorLayout& flatten_weights, const TensorLayout& reserve_space,
-            const TensorLayout& dx, const TensorLayout& dhx, const TensorLayout& dw);
+            const TensorLayout& dx, const TensorLayout& dhx,
+            const TensorLayout& dw) override;
+    bool is_thread_safe() const override { return true; }
 };
 
 }  // namespace naive
diff --git a/dnn/src/naive/rnn/rnn.cpp b/dnn/src/naive/rnn/rnn.cpp
index 8ea43496..d1889f51 100644
--- a/dnn/src/naive/rnn/rnn.cpp
+++ b/dnn/src/naive/rnn/rnn.cpp
@@ -218,7 +218,11 @@ void LSTMCellWeightWrapper::backward(
             x, weight_ih, bias_ih, states[0], weight_hh, bias_hh, states[1], dstates[0],
             dstates[1], gates_tensor,
             new_workspace);  // no information left in the workspace
-    // i, f, o, g
+
+    // BUG: The order of gate_grad if i_g f_g o_g g_g , but it should be  i_g f_g g_g o_g
+    //      The returned gradient includes both horizontal and vertical gradients,
+    //      horizontal grad = douts[1]  vertical gradients = douts[1]
+    //      Here the variable is confusing !!!
     TensorLayout single_gate = {{gates.shape[0], gates.shape[1] / 4}, gates.dtype};
     TensorND i, f, o, g, i_grad, f_grad, o_grad,
             g_grad;  // grad refers to the grad of gates before activation
@@ -239,8 +243,8 @@ void LSTMCellWeightWrapper::backward(
     g_grad = {
             static_cast<uint8_t*>(o_grad.raw_ptr()) + single_gate.span().dist_byte(),
             single_gate};
-    // activation
     auto elem_opr = handle->create_operator<ElemwiseForward>();
+
     elem_opr->param().mode = Elemwise::Mode::SIGMOID;
     elem_opr->exec({i}, i);
     elem_opr->exec({f}, f);
@@ -254,8 +258,8 @@ void LSTMCellWeightWrapper::backward(
     mul_opr->exec({douts[0], tanh_cy}, dstates[0]);
     elem_opr->param().mode = Elemwise::Mode::SIGMOID_GRAD;
     elem_opr->exec({o, dstates[0]}, o_grad);  // grad of gate o
-    // use dstates[0] as tmp tensor to store dhy * o
     mul_opr->exec({douts[0], o}, dstates[0]);
+
     elem_opr->param().mode = Elemwise::Mode::TANH_GRAD;
     elem_opr->exec({tanh_cy, dstates[0]}, dstates[1]);  // grad of cy from hy
     elem_opr->param().mode = Elemwise::Mode::ADD;
diff --git a/dnn/src/naive/rnn/rnn.h b/dnn/src/naive/rnn/rnn.h
index bb5fdf08..cbe2d464 100644
--- a/dnn/src/naive/rnn/rnn.h
+++ b/dnn/src/naive/rnn/rnn.h
@@ -38,6 +38,7 @@ public:
             _megdnn_tensor_out dwi, _megdnn_tensor_out dwh, _megdnn_tensor_out dbias,
             _megdnn_workspace workspace) const;
     virtual size_t num_states() const;
+    virtual ~CellWeightsWrapperBase() {}
 };
 
 class RNNCellWeightWrapper : public CellWeightsWrapperBase {
diff --git a/dnn/src/naive/rnn/template_impl.cpp b/dnn/src/naive/rnn/template_impl.cpp
index d3f870ad..b82e72a2 100644
--- a/dnn/src/naive/rnn/template_impl.cpp
+++ b/dnn/src/naive/rnn/template_impl.cpp
@@ -19,8 +19,10 @@ void cell_opr_exec<RNNCellForward>(
         _megdnn_tensor_in input, _megdnn_tensor_in weight_ih,
         _megdnn_tensor_in weight_hh, _megdnn_tensor_in bias_ih,
         _megdnn_tensor_in bias_hh, const TensorNDArray& states,
-        TensorNDArray& states_new, _megdnn_workspace workspace, Handle* handle) {
+        TensorNDArray& states_new, _megdnn_workspace workspace,
+        param::RNNCell::NonlineMode nonline_mode, Handle* handle) {
     auto opr = handle->create_operator<RNNCellForward>();
+    opr->param().nonlineMode = nonline_mode;
     opr->exec(
             input, weight_ih, bias_ih, states[0], weight_hh, bias_hh, states_new[0],
             workspace);
diff --git a/dnn/src/naive/rnn_cell/opr_impl.cpp b/dnn/src/naive/rnn_cell/opr_impl.cpp
index 8a3d692c..85eccf99 100644
--- a/dnn/src/naive/rnn_cell/opr_impl.cpp
+++ b/dnn/src/naive/rnn_cell/opr_impl.cpp
@@ -11,6 +11,9 @@
 #include "src/naive/rnn_cell/opr_impl.h"
 #include "src/common/rnn_cell.h"
 
+#include "midout.h"
+MIDOUT_DECL(megdnn_naive_rnncell_fwd)
+
 namespace megdnn {
 namespace naive {
 size_t RNNCellImpl::get_workspace_in_bytes(
@@ -26,9 +29,12 @@ void RNNCellImpl::exec(
         _megdnn_tensor_in input, _megdnn_tensor_in weight_ih, _megdnn_tensor_in bias_ih,
         _megdnn_tensor_in hx, _megdnn_tensor_in weight_hh, _megdnn_tensor_in bias_hh,
         _megdnn_tensor_out dst, _megdnn_workspace workspace) {
-    megdnn::rnn_cell::exec(
-            input, weight_ih, bias_ih, hx, weight_hh, bias_hh, dst, workspace,
-            param().nonlineMode, handle());
+    MIDOUT_BEGIN(megdnn_naive_rnncell_fwd) {
+        megdnn::rnn_cell::exec(
+                input, weight_ih, bias_ih, hx, weight_hh, bias_hh, dst, workspace,
+                param().nonlineMode, handle());
+    }
+    MIDOUT_END();
 }
 }  // namespace naive
 }  // namespace megdnn
\ No newline at end of file
diff --git a/dnn/src/naive/rnn_cell/opr_impl.h b/dnn/src/naive/rnn_cell/opr_impl.h
index fe73edb5..59e34cc3 100644
--- a/dnn/src/naive/rnn_cell/opr_impl.h
+++ b/dnn/src/naive/rnn_cell/opr_impl.h
@@ -27,6 +27,7 @@ public:
             const TensorLayout& bias_ih, const TensorLayout& hx,
             const TensorLayout& weight_hh, const TensorLayout& bias_hh,
             const TensorLayout& dst) override;
+    bool is_thread_safe() const override { return true; }
 };
 
 }  // namespace naive
diff --git a/dnn/test/common/deduce_layout_proxy.h b/dnn/test/common/deduce_layout_proxy.h
index 26080d70..5992e779 100644
--- a/dnn/test/common/deduce_layout_proxy.h
+++ b/dnn/test/common/deduce_layout_proxy.h
@@ -77,17 +77,18 @@ struct DeduceLayoutProxy<Opr, 6, false> {
 };
 
 template <typename Opr>
-struct DeduceLayoutProxy<Opr, 6, true> {
-    static void deduce_layout(Opr* opr, TensorLayoutArray& layouts) {
-        megdnn_assert(layouts.size() == 6);
-        opr->deduce_layout(
-                layouts[0], layouts[1], layouts[2], layouts[3], layouts[4], layouts[5]);
-    }
+struct DeduceLayoutProxy<Opr, 7, false> {
+    static void deduce_layout(Opr*, TensorLayoutArray&) {}
 };
 
 template <typename Opr>
-struct DeduceLayoutProxy<Opr, 7, false> {
-    static void deduce_layout(Opr*, TensorLayoutArray&) {}
+struct DeduceLayoutProxy<Opr, 7, true> {
+    static void deduce_layout(Opr* opr, TensorLayoutArray& layouts) {
+        megdnn_assert(layouts.size() == 7);
+        opr->deduce_layout(
+                layouts[0], layouts[1], layouts[2], layouts[3], layouts[4], layouts[5],
+                layouts[6]);
+    }
 };
 
 template <typename Opr>
@@ -109,6 +110,38 @@ struct DeduceLayoutProxy<Opr, 9, true> {
                 layouts[6], layouts[7], layouts[8]);
     }
 };
+
+template <typename Opr>
+struct DeduceLayoutProxy<Opr, 10, true> {
+    static void deduce_layout(Opr* opr, TensorLayoutArray& layouts) {
+        megdnn_assert(layouts.size() == 10);
+        opr->deduce_layout(
+                layouts[0], layouts[1], layouts[2], layouts[3], layouts[4], layouts[5],
+                layouts[6], layouts[7], layouts[8], layouts[9]);
+    }
+};
+
+template <typename Opr>
+struct DeduceLayoutProxy<Opr, 10, false> {
+    static void deduce_layout(Opr*, TensorLayoutArray&) {}
+};
+
+template <typename Opr>
+struct DeduceLayoutProxy<Opr, 13, true> {
+    static void deduce_layout(Opr* opr, TensorLayoutArray& layouts) {
+        megdnn_assert(layouts.size() == 13);
+        opr->deduce_layout(
+                layouts[0], layouts[1], layouts[2], layouts[3], layouts[4], layouts[5],
+                layouts[6], layouts[7], layouts[8], layouts[9], layouts[10],
+                layouts[11], layouts[12]);
+    }
+};
+
+template <typename Opr>
+struct DeduceLayoutProxy<Opr, 13, false> {
+    static void deduce_layout(Opr*, TensorLayoutArray&) {}
+};
+
 }  // namespace test
 }  // namespace megdnn
 
diff --git a/dnn/test/common/exec_proxy.h b/dnn/test/common/exec_proxy.h
index 4255afeb..da8c9c46 100644
--- a/dnn/test/common/exec_proxy.h
+++ b/dnn/test/common/exec_proxy.h
@@ -23,6 +23,44 @@ template <typename Opr, size_t Arity, bool has_workspace>
 struct ExecProxy;
 
 template <typename Opr>
+struct ExecProxy<Opr, 13, true> {
+    WorkspaceWrapper W;
+    void exec(Opr* opr, const TensorNDArray& tensors) {
+        if (!W.valid()) {
+            W = WorkspaceWrapper(opr->handle(), 0);
+        }
+        W.update(opr->get_workspace_in_bytes(
+                tensors[0].layout, tensors[1].layout, tensors[2].layout,
+                tensors[3].layout, tensors[4].layout, tensors[5].layout,
+                tensors[6].layout, tensors[7].layout, tensors[8].layout,
+                tensors[9].layout, tensors[10].layout, tensors[11].layout,
+                tensors[12].layout));
+        opr->exec(
+                tensors[0], tensors[1], tensors[2], tensors[3], tensors[4], tensors[5],
+                tensors[6], tensors[7], tensors[8], tensors[9], tensors[10],
+                tensors[11], tensors[12], W.workspace());
+    }
+};
+
+template <typename Opr>
+struct ExecProxy<Opr, 10, true> {
+    WorkspaceWrapper W;
+    void exec(Opr* opr, const TensorNDArray& tensors) {
+        if (!W.valid()) {
+            W = WorkspaceWrapper(opr->handle(), 0);
+        }
+        W.update(opr->get_workspace_in_bytes(
+                tensors[0].layout, tensors[1].layout, tensors[2].layout,
+                tensors[3].layout, tensors[4].layout, tensors[5].layout,
+                tensors[6].layout, tensors[7].layout, tensors[8].layout,
+                tensors[9].layout));
+        opr->exec(
+                tensors[0], tensors[1], tensors[2], tensors[3], tensors[4], tensors[5],
+                tensors[6], tensors[7], tensors[8], tensors[9], W.workspace());
+    }
+};
+
+template <typename Opr>
 struct ExecProxy<Opr, 9, true> {
     WorkspaceWrapper W;
     void exec(Opr* opr, const TensorNDArray& tensors) {
diff --git a/dnn/test/common/rnn.h b/dnn/test/common/rnn.h
deleted file mode 100644
index 691b2cae..00000000
--- a/dnn/test/common/rnn.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/**
- * \file dnn/test/common/rnn.h
- * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
- *
- * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- */
-#pragma once
-#include <vector>
-
-#include "megdnn/basic_types.h"
-#include "megdnn/opr_param_defs.h"
-
-namespace megdnn {
-namespace test {
-namespace rnn {
-struct TestArg {
-    param::RNN param;
-    TensorShape input, hx, flatten_weights;
-    TestArg(param::RNN param, TensorShape input, TensorShape hx,
-            TensorShape flatten_weights)
-            : param(param), input(input), hx(hx), flatten_weights(flatten_weights) {}
-};
-
-inline std::vector<TestArg> get_args() {
-    std::vector<TestArg> args;
-    size_t batch_size = 2;
-    size_t input_size = 3;
-    size_t hidden_size = 2;
-    size_t seq_len = 2;
-    size_t gate_hidden_size = hidden_size;
-    param::RNN param;
-    param.num_layers = 1;
-    param.bidirectional = false;
-    param.bias = false;
-    param.hidden_size = hidden_size;
-    param.nonlineMode = param::RNN::NonlineMode::RELU;
-
-    args.emplace_back(
-            param, TensorShape{seq_len, batch_size, input_size},
-            TensorShape{batch_size, hidden_size},
-            TensorShape{gate_hidden_size, input_size + hidden_size});
-    return args;
-}
-
-}  // namespace rnn
-}  // namespace test
-}  // namespace megdnn
\ No newline at end of file
diff --git a/dnn/test/naive/lstm.cpp b/dnn/test/naive/lstm.cpp
new file mode 100644
index 00000000..96f51ca9
--- /dev/null
+++ b/dnn/test/naive/lstm.cpp
@@ -0,0 +1,197 @@
+/**
+ * \file dnn/test/naive/lstm.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+// #include "test/common/lstm.h"
+#include "megdnn/dtype.h"
+#include "megdnn/oprs.h"
+#include "test/common/checker.h"
+#include "test/naive/fixture.h"
+
+namespace megdnn {
+namespace test {
+
+TEST_F(NAIVE, LSTM_FORWARD) {
+    Checker<LSTM> checker(handle(), true);
+    size_t batch_size = 2;
+    size_t input_size = 3;
+    size_t hidden_size = 2;
+    size_t seq_len = 2;
+    size_t gate_hidden_size = 4 * hidden_size;
+    LSTM::Param param;
+    param.num_layers = 1;
+    param.bidirectional = false;
+    param.bias = false;
+    param.hidden_size = hidden_size;
+    checker.set_param(param).exect(
+            Testcase{
+                    TensorValue(
+                            {seq_len, batch_size, input_size}, dtype::Float32(),
+                            {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}),  // input
+                    TensorValue(
+                            {batch_size, hidden_size}, dtype::Float32(),
+                            {1, 2, 3, 4}),  // hx
+                    TensorValue(
+                            {batch_size, hidden_size}, dtype::Float32(),
+                            {2, 3, 4, 5}),  // cx
+                    TensorValue(
+                            {gate_hidden_size, input_size + hidden_size},
+                            dtype::Float32(),
+                            {3, 6, 1, 3, 2, 7, 2, 1, 3, 2, 1, 1, 3, 6,
+                             1, 3, 2, 7, 2, 1, 3, 2, 1, 1, 9, 3, 5, 1,
+                             9, 3, 5, 1, 9, 3, 5, 1, 9, 3, 5, 1}),  // flattern weights
+                    {},
+                    {},
+                    {},
+                    {}},
+            Testcase{
+                    {},
+                    {},
+                    {},
+                    {},
+                    TensorValue(
+                            {seq_len, batch_size, hidden_size}, dtype::Float32(),
+                            {0.9951, 0.9993, 0.9999, 1.0000, 0.9993, 0.9999, 1.0000,
+                             1.0000}),  // output
+                    TensorValue(
+                            {batch_size, hidden_size}, dtype::Float32(),
+                            {0.9993, 0.9999, 1.0000, 1.0000}),  // hy
+                    TensorValue(
+                            {batch_size, hidden_size}, dtype::Float32(),
+                            {4.0000, 5.0000, 6.0000, 7.0000}),  // cy
+                    TensorValue(
+                            {2, 2, 2, 2}, dtype::Float32(),
+                            {0.995054, 0.999328, 0.99990, 0.999987, 3., 4., 5., 6.,
+                             0.999329, 0.999328, 0.99990, 1., 4., 5., 6.,
+                             7.})  // reserve space
+            });
+    param.bidirectional = true;
+    checker.set_param(param).exect(
+            Testcase{
+                    TensorValue(
+                            {seq_len, batch_size, input_size}, dtype::Float32(),
+                            {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}),  // input
+                    TensorValue(
+                            {2, batch_size, hidden_size}, dtype::Float32(),
+                            {1, 2, 3, 4, 5, 6, 7, 8}),  // hx
+                    TensorValue(
+                            {2, batch_size, hidden_size}, dtype::Float32(),
+                            {2, 3, 4, 5, 6, 7, 8, 9}),  // cx
+                    TensorValue(
+                            {gate_hidden_size, 2 * (input_size + hidden_size)},
+                            dtype::Float32(),
+                            {3, 6, 1, 3, 2, 7, 2, 1, 3, 2, 1, 1, 3, 6, 1, 3, 2,
+                             7, 2, 1, 3, 2, 1, 1, 9, 3, 5, 1, 9, 3, 5, 1, 9, 3,
+                             5, 1, 9, 3, 5, 1, 3, 6, 1, 3, 2, 7, 2, 1, 3, 2, 1,
+                             1, 3, 6, 1, 3, 2, 7, 2, 1, 3, 2, 1, 1, 9, 3, 5, 1,
+                             9, 3, 5, 1, 9, 3, 5, 1, 9, 3, 5, 1}),  // flattern weights
+                    {},
+                    {},
+                    {},
+                    {}},
+            Testcase{
+                    {},
+                    {},
+                    {},
+                    {},
+                    TensorValue(
+                            {seq_len, batch_size, 2 * hidden_size}, dtype::Float32(),
+                            {0.9951, 0.9993, 1.0000, 1.0000, 0.9999, 1.0000, 1.0000,
+                             1.0000, 0.9993, 0.9999, 1.0000, 1.0000, 1.0000, 1.0000,
+                             1.0000, 1.0000}),  // output
+                    TensorValue(
+                            {2, batch_size, hidden_size}, dtype::Float32(),
+                            {0.9993, 0.9999, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
+                             1.0000}),  // hy
+                    TensorValue(
+                            {2, batch_size, hidden_size}, dtype::Float32(),
+                            {4.0000, 5.0000, 6.0000, 7.0000, 8.0000, 9.0000, 10.0000,
+                             11.0000}),  // cy
+                    TensorValue(
+                            {4, 2, 2, 2}, dtype::Float32(),
+                            {0.995054, 0.999328, 0.99990,  0.999987, 3.,      4.,
+                             5.,       6.,       0.999329, 0.999328, 0.99990, 1.,
+                             4.,       5.,       6.,       7.,       1.,      0.999328,
+                             0.99990,  0.999987, 7.,       8.,       9.,      10.,
+                             0.999329, 0.999328, 0.99990,  1.,       8.,      9.,
+                             10.,      11.})  // reserve space
+            });
+    param.num_layers = 2;
+    checker.set_param(param).exect(
+            Testcase{
+                    TensorValue(
+                            {seq_len, batch_size, input_size}, dtype::Float32(),
+                            {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}),  // input
+                    TensorValue(
+                            {4, batch_size, hidden_size}, dtype::Float32(),
+                            {1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8}),  // hx
+                    TensorValue(
+                            {4, batch_size, hidden_size}, dtype::Float32(),
+                            {2, 3, 4, 5, 6, 7, 8, 9, 2, 3, 4, 5, 6, 7, 8, 9}),  // cx
+                    TensorValue(
+                            {8, 22}, dtype::Float32(),
+                            {
+                                    3, 6, 1, 3, 2, 7, 2, 1, 3, 2, 1, 1, 3, 6, 1, 3,
+                                    2, 7, 2, 1, 3, 2, 1, 1, 9, 3, 5, 1, 9, 3, 5, 1,
+                                    9, 3, 5, 1, 9, 3, 5, 1, 3, 6, 1, 3, 2, 7, 2, 1,
+                                    3, 2, 1, 1, 3, 6, 1, 3, 2, 7, 2, 1, 3, 2, 1, 1,
+                                    9, 3, 5, 1, 9, 3, 5, 1, 9, 3, 5, 1, 9, 3, 5, 1,
+                                    3, 6, 1, 3, 2, 7, 2, 1, 3, 2, 1, 1, 2, 7, 2, 1,
+                                    3, 6, 1, 3, 2, 7, 2, 1, 3, 2, 1, 1, 2, 7, 2, 1,
+                                    3, 6, 1, 3, 2, 7, 2, 1, 3, 2, 1, 1, 2, 7, 2, 1,
+                                    3, 6, 1, 3, 2, 7, 2, 1, 3, 2, 1, 1, 2, 7, 2, 1,
+                                    9, 3, 5, 1, 9, 3, 5, 1, 9, 3, 5, 1, 9, 3, 5, 1,
+                                    9, 3, 5, 1, 9, 3, 5, 1, 9, 3, 5, 1, 9, 3, 5, 1,
+                            }),  // flattern weights
+                    {},
+                    {},
+                    {},
+                    {}},
+            Testcase{
+                    {},
+                    {},
+                    {},
+                    {},
+                    TensorValue(
+                            {seq_len, batch_size, 2 * hidden_size}, dtype::Float32(),
+                            {0.9951, 0.9993, 1.0000, 1.0000, 0.9999, 1.0000, 1.0000,
+                             1.0000, 0.9993, 0.9999, 1.0000, 1.0000, 1.0000, 1.0000,
+                             1.0000, 1.0000}),  // output
+                    TensorValue(
+                            {4, batch_size, hidden_size}, dtype::Float32(),
+                            {0.9993, 0.9999, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
+                             1.0000, 0.9993, 0.9999, 1.0000, 1.0000, 1.0000, 1.0000,
+                             1.0000, 1.0000}),  // hy
+                    TensorValue(
+                            {4, batch_size, hidden_size}, dtype::Float32(),
+                            {4.0000, 5.0000, 6.0000, 7.0000, 8.0000, 9.0000, 10.0000,
+                             11.0000, 4.0000, 5.0000, 6.0000, 7.0000, 8.0000, 9.0000,
+                             10.0000, 11.0000}),  // cy
+                    TensorValue(
+                            {8, 2, 2, 2}, dtype::Float32(),
+                            {
+                                    0.995054, 0.999328, 0.99990,  0.999987, 3.,
+                                    4.,       5.,       6.,       0.999329, 0.999328,
+                                    0.99990,  1.,       4.,       5.,       6.,
+                                    7.,       1.,       0.999328, 0.99990,  0.999987,
+                                    7.,       8.,       9.,       10.,      0.999329,
+                                    0.999328, 0.99990,  1.,       8.,       9.,
+                                    10.,      11.,      0.995054, 0.999328, 0.99990,
+                                    0.999987, 3.,       4.,       5.,       6.,
+                                    0.999329, 0.999328, 0.99990,  1.,       4.,
+                                    5.,       6.,       7.,       1.,       0.999328,
+                                    0.99990,  0.999987, 7.,       8.,       9.,
+                                    10.,      0.999329, 0.999328, 0.99990,  1.,
+                                    8.,       9.,       10.,      11.,
+                            })  // reserve space
+            });
+}
+
+}  // namespace test
+}  // namespace megdnn
\ No newline at end of file
diff --git a/dnn/test/naive/lstmcell.cpp b/dnn/test/naive/lstmcell.cpp
new file mode 100644
index 00000000..1d6091a3
--- /dev/null
+++ b/dnn/test/naive/lstmcell.cpp
@@ -0,0 +1,140 @@
+/**
+ * \file dnn/test/naive/lstmcell.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+#include "megdnn/dtype.h"
+#include "megdnn/oprs.h"
+#include "test/common/checker.h"
+#include "test/naive/fixture.h"
+
+namespace megdnn {
+namespace test {
+TEST_F(NAIVE, LSTMCELL) {
+    Checker<LSTMCell> checker(handle(), true);
+    for (size_t batch : {1, 4})
+        for (size_t n : {3, 4, 5, 23, 100})
+            for (size_t out : {3, 6, 25, 100}) {
+                checker.exec(
+                        {{batch, n},
+                         {out * 4, n},
+                         {1, out * 4},
+                         {batch, out},
+                         {out * 4, out},
+                         {1, out * 4},
+                         {batch, out},
+                         {},
+                         {},
+                         {}});
+            }
+    size_t batch_size = 2;
+    size_t input_size = 3;
+    size_t hidden_size = 2;
+    checker.exect(
+            Testcase{
+                    TensorValue(
+                            {batch_size, input_size}, dtype::Float32(),
+                            {1, 2, 3, 4, 5, 6}),  // input
+                    TensorValue(
+                            {4 * hidden_size, input_size}, dtype::Float32(),
+                            {
+                                    0.3535, 0.3535, 0.3535, 0.3535, 0.3535, 0.3535,
+                                    0.3535, 0.3535, 0.3535, 0.3535, 0.3535, 0.3535,
+                                    0.3535, 0.3535, 0.3535, 0.3535, 0.3535, 0.3535,
+                                    0.3535, 0.3535, 0.3535, 0.3535, 0.3535, 0.3535,
+                            }),  // weight_ih
+                    TensorValue(
+                            {4 * hidden_size}, dtype::Float32(),
+                            {0, 0, 0, 0, 0, 0, 0, 0}),  // bias_ih
+                    TensorValue(
+                            {batch_size, hidden_size}, dtype::Float32(),
+                            {1, 2, 3, 4}),  // hx
+                    TensorValue(
+                            {4 * hidden_size, hidden_size}, dtype::Float32(),
+                            {0.3535, 0.3535, 0.3535, 0.3535, 0.3535, 0.3535, 0.3535,
+                             0.3535, 0.3535, 0.3535, 0.3535, 0.3535, 0.3535, 0.3535,
+                             0.3535, 0.3535}),  // weight_hh
+                    TensorValue(
+                            {4 * hidden_size}, dtype::Float32(),
+                            {0, 0, 0, 0, 0, 0, 0, 0}),  // bias_hh
+                    TensorValue(
+                            {batch_size, hidden_size}, dtype::Float32(),
+                            {2, 3, 4, 5}),  // cx
+                    {},
+                    {},
+                    {}},
+            Testcase{
+                    {},
+                    {},
+                    {},
+                    {},
+                    {},
+                    {},
+                    {},
+                    TensorValue(
+                            {batch_size, hidden_size}, dtype::Float32(),
+                            {0.9541, 0.9593, 0.9995, 0.9996}),  // hy
+                    TensorValue(
+                            {batch_size, hidden_size}, dtype::Float32(),
+                            {2.8771, 3.8373, 4.9979, 5.9975}),  // cy
+                    TensorValue(
+                            {batch_size, 4 * hidden_size}, dtype::Float32(),
+                            {3.18198, 3.18198, 7.7781, 7.7781, 3.18198, 3.18198,
+                             7.77817, 7.77817, 3.18198, 3.18198, 7.77817, 7.77817,
+                             3.18198, 3.18198, 7.77817, 7.77817}),  // cy
+            });
+    batch_size = 2;
+    input_size = 2;
+    hidden_size = 1;
+    checker.exect(
+            Testcase{
+                    TensorValue(
+                            {batch_size, input_size}, dtype::Float32(),
+                            {1, 2, 3, 4}),  // input
+                    TensorValue(
+                            {4 * hidden_size, input_size}, dtype::Float32(),
+                            {0.3535, 0.3535, 0.3535, 0.3535, 0.3535, 0.3535, 0.3535,
+                             0.3535}),  // weight_ih
+                    TensorValue(
+                            {4 * hidden_size}, dtype::Float32(),
+                            {0.3535, 0.3535, 0.3535, 0.3535}),  // bias_ih
+                    TensorValue(
+                            {batch_size, hidden_size}, dtype::Float32(), {1, 2}),  // hx
+                    TensorValue(
+                            {4 * hidden_size, hidden_size}, dtype::Float32(),
+                            {0.3535, 0.3535, 0.3535, 0.3535}),  // weight_hh
+                    TensorValue(
+                            {4 * hidden_size}, dtype::Float32(),
+                            {0.3535, 0.3535, 0.3535, 0.3535}),  // bias_hh
+                    TensorValue(
+                            {batch_size, hidden_size}, dtype::Float32(), {4, 5}),  // cx
+                    {},
+                    {},
+                    {}},
+            Testcase{
+                    {},
+                    {},
+                    {},
+                    {},
+                    {},
+                    {},
+                    {},
+                    TensorValue(
+                            {batch_size, hidden_size}, dtype::Float32(),
+                            {0.8927, 0.9799}),  // hy
+                    TensorValue(
+                            {batch_size, hidden_size}, dtype::Float32(),
+                            {4.4393, 5.8788}),  // cy
+                    TensorValue(
+                            {batch_size, 4 * hidden_size}, dtype::Float32(),
+                            {2.1210, 3.8885, 2.1210, 3.8885, 2.1210, 3.8885, 2.1210,
+                             3.8885}),  // gates
+            });
+}
+}  // namespace test
+}  // namespace megdnn
\ No newline at end of file
diff --git a/dnn/test/naive/rnn.cpp b/dnn/test/naive/rnn.cpp
index 11e9162e..46b95e92 100644
--- a/dnn/test/naive/rnn.cpp
+++ b/dnn/test/naive/rnn.cpp
@@ -8,7 +8,6 @@
  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
  * ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  */
-#include "test/common/rnn.h"
 #include "megdnn/dtype.h"
 #include "megdnn/oprs.h"
 #include "test/common/checker.h"
@@ -17,22 +16,7 @@
 namespace megdnn {
 namespace test {
 
-/*TEST_F(NAIVE, RNN) {
-    std::vector<rnn::TestArg> args = rnn::get_args();
-    Checker<RNN> checker(handle());
-    for (auto&& arg : args) {
-                checker.set_param(arg.param)
-                .set_dtype(0, dtype::Float32())
-                .set_dtype(1, dtype::Float32())
-                .set_dtype(2, dtype::Float32())
-                .set_dtype(3, dtype::Float32())
-                .set_dtype(4, dtype::Float32())
-                .set_dtype(5, dtype::Float32())
-                .execs({arg.input, arg.hx, arg.flatten_weights, {}, {}, {}});
-    }
-}*/
-
-TEST_F(NAIVE, RNN_HAND_MADE) {
+TEST_F(NAIVE, RNN_FORWARD) {
     Checker<RNN> checker(handle(), false);
     size_t batch_size = 2;
     size_t input_size = 3;
@@ -49,14 +33,17 @@ TEST_F(NAIVE, RNN_HAND_MADE) {
             Testcase{
                     TensorValue(
                             {seq_len, batch_size, input_size}, dtype::Float32(),
-                            {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}),  // input
+                            {-0.66536, 0.08049, 0.12008, 0.63423, 1.37801, 0.02591,
+                             0.09153, 0.82866, -1.70429, -1.26624, -0.06421,
+                             0.35816}),  // input
                     TensorValue(
                             {batch_size, hidden_size}, dtype::Float32(),
-                            {2, 1, 3, 5}),  // hx
+                            {-3.19544, -1.24232, 1.99512, -0.25692}),  // hx
                     TensorValue(
                             {gate_hidden_size, input_size + hidden_size},
                             dtype::Float32(),
-                            {3, 6, 1, 3, 2, 7, 9, 3, 5, 1}),  // weights
+                            {0.35355, 0.35355, 0.35355, 0.35355, 0.35355, 0.35355,
+                             0.35355, 0.35355, 0.35355, 0.35355}),  // flattern weights
                     {},
                     {},
                     {}},
@@ -66,13 +53,54 @@ TEST_F(NAIVE, RNN_HAND_MADE) {
                     {},
                     TensorValue(
                             {seq_len, batch_size, hidden_size}, dtype::Float32(),
-                            {39, 39, 90, 84, 300, 216, 546, 366}),  // output
+                            {0.0, 0.0, 1.3351, 1.3351, 0.0, 0.0, 0.6003,
+                             0.6003}),  // output
                     TensorValue(
                             {batch_size, hidden_size}, dtype::Float32(),
-                            {21, 11, 42, 20}),  // hy
+                            {0.0, 0.0, 0.6003, 0.6003}),  // hy
                     TensorValue(
                             {1, 2, 2, 2}, dtype::Float32(),
-                            {2, 1, 3, 5, 21, 11, 42, 20})  // reserve space
+                            {0.0, 0.0, 1.33512, 1.33512, 0.0, 0.0, 0.60031,
+                             0.60031})  // reserve space
+            });
+    param.num_layers = 2;
+    checker.set_param(param).exect(
+            Testcase{
+                    TensorValue(
+                            {seq_len, batch_size, input_size}, dtype::Float32(),
+                            {-0.66536, 0.08049, 0.12008, 0.63423, 1.37801, 0.02591,
+                             0.09153, 0.82866, -1.70429, -1.26624, -0.06421,
+                             0.35816}),  // input
+                    TensorValue(
+                            {2, batch_size, hidden_size}, dtype::Float32(),
+                            {-3.19544, -1.24232, 1.99512, -0.25692, -3.19544, -1.24232,
+                             1.99512, -0.25692}),  // hx
+                    TensorValue(
+                            {2, 9}, dtype::Float32(),
+                            {0.35355, 0.35355, 0.35355, 0.35355, 0.35355, 0.35355,
+                             0.35355, 0.35355, 0.35355, 0.35355, 0.35355, 0.35355,
+                             0.35355, 0.35355, 0.35355, 0.35355, 0.35355,
+                             0.35355}),  // weights
+                    {},
+                    {},
+                    {}},
+            Testcase{
+                    {},
+                    {},
+                    {},
+                    TensorValue(
+                            {seq_len, batch_size, hidden_size}, dtype::Float32(),
+                            {0.0, 0.0, 1.5586, 1.5586, 0.0, 0.0, 1.5266,
+                             1.5266}),  // output
+                    TensorValue(
+                            {2, batch_size, hidden_size}, dtype::Float32(),
+                            {0.0, 0.0, 0.6003, 0.6003, 0.0, 0.0, 1.5266,
+                             1.5266}),  // hy
+                    TensorValue(
+                            {2, 2, 2, 2}, dtype::Float32(),
+                            {0.0, 0.0, 1.33512, 1.33512, 0.0, 0.0, 0.60031, 0.60031,
+                             0.0, 0.0, 1.55861, 1.55861, 0.0, 0.0, 1.52658,
+                             1.52658})  // reserve space
             });
 }
 
diff --git a/dnn/test/naive/rnn_cell.cpp b/dnn/test/naive/rnn_cell.cpp
new file mode 100644
index 00000000..48de6128
--- /dev/null
+++ b/dnn/test/naive/rnn_cell.cpp
@@ -0,0 +1,104 @@
+/**
+ * \file dnn/test/naive/rnncell.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+#include "megdnn/dtype.h"
+#include "megdnn/oprs.h"
+#include "test/common/checker.h"
+#include "test/naive/fixture.h"
+
+namespace megdnn {
+namespace test {
+
+TEST_F(NAIVE, RNNCELL) {
+    Checker<RNNCell> checker(handle(), false);
+    for (size_t batch : {1, 4})
+        for (size_t inp : {3, 4, 5, 23, 100})
+            for (size_t hidden : {3, 6, 25, 100}) {
+                checker.exec(
+                        {{batch, inp},
+                         {hidden, inp},
+                         {1, hidden},
+                         {batch, hidden},
+                         {hidden, hidden},
+                         {1, hidden},
+                         {}});
+            }
+    size_t batch_size = 2;
+    size_t input_size = 3;
+    size_t hidden_size = 2;
+    RNNCell::Param param;
+    param.nonlineMode = param::RNNCell::NonlineMode::TANH;
+    checker.set_param(param).exect(
+            Testcase{
+                    TensorValue(
+                            {batch_size, input_size}, dtype::Float32(),
+                            {1, 2, 3, 4, 5, 6}),  // input
+                    TensorValue(
+                            {hidden_size, input_size}, dtype::Float32(),
+                            {0.3535, 0.3535, 0.3535, 0.3535, 0.3535,
+                             0.3535}),  // weight_ih
+                    TensorValue({1, hidden_size}, dtype::Float32(), {0, 0}),  // bias_ih
+                    TensorValue(
+                            {batch_size, hidden_size}, dtype::Float32(),
+                            {1, 2, 3, 4}),  // hx
+                    TensorValue(
+                            {hidden_size, hidden_size}, dtype::Float32(),
+                            {0.3535, 0.3535, 0.3535, 0.3535}),  // weight_hh
+                    TensorValue({1, hidden_size}, dtype::Float32(), {0, 0}),  // bias_hh
+                    {}},
+            Testcase{
+                    {},
+                    {},
+                    {},
+                    {},
+                    {},
+                    {},
+                    TensorValue(
+                            {batch_size, hidden_size}, dtype::Float32(),
+                            {0.9966, 0.9966, 1.0, 1.0}),  // dst
+            });
+    batch_size = 2;
+    input_size = 2;
+    hidden_size = 1;
+    param.nonlineMode = param::RNNCell::NonlineMode::RELU;
+    checker.set_param(param).exect(
+            Testcase{
+                    TensorValue(
+                            {batch_size, input_size}, dtype::Float32(),
+                            {1, 2, 3, 4}),  // input
+                    TensorValue(
+                            {hidden_size, input_size}, dtype::Float32(),
+                            {0.3535, 0.3535}),  // weight_ih
+                    TensorValue(
+                            {1, hidden_size}, dtype::Float32(), {0.3535}),  // bias_ih
+                    TensorValue(
+                            {batch_size, hidden_size}, dtype::Float32(),
+                            {-1, -2}),  // hx
+                    TensorValue(
+                            {hidden_size, hidden_size}, dtype::Float32(),
+                            {0.3535}),  // weight_hh
+                    TensorValue(
+                            {1, hidden_size}, dtype::Float32(), {0.3535}),  // bias_hh
+                    {}},
+            Testcase{
+                    {},
+                    {},
+                    {},
+                    {},
+                    {},
+                    {},
+                    TensorValue(
+                            {batch_size, hidden_size}, dtype::Float32(),
+                            {1.414, 2.4745}),  // hy
+            });
+}
+
+}  // namespace test
+}  // namespace megdnn
\ No newline at end of file
diff --git a/imperative/python/megengine/module/rnn.py b/imperative/python/megengine/module/rnn.py
index 571e3a19..a9340753 100644
--- a/imperative/python/megengine/module/rnn.py
+++ b/imperative/python/megengine/module/rnn.py
@@ -15,9 +15,8 @@ import numpy as np
 
 from ..core._imperative_rt.core2 import apply
 from ..core.ops import builtin
-from ..device import is_cuda_available
-from ..functional import concat, expand_dims, repeat, stack, zeros
-from ..functional.nn import concat
+from ..core.ops.builtin import BatchNorm
+from ..functional import stack, zeros
 from ..tensor import Parameter, Tensor
 from . import init
 from .module import Module
@@ -25,13 +24,7 @@ from .module import Module
 
 class RNNCellBase(Module):
     def __init__(
-        self,
-        input_size: int,
-        hidden_size: int,
-        bias: bool,
-        num_chunks: int,
-        device=None,
-        dtype=None,
+        self, input_size: int, hidden_size: int, bias: bool, num_chunks: int,
     ) -> None:
         # num_chunks indicates the number of gates
         super(RNNCellBase, self).__init__()
@@ -41,127 +34,195 @@ class RNNCellBase(Module):
         self.bias = bias
 
         # initialize weights
-        common_kwargs = {"device": device, "dtype": dtype}
         self.gate_hidden_size = num_chunks * hidden_size
         self.weight_ih = Parameter(
-            np.random.uniform(size=(self.gate_hidden_size, input_size)).astype(
-                np.float32
-            ),
-            **common_kwargs,
+            np.zeros((self.gate_hidden_size, input_size), dtype=np.float32)
         )
         self.weight_hh = Parameter(
-            np.random.uniform(size=(self.gate_hidden_size, hidden_size)).astype(
-                np.float32
-            ),
-            **common_kwargs,
+            np.zeros((self.gate_hidden_size, hidden_size), dtype=np.float32)
         )
         if bias:
             self.bias_ih = Parameter(
-                np.random.uniform(size=(self.gate_hidden_size)).astype(np.float32),
-                **common_kwargs,
+                np.zeros((self.gate_hidden_size), dtype=np.float32)
             )
             self.bias_hh = Parameter(
-                np.random.uniform(size=(self.gate_hidden_size)).astype(np.float32),
-                **common_kwargs,
+                np.zeros((self.gate_hidden_size), dtype=np.float32)
             )
         else:
-            self.bias_ih = zeros(shape=(self.gate_hidden_size), **common_kwargs)
-            self.bias_hh = zeros(shape=(self.gate_hidden_size), **common_kwargs)
+            self.bias_ih = zeros(shape=(self.gate_hidden_size))
+            self.bias_hh = zeros(shape=(self.gate_hidden_size))
         self.reset_parameters()
         # if bias is False self.bias will remain zero
 
-    def get_op(self):
-        return builtin.RNNCell()
-
     def reset_parameters(self) -> None:
         stdv = 1.0 / math.sqrt(self.hidden_size)
         for weight in self.parameters():
             init.uniform_(weight, -stdv, stdv)
 
+    @abstractmethod
     def forward(self, input: Tensor, hx: Optional[Tensor] = None) -> Tensor:
-        if hx is None:
-            hx = zeros(
-                shape=(input.shape[0], self.gate_hidden_size),
-                dtype=input.dtype,
-                device=input.device,
-            )
-        op = self.get_op()
-        return apply(
-            op, input, self.weight_ih, self.bias_ih, hx, self.weight_hh, self.bias_hh
-        )[0]
-        # return linear(input, self.weight_ih, self.bias_ih) + linear(hx, self.weight_hh, self.bias_hh)
+        raise NotImplementedError("forward not implemented !")
 
 
 class RNNCell(RNNCellBase):
+
+    r"""An Elman RNN cell with tanh or ReLU non-linearity.
+
+    .. math::
+
+        h' = \tanh(W_{ih} x + b_{ih}  +  W_{hh} h + b_{hh})
+
+    If :attr:`nonlinearity` is `'relu'`, then ReLU is used in place of tanh.
+
+    Args:
+        input_size: The number of expected features in the input `x`
+        hidden_size: The number of features in the hidden state `h`
+        bias: If ``False``, then the layer does not use bias weights `b_ih` and `b_hh`.
+            Default: ``True``
+        nonlinearity: The non-linearity to use. Can be either ``'tanh'`` or ``'relu'``. Default: ``'tanh'``
+
+    Inputs: input, hidden
+        - **input** of shape `(batch, input_size)`: tensor containing input features
+        - **hidden** of shape `(batch, hidden_size)`: tensor containing the initial hidden
+          state for each element in the batch.
+          Defaults to zero if not provided.
+
+    Outputs: h'
+        - **h'** of shape `(batch, hidden_size)`: tensor containing the next hidden state
+          for each element in the batch
+
+    Shape:
+        - Input1: :math:`(N, H_{in})` tensor containing input features where
+          :math:`H_{in}` = `input_size`
+        - Input2: :math:`(N, H_{out})` tensor containing the initial hidden
+          state for each element in the batch where :math:`H_{out}` = `hidden_size`
+          Defaults to zero if not provided.
+        - Output: :math:`(N, H_{out})` tensor containing the next hidden state
+          for each element in the batch
+
+
+    Examples:
+
+        .. code-block::
+
+            import numpy as np
+            import megengine as mge
+            import megengine.module as M
+
+            m = M.RNNCell(10, 20)
+            inp = mge.tensor(np.random.randn(3, 10), dtype=np.float32)
+            hx = mge.tensor(np.random.randn(3, 20), dtype=np.float32)
+            out = m(inp, hx)
+            print(out.numpy().shape)
+
+        Outputs:
+
+        .. code-block::
+
+            (3, 20)
+
+    """
+
     def __init__(
         self,
         input_size: int,
         hidden_size: int,
         bias: bool = True,
         nonlinearity: str = "tanh",
-        device=None,
-        dtype=None,
     ) -> None:
         self.nonlinearity = nonlinearity
-        super(RNNCell, self).__init__(
-            input_size, hidden_size, bias, num_chunks=1, device=device, dtype=dtype
-        )
-        # self.activate = tanh if nonlinearity == "tanh" else relu
-
-    def get_op(self):
-        return builtin.RNNCell(nonlineMode=self.nonlinearity)
+        super(RNNCell, self).__init__(input_size, hidden_size, bias, num_chunks=1)
 
     def forward(self, input: Tensor, hx: Optional[Tensor] = None) -> Tensor:
-        return super().forward(input, hx)
+        if hx is None:
+            hx = zeros(shape=(input.shape[0], self.gate_hidden_size),)
+        op = builtin.RNNCell(nonlineMode=self.nonlinearity)
+        return apply(
+            op, input, self.weight_ih, self.bias_ih, hx, self.weight_hh, self.bias_hh
+        )[0]
 
 
 class LSTMCell(RNNCellBase):
-    def __init__(
-        self,
-        input_size: int,
-        hidden_size: int,
-        bias: bool = True,
-        device=None,
-        dtype=None,
-    ) -> None:
-        super(LSTMCell, self).__init__(
-            input_size, hidden_size, bias, num_chunks=4, device=device, dtype=dtype
-        )
 
-    def get_op(self):
-        return builtin.LSTMCell()
+    r"""A long short-term memory (LSTM) cell.
+
+    .. math::
+
+        \begin{array}{ll}
+        i = \sigma(W_{ii} x + b_{ii} + W_{hi} h + b_{hi}) \\
+        f = \sigma(W_{if} x + b_{if} + W_{hf} h + b_{hf}) \\
+        g = \tanh(W_{ig} x + b_{ig} + W_{hg} h + b_{hg}) \\
+        o = \sigma(W_{io} x + b_{io} + W_{ho} h + b_{ho}) \\
+        c' = f * c + i * g \\
+        h' = o * \tanh(c') \\
+        \end{array}
+
+    where :math:`\sigma` is the sigmoid function, and :math:`*` is the Hadamard product.
+
+    Args:
+        input_size: The number of expected features in the input `x`
+        hidden_size: The number of features in the hidden state `h`
+        bias: If ``False``, then the layer does not use bias weights `b_ih` and
+            `b_hh`. Default: ``True``
+
+    Inputs: input, (h_0, c_0)
+        - **input** of shape `(batch, input_size)`: tensor containing input features
+        - **h_0** of shape `(batch, hidden_size)`: tensor containing the initial hidden
+          state for each element in the batch.
+        - **c_0** of shape `(batch, hidden_size)`: tensor containing the initial cell state
+          for each element in the batch.
+
+          If `(h_0, c_0)` is not provided, both **h_0** and **c_0** default to zero.
+
+    Outputs: (h_1, c_1)
+        - **h_1** of shape `(batch, hidden_size)`: tensor containing the next hidden state
+          for each element in the batch
+        - **c_1** of shape `(batch, hidden_size)`: tensor containing the next cell state
+          for each element in the batch
+
+    Examples:
+
+        .. code-block::
+
+            import numpy as np
+            import megengine as mge
+            import megengine.module as M
+
+            m = M.LSTMCell(10, 20)
+            inp = mge.tensor(np.random.randn(3, 10), dtype=np.float32)
+            hx = mge.tensor(np.random.randn(3, 20), dtype=np.float32)
+            cx = mge.tensor(np.random.randn(3, 20), dtype=np.float32)
+            hy, cy = m(inp, (hx, cx))
+            print(hy.numpy().shape)
+            print(cy.numpy().shape)
+
+        Outputs:
+
+        .. code-block::
+
+            (3, 20)
+            (3, 20)
+
+    """
+
+    def __init__(self, input_size: int, hidden_size: int, bias: bool = True,) -> None:
+        super(LSTMCell, self).__init__(input_size, hidden_size, bias, num_chunks=4)
 
     def forward(
         self, input: Tensor, hx: Optional[Tuple[Tensor, Tensor]] = None
     ) -> Tuple[Tensor, Tensor]:
         # hx: (h, c)
         if hx is None:
-            h = zeros(
-                shape=(input.shape[0], self.hidden_size),
-                dtype=input.dtype,
-                device=input.device,
-            )
-            c = zeros(
-                shape=(input.shape[0], self.hidden_size),
-                dtype=input.dtype,
-                device=input.device,
-            )
+            h = zeros(shape=(input.shape[0], self.hidden_size))
+            c = zeros(shape=(input.shape[0], self.hidden_size))
         else:
             h, c = hx
-        op = self.get_op()
+        op = builtin.LSTMCell()
         return apply(
             op, input, self.weight_ih, self.bias_ih, h, self.weight_hh, self.bias_hh, c
         )[:2]
 
 
-def is_gpu(device: str) -> bool:
-    if "xpux" in device and is_cuda_available():
-        return True
-    if "gpu" in device:
-        return True
-    return False
-
-
 class RNNBase(Module):
     def __init__(
         self,
@@ -173,11 +234,8 @@ class RNNBase(Module):
         dropout: float = 0,
         bidirectional: bool = False,
         proj_size: int = 0,
-        device=None,
-        dtype=None,
     ) -> None:
         super(RNNBase, self).__init__()
-        # self.mode = mode
         self.input_size = input_size
         self.hidden_size = hidden_size
         self.num_layers = num_layers
@@ -210,26 +268,25 @@ class RNNBase(Module):
         for layer in range(self.num_layers):
             self.cells.append([])
             for _ in range(self.num_directions):
-                self.cells[layer].append(self.create_cell(layer, device, dtype))
+                self.cells[layer].append(self.create_cell(layer))
         # parameters have been initialized during the creation of the cells
         # if flatten, then delete cells
-        self._flatten_parameters(device, dtype, self.cells)
+        self._flatten_parameters(self.cells)
 
-    def _flatten_parameters(self, device, dtype, cells):
+    def _flatten_parameters(self, cells):
         gate_hidden_size = cells[0][0].gate_hidden_size
         size_dim1 = 0
         for layer in range(self.num_layers):
             for direction in range(self.num_directions):
                 size_dim1 += cells[layer][direction].weight_ih.shape[1]
                 size_dim1 += cells[layer][direction].weight_hh.shape[1]
-        # if self.bias:
-        #     size_dim1 += 2 * self.num_directions * self.num_layers
-        size_dim1 += 2 * self.num_directions * self.num_layers
+        if self.bias:
+            size_dim1 += 2 * self.num_directions * self.num_layers
+
         self._flatten_weights = Parameter(
             np.zeros((gate_hidden_size, size_dim1), dtype=np.float32)
         )
         self.reset_parameters()
-        # TODO: if no bias, set the bias to zero
 
     def reset_parameters(self) -> None:
         stdv = 1.0 / math.sqrt(self.hidden_size)
@@ -237,7 +294,7 @@ class RNNBase(Module):
             init.uniform_(weight, -stdv, stdv)
 
     @abstractmethod
-    def create_cell(self, layer, device, dtype):
+    def create_cell(self, layer):
         raise NotImplementedError("Cell not implemented !")
 
     @abstractmethod
@@ -265,81 +322,137 @@ class RNNBase(Module):
         else:
             batch_size = input.shape[1]
         if hx is None:
-            hx = self.init_hidden(batch_size, input.device, input.dtype)
+            hx = self.init_hidden(batch_size)
 
         output, h = self.apply_op(input, hx)
         if self.batch_first:
             output = output.transpose((1, 0, 2))
         return output, h
 
-        if is_gpu(str(input.device)) or True:
-            # return output, h_n
-            output, h = self.apply_op(input, hx)
-            if self.batch_first:
-                output = output.transpose((1, 0, 2))
-            return output, h
 
-        order_settings = [(0, input.shape[0]), (input.shape[0] - 1, -1, -1)]
-        h_n = []
-        for layer in range(self.num_layers):
-            layer_outputs = []
-            for direction in range(self.num_directions):
-                direction_outputs = [None for _ in range(input.shape[0])]
-                cell = self.cells[layer][direction]
-                hidden = self._apply_fn_to_hx(
-                    hx, lambda x: x[layer * self.num_directions + direction]
-                )
-                for step in range(*(order_settings[direction])):
-                    hidden = cell(input[step], hidden)  # [batch_size, hidden_size]
-                    direction_outputs[step] = self.get_output_from_hidden(hidden)
-                direction_output = stack(
-                    direction_outputs, axis=0
-                )  # [seq_len, batch_size, hidden_size]
-                layer_outputs.append(direction_output)
-                h_n.append(hidden)
-            layer_output = concat(
-                layer_outputs, axis=-1
-            )  # [seq_len, batch_size, D*hidden_size]
-            input = layer_output
-        if self.batch_first:
-            layer_output = layer_output.transpose((1, 0, 2))
-        return layer_output, self._stack_h_n(h_n)
+class RNN(RNNBase):
 
+    r"""Applies a multi-layer Elman RNN with :math:`\tanh` or :math:`\text{ReLU}` non-linearity to an
+    input sequence.
+
+
+    For each element in the input sequence, each layer computes the following
+    function:
+
+    .. math::
+        h_t = \tanh(W_{ih} x_t + b_{ih} + W_{hh} h_{(t-1)} + b_{hh})
+
+    where :math:`h_t` is the hidden state at time `t`, :math:`x_t` is
+    the input at time `t`, and :math:`h_{(t-1)}` is the hidden state of the
+    previous layer at time `t-1` or the initial hidden state at time `0`.
+    If :attr:`nonlinearity` is ``'relu'``, then :math:`\text{ReLU}` is used instead of :math:`\tanh`.
+
+    Args:
+        input_size: The number of expected features in the input `x`
+        hidden_size: The number of features in the hidden state `h`
+        num_layers: Number of recurrent layers. E.g., setting ``num_layers=2``
+            would mean stacking two RNNs together to form a `stacked RNN`,
+            with the second RNN taking in outputs of the first RNN and
+            computing the final results. Default: 1
+        nonlinearity: The non-linearity to use. Can be either ``'tanh'`` or ``'relu'``. Default: ``'tanh'``
+        bias: If ``False``, then the layer does not use bias weights `b_ih` and `b_hh`.
+            Default: ``True``
+        batch_first: If ``True``, then the input and output tensors are provided
+            as `(batch, seq, feature)` instead of `(seq, batch, feature)`.
+            Note that this does not apply to hidden or cell states. See the
+            Inputs/Outputs sections below for details.  Default: ``False``
+        dropout: If non-zero, introduces a `Dropout` layer on the outputs of each
+            RNN layer except the last layer, with dropout probability equal to
+            :attr:`dropout`. Default: 0
+        bidirectional: If ``True``, becomes a bidirectional RNN. Default: ``False``
+
+    Inputs: input, h_0
+        * **input**: tensor of shape :math:`(L, N, H_{in})` when ``batch_first=False`` or
+          :math:`(N, L, H_{in})` when ``batch_first=True`` containing the features of
+          the input sequence.  The input can also be a packed variable length sequence.
+          See :func:`torch.nn.utils.rnn.pack_padded_sequence` or
+          :func:`torch.nn.utils.rnn.pack_sequence` for details.
+        * **h_0**: tensor of shape :math:`(D * \text{num\_layers}, N, H_{out})` containing the initial hidden
+          state for each element in the batch. Defaults to zeros if not provided.
+
+        where:
+
+        .. math::
+            \begin{aligned}
+                N ={} & \text{batch size} \\
+                L ={} & \text{sequence length} \\
+                D ={} & 2 \text{ if bidirectional=True otherwise } 1 \\
+                H_{in} ={} & \text{input\_size} \\
+                H_{out} ={} & \text{hidden\_size}
+            \end{aligned}
+
+    Outputs: output, h_n
+        * **output**: tensor of shape :math:`(L, N, D * H_{out})` when ``batch_first=False`` or
+          :math:`(N, L, D * H_{out})` when ``batch_first=True`` containing the output features
+          `(h_t)` from the last layer of the RNN, for each `t`. If a
+          :class:`torch.nn.utils.rnn.PackedSequence` has been given as the input, the output
+          will also be a packed sequence.
+        * **h_n**: tensor of shape :math:`(D * \text{num\_layers}, N, H_{out})` containing the final hidden state
+          for each element in the batch.
+
+
+    Examples:
+
+        .. code-block::
+
+            import numpy as np
+            import megengine as mge
+            import megengine.module as M
+
+            m = M.RNN(10,20,2,batch_first=False,nonlinearity="relu",bias=True,bidirectional=True)
+            inp = mge.tensor(np.random.randn(6, 30, 10), dtype=np.float32)
+            hx = mge.tensor(np.random.randn(4, 30, 20), dtype=np.float32)
+            out, hn = m(inp, hx)
+            print(out.numpy().shape)
+
+        Outputs:
+
+        .. code-block::
+
+            (6, 30, 40)
+
+    """
 
-class RNN(RNNBase):
     def __init__(self, *args, **kwargs) -> None:
         self.nonlinearity = kwargs.pop("nonlinearity", "tanh")
         super(RNN, self).__init__(*args, **kwargs)
 
-    def create_cell(self, layer, device, dtype):
+    def create_cell(self, layer):
         if layer == 0:
             input_size = self.input_size
         else:
             input_size = self.num_directions * self.hidden_size
-        return RNNCell(
-            input_size, self.hidden_size, self.bias, self.nonlinearity, device, dtype
-        )
+        return RNNCell(input_size, self.hidden_size, self.bias, self.nonlinearity)
 
-    def init_hidden(self, batch_size, device, dtype):
+    def init_hidden(self, batch_size):
         hidden_shape = (
             self.num_directions * self.num_layers,
             batch_size,
             self.hidden_size,
         )
-        return zeros(shape=hidden_shape, dtype=dtype, device=device)
+        return zeros(shape=hidden_shape)
 
     def get_output_from_hidden(self, hx):
         return hx
 
     def apply_op(self, input, hx):
+        fwd_mode = (
+            BatchNorm.FwdMode.TRAINING if self.training else BatchNorm.FwdMode.INFERENCE
+        )
+
         op = builtin.RNN(
             num_layers=self.num_layers,
             bidirectional=self.bidirectional,
             bias=self.bias,
             hidden_size=self.hidden_size,
-            proj_size=self.proj_size,
             dropout=self.dropout,
             nonlineMode=self.nonlinearity,
+            fwd_mode=fwd_mode,
         )
         output, h = apply(op, input, hx, self._flatten_weights)[:2]
         output = output + h.sum() * 0
@@ -348,30 +461,149 @@ class RNN(RNNBase):
 
 
 class LSTM(RNNBase):
+
+    r"""Applies a multi-layer long short-term memory LSTM to an input
+    sequence.
+
+
+    For each element in the input sequence, each layer computes the following
+    function:
+
+    .. math::
+        \begin{array}{ll} \\
+            i_t = \sigma(W_{ii} x_t + b_{ii} + W_{hi} h_{t-1} + b_{hi}) \\
+            f_t = \sigma(W_{if} x_t + b_{if} + W_{hf} h_{t-1} + b_{hf}) \\
+            g_t = \tanh(W_{ig} x_t + b_{ig} + W_{hg} h_{t-1} + b_{hg}) \\
+            o_t = \sigma(W_{io} x_t + b_{io} + W_{ho} h_{t-1} + b_{ho}) \\
+            c_t = f_t \odot c_{t-1} + i_t \odot g_t \\
+            h_t = o_t \odot \tanh(c_t) \\
+        \end{array}
+
+    where :math:`h_t` is the hidden state at time `t`, :math:`c_t` is the cell
+    state at time `t`, :math:`x_t` is the input at time `t`, :math:`h_{t-1}`
+    is the hidden state of the layer at time `t-1` or the initial hidden
+    state at time `0`, and :math:`i_t`, :math:`f_t`, :math:`g_t`,
+    :math:`o_t` are the input, forget, cell, and output gates, respectively.
+    :math:`\sigma` is the sigmoid function, and :math:`\odot` is the Hadamard product.
+
+    In a multilayer LSTM, the input :math:`x^{(l)}_t` of the :math:`l` -th layer
+    (:math:`l >= 2`) is the hidden state :math:`h^{(l-1)}_t` of the previous layer multiplied by
+    dropout :math:`\delta^{(l-1)}_t` where each :math:`\delta^{(l-1)}_t` is a Bernoulli random
+    variable which is :math:`0` with probability :attr:`dropout`.
+
+    If ``proj_size > 0`` is specified, LSTM with projections will be used. This changes
+    the LSTM cell in the following way. First, the dimension of :math:`h_t` will be changed from
+    ``hidden_size`` to ``proj_size`` (dimensions of :math:`W_{hi}` will be changed accordingly).
+    Second, the output hidden state of each layer will be multiplied by a learnable projection
+    matrix: :math:`h_t = W_{hr}h_t`. Note that as a consequence of this, the output
+    of LSTM network will be of different shape as well. See Inputs/Outputs sections below for exact
+    dimensions of all variables. You can find more details in https://arxiv.org/abs/1402.1128.
+
+    Args:
+        input_size: The number of expected features in the input `x`
+        hidden_size: The number of features in the hidden state `h`
+        num_layers: Number of recurrent layers. E.g., setting ``num_layers=2``
+            would mean stacking two LSTMs together to form a `stacked LSTM`,
+            with the second LSTM taking in outputs of the first LSTM and
+            computing the final results. Default: 1
+        bias: If ``False``, then the layer does not use bias weights `b_ih` and `b_hh`.
+            Default: ``True``
+        batch_first: If ``True``, then the input and output tensors are provided
+            as `(batch, seq, feature)` instead of `(seq, batch, feature)`.
+            Note that this does not apply to hidden or cell states. See the
+            Inputs/Outputs sections below for details.  Default: ``False``
+        dropout: If non-zero, introduces a `Dropout` layer on the outputs of each
+            LSTM layer except the last layer, with dropout probability equal to
+            :attr:`dropout`. Default: 0
+        bidirectional: If ``True``, becomes a bidirectional LSTM. Default: ``False``
+        proj_size: If ``> 0``, will use LSTM with projections of corresponding size. Default: 0
+
+    Inputs: input, (h_0, c_0)
+        * **input**: tensor of shape :math:`(L, N, H_{in})` when ``batch_first=False`` or
+          :math:`(N, L, H_{in})` when ``batch_first=True`` containing the features of
+          the input sequence.  The input can also be a packed variable length sequence.
+          See :func:`torch.nn.utils.rnn.pack_padded_sequence` or
+          :func:`torch.nn.utils.rnn.pack_sequence` for details.
+        * **h_0**: tensor of shape :math:`(D * \text{num\_layers}, N, H_{out})` containing the
+          initial hidden state for each element in the batch.
+          Defaults to zeros if (h_0, c_0) is not provided.
+        * **c_0**: tensor of shape :math:`(D * \text{num\_layers}, N, H_{cell})` containing the
+          initial cell state for each element in the batch.
+          Defaults to zeros if (h_0, c_0) is not provided.
+
+        where:
+
+        .. math::
+            \begin{aligned}
+                N ={} & \text{batch size} \\
+                L ={} & \text{sequence length} \\
+                D ={} & 2 \text{ if bidirectional=True otherwise } 1 \\
+                H_{in} ={} & \text{input\_size} \\
+                H_{cell} ={} & \text{hidden\_size} \\
+                H_{out} ={} & \text{proj\_size if } \text{proj\_size}>0 \text{ otherwise hidden\_size} \\
+            \end{aligned}
+
+    Outputs: output, (h_n, c_n)
+        * **output**: tensor of shape :math:`(L, N, D * H_{out})` when ``batch_first=False`` or
+          :math:`(N, L, D * H_{out})` when ``batch_first=True`` containing the output features
+          `(h_t)` from the last layer of the LSTM, for each `t`. If a
+          :class:`torch.nn.utils.rnn.PackedSequence` has been given as the input, the output
+          will also be a packed sequence.
+        * **h_n**: tensor of shape :math:`(D * \text{num\_layers}, N, H_{out})` containing the
+          final hidden state for each element in the batch.
+        * **c_n**: tensor of shape :math:`(D * \text{num\_layers}, N, H_{cell})` containing the
+          final cell state for each element in the batch.
+
+    Examples:
+
+        .. code-block::
+
+            import numpy as np
+            import megengine as mge
+            import megengine.module as M
+
+            m = M.LSTM(10, 20, 2, batch_first=False, bidirectional=True, bias=True)
+            inp = mge.tensor(np.random.randn(6, 30, 10), dtype=np.float32)
+            hx = mge.tensor(np.random.randn(4, 30, 20), dtype=np.float32)
+            cx = mge.tensor(np.random.randn(4, 30, 20), dtype=np.float32)
+            out, (hn, cn) = m(inp,(hx,cx))
+            print(out.numpy().shape)
+
+        Outputs:
+
+        .. code-block::
+
+            (6, 30, 40)
+
+    """
+
     def __init__(self, *args, **kwargs) -> None:
         super(LSTM, self).__init__(*args, **kwargs)
 
-    def create_cell(self, layer, device, dtype):
+    def create_cell(self, layer):
         if layer == 0:
             input_size = self.input_size
         else:
             input_size = self.num_directions * self.hidden_size
-        return LSTMCell(input_size, self.hidden_size, self.bias, device, dtype)
+        return LSTMCell(input_size, self.hidden_size, self.bias)
 
-    def init_hidden(self, batch_size, device, dtype):
+    def init_hidden(self, batch_size):
         hidden_shape = (
             self.num_directions * self.num_layers,
             batch_size,
             self.hidden_size,
         )
-        h = zeros(shape=hidden_shape, dtype=dtype, device=device)
-        c = zeros(shape=hidden_shape, dtype=dtype, device=device)
+        h = zeros(shape=hidden_shape)
+        c = zeros(shape=hidden_shape)
         return (h, c)
 
     def get_output_from_hidden(self, hx):
         return hx[0]
 
     def apply_op(self, input, hx):
+        fwd_mode = (
+            BatchNorm.FwdMode.TRAINING if self.training else BatchNorm.FwdMode.INFERENCE
+        )
         op = builtin.LSTM(
             num_layers=self.num_layers,
             bidirectional=self.bidirectional,
@@ -379,6 +611,7 @@ class LSTM(RNNBase):
             hidden_size=self.hidden_size,
             proj_size=self.proj_size,
             dropout=self.dropout,
+            fwd_mode=fwd_mode,
         )
         output, h, c = apply(op, input, hx[0], hx[1], self._flatten_weights)[:3]
         placeholders = [output.sum() * 0, h.sum() * 0, c.sum() * 0]
diff --git a/imperative/python/test/unit/module/test_rnn.py b/imperative/python/test/unit/module/test_rnn.py
index 35d2065f..eca46cd6 100644
--- a/imperative/python/test/unit/module/test_rnn.py
+++ b/imperative/python/test/unit/module/test_rnn.py
@@ -11,6 +11,7 @@ import pytest
 
 import megengine as mge
 import megengine.functional as F
+from megengine.device import get_device_count
 from megengine.module import LSTM, RNN, LSTMCell, RNNCell
 
 
@@ -20,6 +21,7 @@ def assert_tuple_equal(src, ref):
         assert i == j
 
 
+@pytest.mark.skipif(get_device_count("gpu") > 0, reason="no algorithm on cuda")
 @pytest.mark.parametrize(
     "batch_size, input_size, hidden_size, init_hidden",
     [(3, 10, 20, True), (3, 10, 20, False), (1, 10, 20, False)],
@@ -35,7 +37,7 @@ def test_rnn_cell(batch_size, input_size, hidden_size, init_hidden):
     assert_tuple_equal(h_new.shape, (batch_size, hidden_size))
 
 
-# is batch_size == 0 tolerated ? it will cause error in slice operation xx[:, ...]
+@pytest.mark.skipif(get_device_count("gpu") > 0, reason="no algorithm on cuda")
 @pytest.mark.parametrize(
     "batch_size, input_size, hidden_size, init_hidden",
     [(3, 10, 20, True), (3, 10, 20, False), (1, 10, 20, False)],
@@ -53,6 +55,7 @@ def test_lstm_cell(batch_size, input_size, hidden_size, init_hidden):
     assert_tuple_equal(c_new.shape, (batch_size, hidden_size))
 
 
+@pytest.mark.skipif(get_device_count("gpu") > 0, reason="no algorithm on cuda")
 @pytest.mark.parametrize(
     "batch_size, seq_len, input_size, hidden_size, num_layers, bidirectional, init_hidden, batch_first",
     [
@@ -70,7 +73,6 @@ def test_lstm_cell(batch_size, input_size, hidden_size, init_hidden):
         ),
     ],
 )
-# (0, 1, 1, 1, 1, False, True, False)])
 def test_rnn(
     batch_size,
     seq_len,
@@ -113,6 +115,7 @@ def test_rnn(
     )
 
 
+@pytest.mark.skipif(get_device_count("gpu") > 0, reason="no algorithm on cuda")
 @pytest.mark.parametrize(
     "batch_size, seq_len, input_size, hidden_size, num_layers, bidirectional, init_hidden, batch_first",
     [
@@ -130,7 +133,6 @@ def test_rnn(
         ),
     ],
 )
-# (0, 1, 1, 1, 1, False, True, False)])
 def test_lstm(
     batch_size,
     seq_len,
@@ -175,7 +177,3 @@ def test_lstm(
     assert_tuple_equal(
         h_n[1].shape, (num_directions * num_layers, batch_size, hidden_size)
     )
-
-
-if __name__ == "__main__":
-    test_lstm(5, 10, 10, 20, 1, False, False, True)
diff --git a/src/opr/impl/dnn/rnn.cpp b/src/opr/impl/dnn/rnn.cpp
index 0099cc3a..fa42fe52 100644
--- a/src/opr/impl/dnn/rnn.cpp
+++ b/src/opr/impl/dnn/rnn.cpp
@@ -123,7 +123,7 @@ MGB_IMPL_OPR_GRAD(LSTMCell) {
     SymbolVar input(opr.input(0)), weight_ih(opr.input(1)), hx(opr.input(3)),
             weight_hh(opr.input(4)), cx(opr.input(6));
     SymbolVar h_out(opr.output(0)), c_out(opr.output(1)), gates(opr.output(2)),
-            h_og{out_grad.at(0)}, c_og{out_grad.at(1)}, tmp;
+            h_og{out_grad.at(0)}, c_og{out_grad.at(1)};
     size_t ghs = gates.shape()[1] / 4;  // gate_hidden_size
     SymbolVarArray gates_array = Split::make(
             gates, Split::Options::make_partition(gates, 1, {ghs, ghs, ghs, ghs}));
@@ -141,7 +141,7 @@ MGB_IMPL_OPR_GRAD(LSTMCell) {
     f_grad = Elemwise::make({f, c_og * cx}, Mode::SIGMOID_GRAD);
     i_grad = Elemwise::make({i, c_og * g}, Mode::SIGMOID_GRAD);
     g_grad = Elemwise::make({g, c_og * i}, Mode::TANH_GRAD);
-    SymbolVar rnn_cell_grad = Concat::make({i_grad, f_grad, o_grad, g_grad}, {-1});
+    SymbolVar rnn_cell_grad = Concat::make({i_grad, f_grad, o_grad, g_grad}, -1);
 
     SymbolVar result;
     if (wrt_idx < 6) {
@@ -258,7 +258,6 @@ MGB_IMPL_OPR_GRAD(LSTM) {
     SymbolVarArray grads = LSTMBackward::make(
             opr.input(0), opr.output(0), opr.input(1), opr.input(2), out_grad.at(0),
             out_grad.at(1), out_grad.at(2), opr.input(3), opr.output(3), opr.param());
-    SymbolVar res;
     return grads.at(wrt_idx).node();  // input, hx, cx, weights
 }
 #endif
diff --git a/src/opr/include/megbrain/opr/dnn/rnn.h b/src/opr/include/megbrain/opr/dnn/rnn.h
index ab15359d..38c0e34c 100644
--- a/src/opr/include/megbrain/opr/dnn/rnn.h
+++ b/src/opr/include/megbrain/opr/dnn/rnn.h
@@ -25,11 +25,11 @@ MGB_DEFINE_OPR_CLASS(
 public:
     using NonlineMode = Param::NonlineMode;
 
-    RNNCellForward(
+    MGE_WIN_DECLSPEC_FUC RNNCellForward(
             VarNode* input, VarNode* weight_ih, VarNode* bias_ih, VarNode* hx,
             VarNode* weight_hh, VarNode* bias_hh, const Param& param,
             const OperatorNodeConfig& config);
-    static SymbolVar make(
+    MGE_WIN_DECLSPEC_FUC static SymbolVar make(
             SymbolVar input, SymbolVar weight_ih, SymbolVar bias_ih, SymbolVar hx,
             SymbolVar weight_hh, SymbolVar bias_hh, const Param& param = {},
             const OperatorNodeConfig& config = {});
@@ -39,11 +39,11 @@ using RNNCell = RNNCellForward;
 MGB_DEFINE_OPR_CLASS(
         LSTMCellForward, intl::MegDNNOprWrapperFwd<megdnn::LSTMCellForward>) // {
 public:
-    LSTMCellForward(
+    MGE_WIN_DECLSPEC_FUC LSTMCellForward(
             VarNode* input, VarNode* weight_ih, VarNode* bias_ih, VarNode* hx,
             VarNode* weight_hh, VarNode* bias_hh, VarNode* cx, const Param& param,
             const OperatorNodeConfig& config);
-    static SymbolVar make(
+    MGE_WIN_DECLSPEC_FUC static SymbolVar make(
             SymbolVar input, SymbolVar weight_ih, SymbolVar bias_ih, SymbolVar hx,
             SymbolVar weight_hh, SymbolVar bias_hh, SymbolVar cx,
             const Param& param = {}, const OperatorNodeConfig& config = {});
@@ -51,17 +51,11 @@ public:
 using LSTMCell = LSTMCellForward;
 
 MGB_DEFINE_OPR_CLASS(RNNForward, intl::MegDNNOprWrapperFwd<megdnn::RNNForward>) // {
-    /*private:
-            SymbolVarArray weight_ih_arr;  // 1d, idx: direction * num_layers + layer
-            SymbolVarArray weight_hh_arr;
-            SymbolVarArray bias_arr;
-    */
-
 public:
-    RNNForward(
+    MGE_WIN_DECLSPEC_FUC RNNForward(
             VarNode* input, VarNode* hx, VarNode* flatten_weights, const Param& param,
             const OperatorNodeConfig& config);
-    static SymbolVar make(
+    MGE_WIN_DECLSPEC_FUC static SymbolVar make(
             SymbolVar input, SymbolVar hx, SymbolVar flatten_weights,
             const Param& param = {}, const OperatorNodeConfig& config = {});
 };
@@ -70,11 +64,11 @@ using RNN = RNNForward;
 MGB_DEFINE_OPR_CLASS(
         RNNBackward, intl::MegDNNOprWrapperBwd<megdnn::RNNBackward>) // {
 public:
-    RNNBackward(
+    MGE_WIN_DECLSPEC_FUC RNNBackward(
             VarNode* x, VarNode* y, VarNode* hx, VarNode* dy, VarNode* dhy,
             VarNode* flatten_weights, VarNode* reserve_space, const Param& param,
             const OperatorNodeConfig& config);
-    static SymbolVarArray make(
+    MGE_WIN_DECLSPEC_FUC static SymbolVarArray make(
             SymbolVar x, SymbolVar y, SymbolVar hx, SymbolVar dy, SymbolVar dhy,
             SymbolVar flatten_weights, SymbolVar reserve_space, const Param& param = {},
             const OperatorNodeConfig& config = {});
@@ -88,10 +82,10 @@ private:
 MGB_DEFINE_OPR_CLASS(
         LSTMForward, intl::MegDNNOprWrapperFwd<megdnn::LSTMForward>) // {
 public:
-    LSTMForward(
+    MGE_WIN_DECLSPEC_FUC LSTMForward(
             VarNode* input, VarNode* hx, VarNode* cx, VarNode* flatten_weights,
             const Param& param, const OperatorNodeConfig& config);
-    static SymbolVar make(
+    MGE_WIN_DECLSPEC_FUC static SymbolVar make(
             SymbolVar input, SymbolVar hx, SymbolVar cx, SymbolVar flatten_weights,
             const Param& param = {}, const OperatorNodeConfig& config = {});
 };
@@ -100,11 +94,11 @@ using LSTM = LSTMForward;
 MGB_DEFINE_OPR_CLASS(
         LSTMBackward, intl::MegDNNOprWrapperBwd<megdnn::LSTMBackward>) // {
 public:
-    LSTMBackward(
+    MGE_WIN_DECLSPEC_FUC LSTMBackward(
             VarNode* x, VarNode* y, VarNode* hx, VarNode* cx, VarNode* dy, VarNode* dhy,
             VarNode* dcy, VarNode* flatten_weights, VarNode* reserve_space,
             const Param& param, const OperatorNodeConfig& config);
-    static SymbolVarArray make(
+    MGE_WIN_DECLSPEC_FUC static SymbolVarArray make(
             SymbolVar x, SymbolVar y, SymbolVar hx, SymbolVar cx, SymbolVar dy,
             SymbolVar dhy, SymbolVar dcy, SymbolVar flatten_weights,
             SymbolVar reserve_space, const Param& param = {},