From 20b42a8c3b1fb606d2214e175c2749d1e65d1a61 Mon Sep 17 00:00:00 2001 From: Megvii Engine Team Date: Tue, 4 Jan 2022 11:48:43 +0800 Subject: [PATCH] fix(dnn): add naive lstm kernel GitOrigin-RevId: f08ef810cf936768a022c10f226d80e499355659 --- dnn/include/megdnn/oprs/nn.h | 2 +- dnn/scripts/opr_param_defs.py | 130 +++---- dnn/src/common/handle.cpp | 1 + dnn/src/common/lstm.cpp | 73 ++-- dnn/src/common/lstm_cell.cpp | 109 ++++-- dnn/src/common/opr_trait.h | 6 +- dnn/src/common/rnn.cpp | 53 ++- dnn/src/common/rnn_cell.cpp | 54 ++- dnn/src/cuda/cudnn_wrapper.cpp | 114 ------ dnn/src/cuda/cudnn_wrapper.h | 39 -- dnn/src/cuda/handle_create.cpp | 15 +- dnn/src/cuda/lstm/opr_impl.cpp | 112 ------ dnn/src/cuda/lstm/opr_impl.h | 56 --- dnn/src/cuda/lstm/utils.cpp | 39 -- dnn/src/cuda/lstm/utils.h | 23 -- dnn/src/cuda/lstm_cell/opr_impl.cpp | 42 -- dnn/src/cuda/lstm_cell/opr_impl.h | 36 -- dnn/src/cuda/rnn/opr_impl.cpp | 170 --------- dnn/src/cuda/rnn/opr_impl.h | 57 --- dnn/src/cuda/rnn/utils.cpp | 138 ------- dnn/src/cuda/rnn/utils.h | 56 --- dnn/src/cuda/rnn_cell/opr_impl.cpp | 35 -- dnn/src/cuda/rnn_cell/opr_impl.h | 40 -- dnn/src/naive/handle.cpp | 4 +- dnn/src/naive/lstm/opr_impl.cpp | 82 ++-- dnn/src/naive/lstm/opr_impl.h | 15 +- dnn/src/naive/lstm/template_impl.cpp | 3 +- dnn/src/naive/lstm_cell/opr_impl.cpp | 12 +- dnn/src/naive/lstm_cell/opr_impl.h | 2 + dnn/src/naive/rnn/funcs.h | 445 +++++++++++++++++++++- dnn/src/naive/rnn/funcs.tpp | 449 ---------------------- dnn/src/naive/rnn/opr_impl.cpp | 116 +++--- dnn/src/naive/rnn/opr_impl.h | 13 +- dnn/src/naive/rnn/rnn.cpp | 10 +- dnn/src/naive/rnn/rnn.h | 1 + dnn/src/naive/rnn/template_impl.cpp | 4 +- dnn/src/naive/rnn_cell/opr_impl.cpp | 12 +- dnn/src/naive/rnn_cell/opr_impl.h | 1 + dnn/test/common/deduce_layout_proxy.h | 49 ++- dnn/test/common/exec_proxy.h | 38 ++ dnn/test/common/rnn.h | 51 --- dnn/test/naive/lstm.cpp | 197 ++++++++++ dnn/test/naive/lstmcell.cpp | 140 +++++++ dnn/test/naive/rnn.cpp | 74 ++-- dnn/test/naive/rnn_cell.cpp | 104 +++++ imperative/python/megengine/module/rnn.py | 507 ++++++++++++++++++------- imperative/python/test/unit/module/test_rnn.py | 12 +- src/opr/impl/dnn/rnn.cpp | 5 +- src/opr/include/megbrain/opr/dnn/rnn.h | 30 +- 49 files changed, 1838 insertions(+), 1938 deletions(-) delete mode 100644 dnn/src/cuda/lstm/opr_impl.cpp delete mode 100644 dnn/src/cuda/lstm/opr_impl.h delete mode 100644 dnn/src/cuda/lstm/utils.cpp delete mode 100644 dnn/src/cuda/lstm/utils.h delete mode 100644 dnn/src/cuda/lstm_cell/opr_impl.cpp delete mode 100644 dnn/src/cuda/lstm_cell/opr_impl.h delete mode 100644 dnn/src/cuda/rnn/opr_impl.cpp delete mode 100644 dnn/src/cuda/rnn/opr_impl.h delete mode 100644 dnn/src/cuda/rnn/utils.cpp delete mode 100644 dnn/src/cuda/rnn/utils.h delete mode 100644 dnn/src/cuda/rnn_cell/opr_impl.cpp delete mode 100644 dnn/src/cuda/rnn_cell/opr_impl.h delete mode 100644 dnn/src/naive/rnn/funcs.tpp delete mode 100644 dnn/test/common/rnn.h create mode 100644 dnn/test/naive/lstm.cpp create mode 100644 dnn/test/naive/lstmcell.cpp create mode 100644 dnn/test/naive/rnn_cell.cpp diff --git a/dnn/include/megdnn/oprs/nn.h b/dnn/include/megdnn/oprs/nn.h index a9ce7e30..38500626 100644 --- a/dnn/include/megdnn/oprs/nn.h +++ b/dnn/include/megdnn/oprs/nn.h @@ -2059,7 +2059,7 @@ public: _megdnn_tensor_in bias_ih, _megdnn_tensor_in hx, _megdnn_tensor_in weight_hh, _megdnn_tensor_in bias_hh, _megdnn_tensor_out dst, _megdnn_workspace workspace) = 0; - static void deduce_layout( + void deduce_layout( const TensorLayout& input, const TensorLayout& weight_ih, const TensorLayout& bias_ih, const TensorLayout& hx, const TensorLayout& weight_hh, const TensorLayout& bias_hh, diff --git a/dnn/scripts/opr_param_defs.py b/dnn/scripts/opr_param_defs.py index 46d6f61b..f6b36aa7 100755 --- a/dnn/scripts/opr_param_defs.py +++ b/dnn/scripts/opr_param_defs.py @@ -36,18 +36,13 @@ pdef('Axis').add_fields('int32', 'axis', 0) add_enum(Doc('Format', 'convolution data/filter/output format; see ' ':class:`RelayoutFormat` for more details'), 'NCHW = 0', 'NHWC = 1', 'NHWCD4 = 2', 'NCHW4 = 3', 'NCHW8 = 4', 'NCHW32 = 5', 'NCHW88 = 6', - 'NCHW44 = 7', 'NCHW44_DOT = 8', + 'NCHW44 = 7','NCHW44_DOT = 8', Doc('NCHW_WINOGRAD = 9', 'NCHW layout with weights tranformed by winograd'), - Doc('NCHW88_WINOGRAD = 10', - 'NCHW88 layout with weights tranformed by winograd'), - Doc('NCHW44_WINOGRAD = 11', - 'NCHW44 layout with weights tranformed by winograd'), - Doc('NCHW4_NCHW32 = 12', - 'NCHW4_NCHW32 means input tensors are nchw4 layout, output tensor is nchw32 layout'), - Doc('NCHW32_NCHW4 = 13', - 'NCHW32_NCHW4 means input tensors are nchw32 layout, output tensor is nchw4 layout'), - Doc('NCHW4_NCHW = 14', - 'NCHW4_NCHW means input tensors are nchw4 layout, output tensor is nchw layout'), + Doc('NCHW88_WINOGRAD = 10', 'NCHW88 layout with weights tranformed by winograd'), + Doc('NCHW44_WINOGRAD = 11', 'NCHW44 layout with weights tranformed by winograd'), + Doc('NCHW4_NCHW32 = 12', 'NCHW4_NCHW32 means input tensors are nchw4 layout, output tensor is nchw32 layout'), + Doc('NCHW32_NCHW4 = 13', 'NCHW32_NCHW4 means input tensors are nchw32 layout, output tensor is nchw4 layout'), + Doc('NCHW4_NCHW = 14', 'NCHW4_NCHW means input tensors are nchw4 layout, output tensor is nchw layout'), Doc('NHWC_NCHW = 15', 'NHWC_NCHW means input tensors are nhwc layout, ' 'output tensor is nchw layout'), Doc('NHWC_NCHW4_IC_SMALL = 16', 'NHWC_NCHW4_IC_SMALL means input tensors are nhwc(c < 4) layout, ' @@ -101,13 +96,10 @@ pdef('Axis').add_fields('int32', 'axis', 0) add_enum(Doc('Format', 'convolution data/filter/output format; see ' ':class:`RelayoutFormat` for more details'), 'NCHW = 0', 'NHWC = 1', 'NHWCD4 = 2', 'NCHW4 = 3', 'NCHW8 = 4', 'NCHW32 = 5', 'NCHW88 = 6', - 'NCHW44 = 7', 'NCHW44_DOT = 8', - Doc('NCHW4_NCHW32 = 9', - 'NCHW4_NCHW32 means input tensors are nchw4 layout, output tensor is nchw32 layout'), - Doc('NCHW32_NCHW4 = 10', - 'NCHW32_NCHW4 means input tensors are nchw32 layout, output tensor is nchw4 layout'), - Doc('NCHW4_NCHW = 11', - 'NCHW4_NCHW means input tensors are nchw4 layout, output tensor is nchw layout'), + 'NCHW44 = 7','NCHW44_DOT = 8', + Doc('NCHW4_NCHW32 = 9', 'NCHW4_NCHW32 means input tensors are nchw4 layout, output tensor is nchw32 layout'), + Doc('NCHW32_NCHW4 = 10', 'NCHW32_NCHW4 means input tensors are nchw32 layout, output tensor is nchw4 layout'), + Doc('NCHW4_NCHW = 11', 'NCHW4_NCHW means input tensors are nchw4 layout, output tensor is nchw layout'), Doc('NHWC_NCHW = 12', 'NHWC_NCHW means input tensors are nhwc layout, ' 'output tensor is nchw layout'), Doc('NHWC_NCHW4_IC_SMALL = 13', 'NHWC_NCHW4_IC_SMALL means input tensors are nhwc(c < 4) layout, ' @@ -115,11 +107,11 @@ pdef('Axis').add_fields('int32', 'axis', 0) Doc('NCHW_NCHW4_IC_SMALL = 14', 'NCHW_NCHW4_IC_SMALL means input tensors are nchw(c < 4) layout, ' 'output tensor is nchw4 layout, padding c=4'), Doc('CHWN4 = 15', 'CHWN4 is currently only used on Nvidia platform for fast implementation ' - 'of convolution using CUDA/SASS. The channels are splitted to groups of 4 channels.'), + 'of convolution using CUDA/SASS. The channels are splitted to groups of 4 channels.'), Doc('NCHW64 = 16', 'NCHW64 is designed for convolution implementation to utilizing TensorCore ' - 'instructions for 4-bit integers on Nvidia platforms'), + 'instructions for 4-bit integers on Nvidia platforms'), Doc('NCHW4_NHWC = 17', 'NCHW4_NHWC means input tensors are nchw4 layout, output tensor is nhwc layout')). - add_enum_alias('ComputeMode', 'ConvolutionV1', name_field='compute_mode') + add_enum_alias('ComputeMode', 'ConvolutionV1',name_field='compute_mode') ) @@ -141,7 +133,7 @@ pdef('Axis').add_fields('int32', 'axis', 0) add_enum_alias('ConvMode', 'ConvolutionV0', 'Mode'). add_enum('PoolMode', 'AVERAGE = 0', 'MAX = 1'). add_enum('NonlineMode', 'IDENTITY = 0', 'RELU = 1', 'SIGMOID = 2'). - add_fields('uint32', 'pool_shape_h', 1, 'pool_shape_w', 1, 'pool_stride_h', 1, 'pool_stride_w', 1, + add_fields('uint32', 'pool_shape_h', 1, 'pool_shape_w', 1, 'pool_stride_h', 1, 'pool_stride_w', 1, \ 'pool_pad_h', 0, 'pool_pad_w', 0, 'conv_stride_h', 1, 'conv_stride_w', 1, 'conv_pad_h', 0, 'conv_pad_w', 0)) (pdef('ConvBias', 'legacy conv_bias', version=0, is_legacy=True). @@ -224,8 +216,8 @@ pdef('Axis').add_fields('int32', 'axis', 0) (pdef('SeparableConv'). add_enum_alias('Mode', 'ConvolutionV0'). add_enum('BorderMode', 'BORDER_REPLICATE = 0', 'BORDER_REFLECT = 1', - 'BORDER_REFLECT_101 = 2', 'BORDER_WRAP = 3', - 'BORDER_CONSTANT = 4', 'BORDER_TRANSPARENT = 5', 'BORDER_ISOLATED = 6'). + 'BORDER_REFLECT_101 = 2','BORDER_WRAP = 3', + 'BORDER_CONSTANT = 4', 'BORDER_TRANSPARENT = 5','BORDER_ISOLATED = 6'). add_fields('bool', 'is_symm_kernel', 'true'). add_fields('uint32', 'pad_h', 0, 'pad_w', 0, 'stride_h', 1, 'stride_w', 1, 'ksize_h', 3, 'ksize_w', 3, 'anchor_h', 1, 'anchor_w', 1)) @@ -255,7 +247,7 @@ pdef('Axis').add_fields('int32', 'axis', 0) ) (pdef('Pooling', version=1). - add_enum_alias('Mode', 'PoolingV0'). + add_enum_alias('Mode','PoolingV0'). add_fields('uint32', 'pad_h', 0, 'pad_w', 0, 'stride_h', 2, 'stride_w', 2, 'window_h', 2, 'window_w', 2). add_enum_alias('Format', 'Convolution') @@ -310,8 +302,7 @@ pdef('Axis').add_fields('int32', 'axis', 0) ). add_fields('float32', 'scale', '1.f')) -INTERP_MODES = ['NEAREST = 0', 'LINEAR = 1', - 'AREA = 2', 'CUBIC = 3', 'LANCZOS4 = 4'] +INTERP_MODES = ['NEAREST = 0', 'LINEAR = 1', 'AREA = 2', 'CUBIC = 3', 'LANCZOS4 = 4'] BORDER_MODES = [Doc('REPLICATE = 0', 'aaaaaa|abcdefgh|hhhhhhh'), Doc('REFLECT = 1', 'fedcba|abcdefgh|hgfedcb'), Doc('REFLECT_101 = 2', 'gfedcb|abcdefgh|gfedcba'), @@ -332,8 +323,8 @@ BORDER_MODES = [Doc('REPLICATE = 0', 'aaaaaa|abcdefgh|hhhhhhh'), add_fields('float32', Doc('border_val', 'used for CONSTANT bmode'), '.0f')) (pdef('WarpPerspective', version=2). - add_enum_alias('InterpolationMode', 'WarpPerspectiveV1', name_field="imode"). - add_enum_alias('BorderMode', 'WarpPerspectiveV1', name_field="bmode"). + add_enum_alias('InterpolationMode','WarpPerspectiveV1',name_field="imode"). + add_enum_alias('BorderMode','WarpPerspectiveV1',name_field="bmode"). add_enum_alias('Format', 'Convolution'). add_fields('float32', Doc('border_val', 'used for CONSTANT bmode'), '.0f')) @@ -408,7 +399,7 @@ pdef('Elemwise').add_enum( Doc('RMULH = 43', 'binary: rounded higher l bits of x * y, where l is the bit ' 'length of x.'), - Doc('ATAN2 = 44', 'binary: atan2(y,x)'), + Doc('ATAN2 = 44','binary: atan2(y,x)'), Doc('ERF = 45', 'unary: erf(x)'), Doc('ERFINV = 46', 'unary: inverse function of erf(x)'), Doc('ERFC = 47', 'unary: erfc(x)'), @@ -643,7 +634,7 @@ Currently, ```DEFAULT``` mode means: Doc('axis', 'axis along which reduction is performed; if INT_MAX is given, ' 'reduce to given target shape (only used in megbrain)'), - (1 << 31)-1). + (1<<31)-1). add_enum('DataType', Doc('DEFAULT = 0', ''' @@ -698,7 +689,7 @@ Currently, ```DEFAULT``` mode means: add_fields('int32', Doc('axis', 'axis along which cumsum is performed, default with INT_MAX'), - (1 << 31)-1). + (1<<31)-1). add_fields('bool', Doc('exclusive', 'whether the current element is taken into account'), @@ -770,8 +761,7 @@ pdef('Sleep').add_fields('float32', Doc('time', 'time to sleep in seconds'), 0) (pdef('UniformRNG', version=1). add_fields('uint64', 'seed', 0). add_fields( - 'dtype', Doc( - 'dtype', 'The dtype of output Tensor. Only support Float32.'), + 'dtype', Doc('dtype', 'The dtype of output Tensor. Only support Float32.'), 'DTypeEnum::Float32')) (pdef('GaussianRNG', version=0, is_legacy=True). @@ -782,8 +772,7 @@ pdef('Sleep').add_fields('float32', Doc('time', 'time to sleep in seconds'), 0) add_fields('uint64', 'seed', 0). add_fields('float32', 'mean', 0, 'std', 1). add_fields( - 'dtype', Doc( - 'dtype', 'The dtype of output Tensor. Only support Float32.'), + 'dtype', Doc('dtype', 'The dtype of output Tensor. Only support Float32.'), 'DTypeEnum::Float32')) (pdef('GammaRNG'). @@ -830,7 +819,7 @@ pdef('Sleep').add_fields('float32', Doc('time', 'time to sleep in seconds'), 0) ('YUV2GRAY_NV12', 'BT601_YUV2GRAY_NV12'), ('YUV2GRAY_YV12', 'BT601_YUV2GRAY_YV12'), ('YUV2GRAY_YU12', 'BT601_YUV2GRAY_YU12')], - name_field='mode')) + name_field = 'mode')) (pdef('WarpAffine', version=0, is_legacy=True) .add_enum_alias('InterpolationMode', 'WarpPerspectiveV1', name_field='imode') @@ -853,7 +842,7 @@ pdef('Sleep').add_fields('float32', Doc('time', 'time to sleep in seconds'), 0) (pdef('GaussianBlur') .add_enum_alias('BorderMode', 'WarpPerspectiveV1', name_field='border_mode') .add_fields('uint32', 'kernel_height', 0, 'kernel_width', 0) - .add_fields('float32', 'sigma_x', '0.f', 'sigma_y', '0.f')) + .add_fields('float32','sigma_x', '0.f', 'sigma_y', '0.f')) (pdef('Resize', version=0, is_legacy=True) .add_enum_alias('InterpolationMode', 'WarpPerspectiveV1', name_field='imode')) @@ -866,7 +855,7 @@ pdef('Sleep').add_fields('float32', Doc('time', 'time to sleep in seconds'), 0) .add_enum_alias('InterpolationMode', 'WarpPerspectiveV1', name_field='imode') .add_enum_alias('Format', 'Convolution', default=1)) -(pdef('Remap', version=0, is_legacy=True) +(pdef('Remap', version=0,is_legacy=True) .add_enum_alias('InterpolationMode', 'WarpPerspectiveV1', name_field='imode') .add_enum_alias('BorderMode', 'WarpPerspectiveV1', name_field='border_type') .add_enum_alias('Format', 'ConvolutionV0', default=1) @@ -920,8 +909,8 @@ pdef('Sleep').add_fields('float32', Doc('time', 'time to sleep in seconds'), 0) (pdef('SeparableConv3D'). add_enum_alias('Mode', 'Convolution3D'). add_enum('BorderMode', 'BORDER_REPLICATE = 0', 'BORDER_REFLECT = 1', - 'BORDER_REFLECT_101 = 2', 'BORDER_WRAP = 3', - 'BORDER_CONSTANT = 4', 'BORDER_TRANSPARENT = 5', 'BORDER_ISOLATED = 6'). + 'BORDER_REFLECT_101 = 2','BORDER_WRAP = 3', + 'BORDER_CONSTANT = 4', 'BORDER_TRANSPARENT = 5','BORDER_ISOLATED = 6'). add_fields('bool', 'is_symm_kernel', 'true'). add_fields('uint32', 'pad_d', 0, 'pad_h', 0, 'pad_w', 0, 'stride_d', 0, 'stride_h', 1, 'stride_w', 1, @@ -1034,10 +1023,10 @@ Note: NCHW_NCHW4_WEIGHT will auto pad oc and ic, you should remove oc in later o 'NCHW_NCHW4 = 24', 'NCHW4_NCHW = 25', 'NCHW_NCHW4_WEIGHT = 26', - 'NCHW_NCHW64 = 27', - 'NCHW64_NCHW = 28', - 'NCHW_NHWC = 29', - 'NHWC_NCHW = 30', + 'NCHW_NCHW64 = 27', + 'NCHW64_NCHW = 28', + 'NCHW_NHWC = 29', + 'NHWC_NCHW = 30', ) ) @@ -1059,7 +1048,7 @@ Note: NCHW_NCHW4_WEIGHT will auto pad oc and ic, you should remove oc in later o add_fields('bool', 'is_symm_kernel', 'true'). add_fields('uint32', 'ksize_h', 3, 'ksize_w', 3, 'anchor_h', 1, 'anchor_w', 1)) -(pdef('LocalShare', 'Local share convolution', version=0, is_legacy=True). +(pdef('LocalShare', 'Local share convolution',version=0, is_legacy=True). add_enum_alias('Mode', 'ConvolutionV0'). add_fields( 'uint32', @@ -1100,7 +1089,7 @@ Note: NCHW_NCHW4_WEIGHT will auto pad oc and ic, you should remove oc in later o ) -(pdef('ROIAlign', version=0, is_legacy=True). +(pdef('ROIAlign',version=0,is_legacy=True). add_enum('Mode', 'MAX = 0', 'AVERAGE = 1', name_field='mode'). add_enum_alias('Format', 'ConvolutionV0'). add_fields('float32', 'spatial_scale', '1.0'). @@ -1144,7 +1133,7 @@ Note: NCHW_NCHW4_WEIGHT will auto pad oc and ic, you should remove oc in later o Doc('part_size', 'size of each deformable part'), 1, Doc('sample_per_part', 'sample count of each bbox'), 1)) -(pdef('BatchConvBias', 'Batch convolution (unshare weights on the batch dimension)', version=0, is_legacy=True). +(pdef('BatchConvBias', 'Batch convolution (unshare weights on the batch dimension)',version=0,is_legacy=True). add_enum_alias('NonlineMode', 'ConvBiasV0'). add_enum_alias('Mode', 'ConvolutionV0'). add_fields( @@ -1163,7 +1152,7 @@ Note: NCHW_NCHW4_WEIGHT will auto pad oc and ic, you should remove oc in later o add_enum_alias('ComputeMode', 'ConvolutionV1', name_field="compute_mode") ) -(pdef('BatchConvBias', 'Batch convolution (unshare weights on the batch dimension)', version=1). +(pdef('BatchConvBias', 'Batch convolution (unshare weights on the batch dimension)',version=1). add_enum_alias('NonlineMode', 'ConvBiasV0'). add_enum_alias('Mode', 'ConvolutionV0'). add_fields( @@ -1183,8 +1172,8 @@ Note: NCHW_NCHW4_WEIGHT will auto pad oc and ic, you should remove oc in later o ) (pdef('FakeQuant'). - add_fields('int32', 'qmin', '-2147483648'). - add_fields('int32', 'qmax', '2147483647') + add_fields('int32','qmin','-2147483648'). + add_fields('int32','qmax','2147483647') ) (pdef('TQT'). add_fields('int32', 'qmin', '-2147483648'). @@ -1203,13 +1192,13 @@ PADDING_MODES = [Doc('REPLICATE = 0', 'aaaaaa|abcdefgh|hhhhhhh'), Doc('REFLECT = 1', 'fedcba|abcdefgh|hgfedcb'), Doc('CONSTANT = 2', 'iiiiii|abcdefgh|iiiiiii')] (pdef('Padding'). - add_fields('uint32', Doc('front_offset_dim0', 'offset in dim 0'), 0). - add_fields('uint32', Doc('front_offset_dim1', 'offset in dim 1'), 0). - add_fields('uint32', Doc('front_offset_dim2', 'offset in dim 2'), 0). - add_fields('uint32', Doc('front_offset_dim3', 'offset in dim 3'), 0). - add_fields('uint32', Doc('front_offset_dim4', 'offset in dim 4'), 0). - add_fields('uint32', Doc('front_offset_dim5', 'offset in dim 5'), 0). - add_fields('uint32', Doc('front_offset_dim6', 'offset in dim 6'), 0). + add_fields('uint32', Doc('front_offset_dim0','offset in dim 0'), 0). + add_fields('uint32', Doc('front_offset_dim1','offset in dim 1'), 0). + add_fields('uint32', Doc('front_offset_dim2','offset in dim 2'), 0). + add_fields('uint32', Doc('front_offset_dim3','offset in dim 3'), 0). + add_fields('uint32', Doc('front_offset_dim4','offset in dim 4'), 0). + add_fields('uint32', Doc('front_offset_dim5','offset in dim 5'), 0). + add_fields('uint32', Doc('front_offset_dim6','offset in dim 6'), 0). add_fields('uint32', Doc('back_offset_dim0', 'back offset in dim0'), 0). add_fields('uint32', Doc('back_offset_dim1', 'back offset in dim1'), 0). add_fields('uint32', Doc('back_offset_dim2', 'back offset in dim2'), 0). @@ -1217,7 +1206,7 @@ PADDING_MODES = [Doc('REPLICATE = 0', 'aaaaaa|abcdefgh|hhhhhhh'), add_fields('uint32', Doc('back_offset_dim4', 'back offset in dim4'), 0). add_fields('uint32', Doc('back_offset_dim5', 'back offset in dim5'), 0). add_fields('uint32', Doc('back_offset_dim6', 'back offset in dim6'), 0). - add_fields('float32', Doc('padding_val', 'param of padding opr'), 0). + add_fields('float32', Doc('padding_val','param of padding opr'), 0). add_enum('PaddingMode', *PADDING_MODES, name_field='padding_mode', default=2, member_alias=[(i, 'PADDING_{}'.format(i)) for i in PADDING_MODES] @@ -1241,22 +1230,21 @@ PADDING_MODES = [Doc('REPLICATE = 0', 'aaaaaa|abcdefgh|hhhhhhh'), ) (pdef('RNN'). - add_fields('uint32', 'num_layers', '1'). - add_fields('bool', 'bidirectional', 'false'). - add_fields('bool', 'bias', 'true'). - add_fields('uint32', 'hidden_size', '128'). - add_fields('uint32', 'proj_size', '0'). - add_fields('float32', 'dropout', '0.f'). + add_fields('uint32', Doc('num_layers', 'Number of recurrent layers'), '1'). + add_fields('bool', Doc('bidirectional', 'If becomes a bidirectional RNN'), 'false'). + add_fields('bool', Doc('bias', 'If the layer use bias weights b_ih and b_hh'), 'true'). + add_fields('uint32', Doc('hidden_size', 'The number of features in the hidden state'), '128'). + add_fields('float32', Doc('dropout', 'If introduce a Dropout layer on the outputs of each RNN layer'), '0.f'). add_enum_alias('NonlineMode', 'RNNCell'). add_enum_alias('FwdMode', 'BN', name_field='fwd_mode') ) (pdef('LSTM'). - add_fields('uint32', 'num_layers', '1'). - add_fields('bool', 'bidirectional', 'false'). - add_fields('bool', 'bias', 'true'). - add_fields('uint32', 'hidden_size', '128'). - add_fields('uint32', 'proj_size', '0'). - add_fields('float32', 'dropout', '0.f'). + add_fields('uint32', Doc('num_layers', 'Number of recurrent layers'), '1'). + add_fields('bool', Doc('bidirectional', 'If becomes a bidirectional LSTM'), 'false'). + add_fields('bool', Doc('bias', 'If the layer use bias weights b_ih and b_hh'), 'true'). + add_fields('uint32', Doc('hidden_size', 'The number of features in the hidden state'), '128'). + add_fields('uint32', Doc('proj_size', 'If use LSTM with projections of corresponding size'), '0'). + add_fields('float32', Doc('dropout', 'If introduce a Dropout layer on the outputs of each LSTM layer'), '0.f'). add_enum_alias('FwdMode', 'BN', name_field='fwd_mode') ) diff --git a/dnn/src/common/handle.cpp b/dnn/src/common/handle.cpp index 17f4718c..ee9c3372 100644 --- a/dnn/src/common/handle.cpp +++ b/dnn/src/common/handle.cpp @@ -224,5 +224,6 @@ std::unique_ptr Handle::create_operator() { #define INST(opr) template std::unique_ptr Handle::create_operator(); MEGDNN_FOREACH_OPR_CLASS(INST) + #undef INST // vim: syntax=cpp.doxygen diff --git a/dnn/src/common/lstm.cpp b/dnn/src/common/lstm.cpp index 4275a582..211cdf68 100644 --- a/dnn/src/common/lstm.cpp +++ b/dnn/src/common/lstm.cpp @@ -10,19 +10,12 @@ */ #include "megdnn/oprs.h" #include "src/common/utils.h" -// #include "src/cuda/lstm/utils.h" namespace megdnn { -/*size_t get_reserve_size(Handle* handle, megdnn::LSTMForward::Param& param, const -TensorLayout& input) { #if CUDNN_MAJOR >= 6 auto holder = -megdnn::cuda::lstm::get_RNNDescHolder_v6(handle, param, input); return -holder.reserveSpace_size; # else return 0; #endif -}*/ - void LSTM::deduce_layout( const TensorLayout& input, const TensorLayout& hx, const TensorLayout& cx, - const TensorLayout& flatten_weights, TensorLayout& output, TensorLayout& hy, + const TensorLayout& /*flatten_weights*/, TensorLayout& output, TensorLayout& hy, TensorLayout& cy, TensorLayout& reserve_space) { // input: [seq_len, batch_size, input_size] // hx: [D * num_layers, batch_size, hidden_size] @@ -34,24 +27,30 @@ void LSTM::deduce_layout( TensorShape{seq_len, batch_size, D * hidden_size}, input.dtype); hy = TensorLayout(hx); cy = TensorLayout(cx); - // reserve_space = {{get_reserve_size(this->handle(), param(), input)}, - // dtype::Byte()}; - reserve_space = {{get_reserve_size_in_bytes(input)}, dtype::Byte()}; + reserve_space = {{get_reserve_size_in_bytes(input)}, input.dtype}; } void LSTM::check_exec( const TensorLayout& input, const TensorLayout& hx, const TensorLayout& cx, const TensorLayout& flatten_weights, const TensorLayout& output, const TensorLayout& hy, const TensorLayout& cy, - const TensorLayout& reserve_space, size_t workspace_in_bytes) { + const TensorLayout& /*reserve_space*/, size_t /*workspace_in_bytes*/) { auto errmsg = [&]() { std::string msg; msg.append("input="); msg.append(input.to_string()); + msg.append(", output="); + msg.append(output.to_string()); msg.append(", hx="); msg.append(hx.to_string()); msg.append(", cx="); msg.append(cx.to_string()); + msg.append(", hy="); + msg.append(hy.to_string()); + msg.append(", cy="); + msg.append(cy.to_string()); + msg.append(", flatten_weights="); + msg.append(flatten_weights.to_string()); msg.append(", hidden_size="); msg.append(std::to_string(param().hidden_size)); msg.append(", num_layers="); @@ -61,9 +60,29 @@ void LSTM::check_exec( return msg; }; size_t D = param().bidirectional ? 2 : 1; + size_t b = param().bias ? 1 : 0; size_t num_layers = param().num_layers; + size_t input_size = input.shape[2]; + size_t gate_hidden_size = 4 * param().hidden_size; + // first layer{ weight_ih_l[k][_reverse].shape = (4*hidden_size, input_size) + // weight_hh_l[k][_reverse].shape = (4*hidden_size, hidden_size)} + // other layers{ weight_ih_l[k][_reverse].shape = (4*hidden_size, num_directions * + // hidden_size) + // weight_hh_l[k][_reverse].shape = (4*hidden_size, hidden_size)} + // bias: 2 * num_directions * num_layers + // size_dim1 = D * first layer + (layer -1) * other layer + bias + size_t size_dim1 = D * (input_size + param().hidden_size) + + (num_layers - 1) * D * ((D + 1) * param().hidden_size) + + b * 2 * D * num_layers; #define ASSERT_BRIEF(_content) megdnn_assert(_content, "%s", errmsg().c_str()); + ASSERT_BRIEF(input.ndim == 3) + ASSERT_BRIEF(output.ndim == 3) + ASSERT_BRIEF(flatten_weights.shape[0] == gate_hidden_size) + ASSERT_BRIEF(flatten_weights.shape[0] == size_dim1) + ASSERT_BRIEF(output.shape[0] == input.shape[0]) + ASSERT_BRIEF(output.shape[1] == input.shape[1]) + ASSERT_BRIEF(output.shape[2] == D * param().hidden_size) ASSERT_BRIEF(hx.ndim == 3) ASSERT_BRIEF(hx.shape[0] == D * num_layers) ASSERT_BRIEF(hx.shape[1] == input.shape[1]) // batch_size @@ -72,14 +91,22 @@ void LSTM::check_exec( ASSERT_BRIEF(cx.shape[0] == D * num_layers) ASSERT_BRIEF(cx.shape[1] == input.shape[1]) // batch_size ASSERT_BRIEF(cx.shape[2] == param().hidden_size) + ASSERT_BRIEF(hy.ndim == 3) + ASSERT_BRIEF(hy.shape[0] == D * num_layers) + ASSERT_BRIEF(hy.shape[1] == input.shape[1]) // batch_size + ASSERT_BRIEF(hy.shape[2] == param().hidden_size) + ASSERT_BRIEF(cy.ndim == 3) + ASSERT_BRIEF(cy.shape[0] == D * num_layers) + ASSERT_BRIEF(cy.shape[1] == input.shape[1]) // batch_size + ASSERT_BRIEF(cy.shape[2] == param().hidden_size) #undef ASSERT_BRIEF } void LSTMBackward::deduce_layout( - const TensorLayout& x, const TensorLayout& y, const TensorLayout& hx, - const TensorLayout& cx, const TensorLayout& dy, const TensorLayout& dhy, - const TensorLayout& dcy, const TensorLayout& flatten_weights, - const TensorLayout& reserve_space, TensorLayout& dx, TensorLayout& dhx, + const TensorLayout& x, const TensorLayout& /*y*/, const TensorLayout& hx, + const TensorLayout& cx, const TensorLayout& /*dy*/, const TensorLayout& /*dhy*/, + const TensorLayout& /*dcy*/, const TensorLayout& flatten_weights, + const TensorLayout& /*reserve_space*/, TensorLayout& dx, TensorLayout& dhx, TensorLayout& dcx, TensorLayout& dw) { dx = x; dhx = hx; @@ -87,12 +114,14 @@ void LSTMBackward::deduce_layout( dw = flatten_weights; } +// TODO: add shape check of BWD void LSTMBackward::check_exec( - const TensorLayout& x, const TensorLayout& y, const TensorLayout& hx, - const TensorLayout& cx, const TensorLayout& dy, const TensorLayout& dhy, - const TensorLayout& dcy, const TensorLayout& flatten_weights, - const TensorLayout& reserve_space, const TensorLayout& dx, - const TensorLayout& dhx, const TensorLayout& dcx, const TensorLayout& dw, - size_t workspace_in_bytes) {} + const TensorLayout& /*x*/, const TensorLayout& /*y*/, + const TensorLayout& /*hx*/, const TensorLayout& /*cx*/, + const TensorLayout& /*dy*/, const TensorLayout& /*dhy*/, + const TensorLayout& /*dcy*/, const TensorLayout& /*flatten_weights*/, + const TensorLayout& /*reserve_space*/, const TensorLayout& /*dx*/, + const TensorLayout& /*dhx*/, const TensorLayout& /*dcx*/, + const TensorLayout& /*dw*/, size_t /*workspace_in_bytes*/) {} } // namespace megdnn \ No newline at end of file diff --git a/dnn/src/common/lstm_cell.cpp b/dnn/src/common/lstm_cell.cpp index 35abdf4e..859ed782 100644 --- a/dnn/src/common/lstm_cell.cpp +++ b/dnn/src/common/lstm_cell.cpp @@ -20,8 +20,6 @@ void LSTMCell::deduce_layout( const TensorLayout& weight_hh, const TensorLayout& bias_hh, const TensorLayout& cx, TensorLayout& h_new, TensorLayout& c_new, TensorLayout& gates) { - // size_t batch_size = hx.shape[0]; - // size_t hidden_size = hx.shape[1]; h_new = TensorLayout(hx, hx.dtype); c_new = TensorLayout(cx, cx.dtype); auto opr = handle()->create_operator(); @@ -36,6 +34,39 @@ void LSTMCell::check_exec( const TensorLayout& cx, const TensorLayout& h_new, const TensorLayout& c_new, const TensorLayout& gates, size_t workspace_in_bytes) { TensorLayout h_new_expected, c_new_expected, gates_expected; + auto errmsg = [&]() { + std::string msg; + msg.append("input="); + msg.append(input.to_string()); + msg.append(", weight_ih="); + msg.append(weight_ih.to_string()); + msg.append(", bias_ih="); + msg.append(bias_ih.to_string()); + msg.append(", hx="); + msg.append(hx.to_string()); + msg.append(", weight_hh="); + msg.append(weight_hh.to_string()); + msg.append(", bias_hh="); + msg.append(bias_hh.to_string()); + msg.append(", cx="); + msg.append(cx.to_string()); + return msg; + }; +#define ASSERT_BRIEF(_content) megdnn_assert(_content, "%s", errmsg().c_str()); + + ASSERT_BRIEF(input.ndim == 2) + ASSERT_BRIEF(input.shape[1] == weight_ih.shape[1]) + ASSERT_BRIEF(weight_ih.shape[0] == weight_hh.shape[0]) + ASSERT_BRIEF(weight_hh.shape[0] == 4 * weight_hh.shape[1]) + ASSERT_BRIEF(bias_ih.shape[0] == bias_hh.shape[0]) + ASSERT_BRIEF(hx.ndim == 2) + ASSERT_BRIEF(hx.shape[0] == input.shape[0]) + ASSERT_BRIEF(hx.shape[1] == cx.shape[1]) // hidden_size + ASSERT_BRIEF(cx.ndim == 2) + ASSERT_BRIEF(cx.shape[0] == input.shape[0]) + ASSERT_BRIEF(cx.shape[1] == weight_hh.shape[1]) +#undef ASSERT_BRIEF + deduce_layout( input, weight_ih, bias_ih, hx, weight_hh, bias_hh, cx, h_new_expected, c_new_expected, gates_expected); @@ -57,15 +88,15 @@ size_t get_workspace_in_bytes( const TensorLayout& input, const TensorLayout& weight_ih, const TensorLayout& bias_ih, const TensorLayout& hx, const TensorLayout& weight_hh, const TensorLayout& bias_hh, - const TensorLayout& cx, const TensorLayout& h_new, const TensorLayout& c_new, - const TensorLayout& gates, Handle* handle) { + const TensorLayout& /*cx*/, const TensorLayout& /*h_new*/, + const TensorLayout& /*c_new*/, const TensorLayout& gates, Handle* handle) { TensorLayout tmp_layout; auto opr = handle->create_operator(); opr->param().nonlineMode = param::RNNCell::NonlineMode::IDENTITY; opr->deduce_layout(input, weight_ih, bias_ih, hx, weight_hh, bias_hh, tmp_layout); size_t rnn_cell_need = opr->get_workspace_in_bytes( input, weight_ih, bias_ih, hx, weight_hh, bias_hh, gates); - size_t lstm_cell_need = tmp_layout.span().dist_byte(); + size_t lstm_cell_need = 2 * tmp_layout.span().dist_byte(); return rnn_cell_need > lstm_cell_need ? rnn_cell_need : lstm_cell_need; } @@ -76,37 +107,48 @@ void exec( _megdnn_tensor_out gates, _megdnn_workspace workspace, Handle* handle) { auto opr = handle->create_operator(); opr->param().nonlineMode = param::RNNCell::NonlineMode::IDENTITY; - /*TensorLayout tmp_layout; - opr->deduce_layout(input.layout, weight_ih.layout, - hx.layout, weight_hh.layout, - bias.layout, tmp_layout); - auto workspace_ptr = workspace.raw_ptr; - // TensorND tmp{static_cast(workspace.raw_ptr), tmp_layout}; - TensorND tmp{workspace_ptr, tmp_layout}; - auto new_workspace = Workspace{workspace_ptr + tmp.layout.span().dist_byte(), - workspace.size - - tmp.layout.span().dist_byte()};*/ - // opr->exec(input, weight_ih, hx, weight_hh, bias, tmp, new_workspace); opr->exec(input, weight_ih, bias_ih, hx, weight_hh, bias_hh, gates, workspace); // activation - // size_t batch_size = tmp.layout.shape[0]; size_t batch_size = hx.layout.shape[0]; size_t hidden_size = hx.layout.shape[1]; - // sigmoid: i f o - // TensorLayout gates_ifo_layout{TensorShape({batch_size, hidden_size * 3}), - // tmp.layout.dtype}; - TensorND tmp{static_cast(workspace.raw_ptr), gates.layout}; + + auto copy_opr = handle->create_operator(); + TensorND copy_gates{static_cast(workspace.raw_ptr), gates.layout}; + TensorLayout hidden_layout{TensorShape{hidden_size}, hx.layout.dtype}; + TensorLayout gateinfo_layout{TensorShape{batch_size, hidden_size}, hx.layout.dtype}; + for (size_t i = 0; i < batch_size; i++) { + for (size_t j = 0; j < 4; j++) { + TensorND half_step_states{ + // output + static_cast(gates.raw_ptr()) + + (4 * i + j) * hidden_layout.span().dist_byte(), + hidden_layout}; + TensorND half_step_output{ + static_cast(copy_gates.raw_ptr()) + + j * gateinfo_layout.span().dist_byte() + + i * hidden_layout.span().dist_byte(), + hidden_layout}; + copy_opr->exec(half_step_states, half_step_output); + } + } + void* workspace_ptr = workspace.raw_ptr + copy_gates.layout.span().dist_byte(); + copy_opr->exec(copy_gates, gates); + + // sigmoid: i f + TensorND tmp{static_cast(workspace_ptr), copy_gates.layout}; TensorLayout gates_ifo_layout{ - TensorShape({batch_size, hidden_size * 3}), gates.layout.dtype}; - TensorND gates_ifo_origin{gates.raw_ptr(), gates_ifo_layout}; + TensorShape({batch_size, hidden_size * 2}), copy_gates.layout.dtype}; + TensorND gates_ifo_origin{copy_gates.raw_ptr(), gates_ifo_layout}; TensorND gates_ifo{tmp.raw_ptr(), gates_ifo_layout}; auto sigmoid = handle->create_operator(); sigmoid->param().mode = Elemwise::Param::Mode::SIGMOID; sigmoid->exec({gates_ifo_origin}, gates_ifo); // tanh: g - TensorLayout g_layout{TensorShape({batch_size, hidden_size}), gates.layout.dtype}; + TensorLayout g_layout{ + TensorShape({batch_size, hidden_size}), copy_gates.layout.dtype}; TensorND g_origin{ - static_cast(gates.raw_ptr()) + gates_ifo_layout.span().dist_byte(), + static_cast(copy_gates.raw_ptr()) + + gates_ifo_layout.span().dist_byte(), g_layout}; TensorND g{ static_cast(tmp.raw_ptr()) + gates_ifo_layout.span().dist_byte(), @@ -114,13 +156,24 @@ void exec( auto tanh = handle->create_operator(); tanh->param().mode = Elemwise::Param::Mode::TANH; tanh->exec({g_origin}, g); + // sigmoid: o + TensorLayout three_gates_ifo_layout{ + TensorShape({batch_size, hidden_size * 3}), copy_gates.layout.dtype}; + TensorLayout o_layout{ + TensorShape({batch_size, hidden_size}), copy_gates.layout.dtype}; + TensorND o_origin{ + static_cast(copy_gates.raw_ptr()) + + three_gates_ifo_layout.span().dist_byte(), + o_layout}; + TensorND o{ + static_cast(tmp.raw_ptr()) + + three_gates_ifo_layout.span().dist_byte(), + o_layout}; + sigmoid->exec({o_origin}, o); // extract i f o TensorND i{static_cast(tmp.raw_ptr()), g_layout}; TensorND f{ static_cast(tmp.raw_ptr()) + g_layout.span().dist_byte(), g_layout}; - TensorND o{ - static_cast(tmp.raw_ptr()) + g_layout.span().dist_byte() * 2, - g_layout}; // calculate new cell state auto elewise_mul_add = handle->create_operator(); elewise_mul_add->param().mode = Elemwise::Param::Mode::FUSE_MUL_ADD4; diff --git a/dnn/src/common/opr_trait.h b/dnn/src/common/opr_trait.h index 43e3a8b5..f1558e75 100644 --- a/dnn/src/common/opr_trait.h +++ b/dnn/src/common/opr_trait.h @@ -139,8 +139,12 @@ DEF(LayerNormForward, 6, true, true); DEF(LayerNormBackward, 8, true, true); DEF(DropoutForward, 3, true, true); DEF(DropoutBackward, 3, true, true); -DEF(RNNCellForward, 6, true, true); +DEF(RNNCellForward, 7, true, true); DEF(RNNForward, 6, true, true); +DEF(RNNBackward, 10, true, true); +DEF(LSTMCellForward, 10, true, true); +DEF(LSTMForward, 8, true, true); +DEF(LSTMBackward, 13, true, true); } // namespace megdnn // vim: syntax=cpp.doxygen diff --git a/dnn/src/common/rnn.cpp b/dnn/src/common/rnn.cpp index 2cf3b2f0..3bb54a24 100644 --- a/dnn/src/common/rnn.cpp +++ b/dnn/src/common/rnn.cpp @@ -16,10 +16,8 @@ namespace megdnn { void RNN::deduce_layout( const TensorLayout& input, const TensorLayout& hx, - const TensorLayout& flatten_weights, TensorLayout& output, TensorLayout& hy, + const TensorLayout& /*flatten_weights*/, TensorLayout& output, TensorLayout& hy, TensorLayout& reserve_space) { - // input: [seq_len, batch_size, input_size] - // hx: [D * num_layers, batch_size, hidden_size] size_t seq_len = input.shape[0]; size_t batch_size = input.shape[1]; size_t D = param().bidirectional ? 2 : 1; @@ -27,22 +25,26 @@ void RNN::deduce_layout( output = TensorLayout( TensorShape{seq_len, batch_size, D * hidden_size}, input.dtype); hy = TensorLayout(hx); - // reserve_space = {{get_reserve_size(this->handle(), param(), input)}, - // dtype::Byte()}; - reserve_space = {{get_reserve_size_in_bytes(input)}, dtype::Byte()}; + reserve_space = {{get_reserve_size_in_bytes(input)}, input.dtype}; } void RNN::check_exec( const TensorLayout& input, const TensorLayout& hx, const TensorLayout& flatten_weights, const TensorLayout& output, - const TensorLayout& hy, const TensorLayout& reserve_space, - size_t workspace_in_bytes) { + const TensorLayout& hy, const TensorLayout& /*reserve_space*/, + size_t /*workspace_in_bytes*/) { auto errmsg = [&]() { std::string msg; msg.append("input="); msg.append(input.to_string()); + msg.append(", output="); + msg.append(output.to_string()); msg.append(", hx="); msg.append(hx.to_string()); + msg.append(", flatten_weights="); + msg.append(flatten_weights.to_string()); + msg.append(", hy="); + msg.append(hy.to_string()); msg.append(", hidden_size="); msg.append(std::to_string(param().hidden_size)); msg.append(", num_layers="); @@ -52,20 +54,38 @@ void RNN::check_exec( return msg; }; size_t D = param().bidirectional ? 2 : 1; + size_t b = param().bias ? 1 : 0; size_t num_layers = param().num_layers; + size_t input_size = input.shape[2]; + size_t gate_hidden_size = param().hidden_size; + // calculate size_dim1 the same as lstm + size_t size_dim1 = D * (input_size + param().hidden_size) + + (num_layers - 1) * D * ((D + 1) * param().hidden_size) + + b * 2 * D * num_layers; #define ASSERT_BRIEF(_content) megdnn_assert(_content, "%s", errmsg().c_str()); ASSERT_BRIEF(hx.ndim == 3) + ASSERT_BRIEF(input.ndim == 3) + ASSERT_BRIEF(output.ndim == 3) + ASSERT_BRIEF(hy.ndim == 3) + ASSERT_BRIEF(flatten_weights.shape[0] == gate_hidden_size) + ASSERT_BRIEF(flatten_weights.shape[0] == size_dim1) ASSERT_BRIEF(hx.shape[0] == D * num_layers) ASSERT_BRIEF(hx.shape[1] == input.shape[1]) // batch_size ASSERT_BRIEF(hx.shape[2] == param().hidden_size) + ASSERT_BRIEF(output.shape[0] == input.shape[0]) + ASSERT_BRIEF(output.shape[1] == input.shape[1]) + ASSERT_BRIEF(output.shape[2] == D * param().hidden_size) + ASSERT_BRIEF(hy.shape[0] == hx.shape[0]) + ASSERT_BRIEF(hy.shape[1] == hx.shape[1]) + ASSERT_BRIEF(hy.shape[2] == hx.shape[2]) #undef ASSERT_BRIEF } void RNNBackward::deduce_layout( - const TensorLayout& x, const TensorLayout& y, const TensorLayout& hx, - const TensorLayout& dy, const TensorLayout& dhy, - const TensorLayout& flatten_weights, const TensorLayout& reserve_space, + const TensorLayout& x, const TensorLayout& /*y*/, const TensorLayout& hx, + const TensorLayout& /*dy*/, const TensorLayout& /*dhy*/, + const TensorLayout& flatten_weights, const TensorLayout& /*reserve_space*/, TensorLayout& dx, TensorLayout& dhx, TensorLayout& dw) { dx = x; dhx = hx; @@ -73,10 +93,11 @@ void RNNBackward::deduce_layout( } void RNNBackward::check_exec( - const TensorLayout& x, const TensorLayout& y, const TensorLayout& hx, - const TensorLayout& dy, const TensorLayout& dhy, - const TensorLayout& flatten_weights, const TensorLayout& reserve_space, - const TensorLayout& dx, const TensorLayout& dhx, const TensorLayout& dw, - size_t workspace_in_bytes) {} + const TensorLayout& /*x*/, const TensorLayout& /*y*/, + const TensorLayout& /*hx*/, const TensorLayout& /*dy*/, + const TensorLayout& /*dhy*/, const TensorLayout& /*flatten_weights*/, + const TensorLayout& /*reserve_space*/, const TensorLayout& /*dx*/, + const TensorLayout& /*dhx*/, const TensorLayout& /*dw*/, + size_t /*workspace_in_bytes*/) {} } // namespace megdnn diff --git a/dnn/src/common/rnn_cell.cpp b/dnn/src/common/rnn_cell.cpp index 0cdbc259..aa30addb 100644 --- a/dnn/src/common/rnn_cell.cpp +++ b/dnn/src/common/rnn_cell.cpp @@ -16,16 +16,11 @@ namespace megdnn { void RNNCell::deduce_layout( const TensorLayout& input, const TensorLayout& weight_ih, - const TensorLayout& bias_ih, const TensorLayout& hx, - const TensorLayout& weight_hh, const TensorLayout& bias_hh, TensorLayout& dst) { - // megdnn_assert(hx.ndim == 2); + const TensorLayout& /*bias_ih*/, const TensorLayout& hx, + const TensorLayout& /*weight_hh*/, const TensorLayout& /*bias_hh*/, + TensorLayout& dst) { size_t batch_size = hx.shape[0]; - // size_t hidden_size = weight_hh.shape[1]; size_t gate_hidden_size = weight_ih.shape[0]; - // size_t input_size = weight_ih.shape[1]; - // megdnn_assert(input.shape[1] == input_size); - // megdnn_assert(hx.shape[1] == hidden_size); - // megdnn_assert_eq_dtype(input, hx); dst = TensorLayout(TensorShape({batch_size, gate_hidden_size}), input.dtype); } @@ -36,6 +31,37 @@ void RNNCell::check_exec( const TensorLayout& weight_hh, const TensorLayout& bias_hh, const TensorLayout& dst, size_t workspace_in_bytes) { TensorLayout dst_expected; + auto errmsg = [&]() { + std::string msg; + msg.append("input="); + msg.append(input.to_string()); + msg.append(", weight_ih="); + msg.append(weight_ih.to_string()); + msg.append(", bias_ih="); + msg.append(bias_ih.to_string()); + msg.append(", hx="); + msg.append(hx.to_string()); + msg.append(", weight_hh="); + msg.append(weight_hh.to_string()); + msg.append(", bias_hh="); + msg.append(bias_hh.to_string()); + msg.append(", dst="); + msg.append(dst.to_string()); + return msg; + }; +#define ASSERT_BRIEF(_content) megdnn_assert(_content, "%s", errmsg().c_str()); + + ASSERT_BRIEF(input.ndim == 2) + ASSERT_BRIEF(hx.ndim == 2) + ASSERT_BRIEF(hx.shape[0] == input.shape[0]) // batch + ASSERT_BRIEF(input.shape[1] == weight_ih.shape[1]) + ASSERT_BRIEF(hx.shape[0] == dst.shape[0]) // batch + ASSERT_BRIEF(hx.shape[1] == dst.shape[1]) + ASSERT_BRIEF(hx.shape[1] == weight_ih.shape[0]) // hidden_size + ASSERT_BRIEF(weight_ih.shape[0] == weight_hh.shape[0]) + ASSERT_BRIEF(weight_hh.shape[0] == weight_hh.shape[1]) + ASSERT_BRIEF(bias_ih.shape[0] == bias_hh.shape[0]) +#undef ASSERT_BRIEF megdnn_assert_eq_dtype(input, dst); megdnn_assert_eq_dtype(hx, dst); deduce_layout(input, weight_ih, bias_ih, hx, weight_hh, bias_hh, dst_expected); @@ -53,12 +79,15 @@ namespace rnn_cell { size_t get_workspace_in_bytes( const TensorLayout& input, const TensorLayout& weight_ih, - const TensorLayout& bias_ih, const TensorLayout& hx, - const TensorLayout& weight_hh, const TensorLayout& bias_hh, + const TensorLayout& /*bias_ih*/, const TensorLayout& hx, + const TensorLayout& weight_hh, const TensorLayout& /*bias_hh*/, const TensorLayout& dst, Handle* handle) { auto opr = handle->create_operator(); opr->param().transposeB = true; - return dst.span().dist_byte() + opr->get_workspace_in_bytes(hx, weight_hh, dst); + return dst.span().dist_byte() + + std::max( + opr->get_workspace_in_bytes(hx, weight_hh, dst), + opr->get_workspace_in_bytes(input, weight_ih, dst)); } void exec( @@ -74,14 +103,11 @@ void exec( opr->param().transposeB = true; opr->exec(input, weight_ih, tmp, new_workspace); opr->exec(hx, weight_hh, dst, new_workspace); - // if (this->param().bias) add_bias(dst, tmp, bias, dst); - // if (this->param().bias) { auto add_opr = handle->create_operator(); add_opr->param().mode = Elemwise::Param::Mode::ADD; add_opr->exec({dst, tmp}, dst); add_opr->exec({dst, bias_ih}, dst); add_opr->exec({dst, bias_hh}, dst); - // } // activation using NonlineMode = param::RNNCell::NonlineMode; diff --git a/dnn/src/cuda/cudnn_wrapper.cpp b/dnn/src/cuda/cudnn_wrapper.cpp index 16d73ef6..f8080afb 100644 --- a/dnn/src/cuda/cudnn_wrapper.cpp +++ b/dnn/src/cuda/cudnn_wrapper.cpp @@ -160,29 +160,6 @@ void TensorDesc::set( } } -void TensorDesc::set_nd(const TensorLayout& layout, int pad) { - int nbDims = layout.ndim < pad ? pad : layout.ndim; - int dimA[nbDims], strideA[nbDims]; - - for (size_t i = 0; i < layout.ndim; ++i) { - dimA[i] = layout.shape[i]; - // strideA[i] = layout.stride[i]; - } - for (size_t i = layout.ndim; i < nbDims; ++i) { - dimA[i] = 1; // unused - // strideA[i] = 1; - } - // stride - for (size_t i = 0; i < nbDims; ++i) { - strideA[i] = 1; - for (size_t j = i + 1; j < nbDims; ++j) { - strideA[i] *= dimA[j]; - } - } - cudnn_check(cudnnSetTensorNdDescriptor( - desc, to_cudnn_dtype(layout.dtype), nbDims, dimA, strideA)); -} - std::string TensorDesc::to_string() { cudnnDataType_t data_type; int n; @@ -456,97 +433,6 @@ void Conv3DDesc::set(const param::Convolution3D& param, const size_t nr_group) { desc, 3, padA, filterStrideA, dilationA, mode, CUDNN_DATA_FLOAT)); } -DropoutDesc::DropoutDesc() { - cudnn_check(cudnnCreateDropoutDescriptor(&desc)); -} - -DropoutDesc::~DropoutDesc() { - cudnn_check(cudnnDestroyDropoutDescriptor(desc)); -} - -void DropoutDesc::set(float dropout, Handle* handle, TensorND& state) { - cudnn_check(cudnnSetDropoutDescriptor( - desc, cudnn_handle(handle), dropout, state.raw_ptr(), - state.layout.span().dist_byte(), 0 // seed - )); -} - -void DropoutDesc::set_no_dropout(Handle* handle) { - cudnn_check( - cudnnSetDropoutDescriptor(desc, cudnn_handle(handle), 0, nullptr, 0, 0)); -} - -RNNDesc::RNNDesc() { - cudnn_check(cudnnCreateRNNDescriptor(&desc)); -} - -RNNDesc::~RNNDesc() { - cudnn_check(cudnnDestroyRNNDescriptor(desc)); -} - -void RNNDesc::set( - size_t input_size, size_t hidden_size, size_t proj_size, size_t num_layers, - bool bidirectional, bool bias, const megdnn::DType dtype, cudnnRNNMode_t mode, - DropoutDesc& dropout_desc, Handle* handle) { - cudnnRNNMode_t rnn_mode = mode; - cudnnRNNBiasMode_t bias_mode = bias ? CUDNN_RNN_DOUBLE_BIAS : CUDNN_RNN_NO_BIAS; - cudnnDirectionMode_t dir_mode = - bidirectional ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL; - cudnnDataType_t math_prec; - - // math precision - if (dtype.enumv() == DTypeEnum::Float16) - math_prec = CUDNN_DATA_HALF; - else - math_prec = CUDNN_DATA_FLOAT; - -#if false // CUDNN_MAJOR >= 8 - cudnn_check(cudnnSetRNNDescriptor_v8( - desc, CUDNN_RNN_ALGO_STANDARD, mode, bias_mode, dir_mode, - CUDNN_LINEAR_INPUT, to_cudnn_dtype(dtype), math_prec, CUDNN_DEFAULT_MATH, - input_size, hidden_size, proj_size, num_layers, dropout_desc.desc, - CUDNN_RNN_PADDED_IO_DISABLED)); -#else - cudnn_check(cudnnSetRNNDescriptor_v6( - cudnn_handle(handle), desc, hidden_size, num_layers, dropout_desc.desc, - CUDNN_LINEAR_INPUT, dir_mode, mode, CUDNN_RNN_ALGO_STANDARD, math_prec)); -#endif -} - -RNNDataDesc::RNNDataDesc() { - cudnn_check(cudnnCreateRNNDataDescriptor(&desc)); -} - -RNNDataDesc::~RNNDataDesc() { - cudnn_check(cudnnDestroyRNNDataDescriptor(desc)); -} - -void RNNDataDesc::set( - int batchSize, int vectorSize, int maxSeqLength, const int* devSeqLengths, - DType dtype) { - // for now, all tensor are padded in python - // int seqLengthArray[batchSize]; - // for (int i = 0; i < batchSize; ++i) seqLengthArray[i] = maxSeqLength; - cudnn_check(cudnnSetRNNDataDescriptor( - desc, to_cudnn_dtype(dtype), CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_UNPACKED, - maxSeqLength, batchSize, vectorSize, devSeqLengths, nullptr)); -} - -RNNWeightFilterDesc::RNNWeightFilterDesc() { - cudnn_check(cudnnCreateFilterDescriptor(&desc)); -} - -RNNWeightFilterDesc::~RNNWeightFilterDesc() { - cudnn_check(cudnnDestroyFilterDescriptor(desc)); -} - -void RNNWeightFilterDesc::set(const TensorLayout& flatten_weights) { - int weight_elem_num = flatten_weights.total_nr_elems(); - int dimW[] = {weight_elem_num, 1, 1}; - cudnn_check(cudnnSetFilterNdDescriptor( - desc, to_cudnn_dtype(flatten_weights.dtype), CUDNN_TENSOR_NCHW, 3, dimW)); -} - ////////////////////////// CudnnAlgoPack ////////////////////////// #define V1(v) #v diff --git a/dnn/src/cuda/cudnn_wrapper.h b/dnn/src/cuda/cudnn_wrapper.h index 5198e9b5..ef0ab5ab 100644 --- a/dnn/src/cuda/cudnn_wrapper.h +++ b/dnn/src/cuda/cudnn_wrapper.h @@ -30,7 +30,6 @@ public: void set( const TensorLayout& layout, const param::Convolution::Format = param::Convolution::Format::NCHW); - void set_nd(const TensorLayout& layout, int pad = 3); // at least 3 dimensions std::string to_string(); ~TensorDesc(); cudnnTensorDescriptor_t desc; @@ -122,44 +121,6 @@ public: static const std::unordered_map conv3d_fwd_algos(); }; -class DropoutDesc { -public: - DropoutDesc(); - void set(float dropout, Handle* handle, TensorND& state); - void set_no_dropout(Handle* handle); - ~DropoutDesc(); - cudnnDropoutDescriptor_t desc; -}; - -class RNNDesc { -public: - RNNDesc(); - void set( - size_t input_size, size_t hidden_size, size_t proj_size, size_t num_layers, - bool bidirectional, bool bias, const megdnn::DType dtype, - cudnnRNNMode_t mode, DropoutDesc& dropout_desc, Handle* handle); - ~RNNDesc(); - cudnnRNNDescriptor_t desc; -}; - -class RNNDataDesc { -public: - RNNDataDesc(); - void set( - int batchSize, int vectorSize, int maxSeqLength, const int* devSeqLengths, - DType dtype); - ~RNNDataDesc(); - cudnnRNNDataDescriptor_t desc; -}; - -class RNNWeightFilterDesc { -public: - RNNWeightFilterDesc(); - void set(const TensorLayout& flatten_weights); - ~RNNWeightFilterDesc(); - cudnnFilterDescriptor_t desc; -}; - } // namespace cuda } // namespace megdnn diff --git a/dnn/src/cuda/handle_create.cpp b/dnn/src/cuda/handle_create.cpp index 8d383b25..061af596 100644 --- a/dnn/src/cuda/handle_create.cpp +++ b/dnn/src/cuda/handle_create.cpp @@ -10,7 +10,7 @@ * implied. */ -#include "src/common/handle_impl.h" +// #include "src/common/handle_impl.h" #include "src/cuda/adaptive_pooling/opr_impl.h" #include "src/cuda/add_update/opr_impl.h" @@ -52,8 +52,6 @@ #include "src/cuda/local_share/opr_impl.h" #include "src/cuda/lrn/opr_impl.h" #include "src/cuda/lsq/opr_impl.h" -#include "src/cuda/lstm/opr_impl.h" -#include "src/cuda/lstm_cell/opr_impl.h" #include "src/cuda/mask_conv/opr_impl.h" #include "src/cuda/matrix_inverse/opr_impl.h" #include "src/cuda/matrix_mul/opr_impl.h" @@ -70,8 +68,6 @@ #include "src/cuda/repeat/opr_impl.h" #include "src/cuda/resize/opr_impl.h" #include "src/cuda/rng/opr_impl.h" -#include "src/cuda/rnn/opr_impl.h" -#include "src/cuda/rnn_cell/opr_impl.h" #include "src/cuda/roi_align/opr_impl.h" #include "src/cuda/roi_copy/opr_impl.h" #include "src/cuda/roi_pooling/opr_impl.h" @@ -94,6 +90,7 @@ namespace megdnn { namespace cuda { +// After Adding CUDA LSTM, the declaration of CUDA Backend should be restored // MEGDNN_FOREACH_OPR_CLASS(MEGDNN_SPECIALIZE_CREATE_OPERATOR) MEGDNN_SPECIALIZE_CREATE_OPERATOR(ConvolutionForward); MEGDNN_SPECIALIZE_CREATE_OPERATOR(ConvolutionBackwardData); @@ -222,6 +219,8 @@ MEGDNN_SPECIALIZE_CREATE_OPERATOR(PaddingForward); MEGDNN_SPECIALIZE_CREATE_OPERATOR(PaddingBackward); MEGDNN_SPECIALIZE_CREATE_OPERATOR(LayerNormForward); MEGDNN_SPECIALIZE_CREATE_OPERATOR(LayerNormBackward); +MEGDNN_SPECIALIZE_CREATE_OPERATOR(DropoutForward); +MEGDNN_SPECIALIZE_CREATE_OPERATOR(DropoutBackward); template std::unique_ptr HandleImpl::create_operator() { @@ -232,9 +231,11 @@ std::unique_ptr HandleImpl::create_operator() { #define MEGDNN_INST_CREATE_OPERATOR(opr) \ template std::unique_ptr HandleImpl::create_operator(); +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wpragmas" +#pragma GCC diagnostic ignored "-Winstantiation-after-specialization" MEGDNN_FOREACH_OPR_CLASS(MEGDNN_INST_CREATE_OPERATOR) - - +#pragma GCC diagnostic pop } // namespace cuda } // namespace megdnn diff --git a/dnn/src/cuda/lstm/opr_impl.cpp b/dnn/src/cuda/lstm/opr_impl.cpp deleted file mode 100644 index 6dfc9ef0..00000000 --- a/dnn/src/cuda/lstm/opr_impl.cpp +++ /dev/null @@ -1,112 +0,0 @@ -/** - * \file dnn/src/cuda/lstm/opr_impl.cpp - * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") - * - * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - */ -#include "src/cuda/lstm/opr_impl.h" -#include "src/cuda/lstm/utils.h" -#include "src/cuda/utils.h" - -#include - -namespace megdnn { -namespace cuda { - -void LSTMImpl::exec( - _megdnn_tensor_in input, _megdnn_tensor_in hx, _megdnn_tensor_in cx, - _megdnn_tensor_in flatten_weights, _megdnn_tensor_out output, - _megdnn_tensor_out hy, _megdnn_tensor_out cy, _megdnn_tensor_out reserve_space, - _megdnn_workspace workspace) { - Handle* handle = this->handle(); - - rnn::RNNForwardDescHolder_v6 desc_holder = - lstm::get_RNNDescHolder_v6(this->handle(), param(), input.layout); - auto x_desc_arr = rnn::get_descs(desc_holder.x_descs); - auto y_desc_arr = rnn::get_descs(desc_holder.y_descs); - RNNWeightFilterDesc w_desc; - w_desc.set(flatten_weights.layout); - - if (param().fwd_mode == param::LSTM::FwdMode::TRAINING) { - cudnn_check(cudnnRNNForwardTraining( - cudnn_handle(handle), desc_holder.rnn_desc.desc, desc_holder.seq_len, - x_desc_arr.data(), input.raw_ptr(), desc_holder.hx_desc.desc, - hx.raw_ptr(), desc_holder.cx_desc.desc, cx.raw_ptr(), w_desc.desc, - flatten_weights.raw_ptr(), y_desc_arr.data(), output.raw_ptr(), - desc_holder.hy_desc.desc, hy.raw_ptr(), desc_holder.cy_desc.desc, - cy.raw_ptr(), workspace.raw_ptr, desc_holder.workspace_size, - reserve_space.raw_ptr(), desc_holder.reserveSpace_size)); - } else { - cudnn_check(cudnnRNNForwardInference( - cudnn_handle(handle), desc_holder.rnn_desc.desc, desc_holder.seq_len, - x_desc_arr.data(), input.raw_ptr(), desc_holder.hx_desc.desc, - hx.raw_ptr(), desc_holder.cx_desc.desc, nullptr, w_desc.desc, - flatten_weights.raw_ptr(), y_desc_arr.data(), output.raw_ptr(), - desc_holder.hy_desc.desc, hy.raw_ptr(), desc_holder.cy_desc.desc, - cy.raw_ptr(), workspace.raw_ptr, desc_holder.workspace_size)); - } -} - -size_t LSTMImpl::get_workspace_in_bytes( - const TensorLayout& input, const TensorLayout& hx, const TensorLayout& cx, - const TensorLayout& flatten_weights, const TensorLayout& output, - const TensorLayout& hy, const TensorLayout& cy, - const TensorLayout& reserve_space) { - rnn::RNNForwardDescHolder_v6 desc_holder = - lstm::get_RNNDescHolder_v6(this->handle(), param(), input); - return desc_holder.workspace_size; -} - -size_t LSTMImpl::get_reserve_size_in_bytes(const TensorLayout& input) { - rnn::RNNForwardDescHolder_v6 desc_holder = - lstm::get_RNNDescHolder_v6(this->handle(), param(), input); - return desc_holder.reserveSpace_size; -} - -void LSTMBackwardImpl::exec( - _megdnn_tensor_in x, _megdnn_tensor_in y, _megdnn_tensor_in hx, - _megdnn_tensor_in cx, _megdnn_tensor_in dy, _megdnn_tensor_in dhy, - _megdnn_tensor_in dcy, _megdnn_tensor_in flatten_weights, - _megdnn_tensor_in reserve_space, _megdnn_tensor_out dx, _megdnn_tensor_out dhx, - _megdnn_tensor_out dcx, _megdnn_tensor_out dw, _megdnn_workspace workspace) { - Handle* handle = this->handle(); - size_t seq_len = x.layout.shape[0]; - auto desc_holder = lstm::get_RNNDescHolder_v6(handle, param(), x.layout); - auto x_desc_arr_ptr = rnn::get_descs(desc_holder.x_descs).data(); - auto y_desc_arr_ptr = rnn::get_descs(desc_holder.y_descs).data(); - RNNWeightFilterDesc w_desc; - w_desc.set(flatten_weights.layout); - - cudnn_check(cudnnRNNBackwardData( - cudnn_handle(handle), desc_holder.rnn_desc.desc, seq_len, y_desc_arr_ptr, - y.raw_ptr(), y_desc_arr_ptr, dy.raw_ptr(), desc_holder.hy_desc.desc, - dhy.raw_ptr(), desc_holder.cy_desc.desc, dcy.raw_ptr(), w_desc.desc, - flatten_weights.raw_ptr(), desc_holder.hx_desc.desc, hx.raw_ptr(), - desc_holder.cx_desc.desc, cx.raw_ptr(), x_desc_arr_ptr, dx.raw_ptr(), - desc_holder.hx_desc.desc, dhx.raw_ptr(), desc_holder.cx_desc.desc, - dcx.raw_ptr(), workspace.raw_ptr, desc_holder.workspace_size, - reserve_space.raw_ptr(), desc_holder.reserveSpace_size)); - - cudnn_check(cudnnRNNBackwardWeights( - cudnn_handle(handle), desc_holder.rnn_desc.desc, seq_len, x_desc_arr_ptr, - x.raw_ptr(), desc_holder.hx_desc.desc, hx.raw_ptr(), y_desc_arr_ptr, - y.raw_ptr(), workspace.raw_ptr, desc_holder.workspace_size, w_desc.desc, - dw.raw_ptr(), reserve_space.raw_ptr(), desc_holder.reserveSpace_size)); -} - -size_t LSTMBackwardImpl::get_workspace_in_bytes( - const TensorLayout& x, const TensorLayout& y, const TensorLayout& hx, - const TensorLayout& cx, const TensorLayout& dy, const TensorLayout& dhy, - const TensorLayout& dcy, const TensorLayout& flatten_weights, - const TensorLayout& reserve_space, const TensorLayout& dx, - const TensorLayout& dhx, const TensorLayout& dcx, const TensorLayout& dw) { - auto desc_holder = lstm::get_RNNDescHolder_v6(this->handle(), param(), x); - return desc_holder.workspace_size; -} - -} // namespace cuda -} // namespace megdnn diff --git a/dnn/src/cuda/lstm/opr_impl.h b/dnn/src/cuda/lstm/opr_impl.h deleted file mode 100644 index a01032f7..00000000 --- a/dnn/src/cuda/lstm/opr_impl.h +++ /dev/null @@ -1,56 +0,0 @@ -/** - * \file dnn/src/cuda/lstm/opr_impl.h - * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") - * - * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - */ -#pragma once -#include "megdnn/oprs.h" - -namespace megdnn { -namespace cuda { - -class LSTMImpl : public LSTM { -public: - using LSTM::LSTM; - - void exec( - _megdnn_tensor_in input, _megdnn_tensor_in hx, _megdnn_tensor_in cx, - _megdnn_tensor_in flatten_weights, _megdnn_tensor_out output, - _megdnn_tensor_out hy, _megdnn_tensor_out cy, - _megdnn_tensor_out reserve_space, _megdnn_workspace workspace); - - size_t get_workspace_in_bytes( - const TensorLayout& input, const TensorLayout& hx, const TensorLayout& cx, - const TensorLayout& flatten_weights, const TensorLayout& output, - const TensorLayout& hy, const TensorLayout& cy, - const TensorLayout& reserve_space); - size_t get_reserve_size_in_bytes(const TensorLayout& input); -}; - -class LSTMBackwardImpl : public LSTMBackward { -public: - using LSTMBackward::LSTMBackward; - - virtual void exec( - _megdnn_tensor_in x, _megdnn_tensor_in y, _megdnn_tensor_in hx, - _megdnn_tensor_in cx, _megdnn_tensor_in dy, _megdnn_tensor_in dhy, - _megdnn_tensor_in dcy, _megdnn_tensor_in flatten_weights, - _megdnn_tensor_in reserve_space, _megdnn_tensor_out dx, - _megdnn_tensor_out dhx, _megdnn_tensor_out dcx, _megdnn_tensor_out dw, - _megdnn_workspace workspace); - - virtual size_t get_workspace_in_bytes( - const TensorLayout& x, const TensorLayout& y, const TensorLayout& hx, - const TensorLayout& cx, const TensorLayout& dy, const TensorLayout& dhy, - const TensorLayout& dcy, const TensorLayout& flatten_weights, - const TensorLayout& reserve_space, const TensorLayout& dx, - const TensorLayout& dhx, const TensorLayout& dcx, const TensorLayout& dw); -}; - -} // namespace cuda -} // namespace megdnn \ No newline at end of file diff --git a/dnn/src/cuda/lstm/utils.cpp b/dnn/src/cuda/lstm/utils.cpp deleted file mode 100644 index 7fe1ab0e..00000000 --- a/dnn/src/cuda/lstm/utils.cpp +++ /dev/null @@ -1,39 +0,0 @@ -/** - * \file dnn/src/cuda/lstm/utils.cpp - * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") - * - * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - */ -#include "src/cuda/lstm/utils.h" -#include "src/cuda/utils.h" - -#include - -namespace megdnn { -namespace cuda { -namespace lstm { - -RNNForwardDescHolder_v6 get_RNNDescHolder_v6( - Handle* handle, megdnn::LSTMForward::Param& _param, const TensorLayout& input) { - size_t seq_len = input.shape[0]; - size_t batch_size = input.shape[1]; - size_t input_size = input.shape[2]; - - cudnnRNNMode_t mode = CUDNN_LSTM; - - using FwdMode = param::LSTM::FwdMode; - - RNNForwardDescHolder_v6 desc_holder( - handle, seq_len, batch_size, _param.hidden_size, input_size, - _param.proj_size, _param.num_layers, _param.bidirectional, _param.bias, - input.dtype, mode); - return desc_holder; -} - -} // namespace lstm -} // namespace cuda -} // namespace megdnn \ No newline at end of file diff --git a/dnn/src/cuda/lstm/utils.h b/dnn/src/cuda/lstm/utils.h deleted file mode 100644 index 2623c6fd..00000000 --- a/dnn/src/cuda/lstm/utils.h +++ /dev/null @@ -1,23 +0,0 @@ -/** - * \file dnn/src/cuda/lstm/utils.h - * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") - * - * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - */ -#pragma once -#include "src/cuda/cudnn_wrapper.h" -#include "src/cuda/rnn/utils.h" - -namespace megdnn { -namespace cuda { -namespace lstm { -using megdnn::cuda::rnn::RNNForwardDescHolder_v6; -RNNForwardDescHolder_v6 get_RNNDescHolder_v6( - Handle* handle, megdnn::LSTMForward::Param& _param, const TensorLayout& input); -} // namespace lstm -} // namespace cuda -} // namespace megdnn \ No newline at end of file diff --git a/dnn/src/cuda/lstm_cell/opr_impl.cpp b/dnn/src/cuda/lstm_cell/opr_impl.cpp deleted file mode 100644 index b01c5de0..00000000 --- a/dnn/src/cuda/lstm_cell/opr_impl.cpp +++ /dev/null @@ -1,42 +0,0 @@ -/** - * \file dnn/src/cuda/lstm_cell/opr_impl.cpp - * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") - * - * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - */ -#include "src/cuda/lstm_cell/opr_impl.h" -#include "megdnn/dtype.h" -#include "megdnn/oprs/base.h" -#include "src/common/lstm_cell.h" -#include "src/common/opr_delegate.h" -#include "src/common/utils.h" - -namespace megdnn { -namespace cuda { -size_t LSTMCellImpl::get_workspace_in_bytes( - const TensorLayout& input, const TensorLayout& weight_ih, - const TensorLayout& bias_ih, const TensorLayout& hx, - const TensorLayout& weight_hh, const TensorLayout& bias_hh, - const TensorLayout& cx, const TensorLayout& h_new, const TensorLayout& c_new, - const TensorLayout& gates) { - return megdnn::lstm_cell::get_workspace_in_bytes( - input, weight_ih, bias_ih, hx, weight_hh, bias_hh, cx, h_new, c_new, gates, - handle()); -} - -void LSTMCellImpl::exec( - _megdnn_tensor_in input, _megdnn_tensor_in weight_ih, _megdnn_tensor_in bias_ih, - _megdnn_tensor_in hx, _megdnn_tensor_in weight_hh, _megdnn_tensor_in bias_hh, - _megdnn_tensor_in cx, _megdnn_tensor_out h_new, _megdnn_tensor_out c_new, - _megdnn_tensor_out gates, _megdnn_workspace workspace) { - megdnn::lstm_cell::exec( - input, weight_ih, bias_ih, hx, weight_hh, bias_hh, cx, h_new, c_new, gates, - workspace, handle()); -} -} // namespace cuda - -} // namespace megdnn \ No newline at end of file diff --git a/dnn/src/cuda/lstm_cell/opr_impl.h b/dnn/src/cuda/lstm_cell/opr_impl.h deleted file mode 100644 index f8578162..00000000 --- a/dnn/src/cuda/lstm_cell/opr_impl.h +++ /dev/null @@ -1,36 +0,0 @@ -/** - * \file dnn/src/cuda/lstm_cell/opr_impl.h - * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") - * - * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - */ -#pragma once -#include "megdnn/oprs.h" -#include "src/cuda/rnn_cell/opr_impl.h" - -namespace megdnn { -namespace cuda { - -class LSTMCellImpl : public LSTMCell { -public: - using LSTMCell::LSTMCell; - void exec( - _megdnn_tensor_in input, _megdnn_tensor_in weight_ih, - _megdnn_tensor_in bias_ih, _megdnn_tensor_in hx, - _megdnn_tensor_in weight_hh, _megdnn_tensor_in bias_hh, - _megdnn_tensor_in cx, _megdnn_tensor_out h_new, _megdnn_tensor_out c_new, - _megdnn_tensor_out gates, _megdnn_workspace workspace) override; - size_t get_workspace_in_bytes( - const TensorLayout& input, const TensorLayout& weight_ih, - const TensorLayout& bias_ih, const TensorLayout& hx, - const TensorLayout& weight_hh, const TensorLayout& bias_hh, - const TensorLayout& cx, const TensorLayout& h_new, - const TensorLayout& c_new, const TensorLayout& gates) override; -}; - -} // namespace cuda -} // namespace megdnn \ No newline at end of file diff --git a/dnn/src/cuda/rnn/opr_impl.cpp b/dnn/src/cuda/rnn/opr_impl.cpp deleted file mode 100644 index 937345a2..00000000 --- a/dnn/src/cuda/rnn/opr_impl.cpp +++ /dev/null @@ -1,170 +0,0 @@ -/** - * \file dnn/src/cuda/rnn/opr_impl.cpp - * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") - * - * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - */ -#include "src/cuda/rnn/opr_impl.h" -#include "src/common/rnn.h" -#include "src/cuda/utils.h" - -//#include -#include -#include -#include - -namespace megdnn { -namespace cuda { - -using namespace std; - -void RNNImpl::exec( - _megdnn_tensor_in input, _megdnn_tensor_in hx, - _megdnn_tensor_in flatten_weights, _megdnn_tensor_out output, - _megdnn_tensor_out hy, _megdnn_tensor_out reserve_space, - _megdnn_workspace workspace) { - Handle* handle = this->handle(); - -#if false // CUDNN_MAJOR >= 8 - rnn::RNNForwardDescHolder desc_holder = this->get_desc_holder(input.layout); - - void* workspace_ptr = workspace.raw_ptr; - void* reserveSpace_ptr = static_cast(workspace_ptr) + desc_holder.workspace_size; - - cudnn_check(cudnnRNNForward( - cudnn_handle(handle), desc_holder.rnn_desc.desc, desc_holder.fwdMode, desc_holder.devSeqLengths, - desc_holder.x_desc.desc, input.raw_ptr(), desc_holder.y_desc.desc, output.raw_ptr(), - desc_holder.h_desc.desc, hx.raw_ptr(), hy.raw_ptr(), - desc_holder.h_desc.desc, nullptr, nullptr, - desc_holder.weight_size, flatten_weights.raw_ptr(), desc_holder.workspace_size, workspace_ptr, - desc_holder.reserveSpace_size, reserveSpace_ptr - )); -#else - rnn::RNNForwardDescHolder_v6 desc_holder = - rnn::get_RNNDescHolder_v6(this->handle(), param(), input.layout); - auto x_desc_arr = rnn::get_descs(desc_holder.x_descs); - auto y_desc_arr = rnn::get_descs(desc_holder.y_descs); - RNNWeightFilterDesc w_desc; - w_desc.set(flatten_weights.layout); - - if (param().fwd_mode == param::RNN::FwdMode::TRAINING) { - cudnn_check(cudnnRNNForwardTraining( - cudnn_handle(handle), desc_holder.rnn_desc.desc, desc_holder.seq_len, - x_desc_arr.data(), input.raw_ptr(), desc_holder.hx_desc.desc, - hx.raw_ptr(), desc_holder.cx_desc.desc, NULL, w_desc.desc, - flatten_weights.raw_ptr(), y_desc_arr.data(), output.raw_ptr(), - desc_holder.hy_desc.desc, hy.raw_ptr(), desc_holder.cy_desc.desc, NULL, - workspace.raw_ptr, desc_holder.workspace_size, reserve_space.raw_ptr(), - desc_holder.reserveSpace_size)); - } else { - cudnn_check(cudnnRNNForwardInference( - cudnn_handle(handle), desc_holder.rnn_desc.desc, desc_holder.seq_len, - x_desc_arr.data(), input.raw_ptr(), desc_holder.hx_desc.desc, - hx.raw_ptr(), desc_holder.cx_desc.desc, nullptr, w_desc.desc, - flatten_weights.raw_ptr(), y_desc_arr.data(), output.raw_ptr(), - desc_holder.hy_desc.desc, hy.raw_ptr(), desc_holder.cy_desc.desc, - nullptr, workspace.raw_ptr, desc_holder.workspace_size)); - } -#endif -} - -size_t RNNImpl::get_workspace_in_bytes( - const TensorLayout& input, const TensorLayout& hx, - const TensorLayout& flatten_weights, const TensorLayout& output, - const TensorLayout& hy, const TensorLayout& reserve_space) { -#if false // CUDNN_MAJOR >= 8 - rnn::RNNForwardDescHolder desc_holder = this->get_desc_holder(input); -#else - rnn::RNNForwardDescHolder_v6 desc_holder = - rnn::get_RNNDescHolder_v6(this->handle(), param(), input); -#endif - return desc_holder.workspace_size; -} - -size_t RNNImpl::get_reserve_size_in_bytes(const TensorLayout& input) { - rnn::RNNForwardDescHolder_v6 desc_holder = - rnn::get_RNNDescHolder_v6(this->handle(), param(), input); - return desc_holder.reserveSpace_size; -} - -/*rnn::RNNForwardDescHolder RNNImpl::get_desc_holder(const TensorLayout& input) { - Handle* handle = this->handle(); - size_t seq_len = input.shape[0]; - size_t batch_size = input.shape[1]; - size_t input_size = input.shape[2]; - auto _param = param(); - - cudnnRNNMode_t mode; - using NonlineMode = param::RNN::NonlineMode; - switch (_param.nonlineMode) { - case NonlineMode::RELU: - mode = CUDNN_RNN_RELU; - break; - case NonlineMode::TANH: - mode = CUDNN_RNN_TANH; - break; - } - - cudnnForwardMode_t fwdMode = CUDNN_FWD_MODE_TRAINING; - using FwdMode = param::RNN::FwdMode; - switch (_param.fwd_mode) { - case FwdMode::TRAINING: - fwdMode = CUDNN_FWD_MODE_TRAINING; - break; - case FwdMode::INFERENCE: - fwdMode = CUDNN_FWD_MODE_INFERENCE; - break; - } - - rnn::RNNForwardDescHolder desc_holder( - handle, seq_len, batch_size, _param.hidden_size, input_size, - _param.proj_size, _param.num_layers, _param.bidirectional, _param.bias, - input.dtype, mode, fwdMode); - return desc_holder; -}*/ - -void RNNBackwardImpl::exec( - _megdnn_tensor_in x, _megdnn_tensor_in y, _megdnn_tensor_in hx, - _megdnn_tensor_in dy, _megdnn_tensor_in dhy, _megdnn_tensor_in flatten_weights, - _megdnn_tensor_in reserve_space, _megdnn_tensor_out dx, _megdnn_tensor_out dhx, - _megdnn_tensor_out dw, _megdnn_workspace workspace) { - Handle* handle = this->handle(); - size_t seq_len = x.layout.shape[0]; - auto desc_holder = rnn::get_RNNDescHolder_v6(handle, param(), x.layout); - auto x_desc_arr_ptr = rnn::get_descs(desc_holder.x_descs).data(); - auto y_desc_arr_ptr = rnn::get_descs(desc_holder.y_descs).data(); - RNNWeightFilterDesc w_desc; - w_desc.set(flatten_weights.layout); - - cudnn_check(cudnnRNNBackwardData( - cudnn_handle(handle), desc_holder.rnn_desc.desc, seq_len, y_desc_arr_ptr, - y.raw_ptr(), y_desc_arr_ptr, dy.raw_ptr(), desc_holder.hy_desc.desc, - dhy.raw_ptr(), desc_holder.cy_desc.desc, NULL, w_desc.desc, - flatten_weights.raw_ptr(), desc_holder.hx_desc.desc, hx.raw_ptr(), - desc_holder.cx_desc.desc, NULL, x_desc_arr_ptr, dx.raw_ptr(), - desc_holder.hx_desc.desc, dhx.raw_ptr(), desc_holder.cx_desc.desc, NULL, - workspace.raw_ptr, desc_holder.workspace_size, reserve_space.raw_ptr(), - desc_holder.reserveSpace_size)); - - cudnn_check(cudnnRNNBackwardWeights( - cudnn_handle(handle), desc_holder.rnn_desc.desc, seq_len, x_desc_arr_ptr, - x.raw_ptr(), desc_holder.hx_desc.desc, hx.raw_ptr(), y_desc_arr_ptr, - y.raw_ptr(), workspace.raw_ptr, desc_holder.workspace_size, w_desc.desc, - dw.raw_ptr(), reserve_space.raw_ptr(), desc_holder.reserveSpace_size)); -} - -size_t RNNBackwardImpl::get_workspace_in_bytes( - const TensorLayout& x, const TensorLayout& y, const TensorLayout& hx, - const TensorLayout& dy, const TensorLayout& dhy, - const TensorLayout& flatten_weights, const TensorLayout& reserve_space, - const TensorLayout& dx, const TensorLayout& dhx, const TensorLayout& dw) { - auto desc_holder = rnn::get_RNNDescHolder_v6(this->handle(), param(), x); - return desc_holder.workspace_size; -} - -} // namespace cuda -} // namespace megdnn diff --git a/dnn/src/cuda/rnn/opr_impl.h b/dnn/src/cuda/rnn/opr_impl.h deleted file mode 100644 index 701e7876..00000000 --- a/dnn/src/cuda/rnn/opr_impl.h +++ /dev/null @@ -1,57 +0,0 @@ -/** - * \file dnn/src/cuda/rnn/opr_impl.h - * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") - * - * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - */ -#pragma once -#include "megdnn/oprs.h" -#include "src/cuda/cudnn_wrapper.h" -#include "src/cuda/rnn/utils.h" - -namespace megdnn { -namespace cuda { - -class RNNImpl : public RNN { -public: - using RNN::RNN; - - void exec( - _megdnn_tensor_in input, _megdnn_tensor_in hx, - _megdnn_tensor_in flatten_weights, _megdnn_tensor_out output, - _megdnn_tensor_out hy, _megdnn_tensor_out reserve_space, - _megdnn_workspace workspace); - - size_t get_workspace_in_bytes( - const TensorLayout& input, const TensorLayout& hx, - const TensorLayout& flatten_weights, const TensorLayout& output, - const TensorLayout& hy, const TensorLayout& reserve_space); - size_t get_reserve_size_in_bytes(const TensorLayout& input); - // private: - // rnn::RNNForwardDescHolder get_desc_holder(const TensorLayout& input); -}; - -class RNNBackwardImpl : public RNNBackward { -public: - using RNNBackward::RNNBackward; - - virtual void exec( - _megdnn_tensor_in x, _megdnn_tensor_in y, _megdnn_tensor_in hx, - _megdnn_tensor_in dy, _megdnn_tensor_in dhy, - _megdnn_tensor_in flatten_weights, _megdnn_tensor_in reserve_space, - _megdnn_tensor_out dx, _megdnn_tensor_out dhx, _megdnn_tensor_out dw, - _megdnn_workspace workspace); - - virtual size_t get_workspace_in_bytes( - const TensorLayout& x, const TensorLayout& y, const TensorLayout& hx, - const TensorLayout& dy, const TensorLayout& dhy, - const TensorLayout& flatten_weights, const TensorLayout& reserve_space, - const TensorLayout& dx, const TensorLayout& dhx, const TensorLayout& dw); -}; - -} // namespace cuda -} // namespace megdnn diff --git a/dnn/src/cuda/rnn/utils.cpp b/dnn/src/cuda/rnn/utils.cpp deleted file mode 100644 index 9e4ff825..00000000 --- a/dnn/src/cuda/rnn/utils.cpp +++ /dev/null @@ -1,138 +0,0 @@ -/** - * \file dnn/src/cuda/rnn/utils.cpp - * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") - * - * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - */ -#include "src/cuda/rnn/utils.h" -#include "src/cuda/utils.h" - -#include - -namespace megdnn { -namespace cuda { -namespace rnn { -/*RNNForwardDescHolder::RNNForwardDescHolder(Handle* handle, size_t seq_len, size_t -batch_size, size_t hidden_size, size_t input_size, size_t proj_size, size_t num_layers, -bool bidirectional, bool bias, DType dtype, cudnnRNNMode_t _mode, cudnnForwardMode_t -_fwdMode) : mode(_mode), fwdMode(_fwdMode) -{ - size_t D = bidirectional ? 2 : 1; - - // TODO: set dropout to 0 in inference mode - dropout_desc.set_no_dropout(handle); - - // seq len is unified (not packed) - // cuda_check(cudaMalloc((void**)&devSeqLengths, sizeof(int32_t) * batch_size)); - devSeqLengths = (int32_t*)malloc(sizeof(int32_t) * batch_size); - for (size_t i = 0; i < batch_size; ++i) devSeqLengths[i] = seq_len; - - // proj size should be smaller than hidden size according to cudnn api - // otherwise it is disabled - proj_size = (proj_size > hidden_size || proj_size == 0) ? hidden_size : -proj_size; rnn_desc.set( input_size, hidden_size, proj_size, num_layers, bidirectional, -bias, dtype, mode, dropout_desc, handle - ); - - x_desc.set(batch_size, input_size, seq_len, devSeqLengths, dtype); - y_desc.set(batch_size, D * proj_size, seq_len, - devSeqLengths, dtype); - h_desc.set_nd(TensorLayout(TensorShape{D * num_layers, batch_size, proj_size}, -dtype)); - - cudnn_check(cudnnGetRNNWeightSpaceSize(cudnn_handle(handle), rnn_desc.desc, -&weight_size)); - - cudnn_check(cudnnGetRNNTempSpaceSizes( - cudnn_handle(handle), rnn_desc.desc, fwdMode, x_desc.desc, -&workspace_size, &reserveSpace_size - )); -} - -RNNForwardDescHolder::~RNNForwardDescHolder() { - // cuda_check(cudaFree(devSeqLengths)); - free(devSeqLengths); -}*/ - -RNNForwardDescHolder_v6::RNNForwardDescHolder_v6( - Handle* handle, size_t seq_len, size_t batch_size, size_t hidden_size, - size_t input_size, size_t proj_size, size_t num_layers, bool bidirectional, - bool bias, DType dtype, cudnnRNNMode_t _mode) - : mode(_mode), seq_len(seq_len) { - size_t D = bidirectional ? 2 : 1; - - // TODO: set dropout to 0 in inference mode - dropout_desc.set_no_dropout(handle); - - proj_size = (proj_size > hidden_size || proj_size == 0) ? hidden_size : proj_size; - rnn_desc.set( - input_size, hidden_size, proj_size, num_layers, bidirectional, bias, dtype, - mode, dropout_desc, handle); - - x_descs.resize(seq_len); - y_descs.resize(seq_len); - for (size_t i = 0; i < seq_len; ++i) { - x_descs[i].set_nd(TensorLayout(TensorShape{batch_size, input_size}, dtype), 3); - y_descs[i].set_nd( - TensorLayout(TensorShape{batch_size, D * hidden_size}, dtype), 3); - } - -#define SET_H(_var) \ - _var.set_nd(TensorLayout( \ - TensorShape{D * num_layers, batch_size, hidden_size}, dtype)); - - SET_H(hx_desc) - SET_H(cx_desc) - SET_H(hy_desc) - SET_H(cy_desc) -#undef SET_H - - std::vector x_desc_arr = get_descs(x_descs); - cudnn_check(cudnnGetRNNWorkspaceSize( - cudnn_handle(handle), rnn_desc.desc, seq_len, x_desc_arr.data(), - &workspace_size)); - - cudnn_check(cudnnGetRNNTrainingReserveSize( - cudnn_handle(handle), rnn_desc.desc, seq_len, x_desc_arr.data(), - &reserveSpace_size)); -} - -RNNForwardDescHolder_v6 get_RNNDescHolder_v6( - Handle* handle, megdnn::RNNForward::Param& _param, const TensorLayout& input) { - size_t seq_len = input.shape[0]; - size_t batch_size = input.shape[1]; - size_t input_size = input.shape[2]; - - cudnnRNNMode_t mode; - using NonlineMode = param::RNN::NonlineMode; - switch (_param.nonlineMode) { - case NonlineMode::RELU: - mode = CUDNN_RNN_RELU; - break; - case NonlineMode::TANH: - mode = CUDNN_RNN_TANH; - break; - } - - RNNForwardDescHolder_v6 desc_holder( - handle, seq_len, batch_size, _param.hidden_size, input_size, - _param.proj_size, _param.num_layers, _param.bidirectional, _param.bias, - input.dtype, mode); - return desc_holder; -} - -std::vector get_descs(const std::vector& descs) { - std::vector r; - r.reserve(descs.size()); - for (auto& desc : descs) { - r.emplace_back(desc.desc); - } - return r; -} -} // namespace rnn -} // namespace cuda -} // namespace megdnn \ No newline at end of file diff --git a/dnn/src/cuda/rnn/utils.h b/dnn/src/cuda/rnn/utils.h deleted file mode 100644 index d2a652dc..00000000 --- a/dnn/src/cuda/rnn/utils.h +++ /dev/null @@ -1,56 +0,0 @@ -/** - * \file dnn/src/cuda/rnn/utils.h - * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") - * - * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - */ -#pragma once -#include "src/cuda/cudnn_wrapper.h" - -namespace megdnn { -namespace cuda { -namespace rnn { -// v8, not for now -/*struct RNNForwardDescHolder { - - int32_t* devSeqLengths; - cudnnRNNMode_t mode; - cudnnForwardMode_t fwdMode; - RNNDesc rnn_desc; - DropoutDesc dropout_desc; - RNNDataDesc x_desc, y_desc; - TensorDesc h_desc; - size_t weight_size, workspace_size, reserveSpace_size; - - RNNForwardDescHolder(Handle* handle, size_t seq_len, size_t batch_size, size_t -hidden_size, size_t input_size, size_t proj_size, size_t num_layers, bool bidirectional, - bool bias, DType dtype, -cudnnRNNMode_t _mode, cudnnForwardMode_t _fwdMode); ~RNNForwardDescHolder(); -};*/ - -struct RNNForwardDescHolder_v6 { - cudnnRNNMode_t mode; - RNNDesc rnn_desc; - int seq_len; - DropoutDesc dropout_desc; - std::vector x_descs, y_descs; - TensorDesc hx_desc, cx_desc, hy_desc, cy_desc; - - size_t workspace_size, reserveSpace_size; - - RNNForwardDescHolder_v6( - Handle* handle, size_t seq_len, size_t batch_size, size_t hidden_size, - size_t input_size, size_t proj_size, size_t num_layers, bool bidirectional, - bool bias, DType dtype, cudnnRNNMode_t _mode); -}; - -RNNForwardDescHolder_v6 get_RNNDescHolder_v6( - Handle* handle, megdnn::RNNForward::Param& _param, const TensorLayout& input); -std::vector get_descs(const std::vector& descs); -} // namespace rnn -} // namespace cuda -} // namespace megdnn \ No newline at end of file diff --git a/dnn/src/cuda/rnn_cell/opr_impl.cpp b/dnn/src/cuda/rnn_cell/opr_impl.cpp deleted file mode 100644 index 4fb5f2ed..00000000 --- a/dnn/src/cuda/rnn_cell/opr_impl.cpp +++ /dev/null @@ -1,35 +0,0 @@ -/** - * \file dnn/src/cuda/rnn_cell/opr_impl.cpp - * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") - * - * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - */ -#include "src/cuda/rnn_cell/opr_impl.h" -#include "src/common/rnn_cell.h" - -namespace megdnn { -namespace cuda { -size_t RNNCellImpl::get_workspace_in_bytes( - const TensorLayout& input, const TensorLayout& weight_ih, - const TensorLayout& bias_ih, const TensorLayout& hx, - const TensorLayout& weight_hh, const TensorLayout& bias_hh, - const TensorLayout& dst) { - return megdnn::rnn_cell::get_workspace_in_bytes( - input, weight_ih, bias_hh, hx, weight_hh, bias_hh, dst, handle()); -} - -void RNNCellImpl::exec( - _megdnn_tensor_in input, _megdnn_tensor_in weight_ih, _megdnn_tensor_in bias_ih, - _megdnn_tensor_in hx, _megdnn_tensor_in weight_hh, _megdnn_tensor_in bias_hh, - _megdnn_tensor_out dst, _megdnn_workspace workspace) { - megdnn::rnn_cell::exec( - input, weight_ih, bias_ih, hx, weight_hh, bias_hh, dst, workspace, - param().nonlineMode, handle()); -} -} // namespace cuda - -} // namespace megdnn \ No newline at end of file diff --git a/dnn/src/cuda/rnn_cell/opr_impl.h b/dnn/src/cuda/rnn_cell/opr_impl.h deleted file mode 100644 index 916d8586..00000000 --- a/dnn/src/cuda/rnn_cell/opr_impl.h +++ /dev/null @@ -1,40 +0,0 @@ -/** - * \file dnn/src/cuda/rnn_cell/opr_impl.h - * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") - * - * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - */ -#pragma once -#include "megdnn/oprs.h" - -namespace megdnn { -namespace cuda { - -class RNNCellImpl : public RNNCell { -public: - using RNNCell::RNNCell; - void exec( - _megdnn_tensor_in input, _megdnn_tensor_in weight_ih, - _megdnn_tensor_in bias_ih, _megdnn_tensor_in hx, - _megdnn_tensor_in weight_hh, _megdnn_tensor_in bias_hh, - _megdnn_tensor_out dst, _megdnn_workspace workspace) override; - size_t get_workspace_in_bytes( - const TensorLayout& input, const TensorLayout& weight_ih, - const TensorLayout& bias_ih, const TensorLayout& hx, - const TensorLayout& weight_hh, const TensorLayout& bias_hh, - const TensorLayout& dst) override; - /* - private: - void add_bias(_megdnn_tensor_in A, - _megdnn_tensor_in B, - _megdnn_tensor_in bias, - _megdnn_tensor_out C); - */ -}; - -} // namespace cuda -} // namespace megdnn \ No newline at end of file diff --git a/dnn/src/naive/handle.cpp b/dnn/src/naive/handle.cpp index d92cdb72..87568168 100644 --- a/dnn/src/naive/handle.cpp +++ b/dnn/src/naive/handle.cpp @@ -9,7 +9,6 @@ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. */ - #include "src/naive/handle.h" #include "src/common/handle_impl.h" @@ -140,4 +139,5 @@ MEGDNN_FOREACH_OPR_CLASS(MEGDNN_SPECIALIZE_CREATE_OPERATOR) } // namespace naive } // namespace megdnn -// vim: syntax=cpp.doxygen + +// vim: syntax=cpp.doxygen \ No newline at end of file diff --git a/dnn/src/naive/lstm/opr_impl.cpp b/dnn/src/naive/lstm/opr_impl.cpp index 73f6ac1d..5e11c56b 100644 --- a/dnn/src/naive/lstm/opr_impl.cpp +++ b/dnn/src/naive/lstm/opr_impl.cpp @@ -12,6 +12,9 @@ #include "src/naive/rnn/funcs.h" #include "src/naive/rnn/rnn.h" +#include "midout.h" +MIDOUT_DECL(megdnn_naive_lstm_fwd) + namespace megdnn { namespace naive { using rnn::LSTMCellWeightWrapper; @@ -21,29 +24,32 @@ void LSTMImpl::exec( _megdnn_tensor_in flatten_weights, _megdnn_tensor_out output, _megdnn_tensor_out hy, _megdnn_tensor_out cy, _megdnn_tensor_out reserve_space, _megdnn_workspace workspace) { - auto _param = param(); - size_t D = _param.bidirectional ? 2 : 1; - size_t num_layers = _param.num_layers; - size_t input_size = input.layout.shape[2]; - std::vector cells; - size_t used_workspace_size = rnn::get_cells( - D, num_layers, input_size, _param.hidden_size, _param.bias, cells, - flatten_weights, workspace); + MIDOUT_BEGIN(megdnn_naive_lstm_fwd) { + auto _param = param(); + size_t D = _param.bidirectional ? 2 : 1; + size_t num_layers = _param.num_layers; + size_t input_size = input.layout.shape[2]; + std::vector cells; + size_t used_workspace_size = rnn::get_cells( + D, num_layers, input_size, _param.hidden_size, _param.bias, cells, + flatten_weights, workspace); - Workspace new_workspace( - workspace.raw_ptr + used_workspace_size, - workspace.size - used_workspace_size); - TensorNDArray states = {hx, cx}, states_new = {hy, cy}; - rnn::exec_internal( - cells, input, states, states_new, output, reserve_space, num_layers, D, - this->handle(), new_workspace); + Workspace new_workspace( + workspace.raw_ptr + used_workspace_size, + workspace.size - used_workspace_size); + TensorNDArray states = {hx, cx}, states_new = {hy, cy}; + rnn::exec_internal( + cells, input, states, states_new, output, reserve_space, num_layers, D, + param::RNNCell::NonlineMode::IDENTITY, this->handle(), new_workspace); + } + MIDOUT_END(); } size_t LSTMImpl::get_workspace_in_bytes( - const TensorLayout& input, const TensorLayout& hx, const TensorLayout& cx, - const TensorLayout& flatten_weights, const TensorLayout& output, - const TensorLayout& hy, const TensorLayout& cy, - const TensorLayout& reserve_space) { + const TensorLayout& input, const TensorLayout& /*hx*/, + const TensorLayout& /*cx*/, const TensorLayout& flatten_weights, + const TensorLayout& output, const TensorLayout& /*hy*/, + const TensorLayout& /*cy*/, const TensorLayout& /*reserve_space*/) { size_t workspace_size = rnn::get_workspace_in_bytes( input, flatten_weights, param().hidden_size, param().bidirectional ? 2 : 1, this->handle()); @@ -77,6 +83,7 @@ void LSTMBackwardImpl::exec( size_t num_layers = param().num_layers; size_t D = param().bidirectional ? 2 : 1; size_t input_size = x.layout.shape[2]; + size_t batch_size = x.layout.shape[1]; size_t hidden_size = param().hidden_size; size_t used_workspace_size = 0; @@ -90,10 +97,27 @@ void LSTMBackwardImpl::exec( Workspace new_workspace = Workspace( workspace.raw_ptr + used_workspace_size, workspace.size - used_workspace_size); + TensorNDArray states = {hx, cx}; + std::vector hx_param; + TensorLayout unfold_hx_layout{ + TensorShape{batch_size, hidden_size}, hx.layout.dtype}; + for (size_t layer = 0; layer < num_layers; ++layer) { + for (size_t d = 0; d < D; ++d) { + TensorNDArray unfold_hx; + size_t idx = layer * D + d; + size_t states_offset = idx * unfold_hx_layout.span().dist_byte(); + for (size_t i = 0; i < states.size(); ++i) { + unfold_hx.push_back(TensorND{ + static_cast(states[i].raw_ptr()) + states_offset, + unfold_hx_layout}); + } + hx_param.push_back(unfold_hx); + } + } used_workspace_size += rnn::get_inputs_for_exec( - x, y, reserve_space, num_layers, D, hidden_size, cells, layer_inputs, - layer_outputs, cell_seq_states, param::RNNCell::NonlineMode::IDENTITY, - new_workspace); + x, y, hx_param, reserve_space, num_layers, D, hidden_size, cells, + layer_inputs, layer_outputs, cell_seq_states, + param::RNNCell::NonlineMode::IDENTITY, new_workspace); // dhy arr, dhx arr TensorNDArray dhy_arr = {dhy, dcy}, dhx_arr = {dhx, dcx}; @@ -110,11 +134,12 @@ void LSTMBackwardImpl::exec( } size_t LSTMBackwardImpl::get_workspace_in_bytes( - const TensorLayout& x, const TensorLayout& y, const TensorLayout& hx, - const TensorLayout& cx, const TensorLayout& dy, const TensorLayout& dhy, - const TensorLayout& dcy, const TensorLayout& flatten_weights, - const TensorLayout& reserve_space, const TensorLayout& dx, - const TensorLayout& dhx, const TensorLayout& dcx, const TensorLayout& dw) { + const TensorLayout& x, const TensorLayout& y, const TensorLayout& /*hx*/, + const TensorLayout& /*cx*/, const TensorLayout& /*dy*/, + const TensorLayout& /*dhy*/, const TensorLayout& /*dcy*/, + const TensorLayout& flatten_weights, const TensorLayout& /*reserve_space*/, + const TensorLayout& /*dx*/, const TensorLayout& /*dhx*/, + const TensorLayout& /*dcx*/, const TensorLayout& /*dw*/) { size_t D = param().bidirectional ? 2 : 1; size_t num_layers = param().num_layers; size_t hidden_size = param().hidden_size; @@ -142,5 +167,6 @@ size_t LSTMBackwardImpl::get_workspace_in_bytes( return workspace_size; } } // namespace naive +} // namespace megdnn -} // namespace megdnn \ No newline at end of file +// vim: syntax=cpp.doxygen \ No newline at end of file diff --git a/dnn/src/naive/lstm/opr_impl.h b/dnn/src/naive/lstm/opr_impl.h index 9ff5aa9d..6622f51d 100644 --- a/dnn/src/naive/lstm/opr_impl.h +++ b/dnn/src/naive/lstm/opr_impl.h @@ -22,14 +22,16 @@ public: _megdnn_tensor_in input, _megdnn_tensor_in hx, _megdnn_tensor_in cx, _megdnn_tensor_in flatten_weights, _megdnn_tensor_out output, _megdnn_tensor_out hy, _megdnn_tensor_out cy, - _megdnn_tensor_out reserve_space, _megdnn_workspace workspace); + _megdnn_tensor_out reserve_space, _megdnn_workspace workspace) override; size_t get_workspace_in_bytes( const TensorLayout& input, const TensorLayout& hx, const TensorLayout& cx, const TensorLayout& flatten_weights, const TensorLayout& output, const TensorLayout& hy, const TensorLayout& cy, - const TensorLayout& reserve_space); - size_t get_reserve_size_in_bytes(const TensorLayout& input); + const TensorLayout& reserve_space) override; + size_t get_reserve_size_in_bytes(const TensorLayout& input) override; + + bool is_thread_safe() const override { return true; } }; class LSTMBackwardImpl : public LSTMBackward { @@ -42,14 +44,17 @@ public: _megdnn_tensor_in dcy, _megdnn_tensor_in flatten_weights, _megdnn_tensor_in reserve_space, _megdnn_tensor_out dx, _megdnn_tensor_out dhx, _megdnn_tensor_out dcx, _megdnn_tensor_out dw, - _megdnn_workspace workspace); + _megdnn_workspace workspace) override; + + bool is_thread_safe() const override { return true; } virtual size_t get_workspace_in_bytes( const TensorLayout& x, const TensorLayout& y, const TensorLayout& hx, const TensorLayout& cx, const TensorLayout& dy, const TensorLayout& dhy, const TensorLayout& dcy, const TensorLayout& flatten_weights, const TensorLayout& reserve_space, const TensorLayout& dx, - const TensorLayout& dhx, const TensorLayout& dcx, const TensorLayout& dw); + const TensorLayout& dhx, const TensorLayout& dcx, + const TensorLayout& dw) override; }; } // namespace naive diff --git a/dnn/src/naive/lstm/template_impl.cpp b/dnn/src/naive/lstm/template_impl.cpp index 108ee238..541b828b 100644 --- a/dnn/src/naive/lstm/template_impl.cpp +++ b/dnn/src/naive/lstm/template_impl.cpp @@ -19,7 +19,8 @@ void cell_opr_exec( _megdnn_tensor_in input, _megdnn_tensor_in weight_ih, _megdnn_tensor_in weight_hh, _megdnn_tensor_in bias_ih, _megdnn_tensor_in bias_hh, const TensorNDArray& states, - TensorNDArray& states_new, _megdnn_workspace workspace, Handle* handle) { + TensorNDArray& states_new, _megdnn_workspace workspace, + param::RNNCell::NonlineMode /*nonline_mode*/, Handle* handle) { auto opr = handle->create_operator(); TensorLayout gates, h_new, c_new; opr->deduce_layout( diff --git a/dnn/src/naive/lstm_cell/opr_impl.cpp b/dnn/src/naive/lstm_cell/opr_impl.cpp index 10b0dd6b..139a7c81 100644 --- a/dnn/src/naive/lstm_cell/opr_impl.cpp +++ b/dnn/src/naive/lstm_cell/opr_impl.cpp @@ -11,6 +11,9 @@ #include "src/naive/lstm_cell/opr_impl.h" #include "src/common/lstm_cell.h" +#include "midout.h" +MIDOUT_DECL(megdnn_naive_lstmcell_fwd) + namespace megdnn { namespace naive { size_t LSTMCellImpl::get_workspace_in_bytes( @@ -29,9 +32,12 @@ void LSTMCellImpl::exec( _megdnn_tensor_in hx, _megdnn_tensor_in weight_hh, _megdnn_tensor_in bias_hh, _megdnn_tensor_in cx, _megdnn_tensor_out h_new, _megdnn_tensor_out c_new, _megdnn_tensor_out gates, _megdnn_workspace workspace) { - megdnn::lstm_cell::exec( - input, weight_ih, bias_ih, hx, weight_hh, bias_hh, cx, h_new, c_new, gates, - workspace, handle()); + MIDOUT_BEGIN(megdnn_naive_lstmcell_fwd) { + megdnn::lstm_cell::exec( + input, weight_ih, bias_ih, hx, weight_hh, bias_hh, cx, h_new, c_new, + gates, workspace, handle()); + } + MIDOUT_END(); } } // namespace naive diff --git a/dnn/src/naive/lstm_cell/opr_impl.h b/dnn/src/naive/lstm_cell/opr_impl.h index 4f56c8eb..3aad44b2 100644 --- a/dnn/src/naive/lstm_cell/opr_impl.h +++ b/dnn/src/naive/lstm_cell/opr_impl.h @@ -30,6 +30,8 @@ public: const TensorLayout& weight_hh, const TensorLayout& bias_hh, const TensorLayout& cx, const TensorLayout& h_new, const TensorLayout& c_new, const TensorLayout& gates) override; + + bool is_thread_safe() const override { return true; } }; } // namespace naive diff --git a/dnn/src/naive/rnn/funcs.h b/dnn/src/naive/rnn/funcs.h index 2ea83416..f157f8a9 100644 --- a/dnn/src/naive/rnn/funcs.h +++ b/dnn/src/naive/rnn/funcs.h @@ -8,8 +8,8 @@ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ -#ifndef _RNN_H -#define _RNN_H +// #ifndef _RNN_H +// #define _RNN_H #include "megdnn/oprs.h" namespace megdnn { namespace naive { @@ -20,7 +20,8 @@ void cell_opr_exec( _megdnn_tensor_in input, _megdnn_tensor_in weight_ih, _megdnn_tensor_in weight_hh, _megdnn_tensor_in bias_ih, _megdnn_tensor_in bias_hh, const TensorNDArray& states, - TensorNDArray& states_new, _megdnn_workspace workspace, Handle* handle); + TensorNDArray& states_new, _megdnn_workspace workspace, + param::RNNCell::NonlineMode nonline_mode, Handle* handle); template size_t cell_opr_get_workspace_in_bytes( @@ -33,29 +34,274 @@ size_t get_workspace_in_bytes( const TensorLayout& input, const TensorLayout& flatten_weights, size_t hidden_size, size_t D, // num_directions - Handle* handle); + Handle* handle) { + size_t seq_len = input.shape[0]; + size_t batch_size = input.shape[1]; + size_t input_size = input.shape[2]; + size_t gate_hidden_size = flatten_weights.shape[0]; + // concat workspace + TensorLayout direction_output_layout{ + TensorShape{seq_len, batch_size, hidden_size}, input.dtype}; + TensorLayout output_layout{{seq_len, batch_size, D * hidden_size}, input.dtype}; + TensorLayoutArray layer_layouts; + for (size_t i = 0; i < D; ++i) + layer_layouts.push_back(direction_output_layout); + auto concat_opr = handle->create_operator(); + concat_opr->param().axis = -1; + size_t concat_workspace = + concat_opr->get_workspace_in_bytes(layer_layouts, output_layout); + // cell workspace + TensorLayout weight_ih{{gate_hidden_size, input_size}, flatten_weights.dtype}; + TensorLayout D_weight_ih{ + {gate_hidden_size, D * hidden_size}, flatten_weights.dtype}; + TensorLayout weight_hh{{gate_hidden_size, hidden_size}, flatten_weights.dtype}; + TensorLayout bias{{gate_hidden_size}, flatten_weights.dtype}; + TensorLayout hx{{batch_size, hidden_size}, input.dtype}; + TensorLayout cell_input = {{input.shape[1], input.shape[2]}, input.dtype}; + TensorLayout D_cell_input = {{input.shape[1], D * hidden_size}, input.dtype}; + + size_t cell_workspace = cell_opr_get_workspace_in_bytes( + cell_input, weight_ih, weight_hh, bias, bias, hx, handle); + size_t D_cell_workspace = cell_opr_get_workspace_in_bytes( + D_cell_input, D_weight_ih, weight_hh, bias, bias, hx, handle); + + return std::max(std::max(cell_workspace, D_cell_workspace), concat_workspace); +} template void exec_internal( std::vector& cells, _megdnn_tensor_in input, const TensorNDArray& states, TensorNDArray& states_new, _megdnn_tensor_out output, - _megdnn_tensor_out reserve_space, size_t num_layers, - size_t D, // D is num_directions - Handle* handle, _megdnn_workspace workspace); + _megdnn_tensor_out reserve_space, size_t num_layers, size_t D, + param::RNNCell::NonlineMode nonline_mode, Handle* handle, + _megdnn_workspace workspace) { + size_t seq_len = input.layout.shape[0]; + size_t batch_size = input.layout.shape[1]; + size_t input_size = input.layout.shape[2]; + size_t hidden_size = cells[0].weight_hh.layout.shape[1]; + TensorLayout cell_output_layout{ + TensorShape{batch_size, hidden_size}, states[0].layout.dtype}; + TensorLayout cell_first_input_layout{ + TensorShape{batch_size, input_size}, input.layout.dtype}; + TensorLayout cell_input_layout{ + TensorShape{batch_size, D * hidden_size}, input.layout.dtype}; + TensorLayout direction_output_layout{ + TensorShape{seq_len, batch_size, hidden_size}, output.layout.dtype}; + TensorND tmp_output{workspace.raw_ptr, output.layout}; + _megdnn_workspace new_workspace{ + workspace.raw_ptr + tmp_output.layout.span().dist_byte(), + workspace.size - tmp_output.layout.span().dist_byte()}; + + auto cell_opr = handle->create_operator(); + auto copy_opr = handle->create_operator(); + + // copy states to states_new + for (size_t i = 0; i < states.size(); ++i) + copy_opr->exec(states[i], states_new[i]); + void* reserve_ptr = reserve_space.raw_ptr(); + + // layer 1 + for (size_t d = 0; d < D; ++d) { + size_t cell_idx = d; + auto& cell = cells[cell_idx]; + + TensorNDArray cur_states; + size_t states_offset = cell_idx * cell_output_layout.span().dist_byte(); + for (size_t i = 0; i < states.size(); ++i) { + cur_states.push_back(TensorND{ + static_cast(states_new[i].raw_ptr()) + states_offset, + cell_output_layout}); + } + + for (size_t i = 0; i < seq_len; ++i) { + size_t step = d == 0 ? i : seq_len - 1 - i; + TensorND step_input{ + static_cast(input.raw_ptr()) + + step * cell_first_input_layout.span().dist_byte(), + cell_first_input_layout}; + TensorND step_output{ + static_cast(output.raw_ptr()) + + (step * D) * cell_output_layout.span().dist_byte() + + d * cell_output_layout.span().dist_byte() / batch_size, + cell_output_layout}; + TensorNDArray tmp_states; + for (size_t s = 0; s < cur_states.size(); ++s) { + tmp_states.push_back(TensorND{reserve_ptr, cur_states[s].layout}); + size_t size_in_bytes = cur_states[s].layout.span().dist_byte(); + reserve_ptr = static_cast(reserve_ptr) + size_in_bytes; + } + cell_opr_exec( + step_input, cell.weight_ih, cell.weight_hh, cell.bias_ih, + cell.bias_hh, cur_states, tmp_states, new_workspace, nonline_mode, + handle); + for (size_t s = 0; s < tmp_states.size(); ++s) { + copy_opr->exec(tmp_states[s], cur_states[s]); + } + TensorLayout half_output_layout{ + TensorShape{hidden_size}, states[0].layout.dtype}; + if (D == 2) { + for (size_t i = 0; i < batch_size; i++) { + TensorND half_cur_states{ + // output + static_cast(cur_states[0].raw_ptr()) + + i * half_output_layout.span().dist_byte(), + half_output_layout}; + TensorND half_step_output{ + static_cast(step_output.raw_ptr()) + + i * half_output_layout.span().dist_byte() * 2, + half_output_layout}; + copy_opr->exec(half_cur_states, half_step_output); + } + } else + copy_opr->exec(cur_states[0], step_output); + } + } + + for (size_t layer = 1; layer < num_layers; ++layer) { + for (size_t d = 0; d < D; ++d) { + size_t cell_idx = layer * D + d; + auto& cell = cells[cell_idx]; + + TensorNDArray cur_states; + size_t states_offset = cell_idx * cell_output_layout.span().dist_byte(); + for (size_t i = 0; i < states.size(); ++i) { + cur_states.push_back(TensorND{ + static_cast(states_new[i].raw_ptr()) + states_offset, + cell_output_layout}); + } + + for (size_t i = 0; i < seq_len; ++i) { + size_t step = d == 0 ? i : seq_len - 1 - i; + TensorND step_input{ + static_cast(output.raw_ptr()) + + step * cell_input_layout.span().dist_byte(), + cell_input_layout}; + TensorND step_output{ + static_cast(tmp_output.raw_ptr()) + + (step * D) * cell_output_layout.span().dist_byte() + + d * cell_output_layout.span().dist_byte() / batch_size, + cell_output_layout}; + TensorNDArray tmp_states; + for (size_t s = 0; s < cur_states.size(); ++s) { + tmp_states.push_back(TensorND{reserve_ptr, cur_states[s].layout}); + size_t size_in_bytes = cur_states[s].layout.span().dist_byte(); + reserve_ptr = static_cast(reserve_ptr) + size_in_bytes; + } + cell_opr_exec( + step_input, cell.weight_ih, cell.weight_hh, cell.bias_ih, + cell.bias_hh, cur_states, tmp_states, new_workspace, + nonline_mode, handle); + // copy states to cur_states + for (size_t s = 0; s < tmp_states.size(); ++s) { + copy_opr->exec(tmp_states[s], cur_states[s]); + } + TensorLayout half_output_layout{ + TensorShape{hidden_size}, states[0].layout.dtype}; + if (D == 2) { + for (size_t i = 0; i < batch_size; i++) { + TensorND half_cur_states{ + // output + static_cast(cur_states[0].raw_ptr()) + + i * half_output_layout.span().dist_byte(), + half_output_layout}; + TensorND half_step_output{ + static_cast(step_output.raw_ptr()) + + i * half_output_layout.span().dist_byte() * 2, + half_output_layout}; + copy_opr->exec(half_cur_states, half_step_output); + } + } else + copy_opr->exec(cur_states[0], step_output); + } + } + copy_opr->exec(tmp_output, output); + } +} template size_t get_cells( size_t D, size_t num_layers, size_t input_size, size_t hidden_size, bool bias, std::vector& cells, _megdnn_tensor_in flatten_weights, - _megdnn_workspace workspace); + _megdnn_workspace workspace) { + cells.reserve(D * num_layers); + void* weight_ptr = flatten_weights.raw_ptr(); + for (size_t layer = 0; layer < num_layers; ++layer) { + for (size_t d = 0; d < D; ++d) { + size_t cell_input_size = D * hidden_size; + if (layer == 0) + cell_input_size = input_size; + Cell cell( + weight_ptr, hidden_size, cell_input_size, bias, + flatten_weights.layout.dtype, workspace); + weight_ptr = + static_cast(weight_ptr) + cell.weight_size_in_bytes(); + cells.push_back(cell); + } + } + return cells[0].workspace_size_in_bytes(); +} template size_t get_inputs_for_exec( - _megdnn_tensor_in x, _megdnn_tensor_in y, _megdnn_tensor_in reserve_space, + _megdnn_tensor_in x, _megdnn_tensor_in y, + const std::vector unfold_hx, _megdnn_tensor_in reserve_space, size_t num_layers, size_t D, size_t hidden_size, const std::vector& cells, TensorNDArray& layer_inputs, TensorNDArray& layer_outputs, std::vector>& cell_seq_states, - param::RNNCell::NonlineMode nonlineMode, _megdnn_workspace workspace); + param::RNNCell::NonlineMode /*nonlineMode*/, _megdnn_workspace workspace) { + // return used workspace size + + layer_inputs.push_back(x); + size_t seq_len = x.layout.shape[0]; + size_t batch_size = x.layout.shape[1]; + size_t num_states = cells[0].num_states(); + TensorLayout cell_output_layout{{batch_size, hidden_size}, y.layout.dtype}; + TensorLayout direction_output_layout{ + {seq_len, batch_size, hidden_size}, y.layout.dtype}; + void* workspace_ptr = workspace.raw_ptr; + + // extract intermedia states from reserve space + for (size_t layer = 0; layer < num_layers; ++layer) { + TensorND layer_output{workspace_ptr, y.layout}; + workspace_ptr = static_cast(workspace_ptr) + + layer_output.layout.span().dist_byte(); + for (size_t d = 0; d < D; ++d) { + cell_seq_states.push_back(std::vector()); + cell_seq_states[cell_seq_states.size() - 1].push_back( + {unfold_hx[layer * d]}); + // reverse direction is stored with reversed order of sequence order + for (size_t i = 0; i < seq_len; ++i) { + size_t step = i; + if (d == 1) + step = seq_len - i - 1; + size_t offset = ((layer * D + d) * seq_len + step) * + cell_output_layout.span().dist_byte() * num_states; + TensorNDArray cur_states; + for (size_t s = 0; s < num_states; ++s) { + TensorND h{ + static_cast(reserve_space.raw_ptr()) + offset + + s * cell_output_layout.span().dist_byte(), + cell_output_layout}; + cur_states.push_back(h); + } + TensorND hy{ + static_cast(reserve_space.raw_ptr()) + offset, + cell_output_layout}; + // states + cell_seq_states[cell_seq_states.size() - 1].push_back(cur_states); + // output + offset = i * D * cell_output_layout.span().dist_byte(); + memcpy(static_cast(layer_output.raw_ptr()) + offset, + hy.raw_ptr(), hy.layout.span().dist_byte()); + } + } + layer_outputs.push_back(layer_output); + if (layer != num_layers - 1) + layer_inputs.push_back(layer_output); + } + return static_cast(workspace_ptr) - + static_cast((void*)workspace.raw_ptr); +} template void backward_exec_internal( @@ -65,11 +311,182 @@ void backward_exec_internal( const std::vector>& cell_seq_states, _megdnn_tensor_in dy, const TensorNDArray& dhy, _megdnn_tensor_out dx, TensorNDArray& dstates, _megdnn_tensor_out dw, Handle* handle, - _megdnn_workspace workspace); + _megdnn_workspace workspace) { + /* + layer_inputs: array of input of each layer, element 0: [seq_len, batch_size, + input_size], element others: [seq_len, batch_size, D * hidden_size] + layer_outputs: array of outputs of each rnn. To access outputs of the cell at + (layer, d), use layer_outputs[layer]. The shape is [seq_len, batch_size, + output_size(D*hidden_size)] (in sequence order) cell_seq_states: arrray of states + of each cell at each step. To access the states of the cell at (layer, d) at + sequence step (step), use cell_seq_states[layer*D + d][step] + */ + size_t seq_len = layer_inputs[0].layout.shape[0]; + size_t batch_size = layer_inputs[0].layout.shape[1]; + DType dtype = layer_inputs[0].layout.dtype; + size_t cell_y_size = layer_outputs[0].layout.shape[2] / D; + size_t hidden_size = cell_y_size; + TensorLayout cell_y_layout = {{batch_size, cell_y_size}, dtype}; + void* workspace_ptr = workspace.raw_ptr; + + TensorND layer_output_grad{ + workspace_ptr, {{seq_len, batch_size, D * hidden_size}, dtype}}; + workspace_ptr = static_cast(workspace_ptr) + + layer_output_grad.layout.span().dist_byte(); + memcpy(layer_output_grad.raw_ptr(), dy.raw_ptr(), dy.layout.span().dist_byte()); + TensorNDArray direction_dx_arr; + for (size_t i = 0; i < D; ++i) { + TensorLayout direction_dx_layout{{seq_len, batch_size, hidden_size}, dtype}; + direction_dx_arr.push_back(TensorND(workspace_ptr, direction_dx_layout)); + workspace_ptr = static_cast(workspace_ptr) + + direction_dx_layout.span().dist_byte(); + } + TensorNDArray L0_direction_dx_arr; + for (size_t i = 0; i < D; ++i) { + TensorLayout direction_dx_layout{{seq_len, batch_size, input_size}, dtype}; + L0_direction_dx_arr.push_back(TensorND(workspace_ptr, direction_dx_layout)); + workspace_ptr = static_cast(workspace_ptr) + + direction_dx_layout.span().dist_byte(); + } + + std::vector dstates_arr; + for (size_t layer = 0; layer < num_layers; ++layer) { + for (size_t d = 0; d < D; ++d) { + TensorNDArray cell_states; + cell_states.reserve(dstates.size()); + for (size_t i = 0; i < dstates.size(); ++i) { + size_t offset = (layer * D + d) * cell_y_layout.span().dist_byte(); + TensorND dhx_cell{ + static_cast(dstates[i].raw_ptr()) + offset, + cell_y_layout}; + memcpy(dhx_cell.raw_ptr(), + static_cast(dhy[i].raw_ptr()) + offset, + cell_y_layout.span().dist_byte()); + cell_states.emplace_back(dhx_cell); + } + dstates_arr.push_back(cell_states); + } + } + + memset(dw.raw_ptr(), 0, dw.layout.span().dist_byte()); + + std::vector cell_grads; + size_t used_workspace_size = static_cast(workspace_ptr) - + static_cast((void*)(workspace.raw_ptr)); + workspace_ptr = + static_cast(workspace_ptr) + + get_cells( + D, num_layers, input_size, hidden_size, bias, cell_grads, dw, + Workspace( + workspace.raw_ptr + used_workspace_size, + workspace.size - used_workspace_size)); + + auto add_opr = handle->create_operator(); + add_opr->param().mode = Elemwise::Mode::ADD; + auto copy_opr = handle->create_operator(); + + // initialize dx to zero + memset(dx.raw_ptr(), 0, dx.layout.span().dist_byte()); + + // calculate grads + for (int layer = (int)num_layers - 1; layer >= 0; --layer) { + for (int d = (int)D - 1; d >= 0; --d) { + Cell& cell = cells[layer * D + d]; + Cell& cell_grad = cell_grads[layer * D + d]; + size_t input_size = layer_inputs[layer].layout.shape[2]; + const TensorND& x_arr = layer_inputs[layer]; + const TensorND& y_arr = layer_outputs[layer]; + TensorLayout x_layout = {{batch_size, input_size}, dtype}; + + // tmp tensors + void* tmp_workspace_ptr = workspace_ptr; + TensorND dwi_tmp{tmp_workspace_ptr, cell_grad.weight_ih.layout}; + tmp_workspace_ptr = static_cast(tmp_workspace_ptr) + + dwi_tmp.layout.span().dist_byte(); + TensorND dwh_tmp{tmp_workspace_ptr, cell_grad.weight_hh.layout}; + tmp_workspace_ptr = static_cast(tmp_workspace_ptr) + + dwh_tmp.layout.span().dist_byte(); + TensorND dbias_tmp{tmp_workspace_ptr, cell_grad.bias_ih.layout}; + tmp_workspace_ptr = static_cast(tmp_workspace_ptr) + + dbias_tmp.layout.span().dist_byte(); + size_t used_workspace_size = + static_cast(tmp_workspace_ptr) - + static_cast((void*)(workspace.raw_ptr)); + + for (size_t i = 0; i < seq_len; ++i) { + size_t step = i; + if (d == 0) + step = seq_len - i - 1; + TensorND x{ + static_cast(x_arr.raw_ptr()) + + step * x_layout.span().dist_byte(), + x_layout}, + y{static_cast(y_arr.raw_ptr()) + + (step * D + d) * cell_y_layout.span().dist_byte(), + cell_y_layout}; + const TensorNDArray& cell_states = cell_seq_states[layer * D + d][step]; + TensorNDArray& dstates_new = dstates_arr[layer * D + d]; + + TensorND dy_t{ + static_cast(layer_output_grad.raw_ptr()) + + (step * D + d) * cell_y_layout.span().dist_byte(), + cell_y_layout}; + add_opr->exec({dstates_new[0], dy_t}, dy_t); + + TensorND dx_t; + if (layer == 0) + dx_t = {static_cast(L0_direction_dx_arr[d].raw_ptr()) + + step * x_layout.span().dist_byte(), + x_layout}; + else + dx_t = {static_cast(direction_dx_arr[d].raw_ptr()) + + step * x_layout.span().dist_byte(), + x_layout}; + TensorNDArray douts = {dy_t}; + for (size_t s = 1; s < dstates_new.size(); ++s) + douts.push_back(dstates_new[s]); + cell.backward( + handle, nonlineMode, x, cell_states, y, douts, dx_t, + dstates_new, dwi_tmp, dwh_tmp, dbias_tmp, + Workspace( + workspace.raw_ptr + used_workspace_size, + workspace.size - used_workspace_size)); + // add step gradient to overall gradient + add_opr->exec({dwi_tmp, cell_grad.weight_ih}, cell_grad.weight_ih); + add_opr->exec({dwh_tmp, cell_grad.weight_hh}, cell_grad.weight_hh); + add_opr->exec({dbias_tmp, cell_grad.bias_ih}, cell_grad.bias_ih); + add_opr->exec({dbias_tmp, cell_grad.bias_hh}, cell_grad.bias_hh); + } + } + // add gradient of different directions to layer_output_grad. + if (layer == 0) { + for (size_t i = 0; i < D; ++i) + add_opr->exec({L0_direction_dx_arr[i], dx}, dx); + } else { + if (D == 1) + copy_opr->exec(direction_dx_arr[0], layer_output_grad); + else { + for (size_t t = 0; t < seq_len; ++t) { + size_t offset = t * D * cell_y_layout.span().dist_byte(); + for (size_t d = 0; d < D; ++d) { + TensorND src{ + static_cast(direction_dx_arr[d].raw_ptr()) + + offset, + cell_y_layout}; + TensorND dst{ + static_cast(layer_output_grad.raw_ptr()) + + offset + d * cell_y_layout.span().dist_byte(), + cell_y_layout}; + copy_opr->exec(src, dst); + } + } + } + } + } +} } // namespace rnn } // namespace naive } // namespace megdnn - -#include "funcs.tpp" -#endif +// #include "funcs.tpp" +// #endif diff --git a/dnn/src/naive/rnn/funcs.tpp b/dnn/src/naive/rnn/funcs.tpp deleted file mode 100644 index 60703f21..00000000 --- a/dnn/src/naive/rnn/funcs.tpp +++ /dev/null @@ -1,449 +0,0 @@ -/** - * \file dnn/src/naive/rnn/funcs.cpp - * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") - * - * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - */ -#include "funcs.h" - -namespace megdnn { -namespace naive { -namespace rnn { - -template -size_t get_workspace_in_bytes( - const TensorLayout& input, const TensorLayout& flatten_weights, - size_t hidden_size, - size_t D, // num_directions - Handle* handle) { - size_t seq_len = input.shape[0]; - size_t batch_size = input.shape[1]; - size_t input_size = input.shape[2]; - size_t gate_hidden_size = flatten_weights.shape[0]; - // concat workspace - TensorLayout direction_output_layout{ - TensorShape{seq_len, batch_size, hidden_size}, input.dtype}; - TensorLayout output_layout{{seq_len, batch_size, D * hidden_size}, input.dtype}; - TensorLayoutArray layer_layouts; - for (size_t i = 0; i < D; ++i) - layer_layouts.push_back(direction_output_layout); - auto concat_opr = handle->create_operator(); - concat_opr->param().axis = -1; - size_t concat_workspace = - concat_opr->get_workspace_in_bytes(layer_layouts, output_layout); - // cell workspace - TensorLayout weight_ih{{gate_hidden_size, input_size}, flatten_weights.dtype}; - TensorLayout weight_hh{{gate_hidden_size, hidden_size}, flatten_weights.dtype}; - TensorLayout bias{{gate_hidden_size}, flatten_weights.dtype}; - TensorLayout hx{{batch_size, hidden_size}, input.dtype}; - size_t cell_workspace = cell_opr_get_workspace_in_bytes( - input, weight_ih, weight_hh, bias, bias, hx, handle); - - return std::max(cell_workspace, concat_workspace); -} - -template -void exec_internal( - std::vector& cells, _megdnn_tensor_in input, const TensorNDArray& states, - TensorNDArray& states_new, _megdnn_tensor_out output, - _megdnn_tensor_out reserve_space, size_t num_layers, - size_t D, // D is num_directions - Handle* handle, _megdnn_workspace workspace) { - size_t seq_len = input.layout.shape[0]; - size_t batch_size = input.layout.shape[1]; - size_t input_size = input.layout.shape[2]; - size_t hidden_size = cells[0].weight_hh.layout.shape[1]; - TensorLayout cell_output_layout{ - TensorShape{batch_size, hidden_size}, states[0].layout.dtype}; - TensorLayout cell_first_input_layout{ - TensorShape{batch_size, input_size}, input.layout.dtype}; - TensorLayout cell_input_layout{ - TensorShape{batch_size, D * hidden_size}, input.layout.dtype}; - TensorLayout direction_output_layout{ - TensorShape{seq_len, batch_size, hidden_size}, output.layout.dtype}; - TensorND tmp_output{workspace.raw_ptr, output.layout}; - _megdnn_workspace new_workspace{ - workspace.raw_ptr + tmp_output.layout.span().dist_byte(), - workspace.size - tmp_output.layout.span().dist_byte()}; - - auto cell_opr = handle->create_operator(); - auto copy_opr = handle->create_operator(); - - // copy states to states_new - for (size_t i = 0; i < states.size(); ++i) - copy_opr->exec(states[i], states_new[i]); - void* reserve_ptr = reserve_space.raw_ptr(); - - // layer 1 - // TensorNDArray layer_outputs; - for (size_t d = 0; d < D; ++d) { - size_t cell_idx = d; - auto& cell = cells[cell_idx]; - - TensorNDArray cur_states; - size_t states_offset = cell_idx * cell_output_layout.span().dist_byte(); - for (size_t i = 0; i < states.size(); ++i) { - cur_states.push_back(TensorND{ - static_cast(states_new[i].raw_ptr()) + states_offset, - cell_output_layout}); - } - // TensorND direction_output_tensor{output.raw_ptr + d * - // direction_output_layout.span().dist_byte(), - // direction_output_layout}; - for (size_t i = 0; i < seq_len; ++i) { - size_t step = d == 0 ? i : seq_len - 1 - i; - TensorND step_input{ - static_cast(input.raw_ptr()) + - step * cell_first_input_layout.span().dist_byte(), - cell_first_input_layout}; - TensorND step_output{ - static_cast(output.raw_ptr()) + - (step * D + d) * cell_output_layout.span().dist_byte(), - cell_output_layout}; - // temporary states of each step (use reserve space) - TensorNDArray tmp_states; - for (size_t s = 0; s < cur_states.size(); ++s) { - tmp_states.push_back(TensorND{reserve_ptr, cur_states[s].layout}); - size_t size_in_bytes = cur_states[s].layout.span().dist_byte(); - reserve_ptr = static_cast(reserve_ptr) + size_in_bytes; - } - cell_opr_exec( - step_input, cell.weight_ih, cell.weight_hh, cell.bias_ih, - cell.bias_hh, cur_states, tmp_states, new_workspace, handle); - // copy states to cur_states - for (size_t s = 0; s < tmp_states.size(); ++s) { - copy_opr->exec(tmp_states[s], cur_states[s]); - } - // copy h to step output - copy_opr->exec(cur_states[0], step_output); - } - } - - for (size_t layer = 1; layer < num_layers; ++layer) { - // TensorNDArray layer_outputs; - - for (size_t d = 0; d < D; ++d) { - size_t cell_idx = layer * D + d; - auto& cell = cells[cell_idx]; - - TensorNDArray cur_states; - size_t states_offset = cell_idx * cell_output_layout.span().dist_byte(); - for (size_t i = 0; i < states.size(); ++i) { - cur_states.push_back(TensorND{ - static_cast(states_new[i].raw_ptr()) + states_offset, - cell_output_layout}); - } - // TensorND direction_output_tensor{output.raw_ptr + d * - // direction_output_layout.span().dist_byte(), - // direction_output_layout}; - - for (size_t i = 0; i < seq_len; ++i) { - size_t step = d == 0 ? i : seq_len - 1 - i; - TensorND step_input{ - static_cast(output.raw_ptr()) + - step * cell_input_layout.span().dist_byte(), - cell_input_layout}; - TensorND step_output{ - static_cast(tmp_output.raw_ptr()) + - (step * D + d) * cell_output_layout.span().dist_byte(), - cell_output_layout}; - // temporary states of each step (use reserve space) - TensorNDArray tmp_states; - for (size_t s = 0; s < cur_states.size(); ++s) { - tmp_states.push_back(TensorND{reserve_ptr, cur_states[s].layout}); - size_t size_in_bytes = cur_states[s].layout.span().dist_byte(); - reserve_ptr = static_cast(reserve_ptr) + size_in_bytes; - } - cell_opr_exec( - step_input, cell.weight_ih, cell.weight_hh, cell.bias_ih, - cell.bias_hh, cur_states, cur_states, new_workspace, handle); - // copy states to cur_states - for (size_t s = 0; s < tmp_states.size(); ++s) { - copy_opr->exec(tmp_states[s], cur_states[s]); - } - // copy h to step_output - copy_opr->exec(cur_states[0], step_output); - } - } - // copy layer output to output - copy_opr->exec(tmp_output, output); - } - // output: [d0, d1, d0, d1 ...] -} - -template -size_t get_cells( - size_t D, size_t num_layers, size_t input_size, size_t hidden_size, bool bias, - std::vector& cells, _megdnn_tensor_in flatten_weights, - _megdnn_workspace workspace) { - cells.reserve(D * num_layers); - void* weight_ptr = flatten_weights.raw_ptr(); - for (size_t layer = 0; layer < num_layers; ++layer) { - for (size_t d = 0; d < D; ++d) { - size_t cell_input_size = D * hidden_size; - if (layer == 0) - cell_input_size = input_size; - Cell cell( - weight_ptr, hidden_size, cell_input_size, bias, - flatten_weights.layout.dtype, workspace); - weight_ptr = - static_cast(weight_ptr) + cell.weight_size_in_bytes(); - cells.push_back(cell); - } - } - // return used workspace - return cells[0].workspace_size_in_bytes(); -} - -template -size_t get_inputs_for_exec( - _megdnn_tensor_in x, _megdnn_tensor_in y, _megdnn_tensor_in reserve_space, - size_t num_layers, size_t D, size_t hidden_size, const std::vector& cells, - TensorNDArray& layer_inputs, TensorNDArray& layer_outputs, - std::vector>& cell_seq_states, - param::RNNCell::NonlineMode nonlineMode, _megdnn_workspace workspace) { - // return used workspace size - - layer_inputs.push_back(x); - size_t seq_len = x.layout.shape[0]; - size_t batch_size = x.layout.shape[1]; - size_t num_states = cells[0].num_states(); - TensorLayout cell_output_layout{{batch_size, hidden_size}, y.layout.dtype}; - TensorLayout direction_output_layout{ - {seq_len, batch_size, hidden_size}, y.layout.dtype}; - void* workspace_ptr = workspace.raw_ptr; - - // extract intermedia states from reserve space - for (int layer = 0; layer < num_layers; ++layer) { - TensorND layer_output{workspace_ptr, y.layout}; - workspace_ptr = static_cast(workspace_ptr) + - layer_output.layout.span().dist_byte(); - for (int d = 0; d < D; ++d) { - cell_seq_states.push_back(std::vector()); - // reverse direction is stored with reversed order of sequence order - for (int i = 0; i < seq_len; ++i) { - size_t step = i; - if (d == 1) - step = seq_len - i - 1; - size_t offset = ((layer * D + d) * seq_len + step) * - cell_output_layout.span().dist_byte() * num_states; - TensorNDArray cur_states; - for (int s = 0; s < num_states; ++s) { - TensorND h{ - static_cast(reserve_space.raw_ptr()) + offset + - s * cell_output_layout.span().dist_byte(), - cell_output_layout}; - cur_states.push_back(h); - } - TensorND hy{ - static_cast(reserve_space.raw_ptr()) + offset, - cell_output_layout}; // the first hidden state is the output - // states - cell_seq_states[cell_seq_states.size() - 1].push_back(cur_states); - // output - offset = i * D * cell_output_layout.span().dist_byte(); - memcpy(static_cast(layer_output.raw_ptr()) + offset, hy.raw_ptr(), - hy.layout.span().dist_byte()); - } - } - layer_outputs.push_back(layer_output); - if (layer != num_layers - 1) - layer_inputs.push_back(layer_output); - } - return static_cast(workspace_ptr) - - static_cast((void*)workspace.raw_ptr); -} - -template -// using Cell = RNNCellWeightWrapper; -void backward_exec_internal( - std::vector& cells, size_t D, size_t num_layers, size_t input_size, - bool bias, param::RNNCell::NonlineMode nonlineMode, - const TensorNDArray& layer_inputs, const TensorNDArray& layer_outputs, - const std::vector>& cell_seq_states, - _megdnn_tensor_in dy, const TensorNDArray& dhy, _megdnn_tensor_out dx, - TensorNDArray& dstates, _megdnn_tensor_out dw, Handle* handle, - _megdnn_workspace workspace) { - /* - layer_inputs: array of input of each layer, element 0: [seq_len, batch_size, - input_size], element others: [seq_len, batch_size, D * hidden_size] - layer_outputs: array of outputs of each rnn. To access outputs of the cell at - (layer, d), use layer_outputs[layer]. The shape is [seq_len, batch_size, - output_size(D*hidden_size)] (in sequence order) cell_seq_states: arrray of states - of each cell at each step. To access the states of the cell at (layer, d) at - sequence step (step), use cell_seq_states[layer*D + d][step] - */ - size_t seq_len = layer_inputs[0].layout.shape[0]; - size_t batch_size = layer_inputs[0].layout.shape[1]; - DType dtype = layer_inputs[0].layout.dtype; - size_t cell_y_size = - layer_outputs[0].layout.shape[2] / D; // should all be the same - size_t hidden_size = cell_y_size; - TensorLayout cell_y_layout = {{batch_size, cell_y_size}, dtype}; - void* workspace_ptr = workspace.raw_ptr; - - TensorND layer_output_grad{ - workspace_ptr, {{seq_len, batch_size, D * hidden_size}, dtype}}; - workspace_ptr = static_cast(workspace_ptr) + - layer_output_grad.layout.span().dist_byte(); - memcpy(layer_output_grad.raw_ptr(), dy.raw_ptr(), dy.layout.span().dist_byte()); - TensorNDArray direction_dx_arr; // for layer 1 to layer num_layers-1 - for (int i = 0; i < D; ++i) { - TensorLayout direction_dx_layout{{seq_len, batch_size, hidden_size}, dtype}; - direction_dx_arr.push_back(TensorND(workspace_ptr, direction_dx_layout)); - workspace_ptr = static_cast(workspace_ptr) + - direction_dx_layout.span().dist_byte(); - } - TensorNDArray L0_direction_dx_arr; - for (int i = 0; i < D; ++i) { - TensorLayout direction_dx_layout{{seq_len, batch_size, input_size}, dtype}; - L0_direction_dx_arr.push_back(TensorND(workspace_ptr, direction_dx_layout)); - workspace_ptr = static_cast(workspace_ptr) + - direction_dx_layout.span().dist_byte(); - } - // cell states for each layer and each direction - std::vector dstates_arr; - for (int layer = 0; layer < num_layers; ++layer) { - for (int d = 0; d < D; ++d) { - TensorNDArray cell_states; - cell_states.reserve(dstates.size()); - for (int i = 0; i < dstates.size(); ++i) { - size_t offset = (layer * D + d) * cell_y_layout.span().dist_byte(); - TensorND dhx_cell{ - static_cast(dstates[i].raw_ptr()) + offset, - cell_y_layout}; - memcpy(dhx_cell.raw_ptr(), static_cast(dhy[i].raw_ptr()) + offset, - cell_y_layout.span().dist_byte()); - cell_states.emplace_back(dhx_cell); - } - dstates_arr.push_back(cell_states); - } - } - - // init gradient on weight to zero - memset(dw.raw_ptr(), 0, dw.layout.span().dist_byte()); - // use cells to contain gradient - std::vector cell_grads; - size_t used_workspace_size = static_cast(workspace_ptr) - - static_cast((void*)(workspace.raw_ptr)); - workspace_ptr = - static_cast(workspace_ptr) + - get_cells( - D, num_layers, input_size, hidden_size, bias, cell_grads, dw, - Workspace( - workspace.raw_ptr + used_workspace_size, - workspace.size - used_workspace_size)); - - auto add_opr = handle->create_operator(); - add_opr->param().mode = Elemwise::Mode::ADD; - auto copy_opr = handle->create_operator(); - - // initialize dx to zero - memset(dx.raw_ptr(), 0, dx.layout.span().dist_byte()); - - // calculate grads - for (int layer = num_layers - 1; layer >= 0; --layer) { - for (int d = D - 1; d >= 0; --d) { - Cell& cell = cells[layer * D + d]; - Cell& cell_grad = cell_grads[layer * D + d]; - size_t input_size = layer_inputs[layer].layout.shape[2]; - const TensorND& x_arr = layer_inputs[layer]; - const TensorND& y_arr = layer_outputs[layer]; - TensorLayout x_layout = {{batch_size, input_size}, dtype}; - - // tmp tensors - void* tmp_workspace_ptr = workspace_ptr; - TensorND dwi_tmp{tmp_workspace_ptr, cell_grad.weight_ih.layout}; - tmp_workspace_ptr = static_cast(tmp_workspace_ptr) + - dwi_tmp.layout.span().dist_byte(); - TensorND dwh_tmp{tmp_workspace_ptr, cell_grad.weight_hh.layout}; - tmp_workspace_ptr = static_cast(tmp_workspace_ptr) + - dwh_tmp.layout.span().dist_byte(); - TensorND dbias_tmp{tmp_workspace_ptr, cell_grad.bias_ih.layout}; - tmp_workspace_ptr = static_cast(tmp_workspace_ptr) + - dbias_tmp.layout.span().dist_byte(); - size_t used_workspace_size = - static_cast(tmp_workspace_ptr) - - static_cast((void*)(workspace.raw_ptr)); - - for (int i = 0; i < seq_len; ++i) { - // reverse time step (not seq step). Here step means seq step - size_t step = i; - if (d == 0) - step = seq_len - i - 1; - TensorND x{ - static_cast(x_arr.raw_ptr()) + - step * x_layout.span().dist_byte(), - x_layout}, - y{static_cast(y_arr.raw_ptr()) + - (step * D + d) * cell_y_layout.span().dist_byte(), - cell_y_layout}; - const TensorNDArray& cell_states = cell_seq_states[layer * D + d][step]; - TensorNDArray& dstates_new = dstates_arr[layer * D + d]; - // dy should be d_output + d_hidden - TensorND dy_t{ - static_cast(layer_output_grad.raw_ptr()) + - (step * D + d) * cell_y_layout.span().dist_byte(), - cell_y_layout}; - add_opr->exec({dstates_new[0], dy_t}, dy_t); - // dx for layer 0 has a different size - TensorND dx_t; - if (layer == 0) - dx_t = {static_cast(L0_direction_dx_arr[d].raw_ptr()) + - step * x_layout.span().dist_byte(), - x_layout}; - else - dx_t = {static_cast(direction_dx_arr[d].raw_ptr()) + - step * x_layout.span().dist_byte(), - x_layout}; - TensorNDArray douts = {dy_t}; - for (int s = 1; s < dstates_new.size(); ++s) - douts.push_back(dstates_new[s]); - cell.backward( - handle, nonlineMode, x, cell_states, y, douts, dx_t, - dstates_new, dwi_tmp, dwh_tmp, dbias_tmp, - Workspace( - workspace.raw_ptr + used_workspace_size, - workspace.size - used_workspace_size)); - // add step gradient to overall gradient - add_opr->exec({dwi_tmp, cell_grad.weight_ih}, cell_grad.weight_ih); - add_opr->exec({dwh_tmp, cell_grad.weight_hh}, cell_grad.weight_hh); - add_opr->exec({dbias_tmp, cell_grad.bias_ih}, cell_grad.bias_ih); - add_opr->exec({dbias_tmp, cell_grad.bias_hh}, cell_grad.bias_hh); - } - } - // add gradient of different directions to layer_output_grad. - // Layer 0 to dx - if (layer == 0) { - for (int i = 0; i < D; ++i) - add_opr->exec({L0_direction_dx_arr[i], dx}, dx); - } else { - if (D == 1) - copy_opr->exec(direction_dx_arr[0], layer_output_grad); - else { // D == 2, arrange as [(d0, d1), (d0, d1), ...] - for (size_t t = 0; t < seq_len; ++t) { - size_t offset = t * D * cell_y_layout.span().dist_byte(); - for (size_t d = 0; d < D; ++d) { - TensorND src{ - static_cast(direction_dx_arr[d].raw_ptr()) + - offset, - cell_y_layout}; - TensorND dst{ - static_cast(layer_output_grad.raw_ptr()) + - offset + d * cell_y_layout.span().dist_byte(), - cell_y_layout}; - copy_opr->exec(src, dst); - } - } - } - } - } -} - -} // namespace rnn -} // namespace naive -} // namespace megdnn diff --git a/dnn/src/naive/rnn/opr_impl.cpp b/dnn/src/naive/rnn/opr_impl.cpp index aaa8b4a1..4379cf1e 100644 --- a/dnn/src/naive/rnn/opr_impl.cpp +++ b/dnn/src/naive/rnn/opr_impl.cpp @@ -22,6 +22,9 @@ #include +#include "midout.h" +MIDOUT_DECL(megdnn_naive_rnn_fwd) + namespace megdnn { namespace naive { @@ -32,33 +35,40 @@ void RNNImpl::exec( _megdnn_tensor_in flatten_weights, _megdnn_tensor_out output, _megdnn_tensor_out hy, _megdnn_tensor_out reserve_space, _megdnn_workspace workspace) { - auto _param = param(); - size_t D = _param.bidirectional ? 2 : 1; - size_t num_layers = _param.num_layers; - size_t input_size = input.layout.shape[2]; - std::vector cells; - size_t used_workspace_size = rnn::get_cells( - D, num_layers, input_size, _param.hidden_size, _param.bias, cells, - flatten_weights, workspace); - - Workspace new_workspace( - workspace.raw_ptr + used_workspace_size, - workspace.size - used_workspace_size); - TensorNDArray states, states_new; - states.push_back(hx); - states_new.push_back(hy); - rnn::exec_internal( - cells, input, states, states_new, output, reserve_space, num_layers, D, - this->handle(), new_workspace); + MIDOUT_BEGIN(megdnn_naive_rnn_fwd) { + auto _param = param(); + size_t D = _param.bidirectional ? 2 : 1; + size_t num_layers = _param.num_layers; + size_t input_size = input.layout.shape[2]; + std::vector cells; + size_t used_workspace_size = rnn::get_cells( + D, num_layers, input_size, _param.hidden_size, _param.bias, cells, + flatten_weights, workspace); + + Workspace new_workspace( + workspace.raw_ptr + used_workspace_size, + workspace.size - used_workspace_size); + TensorNDArray states, states_new; + states.push_back(hx); + states_new.push_back(hy); + rnn::exec_internal( + cells, input, states, states_new, output, reserve_space, num_layers, D, + _param.nonlineMode, this->handle(), new_workspace); + } + MIDOUT_END(); } size_t RNNImpl::get_workspace_in_bytes( const TensorLayout& input, const TensorLayout& hx, const TensorLayout& flatten_weights, const TensorLayout& output, - const TensorLayout& hy, const TensorLayout& reserve_space) { + const TensorLayout& /*hy*/, const TensorLayout& /*reserve_space*/) { + auto _param = param(); + size_t D = _param.bidirectional ? 2 : 1; + size_t last_dim = std::max(input.shape[2], D * hx.shape[1]); + TensorLayout last_input = {{input.shape[0], input.shape[1], last_dim}, input.dtype}; size_t workspace_size = rnn::get_workspace_in_bytes( - input, flatten_weights, param().hidden_size, param().bidirectional ? 2 : 1, - this->handle()); + last_input, flatten_weights, param().hidden_size, + param().bidirectional ? 2 : 1, this->handle()); if (!param().bias) { // use fake bias (all 0) TensorLayout bias_layout = {{param().hidden_size}, flatten_weights.dtype}; workspace_size += bias_layout.span().dist_byte(); @@ -82,50 +92,23 @@ void RNNBackwardImpl::exec( _megdnn_tensor_in reserve_space, _megdnn_tensor_out dx, _megdnn_tensor_out dhx, _megdnn_tensor_out dw, _megdnn_workspace workspace) { TensorNDArray layer_inputs; - // layer_inputs.push_back(x); TensorNDArray layer_outputs; std::vector> cell_seq_states; size_t num_layers = param().num_layers; size_t D = param().bidirectional ? 2 : 1; - // size_t seq_len = x.layout.shape[0]; - // size_t batch_size = x.layout.shape[1]; size_t input_size = x.layout.shape[2]; + size_t batch_size = x.layout.shape[1]; size_t hidden_size = param().hidden_size; size_t used_workspace_size = 0; // get cells std::vector cells; - // workspace_ptr = static_cast(workspace_ptr) + used_workspace_size += rnn::get_cells( D, num_layers, input_size, hidden_size, param().bias, cells, flatten_weights, workspace); - // extract intermedia states from reserve space - /*for (int layer = 0; layer < num_layers; ++layer) { - TensorND layer_output{workspace_ptr, y.layout}; - workspace_ptr = static_cast(workspace_ptr) + - layer_output.layout.span().dist_byte(); for (int d = 0; d < D; ++d) { - cell_seq_states.push_back(std::vector()); - // reverse direction is stored with reversed order of sequence order - for (int i = 0; i < seq_len; ++i) { - size_t step = i; - if (d == 1) step = seq_len - i - 1; - size_t offset = ((layer * D + d) * seq_len + step) * - cell_output_layout.span().dist_byte(); TensorND - hy{static_cast(reserve_space.raw_ptr) + offset, cell_output_layout}; - // states - cell_seq_states[cell_seq_states.size() - 1].push_back({hy}); - // output - offset = i * D * cell_output_layout.span().dist_byte(); - memcpy(static_cast(layer_output.raw_ptr) + offset, - hy.raw_ptr, hy.layout.span().dist_byte()); - } - } - cell_seq_outputs.push_back(layer_output); - if (layer != num_layers - 1) layer_inputs.push_back(layer_output); - }*/ // nonlinear mode - param::RNNCell::NonlineMode nonlineMode; + param::RNNCell::NonlineMode nonlineMode = param::RNNCell::NonlineMode::TANH; using ModeRNN = param::RNN::NonlineMode; using ModeRNNCell = param::RNNCell::NonlineMode; switch (param().nonlineMode) { @@ -135,22 +118,34 @@ void RNNBackwardImpl::exec( case ModeRNN::TANH: nonlineMode = ModeRNNCell::TANH; break; + case ModeRNN::IDENTITY: + break; } // get formatted inputs Workspace new_workspace = Workspace( workspace.raw_ptr + used_workspace_size, workspace.size - used_workspace_size); + TensorLayout unfold_hx_layout{ + TensorShape{batch_size, hidden_size}, hx.layout.dtype}; + std::vector hx_param; + for (size_t layer = 0; layer < num_layers; ++layer) { + for (size_t d = 0; d < D; ++d) { + TensorNDArray unfold_hx; + size_t idx = layer * D + d; + size_t states_offset = idx * unfold_hx_layout.span().dist_byte(); + unfold_hx.push_back(TensorND{ + static_cast(hx.raw_ptr()) + states_offset, + unfold_hx_layout}); + hx_param.push_back(unfold_hx); + } + } used_workspace_size += rnn::get_inputs_for_exec( - x, y, reserve_space, num_layers, D, hidden_size, cells, layer_inputs, - layer_outputs, cell_seq_states, nonlineMode, new_workspace); + x, y, hx_param, reserve_space, num_layers, D, hidden_size, cells, + layer_inputs, layer_outputs, cell_seq_states, nonlineMode, new_workspace); - // dhy arr, dhx arr TensorNDArray dhy_arr = {dhy}, dhx_arr = {dhx}; - // exec - /*size_t used_workspace_size = static_cast(workspace_ptr) - - static_cast((void*)workspace.raw_ptr);*/ new_workspace = Workspace( workspace.raw_ptr + used_workspace_size, workspace.size - used_workspace_size); @@ -161,10 +156,11 @@ void RNNBackwardImpl::exec( } size_t RNNBackwardImpl::get_workspace_in_bytes( - const TensorLayout& x, const TensorLayout& y, const TensorLayout& hx, - const TensorLayout& dy, const TensorLayout& dhy, - const TensorLayout& flatten_weights, const TensorLayout& reserve_space, - const TensorLayout& dx, const TensorLayout& dhx, const TensorLayout& dw) { + const TensorLayout& x, const TensorLayout& y, const TensorLayout& /*hx*/, + const TensorLayout& /*dy*/, const TensorLayout& /*dhy*/, + const TensorLayout& flatten_weights, const TensorLayout& /*reserve_space*/, + const TensorLayout& /*dx*/, const TensorLayout& /*dhx*/, + const TensorLayout& /*dw*/) { size_t D = param().bidirectional ? 2 : 1; size_t num_layers = param().num_layers; size_t hidden_size = param().hidden_size; diff --git a/dnn/src/naive/rnn/opr_impl.h b/dnn/src/naive/rnn/opr_impl.h index 8ae6809c..b52edbd1 100644 --- a/dnn/src/naive/rnn/opr_impl.h +++ b/dnn/src/naive/rnn/opr_impl.h @@ -22,13 +22,14 @@ public: _megdnn_tensor_in input, _megdnn_tensor_in hx, _megdnn_tensor_in flatten_weights, _megdnn_tensor_out output, _megdnn_tensor_out hy, _megdnn_tensor_out reserve_space, - _megdnn_workspace workspace); + _megdnn_workspace workspace) override; size_t get_workspace_in_bytes( const TensorLayout& input, const TensorLayout& hx, const TensorLayout& flatten_weights, const TensorLayout& output, - const TensorLayout& hy, const TensorLayout& reserve_space); - size_t get_reserve_size_in_bytes(const TensorLayout& input); + const TensorLayout& hy, const TensorLayout& reserve_space) override; + size_t get_reserve_size_in_bytes(const TensorLayout& input) override; + bool is_thread_safe() const override { return true; } }; class RNNBackwardImpl : public RNNBackward { @@ -40,13 +41,15 @@ public: _megdnn_tensor_in dy, _megdnn_tensor_in dhy, _megdnn_tensor_in flatten_weights, _megdnn_tensor_in reserve_space, _megdnn_tensor_out dx, _megdnn_tensor_out dhx, _megdnn_tensor_out dw, - _megdnn_workspace workspace); + _megdnn_workspace workspace) override; virtual size_t get_workspace_in_bytes( const TensorLayout& x, const TensorLayout& y, const TensorLayout& hx, const TensorLayout& dy, const TensorLayout& dhy, const TensorLayout& flatten_weights, const TensorLayout& reserve_space, - const TensorLayout& dx, const TensorLayout& dhx, const TensorLayout& dw); + const TensorLayout& dx, const TensorLayout& dhx, + const TensorLayout& dw) override; + bool is_thread_safe() const override { return true; } }; } // namespace naive diff --git a/dnn/src/naive/rnn/rnn.cpp b/dnn/src/naive/rnn/rnn.cpp index 8ea43496..d1889f51 100644 --- a/dnn/src/naive/rnn/rnn.cpp +++ b/dnn/src/naive/rnn/rnn.cpp @@ -218,7 +218,11 @@ void LSTMCellWeightWrapper::backward( x, weight_ih, bias_ih, states[0], weight_hh, bias_hh, states[1], dstates[0], dstates[1], gates_tensor, new_workspace); // no information left in the workspace - // i, f, o, g + + // BUG: The order of gate_grad if i_g f_g o_g g_g , but it should be i_g f_g g_g o_g + // The returned gradient includes both horizontal and vertical gradients, + // horizontal grad = douts[1] vertical gradients = douts[1] + // Here the variable is confusing !!! TensorLayout single_gate = {{gates.shape[0], gates.shape[1] / 4}, gates.dtype}; TensorND i, f, o, g, i_grad, f_grad, o_grad, g_grad; // grad refers to the grad of gates before activation @@ -239,8 +243,8 @@ void LSTMCellWeightWrapper::backward( g_grad = { static_cast(o_grad.raw_ptr()) + single_gate.span().dist_byte(), single_gate}; - // activation auto elem_opr = handle->create_operator(); + elem_opr->param().mode = Elemwise::Mode::SIGMOID; elem_opr->exec({i}, i); elem_opr->exec({f}, f); @@ -254,8 +258,8 @@ void LSTMCellWeightWrapper::backward( mul_opr->exec({douts[0], tanh_cy}, dstates[0]); elem_opr->param().mode = Elemwise::Mode::SIGMOID_GRAD; elem_opr->exec({o, dstates[0]}, o_grad); // grad of gate o - // use dstates[0] as tmp tensor to store dhy * o mul_opr->exec({douts[0], o}, dstates[0]); + elem_opr->param().mode = Elemwise::Mode::TANH_GRAD; elem_opr->exec({tanh_cy, dstates[0]}, dstates[1]); // grad of cy from hy elem_opr->param().mode = Elemwise::Mode::ADD; diff --git a/dnn/src/naive/rnn/rnn.h b/dnn/src/naive/rnn/rnn.h index bb5fdf08..cbe2d464 100644 --- a/dnn/src/naive/rnn/rnn.h +++ b/dnn/src/naive/rnn/rnn.h @@ -38,6 +38,7 @@ public: _megdnn_tensor_out dwi, _megdnn_tensor_out dwh, _megdnn_tensor_out dbias, _megdnn_workspace workspace) const; virtual size_t num_states() const; + virtual ~CellWeightsWrapperBase() {} }; class RNNCellWeightWrapper : public CellWeightsWrapperBase { diff --git a/dnn/src/naive/rnn/template_impl.cpp b/dnn/src/naive/rnn/template_impl.cpp index d3f870ad..b82e72a2 100644 --- a/dnn/src/naive/rnn/template_impl.cpp +++ b/dnn/src/naive/rnn/template_impl.cpp @@ -19,8 +19,10 @@ void cell_opr_exec( _megdnn_tensor_in input, _megdnn_tensor_in weight_ih, _megdnn_tensor_in weight_hh, _megdnn_tensor_in bias_ih, _megdnn_tensor_in bias_hh, const TensorNDArray& states, - TensorNDArray& states_new, _megdnn_workspace workspace, Handle* handle) { + TensorNDArray& states_new, _megdnn_workspace workspace, + param::RNNCell::NonlineMode nonline_mode, Handle* handle) { auto opr = handle->create_operator(); + opr->param().nonlineMode = nonline_mode; opr->exec( input, weight_ih, bias_ih, states[0], weight_hh, bias_hh, states_new[0], workspace); diff --git a/dnn/src/naive/rnn_cell/opr_impl.cpp b/dnn/src/naive/rnn_cell/opr_impl.cpp index 8a3d692c..85eccf99 100644 --- a/dnn/src/naive/rnn_cell/opr_impl.cpp +++ b/dnn/src/naive/rnn_cell/opr_impl.cpp @@ -11,6 +11,9 @@ #include "src/naive/rnn_cell/opr_impl.h" #include "src/common/rnn_cell.h" +#include "midout.h" +MIDOUT_DECL(megdnn_naive_rnncell_fwd) + namespace megdnn { namespace naive { size_t RNNCellImpl::get_workspace_in_bytes( @@ -26,9 +29,12 @@ void RNNCellImpl::exec( _megdnn_tensor_in input, _megdnn_tensor_in weight_ih, _megdnn_tensor_in bias_ih, _megdnn_tensor_in hx, _megdnn_tensor_in weight_hh, _megdnn_tensor_in bias_hh, _megdnn_tensor_out dst, _megdnn_workspace workspace) { - megdnn::rnn_cell::exec( - input, weight_ih, bias_ih, hx, weight_hh, bias_hh, dst, workspace, - param().nonlineMode, handle()); + MIDOUT_BEGIN(megdnn_naive_rnncell_fwd) { + megdnn::rnn_cell::exec( + input, weight_ih, bias_ih, hx, weight_hh, bias_hh, dst, workspace, + param().nonlineMode, handle()); + } + MIDOUT_END(); } } // namespace naive } // namespace megdnn \ No newline at end of file diff --git a/dnn/src/naive/rnn_cell/opr_impl.h b/dnn/src/naive/rnn_cell/opr_impl.h index fe73edb5..59e34cc3 100644 --- a/dnn/src/naive/rnn_cell/opr_impl.h +++ b/dnn/src/naive/rnn_cell/opr_impl.h @@ -27,6 +27,7 @@ public: const TensorLayout& bias_ih, const TensorLayout& hx, const TensorLayout& weight_hh, const TensorLayout& bias_hh, const TensorLayout& dst) override; + bool is_thread_safe() const override { return true; } }; } // namespace naive diff --git a/dnn/test/common/deduce_layout_proxy.h b/dnn/test/common/deduce_layout_proxy.h index 26080d70..5992e779 100644 --- a/dnn/test/common/deduce_layout_proxy.h +++ b/dnn/test/common/deduce_layout_proxy.h @@ -77,17 +77,18 @@ struct DeduceLayoutProxy { }; template -struct DeduceLayoutProxy { - static void deduce_layout(Opr* opr, TensorLayoutArray& layouts) { - megdnn_assert(layouts.size() == 6); - opr->deduce_layout( - layouts[0], layouts[1], layouts[2], layouts[3], layouts[4], layouts[5]); - } +struct DeduceLayoutProxy { + static void deduce_layout(Opr*, TensorLayoutArray&) {} }; template -struct DeduceLayoutProxy { - static void deduce_layout(Opr*, TensorLayoutArray&) {} +struct DeduceLayoutProxy { + static void deduce_layout(Opr* opr, TensorLayoutArray& layouts) { + megdnn_assert(layouts.size() == 7); + opr->deduce_layout( + layouts[0], layouts[1], layouts[2], layouts[3], layouts[4], layouts[5], + layouts[6]); + } }; template @@ -109,6 +110,38 @@ struct DeduceLayoutProxy { layouts[6], layouts[7], layouts[8]); } }; + +template +struct DeduceLayoutProxy { + static void deduce_layout(Opr* opr, TensorLayoutArray& layouts) { + megdnn_assert(layouts.size() == 10); + opr->deduce_layout( + layouts[0], layouts[1], layouts[2], layouts[3], layouts[4], layouts[5], + layouts[6], layouts[7], layouts[8], layouts[9]); + } +}; + +template +struct DeduceLayoutProxy { + static void deduce_layout(Opr*, TensorLayoutArray&) {} +}; + +template +struct DeduceLayoutProxy { + static void deduce_layout(Opr* opr, TensorLayoutArray& layouts) { + megdnn_assert(layouts.size() == 13); + opr->deduce_layout( + layouts[0], layouts[1], layouts[2], layouts[3], layouts[4], layouts[5], + layouts[6], layouts[7], layouts[8], layouts[9], layouts[10], + layouts[11], layouts[12]); + } +}; + +template +struct DeduceLayoutProxy { + static void deduce_layout(Opr*, TensorLayoutArray&) {} +}; + } // namespace test } // namespace megdnn diff --git a/dnn/test/common/exec_proxy.h b/dnn/test/common/exec_proxy.h index 4255afeb..da8c9c46 100644 --- a/dnn/test/common/exec_proxy.h +++ b/dnn/test/common/exec_proxy.h @@ -23,6 +23,44 @@ template struct ExecProxy; template +struct ExecProxy { + WorkspaceWrapper W; + void exec(Opr* opr, const TensorNDArray& tensors) { + if (!W.valid()) { + W = WorkspaceWrapper(opr->handle(), 0); + } + W.update(opr->get_workspace_in_bytes( + tensors[0].layout, tensors[1].layout, tensors[2].layout, + tensors[3].layout, tensors[4].layout, tensors[5].layout, + tensors[6].layout, tensors[7].layout, tensors[8].layout, + tensors[9].layout, tensors[10].layout, tensors[11].layout, + tensors[12].layout)); + opr->exec( + tensors[0], tensors[1], tensors[2], tensors[3], tensors[4], tensors[5], + tensors[6], tensors[7], tensors[8], tensors[9], tensors[10], + tensors[11], tensors[12], W.workspace()); + } +}; + +template +struct ExecProxy { + WorkspaceWrapper W; + void exec(Opr* opr, const TensorNDArray& tensors) { + if (!W.valid()) { + W = WorkspaceWrapper(opr->handle(), 0); + } + W.update(opr->get_workspace_in_bytes( + tensors[0].layout, tensors[1].layout, tensors[2].layout, + tensors[3].layout, tensors[4].layout, tensors[5].layout, + tensors[6].layout, tensors[7].layout, tensors[8].layout, + tensors[9].layout)); + opr->exec( + tensors[0], tensors[1], tensors[2], tensors[3], tensors[4], tensors[5], + tensors[6], tensors[7], tensors[8], tensors[9], W.workspace()); + } +}; + +template struct ExecProxy { WorkspaceWrapper W; void exec(Opr* opr, const TensorNDArray& tensors) { diff --git a/dnn/test/common/rnn.h b/dnn/test/common/rnn.h deleted file mode 100644 index 691b2cae..00000000 --- a/dnn/test/common/rnn.h +++ /dev/null @@ -1,51 +0,0 @@ -/** - * \file dnn/test/common/rnn.h - * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") - * - * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - */ -#pragma once -#include - -#include "megdnn/basic_types.h" -#include "megdnn/opr_param_defs.h" - -namespace megdnn { -namespace test { -namespace rnn { -struct TestArg { - param::RNN param; - TensorShape input, hx, flatten_weights; - TestArg(param::RNN param, TensorShape input, TensorShape hx, - TensorShape flatten_weights) - : param(param), input(input), hx(hx), flatten_weights(flatten_weights) {} -}; - -inline std::vector get_args() { - std::vector args; - size_t batch_size = 2; - size_t input_size = 3; - size_t hidden_size = 2; - size_t seq_len = 2; - size_t gate_hidden_size = hidden_size; - param::RNN param; - param.num_layers = 1; - param.bidirectional = false; - param.bias = false; - param.hidden_size = hidden_size; - param.nonlineMode = param::RNN::NonlineMode::RELU; - - args.emplace_back( - param, TensorShape{seq_len, batch_size, input_size}, - TensorShape{batch_size, hidden_size}, - TensorShape{gate_hidden_size, input_size + hidden_size}); - return args; -} - -} // namespace rnn -} // namespace test -} // namespace megdnn \ No newline at end of file diff --git a/dnn/test/naive/lstm.cpp b/dnn/test/naive/lstm.cpp new file mode 100644 index 00000000..96f51ca9 --- /dev/null +++ b/dnn/test/naive/lstm.cpp @@ -0,0 +1,197 @@ +/** + * \file dnn/test/naive/lstm.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +// #include "test/common/lstm.h" +#include "megdnn/dtype.h" +#include "megdnn/oprs.h" +#include "test/common/checker.h" +#include "test/naive/fixture.h" + +namespace megdnn { +namespace test { + +TEST_F(NAIVE, LSTM_FORWARD) { + Checker checker(handle(), true); + size_t batch_size = 2; + size_t input_size = 3; + size_t hidden_size = 2; + size_t seq_len = 2; + size_t gate_hidden_size = 4 * hidden_size; + LSTM::Param param; + param.num_layers = 1; + param.bidirectional = false; + param.bias = false; + param.hidden_size = hidden_size; + checker.set_param(param).exect( + Testcase{ + TensorValue( + {seq_len, batch_size, input_size}, dtype::Float32(), + {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}), // input + TensorValue( + {batch_size, hidden_size}, dtype::Float32(), + {1, 2, 3, 4}), // hx + TensorValue( + {batch_size, hidden_size}, dtype::Float32(), + {2, 3, 4, 5}), // cx + TensorValue( + {gate_hidden_size, input_size + hidden_size}, + dtype::Float32(), + {3, 6, 1, 3, 2, 7, 2, 1, 3, 2, 1, 1, 3, 6, + 1, 3, 2, 7, 2, 1, 3, 2, 1, 1, 9, 3, 5, 1, + 9, 3, 5, 1, 9, 3, 5, 1, 9, 3, 5, 1}), // flattern weights + {}, + {}, + {}, + {}}, + Testcase{ + {}, + {}, + {}, + {}, + TensorValue( + {seq_len, batch_size, hidden_size}, dtype::Float32(), + {0.9951, 0.9993, 0.9999, 1.0000, 0.9993, 0.9999, 1.0000, + 1.0000}), // output + TensorValue( + {batch_size, hidden_size}, dtype::Float32(), + {0.9993, 0.9999, 1.0000, 1.0000}), // hy + TensorValue( + {batch_size, hidden_size}, dtype::Float32(), + {4.0000, 5.0000, 6.0000, 7.0000}), // cy + TensorValue( + {2, 2, 2, 2}, dtype::Float32(), + {0.995054, 0.999328, 0.99990, 0.999987, 3., 4., 5., 6., + 0.999329, 0.999328, 0.99990, 1., 4., 5., 6., + 7.}) // reserve space + }); + param.bidirectional = true; + checker.set_param(param).exect( + Testcase{ + TensorValue( + {seq_len, batch_size, input_size}, dtype::Float32(), + {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}), // input + TensorValue( + {2, batch_size, hidden_size}, dtype::Float32(), + {1, 2, 3, 4, 5, 6, 7, 8}), // hx + TensorValue( + {2, batch_size, hidden_size}, dtype::Float32(), + {2, 3, 4, 5, 6, 7, 8, 9}), // cx + TensorValue( + {gate_hidden_size, 2 * (input_size + hidden_size)}, + dtype::Float32(), + {3, 6, 1, 3, 2, 7, 2, 1, 3, 2, 1, 1, 3, 6, 1, 3, 2, + 7, 2, 1, 3, 2, 1, 1, 9, 3, 5, 1, 9, 3, 5, 1, 9, 3, + 5, 1, 9, 3, 5, 1, 3, 6, 1, 3, 2, 7, 2, 1, 3, 2, 1, + 1, 3, 6, 1, 3, 2, 7, 2, 1, 3, 2, 1, 1, 9, 3, 5, 1, + 9, 3, 5, 1, 9, 3, 5, 1, 9, 3, 5, 1}), // flattern weights + {}, + {}, + {}, + {}}, + Testcase{ + {}, + {}, + {}, + {}, + TensorValue( + {seq_len, batch_size, 2 * hidden_size}, dtype::Float32(), + {0.9951, 0.9993, 1.0000, 1.0000, 0.9999, 1.0000, 1.0000, + 1.0000, 0.9993, 0.9999, 1.0000, 1.0000, 1.0000, 1.0000, + 1.0000, 1.0000}), // output + TensorValue( + {2, batch_size, hidden_size}, dtype::Float32(), + {0.9993, 0.9999, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, + 1.0000}), // hy + TensorValue( + {2, batch_size, hidden_size}, dtype::Float32(), + {4.0000, 5.0000, 6.0000, 7.0000, 8.0000, 9.0000, 10.0000, + 11.0000}), // cy + TensorValue( + {4, 2, 2, 2}, dtype::Float32(), + {0.995054, 0.999328, 0.99990, 0.999987, 3., 4., + 5., 6., 0.999329, 0.999328, 0.99990, 1., + 4., 5., 6., 7., 1., 0.999328, + 0.99990, 0.999987, 7., 8., 9., 10., + 0.999329, 0.999328, 0.99990, 1., 8., 9., + 10., 11.}) // reserve space + }); + param.num_layers = 2; + checker.set_param(param).exect( + Testcase{ + TensorValue( + {seq_len, batch_size, input_size}, dtype::Float32(), + {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}), // input + TensorValue( + {4, batch_size, hidden_size}, dtype::Float32(), + {1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8}), // hx + TensorValue( + {4, batch_size, hidden_size}, dtype::Float32(), + {2, 3, 4, 5, 6, 7, 8, 9, 2, 3, 4, 5, 6, 7, 8, 9}), // cx + TensorValue( + {8, 22}, dtype::Float32(), + { + 3, 6, 1, 3, 2, 7, 2, 1, 3, 2, 1, 1, 3, 6, 1, 3, + 2, 7, 2, 1, 3, 2, 1, 1, 9, 3, 5, 1, 9, 3, 5, 1, + 9, 3, 5, 1, 9, 3, 5, 1, 3, 6, 1, 3, 2, 7, 2, 1, + 3, 2, 1, 1, 3, 6, 1, 3, 2, 7, 2, 1, 3, 2, 1, 1, + 9, 3, 5, 1, 9, 3, 5, 1, 9, 3, 5, 1, 9, 3, 5, 1, + 3, 6, 1, 3, 2, 7, 2, 1, 3, 2, 1, 1, 2, 7, 2, 1, + 3, 6, 1, 3, 2, 7, 2, 1, 3, 2, 1, 1, 2, 7, 2, 1, + 3, 6, 1, 3, 2, 7, 2, 1, 3, 2, 1, 1, 2, 7, 2, 1, + 3, 6, 1, 3, 2, 7, 2, 1, 3, 2, 1, 1, 2, 7, 2, 1, + 9, 3, 5, 1, 9, 3, 5, 1, 9, 3, 5, 1, 9, 3, 5, 1, + 9, 3, 5, 1, 9, 3, 5, 1, 9, 3, 5, 1, 9, 3, 5, 1, + }), // flattern weights + {}, + {}, + {}, + {}}, + Testcase{ + {}, + {}, + {}, + {}, + TensorValue( + {seq_len, batch_size, 2 * hidden_size}, dtype::Float32(), + {0.9951, 0.9993, 1.0000, 1.0000, 0.9999, 1.0000, 1.0000, + 1.0000, 0.9993, 0.9999, 1.0000, 1.0000, 1.0000, 1.0000, + 1.0000, 1.0000}), // output + TensorValue( + {4, batch_size, hidden_size}, dtype::Float32(), + {0.9993, 0.9999, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, + 1.0000, 0.9993, 0.9999, 1.0000, 1.0000, 1.0000, 1.0000, + 1.0000, 1.0000}), // hy + TensorValue( + {4, batch_size, hidden_size}, dtype::Float32(), + {4.0000, 5.0000, 6.0000, 7.0000, 8.0000, 9.0000, 10.0000, + 11.0000, 4.0000, 5.0000, 6.0000, 7.0000, 8.0000, 9.0000, + 10.0000, 11.0000}), // cy + TensorValue( + {8, 2, 2, 2}, dtype::Float32(), + { + 0.995054, 0.999328, 0.99990, 0.999987, 3., + 4., 5., 6., 0.999329, 0.999328, + 0.99990, 1., 4., 5., 6., + 7., 1., 0.999328, 0.99990, 0.999987, + 7., 8., 9., 10., 0.999329, + 0.999328, 0.99990, 1., 8., 9., + 10., 11., 0.995054, 0.999328, 0.99990, + 0.999987, 3., 4., 5., 6., + 0.999329, 0.999328, 0.99990, 1., 4., + 5., 6., 7., 1., 0.999328, + 0.99990, 0.999987, 7., 8., 9., + 10., 0.999329, 0.999328, 0.99990, 1., + 8., 9., 10., 11., + }) // reserve space + }); +} + +} // namespace test +} // namespace megdnn \ No newline at end of file diff --git a/dnn/test/naive/lstmcell.cpp b/dnn/test/naive/lstmcell.cpp new file mode 100644 index 00000000..1d6091a3 --- /dev/null +++ b/dnn/test/naive/lstmcell.cpp @@ -0,0 +1,140 @@ +/** + * \file dnn/test/naive/lstmcell.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "megdnn/dtype.h" +#include "megdnn/oprs.h" +#include "test/common/checker.h" +#include "test/naive/fixture.h" + +namespace megdnn { +namespace test { +TEST_F(NAIVE, LSTMCELL) { + Checker checker(handle(), true); + for (size_t batch : {1, 4}) + for (size_t n : {3, 4, 5, 23, 100}) + for (size_t out : {3, 6, 25, 100}) { + checker.exec( + {{batch, n}, + {out * 4, n}, + {1, out * 4}, + {batch, out}, + {out * 4, out}, + {1, out * 4}, + {batch, out}, + {}, + {}, + {}}); + } + size_t batch_size = 2; + size_t input_size = 3; + size_t hidden_size = 2; + checker.exect( + Testcase{ + TensorValue( + {batch_size, input_size}, dtype::Float32(), + {1, 2, 3, 4, 5, 6}), // input + TensorValue( + {4 * hidden_size, input_size}, dtype::Float32(), + { + 0.3535, 0.3535, 0.3535, 0.3535, 0.3535, 0.3535, + 0.3535, 0.3535, 0.3535, 0.3535, 0.3535, 0.3535, + 0.3535, 0.3535, 0.3535, 0.3535, 0.3535, 0.3535, + 0.3535, 0.3535, 0.3535, 0.3535, 0.3535, 0.3535, + }), // weight_ih + TensorValue( + {4 * hidden_size}, dtype::Float32(), + {0, 0, 0, 0, 0, 0, 0, 0}), // bias_ih + TensorValue( + {batch_size, hidden_size}, dtype::Float32(), + {1, 2, 3, 4}), // hx + TensorValue( + {4 * hidden_size, hidden_size}, dtype::Float32(), + {0.3535, 0.3535, 0.3535, 0.3535, 0.3535, 0.3535, 0.3535, + 0.3535, 0.3535, 0.3535, 0.3535, 0.3535, 0.3535, 0.3535, + 0.3535, 0.3535}), // weight_hh + TensorValue( + {4 * hidden_size}, dtype::Float32(), + {0, 0, 0, 0, 0, 0, 0, 0}), // bias_hh + TensorValue( + {batch_size, hidden_size}, dtype::Float32(), + {2, 3, 4, 5}), // cx + {}, + {}, + {}}, + Testcase{ + {}, + {}, + {}, + {}, + {}, + {}, + {}, + TensorValue( + {batch_size, hidden_size}, dtype::Float32(), + {0.9541, 0.9593, 0.9995, 0.9996}), // hy + TensorValue( + {batch_size, hidden_size}, dtype::Float32(), + {2.8771, 3.8373, 4.9979, 5.9975}), // cy + TensorValue( + {batch_size, 4 * hidden_size}, dtype::Float32(), + {3.18198, 3.18198, 7.7781, 7.7781, 3.18198, 3.18198, + 7.77817, 7.77817, 3.18198, 3.18198, 7.77817, 7.77817, + 3.18198, 3.18198, 7.77817, 7.77817}), // cy + }); + batch_size = 2; + input_size = 2; + hidden_size = 1; + checker.exect( + Testcase{ + TensorValue( + {batch_size, input_size}, dtype::Float32(), + {1, 2, 3, 4}), // input + TensorValue( + {4 * hidden_size, input_size}, dtype::Float32(), + {0.3535, 0.3535, 0.3535, 0.3535, 0.3535, 0.3535, 0.3535, + 0.3535}), // weight_ih + TensorValue( + {4 * hidden_size}, dtype::Float32(), + {0.3535, 0.3535, 0.3535, 0.3535}), // bias_ih + TensorValue( + {batch_size, hidden_size}, dtype::Float32(), {1, 2}), // hx + TensorValue( + {4 * hidden_size, hidden_size}, dtype::Float32(), + {0.3535, 0.3535, 0.3535, 0.3535}), // weight_hh + TensorValue( + {4 * hidden_size}, dtype::Float32(), + {0.3535, 0.3535, 0.3535, 0.3535}), // bias_hh + TensorValue( + {batch_size, hidden_size}, dtype::Float32(), {4, 5}), // cx + {}, + {}, + {}}, + Testcase{ + {}, + {}, + {}, + {}, + {}, + {}, + {}, + TensorValue( + {batch_size, hidden_size}, dtype::Float32(), + {0.8927, 0.9799}), // hy + TensorValue( + {batch_size, hidden_size}, dtype::Float32(), + {4.4393, 5.8788}), // cy + TensorValue( + {batch_size, 4 * hidden_size}, dtype::Float32(), + {2.1210, 3.8885, 2.1210, 3.8885, 2.1210, 3.8885, 2.1210, + 3.8885}), // gates + }); +} +} // namespace test +} // namespace megdnn \ No newline at end of file diff --git a/dnn/test/naive/rnn.cpp b/dnn/test/naive/rnn.cpp index 11e9162e..46b95e92 100644 --- a/dnn/test/naive/rnn.cpp +++ b/dnn/test/naive/rnn.cpp @@ -8,7 +8,6 @@ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ -#include "test/common/rnn.h" #include "megdnn/dtype.h" #include "megdnn/oprs.h" #include "test/common/checker.h" @@ -17,22 +16,7 @@ namespace megdnn { namespace test { -/*TEST_F(NAIVE, RNN) { - std::vector args = rnn::get_args(); - Checker checker(handle()); - for (auto&& arg : args) { - checker.set_param(arg.param) - .set_dtype(0, dtype::Float32()) - .set_dtype(1, dtype::Float32()) - .set_dtype(2, dtype::Float32()) - .set_dtype(3, dtype::Float32()) - .set_dtype(4, dtype::Float32()) - .set_dtype(5, dtype::Float32()) - .execs({arg.input, arg.hx, arg.flatten_weights, {}, {}, {}}); - } -}*/ - -TEST_F(NAIVE, RNN_HAND_MADE) { +TEST_F(NAIVE, RNN_FORWARD) { Checker checker(handle(), false); size_t batch_size = 2; size_t input_size = 3; @@ -49,14 +33,17 @@ TEST_F(NAIVE, RNN_HAND_MADE) { Testcase{ TensorValue( {seq_len, batch_size, input_size}, dtype::Float32(), - {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}), // input + {-0.66536, 0.08049, 0.12008, 0.63423, 1.37801, 0.02591, + 0.09153, 0.82866, -1.70429, -1.26624, -0.06421, + 0.35816}), // input TensorValue( {batch_size, hidden_size}, dtype::Float32(), - {2, 1, 3, 5}), // hx + {-3.19544, -1.24232, 1.99512, -0.25692}), // hx TensorValue( {gate_hidden_size, input_size + hidden_size}, dtype::Float32(), - {3, 6, 1, 3, 2, 7, 9, 3, 5, 1}), // weights + {0.35355, 0.35355, 0.35355, 0.35355, 0.35355, 0.35355, + 0.35355, 0.35355, 0.35355, 0.35355}), // flattern weights {}, {}, {}}, @@ -66,13 +53,54 @@ TEST_F(NAIVE, RNN_HAND_MADE) { {}, TensorValue( {seq_len, batch_size, hidden_size}, dtype::Float32(), - {39, 39, 90, 84, 300, 216, 546, 366}), // output + {0.0, 0.0, 1.3351, 1.3351, 0.0, 0.0, 0.6003, + 0.6003}), // output TensorValue( {batch_size, hidden_size}, dtype::Float32(), - {21, 11, 42, 20}), // hy + {0.0, 0.0, 0.6003, 0.6003}), // hy TensorValue( {1, 2, 2, 2}, dtype::Float32(), - {2, 1, 3, 5, 21, 11, 42, 20}) // reserve space + {0.0, 0.0, 1.33512, 1.33512, 0.0, 0.0, 0.60031, + 0.60031}) // reserve space + }); + param.num_layers = 2; + checker.set_param(param).exect( + Testcase{ + TensorValue( + {seq_len, batch_size, input_size}, dtype::Float32(), + {-0.66536, 0.08049, 0.12008, 0.63423, 1.37801, 0.02591, + 0.09153, 0.82866, -1.70429, -1.26624, -0.06421, + 0.35816}), // input + TensorValue( + {2, batch_size, hidden_size}, dtype::Float32(), + {-3.19544, -1.24232, 1.99512, -0.25692, -3.19544, -1.24232, + 1.99512, -0.25692}), // hx + TensorValue( + {2, 9}, dtype::Float32(), + {0.35355, 0.35355, 0.35355, 0.35355, 0.35355, 0.35355, + 0.35355, 0.35355, 0.35355, 0.35355, 0.35355, 0.35355, + 0.35355, 0.35355, 0.35355, 0.35355, 0.35355, + 0.35355}), // weights + {}, + {}, + {}}, + Testcase{ + {}, + {}, + {}, + TensorValue( + {seq_len, batch_size, hidden_size}, dtype::Float32(), + {0.0, 0.0, 1.5586, 1.5586, 0.0, 0.0, 1.5266, + 1.5266}), // output + TensorValue( + {2, batch_size, hidden_size}, dtype::Float32(), + {0.0, 0.0, 0.6003, 0.6003, 0.0, 0.0, 1.5266, + 1.5266}), // hy + TensorValue( + {2, 2, 2, 2}, dtype::Float32(), + {0.0, 0.0, 1.33512, 1.33512, 0.0, 0.0, 0.60031, 0.60031, + 0.0, 0.0, 1.55861, 1.55861, 0.0, 0.0, 1.52658, + 1.52658}) // reserve space }); } diff --git a/dnn/test/naive/rnn_cell.cpp b/dnn/test/naive/rnn_cell.cpp new file mode 100644 index 00000000..48de6128 --- /dev/null +++ b/dnn/test/naive/rnn_cell.cpp @@ -0,0 +1,104 @@ +/** + * \file dnn/test/naive/rnncell.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "megdnn/dtype.h" +#include "megdnn/oprs.h" +#include "test/common/checker.h" +#include "test/naive/fixture.h" + +namespace megdnn { +namespace test { + +TEST_F(NAIVE, RNNCELL) { + Checker checker(handle(), false); + for (size_t batch : {1, 4}) + for (size_t inp : {3, 4, 5, 23, 100}) + for (size_t hidden : {3, 6, 25, 100}) { + checker.exec( + {{batch, inp}, + {hidden, inp}, + {1, hidden}, + {batch, hidden}, + {hidden, hidden}, + {1, hidden}, + {}}); + } + size_t batch_size = 2; + size_t input_size = 3; + size_t hidden_size = 2; + RNNCell::Param param; + param.nonlineMode = param::RNNCell::NonlineMode::TANH; + checker.set_param(param).exect( + Testcase{ + TensorValue( + {batch_size, input_size}, dtype::Float32(), + {1, 2, 3, 4, 5, 6}), // input + TensorValue( + {hidden_size, input_size}, dtype::Float32(), + {0.3535, 0.3535, 0.3535, 0.3535, 0.3535, + 0.3535}), // weight_ih + TensorValue({1, hidden_size}, dtype::Float32(), {0, 0}), // bias_ih + TensorValue( + {batch_size, hidden_size}, dtype::Float32(), + {1, 2, 3, 4}), // hx + TensorValue( + {hidden_size, hidden_size}, dtype::Float32(), + {0.3535, 0.3535, 0.3535, 0.3535}), // weight_hh + TensorValue({1, hidden_size}, dtype::Float32(), {0, 0}), // bias_hh + {}}, + Testcase{ + {}, + {}, + {}, + {}, + {}, + {}, + TensorValue( + {batch_size, hidden_size}, dtype::Float32(), + {0.9966, 0.9966, 1.0, 1.0}), // dst + }); + batch_size = 2; + input_size = 2; + hidden_size = 1; + param.nonlineMode = param::RNNCell::NonlineMode::RELU; + checker.set_param(param).exect( + Testcase{ + TensorValue( + {batch_size, input_size}, dtype::Float32(), + {1, 2, 3, 4}), // input + TensorValue( + {hidden_size, input_size}, dtype::Float32(), + {0.3535, 0.3535}), // weight_ih + TensorValue( + {1, hidden_size}, dtype::Float32(), {0.3535}), // bias_ih + TensorValue( + {batch_size, hidden_size}, dtype::Float32(), + {-1, -2}), // hx + TensorValue( + {hidden_size, hidden_size}, dtype::Float32(), + {0.3535}), // weight_hh + TensorValue( + {1, hidden_size}, dtype::Float32(), {0.3535}), // bias_hh + {}}, + Testcase{ + {}, + {}, + {}, + {}, + {}, + {}, + TensorValue( + {batch_size, hidden_size}, dtype::Float32(), + {1.414, 2.4745}), // hy + }); +} + +} // namespace test +} // namespace megdnn \ No newline at end of file diff --git a/imperative/python/megengine/module/rnn.py b/imperative/python/megengine/module/rnn.py index 571e3a19..a9340753 100644 --- a/imperative/python/megengine/module/rnn.py +++ b/imperative/python/megengine/module/rnn.py @@ -15,9 +15,8 @@ import numpy as np from ..core._imperative_rt.core2 import apply from ..core.ops import builtin -from ..device import is_cuda_available -from ..functional import concat, expand_dims, repeat, stack, zeros -from ..functional.nn import concat +from ..core.ops.builtin import BatchNorm +from ..functional import stack, zeros from ..tensor import Parameter, Tensor from . import init from .module import Module @@ -25,13 +24,7 @@ from .module import Module class RNNCellBase(Module): def __init__( - self, - input_size: int, - hidden_size: int, - bias: bool, - num_chunks: int, - device=None, - dtype=None, + self, input_size: int, hidden_size: int, bias: bool, num_chunks: int, ) -> None: # num_chunks indicates the number of gates super(RNNCellBase, self).__init__() @@ -41,127 +34,195 @@ class RNNCellBase(Module): self.bias = bias # initialize weights - common_kwargs = {"device": device, "dtype": dtype} self.gate_hidden_size = num_chunks * hidden_size self.weight_ih = Parameter( - np.random.uniform(size=(self.gate_hidden_size, input_size)).astype( - np.float32 - ), - **common_kwargs, + np.zeros((self.gate_hidden_size, input_size), dtype=np.float32) ) self.weight_hh = Parameter( - np.random.uniform(size=(self.gate_hidden_size, hidden_size)).astype( - np.float32 - ), - **common_kwargs, + np.zeros((self.gate_hidden_size, hidden_size), dtype=np.float32) ) if bias: self.bias_ih = Parameter( - np.random.uniform(size=(self.gate_hidden_size)).astype(np.float32), - **common_kwargs, + np.zeros((self.gate_hidden_size), dtype=np.float32) ) self.bias_hh = Parameter( - np.random.uniform(size=(self.gate_hidden_size)).astype(np.float32), - **common_kwargs, + np.zeros((self.gate_hidden_size), dtype=np.float32) ) else: - self.bias_ih = zeros(shape=(self.gate_hidden_size), **common_kwargs) - self.bias_hh = zeros(shape=(self.gate_hidden_size), **common_kwargs) + self.bias_ih = zeros(shape=(self.gate_hidden_size)) + self.bias_hh = zeros(shape=(self.gate_hidden_size)) self.reset_parameters() # if bias is False self.bias will remain zero - def get_op(self): - return builtin.RNNCell() - def reset_parameters(self) -> None: stdv = 1.0 / math.sqrt(self.hidden_size) for weight in self.parameters(): init.uniform_(weight, -stdv, stdv) + @abstractmethod def forward(self, input: Tensor, hx: Optional[Tensor] = None) -> Tensor: - if hx is None: - hx = zeros( - shape=(input.shape[0], self.gate_hidden_size), - dtype=input.dtype, - device=input.device, - ) - op = self.get_op() - return apply( - op, input, self.weight_ih, self.bias_ih, hx, self.weight_hh, self.bias_hh - )[0] - # return linear(input, self.weight_ih, self.bias_ih) + linear(hx, self.weight_hh, self.bias_hh) + raise NotImplementedError("forward not implemented !") class RNNCell(RNNCellBase): + + r"""An Elman RNN cell with tanh or ReLU non-linearity. + + .. math:: + + h' = \tanh(W_{ih} x + b_{ih} + W_{hh} h + b_{hh}) + + If :attr:`nonlinearity` is `'relu'`, then ReLU is used in place of tanh. + + Args: + input_size: The number of expected features in the input `x` + hidden_size: The number of features in the hidden state `h` + bias: If ``False``, then the layer does not use bias weights `b_ih` and `b_hh`. + Default: ``True`` + nonlinearity: The non-linearity to use. Can be either ``'tanh'`` or ``'relu'``. Default: ``'tanh'`` + + Inputs: input, hidden + - **input** of shape `(batch, input_size)`: tensor containing input features + - **hidden** of shape `(batch, hidden_size)`: tensor containing the initial hidden + state for each element in the batch. + Defaults to zero if not provided. + + Outputs: h' + - **h'** of shape `(batch, hidden_size)`: tensor containing the next hidden state + for each element in the batch + + Shape: + - Input1: :math:`(N, H_{in})` tensor containing input features where + :math:`H_{in}` = `input_size` + - Input2: :math:`(N, H_{out})` tensor containing the initial hidden + state for each element in the batch where :math:`H_{out}` = `hidden_size` + Defaults to zero if not provided. + - Output: :math:`(N, H_{out})` tensor containing the next hidden state + for each element in the batch + + + Examples: + + .. code-block:: + + import numpy as np + import megengine as mge + import megengine.module as M + + m = M.RNNCell(10, 20) + inp = mge.tensor(np.random.randn(3, 10), dtype=np.float32) + hx = mge.tensor(np.random.randn(3, 20), dtype=np.float32) + out = m(inp, hx) + print(out.numpy().shape) + + Outputs: + + .. code-block:: + + (3, 20) + + """ + def __init__( self, input_size: int, hidden_size: int, bias: bool = True, nonlinearity: str = "tanh", - device=None, - dtype=None, ) -> None: self.nonlinearity = nonlinearity - super(RNNCell, self).__init__( - input_size, hidden_size, bias, num_chunks=1, device=device, dtype=dtype - ) - # self.activate = tanh if nonlinearity == "tanh" else relu - - def get_op(self): - return builtin.RNNCell(nonlineMode=self.nonlinearity) + super(RNNCell, self).__init__(input_size, hidden_size, bias, num_chunks=1) def forward(self, input: Tensor, hx: Optional[Tensor] = None) -> Tensor: - return super().forward(input, hx) + if hx is None: + hx = zeros(shape=(input.shape[0], self.gate_hidden_size),) + op = builtin.RNNCell(nonlineMode=self.nonlinearity) + return apply( + op, input, self.weight_ih, self.bias_ih, hx, self.weight_hh, self.bias_hh + )[0] class LSTMCell(RNNCellBase): - def __init__( - self, - input_size: int, - hidden_size: int, - bias: bool = True, - device=None, - dtype=None, - ) -> None: - super(LSTMCell, self).__init__( - input_size, hidden_size, bias, num_chunks=4, device=device, dtype=dtype - ) - def get_op(self): - return builtin.LSTMCell() + r"""A long short-term memory (LSTM) cell. + + .. math:: + + \begin{array}{ll} + i = \sigma(W_{ii} x + b_{ii} + W_{hi} h + b_{hi}) \\ + f = \sigma(W_{if} x + b_{if} + W_{hf} h + b_{hf}) \\ + g = \tanh(W_{ig} x + b_{ig} + W_{hg} h + b_{hg}) \\ + o = \sigma(W_{io} x + b_{io} + W_{ho} h + b_{ho}) \\ + c' = f * c + i * g \\ + h' = o * \tanh(c') \\ + \end{array} + + where :math:`\sigma` is the sigmoid function, and :math:`*` is the Hadamard product. + + Args: + input_size: The number of expected features in the input `x` + hidden_size: The number of features in the hidden state `h` + bias: If ``False``, then the layer does not use bias weights `b_ih` and + `b_hh`. Default: ``True`` + + Inputs: input, (h_0, c_0) + - **input** of shape `(batch, input_size)`: tensor containing input features + - **h_0** of shape `(batch, hidden_size)`: tensor containing the initial hidden + state for each element in the batch. + - **c_0** of shape `(batch, hidden_size)`: tensor containing the initial cell state + for each element in the batch. + + If `(h_0, c_0)` is not provided, both **h_0** and **c_0** default to zero. + + Outputs: (h_1, c_1) + - **h_1** of shape `(batch, hidden_size)`: tensor containing the next hidden state + for each element in the batch + - **c_1** of shape `(batch, hidden_size)`: tensor containing the next cell state + for each element in the batch + + Examples: + + .. code-block:: + + import numpy as np + import megengine as mge + import megengine.module as M + + m = M.LSTMCell(10, 20) + inp = mge.tensor(np.random.randn(3, 10), dtype=np.float32) + hx = mge.tensor(np.random.randn(3, 20), dtype=np.float32) + cx = mge.tensor(np.random.randn(3, 20), dtype=np.float32) + hy, cy = m(inp, (hx, cx)) + print(hy.numpy().shape) + print(cy.numpy().shape) + + Outputs: + + .. code-block:: + + (3, 20) + (3, 20) + + """ + + def __init__(self, input_size: int, hidden_size: int, bias: bool = True,) -> None: + super(LSTMCell, self).__init__(input_size, hidden_size, bias, num_chunks=4) def forward( self, input: Tensor, hx: Optional[Tuple[Tensor, Tensor]] = None ) -> Tuple[Tensor, Tensor]: # hx: (h, c) if hx is None: - h = zeros( - shape=(input.shape[0], self.hidden_size), - dtype=input.dtype, - device=input.device, - ) - c = zeros( - shape=(input.shape[0], self.hidden_size), - dtype=input.dtype, - device=input.device, - ) + h = zeros(shape=(input.shape[0], self.hidden_size)) + c = zeros(shape=(input.shape[0], self.hidden_size)) else: h, c = hx - op = self.get_op() + op = builtin.LSTMCell() return apply( op, input, self.weight_ih, self.bias_ih, h, self.weight_hh, self.bias_hh, c )[:2] -def is_gpu(device: str) -> bool: - if "xpux" in device and is_cuda_available(): - return True - if "gpu" in device: - return True - return False - - class RNNBase(Module): def __init__( self, @@ -173,11 +234,8 @@ class RNNBase(Module): dropout: float = 0, bidirectional: bool = False, proj_size: int = 0, - device=None, - dtype=None, ) -> None: super(RNNBase, self).__init__() - # self.mode = mode self.input_size = input_size self.hidden_size = hidden_size self.num_layers = num_layers @@ -210,26 +268,25 @@ class RNNBase(Module): for layer in range(self.num_layers): self.cells.append([]) for _ in range(self.num_directions): - self.cells[layer].append(self.create_cell(layer, device, dtype)) + self.cells[layer].append(self.create_cell(layer)) # parameters have been initialized during the creation of the cells # if flatten, then delete cells - self._flatten_parameters(device, dtype, self.cells) + self._flatten_parameters(self.cells) - def _flatten_parameters(self, device, dtype, cells): + def _flatten_parameters(self, cells): gate_hidden_size = cells[0][0].gate_hidden_size size_dim1 = 0 for layer in range(self.num_layers): for direction in range(self.num_directions): size_dim1 += cells[layer][direction].weight_ih.shape[1] size_dim1 += cells[layer][direction].weight_hh.shape[1] - # if self.bias: - # size_dim1 += 2 * self.num_directions * self.num_layers - size_dim1 += 2 * self.num_directions * self.num_layers + if self.bias: + size_dim1 += 2 * self.num_directions * self.num_layers + self._flatten_weights = Parameter( np.zeros((gate_hidden_size, size_dim1), dtype=np.float32) ) self.reset_parameters() - # TODO: if no bias, set the bias to zero def reset_parameters(self) -> None: stdv = 1.0 / math.sqrt(self.hidden_size) @@ -237,7 +294,7 @@ class RNNBase(Module): init.uniform_(weight, -stdv, stdv) @abstractmethod - def create_cell(self, layer, device, dtype): + def create_cell(self, layer): raise NotImplementedError("Cell not implemented !") @abstractmethod @@ -265,81 +322,137 @@ class RNNBase(Module): else: batch_size = input.shape[1] if hx is None: - hx = self.init_hidden(batch_size, input.device, input.dtype) + hx = self.init_hidden(batch_size) output, h = self.apply_op(input, hx) if self.batch_first: output = output.transpose((1, 0, 2)) return output, h - if is_gpu(str(input.device)) or True: - # return output, h_n - output, h = self.apply_op(input, hx) - if self.batch_first: - output = output.transpose((1, 0, 2)) - return output, h - order_settings = [(0, input.shape[0]), (input.shape[0] - 1, -1, -1)] - h_n = [] - for layer in range(self.num_layers): - layer_outputs = [] - for direction in range(self.num_directions): - direction_outputs = [None for _ in range(input.shape[0])] - cell = self.cells[layer][direction] - hidden = self._apply_fn_to_hx( - hx, lambda x: x[layer * self.num_directions + direction] - ) - for step in range(*(order_settings[direction])): - hidden = cell(input[step], hidden) # [batch_size, hidden_size] - direction_outputs[step] = self.get_output_from_hidden(hidden) - direction_output = stack( - direction_outputs, axis=0 - ) # [seq_len, batch_size, hidden_size] - layer_outputs.append(direction_output) - h_n.append(hidden) - layer_output = concat( - layer_outputs, axis=-1 - ) # [seq_len, batch_size, D*hidden_size] - input = layer_output - if self.batch_first: - layer_output = layer_output.transpose((1, 0, 2)) - return layer_output, self._stack_h_n(h_n) +class RNN(RNNBase): + r"""Applies a multi-layer Elman RNN with :math:`\tanh` or :math:`\text{ReLU}` non-linearity to an + input sequence. + + + For each element in the input sequence, each layer computes the following + function: + + .. math:: + h_t = \tanh(W_{ih} x_t + b_{ih} + W_{hh} h_{(t-1)} + b_{hh}) + + where :math:`h_t` is the hidden state at time `t`, :math:`x_t` is + the input at time `t`, and :math:`h_{(t-1)}` is the hidden state of the + previous layer at time `t-1` or the initial hidden state at time `0`. + If :attr:`nonlinearity` is ``'relu'``, then :math:`\text{ReLU}` is used instead of :math:`\tanh`. + + Args: + input_size: The number of expected features in the input `x` + hidden_size: The number of features in the hidden state `h` + num_layers: Number of recurrent layers. E.g., setting ``num_layers=2`` + would mean stacking two RNNs together to form a `stacked RNN`, + with the second RNN taking in outputs of the first RNN and + computing the final results. Default: 1 + nonlinearity: The non-linearity to use. Can be either ``'tanh'`` or ``'relu'``. Default: ``'tanh'`` + bias: If ``False``, then the layer does not use bias weights `b_ih` and `b_hh`. + Default: ``True`` + batch_first: If ``True``, then the input and output tensors are provided + as `(batch, seq, feature)` instead of `(seq, batch, feature)`. + Note that this does not apply to hidden or cell states. See the + Inputs/Outputs sections below for details. Default: ``False`` + dropout: If non-zero, introduces a `Dropout` layer on the outputs of each + RNN layer except the last layer, with dropout probability equal to + :attr:`dropout`. Default: 0 + bidirectional: If ``True``, becomes a bidirectional RNN. Default: ``False`` + + Inputs: input, h_0 + * **input**: tensor of shape :math:`(L, N, H_{in})` when ``batch_first=False`` or + :math:`(N, L, H_{in})` when ``batch_first=True`` containing the features of + the input sequence. The input can also be a packed variable length sequence. + See :func:`torch.nn.utils.rnn.pack_padded_sequence` or + :func:`torch.nn.utils.rnn.pack_sequence` for details. + * **h_0**: tensor of shape :math:`(D * \text{num\_layers}, N, H_{out})` containing the initial hidden + state for each element in the batch. Defaults to zeros if not provided. + + where: + + .. math:: + \begin{aligned} + N ={} & \text{batch size} \\ + L ={} & \text{sequence length} \\ + D ={} & 2 \text{ if bidirectional=True otherwise } 1 \\ + H_{in} ={} & \text{input\_size} \\ + H_{out} ={} & \text{hidden\_size} + \end{aligned} + + Outputs: output, h_n + * **output**: tensor of shape :math:`(L, N, D * H_{out})` when ``batch_first=False`` or + :math:`(N, L, D * H_{out})` when ``batch_first=True`` containing the output features + `(h_t)` from the last layer of the RNN, for each `t`. If a + :class:`torch.nn.utils.rnn.PackedSequence` has been given as the input, the output + will also be a packed sequence. + * **h_n**: tensor of shape :math:`(D * \text{num\_layers}, N, H_{out})` containing the final hidden state + for each element in the batch. + + + Examples: + + .. code-block:: + + import numpy as np + import megengine as mge + import megengine.module as M + + m = M.RNN(10,20,2,batch_first=False,nonlinearity="relu",bias=True,bidirectional=True) + inp = mge.tensor(np.random.randn(6, 30, 10), dtype=np.float32) + hx = mge.tensor(np.random.randn(4, 30, 20), dtype=np.float32) + out, hn = m(inp, hx) + print(out.numpy().shape) + + Outputs: + + .. code-block:: + + (6, 30, 40) + + """ -class RNN(RNNBase): def __init__(self, *args, **kwargs) -> None: self.nonlinearity = kwargs.pop("nonlinearity", "tanh") super(RNN, self).__init__(*args, **kwargs) - def create_cell(self, layer, device, dtype): + def create_cell(self, layer): if layer == 0: input_size = self.input_size else: input_size = self.num_directions * self.hidden_size - return RNNCell( - input_size, self.hidden_size, self.bias, self.nonlinearity, device, dtype - ) + return RNNCell(input_size, self.hidden_size, self.bias, self.nonlinearity) - def init_hidden(self, batch_size, device, dtype): + def init_hidden(self, batch_size): hidden_shape = ( self.num_directions * self.num_layers, batch_size, self.hidden_size, ) - return zeros(shape=hidden_shape, dtype=dtype, device=device) + return zeros(shape=hidden_shape) def get_output_from_hidden(self, hx): return hx def apply_op(self, input, hx): + fwd_mode = ( + BatchNorm.FwdMode.TRAINING if self.training else BatchNorm.FwdMode.INFERENCE + ) + op = builtin.RNN( num_layers=self.num_layers, bidirectional=self.bidirectional, bias=self.bias, hidden_size=self.hidden_size, - proj_size=self.proj_size, dropout=self.dropout, nonlineMode=self.nonlinearity, + fwd_mode=fwd_mode, ) output, h = apply(op, input, hx, self._flatten_weights)[:2] output = output + h.sum() * 0 @@ -348,30 +461,149 @@ class RNN(RNNBase): class LSTM(RNNBase): + + r"""Applies a multi-layer long short-term memory LSTM to an input + sequence. + + + For each element in the input sequence, each layer computes the following + function: + + .. math:: + \begin{array}{ll} \\ + i_t = \sigma(W_{ii} x_t + b_{ii} + W_{hi} h_{t-1} + b_{hi}) \\ + f_t = \sigma(W_{if} x_t + b_{if} + W_{hf} h_{t-1} + b_{hf}) \\ + g_t = \tanh(W_{ig} x_t + b_{ig} + W_{hg} h_{t-1} + b_{hg}) \\ + o_t = \sigma(W_{io} x_t + b_{io} + W_{ho} h_{t-1} + b_{ho}) \\ + c_t = f_t \odot c_{t-1} + i_t \odot g_t \\ + h_t = o_t \odot \tanh(c_t) \\ + \end{array} + + where :math:`h_t` is the hidden state at time `t`, :math:`c_t` is the cell + state at time `t`, :math:`x_t` is the input at time `t`, :math:`h_{t-1}` + is the hidden state of the layer at time `t-1` or the initial hidden + state at time `0`, and :math:`i_t`, :math:`f_t`, :math:`g_t`, + :math:`o_t` are the input, forget, cell, and output gates, respectively. + :math:`\sigma` is the sigmoid function, and :math:`\odot` is the Hadamard product. + + In a multilayer LSTM, the input :math:`x^{(l)}_t` of the :math:`l` -th layer + (:math:`l >= 2`) is the hidden state :math:`h^{(l-1)}_t` of the previous layer multiplied by + dropout :math:`\delta^{(l-1)}_t` where each :math:`\delta^{(l-1)}_t` is a Bernoulli random + variable which is :math:`0` with probability :attr:`dropout`. + + If ``proj_size > 0`` is specified, LSTM with projections will be used. This changes + the LSTM cell in the following way. First, the dimension of :math:`h_t` will be changed from + ``hidden_size`` to ``proj_size`` (dimensions of :math:`W_{hi}` will be changed accordingly). + Second, the output hidden state of each layer will be multiplied by a learnable projection + matrix: :math:`h_t = W_{hr}h_t`. Note that as a consequence of this, the output + of LSTM network will be of different shape as well. See Inputs/Outputs sections below for exact + dimensions of all variables. You can find more details in https://arxiv.org/abs/1402.1128. + + Args: + input_size: The number of expected features in the input `x` + hidden_size: The number of features in the hidden state `h` + num_layers: Number of recurrent layers. E.g., setting ``num_layers=2`` + would mean stacking two LSTMs together to form a `stacked LSTM`, + with the second LSTM taking in outputs of the first LSTM and + computing the final results. Default: 1 + bias: If ``False``, then the layer does not use bias weights `b_ih` and `b_hh`. + Default: ``True`` + batch_first: If ``True``, then the input and output tensors are provided + as `(batch, seq, feature)` instead of `(seq, batch, feature)`. + Note that this does not apply to hidden or cell states. See the + Inputs/Outputs sections below for details. Default: ``False`` + dropout: If non-zero, introduces a `Dropout` layer on the outputs of each + LSTM layer except the last layer, with dropout probability equal to + :attr:`dropout`. Default: 0 + bidirectional: If ``True``, becomes a bidirectional LSTM. Default: ``False`` + proj_size: If ``> 0``, will use LSTM with projections of corresponding size. Default: 0 + + Inputs: input, (h_0, c_0) + * **input**: tensor of shape :math:`(L, N, H_{in})` when ``batch_first=False`` or + :math:`(N, L, H_{in})` when ``batch_first=True`` containing the features of + the input sequence. The input can also be a packed variable length sequence. + See :func:`torch.nn.utils.rnn.pack_padded_sequence` or + :func:`torch.nn.utils.rnn.pack_sequence` for details. + * **h_0**: tensor of shape :math:`(D * \text{num\_layers}, N, H_{out})` containing the + initial hidden state for each element in the batch. + Defaults to zeros if (h_0, c_0) is not provided. + * **c_0**: tensor of shape :math:`(D * \text{num\_layers}, N, H_{cell})` containing the + initial cell state for each element in the batch. + Defaults to zeros if (h_0, c_0) is not provided. + + where: + + .. math:: + \begin{aligned} + N ={} & \text{batch size} \\ + L ={} & \text{sequence length} \\ + D ={} & 2 \text{ if bidirectional=True otherwise } 1 \\ + H_{in} ={} & \text{input\_size} \\ + H_{cell} ={} & \text{hidden\_size} \\ + H_{out} ={} & \text{proj\_size if } \text{proj\_size}>0 \text{ otherwise hidden\_size} \\ + \end{aligned} + + Outputs: output, (h_n, c_n) + * **output**: tensor of shape :math:`(L, N, D * H_{out})` when ``batch_first=False`` or + :math:`(N, L, D * H_{out})` when ``batch_first=True`` containing the output features + `(h_t)` from the last layer of the LSTM, for each `t`. If a + :class:`torch.nn.utils.rnn.PackedSequence` has been given as the input, the output + will also be a packed sequence. + * **h_n**: tensor of shape :math:`(D * \text{num\_layers}, N, H_{out})` containing the + final hidden state for each element in the batch. + * **c_n**: tensor of shape :math:`(D * \text{num\_layers}, N, H_{cell})` containing the + final cell state for each element in the batch. + + Examples: + + .. code-block:: + + import numpy as np + import megengine as mge + import megengine.module as M + + m = M.LSTM(10, 20, 2, batch_first=False, bidirectional=True, bias=True) + inp = mge.tensor(np.random.randn(6, 30, 10), dtype=np.float32) + hx = mge.tensor(np.random.randn(4, 30, 20), dtype=np.float32) + cx = mge.tensor(np.random.randn(4, 30, 20), dtype=np.float32) + out, (hn, cn) = m(inp,(hx,cx)) + print(out.numpy().shape) + + Outputs: + + .. code-block:: + + (6, 30, 40) + + """ + def __init__(self, *args, **kwargs) -> None: super(LSTM, self).__init__(*args, **kwargs) - def create_cell(self, layer, device, dtype): + def create_cell(self, layer): if layer == 0: input_size = self.input_size else: input_size = self.num_directions * self.hidden_size - return LSTMCell(input_size, self.hidden_size, self.bias, device, dtype) + return LSTMCell(input_size, self.hidden_size, self.bias) - def init_hidden(self, batch_size, device, dtype): + def init_hidden(self, batch_size): hidden_shape = ( self.num_directions * self.num_layers, batch_size, self.hidden_size, ) - h = zeros(shape=hidden_shape, dtype=dtype, device=device) - c = zeros(shape=hidden_shape, dtype=dtype, device=device) + h = zeros(shape=hidden_shape) + c = zeros(shape=hidden_shape) return (h, c) def get_output_from_hidden(self, hx): return hx[0] def apply_op(self, input, hx): + fwd_mode = ( + BatchNorm.FwdMode.TRAINING if self.training else BatchNorm.FwdMode.INFERENCE + ) op = builtin.LSTM( num_layers=self.num_layers, bidirectional=self.bidirectional, @@ -379,6 +611,7 @@ class LSTM(RNNBase): hidden_size=self.hidden_size, proj_size=self.proj_size, dropout=self.dropout, + fwd_mode=fwd_mode, ) output, h, c = apply(op, input, hx[0], hx[1], self._flatten_weights)[:3] placeholders = [output.sum() * 0, h.sum() * 0, c.sum() * 0] diff --git a/imperative/python/test/unit/module/test_rnn.py b/imperative/python/test/unit/module/test_rnn.py index 35d2065f..eca46cd6 100644 --- a/imperative/python/test/unit/module/test_rnn.py +++ b/imperative/python/test/unit/module/test_rnn.py @@ -11,6 +11,7 @@ import pytest import megengine as mge import megengine.functional as F +from megengine.device import get_device_count from megengine.module import LSTM, RNN, LSTMCell, RNNCell @@ -20,6 +21,7 @@ def assert_tuple_equal(src, ref): assert i == j +@pytest.mark.skipif(get_device_count("gpu") > 0, reason="no algorithm on cuda") @pytest.mark.parametrize( "batch_size, input_size, hidden_size, init_hidden", [(3, 10, 20, True), (3, 10, 20, False), (1, 10, 20, False)], @@ -35,7 +37,7 @@ def test_rnn_cell(batch_size, input_size, hidden_size, init_hidden): assert_tuple_equal(h_new.shape, (batch_size, hidden_size)) -# is batch_size == 0 tolerated ? it will cause error in slice operation xx[:, ...] +@pytest.mark.skipif(get_device_count("gpu") > 0, reason="no algorithm on cuda") @pytest.mark.parametrize( "batch_size, input_size, hidden_size, init_hidden", [(3, 10, 20, True), (3, 10, 20, False), (1, 10, 20, False)], @@ -53,6 +55,7 @@ def test_lstm_cell(batch_size, input_size, hidden_size, init_hidden): assert_tuple_equal(c_new.shape, (batch_size, hidden_size)) +@pytest.mark.skipif(get_device_count("gpu") > 0, reason="no algorithm on cuda") @pytest.mark.parametrize( "batch_size, seq_len, input_size, hidden_size, num_layers, bidirectional, init_hidden, batch_first", [ @@ -70,7 +73,6 @@ def test_lstm_cell(batch_size, input_size, hidden_size, init_hidden): ), ], ) -# (0, 1, 1, 1, 1, False, True, False)]) def test_rnn( batch_size, seq_len, @@ -113,6 +115,7 @@ def test_rnn( ) +@pytest.mark.skipif(get_device_count("gpu") > 0, reason="no algorithm on cuda") @pytest.mark.parametrize( "batch_size, seq_len, input_size, hidden_size, num_layers, bidirectional, init_hidden, batch_first", [ @@ -130,7 +133,6 @@ def test_rnn( ), ], ) -# (0, 1, 1, 1, 1, False, True, False)]) def test_lstm( batch_size, seq_len, @@ -175,7 +177,3 @@ def test_lstm( assert_tuple_equal( h_n[1].shape, (num_directions * num_layers, batch_size, hidden_size) ) - - -if __name__ == "__main__": - test_lstm(5, 10, 10, 20, 1, False, False, True) diff --git a/src/opr/impl/dnn/rnn.cpp b/src/opr/impl/dnn/rnn.cpp index 0099cc3a..fa42fe52 100644 --- a/src/opr/impl/dnn/rnn.cpp +++ b/src/opr/impl/dnn/rnn.cpp @@ -123,7 +123,7 @@ MGB_IMPL_OPR_GRAD(LSTMCell) { SymbolVar input(opr.input(0)), weight_ih(opr.input(1)), hx(opr.input(3)), weight_hh(opr.input(4)), cx(opr.input(6)); SymbolVar h_out(opr.output(0)), c_out(opr.output(1)), gates(opr.output(2)), - h_og{out_grad.at(0)}, c_og{out_grad.at(1)}, tmp; + h_og{out_grad.at(0)}, c_og{out_grad.at(1)}; size_t ghs = gates.shape()[1] / 4; // gate_hidden_size SymbolVarArray gates_array = Split::make( gates, Split::Options::make_partition(gates, 1, {ghs, ghs, ghs, ghs})); @@ -141,7 +141,7 @@ MGB_IMPL_OPR_GRAD(LSTMCell) { f_grad = Elemwise::make({f, c_og * cx}, Mode::SIGMOID_GRAD); i_grad = Elemwise::make({i, c_og * g}, Mode::SIGMOID_GRAD); g_grad = Elemwise::make({g, c_og * i}, Mode::TANH_GRAD); - SymbolVar rnn_cell_grad = Concat::make({i_grad, f_grad, o_grad, g_grad}, {-1}); + SymbolVar rnn_cell_grad = Concat::make({i_grad, f_grad, o_grad, g_grad}, -1); SymbolVar result; if (wrt_idx < 6) { @@ -258,7 +258,6 @@ MGB_IMPL_OPR_GRAD(LSTM) { SymbolVarArray grads = LSTMBackward::make( opr.input(0), opr.output(0), opr.input(1), opr.input(2), out_grad.at(0), out_grad.at(1), out_grad.at(2), opr.input(3), opr.output(3), opr.param()); - SymbolVar res; return grads.at(wrt_idx).node(); // input, hx, cx, weights } #endif diff --git a/src/opr/include/megbrain/opr/dnn/rnn.h b/src/opr/include/megbrain/opr/dnn/rnn.h index ab15359d..38c0e34c 100644 --- a/src/opr/include/megbrain/opr/dnn/rnn.h +++ b/src/opr/include/megbrain/opr/dnn/rnn.h @@ -25,11 +25,11 @@ MGB_DEFINE_OPR_CLASS( public: using NonlineMode = Param::NonlineMode; - RNNCellForward( + MGE_WIN_DECLSPEC_FUC RNNCellForward( VarNode* input, VarNode* weight_ih, VarNode* bias_ih, VarNode* hx, VarNode* weight_hh, VarNode* bias_hh, const Param& param, const OperatorNodeConfig& config); - static SymbolVar make( + MGE_WIN_DECLSPEC_FUC static SymbolVar make( SymbolVar input, SymbolVar weight_ih, SymbolVar bias_ih, SymbolVar hx, SymbolVar weight_hh, SymbolVar bias_hh, const Param& param = {}, const OperatorNodeConfig& config = {}); @@ -39,11 +39,11 @@ using RNNCell = RNNCellForward; MGB_DEFINE_OPR_CLASS( LSTMCellForward, intl::MegDNNOprWrapperFwd) // { public: - LSTMCellForward( + MGE_WIN_DECLSPEC_FUC LSTMCellForward( VarNode* input, VarNode* weight_ih, VarNode* bias_ih, VarNode* hx, VarNode* weight_hh, VarNode* bias_hh, VarNode* cx, const Param& param, const OperatorNodeConfig& config); - static SymbolVar make( + MGE_WIN_DECLSPEC_FUC static SymbolVar make( SymbolVar input, SymbolVar weight_ih, SymbolVar bias_ih, SymbolVar hx, SymbolVar weight_hh, SymbolVar bias_hh, SymbolVar cx, const Param& param = {}, const OperatorNodeConfig& config = {}); @@ -51,17 +51,11 @@ public: using LSTMCell = LSTMCellForward; MGB_DEFINE_OPR_CLASS(RNNForward, intl::MegDNNOprWrapperFwd) // { - /*private: - SymbolVarArray weight_ih_arr; // 1d, idx: direction * num_layers + layer - SymbolVarArray weight_hh_arr; - SymbolVarArray bias_arr; - */ - public: - RNNForward( + MGE_WIN_DECLSPEC_FUC RNNForward( VarNode* input, VarNode* hx, VarNode* flatten_weights, const Param& param, const OperatorNodeConfig& config); - static SymbolVar make( + MGE_WIN_DECLSPEC_FUC static SymbolVar make( SymbolVar input, SymbolVar hx, SymbolVar flatten_weights, const Param& param = {}, const OperatorNodeConfig& config = {}); }; @@ -70,11 +64,11 @@ using RNN = RNNForward; MGB_DEFINE_OPR_CLASS( RNNBackward, intl::MegDNNOprWrapperBwd) // { public: - RNNBackward( + MGE_WIN_DECLSPEC_FUC RNNBackward( VarNode* x, VarNode* y, VarNode* hx, VarNode* dy, VarNode* dhy, VarNode* flatten_weights, VarNode* reserve_space, const Param& param, const OperatorNodeConfig& config); - static SymbolVarArray make( + MGE_WIN_DECLSPEC_FUC static SymbolVarArray make( SymbolVar x, SymbolVar y, SymbolVar hx, SymbolVar dy, SymbolVar dhy, SymbolVar flatten_weights, SymbolVar reserve_space, const Param& param = {}, const OperatorNodeConfig& config = {}); @@ -88,10 +82,10 @@ private: MGB_DEFINE_OPR_CLASS( LSTMForward, intl::MegDNNOprWrapperFwd) // { public: - LSTMForward( + MGE_WIN_DECLSPEC_FUC LSTMForward( VarNode* input, VarNode* hx, VarNode* cx, VarNode* flatten_weights, const Param& param, const OperatorNodeConfig& config); - static SymbolVar make( + MGE_WIN_DECLSPEC_FUC static SymbolVar make( SymbolVar input, SymbolVar hx, SymbolVar cx, SymbolVar flatten_weights, const Param& param = {}, const OperatorNodeConfig& config = {}); }; @@ -100,11 +94,11 @@ using LSTM = LSTMForward; MGB_DEFINE_OPR_CLASS( LSTMBackward, intl::MegDNNOprWrapperBwd) // { public: - LSTMBackward( + MGE_WIN_DECLSPEC_FUC LSTMBackward( VarNode* x, VarNode* y, VarNode* hx, VarNode* cx, VarNode* dy, VarNode* dhy, VarNode* dcy, VarNode* flatten_weights, VarNode* reserve_space, const Param& param, const OperatorNodeConfig& config); - static SymbolVarArray make( + MGE_WIN_DECLSPEC_FUC static SymbolVarArray make( SymbolVar x, SymbolVar y, SymbolVar hx, SymbolVar cx, SymbolVar dy, SymbolVar dhy, SymbolVar dcy, SymbolVar flatten_weights, SymbolVar reserve_space, const Param& param = {},