/** * \file dnn/src/naive/rnn/funcs.h * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") * * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ // #ifndef _RNN_H // #define _RNN_H #include "megdnn/oprs.h" namespace megdnn { namespace naive { namespace rnn { template void cell_opr_exec( _megdnn_tensor_in input, _megdnn_tensor_in weight_ih, _megdnn_tensor_in weight_hh, _megdnn_tensor_in bias_ih, _megdnn_tensor_in bias_hh, const TensorNDArray& states, TensorNDArray& states_new, _megdnn_workspace workspace, param::RNNCell::NonlineMode nonline_mode, Handle* handle); template size_t cell_opr_get_workspace_in_bytes( const TensorLayout& input, const TensorLayout& weight_ih, const TensorLayout& weight_hh, const TensorLayout& bias_ih, const TensorLayout& bias_hh, const TensorLayout& hx, Handle* handle); template size_t get_workspace_in_bytes( const TensorLayout& input, const TensorLayout& flatten_weights, size_t hidden_size, size_t D, // num_directions Handle* handle) { size_t seq_len = input.shape[0]; size_t batch_size = input.shape[1]; size_t input_size = input.shape[2]; size_t gate_hidden_size = flatten_weights.shape[0]; // concat workspace TensorLayout direction_output_layout{ TensorShape{seq_len, batch_size, hidden_size}, input.dtype}; TensorLayout output_layout{{seq_len, batch_size, D * hidden_size}, input.dtype}; TensorLayoutArray layer_layouts; for (size_t i = 0; i < D; ++i) layer_layouts.push_back(direction_output_layout); auto concat_opr = handle->create_operator(); concat_opr->param().axis = -1; size_t concat_workspace = concat_opr->get_workspace_in_bytes(layer_layouts, output_layout); // cell workspace TensorLayout weight_ih{{gate_hidden_size, input_size}, flatten_weights.dtype}; TensorLayout D_weight_ih{ {gate_hidden_size, D * hidden_size}, flatten_weights.dtype}; TensorLayout weight_hh{{gate_hidden_size, hidden_size}, flatten_weights.dtype}; TensorLayout bias{{gate_hidden_size}, flatten_weights.dtype}; TensorLayout hx{{batch_size, hidden_size}, input.dtype}; TensorLayout cell_input = {{input.shape[1], input.shape[2]}, input.dtype}; TensorLayout D_cell_input = {{input.shape[1], D * hidden_size}, input.dtype}; size_t cell_workspace = cell_opr_get_workspace_in_bytes( cell_input, weight_ih, weight_hh, bias, bias, hx, handle); size_t D_cell_workspace = cell_opr_get_workspace_in_bytes( D_cell_input, D_weight_ih, weight_hh, bias, bias, hx, handle); return std::max(std::max(cell_workspace, D_cell_workspace), concat_workspace); } template void exec_internal( std::vector& cells, _megdnn_tensor_in input, const TensorNDArray& states, TensorNDArray& states_new, _megdnn_tensor_out output, _megdnn_tensor_out reserve_space, size_t num_layers, size_t D, param::RNNCell::NonlineMode nonline_mode, Handle* handle, _megdnn_workspace workspace) { size_t seq_len = input.layout.shape[0]; size_t batch_size = input.layout.shape[1]; size_t input_size = input.layout.shape[2]; size_t hidden_size = cells[0].weight_hh.layout.shape[1]; TensorLayout cell_output_layout{ TensorShape{batch_size, hidden_size}, states[0].layout.dtype}; TensorLayout cell_first_input_layout{ TensorShape{batch_size, input_size}, input.layout.dtype}; TensorLayout cell_input_layout{ TensorShape{batch_size, D * hidden_size}, input.layout.dtype}; TensorLayout direction_output_layout{ TensorShape{seq_len, batch_size, hidden_size}, output.layout.dtype}; TensorND tmp_output{workspace.raw_ptr, output.layout}; _megdnn_workspace new_workspace{ workspace.raw_ptr + tmp_output.layout.span().dist_byte(), workspace.size - tmp_output.layout.span().dist_byte()}; auto cell_opr = handle->create_operator(); auto copy_opr = handle->create_operator(); // copy states to states_new for (size_t i = 0; i < states.size(); ++i) copy_opr->exec(states[i], states_new[i]); void* reserve_ptr = reserve_space.raw_ptr(); // layer 1 for (size_t d = 0; d < D; ++d) { size_t cell_idx = d; auto& cell = cells[cell_idx]; TensorNDArray cur_states; size_t states_offset = cell_idx * cell_output_layout.span().dist_byte(); for (size_t i = 0; i < states.size(); ++i) { cur_states.push_back(TensorND{ static_cast(states_new[i].raw_ptr()) + states_offset, cell_output_layout}); } for (size_t i = 0; i < seq_len; ++i) { size_t step = d == 0 ? i : seq_len - 1 - i; TensorND step_input{ static_cast(input.raw_ptr()) + step * cell_first_input_layout.span().dist_byte(), cell_first_input_layout}; TensorND step_output{ static_cast(output.raw_ptr()) + (step * D) * cell_output_layout.span().dist_byte() + d * cell_output_layout.span().dist_byte() / batch_size, cell_output_layout}; TensorNDArray tmp_states; for (size_t s = 0; s < cur_states.size(); ++s) { tmp_states.push_back(TensorND{reserve_ptr, cur_states[s].layout}); size_t size_in_bytes = cur_states[s].layout.span().dist_byte(); reserve_ptr = static_cast(reserve_ptr) + size_in_bytes; } cell_opr_exec( step_input, cell.weight_ih, cell.weight_hh, cell.bias_ih, cell.bias_hh, cur_states, tmp_states, new_workspace, nonline_mode, handle); for (size_t s = 0; s < tmp_states.size(); ++s) { copy_opr->exec(tmp_states[s], cur_states[s]); } TensorLayout half_output_layout{ TensorShape{hidden_size}, states[0].layout.dtype}; if (D == 2) { for (size_t i = 0; i < batch_size; i++) { TensorND half_cur_states{ // output static_cast(cur_states[0].raw_ptr()) + i * half_output_layout.span().dist_byte(), half_output_layout}; TensorND half_step_output{ static_cast(step_output.raw_ptr()) + i * half_output_layout.span().dist_byte() * 2, half_output_layout}; copy_opr->exec(half_cur_states, half_step_output); } } else copy_opr->exec(cur_states[0], step_output); } } for (size_t layer = 1; layer < num_layers; ++layer) { for (size_t d = 0; d < D; ++d) { size_t cell_idx = layer * D + d; auto& cell = cells[cell_idx]; TensorNDArray cur_states; size_t states_offset = cell_idx * cell_output_layout.span().dist_byte(); for (size_t i = 0; i < states.size(); ++i) { cur_states.push_back(TensorND{ static_cast(states_new[i].raw_ptr()) + states_offset, cell_output_layout}); } for (size_t i = 0; i < seq_len; ++i) { size_t step = d == 0 ? i : seq_len - 1 - i; TensorND step_input{ static_cast(output.raw_ptr()) + step * cell_input_layout.span().dist_byte(), cell_input_layout}; TensorND step_output{ static_cast(tmp_output.raw_ptr()) + (step * D) * cell_output_layout.span().dist_byte() + d * cell_output_layout.span().dist_byte() / batch_size, cell_output_layout}; TensorNDArray tmp_states; for (size_t s = 0; s < cur_states.size(); ++s) { tmp_states.push_back(TensorND{reserve_ptr, cur_states[s].layout}); size_t size_in_bytes = cur_states[s].layout.span().dist_byte(); reserve_ptr = static_cast(reserve_ptr) + size_in_bytes; } cell_opr_exec( step_input, cell.weight_ih, cell.weight_hh, cell.bias_ih, cell.bias_hh, cur_states, tmp_states, new_workspace, nonline_mode, handle); // copy states to cur_states for (size_t s = 0; s < tmp_states.size(); ++s) { copy_opr->exec(tmp_states[s], cur_states[s]); } TensorLayout half_output_layout{ TensorShape{hidden_size}, states[0].layout.dtype}; if (D == 2) { for (size_t i = 0; i < batch_size; i++) { TensorND half_cur_states{ // output static_cast(cur_states[0].raw_ptr()) + i * half_output_layout.span().dist_byte(), half_output_layout}; TensorND half_step_output{ static_cast(step_output.raw_ptr()) + i * half_output_layout.span().dist_byte() * 2, half_output_layout}; copy_opr->exec(half_cur_states, half_step_output); } } else copy_opr->exec(cur_states[0], step_output); } } copy_opr->exec(tmp_output, output); } } template size_t get_cells( size_t D, size_t num_layers, size_t input_size, size_t hidden_size, bool bias, std::vector& cells, _megdnn_tensor_in flatten_weights, _megdnn_workspace workspace) { cells.reserve(D * num_layers); void* weight_ptr = flatten_weights.raw_ptr(); for (size_t layer = 0; layer < num_layers; ++layer) { for (size_t d = 0; d < D; ++d) { size_t cell_input_size = D * hidden_size; if (layer == 0) cell_input_size = input_size; Cell cell( weight_ptr, hidden_size, cell_input_size, bias, flatten_weights.layout.dtype, workspace); weight_ptr = static_cast(weight_ptr) + cell.weight_size_in_bytes(); cells.push_back(cell); } } return cells[0].workspace_size_in_bytes(); } template size_t get_inputs_for_exec( _megdnn_tensor_in x, _megdnn_tensor_in y, const std::vector unfold_hx, _megdnn_tensor_in reserve_space, size_t num_layers, size_t D, size_t hidden_size, const std::vector& cells, TensorNDArray& layer_inputs, TensorNDArray& layer_outputs, std::vector>& cell_seq_states, param::RNNCell::NonlineMode /*nonlineMode*/, _megdnn_workspace workspace) { // return used workspace size layer_inputs.push_back(x); size_t seq_len = x.layout.shape[0]; size_t batch_size = x.layout.shape[1]; size_t num_states = cells[0].num_states(); TensorLayout cell_output_layout{{batch_size, hidden_size}, y.layout.dtype}; TensorLayout direction_output_layout{ {seq_len, batch_size, hidden_size}, y.layout.dtype}; void* workspace_ptr = workspace.raw_ptr; // extract intermedia states from reserve space for (size_t layer = 0; layer < num_layers; ++layer) { TensorND layer_output{workspace_ptr, y.layout}; workspace_ptr = static_cast(workspace_ptr) + layer_output.layout.span().dist_byte(); for (size_t d = 0; d < D; ++d) { cell_seq_states.push_back(std::vector()); cell_seq_states[cell_seq_states.size() - 1].push_back( {unfold_hx[layer * d]}); // reverse direction is stored with reversed order of sequence order for (size_t i = 0; i < seq_len; ++i) { size_t step = i; if (d == 1) step = seq_len - i - 1; size_t offset = ((layer * D + d) * seq_len + step) * cell_output_layout.span().dist_byte() * num_states; TensorNDArray cur_states; for (size_t s = 0; s < num_states; ++s) { TensorND h{ static_cast(reserve_space.raw_ptr()) + offset + s * cell_output_layout.span().dist_byte(), cell_output_layout}; cur_states.push_back(h); } TensorND hy{ static_cast(reserve_space.raw_ptr()) + offset, cell_output_layout}; // states cell_seq_states[cell_seq_states.size() - 1].push_back(cur_states); // output offset = i * D * cell_output_layout.span().dist_byte(); memcpy(static_cast(layer_output.raw_ptr()) + offset, hy.raw_ptr(), hy.layout.span().dist_byte()); } } layer_outputs.push_back(layer_output); if (layer != num_layers - 1) layer_inputs.push_back(layer_output); } return static_cast(workspace_ptr) - static_cast((void*)workspace.raw_ptr); } template void backward_exec_internal( std::vector& cells, size_t D, size_t num_layers, size_t input_size, bool bias, param::RNNCell::NonlineMode nonlineMode, const TensorNDArray& layer_inputs, const TensorNDArray& layer_outputs, const std::vector>& cell_seq_states, _megdnn_tensor_in dy, const TensorNDArray& dhy, _megdnn_tensor_out dx, TensorNDArray& dstates, _megdnn_tensor_out dw, Handle* handle, _megdnn_workspace workspace) { /* layer_inputs: array of input of each layer, element 0: [seq_len, batch_size, input_size], element others: [seq_len, batch_size, D * hidden_size] layer_outputs: array of outputs of each rnn. To access outputs of the cell at (layer, d), use layer_outputs[layer]. The shape is [seq_len, batch_size, output_size(D*hidden_size)] (in sequence order) cell_seq_states: arrray of states of each cell at each step. To access the states of the cell at (layer, d) at sequence step (step), use cell_seq_states[layer*D + d][step] */ size_t seq_len = layer_inputs[0].layout.shape[0]; size_t batch_size = layer_inputs[0].layout.shape[1]; DType dtype = layer_inputs[0].layout.dtype; size_t cell_y_size = layer_outputs[0].layout.shape[2] / D; size_t hidden_size = cell_y_size; TensorLayout cell_y_layout = {{batch_size, cell_y_size}, dtype}; void* workspace_ptr = workspace.raw_ptr; TensorND layer_output_grad{ workspace_ptr, {{seq_len, batch_size, D * hidden_size}, dtype}}; workspace_ptr = static_cast(workspace_ptr) + layer_output_grad.layout.span().dist_byte(); memcpy(layer_output_grad.raw_ptr(), dy.raw_ptr(), dy.layout.span().dist_byte()); TensorNDArray direction_dx_arr; for (size_t i = 0; i < D; ++i) { TensorLayout direction_dx_layout{{seq_len, batch_size, hidden_size}, dtype}; direction_dx_arr.push_back(TensorND(workspace_ptr, direction_dx_layout)); workspace_ptr = static_cast(workspace_ptr) + direction_dx_layout.span().dist_byte(); } TensorNDArray L0_direction_dx_arr; for (size_t i = 0; i < D; ++i) { TensorLayout direction_dx_layout{{seq_len, batch_size, input_size}, dtype}; L0_direction_dx_arr.push_back(TensorND(workspace_ptr, direction_dx_layout)); workspace_ptr = static_cast(workspace_ptr) + direction_dx_layout.span().dist_byte(); } std::vector dstates_arr; for (size_t layer = 0; layer < num_layers; ++layer) { for (size_t d = 0; d < D; ++d) { TensorNDArray cell_states; cell_states.reserve(dstates.size()); for (size_t i = 0; i < dstates.size(); ++i) { size_t offset = (layer * D + d) * cell_y_layout.span().dist_byte(); TensorND dhx_cell{ static_cast(dstates[i].raw_ptr()) + offset, cell_y_layout}; memcpy(dhx_cell.raw_ptr(), static_cast(dhy[i].raw_ptr()) + offset, cell_y_layout.span().dist_byte()); cell_states.emplace_back(dhx_cell); } dstates_arr.push_back(cell_states); } } memset(dw.raw_ptr(), 0, dw.layout.span().dist_byte()); std::vector cell_grads; size_t used_workspace_size = static_cast(workspace_ptr) - static_cast((void*)(workspace.raw_ptr)); workspace_ptr = static_cast(workspace_ptr) + get_cells( D, num_layers, input_size, hidden_size, bias, cell_grads, dw, Workspace( workspace.raw_ptr + used_workspace_size, workspace.size - used_workspace_size)); auto add_opr = handle->create_operator(); add_opr->param().mode = Elemwise::Mode::ADD; auto copy_opr = handle->create_operator(); // initialize dx to zero memset(dx.raw_ptr(), 0, dx.layout.span().dist_byte()); // calculate grads for (int layer = (int)num_layers - 1; layer >= 0; --layer) { for (int d = (int)D - 1; d >= 0; --d) { Cell& cell = cells[layer * D + d]; Cell& cell_grad = cell_grads[layer * D + d]; size_t input_size = layer_inputs[layer].layout.shape[2]; const TensorND& x_arr = layer_inputs[layer]; const TensorND& y_arr = layer_outputs[layer]; TensorLayout x_layout = {{batch_size, input_size}, dtype}; // tmp tensors void* tmp_workspace_ptr = workspace_ptr; TensorND dwi_tmp{tmp_workspace_ptr, cell_grad.weight_ih.layout}; tmp_workspace_ptr = static_cast(tmp_workspace_ptr) + dwi_tmp.layout.span().dist_byte(); TensorND dwh_tmp{tmp_workspace_ptr, cell_grad.weight_hh.layout}; tmp_workspace_ptr = static_cast(tmp_workspace_ptr) + dwh_tmp.layout.span().dist_byte(); TensorND dbias_tmp{tmp_workspace_ptr, cell_grad.bias_ih.layout}; tmp_workspace_ptr = static_cast(tmp_workspace_ptr) + dbias_tmp.layout.span().dist_byte(); size_t used_workspace_size = static_cast(tmp_workspace_ptr) - static_cast((void*)(workspace.raw_ptr)); for (size_t i = 0; i < seq_len; ++i) { size_t step = i; if (d == 0) step = seq_len - i - 1; TensorND x{ static_cast(x_arr.raw_ptr()) + step * x_layout.span().dist_byte(), x_layout}, y{static_cast(y_arr.raw_ptr()) + (step * D + d) * cell_y_layout.span().dist_byte(), cell_y_layout}; const TensorNDArray& cell_states = cell_seq_states[layer * D + d][step]; TensorNDArray& dstates_new = dstates_arr[layer * D + d]; TensorND dy_t{ static_cast(layer_output_grad.raw_ptr()) + (step * D + d) * cell_y_layout.span().dist_byte(), cell_y_layout}; add_opr->exec({dstates_new[0], dy_t}, dy_t); TensorND dx_t; if (layer == 0) dx_t = {static_cast(L0_direction_dx_arr[d].raw_ptr()) + step * x_layout.span().dist_byte(), x_layout}; else dx_t = {static_cast(direction_dx_arr[d].raw_ptr()) + step * x_layout.span().dist_byte(), x_layout}; TensorNDArray douts = {dy_t}; for (size_t s = 1; s < dstates_new.size(); ++s) douts.push_back(dstates_new[s]); cell.backward( handle, nonlineMode, x, cell_states, y, douts, dx_t, dstates_new, dwi_tmp, dwh_tmp, dbias_tmp, Workspace( workspace.raw_ptr + used_workspace_size, workspace.size - used_workspace_size)); // add step gradient to overall gradient add_opr->exec({dwi_tmp, cell_grad.weight_ih}, cell_grad.weight_ih); add_opr->exec({dwh_tmp, cell_grad.weight_hh}, cell_grad.weight_hh); add_opr->exec({dbias_tmp, cell_grad.bias_ih}, cell_grad.bias_ih); add_opr->exec({dbias_tmp, cell_grad.bias_hh}, cell_grad.bias_hh); } } // add gradient of different directions to layer_output_grad. if (layer == 0) { for (size_t i = 0; i < D; ++i) add_opr->exec({L0_direction_dx_arr[i], dx}, dx); } else { if (D == 1) copy_opr->exec(direction_dx_arr[0], layer_output_grad); else { for (size_t t = 0; t < seq_len; ++t) { size_t offset = t * D * cell_y_layout.span().dist_byte(); for (size_t d = 0; d < D; ++d) { TensorND src{ static_cast(direction_dx_arr[d].raw_ptr()) + offset, cell_y_layout}; TensorND dst{ static_cast(layer_output_grad.raw_ptr()) + offset + d * cell_y_layout.span().dist_byte(), cell_y_layout}; copy_opr->exec(src, dst); } } } } } } } // namespace rnn } // namespace naive } // namespace megdnn // #include "funcs.tpp" // #endif