Browse Source

refactor(dnn): remove unused costmodel in cuda

GitOrigin-RevId: b15f0607b9
release-1.1
Megvii Engine Team 4 years ago
parent
commit
18ec5341f2
4 changed files with 0 additions and 1076 deletions
  1. +0
    -235
      dnn/src/cuda/convolution/cudnn_heuristic.cpp
  2. +0
    -86
      dnn/src/cuda/convolution/cudnn_heuristic.h
  3. +0
    -754
      dnn/src/cuda/convolution/get_params.cpp
  4. +0
    -1
      dnn/src/x86/conv_bias/f32/winograd_algo.cpp

+ 0
- 235
dnn/src/cuda/convolution/cudnn_heuristic.cpp View File

@@ -1,235 +0,0 @@
/**
* \file dnn/src/cuda/convolution/cudnn_heuristic.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/
#include "./cudnn_heuristic.h"
#include "megdnn.h"
#include "src/cuda/utils.h"

using namespace megdnn;
using namespace cuda;
using namespace convolution;

bool convolution::PerformanceModelBase::args_is_proper(
const TensorLayout* x_layout,
const ConvolutionBase<param::Convolution>::CanonizedFilterMeta& filter) {
bool available = (x_layout->dtype == dtype::Float32() &&
filter.format == param::Convolution::Format::NCHW &&
filter.should_flip == 0 && filter.stride[0] == 1 &&
filter.stride[1] == 1 && filter.spatial_ndim == 2 &&
filter.dilation[0] == 1 && filter.dilation[1] == 1);
return available;
}

bool convolution::PerformanceModelBase::predict_time_success(
const TensorLayout* x_layout, const ConvolutionBase<param::Convolution>::CanonizedFilterMeta& filter,
const ConvolutionType& conv_type, float** mask_p, float** time_pred_p,
size_t* output_dim_p) {
size_t layer_num;
const size_t* layers_dim;
size_t input_params[9];
const float* matrices;
const float* biases;
const float* alpha;
const float* beta;
float* hidden_units;

if (!(args_is_proper(x_layout, filter))) {
return false;
}

if (!convolution::heuristic_params_available(
cuda::current_device_prop().major,
cuda::current_device_prop().minor, &layer_num, &layers_dim,
&matrices, &biases, &alpha, &beta, conv_type, &hidden_units,
time_pred_p, mask_p)) {
return false;
}

input_params[0] = x_layout->shape[0];
input_params[1] = x_layout->shape[1];
input_params[2] = x_layout->shape[2];
input_params[3] = x_layout->shape[3];
input_params[4] = filter.ocpg;
input_params[5] = filter.spatial[0];
input_params[6] = filter.spatial[1];
input_params[7] = filter.padding[0];
input_params[8] = filter.padding[1];

predict_time(layer_num, layers_dim, input_params, matrices, biases, alpha,
beta, hidden_units, *time_pred_p);

*output_dim_p = layers_dim[layer_num - 1];

return true;
}

void convolution::PerformanceModelBase::predict_time(
const size_t layer_num, const size_t* layers_dim,
const size_t* input_params, const float* matrices, const float* biases,
const float* alpha, const float* beta, float* hidden_units,
float* time_pred) {
size_t layer_ind;
size_t i, j;
const float *matrix_entry = matrices, *bias_entry = biases;
float *prev_entry, *next_entry = hidden_units;
size_t shape;

for (j = 0; j < layers_dim[1]; ++j) {
for (i = 0; i < layers_dim[0]; ++i) {
next_entry[j] +=
matrix_entry[j * layers_dim[0] + i] * input_params[i];
}
next_entry[j] += bias_entry[j];
next_entry[j] = element_ReLU(next_entry[j]);
}
prev_entry = next_entry;
next_entry += layers_dim[1];
matrix_entry += layers_dim[0] * layers_dim[1];
bias_entry += layers_dim[1];

for (layer_ind = 1; layer_ind < layer_num - 2; ++layer_ind) {
for (j = 0; j < layers_dim[layer_ind + 1]; ++j) {
for (i = 0; i < layers_dim[layer_ind]; ++i) {
next_entry[j] += matrix_entry[j * layers_dim[layer_ind] + i] *
prev_entry[i];
}
next_entry[j] += bias_entry[j];
next_entry[j] = element_ReLU(next_entry[j]);
}
prev_entry = next_entry;
next_entry += layers_dim[layer_ind + 1];
matrix_entry += layers_dim[layer_ind] * layers_dim[layer_ind + 1];
bias_entry += layers_dim[layer_ind + 1];
}

for (j = 0; j < layers_dim[layer_num - 2]; ++j) {
for (i = 0; i < layers_dim[layer_num - 1]; ++i) {
time_pred[j] += matrix_entry[j * layers_dim[i]] * input_params[i];
}
time_pred[j] += bias_entry[j];
}

shape = input_params[0] * input_params[1] * input_params[4] *
(input_params[2] + input_params[7] * 2 - input_params[5] + 1) *
(input_params[3] + input_params[8] * 2 - input_params[6] + 1) *
input_params[5] * input_params[6];
for (i = 0; i < layers_dim[layer_num - 1]; ++i) {
time_pred[i] = std::exp2f(time_pred[i] * beta[i]) * (shape / alpha[i]);
}
}

/* backward filter */
void convolution::PerformanceModelBackwardFilter::gen_mask_backward_filter(
float* mask, const size_t output_dim,
const ConvolutionBackwardFilterImpl::AlgoBase::SizeArgs& args,
const CUDNNBwdFilterDescs& D,
const size_t workspace_size_limit_in_bytes) {
size_t i;
size_t workspace_size;
for (i = 0; i < output_dim; ++i) {
mask[i] = -1.0f;
auto cudnnStat = cudnnGetConvolutionBackwardFilterWorkspaceSize(
args.handle->cudnn_handle(), D.src_desc.desc, D.diff_desc.desc,
D.conv_desc.desc, D.grad_desc.desc,
static_cast<cudnnConvolutionBwdFilterAlgo_t>(i),
&workspace_size);
if (cudnnStat == CUDNN_STATUS_SUCCESS &&
workspace_size < workspace_size_limit_in_bytes) {
mask[i] = 1.0f;
}
}
}

bool convolution::PerformanceModelBackwardFilter::
get_algo_backward_filter_success(
const ConvolutionBackwardFilterImpl::AlgoBase::SizeArgs& args,
const CUDNNBwdFilterDescs& D,
const size_t workspace_limit_in_bytes,
cudnnConvolutionBwdFilterAlgo_t* algo) {
float* mask;
size_t output_dim;
float* time_pred;

if (!predict_time_success(args.src_layout, args.grad_filter_meta,
ConvolutionType::BACKWARD_FILTER, &(mask),
&(time_pred), &(output_dim))) {
return false;
}

gen_mask_backward_filter(mask, output_dim, args, D,
workspace_limit_in_bytes);

size_t i, selected = 0;
for (i = 0; i < output_dim; ++i) {
if (mask[i] > 0 && time_pred[i] < time_pred[selected]) {
selected = i;
}
}
*algo = static_cast<cudnnConvolutionBwdFilterAlgo_t>(selected);

return mask[selected] > 0;
}

/* backward data */
void convolution::PerformanceModelBackwardData::gen_mask_backward_data(
float* mask, const size_t output_dim,
const ConvolutionBackwardDataImpl::AlgoBase::SizeArgs& args,
const CUDNNBwdDataDescs& D,
const size_t workspace_size_limit_in_bytes) {
size_t i;
size_t workspace_size;
for (i = 0; i < output_dim; ++i) {
mask[i] = -1.0f;
auto cudnnStat = cudnnGetConvolutionBackwardDataWorkspaceSize(
args.handle->cudnn_handle(), D.filter_desc.desc,
D.diff_desc.desc, D.conv_desc.desc, D.grad_desc.desc,
static_cast<cudnnConvolutionBwdDataAlgo_t>(i), &workspace_size);
if (cudnnStat == CUDNN_STATUS_SUCCESS &&
workspace_size < workspace_size_limit_in_bytes) {
mask[i] = 1.0f;
}
}
}

bool convolution::PerformanceModelBackwardData::get_algo_backward_data_success(
const ConvolutionBackwardDataImpl::AlgoBase::SizeArgs& args,
const CUDNNBwdDataDescs& D, const size_t workspace_limit_in_bytes,
cudnnConvolutionBwdDataAlgo_t* algo) {
float* mask;
size_t output_dim;
float* time_pred;

if (!predict_time_success(args.grad_layout, args.filter_meta,
ConvolutionType::BACKWARD_DATA, &mask, &time_pred,
&output_dim)) {
return false;
}

gen_mask_backward_data(mask, output_dim, args, D, workspace_limit_in_bytes);

size_t i, selected = 0;
for (i = 0; i < output_dim; ++i) {
if (mask[i] > 0 && time_pred[i] < time_pred[selected]) {
selected = i;
}
}

// special case:
// if the filter shape in cudnnConvolutionBackwardData is too asymmetric,
// the performance of algo1 is dramatically reduced,
// we temporarily choose algo0.
if (args.filter_meta.spatial[0] / args.filter_meta.spatial[1] > 32 ||
args.filter_meta.spatial[1] / args.filter_meta.spatial[0] > 32) {
selected = 0;
}
*algo = static_cast<cudnnConvolutionBwdDataAlgo_t>(selected);

return mask[selected] > 0;
}

+ 0
- 86
dnn/src/cuda/convolution/cudnn_heuristic.h View File

@@ -1,86 +0,0 @@
/**
* \file dnn/src/cuda/convolution/cudnn_heuristic.h
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/
#pragma once

#include "src/cuda/convolution/backward_data/algo.h"
#include "src/cuda/convolution/backward_filter/algo.h"

namespace megdnn {
namespace cuda {
namespace convolution {

enum class ConvolutionType {
FORWARD = 0,
BACKWARD_FILTER = 1,
BACKWARD_DATA = 2
};

bool heuristic_params_available(
int cuda_major, int cuda_minor, size_t* layer_num_p,
const size_t** layers_dim_p, const float** matrices_p,
const float** biases_p, const float** alpha_p, const float** beta_p,
const ConvolutionType& conv_type, float** hidden_units_p,
float** time_pred_p, float** mask_p);

class PerformanceModelBase {
public:
static float element_ReLU(float element) {
return element > 0.0 ? element : 0.0;
}
static bool predict_time_success(const TensorLayout* x_layout,
const ConvolutionBase<param::Convolution>::CanonizedFilterMeta& filter,
const ConvolutionType& conv_type,
float** mask_p, float** time_pred_p,
size_t* output_dim_p);

private:
static bool args_is_proper(
const TensorLayout* x_layout,
const ConvolutionBase<param::Convolution>::CanonizedFilterMeta& filter);
static void predict_time(const size_t layer_num, const size_t* layers_dim,
const size_t* input_params, const float* matrices,
const float* biases, const float* alpha,
const float* beta, float* hidden_units,
float* time_pred);
};

class PerformanceModelBackwardFilter : public PerformanceModelBase {
public:
static bool get_algo_backward_filter_success(
const ConvolutionBackwardFilterImpl::AlgoBase::SizeArgs& args,
const CUDNNBwdFilterDescs& D, const size_t workspace_limit_in_bytes,
cudnnConvolutionBwdFilterAlgo_t* algo);

private:
static void gen_mask_backward_filter(
float* mask, const size_t output_dim,
const ConvolutionBackwardFilterImpl::AlgoBase::SizeArgs& args,
const CUDNNBwdFilterDescs& D,
const size_t workspace_limit_in_bytes);
};

class PerformanceModelBackwardData : public PerformanceModelBase {
public:
static bool get_algo_backward_data_success(
const ConvolutionBackwardDataImpl::AlgoBase::SizeArgs& args,
const CUDNNBwdDataDescs& D, const size_t workspace_limit_in_bytes,
cudnnConvolutionBwdDataAlgo_t* algo);

private:
static void gen_mask_backward_data(
float* mask, const size_t output_dim,
const ConvolutionBackwardDataImpl::AlgoBase::SizeArgs& args,
const CUDNNBwdDataDescs& D, const size_t workspace_limit_in_bytes);
};

} // namespace convolution
} // namespace cuda
} // namespace megdnn

+ 0
- 754
dnn/src/cuda/convolution/get_params.cpp View File

@@ -1,754 +0,0 @@
/**
* \file dnn/src/cuda/convolution/get_params.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/
#include "src/cuda/convolution/cudnn_heuristic.h"
#include "megdnn.h"

using namespace megdnn;
using namespace cuda;
using namespace convolution;

bool convolution::heuristic_params_available(
int cuda_major, int cuda_minor, size_t* layer_num_p,
const size_t** layers_dim_p, const float** matrices_p,
const float** biases_p, const float** alpha_p, const float** beta_p,
const ConvolutionType& conv_type, float** hidden_units_p,
float** time_pred_p, float** mask_p) {
MEGDNN_MARK_USED_VAR(cuda_major);
MEGDNN_MARK_USED_VAR(cuda_minor);
MEGDNN_MARK_USED_VAR(layer_num_p);
MEGDNN_MARK_USED_VAR(layers_dim_p);
MEGDNN_MARK_USED_VAR(matrices_p);
MEGDNN_MARK_USED_VAR(biases_p);
MEGDNN_MARK_USED_VAR(alpha_p);
MEGDNN_MARK_USED_VAR(beta_p);
MEGDNN_MARK_USED_VAR(conv_type);
MEGDNN_MARK_USED_VAR(hidden_units_p);
MEGDNN_MARK_USED_VAR(time_pred_p);
MEGDNN_MARK_USED_VAR(mask_p);

#if CUDNN_MAJOR == 6 && CUDNN_MINOR == 0
float cuda5_2_BACKWARD_FILTER_time_pred[7] = {0.0f};
float cuda5_2_BACKWARD_FILTER_mask[7] = {0.0f};
float cuda5_2_BACKWARD_FILTER_hidden_units[24] = {0.0f};
const static size_t cuda5_2_BACKWARD_FILTER_layers_dim[4] = {9, 12, 12, 7};
const static float cuda5_2_BACKWARD_FILTER_matrices[336] = {
3.499478e-03, 1.353932e-02, -1.316529e-01, 1.006798e-01,
1.249662e-02, -3.591197e-01, -4.299506e-01, -3.613592e-01,
-3.783917e-01, -4.249511e-01, 6.287370e-03, -2.861480e-03,
3.128614e-03, 8.496360e-03, 5.568272e-01, 1.965293e-01,
-6.205962e-02, -1.999864e-01, 9.333656e-03, -6.377945e-02,
6.122595e-02, 1.122032e-01, -1.683744e-02, -9.395520e-02,
-2.953549e-02, -2.772853e-02, -2.892097e-02, 3.200796e-03,
5.553298e-03, 6.707606e-01, 3.111190e-01, -5.293804e-01,
-8.127835e-02, -5.839296e-02, 9.633666e-02, 5.957389e-02,
-7.131222e-02, 4.057650e-02, 4.311656e-02, -1.456163e-02,
5.683148e-02, 6.175192e-02, 9.331264e-02, 9.957494e-02,
5.202487e-02, 0.0, 0.0, -7.725500e-14,
-8.058319e-17, 0.0, 0.0, 0.0,
0.0, 0.0, 1.988015e-04, -1.530555e-01,
3.629641e-03, -1.238047e-03, 1.692593e-02, 3.404703e-01,
5.441420e-01, -3.275000e-01, -3.742920e-01, -1.714999e-01,
1.979161e-02, 5.019676e-02, 1.406423e-02, -4.360787e-02,
-5.948093e-03, 1.522342e-01, 1.012455e-02, 5.666151e-02,
-7.033888e-05, 1.519375e-02, -2.360136e-02, -5.682724e-04,
-2.552732e-02, 2.329080e-01, 3.437024e-01, 4.054402e-01,
3.379739e-01, 1.566344e-03, 3.172801e-02, -1.336258e-02,
1.401075e-02, 2.876163e-02, 1.293039e+00, 7.118387e-01,
2.966451e-01, 4.372724e-01, -2.286311e-02, -6.896693e-03,
3.156468e-02, 3.829155e-02, -9.890525e-04, 1.836302e-02,
2.394343e-02, 4.963258e-02, 4.368515e-02, 2.950634e-03,
1.129842e-02, 7.078686e-01, 3.193808e-01, 9.759862e-03,
2.906150e-01, 1.806232e-01, 1.396071e-01, 2.047469e-01,
-2.561368e-01, -3.322504e-01, 7.250011e-02, -3.389789e-02,
-1.372720e-02, 0.0, -1.690562e-01, -1.013354e-01,
-1.920926e-02, 1.018956e-01, 2.467915e-02, 4.451101e-02,
-4.139300e-02, -1.031867e-02, -5.686982e-03, 2.993172e-01,
1.746564e-02, -3.393853e-20, 1.905611e-02, -5.220098e-02,
4.550828e-02, 8.211702e-02, -2.850403e-03, -2.816908e-01,
6.826700e-02, -1.102444e-02, 7.373374e-03, 9.173237e-03,
-6.144243e-03, 0.0, -1.675391e-02, 2.949211e-02,
-1.925736e-02, 2.259453e-02, 6.339108e-02, -1.233638e-01,
-1.239254e-02, -9.204817e-03, -6.979109e-02, -2.015045e-02,
-1.624232e-02, 0.0, 8.557694e-02, -2.066801e-02,
2.876340e-01, -1.265177e-01, 7.225822e-03, 7.337274e-02,
-4.342360e-02, -1.974944e-01, -6.721890e-03, -4.495411e-02,
-3.655335e-02, 0.0, -4.551398e-01, 8.440251e-02,
-2.404170e-01, 1.250752e-01, 1.646416e-03, 9.063166e-02,
2.506036e-02, 8.455078e-03, -1.908465e-02, 6.791655e-03,
2.511951e-02, 0.0, 7.265597e-03, -1.285137e-03,
-3.404747e-04, 8.924944e-03, 4.234224e-03, -1.186513e-02,
2.454471e-02, 9.120111e-04, 2.120904e-02, -5.555666e-03,
-1.493565e-02, 0.0, 2.764972e-03, -6.132948e-04,
6.180623e-03, 3.238724e-03, -1.073131e-02, -1.342798e-04,
8.969568e-02, 1.010931e-01, -1.038349e-02, -9.198243e-02,
4.724314e-02, 0.0, 1.175188e-02, -6.051729e-02,
-2.525244e-03, -1.566657e-01, -1.447370e-02, 1.747005e-01,
1.078679e-01, 2.556116e-01, 3.880575e-02, 9.777729e-03,
1.078563e-01, 0.0, 4.525005e-01, 8.311278e-03,
8.198996e-02, -2.884443e-01, -1.808732e-02, -3.114621e-02,
1.732809e-02, 2.442103e-01, 3.329617e-02, 8.462872e-03,
6.775563e-02, -7.453864e-19, 1.846050e-01, 2.739331e-02,
1.029433e-01, -2.251960e-01, 3.331415e-02, -2.261097e-02,
3.815529e-02, -5.755350e-02, -8.908589e-03, -4.526101e-02,
1.555560e-02, 0.0, 2.347023e-02, -1.399980e-01,
-2.699343e-02, 2.168779e-02, 2.629133e-03, 3.232189e-02,
3.693172e-02, -9.767429e-02, 2.461806e-02, 1.045579e-01,
5.808600e-02, 0.0, -1.331031e-02, 3.555656e-03,
-9.530113e-02, -1.961061e-02, -1.579800e-02, -7.582582e-02,
-3.099381e-02, 9.698432e-01, 7.805698e-01, 1.542833e-01,
-1.025053e-01, -7.509316e-04, -1.675225e-02, -7.818724e-03,
-2.718012e-01, 8.506276e-01, 3.869322e-02, 2.732933e-02,
-4.932691e-02, 7.077541e-01, 4.385699e-01, 8.550947e-02,
-1.737943e-01, -1.007005e-02, 1.884576e-02, 6.328513e-02,
-2.711761e-01, 1.054725e+00, -1.001195e-02, 6.876359e-02,
-4.647969e-01, 7.618478e-01, 1.170148e+00, 5.507177e-02,
-1.284761e-01, 2.255174e-02, 5.041638e-03, 2.431494e-01,
-2.259419e-01, 6.318219e-01, 4.526694e-02, -1.068190e-01,
9.181661e-05, 7.900977e-01, 5.499427e-01, 2.147153e-02,
-1.855706e-01, -6.816355e-03, 2.600182e-02, 7.784649e-02,
-2.902775e-01, 9.821153e-01, -1.705817e-02, 9.162355e-02,
0.0, 0.0, 0.0, 0.0,
0.0, 0.0, 0.0, 0.0,
0.0, 0.0, 0.0, 0.0,
4.378970e-02, 7.106545e-01, 5.512720e-01, 1.076883e-01,
-3.036375e-01, 4.190212e-02, -1.192542e-02, 1.002918e-01,
-2.498885e-01, 6.789825e-01, -1.278644e-01, 8.962566e-02,
-4.231460e-02, 2.334089e-01, 3.083326e-03, 2.404322e-02,
-2.668908e-01, 3.057625e-03, -1.283901e-03, 1.349618e-02,
-4.993697e-02, 6.061308e-01, -9.689163e-02, 1.609056e-01};
const static float cuda5_2_BACKWARD_FILTER_biases[31] = {
3.927711e-02, 4.658543e-01, 3.737917e-02, -4.173907e-02,
6.516264e-04, 0.0, 1.141180e+00, 5.656777e-03,
-1.466774e-01, -3.637813e-01, 3.348432e-02, -2.374912e-01,
1.856181e-01, 1.458818e+00, 1.436140e+00, 1.708800e-01,
3.663654e-02, 2.147604e-02, 5.249544e-02, 9.389526e-02,
-7.182905e-02, 2.513293e+00, -6.255527e-02, -1.452608e-01,
-7.379941e-01, -5.884537e-01, -6.324590e-01, -6.180407e-01,
0.0, -1.712828e-01, -2.353933e-01};
const static float cuda5_2_BACKWARD_FILTER_alpha[7] = {
2.189385e+08, 1.987406e+08, 6.368552e+07, 2.164986e+08,
2.000000e+08, 3.611623e+08, 8.509315e+06};
const static float cuda5_2_BACKWARD_FILTER_beta[7] = {
1.558573e+00, 1.825239e+00, 1.782366e+00, 1.772095e+00,
2.000000e+00, 1.856787e+00, 1.625270e+00};
float cuda5_2_BACKWARD_DATA_time_pred[6] = {0.0f};
float cuda5_2_BACKWARD_DATA_mask[6] = {0.0f};
float cuda5_2_BACKWARD_DATA_hidden_units[24] = {0.0f};
const static size_t cuda5_2_BACKWARD_DATA_layers_dim[4] = {9, 12, 12, 6};
const static float cuda5_2_BACKWARD_DATA_matrices[324] = {
1.090385e-03, -9.525486e-02, 2.116694e-02, 8.324536e-03,
2.443915e-03, -1.486993e-03, 1.996945e-01, -3.490458e-02,
-2.909729e-01, -4.403929e-01, 3.302580e-03, 8.758115e-03,
2.016278e-03, 5.139519e-03, 6.631530e-01, 4.163170e-01,
-2.275565e-01, -1.927734e-01, 4.901680e-02, 3.499708e-02,
4.430823e-02, -6.245822e-01, 2.489910e-02, 2.943479e-01,
3.011928e-01, -6.154800e-02, -6.945755e-02, 2.156114e-02,
-2.706529e-02, 2.254039e-02, -2.130969e-01, -1.711698e-03,
3.185264e-01, 3.669779e-01, 2.366176e-01, 2.016553e-01,
1.742197e-04, 8.993217e-04, -3.757331e-01, -1.517802e-01,
1.150989e-03, 4.397022e-01, 2.472478e-01, -5.120142e-01,
-5.310764e-01, -2.185705e-02, -1.019608e-02, -1.484592e-01,
-1.720972e-01, 3.073631e-02, 1.679189e-02, 9.030435e-03,
-4.171251e-03, -7.412981e-03, 3.670006e-02, 2.704583e-02,
1.162922e-01, 8.629673e-02, -1.661878e-01, -1.722751e-01,
-2.494859e-01, 6.303188e-02, 2.379866e-03, -9.154570e-02,
-8.703206e-02, 3.478937e-02, 2.733189e-02, -6.598901e-02,
-2.212522e-02, -3.853705e-02, 2.827537e-02, 2.944724e-02,
1.588451e-02, 2.663488e-02, 1.933236e-02, 3.978135e-02,
1.509624e-02, 1.144023e+00, 7.680039e-01, 4.072323e-01,
3.243737e-01, 4.177893e-02, 4.054888e-02, 1.758260e-01,
1.351026e-01, 2.773634e-02, 8.728213e-02, 1.938261e-01,
-1.641249e-02, 7.889663e-02, 4.266707e-04, 6.022587e-04,
6.884130e-04, 2.244700e-04, -3.188357e-01, 1.903596e-01,
3.979538e-01, -2.875198e-01, -5.881550e-01, -1.732513e-02,
2.107770e-02, -2.415357e-02, 5.184836e-02, 2.633666e-03,
-4.351313e-01, -3.523280e-01, -1.124865e-01, -5.509025e-02,
-2.874137e-03, -2.260433e-03, 5.087418e-03, 2.825674e-03,
4.565214e-03, 1.520132e-03, -1.722531e-03, -1.287867e-04,
1.223576e-03, -5.230475e-04, -2.300250e-03, -6.684309e-03,
-7.956048e-03, -3.028432e-03, 2.238011e-02, -1.166453e-02,
6.994838e-02, 5.585106e-03, -9.814836e-03, -4.010206e-03,
-3.232308e-03, -1.020571e-02, -1.587651e-02, 6.942352e-02,
6.370817e-01, 5.906755e-02, -3.062441e-03, 9.914325e-02,
2.335527e-01, -4.718621e-03, -2.132248e-02, 3.841487e-02,
7.563891e-02, -7.599686e-02, 1.408871e-01, 5.740594e-02,
1.902002e-01, 2.145507e-01, 3.427162e-02, 3.367433e-02,
2.967569e-01, 2.863470e-02, 3.392174e-02, 3.514072e-02,
-1.441963e-01, -4.797359e-02, -5.965770e-03, 1.214167e-01,
0.0, 0.0, -4.498340e-06, -1.828862e-07,
0.0, 1.093948e-12, -2.601859e-06, 0.0,
-9.811162e-09, -2.785148e-06, 0.0, -2.360134e-27,
-1.110723e-01, -1.570218e-01, -4.062234e-02, -7.606770e-02,
5.144730e-01, 9.398572e-02, 1.906881e-01, 1.747961e-02,
1.106279e-01, -1.254419e-01, 6.205062e-01, -5.617496e-02,
-1.629532e-01, -1.042091e-01, -1.413646e-01, 1.433934e-01,
1.425548e-01, 2.505819e-02, 5.484238e-04, -9.254320e-02,
1.448994e-01, 3.132954e-02, -1.425708e-01, -1.685494e-02,
-3.513211e-01, -1.992232e-01, -1.081804e-01, 4.960524e-02,
-5.546688e-01, -1.675645e-02, -3.610602e-02, 2.780567e-02,
2.227647e-01, 4.038066e-02, -6.002745e-01, -1.275032e-01,
-1.026016e-01, -2.635376e-01, 2.059869e-02, -8.100250e-02,
8.695480e-02, -4.293829e-02, -1.870224e-02, 7.269356e-02,
3.979762e-02, 3.270284e-02, 1.190808e-01, -1.059370e-01,
1.286611e-02, 3.927987e-02, 7.228687e-03, 2.264480e-02,
-1.119717e-01, 8.701903e-02, 2.064170e-02, 5.297894e-02,
9.965703e-03, 1.206108e-02, -5.411500e-02, -5.476563e-02,
-1.837980e-01, -7.351980e-01, -1.781217e-01, 1.473823e-01,
-4.530039e-01, -3.604104e-02, 2.418269e-02, 2.903621e-02,
4.367216e-01, -5.112789e-02, -3.706729e-01, -2.049569e-01,
-9.153855e-02, -1.008104e-01, -1.009935e-02, -1.033947e-01,
5.495172e-02, 1.323372e-02, -5.191914e-02, -1.545710e-02,
3.271207e-02, 1.939050e-02, -3.092350e-02, 7.518642e-02,
-5.528467e-03, 8.568556e-02, 1.924936e-02, 1.007434e+00,
-6.850208e-07, 5.599304e-01, 3.076834e-01, -4.312680e-01,
7.534813e-02, -3.293671e-02, 5.830373e-03, -2.450454e-02,
-3.698347e-04, -8.712796e-03, 4.009782e-01, 1.215293e+00,
-5.273760e-07, 2.344936e-01, 1.927198e-01, -3.006327e-01,
-2.927265e-02, -8.696410e-03, -2.446414e-02, 1.890189e-02,
3.553152e-03, -1.651816e-02, 2.438239e-01, 6.245783e-01,
1.809883e-07, 3.264363e-01, 7.772639e-01, -2.954962e-01,
2.704587e-02, -3.836469e-02, -4.457633e-01, 1.726713e-02,
5.172309e-03, 1.289187e-02, 5.472647e-01, 6.243305e-01,
-4.123602e-08, 4.334479e-01, 7.573158e-02, -2.572208e-01,
5.492910e-02, -9.502222e-03, -2.104075e-01, -3.131663e-02,
2.312713e-03, 3.963990e-02, 4.713630e-01, 8.256559e-01,
-2.583514e-08, 4.528451e-01, 7.318445e-02, -2.987004e-01,
8.577114e-02, -2.907754e-02, -5.389895e-02, 8.495960e-02,
-1.558219e-04, 3.880079e-02, 4.180317e-01, 5.884213e-01,
3.963620e-07, 4.769594e-01, 3.800152e-01, -3.191836e-01,
-1.669163e-01, 8.362461e-04, -1.668053e-01, -9.146041e-02};
const static float cuda5_2_BACKWARD_DATA_biases[30] = {
1.238052e+00, 7.745910e-01, 3.356679e-01, -7.175566e-02,
1.497247e+00, 3.300638e-03, 2.789130e-01, -8.312362e-02,
-7.829870e-02, -3.456568e-01, 1.328189e+00, -2.689771e-01,
9.444705e-03, -1.149580e-01, 4.422197e-01, 2.072980e+00,
0.0, 4.782698e-01, -1.116326e+00, 7.193607e-01,
2.938375e-02, 1.465170e-02, 8.513468e-02, 6.830001e-02,
4.035618e-01, 1.607704e-01, 9.502214e-01, 6.022118e-01,
2.584324e-01, 7.981322e-01};
const static float cuda5_2_BACKWARD_DATA_alpha[6] = {
1.997689e+08, 3.799992e+08, 6.843723e+07, 1.140762e+08,
5.562133e+08, 3.324116e+08};
const static float cuda5_2_BACKWARD_DATA_beta[6] = {
1.537834e+00, 1.587649e+00, 1.844705e+00, 1.671656e+00,
1.672516e+00, 1.705950e+00};
float cuda5_2_FORWARD_time_pred[8] = {0.0f};
float cuda5_2_FORWARD_mask[8] = {0.0f};
float cuda5_2_FORWARD_hidden_units[24] = {0.0f};
const static size_t cuda5_2_FORWARD_layers_dim[4] = {9, 12, 12, 8};
const static float cuda5_2_FORWARD_matrices[348] = {
-9.209032e-02, -1.659105e-01, -5.965192e-02, -2.153863e-02,
8.719379e-02, -3.499233e-02, 7.201853e-03, -1.419160e-02,
-1.818457e-04, -3.145495e-01, 1.526620e-03, -3.928741e-03,
-2.569693e-03, 3.410484e-03, 2.167806e-01, 1.747067e-01,
-2.598841e-01, -3.055519e-01, 5.274500e-04, -9.025287e-03,
-2.483256e-02, 4.541647e-02, 7.308841e-02, -4.819591e-01,
-4.753071e-01, -1.471946e-02, 5.257137e-03, 2.392092e-03,
-1.222254e-02, 1.609546e-02, -3.770980e-03, 1.646060e-02,
1.753314e-02, 1.508273e-02, 9.316003e-03, -5.777596e-04,
-2.694935e-05, 1.604315e-03, -1.762570e-02, -4.887820e-01,
4.957791e-03, 2.363977e-01, 3.638881e-01, -4.731908e-01,
-5.269557e-01, -1.159047e-03, 1.838379e-02, -1.427773e-01,
-1.495254e-01, 1.330812e-01, 3.283872e-01, 3.582126e-01,
-1.175109e-01, -1.454948e-01, 2.369200e-02, 1.493328e-02,
3.108240e-02, 3.270133e-02, -6.615507e-01, 3.380858e-01,
3.704230e-01, 8.769190e-02, -6.377754e-02, 4.325379e-02,
-2.027540e-03, -1.402376e-01, -9.008316e-02, -2.559709e-03,
-8.711295e-02, -9.627704e-02, -1.539383e-01, -1.632525e-01,
3.015039e-02, 3.144164e-02, 6.656437e-02, 5.488716e-02,
1.877632e-01, 5.748791e-01, 3.917130e-01, 2.071713e-01,
2.771358e-01, -5.960735e-02, 1.106716e-02, 5.781374e-02,
6.840285e-03, 2.902341e-02, -3.347534e-01, -1.212164e-01,
-8.089989e-02, -1.384973e-01, 1.251527e-02, -2.644526e-01,
6.949010e-02, 2.681785e-02, 1.081700e-01, -3.502952e-02,
3.512865e-01, -9.033766e-02, 2.017496e-02, 2.095562e-02,
1.330583e-02, 2.582395e-02, -2.550245e-03, -1.596605e-03,
-4.966798e-01, -5.384876e-01, -3.006902e-01, -2.735094e-01,
2.044184e-02, 3.490414e-01, 1.717040e-02, 6.914880e-03,
1.496788e-02, -7.078647e-02, 6.652176e-02, 6.768194e-03,
-3.086404e-02, 1.317981e-01, -5.902661e-02, -8.681632e-02,
-6.622906e-02, 1.597742e-01, 3.700355e-03, 1.707309e-02,
-5.229016e-02, 2.836531e-02, 9.072421e-03, -1.104825e-01,
1.009224e-02, -1.915519e-02, -2.592222e-02, -9.112109e-02,
-2.824950e-02, 5.274639e-01, 1.052709e-01, 1.325189e-02,
3.486569e-01, 1.155336e-01, 7.854062e-02, 1.637263e-02,
-1.599528e-01, 1.090762e-01, 2.625560e-02, 8.724683e-02,
3.858089e-02, -5.696925e-01, -2.280933e-01, -3.096054e-02,
-5.547203e-01, -6.229282e-02, -1.009606e-01, 5.365341e-02,
1.673071e-01, -1.734997e-01, -2.949879e-02, -2.640804e-01,
4.783161e-02, -4.411741e-01, -1.495569e-01, -1.043236e-02,
-2.952088e-01, -2.866718e-02, 4.253592e-02, 3.828135e-02,
7.448777e-02, -2.757399e-02, -6.067163e-02, -2.007495e-01,
-3.468005e-02, -1.678551e-01, -2.086982e-02, -2.114448e-02,
-2.844830e-02, 3.823385e-03, 8.453450e-03, 1.447659e-03,
5.760803e-02, 7.803936e-02, -7.363023e-02, -1.894736e-03,
6.325649e-02, 1.527100e-02, -4.378622e-02, 3.171223e-03,
8.858634e-01, 7.191087e-02, 2.045580e-01, -3.890414e-03,
-7.661989e-02, 2.667563e-02, -2.549908e-02, -9.384236e-02,
-4.146666e-02, 2.281848e-01, 7.052436e-02, 1.180828e-03,
1.976338e-01, 1.647339e-02, -2.741527e-02, 1.641885e-02,
-1.197201e-01, -3.670282e-02, 1.672286e-01, 5.267144e-02,
8.803396e-02, 4.463083e-01, -8.939818e-03, 4.523633e-03,
-1.554685e-01, -1.392173e-02, 4.290194e-03, -9.498623e-03,
-2.200229e-02, -1.022839e-01, 1.553784e-02, 4.006403e-02,
-8.901481e-02, 1.353742e-01, -6.176645e-02, 2.818892e-03,
4.842044e-02, 1.031219e-02, 4.689164e-03, 2.677023e-01,
-1.331718e-02, 2.130043e-01, 7.004514e-03, -5.422973e-01,
7.450043e-03, 4.017003e-01, -9.216257e-03, -2.551504e-02,
-2.416791e-01, -1.451814e-01, -1.796521e-01, -1.749250e-02,
9.023457e-02, 9.444007e-02, -5.293583e-03, -1.027239e-01,
1.017421e-02, 1.213706e-01, -3.460192e-02, 8.999067e-03,
-1.110771e-01, 2.168397e-01, -4.417743e-02, 8.891370e-02,
-1.271863e-01, -7.239018e-02, -1.346174e-02, 5.777563e-02,
7.088694e-02, 6.467304e-02, 7.867605e-01, -2.014701e-01,
1.461604e-01, -6.571004e-02, 6.528026e-01, 6.720600e-01,
4.151264e-01, -6.271046e-03, -1.568682e-02, 2.438027e-01,
6.112317e-02, 8.257028e-02, 8.817917e-01, -1.995129e-01,
9.260281e-02, -6.511735e-02, 6.132895e-01, 5.789503e-01,
3.354024e-01, 1.621681e-04, -1.380093e-02, 2.710598e-01,
1.104726e-01, 5.625401e-02, 7.417016e-01, -2.523506e-01,
1.436054e-01, -7.903862e-02, 5.858161e-01, 4.287509e-01,
5.370684e-01, -9.449220e-02, -9.393471e-03, 3.037375e-01,
0.0, 0.0, 0.0, 0.0,
0.0, 0.0, 0.0, 0.0,
0.0, 0.0, 0.0, 0.0,
4.773019e-01, -2.101818e-02, 2.934896e-01, -4.207794e-01,
-2.892000e-01, -1.383682e-02, 3.842597e-01, 5.408122e-01,
-1.901669e-01, -5.255229e-02, 3.103573e-01, 7.447528e-01,
1.010295e-01, 5.580491e-03, 4.166604e-01, -2.997382e-01,
-3.115629e-01, -2.585651e-02, 5.481771e-01, 6.307158e-01,
4.869811e-01, 6.668988e-01, -8.661555e-02, 6.073793e-01,
6.002924e-03, 1.855917e-02, 5.628079e-01, -1.967446e-01,
-1.365761e-01, -3.095432e-02, 6.461580e-01, 7.712716e-01,
4.082011e-01, 8.834770e-02, -1.021050e-01, 4.353123e-01,
2.292985e-01, -6.493770e-02, 2.730630e-01, -3.267927e-01,
-3.408634e-01, -6.609171e-02, 5.608538e-01, 7.108021e-01,
3.760323e-01, 3.335001e-01, 8.168215e-02, 2.858790e-01};
const static float cuda5_2_FORWARD_biases[32] = {
-1.021053e-02, 1.398318e+00, -2.447664e-01, 2.701163e-02,
1.148165e+00, 6.030037e-01, 2.089586e-01, 5.609234e-02,
-4.842668e-01, 1.262153e-01, 2.643087e-01, 3.539835e-01,
4.792117e-01, 4.310244e-02, 1.993983e+00, 2.597207e-01,
-2.811204e-01, 7.933383e-02, 1.056050e+00, 1.234862e+00,
7.894841e-01, 2.019784e-01, -1.216166e-01, 8.840314e-01,
-3.542692e-01, -3.693904e-01, -2.181383e-01, 0.0,
-2.216420e-01, -1.602890e-01, 8.500483e-03, 2.072607e-01};
const static float cuda5_2_FORWARD_alpha[8] = {
2.549612e+08, 3.579459e+08, 1.927015e+08, 2.000000e+08,
3.222185e+07, 8.748824e+07, 6.676129e+08, 2.775480e+08};
const static float cuda5_2_FORWARD_beta[8] = {
1.463412e+00, 1.553222e+00, 1.515109e+00, 2.000000e+00,
2.117807e+00, 1.622262e+00, 1.626601e+00, 1.669380e+00};
if (conv_type == ConvolutionType::BACKWARD_FILTER && cuda_major == 5 &&
cuda_minor == 2) {
*layer_num_p = 4;
*hidden_units_p = cuda5_2_BACKWARD_FILTER_hidden_units;
*layers_dim_p = cuda5_2_BACKWARD_FILTER_layers_dim;
*matrices_p = cuda5_2_BACKWARD_FILTER_matrices;
*biases_p = cuda5_2_BACKWARD_FILTER_biases;
*alpha_p = cuda5_2_BACKWARD_FILTER_alpha;
*beta_p = cuda5_2_BACKWARD_FILTER_beta;
*time_pred_p = cuda5_2_BACKWARD_FILTER_time_pred;
*mask_p = cuda5_2_BACKWARD_FILTER_mask;
} else if (conv_type == ConvolutionType::BACKWARD_DATA && cuda_major == 5 &&
cuda_minor == 2) {
*layer_num_p = 4;
*hidden_units_p = cuda5_2_BACKWARD_DATA_hidden_units;
*layers_dim_p = cuda5_2_BACKWARD_DATA_layers_dim;
*matrices_p = cuda5_2_BACKWARD_DATA_matrices;
*biases_p = cuda5_2_BACKWARD_DATA_biases;
*alpha_p = cuda5_2_BACKWARD_DATA_alpha;
*beta_p = cuda5_2_BACKWARD_DATA_beta;
*time_pred_p = cuda5_2_BACKWARD_DATA_time_pred;
*mask_p = cuda5_2_BACKWARD_DATA_mask;
} else if (conv_type == ConvolutionType::FORWARD && cuda_major == 5 &&
cuda_minor == 2) {
*layer_num_p = 4;
*hidden_units_p = cuda5_2_FORWARD_hidden_units;
*layers_dim_p = cuda5_2_FORWARD_layers_dim;
*matrices_p = cuda5_2_FORWARD_matrices;
*biases_p = cuda5_2_FORWARD_biases;
*alpha_p = cuda5_2_FORWARD_alpha;
*beta_p = cuda5_2_FORWARD_beta;
*time_pred_p = cuda5_2_FORWARD_time_pred;
*mask_p = cuda5_2_FORWARD_mask;
} else {
return false;
}
return true;
#endif
#if CUDNN_MAJOR == 5 && CUDNN_MINOR == 1
float cuda5_2_FORWARD_time_pred[9] = {0.0f};
float cuda5_2_FORWARD_mask[9] = {0.0f};
float cuda5_2_FORWARD_hidden_units[24] = {0.0f};
const static size_t cuda5_2_FORWARD_layers_dim[4] = {9, 12, 12, 9};
const static float cuda5_2_FORWARD_matrices[360] = {
3.087359e-03, -2.629997e-01, 9.492566e-02, 4.831330e-02,
4.493726e-02, -3.714851e-04, 8.981445e-02, -4.888808e-02,
-7.350665e-02, -7.113249e-01, 2.111573e-02, 6.259846e-02,
2.931650e-02, 1.313162e-01, 1.926165e-02, 3.785147e-01,
1.765169e-01, 6.096475e-02, 4.104461e-03, 8.656193e-03,
1.102456e-02, 7.944959e-03, 4.644261e-02, -5.927094e-01,
-6.180425e-01, -4.314502e-01, -4.073743e-01, 3.077646e-02,
-1.029431e-01, 5.112506e-02, -8.541957e-02, 2.589677e-02,
-5.164597e-02, 1.186986e-01, -4.672555e-02, -6.755380e-02,
-2.806628e-04, 1.056535e-02, -1.438679e-01, -1.122842e-01,
5.779694e-02, 1.705828e-01, 3.862250e-01, -1.106681e-01,
-5.471609e-02, -2.316525e-02, -4.610147e-02, 2.021985e-03,
-5.761939e-03, 1.209045e-01, -7.279532e-02, 9.754839e-02,
-6.032932e-02, -1.589997e-02, 1.985070e-03, 2.788936e-03,
-2.104690e-01, -2.731634e-01, 1.189841e-02, 2.144678e-01,
1.771111e-01, -3.730702e-01, -3.886393e-01, -4.719765e-06,
-2.289832e-22, 0.0, 0.0, -7.619362e-33,
0.0, 0.0, 0.0, 0.0,
1.652513e-02, 2.785243e-02, 6.713332e-02, 3.292293e-02,
-7.087571e-01, 2.954406e-01, 2.942279e-01, 2.148153e-01,
9.042904e-02, 3.337476e-02, 5.262762e-02, 1.355991e-01,
6.802084e-02, 3.188081e-01, 1.053071e+00, 5.648708e-01,
3.254285e-01, 3.829584e-01, -3.902937e-02, 8.569189e-04,
-6.860779e-03, -1.342737e-02, 9.002463e-04, 2.672171e-01,
1.833601e-02, -4.791870e-02, -4.673452e-01, -5.951233e-04,
1.327156e-02, 4.884608e-04, -6.395956e-04, -1.247312e-02,
2.616015e-03, 2.045540e-02, 1.826517e-02, 2.752957e-02,
4.864566e-03, 1.974226e-01, 8.022508e-02, 8.533795e-02,
7.867660e-02, 1.206522e-02, 1.408663e-01, 8.814420e-29,
2.803104e-02, -1.190598e-01, 4.397753e-01, 2.351956e-03,
2.934275e-02, 1.909389e-02, -1.119068e-01, -5.117084e-02,
6.178805e-03, -1.955722e-03, -4.881141e-02, 0.0,
-5.396824e-02, 1.768444e-02, -1.764243e-01, -1.029730e-02,
3.943393e-02, -1.397969e-02, 9.628724e-02, -4.312754e-02,
-1.602866e-01, -1.405657e-02, 1.331697e-01, 0.0,
-2.396953e-02, 1.866630e-02, 3.267511e-02, -6.928004e-03,
7.034376e-02, -6.569391e-02, -1.199368e-01, 2.414189e-02,
3.878685e-02, 1.612695e-02, -9.410737e-02, 2.452490e-33,
-3.085373e-02, 1.452446e-02, 5.175281e-02, -2.379139e-02,
-5.039049e-02, 1.873454e-02, 9.242059e-02, -1.805802e-02,
-4.347714e-02, -3.853900e-02, 1.008241e-01, 0.0,
-9.480388e-03, 2.023331e-02, -6.792901e-03, -8.394149e-03,
-7.546303e-02, 6.270129e-03, -3.894017e-01, -4.973264e-02,
-1.555514e-01, -1.105092e-02, -1.950841e-01, -1.148950e-25,
-2.661943e-02, 9.485362e-02, -4.270326e-01, 7.918665e-03,
-1.816450e-01, -4.379404e-02, -3.889270e-02, -1.432468e-02,
1.501353e-02, -3.272457e-02, -1.477906e-01, 0.0,
-1.104928e-01, 3.061369e-02, -1.783103e-01, -4.144012e-03,
-1.341517e-02, -8.905338e-02, -2.880624e-01, -4.843873e-02,
-8.718476e-02, -4.244976e-02, -5.811334e-02, 8.169911e-07,
3.018601e-01, -6.359625e-02, -6.384093e-02, -2.376516e-03,
1.381678e-01, 5.480919e-03, -1.754923e-02, 1.902135e-02,
1.838670e-01, 1.829514e-02, 9.986089e-01, 0.0,
-3.820317e-02, -8.010733e-02, 2.023727e-01, -8.899641e-03,
-6.265503e-02, 2.848809e-01, -6.972601e-02, 9.673467e-02,
-6.779978e-02, -1.749464e-02, -1.618047e-01, 0.0,
5.618134e-03, -7.931516e-02, -7.710180e-01, -5.023658e-03,
2.721053e-02, 2.372581e-03, 1.131147e-01, 3.923619e-02,
1.188756e-01, 6.569220e-02, 3.954504e-02, 4.407177e-06,
3.772899e-02, -7.408679e-02, 2.722764e-01, 9.289873e-03,
-1.720112e-08, -1.111527e-10, -3.223340e-33, 0.0,
0.0, 0.0, 0.0, 0.0,
-3.947499e-10, -1.125618e-07, 0.0, 0.0,
7.252669e-01, -2.573835e-02, -3.086479e-03, 1.373577e-02,
-2.595616e-02, -1.071919e-01, -1.039699e-01, 4.686809e-01,
6.939601e-01, 5.092673e-02, 8.983605e-01, 7.748492e-12,
7.637465e-01, -5.160391e-02, 4.367014e-03, 5.456513e-03,
-1.755392e-02, -1.141231e-01, -9.624086e-02, 4.324957e-01,
7.202701e-01, 5.805269e-02, 8.917692e-01, 5.552060e-13,
6.970178e-01, -1.570065e-01, 3.382218e-02, -2.513156e-02,
-1.520863e-02, -1.164639e-01, -1.687423e-01, 4.522114e-01,
5.808989e-01, 5.248518e-02, 8.544105e-01, 9.402750e-15,
0.0, 0.0, 0.0, 0.0,
0.0, 0.0, 0.0, 0.0,
0.0, 0.0, 0.0, 0.0,
4.326442e-01, -5.917080e-02, 2.801385e-01, -2.795843e-02,
1.264143e-02, -3.693263e-01, -1.749216e-01, 2.439530e-01,
5.274415e-01, 6.522411e-01, 2.642505e-01, -1.186306e-22,
4.592337e-01, -3.818674e-02, 1.983223e-02, -3.099717e-02,
3.941813e-02, -5.257453e-01, -3.692166e-02, 2.670639e-01,
6.403314e-01, 5.740828e-01, 2.307071e-01, -6.111520e-19,
5.923415e-01, -1.620244e-01, -6.315269e-03, 1.360147e-01,
3.776298e-02, -2.748910e-01, -9.679949e-02, 3.612375e-01,
6.582589e-01, 1.544350e-01, 8.423274e-01, 0.0,
4.770435e-01, -3.441220e-02, 7.110235e-02, 1.750984e-01,
-1.088923e-01, -3.269669e-01, -3.097497e-01, 3.498318e-01,
6.162855e-01, 5.070065e-01, 4.478149e-01, 0.0,
0.0, 0.0, 0.0, 0.0,
0.0, 0.0, 0.0, 0.0,
0.0, 0.0, 0.0, 0.0};
const static float cuda5_2_FORWARD_biases[33] = {
2.354680e-01, 4.575782e-01, 6.988282e-01, 2.040031e-01,
8.584012e-01, 8.249553e-02, 1.267146e+00, 0.0,
3.549752e-01, -4.857582e-01, 1.279055e+00, 6.212520e-03,
1.735605e+00, 1.737882e-01, 9.513135e-02, 1.042232e-01,
2.587379e-02, 1.125817e-01, 4.899196e-01, 8.571400e-01,
1.188120e+00, 1.079335e+00, 1.945481e+00, 0.0,
-4.535237e-01, -4.646283e-01, -2.796752e-01, 0.0,
-1.881813e-01, 6.431429e-02, 1.600823e-01, 3.773381e-01,
0.0};
const static float cuda5_2_FORWARD_alpha[9] = {
2.371974e+08, 3.625653e+08, 1.961586e+08, 2.000000e+08,
2.259449e+07, 1.865459e+07, 6.657476e+08, 2.487226e+08,
2.000000e+08};
const static float cuda5_2_FORWARD_beta[9] = {
1.575003e+00, 1.656241e+00, 1.577959e+00, 2.000000e+00,
2.396584e+00, 2.221534e+00, 1.692119e+00, 1.879424e+00,
2.000000e+00};
float cuda5_2_BACKWARD_DATA_time_pred[6] = {0.0f};
float cuda5_2_BACKWARD_DATA_mask[6] = {0.0f};
float cuda5_2_BACKWARD_DATA_hidden_units[24] = {0.0f};
const static size_t cuda5_2_BACKWARD_DATA_layers_dim[4] = {9, 12, 12, 6};
const static float cuda5_2_BACKWARD_DATA_matrices[324] = {
8.340252e-04, -7.066309e-02, 6.012942e-03, -8.961015e-04,
5.308781e-02, 8.890389e-03, -1.695608e-02, -2.008141e-01,
-2.327795e-01, 1.816323e-03, 1.741969e-03, -4.547063e-01,
-3.278293e-01, 3.194534e-03, 5.590135e-01, 5.038606e-01,
-6.899682e-01, -6.846661e-01, 1.296691e-02, 9.286657e-03,
6.076815e-02, 9.537656e-03, -1.845960e-01, 2.334390e-01,
6.584574e-02, -1.502425e-01, -1.464556e-01, 2.582188e-02,
-2.801069e-01, 2.606461e-01, 5.094615e-02, 9.973006e-03,
-2.273075e-01, 1.013311e-01, -2.977537e-01, -3.584019e-01,
1.550467e-02, -2.365348e-02, -2.361028e-01, -4.535604e-01,
-1.099842e-01, 3.337491e-02, 3.386805e-02, 5.759778e-02,
5.773445e-02, -6.057084e-03, -5.215100e-03, -2.488342e-02,
4.550520e-01, -6.358563e-03, -4.111410e-01, -2.748287e-01,
6.576765e-01, 6.735925e-01, 1.382121e-02, 1.599379e-02,
2.175570e-01, 4.235858e-01, -4.743209e-03, 8.406488e-01,
5.463328e-01, 5.315352e-01, 5.759005e-01, -3.956826e-01,
1.770215e-03, 4.242290e-03, 5.961310e-03, 2.629623e-03,
3.968062e-01, 2.857247e-01, -3.694852e-01, -4.826791e-01,
-1.361759e-01, 1.741970e-02, 2.067235e-01, -3.166322e-02,
1.676094e-02, 1.222352e-01, 3.594849e-01, 5.646787e-02,
9.237770e-02, 2.705673e-02, 3.022863e-02, 2.661669e-01,
1.342065e-01, 9.685011e-02, -4.619106e-01, -4.885407e-01,
-1.207667e-01, -3.344076e-02, 1.247313e-03, 9.397045e-04,
3.326222e-03, 2.384325e-03, -5.191239e-01, 3.588830e-01,
5.642326e-01, -2.458584e-01, -6.050721e-01, -5.983715e-04,
-3.112906e-04, -8.002273e-02, 2.754113e-01, 1.347607e-01,
2.869407e-01, 3.228108e-01, 2.589051e-01, 2.689373e-01,
2.097373e-03, -1.213292e-03, 2.289704e-02, 2.260412e-02,
-4.001153e-03, -3.886382e-02, 1.744227e-02, 1.228004e-03,
5.637321e-02, 5.326664e-03, 5.775909e-02, -7.129682e-02,
2.957929e-02, -3.619472e-02, -7.687800e-02, 2.551496e-01,
2.791522e-02, -1.290575e-01, 7.948833e-02, 9.349618e-02,
4.568452e-03, -2.620651e-01, 9.037835e-03, 1.652229e-01,
-1.035363e-02, -4.924298e-01, -1.359403e-01, -2.509044e-02,
6.072426e-02, -1.067680e-01, 9.075266e-02, -5.669300e-01,
-5.016208e-02, -4.982992e-03, -4.493951e-01, 2.403491e-02,
-5.795595e-03, 8.214971e-02, 1.994753e-03, 2.271867e-03,
8.008438e-03, -1.517102e-01, -2.790549e-02, 7.735109e-02,
-1.794875e-02, 1.122736e-02, -4.320173e-02, -9.230874e-03,
-4.703557e-02, -3.043727e-02, -1.645634e-01, -6.124438e-02,
2.416326e-01, -2.548371e-01, 2.711228e-01, 2.171408e-01,
-1.613229e-02, -1.133995e-01, -5.881115e-01, 1.196182e-01,
-1.574013e-02, -2.309249e-02, -9.163861e-02, -1.243609e-03,
2.755058e-03, -8.981592e-02, 4.023712e-02, 1.447185e-01,
1.773491e-02, -4.728686e-02, 4.132702e-02, 4.325303e-02,
9.868489e-02, -2.594438e-01, 1.111406e-02, 5.278649e-02,
-5.842348e-02, 7.532353e-02, -3.890866e-02, 7.389170e-03,
-8.200553e-02, -2.977651e-04, 2.846818e-01, -2.641009e-02,
-3.923972e-06, 1.683590e-06, 4.231356e-06, -1.460619e-05,
1.480699e-05, -4.800242e-05, -3.605007e-05, 4.642337e-06,
-1.237117e-05, -6.065346e-05, 1.122525e-07, -4.718931e-05,
-4.836941e-02, 2.925190e-02, 5.125062e-02, -8.673830e-02,
4.049347e-02, -1.281789e-01, 4.054615e-02, -1.102404e-01,
1.797214e-02, 8.068577e-03, 9.849558e-02, 2.462221e-02,
-3.952334e-02, 7.078841e-02, 5.095275e-03, -5.172743e-03,
1.358633e-01, -4.528875e-01, 4.454420e-01, -5.941349e-01,
-8.203693e-02, -2.733144e-01, -4.668098e-01, 2.087940e-01,
2.732850e-01, 1.967585e-01, -1.648116e-02, -4.675763e-02,
-2.471467e-02, -3.507713e-02, 1.268763e-01, -4.777270e-04,
-6.884494e-02, -4.142293e-02, 4.568305e-01, -1.171813e-01,
4.104385e-02, 4.123072e-01, 1.201161e-01, 5.688429e-02,
-6.769225e-02, 1.879334e-01, -1.869847e-01, 2.116578e-01,
1.023851e-01, -7.956885e-03, 3.125194e-02, -3.698255e-02,
-1.742767e-02, 8.019327e-02, -2.414790e-01, 1.692867e-01,
-1.363161e-01, -2.991336e-02, 1.571377e-01, -4.675832e-05,
3.410926e-02, -2.423313e-02, 3.784683e-01, 8.980562e-01,
1.445573e-02, 3.742977e-01, -1.449231e-01, 5.250753e-02,
-9.320556e-02, 1.881413e-01, 1.525415e-01, 1.516415e-05,
-2.865472e-02, -4.090607e-02, 1.368707e-01, 1.152067e+00,
3.926153e-02, 3.892255e-01, -3.988812e-01, 2.768721e-01,
1.682807e-01, -8.165011e-02, 2.984257e-01, -2.310482e-05,
-1.301168e-01, -3.295192e-01, 1.955211e-01, 6.782165e-01,
-1.859493e-02, 5.047321e-01, -3.545281e-01, 6.802614e-01,
-2.701511e-02, 5.938844e-02, 1.288360e-01, 6.412582e-05,
6.354152e-02, -2.929806e-01, 1.172161e-01, 5.812020e-01,
-1.526828e-03, 4.311178e-01, -1.572772e-01, 3.847064e-01,
-1.406437e-01, -8.771673e-02, 1.723672e-01, -2.926565e-05,
1.170990e-01, -1.168602e-01, 2.353766e-01, 8.977429e-01,
1.029375e-02, 4.529134e-01, -3.884215e-01, 2.041353e-01,
-2.684749e-02, 9.474846e-02, 1.718571e-01, 9.999280e-06,
-9.272413e-02, -1.050809e-01, 2.637663e-01, 6.296775e-01};
const static float cuda5_2_BACKWARD_DATA_biases[30] = {
2.758991e-01, 9.040871e-01, 6.578859e-01, 3.464146e-01,
-1.074793e-01, -1.111640e+00, -4.436951e-03, 1.027522e+00,
5.782945e-02, -6.986979e-02, 1.183250e+00, -9.289587e-02,
2.339573e-03, 2.321955e-01, 6.579675e-01, 9.597613e-01,
4.900812e-02, 1.206250e-01, 1.320550e-01, 1.839768e-17,
1.678722e-01, -3.203184e-03, 7.736452e-01, 2.727852e+00,
1.589646e-01, -3.824490e-02, 5.180550e-01, 7.756407e-01,
4.521459e-01, 4.122442e-01};
const static float cuda5_2_BACKWARD_DATA_alpha[6] = {
1.933176e+08, 4.558126e+08, 6.040167e+07, 4.608431e+07,
6.338093e+08, 3.281159e+08};
const static float cuda5_2_BACKWARD_DATA_beta[6] = {
1.608048e+00, 1.659768e+00, 1.943038e+00, 1.953083e+00,
1.738348e+00, 1.891296e+00};
float cuda5_2_BACKWARD_FILTER_time_pred[6] = {0.0f};
float cuda5_2_BACKWARD_FILTER_mask[6] = {0.0f};
float cuda5_2_BACKWARD_FILTER_hidden_units[24] = {0.0f};
const static size_t cuda5_2_BACKWARD_FILTER_layers_dim[4] = {9, 12, 12, 6};
const static float cuda5_2_BACKWARD_FILTER_matrices[324] = {
4.047185e-03, 3.388695e-04, 1.210363e-04, -6.148457e-06,
-3.252271e-03, 8.122424e-04, 1.075851e-03, 3.066259e-03,
1.921126e-03, -1.042791e-04, -3.275821e-01, 4.278608e-03,
-2.106100e-01, 8.295547e-02, 2.430674e-01, -2.748593e-02,
-2.065240e-02, -1.395731e-02, -3.491511e-02, 3.520847e-03,
1.790237e-02, 1.188376e-02, 5.372314e-02, 1.494784e-02,
5.035055e-02, 6.581915e-02, 6.861494e-02, -2.199881e-03,
-2.281682e-02, -9.687833e-02, 3.909182e-03, 1.024575e-01,
3.948949e-02, -4.566963e-02, -1.375550e-01, -6.794923e-02,
6.135985e-04, -4.608163e-01, 2.404660e-01, 6.274750e-03,
1.059302e-01, 1.676516e-01, -5.104349e-02, 9.925397e-02,
-1.470984e-02, 1.031084e-04, 4.374801e-02, -5.167035e-01,
-3.632444e-01, 8.170792e-02, 3.783056e-01, 3.212413e-01,
-4.803373e-01, -4.874209e-01, 2.615676e-04, 3.406848e-02,
8.674651e-02, 3.508870e-03, -6.156053e-01, 3.270718e-01,
3.457363e-01, 1.898023e-01, -1.473479e-01, -2.987293e-01,
1.315816e-03, -5.991638e-03, 1.428707e-03, 1.580944e-03,
6.320467e-01, 2.342933e-01, -7.387988e-01, -4.437208e-01,
-7.261886e-02, 5.008508e-03, 4.693171e-02, -5.879956e-02,
1.677305e-02, 1.845511e-01, 3.830231e-01, 4.003870e-02,
9.888364e-02, 7.434040e-04, 7.895462e-02, 2.310843e-01,
1.044731e-02, 1.716935e-01, 1.390186e-01, -3.862206e-01,
-1.001334e-01, 1.338546e-02, -1.354914e-02, 5.464492e-02,
3.437773e-03, -2.069449e-03, -3.513253e-02, 1.837639e-02,
-1.552736e-01, -1.349904e-02, -1.025307e-01, -4.804826e-06,
3.284197e-02, 5.086832e-02, 5.690669e-03, 7.154379e-02,
1.094594e+00, 1.068281e+00, 3.653902e-01, 3.107198e-01,
-7.299128e-03, -3.042033e-04, 5.593516e-03, 3.541658e-03,
5.810616e-04, 8.030201e-03, -1.622678e-02, 1.400076e-04,
2.819623e-03, 4.108455e-03, 5.561182e-03, 2.512096e-03,
-8.622734e-04, 5.333219e-02, 3.076694e-02, 1.795766e-01,
-2.318845e-02, -3.202521e-02, 3.119619e-01, -1.606582e-01,
-1.085588e-01, -9.067213e-02, -1.422861e-02, -3.444208e-02,
-1.635176e-04, -2.596654e-01, 1.995525e-02, 2.055750e-02,
2.022944e-01, 4.327365e-01, -1.619481e-02, 1.125397e-01,
7.984060e-03, -2.073076e-01, -1.761664e-02, -4.832107e-02,
0.0, 0.0, 0.0, 0.0,
0.0, 0.0, 0.0, 0.0,
0.0, 0.0, 0.0, 0.0,
1.638518e-03, -1.793951e-02, 5.772194e-02, 2.851987e-02,
6.163087e-02, 2.867437e-02, 5.545961e-02, -1.660824e-01,
9.789789e-02, -1.159482e-01, 5.385513e-02, 6.836513e-02,
5.594874e-04, -2.741018e-02, 4.838353e-02, 4.298405e-02,
1.854298e-02, 3.633871e-02, 9.942706e-03, 3.490340e-01,
8.440907e-02, 2.376168e-02, 4.866724e-02, -2.214078e-01,
-5.650432e-03, -8.008064e-04, 1.477945e-03, 9.983850e-04,
2.346494e-04, 2.069148e-03, -4.035380e-03, -5.895875e-03,
-2.146410e-04, 8.988932e-04, 7.378523e-05, -3.107871e-05,
3.014900e-03, -2.577113e-01, 8.653076e-03, -2.681585e-02,
-5.089819e-02, -2.550743e-02, -3.467115e-02, 3.631677e-01,
-5.167207e-02, 9.202915e-02, -2.041105e-02, -1.355488e-01,
-4.411176e-03, 1.459578e-01, -1.287185e-02, -5.766148e-03,
-1.725510e-01, 1.716040e-01, -1.324064e-01, -1.831788e-01,
-4.434610e-02, -7.823753e-02, -2.463202e-02, 2.183346e-02,
5.483676e-04, -7.481821e-02, -8.179377e-03, -3.340281e-02,
-2.679154e-03, -3.484565e-02, -4.761697e-02, -7.778479e-01,
-9.353197e-02, -1.011887e-01, -3.653892e-02, 3.624209e-01,
-2.063141e-03, -1.785554e-03, 5.357111e-02, -4.105966e-02,
4.269572e-02, -1.532830e-01, 2.175374e-02, 1.304753e-01,
5.400207e-02, -4.020891e-02, -2.284152e-02, 1.153921e-01,
2.909448e-03, -1.312913e-02, -1.562593e-01, -1.018874e-01,
7.121818e-03, -1.468466e-01, 3.900497e-03, -2.249627e-02,
-5.684932e-02, 2.612863e-02, -1.410081e-01, 2.298795e-02,
6.794739e-04, 7.064358e-01, 7.429705e-01, 0.0,
3.578874e-01, 7.372183e-01, -2.632545e-04, -1.001730e-01,
4.224807e-01, -1.673518e-01, 9.987204e-04, -7.437595e-02,
4.765817e-05, 5.283366e-01, 5.804700e-01, 0.0,
5.304079e-01, 8.826514e-01, 1.191588e-04, -2.403303e-02,
8.384521e-02, -1.913135e-01, -2.046290e-04, -4.905949e-02,
-5.695952e-03, 4.907159e-01, 8.468218e-01, 0.0,
3.835697e-01, 4.161280e-01, -1.292199e-03, 2.594048e-01,
4.049456e-01, -4.400651e-01, 4.166223e-01, -1.978285e-01,
2.546945e-04, 5.691357e-01, 7.418897e-01, 0.0,
5.059269e-01, 8.695604e-01, -4.737849e-05, -1.666739e-02,
1.190503e-01, -1.528916e-01, -1.769190e-04, -4.045478e-02,
0.0, 0.0, 0.0, 0.0,
0.0, 0.0, 0.0, 0.0,
0.0, 0.0, 0.0, 0.0,
3.728615e-02, 3.964641e-01, 6.608990e-01, 0.0,
6.230336e-01, 5.074117e-01, 8.405939e-03, -1.422498e-01,
4.705996e-02, -2.407855e-01, -2.056813e-02, 2.624028e-01};
const static float cuda5_2_BACKWARD_FILTER_biases[30] = {
2.526327e-03, 1.731556e-02, 7.836947e-02, 6.594411e-02,
1.693102e-01, 7.814206e-01, 6.354987e-01, 9.766987e-01,
2.128775e-01, -4.894586e-01, -3.741650e-02, -1.046441e-01,
-2.802074e-02, 1.076976e+00, 1.484343e+00, 0.0,
1.488592e+00, 2.316875e+00, -5.133961e-03, 3.100583e-01,
6.346995e-01, 3.810246e-01, -2.523698e-01, 3.231826e-01,
-4.245956e-01, -4.564983e-01, 4.500998e-03, -5.841292e-01,
0.0, -1.728347e-01};
const static float cuda5_2_BACKWARD_FILTER_alpha[6] = {
2.066506e+08, 2.177061e+08, 5.654493e+07, 2.368001e+08,
2.000000e+08, 2.537848e+08};
const static float cuda5_2_BACKWARD_FILTER_beta[6] = {
1.610186e+00, 1.844894e+00, 1.895551e+00, 1.816587e+00,
2.000000e+00, 2.252824e+00};
if (conv_type == ConvolutionType::FORWARD && cuda_major == 5 &&
cuda_minor == 2) {
*layer_num_p = 4;
*hidden_units_p = cuda5_2_FORWARD_hidden_units;
*layers_dim_p = cuda5_2_FORWARD_layers_dim;
*matrices_p = cuda5_2_FORWARD_matrices;
*biases_p = cuda5_2_FORWARD_biases;
*alpha_p = cuda5_2_FORWARD_alpha;
*beta_p = cuda5_2_FORWARD_beta;
*time_pred_p = cuda5_2_FORWARD_time_pred;
*mask_p = cuda5_2_FORWARD_mask;
} else if (conv_type == ConvolutionType::BACKWARD_DATA && cuda_major == 5 &&
cuda_minor == 2) {
*layer_num_p = 4;
*hidden_units_p = cuda5_2_BACKWARD_DATA_hidden_units;
*layers_dim_p = cuda5_2_BACKWARD_DATA_layers_dim;
*matrices_p = cuda5_2_BACKWARD_DATA_matrices;
*biases_p = cuda5_2_BACKWARD_DATA_biases;
*alpha_p = cuda5_2_BACKWARD_DATA_alpha;
*beta_p = cuda5_2_BACKWARD_DATA_beta;
*time_pred_p = cuda5_2_BACKWARD_DATA_time_pred;
*mask_p = cuda5_2_BACKWARD_DATA_mask;
} else if (conv_type == ConvolutionType::BACKWARD_FILTER && cuda_major == 5 &&
cuda_minor == 2) {
*layer_num_p = 4;
*hidden_units_p = cuda5_2_BACKWARD_FILTER_hidden_units;
*layers_dim_p = cuda5_2_BACKWARD_FILTER_layers_dim;
*matrices_p = cuda5_2_BACKWARD_FILTER_matrices;
*biases_p = cuda5_2_BACKWARD_FILTER_biases;
*alpha_p = cuda5_2_BACKWARD_FILTER_alpha;
*beta_p = cuda5_2_BACKWARD_FILTER_beta;
*time_pred_p = cuda5_2_BACKWARD_FILTER_time_pred;
*mask_p = cuda5_2_BACKWARD_FILTER_mask;
} else {
return false;
}
return true;
#endif

return false;
}

// vim: syntax=cpp.doxygen

+ 0
- 1
dnn/src/x86/conv_bias/f32/winograd_algo.cpp View File

@@ -16,7 +16,6 @@
#include "src/x86/conv_bias/opr_impl.h"
#include "src/x86/conv_bias/postprocess_helper.h"
#include "src/x86/handle.h"
#include "src/x86/profile.h"

#include "midout.h"



Loading…
Cancel
Save