GitOrigin-RevId: 323bf6073a
release-1.2
@@ -6,12 +6,13 @@ | |||||
* | * | ||||
* Unless required by applicable law or agreed to in writing, | * Unless required by applicable law or agreed to in writing, | ||||
* software distributed under the License is distributed on an | * software distributed under the License is distributed on an | ||||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||||
* implied. | |||||
*/ | */ | ||||
#include "src/cuda/conv_bias/opr_impl.h" | |||||
#include "megdnn/dtype.h" | #include "megdnn/dtype.h" | ||||
#include "src/cuda/conv_bias/helper.h" | |||||
#include "src/cuda/conv_bias/algo.h" | #include "src/cuda/conv_bias/algo.h" | ||||
#include "src/cuda/conv_bias/helper.h" | |||||
#include "src/cuda/conv_bias/opr_impl.h" | |||||
#include "src/cuda/handle.h" | #include "src/cuda/handle.h" | ||||
#include "src/cuda/utils.h" | #include "src/cuda/utils.h" | ||||
@@ -124,6 +125,44 @@ ConvBiasForward::Algorithm* ConvBiasForwardImpl::get_algorithm_heuristic( | |||||
return nullptr; | return nullptr; | ||||
}; | }; | ||||
const bool is_chanwise = | |||||
(args.filter_meta.format == Param::Format::NCHW && | |||||
args.filter_meta.group == src[1]) || | |||||
(args.filter_meta.format == Param::Format::NCHW4 && | |||||
args.filter_meta.group == src[1] * 4) || | |||||
(args.filter_meta.format == Param::Format::NCHW32 && | |||||
args.filter_meta.group == src[1] * 32); | |||||
// prefer special chanwise impl since as the group conv of cudnn | |||||
// whose version is lower than v7.5.0 is still slower than our | |||||
// implementation in many channel-wise cases | |||||
const bool slow_cudnn_chanwise_impl = | |||||
CUDNN_MAJOR < 7 || (CUDNN_MAJOR == 7 && CUDNN_MINOR < 5); | |||||
//! choose CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM default for large image | |||||
const int hw_size = src[2] * src[3]; | |||||
//! choose dnn when stride != 1, may need calibrate for different cudnn | |||||
//! version | |||||
const bool prefer_dnn_chanwise = | |||||
slow_cudnn_chanwise_impl || args.filter_meta.stride[0] != 1 || | |||||
args.filter_meta.stride[1] != 1 || hw_size < 512; | |||||
//! avoid bad case in cudnn, check dnn chanwise impl first | |||||
if (is_chanwise) { | |||||
if (prefer_dnn_chanwise) { | |||||
if (sm_algo_pack.chanwise.is_available_reproducible( | |||||
args, reproducible, workspace_limit_in_bytes)) | |||||
return &sm_algo_pack.chanwise; | |||||
if (sm_algo_pack.chanwise8x8x32.is_available_reproducible( | |||||
args, reproducible, workspace_limit_in_bytes)) | |||||
return &sm_algo_pack.chanwise8x8x32; | |||||
} else { | |||||
conv_args.dst_layout = &dst_layout; | |||||
if (is_cudnn_supported(conv_args)) { | |||||
if (auto algo = get_cudnn_algo(cudnn_conv_from_enum_wrapper)) { | |||||
return algo; | |||||
} | |||||
} | |||||
} | |||||
} | |||||
//! Prefer CUDNN CONVBIAS. | //! Prefer CUDNN CONVBIAS. | ||||
bool cudnn_conv_bias_act_supported = false; | bool cudnn_conv_bias_act_supported = false; | ||||
for (auto&& algo : sm_algo_pack.cudnn_conv_bias_activations) { | for (auto&& algo : sm_algo_pack.cudnn_conv_bias_activations) { | ||||
@@ -139,22 +178,10 @@ ConvBiasForward::Algorithm* ConvBiasForwardImpl::get_algorithm_heuristic( | |||||
return algo; | return algo; | ||||
} | } | ||||
if (args.filter_meta.group > 1) { | |||||
#if CUDNN_MAJOR < 7 || (CUDNN_MAJOR == 7 && CUDNN_MINOR < 5) | |||||
// prefer special chanwise impl since as the group conv of cudnn whose | |||||
// version is lower than v7.5.0 is still slower than our implementation | |||||
// in many channel-wise cases | |||||
if (sm_algo_pack.chanwise.is_available_reproducible( | |||||
args, reproducible, workspace_limit_in_bytes)) | |||||
return &sm_algo_pack.chanwise; | |||||
if (sm_algo_pack.chanwise8x8x32.is_available_reproducible( | |||||
args, reproducible, workspace_limit_in_bytes)) | |||||
return &sm_algo_pack.chanwise8x8x32; | |||||
#endif | |||||
} | |||||
if (auto algo = get_1x1_algo(args)) { | |||||
return algo; | |||||
int batch = src[0]; | |||||
if (batch == 1 && sm_algo_pack.a1x1.is_available_reproducible( | |||||
args, reproducible, workspace_limit_in_bytes)) { | |||||
return &sm_algo_pack.a1x1; | |||||
} | } | ||||
// modify conv_args dst_layout | // modify conv_args dst_layout | ||||
@@ -179,6 +206,10 @@ ConvBiasForward::Algorithm* ConvBiasForwardImpl::get_algorithm_heuristic( | |||||
conv_args = orig_args; | conv_args = orig_args; | ||||
} | } | ||||
if (auto algo = get_1x1_algo(args)) { | |||||
return algo; | |||||
} | |||||
if (args.src_layout->dtype.enumv() != DTypeTrait<dtype::BFloat16>::enumv) { | if (args.src_layout->dtype.enumv() != DTypeTrait<dtype::BFloat16>::enumv) { | ||||
if (reproducible) { | if (reproducible) { | ||||
return megdnn::get_reproducible_algo<ConvBiasForwardImpl>( | return megdnn::get_reproducible_algo<ConvBiasForwardImpl>( | ||||
@@ -839,6 +839,88 @@ TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_FORWARD_FLOAT_SMALL) { | |||||
} | } | ||||
TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_FORWARD_CUDNN_DNN) { | |||||
CUBenchmarker<ConvBiasForward> bencher(handle_cuda()); | |||||
size_t RUNS = 1; | |||||
bencher.set_display(false).set_times(RUNS); | |||||
ConvBias::Param param; | |||||
param.format = ConvBias::Param::Format::NCHW; | |||||
param.sparse = ConvBias::Param::Sparse::GROUP; | |||||
NormalRNG rng; | |||||
auto run = [&](size_t batch, size_t c, size_t ih, size_t iw, size_t f, | |||||
size_t s) { | |||||
param.pad_h = f / 2; | |||||
param.pad_w = f / 2; | |||||
param.stride_h = s; | |||||
param.stride_w = s; | |||||
param.compute_mode = param::ConvBias::ComputeMode::DEFAULT; | |||||
TensorShape src = {batch, c, ih, iw}, filter = {c, 1, 1, f, f}, | |||||
bias = {1, c, 1, 1}; | |||||
TensorLayout dst_layout; | |||||
auto opr = handle_cuda()->create_operator<ConvBias>(); | |||||
opr->param() = param; | |||||
opr->deduce_layout({src, dtype::Float32()}, {filter, dtype::Float32()}, | |||||
{bias, dtype::Float32()}, {}, dst_layout); | |||||
float computation_mops = | |||||
static_cast<float>(dst_layout.total_nr_elems() * f * f * 2) * | |||||
1e-6; | |||||
bencher.set_param(param) | |||||
.set_dtype(0, dtype::Float32()) | |||||
.set_dtype(1, dtype::Float32()) | |||||
.set_dtype(2, dtype::Float32()) | |||||
.set_rng(0, &rng) | |||||
.set_rng(1, &rng); | |||||
bencher.set_before_exec_callback( | |||||
AlgoChecker<ConvBiasForward>(".+CHANNEL_WISE.+")); | |||||
auto time_in_ms_dnn = bencher.execs({src, filter, bias, {}, {}}) / RUNS; | |||||
bencher.set_param(param) | |||||
.set_dtype(0, dtype::Float32()) | |||||
.set_dtype(1, dtype::Float32()) | |||||
.set_dtype(2, dtype::Float32()) | |||||
.set_rng(0, &rng) | |||||
.set_rng(1, &rng); | |||||
bencher.set_before_exec_callback(AlgoChecker<ConvBiasForward>( | |||||
".+CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM.+")); | |||||
auto time_in_ms_cudnn = | |||||
bencher.execs({src, filter, bias, {}, {}}) / RUNS; | |||||
printf("stride=%zu src=%s, filter=%s, dst=%s, dnn: %.2fms %.2fGB/s " | |||||
"cudnn: %.2fms %.2fGB/s " | |||||
"speedup: " | |||||
"%0.2f (dnn/cudnn)\n", | |||||
s, src.to_string().c_str(), filter.to_string().c_str(), | |||||
dst_layout.to_string().c_str(), time_in_ms_dnn, | |||||
computation_mops / time_in_ms_dnn, time_in_ms_cudnn, | |||||
computation_mops / time_in_ms_cudnn, | |||||
time_in_ms_cudnn / time_in_ms_dnn); | |||||
}; | |||||
// clang-format off | |||||
for(size_t batch:{1, 16, 32, 64, 128}){ | |||||
run(batch, 32, 112, 112, 3, 1); | |||||
run(batch, 96, 112, 112, 3, 2); | |||||
run(batch, 96, 112, 112, 3, 1); | |||||
run(batch, 144, 56, 56, 3, 2); | |||||
run(batch, 144, 56, 56, 3, 1); | |||||
run(batch, 192, 28, 28, 3, 1); | |||||
run(batch, 384, 14, 14, 3, 1); | |||||
run(batch, 576, 14, 14, 3, 1); | |||||
run(batch, 960, 7, 7, 3, 1); | |||||
//! calibrate heu algo policy hw_size param | |||||
run(batch, 144, 24, 24, 3, 1); | |||||
run(batch, 144, 22, 22, 3, 1); | |||||
run(batch, 144, 20, 20, 3, 1); | |||||
run(batch, 144, 18, 18, 3, 1); | |||||
} | |||||
// clang-format on | |||||
} | |||||
TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_BACKWARD_DATA_FLOAT_SMALL) { | TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_BACKWARD_DATA_FLOAT_SMALL) { | ||||
CUBenchmarker<ConvolutionBackwardData> bencher(handle_cuda()); | CUBenchmarker<ConvolutionBackwardData> bencher(handle_cuda()); | ||||
size_t RUNS = 1; | size_t RUNS = 1; | ||||