@@ -1,113 +0,0 @@ | |||||
/** | |||||
* \file dnn/src/cuda/conv_bias/1x1.cpp | |||||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
* | |||||
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||||
* | |||||
* Unless required by applicable law or agreed to in writing, | |||||
* software distributed under the License is distributed on an | |||||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
*/ | |||||
#include "src/common/conv_bias.h" | |||||
#include "src/cuda/conv_bias/algo.h" | |||||
#include "src/cuda/handle.h" | |||||
#include "src/cuda/utils.cuh" | |||||
using namespace megdnn; | |||||
using namespace cuda; | |||||
using namespace conv_bias; | |||||
bool ConvBiasForwardImpl::Algo1x1::is_available(const SizeArgs& args) const { | |||||
if (args.z_layout->ndim > 0) | |||||
return false; | |||||
auto&& fm = args.filter_meta; | |||||
return fm.format == Param::Format::NCHW && | |||||
(fm.dtype.enumv() == DTypeEnum::Float32 || | |||||
fm.dtype.enumv() == DTypeEnum::Float16) && | |||||
fm.spatial_ndim == 2 && fm.group == 1 && fm.dilation[0] == 1 && | |||||
fm.dilation[1] == 1 && fm.spatial[0] == 1 && fm.spatial[1] == 1 && | |||||
fm.padding[0] == 0 && fm.padding[1] == 0 && fm.stride[0] == 1 && | |||||
fm.stride[1] == 1; | |||||
} | |||||
void ConvBiasForwardImpl::Algo1x1::extract_matmul_layouts(const SizeArgs& args, | |||||
TensorLayout& A, | |||||
TensorLayout& B, | |||||
TensorLayout& C) { | |||||
auto&& fm = args.filter_meta; | |||||
A = {{fm.ocpg, fm.icpg}, fm.dtype}; | |||||
B.ndim = 2; | |||||
B.shape[0] = args.src_layout->shape[1]; | |||||
B.shape[1] = args.src_layout->shape[2] * args.src_layout->shape[3]; | |||||
B.stride[0] = args.src_layout->stride[1]; | |||||
B.stride[1] = 1; | |||||
B.dtype = args.src_layout->dtype; | |||||
C = {{args.dst_layout->shape[1], B.shape[1]}, args.dst_layout->dtype}; | |||||
} | |||||
WorkspaceBundle ConvBiasForwardImpl::Algo1x1::get_workspace_bundle( | |||||
void* ptr, const SizeArgs& args) const { | |||||
auto dst_layout = *args.dst_layout; | |||||
SmallVector<size_t> sizes; | |||||
if (dst_layout.dtype.enumv() != args.bias_layout->dtype.enumv()) { | |||||
dst_layout.dtype = DType(); | |||||
args.opr->check_or_deduce_dtype_fwd(args.src_layout->dtype, | |||||
args.filter_layout->dtype, | |||||
dst_layout.dtype); | |||||
sizes.push_back(dst_layout.span().dist_byte()); | |||||
} | |||||
SizeArgs conv_args = args; | |||||
conv_args.dst_layout = &dst_layout; | |||||
TensorLayout A, B, C; | |||||
extract_matmul_layouts(conv_args, A, B, C); | |||||
sizes.insert(sizes.begin(), | |||||
args.handle->matmul_opr()->get_workspace_in_bytes(A, B, C)); | |||||
return {ptr, std::move(sizes)}; | |||||
} | |||||
size_t ConvBiasForwardImpl::Algo1x1::get_workspace_in_bytes( | |||||
const SizeArgs& args) const { | |||||
return get_workspace_bundle(nullptr, args).total_size_in_bytes(); | |||||
} | |||||
void ConvBiasForwardImpl::Algo1x1::exec(const ExecArgs& args) const { | |||||
auto bundle = get_workspace_bundle(args.workspace.raw_ptr, args); | |||||
auto conv_dst_tensor = *args.dst_tensor; | |||||
if (args.dst_layout->dtype.enumv() != args.bias_layout->dtype.enumv()) { | |||||
conv_dst_tensor.raw_ptr = bundle.get(1); | |||||
conv_dst_tensor.layout.dtype = DType(); | |||||
args.opr->check_or_deduce_dtype_fwd(args.src_layout->dtype, | |||||
args.filter_layout->dtype, | |||||
conv_dst_tensor.layout.dtype); | |||||
} | |||||
ExecArgs conv_args = args; | |||||
conv_args.dst_tensor = &conv_dst_tensor; | |||||
conv_args.dst_layout = &conv_dst_tensor.layout; | |||||
{ | |||||
TensorND A, B, C; | |||||
extract_matmul_layouts(conv_args, A.layout, B.layout, C.layout); | |||||
A.raw_ptr = conv_args.filter_tensor->raw_ptr; | |||||
B.raw_ptr = conv_args.src_tensor->raw_ptr; | |||||
C.raw_ptr = conv_args.dst_tensor->raw_ptr; | |||||
size_t batch = conv_args.src_layout->shape[0]; | |||||
auto mm = conv_args.handle->matmul_opr(); | |||||
auto strd_B = conv_args.src_layout->stride[0] * | |||||
conv_args.src_layout->dtype.size(), | |||||
strd_C = conv_args.dst_layout->stride[0] * | |||||
conv_args.dst_layout->dtype.size(); | |||||
for (size_t i = 0; i < batch; ++i) { | |||||
mm->exec(A, B, C, bundle.get_workspace(0)); | |||||
incr_voidp(B.raw_ptr, strd_B); | |||||
incr_voidp(C.raw_ptr, strd_C); | |||||
} | |||||
} | |||||
handle_bias_and_nonlinear(args.handle, args.nonlinear_mode, | |||||
&conv_dst_tensor, args.dst_tensor, | |||||
args.bias_tensor); | |||||
} | |||||
// vim: syntax=cpp.doxygen |
@@ -24,7 +24,6 @@ ConvBiasForwardImpl::AlgoPack::AlgoPack() { | |||||
non_cudnn_algos.push_back(&matmul); | non_cudnn_algos.push_back(&matmul); | ||||
non_cudnn_algos.push_back(&matmul8x8x32); | non_cudnn_algos.push_back(&matmul8x8x32); | ||||
non_cudnn_algos.push_back(&batched_matmul); | non_cudnn_algos.push_back(&batched_matmul); | ||||
non_cudnn_algos.push_back(&a1x1); | |||||
fill_cudnn_algos(); | fill_cudnn_algos(); | ||||
for (auto&& algo : cudnn_conv_bias_activations) { | for (auto&& algo : cudnn_conv_bias_activations) { | ||||
@@ -43,7 +42,6 @@ ConvBiasForwardImpl::AlgoPack::AlgoPack() { | |||||
conv_algos.push_back(&matmul); | conv_algos.push_back(&matmul); | ||||
conv_algos.push_back(&matmul8x8x32); | conv_algos.push_back(&matmul8x8x32); | ||||
conv_algos.push_back(&batched_matmul); | conv_algos.push_back(&batched_matmul); | ||||
conv_algos.push_back(&a1x1); | |||||
conv_algos.reserve(conv_algos.size() * 2); | conv_algos.reserve(conv_algos.size() * 2); | ||||
//! add gconv algos by AlgoGroupConvGeneral | //! add gconv algos by AlgoGroupConvGeneral | ||||
@@ -51,7 +51,6 @@ public: | |||||
CUDA_INPLACE_MATMUL, | CUDA_INPLACE_MATMUL, | ||||
CUDA_MATMUL, | CUDA_MATMUL, | ||||
CUDA_MATMUL_INT8X8X32, | CUDA_MATMUL_INT8X8X32, | ||||
CUDA_1X1, | |||||
CUDA_BATCHED_MATMUL, | CUDA_BATCHED_MATMUL, | ||||
CUDA_GROUP_CONV_GENERAL, | CUDA_GROUP_CONV_GENERAL, | ||||
CUDA_WMMA_UINT4X4X32, | CUDA_WMMA_UINT4X4X32, | ||||
@@ -358,31 +357,6 @@ private: | |||||
mutable std::string m_name; | mutable std::string m_name; | ||||
}; | }; | ||||
//! optimized 1x1 conv | |||||
class ConvBiasForwardImpl::Algo1x1 final : public AlgoBase { | |||||
static void extract_matmul_layouts(const SizeArgs& args, TensorLayout& A, | |||||
TensorLayout& B, TensorLayout& C); | |||||
public: | |||||
bool is_available(const SizeArgs& args) const override; | |||||
size_t get_workspace_in_bytes(const SizeArgs& args) const override; | |||||
void exec(const ExecArgs& args) const override; | |||||
const char* name() const override { | |||||
if (m_name.empty()) { | |||||
m_name = ConvBiasForward::algo_name<ConvBiasForward::MatmulParam>( | |||||
"MATMUL1X1", {}); | |||||
} | |||||
return m_name.c_str(); | |||||
} | |||||
bool is_reproducible() const override { return true; } | |||||
MEGDNN_DECL_ALGO_TYPE(CUDA_1X1) | |||||
private: | |||||
WorkspaceBundle get_workspace_bundle(void* ptr, const SizeArgs& args) const; | |||||
mutable std::string m_name; | |||||
}; | |||||
class ConvBiasForwardImpl::AlgoBatchedMatmul final : public AlgoBase { | class ConvBiasForwardImpl::AlgoBatchedMatmul final : public AlgoBase { | ||||
static void extract_matmul_layouts(const SizeArgs& args, TensorLayout& A, | static void extract_matmul_layouts(const SizeArgs& args, TensorLayout& A, | ||||
TensorLayout& B, TensorLayout& C); | TensorLayout& B, TensorLayout& C); | ||||
@@ -738,7 +712,6 @@ public: | |||||
AlgoMatmul matmul; | AlgoMatmul matmul; | ||||
AlgoMatmul8x8x32 matmul8x8x32; | AlgoMatmul8x8x32 matmul8x8x32; | ||||
AlgoBatchedMatmul batched_matmul; | AlgoBatchedMatmul batched_matmul; | ||||
Algo1x1 a1x1; | |||||
std::vector<AlgoInt8NCHW4DotProdImplicitGemm> int8_nchw4_dotprod; | std::vector<AlgoInt8NCHW4DotProdImplicitGemm> int8_nchw4_dotprod; | ||||
AlgoInt8CHWN4DotProdImplicitGemm int8_chwn4_dotprod; | AlgoInt8CHWN4DotProdImplicitGemm int8_chwn4_dotprod; | ||||
#if CUDA_VERSION >= 10000 | #if CUDA_VERSION >= 10000 | ||||
@@ -118,9 +118,6 @@ ConvBiasForward::Algorithm* ConvBiasForwardImpl::get_algorithm_heuristic( | |||||
if (sm_algo_pack.batched_matmul.is_available_reproducible( | if (sm_algo_pack.batched_matmul.is_available_reproducible( | ||||
size_arg, reproducible, workspace_limit_in_bytes)) { | size_arg, reproducible, workspace_limit_in_bytes)) { | ||||
return &sm_algo_pack.batched_matmul; | return &sm_algo_pack.batched_matmul; | ||||
} else if (sm_algo_pack.a1x1.is_available_reproducible( | |||||
size_arg, reproducible, workspace_limit_in_bytes)) { | |||||
return &sm_algo_pack.a1x1; | |||||
} | } | ||||
return nullptr; | return nullptr; | ||||
}; | }; | ||||
@@ -178,12 +175,6 @@ ConvBiasForward::Algorithm* ConvBiasForwardImpl::get_algorithm_heuristic( | |||||
return algo; | return algo; | ||||
} | } | ||||
int batch = src[0]; | |||||
if (batch == 1 && sm_algo_pack.a1x1.is_available_reproducible( | |||||
args, reproducible, workspace_limit_in_bytes)) { | |||||
return &sm_algo_pack.a1x1; | |||||
} | |||||
// modify conv_args dst_layout | // modify conv_args dst_layout | ||||
conv_args.dst_layout = &dst_layout; | conv_args.dst_layout = &dst_layout; | ||||
if (is_cudnn_supported(conv_args)) { | if (is_cudnn_supported(conv_args)) { | ||||
@@ -834,12 +834,6 @@ TEST_F(CUDA, CONV_BIAS_FORWARD_MATMUL_1x1) { | |||||
.set_epsilon(1e-3); | .set_epsilon(1e-3); | ||||
for (auto&& arg : args) { | for (auto&& arg : args) { | ||||
checker.set_param(arg.param); | checker.set_param(arg.param); | ||||
checker.set_before_exec_callback( | |||||
conv_bias::ConvBiasAlgoChecker<ConvBias>( | |||||
ConvBiasForward::algo_name< | |||||
ConvBiasForward::MatmulParam>("MATMUL1X1", {}) | |||||
.c_str())); | |||||
checker.execs({arg.src, arg.filter, arg.bias, {}, {}}); | |||||
checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker< | checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker< | ||||
ConvBias>( | ConvBias>( | ||||
ConvBiasForward::algo_name<ConvBiasForward::MatmulParam>( | ConvBiasForward::algo_name<ConvBiasForward::MatmulParam>( | ||||
@@ -115,7 +115,7 @@ TEST_F(CUDA, GROUP_CONV_FORWARD_1x1) { | |||||
#if CUDNN_MAJOR <= 6 | #if CUDNN_MAJOR <= 6 | ||||
std::string conv1x1_name = | std::string conv1x1_name = | ||||
ConvBiasForward::algo_name<ConvBiasForward::MatmulParam>( | ConvBiasForward::algo_name<ConvBiasForward::MatmulParam>( | ||||
"MATMUL1X1", {}); | |||||
"BATCHEDMATMUL", {}); | |||||
checker.set_before_exec_callback( | checker.set_before_exec_callback( | ||||
AlgoChecker<ConvolutionForward>(ExecutionPolicyAlgoName{ | AlgoChecker<ConvolutionForward>(ExecutionPolicyAlgoName{ | ||||
"DEFAULT", | "DEFAULT", | ||||