@@ -42,24 +42,11 @@ ConvBiasForwardImpl::AlgoPack::AlgoPack() { | |||||
conv_algos.push_back(&matmul); | conv_algos.push_back(&matmul); | ||||
conv_algos.push_back(&matmul8x8x32); | conv_algos.push_back(&matmul8x8x32); | ||||
conv_algos.push_back(&batched_matmul); | conv_algos.push_back(&batched_matmul); | ||||
conv_algos.reserve(conv_algos.size() * 2); | |||||
//! add gconv algos by AlgoGroupConvGeneral | |||||
size_t algo_size = conv_algos.size(); | |||||
for (size_t i = 3; i < algo_size; ++i) { | |||||
gconv_refhold.emplace_back(new AlgoGroupConvGeneral(conv_algos[i])); | |||||
algo2gconv[conv_algos[i]] = gconv_refhold.back().get(); | |||||
conv_algos.push_back(gconv_refhold.back().get()); | |||||
} | |||||
conv_algos.push_back(&group); | |||||
for (auto&& algo : conv_algos) { | for (auto&& algo : conv_algos) { | ||||
all_algos.push_back(algo); | all_algos.push_back(algo); | ||||
} | } | ||||
non_cudnn_algos.push_back(all_algos.rbegin()[4]); // group inplace_matmul | |||||
non_cudnn_algos.push_back(all_algos.rbegin()[3]); // group matmul | |||||
non_cudnn_algos.push_back(all_algos.rbegin()[2]); // group matmul_8x8x32 | |||||
non_cudnn_algos.push_back(all_algos.rbegin()[1]); // group batched_matmul | |||||
non_cudnn_algos.push_back(all_algos.rbegin()[0]); // group 1x1 | |||||
all_algos.push_back(&bfloat16); | all_algos.push_back(&bfloat16); | ||||
bfloat16_algos.push_back(&bfloat16); | bfloat16_algos.push_back(&bfloat16); | ||||
@@ -118,7 +105,7 @@ ConvBiasForwardImpl::AlgoPack ConvBiasForwardImpl::sm_algo_pack; | |||||
MEGDNN_DEF_GET_ALGO_FROM_DESC(ConvBiasForwardImpl) | MEGDNN_DEF_GET_ALGO_FROM_DESC(ConvBiasForwardImpl) | ||||
ConvBiasForwardImpl::AlgoBase::SizeArgs::SizeArgs( | ConvBiasForwardImpl::AlgoBase::SizeArgs::SizeArgs( | ||||
ConvBiasForwardImpl* o, const TensorLayout& src, | |||||
const ConvBiasForwardImpl* o, const TensorLayout& src, | |||||
const TensorLayout& filter, const TensorLayout& bias, | const TensorLayout& filter, const TensorLayout& bias, | ||||
const TensorLayout& z, const TensorLayout& dst, | const TensorLayout& z, const TensorLayout& dst, | ||||
const PreprocessedFilter* preprocessed_filter) | const PreprocessedFilter* preprocessed_filter) | ||||
@@ -127,7 +114,7 @@ ConvBiasForwardImpl::AlgoBase::SizeArgs::SizeArgs( | |||||
dst, preprocessed_filter) {} | dst, preprocessed_filter) {} | ||||
ConvBiasForwardImpl::AlgoBase::SizeArgs::SizeArgs( | ConvBiasForwardImpl::AlgoBase::SizeArgs::SizeArgs( | ||||
ConvBiasForwardImpl* o, const TensorLayout& src, | |||||
const ConvBiasForwardImpl* o, const TensorLayout& src, | |||||
const TensorLayout& filter, const CanonizedFilterMeta& filter_meta, | const TensorLayout& filter, const CanonizedFilterMeta& filter_meta, | ||||
const TensorLayout& bias, const TensorLayout& z, | const TensorLayout& bias, const TensorLayout& z, | ||||
const TensorLayout& dst, const PreprocessedFilter* preprocessed_filter) | const TensorLayout& dst, const PreprocessedFilter* preprocessed_filter) | ||||
@@ -78,15 +78,15 @@ public: | |||||
AlgoBase() : Algorithm() { m_handle_type = Handle::HandleType::CUDA; } | AlgoBase() : Algorithm() { m_handle_type = Handle::HandleType::CUDA; } | ||||
struct SizeArgs : public conv_bias::BiasForwardSizeArgs { | struct SizeArgs : public conv_bias::BiasForwardSizeArgs { | ||||
ConvBiasForwardImpl* opr; | |||||
const ConvBiasForwardImpl* opr; | |||||
const PreprocessedFilter* preprocessed_filter; | const PreprocessedFilter* preprocessed_filter; | ||||
std::string to_string() const; | std::string to_string() const; | ||||
SizeArgs(ConvBiasForwardImpl* opr, const TensorLayout& src, | |||||
SizeArgs(const ConvBiasForwardImpl* opr, const TensorLayout& src, | |||||
const TensorLayout& filter, const TensorLayout& bias, | const TensorLayout& filter, const TensorLayout& bias, | ||||
const TensorLayout& z, const TensorLayout& dst, | const TensorLayout& z, const TensorLayout& dst, | ||||
const PreprocessedFilter* preprocessed_filter = nullptr); | const PreprocessedFilter* preprocessed_filter = nullptr); | ||||
SizeArgs(ConvBiasForwardImpl* opr, const TensorLayout& src, | |||||
SizeArgs(const ConvBiasForwardImpl* opr, const TensorLayout& src, | |||||
const TensorLayout& filter, | const TensorLayout& filter, | ||||
const CanonizedFilterMeta& filter_meta, | const CanonizedFilterMeta& filter_meta, | ||||
const TensorLayout& bias, const TensorLayout& z, | const TensorLayout& bias, const TensorLayout& z, | ||||
@@ -434,27 +434,24 @@ private: | |||||
//! implement group conv by another algo | //! implement group conv by another algo | ||||
class ConvBiasForwardImpl::AlgoGroupConvGeneral final : public AlgoBase { | class ConvBiasForwardImpl::AlgoGroupConvGeneral final : public AlgoBase { | ||||
public: | public: | ||||
AlgoGroupConvGeneral(AlgoBase* impl); | |||||
bool is_available(const SizeArgs& args) const override; | bool is_available(const SizeArgs& args) const override; | ||||
size_t get_workspace_in_bytes(const SizeArgs& args) const override; | size_t get_workspace_in_bytes(const SizeArgs& args) const override; | ||||
void exec(const ExecArgs& args) const override; | void exec(const ExecArgs& args) const override; | ||||
const char* name() const override { return m_name.c_str(); } | |||||
std::vector<SearchItem> get_subopr_list( | |||||
const TensorLayoutArray& layouts, | |||||
const OperatorBase* opr) const override; | |||||
AlgoAttribute attribute() const override { | |||||
auto ret = AlgoAttribute::DEFAULT; | |||||
#define cb(attr) \ | |||||
if (m_impl->contain_attribute_all(attr)) { \ | |||||
ret |= attr; \ | |||||
const char* name() const override { | |||||
if (m_name.empty()) { | |||||
m_name = ConvBiasForward::algo_name<DirectParam>("CUDA:GROUP_CONV", | |||||
{}); | |||||
} | |||||
return m_name.c_str(); | |||||
} | } | ||||
MEGDNN_FOREACH_ALGO_ATTRIBUTE_INHERITABLE(cb) | |||||
#undef cb | |||||
if (m_impl->contain_attribute_all(AlgoAttribute::REPRODUCIBLE)) { | |||||
ret |= AlgoAttribute::REPRODUCIBLE; | |||||
} | |||||
return ret; | |||||
AlgoAttribute attribute() const override { | |||||
return AlgoAttribute::REPRODUCIBLE; | |||||
} | } | ||||
static void modify_size_args(SizeArgs& args, TensorLayout& src_pg, | static void modify_size_args(SizeArgs& args, TensorLayout& src_pg, | ||||
@@ -463,8 +460,7 @@ public: | |||||
private: | private: | ||||
WorkspaceBundle get_workspace_bundle(void* ptr, const SizeArgs& args) const; | WorkspaceBundle get_workspace_bundle(void* ptr, const SizeArgs& args) const; | ||||
AlgoBase* m_impl; | |||||
std::string m_name; | |||||
mutable std::string m_name; | |||||
}; | }; | ||||
#if CUDA_VERSION >= 10000 | #if CUDA_VERSION >= 10000 | ||||
@@ -1087,9 +1083,8 @@ public: | |||||
std::vector<AlgoInt4Int4NHWCIMMAImplicitGemm> int4_int4_nhwc_imma; | std::vector<AlgoInt4Int4NHWCIMMAImplicitGemm> int4_int4_nhwc_imma; | ||||
std::vector<AlgoUInt4Int4NHWCIMMAImplicitGemm> uint4_int4_nhwc_imma; | std::vector<AlgoUInt4Int4NHWCIMMAImplicitGemm> uint4_int4_nhwc_imma; | ||||
#endif | #endif | ||||
std::vector<std::unique_ptr<AlgoGroupConvGeneral>> gconv_refhold; | |||||
AlgoGroupConvGeneral group; | |||||
AlgoBFloat16 bfloat16; | AlgoBFloat16 bfloat16; | ||||
std::unordered_map<AlgoBase*, AlgoGroupConvGeneral*> algo2gconv; | |||||
AlgoBase* cudnn_conv_bias_act_from_enum(cudnnConvolutionFwdAlgo_t algo); | AlgoBase* cudnn_conv_bias_act_from_enum(cudnnConvolutionFwdAlgo_t algo); | ||||
@@ -9,6 +9,7 @@ | |||||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||
*/ | */ | ||||
#include <utility> | |||||
#include "src/common/conv_bias.h" | #include "src/common/conv_bias.h" | ||||
#include "src/cuda/conv_bias/algo.h" | #include "src/cuda/conv_bias/algo.h" | ||||
@@ -16,36 +17,80 @@ using namespace megdnn; | |||||
using namespace cuda; | using namespace cuda; | ||||
using namespace conv_bias; | using namespace conv_bias; | ||||
void ConvBiasForwardImpl::AlgoGroupConvGeneral::modify_size_args( | |||||
ConvBiasForwardImpl::AlgoBase::SizeArgs& args, TensorLayout& src_pg, | |||||
TensorLayout& dst_pg, TensorLayout& bias_pg) { | |||||
src_pg = *args.src_layout; | |||||
dst_pg = *args.dst_layout; | |||||
bias_pg = *args.bias_layout; | |||||
namespace { | |||||
std::pair<TensorLayoutArray, ConvBiasForwardImpl::Param> sub_opr_config( | |||||
const ConvBiasForwardImpl::AlgoBase::SizeArgs& args) { | |||||
TensorLayout src_pg = *args.src_layout; | |||||
SmallVector<size_t> flt_shape(0); | |||||
std::vector<ptrdiff_t> flt_stride(0); | |||||
size_t idx = 0; | |||||
// check if the first dim is group | |||||
if (args.filter_layout->ndim > args.src_layout->ndim) | |||||
++idx; | |||||
for (; idx < args.filter_layout->ndim; ++idx) { | |||||
flt_shape.push_back(args.filter_layout->shape[idx]); | |||||
flt_stride.push_back(args.filter_layout->stride[idx]); | |||||
} | |||||
TensorLayout filter_pg(flt_shape, flt_stride, | |||||
args.filter_layout->dtype, | |||||
args.filter_layout->format); | |||||
TensorLayout bias_pg = *args.bias_layout; | |||||
TensorLayout z_pg = *args.z_layout; | |||||
TensorLayout dst_pg = *args.dst_layout; | |||||
auto nr_grp = args.filter_meta.group; | auto nr_grp = args.filter_meta.group; | ||||
args.filter_meta.group = 1; | |||||
size_t c_pos; | size_t c_pos; | ||||
if (args.filter_meta.format == Param::Format::NCHW || | |||||
args.filter_meta.format == Param::Format::NCHW4) { | |||||
if (args.filter_meta.format == megdnn::param::ConvBias::Format::NCHW || | |||||
args.filter_meta.format == megdnn::param::ConvBias::Format::NCHW4) { | |||||
c_pos = 1; | c_pos = 1; | ||||
} else { | } else { | ||||
megdnn_assert(args.filter_meta.format == Param::Format::NHWC, | |||||
megdnn_assert(args.filter_meta.format == | |||||
megdnn::param::ConvBias::Format::NHWC, | |||||
"invalid conv format"); | "invalid conv format"); | ||||
c_pos = 3; | c_pos = 3; | ||||
} | } | ||||
src_pg.shape[c_pos] /= nr_grp; | src_pg.shape[c_pos] /= nr_grp; | ||||
dst_pg.shape[c_pos] /= nr_grp; | |||||
bias_pg.ndim = 0; | bias_pg.ndim = 0; | ||||
args.src_layout = &src_pg; | |||||
args.dst_layout = &dst_pg; | |||||
args.bias_layout = &bias_pg; | |||||
args.nonlinear_mode = Param::NonlineMode::IDENTITY; | |||||
dst_pg.shape[c_pos] /= nr_grp; | |||||
megdnn::param::ConvBias param = args.opr->param(); | |||||
param.sparse = megdnn::param::ConvBias::Sparse::DENSE; | |||||
param.nonlineMode = | |||||
megdnn::param::ConvBias::NonlineMode::IDENTITY; | |||||
std::pair<TensorLayoutArray, ConvBiasForwardImpl::Param> ret; | |||||
ret.first = {src_pg, filter_pg, bias_pg, z_pg, dst_pg}; | |||||
ret.second = param; | |||||
return ret; | |||||
} | } | ||||
ConvBiasForwardImpl::AlgoGroupConvGeneral::AlgoGroupConvGeneral(AlgoBase* impl) | |||||
: m_impl{impl} { | |||||
m_name = ConvBiasForward::algo_name<DirectParam>( | |||||
ssprintf("%s:%s", "CUDA:GROUP_CONV", impl->name()), {}); | |||||
std::pair<TensorLayoutArray, std::unique_ptr<ConvBiasForward>> prepare_sub_opr( | |||||
const ConvBiasForwardImpl::AlgoBase::SizeArgs& args) { | |||||
auto convbias_opr = args.handle->create_operator<ConvBias>(); | |||||
set_execution_policy<ConvBiasForward, ConvBiasForward*>( | |||||
args.opr, convbias_opr.get()); | |||||
auto&& config = sub_opr_config(args); | |||||
convbias_opr->param() = config.second; | |||||
return {config.first, std::move(convbias_opr)}; | |||||
} | |||||
} // namespace | |||||
std::vector<Algorithm::SearchItem> | |||||
ConvBiasForwardImpl::AlgoGroupConvGeneral::get_subopr_list( | |||||
const TensorLayoutArray& layouts, const OperatorBase* opr) const { | |||||
AlgoBase::SizeArgs args{static_cast<const ConvBiasForwardImpl*>(opr), | |||||
layouts[0], | |||||
layouts[1], | |||||
layouts[2], | |||||
layouts[3], | |||||
layouts[4]}; | |||||
auto&& config = sub_opr_config(args); | |||||
std::string param_str; | |||||
Algorithm::serialize_write_pod(config.second, param_str); | |||||
return {{Algorithm::OprType::CONVBIAS_FORWARD, param_str, config.first}}; | |||||
} | } | ||||
bool ConvBiasForwardImpl::AlgoGroupConvGeneral::is_available( | bool ConvBiasForwardImpl::AlgoGroupConvGeneral::is_available( | ||||
@@ -62,10 +107,10 @@ bool ConvBiasForwardImpl::AlgoGroupConvGeneral::is_available( | |||||
param.format == param::ConvBias::Format::NCHW32) | param.format == param::ConvBias::Format::NCHW32) | ||||
return false; | return false; | ||||
auto sub_args = args; | |||||
TensorLayout src_pg, dst_pg, bias_pg; | |||||
modify_size_args(sub_args, src_pg, dst_pg, bias_pg); | |||||
return m_impl->is_available(sub_args); | |||||
auto config = prepare_sub_opr(args); | |||||
return get_algorithm(static_cast<ConvBiasForwardImpl*>(config.second.get()), | |||||
config.first[0], config.first[1], config.first[2], | |||||
config.first[3], config.first[4]); | |||||
} | } | ||||
WorkspaceBundle ConvBiasForwardImpl::AlgoGroupConvGeneral::get_workspace_bundle( | WorkspaceBundle ConvBiasForwardImpl::AlgoGroupConvGeneral::get_workspace_bundle( | ||||
@@ -80,12 +125,12 @@ WorkspaceBundle ConvBiasForwardImpl::AlgoGroupConvGeneral::get_workspace_bundle( | |||||
sizes.push_back(dst_layout.span().dist_byte()); | sizes.push_back(dst_layout.span().dist_byte()); | ||||
} | } | ||||
auto sub_args = args; | |||||
sub_args.dst_layout = &dst_layout; | |||||
TensorLayout src_pg, dst_pg, bias_pg; | |||||
modify_size_args(sub_args, src_pg, dst_pg, bias_pg); | |||||
sizes.insert(sizes.begin(), | |||||
m_impl->get_workspace_in_bytes(sub_args)); | |||||
auto config = prepare_sub_opr(args); | |||||
size_t mm_ws = config.second->get_workspace_in_bytes( | |||||
config.first[0], config.first[1], config.first[2], | |||||
config.first[3], config.first[4], nullptr); | |||||
sizes.insert(sizes.begin(), mm_ws); | |||||
return {ptr, std::move(sizes)}; | return {ptr, std::move(sizes)}; | ||||
} | } | ||||
@@ -109,28 +154,13 @@ void ConvBiasForwardImpl::AlgoGroupConvGeneral::exec( | |||||
auto sub_args = args; | auto sub_args = args; | ||||
sub_args.dst_tensor = &conv_dst_tensor; | sub_args.dst_tensor = &conv_dst_tensor; | ||||
sub_args.dst_layout = &conv_dst_tensor.layout; | sub_args.dst_layout = &conv_dst_tensor.layout; | ||||
TensorND tsrc{*args.src_tensor}, tdst{conv_dst_tensor}, | |||||
tbias{*args.bias_tensor}; | |||||
SmallVector<size_t> flt_shape(0); | |||||
std::vector<ptrdiff_t> flt_stride(0); | |||||
size_t idx = 0; | |||||
// check if the first dim is group | |||||
if (args.filter_tensor->layout.ndim > args.src_layout->ndim) | |||||
++idx; | |||||
for (; idx < args.filter_tensor->layout.ndim; ++idx) { | |||||
flt_shape.push_back(args.filter_tensor->layout[idx]); | |||||
flt_stride.push_back(args.filter_tensor->layout.stride[idx]); | |||||
} | |||||
TensorND tflt{args.filter_tensor->raw_ptr, | |||||
TensorLayout{flt_shape, flt_stride, | |||||
args.filter_tensor->layout.dtype, | |||||
args.filter_tensor->layout.format}}; | |||||
modify_size_args(sub_args, tsrc.layout, tdst.layout, tbias.layout); | |||||
sub_args.src_tensor = &tsrc; | |||||
sub_args.dst_tensor = &tdst; | |||||
sub_args.filter_tensor = &tflt; | |||||
sub_args.bias_tensor = &tbias; | |||||
auto config = prepare_sub_opr(sub_args); | |||||
TensorND tsrc{args.src_tensor->raw_ptr, config.first[0]}; | |||||
TensorND tfilter{args.filter_tensor->raw_ptr, config.first[1]}; | |||||
TensorND tbias{args.bias_tensor->raw_ptr, config.first[2]}; | |||||
TensorND tz{args.z_tensor->raw_ptr, config.first[3]}; | |||||
TensorND tdst{conv_dst_tensor.raw_ptr, config.first[4]}; | |||||
size_t c_pos; | size_t c_pos; | ||||
if (args.filter_meta.format == Param::Format::NCHW || | if (args.filter_meta.format == Param::Format::NCHW || | ||||
@@ -150,16 +180,17 @@ void ConvBiasForwardImpl::AlgoGroupConvGeneral::exec( | |||||
strd_dst = tdst.layout.stride[c_pos] * fm.ocpg * | strd_dst = tdst.layout.stride[c_pos] * fm.ocpg * | ||||
tdst.layout.dtype.size(), | tdst.layout.dtype.size(), | ||||
strd_flt = fm.icpg * fm.ocpg * fm.spatial[0] * fm.spatial[1] * | strd_flt = fm.icpg * fm.ocpg * fm.spatial[0] * fm.spatial[1] * | ||||
tflt.layout.dtype.size(); | |||||
tfilter.layout.dtype.size(); | |||||
if (args.filter_meta.format == Param::Format::NCHW4) { | if (args.filter_meta.format == Param::Format::NCHW4) { | ||||
strd_src >>= 2; | strd_src >>= 2; | ||||
strd_dst >>= 2; | strd_dst >>= 2; | ||||
} | } | ||||
for (uint32_t g = 0; g < grp; ++g) { | for (uint32_t g = 0; g < grp; ++g) { | ||||
m_impl->exec(sub_args); | |||||
config.second->exec(tsrc, tfilter, tbias, | |||||
tz, tdst, nullptr, bundle.get_workspace(0)); | |||||
incr_voidp(tsrc.raw_ptr, strd_src); | incr_voidp(tsrc.raw_ptr, strd_src); | ||||
incr_voidp(tdst.raw_ptr, strd_dst); | incr_voidp(tdst.raw_ptr, strd_dst); | ||||
incr_voidp(tflt.raw_ptr, strd_flt); | |||||
incr_voidp(tfilter.raw_ptr, strd_flt); | |||||
} | } | ||||
} | } | ||||
handle_bias_and_nonlinear(args.handle, args.nonlinear_mode, | handle_bias_and_nonlinear(args.handle, args.nonlinear_mode, | ||||
@@ -193,25 +193,17 @@ ConvBiasForward::Algorithm* ConvBiasForwardImpl::get_algorithm_heuristic( | |||||
return algo; | return algo; | ||||
} | } | ||||
if (args.filter_meta.group > 1) { | |||||
auto orig_args = conv_args; | |||||
TensorLayout src, dst, bias; | |||||
AlgoGroupConvGeneral::modify_size_args(conv_args, src, dst, bias); | |||||
if (auto algo = get_1x1_algo(conv_args)) { | |||||
return sm_algo_pack.algo2gconv.at(algo); | |||||
} | |||||
if (is_cudnn_supported(conv_args)) { | |||||
if (auto algo = get_cudnn_algo(cudnn_conv_from_enum_wrapper)) { | |||||
return sm_algo_pack.algo2gconv.at(algo); | |||||
} | |||||
} | |||||
conv_args = orig_args; | |||||
} | |||||
if (auto algo = get_1x1_algo(args)) { | if (auto algo = get_1x1_algo(args)) { | ||||
return algo; | return algo; | ||||
} | } | ||||
if (args.filter_meta.group > 1) { | |||||
if (auto algo = megdnn::get_algo_match_attribute<ConvBiasForwardImpl>( | |||||
&sm_algo_pack.group, positive_attr, negative_attr)){ | |||||
return algo; | |||||
} | |||||
} | |||||
if (sm_algo_pack.fallback_nchw_qs8.is_available_attribute( | if (sm_algo_pack.fallback_nchw_qs8.is_available_attribute( | ||||
args, positive_attr, negative_attr, workspace_limit_in_bytes)) { | args, positive_attr, negative_attr, workspace_limit_in_bytes)) { | ||||
return &sm_algo_pack.fallback_nchw_qs8; | return &sm_algo_pack.fallback_nchw_qs8; | ||||
@@ -39,25 +39,9 @@ ConvolutionBackwardDataImpl::AlgoPack::AlgoPack() { | |||||
int8_algos.push_back(&int8_nchw_dotprod); | int8_algos.push_back(&int8_nchw_dotprod); | ||||
all_algos.push_back(&int8_nchw_dotprod); | all_algos.push_back(&int8_nchw_dotprod); | ||||
all_algos.reserve(all_algos.size() * 2); | |||||
// add gconv algos by AlgoGroupConvGeneral | |||||
auto all_algos_data = all_algos.data(); | |||||
size_t group_algo_start = 2; | |||||
for (size_t i = group_algo_start; i < all_algos.size(); ++i) { | |||||
gconv.push_back({all_algos[i]}); | |||||
} | |||||
for (size_t i = group_algo_start; i < all_algos.size(); ++i) { | |||||
algo2gconv[all_algos[i]] = &gconv[i - group_algo_start]; | |||||
} | |||||
for (auto&& i : gconv) { | |||||
all_algos.push_back(&i); | |||||
} | |||||
megdnn_assert(all_algos_data == all_algos.data()); | |||||
non_cudnn_algos.push_back(all_algos.rbegin()[0]); // group matmul | |||||
all_algos.push_back(&bfloat16); | all_algos.push_back(&bfloat16); | ||||
bfloat16_algos.push_back(&bfloat16); | bfloat16_algos.push_back(&bfloat16); | ||||
all_algos.push_back(&group); | |||||
for (auto&& algo : all_algos) { | for (auto&& algo : all_algos) { | ||||
m_all_algos_map.emplace(algo->info().desc, algo); | m_all_algos_map.emplace(algo->info().desc, algo); | ||||
@@ -80,13 +64,13 @@ ConvolutionBackwardDataImpl::AlgoPack::cudnn_from_enum( | |||||
ConvolutionBackwardDataImpl::AlgoPack ConvolutionBackwardDataImpl::sm_algo_pack; | ConvolutionBackwardDataImpl::AlgoPack ConvolutionBackwardDataImpl::sm_algo_pack; | ||||
ConvolutionBackwardDataImpl::AlgoBase::SizeArgs::SizeArgs( | ConvolutionBackwardDataImpl::AlgoBase::SizeArgs::SizeArgs( | ||||
ConvolutionBackwardDataImpl* o, const TensorLayout& filter, | |||||
const ConvolutionBackwardDataImpl* o, const TensorLayout& filter, | |||||
const TensorLayout& diff, const TensorLayout& grad) | const TensorLayout& diff, const TensorLayout& grad) | ||||
: SizeArgs(o, filter, o->make_canonized_filter_meta(grad.ndim, filter), | : SizeArgs(o, filter, o->make_canonized_filter_meta(grad.ndim, filter), | ||||
diff, grad) {} | diff, grad) {} | ||||
ConvolutionBackwardDataImpl::AlgoBase::SizeArgs::SizeArgs( | ConvolutionBackwardDataImpl::AlgoBase::SizeArgs::SizeArgs( | ||||
ConvolutionBackwardDataImpl* o, const TensorLayout& filter, | |||||
const ConvolutionBackwardDataImpl* o, const TensorLayout& filter, | |||||
const CanonizedFilterMeta& filter_meta, const TensorLayout& diff, | const CanonizedFilterMeta& filter_meta, const TensorLayout& diff, | ||||
const TensorLayout& grad) | const TensorLayout& grad) | ||||
: handle{concrete_handle(o->handle())}, | : handle{concrete_handle(o->handle())}, | ||||
@@ -97,7 +81,7 @@ ConvolutionBackwardDataImpl::AlgoBase::SizeArgs::SizeArgs( | |||||
opr{o} {} | opr{o} {} | ||||
ConvolutionBackwardDataImpl::AlgoBase::ExecArgs::ExecArgs( | ConvolutionBackwardDataImpl::AlgoBase::ExecArgs::ExecArgs( | ||||
ConvolutionBackwardDataImpl* opr, _megdnn_tensor_in filter, | |||||
const ConvolutionBackwardDataImpl* opr, _megdnn_tensor_in filter, | |||||
_megdnn_tensor_in diff, _megdnn_tensor_out grad, | _megdnn_tensor_in diff, _megdnn_tensor_out grad, | ||||
_megdnn_workspace workspace) | _megdnn_workspace workspace) | ||||
: SizeArgs(opr, filter.layout, diff.layout, grad.layout), | : SizeArgs(opr, filter.layout, diff.layout, grad.layout), | ||||
@@ -49,15 +49,17 @@ public: | |||||
HandleImpl* handle; | HandleImpl* handle; | ||||
CanonizedFilterMeta filter_meta; | CanonizedFilterMeta filter_meta; | ||||
const TensorLayout *diff_layout, *grad_layout, *filter_layout; | const TensorLayout *diff_layout, *grad_layout, *filter_layout; | ||||
ConvolutionBackwardDataImpl* opr; | |||||
const ConvolutionBackwardDataImpl* opr; | |||||
std::string to_string() const; | std::string to_string() const; | ||||
void init_desc(convolution::CUDNNBwdDataDescs& desc) const { | void init_desc(convolution::CUDNNBwdDataDescs& desc) const { | ||||
desc.set(filter_meta, *diff_layout, *grad_layout, opr->param()); | desc.set(filter_meta, *diff_layout, *grad_layout, opr->param()); | ||||
} | } | ||||
SizeArgs(ConvolutionBackwardDataImpl* opr, const TensorLayout& filter, | |||||
const TensorLayout& diff, const TensorLayout& grad); | |||||
SizeArgs(ConvolutionBackwardDataImpl* opr, const TensorLayout& filter, | |||||
SizeArgs(const ConvolutionBackwardDataImpl* opr, | |||||
const TensorLayout& filter, const TensorLayout& diff, | |||||
const TensorLayout& grad); | |||||
SizeArgs(const ConvolutionBackwardDataImpl* opr, | |||||
const TensorLayout& filter, | |||||
const CanonizedFilterMeta& filter_meta, | const CanonizedFilterMeta& filter_meta, | ||||
const TensorLayout& diff, const TensorLayout& grad); | const TensorLayout& diff, const TensorLayout& grad); | ||||
@@ -70,7 +72,7 @@ public: | |||||
const TensorND *filter_tensor, *diff_tensor, *grad_tensor; | const TensorND *filter_tensor, *diff_tensor, *grad_tensor; | ||||
Workspace workspace; | Workspace workspace; | ||||
ExecArgs(ConvolutionBackwardDataImpl* opr, _megdnn_tensor_in filter, | |||||
ExecArgs(const ConvolutionBackwardDataImpl* opr, _megdnn_tensor_in filter, | |||||
_megdnn_tensor_in diff, _megdnn_tensor_out grad, | _megdnn_tensor_in diff, _megdnn_tensor_out grad, | ||||
_megdnn_workspace workspace); | _megdnn_workspace workspace); | ||||
}; | }; | ||||
@@ -219,35 +221,26 @@ private: | |||||
//! implement group conv by another algo | //! implement group conv by another algo | ||||
class ConvolutionBackwardDataImpl::AlgoGroupConvGeneral final | class ConvolutionBackwardDataImpl::AlgoGroupConvGeneral final | ||||
: public AlgoBase { | : public AlgoBase { | ||||
AlgoBase* m_impl; | |||||
std::string m_name; | |||||
public: | public: | ||||
AlgoGroupConvGeneral(AlgoBase* impl); | |||||
bool is_available(const SizeArgs& args) const override; | bool is_available(const SizeArgs& args) const override; | ||||
size_t get_workspace_in_bytes(const SizeArgs& args) const override; | size_t get_workspace_in_bytes(const SizeArgs& args) const override; | ||||
void exec(const ExecArgs& args) const override; | void exec(const ExecArgs& args) const override; | ||||
const char* name() const override { return m_name.c_str(); } | |||||
std::vector<SearchItem> get_subopr_list( | |||||
const TensorLayoutArray& layouts, | |||||
const OperatorBase* opr) const override; | |||||
const char* name() const override { | |||||
return "CUDA:GROUP_CONV_BACKWARD_DATA"; | |||||
} | |||||
static void modify_size_args(SizeArgs& args, TensorLayout& diff_pg, | |||||
TensorLayout& grad_pg); | |||||
MEGDNN_DECL_ALGO_TYPE(CUDA_GROUP_CONV_GENERAL) | MEGDNN_DECL_ALGO_TYPE(CUDA_GROUP_CONV_GENERAL) | ||||
AlgoAttribute attribute() const override { | AlgoAttribute attribute() const override { | ||||
auto ret = AlgoAttribute::DEFAULT; | |||||
#define cb(attr) \ | |||||
if (m_impl->contain_attribute_all(attr)) { \ | |||||
ret |= attr; \ | |||||
} | |||||
MEGDNN_FOREACH_ALGO_ATTRIBUTE_INHERITABLE(cb) | |||||
#undef cb | |||||
if (m_impl->contain_attribute_all(AlgoAttribute::REPRODUCIBLE)) { | |||||
ret |= AlgoAttribute::REPRODUCIBLE; | |||||
} | |||||
return ret; | |||||
return AlgoAttribute::REPRODUCIBLE; | |||||
} | } | ||||
private: | |||||
WorkspaceBundle get_workspace_bundle(void* ptr, const SizeArgs& args) const; | |||||
}; | }; | ||||
class ConvolutionBackwardDataImpl::AlgoInt8NCHW4DotProdImplicitGemm final | class ConvolutionBackwardDataImpl::AlgoInt8NCHW4DotProdImplicitGemm final | ||||
@@ -319,9 +312,8 @@ public: | |||||
AlgoMatmul matmul; | AlgoMatmul matmul; | ||||
AlgoChanwise chanwise; | AlgoChanwise chanwise; | ||||
AlgoChanwiseSmall chanwise_small; | AlgoChanwiseSmall chanwise_small; | ||||
std::vector<AlgoGroupConvGeneral> gconv; | |||||
std::unordered_map<AlgoBase*, AlgoGroupConvGeneral*> algo2gconv; | |||||
AlgoBFloat16 bfloat16; | AlgoBFloat16 bfloat16; | ||||
AlgoGroupConvGeneral group; | |||||
std::vector<AlgoInt8NCHW4DotProdImplicitGemm> int8_nchw4_dotprod; | std::vector<AlgoInt8NCHW4DotProdImplicitGemm> int8_nchw4_dotprod; | ||||
AlgoInt8NCHWDotProdImplicitGemm int8_nchw_dotprod; | AlgoInt8NCHWDotProdImplicitGemm int8_nchw_dotprod; | ||||
@@ -16,24 +16,63 @@ using namespace megdnn; | |||||
using namespace cuda; | using namespace cuda; | ||||
using namespace convolution; | using namespace convolution; | ||||
void ConvolutionBackwardDataImpl::AlgoGroupConvGeneral::modify_size_args( | |||||
ConvolutionBackwardDataImpl::AlgoBase::SizeArgs& args, | |||||
TensorLayout& diff_pg, TensorLayout& grad_pg) { | |||||
diff_pg = *args.diff_layout; | |||||
grad_pg = *args.grad_layout; | |||||
namespace { | |||||
std::pair<TensorLayoutArray, Convolution::Param> sub_opr_config( | |||||
const ConvolutionBackwardDataImpl::AlgoBase::SizeArgs& args) { | |||||
SmallVector<size_t> flt_shape(0); | |||||
std::vector<ptrdiff_t> flt_stride(0); | |||||
size_t idx = 0; | |||||
// check if the first dim is group | |||||
if (args.filter_layout->ndim > args.diff_layout->ndim) | |||||
++idx; | |||||
for (; idx < args.filter_layout->ndim; ++idx) { | |||||
flt_shape.push_back(args.filter_layout->shape[idx]); | |||||
flt_stride.push_back(args.filter_layout->stride[idx]); | |||||
} | |||||
TensorLayout filter_pg(flt_shape, flt_stride, args.filter_layout->dtype, | |||||
args.filter_layout->format); | |||||
TensorLayout diff_pg = *args.diff_layout; | |||||
TensorLayout grad_pg = *args.grad_layout; | |||||
auto nr_grp = args.filter_meta.group; | auto nr_grp = args.filter_meta.group; | ||||
args.filter_meta.group = 1; | |||||
diff_pg.shape[1] /= nr_grp; | |||||
grad_pg.shape[1] /= nr_grp; | |||||
args.diff_layout = &diff_pg; | |||||
args.grad_layout = &grad_pg; | |||||
size_t c_pos = 1; | |||||
diff_pg.shape[c_pos] /= nr_grp; | |||||
grad_pg.shape[c_pos] /= nr_grp; | |||||
megdnn::param::Convolution param = args.opr->param(); | |||||
param.sparse = megdnn::param::ConvBias::Sparse::DENSE; | |||||
std::pair<TensorLayoutArray, ConvolutionBackwardDataImpl::Param> ret; | |||||
ret.first = {filter_pg, diff_pg, grad_pg}; | |||||
ret.second = param; | |||||
return ret; | |||||
} | } | ||||
ConvolutionBackwardDataImpl::AlgoGroupConvGeneral::AlgoGroupConvGeneral( | |||||
AlgoBase* impl) | |||||
: m_impl{impl} { | |||||
m_name = "group_conv:"; | |||||
m_name += impl->name(); | |||||
std::pair<TensorLayoutArray, std::unique_ptr<ConvolutionBackwardData>> | |||||
prepare_sub_opr(const ConvolutionBackwardDataImpl::AlgoBase::SizeArgs& args) { | |||||
auto conv_bwd_data_opr = | |||||
args.handle->create_operator<ConvolutionBackwardData>(); | |||||
set_execution_policy<ConvolutionBackwardData, ConvolutionBackwardData*>( | |||||
args.opr, conv_bwd_data_opr.get()); | |||||
auto&& config = sub_opr_config(args); | |||||
conv_bwd_data_opr->param() = config.second; | |||||
return {config.first, std::move(conv_bwd_data_opr)}; | |||||
} | |||||
} // namespace | |||||
std::vector<Algorithm::SearchItem> | |||||
ConvolutionBackwardDataImpl::AlgoGroupConvGeneral::get_subopr_list( | |||||
const TensorLayoutArray& layouts, const OperatorBase* opr) const { | |||||
AlgoBase::SizeArgs args{ | |||||
static_cast<const ConvolutionBackwardDataImpl*>(opr), layouts[0], | |||||
layouts[1], layouts[2]}; | |||||
auto&& config = sub_opr_config(args); | |||||
std::string param_str; | |||||
Algorithm::serialize_write_pod(config.second, param_str); | |||||
return {{Algorithm::OprType::CONVOLUTION_BACKWARD_DATA, param_str, | |||||
config.first}}; | |||||
} | } | ||||
bool ConvolutionBackwardDataImpl::AlgoGroupConvGeneral::is_available( | bool ConvolutionBackwardDataImpl::AlgoGroupConvGeneral::is_available( | ||||
@@ -46,44 +85,60 @@ bool ConvolutionBackwardDataImpl::AlgoGroupConvGeneral::is_available( | |||||
} | } | ||||
if (args.filter_meta.group <= 1) | if (args.filter_meta.group <= 1) | ||||
return false; | return false; | ||||
auto sub_args = args; | |||||
TensorLayout diff_pg, grad_pg; | |||||
modify_size_args(sub_args, diff_pg, grad_pg); | |||||
return m_impl->is_available(sub_args); | |||||
if (args.filter_meta.format != | |||||
megdnn::param::Convolution::Format::NCHW) { | |||||
return false; | |||||
} | |||||
auto config = prepare_sub_opr(args); | |||||
return get_algorithm( | |||||
static_cast<ConvolutionBackwardDataImpl*>(config.second.get()), | |||||
config.first[0], config.first[1], config.first[2]); | |||||
} | |||||
WorkspaceBundle | |||||
ConvolutionBackwardDataImpl::AlgoGroupConvGeneral::get_workspace_bundle( | |||||
void* ptr, const SizeArgs& args) const { | |||||
auto config = prepare_sub_opr(args); | |||||
size_t sizes = config.second->get_workspace_in_bytes( | |||||
config.first[0], config.first[1], config.first[2]); | |||||
return {ptr, {sizes}}; | |||||
} | } | ||||
size_t | size_t | ||||
ConvolutionBackwardDataImpl::AlgoGroupConvGeneral::get_workspace_in_bytes( | ConvolutionBackwardDataImpl::AlgoGroupConvGeneral::get_workspace_in_bytes( | ||||
const SizeArgs& args) const { | const SizeArgs& args) const { | ||||
auto sub_args = args; | |||||
TensorLayout diff_pg, grad_pg; | |||||
modify_size_args(sub_args, diff_pg, grad_pg); | |||||
return m_impl->get_workspace_in_bytes(sub_args); | |||||
return get_workspace_bundle(nullptr, args).total_size_in_bytes(); | |||||
} | } | ||||
void ConvolutionBackwardDataImpl::AlgoGroupConvGeneral::exec( | void ConvolutionBackwardDataImpl::AlgoGroupConvGeneral::exec( | ||||
const ExecArgs& args) const { | const ExecArgs& args) const { | ||||
auto sub_args = args; | |||||
TensorND tflt{*args.filter_tensor}, tdiff{*args.diff_tensor}, | |||||
tgrad{*args.grad_tensor}; | |||||
modify_size_args(sub_args, tdiff.layout, tgrad.layout); | |||||
sub_args.filter_tensor = &tflt; | |||||
sub_args.diff_tensor = &tdiff; | |||||
sub_args.grad_tensor = &tgrad; | |||||
auto grp = args.filter_meta.group; | |||||
auto&& fm = args.filter_meta; | |||||
auto strd_flt = (fm.icpg * fm.ocpg * fm.spatial[0] * fm.spatial[1] * | |||||
tflt.layout.dtype.size()), | |||||
strd_diff = | |||||
(tdiff.layout.stride[1] * fm.ocpg * tdiff.layout.dtype.size()), | |||||
strd_grad = | |||||
(tgrad.layout.stride[1] * fm.icpg * tgrad.layout.dtype.size()); | |||||
for (uint32_t g = 0; g < grp; ++g) { | |||||
m_impl->exec(sub_args); | |||||
incr_voidp(tflt.raw_ptr, strd_flt); | |||||
incr_voidp(tdiff.raw_ptr, strd_diff); | |||||
incr_voidp(tgrad.raw_ptr, strd_grad); | |||||
auto bundle = get_workspace_bundle(args.workspace.raw_ptr, args); | |||||
{ | |||||
auto config = prepare_sub_opr(args); | |||||
TensorND tfilter{args.filter_tensor->raw_ptr, config.first[0]}; | |||||
TensorND tdiff{args.diff_tensor->raw_ptr, config.first[1]}; | |||||
TensorND tgrad{args.grad_tensor->raw_ptr, config.first[2]}; | |||||
size_t c_pos = 1; | |||||
auto&& fm = args.filter_meta; | |||||
auto strd_flt = fm.icpg * fm.ocpg * fm.spatial[0] * fm.spatial[1] * | |||||
tfilter.layout.dtype.size(), | |||||
strd_diff = tdiff.layout.stride[c_pos] * fm.ocpg * | |||||
tdiff.layout.dtype.size(), | |||||
strd_grad = (tgrad.layout.stride[c_pos] * fm.icpg * | |||||
tgrad.layout.dtype.size()); | |||||
auto grp = args.filter_meta.group; | |||||
for (uint32_t g = 0; g < grp; ++g) { | |||||
config.second->exec(tfilter, tdiff, tgrad, bundle.get_workspace(0)); | |||||
incr_voidp(tfilter.raw_ptr, strd_flt); | |||||
incr_voidp(tdiff.raw_ptr, strd_diff); | |||||
incr_voidp(tgrad.raw_ptr, strd_grad); | |||||
} | |||||
} | } | ||||
} | } | ||||
@@ -26,23 +26,8 @@ ConvolutionBackwardFilterImpl::AlgoPack::AlgoPack() { | |||||
all_algos.push_back(&i); | all_algos.push_back(&i); | ||||
} | } | ||||
all_algos.push_back(&matmul); | all_algos.push_back(&matmul); | ||||
all_algos.push_back(&group); | |||||
all_algos.reserve(all_algos.size() * 2); | |||||
// add gconv algos by AlgoGroupConvGeneral | |||||
auto all_algos_data = all_algos.data(); | |||||
for (size_t i = 1; i < all_algos.size(); ++ i) { | |||||
gconv.push_back({all_algos[i]}); | |||||
} | |||||
for (size_t i = 1; i < all_algos.size(); ++ i) { | |||||
algo2gconv[all_algos[i]] = &gconv[i - 1]; | |||||
} | |||||
for (auto &&i: gconv) { | |||||
all_algos.push_back(&i); | |||||
} | |||||
megdnn_assert(all_algos_data == all_algos.data()); | |||||
non_cudnn_algos.push_back(all_algos.rbegin()[0]); // group matmul | |||||
all_algos.push_back(&bfloat16); | all_algos.push_back(&bfloat16); | ||||
bfloat16_algos.push_back(&bfloat16); | bfloat16_algos.push_back(&bfloat16); | ||||
@@ -68,7 +53,7 @@ ConvolutionBackwardFilterImpl::AlgoPack | |||||
ConvolutionBackwardFilterImpl::sm_algo_pack; | ConvolutionBackwardFilterImpl::sm_algo_pack; | ||||
ConvolutionBackwardFilterImpl::AlgoBase::SizeArgs::SizeArgs( | ConvolutionBackwardFilterImpl::AlgoBase::SizeArgs::SizeArgs( | ||||
ConvolutionBackwardFilterImpl *o, | |||||
const ConvolutionBackwardFilterImpl *o, | |||||
const TensorLayout &src, const TensorLayout &diff, | const TensorLayout &src, const TensorLayout &diff, | ||||
const TensorLayout &grad): | const TensorLayout &grad): | ||||
SizeArgs(o, src, diff, grad, o->make_canonized_filter_meta(src.ndim, grad)) | SizeArgs(o, src, diff, grad, o->make_canonized_filter_meta(src.ndim, grad)) | ||||
@@ -76,7 +61,7 @@ ConvolutionBackwardFilterImpl::AlgoBase::SizeArgs::SizeArgs( | |||||
} | } | ||||
ConvolutionBackwardFilterImpl::AlgoBase::SizeArgs::SizeArgs( | ConvolutionBackwardFilterImpl::AlgoBase::SizeArgs::SizeArgs( | ||||
ConvolutionBackwardFilterImpl* o, const TensorLayout& src, | |||||
const ConvolutionBackwardFilterImpl* o, const TensorLayout& src, | |||||
const TensorLayout& diff, const TensorLayout& grad, | const TensorLayout& diff, const TensorLayout& grad, | ||||
const CanonizedFilterMeta& grad_meta) | const CanonizedFilterMeta& grad_meta) | ||||
: handle{concrete_handle(o->handle())}, | : handle{concrete_handle(o->handle())}, | ||||
@@ -87,7 +72,7 @@ ConvolutionBackwardFilterImpl::AlgoBase::SizeArgs::SizeArgs( | |||||
opr{o} {} | opr{o} {} | ||||
ConvolutionBackwardFilterImpl::AlgoBase::ExecArgs::ExecArgs( | ConvolutionBackwardFilterImpl::AlgoBase::ExecArgs::ExecArgs( | ||||
ConvolutionBackwardFilterImpl *opr, | |||||
const ConvolutionBackwardFilterImpl *opr, | |||||
_megdnn_tensor_in src, | _megdnn_tensor_in src, | ||||
_megdnn_tensor_in diff, | _megdnn_tensor_in diff, | ||||
_megdnn_tensor_out grad, | _megdnn_tensor_out grad, | ||||
@@ -45,16 +45,18 @@ public: | |||||
HandleImpl* handle; | HandleImpl* handle; | ||||
const TensorLayout *src_layout, *diff_layout, *grad_layout; | const TensorLayout *src_layout, *diff_layout, *grad_layout; | ||||
CanonizedFilterMeta grad_filter_meta; | CanonizedFilterMeta grad_filter_meta; | ||||
ConvolutionBackwardFilterImpl* opr; | |||||
const ConvolutionBackwardFilterImpl* opr; | |||||
std::string to_string() const; | std::string to_string() const; | ||||
void init_desc(convolution::CUDNNBwdFilterDescs& desc) const { | void init_desc(convolution::CUDNNBwdFilterDescs& desc) const { | ||||
desc.set(*src_layout, *diff_layout, grad_filter_meta, opr->param()); | desc.set(*src_layout, *diff_layout, grad_filter_meta, opr->param()); | ||||
} | } | ||||
SizeArgs(ConvolutionBackwardFilterImpl* opr, const TensorLayout& src, | |||||
const TensorLayout& diff, const TensorLayout& grad); | |||||
SizeArgs(ConvolutionBackwardFilterImpl* opr, const TensorLayout& src, | |||||
const TensorLayout& diff, const TensorLayout& grad, | |||||
SizeArgs(const ConvolutionBackwardFilterImpl* opr, | |||||
const TensorLayout& src, const TensorLayout& diff, | |||||
const TensorLayout& grad); | |||||
SizeArgs(const ConvolutionBackwardFilterImpl* opr, | |||||
const TensorLayout& src, const TensorLayout& diff, | |||||
const TensorLayout& grad, | |||||
const CanonizedFilterMeta& grad_meta); | const CanonizedFilterMeta& grad_meta); | ||||
convolution::ForwardSizeArgs as_fwd_args() const { | convolution::ForwardSizeArgs as_fwd_args() const { | ||||
@@ -66,9 +68,9 @@ public: | |||||
const TensorND *src_tensor, *diff_tensor, *grad_tensor; | const TensorND *src_tensor, *diff_tensor, *grad_tensor; | ||||
Workspace workspace; | Workspace workspace; | ||||
ExecArgs(ConvolutionBackwardFilterImpl* opr, _megdnn_tensor_in src, | |||||
_megdnn_tensor_in diff, _megdnn_tensor_out grad, | |||||
_megdnn_workspace workspace); | |||||
ExecArgs(const ConvolutionBackwardFilterImpl* opr, | |||||
_megdnn_tensor_in src, _megdnn_tensor_in diff, | |||||
_megdnn_tensor_out grad, _megdnn_workspace workspace); | |||||
}; | }; | ||||
virtual bool is_available(const SizeArgs& args) const = 0; | virtual bool is_available(const SizeArgs& args) const = 0; | ||||
virtual size_t get_workspace_in_bytes(const SizeArgs& args) const = 0; | virtual size_t get_workspace_in_bytes(const SizeArgs& args) const = 0; | ||||
@@ -203,29 +205,25 @@ private: | |||||
//! implement group conv by another algo | //! implement group conv by another algo | ||||
class ConvolutionBackwardFilterImpl::AlgoGroupConvGeneral final | class ConvolutionBackwardFilterImpl::AlgoGroupConvGeneral final | ||||
: public AlgoBase { | : public AlgoBase { | ||||
AlgoBase* m_impl; | |||||
std::string m_name; | |||||
public: | public: | ||||
AlgoGroupConvGeneral(AlgoBase* impl); | |||||
bool is_available(const SizeArgs& args) const override; | bool is_available(const SizeArgs& args) const override; | ||||
size_t get_workspace_in_bytes(const SizeArgs& args) const override; | size_t get_workspace_in_bytes(const SizeArgs& args) const override; | ||||
void exec(const ExecArgs& args) const override; | void exec(const ExecArgs& args) const override; | ||||
std::vector<SearchItem> get_subopr_list( | |||||
const TensorLayoutArray& layouts, | |||||
const OperatorBase* opr) const override; | |||||
const char* name() const override { return m_name.c_str(); } | |||||
static void modify_size_args(SizeArgs& args, TensorLayout& src_pg, | |||||
TensorLayout& diff_pg); | |||||
const char* name() const override { | |||||
return "CUDA:GROUP_CONV_BACKWARD_FILTER"; | |||||
} | |||||
MEGDNN_DECL_ALGO_TYPE(CUDA_GROUP_CONV_GENERAL) | MEGDNN_DECL_ALGO_TYPE(CUDA_GROUP_CONV_GENERAL) | ||||
AlgoAttribute attribute() const override { | AlgoAttribute attribute() const override { | ||||
auto ret = static_cast<AlgoAttribute>(0); | |||||
if (m_impl->contain_attribute_all(AlgoAttribute::REPRODUCIBLE)) { | |||||
ret |= AlgoAttribute::REPRODUCIBLE; | |||||
} | |||||
return ret; | |||||
return AlgoAttribute::REPRODUCIBLE; | |||||
} | } | ||||
private: | |||||
WorkspaceBundle get_workspace_bundle(void* ptr, const SizeArgs& args) const; | |||||
}; | }; | ||||
class ConvolutionBackwardFilterImpl::AlgoPack : NonCopyableObj { | class ConvolutionBackwardFilterImpl::AlgoPack : NonCopyableObj { | ||||
@@ -240,8 +238,7 @@ public: | |||||
std::vector<AlgoCUDNN> cudnn; | std::vector<AlgoCUDNN> cudnn; | ||||
AlgoMatmul matmul; | AlgoMatmul matmul; | ||||
AlgoChanwise chanwise; | AlgoChanwise chanwise; | ||||
std::vector<AlgoGroupConvGeneral> gconv; | |||||
std::unordered_map<AlgoBase*, AlgoGroupConvGeneral*> algo2gconv; | |||||
AlgoGroupConvGeneral group; | |||||
AlgoBFloat16 bfloat16; | AlgoBFloat16 bfloat16; | ||||
std::vector<AlgoBase*> | std::vector<AlgoBase*> | ||||
@@ -15,25 +15,63 @@ using namespace megdnn; | |||||
using namespace cuda; | using namespace cuda; | ||||
using namespace convolution; | using namespace convolution; | ||||
void ConvolutionBackwardFilterImpl::AlgoGroupConvGeneral::modify_size_args( | |||||
ConvolutionBackwardFilterImpl::AlgoBase::SizeArgs &args, | |||||
TensorLayout &src_pg, TensorLayout &diff_pg) { | |||||
src_pg = *args.src_layout; | |||||
diff_pg = *args.diff_layout; | |||||
namespace { | |||||
std::pair<TensorLayoutArray, Convolution::Param> sub_opr_config( | |||||
const ConvolutionBackwardFilterImpl::AlgoBase::SizeArgs& args) { | |||||
SmallVector<size_t> flt_shape(0); | |||||
std::vector<ptrdiff_t> flt_stride(0); | |||||
size_t idx = 0; | |||||
// check if the first dim is group | |||||
if (args.grad_layout->ndim > args.diff_layout->ndim) | |||||
++idx; | |||||
for (; idx < args.grad_layout->ndim; ++idx) { | |||||
flt_shape.push_back(args.grad_layout->shape[idx]); | |||||
flt_stride.push_back(args.grad_layout->stride[idx]); | |||||
} | |||||
TensorLayout filter_pg(flt_shape, flt_stride, args.grad_layout->dtype, | |||||
args.grad_layout->format); | |||||
TensorLayout src_pg = *args.src_layout; | |||||
TensorLayout diff_pg = *args.diff_layout; | |||||
auto nr_grp = args.grad_filter_meta.group; | auto nr_grp = args.grad_filter_meta.group; | ||||
args.grad_filter_meta.group = 1; | |||||
src_pg.shape[1] /= nr_grp; | |||||
diff_pg.shape[1] /= nr_grp; | |||||
args.src_layout = &src_pg; | |||||
args.diff_layout = &diff_pg; | |||||
size_t c_pos = 1; | |||||
src_pg.shape[c_pos] /= nr_grp; | |||||
diff_pg.shape[c_pos] /= nr_grp; | |||||
megdnn::param::Convolution param = args.opr->param(); | |||||
param.sparse = megdnn::param::ConvBias::Sparse::DENSE; | |||||
std::pair<TensorLayoutArray, ConvolutionBackwardFilterImpl::Param> ret; | |||||
ret.first = {src_pg, diff_pg, filter_pg}; | |||||
ret.second = param; | |||||
return ret; | |||||
} | } | ||||
ConvolutionBackwardFilterImpl::AlgoGroupConvGeneral::AlgoGroupConvGeneral( | |||||
AlgoBase *impl): | |||||
m_impl{impl} | |||||
{ | |||||
m_name = "group_conv:"; | |||||
m_name += impl->name(); | |||||
std::pair<TensorLayoutArray, std::unique_ptr<ConvolutionBackwardFilter>> | |||||
prepare_sub_opr(const ConvolutionBackwardFilterImpl::AlgoBase::SizeArgs& args) { | |||||
auto conv_bwd_filter_opr = | |||||
args.handle->create_operator<ConvolutionBackwardFilter>(); | |||||
set_execution_policy<ConvolutionBackwardFilter, ConvolutionBackwardFilter*>( | |||||
args.opr, conv_bwd_filter_opr.get()); | |||||
auto&& config = sub_opr_config(args); | |||||
conv_bwd_filter_opr->param() = config.second; | |||||
return {config.first, std::move(conv_bwd_filter_opr)}; | |||||
} | |||||
} // namespace | |||||
std::vector<Algorithm::SearchItem> | |||||
ConvolutionBackwardFilterImpl::AlgoGroupConvGeneral::get_subopr_list( | |||||
const TensorLayoutArray& layouts, const OperatorBase* opr) const { | |||||
AlgoBase::SizeArgs args{ | |||||
static_cast<const ConvolutionBackwardFilterImpl*>(opr), layouts[0], | |||||
layouts[1], layouts[2]}; | |||||
auto&& config = sub_opr_config(args); | |||||
std::string param_str; | |||||
Algorithm::serialize_write_pod(config.second, param_str); | |||||
return {{Algorithm::OprType::CONVOLUTION_BACKWARD_FILTER, param_str, | |||||
config.first}}; | |||||
} | } | ||||
bool ConvolutionBackwardFilterImpl::AlgoGroupConvGeneral::is_available( | bool ConvolutionBackwardFilterImpl::AlgoGroupConvGeneral::is_available( | ||||
@@ -44,44 +82,60 @@ bool ConvolutionBackwardFilterImpl::AlgoGroupConvGeneral::is_available( | |||||
} | } | ||||
if (args.grad_filter_meta.group <= 1) | if (args.grad_filter_meta.group <= 1) | ||||
return false; | return false; | ||||
auto sub_args = args; | |||||
TensorLayout src_pg, diff_pg; | |||||
modify_size_args(sub_args, src_pg, diff_pg); | |||||
return m_impl->is_available(sub_args); | |||||
if (args.grad_filter_meta.format != | |||||
megdnn::param::Convolution::Format::NCHW) { | |||||
return false; | |||||
} | |||||
auto config = prepare_sub_opr(args); | |||||
return get_algorithm( | |||||
static_cast<ConvolutionBackwardFilterImpl*>(config.second.get()), | |||||
config.first[0], config.first[1], config.first[2]); | |||||
} | } | ||||
size_t ConvolutionBackwardFilterImpl::AlgoGroupConvGeneral:: | |||||
get_workspace_in_bytes(const SizeArgs &args) const { | |||||
auto sub_args = args; | |||||
TensorLayout src_pg, diff_pg; | |||||
modify_size_args(sub_args, src_pg, diff_pg); | |||||
return m_impl->get_workspace_in_bytes(sub_args); | |||||
WorkspaceBundle | |||||
ConvolutionBackwardFilterImpl::AlgoGroupConvGeneral::get_workspace_bundle( | |||||
void* ptr, const SizeArgs& args) const { | |||||
auto config = prepare_sub_opr(args); | |||||
size_t sizes = config.second->get_workspace_in_bytes( | |||||
config.first[0], config.first[1], config.first[2]); | |||||
return {ptr, {sizes}}; | |||||
} | |||||
size_t | |||||
ConvolutionBackwardFilterImpl::AlgoGroupConvGeneral::get_workspace_in_bytes( | |||||
const SizeArgs& args) const { | |||||
return get_workspace_bundle(nullptr, args).total_size_in_bytes(); | |||||
} | } | ||||
void ConvolutionBackwardFilterImpl::AlgoGroupConvGeneral::exec( | void ConvolutionBackwardFilterImpl::AlgoGroupConvGeneral::exec( | ||||
const ExecArgs &args) const { | |||||
auto sub_args = args; | |||||
TensorND tsrc{*args.src_tensor}, tdiff{*args.diff_tensor}, | |||||
tgrad{*args.grad_tensor}; | |||||
modify_size_args(sub_args, tsrc.layout, tdiff.layout); | |||||
sub_args.src_tensor = &tsrc; | |||||
sub_args.diff_tensor = &tdiff; | |||||
sub_args.grad_tensor = &tgrad; | |||||
auto &&fm = args.grad_filter_meta; | |||||
auto grp = fm.group; | |||||
auto strd_src = ( | |||||
tsrc.layout.stride[1] * fm.icpg * tsrc.layout.dtype.size()), | |||||
strd_diff = ( | |||||
tdiff.layout.stride[1] * fm.ocpg * tdiff.layout.dtype.size()), | |||||
strd_grad = (fm.icpg * fm.ocpg * | |||||
fm.spatial[0] * fm.spatial[1] * tgrad.layout.dtype.size()); | |||||
for (uint32_t g = 0; g < grp; ++ g) { | |||||
m_impl->exec(sub_args); | |||||
incr_voidp(tsrc.raw_ptr, strd_src); | |||||
incr_voidp(tdiff.raw_ptr, strd_diff); | |||||
incr_voidp(tgrad.raw_ptr, strd_grad); | |||||
const ExecArgs& args) const { | |||||
auto bundle = get_workspace_bundle(args.workspace.raw_ptr, args); | |||||
{ | |||||
auto config = prepare_sub_opr(args); | |||||
TensorND tsrc{args.src_tensor->raw_ptr, config.first[0]}; | |||||
TensorND tdiff{args.diff_tensor->raw_ptr, config.first[1]}; | |||||
TensorND tgrad{args.grad_tensor->raw_ptr, config.first[2]}; | |||||
size_t c_pos = 1; | |||||
auto&& fm = args.grad_filter_meta; | |||||
auto strd_src = tsrc.layout.stride[c_pos] * fm.icpg * | |||||
tsrc.layout.dtype.size(), | |||||
strd_diff = tdiff.layout.stride[c_pos] * fm.ocpg * | |||||
tdiff.layout.dtype.size(), | |||||
strd_grad = fm.icpg * fm.ocpg * fm.spatial[0] * fm.spatial[1] * | |||||
tgrad.layout.dtype.size(); | |||||
auto grp = fm.group; | |||||
for (uint32_t g = 0; g < grp; ++g) { | |||||
config.second->exec(tsrc, tdiff, tgrad, bundle.get_workspace(0)); | |||||
incr_voidp(tsrc.raw_ptr, strd_src); | |||||
incr_voidp(tdiff.raw_ptr, strd_diff); | |||||
incr_voidp(tgrad.raw_ptr, strd_grad); | |||||
} | |||||
} | } | ||||
} | } | ||||
@@ -104,19 +104,7 @@ ConvolutionBackwardDataImpl::get_algorithm_heuristic( | |||||
const TensorLayout& grad, size_t workspace_limit_in_bytes, | const TensorLayout& grad, size_t workspace_limit_in_bytes, | ||||
const AlgoAttribute& positive_attr, | const AlgoAttribute& positive_attr, | ||||
const AlgoAttribute& negative_attr) { | const AlgoAttribute& negative_attr) { | ||||
auto fm = check_layout_fwd(grad, filter, diff); | |||||
return get_algorithm_heuristic(filter, fm, diff, grad, | |||||
workspace_limit_in_bytes, positive_attr, | |||||
negative_attr); | |||||
} | |||||
ConvolutionBackwardDataImpl::Algorithm* | |||||
ConvolutionBackwardDataImpl::get_algorithm_heuristic(const TensorLayout& filter, | |||||
const CanonizedFilterMeta& filter_meta, const TensorLayout& diff, | |||||
const TensorLayout& grad, size_t workspace_limit_in_bytes, | |||||
const AlgoAttribute& positive_attr, | |||||
const AlgoAttribute& negative_attr) { | |||||
AlgoBase::SizeArgs args(this, filter, filter_meta, diff, grad); | |||||
AlgoBase::SizeArgs args(this, filter, diff, grad); | |||||
if (args.filter_meta.group > 1 && | if (args.filter_meta.group > 1 && | ||||
sm_algo_pack.chanwise.is_available_attribute( | sm_algo_pack.chanwise.is_available_attribute( | ||||
@@ -186,14 +174,11 @@ ConvolutionBackwardDataImpl::get_algorithm_heuristic(const TensorLayout& filter, | |||||
} | } | ||||
if (args.filter_meta.group > 1) { | if (args.filter_meta.group > 1) { | ||||
auto orig_args = args; | |||||
TensorLayout a, b; | |||||
AlgoGroupConvGeneral::modify_size_args(args, a, b); | |||||
if (is_cudnn_supported(args.as_fwd_args())) { | |||||
if (auto algo = get_cudnn_algo()) | |||||
return sm_algo_pack.algo2gconv.at(algo); | |||||
if (auto algo = megdnn::get_algo_match_attribute< | |||||
ConvolutionBackwardDataImpl>( | |||||
&sm_algo_pack.group, positive_attr, negative_attr)) { | |||||
return algo; | |||||
} | } | ||||
args = orig_args; | |||||
} | } | ||||
if (args.filter_layout->dtype.enumv() != | if (args.filter_layout->dtype.enumv() != | ||||
@@ -212,7 +197,7 @@ size_t ConvolutionBackwardDataImpl::get_workspace_in_bytes( | |||||
const TensorLayout& filter, const TensorLayout& diff, | const TensorLayout& filter, const TensorLayout& diff, | ||||
const TensorLayout& grad) { | const TensorLayout& grad) { | ||||
AlgoBase::SizeArgs args(this, filter, diff, grad); | AlgoBase::SizeArgs args(this, filter, diff, grad); | ||||
return get_algorithm(this, filter, args.filter_meta, diff, grad) | |||||
return get_algorithm(this, filter, diff, grad) | |||||
->get_workspace_in_bytes(args); | ->get_workspace_in_bytes(args); | ||||
} | } | ||||
@@ -227,8 +212,7 @@ void ConvolutionBackwardFilterImpl::exec(_megdnn_tensor_in src, | |||||
_megdnn_tensor_out grad, | _megdnn_tensor_out grad, | ||||
_megdnn_workspace workspace) { | _megdnn_workspace workspace) { | ||||
AlgoBase::ExecArgs args(this, src, diff, grad, workspace); | AlgoBase::ExecArgs args(this, src, diff, grad, workspace); | ||||
auto algo = get_algorithm(this, src.layout, diff.layout, grad.layout, | |||||
args.grad_filter_meta); | |||||
auto algo = get_algorithm(this, src.layout, diff.layout, grad.layout); | |||||
algo->check_workspace(args, workspace).exec(args); | algo->check_workspace(args, workspace).exec(args); | ||||
} | } | ||||
@@ -246,20 +230,7 @@ ConvolutionBackwardFilterImpl::get_algorithm_heuristic( | |||||
const TensorLayout& grad, size_t workspace_limit_in_bytes, | const TensorLayout& grad, size_t workspace_limit_in_bytes, | ||||
const AlgoAttribute& positive_attr, | const AlgoAttribute& positive_attr, | ||||
const AlgoAttribute& negative_attr) { | const AlgoAttribute& negative_attr) { | ||||
auto fm = check_layout_fwd(src, grad, diff); | |||||
return get_algorithm_heuristic(src, diff, grad, fm, | |||||
workspace_limit_in_bytes, positive_attr, | |||||
negative_attr); | |||||
} | |||||
ConvolutionBackwardFilterImpl::Algorithm* | |||||
ConvolutionBackwardFilterImpl::get_algorithm_heuristic( | |||||
const TensorLayout& src, const TensorLayout& diff, | |||||
const TensorLayout& grad, const CanonizedFilterMeta& grad_meta, | |||||
size_t workspace_limit_in_bytes, | |||||
const AlgoAttribute& positive_attr, | |||||
const AlgoAttribute& negative_attr) { | |||||
AlgoBase::SizeArgs args(this, src, diff, grad, grad_meta); | |||||
AlgoBase::SizeArgs args(this, src, diff, grad); | |||||
if (args.grad_filter_meta.group > 1 && | if (args.grad_filter_meta.group > 1 && | ||||
sm_algo_pack.chanwise.is_available_attribute( | sm_algo_pack.chanwise.is_available_attribute( | ||||
@@ -332,14 +303,11 @@ ConvolutionBackwardFilterImpl::get_algorithm_heuristic( | |||||
} | } | ||||
if (args.grad_filter_meta.group > 1) { | if (args.grad_filter_meta.group > 1) { | ||||
auto orig_args = args; | |||||
TensorLayout a, b; | |||||
AlgoGroupConvGeneral::modify_size_args(args, a, b); | |||||
if (is_cudnn_supported(args.as_fwd_args())) { | |||||
if (auto algo = get_cudnn_algo()) | |||||
return sm_algo_pack.algo2gconv.at(algo); | |||||
if (auto algo = megdnn::get_algo_match_attribute< | |||||
ConvolutionBackwardFilterImpl>( | |||||
&sm_algo_pack.group, positive_attr, negative_attr)) { | |||||
return algo; | |||||
} | } | ||||
args = orig_args; | |||||
} | } | ||||
if (args.src_layout->dtype.enumv() != DTypeTrait<dtype::BFloat16>::enumv) { | if (args.src_layout->dtype.enumv() != DTypeTrait<dtype::BFloat16>::enumv) { | ||||
@@ -357,7 +325,7 @@ size_t ConvolutionBackwardFilterImpl::get_workspace_in_bytes( | |||||
const TensorLayout& src, const TensorLayout& diff, | const TensorLayout& src, const TensorLayout& diff, | ||||
const TensorLayout& grad) { | const TensorLayout& grad) { | ||||
AlgoBase::SizeArgs args(this, src, diff, grad); | AlgoBase::SizeArgs args(this, src, diff, grad); | ||||
return get_algorithm(this, src, diff, grad, args.grad_filter_meta) | |||||
return get_algorithm(this, src, diff, grad) | |||||
->get_workspace_in_bytes(args); | ->get_workspace_in_bytes(args); | ||||
} | } | ||||
@@ -75,17 +75,6 @@ public: | |||||
void exec(_megdnn_tensor_in filter, _megdnn_tensor_in diff, | void exec(_megdnn_tensor_in filter, _megdnn_tensor_in diff, | ||||
_megdnn_tensor_out grad, _megdnn_workspace workspace) override; | _megdnn_tensor_out grad, _megdnn_workspace workspace) override; | ||||
AlgorithmInfo get_algorithm_info_heuristic( | AlgorithmInfo get_algorithm_info_heuristic( | ||||
const TensorLayout& filter, const CanonizedFilterMeta& filter_meta, | |||||
const TensorLayout& diff, const TensorLayout& grad, | |||||
size_t workspace_limit_in_bytes, const AlgoAttribute& positive_attr, | |||||
const AlgoAttribute& negative_attr) { | |||||
return get_algorithm_heuristic(filter, filter_meta, diff, grad, | |||||
workspace_limit_in_bytes, positive_attr, | |||||
negative_attr) | |||||
->info(); | |||||
} | |||||
AlgorithmInfo get_algorithm_info_heuristic( | |||||
const TensorLayout& filter, const TensorLayout& diff, | const TensorLayout& filter, const TensorLayout& diff, | ||||
const TensorLayout& grad, size_t workspace_limit_in_bytes, | const TensorLayout& grad, size_t workspace_limit_in_bytes, | ||||
const AlgoAttribute& positive_attr, | const AlgoAttribute& positive_attr, | ||||
@@ -128,14 +117,6 @@ protected: | |||||
const AlgoAttribute& negative_attr) override; | const AlgoAttribute& negative_attr) override; | ||||
private: | private: | ||||
Algorithm* get_algorithm_heuristic(const TensorLayout& filter, | |||||
const CanonizedFilterMeta& filter_meta, | |||||
const TensorLayout& diff, | |||||
const TensorLayout& grad, | |||||
size_t workspace_limit_in_bytes, | |||||
const AlgoAttribute& positive_attr, | |||||
const AlgoAttribute& negative_attr); | |||||
static AlgoPack sm_algo_pack; | static AlgoPack sm_algo_pack; | ||||
}; | }; | ||||
@@ -148,17 +129,6 @@ public: | |||||
const TensorLayout& diff, | const TensorLayout& diff, | ||||
const TensorLayout& grad) override; | const TensorLayout& grad) override; | ||||
AlgorithmInfo get_algorithm_info_heuristic( | AlgorithmInfo get_algorithm_info_heuristic( | ||||
const TensorLayout& src, const TensorLayout& diff, | |||||
const TensorLayout& grad, const CanonizedFilterMeta& grad_meta, | |||||
size_t workspace_limit_in_bytes, const AlgoAttribute& positive_attr, | |||||
const AlgoAttribute& negative_attr) { | |||||
return get_algorithm_heuristic(src, diff, grad, grad_meta, | |||||
workspace_limit_in_bytes, positive_attr, | |||||
negative_attr) | |||||
->info(); | |||||
} | |||||
AlgorithmInfo get_algorithm_info_heuristic( | |||||
const TensorLayout& filter, const TensorLayout& diff, | const TensorLayout& filter, const TensorLayout& diff, | ||||
const TensorLayout& grad, size_t workspace_limit_in_bytes, | const TensorLayout& grad, size_t workspace_limit_in_bytes, | ||||
const AlgoAttribute& positive_attr, | const AlgoAttribute& positive_attr, | ||||
@@ -195,14 +165,6 @@ protected: | |||||
const AlgoAttribute& negative_attr) override; | const AlgoAttribute& negative_attr) override; | ||||
private: | private: | ||||
Algorithm* get_algorithm_heuristic(const TensorLayout& src, | |||||
const TensorLayout& diff, | |||||
const TensorLayout& grad, | |||||
const CanonizedFilterMeta& grad_meta, | |||||
size_t workspace_limit_in_bytes, | |||||
const AlgoAttribute& positive_attr, | |||||
const AlgoAttribute& negative_attr); | |||||
static AlgoPack sm_algo_pack; | static AlgoPack sm_algo_pack; | ||||
}; | }; | ||||
@@ -1034,10 +1034,11 @@ TEST_F(CUDA, CONV_BIAS_FORWARD_GROUP) { | |||||
// float case | // float case | ||||
Checker<ConvBiasForward> checker(handle_cuda()); | Checker<ConvBiasForward> checker(handle_cuda()); | ||||
checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker< | checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker< | ||||
ConvBias>( | |||||
ConvBias>(ExecutionPolicyAlgoName{ | |||||
ConvBiasForward::algo_name<ConvBiasForward::DirectParam>( | ConvBiasForward::algo_name<ConvBiasForward::DirectParam>( | ||||
"CUDA:GROUP_CONV", {}) | "CUDA:GROUP_CONV", {}) | ||||
.c_str())); | |||||
.c_str(), | |||||
{{"CUDNN", {}}}})); | |||||
ConvBias::Param param; | ConvBias::Param param; | ||||
param.sparse = ConvBias::Param::Sparse::GROUP; | param.sparse = ConvBias::Param::Sparse::GROUP; | ||||
param.nonlineMode = mode; | param.nonlineMode = mode; | ||||
@@ -108,39 +108,33 @@ TEST_F(CUDA, GROUP_CONV_FORWARD) | |||||
} | } | ||||
TEST_F(CUDA, GROUP_CONV_FORWARD_1x1) { | TEST_F(CUDA, GROUP_CONV_FORWARD_1x1) { | ||||
auto run = [&](size_t N, size_t IC, size_t IH, size_t IW, | |||||
size_t FH, size_t FW, | |||||
size_t OC, size_t group) { | |||||
auto run = [&](size_t N, size_t IC, size_t IH, size_t IW, size_t FH, | |||||
size_t FW, size_t OC, size_t group) { | |||||
Checker<Convolution> checker(handle_cuda()); | Checker<Convolution> checker(handle_cuda()); | ||||
#if CUDNN_MAJOR <= 6 | |||||
std::string conv1x1_name = | std::string conv1x1_name = | ||||
ConvBiasForward::algo_name<ConvBiasForward::MatmulParam>( | |||||
"BATCHEDMATMUL", {}); | |||||
checker.set_before_exec_callback( | |||||
AlgoChecker<ConvolutionForward>(ExecutionPolicyAlgoName{ | |||||
"DEFAULT", | |||||
{{ConvBiasForward::algo_name< | |||||
ConvBiasForward::DirectParam>( | |||||
ssprintf("%s:%s", "CUDA:GROUP_CONV", | |||||
conv1x1_name.c_str()) | |||||
.c_str(), | |||||
{}) | |||||
.c_str(), | |||||
{}}}})); | |||||
#endif | |||||
ConvBiasForward::algo_name<ConvBias::MatmulParam>( | |||||
"INPLACE_MATMUL", {}); | |||||
checker.set_before_exec_callback(AlgoChecker<ConvolutionForward>( | |||||
ExecutionPolicyAlgoName{"DEFAULT", | |||||
{{ConvBiasForward::algo_name< | |||||
ConvBiasForward::DirectParam>( | |||||
"CUDA:GROUP_CONV", {}) | |||||
.c_str(), | |||||
{{conv1x1_name.c_str(), {}}}}}})); | |||||
Convolution::Param param; | Convolution::Param param; | ||||
param.sparse = Convolution::Param::Sparse::GROUP; | param.sparse = Convolution::Param::Sparse::GROUP; | ||||
auto ICg = IC / group; | auto ICg = IC / group; | ||||
auto OCg = OC / group; | auto OCg = OC / group; | ||||
checker.set_param(param).exec({{N, IC, IH, IW}, | |||||
{group, OCg, ICg, FH, FW}, {}}); | |||||
checker.set_param(param).exec( | |||||
{{N, IC, IH, IW}, {group, OCg, ICg, FH, FW}, {}}); | |||||
}; | }; | ||||
size_t ic = 192; | size_t ic = 192; | ||||
for (size_t g = 2; g <= 3; g += 1) { | for (size_t g = 2; g <= 3; g += 1) { | ||||
for (size_t ih = 8; ih <= 128; ih *= 4) { | for (size_t ih = 8; ih <= 128; ih *= 4) { | ||||
size_t iw = ih; | size_t iw = ih; | ||||
run(2, ic, ih, iw, 1, 1, ic / g, g); | run(2, ic, ih, iw, 1, 1, ic / g, g); | ||||
run(2, ic, ih+1, iw+1, 1, 1, ic / g, g); | |||||
run(2, ic, ih + 1, iw + 1, 1, 1, ic / g, g); | |||||
} | } | ||||
} | } | ||||
} | } | ||||
@@ -189,6 +183,54 @@ TEST_F(CUDA, GROUP_CONV_BACKWARD_DATA) | |||||
8); | 8); | ||||
} | } | ||||
TEST_F(CUDA, GROUP_CONV_BACKWARD_DATA_CUDNN) | |||||
{ | |||||
auto run = [&](size_t N, size_t IC, size_t IH, size_t IW, | |||||
size_t FH, size_t FW, | |||||
size_t OC, size_t OH, size_t OW, | |||||
size_t PH, size_t PW, | |||||
size_t SH, size_t SW, | |||||
size_t group) | |||||
{ | |||||
Checker<ConvolutionBackwardData> checker(handle_cuda()); | |||||
checker.set_before_exec_callback( | |||||
AlgoChecker<ConvolutionBackwardData>(ExecutionPolicyAlgoName{ | |||||
"CUDA:GROUP_CONV_BACKWARD_DATA", {{"CUDNN", {}}}})); | |||||
ConvolutionBackwardData::Param param; | |||||
param.sparse = Convolution::Param::Sparse::GROUP; | |||||
param.pad_h = PH; | |||||
param.pad_w = PW; | |||||
param.stride_h = SH; | |||||
param.stride_w = SW; | |||||
auto ICg = IC / group; | |||||
auto OCg = OC / group; | |||||
checker.set_param(param).exec({{group, OCg, ICg, FH, FW}, | |||||
{N, OC, OH, OW}, {N, IC, IH, IW}}); | |||||
}; | |||||
// normal case | |||||
run(2, 64, 7, 7, | |||||
3, 3, | |||||
32, 5, 5, | |||||
0, 0, | |||||
1, 1, | |||||
2); | |||||
// padded case | |||||
run(2, 32, 7, 7, | |||||
3, 3, | |||||
64, 7, 7, | |||||
1, 1, | |||||
1, 1, | |||||
4); | |||||
// strided case | |||||
run(2, 32, 7, 7, | |||||
3, 3, | |||||
64, 3, 3, | |||||
0, 0, | |||||
2, 2, | |||||
8); | |||||
} | |||||
TEST_F(CUDA, GROUP_CONV_BACKWARD_FILTER) | TEST_F(CUDA, GROUP_CONV_BACKWARD_FILTER) | ||||
{ | { | ||||
auto run = [&](size_t N, size_t IC, size_t IH, size_t IW, | auto run = [&](size_t N, size_t IC, size_t IH, size_t IW, | ||||
@@ -233,6 +275,52 @@ TEST_F(CUDA, GROUP_CONV_BACKWARD_FILTER) | |||||
8); | 8); | ||||
} | } | ||||
TEST_F(CUDA, GROUP_CONV_BACKWARD_FILTER_CUDNN) | |||||
{ | |||||
auto run = [&](size_t N, size_t IC, size_t IH, size_t IW, | |||||
size_t FH, size_t FW, | |||||
size_t OC, size_t OH, size_t OW, | |||||
size_t PH, size_t PW, | |||||
size_t SH, size_t SW, | |||||
size_t group) | |||||
{ | |||||
Checker<ConvolutionBackwardFilter> checker(handle_cuda()); | |||||
checker.set_before_exec_callback( | |||||
AlgoChecker<ConvolutionBackwardFilter>(ExecutionPolicyAlgoName{ | |||||
"CUDA:GROUP_CONV_BACKWARD_FILTER", {{"CUDNN", {}}}})); | |||||
ConvolutionBackwardFilter::Param param; | |||||
param.sparse = Convolution::Param::Sparse::GROUP; | |||||
param.pad_h = PH; | |||||
param.pad_w = PW; | |||||
param.stride_h = SH; | |||||
param.stride_w = SW; | |||||
auto ICg = IC / group; | |||||
auto OCg = OC / group; | |||||
checker.set_param(param).exec({{N, IC, IH, IW}, | |||||
{N, OC, OH, OW}, {group, OCg, ICg, FH, FW}}); | |||||
}; | |||||
// normal case | |||||
run(2, 64, 7, 7, | |||||
3, 3, | |||||
32, 5, 5, | |||||
0, 0, | |||||
1, 1, | |||||
2); | |||||
// padded case | |||||
run(2, 32, 7, 7, | |||||
3, 3, | |||||
64, 7, 7, | |||||
1, 1, | |||||
1, 1, | |||||
4); | |||||
// strided case | |||||
run(2, 32, 7, 7, | |||||
3, 3, | |||||
64, 3, 3, | |||||
0, 0, | |||||
2, 2, | |||||
8); | |||||
} | |||||
} // namespace test | } // namespace test | ||||
} // namespace megdnn | } // namespace megdnn | ||||