diff --git a/dnn/src/x86/pooling/algo.cpp b/dnn/src/x86/pooling/algo.cpp new file mode 100644 index 00000000..30929b08 --- /dev/null +++ b/dnn/src/x86/pooling/algo.cpp @@ -0,0 +1,365 @@ +/** + * \file dnn/src/x86/pooling/algos.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ +#include "src/x86/pooling/algo.h" +#include "megdnn/opr_param_defs.h" +#include "src/common/opr_delegate.h" +#include "src/common/utils.h" +#include "src/fallback/pooling/opr_impl.h" +#include "src/naive/handle.h" +#include "src/x86/handle.h" +#include "src/x86/pooling/do_max_pooling_3x3_s2x2_float_sse.h" +#include "src/x86/pooling/pooling_special_cases.h" +#include "src/x86/utils.h" + +using namespace megdnn; +using namespace x86; + +namespace { + +#if MEGDNN_X86_WITH_MKL_DNN +template +dnnl::memory tensor_to_mkl_memory(_megdnn_tensor_in src, + const dnnl::engine& mkldnn_eng, + dnnl::memory::data_type mkldnn_datatype) { + megdnn_assert(format_tag == dnnl::memory::format_tag::nChw8c || + format_tag == dnnl::memory::format_tag::nchw || + format_tag == dnnl::memory::format_tag::nhwc, + "not support format"); + + dnnl::memory::dims src_shape = { + static_cast(src.layout[0]), static_cast(src.layout[1]), + static_cast(src.layout[2]), static_cast(src.layout[3])}; + if (format_tag == dnnl::memory::format_tag::nChw8c) { + src_shape = {static_cast(src.layout[0]), + static_cast(src.layout[1] * 8), + static_cast(src.layout[2]), + static_cast(src.layout[3])}; + } + auto megdnn_src_md = + dnnl::memory::desc({src_shape}, mkldnn_datatype, format_tag); + if (use_mkl_mem) { + auto megdnn_src_memory = dnnl::memory(megdnn_src_md, mkldnn_eng); + return megdnn_src_memory; + } else { + auto megdnn_src_memory = dnnl::memory(megdnn_src_md, mkldnn_eng, + const_cast(src.raw_ptr)); + return megdnn_src_memory; + } +} + +#endif + +} // namespace + +PoolingImpl::AlgoPack::AlgoPack() { + all_algos.push_back(&algo_mean_w2s2_avx); + all_algos.push_back(&algo_mean_w2s2_sse3); + all_algos.push_back(&algo_max_w2s2_sse); + all_algos.push_back(&algo_max_w3s3_sse); +#if MEGDNN_X86_WITH_MKL_DNN + all_algos.push_back(&algo_mkldnn_nchw); + all_algos.push_back(&algo_mkldnn_nchw88); +#endif + all_algos.push_back(&algo_fallback); + + for (auto&& algo : all_algos) { + m_all_algos_map.emplace(algo->info().desc, algo); + } +} + +PoolingImpl::AlgoPack PoolingImpl::sm_algo_pack; +MEGDNN_DEF_GET_ALGO_FROM_DESC(PoolingImpl) + +PoolingImpl::AlgoBase::SizeArgs::SizeArgs(PoolingImpl* o, + const TensorLayout& src, + const TensorLayout& dst) + : handle{static_cast(o->handle())}, + opr{o}, + layout_src{src}, + layout_dst{dst} {} + +PoolingImpl::AlgoBase::ExecArgs::ExecArgs(PoolingImpl* opr, + _megdnn_tensor_in src, + _megdnn_tensor_out dst, + _megdnn_workspace workspace) + : SizeArgs(opr, src.layout, dst.layout), + src_tensor{&src}, + dst_tensor{&dst}, + workspace{workspace} {} + +std::string PoolingImpl::AlgoBase::SizeArgs::to_string() const { + return ssprintf("src=%s, dst=%s", layout_src.to_string().c_str(), + layout_dst.to_string().c_str()); +} + +bool PoolingImpl::AlgoMeanW2S2AVX::is_available(const SizeArgs& args) const { + auto SH = args.opr->param().stride_h; + auto SW = args.opr->param().stride_w; + auto FH = args.opr->param().window_h; + auto FW = args.opr->param().window_w; + + return (is_supported(SIMDType::AVX) && + args.opr->param().mode == Mode::AVERAGE && + args.opr->param().format == Param::Format::NCHW && + args.layout_src.dtype == dtype::Float32() && FH == 2 && FW == 2 && + SH == 2 && SW == 2); +} + +void PoolingImpl::AlgoMeanW2S2AVX::exec(const ExecArgs& args) const { + auto N = args.layout_src.shape[0]; + auto C = args.layout_src.shape[1]; + auto IH = args.layout_src.shape[2]; + auto IW = args.layout_src.shape[3]; + auto OH = args.layout_dst.shape[2]; + auto OW = args.layout_dst.shape[3]; + auto PH = args.opr->param().pad_h; + auto PW = args.opr->param().pad_w; + auto sptr = reinterpret_cast(args.src_tensor->raw_ptr); + auto dptr = reinterpret_cast(args.dst_tensor->raw_ptr); + auto handle = [=]() { return args.handle; }; + MEGDNN_DISPATCH_CPU_KERN_OPR(rep(n, N) rep(c, C) { + mean_pooling_w2x2_s2x2_avx(sptr + n * C * IH * IW + c * IH * IW, IH, IW, + dptr + n * C * OH * OW + c * OH * OW, OH, OW, + PH, PW, true); + }); +} + +bool PoolingImpl::AlgoMeanW2S2SSE3::is_available(const SizeArgs& args) const { + auto SH = args.opr->param().stride_h; + auto SW = args.opr->param().stride_w; + auto FH = args.opr->param().window_h; + auto FW = args.opr->param().window_w; + + return (is_supported(SIMDType::SSE3) && + args.opr->param().mode == Mode::AVERAGE && + args.layout_src.dtype == dtype::Float32() && + args.opr->param().format == Param::Format::NCHW && FH == 2 && + FW == 2 && SH == 2 && SW == 2); +} + +void PoolingImpl::AlgoMeanW2S2SSE3::exec(const ExecArgs& args) const { + auto N = args.layout_src.shape[0]; + auto C = args.layout_src.shape[1]; + auto IH = args.layout_src.shape[2]; + auto IW = args.layout_src.shape[3]; + auto OH = args.layout_dst.shape[2]; + auto OW = args.layout_dst.shape[3]; + auto PH = args.opr->param().pad_h; + auto PW = args.opr->param().pad_w; + auto sptr = reinterpret_cast(args.src_tensor->raw_ptr); + auto dptr = reinterpret_cast(args.dst_tensor->raw_ptr); + auto handle = [=]() { return args.handle; }; + MEGDNN_DISPATCH_CPU_KERN_OPR(rep(n, N) rep(c, C) { + mean_pooling_w2x2_s2x2_sse3(sptr + n * C * IH * IW + c * IH * IW, IH, + IW, dptr + n * C * OH * OW + c * OH * OW, + OH, OW, PH, PW, true); + }); +} + +bool PoolingImpl::AlgoMaxW2S2SSE::is_available(const SizeArgs& args) const { + auto SH = args.opr->param().stride_h; + auto SW = args.opr->param().stride_w; + auto FH = args.opr->param().window_h; + auto FW = args.opr->param().window_w; + + return (is_supported(SIMDType::SSE) && + args.layout_src.dtype == dtype::Float32() && + args.opr->param().mode == Mode::MAX && + args.opr->param().format == Param::Format::NCHW && FH == 2 && + FW == 2 && SH == 2 && SW == 2); +} + +void PoolingImpl::AlgoMaxW2S2SSE::exec(const ExecArgs& args) const { + auto N = args.layout_src.shape[0]; + auto C = args.layout_src.shape[1]; + auto IH = args.layout_src.shape[2]; + auto IW = args.layout_src.shape[3]; + auto OH = args.layout_dst.shape[2]; + auto OW = args.layout_dst.shape[3]; + auto PH = args.opr->param().pad_h; + auto PW = args.opr->param().pad_w; + auto sptr = reinterpret_cast(args.src_tensor->raw_ptr); + auto dptr = reinterpret_cast(args.dst_tensor->raw_ptr); + auto handle = [=]() { return args.handle; }; + MEGDNN_DISPATCH_CPU_KERN_OPR(rep(n, N) rep(c, C) { + max_pooling_w2x2_s2x2_sse(sptr + n * C * IH * IW + c * IH * IW, IH, IW, + dptr + n * C * OH * OW + c * OH * OW, OH, OW, + PH, PW); + }); +} + +bool PoolingImpl::AlgoMaxW3S3SSE::is_available(const SizeArgs& args) const { + auto SH = args.opr->param().stride_h; + auto SW = args.opr->param().stride_w; + auto FH = args.opr->param().window_h; + auto FW = args.opr->param().window_w; + + return (is_supported(SIMDType::SSE) && + args.layout_src.dtype == dtype::Float32() && + args.opr->param().mode == Mode::MAX && + args.opr->param().format == Param::Format::NCHW && FH == 3 && + FW == 3 && SH == 2 && SW == 2); +} + +void PoolingImpl::AlgoMaxW3S3SSE::exec(const ExecArgs& args) const { + auto N = args.layout_src.shape[0]; + auto C = args.layout_src.shape[1]; + auto IH = args.layout_src.shape[2]; + auto IW = args.layout_src.shape[3]; + auto OH = args.layout_dst.shape[2]; + auto OW = args.layout_dst.shape[3]; + auto PH = args.opr->param().pad_h; + auto PW = args.opr->param().pad_w; + auto sptr = reinterpret_cast(args.src_tensor->raw_ptr); + auto dptr = reinterpret_cast(args.dst_tensor->raw_ptr); + auto handle = [=]() { return args.handle; }; + MEGDNN_DISPATCH_CPU_KERN_OPR( + WorkspaceBundle ws = get_bundle(args.layout_src, args.layout_dst, + args.opr->param()); + ws.set(args.workspace.raw_ptr); rep(n, N) rep(c, C) { + do_max_pooling_3x3_s2x2_float_SSE( + sptr + n * C * IH * IW + c * IH * IW, + dptr + n * C * OH * OW + c * OH * OW, IH, IW, OH, OW, + PH, PW, ws); + }); +} + +#if MEGDNN_X86_WITH_MKL_DNN +bool PoolingImpl::AlgoMKLDNNNCHW::is_available(const SizeArgs& args) const { + return ((args.layout_src.dtype.enumv() == DTypeEnum::QuantizedS8 || + args.layout_src.dtype.enumv() == DTypeEnum::Int8) && + args.opr->param().mode == Mode::MAX && + args.opr->param().format == Param::Format::NCHW); +} + +void PoolingImpl::AlgoMKLDNNNCHW::exec(const ExecArgs& args) const { + auto PH = args.opr->param().pad_h; + auto PW = args.opr->param().pad_w; + auto FH = args.opr->param().window_h; + auto FW = args.opr->param().window_w; + auto SH = args.opr->param().stride_h; + auto SW = args.opr->param().stride_w; + auto handle = [=]() { return args.handle; }; + + auto x86_handle = static_cast(inplace_cpu_handle().get()); + auto mkldnn_eng = x86_handle->mkldnn_engine(); + auto mkldnn_stream = x86_handle->mkldnn_stream(); + auto mkldnn_pooling_mode = dnnl::algorithm::pooling_max; + dnnl::memory::dims pool_strides = {SH, SW}; + dnnl::memory::dims pool_padding = {PH, PW}; + dnnl::memory::dims pool_kernel = {FH, FW}; + + dnnl::memory&& megdnn_src_memory_ori = + tensor_to_mkl_memory( + *args.src_tensor, mkldnn_eng, dnnl::memory::data_type::s8); + dnnl::memory&& megdnn_dst_memory_ori = + tensor_to_mkl_memory( + *args.dst_tensor, mkldnn_eng, dnnl::memory::data_type::s8); + + dnnl::memory&& megdnn_src_memory = + tensor_to_mkl_memory( + *args.src_tensor, mkldnn_eng, dnnl::memory::data_type::s8); + dnnl::memory&& megdnn_dst_memory = + tensor_to_mkl_memory( + *args.dst_tensor, mkldnn_eng, dnnl::memory::data_type::s8); + + auto reorder_src = dnnl::reorder(megdnn_src_memory_ori, megdnn_src_memory); + auto reorder_dst = dnnl::reorder(megdnn_dst_memory, megdnn_dst_memory_ori); + auto pool1_desc = dnnl::pooling_forward::desc( + dnnl::prop_kind::forward_inference, mkldnn_pooling_mode, + megdnn_src_memory.get_desc(), megdnn_dst_memory.get_desc(), + pool_strides, pool_kernel, pool_padding, pool_padding); + auto pool_pd = + dnnl::pooling_forward::primitive_desc(pool1_desc, mkldnn_eng); + auto pool = dnnl::pooling_forward(pool_pd); + + auto run = [mkldnn_stream, mkldnn_eng, reorder_src, pool, reorder_dst, + megdnn_src_memory_ori, megdnn_src_memory, megdnn_dst_memory, + megdnn_dst_memory_ori](void) { + MEGDNN_MARK_USED_VAR(mkldnn_eng); + auto mkl_stream = mkldnn_stream; + reorder_src.execute(mkl_stream, {{DNNL_ARG_FROM, megdnn_src_memory_ori}, + {DNNL_ARG_TO, megdnn_src_memory}}); + pool.execute(mkl_stream, {{DNNL_ARG_SRC, megdnn_src_memory}, + {DNNL_ARG_DST, megdnn_dst_memory}}); + reorder_dst.execute(mkl_stream, {{DNNL_ARG_FROM, megdnn_dst_memory}, + {DNNL_ARG_TO, megdnn_dst_memory_ori}}); + mkl_stream.wait(); + }; + MEGDNN_DISPATCH_CPU_KERN_OPR(run()); +} + +#endif + +#if MEGDNN_X86_WITH_MKL_DNN +bool PoolingImpl::AlgoMKLDNNNCHW88::is_available(const SizeArgs& args) const { + return (args.layout_src.dtype == dtype::Float32() && + args.opr->param().mode == Mode::MAX && + args.opr->param().format == Param::Format::NCHW88); +} + +void PoolingImpl::AlgoMKLDNNNCHW88::exec(const ExecArgs& args) const { + auto PH = args.opr->param().pad_h; + auto PW = args.opr->param().pad_w; + auto FH = args.opr->param().window_h; + auto FW = args.opr->param().window_w; + auto SH = args.opr->param().stride_h; + auto SW = args.opr->param().stride_w; + auto handle = [=]() { return args.handle; }; + + auto x86_handle = static_cast(inplace_cpu_handle().get()); + auto mkldnn_eng = x86_handle->mkldnn_engine(); + auto mkldnn_stream = x86_handle->mkldnn_stream(); + auto mkldnn_pooling_mode = dnnl::algorithm::pooling_max; + switch (args.opr->param().mode) { + case Mode::MAX: + mkldnn_pooling_mode = dnnl::algorithm::pooling_max; + break; + case Mode::AVERAGE: + mkldnn_pooling_mode = dnnl::algorithm::pooling_avg_include_padding; + break; + case Mode::AVERAGE_COUNT_EXCLUDE_PADDING: + mkldnn_pooling_mode = dnnl::algorithm::pooling_avg_exclude_padding; + break; + default: + megdnn_throw("not supported pooling mode\n"); + }; + + dnnl::memory::dims pool_strides = {SH, SW}; + dnnl::memory::dims pool_padding = {PH, PW}; + dnnl::memory::dims pool_kernel = {FH, FW}; + dnnl::memory&& megdnn_src_memory_ori = + tensor_to_mkl_memory( + *args.src_tensor, mkldnn_eng, dnnl::memory::data_type::f32); + dnnl::memory&& megdnn_dst_memory_ori = + tensor_to_mkl_memory( + *args.dst_tensor, mkldnn_eng, dnnl::memory::data_type::f32); + auto pool_desc = dnnl::pooling_forward::desc( + dnnl::prop_kind::forward_inference, mkldnn_pooling_mode, + megdnn_src_memory_ori.get_desc(), megdnn_dst_memory_ori.get_desc(), + pool_strides, pool_kernel, pool_padding, pool_padding); + auto pool_pd = dnnl::pooling_forward::primitive_desc(pool_desc, mkldnn_eng); + auto pool = dnnl::pooling_forward(pool_pd); + + auto run = [mkldnn_stream, pool, mkldnn_eng, megdnn_src_memory_ori, + megdnn_dst_memory_ori](void) { + MEGDNN_MARK_USED_VAR(mkldnn_eng); + auto mkl_stream = mkldnn_stream; + + pool.execute(mkl_stream, {{DNNL_ARG_SRC, megdnn_src_memory_ori}, + {DNNL_ARG_DST, megdnn_dst_memory_ori}}); + mkl_stream.wait(); + }; + MEGDNN_DISPATCH_CPU_KERN_OPR(run()); +} + +#endif \ No newline at end of file diff --git a/dnn/src/x86/pooling/algo.h b/dnn/src/x86/pooling/algo.h new file mode 100644 index 00000000..8a16f2b8 --- /dev/null +++ b/dnn/src/x86/pooling/algo.h @@ -0,0 +1,132 @@ +/** + * \file dnn/src/x86/pooling/algo.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ + +#pragma once + +#include +#include "src/common/algo_base.h" +#include "src/common/metahelper.h" +#include "src/x86/pooling/opr_impl.h" +#include "src/x86/handle.h" + +namespace megdnn { +namespace x86 { +using AlgoBase = PoolingImpl::AlgoBase; + +class PoolingImpl::AlgoBase : public Algorithm { +public: + enum class AlgoType : uint32_t { + X86_MeanW2S2AVX, + X86_MeanW2S2SSE3, + X86_MaxW2S2SSE, + X86_MaxW3S3SSE, +#if MEGDNN_X86_WITH_MKL_DNN + X86_MKLDNNNCHW, + X86_MKLDNNNCHW88, +#endif + X86_Fallback + }; + using Mapper = std::unordered_map; + AlgoBase() : Algorithm() { m_handle_type = Handle::HandleType::X86; } + virtual ~AlgoBase() = default; + struct SizeArgs { + HandleImpl* handle; + PoolingImpl* opr; + const TensorLayout layout_src, layout_dst; + + std::string to_string() const; + SizeArgs(PoolingImpl* opr, const TensorLayout& src, + const TensorLayout& dst); + }; + struct ExecArgs : public SizeArgs { + const TensorND *src_tensor, *dst_tensor; + Workspace workspace; + + ExecArgs(PoolingImpl* opr, _megdnn_tensor_in src, + _megdnn_tensor_out dst, _megdnn_workspace workspace); + }; + + virtual bool is_available(const SizeArgs& args) const = 0; + virtual void exec(const ExecArgs& args) const = 0; + + uint32_t type() const override { return INVALID_ALGO_TYPE; }; + bool is_available_attribute( + const SizeArgs& args, + const AlgoAttribute& positive_attr = AlgoAttribute::REPRODUCIBLE, + const AlgoAttribute& negative_attr = AlgoAttribute::DEFAULT) { + return contain_attribute_all(positive_attr) && + !contain_attribute_any(negative_attr) && is_available(args); + } +}; + +#define ALGO_IMPL(_name) \ + class PoolingImpl::Algo##_name final : public AlgoBase { \ + std::string m_algo_name; \ + \ + public: \ + Algo##_name() : m_algo_name(std::string(#_name).append("_POOLING")) {} \ + AlgoAttribute attribute() const override { \ + return AlgoAttribute::REPRODUCIBLE; \ + }; \ + const char* name() const override { return m_algo_name.c_str(); } \ + bool is_available(const SizeArgs& args) const override; \ + void exec(const ExecArgs& args) const override; \ + MEGDNN_DECL_ALGO_TYPE(X86_##_name) \ + }; + +ALGO_IMPL(MeanW2S2AVX) +ALGO_IMPL(MeanW2S2SSE3) +ALGO_IMPL(MaxW2S2SSE) +ALGO_IMPL(MaxW3S3SSE) +#if MEGDNN_X86_WITH_MKL_DNN +ALGO_IMPL(MKLDNNNCHW) +ALGO_IMPL(MKLDNNNCHW88) +#endif + +#undef ALGO_IMPL + +class PoolingImpl::AlgoFallback final : public AlgoBase { + std::string m_algo_name; +public: + AlgoFallback() : m_algo_name("FALLBACK_POOLING") {} + AlgoAttribute attribute() const override { + return AlgoAttribute::REPRODUCIBLE; + }; + const char* name() const override { return m_algo_name.c_str(); } + bool is_available(const SizeArgs&) const override { return true; } + void exec(const ExecArgs&) const override {} + MEGDNN_DECL_ALGO_TYPE(X86_Fallback) +}; + +class PoolingImpl::AlgoPack : NonCopyableObj { +private: + AlgoBase::Mapper m_all_algos_map; + AlgoMeanW2S2AVX algo_mean_w2s2_avx; + AlgoMeanW2S2SSE3 algo_mean_w2s2_sse3; + AlgoMaxW2S2SSE algo_max_w2s2_sse; + AlgoMaxW3S3SSE algo_max_w3s3_sse; +#if MEGDNN_X86_WITH_MKL_DNN + AlgoMKLDNNNCHW algo_mkldnn_nchw; + AlgoMKLDNNNCHW88 algo_mkldnn_nchw88; +#endif + AlgoFallback algo_fallback; + +public: + AlgoPack(); + + std::vector all_algos; + + const AlgoBase::Mapper& all_algos_map() const { return m_all_algos_map; } +}; + +} // namespace x86 +} // namespace megdnn diff --git a/dnn/src/x86/pooling/opr_impl.cpp b/dnn/src/x86/pooling/opr_impl.cpp index bad5a9bd..c6601d2b 100644 --- a/dnn/src/x86/pooling/opr_impl.cpp +++ b/dnn/src/x86/pooling/opr_impl.cpp @@ -13,9 +13,9 @@ #include "src/common/utils.h" #include "src/naive/handle.h" #include "src/x86/handle.h" -#include "src/x86/pooling/do_max_pooling_3x3_s2x2_float_sse.h" -#include "src/x86/pooling/pooling_special_cases.h" #include "src/x86/utils.h" +#include "src/x86/pooling/algo.h" +#include "src/common/algo_chooser.h" #if MEGDNN_X86_WITH_MKL_DNN #include "mkldnn.hpp" @@ -24,10 +24,9 @@ using namespace megdnn; using namespace x86; -namespace { - -WorkspaceBundle get_bundle(const TensorLayout& src, const TensorLayout& dst, - const param::Pooling& param) { +WorkspaceBundle megdnn::x86::get_bundle(const TensorLayout& src, + const TensorLayout& dst, + const param::Pooling& param) { megdnn_assert( is_supported(SIMDType::SSE) && src.dtype == dtype::Float32() && param.format == param::Pooling::Format::NCHW && @@ -45,242 +44,63 @@ WorkspaceBundle get_bundle(const TensorLayout& src, const TensorLayout& dst, return ws; } -#if MEGDNN_X86_WITH_MKL_DNN -template -dnnl::memory tensor_to_mkl_memory(_megdnn_tensor_in src, - const dnnl::engine& mkldnn_eng, - dnnl::memory::data_type mkldnn_datatype) { - megdnn_assert(format_tag == dnnl::memory::format_tag::nChw8c || - format_tag == dnnl::memory::format_tag::nchw || - format_tag == dnnl::memory::format_tag::nhwc, - "not support format"); - - dnnl::memory::dims src_shape = { - static_cast(src.layout[0]), static_cast(src.layout[1]), - static_cast(src.layout[2]), static_cast(src.layout[3])}; - if (format_tag == dnnl::memory::format_tag::nChw8c) { - src_shape = {static_cast(src.layout[0]), - static_cast(src.layout[1] * 8), - static_cast(src.layout[2]), - static_cast(src.layout[3])}; - } - auto megdnn_src_md = - dnnl::memory::desc({src_shape}, mkldnn_datatype, format_tag); - if (use_mkl_mem) { - auto megdnn_src_memory = dnnl::memory(megdnn_src_md, mkldnn_eng); - return megdnn_src_memory; +size_t PoolingImpl::get_workspace_in_bytes(const TensorLayout& src, + const TensorLayout& dst) { + auto algo = get_algorithm(this, src, dst); + if (!is_fallback_algo(algo)) { + if (is_supported(SIMDType::SSE) && src.dtype == dtype::Float32() && + param().mode == Mode::MAX && + param().format == Param::Format::NCHW && param().window_h == 3 && + param().window_w == 3 && param().stride_h == 2 && + param().stride_w == 2) { + WorkspaceBundle ws = get_bundle(src, dst, param()); + + return ws.total_size_in_bytes(); + } else { + return 0; + } } else { - auto megdnn_src_memory = dnnl::memory(megdnn_src_md, mkldnn_eng, - const_cast(src.raw_ptr)); - return megdnn_src_memory; + auto fallback_worksapce = + fallback::PoolingImpl::get_workspace_in_bytes(src, dst); + return fallback_worksapce; } } -#endif - -} // namespace - -size_t PoolingImpl::get_workspace_in_bytes(const TensorLayout& src, - const TensorLayout& dst) { - if (is_supported(SIMDType::SSE) && src.dtype == dtype::Float32() && - param().mode == Mode::MAX && param().format == Param::Format::NCHW && - param().window_h == 3 && param().window_w == 3 && - param().stride_h == 2 && param().stride_w == 2) { - WorkspaceBundle ws = get_bundle(src, dst, param()); +std::vector PoolingImpl::get_all_algorithms( + const TensorLayout& src, const TensorLayout& dst) { + return megdnn::get_all_algorithms({this, src, dst}); +} - return ws.total_size_in_bytes(); - } else { - return 0; +Algorithm* PoolingImpl::get_algorithm_heuristic( + const TensorLayout& src, const TensorLayout& dst, + size_t workspace_limit_in_bytes, const AlgoAttribute& positive_attr, + const AlgoAttribute& negative_attr) { + MEGDNN_MARK_USED_VAR(workspace_limit_in_bytes); + + AlgoBase::SizeArgs args(this, src, dst); + for (auto iter : algo_pack().all_algos) { + if (iter->is_available_attribute(args, positive_attr, negative_attr)) { + return iter; + } } + megdnn_throw( + ssprintf("require algorithm with attribute(%s) and without " + "attribute(%s), but can't get suitable algo.\n", + Algorithm::attribute_str(positive_attr).c_str(), + Algorithm::attribute_str(negative_attr).c_str())); + return nullptr; } void PoolingImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace workspace) { check_exec(src.layout, dst.layout, workspace.size); - size_t N = src.layout.shape[0], C = src.layout.shape[1], - IH = src.layout.shape[2], IW = src.layout.shape[3]; - size_t OH = dst.layout.shape[2], OW = dst.layout.shape[3]; - - auto mode = param().mode; - auto FH = param().window_h, FW = param().window_w; - auto SH = param().stride_h, SW = param().stride_w; - auto PH = param().pad_h, PW = param().pad_w; - bool is_average = (mode == Mode::AVERAGE); - bool is_include = true; - if (is_supported(SIMDType::AVX) && is_average && - param().format == Param::Format::NCHW && - src.layout.dtype == dtype::Float32() && FH == 2 && FW == 2 && SH == 2 && - SW == 2) { - auto sptr = src.ptr(); - auto dptr = dst.ptr(); - MEGDNN_DISPATCH_CPU_KERN_OPR(rep(n, N) rep(c, C) { - mean_pooling_w2x2_s2x2_avx(sptr + n * C * IH * IW + c * IH * IW, IH, - IW, dptr + n * C * OH * OW + c * OH * OW, - OH, OW, PH, PW, is_include); - }); - return; - } - if (is_supported(SIMDType::SSE3) && is_average && - src.layout.dtype == dtype::Float32() && - param().format == Param::Format::NCHW && FH == 2 && FW == 2 && - SH == 2 && SW == 2) { - auto sptr = src.ptr(); - auto dptr = dst.ptr(); - MEGDNN_DISPATCH_CPU_KERN_OPR(rep(n, N) rep(c, C) { - mean_pooling_w2x2_s2x2_sse3(sptr + n * C * IH * IW + c * IH * IW, - IH, IW, - dptr + n * C * OH * OW + c * OH * OW, - OH, OW, PH, PW, is_include); - }); - return; - } - if (is_supported(SIMDType::SSE) && src.layout.dtype == dtype::Float32() && - mode == Mode::MAX && param().format == Param::Format::NCHW && FH == 2 && - FW == 2 && SH == 2 && SW == 2) { - auto sptr = src.ptr(); - auto dptr = dst.ptr(); - MEGDNN_DISPATCH_CPU_KERN_OPR(rep(n, N) rep(c, C) { - max_pooling_w2x2_s2x2_sse(sptr + n * C * IH * IW + c * IH * IW, IH, - IW, dptr + n * C * OH * OW + c * OH * OW, - OH, OW, PH, PW); - }); - return; - } - if (is_supported(SIMDType::SSE) && src.layout.dtype == dtype::Float32() && - mode == Mode::MAX && param().format == Param::Format::NCHW && FH == 3 && - FW == 3 && SH == 2 && SW == 2) { - auto sptr = src.ptr(); - auto dptr = dst.ptr(); - MEGDNN_DISPATCH_CPU_KERN_OPR( - - WorkspaceBundle ws = - get_bundle(src.layout, dst.layout, param()); - ws.set(workspace.raw_ptr); rep(n, N) rep(c, C) { - do_max_pooling_3x3_s2x2_float_SSE( - sptr + n * C * IH * IW + c * IH * IW, - dptr + n * C * OH * OW + c * OH * OW, IH, IW, OH, - OW, PH, PW, ws); - }); - return; - } - -#if MEGDNN_X86_WITH_MKL_DNN - - // Mkldnn provide optimized code for nhwc int8 pooling now. - // Mkldnn can not change the layout automatic. - // Reorder nchw input to nhwc, do pooling, reorder nhwc result to nchw - if ((src.layout.dtype.enumv() == DTypeEnum::QuantizedS8 || - src.layout.dtype.enumv() == DTypeEnum::Int8) && - mode == Mode::MAX && param().format == Param::Format::NCHW) { - auto x86_handle = static_cast(inplace_cpu_handle().get()); - - auto mkldnn_eng = x86_handle->mkldnn_engine(); - auto mkldnn_stream = x86_handle->mkldnn_stream(); - auto mkldnn_pooling_mode = dnnl::algorithm::pooling_max; - dnnl::memory::dims pool_strides = {SH, SW}; - dnnl::memory::dims pool_padding = {PH, PW}; - dnnl::memory::dims pool_kernel = {FH, FW}; - - dnnl::memory&& megdnn_src_memory_ori = - tensor_to_mkl_memory( - src, mkldnn_eng, dnnl::memory::data_type::s8); - dnnl::memory&& megdnn_dst_memory_ori = - tensor_to_mkl_memory( - dst, mkldnn_eng, dnnl::memory::data_type::s8); - - dnnl::memory&& megdnn_src_memory = - tensor_to_mkl_memory( - src, mkldnn_eng, dnnl::memory::data_type::s8); - dnnl::memory&& megdnn_dst_memory = - tensor_to_mkl_memory( - dst, mkldnn_eng, dnnl::memory::data_type::s8); - - auto reorder_src = - dnnl::reorder(megdnn_src_memory_ori, megdnn_src_memory); - auto reorder_dst = - dnnl::reorder(megdnn_dst_memory, megdnn_dst_memory_ori); - auto pool1_desc = dnnl::pooling_forward::desc( - dnnl::prop_kind::forward_inference, mkldnn_pooling_mode, - megdnn_src_memory.get_desc(), megdnn_dst_memory.get_desc(), - pool_strides, pool_kernel, pool_padding, pool_padding); - auto pool_pd = - dnnl::pooling_forward::primitive_desc(pool1_desc, mkldnn_eng); - auto pool = dnnl::pooling_forward(pool_pd); - - auto run = [mkldnn_stream, mkldnn_eng, reorder_src, pool, reorder_dst, - megdnn_src_memory_ori, megdnn_src_memory, megdnn_dst_memory, - megdnn_dst_memory_ori](void) { - MEGDNN_MARK_USED_VAR(mkldnn_eng); - auto mkl_stream = mkldnn_stream; - reorder_src.execute(mkl_stream, - {{DNNL_ARG_FROM, megdnn_src_memory_ori}, - {DNNL_ARG_TO, megdnn_src_memory}}); - pool.execute(mkl_stream, {{DNNL_ARG_SRC, megdnn_src_memory}, - {DNNL_ARG_DST, megdnn_dst_memory}}); - reorder_dst.execute(mkl_stream, - {{DNNL_ARG_FROM, megdnn_dst_memory}, - {DNNL_ARG_TO, megdnn_dst_memory_ori}}); - mkl_stream.wait(); - }; - MEGDNN_DISPATCH_CPU_KERN_OPR(run()); - return; - } - - if (src.layout.dtype == dtype::Float32() && mode == Mode::MAX && - param().format == Param::Format::NCHW88) { - auto x86_handle = static_cast(inplace_cpu_handle().get()); - auto mkldnn_eng = x86_handle->mkldnn_engine(); - auto mkldnn_stream = x86_handle->mkldnn_stream(); - auto mkldnn_pooling_mode = dnnl::algorithm::pooling_max; - switch (mode) { - case Mode::MAX: - mkldnn_pooling_mode = dnnl::algorithm::pooling_max; - break; - case Mode::AVERAGE: - mkldnn_pooling_mode = - dnnl::algorithm::pooling_avg_include_padding; - break; - case Mode::AVERAGE_COUNT_EXCLUDE_PADDING: - mkldnn_pooling_mode = - dnnl::algorithm::pooling_avg_exclude_padding; - break; - default: - megdnn_assert(0, "not supported pooling mode\n"); - }; - - dnnl::memory::dims pool_strides = {SH, SW}; - dnnl::memory::dims pool_padding = {PH, PW}; - dnnl::memory::dims pool_kernel = {FH, FW}; - dnnl::memory&& megdnn_src_memory_ori = - tensor_to_mkl_memory( - src, mkldnn_eng, dnnl::memory::data_type::f32); - dnnl::memory&& megdnn_dst_memory_ori = - tensor_to_mkl_memory( - dst, mkldnn_eng, dnnl::memory::data_type::f32); - auto pool_desc = dnnl::pooling_forward::desc( - dnnl::prop_kind::forward_inference, mkldnn_pooling_mode, - megdnn_src_memory_ori.get_desc(), - megdnn_dst_memory_ori.get_desc(), pool_strides, pool_kernel, - pool_padding, pool_padding); - auto pool_pd = - dnnl::pooling_forward::primitive_desc(pool_desc, mkldnn_eng); - auto pool = dnnl::pooling_forward(pool_pd); - - auto run = [mkldnn_stream, pool, mkldnn_eng, megdnn_src_memory_ori, - megdnn_dst_memory_ori](void) { - MEGDNN_MARK_USED_VAR(mkldnn_eng); - auto mkl_stream = mkldnn_stream; - - pool.execute(mkl_stream, {{DNNL_ARG_SRC, megdnn_src_memory_ori}, - {DNNL_ARG_DST, megdnn_dst_memory_ori}}); - mkl_stream.wait(); - }; - MEGDNN_DISPATCH_CPU_KERN_OPR(run()); - return; + AlgoBase::ExecArgs args(this, src, dst, workspace); + auto algo = get_algorithm(this, src.layout, dst.layout); + if (!is_fallback_algo(algo)) { + algo->exec(args); + } else { + fallback::PoolingImpl::exec(src, dst, Workspace()); } -#endif - - fallback::PoolingImpl::exec(src, dst, Workspace()); } // vim: syntax=cpp.doxygen diff --git a/dnn/src/x86/pooling/opr_impl.h b/dnn/src/x86/pooling/opr_impl.h index b97a01df..764c9895 100644 --- a/dnn/src/x86/pooling/opr_impl.h +++ b/dnn/src/x86/pooling/opr_impl.h @@ -6,7 +6,8 @@ * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. */ #pragma once #include "src/fallback/pooling/opr_impl.h" @@ -14,17 +15,62 @@ namespace megdnn { namespace x86 { -class PoolingImpl: public fallback::PoolingImpl { - public: - using fallback::PoolingImpl::PoolingImpl; - void exec(_megdnn_tensor_in src, _megdnn_tensor_out dst, - _megdnn_workspace) override; - size_t get_workspace_in_bytes(const TensorLayout &, - const TensorLayout &) override; -}; +class PoolingImpl : public fallback::PoolingImpl { +private: + class AlgoMeanW2S2AVX; + class AlgoMeanW2S2SSE3; + class AlgoMaxW2S2SSE; + class AlgoMaxW3S3SSE; +#if MEGDNN_X86_WITH_MKL_DNN + class AlgoMKLDNNNCHW; + class AlgoMKLDNNNCHW88; +#endif + class AlgoFallback; + class AlgoPack; + static AlgoPack sm_algo_pack; -} // namespace x86 -} // namespace megdnn -// vim: syntax=cpp.doxygen +public: + using fallback::PoolingImpl::PoolingImpl; + class AlgoBase; + void exec(_megdnn_tensor_in src, _megdnn_tensor_out dst, + _megdnn_workspace) override; + size_t get_workspace_in_bytes(const TensorLayout&, + const TensorLayout&) override; + + static size_t constexpr MAX_SPATIAL_DIM = 2; + + const char* get_algorithm_set_name() const override { + return "X86_POOLING_FORWARD"; + } + Algorithm* get_algorithm_from_desc(const AlgorithmDesc& desc) override; + + AlgorithmInfo get_algorithm_info_heuristic( + const TensorLayout& src, const TensorLayout& dst, + size_t workspace_limit_in_bytes, const AlgoAttribute& positive_attr, + const AlgoAttribute& negative_attr) { + return get_algorithm_heuristic(src, dst, workspace_limit_in_bytes, + positive_attr, negative_attr) + ->info(); + } + static const AlgoPack& algo_pack() { return sm_algo_pack; } + bool is_fallback_algo(Algorithm* algo) { + return strcmp(algo->name(), "FALLBACK_POOLING") == 0; + } + +protected: + std::vector get_all_algorithms( + const TensorLayout& src, const TensorLayout& dst) override; + Algorithm* get_algorithm_heuristic( + const TensorLayout& src, const TensorLayout& dst, + size_t workspace_limit_in_bytes, const AlgoAttribute& positive_attr, + const AlgoAttribute& negative_attr) override; +}; + +WorkspaceBundle get_bundle(const TensorLayout& src, const TensorLayout& dst, + const param::Pooling& param); + +} // namespace x86 +} // namespace megdnn +// vim: syntax=cpp.doxygen diff --git a/src/opr/test/dnn/pooling.cpp b/src/opr/test/dnn/pooling.cpp index 023fd4c8..251c2d48 100644 --- a/src/opr/test/dnn/pooling.cpp +++ b/src/opr/test/dnn/pooling.cpp @@ -159,6 +159,42 @@ TEST(TestOprDNN, PoolingExePolicy) { "cudnnReproducible") != std::string::npos); } +TEST(TestOprDNN, PoolingForwardFastrun) { + using Param = opr::Pooling::Param; + Param param; + using Policy = opr::Pooling::ExecutionPolicy; + using S = Policy::Strategy; + + auto cn = CompNode::load("xpux"); + cn.activate(); + + auto orig_impl = PersistentCache::set_impl( + std::make_shared()); + + HostTensorND host_y; + S strategy = S::PROFILE | S::REPRODUCIBLE; + + auto graph = ComputingGraph::make(); + + HostTensorGenerator<> gen; + TensorShape shape = {1, 20, 24, 24}; + auto input = opr::Host2DeviceCopy::make(*graph, gen(shape, cn)); + + param.mode = Param::Mode::MAX; + param.window_h = param.window_w = 2; + param.stride_h = param.stride_w = 2; + param.pad_h = param.pad_w = 0; + param.format = Param::Format::NCHW; + + Policy policy; + policy.strategy = strategy; + + auto pooling = opr::PoolingForward::make(input, param, {}, policy); + + auto func = graph->compile({make_callback_copy(pooling, host_y)}); + func->execute().wait(); +} + } // anonymous namespace // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}