GitOrigin-RevId: 96cdc57180
release-1.5
@@ -0,0 +1,365 @@ | |||
/** | |||
* \file dnn/src/x86/pooling/algos.cpp | |||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
* | |||
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||
* | |||
* Unless required by applicable law or agreed to in writing, | |||
* software distributed under the License is distributed on an | |||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||
* implied. | |||
*/ | |||
#include "src/x86/pooling/algo.h" | |||
#include "megdnn/opr_param_defs.h" | |||
#include "src/common/opr_delegate.h" | |||
#include "src/common/utils.h" | |||
#include "src/fallback/pooling/opr_impl.h" | |||
#include "src/naive/handle.h" | |||
#include "src/x86/handle.h" | |||
#include "src/x86/pooling/do_max_pooling_3x3_s2x2_float_sse.h" | |||
#include "src/x86/pooling/pooling_special_cases.h" | |||
#include "src/x86/utils.h" | |||
using namespace megdnn; | |||
using namespace x86; | |||
namespace { | |||
#if MEGDNN_X86_WITH_MKL_DNN | |||
template <dnnl::memory::format_tag format_tag, bool use_mkl_mem> | |||
dnnl::memory tensor_to_mkl_memory(_megdnn_tensor_in src, | |||
const dnnl::engine& mkldnn_eng, | |||
dnnl::memory::data_type mkldnn_datatype) { | |||
megdnn_assert(format_tag == dnnl::memory::format_tag::nChw8c || | |||
format_tag == dnnl::memory::format_tag::nchw || | |||
format_tag == dnnl::memory::format_tag::nhwc, | |||
"not support format"); | |||
dnnl::memory::dims src_shape = { | |||
static_cast<long>(src.layout[0]), static_cast<long>(src.layout[1]), | |||
static_cast<long>(src.layout[2]), static_cast<long>(src.layout[3])}; | |||
if (format_tag == dnnl::memory::format_tag::nChw8c) { | |||
src_shape = {static_cast<long>(src.layout[0]), | |||
static_cast<long>(src.layout[1] * 8), | |||
static_cast<long>(src.layout[2]), | |||
static_cast<long>(src.layout[3])}; | |||
} | |||
auto megdnn_src_md = | |||
dnnl::memory::desc({src_shape}, mkldnn_datatype, format_tag); | |||
if (use_mkl_mem) { | |||
auto megdnn_src_memory = dnnl::memory(megdnn_src_md, mkldnn_eng); | |||
return megdnn_src_memory; | |||
} else { | |||
auto megdnn_src_memory = dnnl::memory(megdnn_src_md, mkldnn_eng, | |||
const_cast<void*>(src.raw_ptr)); | |||
return megdnn_src_memory; | |||
} | |||
} | |||
#endif | |||
} // namespace | |||
PoolingImpl::AlgoPack::AlgoPack() { | |||
all_algos.push_back(&algo_mean_w2s2_avx); | |||
all_algos.push_back(&algo_mean_w2s2_sse3); | |||
all_algos.push_back(&algo_max_w2s2_sse); | |||
all_algos.push_back(&algo_max_w3s3_sse); | |||
#if MEGDNN_X86_WITH_MKL_DNN | |||
all_algos.push_back(&algo_mkldnn_nchw); | |||
all_algos.push_back(&algo_mkldnn_nchw88); | |||
#endif | |||
all_algos.push_back(&algo_fallback); | |||
for (auto&& algo : all_algos) { | |||
m_all_algos_map.emplace(algo->info().desc, algo); | |||
} | |||
} | |||
PoolingImpl::AlgoPack PoolingImpl::sm_algo_pack; | |||
MEGDNN_DEF_GET_ALGO_FROM_DESC(PoolingImpl) | |||
PoolingImpl::AlgoBase::SizeArgs::SizeArgs(PoolingImpl* o, | |||
const TensorLayout& src, | |||
const TensorLayout& dst) | |||
: handle{static_cast<x86::HandleImpl*>(o->handle())}, | |||
opr{o}, | |||
layout_src{src}, | |||
layout_dst{dst} {} | |||
PoolingImpl::AlgoBase::ExecArgs::ExecArgs(PoolingImpl* opr, | |||
_megdnn_tensor_in src, | |||
_megdnn_tensor_out dst, | |||
_megdnn_workspace workspace) | |||
: SizeArgs(opr, src.layout, dst.layout), | |||
src_tensor{&src}, | |||
dst_tensor{&dst}, | |||
workspace{workspace} {} | |||
std::string PoolingImpl::AlgoBase::SizeArgs::to_string() const { | |||
return ssprintf("src=%s, dst=%s", layout_src.to_string().c_str(), | |||
layout_dst.to_string().c_str()); | |||
} | |||
bool PoolingImpl::AlgoMeanW2S2AVX::is_available(const SizeArgs& args) const { | |||
auto SH = args.opr->param().stride_h; | |||
auto SW = args.opr->param().stride_w; | |||
auto FH = args.opr->param().window_h; | |||
auto FW = args.opr->param().window_w; | |||
return (is_supported(SIMDType::AVX) && | |||
args.opr->param().mode == Mode::AVERAGE && | |||
args.opr->param().format == Param::Format::NCHW && | |||
args.layout_src.dtype == dtype::Float32() && FH == 2 && FW == 2 && | |||
SH == 2 && SW == 2); | |||
} | |||
void PoolingImpl::AlgoMeanW2S2AVX::exec(const ExecArgs& args) const { | |||
auto N = args.layout_src.shape[0]; | |||
auto C = args.layout_src.shape[1]; | |||
auto IH = args.layout_src.shape[2]; | |||
auto IW = args.layout_src.shape[3]; | |||
auto OH = args.layout_dst.shape[2]; | |||
auto OW = args.layout_dst.shape[3]; | |||
auto PH = args.opr->param().pad_h; | |||
auto PW = args.opr->param().pad_w; | |||
auto sptr = reinterpret_cast<dt_float32*>(args.src_tensor->raw_ptr); | |||
auto dptr = reinterpret_cast<dt_float32*>(args.dst_tensor->raw_ptr); | |||
auto handle = [=]() { return args.handle; }; | |||
MEGDNN_DISPATCH_CPU_KERN_OPR(rep(n, N) rep(c, C) { | |||
mean_pooling_w2x2_s2x2_avx(sptr + n * C * IH * IW + c * IH * IW, IH, IW, | |||
dptr + n * C * OH * OW + c * OH * OW, OH, OW, | |||
PH, PW, true); | |||
}); | |||
} | |||
bool PoolingImpl::AlgoMeanW2S2SSE3::is_available(const SizeArgs& args) const { | |||
auto SH = args.opr->param().stride_h; | |||
auto SW = args.opr->param().stride_w; | |||
auto FH = args.opr->param().window_h; | |||
auto FW = args.opr->param().window_w; | |||
return (is_supported(SIMDType::SSE3) && | |||
args.opr->param().mode == Mode::AVERAGE && | |||
args.layout_src.dtype == dtype::Float32() && | |||
args.opr->param().format == Param::Format::NCHW && FH == 2 && | |||
FW == 2 && SH == 2 && SW == 2); | |||
} | |||
void PoolingImpl::AlgoMeanW2S2SSE3::exec(const ExecArgs& args) const { | |||
auto N = args.layout_src.shape[0]; | |||
auto C = args.layout_src.shape[1]; | |||
auto IH = args.layout_src.shape[2]; | |||
auto IW = args.layout_src.shape[3]; | |||
auto OH = args.layout_dst.shape[2]; | |||
auto OW = args.layout_dst.shape[3]; | |||
auto PH = args.opr->param().pad_h; | |||
auto PW = args.opr->param().pad_w; | |||
auto sptr = reinterpret_cast<dt_float32*>(args.src_tensor->raw_ptr); | |||
auto dptr = reinterpret_cast<dt_float32*>(args.dst_tensor->raw_ptr); | |||
auto handle = [=]() { return args.handle; }; | |||
MEGDNN_DISPATCH_CPU_KERN_OPR(rep(n, N) rep(c, C) { | |||
mean_pooling_w2x2_s2x2_sse3(sptr + n * C * IH * IW + c * IH * IW, IH, | |||
IW, dptr + n * C * OH * OW + c * OH * OW, | |||
OH, OW, PH, PW, true); | |||
}); | |||
} | |||
bool PoolingImpl::AlgoMaxW2S2SSE::is_available(const SizeArgs& args) const { | |||
auto SH = args.opr->param().stride_h; | |||
auto SW = args.opr->param().stride_w; | |||
auto FH = args.opr->param().window_h; | |||
auto FW = args.opr->param().window_w; | |||
return (is_supported(SIMDType::SSE) && | |||
args.layout_src.dtype == dtype::Float32() && | |||
args.opr->param().mode == Mode::MAX && | |||
args.opr->param().format == Param::Format::NCHW && FH == 2 && | |||
FW == 2 && SH == 2 && SW == 2); | |||
} | |||
void PoolingImpl::AlgoMaxW2S2SSE::exec(const ExecArgs& args) const { | |||
auto N = args.layout_src.shape[0]; | |||
auto C = args.layout_src.shape[1]; | |||
auto IH = args.layout_src.shape[2]; | |||
auto IW = args.layout_src.shape[3]; | |||
auto OH = args.layout_dst.shape[2]; | |||
auto OW = args.layout_dst.shape[3]; | |||
auto PH = args.opr->param().pad_h; | |||
auto PW = args.opr->param().pad_w; | |||
auto sptr = reinterpret_cast<dt_float32*>(args.src_tensor->raw_ptr); | |||
auto dptr = reinterpret_cast<dt_float32*>(args.dst_tensor->raw_ptr); | |||
auto handle = [=]() { return args.handle; }; | |||
MEGDNN_DISPATCH_CPU_KERN_OPR(rep(n, N) rep(c, C) { | |||
max_pooling_w2x2_s2x2_sse(sptr + n * C * IH * IW + c * IH * IW, IH, IW, | |||
dptr + n * C * OH * OW + c * OH * OW, OH, OW, | |||
PH, PW); | |||
}); | |||
} | |||
bool PoolingImpl::AlgoMaxW3S3SSE::is_available(const SizeArgs& args) const { | |||
auto SH = args.opr->param().stride_h; | |||
auto SW = args.opr->param().stride_w; | |||
auto FH = args.opr->param().window_h; | |||
auto FW = args.opr->param().window_w; | |||
return (is_supported(SIMDType::SSE) && | |||
args.layout_src.dtype == dtype::Float32() && | |||
args.opr->param().mode == Mode::MAX && | |||
args.opr->param().format == Param::Format::NCHW && FH == 3 && | |||
FW == 3 && SH == 2 && SW == 2); | |||
} | |||
void PoolingImpl::AlgoMaxW3S3SSE::exec(const ExecArgs& args) const { | |||
auto N = args.layout_src.shape[0]; | |||
auto C = args.layout_src.shape[1]; | |||
auto IH = args.layout_src.shape[2]; | |||
auto IW = args.layout_src.shape[3]; | |||
auto OH = args.layout_dst.shape[2]; | |||
auto OW = args.layout_dst.shape[3]; | |||
auto PH = args.opr->param().pad_h; | |||
auto PW = args.opr->param().pad_w; | |||
auto sptr = reinterpret_cast<dt_float32*>(args.src_tensor->raw_ptr); | |||
auto dptr = reinterpret_cast<dt_float32*>(args.dst_tensor->raw_ptr); | |||
auto handle = [=]() { return args.handle; }; | |||
MEGDNN_DISPATCH_CPU_KERN_OPR( | |||
WorkspaceBundle ws = get_bundle(args.layout_src, args.layout_dst, | |||
args.opr->param()); | |||
ws.set(args.workspace.raw_ptr); rep(n, N) rep(c, C) { | |||
do_max_pooling_3x3_s2x2_float_SSE( | |||
sptr + n * C * IH * IW + c * IH * IW, | |||
dptr + n * C * OH * OW + c * OH * OW, IH, IW, OH, OW, | |||
PH, PW, ws); | |||
}); | |||
} | |||
#if MEGDNN_X86_WITH_MKL_DNN | |||
bool PoolingImpl::AlgoMKLDNNNCHW::is_available(const SizeArgs& args) const { | |||
return ((args.layout_src.dtype.enumv() == DTypeEnum::QuantizedS8 || | |||
args.layout_src.dtype.enumv() == DTypeEnum::Int8) && | |||
args.opr->param().mode == Mode::MAX && | |||
args.opr->param().format == Param::Format::NCHW); | |||
} | |||
void PoolingImpl::AlgoMKLDNNNCHW::exec(const ExecArgs& args) const { | |||
auto PH = args.opr->param().pad_h; | |||
auto PW = args.opr->param().pad_w; | |||
auto FH = args.opr->param().window_h; | |||
auto FW = args.opr->param().window_w; | |||
auto SH = args.opr->param().stride_h; | |||
auto SW = args.opr->param().stride_w; | |||
auto handle = [=]() { return args.handle; }; | |||
auto x86_handle = static_cast<HandleImpl*>(inplace_cpu_handle().get()); | |||
auto mkldnn_eng = x86_handle->mkldnn_engine(); | |||
auto mkldnn_stream = x86_handle->mkldnn_stream(); | |||
auto mkldnn_pooling_mode = dnnl::algorithm::pooling_max; | |||
dnnl::memory::dims pool_strides = {SH, SW}; | |||
dnnl::memory::dims pool_padding = {PH, PW}; | |||
dnnl::memory::dims pool_kernel = {FH, FW}; | |||
dnnl::memory&& megdnn_src_memory_ori = | |||
tensor_to_mkl_memory<dnnl::memory::format_tag::nchw, false>( | |||
*args.src_tensor, mkldnn_eng, dnnl::memory::data_type::s8); | |||
dnnl::memory&& megdnn_dst_memory_ori = | |||
tensor_to_mkl_memory<dnnl::memory::format_tag::nchw, false>( | |||
*args.dst_tensor, mkldnn_eng, dnnl::memory::data_type::s8); | |||
dnnl::memory&& megdnn_src_memory = | |||
tensor_to_mkl_memory<dnnl::memory::format_tag::nhwc, true>( | |||
*args.src_tensor, mkldnn_eng, dnnl::memory::data_type::s8); | |||
dnnl::memory&& megdnn_dst_memory = | |||
tensor_to_mkl_memory<dnnl::memory::format_tag::nhwc, true>( | |||
*args.dst_tensor, mkldnn_eng, dnnl::memory::data_type::s8); | |||
auto reorder_src = dnnl::reorder(megdnn_src_memory_ori, megdnn_src_memory); | |||
auto reorder_dst = dnnl::reorder(megdnn_dst_memory, megdnn_dst_memory_ori); | |||
auto pool1_desc = dnnl::pooling_forward::desc( | |||
dnnl::prop_kind::forward_inference, mkldnn_pooling_mode, | |||
megdnn_src_memory.get_desc(), megdnn_dst_memory.get_desc(), | |||
pool_strides, pool_kernel, pool_padding, pool_padding); | |||
auto pool_pd = | |||
dnnl::pooling_forward::primitive_desc(pool1_desc, mkldnn_eng); | |||
auto pool = dnnl::pooling_forward(pool_pd); | |||
auto run = [mkldnn_stream, mkldnn_eng, reorder_src, pool, reorder_dst, | |||
megdnn_src_memory_ori, megdnn_src_memory, megdnn_dst_memory, | |||
megdnn_dst_memory_ori](void) { | |||
MEGDNN_MARK_USED_VAR(mkldnn_eng); | |||
auto mkl_stream = mkldnn_stream; | |||
reorder_src.execute(mkl_stream, {{DNNL_ARG_FROM, megdnn_src_memory_ori}, | |||
{DNNL_ARG_TO, megdnn_src_memory}}); | |||
pool.execute(mkl_stream, {{DNNL_ARG_SRC, megdnn_src_memory}, | |||
{DNNL_ARG_DST, megdnn_dst_memory}}); | |||
reorder_dst.execute(mkl_stream, {{DNNL_ARG_FROM, megdnn_dst_memory}, | |||
{DNNL_ARG_TO, megdnn_dst_memory_ori}}); | |||
mkl_stream.wait(); | |||
}; | |||
MEGDNN_DISPATCH_CPU_KERN_OPR(run()); | |||
} | |||
#endif | |||
#if MEGDNN_X86_WITH_MKL_DNN | |||
bool PoolingImpl::AlgoMKLDNNNCHW88::is_available(const SizeArgs& args) const { | |||
return (args.layout_src.dtype == dtype::Float32() && | |||
args.opr->param().mode == Mode::MAX && | |||
args.opr->param().format == Param::Format::NCHW88); | |||
} | |||
void PoolingImpl::AlgoMKLDNNNCHW88::exec(const ExecArgs& args) const { | |||
auto PH = args.opr->param().pad_h; | |||
auto PW = args.opr->param().pad_w; | |||
auto FH = args.opr->param().window_h; | |||
auto FW = args.opr->param().window_w; | |||
auto SH = args.opr->param().stride_h; | |||
auto SW = args.opr->param().stride_w; | |||
auto handle = [=]() { return args.handle; }; | |||
auto x86_handle = static_cast<HandleImpl*>(inplace_cpu_handle().get()); | |||
auto mkldnn_eng = x86_handle->mkldnn_engine(); | |||
auto mkldnn_stream = x86_handle->mkldnn_stream(); | |||
auto mkldnn_pooling_mode = dnnl::algorithm::pooling_max; | |||
switch (args.opr->param().mode) { | |||
case Mode::MAX: | |||
mkldnn_pooling_mode = dnnl::algorithm::pooling_max; | |||
break; | |||
case Mode::AVERAGE: | |||
mkldnn_pooling_mode = dnnl::algorithm::pooling_avg_include_padding; | |||
break; | |||
case Mode::AVERAGE_COUNT_EXCLUDE_PADDING: | |||
mkldnn_pooling_mode = dnnl::algorithm::pooling_avg_exclude_padding; | |||
break; | |||
default: | |||
megdnn_throw("not supported pooling mode\n"); | |||
}; | |||
dnnl::memory::dims pool_strides = {SH, SW}; | |||
dnnl::memory::dims pool_padding = {PH, PW}; | |||
dnnl::memory::dims pool_kernel = {FH, FW}; | |||
dnnl::memory&& megdnn_src_memory_ori = | |||
tensor_to_mkl_memory<dnnl::memory::format_tag::nChw8c, false>( | |||
*args.src_tensor, mkldnn_eng, dnnl::memory::data_type::f32); | |||
dnnl::memory&& megdnn_dst_memory_ori = | |||
tensor_to_mkl_memory<dnnl::memory::format_tag::nChw8c, false>( | |||
*args.dst_tensor, mkldnn_eng, dnnl::memory::data_type::f32); | |||
auto pool_desc = dnnl::pooling_forward::desc( | |||
dnnl::prop_kind::forward_inference, mkldnn_pooling_mode, | |||
megdnn_src_memory_ori.get_desc(), megdnn_dst_memory_ori.get_desc(), | |||
pool_strides, pool_kernel, pool_padding, pool_padding); | |||
auto pool_pd = dnnl::pooling_forward::primitive_desc(pool_desc, mkldnn_eng); | |||
auto pool = dnnl::pooling_forward(pool_pd); | |||
auto run = [mkldnn_stream, pool, mkldnn_eng, megdnn_src_memory_ori, | |||
megdnn_dst_memory_ori](void) { | |||
MEGDNN_MARK_USED_VAR(mkldnn_eng); | |||
auto mkl_stream = mkldnn_stream; | |||
pool.execute(mkl_stream, {{DNNL_ARG_SRC, megdnn_src_memory_ori}, | |||
{DNNL_ARG_DST, megdnn_dst_memory_ori}}); | |||
mkl_stream.wait(); | |||
}; | |||
MEGDNN_DISPATCH_CPU_KERN_OPR(run()); | |||
} | |||
#endif |
@@ -0,0 +1,132 @@ | |||
/** | |||
* \file dnn/src/x86/pooling/algo.h | |||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
* | |||
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||
* | |||
* Unless required by applicable law or agreed to in writing, | |||
* software distributed under the License is distributed on an | |||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||
* implied. | |||
*/ | |||
#pragma once | |||
#include <unordered_map> | |||
#include "src/common/algo_base.h" | |||
#include "src/common/metahelper.h" | |||
#include "src/x86/pooling/opr_impl.h" | |||
#include "src/x86/handle.h" | |||
namespace megdnn { | |||
namespace x86 { | |||
using AlgoBase = PoolingImpl::AlgoBase; | |||
class PoolingImpl::AlgoBase : public Algorithm { | |||
public: | |||
enum class AlgoType : uint32_t { | |||
X86_MeanW2S2AVX, | |||
X86_MeanW2S2SSE3, | |||
X86_MaxW2S2SSE, | |||
X86_MaxW3S3SSE, | |||
#if MEGDNN_X86_WITH_MKL_DNN | |||
X86_MKLDNNNCHW, | |||
X86_MKLDNNNCHW88, | |||
#endif | |||
X86_Fallback | |||
}; | |||
using Mapper = std::unordered_map<AlgorithmDesc, AlgoBase*>; | |||
AlgoBase() : Algorithm() { m_handle_type = Handle::HandleType::X86; } | |||
virtual ~AlgoBase() = default; | |||
struct SizeArgs { | |||
HandleImpl* handle; | |||
PoolingImpl* opr; | |||
const TensorLayout layout_src, layout_dst; | |||
std::string to_string() const; | |||
SizeArgs(PoolingImpl* opr, const TensorLayout& src, | |||
const TensorLayout& dst); | |||
}; | |||
struct ExecArgs : public SizeArgs { | |||
const TensorND *src_tensor, *dst_tensor; | |||
Workspace workspace; | |||
ExecArgs(PoolingImpl* opr, _megdnn_tensor_in src, | |||
_megdnn_tensor_out dst, _megdnn_workspace workspace); | |||
}; | |||
virtual bool is_available(const SizeArgs& args) const = 0; | |||
virtual void exec(const ExecArgs& args) const = 0; | |||
uint32_t type() const override { return INVALID_ALGO_TYPE; }; | |||
bool is_available_attribute( | |||
const SizeArgs& args, | |||
const AlgoAttribute& positive_attr = AlgoAttribute::REPRODUCIBLE, | |||
const AlgoAttribute& negative_attr = AlgoAttribute::DEFAULT) { | |||
return contain_attribute_all(positive_attr) && | |||
!contain_attribute_any(negative_attr) && is_available(args); | |||
} | |||
}; | |||
#define ALGO_IMPL(_name) \ | |||
class PoolingImpl::Algo##_name final : public AlgoBase { \ | |||
std::string m_algo_name; \ | |||
\ | |||
public: \ | |||
Algo##_name() : m_algo_name(std::string(#_name).append("_POOLING")) {} \ | |||
AlgoAttribute attribute() const override { \ | |||
return AlgoAttribute::REPRODUCIBLE; \ | |||
}; \ | |||
const char* name() const override { return m_algo_name.c_str(); } \ | |||
bool is_available(const SizeArgs& args) const override; \ | |||
void exec(const ExecArgs& args) const override; \ | |||
MEGDNN_DECL_ALGO_TYPE(X86_##_name) \ | |||
}; | |||
ALGO_IMPL(MeanW2S2AVX) | |||
ALGO_IMPL(MeanW2S2SSE3) | |||
ALGO_IMPL(MaxW2S2SSE) | |||
ALGO_IMPL(MaxW3S3SSE) | |||
#if MEGDNN_X86_WITH_MKL_DNN | |||
ALGO_IMPL(MKLDNNNCHW) | |||
ALGO_IMPL(MKLDNNNCHW88) | |||
#endif | |||
#undef ALGO_IMPL | |||
class PoolingImpl::AlgoFallback final : public AlgoBase { | |||
std::string m_algo_name; | |||
public: | |||
AlgoFallback() : m_algo_name("FALLBACK_POOLING") {} | |||
AlgoAttribute attribute() const override { | |||
return AlgoAttribute::REPRODUCIBLE; | |||
}; | |||
const char* name() const override { return m_algo_name.c_str(); } | |||
bool is_available(const SizeArgs&) const override { return true; } | |||
void exec(const ExecArgs&) const override {} | |||
MEGDNN_DECL_ALGO_TYPE(X86_Fallback) | |||
}; | |||
class PoolingImpl::AlgoPack : NonCopyableObj { | |||
private: | |||
AlgoBase::Mapper m_all_algos_map; | |||
AlgoMeanW2S2AVX algo_mean_w2s2_avx; | |||
AlgoMeanW2S2SSE3 algo_mean_w2s2_sse3; | |||
AlgoMaxW2S2SSE algo_max_w2s2_sse; | |||
AlgoMaxW3S3SSE algo_max_w3s3_sse; | |||
#if MEGDNN_X86_WITH_MKL_DNN | |||
AlgoMKLDNNNCHW algo_mkldnn_nchw; | |||
AlgoMKLDNNNCHW88 algo_mkldnn_nchw88; | |||
#endif | |||
AlgoFallback algo_fallback; | |||
public: | |||
AlgoPack(); | |||
std::vector<AlgoBase*> all_algos; | |||
const AlgoBase::Mapper& all_algos_map() const { return m_all_algos_map; } | |||
}; | |||
} // namespace x86 | |||
} // namespace megdnn |
@@ -13,9 +13,9 @@ | |||
#include "src/common/utils.h" | |||
#include "src/naive/handle.h" | |||
#include "src/x86/handle.h" | |||
#include "src/x86/pooling/do_max_pooling_3x3_s2x2_float_sse.h" | |||
#include "src/x86/pooling/pooling_special_cases.h" | |||
#include "src/x86/utils.h" | |||
#include "src/x86/pooling/algo.h" | |||
#include "src/common/algo_chooser.h" | |||
#if MEGDNN_X86_WITH_MKL_DNN | |||
#include "mkldnn.hpp" | |||
@@ -24,10 +24,9 @@ | |||
using namespace megdnn; | |||
using namespace x86; | |||
namespace { | |||
WorkspaceBundle get_bundle(const TensorLayout& src, const TensorLayout& dst, | |||
const param::Pooling& param) { | |||
WorkspaceBundle megdnn::x86::get_bundle(const TensorLayout& src, | |||
const TensorLayout& dst, | |||
const param::Pooling& param) { | |||
megdnn_assert( | |||
is_supported(SIMDType::SSE) && src.dtype == dtype::Float32() && | |||
param.format == param::Pooling::Format::NCHW && | |||
@@ -45,242 +44,63 @@ WorkspaceBundle get_bundle(const TensorLayout& src, const TensorLayout& dst, | |||
return ws; | |||
} | |||
#if MEGDNN_X86_WITH_MKL_DNN | |||
template <dnnl::memory::format_tag format_tag, bool use_mkl_mem> | |||
dnnl::memory tensor_to_mkl_memory(_megdnn_tensor_in src, | |||
const dnnl::engine& mkldnn_eng, | |||
dnnl::memory::data_type mkldnn_datatype) { | |||
megdnn_assert(format_tag == dnnl::memory::format_tag::nChw8c || | |||
format_tag == dnnl::memory::format_tag::nchw || | |||
format_tag == dnnl::memory::format_tag::nhwc, | |||
"not support format"); | |||
dnnl::memory::dims src_shape = { | |||
static_cast<long>(src.layout[0]), static_cast<long>(src.layout[1]), | |||
static_cast<long>(src.layout[2]), static_cast<long>(src.layout[3])}; | |||
if (format_tag == dnnl::memory::format_tag::nChw8c) { | |||
src_shape = {static_cast<long>(src.layout[0]), | |||
static_cast<long>(src.layout[1] * 8), | |||
static_cast<long>(src.layout[2]), | |||
static_cast<long>(src.layout[3])}; | |||
} | |||
auto megdnn_src_md = | |||
dnnl::memory::desc({src_shape}, mkldnn_datatype, format_tag); | |||
if (use_mkl_mem) { | |||
auto megdnn_src_memory = dnnl::memory(megdnn_src_md, mkldnn_eng); | |||
return megdnn_src_memory; | |||
size_t PoolingImpl::get_workspace_in_bytes(const TensorLayout& src, | |||
const TensorLayout& dst) { | |||
auto algo = get_algorithm(this, src, dst); | |||
if (!is_fallback_algo(algo)) { | |||
if (is_supported(SIMDType::SSE) && src.dtype == dtype::Float32() && | |||
param().mode == Mode::MAX && | |||
param().format == Param::Format::NCHW && param().window_h == 3 && | |||
param().window_w == 3 && param().stride_h == 2 && | |||
param().stride_w == 2) { | |||
WorkspaceBundle ws = get_bundle(src, dst, param()); | |||
return ws.total_size_in_bytes(); | |||
} else { | |||
return 0; | |||
} | |||
} else { | |||
auto megdnn_src_memory = dnnl::memory(megdnn_src_md, mkldnn_eng, | |||
const_cast<void*>(src.raw_ptr)); | |||
return megdnn_src_memory; | |||
auto fallback_worksapce = | |||
fallback::PoolingImpl::get_workspace_in_bytes(src, dst); | |||
return fallback_worksapce; | |||
} | |||
} | |||
#endif | |||
} // namespace | |||
size_t PoolingImpl::get_workspace_in_bytes(const TensorLayout& src, | |||
const TensorLayout& dst) { | |||
if (is_supported(SIMDType::SSE) && src.dtype == dtype::Float32() && | |||
param().mode == Mode::MAX && param().format == Param::Format::NCHW && | |||
param().window_h == 3 && param().window_w == 3 && | |||
param().stride_h == 2 && param().stride_w == 2) { | |||
WorkspaceBundle ws = get_bundle(src, dst, param()); | |||
std::vector<Algorithm*> PoolingImpl::get_all_algorithms( | |||
const TensorLayout& src, const TensorLayout& dst) { | |||
return megdnn::get_all_algorithms<PoolingImpl>({this, src, dst}); | |||
} | |||
return ws.total_size_in_bytes(); | |||
} else { | |||
return 0; | |||
Algorithm* PoolingImpl::get_algorithm_heuristic( | |||
const TensorLayout& src, const TensorLayout& dst, | |||
size_t workspace_limit_in_bytes, const AlgoAttribute& positive_attr, | |||
const AlgoAttribute& negative_attr) { | |||
MEGDNN_MARK_USED_VAR(workspace_limit_in_bytes); | |||
AlgoBase::SizeArgs args(this, src, dst); | |||
for (auto iter : algo_pack().all_algos) { | |||
if (iter->is_available_attribute(args, positive_attr, negative_attr)) { | |||
return iter; | |||
} | |||
} | |||
megdnn_throw( | |||
ssprintf("require algorithm with attribute(%s) and without " | |||
"attribute(%s), but can't get suitable algo.\n", | |||
Algorithm::attribute_str(positive_attr).c_str(), | |||
Algorithm::attribute_str(negative_attr).c_str())); | |||
return nullptr; | |||
} | |||
void PoolingImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_out dst, | |||
_megdnn_workspace workspace) { | |||
check_exec(src.layout, dst.layout, workspace.size); | |||
size_t N = src.layout.shape[0], C = src.layout.shape[1], | |||
IH = src.layout.shape[2], IW = src.layout.shape[3]; | |||
size_t OH = dst.layout.shape[2], OW = dst.layout.shape[3]; | |||
auto mode = param().mode; | |||
auto FH = param().window_h, FW = param().window_w; | |||
auto SH = param().stride_h, SW = param().stride_w; | |||
auto PH = param().pad_h, PW = param().pad_w; | |||
bool is_average = (mode == Mode::AVERAGE); | |||
bool is_include = true; | |||
if (is_supported(SIMDType::AVX) && is_average && | |||
param().format == Param::Format::NCHW && | |||
src.layout.dtype == dtype::Float32() && FH == 2 && FW == 2 && SH == 2 && | |||
SW == 2) { | |||
auto sptr = src.ptr<dt_float32>(); | |||
auto dptr = dst.ptr<dt_float32>(); | |||
MEGDNN_DISPATCH_CPU_KERN_OPR(rep(n, N) rep(c, C) { | |||
mean_pooling_w2x2_s2x2_avx(sptr + n * C * IH * IW + c * IH * IW, IH, | |||
IW, dptr + n * C * OH * OW + c * OH * OW, | |||
OH, OW, PH, PW, is_include); | |||
}); | |||
return; | |||
} | |||
if (is_supported(SIMDType::SSE3) && is_average && | |||
src.layout.dtype == dtype::Float32() && | |||
param().format == Param::Format::NCHW && FH == 2 && FW == 2 && | |||
SH == 2 && SW == 2) { | |||
auto sptr = src.ptr<dt_float32>(); | |||
auto dptr = dst.ptr<dt_float32>(); | |||
MEGDNN_DISPATCH_CPU_KERN_OPR(rep(n, N) rep(c, C) { | |||
mean_pooling_w2x2_s2x2_sse3(sptr + n * C * IH * IW + c * IH * IW, | |||
IH, IW, | |||
dptr + n * C * OH * OW + c * OH * OW, | |||
OH, OW, PH, PW, is_include); | |||
}); | |||
return; | |||
} | |||
if (is_supported(SIMDType::SSE) && src.layout.dtype == dtype::Float32() && | |||
mode == Mode::MAX && param().format == Param::Format::NCHW && FH == 2 && | |||
FW == 2 && SH == 2 && SW == 2) { | |||
auto sptr = src.ptr<dt_float32>(); | |||
auto dptr = dst.ptr<dt_float32>(); | |||
MEGDNN_DISPATCH_CPU_KERN_OPR(rep(n, N) rep(c, C) { | |||
max_pooling_w2x2_s2x2_sse(sptr + n * C * IH * IW + c * IH * IW, IH, | |||
IW, dptr + n * C * OH * OW + c * OH * OW, | |||
OH, OW, PH, PW); | |||
}); | |||
return; | |||
} | |||
if (is_supported(SIMDType::SSE) && src.layout.dtype == dtype::Float32() && | |||
mode == Mode::MAX && param().format == Param::Format::NCHW && FH == 3 && | |||
FW == 3 && SH == 2 && SW == 2) { | |||
auto sptr = src.ptr<dt_float32>(); | |||
auto dptr = dst.ptr<dt_float32>(); | |||
MEGDNN_DISPATCH_CPU_KERN_OPR( | |||
WorkspaceBundle ws = | |||
get_bundle(src.layout, dst.layout, param()); | |||
ws.set(workspace.raw_ptr); rep(n, N) rep(c, C) { | |||
do_max_pooling_3x3_s2x2_float_SSE( | |||
sptr + n * C * IH * IW + c * IH * IW, | |||
dptr + n * C * OH * OW + c * OH * OW, IH, IW, OH, | |||
OW, PH, PW, ws); | |||
}); | |||
return; | |||
} | |||
#if MEGDNN_X86_WITH_MKL_DNN | |||
// Mkldnn provide optimized code for nhwc int8 pooling now. | |||
// Mkldnn can not change the layout automatic. | |||
// Reorder nchw input to nhwc, do pooling, reorder nhwc result to nchw | |||
if ((src.layout.dtype.enumv() == DTypeEnum::QuantizedS8 || | |||
src.layout.dtype.enumv() == DTypeEnum::Int8) && | |||
mode == Mode::MAX && param().format == Param::Format::NCHW) { | |||
auto x86_handle = static_cast<HandleImpl*>(inplace_cpu_handle().get()); | |||
auto mkldnn_eng = x86_handle->mkldnn_engine(); | |||
auto mkldnn_stream = x86_handle->mkldnn_stream(); | |||
auto mkldnn_pooling_mode = dnnl::algorithm::pooling_max; | |||
dnnl::memory::dims pool_strides = {SH, SW}; | |||
dnnl::memory::dims pool_padding = {PH, PW}; | |||
dnnl::memory::dims pool_kernel = {FH, FW}; | |||
dnnl::memory&& megdnn_src_memory_ori = | |||
tensor_to_mkl_memory<dnnl::memory::format_tag::nchw, false>( | |||
src, mkldnn_eng, dnnl::memory::data_type::s8); | |||
dnnl::memory&& megdnn_dst_memory_ori = | |||
tensor_to_mkl_memory<dnnl::memory::format_tag::nchw, false>( | |||
dst, mkldnn_eng, dnnl::memory::data_type::s8); | |||
dnnl::memory&& megdnn_src_memory = | |||
tensor_to_mkl_memory<dnnl::memory::format_tag::nhwc, true>( | |||
src, mkldnn_eng, dnnl::memory::data_type::s8); | |||
dnnl::memory&& megdnn_dst_memory = | |||
tensor_to_mkl_memory<dnnl::memory::format_tag::nhwc, true>( | |||
dst, mkldnn_eng, dnnl::memory::data_type::s8); | |||
auto reorder_src = | |||
dnnl::reorder(megdnn_src_memory_ori, megdnn_src_memory); | |||
auto reorder_dst = | |||
dnnl::reorder(megdnn_dst_memory, megdnn_dst_memory_ori); | |||
auto pool1_desc = dnnl::pooling_forward::desc( | |||
dnnl::prop_kind::forward_inference, mkldnn_pooling_mode, | |||
megdnn_src_memory.get_desc(), megdnn_dst_memory.get_desc(), | |||
pool_strides, pool_kernel, pool_padding, pool_padding); | |||
auto pool_pd = | |||
dnnl::pooling_forward::primitive_desc(pool1_desc, mkldnn_eng); | |||
auto pool = dnnl::pooling_forward(pool_pd); | |||
auto run = [mkldnn_stream, mkldnn_eng, reorder_src, pool, reorder_dst, | |||
megdnn_src_memory_ori, megdnn_src_memory, megdnn_dst_memory, | |||
megdnn_dst_memory_ori](void) { | |||
MEGDNN_MARK_USED_VAR(mkldnn_eng); | |||
auto mkl_stream = mkldnn_stream; | |||
reorder_src.execute(mkl_stream, | |||
{{DNNL_ARG_FROM, megdnn_src_memory_ori}, | |||
{DNNL_ARG_TO, megdnn_src_memory}}); | |||
pool.execute(mkl_stream, {{DNNL_ARG_SRC, megdnn_src_memory}, | |||
{DNNL_ARG_DST, megdnn_dst_memory}}); | |||
reorder_dst.execute(mkl_stream, | |||
{{DNNL_ARG_FROM, megdnn_dst_memory}, | |||
{DNNL_ARG_TO, megdnn_dst_memory_ori}}); | |||
mkl_stream.wait(); | |||
}; | |||
MEGDNN_DISPATCH_CPU_KERN_OPR(run()); | |||
return; | |||
} | |||
if (src.layout.dtype == dtype::Float32() && mode == Mode::MAX && | |||
param().format == Param::Format::NCHW88) { | |||
auto x86_handle = static_cast<HandleImpl*>(inplace_cpu_handle().get()); | |||
auto mkldnn_eng = x86_handle->mkldnn_engine(); | |||
auto mkldnn_stream = x86_handle->mkldnn_stream(); | |||
auto mkldnn_pooling_mode = dnnl::algorithm::pooling_max; | |||
switch (mode) { | |||
case Mode::MAX: | |||
mkldnn_pooling_mode = dnnl::algorithm::pooling_max; | |||
break; | |||
case Mode::AVERAGE: | |||
mkldnn_pooling_mode = | |||
dnnl::algorithm::pooling_avg_include_padding; | |||
break; | |||
case Mode::AVERAGE_COUNT_EXCLUDE_PADDING: | |||
mkldnn_pooling_mode = | |||
dnnl::algorithm::pooling_avg_exclude_padding; | |||
break; | |||
default: | |||
megdnn_assert(0, "not supported pooling mode\n"); | |||
}; | |||
dnnl::memory::dims pool_strides = {SH, SW}; | |||
dnnl::memory::dims pool_padding = {PH, PW}; | |||
dnnl::memory::dims pool_kernel = {FH, FW}; | |||
dnnl::memory&& megdnn_src_memory_ori = | |||
tensor_to_mkl_memory<dnnl::memory::format_tag::nChw8c, false>( | |||
src, mkldnn_eng, dnnl::memory::data_type::f32); | |||
dnnl::memory&& megdnn_dst_memory_ori = | |||
tensor_to_mkl_memory<dnnl::memory::format_tag::nChw8c, false>( | |||
dst, mkldnn_eng, dnnl::memory::data_type::f32); | |||
auto pool_desc = dnnl::pooling_forward::desc( | |||
dnnl::prop_kind::forward_inference, mkldnn_pooling_mode, | |||
megdnn_src_memory_ori.get_desc(), | |||
megdnn_dst_memory_ori.get_desc(), pool_strides, pool_kernel, | |||
pool_padding, pool_padding); | |||
auto pool_pd = | |||
dnnl::pooling_forward::primitive_desc(pool_desc, mkldnn_eng); | |||
auto pool = dnnl::pooling_forward(pool_pd); | |||
auto run = [mkldnn_stream, pool, mkldnn_eng, megdnn_src_memory_ori, | |||
megdnn_dst_memory_ori](void) { | |||
MEGDNN_MARK_USED_VAR(mkldnn_eng); | |||
auto mkl_stream = mkldnn_stream; | |||
pool.execute(mkl_stream, {{DNNL_ARG_SRC, megdnn_src_memory_ori}, | |||
{DNNL_ARG_DST, megdnn_dst_memory_ori}}); | |||
mkl_stream.wait(); | |||
}; | |||
MEGDNN_DISPATCH_CPU_KERN_OPR(run()); | |||
return; | |||
AlgoBase::ExecArgs args(this, src, dst, workspace); | |||
auto algo = get_algorithm(this, src.layout, dst.layout); | |||
if (!is_fallback_algo(algo)) { | |||
algo->exec(args); | |||
} else { | |||
fallback::PoolingImpl::exec(src, dst, Workspace()); | |||
} | |||
#endif | |||
fallback::PoolingImpl::exec(src, dst, Workspace()); | |||
} | |||
// vim: syntax=cpp.doxygen |
@@ -6,7 +6,8 @@ | |||
* | |||
* Unless required by applicable law or agreed to in writing, | |||
* software distributed under the License is distributed on an | |||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||
* implied. | |||
*/ | |||
#pragma once | |||
#include "src/fallback/pooling/opr_impl.h" | |||
@@ -14,17 +15,62 @@ | |||
namespace megdnn { | |||
namespace x86 { | |||
class PoolingImpl: public fallback::PoolingImpl { | |||
public: | |||
using fallback::PoolingImpl::PoolingImpl; | |||
void exec(_megdnn_tensor_in src, _megdnn_tensor_out dst, | |||
_megdnn_workspace) override; | |||
size_t get_workspace_in_bytes(const TensorLayout &, | |||
const TensorLayout &) override; | |||
}; | |||
class PoolingImpl : public fallback::PoolingImpl { | |||
private: | |||
class AlgoMeanW2S2AVX; | |||
class AlgoMeanW2S2SSE3; | |||
class AlgoMaxW2S2SSE; | |||
class AlgoMaxW3S3SSE; | |||
#if MEGDNN_X86_WITH_MKL_DNN | |||
class AlgoMKLDNNNCHW; | |||
class AlgoMKLDNNNCHW88; | |||
#endif | |||
class AlgoFallback; | |||
class AlgoPack; | |||
static AlgoPack sm_algo_pack; | |||
} // namespace x86 | |||
} // namespace megdnn | |||
// vim: syntax=cpp.doxygen | |||
public: | |||
using fallback::PoolingImpl::PoolingImpl; | |||
class AlgoBase; | |||
void exec(_megdnn_tensor_in src, _megdnn_tensor_out dst, | |||
_megdnn_workspace) override; | |||
size_t get_workspace_in_bytes(const TensorLayout&, | |||
const TensorLayout&) override; | |||
static size_t constexpr MAX_SPATIAL_DIM = 2; | |||
const char* get_algorithm_set_name() const override { | |||
return "X86_POOLING_FORWARD"; | |||
} | |||
Algorithm* get_algorithm_from_desc(const AlgorithmDesc& desc) override; | |||
AlgorithmInfo get_algorithm_info_heuristic( | |||
const TensorLayout& src, const TensorLayout& dst, | |||
size_t workspace_limit_in_bytes, const AlgoAttribute& positive_attr, | |||
const AlgoAttribute& negative_attr) { | |||
return get_algorithm_heuristic(src, dst, workspace_limit_in_bytes, | |||
positive_attr, negative_attr) | |||
->info(); | |||
} | |||
static const AlgoPack& algo_pack() { return sm_algo_pack; } | |||
bool is_fallback_algo(Algorithm* algo) { | |||
return strcmp(algo->name(), "FALLBACK_POOLING") == 0; | |||
} | |||
protected: | |||
std::vector<Algorithm*> get_all_algorithms( | |||
const TensorLayout& src, const TensorLayout& dst) override; | |||
Algorithm* get_algorithm_heuristic( | |||
const TensorLayout& src, const TensorLayout& dst, | |||
size_t workspace_limit_in_bytes, const AlgoAttribute& positive_attr, | |||
const AlgoAttribute& negative_attr) override; | |||
}; | |||
WorkspaceBundle get_bundle(const TensorLayout& src, const TensorLayout& dst, | |||
const param::Pooling& param); | |||
} // namespace x86 | |||
} // namespace megdnn | |||
// vim: syntax=cpp.doxygen |
@@ -159,6 +159,42 @@ TEST(TestOprDNN, PoolingExePolicy) { | |||
"cudnnReproducible") != std::string::npos); | |||
} | |||
TEST(TestOprDNN, PoolingForwardFastrun) { | |||
using Param = opr::Pooling::Param; | |||
Param param; | |||
using Policy = opr::Pooling::ExecutionPolicy; | |||
using S = Policy::Strategy; | |||
auto cn = CompNode::load("xpux"); | |||
cn.activate(); | |||
auto orig_impl = PersistentCache::set_impl( | |||
std::make_shared<InMemoryPersistentCache>()); | |||
HostTensorND host_y; | |||
S strategy = S::PROFILE | S::REPRODUCIBLE; | |||
auto graph = ComputingGraph::make(); | |||
HostTensorGenerator<> gen; | |||
TensorShape shape = {1, 20, 24, 24}; | |||
auto input = opr::Host2DeviceCopy::make(*graph, gen(shape, cn)); | |||
param.mode = Param::Mode::MAX; | |||
param.window_h = param.window_w = 2; | |||
param.stride_h = param.stride_w = 2; | |||
param.pad_h = param.pad_w = 0; | |||
param.format = Param::Format::NCHW; | |||
Policy policy; | |||
policy.strategy = strategy; | |||
auto pooling = opr::PoolingForward::make(input, param, {}, policy); | |||
auto func = graph->compile({make_callback_copy(pooling, host_y)}); | |||
func->execute().wait(); | |||
} | |||
} // anonymous namespace | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |