GitOrigin-RevId: 55a700d747
release-1.1
@@ -20,7 +20,6 @@ | |||||
#include "src/x86/conv_bias/postprocess_helper.h" | #include "src/x86/conv_bias/postprocess_helper.h" | ||||
#include "src/x86/convolution/convolution_direct_special_cases.h" | #include "src/x86/convolution/convolution_direct_special_cases.h" | ||||
#include "src/x86/handle.h" | #include "src/x86/handle.h" | ||||
#include "src/x86/profile.h" | |||||
#include "midout.h" | #include "midout.h" | ||||
@@ -487,153 +486,6 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoDirectStride2::get_kimpls( | |||||
const NCBKernSizeParam& param) const { | const NCBKernSizeParam& param) const { | ||||
GET_KERN; | GET_KERN; | ||||
} | } | ||||
/* ===================== matmul algo ===================== */ | |||||
WorkspaceBundle ConvBiasImpl::AlgoMatrixMul::get_bundle( | |||||
const NCBKernSizeParam& param) { | |||||
UNPACK_CONV_F32_NCB_KERN_SIZES(param); | |||||
MEGDNN_MARK_USED_VAR(N); | |||||
MEGDNN_MARK_USED_VAR(OC); | |||||
auto IW2 = IH + 2 * PH; | |||||
auto IH2 = IW + 2 * PW; | |||||
bool can_matrix_mul_direct = | |||||
(FH == 1 && FW == 1 && SH == 1 && SW == 1 && PH == 0 && PW == 0); | |||||
// temp space to store padding-free src (with 4 extra floats) | |||||
// temp space to store unrolled matrix (with 4 extra floats) | |||||
// workspace for matrix mul opr | |||||
size_t part0, part1, part2; | |||||
if (can_matrix_mul_direct) { | |||||
part0 = part1 = 0; | |||||
} else { | |||||
part0 = (IC * IH2 * IW2 + 4) * sizeof(float); | |||||
part1 = (IC * FH * FW * OH * OW + 4) * sizeof(float); | |||||
} | |||||
{ | |||||
TensorLayout A_, B_, C_; | |||||
A_ = TensorLayout({OC, IC * FH * FW}, dtype::Float32()); | |||||
B_ = TensorLayout({IC * FH * FW, OH * OW}, dtype::Float32()); | |||||
C_ = TensorLayout({OC, OH * OW}, dtype::Float32()); | |||||
part2 = get_matmul_opr()->get_workspace_in_bytes(A_, B_, C_); | |||||
} | |||||
return {nullptr, {part0, part1, part2}}; | |||||
} | |||||
bool ConvBiasImpl::AlgoMatrixMul::is_preferred( | |||||
const NCBKernSizeParam& param) const { | |||||
auto&& fm = param.filter_meta; | |||||
if (fm.dilation[0] != 1 || fm.dilation[1] != 1) { | |||||
return false; | |||||
} | |||||
// single channel conv should never use matrix mul | |||||
if (fm.ocpg == 1 || fm.icpg == 1) | |||||
return false; | |||||
// 1x1 conv should always use matrix mul | |||||
if (fm.spatial[0] == 1 && fm.spatial[1] == 1) | |||||
return true; | |||||
// if stride is not 1x1, always use matrix mul | |||||
if (fm.stride[0] != 1 || fm.stride[1] != 1) | |||||
return true; | |||||
int f = find_nearest_elem<int>( | |||||
std::round(geometric_mean(fm.spatial[0], fm.spatial[1])), | |||||
{2, 3, 4, 5, 6, 7}); | |||||
int oc = find_nearest_elem<int>(fm.ocpg, {4, 8, 16, 32, 64, 96, 128}); | |||||
int ic = find_nearest_elem<int>(fm.icpg, {4, 8, 16, 32, 64, 96, 128}); | |||||
int on = std::round(geometric_mean(param.osz[0], param.osz[1])); | |||||
ProfileElement cur(f, oc, ic, on); | |||||
auto H = static_cast<HandleImpl*>(inplace_cpu_handle().get()); | |||||
auto&& target = std::lower_bound(H->profile_cache().begin(), | |||||
H->profile_cache().end(), cur); | |||||
megdnn_assert_internal(target->f == cur.f); | |||||
megdnn_assert_internal(target->oc == cur.oc); | |||||
megdnn_assert_internal(target->ic == cur.ic); | |||||
return on < target->on_threshold; | |||||
} | |||||
MatrixMul* ConvBiasImpl::AlgoMatrixMul::get_matmul_opr() { | |||||
static CpuOprDelegationStorage<> storage; | |||||
return storage.get<MatrixMul>(); | |||||
} | |||||
void ConvBiasImpl::AlgoMatrixMul::kimpl(const NCBKernParam& param, | |||||
const NCBKernIndex& ncb_index) { | |||||
UNPACK_CONV_F32_NCB_KERN_SIZES(param); | |||||
auto IH2 = IH + 2 * PH; | |||||
auto IW2 = IW + 2 * PW; | |||||
size_t group_id = ncb_index.ndrange_id[0]; | |||||
bool is_xcorr = !param.filter_meta.should_flip; | |||||
auto bundle = get_bundle(param); | |||||
bundle.set(param.workspace_ptr); | |||||
// workspace = tmp..src2 | |||||
for (size_t n = 0; n < N; ++n) { | |||||
float* src = const_cast<float*>(param.src<float>(n, group_id)); | |||||
float* dst = param.dst<float>(n, group_id); | |||||
float* bias_ptr = static_cast<float*>( | |||||
const_cast<void*>(param.bias<void>(n, group_id))); | |||||
float *B, *src2; | |||||
if (FH == 1 && FW == 1 && SH == 1 && SW == 1 && PH == 0 && PW == 0) { | |||||
// special case: 1x1 | |||||
B = src; | |||||
} else { | |||||
src2 = static_cast<float*>(bundle.get(0)); | |||||
// copy src to src2; | |||||
float* src2_ptr = src2; | |||||
const float* src_ptr = src; | |||||
rep(ic, IC) { | |||||
if (PH != 0) { | |||||
std::memset(src2_ptr, 0, sizeof(float) * PH * IW2); | |||||
src2_ptr += PH * IW2; | |||||
} | |||||
rep(ih, IH) { | |||||
if (PW != 0) | |||||
rep(pw, PW) * (src2_ptr++) = 0.0f; | |||||
std::memcpy(src2_ptr, src_ptr, sizeof(float) * IW); | |||||
src2_ptr += IW; | |||||
src_ptr += IW; | |||||
if (PW != 0) | |||||
rep(pw, PW) * (src2_ptr++) = 0.0f; | |||||
} | |||||
if (PH != 0) { | |||||
std::memset(src2_ptr, 0, sizeof(float) * PH * IW2); | |||||
src2_ptr += PH * IW2; | |||||
} | |||||
} | |||||
B = static_cast<float*>(bundle.get(1)); | |||||
if (SH == 1 && SW == 1) { | |||||
if (is_xcorr) { | |||||
img2col<true>(src2, B, OC, OH, OW, IC, IH2, IW2, FH, FW); | |||||
} else { | |||||
img2col<false>(src2, B, OC, OH, OW, IC, IH2, IW2, FH, FW); | |||||
} | |||||
} else { | |||||
if (is_xcorr) { | |||||
img2col_stride<true>(src2, B, OC, OH, OW, IC, IH2, IW2, FH, | |||||
FW, SH, SW); | |||||
} else { | |||||
img2col_stride<false>(src2, B, OC, OH, OW, IC, IH2, IW2, FH, | |||||
FW, SH, SW); | |||||
} | |||||
} | |||||
} | |||||
{ | |||||
TensorND A_, B_, C_; | |||||
A_.layout = TensorLayout({OC, IC * FH * FW}, dtype::Float32()); | |||||
A_.raw_ptr = const_cast<float*>(param.filter<float>(group_id)); | |||||
B_.layout = TensorLayout({IC * FH * FW, OH * OW}, dtype::Float32()); | |||||
B_.raw_ptr = B; | |||||
C_.layout = TensorLayout({OC, OH * OW}, dtype::Float32()); | |||||
C_.raw_ptr = dst; | |||||
Workspace workspace(static_cast<dt_byte*>(bundle.get(2)), | |||||
bundle.get_size(2)); | |||||
get_matmul_opr()->exec(A_, B_, C_, workspace); | |||||
} | |||||
PostProcess<float>::run(dst, bias_ptr, dst, param.bias_mode, | |||||
param.nonlineMode, param.bias_type, | |||||
param.dst_type, 1_z, OC, OH, OW); | |||||
} | |||||
} | |||||
#if MEGDNN_X86_WITH_MKL_DNN | #if MEGDNN_X86_WITH_MKL_DNN | ||||
static inline void mkldnn_fp32_conv_instance( | static inline void mkldnn_fp32_conv_instance( | ||||
@@ -123,47 +123,6 @@ public: | |||||
MEGDNN_WINOGRAD_ALGO_FUN_DECLARE(AlgoDataType::FLOAT32); | MEGDNN_WINOGRAD_ALGO_FUN_DECLARE(AlgoDataType::FLOAT32); | ||||
}; | }; | ||||
/* ===================== matmul algo ===================== */ | |||||
class ConvBiasImpl::AlgoMatrixMul final : public AlgoBase { | |||||
static MatrixMul* get_matmul_opr(); | |||||
static WorkspaceBundle get_bundle(const NCBKernSizeParam& param); | |||||
static void kimpl(const NCBKernParam& param, const NCBKernIndex&); | |||||
public: | |||||
bool is_reproducible() const override { return true; } | |||||
const char* name() const override { return "X86_CONV_BIAS_MATMUL"; } | |||||
bool usable(const NCBKernSizeParam& param, | |||||
AlgoSelectionStrategy) const override { | |||||
auto&& fm = param.filter_meta; | |||||
return fm.format == Param::Format::NCHW && fm.spatial_ndim == 2 && | |||||
param.src_type.enumv() == DTypeEnum::Float32 && | |||||
param.filter_type.enumv() == DTypeEnum::Float32 && | |||||
param.dst_type.enumv() == DTypeEnum::Float32 && | |||||
fm.dilation[0] == 1 && fm.dilation[1] == 1 && | |||||
//! The matmul opr is only used in single thread | |||||
//! TODO:support the no pack matmul algo in fallback im2col + | |||||
//! matmul | |||||
param.nr_threads == 1_z; | |||||
} | |||||
bool is_preferred(const NCBKernSizeParam&) const override; | |||||
size_t get_workspace(const NCBKernSizeParam& param) const override { | |||||
return get_bundle(param).total_size_in_bytes(); | |||||
} | |||||
SmallVector<NCBKern> dispatch_kerns( | |||||
const NCBKernSizeParam& param) const override { | |||||
size_t group = param.filter_meta.group; | |||||
return {{kimpl, {group, 1_z, 1_z}}}; | |||||
} | |||||
void* type() const override; | |||||
ConvAlgoTypePack get_algo_type() const override { | |||||
return {AlgoDataType::FLOAT32, AlgoCategory::IM2COL}; | |||||
} | |||||
}; | |||||
#if MEGDNN_X86_WITH_MKL_DNN | #if MEGDNN_X86_WITH_MKL_DNN | ||||
class ConvBiasImpl::AlgoMkldnnConv final : public AlgoBase { | class ConvBiasImpl::AlgoMkldnnConv final : public AlgoBase { | ||||
static void kern_mkldnn_fp32(const NCBKernParam& param, | static void kern_mkldnn_fp32(const NCBKernParam& param, | ||||
@@ -47,10 +47,6 @@ void* ConvBiasImpl::AlgoDirectStride2::type() const { | |||||
return x86_algo_type; | return x86_algo_type; | ||||
} | } | ||||
void* ConvBiasImpl::AlgoMatrixMul::type() const { | |||||
return x86_algo_type; | |||||
} | |||||
void* ConvBiasImpl::AlgoDirectAvx2Stride1Int8::type() const { | void* ConvBiasImpl::AlgoDirectAvx2Stride1Int8::type() const { | ||||
return x86_algo_type; | return x86_algo_type; | ||||
} | } | ||||
@@ -82,7 +78,6 @@ class ConvBiasImpl::AlgoPack : NonCopyableObj { | |||||
AlgoAVX2DirectConvStride2 avx2_stride2_direct; | AlgoAVX2DirectConvStride2 avx2_stride2_direct; | ||||
AlgoChanWiseAvx2Stride1Qint8 avx2_stride1_chanwsie_qint8; | AlgoChanWiseAvx2Stride1Qint8 avx2_stride1_chanwsie_qint8; | ||||
AlgoChanWiseAvx2Stride2Qint8 avx2_stride2_chanwsie_qint8; | AlgoChanWiseAvx2Stride2Qint8 avx2_stride2_chanwsie_qint8; | ||||
AlgoMatrixMul matmul; | |||||
#if MEGDNN_X86_WITH_MKL_DNN | #if MEGDNN_X86_WITH_MKL_DNN | ||||
AlgoMkldnnMatmulQint8 mkldnn_matmul_qint8; | AlgoMkldnnMatmulQint8 mkldnn_matmul_qint8; | ||||
//! Because the mkldnnconv need handle | //! Because the mkldnnconv need handle | ||||
@@ -107,7 +102,6 @@ public: | |||||
all_algos.emplace_back(&avx2_stride2_chanwsie_qint8); | all_algos.emplace_back(&avx2_stride2_chanwsie_qint8); | ||||
all_algos.emplace_back(&avx2_stride1_direct_int8); | all_algos.emplace_back(&avx2_stride1_direct_int8); | ||||
all_algos.emplace_back(&avx2_stride2_direct); | all_algos.emplace_back(&avx2_stride2_direct); | ||||
all_algos.emplace_back(&matmul); | |||||
static CpuOprDelegationStorage<> storage; | static CpuOprDelegationStorage<> storage; | ||||
auto matmul_opr = storage.get<MatrixMul>(); | auto matmul_opr = storage.get<MatrixMul>(); | ||||
@@ -31,7 +31,6 @@ public: | |||||
class AlgoDirectStride2; | class AlgoDirectStride2; | ||||
class AlgoFP32WinogradF63_8x8; | class AlgoFP32WinogradF63_8x8; | ||||
class AlgoFP32WinogradF23_8x8; | class AlgoFP32WinogradF23_8x8; | ||||
class AlgoMatrixMul; | |||||
class AlgoDirectAvx2Stride1Int8; | class AlgoDirectAvx2Stride1Int8; | ||||
class AlgoAVX2DirectConvStride2; | class AlgoAVX2DirectConvStride2; | ||||
class AlgoChanWiseAvx2Stride1Qint8; | class AlgoChanWiseAvx2Stride1Qint8; | ||||
@@ -11,8 +11,6 @@ | |||||
#pragma once | #pragma once | ||||
#include "src/fallback/handle.h" | #include "src/fallback/handle.h" | ||||
#include "src/x86/profile.h" | |||||
#if MEGDNN_X86_WITH_MKL_DNN | #if MEGDNN_X86_WITH_MKL_DNN | ||||
#include <mkldnn.hpp> | #include <mkldnn.hpp> | ||||
#endif | #endif | ||||
@@ -22,8 +20,6 @@ namespace x86 { | |||||
class HandleImpl : public fallback::HandleImpl { | class HandleImpl : public fallback::HandleImpl { | ||||
public: | public: | ||||
const ProfileCache& profile_cache() { return m_profile_cache; } | |||||
HandleImpl(megcoreComputingHandle_t computing_handle, | HandleImpl(megcoreComputingHandle_t computing_handle, | ||||
HandleType type = HandleType::X86); | HandleType type = HandleType::X86); | ||||
@@ -37,7 +33,6 @@ public: | |||||
#endif | #endif | ||||
private: | private: | ||||
ProfileCache m_profile_cache = get_profile_cache(); | |||||
#if MEGDNN_X86_WITH_MKL_DNN | #if MEGDNN_X86_WITH_MKL_DNN | ||||
dnnl::engine m_mkldnn_engine; | dnnl::engine m_mkldnn_engine; | ||||
dnnl::stream m_mkldnn_stream; | dnnl::stream m_mkldnn_stream; | ||||
@@ -1,324 +0,0 @@ | |||||
/** | |||||
* \file dnn/src/x86/profile.cpp | |||||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
* | |||||
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||||
* | |||||
* Unless required by applicable law or agreed to in writing, | |||||
* software distributed under the License is distributed on an | |||||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
*/ | |||||
#include "src/x86/profile.h" | |||||
namespace megdnn { | |||||
namespace x86 { | |||||
ProfileCache get_profile_cache() | |||||
{ | |||||
ProfileCache vec; | |||||
vec.clear(); | |||||
vec.reserve(294); | |||||
vec.push_back(ProfileElement(2, 4, 4, 49)); | |||||
vec.push_back(ProfileElement(2, 4, 8, 25)); | |||||
vec.push_back(ProfileElement(2, 4, 16, 19)); | |||||
vec.push_back(ProfileElement(2, 4, 32, 14)); | |||||
vec.push_back(ProfileElement(2, 4, 64, 13)); | |||||
vec.push_back(ProfileElement(2, 4, 96, 15)); | |||||
vec.push_back(ProfileElement(2, 4, 128, 15)); | |||||
vec.push_back(ProfileElement(2, 8, 4, 241)); | |||||
vec.push_back(ProfileElement(2, 8, 8, 121)); | |||||
vec.push_back(ProfileElement(2, 8, 16, 57)); | |||||
vec.push_back(ProfileElement(2, 8, 32, 29)); | |||||
vec.push_back(ProfileElement(2, 8, 64, 17)); | |||||
vec.push_back(ProfileElement(2, 8, 96, 39)); | |||||
vec.push_back(ProfileElement(2, 8, 128, 29)); | |||||
vec.push_back(ProfileElement(2, 16, 4, 1000000000)); | |||||
vec.push_back(ProfileElement(2, 16, 8, 273)); | |||||
vec.push_back(ProfileElement(2, 16, 16, 177)); | |||||
vec.push_back(ProfileElement(2, 16, 32, 137)); | |||||
vec.push_back(ProfileElement(2, 16, 64, 1000000000)); | |||||
vec.push_back(ProfileElement(2, 16, 96, 1000000000)); | |||||
vec.push_back(ProfileElement(2, 16, 128, 1000000000)); | |||||
vec.push_back(ProfileElement(2, 32, 4, 1000000000)); | |||||
vec.push_back(ProfileElement(2, 32, 8, 1000000000)); | |||||
vec.push_back(ProfileElement(2, 32, 16, 1000000000)); | |||||
vec.push_back(ProfileElement(2, 32, 32, 1000000000)); | |||||
vec.push_back(ProfileElement(2, 32, 64, 1000000000)); | |||||
vec.push_back(ProfileElement(2, 32, 96, 1000000000)); | |||||
vec.push_back(ProfileElement(2, 32, 128, 1000000000)); | |||||
vec.push_back(ProfileElement(2, 64, 4, 1000000000)); | |||||
vec.push_back(ProfileElement(2, 64, 8, 1000000000)); | |||||
vec.push_back(ProfileElement(2, 64, 16, 1000000000)); | |||||
vec.push_back(ProfileElement(2, 64, 32, 1000000000)); | |||||
vec.push_back(ProfileElement(2, 64, 64, 1000000000)); | |||||
vec.push_back(ProfileElement(2, 64, 96, 1000000000)); | |||||
vec.push_back(ProfileElement(2, 64, 128, 1000000000)); | |||||
vec.push_back(ProfileElement(2, 96, 4, 1000000000)); | |||||
vec.push_back(ProfileElement(2, 96, 8, 1000000000)); | |||||
vec.push_back(ProfileElement(2, 96, 16, 1000000000)); | |||||
vec.push_back(ProfileElement(2, 96, 32, 1000000000)); | |||||
vec.push_back(ProfileElement(2, 96, 64, 1000000000)); | |||||
vec.push_back(ProfileElement(2, 96, 96, 1000000000)); | |||||
vec.push_back(ProfileElement(2, 96, 128, 1000000000)); | |||||
vec.push_back(ProfileElement(2, 128, 4, 1000000000)); | |||||
vec.push_back(ProfileElement(2, 128, 8, 1000000000)); | |||||
vec.push_back(ProfileElement(2, 128, 16, 1000000000)); | |||||
vec.push_back(ProfileElement(2, 128, 32, 1000000000)); | |||||
vec.push_back(ProfileElement(2, 128, 64, 1000000000)); | |||||
vec.push_back(ProfileElement(2, 128, 96, 1000000000)); | |||||
vec.push_back(ProfileElement(2, 128, 128, 1000000000)); | |||||
vec.push_back(ProfileElement(3, 4, 4, 10)); | |||||
vec.push_back(ProfileElement(3, 4, 8, 5)); | |||||
vec.push_back(ProfileElement(3, 4, 16, 7)); | |||||
vec.push_back(ProfileElement(3, 4, 32, 7)); | |||||
vec.push_back(ProfileElement(3, 4, 64, 6)); | |||||
vec.push_back(ProfileElement(3, 4, 96, 5)); | |||||
vec.push_back(ProfileElement(3, 4, 128, 5)); | |||||
vec.push_back(ProfileElement(3, 8, 4, 14)); | |||||
vec.push_back(ProfileElement(3, 8, 8, 13)); | |||||
vec.push_back(ProfileElement(3, 8, 16, 13)); | |||||
vec.push_back(ProfileElement(3, 8, 32, 13)); | |||||
vec.push_back(ProfileElement(3, 8, 64, 11)); | |||||
vec.push_back(ProfileElement(3, 8, 96, 11)); | |||||
vec.push_back(ProfileElement(3, 8, 128, 12)); | |||||
vec.push_back(ProfileElement(3, 16, 4, 37)); | |||||
vec.push_back(ProfileElement(3, 16, 8, 29)); | |||||
vec.push_back(ProfileElement(3, 16, 16, 21)); | |||||
vec.push_back(ProfileElement(3, 16, 32, 19)); | |||||
vec.push_back(ProfileElement(3, 16, 64, 14)); | |||||
vec.push_back(ProfileElement(3, 16, 96, 13)); | |||||
vec.push_back(ProfileElement(3, 16, 128, 13)); | |||||
vec.push_back(ProfileElement(3, 32, 4, 69)); | |||||
vec.push_back(ProfileElement(3, 32, 8, 105)); | |||||
vec.push_back(ProfileElement(3, 32, 16, 105)); | |||||
vec.push_back(ProfileElement(3, 32, 32, 49)); | |||||
vec.push_back(ProfileElement(3, 32, 64, 29)); | |||||
vec.push_back(ProfileElement(3, 32, 96, 27)); | |||||
vec.push_back(ProfileElement(3, 32, 128, 39)); | |||||
vec.push_back(ProfileElement(3, 64, 4, 193)); | |||||
vec.push_back(ProfileElement(3, 64, 8, 161)); | |||||
vec.push_back(ProfileElement(3, 64, 16, 137)); | |||||
vec.push_back(ProfileElement(3, 64, 32, 113)); | |||||
vec.push_back(ProfileElement(3, 64, 64, 1000000000)); | |||||
vec.push_back(ProfileElement(3, 64, 96, 1000000000)); | |||||
vec.push_back(ProfileElement(3, 64, 128, 1000000000)); | |||||
vec.push_back(ProfileElement(3, 96, 4, 1000000000)); | |||||
vec.push_back(ProfileElement(3, 96, 8, 305)); | |||||
vec.push_back(ProfileElement(3, 96, 16, 1000000000)); | |||||
vec.push_back(ProfileElement(3, 96, 32, 1000000000)); | |||||
vec.push_back(ProfileElement(3, 96, 64, 1000000000)); | |||||
vec.push_back(ProfileElement(3, 96, 96, 1000000000)); | |||||
vec.push_back(ProfileElement(3, 96, 128, 1000000000)); | |||||
vec.push_back(ProfileElement(3, 128, 4, 1000000000)); | |||||
vec.push_back(ProfileElement(3, 128, 8, 1000000000)); | |||||
vec.push_back(ProfileElement(3, 128, 16, 1000000000)); | |||||
vec.push_back(ProfileElement(3, 128, 32, 1000000000)); | |||||
vec.push_back(ProfileElement(3, 128, 64, 1000000000)); | |||||
vec.push_back(ProfileElement(3, 128, 96, 1000000000)); | |||||
vec.push_back(ProfileElement(3, 128, 128, 1000000000)); | |||||
vec.push_back(ProfileElement(4, 4, 4, 7)); | |||||
vec.push_back(ProfileElement(4, 4, 8, 7)); | |||||
vec.push_back(ProfileElement(4, 4, 16, 5)); | |||||
vec.push_back(ProfileElement(4, 4, 32, 6)); | |||||
vec.push_back(ProfileElement(4, 4, 64, 5)); | |||||
vec.push_back(ProfileElement(4, 4, 96, 5)); | |||||
vec.push_back(ProfileElement(4, 4, 128, 5)); | |||||
vec.push_back(ProfileElement(4, 8, 4, 14)); | |||||
vec.push_back(ProfileElement(4, 8, 8, 12)); | |||||
vec.push_back(ProfileElement(4, 8, 16, 5)); | |||||
vec.push_back(ProfileElement(4, 8, 32, 6)); | |||||
vec.push_back(ProfileElement(4, 8, 64, 6)); | |||||
vec.push_back(ProfileElement(4, 8, 96, 6)); | |||||
vec.push_back(ProfileElement(4, 8, 128, 5)); | |||||
vec.push_back(ProfileElement(4, 16, 4, 14)); | |||||
vec.push_back(ProfileElement(4, 16, 8, 14)); | |||||
vec.push_back(ProfileElement(4, 16, 16, 13)); | |||||
vec.push_back(ProfileElement(4, 16, 32, 13)); | |||||
vec.push_back(ProfileElement(4, 16, 64, 13)); | |||||
vec.push_back(ProfileElement(4, 16, 96, 13)); | |||||
vec.push_back(ProfileElement(4, 16, 128, 13)); | |||||
vec.push_back(ProfileElement(4, 32, 4, 37)); | |||||
vec.push_back(ProfileElement(4, 32, 8, 31)); | |||||
vec.push_back(ProfileElement(4, 32, 16, 29)); | |||||
vec.push_back(ProfileElement(4, 32, 32, 21)); | |||||
vec.push_back(ProfileElement(4, 32, 64, 21)); | |||||
vec.push_back(ProfileElement(4, 32, 96, 29)); | |||||
vec.push_back(ProfileElement(4, 32, 128, 21)); | |||||
vec.push_back(ProfileElement(4, 64, 4, 137)); | |||||
vec.push_back(ProfileElement(4, 64, 8, 113)); | |||||
vec.push_back(ProfileElement(4, 64, 16, 89)); | |||||
vec.push_back(ProfileElement(4, 64, 32, 69)); | |||||
vec.push_back(ProfileElement(4, 64, 64, 45)); | |||||
vec.push_back(ProfileElement(4, 64, 96, 37)); | |||||
vec.push_back(ProfileElement(4, 64, 128, 35)); | |||||
vec.push_back(ProfileElement(4, 96, 4, 137)); | |||||
vec.push_back(ProfileElement(4, 96, 8, 113)); | |||||
vec.push_back(ProfileElement(4, 96, 16, 105)); | |||||
vec.push_back(ProfileElement(4, 96, 32, 77)); | |||||
vec.push_back(ProfileElement(4, 96, 64, 53)); | |||||
vec.push_back(ProfileElement(4, 96, 96, 45)); | |||||
vec.push_back(ProfileElement(4, 96, 128, 39)); | |||||
vec.push_back(ProfileElement(4, 128, 4, 137)); | |||||
vec.push_back(ProfileElement(4, 128, 8, 121)); | |||||
vec.push_back(ProfileElement(4, 128, 16, 153)); | |||||
vec.push_back(ProfileElement(4, 128, 32, 97)); | |||||
vec.push_back(ProfileElement(4, 128, 64, 1000000000)); | |||||
vec.push_back(ProfileElement(4, 128, 96, 1000000000)); | |||||
vec.push_back(ProfileElement(4, 128, 128, 1000000000)); | |||||
vec.push_back(ProfileElement(5, 4, 4, 8)); | |||||
vec.push_back(ProfileElement(5, 4, 8, 9)); | |||||
vec.push_back(ProfileElement(5, 4, 16, 5)); | |||||
vec.push_back(ProfileElement(5, 4, 32, 5)); | |||||
vec.push_back(ProfileElement(5, 4, 64, 5)); | |||||
vec.push_back(ProfileElement(5, 4, 96, 5)); | |||||
vec.push_back(ProfileElement(5, 4, 128, 5)); | |||||
vec.push_back(ProfileElement(5, 8, 4, 7)); | |||||
vec.push_back(ProfileElement(5, 8, 8, 6)); | |||||
vec.push_back(ProfileElement(5, 8, 16, 5)); | |||||
vec.push_back(ProfileElement(5, 8, 32, 5)); | |||||
vec.push_back(ProfileElement(5, 8, 64, 5)); | |||||
vec.push_back(ProfileElement(5, 8, 96, 5)); | |||||
vec.push_back(ProfileElement(5, 8, 128, 5)); | |||||
vec.push_back(ProfileElement(5, 16, 4, 21)); | |||||
vec.push_back(ProfileElement(5, 16, 8, 12)); | |||||
vec.push_back(ProfileElement(5, 16, 16, 12)); | |||||
vec.push_back(ProfileElement(5, 16, 32, 11)); | |||||
vec.push_back(ProfileElement(5, 16, 64, 11)); | |||||
vec.push_back(ProfileElement(5, 16, 96, 11)); | |||||
vec.push_back(ProfileElement(5, 16, 128, 11)); | |||||
vec.push_back(ProfileElement(5, 32, 4, 23)); | |||||
vec.push_back(ProfileElement(5, 32, 8, 14)); | |||||
vec.push_back(ProfileElement(5, 32, 16, 14)); | |||||
vec.push_back(ProfileElement(5, 32, 32, 13)); | |||||
vec.push_back(ProfileElement(5, 32, 64, 13)); | |||||
vec.push_back(ProfileElement(5, 32, 96, 13)); | |||||
vec.push_back(ProfileElement(5, 32, 128, 13)); | |||||
vec.push_back(ProfileElement(5, 64, 4, 77)); | |||||
vec.push_back(ProfileElement(5, 64, 8, 39)); | |||||
vec.push_back(ProfileElement(5, 64, 16, 37)); | |||||
vec.push_back(ProfileElement(5, 64, 32, 29)); | |||||
vec.push_back(ProfileElement(5, 64, 64, 29)); | |||||
vec.push_back(ProfileElement(5, 64, 96, 21)); | |||||
vec.push_back(ProfileElement(5, 64, 128, 21)); | |||||
vec.push_back(ProfileElement(5, 96, 4, 113)); | |||||
vec.push_back(ProfileElement(5, 96, 8, 77)); | |||||
vec.push_back(ProfileElement(5, 96, 16, 61)); | |||||
vec.push_back(ProfileElement(5, 96, 32, 39)); | |||||
vec.push_back(ProfileElement(5, 96, 64, 37)); | |||||
vec.push_back(ProfileElement(5, 96, 96, 31)); | |||||
vec.push_back(ProfileElement(5, 96, 128, 29)); | |||||
vec.push_back(ProfileElement(5, 128, 4, 113)); | |||||
vec.push_back(ProfileElement(5, 128, 8, 97)); | |||||
vec.push_back(ProfileElement(5, 128, 16, 69)); | |||||
vec.push_back(ProfileElement(5, 128, 32, 53)); | |||||
vec.push_back(ProfileElement(5, 128, 64, 39)); | |||||
vec.push_back(ProfileElement(5, 128, 96, 31)); | |||||
vec.push_back(ProfileElement(5, 128, 128, 31)); | |||||
vec.push_back(ProfileElement(6, 4, 4, 7)); | |||||
vec.push_back(ProfileElement(6, 4, 8, 3)); | |||||
vec.push_back(ProfileElement(6, 4, 16, 5)); | |||||
vec.push_back(ProfileElement(6, 4, 32, 4)); | |||||
vec.push_back(ProfileElement(6, 4, 64, 5)); | |||||
vec.push_back(ProfileElement(6, 4, 96, 4)); | |||||
vec.push_back(ProfileElement(6, 4, 128, 4)); | |||||
vec.push_back(ProfileElement(6, 8, 4, 11)); | |||||
vec.push_back(ProfileElement(6, 8, 8, 5)); | |||||
vec.push_back(ProfileElement(6, 8, 16, 5)); | |||||
vec.push_back(ProfileElement(6, 8, 32, 5)); | |||||
vec.push_back(ProfileElement(6, 8, 64, 5)); | |||||
vec.push_back(ProfileElement(6, 8, 96, 5)); | |||||
vec.push_back(ProfileElement(6, 8, 128, 5)); | |||||
vec.push_back(ProfileElement(6, 16, 4, 13)); | |||||
vec.push_back(ProfileElement(6, 16, 8, 11)); | |||||
vec.push_back(ProfileElement(6, 16, 16, 11)); | |||||
vec.push_back(ProfileElement(6, 16, 32, 5)); | |||||
vec.push_back(ProfileElement(6, 16, 64, 5)); | |||||
vec.push_back(ProfileElement(6, 16, 96, 5)); | |||||
vec.push_back(ProfileElement(6, 16, 128, 11)); | |||||
vec.push_back(ProfileElement(6, 32, 4, 21)); | |||||
vec.push_back(ProfileElement(6, 32, 8, 14)); | |||||
vec.push_back(ProfileElement(6, 32, 16, 13)); | |||||
vec.push_back(ProfileElement(6, 32, 32, 13)); | |||||
vec.push_back(ProfileElement(6, 32, 64, 13)); | |||||
vec.push_back(ProfileElement(6, 32, 96, 13)); | |||||
vec.push_back(ProfileElement(6, 32, 128, 13)); | |||||
vec.push_back(ProfileElement(6, 64, 4, 39)); | |||||
vec.push_back(ProfileElement(6, 64, 8, 29)); | |||||
vec.push_back(ProfileElement(6, 64, 16, 29)); | |||||
vec.push_back(ProfileElement(6, 64, 32, 21)); | |||||
vec.push_back(ProfileElement(6, 64, 64, 21)); | |||||
vec.push_back(ProfileElement(6, 64, 96, 21)); | |||||
vec.push_back(ProfileElement(6, 64, 128, 21)); | |||||
vec.push_back(ProfileElement(6, 96, 4, 97)); | |||||
vec.push_back(ProfileElement(6, 96, 8, 61)); | |||||
vec.push_back(ProfileElement(6, 96, 16, 39)); | |||||
vec.push_back(ProfileElement(6, 96, 32, 37)); | |||||
vec.push_back(ProfileElement(6, 96, 64, 29)); | |||||
vec.push_back(ProfileElement(6, 96, 96, 29)); | |||||
vec.push_back(ProfileElement(6, 96, 128, 21)); | |||||
vec.push_back(ProfileElement(6, 128, 4, 77)); | |||||
vec.push_back(ProfileElement(6, 128, 8, 61)); | |||||
vec.push_back(ProfileElement(6, 128, 16, 39)); | |||||
vec.push_back(ProfileElement(6, 128, 32, 37)); | |||||
vec.push_back(ProfileElement(6, 128, 64, 29)); | |||||
vec.push_back(ProfileElement(6, 128, 96, 29)); | |||||
vec.push_back(ProfileElement(6, 128, 128, 23)); | |||||
vec.push_back(ProfileElement(7, 4, 4, 5)); | |||||
vec.push_back(ProfileElement(7, 4, 8, 4)); | |||||
vec.push_back(ProfileElement(7, 4, 16, 4)); | |||||
vec.push_back(ProfileElement(7, 4, 32, 4)); | |||||
vec.push_back(ProfileElement(7, 4, 64, 4)); | |||||
vec.push_back(ProfileElement(7, 4, 96, 4)); | |||||
vec.push_back(ProfileElement(7, 4, 128, 3)); | |||||
vec.push_back(ProfileElement(7, 8, 4, 5)); | |||||
vec.push_back(ProfileElement(7, 8, 8, 5)); | |||||
vec.push_back(ProfileElement(7, 8, 16, 5)); | |||||
vec.push_back(ProfileElement(7, 8, 32, 5)); | |||||
vec.push_back(ProfileElement(7, 8, 64, 5)); | |||||
vec.push_back(ProfileElement(7, 8, 96, 5)); | |||||
vec.push_back(ProfileElement(7, 8, 128, 5)); | |||||
vec.push_back(ProfileElement(7, 16, 4, 13)); | |||||
vec.push_back(ProfileElement(7, 16, 8, 11)); | |||||
vec.push_back(ProfileElement(7, 16, 16, 5)); | |||||
vec.push_back(ProfileElement(7, 16, 32, 5)); | |||||
vec.push_back(ProfileElement(7, 16, 64, 5)); | |||||
vec.push_back(ProfileElement(7, 16, 96, 5)); | |||||
vec.push_back(ProfileElement(7, 16, 128, 5)); | |||||
vec.push_back(ProfileElement(7, 32, 4, 21)); | |||||
vec.push_back(ProfileElement(7, 32, 8, 13)); | |||||
vec.push_back(ProfileElement(7, 32, 16, 13)); | |||||
vec.push_back(ProfileElement(7, 32, 32, 13)); | |||||
vec.push_back(ProfileElement(7, 32, 64, 13)); | |||||
vec.push_back(ProfileElement(7, 32, 96, 13)); | |||||
vec.push_back(ProfileElement(7, 32, 128, 12)); | |||||
vec.push_back(ProfileElement(7, 64, 4, 37)); | |||||
vec.push_back(ProfileElement(7, 64, 8, 21)); | |||||
vec.push_back(ProfileElement(7, 64, 16, 14)); | |||||
vec.push_back(ProfileElement(7, 64, 32, 14)); | |||||
vec.push_back(ProfileElement(7, 64, 64, 14)); | |||||
vec.push_back(ProfileElement(7, 64, 96, 13)); | |||||
vec.push_back(ProfileElement(7, 64, 128, 14)); | |||||
vec.push_back(ProfileElement(7, 96, 4, 61)); | |||||
vec.push_back(ProfileElement(7, 96, 8, 39)); | |||||
vec.push_back(ProfileElement(7, 96, 16, 37)); | |||||
vec.push_back(ProfileElement(7, 96, 32, 31)); | |||||
vec.push_back(ProfileElement(7, 96, 64, 21)); | |||||
vec.push_back(ProfileElement(7, 96, 96, 21)); | |||||
vec.push_back(ProfileElement(7, 96, 128, 21)); | |||||
vec.push_back(ProfileElement(7, 128, 4, 61)); | |||||
vec.push_back(ProfileElement(7, 128, 8, 31)); | |||||
vec.push_back(ProfileElement(7, 128, 16, 37)); | |||||
vec.push_back(ProfileElement(7, 128, 32, 11)); | |||||
vec.push_back(ProfileElement(7, 128, 64, 13)); | |||||
vec.push_back(ProfileElement(7, 128, 96, 23)); | |||||
vec.push_back(ProfileElement(7, 128, 128, 21)); | |||||
return vec; | |||||
} | |||||
} // namespace fallback | |||||
} // namespace megdnn | |||||
// vim: syntax=cpp.doxygen | |||||
@@ -1,45 +0,0 @@ | |||||
/** | |||||
* \file dnn/src/x86/profile.h | |||||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
* | |||||
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||||
* | |||||
* Unless required by applicable law or agreed to in writing, | |||||
* software distributed under the License is distributed on an | |||||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
*/ | |||||
#pragma once | |||||
#include <vector> | |||||
namespace megdnn { | |||||
namespace x86 { | |||||
struct ProfileElement { | |||||
// when output_size > on_threshold, DIRECT is faster, | |||||
// otherwise MATRIX_MUL is faster | |||||
int f, ic, oc, on_threshold; | |||||
ProfileElement(int f, int ic, int oc, int on_threshold): | |||||
f(f), ic(ic), oc(oc), on_threshold(on_threshold) | |||||
{ | |||||
} | |||||
bool operator<(const ProfileElement &rhs) const | |||||
{ | |||||
if (this->f < rhs.f) return true; | |||||
if (this->f > rhs.f) return false; | |||||
if (this->ic < rhs.ic) return true; | |||||
if (this->ic > rhs.ic) return false; | |||||
if (this->oc < rhs.oc) return true; | |||||
if (this->oc > rhs.oc) return false; | |||||
return false; | |||||
} | |||||
}; | |||||
using ProfileCache = std::vector<ProfileElement>; | |||||
ProfileCache get_profile_cache(); | |||||
} // namespace fallback | |||||
} // namespace megdnn | |||||
// vim: syntax=cpp.doxygen | |||||
@@ -63,7 +63,6 @@ | |||||
#include "./sep_conv_filter.h" | #include "./sep_conv_filter.h" | ||||
#include "src/common/utils.h" | #include "src/common/utils.h" | ||||
#include "src/x86/utils.h" | #include "src/x86/utils.h" | ||||
#include "src/x86/profile.h" | |||||
#include "src/x86/handle.h" | #include "src/x86/handle.h" | ||||
#include <cstring> | #include <cstring> | ||||
@@ -14,7 +14,6 @@ | |||||
#include "src/common/cv/helper.h" | #include "src/common/cv/helper.h" | ||||
#include "src/common/utils.h" | #include "src/common/utils.h" | ||||
#include "src/x86/utils.h" | #include "src/x86/utils.h" | ||||
#include "src/x86/profile.h" | |||||
#include "src/x86/handle.h" | #include "src/x86/handle.h" | ||||
#include <cstring> | #include <cstring> | ||||
@@ -1599,73 +1599,6 @@ TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_QINT8_FILTER_PREPROCESS) { | |||||
#undef cb | #undef cb | ||||
} | } | ||||
TEST_F(X86, CONV_BIAS_MATMUL) { | |||||
using namespace conv_bias; | |||||
std::vector<TestArg> args; | |||||
auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel, | |||||
size_t p, NonlineMode nonline_mode) { | |||||
if (w + 2 * p < kernel || h + 2 * p < kernel) | |||||
return; | |||||
param::ConvBias param; | |||||
param.stride_h = 1; | |||||
param.stride_w = 1; | |||||
param.pad_h = p; | |||||
param.pad_w = p; | |||||
param.nonlineMode = nonline_mode; | |||||
//! no bias | |||||
param.sparse = param::ConvBias::Sparse::DENSE; | |||||
args.emplace_back(param, TensorShape{1, ic, h, w}, | |||||
TensorShape{oc, ic, kernel, kernel}, TensorShape{}); | |||||
//! bias channel | |||||
args.emplace_back(param, TensorShape{2, ic, h, w}, | |||||
TensorShape{oc, ic, kernel, kernel}, | |||||
TensorShape{1, oc, 1, 1}); | |||||
//! bias | |||||
args.emplace_back(param, TensorShape{2, ic, h, w}, | |||||
TensorShape{oc, ic, kernel, kernel}, | |||||
TensorShape{2, oc, (h + param.pad_h * 2 - kernel) + 1, | |||||
(w + param.pad_w * 2 - kernel) + 1}); | |||||
//! gruop | |||||
param.sparse = param::ConvBias::Sparse::GROUP; | |||||
args.emplace_back( | |||||
param, TensorShape{2, 2 * ic, h, w}, | |||||
TensorShape{2, oc, ic, kernel, kernel}, | |||||
TensorShape{2, 2 * oc, (h + param.pad_h * 2 - kernel) + 1, | |||||
(w + param.pad_w * 2 - kernel) + 1}); | |||||
}; | |||||
for (size_t kernel : {2, 3, 5, 7}) | |||||
for (size_t ic : {1, 2, 3, 4}) | |||||
for (size_t oc : {1, 2, 3, 4}) | |||||
for (size_t p : {0, 2}) | |||||
for (size_t size : {20, 21, 22, 23, 24}) | |||||
for (NonlineMode nonline_mode : | |||||
{NonlineMode::RELU, NonlineMode::SIGMOID, | |||||
NonlineMode::H_SWISH, NonlineMode::IDENTITY}) { | |||||
run(oc, ic, size, size, kernel, p, nonline_mode); | |||||
} | |||||
Checker<ConvBias> checker(handle()); | |||||
checker.set_before_exec_callback( | |||||
conv_bias::ConvBiasAlgoChecker<ConvBiasForward>( | |||||
"X86_CONV_BIAS_MATMUL")); | |||||
checker.set_epsilon(1); | |||||
UniformIntRNG rng{-50, 50}; | |||||
checker.set_dtype(0, dtype::Float32()) | |||||
.set_dtype(1, dtype::Float32()) | |||||
.set_dtype(2, dtype::Float32()) | |||||
.set_rng(0, &rng) | |||||
.set_rng(1, &rng) | |||||
.set_rng(2, &rng); | |||||
for (auto&& arg : args) { | |||||
checker.set_param(arg.param).exec( | |||||
{arg.src, arg.filter, arg.bias, {}, {}}); | |||||
} | |||||
} | |||||
#if MEGDNN_WITH_BENCHMARK | #if MEGDNN_WITH_BENCHMARK | ||||
#if MEGDNN_X86_WITH_MKL_DNN | #if MEGDNN_X86_WITH_MKL_DNN | ||||
static void x86_benchmark_fp32_mkldnn(Handle* handle) { | static void x86_benchmark_fp32_mkldnn(Handle* handle) { | ||||
@@ -182,49 +182,6 @@ TEST_F(X86, DEFAULT_CONV_DIRECT_STRIDE2) { | |||||
} | } | ||||
} | } | ||||
TEST_F(X86, DEFAULT_CONV_MATMUL) { | |||||
using namespace convolution; | |||||
std::vector<TestArg> args; | |||||
auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel, | |||||
size_t p) { | |||||
if (w + 2 * p < kernel || h + 2 * p < kernel) | |||||
return; | |||||
param::Convolution param; | |||||
param.stride_h = 1; | |||||
param.stride_w = 1; | |||||
param.pad_h = p; | |||||
param.pad_w = p; | |||||
//! no bias | |||||
args.emplace_back(param, TensorShape{1, ic, h, w}, | |||||
TensorShape{oc, ic, kernel, kernel}); | |||||
}; | |||||
for (size_t kernel : {2, 3, 5, 7}) | |||||
for (size_t ic : {1, 2, 3, 4}) | |||||
for (size_t oc : {1, 2, 3, 4}) | |||||
for (size_t p : {0, 2}) | |||||
for (size_t size : {20, 21, 22, 23, 24}) { | |||||
run(oc, ic, size, size, kernel, p); | |||||
} | |||||
Checker<ConvolutionForward> checker(handle()); | |||||
checker.set_before_exec_callback(AlgoChecker<ConvolutionForward>( | |||||
"CONVOLUTION_DEFAULT_X86_CONV_BIAS_MATMUL")); | |||||
UniformIntRNG rng{-50, 50}; | |||||
checker.set_dtype(0, dtype::Float32()) | |||||
.set_dtype(1, dtype::Float32()) | |||||
.set_dtype(2, dtype::Float32()) | |||||
.set_rng(0, &rng) | |||||
.set_rng(1, &rng) | |||||
.set_rng(2, &rng); | |||||
for (auto&& arg : args) { | |||||
checker.set_param(arg.param).exec({arg.src, arg.filter, {}}); | |||||
} | |||||
} | |||||
#if MEGDNN_X86_WITH_MKL_DNN | #if MEGDNN_X86_WITH_MKL_DNN | ||||
TEST_F(X86, CONVOLUTION_FORWARD_INT8) { | TEST_F(X86, CONVOLUTION_FORWARD_INT8) { | ||||
Checker<ConvolutionForward> checker(handle()); | Checker<ConvolutionForward> checker(handle()); | ||||