Browse Source

Merge pull request #207 from togetherwhenyouwant:feat-x86-matmul-6x16x2

GitOrigin-RevId: 148ae44ba0
release-1.7
liuke Megvii Engine Team 3 years ago
parent
commit
b0ba6d3201
13 changed files with 1599 additions and 6 deletions
  1. +1
    -0
      CMakeLists.txt
  2. +6
    -6
      dnn/src/fallback/conv_bias/im2col/algos.cpp
  3. +1
    -0
      dnn/src/fallback/matrix_mul/opr_impl.h
  4. +9
    -0
      dnn/src/x86/avx_helper.h
  5. +66
    -0
      dnn/src/x86/matrix_mul/algos.cpp
  6. +11
    -0
      dnn/src/x86/matrix_mul/algos.h
  7. +3
    -0
      dnn/src/x86/matrix_mul/f32/strategy.h
  8. +1278
    -0
      dnn/src/x86/matrix_mul/f32/strategy_6x16.cpp
  9. +2
    -0
      dnn/src/x86/matrix_mul/opr_impl.cpp
  10. +1
    -0
      dnn/src/x86/matrix_mul/opr_impl.h
  11. +9
    -0
      dnn/test/x86/accuracy_shake.cpp
  12. +198
    -0
      dnn/test/x86/conv_bias.cpp
  13. +14
    -0
      dnn/test/x86/matrix_mul.cpp

+ 1
- 0
CMakeLists.txt View File

@@ -10,6 +10,7 @@ project(MegEngine LANGUAGES C CXX VERSION ${MGB_VER_STRING})
set(CMAKE_CXX_STANDARD 14) set(CMAKE_CXX_STANDARD 14)
set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF) set(CMAKE_CXX_EXTENSIONS OFF)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
set(CMAKE_POSITION_INDEPENDENT_CODE ON) set(CMAKE_POSITION_INDEPENDENT_CODE ON)
set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/Modules) set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/Modules)
set(CMAKE_POLICY_DEFAULT_CMP0048 NEW) set(CMAKE_POLICY_DEFAULT_CMP0048 NEW)


+ 6
- 6
dnn/src/fallback/conv_bias/im2col/algos.cpp View File

@@ -8,11 +8,11 @@
* software distributed under the License is distributed on an * software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/ */

#include "src/fallback/conv_bias/im2col/algos.h"
#include "megdnn/opr_param_defs.h" #include "megdnn/opr_param_defs.h"

#include "src/common/opr_delegate.h" #include "src/common/opr_delegate.h"
#include "src/fallback/conv_bias/common.h" #include "src/fallback/conv_bias/common.h"
#include "src/fallback/conv_bias/im2col/algos.h"
#include "src/fallback/conv_bias/im2col/factory.h" #include "src/fallback/conv_bias/im2col/factory.h"
#include "src/fallback/conv_bias/im2col/im2col_kerns.h" #include "src/fallback/conv_bias/im2col/im2col_kerns.h"
#include "src/fallback/conv_bias/opr_impl.h" #include "src/fallback/conv_bias/opr_impl.h"
@@ -68,16 +68,16 @@ static void choice_ohw_oc_block(
fallback::MatrixMulImpl::AlgoBase::PackMode pack_mode) { fallback::MatrixMulImpl::AlgoBase::PackMode pack_mode) {
//! calculate m_oc_tile_size in choice_ohw_oc_block() fucntion, //! calculate m_oc_tile_size in choice_ohw_oc_block() fucntion,
//! when ohw_tile_size < this value ohw_tile_size = ohw //! when ohw_tile_size < this value ohw_tile_size = ohw
static constexpr size_t DEFAULT_OHW_MIN_TILE_SIZE = 32;
size_t DEFAULT_OHW_MIN_TILE_SIZE = round_up(static_cast<size_t>(32), block_n);
//! when nr_threads > 1 and round(ohw,nr_threads)>nr_threads, //! when nr_threads > 1 and round(ohw,nr_threads)>nr_threads,
//! oc_tile_size = DEFAULT_OC_TILE_SIZE //! oc_tile_size = DEFAULT_OC_TILE_SIZE
static constexpr size_t DEFAULT_OC_TILE_SIZE = 512;
size_t DEFAULT_OC_TILE_SIZE = round_up(static_cast<size_t>(512), block_m);
//! when oc_tile_size > this value m_oc_tile_size = //! when oc_tile_size > this value m_oc_tile_size =
//! DEFAULT_OC_MAX_TILE_SIZE //! DEFAULT_OC_MAX_TILE_SIZE
static constexpr size_t DEFAULT_OC_MAX_TILE_SIZE = 1024;
size_t DEFAULT_OC_MAX_TILE_SIZE = round_up(static_cast<size_t>(1024), block_m);
//! when oc_tile_size < this value oc_tile_size = //! when oc_tile_size < this value oc_tile_size =
//! DEFAULT_OC_MIN_TILE_SIZE the purpose is aligning the calculation //! DEFAULT_OC_MIN_TILE_SIZE the purpose is aligning the calculation
static constexpr size_t DEFAULT_OC_MIN_TILE_SIZE = 128;
size_t DEFAULT_OC_MIN_TILE_SIZE = round_up(static_cast<size_t>(128), block_m);
size_t nr_threads = param.nr_threads; size_t nr_threads = param.nr_threads;
size_t OC = param.filter_meta.ocpg; size_t OC = param.filter_meta.ocpg;
size_t ohw = param.osz[0] * param.osz[1]; size_t ohw = param.osz[0] * param.osz[1];


+ 1
- 0
dnn/src/fallback/matrix_mul/opr_impl.h View File

@@ -123,6 +123,7 @@ public:
X86_INT8X8X16_SSE, X86_INT8X8X16_SSE,
X86_INT8X8X32_SSE_4X8X2, X86_INT8X8X32_SSE_4X8X2,
X86_F32_MK8_8X8, X86_F32_MK8_8X8,
X86_F32_6x16,
X86_INT8X8X32_VNNI, X86_INT8X8X32_VNNI,
X86_INT8X8X32_MKLDNN, X86_INT8X8X32_MKLDNN,
#elif MEGDNN_AARCH64 || MEGDNN_ARMV7 #elif MEGDNN_AARCH64 || MEGDNN_ARMV7


+ 9
- 0
dnn/src/x86/avx_helper.h View File

@@ -33,6 +33,15 @@ static inline __m256 _mm256_loadu2_m128_emulate(
_mm256_castps128_ps256(_mm_loadu_ps(loaddr)), _mm_loadu_ps(hiaddr), 1); _mm256_castps128_ps256(_mm_loadu_ps(loaddr)), _mm_loadu_ps(hiaddr), 1);
} }


MEGDNN_ATTRIBUTE_TARGET("avx")
static inline void _mm256_storeu2_m128_emulate(
float* hiaddr, float* loaddr, __m256 reg) {
auto xmm0 = _mm256_extractf128_ps(reg, 0);
auto xmm1 = _mm256_extractf128_ps(reg, 1);
_mm_storeu_ps(loaddr, xmm0);
_mm_storeu_ps(hiaddr, xmm1);
}

template <typename ctype, size_t len> template <typename ctype, size_t len>
struct Vector; struct Vector;




+ 66
- 0
dnn/src/x86/matrix_mul/algos.cpp View File

@@ -309,6 +309,33 @@ void gemm_s8s8s32_sse_4x8x2(const MatrixMulImpl::KernParam& kern_param) {
MIDOUT_END(); MIDOUT_END();
} }


void gemm_f32_avx2_6x16(const MatrixMulImpl::KernParam& kern_param) {
MEGDNN_MARK_USED_VAR(kern_param);
MIDOUT_BEGIN(megdnn_x86_matmul_kern_avx2_6x16x2, midout_iv(0)) {
constexpr int cacheline = 64;
const size_t m = kern_param.M;
const size_t n = kern_param.N;
const size_t k = kern_param.K;
const bool trans_a = kern_param.trA;
const bool trans_b = kern_param.trB;
const size_t lda = kern_param.LDA;
const size_t ldb = kern_param.LDB;
const size_t ldc = kern_param.LDC;
auto a_type = kern_param.A_type;
auto b_type = kern_param.B_type;
auto c_type = kern_param.C_type;
const auto a_ptr = kern_param.A<float>();
const auto b_ptr = kern_param.B<float>();
auto c_ptr = kern_param.C<float>();
x86::matmul::sgemm_pack_6x16_avx2 strategy(m, n, k, a_type, b_type, c_type);

megdnn::matmul::GemmInterleaved<x86::matmul::sgemm_pack_6x16_avx2>(
m, n, k, trans_a, trans_b, strategy, cacheline)
.execute(a_ptr, lda, b_ptr, ldb, c_ptr, ldc, kern_param.workspace_ptr);
}
MIDOUT_END();
}

} // namespace } // namespace


/*************************AlgoInt8x8x16AVX2********************/ /*************************AlgoInt8x8x16AVX2********************/
@@ -625,4 +652,43 @@ size_t MatrixMulImpl::AlgoF32MK8_8x8::get_workspace(
MIDOUT_END(); MIDOUT_END();
} }


/*************************AlgoFloatAVX2M6N16********************/
MatrixMulImpl::kern_t MatrixMulImpl::AlgoFloatAVX2M6N16::get_kern(
const KernSizeParam&) const {
return gemm_f32_avx2_6x16;
}
bool MatrixMulImpl::AlgoFloatAVX2M6N16::usable(
const KernSizeParam& kern_size_param) const {
bool is_param_ok =
kern_size_param.A_type.enumv() == kern_size_param.B_type.enumv() &&
((kern_size_param.A_type.enumv() == DTypeEnum::Float32 &&
kern_size_param.C_type.enumv() == DTypeEnum::Float32)) &&
kern_size_param.compute_mode == Param::ComputeMode::DEFAULT &&
kern_size_param.format == Param::Format::DEFAULT &&
is_supported(SIMDType::AVX2);
return is_param_ok;
}
size_t MatrixMulImpl::AlgoFloatAVX2M6N16::get_workspace(
const KernSizeParam& kern_param) const {
constexpr int cacheline = 64;
const size_t m = kern_param.M;
const size_t n = kern_param.N;
const size_t k = kern_param.K;
const bool trans_a = kern_param.trA;
const bool trans_b = kern_param.trB;
auto a_type = kern_param.A_type;
auto b_type = kern_param.B_type;
auto c_type = kern_param.C_type;
x86::matmul::sgemm_pack_6x16_avx2 strategy(m, n, k, a_type, b_type, c_type);

return megdnn::matmul::GemmInterleaved<x86::matmul::sgemm_pack_6x16_avx2>(
m, n, k, trans_a, trans_b, strategy, cacheline)
.get_workspace_size();
}

MEGDNN_REG_GEMM_FUNC_FOR_IM2COL_IMPL_DETAIL(
AlgoFloatAVX2M6N16, megdnn_x86_matmul_kern, "AlgoFloatAVX2M6N16"_hash,
x86::matmul::sgemm_pack_6x16_avx2, float, float, float, AlgoDataType::FLOAT32,
DEFAULT);

// vim: syntax=cpp.doxygen // vim: syntax=cpp.doxygen

+ 11
- 0
dnn/src/x86/matrix_mul/algos.h View File

@@ -134,6 +134,17 @@ public:
MEGDNN_DECL_ALGO_TYPE(X86_F32_MK8_8X8) MEGDNN_DECL_ALGO_TYPE(X86_F32_MK8_8X8)
}; };


class MatrixMulImpl::AlgoFloatAVX2M6N16 : public AlgoBase {
public:
AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; }
const char* name() const override { return "X86_F32_6x16"; }
bool usable(const KernSizeParam&) const override;
size_t get_workspace(const KernSizeParam&) const override;
kern_t get_kern(const KernSizeParam&) const override;
MEGDNN_REG_GEMM_FUNC_FOR_IM2COL();
MEGDNN_DECL_ALGO_TYPE(X86_F32_6x16)
};

#if MEGDNN_X86_WITH_VNNI #if MEGDNN_X86_WITH_VNNI
class MatrixMulImpl::AlgoInt8x8x32Vnni : public AlgoBase { class MatrixMulImpl::AlgoInt8x8x32Vnni : public AlgoBase {
public: public:


+ 3
- 0
dnn/src/x86/matrix_mul/f32/strategy.h View File

@@ -19,6 +19,9 @@ namespace matmul {
MEGDNN_REG_GEMM_STRATEGY_NOPACK( MEGDNN_REG_GEMM_STRATEGY_NOPACK(
float, float, float, 8, 8, 8, false, true, sgemm_nopack_8x8_avx2); float, float, float, 8, 8, 8, false, true, sgemm_nopack_8x8_avx2);


MEGDNN_REG_GEMM_STRATEGY_WITH_PACK_A_TYPE(
float, float, float, float, 6, 16, 1, false, false, sgemm_pack_6x16_avx2);

} // namespace matmul } // namespace matmul
} // namespace x86 } // namespace x86
} // namespace megdnn } // namespace megdnn

+ 1278
- 0
dnn/src/x86/matrix_mul/f32/strategy_6x16.cpp
File diff suppressed because it is too large
View File


+ 2
- 0
dnn/src/x86/matrix_mul/opr_impl.cpp View File

@@ -34,6 +34,7 @@ class MatrixMulImpl::AlgoPack : NonCopyableObj {
AlgoInt8x8x16AVX2 algoint8x8x16avx2_m4n16k2; AlgoInt8x8x16AVX2 algoint8x8x16avx2_m4n16k2;
AlgoInt8x8x16SSE algoint8x8x16sse_m4n8k2; AlgoInt8x8x16SSE algoint8x8x16sse_m4n8k2;
AlgoF32MK8_8x8 algof32mk8_8x8; AlgoF32MK8_8x8 algof32mk8_8x8;
AlgoFloatAVX2M6N16 algof32_6x16;


SmallVector<fallback::MatrixMulImpl::AlgoBase*> m_all_algos; SmallVector<fallback::MatrixMulImpl::AlgoBase*> m_all_algos;
fallback::MatrixMulImpl::AlgoBase::Mapper m_all_algos_map; fallback::MatrixMulImpl::AlgoBase::Mapper m_all_algos_map;
@@ -51,6 +52,7 @@ public:
m_all_algos.emplace_back(&algoint8x8x32sse_m4n8k2); m_all_algos.emplace_back(&algoint8x8x32sse_m4n8k2);
m_all_algos.emplace_back(&algoint8x8x16sse_m4n8k2); m_all_algos.emplace_back(&algoint8x8x16sse_m4n8k2);
m_all_algos.emplace_back(&algof32mk8_8x8); m_all_algos.emplace_back(&algof32mk8_8x8);
m_all_algos.emplace_back(&algof32_6x16);
#if MEGDNN_X86_WITH_MKL_DNN #if MEGDNN_X86_WITH_MKL_DNN
m_all_algos.emplace_back(&algoint8x8x32mkldnn); m_all_algos.emplace_back(&algoint8x8x32mkldnn);
#endif #endif


+ 1
- 0
dnn/src/x86/matrix_mul/opr_impl.h View File

@@ -67,6 +67,7 @@ private:
class AlgoInt8x8x16SSE; class AlgoInt8x8x16SSE;
class AlgoPack; class AlgoPack;
class AlgoF32MK8_8x8; class AlgoF32MK8_8x8;
class AlgoFloatAVX2M6N16;


public: public:
static const AlgoPack& algo_pack(); static const AlgoPack& algo_pack();


+ 9
- 0
dnn/test/x86/accuracy_shake.cpp View File

@@ -84,6 +84,15 @@ TEST_F(X86, SHAKE_MATRIX_MUL_FORWARD) {
.exec({{20, 100}, {100, 60}, {}}); .exec({{20, 100}, {100, 60}, {}});
} }


TEST_F(X86, SHAKE_MATRIX_MUL_6x16_FORWARD) {
AccuracyShakeChecker<MatrixMul> checker(handle());
checker.set_before_exec_callback(AlgoGenerator<MatrixMul>("X86_F32_6x16"));
checker.set_dtype(0, dtype::Float32())
.set_dtype(1, dtype::Float32())
.set_dtype(2, dtype::Float32())
.exec({{20, 100}, {100, 60}, {}});
}

} // namespace test } // namespace test
} // namespace megdnn } // namespace megdnn




+ 198
- 0
dnn/test/x86/conv_bias.cpp View File

@@ -1150,6 +1150,56 @@ TEST_F(X86, CONV_BIAS_IM2COLMATMUL_FP32_NOPACK_PREPROCESS) {


#endif #endif


TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_FP32_6x16) {
using namespace conv_bias;
std::vector<TestArg> args;
auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel, size_t p,
NonlineMode nonline_mode) {
if (w + 2 * p < kernel || h + 2 * p < kernel)
return;
param::ConvBias param;
param.stride_h = 1;
param.stride_w = 1;
param.pad_h = p;
param.pad_w = p;
param.nonlineMode = nonline_mode;

//! no bias
args.emplace_back(
param, TensorShape{1, ic, h, w}, TensorShape{oc, ic, kernel, kernel},
TensorShape{});
args.emplace_back(
param, TensorShape{1, ic, h, w}, TensorShape{oc, ic, kernel, kernel},
TensorShape{1, oc, 1, 1});
args.emplace_back(
param, TensorShape{1, ic, h, w}, TensorShape{oc, ic, kernel, kernel},
TensorShape{
1, oc, (h + 2 * p - kernel) / param.stride_h + 1,
(w + 2 * p - kernel) / param.stride_w + 1});
};

for (size_t kernel : {2, 3, 4, 5, 6, 7})
for (size_t ic : {1, 4, 8, 16})
for (size_t oc : {1, 4, 8, 16, 300})
for (size_t p : {0, 2})
for (size_t size : {8, 24})
for (NonlineMode nonline_mode :
{NonlineMode::IDENTITY, NonlineMode::RELU}) {
run(oc, ic, size, size, kernel, p, nonline_mode);
}

run(2046, 8, 20, 20, 3, 1, NonlineMode::IDENTITY);
Checker<ConvBias> checker(handle());

#define cb(algo_name) \
checker.set_before_exec_callback( \
conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name)); \
for (auto&& arg : args) { \
checker.set_param(arg.param).execs({arg.src, arg.filter, arg.bias, {}, {}}); \
}
cb("IM2COLMATMUL:X86_F32_6x16:192");
}

#if MEGDNN_X86_WITH_MKL && SUPPORT_MKL_PACKED_GEMM #if MEGDNN_X86_WITH_MKL && SUPPORT_MKL_PACKED_GEMM
TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_FP32_PACKA) { TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_FP32_PACKA) {
using namespace conv_bias; using namespace conv_bias;
@@ -1435,6 +1485,12 @@ TEST_F(X86_MULTI_THREADS, CONV_BIAS_CONV1X1_S1_INT8X8X32_PREPROCESS) {


#endif #endif


TEST_F(X86_MULTI_THREADS, CONV_BIAS_CONV1X1_S1_FP32_6x16) {
using namespace conv_bias;
std::vector<conv_bias::TestArg> args = get_conv_bias_1x1_args(false, false);
check_conv_bias(args, handle(), "CONV1x1:X86_F32_6x16:48");
}

TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_QINT8) { TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_QINT8) {
using namespace conv_bias; using namespace conv_bias;
std::vector<TestArg> args; std::vector<TestArg> args;
@@ -2651,6 +2707,148 @@ TEST_F(X86_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_IM2COL_F32_single_thread)
shapes_and_computation.clear(); shapes_and_computation.clear();
} }


TEST_F(X86_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_IM2COL_F32_6x16) {
constexpr size_t RUNS = 50;

param::ConvBias param;
param.nonlineMode = param::ConvBias::NonlineMode::RELU;
param.pad_h = 1;
param.pad_w = 1;
param.stride_h = 1;
param.stride_w = 1;

std::vector<DType> data_type = {
dtype::Float32(), dtype::Float32(), dtype::Float32(), dtype::Float32()};
std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
size_t group) {
SmallVector<TensorShape> shapes{
{N, IC, H, W},
{OC / group, IC / group, FS, FS},
{1, OC, 1, 1},
{},
{N, OC, H, W}};
TensorShape dst{N, OC, H, W};
float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
dst.total_nr_elems()) *
1e-6;
shapes_and_computation.push_back(std::make_pair(shapes, computations));
};

bench_case(1, 32, 32, 200, 200, 3, 1);
bench_case(1, 32, 32, 200, 200, 3, 1);
bench_case(1, 32, 32, 128, 128, 3, 1);
bench_case(1, 32, 32, 128, 128, 3, 1);
bench_case(1, 32, 32, 100, 100, 3, 1);
bench_case(1, 32, 32, 100, 100, 3, 1);
bench_case(1, 32, 32, 80, 80, 3, 1);
bench_case(1, 32, 32, 80, 80, 3, 1);

bench_case(1, 64, 32, 7, 7, 3, 1);
bench_case(1, 64, 64, 7, 7, 3, 1);
bench_case(1, 64, 128, 7, 7, 3, 1);
bench_case(1, 64, 256, 7, 7, 3, 1);
bench_case(1, 64, 512, 7, 7, 3, 1);
bench_case(1, 64, 1024, 7, 7, 3, 1);

bench_case(1, 64, 32, 14, 14, 3, 1);
bench_case(1, 64, 64, 14, 14, 3, 1);
bench_case(1, 64, 128, 14, 14, 3, 1);
bench_case(1, 64, 256, 14, 14, 3, 1);
bench_case(1, 64, 512, 14, 14, 3, 1);

bench_case(1, 64, 1024, 14, 14, 3, 1);
bench_case(1, 128, 128, 14, 14, 3, 1);
bench_case(1, 128, 256, 14, 14, 3, 1);
bench_case(1, 512, 512, 14, 14, 3, 1);
bench_case(1, 256, 512, 14, 14, 3, 1);
bench_case(1, 512, 1024, 14, 14, 3, 1);
bench_case(1, 1024, 1024, 14, 14, 3, 1);

std::string algo_name = "IM2COLMATMUL:X86_F32_6x16:192";
printf("Benchmark IM2COLMATMUL:X86_F32_6x16 algo\n");
benchmark_impl(
param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}},
data_type);
benchmark_impl(
param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}},
data_type);
benchmark_impl(
param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}},
data_type);
shapes_and_computation.clear();
}

TEST_F(X86_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_IM2COL_F32_6X16_single_thread) {
constexpr size_t RUNS = 50;

param::ConvBias param;
param.nonlineMode = param::ConvBias::NonlineMode::RELU;
param.pad_h = 1;
param.pad_w = 1;
param.stride_h = 1;
param.stride_w = 1;

std::vector<DType> data_type = {
dtype::Float32(), dtype::Float32(), dtype::Float32(), dtype::Float32()};
std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation;
auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
size_t group) {
SmallVector<TensorShape> shapes{
{N, IC, H, W},
{OC / group, IC / group, FS, FS},
{1, OC, 1, 1},
{},
{N, OC, H, W}};
TensorShape dst{N, OC, H, W};
float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
dst.total_nr_elems()) *
1e-6;
shapes_and_computation.push_back(std::make_pair(shapes, computations));
};

bench_case(1, 32, 32, 200, 200, 3, 1);
bench_case(1, 32, 32, 200, 200, 3, 1);
bench_case(1, 32, 32, 128, 128, 3, 1);
bench_case(1, 32, 32, 128, 128, 3, 1);
bench_case(1, 32, 32, 100, 100, 3, 1);
bench_case(1, 32, 32, 100, 100, 3, 1);
bench_case(1, 32, 32, 80, 80, 3, 1);
bench_case(1, 32, 32, 80, 80, 3, 1);

bench_case(1, 64, 32, 7, 7, 3, 1);
bench_case(1, 64, 64, 7, 7, 3, 1);
bench_case(1, 64, 128, 7, 7, 3, 1);
bench_case(1, 64, 256, 7, 7, 3, 1);
bench_case(1, 64, 512, 7, 7, 3, 1);
bench_case(1, 64, 1024, 7, 7, 3, 1);

bench_case(1, 64, 32, 14, 14, 3, 1);
bench_case(1, 64, 64, 14, 14, 3, 1);
bench_case(1, 64, 128, 14, 14, 3, 1);
bench_case(1, 64, 256, 14, 14, 3, 1);
bench_case(1, 64, 512, 14, 14, 3, 1);

bench_case(1, 64, 1024, 14, 14, 3, 1);
bench_case(1, 128, 128, 14, 14, 3, 1);
bench_case(1, 128, 256, 14, 14, 3, 1);
bench_case(1, 512, 512, 14, 14, 3, 1);
bench_case(1, 256, 512, 14, 14, 3, 1);
bench_case(1, 512, 1024, 14, 14, 3, 1);
bench_case(1, 1024, 1024, 14, 14, 3, 1);

std::string algo_name = "IM2COLMATMUL:X86_F32_MKL_PACKA:192";
std::string algo_name1 = "IM2COLMATMUL:X86_F32_6x16:192";
printf("Benchmark IM2COLMATMUL:X86_F32_6x16 algo\n");
benchmark_impl_comp(
param, shapes_and_computation, algo_name, algo_name1, RUNS, {1, {4}},
{1, {4}}, data_type);
benchmark_impl_comp(
param, shapes_and_computation, algo_name, algo_name1, RUNS, {1, {7}},
{1, {7}}, data_type);
shapes_and_computation.clear();
}

TEST_F(X86_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_IM2COL_INT8X8X32) { TEST_F(X86_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_IM2COL_INT8X8X32) {
constexpr size_t RUNS = 50; constexpr size_t RUNS = 50;




+ 14
- 0
dnn/test/x86/matrix_mul.cpp View File

@@ -83,6 +83,12 @@ TEST_F(X86, MATRIX_MUL_AVX2_MK8_8X8) {
"X86_F32MK8_8X8", param::MatrixMul::Format::MK8, 1, 1e-3, false); "X86_F32MK8_8X8", param::MatrixMul::Format::MK8, 1, 1e-3, false);
} }


TEST_F(X86, MATRIX_MUL_AVX2_6x16) {
matrix_mul::check_matrix_mul(
dtype::Float32{}, dtype::Float32{}, dtype::Float32{}, handle(),
"X86_F32_6x16", param::MatrixMul::Format::DEFAULT, 1, 1e-3, false);
}

#if MEGDNN_WITH_BENCHMARK #if MEGDNN_WITH_BENCHMARK


TEST_F(X86, BENCHMARK_MATRIX_MUL_AVX2_MK8_8X8) { TEST_F(X86, BENCHMARK_MATRIX_MUL_AVX2_MK8_8X8) {
@@ -93,6 +99,14 @@ TEST_F(X86, BENCHMARK_MATRIX_MUL_AVX2_MK8_8X8) {
dtype::Float32{}, dtype::Float32{}, "X86_F32_BLAS"); dtype::Float32{}, dtype::Float32{}, "X86_F32_BLAS");
} }


TEST_F(X86, BENCHMARK_MATRIX_MUL_AVX2_6x16) {
auto args = matrix_mul::get_benchmark_matmul_mk_packed_args(8);
matrix_mul::benchmark_with_contrast(
handle(), args, dtype::Float32{}, dtype::Float32{}, dtype::Float32{},
"X86_F32_6x16", param::MatrixMul::Format::DEFAULT, dtype::Float32{},
dtype::Float32{}, dtype::Float32{}, "X86_F32_BLAS");
}

TEST_F(X86, BENCHMARK_MATRIX_MUL_8X8X32) { TEST_F(X86, BENCHMARK_MATRIX_MUL_8X8X32) {
constexpr size_t RUNS = 50; constexpr size_t RUNS = 50;
auto rng = std::make_unique<UniformIntRNG>(-127, 127); auto rng = std::make_unique<UniformIntRNG>(-127, 127);


Loading…
Cancel
Save