You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

strategy.cpp 6.6 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160
  1. /**
  2. * \file dnn/src/armv7/conv_bias/int8/strategy.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #include "src/armv7/conv_bias/int8/strategy.h"
  12. #include "src/arm_common/simd_macro/marm_neon.h"
  13. #include "src/armv7/matrix_mul/asm/common.h"
  14. #include "src/common/utils.h"
  15. #include "src/fallback/conv_bias/common.h"
  16. #include "src/arm_common/conv_bias/matmul_postprocess.h"
  17. #include "src/armv7/matrix_mul/int8/kernel_4x2x16.h"
  18. using namespace megdnn;
  19. using namespace armv7;
  20. using namespace armv7::matmul;
  21. namespace impl {
  22. template <BiasMode bmode, typename Op, int block_m, int block_n>
  23. struct KernCaller;
  24. template <BiasMode bmode, typename Op>
  25. struct KernCaller<bmode, Op, 4, 2> {
  26. static void run(const dt_int8* packA, const dt_int8* packB, size_t M,
  27. size_t N, size_t K, dt_int8* C, size_t LDC, bool is_first_k,
  28. Op op, const dt_int32* bias, dt_int32* workspace) {
  29. megdnn_assert(is_first_k);
  30. constexpr size_t A_INTERLEAVE = 4;
  31. constexpr size_t B_INTERLEAVE = 2;
  32. //! K is packed to times of 4
  33. K = round_up<size_t>(K, 16);
  34. const int K4 = K * 4;
  35. const int K2 = K * 2;
  36. size_t m = 0;
  37. for (; m + A_INTERLEAVE - 1 < M; m += A_INTERLEAVE) {
  38. int8_t* output = C + (m * LDC);
  39. size_t n = 0;
  40. const dt_int8* cur_packB = packB;
  41. for (; n + B_INTERLEAVE - 1 < N; n += B_INTERLEAVE) {
  42. matmul_4x2x16::kern_4x2(packA, cur_packB, K, workspace, 2,
  43. is_first_k, 4, 2);
  44. arm_common::ConvBiasMatmul<bmode, Op, dt_int8, 4, 2, 4,
  45. 2>::postprocess(bias, workspace,
  46. output, LDC, op);
  47. output += B_INTERLEAVE;
  48. cur_packB += K2;
  49. }
  50. for (; n < N; n += B_INTERLEAVE) {
  51. matmul_4x2x16::kern_4x2(packA, cur_packB, K, workspace, 2,
  52. is_first_k, 4,
  53. std::min<size_t>(N - n, 2));
  54. #define cb(m, n) \
  55. arm_common::ConvBiasMatmul<bmode, Op, dt_int8, 4, 2, 4, n>::postprocess( \
  56. bias, workspace, output, LDC, op);
  57. DISPATCH_N(cb, 4, std::min<size_t>(N - n, 2));
  58. #undef cb
  59. output += B_INTERLEAVE;
  60. cur_packB += K2;
  61. }
  62. packA += K4;
  63. if (bmode == BiasMode::BROADCAST_CHANNEL_BIAS) {
  64. bias += A_INTERLEAVE;
  65. }
  66. }
  67. for (; m < M; m += A_INTERLEAVE) {
  68. int8_t* output = C + (m * LDC);
  69. size_t n = 0;
  70. const dt_int8* cur_packB = packB;
  71. for (; n < N; n += B_INTERLEAVE) {
  72. matmul_4x2x16::kern_4x2(packA, cur_packB, K, workspace, 2,
  73. is_first_k, std::min<size_t>(M - m, 4),
  74. std::min<size_t>(N - n, 2));
  75. #define cb(m, n) \
  76. arm_common::ConvBiasMatmul<bmode, Op, dt_int8, 4, 2, m, n>::postprocess( \
  77. bias, workspace, output, LDC, op);
  78. DISPATCH_M(cb, std::min<size_t>(M - m, 4),
  79. std::min<size_t>(N - n, 2));
  80. #undef cb
  81. output += B_INTERLEAVE;
  82. cur_packB += K2;
  83. }
  84. packA += K4;
  85. if (bmode == BiasMode::BROADCAST_CHANNEL_BIAS) {
  86. bias += A_INTERLEAVE;
  87. }
  88. }
  89. }
  90. };
  91. } // namespace impl
  92. MEGDNN_REG_GEMM_STRATEGY_IMPL(gemm_s8_4x2_nobias_identity)
  93. void gemm_s8_4x2_nobias_identity::pack_A(dt_int8* outptr, const dt_int8* inptr,
  94. int ldin, int y0, int ymax, int k0,
  95. int kmax, bool /*transpose*/) const {
  96. MEGDNN_MARK_USED_VAR(matmul_4x2x16::gemm_s8_4x2_pack_A_t);
  97. matmul_4x2x16::gemm_s8_4x2_pack_A_n(outptr, inptr, ldin, y0, ymax, k0,
  98. kmax);
  99. }
  100. void gemm_s8_4x2_nobias_identity::pack_B(dt_int8* out, const dt_int8* in,
  101. int ldin, int x0, int xmax, int k0,
  102. int kmax, bool /*transpose*/) const {
  103. MEGDNN_MARK_USED_VAR(matmul_4x2x16::gemm_s8_4x2_pack_B_t);
  104. matmul_4x2x16::gemm_s8_4x2_pack_B_n(out, in, ldin, x0, xmax, k0, kmax);
  105. }
  106. size_t gemm_s8_4x2_nobias_identity::get_workspace_size() const {
  107. return 4 * 2 * sizeof(dt_int32);
  108. }
  109. #define KERN(_bias, _BIAS, _nonline, _OP) \
  110. void gemm_s8_4x2_##_bias##_##_nonline::kern( \
  111. const dt_int8* packA, const dt_int8* packB, size_t M, size_t N, \
  112. size_t K, dt_int8* C, size_t LDC, bool is_first_k, \
  113. const dt_int32* bias, dt_int32* workspace) const { \
  114. float scale_A = A_dtype.param<dtype::QuantizedS8>().scale; \
  115. float scale_B = B_dtype.param<dtype::QuantizedS8>().scale; \
  116. float scale_C = C_dtype.param<dtype::QuantizedS8>().scale; \
  117. DEFINE_OP(_OP); \
  118. impl::KernCaller<_BIAS, decltype(op), 4, 2>::run( \
  119. packA, packB, M, N, K, C, LDC, is_first_k, op, bias, \
  120. workspace); \
  121. }
  122. #define DEFINE_OP(_Op) \
  123. arm_common::_Op<dt_qint32, dt_qint8> op(scale_A* scale_B, scale_C);
  124. KERN(nobias, BiasMode::NO_BIAS, identity, TypeCvtOp)
  125. KERN(nobias, BiasMode::NO_BIAS, relu, ReluOp)
  126. KERN(nobias, BiasMode::NO_BIAS, hswish, HSwishOp)
  127. #undef DEFINE_OP
  128. #define DEFINE_OP(_Op) \
  129. arm_common::_Op<dt_qint32, dt_qint8, true> op(scale_A* scale_B, \
  130. scale_A* scale_B, scale_C);
  131. KERN(bias_channel, BiasMode::BROADCAST_CHANNEL_BIAS, identity, AddOp)
  132. KERN(bias_channel, BiasMode::BROADCAST_CHANNEL_BIAS, relu, FuseAddReluOp)
  133. KERN(bias_channel, BiasMode::BROADCAST_CHANNEL_BIAS, hswish, FuseAddHSwishOp)
  134. #undef DEFINE_OP
  135. #undef KERN
  136. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台