You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

batched_matrix_mul.cpp 7.9 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257
  1. /**
  2. * \file dnn/test/aarch64/batched_matrix_mul.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #include "test/common/benchmarker.h"
  12. #include "test/common/checker.h"
  13. #include "test/common/rng.h"
  14. #include "test/common/matrix_mul.h"
  15. #include "test/aarch64/fixture.h"
  16. namespace megdnn {
  17. namespace test {
  18. TEST_F(AARCH64, BATCHED_MATRIX_MUL) {
  19. Checker<BatchedMatrixMul> checker(handle());
  20. checker.set_epsilon(1e-2);
  21. using Param = MatrixMul::Param;
  22. // auto args = get_batch_matmul_args();
  23. auto args = matrix_mul::get_batched_matmul_args();
  24. for (DType dtype : std::vector<DType>{dtype::Float32()}) {
  25. for (unsigned mask = 0; mask < 4; ++mask) {
  26. for (auto& arg : args) {
  27. size_t b = arg.b, m = arg.m, n = arg.n, k = arg.k;
  28. //! if test all batch sizes, the test case will time out.
  29. if (b != 2) {
  30. continue;
  31. }
  32. Param param;
  33. param.transposeA = mask & 1;
  34. param.transposeB = mask & 2;
  35. TensorShape A, B;
  36. if (param.transposeA)
  37. A = TensorShape{b, k, m};
  38. else
  39. A = TensorShape{b, m, k};
  40. if (param.transposeB)
  41. B = TensorShape{b, n, k};
  42. else
  43. B = TensorShape{b, k, n};
  44. checker.set_param(param)
  45. .set_dtype(0, dtype)
  46. .set_dtype(1, dtype)
  47. .execs({A, B, {}});
  48. }
  49. }
  50. }
  51. }
  52. #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
  53. TEST_F(AARCH64, BATCHED_MATRIX_MUL_FP16) {
  54. Checker<BatchedMatrixMul> checker(handle());
  55. using Param = MatrixMul::Param;
  56. auto args = matrix_mul::get_batched_matmul_args();
  57. NormalRNG rng(1.f);
  58. checker.set_rng(0, &rng).set_rng(1, &rng).set_epsilon(1e-2);
  59. for (DType dtype : std::vector<DType>{dtype::Float16()}) {
  60. for (unsigned mask = 0; mask < 4; ++mask) {
  61. for (auto& arg : args) {
  62. size_t b = arg.b, m = arg.m, n = arg.n, k = arg.k;
  63. //! if test all batch sizes, the test case will time out on
  64. //! sdm855
  65. if (b != 1) {
  66. continue;
  67. }
  68. Param param;
  69. param.transposeA = mask & 1;
  70. param.transposeB = mask & 2;
  71. TensorShape A, B;
  72. if (param.transposeA)
  73. A = TensorShape{b, k, m};
  74. else
  75. A = TensorShape{b, m, k};
  76. if (param.transposeB)
  77. B = TensorShape{b, n, k};
  78. else
  79. B = TensorShape{b, k, n};
  80. checker.set_param(param)
  81. .set_dtype(0, dtype)
  82. .set_dtype(1, dtype)
  83. .set_dtype(2, dtype)
  84. .execs({A, B, {}});
  85. }
  86. }
  87. }
  88. }
  89. #if MEGDNN_WITH_BENCHMARK
  90. TEST_F(AARCH64, BENCHMARK_TRANSPOSED_MATRIX_MUL_QUICK_FP16) {
  91. int exec_times = 10;
  92. Benchmarker<MatrixMul> benchmarker_gemm(handle());
  93. benchmarker_gemm.set_times(exec_times);
  94. float mod = 1000 * exec_times / 1e9;
  95. using Param = MatrixMul::Param;
  96. auto run = [&](size_t M, size_t K, size_t N) {
  97. float time = 1.f, perf = 1.f;
  98. std::cout << "GEMM: (" << M << ", " << K << ", " << N << ")"
  99. << std::endl;
  100. Param param;
  101. param.transposeA = true;
  102. param.transposeB = true;
  103. benchmarker_gemm.set_param(param)
  104. .set_dtype(0, dtype::Float32())
  105. .set_dtype(1, dtype::Float32());
  106. time = benchmarker_gemm.exec({{M, K}, {K, N}, {}});
  107. perf = 2.f * M * K * N / time * mod;
  108. std::cout << "gemm fp32, Performance is " << perf << " Gflops"
  109. << std::endl;
  110. benchmarker_gemm.set_param(param)
  111. .set_dtype(0, dtype::Float16())
  112. .set_dtype(1, dtype::Float16());
  113. time = benchmarker_gemm.exec({{M, K}, {K, N}, {}});
  114. perf = 2.f * M * K * N / time * mod;
  115. std::cout << "gemm fp16, Performance is " << perf << " Gflops"
  116. << std::endl;
  117. };
  118. // run M = K = N
  119. run(32, 32, 32);
  120. run(64, 64, 64);
  121. run(128, 128, 128);
  122. run(256, 256, 256);
  123. run(512, 512, 512);
  124. run(1024, 1024, 1024);
  125. run(2048, 2048, 2048);
  126. }
  127. TEST_F(AARCH64, BENCHMARK_TRANSPOSED_MATRIX_MUL_ALL_SIZES_FP16) {
  128. int exec_times = 50;
  129. Benchmarker<MatrixMul> benchmarker_gemm(handle());
  130. benchmarker_gemm.set_times(exec_times);
  131. float mod = 1000 * exec_times / 1e9;
  132. using Param = MatrixMul::Param;
  133. auto run = [&](size_t M, size_t K, size_t N) {
  134. float time = 1.f, perf = 1.f;
  135. std::cout << "GEMM: (" << M << ", " << K << ", " << N << ")"
  136. << std::endl;
  137. Param param;
  138. param.transposeA = param.transposeB = true;
  139. benchmarker_gemm.set_param(param)
  140. .set_dtype(0, dtype::Float32())
  141. .set_dtype(1, dtype::Float32());
  142. time = benchmarker_gemm.exec({{K, M}, {N, K}, {}});
  143. perf = 2.f * M * K * N / time * mod;
  144. std::cout << "gemm fp32, Performance is " << perf << " Gflops"
  145. << std::endl;
  146. benchmarker_gemm.set_param(param)
  147. .set_dtype(0, dtype::Float16())
  148. .set_dtype(1, dtype::Float16());
  149. time = benchmarker_gemm.exec({{K, M}, {N, K}, {}});
  150. perf = 2.f * M * K * N / time * mod;
  151. std::cout << "gemm fp16, Performance is " << perf << " Gflops"
  152. << std::endl;
  153. };
  154. std::cout << "warm up:\n";
  155. for (int i = 0; i < 50; i++) {
  156. benchmarker_gemm.set_dtype(0, dtype::Float32())
  157. .set_dtype(1, dtype::Float32())
  158. .set_display(false)
  159. .exec({{256, 256}, {256, 256}, {}});
  160. benchmarker_gemm.set_display(true);
  161. }
  162. // run M = K = N
  163. run(8, 8, 8);
  164. run(16, 16, 16);
  165. run(32, 32, 32);
  166. run(64, 64, 64);
  167. run(128, 128, 128);
  168. run(256, 256, 256);
  169. run(512, 512, 512);
  170. run(1024, 1024, 1024);
  171. run(2048, 2048, 2048);
  172. // run sgmev like
  173. run(32, 32, 1);
  174. run(64, 64, 1);
  175. run(128, 128, 1);
  176. run(256, 256, 1);
  177. run(512, 512, 1);
  178. // run M, N >> K
  179. run(32, 16, 32);
  180. run(64, 16, 64);
  181. run(128, 16, 128);
  182. run(256, 16, 256);
  183. run(512, 16, 512);
  184. // run N, K >> M
  185. run(16, 32, 32);
  186. run(16, 64, 64);
  187. run(16, 128, 128);
  188. run(16, 256, 256);
  189. run(16, 512, 512);
  190. // run M >> K, N
  191. run(32, 16, 16);
  192. run(64, 16, 16);
  193. run(128, 16, 16);
  194. run(256, 16, 16);
  195. run(512, 16, 16);
  196. // run K >> M, N
  197. run(16, 32, 16);
  198. run(16, 64, 16);
  199. run(16, 128, 16);
  200. run(16, 256, 16);
  201. run(16, 512, 16);
  202. // run N >> M, K
  203. run(16, 16, 32);
  204. run(16, 16, 64);
  205. run(16, 16, 128);
  206. run(16, 16, 256);
  207. run(16, 16, 512);
  208. // run VGG
  209. // conv 1.1
  210. run(64, 3 * 3 * 3, 224 * 224);
  211. // conv 1.2
  212. run(128, 64 * 3 * 3, 112 * 112);
  213. // conv 2.1
  214. run(128, 128 * 3 * 3, 112 * 112);
  215. // conv 2.2
  216. run(128, 128 * 3 * 3, 56 * 56);
  217. // conv 3.1
  218. run(256, 128 * 3 * 3, 56 * 56);
  219. // conv 3.2
  220. run(256, 256 * 3 * 3, 28 * 28);
  221. // conv 4.1
  222. run(512, 256 * 3 * 3, 28 * 28);
  223. // conv 4.2
  224. run(512, 512 * 3 * 3, 14 * 14);
  225. }
  226. #endif
  227. #endif
  228. } // namespace test
  229. } // namespace megdnn
  230. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台