You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

batched_matrix_mul.cpp 7.7 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247
  1. /**
  2. * \file dnn/test/aarch64/batched_matrix_mul.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #include "test/common/benchmarker.h"
  12. #include "test/common/checker.h"
  13. #include "test/common/matrix_mul.h"
  14. #include "test/common/rng.h"
  15. #include "test/aarch64/fixture.h"
  16. namespace megdnn {
  17. namespace test {
  18. TEST_F(AARCH64, BATCHED_MATRIX_MUL) {
  19. Checker<BatchedMatrixMul> checker(handle());
  20. checker.set_epsilon(1e-2);
  21. using Param = MatrixMul::Param;
  22. // auto args = get_batch_matmul_args();
  23. auto args = matrix_mul::get_batched_matmul_args();
  24. for (DType dtype : std::vector<DType>{dtype::Float32()}) {
  25. for (unsigned mask = 0; mask < 4; ++mask) {
  26. for (auto& arg : args) {
  27. size_t b = arg.b, m = arg.m, n = arg.n, k = arg.k;
  28. //! if test all batch sizes, the test case will time out.
  29. if (b != 2) {
  30. continue;
  31. }
  32. Param param;
  33. param.transposeA = mask & 1;
  34. param.transposeB = mask & 2;
  35. TensorShape A, B;
  36. if (param.transposeA)
  37. A = TensorShape{b, k, m};
  38. else
  39. A = TensorShape{b, m, k};
  40. if (param.transposeB)
  41. B = TensorShape{b, n, k};
  42. else
  43. B = TensorShape{b, k, n};
  44. checker.set_param(param).set_dtype(0, dtype).set_dtype(1, dtype).execs(
  45. {A, B, {}});
  46. }
  47. }
  48. }
  49. }
  50. #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
  51. TEST_F(AARCH64, BATCHED_MATRIX_MUL_FP16) {
  52. Checker<BatchedMatrixMul> checker(handle());
  53. using Param = MatrixMul::Param;
  54. auto args = matrix_mul::get_batched_matmul_args();
  55. NormalRNG rng(1.f);
  56. checker.set_rng(0, &rng).set_rng(1, &rng).set_epsilon(1e-2);
  57. for (DType dtype : std::vector<DType>{dtype::Float16()}) {
  58. for (unsigned mask = 0; mask < 4; ++mask) {
  59. for (auto& arg : args) {
  60. size_t b = arg.b, m = arg.m, n = arg.n, k = arg.k;
  61. //! if test all batch sizes, the test case will time out on
  62. //! sdm855
  63. if (b != 1) {
  64. continue;
  65. }
  66. Param param;
  67. param.transposeA = mask & 1;
  68. param.transposeB = mask & 2;
  69. TensorShape A, B;
  70. if (param.transposeA)
  71. A = TensorShape{b, k, m};
  72. else
  73. A = TensorShape{b, m, k};
  74. if (param.transposeB)
  75. B = TensorShape{b, n, k};
  76. else
  77. B = TensorShape{b, k, n};
  78. checker.set_param(param)
  79. .set_dtype(0, dtype)
  80. .set_dtype(1, dtype)
  81. .set_dtype(2, dtype)
  82. .execs({A, B, {}});
  83. }
  84. }
  85. }
  86. }
  87. #if MEGDNN_WITH_BENCHMARK
  88. TEST_F(AARCH64, BENCHMARK_TRANSPOSED_MATRIX_MUL_QUICK_FP16) {
  89. int exec_times = 10;
  90. Benchmarker<MatrixMul> benchmarker_gemm(handle());
  91. benchmarker_gemm.set_times(exec_times);
  92. float mod = 1000 * exec_times / 1e9;
  93. using Param = MatrixMul::Param;
  94. auto run = [&](size_t M, size_t K, size_t N) {
  95. float time = 1.f, perf = 1.f;
  96. std::cout << "GEMM: (" << M << ", " << K << ", " << N << ")" << std::endl;
  97. Param param;
  98. param.transposeA = true;
  99. param.transposeB = true;
  100. benchmarker_gemm.set_param(param)
  101. .set_dtype(0, dtype::Float32())
  102. .set_dtype(1, dtype::Float32());
  103. time = benchmarker_gemm.exec({{M, K}, {K, N}, {}});
  104. perf = 2.f * M * K * N / time * mod;
  105. std::cout << "gemm fp32, Performance is " << perf << " Gflops" << std::endl;
  106. benchmarker_gemm.set_param(param)
  107. .set_dtype(0, dtype::Float16())
  108. .set_dtype(1, dtype::Float16());
  109. time = benchmarker_gemm.exec({{M, K}, {K, N}, {}});
  110. perf = 2.f * M * K * N / time * mod;
  111. std::cout << "gemm fp16, Performance is " << perf << " Gflops" << std::endl;
  112. };
  113. // run M = K = N
  114. run(32, 32, 32);
  115. run(64, 64, 64);
  116. run(128, 128, 128);
  117. run(256, 256, 256);
  118. run(512, 512, 512);
  119. run(1024, 1024, 1024);
  120. run(2048, 2048, 2048);
  121. }
  122. TEST_F(AARCH64, BENCHMARK_TRANSPOSED_MATRIX_MUL_ALL_SIZES_FP16) {
  123. int exec_times = 50;
  124. Benchmarker<MatrixMul> benchmarker_gemm(handle());
  125. benchmarker_gemm.set_times(exec_times);
  126. float mod = 1000 * exec_times / 1e9;
  127. using Param = MatrixMul::Param;
  128. auto run = [&](size_t M, size_t K, size_t N) {
  129. float time = 1.f, perf = 1.f;
  130. std::cout << "GEMM: (" << M << ", " << K << ", " << N << ")" << std::endl;
  131. Param param;
  132. param.transposeA = param.transposeB = true;
  133. benchmarker_gemm.set_param(param)
  134. .set_dtype(0, dtype::Float32())
  135. .set_dtype(1, dtype::Float32());
  136. time = benchmarker_gemm.exec({{K, M}, {N, K}, {}});
  137. perf = 2.f * M * K * N / time * mod;
  138. std::cout << "gemm fp32, Performance is " << perf << " Gflops" << std::endl;
  139. benchmarker_gemm.set_param(param)
  140. .set_dtype(0, dtype::Float16())
  141. .set_dtype(1, dtype::Float16());
  142. time = benchmarker_gemm.exec({{K, M}, {N, K}, {}});
  143. perf = 2.f * M * K * N / time * mod;
  144. std::cout << "gemm fp16, Performance is " << perf << " Gflops" << std::endl;
  145. };
  146. std::cout << "warm up:\n";
  147. for (int i = 0; i < 50; i++) {
  148. benchmarker_gemm.set_dtype(0, dtype::Float32())
  149. .set_dtype(1, dtype::Float32())
  150. .set_display(false)
  151. .exec({{256, 256}, {256, 256}, {}});
  152. benchmarker_gemm.set_display(true);
  153. }
  154. // run M = K = N
  155. run(8, 8, 8);
  156. run(16, 16, 16);
  157. run(32, 32, 32);
  158. run(64, 64, 64);
  159. run(128, 128, 128);
  160. run(256, 256, 256);
  161. run(512, 512, 512);
  162. run(1024, 1024, 1024);
  163. run(2048, 2048, 2048);
  164. // run sgmev like
  165. run(32, 32, 1);
  166. run(64, 64, 1);
  167. run(128, 128, 1);
  168. run(256, 256, 1);
  169. run(512, 512, 1);
  170. // run M, N >> K
  171. run(32, 16, 32);
  172. run(64, 16, 64);
  173. run(128, 16, 128);
  174. run(256, 16, 256);
  175. run(512, 16, 512);
  176. // run N, K >> M
  177. run(16, 32, 32);
  178. run(16, 64, 64);
  179. run(16, 128, 128);
  180. run(16, 256, 256);
  181. run(16, 512, 512);
  182. // run M >> K, N
  183. run(32, 16, 16);
  184. run(64, 16, 16);
  185. run(128, 16, 16);
  186. run(256, 16, 16);
  187. run(512, 16, 16);
  188. // run K >> M, N
  189. run(16, 32, 16);
  190. run(16, 64, 16);
  191. run(16, 128, 16);
  192. run(16, 256, 16);
  193. run(16, 512, 16);
  194. // run N >> M, K
  195. run(16, 16, 32);
  196. run(16, 16, 64);
  197. run(16, 16, 128);
  198. run(16, 16, 256);
  199. run(16, 16, 512);
  200. // run VGG
  201. // conv 1.1
  202. run(64, 3 * 3 * 3, 224 * 224);
  203. // conv 1.2
  204. run(128, 64 * 3 * 3, 112 * 112);
  205. // conv 2.1
  206. run(128, 128 * 3 * 3, 112 * 112);
  207. // conv 2.2
  208. run(128, 128 * 3 * 3, 56 * 56);
  209. // conv 3.1
  210. run(256, 128 * 3 * 3, 56 * 56);
  211. // conv 3.2
  212. run(256, 256 * 3 * 3, 28 * 28);
  213. // conv 4.1
  214. run(512, 256 * 3 * 3, 28 * 28);
  215. // conv 4.2
  216. run(512, 512 * 3 * 3, 14 * 14);
  217. }
  218. #endif
  219. #endif
  220. } // namespace test
  221. } // namespace megdnn
  222. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台