You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

matrix_mul.cpp 16 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346
  1. /**
  2. * \file dnn/test/naive/matrix_mul.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #include "test/naive/fixture.h"
  12. #include "megdnn/oprs/linalg.h"
  13. #include "test/common/checker.h"
  14. #include "test/common/matrix_mul.h"
  15. #include "test/common/random_state.h"
  16. #include "test/common/extra_impl_helper.h"
  17. using namespace megdnn;
  18. using namespace test;
  19. namespace {
  20. void run_matmul_mk_format(Handle* handle, param::MatrixMul::Format format,
  21. DType Atype, DType Btype, DType Ctype) {
  22. using namespace matrix_mul;
  23. std::vector<TestArg> args = get_matmul_args();
  24. Checker<MatrixMul> checker(handle);
  25. auto extra_impl = [](const TensorNDArray& tensors, param::MatrixMul param,
  26. Handle* handle, size_t pack_size) {
  27. megdnn_assert((param.format == param::MatrixMul::Format::MK4 ||
  28. param.format == param::MatrixMul::Format::MK4_DOT ||
  29. param.format == param::MatrixMul::Format::MK8) &&
  30. tensors.size() == 3);
  31. param::MatrixMul new_param = param;
  32. new_param.format = param::MatrixMul::Format::DEFAULT;
  33. size_t M = tensors[2].layout[0] * pack_size;
  34. size_t N = tensors[2].layout[1];
  35. size_t K = tensors[0].layout[1 - param.transposeA] * pack_size;
  36. TensorLayoutArray default_layouts, mk4_layouts;
  37. if (param.transposeA) {
  38. default_layouts.emplace_back(tensors[0].layout.reshape({K, M}));
  39. if (param.format == param::MatrixMul::Format::MK4_DOT) {
  40. mk4_layouts.emplace_back(
  41. default_layouts.back()
  42. .reshape({K / pack_size, M / pack_size,
  43. pack_size, pack_size})
  44. .dimshuffle({0, 3, 1, 2}));
  45. } else {
  46. mk4_layouts.emplace_back(
  47. default_layouts.back()
  48. .reshape({K / pack_size, M / pack_size,
  49. pack_size, pack_size})
  50. .dimshuffle({0, 2, 1, 3}));
  51. }
  52. } else {
  53. default_layouts.emplace_back(tensors[0].layout.reshape({M, K}));
  54. if (param.format == param::MatrixMul::Format::MK4_DOT) {
  55. mk4_layouts.emplace_back(
  56. default_layouts.back()
  57. .reshape({M / pack_size, K / pack_size,
  58. pack_size, pack_size})
  59. .dimshuffle({0, 2, 1, 3}));
  60. } else {
  61. mk4_layouts.emplace_back(
  62. default_layouts.back()
  63. .reshape({M / pack_size, K / pack_size,
  64. pack_size, pack_size})
  65. .dimshuffle({0, 3, 1, 2}));
  66. }
  67. }
  68. if (param.transposeB) {
  69. default_layouts.emplace_back(tensors[1].layout.reshape({N, K}));
  70. mk4_layouts.emplace_back(
  71. default_layouts.back()
  72. .reshape({N, K / pack_size, pack_size})
  73. .dimshuffle({0, 1, 2}));
  74. } else {
  75. default_layouts.emplace_back(tensors[1].layout.reshape({K, N}));
  76. mk4_layouts.emplace_back(
  77. default_layouts.back()
  78. .reshape({K / pack_size, N, pack_size})
  79. .dimshuffle({0, 2, 1}));
  80. }
  81. default_layouts.emplace_back(tensors[2].layout.reshape({M, N}));
  82. mk4_layouts.emplace_back(default_layouts.back()
  83. .reshape({M / pack_size, N, pack_size})
  84. .dimshuffle({0, 2, 1}));
  85. auto matmul_opr = handle->create_operator<MatrixMul>();
  86. matmul_opr->param() = new_param;
  87. size_t matmul_workspace = matmul_opr->get_workspace_in_bytes(
  88. default_layouts[0], default_layouts[1], default_layouts[2]);
  89. auto relayout_opr = handle->create_operator<Relayout>();
  90. WorkspaceBundle wb(nullptr, {default_layouts[0].span().dist_byte(),
  91. default_layouts[1].span().dist_byte(),
  92. default_layouts[2].span().dist_byte(),
  93. matmul_workspace});
  94. wb.set(malloc(wb.total_size_in_bytes()));
  95. TensorNDArray default_tensors, mk4_tensors;
  96. for (size_t i = 0; i < 3; i++) {
  97. default_tensors.emplace_back(wb.get(i), default_layouts[i]);
  98. mk4_tensors.emplace_back(tensors[i].raw_ptr, mk4_layouts[i]);
  99. }
  100. relayout_opr->exec(mk4_tensors[0], default_tensors[0]);
  101. relayout_opr->exec(mk4_tensors[1], default_tensors[1]);
  102. matmul_opr->exec(default_tensors[0], default_tensors[1],
  103. default_tensors[2], wb.get_workspace(3));
  104. relayout_opr->exec(default_tensors[2], mk4_tensors[2]);
  105. free(wb.ptr());
  106. };
  107. size_t pack_size = MatrixMulForward::pack_size(format);
  108. for (auto&& arg : args) {
  109. if (arg.m % pack_size != 0 || arg.k % pack_size != 0)
  110. continue;
  111. param::MatrixMul param;
  112. param.transposeA = arg.mask & 0x1;
  113. param.transposeB = arg.mask & 0x2;
  114. param.format = format;
  115. size_t m = arg.m, n = arg.n, k = arg.k;
  116. TensorShape A, B;
  117. if (param.transposeA) {
  118. A = TensorShape{k / pack_size, m / pack_size, pack_size, pack_size};
  119. } else {
  120. A = TensorShape{m / pack_size, k / pack_size, pack_size, pack_size};
  121. }
  122. if (param.transposeB) {
  123. B = TensorShape{n, k / pack_size, pack_size};
  124. } else {
  125. B = TensorShape{k / pack_size, n, pack_size};
  126. }
  127. checker.set_extra_opr_impl(std::bind(extra_impl, std::placeholders::_1,
  128. param, handle, pack_size));
  129. checker.set_dtype(0, Atype)
  130. .set_dtype(1, Btype)
  131. .set_dtype(2, Ctype)
  132. .set_epsilon(1e-3)
  133. .set_param(param)
  134. .execs({A, B, {}});
  135. }
  136. }
  137. } // namespace
  138. TEST_F(NAIVE, MATRIX_MUL_QUANTIZED4x4x32) {
  139. Checker<MatrixMul> checker(handle(), /* check_dispatch */ false);
  140. auto GenTensorValueQuint4 = [](const TensorShape& shape,
  141. dtype::Quantized4Asymm dtype,
  142. const std::vector<int>& values) {
  143. TensorND tensor;
  144. tensor.layout = {shape, dtype};
  145. tensor.raw_ptr =
  146. static_cast<dt_byte*>(malloc(tensor.layout.span().dist_byte()));
  147. uint8_t* ptr = static_cast<uint8_t*>(tensor.raw_ptr);
  148. megdnn_assert(values.size() == tensor.layout.span().dist_elem());
  149. for (size_t i = 0; i < tensor.layout.span().dist_elem(); i += 2) {
  150. int val0 = values[i], val1 = values[i + 1];
  151. ptr[i / 2] = val0 | (val1 << 4);
  152. }
  153. return tensor;
  154. };
  155. using Param = MatrixMul::Param;
  156. Param param;
  157. checker.set_param(param);
  158. checker.set_dtype(2, dtype::QuantizedS32(0.3f * 0.3f));
  159. checker.exect(
  160. Testcase{
  161. GenTensorValueQuint4(
  162. {8, 8}, dtype::Quantized4Asymm(0.3f, (uint8_t)8),
  163. {13, 2, 4, 13, 9, 3, 14, 14, 14, 5, 3, 3, 15,
  164. 11, 8, 8, 5, 7, 14, 15, 8, 2, 11, 1, 15, 9,
  165. 13, 14, 2, 3, 11, 11, 15, 10, 11, 0, 13, 12, 3,
  166. 11, 9, 9, 10, 5, 2, 5, 8, 4, 6, 9, 0, 0,
  167. 3, 9, 9, 8, 8, 15, 7, 5, 0, 3, 9, 10}),
  168. GenTensorValueQuint4(
  169. {8, 8}, dtype::Quantized4Asymm(0.3f, (uint8_t)8),
  170. {5, 14, 13, 11, 4, 7, 12, 12, 11, 7, 13, 10, 5,
  171. 6, 4, 2, 3, 12, 2, 2, 13, 3, 14, 0, 15, 15,
  172. 0, 2, 2, 13, 3, 14, 10, 8, 9, 11, 0, 14, 15,
  173. 4, 14, 7, 1, 6, 13, 2, 12, 5, 2, 15, 7, 11,
  174. 13, 9, 8, 10, 0, 11, 6, 10, 12, 2, 2, 12}),
  175. {}},
  176. Testcase{
  177. {},
  178. {},
  179. TensorValue(
  180. {8, 8}, dtype::QuantizedS32(0.3f * 0.3f),
  181. {-90, 120, -3, 40, -31, 58, -54, 165, -5, -19,
  182. 71, 87, -51, 24, 92, 15, 27, 62, -59, -82,
  183. -40, 91, 11, -16, -85, 138, -18, -36, 8, -25,
  184. -56, 75, -46, -34, 67, 53, -4, -83, 111, -86,
  185. -29, -17, 45, -9, 38, -22, -3, -19, -17, -95,
  186. 94, 78, 63, -35, -51, 21, -63, -14, 87, 31,
  187. 44, -53, -107, 5}),
  188. });
  189. }
  190. TEST_F(NAIVE, MATRIX_MUL_QUANTIZEDS4_4x4x16) {
  191. Checker<MatrixMul> checker(handle(), /* check_dispatch */ false);
  192. auto GenTensorValueQuint4 = [](const TensorShape& shape,
  193. dtype::QuantizedS4 dtype,
  194. const std::vector<int>& values) {
  195. TensorND tensor;
  196. tensor.layout = {shape, dtype};
  197. tensor.raw_ptr =
  198. static_cast<dt_byte*>(malloc(tensor.layout.span().dist_byte()));
  199. uint8_t* ptr = static_cast<uint8_t*>(tensor.raw_ptr);
  200. megdnn_assert(values.size() == tensor.layout.span().dist_elem());
  201. for (size_t i = 0; i < tensor.layout.span().dist_elem(); i += 2) {
  202. int val0 = values[i], val1 = values[i + 1];
  203. ptr[i / 2] =(val0 & 0xF) | (val1 << 4);
  204. }
  205. return tensor;
  206. };
  207. using Param = MatrixMul::Param;
  208. Param param;
  209. checker.set_param(param);
  210. checker.set_dtype(2, dtype::QuantizedS16(0.3f * 0.3f));
  211. checker.exect(
  212. Testcase{
  213. GenTensorValueQuint4(
  214. {8, 8}, dtype::QuantizedS4(0.3f),
  215. {-8, 7, 2, 1, 2, 3, 2, 7,
  216. 2, 5, 3, 3, 7, 4, -7, 1,
  217. -5, 7, -4, -1, -1, 2, 4, 1,
  218. 7, 2, -6, -2, -6, 3, 4, 4,
  219. -2, 2, 3, 0, 6, 5, 3, 4,
  220. -1, -1, -5, 5, 2, 5, 1, 4,
  221. 6, 2, 0, 0, 3, 2, 2, 1,
  222. -4, -3, 7, 5, 0, 3, 2, 3}),
  223. GenTensorValueQuint4(
  224. {8, 8}, dtype::QuantizedS4(0.3f),
  225. {5, -8, -7, -6, 4, 7, -5, -5,
  226. -4, 7, -3, -2, 5, 6, 4, 2,
  227. 3, -1, 2, 2, 7, 3, 6, 0,
  228. 5, 4, 0, 2, 2, 3, 3, 2,
  229. 1, -8, -7, -6, 0, -5, -4, 4,
  230. -3, 7, 1, 6, -2, 2, -1, 5,
  231. 2, 0, 7, 6, 5, 4, 3, 2,
  232. 0, 0, 1, 0, 5, 2, 2, 6}),
  233. {}},
  234. Testcase{
  235. {},
  236. {},
  237. TensorValue(
  238. {8, 8}, dtype::QuantizedS16(0.3f * 0.3f),
  239. {-60, 120, 49, 58, 58, 13, 92, 125,
  240. -5, 0, -116, -70, 22, 9, -14, 46,
  241. -69, 111, 44, 48, 6, 19, 42, 57,
  242. -8, 25, 10, 16, 26, 97, -28, -12,
  243. -12, 14, 2, 26, 48, 7, 24, 93,
  244. -2, 45, 2, 32, -19, -1, -16, 72,
  245. 23, -44, -52, -34, 45, 53, -28, 6,
  246. 33, 45, 71, 84, 47, 10, 74, 61})
  247. });
  248. }
  249. TEST_F(NAIVE, MATRIX_MUL_QUANTIZED8x8x32) {
  250. Checker<MatrixMul> checker(handle(), /* check_dispatch */ false);
  251. MatrixMul::Param param;
  252. param.transposeA = false;
  253. param.transposeB = false;
  254. checker.set_param(param).exect(
  255. Testcase{TensorValue(
  256. {4, 7}, dtype::Quantized8Asymm(0.1f, (uint8_t)128),
  257. {6, 97, 210, 47, 213, 246, 92, 121, 132, 133,
  258. 37, 31, 87, 71, 0, 5, 198, 11, 97, 141,
  259. 222, 166, 76, 212, 190, 108, 245, 143}),
  260. TensorValue({7, 5},
  261. dtype::Quantized8Asymm(0.2f, (uint8_t)233),
  262. {89, 207, 79, 135, 43, 29, 235, 171, 40,
  263. 78, 119, 145, 254, 162, 184, 139, 248, 214,
  264. 201, 183, 127, 75, 48, 200, 96, 109, 63,
  265. 60, 100, 120, 111, 182, 150, 227, 92}),
  266. {}},
  267. Testcase{{},
  268. {},
  269. TensorValue({4, 5}, dtype::QuantizedS32(0.1f * 0.2f),
  270. {2908, -36975, -9180, -3574, 8114,
  271. 30496, 23588, 32433, 11467, 30974,
  272. 36748, -6939, 26715, 33787, 35329,
  273. -24486, -25049, -19828, -16627, -18972})});
  274. param.transposeA = true;
  275. checker.set_param(param).exect(
  276. Testcase{TensorValue({2, 1},
  277. dtype::Quantized8Asymm(0.7f, (uint8_t)128),
  278. {129, 129}),
  279. TensorValue({2, 1},
  280. dtype::Quantized8Asymm(0.4f, (uint8_t)128),
  281. {129, 129}),
  282. {}},
  283. Testcase{{},
  284. {},
  285. TensorValue({1, 1}, dtype::QuantizedS32(0.7f * 0.4f),
  286. {2})});
  287. }
  288. TEST_F(NAIVE, MATRIX_MUL_MK4) {
  289. run_matmul_mk_format(handle(), param::MatrixMul::Format::MK4,
  290. dtype::Float32(), dtype::Float32(), dtype::Float32());
  291. }
  292. TEST_F(NAIVE, MATRIX_MUL_MK8) {
  293. run_matmul_mk_format(handle(), param::MatrixMul::Format::MK8,
  294. dtype::Int16(), dtype::Int16(), dtype::Int32());
  295. }
  296. TEST_F(NAIVE, MATRIX_MUL_MK4_DOT) {
  297. run_matmul_mk_format(handle(), param::MatrixMul::Format::MK4_DOT,
  298. dtype::Int8(), dtype::Int8(), dtype::Int32());
  299. }
  300. TEST_F(NAIVE, MATRIX_MUL_BFLOAT16) {
  301. Checker<MatrixMul> checker(handle(), /* check_dispatch */ false);
  302. MatrixMul::Param param, fp32_param;
  303. fp32_param = param;
  304. param.compute_mode = param::MatrixMul::ComputeMode::FLOAT32;
  305. checker.set_param(param);
  306. checker.set_dtype(0, dtype::BFloat16());
  307. checker.set_dtype(1, dtype::BFloat16());
  308. checker.set_dtype(2, dtype::BFloat16());
  309. auto extra_impl = extra_impl_helper<MatrixMul>(handle(), fp32_param);
  310. checker.set_extra_opr_impl(extra_impl);
  311. checker.set_epsilon(1.5e-2);
  312. UniformFloatRNG frng{1e-2, 5.f};
  313. checker.set_rng(0, &frng);
  314. checker.set_rng(1, &frng);
  315. checker.execs({{8, 8}, {8, 8}, {}});
  316. param.compute_mode = param::MatrixMul::ComputeMode::DEFAULT;
  317. checker.set_param(param);
  318. checker.execs({{8, 8}, {8, 8}, {}});
  319. }
  320. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台