You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

matrix_mul.cpp 20 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546
  1. /**
  2. * \file dnn/test/cuda/matrix_mul.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
  10. * implied.
  11. */
  12. #include "test/cuda/fixture.h"
  13. #include "test/common/benchmarker.h"
  14. #include "test/common/checker.h"
  15. #include "test/common/matrix_mul.h"
  16. #include "test/cuda/utils.h"
  17. #if defined(cuda_check)
  18. #undef cuda_check
  19. #endif
  20. #include "src/cuda/utils.h"
  21. namespace megdnn {
  22. namespace test {
  23. #if CUDA_VERSION >= 10000
  24. TEST_F(CUDA, MATRIX_MUL_QUANTIZED4x4x32_EXCEPTION) {
  25. if (cuda::current_device_prop().major > 7 ||
  26. (cuda::current_device_prop().major == 7 &&
  27. cuda::current_device_prop().minor >= 5)) {
  28. printf("Skip CUDA.MATRIX_MUL_QUANTIZED4x4x32_EXCEPTION test as current "
  29. "device support wmma intrinsics\n");
  30. return;
  31. }
  32. Checker<MatrixMul> checker(handle_cuda(), false);
  33. using Param = MatrixMul::Param;
  34. Param param;
  35. param.transposeB = true;
  36. checker.set_param(param);
  37. checker.set_dtype(0, dtype::Quantized4Asymm(1.3f, (uint8_t)3));
  38. checker.set_dtype(1, dtype::Quantized4Asymm(1.3f, (uint8_t)3));
  39. checker.set_dtype(2, dtype::QuantizedS32(1.3f * 1.3f));
  40. ASSERT_THROW(checker.exec({{256, 256}, {256, 256}, {256, 256}}), MegDNNError);
  41. }
  42. TEST_F(CUDA, MATRIX_MUL_QUANTIZED4x4x32) {
  43. require_compute_capability(7, 5);
  44. Checker<MatrixMul> checker(handle_cuda(), false);
  45. using Param = MatrixMul::Param;
  46. Param param;
  47. param.transposeB = true;
  48. checker.set_param(param);
  49. checker.set_dtype(0, dtype::Quantized4Asymm(1.3f, (uint8_t)3));
  50. checker.set_dtype(1, dtype::Quantized4Asymm(1.3f, (uint8_t)3));
  51. checker.set_dtype(2, dtype::QuantizedS32(1.3f * 1.3f));
  52. checker.exec({{256, 256}, {256, 256}, {256, 256}});
  53. auto args = matrix_mul::get_matmul_args();
  54. for (auto arg : args) {
  55. size_t m = (arg.m + 7) / 8 * 8, n = (arg.n + 7) / 8 * 8,
  56. k = (arg.k + 31) / 32 * 32;
  57. checker.exec({{m, k}, {n, k}, {m, n}});
  58. }
  59. }
  60. #if MEGDNN_WITH_BENCHMARK
  61. TEST_F(CUDA, BENCHMARK_MATRIX_MUL_QUANTIZED4x4x32) {
  62. require_compute_capability(7, 5);
  63. Benchmarker<MatrixMul> bencher(handle_cuda());
  64. using Param = MatrixMul::Param;
  65. Param param;
  66. param.transposeB = true;
  67. bencher.set_param(param);
  68. bencher.set_dtype(0, dtype::Quantized4Asymm(1.0f, (uint8_t)3));
  69. bencher.set_dtype(1, dtype::Quantized4Asymm(1.0f, (uint8_t)3));
  70. bencher.set_dtype(2, dtype::QuantizedS32(1.0f));
  71. for (size_t m : {256, 1024, 4096, 10240, 40960}) {
  72. for (size_t n : {256, 1024, 4096}) {
  73. for (size_t k : {512, 1024, 2048}) {
  74. bencher.set_times(400);
  75. auto time_in_ms = bencher.exec({{m, k}, {n, k}, {m, n}}) / 400;
  76. auto gflps = 2.0 * m * k * n / (time_in_ms * 1e-3) * 1e-12;
  77. printf("m=%zu, k=%zu, n=%zu, time: %fms, perf: %f TFlops\n", m, k, n,
  78. time_in_ms, gflps);
  79. }
  80. }
  81. }
  82. }
  83. TEST_F(CUDA, PEAK_BENCHMARK_MATRIX_MUL_QUANTIZED4x4x32) {
  84. require_compute_capability(7, 5);
  85. Benchmarker<MatrixMul> bencher(handle_cuda());
  86. using Param = MatrixMul::Param;
  87. Param param;
  88. param.transposeB = true;
  89. bencher.set_param(param);
  90. bencher.set_dtype(0, dtype::Quantized4Asymm(1.0f, (uint8_t)3));
  91. bencher.set_dtype(1, dtype::Quantized4Asymm(1.0f, (uint8_t)3));
  92. bencher.set_dtype(2, dtype::QuantizedS32(1.0f));
  93. bencher.set_times(400);
  94. size_t m = 4096, n = 4096, k = 81920;
  95. auto time_in_ms = bencher.exec({{m, k}, {n, k}, {m, n}}) / 400;
  96. auto tflps = 2.0 * m * k * n / (time_in_ms * 1e-3) * 1e-12;
  97. printf("m=%zu, k=%zu, n=%zu, time: %fms, perf: %f TFlops\n", m, k, n, time_in_ms,
  98. tflps);
  99. }
  100. #endif
  101. #endif
  102. TEST_F(CUDA, MATRIX_MUL_INT8x8x32_WITH_SPETIAL_STRIDES) {
  103. require_compute_capability(6, 1);
  104. Checker<MatrixMul> checker(handle_cuda());
  105. using Param = MatrixMul::Param;
  106. Param param;
  107. DType stype = dtype::Int8();
  108. checker.set_param(param)
  109. .set_dtype(0, stype)
  110. .set_dtype(1, stype)
  111. .set_dtype(2, dtype::Int32())
  112. .set_epsilon(5e-3);
  113. size_t m = 1024, n = 1024, k = 1024;
  114. {
  115. TensorLayout A{{m, k}, {2048, 1}, dtype::Int8()},
  116. B{{k, n}, {2048, 1}, dtype::Int8()}, C{{m, n}, dtype::Int32()};
  117. checker.execl({A, B, {}});
  118. }
  119. }
  120. TEST_F(CUDA, MATRIX_MUL_INT8x8x32_NAIVE) {
  121. require_compute_capability(6, 1);
  122. using Param = MatrixMul::Param;
  123. UniformIntRNG rng{-128, 127};
  124. Checker<MatrixMul> checker(handle_cuda());
  125. checker.set_rng(0, &rng).set_rng(1, &rng);
  126. size_t m = 1007, n = 1003, k = 129;
  127. for (unsigned mask = 0; mask < 4; ++mask) {
  128. Param param;
  129. param.transposeA = mask & 1;
  130. param.transposeB = mask & 2;
  131. TensorShape A, B;
  132. if (param.transposeA)
  133. A = TensorShape{k, m};
  134. else
  135. A = TensorShape{m, k};
  136. if (param.transposeB)
  137. B = TensorShape{n, k};
  138. else
  139. B = TensorShape{k, n};
  140. checker.set_param(param)
  141. .set_dtype(0, dtype::Int8())
  142. .set_dtype(1, dtype::Int8())
  143. .set_dtype(2, dtype::Int32())
  144. .set_epsilon(0)
  145. .execs({A, B, {}});
  146. }
  147. }
  148. TEST_F(CUDA, MATRIX_MUL_FLOAT_NAIVE) {
  149. Checker<MatrixMul> checker(handle_cuda());
  150. checker.set_before_exec_callback(AlgoChecker<MatrixMulForward>("NAIVE"));
  151. using Param = MatrixMul::Param;
  152. size_t m = 12, n = 16, k = 20;
  153. std::vector<DType> dtype_array;
  154. dtype_array.push_back(dtype::Float32());
  155. dtype_array.push_back(dtype::Float16());
  156. for (DType dtype : dtype_array) {
  157. for (unsigned mask = 0; mask < 4; ++mask) {
  158. Param param;
  159. param.transposeA = mask & 1;
  160. param.transposeB = mask & 2;
  161. DType stype = dtype;
  162. TensorShape A, B;
  163. if (param.transposeA)
  164. A = TensorShape{k, m};
  165. else
  166. A = TensorShape{m, k};
  167. if (param.transposeB)
  168. B = TensorShape{n, k};
  169. else
  170. B = TensorShape{k, n};
  171. if (dtype == dtype::Float16()) {
  172. param.compute_mode = param::MatrixMul::ComputeMode::FLOAT32;
  173. }
  174. checker.set_param(param)
  175. .set_dtype(0, stype)
  176. .set_dtype(1, stype)
  177. .set_dtype(2, dtype)
  178. .set_epsilon(dtype == dtype::Float16() ? 5e-2 : 5e-3)
  179. .execs({A, B, {}});
  180. }
  181. }
  182. }
  183. TEST_F(CUDA, MATRIX_MUL) {
  184. Checker<MatrixMul> checker(handle_cuda());
  185. using Param = MatrixMul::Param;
  186. size_t m = 12, n = 16, k = 20;
  187. bool is_int_available = check_compute_capability(6, 1);
  188. std::vector<DType> dtype_array;
  189. dtype_array.push_back(dtype::Float32());
  190. dtype_array.push_back(dtype::Float16());
  191. dtype_array.push_back(dtype::BFloat16());
  192. if (is_int_available)
  193. dtype_array.push_back(dtype::Int32());
  194. for (DType dtype : dtype_array) {
  195. for (unsigned mask = 0; mask < 4; ++mask) {
  196. Param param;
  197. param.transposeA = mask & 1;
  198. param.transposeB = mask & 2;
  199. DType stype = dtype == dtype::Int32() ? dtype::Int8() : dtype;
  200. TensorShape A, B;
  201. if (param.transposeA)
  202. A = TensorShape{k, m};
  203. else
  204. A = TensorShape{m, k};
  205. if (param.transposeB)
  206. B = TensorShape{n, k};
  207. else
  208. B = TensorShape{k, n};
  209. if (dtype == dtype::BFloat16()) {
  210. param.compute_mode = param::MatrixMul::ComputeMode::FLOAT32;
  211. checker.set_before_exec_callback(AlgoChecker<MatrixMulForward>(
  212. ExecutionPolicyAlgoName{"MATMUL_BFLOAT16", {{"CUBLAS", {}}}}));
  213. }
  214. checker.set_param(param)
  215. .set_dtype(0, stype)
  216. .set_dtype(1, stype)
  217. .set_dtype(2, dtype)
  218. .set_epsilon(
  219. dtype == dtype::Float16() || dtype == dtype::BFloat16()
  220. ? 5e-2
  221. : 5e-3)
  222. .execs({A, B, {}});
  223. if (dtype == dtype::BFloat16()) {
  224. checker.reset_before_exec_callback();
  225. checker.opr()->execution_policy() = {};
  226. }
  227. }
  228. }
  229. // general tests
  230. auto args = matrix_mul::get_matmul_args();
  231. for (auto arg : args) {
  232. auto m = arg.m, n = arg.n, k = arg.k;
  233. auto mask = arg.mask;
  234. Param param;
  235. param.transposeA = mask & 1;
  236. param.transposeB = mask & 2;
  237. TensorShape AS, BS, CS;
  238. if (param.transposeA)
  239. AS = TensorShape{k, m};
  240. else
  241. AS = TensorShape{m, k};
  242. if (param.transposeB)
  243. BS = TensorShape{n, k};
  244. else
  245. BS = TensorShape{k, n};
  246. CS = TensorShape{m, n};
  247. TensorLayout AL, BL, CL;
  248. if (arg.A_stride == matrix_mul::TestArg::UNSET_STRIDE_VAL) {
  249. AL = TensorLayout(AS, dtype::Float32());
  250. } else {
  251. AL = TensorLayout(AS, {ptrdiff_t(arg.A_stride), 1}, dtype::Float32());
  252. }
  253. if (arg.B_stride == matrix_mul::TestArg::UNSET_STRIDE_VAL) {
  254. BL = TensorLayout(BS, dtype::Float32());
  255. } else {
  256. BL = TensorLayout(BS, {ptrdiff_t(arg.B_stride), 1}, dtype::Float32());
  257. }
  258. if (arg.C_stride == matrix_mul::TestArg::UNSET_STRIDE_VAL) {
  259. CL = TensorLayout(CS, dtype::Float32());
  260. } else {
  261. CL = TensorLayout(CS, {ptrdiff_t(arg.C_stride), 1}, dtype::Float32());
  262. }
  263. checker.set_param(param).execl({AL, BL, CL});
  264. }
  265. }
  266. TEST_F(CUDA, MATRIX_MUL_CUBLASLT) {
  267. require_compute_capability(7, 5);
  268. NormalRNG normal_rng;
  269. Checker<MatrixMul> checker(handle_cuda());
  270. checker.set_rng(0, &normal_rng)
  271. .set_rng(1, &normal_rng)
  272. .set_before_exec_callback(AlgoChecker<MatrixMulForward>("CUBLAS_LT"));
  273. using Param = MatrixMul::Param;
  274. size_t m = 32, n = 32, k = 32;
  275. // test Int8 matmul
  276. {
  277. DType dtype = dtype::Int32();
  278. Param param;
  279. param.transposeA = false;
  280. param.transposeB = false;
  281. DType stype = dtype == dtype::Int32() ? dtype::Int8() : dtype;
  282. TensorShape A, B;
  283. A = TensorShape{m, k};
  284. B = TensorShape{k, n};
  285. checker.set_param(param)
  286. .set_dtype(0, stype)
  287. .set_dtype(1, stype)
  288. .set_dtype(2, dtype)
  289. .set_epsilon(dtype == dtype::Float16() ? 5e-2 : 5e-3)
  290. .execs({A, B, {}});
  291. }
  292. // test float-point matmul
  293. for (DType dtype : std::array<DType, 2>{{dtype::Float32(), dtype::Float16()}}) {
  294. for (unsigned mask = 0; mask < 4; ++mask) {
  295. Param param;
  296. param.transposeA = mask & 1;
  297. param.transposeB = mask & 2;
  298. DType stype = dtype == dtype::Int32() ? dtype::Int8() : dtype;
  299. TensorShape A, B;
  300. if (param.transposeA)
  301. A = TensorShape{k, m};
  302. else
  303. A = TensorShape{m, k};
  304. if (param.transposeB)
  305. B = TensorShape{n, k};
  306. else
  307. B = TensorShape{k, n};
  308. checker.set_param(param)
  309. .set_dtype(0, stype)
  310. .set_dtype(1, stype)
  311. .set_dtype(2, dtype)
  312. .set_epsilon(dtype == dtype::Float16() ? 5e-2 : 8e-3)
  313. .execs({A, B, {}});
  314. }
  315. }
  316. // general tests
  317. auto args = matrix_mul::get_matmul_args();
  318. for (auto arg : args) {
  319. auto m = arg.m, n = arg.n, k = arg.k;
  320. auto mask = arg.mask;
  321. Param param;
  322. param.transposeA = mask & 1;
  323. param.transposeB = mask & 2;
  324. TensorShape AS, BS, CS;
  325. if (param.transposeA)
  326. AS = TensorShape{k, m};
  327. else
  328. AS = TensorShape{m, k};
  329. if (param.transposeB)
  330. BS = TensorShape{n, k};
  331. else
  332. BS = TensorShape{k, n};
  333. CS = TensorShape{m, n};
  334. TensorLayout AL, BL, CL;
  335. if (arg.A_stride == matrix_mul::TestArg::UNSET_STRIDE_VAL) {
  336. AL = TensorLayout(AS, dtype::Float32());
  337. } else {
  338. AL = TensorLayout(AS, {ptrdiff_t(arg.A_stride), 1}, dtype::Float32());
  339. }
  340. if (arg.B_stride == matrix_mul::TestArg::UNSET_STRIDE_VAL) {
  341. BL = TensorLayout(BS, dtype::Float32());
  342. } else {
  343. BL = TensorLayout(BS, {ptrdiff_t(arg.B_stride), 1}, dtype::Float32());
  344. }
  345. if (arg.C_stride == matrix_mul::TestArg::UNSET_STRIDE_VAL) {
  346. CL = TensorLayout(CS, dtype::Float32());
  347. } else {
  348. CL = TensorLayout(CS, {ptrdiff_t(arg.C_stride), 1}, dtype::Float32());
  349. }
  350. checker.set_param(param).execl({AL, BL, CL});
  351. }
  352. }
  353. TEST_F(CUDA, MATRIX_MUL_CUBLASLT_SPECIAL_CASE) {
  354. require_compute_capability(7, 5);
  355. size_t m = 12, n = 16, k = 20;
  356. Checker<MatrixMul> checker(handle_cuda());
  357. checker.set_before_exec_callback(AlgoChecker<MatrixMulForward>("CUBLAS_LT"));
  358. using Param = MatrixMul::Param;
  359. Param param;
  360. DType stype = dtype::Float32();
  361. DType dtype = dtype::Float32();
  362. TensorShape A, B;
  363. param.transposeA = param.transposeB = 1;
  364. if (param.transposeA)
  365. A = TensorShape{k, m};
  366. else
  367. A = TensorShape{m, k};
  368. if (param.transposeB)
  369. B = TensorShape{n, k};
  370. else
  371. B = TensorShape{k, n};
  372. checker.set_param(param)
  373. .set_dtype(0, stype)
  374. .set_dtype(1, stype)
  375. .set_dtype(2, dtype)
  376. .set_epsilon(dtype == dtype::Float16() ? 5e-1 : 5e-2)
  377. .execs({A, B, {}});
  378. }
  379. TEST_F(CUDA, MATRIX_MUL_CUBLASLT_INT8) {
  380. require_compute_capability(7, 5);
  381. NormalRNG normal_rng;
  382. Checker<MatrixMul> checker(handle_cuda());
  383. checker.set_rng(0, &normal_rng)
  384. .set_rng(1, &normal_rng)
  385. .set_before_exec_callback(AlgoChecker<MatrixMulForward>("CUBLAS_LT"));
  386. using Param = MatrixMul::Param;
  387. // size_t m = 32, n = 32, k = 32;
  388. // test Int8 matmul
  389. for (size_t m = 8; m <= 64; m += 4)
  390. for (size_t n = 8; n <= 64; n += 4)
  391. for (size_t k = 8; k <= 64; k += 4) {
  392. DType dtype = dtype::Int32();
  393. Param param;
  394. param.transposeA = false;
  395. param.transposeB = false;
  396. DType stype = dtype == dtype::Int32() ? dtype::Int8() : dtype;
  397. TensorShape A, B;
  398. A = TensorShape{m, k};
  399. B = TensorShape{k, n};
  400. checker.set_param(param)
  401. .set_dtype(0, stype)
  402. .set_dtype(1, stype)
  403. .set_dtype(2, dtype)
  404. .set_epsilon(dtype == dtype::Float16() ? 5e-2 : 5e-3)
  405. .execs({A, B, {}});
  406. }
  407. }
  408. TEST_F(CUDA, MATRIX_MUL_CUBLASLT_F32) {
  409. require_compute_capability(7, 5);
  410. size_t m = 128, n = 1024, k = 18432;
  411. Checker<MatrixMul> checker(handle_cuda());
  412. checker.set_before_exec_callback(AlgoChecker<MatrixMulForward>("CUBLAS_LT"));
  413. using Param = MatrixMul::Param;
  414. Param param;
  415. DType stype = dtype::Float32();
  416. DType dtype = dtype::Float32();
  417. TensorShape A, B;
  418. param.transposeA = param.transposeB = 0;
  419. if (param.transposeA)
  420. A = TensorShape{k, m};
  421. else
  422. A = TensorShape{m, k};
  423. if (param.transposeB)
  424. B = TensorShape{n, k};
  425. else
  426. B = TensorShape{k, n};
  427. checker.set_param(param)
  428. .set_dtype(0, stype)
  429. .set_dtype(1, stype)
  430. .set_dtype(2, dtype)
  431. .execs({A, B, {}});
  432. }
  433. TEST_F(CUDA, MATRIX_MUL_CUDNN_F32_uncont) {
  434. Checker<MatrixMul> checker(handle_cuda());
  435. checker.set_before_exec_callback(AlgoChecker<MatrixMulForward>("MATMUL_CONV1X1"));
  436. using Param = MatrixMul::Param;
  437. size_t m = 100, n = 100, k = 100;
  438. Param param;
  439. param.transposeA = 1;
  440. param.transposeB = 1;
  441. TensorLayout A{{m, k}, {128, 1}, dtype::Float32()},
  442. B{{k, n}, {128, 1}, dtype::Float32()}, C{{m, n}, dtype::Float32()};
  443. DType stype = dtype::Float32();
  444. DType dtype = dtype::Float32();
  445. checker.set_param(param)
  446. .set_dtype(0, stype)
  447. .set_dtype(1, stype)
  448. .set_dtype(2, dtype)
  449. .execl({A, B, {}});
  450. }
  451. TEST_F(CUDA, MATRIX_MUL_CUDNN_F32) {
  452. Checker<MatrixMul> checker(handle_cuda());
  453. checker.set_before_exec_callback(AlgoChecker<MatrixMulForward>("MATMUL_CONV1X1"));
  454. using Param = MatrixMul::Param;
  455. for (size_t m = 8; m <= 64; m += 4) {
  456. for (size_t n = 8; n <= 64; n += 4) {
  457. for (size_t k = 8; k <= 64; k += 4) {
  458. for (unsigned mask = 0; mask < 4; ++mask) {
  459. Param param;
  460. param.transposeA = mask & 1;
  461. param.transposeB = mask & 2;
  462. DType stype = dtype::Float32();
  463. DType dtype = dtype::Float32();
  464. TensorShape A, B;
  465. if (param.transposeA)
  466. A = TensorShape{k, m};
  467. else
  468. A = TensorShape{m, k};
  469. if (param.transposeB)
  470. B = TensorShape{n, k};
  471. else
  472. B = TensorShape{k, n};
  473. checker.set_param(param)
  474. .set_dtype(0, stype)
  475. .set_dtype(1, stype)
  476. .set_dtype(2, dtype)
  477. .execs({A, B, {}});
  478. }
  479. }
  480. }
  481. }
  482. }
  483. TEST_F(CUDA, MATRIX_MUL_CUDNN_F16) {
  484. Checker<MatrixMul> checker(handle_cuda());
  485. checker.set_before_exec_callback(AlgoChecker<MatrixMulForward>("MATMUL_CONV1X1"));
  486. using Param = MatrixMul::Param;
  487. for (size_t m = 8; m <= 64; m += 4) {
  488. for (size_t n = 8; n <= 64; n += 4) {
  489. for (size_t k = 8; k <= 64; k += 4) {
  490. for (unsigned mask = 0; mask < 4; ++mask) {
  491. Param param;
  492. param.transposeA = mask & 1;
  493. param.transposeB = mask & 2;
  494. DType stype = dtype::Float16();
  495. DType dtype = dtype::Float16();
  496. TensorShape A, B;
  497. if (param.transposeA)
  498. A = TensorShape{k, m};
  499. else
  500. A = TensorShape{m, k};
  501. if (param.transposeB)
  502. B = TensorShape{n, k};
  503. else
  504. B = TensorShape{k, n};
  505. checker.set_param(param)
  506. .set_dtype(0, stype)
  507. .set_dtype(1, stype)
  508. .set_dtype(2, dtype)
  509. .set_epsilon(6e-2)
  510. .execs({A, B, {}});
  511. }
  512. }
  513. }
  514. }
  515. }
  516. } // namespace test
  517. } // namespace megdnn
  518. // vim: syntax=cpp.doxygen