You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

convolution.cpp 52 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264
  1. /**
  2. * \file dnn/test/cuda/convolution.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
  10. * implied.
  11. */
  12. #include "test/common/convolution.h"
  13. #include "megdnn/dtype.h"
  14. #include "megdnn/opr_param_defs.h"
  15. #include "megdnn/oprs.h"
  16. #include "src/cuda/utils.h"
  17. #include "test/common/accuracy_shake_checker.h"
  18. #include "test/common/checker.h"
  19. #include "test/common/rng.h"
  20. #include "test/common/tensor.h"
  21. #include "test/common/workspace_wrapper.h"
  22. #include "test/cuda/benchmark.h"
  23. #include "test/cuda/fixture.h"
  24. #define V1(x) #x
  25. #define V(x) V1(x)
  26. #define CUDNN_VERSION_STRING \
  27. "v" V(CUDNN_MAJOR) "." V(CUDNN_MINOR) "." V(CUDNN_PATCHLEVEL)
  28. namespace megdnn {
  29. namespace test {
  30. TEST_F(CUDA, CONVOLUTION_8X8X32) {
  31. if (!cuda::is_compute_capability_required(6, 1)) {
  32. printf("Skip CUDA.CONVOLUTION_8X8X32 test as current device"
  33. "doesn't support\n");
  34. return;
  35. }
  36. using namespace convolution;
  37. std::vector<TestArg> args;
  38. {
  39. auto v = get_args();
  40. for (auto&& a : v) {
  41. args.push_back(std::move(a));
  42. }
  43. }
  44. {
  45. auto v = get_dilated_args();
  46. for (auto&& a : v) {
  47. args.push_back(std::move(a));
  48. }
  49. }
  50. {
  51. auto v = get_chanwise_args();
  52. for (auto&& a : v) {
  53. args.push_back(std::move(a));
  54. }
  55. }
  56. Checker<ConvolutionForward> checker(handle_cuda());
  57. UniformIntRNG rng(-4, 4);
  58. for (auto arg : args) {
  59. arg.param.format = param::Convolution::Format::NHWC;
  60. arg.src = cvt_src_or_dst_nchw2nhwc(arg.src);
  61. arg.filter = cvt_filter_nchw2nhwc(arg.filter);
  62. checker.set_dtype(0, dtype::Int8())
  63. .set_dtype(1, dtype::Int8())
  64. .set_dtype(2, dtype::Int32())
  65. .set_param(arg.param)
  66. .set_rng(0, &rng)
  67. .set_rng(1, &rng)
  68. .execs({arg.src, arg.filter, {}});
  69. }
  70. }
  71. TEST_F(CUDA, CONVOLUTION_FORWARD) {
  72. using namespace convolution;
  73. std::vector<TestArg> args = get_args();
  74. Checker<ConvolutionForward> checker(handle_cuda());
  75. NormalRNG default_rng;
  76. for (auto&& arg : args) {
  77. float scale = 1.0f / sqrt(arg.filter[1] * arg.filter[2] * arg.filter[3]);
  78. UniformFloatRNG rng(scale, 2 * scale);
  79. checker.set_dtype(0, dtype::Float32())
  80. .set_dtype(1, dtype::Float32())
  81. .set_dtype(2, dtype::Float32())
  82. .set_rng(0, &default_rng)
  83. .set_rng(1, &default_rng)
  84. .set_epsilon(1e-3)
  85. .set_param(arg.param)
  86. .execs({arg.src, arg.filter, {}});
  87. checker.set_dtype(0, dtype::Float16())
  88. .set_dtype(1, dtype::Float16())
  89. .set_dtype(2, dtype::Float16())
  90. .set_rng(0, &rng)
  91. .set_rng(1, &rng)
  92. .set_epsilon(1e-1)
  93. .set_param(arg.param)
  94. .execs({arg.src, arg.filter, {}});
  95. arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  96. checker.set_dtype(0, dtype::Float16())
  97. .set_dtype(1, dtype::Float16())
  98. .set_dtype(2, dtype::Float16())
  99. .set_rng(0, &rng)
  100. .set_rng(1, &rng)
  101. .set_epsilon(1e-1)
  102. .set_param(arg.param)
  103. .execs({arg.src, arg.filter, {}});
  104. checker.set_dtype(0, dtype::BFloat16())
  105. .set_dtype(1, dtype::BFloat16())
  106. .set_dtype(2, dtype::BFloat16())
  107. .set_epsilon(1e-1)
  108. .set_param(arg.param)
  109. .execs({arg.src, arg.filter, {}});
  110. }
  111. }
  112. TEST_F(CUDA, CONV_FORWARD_MATMUL_NCHW4) {
  113. if (!cuda::is_compute_capability_required(6, 1))
  114. return;
  115. using namespace convolution;
  116. Checker<Convolution> checker(handle_cuda());
  117. UniformIntRNG int_rng{-127, 127};
  118. Convolution::Param param;
  119. param.format = Convolution::Param::Format::NCHW4;
  120. checker.set_dtype(0, dtype::QuantizedS8(0.132f))
  121. .set_dtype(1, dtype::QuantizedS8(0.0239f))
  122. .set_dtype(2, dtype::QuantizedS32(0.132f * 0.0239f))
  123. .set_rng(0, &int_rng)
  124. .set_rng(1, &int_rng)
  125. .set_param(param);
  126. checker.set_before_exec_callback(
  127. AlgoChecker<ConvolutionForward>(ExecutionPolicyAlgoName{
  128. "DEFAULT",
  129. {{ConvBiasForward::algo_name<ConvBiasForward::MatmulParam>(
  130. "MATMUL8X8X32", {})
  131. .c_str(),
  132. {}}}}));
  133. param.sparse = Convolution::Param::Sparse::DENSE;
  134. param.pad_h = param.pad_w = 1;
  135. param.stride_h = param.stride_w = 1;
  136. checker.set_param(param);
  137. checker.exec({{8, 4, 10, 10, 4}, {16, 4, 3, 3, 4}, {}});
  138. checker.exec({{1, 4, 2, 2, 4}, {16, 4, 3, 3, 4}, {}});
  139. checker.exec({{8, 64, 12, 12, 4}, {256, 64, 3, 3, 4}, {}});
  140. }
  141. TEST_F(CUDA, CONVOLUTION_1X1_FORWARD) {
  142. using namespace convolution;
  143. std::vector<TestArg> args = get_1x1_args();
  144. Checker<ConvolutionForward> checker(handle_cuda());
  145. NormalRNG default_rng;
  146. for (auto&& arg : args) {
  147. float scale = 1.0f / sqrt(arg.filter[1] * arg.filter[2] * arg.filter[3]);
  148. UniformFloatRNG rng(scale, 2 * scale);
  149. checker.set_dtype(0, dtype::Float32())
  150. .set_dtype(1, dtype::Float32())
  151. .set_rng(0, &default_rng)
  152. .set_rng(1, &default_rng)
  153. .set_epsilon(1e-3)
  154. .set_param(arg.param)
  155. .execs({arg.src, arg.filter, {}});
  156. }
  157. }
  158. TEST_F(CUDA, BENCHMARK_CONVOLUTION_1X1_FORWARD) {
  159. using namespace convolution;
  160. std::vector<TestArg> args = get_1x1_args();
  161. Benchmarker<ConvolutionForward> marker(handle_cuda());
  162. NormalRNG default_rng;
  163. for (auto&& arg : args) {
  164. float scale = 1.0f / sqrt(arg.filter[1] * arg.filter[2] * arg.filter[3]);
  165. UniformFloatRNG rng(scale, 2 * scale);
  166. marker.set_dtype(0, dtype::Float32())
  167. .set_dtype(1, dtype::Float32())
  168. .set_rng(0, &default_rng)
  169. .set_rng(1, &default_rng)
  170. .set_param(arg.param)
  171. .execs({arg.src, arg.filter, {}});
  172. }
  173. }
  174. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA) {
  175. using namespace convolution;
  176. std::vector<TestArg> args = get_args_cuda_conv_bwd_data();
  177. Checker<ConvolutionBackwardData> checker(handle_cuda());
  178. NormalRNG default_rng;
  179. for (auto&& arg : args) {
  180. float scale = 64.f / sqrt(arg.filter[0] * arg.filter[2] * arg.filter[3]);
  181. UniformFloatRNG rng(scale, 2 * scale);
  182. auto src = TensorLayout(arg.src, dtype::Float32());
  183. auto filter = TensorLayout(arg.filter, dtype::Float32());
  184. TensorLayout dst;
  185. {
  186. auto opr = handle_cuda()->create_operator<Convolution>();
  187. opr->param() = arg.param;
  188. opr->deduce_layout(src, filter, dst);
  189. }
  190. src.dtype = dst.dtype = filter.dtype = dtype::Float32();
  191. checker.set_rng(0, &default_rng)
  192. .set_rng(1, &default_rng)
  193. .set_epsilon(1e-3)
  194. .set_param(arg.param)
  195. .exec(TensorLayoutArray{filter, dst, src});
  196. if (!cuda::is_compute_capability_required(6, 0)) {
  197. src.dtype = dst.dtype = filter.dtype = dtype::Float16();
  198. checker.set_rng(0, &rng)
  199. .set_rng(1, &rng)
  200. .set_epsilon(1e-1)
  201. .set_param(arg.param)
  202. .exec(TensorLayoutArray{filter, dst, src});
  203. arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  204. checker.set_rng(0, &rng)
  205. .set_rng(1, &rng)
  206. .set_epsilon(1e-1)
  207. .set_param(arg.param)
  208. .exec(TensorLayoutArray{filter, dst, src});
  209. }
  210. checker.set_before_exec_callback(
  211. AlgoChecker<ConvolutionBackwardData>(ExecutionPolicyAlgoName{
  212. "CONVOLUTION_BACKWARD_DATD_BFLOAT16",
  213. {{"MATMUL", {{"CUBLAS", {}}}}}}));
  214. src.dtype = dst.dtype = filter.dtype = dtype::BFloat16();
  215. arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  216. checker.set_rng(0, &rng)
  217. .set_rng(1, &rng)
  218. .set_epsilon(1e-1)
  219. .set_param(arg.param)
  220. .exec(TensorLayoutArray{filter, dst, src});
  221. checker.reset_before_exec_callback();
  222. checker.opr()->execution_policy() = {};
  223. }
  224. }
  225. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_FP16_CUDNN7_5) {
  226. // algo CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 with
  227. // TensorCore operations produces incorrect result.
  228. // Maybe nvidia has fixed this issue
  229. // There is a test using incorrect case:
  230. // inp={2x8x18x18}, kern={8x8x2x2}, pad_h=pad_w=2, stride_h=stride_w=2,
  231. // dtype=float16
  232. using namespace convolution;
  233. std::vector<TestArg> args = get_args_cudnn_5_1_backward();
  234. Checker<ConvolutionBackwardData> checker(handle_cuda());
  235. NormalRNG default_rng;
  236. for (auto&& arg : args) {
  237. float scale = 128.f / sqrt(arg.filter[0] * arg.filter[2] * arg.filter[3]);
  238. scale = std::max(scale, 1.f);
  239. UniformFloatRNG rng(scale, 2 * scale);
  240. arg.param.format = param::Convolution::Format::NHWC;
  241. arg.src = cvt_src_or_dst_nchw2nhwc(arg.src);
  242. arg.filter = cvt_filter_nchw2nhwc(arg.filter);
  243. auto src = TensorLayout(arg.src, dtype::Float32());
  244. auto filter = TensorLayout(arg.filter, dtype::Float32());
  245. TensorLayout dst;
  246. {
  247. auto opr = handle_cuda()->create_operator<Convolution>();
  248. opr->param() = arg.param;
  249. opr->deduce_layout(src, filter, dst);
  250. }
  251. src.dtype = dst.dtype = filter.dtype = dtype::Float16();
  252. arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  253. checker.set_rng(0, &rng)
  254. .set_rng(1, &rng)
  255. .set_epsilon(1e-2)
  256. .set_param(arg.param)
  257. .exec(TensorLayoutArray{filter, dst, src});
  258. src.dtype = dst.dtype = filter.dtype = dtype::Float32();
  259. arg.param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  260. checker.set_rng(0, &rng)
  261. .set_rng(1, &rng)
  262. .set_epsilon(1e-2)
  263. .set_param(arg.param)
  264. .exec(TensorLayoutArray{filter, dst, src});
  265. }
  266. }
  267. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_NHWC) {
  268. using namespace convolution;
  269. std::vector<TestArg> args = get_args_cuda_conv_bwd_data();
  270. Checker<ConvolutionBackwardData> checker(handle_cuda());
  271. NormalRNG default_rng;
  272. for (auto&& arg : args) {
  273. float scale = 64.f / sqrt(arg.filter[0] * arg.filter[2] * arg.filter[3]);
  274. UniformFloatRNG rng(scale, 2 * scale);
  275. arg.param.format = param::Convolution::Format::NHWC;
  276. arg.src = cvt_src_or_dst_nchw2nhwc(arg.src);
  277. arg.filter = cvt_filter_nchw2nhwc(arg.filter);
  278. auto src = TensorLayout(arg.src, dtype::Float32());
  279. auto filter = TensorLayout(arg.filter, dtype::Float32());
  280. TensorLayout dst;
  281. {
  282. auto opr = handle_cuda()->create_operator<Convolution>();
  283. opr->param() = arg.param;
  284. opr->deduce_layout(src, filter, dst);
  285. }
  286. src.dtype = dst.dtype = filter.dtype = dtype::Float16();
  287. arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  288. checker.set_rng(0, &rng)
  289. .set_rng(1, &rng)
  290. .set_epsilon(1e-2)
  291. .set_param(arg.param)
  292. .exec(TensorLayoutArray{filter, dst, src});
  293. src.dtype = dst.dtype = filter.dtype = dtype::Float32();
  294. arg.param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  295. checker.set_rng(0, &rng)
  296. .set_rng(1, &rng)
  297. .set_epsilon(1e-2)
  298. .set_param(arg.param)
  299. .exec(TensorLayoutArray{filter, dst, src});
  300. }
  301. }
  302. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_CUDNN) {
  303. if (cuda::is_compute_capability_required(7, 0))
  304. return;
  305. using namespace convolution;
  306. Checker<ConvolutionBackwardData> checker(handle_cuda());
  307. checker.set_before_exec_callback(
  308. AlgoChecker<ConvolutionBackwardData>("CUDNN_CONVOLUTION"));
  309. //! noncontiguous case
  310. {
  311. param::Convolution param;
  312. param.pad_h = param.pad_w = 1;
  313. checker.set_param(param).execl(TensorLayoutArray{
  314. {{16, 16, 3, 3}, {144, 9, 3, 1}, dtype::Float32()},
  315. {{2, 16, 7, 7}, {1568, 49, 7, 1}, dtype::Float32()},
  316. {{2, 16, 7, 7}, {1568, 49, 7, 1}, dtype::Float32()},
  317. });
  318. }
  319. }
  320. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_MATMUL) {
  321. using namespace convolution;
  322. std::vector<TestArg> args = get_args_cuda_conv_bwd_data();
  323. Checker<ConvolutionBackwardData> checker(handle_cuda());
  324. checker.set_before_exec_callback(AlgoChecker<ConvolutionBackwardData>(
  325. ExecutionPolicyAlgoName{"MATMUL", {{"CUBLAS", {}}}}));
  326. NormalRNG default_rng;
  327. for (auto&& arg : args) {
  328. float scale = 64.f / sqrt(arg.filter[0] * arg.filter[2] * arg.filter[3]);
  329. UniformFloatRNG rng(scale, 2 * scale);
  330. auto src = TensorLayout(arg.src, dtype::Float32());
  331. auto filter = TensorLayout(arg.filter, dtype::Float32());
  332. TensorLayout dst;
  333. {
  334. auto opr = handle_cuda()->create_operator<Convolution>();
  335. opr->param() = arg.param;
  336. opr->deduce_layout(src, filter, dst);
  337. }
  338. src.dtype = dst.dtype = filter.dtype = dtype::Float32();
  339. checker.set_rng(0, &default_rng)
  340. .set_rng(1, &default_rng)
  341. .set_epsilon(1e-3)
  342. .set_param(arg.param)
  343. .exec(TensorLayoutArray{filter, dst, src});
  344. }
  345. //! noncontiguous case
  346. {
  347. param::Convolution param;
  348. param.pad_h = param.pad_w = 1;
  349. checker.set_param(param).execl(TensorLayoutArray{
  350. {{16, 16, 3, 3}, {144, 9, 3, 1}, dtype::Float32()},
  351. {{2, 16, 7, 7}, {1568, 49, 7, 1}, dtype::Float32()},
  352. {{2, 16, 7, 7}, {1568, 49, 7, 1}, dtype::Float32()},
  353. });
  354. }
  355. }
  356. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_INT8_NCHW4_DP4A) {
  357. if (!cuda::is_compute_capability_required(6, 1)) {
  358. printf("Skip CUDA.CONVOLUTION_BACKWARD_DATA_INT8_NCHW4_DP4A test as "
  359. "current device doesn't support\n");
  360. return;
  361. }
  362. using namespace convolution;
  363. std::vector<TestArg> args = get_args_int8_nchw4_conv_bwd_data();
  364. struct AlgoParam {
  365. int threadblock_m;
  366. int threadblock_n;
  367. int threadblock_k;
  368. int warp_m;
  369. int warp_n;
  370. int warp_k;
  371. int stage;
  372. std::string to_string() {
  373. return ssprintf(
  374. "_%dX%dX%d_%dX%dX%d_%dstage", threadblock_m, threadblock_n,
  375. threadblock_k, warp_m, warp_n, warp_k, stage);
  376. }
  377. };
  378. std::vector<AlgoParam> all_params;
  379. all_params.emplace_back(AlgoParam{16, 64, 8, 16, 64, 8, 2});
  380. all_params.emplace_back(AlgoParam{16, 128, 16, 16, 64, 16, 2});
  381. all_params.emplace_back(AlgoParam{16, 128, 16, 16, 128, 16, 1});
  382. all_params.emplace_back(AlgoParam{32, 128, 32, 32, 64, 32, 2});
  383. for (auto algo_param : all_params) {
  384. Checker<ConvolutionBackwardData> checker(handle_cuda());
  385. std::string algo_name(ssprintf(
  386. "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM%s", algo_param.to_string().c_str()));
  387. checker.set_before_exec_callback(
  388. AlgoChecker<ConvolutionBackwardData>(algo_name.c_str()));
  389. checker.set_epsilon(1 + 1e-3).set_max_avg_error(1e-1);
  390. for (auto&& arg : args) {
  391. UniformIntRNG rng(-3, 3);
  392. auto src = TensorLayout(arg.src, dtype::QuantizedS8{1.2f});
  393. auto filter = TensorLayout(arg.filter, dtype::QuantizedS8{1.3f});
  394. TensorLayout dst;
  395. dst.dtype = dtype::QuantizedS8{1.2f};
  396. {
  397. auto opr = handle_cuda()->create_operator<Convolution>();
  398. opr->param() = arg.param;
  399. opr->deduce_layout(src, filter, dst);
  400. }
  401. checker.set_rng(0, &rng).set_rng(1, &rng).set_param(arg.param).exec(
  402. TensorLayoutArray{filter, dst, src});
  403. }
  404. }
  405. }
  406. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_INT8_NCHW_DP4A) {
  407. if (!cuda::is_compute_capability_required(6, 1)) {
  408. printf("Skip CUDA.CONVOLUTION_BACKWARD_DATA_INT8_NCHW_DP4A test as "
  409. "current device doesn't support\n");
  410. return;
  411. }
  412. using namespace convolution;
  413. std::vector<TestArg> args = get_args_int8_nchw_conv_bwd_data();
  414. Checker<ConvolutionBackwardData> checker(handle_cuda());
  415. checker.set_before_exec_callback(
  416. AlgoChecker<ConvolutionBackwardData>("INT8_NCHW_DOTPROD_IMPLICIT_GEMM"));
  417. checker.set_epsilon(1 + 1e-3).set_max_avg_error(1e-1);
  418. for (auto&& arg : args) {
  419. UniformIntRNG rng(-3, 3);
  420. auto src = TensorLayout(arg.src, dtype::QuantizedS8{1.2f});
  421. auto filter = TensorLayout(arg.filter, dtype::QuantizedS8{1.3f});
  422. TensorLayout dst;
  423. dst.dtype = dtype::QuantizedS8{1.2f};
  424. {
  425. auto opr = handle_cuda()->create_operator<Convolution>();
  426. opr->param() = arg.param;
  427. opr->deduce_layout(src, filter, dst);
  428. }
  429. checker.set_rng(0, &rng).set_rng(1, &rng).set_param(arg.param).exec(
  430. TensorLayoutArray{filter, dst, src});
  431. }
  432. }
  433. #if CUDA_VERSION >= 10020
  434. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_INT8_NHWC_IMMA) {
  435. if (!cuda::is_compute_capability_required(7, 5)) {
  436. printf("Skip CUDA.CONVOLUTION_BACKWARD_DATA_INT8_NHWC_IMMA test as "
  437. "current device doesn't support\n");
  438. return;
  439. }
  440. using namespace convolution;
  441. std::vector<TestArg> args = get_args_int8_nhwc_conv_bwd_data();
  442. struct AlgoParam {
  443. int threadblock_m;
  444. int threadblock_n;
  445. int threadblock_k;
  446. int warp_m;
  447. int warp_n;
  448. int warp_k;
  449. int stage;
  450. int access_size;
  451. std::string to_string() {
  452. return ssprintf(
  453. "_%dX%dX%d_%dX%dX%d_%dstage_%d", threadblock_m, threadblock_n,
  454. threadblock_k, warp_m, warp_n, warp_k, stage, access_size);
  455. }
  456. };
  457. std::vector<AlgoParam> all_params;
  458. all_params.emplace_back(AlgoParam{64, 16, 32, 64, 16, 32, 2, 4});
  459. all_params.emplace_back(AlgoParam{64, 16, 32, 64, 16, 32, 2, 8});
  460. all_params.emplace_back(AlgoParam{64, 16, 32, 64, 16, 32, 2, 16});
  461. all_params.emplace_back(AlgoParam{128, 32, 32, 64, 32, 32, 1, 4});
  462. all_params.emplace_back(AlgoParam{128, 32, 32, 64, 32, 32, 1, 8});
  463. all_params.emplace_back(AlgoParam{128, 32, 32, 64, 32, 32, 1, 16});
  464. for (auto algo_param : all_params) {
  465. Checker<ConvolutionBackwardData> checker(handle_cuda());
  466. std::string algo_name(ssprintf(
  467. "INT8_NHWC_IMMA_IMPLICIT_GEMM%s", algo_param.to_string().c_str()));
  468. checker.set_before_exec_callback(
  469. AlgoChecker<ConvolutionBackwardData>(algo_name.c_str()));
  470. checker.set_epsilon(1 + 1e-3).set_max_avg_error(1e-1);
  471. for (auto&& arg : args) {
  472. UniformIntRNG rng(-3, 3);
  473. auto src = TensorLayout(arg.src, dtype::QuantizedS8{1.2f});
  474. auto filter = TensorLayout(arg.filter, dtype::QuantizedS8{1.3f});
  475. TensorLayout dst;
  476. dst.dtype = dtype::QuantizedS8{1.2f};
  477. {
  478. auto opr = handle_cuda()->create_operator<Convolution>();
  479. opr->param() = arg.param;
  480. opr->deduce_layout(src, filter, dst);
  481. }
  482. checker.set_rng(0, &rng).set_rng(1, &rng).set_param(arg.param).exec(
  483. TensorLayoutArray{filter, dst, src});
  484. }
  485. }
  486. }
  487. #endif
  488. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_FAILED_CUDNN7_5) {
  489. // BRAIN-481 failed on architectures 7.0, remove the following if statement,
  490. // when cudnn fixed the problem.
  491. if (cuda::is_compute_capability_required(7, 0))
  492. return;
  493. using namespace convolution;
  494. std::vector<TestArg> args = get_args_cudnn_7_5_failures();
  495. Checker<ConvolutionBackwardData> checker(handle_cuda());
  496. NormalRNG default_rng;
  497. for (auto&& arg : args) {
  498. float scale = 128.f / sqrt(arg.filter[0] * arg.filter[2] * arg.filter[3]);
  499. scale = std::max(scale, 1.f);
  500. UniformFloatRNG rng(scale, 2 * scale);
  501. auto src = TensorLayout(arg.src, dtype::Float32());
  502. auto filter = TensorLayout(arg.filter, dtype::Float32());
  503. TensorLayout dst;
  504. {
  505. auto opr = handle_cuda()->create_operator<Convolution>();
  506. opr->param() = arg.param;
  507. opr->deduce_layout(src, filter, dst);
  508. }
  509. src.dtype = dst.dtype = filter.dtype = dtype::Float32();
  510. checker.set_rng(0, &default_rng)
  511. .set_rng(1, &default_rng)
  512. .set_epsilon(1e-3)
  513. .set_param(arg.param)
  514. .exec(TensorLayoutArray{filter, dst, src});
  515. src.dtype = dst.dtype = filter.dtype = dtype::Float16();
  516. checker.set_rng(0, &rng)
  517. .set_rng(1, &rng)
  518. .set_epsilon(1e-1)
  519. .set_param(arg.param)
  520. .exec(TensorLayoutArray{filter, dst, src});
  521. arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  522. checker.set_rng(0, &rng)
  523. .set_rng(1, &rng)
  524. .set_epsilon(1e-1)
  525. .set_param(arg.param)
  526. .exec(TensorLayoutArray{filter, dst, src});
  527. }
  528. }
  529. TEST_F(CUDA, CONVOLUTION_BACKWARD_FILTER) {
  530. using namespace convolution;
  531. std::vector<TestArg> args = get_args();
  532. Checker<ConvolutionBackwardFilter> checker(handle_cuda());
  533. bool f16_checked = false;
  534. for (auto&& arg : args) {
  535. auto src = TensorLayout(arg.src, dtype::Float32());
  536. auto filter = TensorLayout(arg.filter, dtype::Float32());
  537. TensorLayout dst;
  538. {
  539. auto opr = handle_cuda()->create_operator<Convolution>();
  540. opr->param() = arg.param;
  541. opr->deduce_layout(src, filter, dst);
  542. }
  543. float scale = 1.0f / sqrt(dst[2] * dst[3]);
  544. UniformFloatRNG rng(scale, 2 * scale);
  545. src.dtype = dst.dtype = filter.dtype = dtype::Float32();
  546. checker.set_rng(0, &rng)
  547. .set_rng(1, &rng)
  548. .set_epsilon(1e-3)
  549. .set_param(arg.param)
  550. .exec(TensorLayoutArray{src, dst, filter});
  551. // reduce on large f16 array may introduce significant error
  552. if (dst.total_nr_elems() >= 1000 && f16_checked)
  553. continue;
  554. f16_checked = true;
  555. src.dtype = dst.dtype = filter.dtype = dtype::Float16();
  556. checker.set_rng(0, &rng)
  557. .set_rng(1, &rng)
  558. .set_epsilon(1e-1)
  559. .set_param(arg.param)
  560. .exec(TensorLayoutArray{src, dst, filter});
  561. arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  562. checker.set_rng(0, &rng)
  563. .set_rng(1, &rng)
  564. .set_epsilon(1e-1)
  565. .set_param(arg.param)
  566. .exec(TensorLayoutArray{src, dst, filter});
  567. checker.set_before_exec_callback(
  568. AlgoChecker<ConvolutionBackwardFilter>(ExecutionPolicyAlgoName{
  569. "CONVOLUTION_BACKWARD_FILTER_BFLOAT16",
  570. {{"MATMUL", {{"CUBLAS", {}}}}}}));
  571. src.dtype = dst.dtype = filter.dtype = dtype::BFloat16();
  572. checker.set_rng(0, &rng)
  573. .set_rng(1, &rng)
  574. .set_epsilon(1e-1)
  575. .set_param(arg.param)
  576. .exec(TensorLayoutArray{src, dst, filter});
  577. checker.reset_before_exec_callback();
  578. checker.opr()->execution_policy() = {};
  579. }
  580. }
  581. TEST_F(CUDA, CONVOLUTION_BACKWARD_FILTER_MATMUL) {
  582. using namespace convolution;
  583. std::vector<TestArg> args = get_args();
  584. Checker<ConvolutionBackwardFilter> checker(handle_cuda());
  585. checker.set_before_exec_callback(AlgoChecker<ConvolutionBackwardFilter>(
  586. ExecutionPolicyAlgoName{"MATMUL", {{"CUBLAS", {}}}}));
  587. for (auto&& arg : args) {
  588. auto src = TensorLayout(arg.src, dtype::Float32());
  589. auto filter = TensorLayout(arg.filter, dtype::Float32());
  590. TensorLayout dst;
  591. {
  592. auto opr = handle_cuda()->create_operator<Convolution>();
  593. opr->param() = arg.param;
  594. opr->deduce_layout(src, filter, dst);
  595. }
  596. float scale = 1.0f / sqrt(dst[2] * dst[3]);
  597. UniformFloatRNG rng(scale, 2 * scale);
  598. src.dtype = dst.dtype = filter.dtype = dtype::Float32();
  599. checker.set_rng(0, &rng)
  600. .set_rng(1, &rng)
  601. .set_epsilon(1e-3)
  602. .set_param(arg.param)
  603. .exec(TensorLayoutArray{src, dst, filter});
  604. }
  605. //! noncontiguous case
  606. {
  607. NormalRNG default_rng;
  608. param::Convolution param;
  609. param.pad_h = param.pad_w = 1;
  610. checker.set_rng(0, &default_rng)
  611. .set_rng(1, &default_rng)
  612. .set_param(param)
  613. .execl(TensorLayoutArray{
  614. {{2, 16, 7, 7}, {1568, 49, 7, 1}, dtype::Float32()},
  615. {{2, 16, 7, 7}, {1568, 49, 7, 1}, dtype::Float32()},
  616. {{16, 16, 3, 3}, {144, 9, 3, 1}, dtype::Float32()}});
  617. }
  618. }
  619. TEST_F(CUDA, CONVOLUTION_BACKWARD_FILTER_CUDNN) {
  620. if (cuda::is_compute_capability_required(7, 0))
  621. return;
  622. using namespace convolution;
  623. Checker<ConvolutionBackwardFilter> checker(handle_cuda());
  624. checker.set_before_exec_callback(
  625. AlgoChecker<ConvolutionBackwardFilter>("CUDNN_CONVOLUTION"));
  626. //! noncontiguous case
  627. {
  628. param::Convolution param;
  629. param.pad_h = param.pad_w = 1;
  630. checker.set_param(param).execl(TensorLayoutArray{
  631. {{2, 16, 7, 7}, {1568, 49, 7, 1}, dtype::Float32()},
  632. {{2, 16, 7, 7}, {1568, 49, 7, 1}, dtype::Float32()},
  633. {{16, 16, 3, 3}, {144, 9, 3, 1}, dtype::Float32()}});
  634. }
  635. }
  636. TEST_F(CUDA, CONV_CONFIG_COMBINATIONS) {
  637. auto eps_getter = [](bool f16, int stage, const char* name) -> float {
  638. if (f16) {
  639. return stage == 2 ? 0.5 : 0.2;
  640. }
  641. if (strstr(name, "WINOGRAD_NONFUSED"))
  642. return 0.3;
  643. return 1e-3;
  644. };
  645. convolution::test_conv_config_combinations(
  646. 2, handle_cuda(), false, true, true, eps_getter, true);
  647. convolution::test_conv_config_combinations(
  648. 3, handle_cuda(), false, true, true, eps_getter, true);
  649. convolution::test_conv_config_combinations(
  650. 5, handle_cuda(), false, true, true, eps_getter, true);
  651. }
  652. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_1) {
  653. if (cuda::is_compute_capability_required(7, 0))
  654. return;
  655. using namespace convolution;
  656. Checker<ConvolutionBackwardData> checker(handle_cuda());
  657. checker.set_before_exec_callback(AlgoChecker<ConvolutionBackwardData>(
  658. "CUDNN_CONVOLUTION_BWD_DATA_ALGO_1" CUDNN_VERSION_STRING));
  659. NormalRNG default_rng;
  660. TensorShape s_filter = TensorShape{8, 8, 2, 2}, s_src = TensorShape{2, 8, 18, 18};
  661. float scale = 1.0f / sqrt(s_filter[0] * s_filter[2] * s_filter[3]);
  662. UniformFloatRNG rng(scale, 2 * scale);
  663. auto src = TensorLayout(s_src, dtype::Float16());
  664. auto filter = TensorLayout(s_filter, dtype::Float16());
  665. TensorLayout dst;
  666. param::Convolution param;
  667. param.pad_h = param.pad_w = 2;
  668. param.stride_h = param.stride_w = 2;
  669. {
  670. auto opr = handle_cuda()->create_operator<Convolution>();
  671. opr->param() = param;
  672. opr->deduce_layout(src, filter, dst);
  673. }
  674. src.dtype = dst.dtype = filter.dtype = dtype::Float16();
  675. param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  676. checker.set_rng(0, &rng).set_rng(1, &rng).set_epsilon(0.2).set_param(param).exec(
  677. TensorLayoutArray{filter, dst, src});
  678. }
  679. TEST_F(CUDA, CONVOLUTION_BACKWARD_DEPTHWISE_LARGE_FILTER) {
  680. Checker<ConvolutionBackwardData> checker(handle_cuda());
  681. checker.set_before_exec_callback(
  682. AlgoChecker<ConvolutionBackwardData>("DEPTHWISE_LARGE_FILTER"));
  683. for (auto dtype : std::vector<DType>{dtype::Float16()}) {
  684. auto run = [&checker, &dtype](size_t n, size_t g, size_t h, size_t fh) {
  685. param::Convolution param;
  686. param.stride_h = param.stride_w = 1;
  687. param.pad_h = param.pad_w = fh / 2;
  688. param.mode = Convolution::Mode::CROSS_CORRELATION;
  689. param.sparse = param::Convolution::Sparse::GROUP;
  690. checker.set_dtype(0, dtype).set_dtype(1, dtype).set_dtype(2, dtype);
  691. checker.set_param(param).execs(
  692. {{g, 1, 1, fh, fh}, {n, g, h, h}, {n, g, h, h}});
  693. };
  694. run(4, 8, 32, 5);
  695. run(4, 8, 32, 7);
  696. run(4, 8, 32, 9);
  697. run(4, 8, 32, 11);
  698. run(4, 8, 32, 13);
  699. run(4, 8, 32, 15);
  700. run(4, 8, 32, 17);
  701. run(4, 8, 32, 19);
  702. run(4, 8, 32, 21);
  703. run(4, 8, 32, 23);
  704. run(4, 8, 32, 25);
  705. run(4, 8, 32, 27);
  706. run(4, 8, 32, 29);
  707. run(4, 8, 32, 31);
  708. run(4, 8, 64, 7);
  709. run(4, 8, 64, 5);
  710. run(4, 8, 64, 9);
  711. run(4, 8, 64, 11);
  712. run(4, 8, 64, 13);
  713. run(4, 8, 64, 15);
  714. run(4, 8, 64, 17);
  715. run(4, 8, 64, 19);
  716. run(4, 8, 64, 21);
  717. run(4, 8, 64, 23);
  718. run(4, 8, 64, 25);
  719. run(4, 8, 64, 27);
  720. run(4, 8, 64, 29);
  721. run(4, 8, 64, 31);
  722. run(1, 2, 128, 31);
  723. run(1, 2, 256, 31);
  724. }
  725. }
  726. #if MEGDNN_WITH_BENCHMARK
  727. TEST_F(CUDA, CONV_FWD_BENCHMARK) {
  728. auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW, size_t SH = 1,
  729. size_t SW = 1, size_t FH = 1, size_t FW = 1, size_t PH = 0,
  730. size_t PW = 0, bool fp16io_c32 = false) {
  731. auto benchmarker = Benchmarker<ConvolutionForward>(handle_cuda());
  732. benchmarker.set_dtype(0, dtype::Float16())
  733. .set_dtype(1, dtype::Float16())
  734. .set_dtype(2, dtype::Float16());
  735. ConvolutionForward::Param param;
  736. param.stride_h = SH;
  737. param.stride_w = SW;
  738. param.pad_h = PH;
  739. param.pad_w = PW;
  740. if (fp16io_c32) {
  741. param.compute_mode = ConvolutionForward::Param::ComputeMode::FLOAT32;
  742. }
  743. benchmarker.set_param(param);
  744. std::unique_ptr<OprProxy<ConvolutionForward>> proxy{
  745. new OprProxy<ConvolutionForward>{true}};
  746. benchmarker.set_proxy(proxy);
  747. size_t OH = (IH - FH + 2 * PH) / SH + 1;
  748. size_t OW = (IW - FW + 2 * PW) / SW + 1;
  749. auto time =
  750. benchmarker.execs({{N, IC, IH, IW}, {OC, IC, FH, FW}, {N, OC, OH, OW}});
  751. time /= 1000.0 * 10.0;
  752. auto flo = (double)N * OC * IC * OH * OW * FH * FW * 2;
  753. auto flops = flo / time / 1e12;
  754. printf("comp_type %s: ", fp16io_c32 ? "32" : "16");
  755. printf("%.3fG FLO, flops %.3fTFLOPS\n", flo / 1e9, flops);
  756. };
  757. run(32, 512, 256, 56, 56, 1, 1, 1, 1, 0, 0, false);
  758. run(32, 512, 256, 56, 56, 1, 1, 1, 1, 0, 0, true);
  759. }
  760. TEST_F(CUDA, CONVOLUTION_FWD_BENCHMARK) {
  761. CUBenchmarker<ConvolutionForward> bench{handle_cuda()};
  762. std::unique_ptr<OprProxy<ConvolutionForward>> proxy{
  763. new OprProxy<ConvolutionForward>{true}};
  764. size_t RUNS = 10;
  765. bench.set_proxy(proxy).set_times(RUNS);
  766. auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW, size_t FH,
  767. size_t SH, size_t PH) {
  768. bench.set_dtype(0, dtype::Float32())
  769. .set_dtype(1, dtype::Float32())
  770. .set_dtype(2, dtype::Float32());
  771. param::Convolution param;
  772. param.stride_h = param.stride_w = SH;
  773. param.pad_h = param.pad_w = PH;
  774. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  775. bench.set_param(param);
  776. bench.proxy()->target_execution_policy.algo.reset();
  777. TensorLayout src{{N, IC, IH, IW}, dtype::Float32()},
  778. filter{{OC, IC, FH, FH}, dtype::Float32()};
  779. TensorLayout dst;
  780. {
  781. auto&& opr = handle_cuda()->create_operator<Convolution>();
  782. opr->param() = param;
  783. opr->deduce_layout(src, filter, dst);
  784. }
  785. auto time_ms_fp32 = bench.execl({src, filter, dst}) / RUNS;
  786. src.dtype = filter.dtype = dst.dtype = dtype::Float16();
  787. bench.proxy()->target_execution_policy.algo.reset();
  788. bench.set_dtype(0, dtype::Float16())
  789. .set_dtype(1, dtype::Float16())
  790. .set_dtype(2, dtype::Float16());
  791. auto time_ms_true_fp16 = bench.execl({src, filter, dst}) / RUNS;
  792. param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  793. bench.proxy()->target_execution_policy.algo.reset();
  794. bench.set_param(param);
  795. auto time_ms_pseudo_fp16 = bench.execl({src, filter, dst}) / RUNS;
  796. float flo = 2.0 * N * OC * IC * dst[2] * dst[3] * FH * FH;
  797. printf("inp=%s, kern=%s, dst=%s ", src.to_string().c_str(),
  798. filter.to_string().c_str(), dst.to_string().c_str());
  799. printf("time_fp32=%.2fms, flops=%.3fTFLOPS\ntime_true_fp16=%.2fms, "
  800. "flops=%.3fTFLOPS\ntime_pseudo_fp16=%.2fms, flops=%.3fFLOPS\n",
  801. time_ms_fp32, (flo / (time_ms_fp32 * 1e9)), time_ms_true_fp16,
  802. (flo / (time_ms_true_fp16 * 1e9)), time_ms_pseudo_fp16,
  803. (flo / (time_ms_pseudo_fp16 * 1e9)));
  804. printf("speedup (true_fp16/fp32)=%.2f, (true_fp16/pseudo_fp16)=%.2f\n",
  805. time_ms_fp32 / time_ms_true_fp16,
  806. time_ms_pseudo_fp16 / time_ms_true_fp16);
  807. };
  808. run(32, 64, 3, 224, 224, 7, 2, 3);
  809. run(32, 128, 128, 28, 28, 3, 1, 1);
  810. run(32, 256, 256, 14, 14, 3, 1, 1);
  811. run(32, 512, 512, 7, 7, 3, 1, 1);
  812. run(32, 64, 64, 56, 56, 3, 1, 1);
  813. run(32, 512, 256, 56, 56, 1, 2, 0);
  814. run(32, 1024, 512, 28, 28, 1, 2, 0);
  815. run(32, 2048, 1024, 14, 14, 1, 2, 0);
  816. run(32, 512, 128, 28, 28, 1, 1, 0);
  817. run(32, 128, 512, 28, 28, 1, 1, 0);
  818. run(32, 1024, 256, 14, 14, 1, 1, 0);
  819. run(32, 256, 1024, 14, 14, 1, 1, 0);
  820. run(32, 2048, 512, 7, 7, 1, 1, 0);
  821. run(32, 512, 2048, 7, 7, 1, 1, 0);
  822. run(32, 256, 64, 56, 56, 1, 1, 0);
  823. run(32, 64, 256, 56, 56, 1, 1, 0);
  824. run(32, 128, 256, 56, 56, 1, 2, 0);
  825. run(32, 256, 512, 28, 28, 1, 2, 0);
  826. run(32, 512, 1024, 14, 14, 1, 2, 0);
  827. run(32, 64, 64, 56, 56, 1, 1, 0);
  828. }
  829. TEST_F(CUDA, CONVOLUTION_BWD_DATA_BENCHMARK) {
  830. CUBenchmarker<ConvolutionBackwardData> bench{handle_cuda()};
  831. std::unique_ptr<OprProxy<ConvolutionBackwardData>> proxy{
  832. new OprProxy<ConvolutionBackwardData>{true}};
  833. size_t RUNS = 10;
  834. bench.set_proxy(proxy).set_times(RUNS);
  835. auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW, size_t FH,
  836. size_t SH, size_t PH) {
  837. bench.set_dtype(0, dtype::Float32())
  838. .set_dtype(1, dtype::Float32())
  839. .set_dtype(2, dtype::Float32());
  840. param::Convolution param;
  841. param.stride_h = param.stride_w = SH;
  842. param.pad_h = param.pad_w = PH;
  843. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  844. bench.set_param(param);
  845. bench.proxy()->target_execution_policy.algo.reset();
  846. TensorLayout src{{N, IC, IH, IW}, dtype::Float32()},
  847. filter{{OC, IC, FH, FH}, dtype::Float32()};
  848. TensorLayout dst;
  849. {
  850. auto&& opr = handle_cuda()->create_operator<Convolution>();
  851. opr->param() = param;
  852. opr->deduce_layout(src, filter, dst);
  853. }
  854. auto time_ms_fp32 = bench.execl({filter, dst, src}) / RUNS;
  855. src.dtype = filter.dtype = dst.dtype = dtype::Float16();
  856. bench.proxy()->target_execution_policy.algo.reset();
  857. bench.set_dtype(0, dtype::Float16())
  858. .set_dtype(1, dtype::Float16())
  859. .set_dtype(2, dtype::Float16());
  860. auto time_ms_true_fp16 = bench.execl({filter, dst, src}) / RUNS;
  861. param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  862. bench.proxy()->target_execution_policy.algo.reset();
  863. bench.set_param(param);
  864. auto time_ms_pseudo_fp16 = bench.execl({filter, dst, src}) / RUNS;
  865. float flo = 2.0 * N * OC * IC * dst[2] * dst[3] * FH * FH;
  866. printf("inp=%s, kern=%s, dst=%s ", src.to_string().c_str(),
  867. filter.to_string().c_str(), dst.to_string().c_str());
  868. printf("time_fp32=%.2fms, flops=%.3fTFLOPS\ntime_true_fp16=%.2fms, "
  869. "flops=%.3fTFLOPS\ntime_pseudo_fp16=%.2fms, flops=%.3fFLOPS\n",
  870. time_ms_fp32, (flo / (time_ms_fp32 * 1e9)), time_ms_true_fp16,
  871. (flo / (time_ms_true_fp16 * 1e9)), time_ms_pseudo_fp16,
  872. (flo / (time_ms_pseudo_fp16 * 1e9)));
  873. printf("speedup (true_fp16/fp32)=%.2f, (true_fp16/pseudo_fp16)=%.2f\n",
  874. time_ms_fp32 / time_ms_true_fp16,
  875. time_ms_pseudo_fp16 / time_ms_true_fp16);
  876. };
  877. run(32, 64, 3, 224, 224, 7, 2, 3);
  878. run(32, 128, 128, 28, 28, 3, 1, 1);
  879. run(32, 256, 256, 14, 14, 3, 1, 1);
  880. run(32, 512, 512, 7, 7, 3, 1, 1);
  881. run(32, 64, 64, 56, 56, 3, 1, 1);
  882. run(32, 512, 256, 56, 56, 1, 2, 0);
  883. run(32, 1024, 512, 28, 28, 1, 2, 0);
  884. run(32, 2048, 1024, 14, 14, 1, 2, 0);
  885. run(32, 512, 128, 28, 28, 1, 1, 0);
  886. run(32, 128, 512, 28, 28, 1, 1, 0);
  887. run(32, 1024, 256, 14, 14, 1, 1, 0);
  888. run(32, 256, 1024, 14, 14, 1, 1, 0);
  889. run(32, 2048, 512, 7, 7, 1, 1, 0);
  890. run(32, 512, 2048, 7, 7, 1, 1, 0);
  891. run(32, 256, 64, 56, 56, 1, 1, 0);
  892. run(32, 64, 256, 56, 56, 1, 1, 0);
  893. run(32, 128, 256, 56, 56, 1, 2, 0);
  894. run(32, 256, 512, 28, 28, 1, 2, 0);
  895. run(32, 512, 1024, 14, 14, 1, 2, 0);
  896. run(32, 64, 64, 56, 56, 1, 1, 0);
  897. }
  898. TEST_F(CUDA, BENCHMARK_CONVOLUTION_BWD_DATA_DEPTHWISE_LARGE_FILTER) {
  899. CUBenchmarker<ConvolutionBackwardData> bencher{handle_cuda()};
  900. bencher.set_display(false);
  901. bencher.set_before_exec_callback(
  902. AlgoChecker<ConvolutionBackwardData>("DEPTHWISE_LARGE_FILTER"));
  903. auto run = [&](size_t N, size_t OC, size_t g, size_t IH, size_t IW, size_t FH,
  904. size_t SH, size_t nr_times) {
  905. bencher.set_dtype(0, dtype::Float32())
  906. .set_dtype(1, dtype::Float32())
  907. .set_dtype(2, dtype::Float32());
  908. param::Convolution param;
  909. param.stride_h = param.stride_w = SH;
  910. param.pad_h = param.pad_w = FH / 2;
  911. param.sparse = param::Convolution::Sparse::GROUP;
  912. bencher.set_param(param);
  913. bencher.set_times(nr_times);
  914. TensorLayout src{{N, g, IH, IW}, dtype::Float32()},
  915. filter{{g, 1, 1, FH, FH}, dtype::Float32()};
  916. TensorLayout dst;
  917. {
  918. auto&& opr = handle_cuda()->create_operator<Convolution>();
  919. opr->param() = param;
  920. opr->deduce_layout(src, filter, dst);
  921. }
  922. auto time_ms_fp32 = bencher.execl({filter, dst, src}) / nr_times;
  923. float flo = 2.0 * N * g * dst[2] * dst[3] * FH * FH;
  924. printf("inp=%s, kern=%s, dst=%s ", src.to_string().c_str(),
  925. filter.to_string().c_str(), dst.to_string().c_str());
  926. printf("time_fp32=%.2fms, flops=%.3fTFLOPS\n", time_ms_fp32,
  927. (flo / (time_ms_fp32 * 1e9)));
  928. };
  929. run(64, 384, 384, 32, 32, 3, 1, 10);
  930. run(64, 384, 384, 32, 32, 5, 1, 10);
  931. run(64, 384, 384, 32, 32, 7, 1, 10);
  932. run(64, 384, 384, 32, 32, 9, 1, 10);
  933. run(64, 384, 384, 32, 32, 11, 1, 10);
  934. run(64, 384, 384, 32, 32, 13, 1, 10);
  935. run(64, 384, 384, 32, 32, 15, 1, 10);
  936. run(64, 384, 384, 32, 32, 17, 1, 10);
  937. run(64, 384, 384, 32, 32, 19, 1, 10);
  938. run(64, 384, 384, 32, 32, 21, 1, 10);
  939. run(64, 384, 384, 32, 32, 23, 1, 10);
  940. run(64, 384, 384, 32, 32, 25, 1, 10);
  941. run(64, 384, 384, 32, 32, 27, 1, 10);
  942. run(64, 384, 384, 32, 32, 29, 1, 10);
  943. run(64, 384, 384, 32, 32, 31, 1, 10);
  944. }
  945. TEST_F(CUDA, BENCHMARK_CONVOLUTION_BWD_DATA_DEPTHWISE_LARGE_FILTER_FP16) {
  946. CUBenchmarker<ConvolutionBackwardData> bencher{handle_cuda()};
  947. bencher.set_display(false);
  948. bencher.set_before_exec_callback(
  949. AlgoChecker<ConvolutionBackwardData>("DEPTHWISE_LARGE_FILTER"));
  950. auto run = [&](size_t N, size_t OC, size_t g, size_t IH, size_t IW, size_t FH,
  951. size_t SH, size_t nr_times) {
  952. bencher.set_dtype(0, dtype::Float16())
  953. .set_dtype(1, dtype::Float16())
  954. .set_dtype(2, dtype::Float16());
  955. param::Convolution param;
  956. param.stride_h = param.stride_w = SH;
  957. param.pad_h = param.pad_w = FH / 2;
  958. param.sparse = param::Convolution::Sparse::GROUP;
  959. bencher.set_param(param);
  960. bencher.set_times(nr_times);
  961. TensorLayout src{{N, g, IH, IW}, dtype::Float16()},
  962. filter{{g, 1, 1, FH, FH}, dtype::Float16()};
  963. TensorLayout dst;
  964. {
  965. auto&& opr = handle_cuda()->create_operator<Convolution>();
  966. opr->param() = param;
  967. opr->deduce_layout(src, filter, dst);
  968. }
  969. auto time_ms_fp16 = bencher.execl({filter, dst, src}) / nr_times;
  970. float flo = 2.0 * N * g * dst[2] * dst[3] * FH * FH;
  971. printf("inp=%s, kern=%s, dst=%s ", src.to_string().c_str(),
  972. filter.to_string().c_str(), dst.to_string().c_str());
  973. printf("time_fp16=%.2fms, flops=%.3fTFLOPS\n", time_ms_fp16,
  974. (flo / (time_ms_fp16 * 1e9)));
  975. };
  976. run(64, 384, 384, 32, 32, 3, 1, 10);
  977. run(64, 384, 384, 32, 32, 5, 1, 10);
  978. run(64, 384, 384, 32, 32, 7, 1, 10);
  979. run(64, 384, 384, 32, 32, 9, 1, 10);
  980. run(64, 384, 384, 32, 32, 11, 1, 10);
  981. run(64, 384, 384, 32, 32, 13, 1, 10);
  982. run(64, 384, 384, 32, 32, 15, 1, 10);
  983. run(64, 384, 384, 32, 32, 17, 1, 10);
  984. run(64, 384, 384, 32, 32, 19, 1, 10);
  985. run(64, 384, 384, 32, 32, 21, 1, 10);
  986. run(64, 384, 384, 32, 32, 23, 1, 10);
  987. run(64, 384, 384, 32, 32, 25, 1, 10);
  988. run(64, 384, 384, 32, 32, 27, 1, 10);
  989. run(64, 384, 384, 32, 32, 29, 1, 10);
  990. run(64, 384, 384, 32, 32, 31, 1, 10);
  991. }
  992. TEST_F(CUDA, BENCHMARK_CONVOLUTION_BWD_DATA_BF16) {
  993. CUBenchmarker<ConvolutionBackwardData> bench{handle_cuda()};
  994. std::unique_ptr<OprProxy<ConvolutionBackwardData>> proxy{
  995. new OprProxy<ConvolutionBackwardData>{true}};
  996. size_t RUNS = 10;
  997. bench.set_proxy(proxy).set_times(RUNS);
  998. auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW, size_t FH,
  999. size_t SH, size_t PH) {
  1000. bench.set_dtype(0, dtype::BFloat16())
  1001. .set_dtype(1, dtype::BFloat16())
  1002. .set_dtype(2, dtype::BFloat16());
  1003. param::Convolution param;
  1004. param.stride_h = param.stride_w = SH;
  1005. param.pad_h = param.pad_w = PH;
  1006. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  1007. bench.set_param(param);
  1008. bench.proxy()->target_execution_policy = {};
  1009. TensorLayout src{{N, IC, IH, IW}, dtype::BFloat16()},
  1010. filter{{OC, IC, FH, FH}, dtype::BFloat16()};
  1011. TensorLayout dst;
  1012. {
  1013. auto&& opr = handle_cuda()->create_operator<Convolution>();
  1014. opr->param() = param;
  1015. opr->deduce_layout(src, filter, dst);
  1016. }
  1017. auto used = bench.execl({filter, dst, src}) / RUNS;
  1018. float flo = 2.0 * N * OC * IC * dst[2] * dst[3] * FH * FH;
  1019. printf("inp=%s, kern=%s, dst=%s ", src.to_string().c_str(),
  1020. filter.to_string().c_str(), dst.to_string().c_str());
  1021. printf("time_fp32=%.2fms, flops=%.3fTFLOPS\n", used, (flo / (used * 1e9)));
  1022. };
  1023. run(32, 64, 3, 224, 224, 7, 2, 3);
  1024. run(32, 128, 128, 28, 28, 3, 1, 1);
  1025. run(32, 256, 256, 14, 14, 3, 1, 1);
  1026. run(32, 512, 512, 7, 7, 3, 1, 1);
  1027. run(32, 64, 64, 56, 56, 3, 1, 1);
  1028. run(32, 512, 256, 56, 56, 1, 2, 0);
  1029. run(32, 1024, 512, 28, 28, 1, 2, 0);
  1030. run(32, 2048, 1024, 14, 14, 1, 2, 0);
  1031. run(32, 512, 128, 28, 28, 1, 1, 0);
  1032. run(32, 128, 512, 28, 28, 1, 1, 0);
  1033. run(32, 1024, 256, 14, 14, 1, 1, 0);
  1034. run(32, 256, 1024, 14, 14, 1, 1, 0);
  1035. run(32, 2048, 512, 7, 7, 1, 1, 0);
  1036. run(32, 512, 2048, 7, 7, 1, 1, 0);
  1037. run(32, 256, 64, 56, 56, 1, 1, 0);
  1038. run(32, 64, 256, 56, 56, 1, 1, 0);
  1039. run(32, 128, 256, 56, 56, 1, 2, 0);
  1040. run(32, 256, 512, 28, 28, 1, 2, 0);
  1041. run(32, 512, 1024, 14, 14, 1, 2, 0);
  1042. run(32, 64, 64, 56, 56, 1, 1, 0);
  1043. }
  1044. TEST_F(CUDA, BENCHMARK_CONVOLUTION_BWD_DATA_INT8_DP4A) {
  1045. CUBenchmarker<ConvolutionBackwardData> bench{handle_cuda()};
  1046. std::unique_ptr<OprProxy<ConvolutionBackwardData>> proxy{
  1047. new OprProxy<ConvolutionBackwardData>{true}};
  1048. size_t RUNS = 10;
  1049. bench.set_proxy(proxy).set_times(RUNS);
  1050. auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW, size_t FH,
  1051. size_t SH, size_t PH) {
  1052. bench.set_dtype(0, dtype::QuantizedS8{1.0f})
  1053. .set_dtype(1, dtype::QuantizedS8{1.0f})
  1054. .set_dtype(2, dtype::QuantizedS8{1.0f});
  1055. param::Convolution param;
  1056. param.format = param::Convolution::Format::NCHW4;
  1057. param.stride_h = param.stride_w = SH;
  1058. param.pad_h = param.pad_w = PH;
  1059. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  1060. bench.set_param(param);
  1061. bench.proxy()->target_execution_policy = {};
  1062. TensorLayout src{{N, IC / 4, IH, IW, 4}, dtype::QuantizedS8{1.0f}},
  1063. filter{{OC, IC / 4, FH, FH, 4}, dtype::QuantizedS8{1.0f}};
  1064. TensorLayout dst;
  1065. dst.dtype = dtype::QuantizedS8{1.0f};
  1066. {
  1067. auto&& opr = handle_cuda()->create_operator<Convolution>();
  1068. opr->param() = param;
  1069. opr->deduce_layout(src, filter, dst);
  1070. }
  1071. auto used = bench.execl({filter, dst, src}) / RUNS;
  1072. float flo = 2.0 * N * OC * IC * dst[2] * dst[3] * FH * FH;
  1073. printf("inp=%s, kern=%s, dst=%s ", src.to_string().c_str(),
  1074. filter.to_string().c_str(), dst.to_string().c_str());
  1075. printf("time_fp32=%.2fms, flops=%.3fTFLOPS\n", used, (flo / (used * 1e9)));
  1076. };
  1077. run(64, 32, 32, 92, 180, 4, 2, 2);
  1078. run(64, 32, 32, 46, 80, 4, 2, 2);
  1079. run(16, 16, 16, 92, 180, 4, 2, 2);
  1080. run(16, 16, 16, 46, 80, 4, 2, 2);
  1081. }
  1082. TEST_F(CUDA, CONVOLUTION_BWD_FILTER_BENCHMARK) {
  1083. CUBenchmarker<ConvolutionBackwardFilter> bench{handle_cuda()};
  1084. std::unique_ptr<OprProxy<ConvolutionBackwardFilter>> proxy{
  1085. new OprProxy<ConvolutionBackwardFilter>{true}};
  1086. size_t RUNS = 10;
  1087. bench.set_proxy(proxy).set_times(RUNS);
  1088. auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW, size_t FH,
  1089. size_t SH, size_t PH) {
  1090. bench.set_dtype(0, dtype::Float32())
  1091. .set_dtype(1, dtype::Float32())
  1092. .set_dtype(2, dtype::Float32());
  1093. param::Convolution param;
  1094. param.stride_h = param.stride_w = SH;
  1095. param.pad_h = param.pad_w = PH;
  1096. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  1097. bench.set_param(param);
  1098. bench.proxy()->target_execution_policy.algo.reset();
  1099. TensorLayout src{{N, IC, IH, IW}, dtype::Float32()},
  1100. filter{{OC, IC, FH, FH}, dtype::Float32()};
  1101. TensorLayout dst;
  1102. {
  1103. auto&& opr = handle_cuda()->create_operator<Convolution>();
  1104. opr->param() = param;
  1105. opr->deduce_layout(src, filter, dst);
  1106. }
  1107. auto time_ms_fp32 = bench.execl({src, dst, filter}) / RUNS;
  1108. src.dtype = filter.dtype = dst.dtype = dtype::Float16();
  1109. bench.proxy()->target_execution_policy.algo.reset();
  1110. bench.set_dtype(0, dtype::Float16())
  1111. .set_dtype(1, dtype::Float16())
  1112. .set_dtype(2, dtype::Float16());
  1113. auto time_ms_true_fp16 = bench.execl({src, dst, filter}) / RUNS;
  1114. param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  1115. bench.proxy()->target_execution_policy.algo.reset();
  1116. bench.set_param(param);
  1117. auto time_ms_pseudo_fp16 = bench.execl({src, dst, filter}) / RUNS;
  1118. float flo = 2.0 * N * OC * IC * dst[2] * dst[3] * FH * FH;
  1119. printf("inp=%s, kern=%s, dst=%s ", src.to_string().c_str(),
  1120. filter.to_string().c_str(), dst.to_string().c_str());
  1121. printf("time_fp32=%.2fms, flops=%.3fTFLOPS\ntime_true_fp16=%.2fms, "
  1122. "flops=%.3fTFLOPS\ntime_pseudo_fp16=%.2fms, flops=%.3fFLOPS\n",
  1123. time_ms_fp32, (flo / (time_ms_fp32 * 1e9)), time_ms_true_fp16,
  1124. (flo / (time_ms_true_fp16 * 1e9)), time_ms_pseudo_fp16,
  1125. (flo / (time_ms_pseudo_fp16 * 1e9)));
  1126. printf("speedup (true_fp16/fp32)=%.2f, (true_fp16/pseudo_fp16)=%.2f\n",
  1127. time_ms_fp32 / time_ms_true_fp16,
  1128. time_ms_pseudo_fp16 / time_ms_true_fp16);
  1129. };
  1130. run(32, 64, 3, 224, 224, 7, 2, 3);
  1131. run(32, 128, 128, 28, 28, 3, 1, 1);
  1132. run(32, 256, 256, 14, 14, 3, 1, 1);
  1133. run(32, 512, 512, 7, 7, 3, 1, 1);
  1134. run(32, 64, 64, 56, 56, 3, 1, 1);
  1135. run(32, 512, 256, 56, 56, 1, 2, 0);
  1136. run(32, 1024, 512, 28, 28, 1, 2, 0);
  1137. run(32, 2048, 1024, 14, 14, 1, 2, 0);
  1138. run(32, 512, 128, 28, 28, 1, 1, 0);
  1139. run(32, 128, 512, 28, 28, 1, 1, 0);
  1140. run(32, 1024, 256, 14, 14, 1, 1, 0);
  1141. run(32, 256, 1024, 14, 14, 1, 1, 0);
  1142. run(32, 2048, 512, 7, 7, 1, 1, 0);
  1143. run(32, 512, 2048, 7, 7, 1, 1, 0);
  1144. run(32, 256, 64, 56, 56, 1, 1, 0);
  1145. run(32, 64, 256, 56, 56, 1, 1, 0);
  1146. run(32, 128, 256, 56, 56, 1, 2, 0);
  1147. run(32, 256, 512, 28, 28, 1, 2, 0);
  1148. run(32, 512, 1024, 14, 14, 1, 2, 0);
  1149. run(32, 64, 64, 56, 56, 1, 1, 0);
  1150. }
  1151. TEST_F(CUDA, BENCHMARK_CONVOLUTION_BWD_FILTER_DEPTHWISE_LARGE_FILTER) {
  1152. CUBenchmarker<ConvolutionBackwardFilter> bench{handle_cuda()};
  1153. std::unique_ptr<OprProxy<ConvolutionBackwardFilter>> proxy{
  1154. new OprProxy<ConvolutionBackwardFilter>{true}};
  1155. size_t RUNS = 10;
  1156. bench.set_proxy(proxy).set_times(RUNS);
  1157. bench.set_before_exec_callback(AlgoChecker<ConvolutionBackwardFilter>(
  1158. "CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFTv7.6.3"));
  1159. auto run = [&](size_t N, size_t OC, size_t g, size_t IH, size_t IW, size_t FH,
  1160. size_t SH, size_t PH) {
  1161. bench.set_dtype(0, dtype::Float32())
  1162. .set_dtype(1, dtype::Float32())
  1163. .set_dtype(2, dtype::Float32());
  1164. param::Convolution param;
  1165. param.stride_h = param.stride_w = SH;
  1166. param.pad_h = param.pad_w = FH / 2;
  1167. param.sparse = param::Convolution::Sparse::GROUP;
  1168. bench.set_param(param);
  1169. bench.proxy()->target_execution_policy.algo.reset();
  1170. TensorLayout src{{N, g, IH, IW}, dtype::Float32()},
  1171. filter{{g, 1, 1, FH, FH}, dtype::Float32()};
  1172. TensorLayout dst;
  1173. {
  1174. auto&& opr = handle_cuda()->create_operator<Convolution>();
  1175. opr->param() = param;
  1176. opr->deduce_layout(src, filter, dst);
  1177. }
  1178. auto time_ms_fp32 = bench.execl({src, dst, filter}) / RUNS;
  1179. float flo = 2.0 * N * g * dst[2] * dst[3] * FH * FH;
  1180. printf("inp=%s, kern=%s, dst=%s ", src.to_string().c_str(),
  1181. filter.to_string().c_str(), dst.to_string().c_str());
  1182. printf("time_fp32=%.2fms, flops=%.3fTFLOPS\n", time_ms_fp32,
  1183. (flo / (time_ms_fp32 * 1e9)));
  1184. };
  1185. run(64, 384, 384, 32, 32, 31, 1, 15);
  1186. }
  1187. #endif
  1188. #undef CUDNN_VERSION_STRING
  1189. #undef V
  1190. #undef V1
  1191. } // namespace test
  1192. } // namespace megdnn
  1193. // vim: syntax=cpp.doxygen