You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

convolution.cpp 53 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279
  1. /**
  2. * \file dnn/test/cuda/convolution.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
  10. * implied.
  11. */
  12. #include "test/common/convolution.h"
  13. #include "megdnn/dtype.h"
  14. #include "megdnn/opr_param_defs.h"
  15. #include "megdnn/oprs.h"
  16. #include "src/cuda/utils.h"
  17. #include "test/common/accuracy_shake_checker.h"
  18. #include "test/common/checker.h"
  19. #include "test/common/rng.h"
  20. #include "test/common/tensor.h"
  21. #include "test/common/workspace_wrapper.h"
  22. #include "test/cuda/benchmark.h"
  23. #include "test/cuda/fixture.h"
  24. #define V1(x) #x
  25. #define V(x) V1(x)
  26. #define CUDNN_VERSION_STRING \
  27. "v" V(CUDNN_MAJOR) "." V(CUDNN_MINOR) "." V(CUDNN_PATCHLEVEL)
  28. namespace megdnn {
  29. namespace test {
  30. TEST_F(CUDA, CONVOLUTION_8X8X32) {
  31. if (!cuda::is_compute_capability_required(6, 1)) {
  32. printf("Skip CUDA.CONVOLUTION_8X8X32 test as current device"
  33. "doesn't support\n");
  34. return;
  35. }
  36. using namespace convolution;
  37. std::vector<TestArg> args;
  38. {
  39. auto v = get_args();
  40. for (auto&& a : v) {
  41. args.push_back(std::move(a));
  42. }
  43. }
  44. {
  45. auto v = get_dilated_args();
  46. for (auto&& a : v) {
  47. args.push_back(std::move(a));
  48. }
  49. }
  50. {
  51. auto v = get_chanwise_args();
  52. for (auto&& a : v) {
  53. args.push_back(std::move(a));
  54. }
  55. }
  56. Checker<ConvolutionForward> checker(handle_cuda());
  57. UniformIntRNG rng(-4, 4);
  58. for (auto arg : args) {
  59. arg.param.format = param::Convolution::Format::NHWC;
  60. arg.src = cvt_src_or_dst_nchw2nhwc(arg.src);
  61. arg.filter = cvt_filter_nchw2nhwc(arg.filter);
  62. checker.set_dtype(0, dtype::Int8())
  63. .set_dtype(1, dtype::Int8())
  64. .set_dtype(2, dtype::Int32())
  65. .set_param(arg.param)
  66. .set_rng(0, &rng)
  67. .set_rng(1, &rng)
  68. .execs({arg.src, arg.filter, {}});
  69. }
  70. }
  71. TEST_F(CUDA, CONVOLUTION_FORWARD) {
  72. using namespace convolution;
  73. std::vector<TestArg> args = get_args();
  74. Checker<ConvolutionForward> checker(handle_cuda());
  75. NormalRNG default_rng;
  76. for (auto&& arg : args) {
  77. float scale = 1.0f / sqrt(arg.filter[1] * arg.filter[2] * arg.filter[3]);
  78. UniformFloatRNG rng(scale, 2 * scale);
  79. checker.set_dtype(0, dtype::Float32())
  80. .set_dtype(1, dtype::Float32())
  81. .set_dtype(2, dtype::Float32())
  82. .set_rng(0, &default_rng)
  83. .set_rng(1, &default_rng)
  84. .set_epsilon(1e-3)
  85. .set_param(arg.param)
  86. .execs({arg.src, arg.filter, {}});
  87. checker.set_dtype(0, dtype::Float16())
  88. .set_dtype(1, dtype::Float16())
  89. .set_dtype(2, dtype::Float16())
  90. .set_rng(0, &rng)
  91. .set_rng(1, &rng)
  92. .set_epsilon(1e-1)
  93. .set_param(arg.param)
  94. .execs({arg.src, arg.filter, {}});
  95. arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  96. checker.set_dtype(0, dtype::Float16())
  97. .set_dtype(1, dtype::Float16())
  98. .set_dtype(2, dtype::Float16())
  99. .set_rng(0, &rng)
  100. .set_rng(1, &rng)
  101. .set_epsilon(1e-1)
  102. .set_param(arg.param)
  103. .execs({arg.src, arg.filter, {}});
  104. checker.set_dtype(0, dtype::BFloat16())
  105. .set_dtype(1, dtype::BFloat16())
  106. .set_dtype(2, dtype::BFloat16())
  107. .set_epsilon(1e-1)
  108. .set_param(arg.param)
  109. .execs({arg.src, arg.filter, {}});
  110. }
  111. }
  112. TEST_F(CUDA, CONV_FORWARD_MATMUL_NCHW4) {
  113. if (!cuda::is_compute_capability_required(6, 1))
  114. return;
  115. using namespace convolution;
  116. Checker<Convolution> checker(handle_cuda());
  117. UniformIntRNG int_rng{-127, 127};
  118. Convolution::Param param;
  119. param.format = Convolution::Param::Format::NCHW4;
  120. checker.set_dtype(0, dtype::QuantizedS8(0.132f))
  121. .set_dtype(1, dtype::QuantizedS8(0.0239f))
  122. .set_dtype(2, dtype::QuantizedS32(0.132f * 0.0239f))
  123. .set_rng(0, &int_rng)
  124. .set_rng(1, &int_rng)
  125. .set_param(param);
  126. checker.set_before_exec_callback(
  127. AlgoChecker<ConvolutionForward>(ExecutionPolicyAlgoName{
  128. "DEFAULT",
  129. {{ConvBiasForward::algo_name<ConvBiasForward::MatmulParam>(
  130. "MATMUL8X8X32", {})
  131. .c_str(),
  132. {}}}}));
  133. param.sparse = Convolution::Param::Sparse::DENSE;
  134. param.pad_h = param.pad_w = 1;
  135. param.stride_h = param.stride_w = 1;
  136. checker.set_param(param);
  137. checker.exec({{8, 4, 10, 10, 4}, {16, 4, 3, 3, 4}, {}});
  138. checker.exec({{1, 4, 2, 2, 4}, {16, 4, 3, 3, 4}, {}});
  139. checker.exec({{8, 64, 12, 12, 4}, {256, 64, 3, 3, 4}, {}});
  140. }
  141. TEST_F(CUDA, CONVOLUTION_1X1_FORWARD) {
  142. using namespace convolution;
  143. std::vector<TestArg> args = get_1x1_args();
  144. Checker<ConvolutionForward> checker(handle_cuda());
  145. NormalRNG default_rng;
  146. for (auto&& arg : args) {
  147. float scale = 1.0f / sqrt(arg.filter[1] * arg.filter[2] * arg.filter[3]);
  148. UniformFloatRNG rng(scale, 2 * scale);
  149. checker.set_dtype(0, dtype::Float32())
  150. .set_dtype(1, dtype::Float32())
  151. .set_rng(0, &default_rng)
  152. .set_rng(1, &default_rng)
  153. .set_epsilon(1e-3)
  154. .set_param(arg.param)
  155. .execs({arg.src, arg.filter, {}});
  156. }
  157. }
  158. TEST_F(CUDA, BENCHMARK_CONVOLUTION_1X1_FORWARD) {
  159. using namespace convolution;
  160. std::vector<TestArg> args = get_1x1_args();
  161. Benchmarker<ConvolutionForward> marker(handle_cuda());
  162. NormalRNG default_rng;
  163. for (auto&& arg : args) {
  164. float scale = 1.0f / sqrt(arg.filter[1] * arg.filter[2] * arg.filter[3]);
  165. UniformFloatRNG rng(scale, 2 * scale);
  166. marker.set_dtype(0, dtype::Float32())
  167. .set_dtype(1, dtype::Float32())
  168. .set_rng(0, &default_rng)
  169. .set_rng(1, &default_rng)
  170. .set_param(arg.param)
  171. .execs({arg.src, arg.filter, {}});
  172. }
  173. }
  174. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA) {
  175. using namespace convolution;
  176. std::vector<TestArg> args = get_args_cuda_conv_bwd_data();
  177. Checker<ConvolutionBackwardData> checker(handle_cuda());
  178. NormalRNG default_rng;
  179. for (auto&& arg : args) {
  180. float scale = 64.f / sqrt(arg.filter[0] * arg.filter[2] * arg.filter[3]);
  181. UniformFloatRNG rng(scale, 2 * scale);
  182. auto src = TensorLayout(arg.src, dtype::Float32());
  183. auto filter = TensorLayout(arg.filter, dtype::Float32());
  184. TensorLayout dst;
  185. {
  186. auto opr = handle_cuda()->create_operator<Convolution>();
  187. opr->param() = arg.param;
  188. opr->deduce_layout(src, filter, dst);
  189. }
  190. src.dtype = dst.dtype = filter.dtype = dtype::Float32();
  191. checker.set_rng(0, &default_rng)
  192. .set_rng(1, &default_rng)
  193. .set_epsilon(1e-3)
  194. .set_param(arg.param)
  195. .exec(TensorLayoutArray{filter, dst, src});
  196. if (!cuda::is_compute_capability_required(6, 0)) {
  197. src.dtype = dst.dtype = filter.dtype = dtype::Float16();
  198. checker.set_rng(0, &rng)
  199. .set_rng(1, &rng)
  200. .set_epsilon(1e-1)
  201. .set_param(arg.param)
  202. .exec(TensorLayoutArray{filter, dst, src});
  203. arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  204. checker.set_rng(0, &rng)
  205. .set_rng(1, &rng)
  206. .set_epsilon(1e-1)
  207. .set_param(arg.param)
  208. .exec(TensorLayoutArray{filter, dst, src});
  209. }
  210. checker.set_before_exec_callback(
  211. AlgoChecker<ConvolutionBackwardData>(ExecutionPolicyAlgoName{
  212. "CONVOLUTION_BACKWARD_DATD_BFLOAT16",
  213. {{"MATMUL", {{"CUBLAS", {}}}}}}));
  214. src.dtype = dst.dtype = filter.dtype = dtype::BFloat16();
  215. arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  216. checker.set_rng(0, &rng)
  217. .set_rng(1, &rng)
  218. .set_epsilon(1e-1)
  219. .set_param(arg.param)
  220. .exec(TensorLayoutArray{filter, dst, src});
  221. checker.reset_before_exec_callback();
  222. checker.opr()->execution_policy() = {};
  223. }
  224. }
  225. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_FP16_CUDNN7_5) {
  226. // algo CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 with
  227. // TensorCore operations produces incorrect result.
  228. // Maybe nvidia has fixed this issue
  229. // There is a test using incorrect case:
  230. // inp={2x8x18x18}, kern={8x8x2x2}, pad_h=pad_w=2, stride_h=stride_w=2,
  231. // dtype=float16
  232. using namespace convolution;
  233. std::vector<TestArg> args = get_args_cudnn_5_1_backward();
  234. Checker<ConvolutionBackwardData> checker(handle_cuda());
  235. NormalRNG default_rng;
  236. for (auto&& arg : args) {
  237. float scale = 128.f / sqrt(arg.filter[0] * arg.filter[2] * arg.filter[3]);
  238. scale = std::max(scale, 1.f);
  239. UniformFloatRNG rng(scale, 2 * scale);
  240. arg.param.format = param::Convolution::Format::NHWC;
  241. arg.src = cvt_src_or_dst_nchw2nhwc(arg.src);
  242. arg.filter = cvt_filter_nchw2nhwc(arg.filter);
  243. auto src = TensorLayout(arg.src, dtype::Float32());
  244. auto filter = TensorLayout(arg.filter, dtype::Float32());
  245. TensorLayout dst;
  246. {
  247. auto opr = handle_cuda()->create_operator<Convolution>();
  248. opr->param() = arg.param;
  249. opr->deduce_layout(src, filter, dst);
  250. }
  251. src.dtype = dst.dtype = filter.dtype = dtype::Float16();
  252. arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  253. checker.set_rng(0, &rng)
  254. .set_rng(1, &rng)
  255. .set_epsilon(1e-2)
  256. .set_param(arg.param)
  257. .exec(TensorLayoutArray{filter, dst, src});
  258. src.dtype = dst.dtype = filter.dtype = dtype::Float32();
  259. arg.param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  260. checker.set_rng(0, &rng)
  261. .set_rng(1, &rng)
  262. .set_epsilon(1e-2)
  263. .set_param(arg.param)
  264. .exec(TensorLayoutArray{filter, dst, src});
  265. }
  266. }
  267. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_NHWC) {
  268. using namespace convolution;
  269. std::vector<TestArg> args = get_args_cuda_conv_bwd_data();
  270. Checker<ConvolutionBackwardData> checker(handle_cuda());
  271. NormalRNG default_rng;
  272. for (auto&& arg : args) {
  273. float scale = 64.f / sqrt(arg.filter[0] * arg.filter[2] * arg.filter[3]);
  274. UniformFloatRNG rng(scale, 2 * scale);
  275. arg.param.format = param::Convolution::Format::NHWC;
  276. arg.src = cvt_src_or_dst_nchw2nhwc(arg.src);
  277. arg.filter = cvt_filter_nchw2nhwc(arg.filter);
  278. auto src = TensorLayout(arg.src, dtype::Float32());
  279. auto filter = TensorLayout(arg.filter, dtype::Float32());
  280. TensorLayout dst;
  281. {
  282. auto opr = handle_cuda()->create_operator<Convolution>();
  283. opr->param() = arg.param;
  284. opr->deduce_layout(src, filter, dst);
  285. }
  286. src.dtype = dst.dtype = filter.dtype = dtype::Float16();
  287. arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  288. checker.set_rng(0, &rng)
  289. .set_rng(1, &rng)
  290. .set_epsilon(1e-2)
  291. .set_param(arg.param)
  292. .exec(TensorLayoutArray{filter, dst, src});
  293. src.dtype = dst.dtype = filter.dtype = dtype::Float32();
  294. arg.param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  295. checker.set_rng(0, &rng)
  296. .set_rng(1, &rng)
  297. .set_epsilon(1e-2)
  298. .set_param(arg.param)
  299. .exec(TensorLayoutArray{filter, dst, src});
  300. }
  301. }
  302. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_CUDNN) {
  303. if (cuda::is_compute_capability_required(7, 0))
  304. return;
  305. using namespace convolution;
  306. Checker<ConvolutionBackwardData> checker(handle_cuda());
  307. checker.set_before_exec_callback(
  308. AlgoChecker<ConvolutionBackwardData>("CUDNN_CONVOLUTION"));
  309. //! noncontiguous case
  310. {
  311. param::Convolution param;
  312. param.pad_h = param.pad_w = 1;
  313. checker.set_param(param).execl(TensorLayoutArray{
  314. {{16, 16, 3, 3}, {144, 9, 3, 1}, dtype::Float32()},
  315. {{2, 16, 7, 7}, {1568, 49, 7, 1}, dtype::Float32()},
  316. {{2, 16, 7, 7}, {1568, 49, 7, 1}, dtype::Float32()},
  317. });
  318. }
  319. }
  320. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_MATMUL) {
  321. using namespace convolution;
  322. std::vector<TestArg> args = get_args_cuda_conv_bwd_data();
  323. Checker<ConvolutionBackwardData> checker(handle_cuda());
  324. checker.set_before_exec_callback(AlgoChecker<ConvolutionBackwardData>(
  325. ExecutionPolicyAlgoName{"MATMUL", {{"CUBLAS", {}}}}));
  326. NormalRNG default_rng;
  327. for (auto&& arg : args) {
  328. float scale = 64.f / sqrt(arg.filter[0] * arg.filter[2] * arg.filter[3]);
  329. UniformFloatRNG rng(scale, 2 * scale);
  330. auto src = TensorLayout(arg.src, dtype::Float32());
  331. auto filter = TensorLayout(arg.filter, dtype::Float32());
  332. TensorLayout dst;
  333. {
  334. auto opr = handle_cuda()->create_operator<Convolution>();
  335. opr->param() = arg.param;
  336. opr->deduce_layout(src, filter, dst);
  337. }
  338. src.dtype = dst.dtype = filter.dtype = dtype::Float32();
  339. checker.set_rng(0, &default_rng)
  340. .set_rng(1, &default_rng)
  341. .set_epsilon(1e-3)
  342. .set_param(arg.param)
  343. .exec(TensorLayoutArray{filter, dst, src});
  344. }
  345. //! noncontiguous case
  346. {
  347. param::Convolution param;
  348. param.pad_h = param.pad_w = 1;
  349. checker.set_param(param).execl(TensorLayoutArray{
  350. {{16, 16, 3, 3}, {144, 9, 3, 1}, dtype::Float32()},
  351. {{2, 16, 7, 7}, {1568, 49, 7, 1}, dtype::Float32()},
  352. {{2, 16, 7, 7}, {1568, 49, 7, 1}, dtype::Float32()},
  353. });
  354. }
  355. }
  356. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_INT8_NCHW4_DP4A) {
  357. if (!cuda::is_compute_capability_required(6, 1)) {
  358. printf("Skip CUDA.CONVOLUTION_BACKWARD_DATA_INT8_NCHW4_DP4A test as "
  359. "current device doesn't support\n");
  360. return;
  361. }
  362. using namespace convolution;
  363. std::vector<TestArg> args = get_args_int8_nchw4_conv_bwd_data();
  364. struct AlgoParam {
  365. int threadblock_m;
  366. int threadblock_n;
  367. int threadblock_k;
  368. int warp_m;
  369. int warp_n;
  370. int warp_k;
  371. int stage;
  372. std::string to_string() {
  373. return ssprintf(
  374. "_%dX%dX%d_%dX%dX%d_%dstage", threadblock_m, threadblock_n,
  375. threadblock_k, warp_m, warp_n, warp_k, stage);
  376. }
  377. };
  378. std::vector<AlgoParam> all_params;
  379. all_params.emplace_back(AlgoParam{16, 64, 8, 16, 64, 8, 2});
  380. all_params.emplace_back(AlgoParam{16, 128, 16, 16, 64, 16, 2});
  381. all_params.emplace_back(AlgoParam{16, 128, 16, 16, 128, 16, 1});
  382. all_params.emplace_back(AlgoParam{32, 128, 32, 32, 64, 32, 2});
  383. for (auto algo_param : all_params) {
  384. Checker<ConvolutionBackwardData> checker(handle_cuda());
  385. std::string algo_name(ssprintf(
  386. "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM%s", algo_param.to_string().c_str()));
  387. checker.set_before_exec_callback(
  388. AlgoChecker<ConvolutionBackwardData>(algo_name.c_str()));
  389. checker.set_epsilon(1 + 1e-3).set_max_avg_error(1e-1);
  390. for (auto&& arg : args) {
  391. UniformIntRNG rng(-3, 3);
  392. auto src = TensorLayout(arg.src, dtype::QuantizedS8{1.2f});
  393. auto filter = TensorLayout(arg.filter, dtype::QuantizedS8{1.3f});
  394. TensorLayout dst;
  395. dst.dtype = dtype::QuantizedS8{1.2f};
  396. {
  397. auto opr = handle_cuda()->create_operator<Convolution>();
  398. opr->param() = arg.param;
  399. opr->deduce_layout(src, filter, dst);
  400. }
  401. checker.set_rng(0, &rng).set_rng(1, &rng).set_param(arg.param).exec(
  402. TensorLayoutArray{filter, dst, src});
  403. }
  404. }
  405. }
  406. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_INT8_NCHW_DP4A) {
  407. if (!cuda::is_compute_capability_required(6, 1)) {
  408. printf("Skip CUDA.CONVOLUTION_BACKWARD_DATA_INT8_NCHW_DP4A test as "
  409. "current device doesn't support\n");
  410. return;
  411. }
  412. using namespace convolution;
  413. std::vector<TestArg> args = get_args_int8_nchw_conv_bwd_data();
  414. Checker<ConvolutionBackwardData> checker(handle_cuda());
  415. checker.set_before_exec_callback(
  416. AlgoChecker<ConvolutionBackwardData>("INT8_NCHW_DOTPROD_IMPLICIT_GEMM"));
  417. checker.set_epsilon(1 + 1e-3).set_max_avg_error(1e-1);
  418. for (auto&& arg : args) {
  419. UniformIntRNG rng(-3, 3);
  420. auto src = TensorLayout(arg.src, dtype::QuantizedS8{1.2f});
  421. auto filter = TensorLayout(arg.filter, dtype::QuantizedS8{1.3f});
  422. TensorLayout dst;
  423. dst.dtype = dtype::QuantizedS8{1.2f};
  424. {
  425. auto opr = handle_cuda()->create_operator<Convolution>();
  426. opr->param() = arg.param;
  427. opr->deduce_layout(src, filter, dst);
  428. }
  429. checker.set_rng(0, &rng).set_rng(1, &rng).set_param(arg.param).exec(
  430. TensorLayoutArray{filter, dst, src});
  431. }
  432. }
  433. #if CUDA_VERSION >= 10020
  434. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_INT8_NHWC_IMMA) {
  435. if (!cuda::is_compute_capability_required(7, 5)) {
  436. printf("Skip CUDA.CONVOLUTION_BACKWARD_DATA_INT8_NHWC_IMMA test as "
  437. "current device doesn't support\n");
  438. return;
  439. }
  440. using namespace convolution;
  441. std::vector<TestArg> args = get_args_int8_nhwc_conv_bwd_data();
  442. struct AlgoParam {
  443. int threadblock_m;
  444. int threadblock_n;
  445. int threadblock_k;
  446. int warp_m;
  447. int warp_n;
  448. int warp_k;
  449. int stage;
  450. int access_size;
  451. std::string to_string() {
  452. return ssprintf(
  453. "_%dX%dX%d_%dX%dX%d_%dstage_%d", threadblock_m, threadblock_n,
  454. threadblock_k, warp_m, warp_n, warp_k, stage, access_size);
  455. }
  456. };
  457. std::vector<AlgoParam> all_params;
  458. all_params.emplace_back(AlgoParam{64, 16, 32, 64, 16, 32, 2, 4});
  459. all_params.emplace_back(AlgoParam{64, 16, 32, 64, 16, 32, 2, 8});
  460. all_params.emplace_back(AlgoParam{64, 16, 32, 64, 16, 32, 2, 16});
  461. all_params.emplace_back(AlgoParam{128, 32, 32, 64, 32, 32, 1, 4});
  462. all_params.emplace_back(AlgoParam{128, 32, 32, 64, 32, 32, 1, 8});
  463. all_params.emplace_back(AlgoParam{128, 32, 32, 64, 32, 32, 1, 16});
  464. for (auto algo_param : all_params) {
  465. Checker<ConvolutionBackwardData> checker(handle_cuda());
  466. std::string algo_name(ssprintf(
  467. "INT8_NHWC_IMMA_IMPLICIT_GEMM%s", algo_param.to_string().c_str()));
  468. checker.set_before_exec_callback(
  469. AlgoChecker<ConvolutionBackwardData>(algo_name.c_str()));
  470. checker.set_epsilon(1 + 1e-3).set_max_avg_error(1e-1);
  471. for (auto&& arg : args) {
  472. UniformIntRNG rng(-3, 3);
  473. auto src = TensorLayout(arg.src, dtype::QuantizedS8{1.2f});
  474. auto filter = TensorLayout(arg.filter, dtype::QuantizedS8{1.3f});
  475. TensorLayout dst;
  476. dst.dtype = dtype::QuantizedS8{1.2f};
  477. {
  478. auto opr = handle_cuda()->create_operator<Convolution>();
  479. opr->param() = arg.param;
  480. opr->deduce_layout(src, filter, dst);
  481. }
  482. checker.set_rng(0, &rng).set_rng(1, &rng).set_param(arg.param).exec(
  483. TensorLayoutArray{filter, dst, src});
  484. }
  485. }
  486. }
  487. #endif
  488. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_FAILED_CUDNN7_5) {
  489. // BRAIN-481 failed on architectures 7.0, remove the following if statement,
  490. // when cudnn fixed the problem.
  491. if (cuda::is_compute_capability_required(7, 0))
  492. return;
  493. using namespace convolution;
  494. std::vector<TestArg> args = get_args_cudnn_7_5_failures();
  495. Checker<ConvolutionBackwardData> checker(handle_cuda());
  496. NormalRNG default_rng;
  497. for (auto&& arg : args) {
  498. float scale = 128.f / sqrt(arg.filter[0] * arg.filter[2] * arg.filter[3]);
  499. scale = std::max(scale, 1.f);
  500. UniformFloatRNG rng(scale, 2 * scale);
  501. auto src = TensorLayout(arg.src, dtype::Float32());
  502. auto filter = TensorLayout(arg.filter, dtype::Float32());
  503. TensorLayout dst;
  504. {
  505. auto opr = handle_cuda()->create_operator<Convolution>();
  506. opr->param() = arg.param;
  507. opr->deduce_layout(src, filter, dst);
  508. }
  509. src.dtype = dst.dtype = filter.dtype = dtype::Float32();
  510. checker.set_rng(0, &default_rng)
  511. .set_rng(1, &default_rng)
  512. .set_epsilon(1e-3)
  513. .set_param(arg.param)
  514. .exec(TensorLayoutArray{filter, dst, src});
  515. src.dtype = dst.dtype = filter.dtype = dtype::Float16();
  516. checker.set_rng(0, &rng)
  517. .set_rng(1, &rng)
  518. .set_epsilon(1e-1)
  519. .set_param(arg.param)
  520. .exec(TensorLayoutArray{filter, dst, src});
  521. arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  522. checker.set_rng(0, &rng)
  523. .set_rng(1, &rng)
  524. .set_epsilon(1e-1)
  525. .set_param(arg.param)
  526. .exec(TensorLayoutArray{filter, dst, src});
  527. }
  528. }
  529. TEST_F(CUDA, CONVOLUTION_BACKWARD_FILTER) {
  530. using namespace convolution;
  531. std::vector<TestArg> args = get_args();
  532. Checker<ConvolutionBackwardFilter> checker(handle_cuda());
  533. bool f16_checked = false;
  534. for (auto&& arg : args) {
  535. auto src = TensorLayout(arg.src, dtype::Float32());
  536. auto filter = TensorLayout(arg.filter, dtype::Float32());
  537. TensorLayout dst;
  538. {
  539. auto opr = handle_cuda()->create_operator<Convolution>();
  540. opr->param() = arg.param;
  541. opr->deduce_layout(src, filter, dst);
  542. }
  543. float scale = 1.0f / sqrt(dst[2] * dst[3]);
  544. UniformFloatRNG rng(scale, 2 * scale);
  545. src.dtype = dst.dtype = filter.dtype = dtype::Float32();
  546. checker.set_rng(0, &rng)
  547. .set_rng(1, &rng)
  548. .set_epsilon(1e-3)
  549. .set_param(arg.param)
  550. .exec(TensorLayoutArray{src, dst, filter});
  551. // reduce on large f16 array may introduce significant error
  552. if (dst.total_nr_elems() >= 1000 && f16_checked)
  553. continue;
  554. f16_checked = true;
  555. src.dtype = dst.dtype = filter.dtype = dtype::Float16();
  556. checker.set_rng(0, &rng)
  557. .set_rng(1, &rng)
  558. .set_epsilon(1e-1)
  559. .set_param(arg.param)
  560. .exec(TensorLayoutArray{src, dst, filter});
  561. arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  562. checker.set_rng(0, &rng)
  563. .set_rng(1, &rng)
  564. .set_epsilon(1e-1)
  565. .set_param(arg.param)
  566. .exec(TensorLayoutArray{src, dst, filter});
  567. checker.set_before_exec_callback(
  568. AlgoChecker<ConvolutionBackwardFilter>(ExecutionPolicyAlgoName{
  569. "CONVOLUTION_BACKWARD_FILTER_BFLOAT16",
  570. {{"MATMUL", {{"CUBLAS", {}}}}}}));
  571. src.dtype = dst.dtype = filter.dtype = dtype::BFloat16();
  572. checker.set_rng(0, &rng)
  573. .set_rng(1, &rng)
  574. .set_epsilon(1e-1)
  575. .set_param(arg.param)
  576. .exec(TensorLayoutArray{src, dst, filter});
  577. checker.reset_before_exec_callback();
  578. checker.opr()->execution_policy() = {};
  579. }
  580. }
  581. TEST_F(CUDA, CONVOLUTION_BACKWARD_FILTER_MATMUL) {
  582. using namespace convolution;
  583. std::vector<TestArg> args = get_args();
  584. Checker<ConvolutionBackwardFilter> checker(handle_cuda());
  585. checker.set_before_exec_callback(AlgoChecker<ConvolutionBackwardFilter>(
  586. ExecutionPolicyAlgoName{"MATMUL", {{"CUBLAS", {}}}}));
  587. for (auto&& arg : args) {
  588. auto src = TensorLayout(arg.src, dtype::Float32());
  589. auto filter = TensorLayout(arg.filter, dtype::Float32());
  590. TensorLayout dst;
  591. {
  592. auto opr = handle_cuda()->create_operator<Convolution>();
  593. opr->param() = arg.param;
  594. opr->deduce_layout(src, filter, dst);
  595. }
  596. float scale = 1.0f / sqrt(dst[2] * dst[3]);
  597. UniformFloatRNG rng(scale, 2 * scale);
  598. src.dtype = dst.dtype = filter.dtype = dtype::Float32();
  599. checker.set_rng(0, &rng)
  600. .set_rng(1, &rng)
  601. .set_epsilon(1e-3)
  602. .set_param(arg.param)
  603. .exec(TensorLayoutArray{src, dst, filter});
  604. }
  605. //! noncontiguous case
  606. {
  607. NormalRNG default_rng;
  608. param::Convolution param;
  609. param.pad_h = param.pad_w = 1;
  610. checker.set_rng(0, &default_rng)
  611. .set_rng(1, &default_rng)
  612. .set_param(param)
  613. .execl(TensorLayoutArray{
  614. {{2, 16, 7, 7}, {1568, 49, 7, 1}, dtype::Float32()},
  615. {{2, 16, 7, 7}, {1568, 49, 7, 1}, dtype::Float32()},
  616. {{16, 16, 3, 3}, {144, 9, 3, 1}, dtype::Float32()}});
  617. }
  618. }
  619. TEST_F(CUDA, CONVOLUTION_BACKWARD_FILTER_CUDNN) {
  620. if (cuda::is_compute_capability_required(7, 0))
  621. return;
  622. using namespace convolution;
  623. Checker<ConvolutionBackwardFilter> checker(handle_cuda());
  624. checker.set_before_exec_callback(
  625. AlgoChecker<ConvolutionBackwardFilter>("CUDNN_CONVOLUTION"));
  626. //! noncontiguous case
  627. {
  628. param::Convolution param;
  629. param.pad_h = param.pad_w = 1;
  630. checker.set_param(param).execl(TensorLayoutArray{
  631. {{2, 16, 7, 7}, {1568, 49, 7, 1}, dtype::Float32()},
  632. {{2, 16, 7, 7}, {1568, 49, 7, 1}, dtype::Float32()},
  633. {{16, 16, 3, 3}, {144, 9, 3, 1}, dtype::Float32()}});
  634. }
  635. }
  636. TEST_F(CUDA, CONV_CONFIG_COMBINATIONS) {
  637. auto eps_getter = [](bool f16, int stage, const char* name) -> float {
  638. if (f16) {
  639. return stage == 2 ? 0.5 : 0.2;
  640. }
  641. if (strstr(name, "WINOGRAD_NONFUSED"))
  642. return 0.3;
  643. return 1e-3;
  644. };
  645. convolution::test_conv_config_combinations(
  646. 2, handle_cuda(), false, true, true, eps_getter, true);
  647. convolution::test_conv_config_combinations(
  648. 3, handle_cuda(), false, true, true, eps_getter, true);
  649. convolution::test_conv_config_combinations(
  650. 5, handle_cuda(), false, true, true, eps_getter, true);
  651. }
  652. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_1) {
  653. if (cuda::is_compute_capability_required(7, 0))
  654. return;
  655. using namespace convolution;
  656. Checker<ConvolutionBackwardData> checker(handle_cuda());
  657. checker.set_before_exec_callback(AlgoChecker<ConvolutionBackwardData>(
  658. "CUDNN_CONVOLUTION_BWD_DATA_ALGO_1" CUDNN_VERSION_STRING));
  659. NormalRNG default_rng;
  660. TensorShape s_filter = TensorShape{8, 8, 2, 2}, s_src = TensorShape{2, 8, 18, 18};
  661. float scale = 1.0f / sqrt(s_filter[0] * s_filter[2] * s_filter[3]);
  662. UniformFloatRNG rng(scale, 2 * scale);
  663. auto src = TensorLayout(s_src, dtype::Float16());
  664. auto filter = TensorLayout(s_filter, dtype::Float16());
  665. TensorLayout dst;
  666. param::Convolution param;
  667. param.pad_h = param.pad_w = 2;
  668. param.stride_h = param.stride_w = 2;
  669. {
  670. auto opr = handle_cuda()->create_operator<Convolution>();
  671. opr->param() = param;
  672. opr->deduce_layout(src, filter, dst);
  673. }
  674. src.dtype = dst.dtype = filter.dtype = dtype::Float16();
  675. param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  676. checker.set_rng(0, &rng).set_rng(1, &rng).set_epsilon(0.2).set_param(param).exec(
  677. TensorLayoutArray{filter, dst, src});
  678. }
  679. TEST_F(CUDA, CONVOLUTION_BACKWARD_DEPTHWISE_LARGE_FILTER) {
  680. Checker<ConvolutionBackwardData> checker(handle_cuda());
  681. checker.set_before_exec_callback(
  682. AlgoChecker<ConvolutionBackwardData>("DEPTHWISE_LARGE_FILTER"));
  683. for (auto dtype : std::vector<DType> {
  684. dtype::Float32(),
  685. #if CUDA_VERSION >= 9000
  686. dtype::Float16()
  687. #endif
  688. }) {
  689. auto run = [&checker, &dtype](
  690. size_t n, size_t g, size_t h, size_t fh, size_t padding,
  691. size_t stride) {
  692. param::Convolution param;
  693. param.stride_h = param.stride_w = stride;
  694. param.pad_h = param.pad_w = padding;
  695. param.mode = Convolution::Mode::CROSS_CORRELATION;
  696. param.sparse = param::Convolution::Sparse::GROUP;
  697. checker.set_dtype(0, dtype).set_dtype(1, dtype).set_dtype(2, dtype);
  698. float scale = 64.f / sqrt(fh * fh);
  699. UniformFloatRNG rng(scale, scale * 2);
  700. checker.set_rng(0, &rng).set_rng(1, &rng).set_rng(2, &rng);
  701. if (dtype.enumv() == DTypeEnum::Float16)
  702. checker.set_epsilon(1e-1);
  703. checker.set_param(param).execs(
  704. {{g, 1, 1, fh, fh},
  705. {n, g, (h + 2 * padding - fh + 1) / stride,
  706. (h + 2 * padding - fh + 1) / stride},
  707. {n, g, h, h}});
  708. };
  709. run(4, 8, 32, 5, 5 / 2, 1);
  710. run(4, 8, 32, 7, 7 / 2, 1);
  711. run(4, 8, 32, 9, 9 / 2, 1);
  712. run(4, 8, 32, 11, 11 / 2, 1);
  713. run(4, 8, 32, 13, 13 / 2, 1);
  714. run(4, 8, 32, 15, 15 / 2, 1);
  715. run(4, 8, 32, 17, 17 / 2, 1);
  716. run(4, 8, 32, 19, 19 / 2, 1);
  717. run(4, 8, 32, 21, 21 / 2, 1);
  718. run(4, 8, 32, 23, 23 / 2, 1);
  719. run(4, 8, 32, 25, 25 / 2, 1);
  720. run(4, 8, 32, 27, 27 / 2, 1);
  721. run(4, 8, 32, 29, 29 / 2, 1);
  722. run(4, 8, 32, 31, 31 / 2, 1);
  723. run(4, 8, 64, 5, 5 / 2, 2);
  724. run(4, 8, 64, 7, 7 / 3, 2);
  725. run(4, 8, 64, 9, 9 / 3, 2);
  726. run(4, 8, 64, 11, 11 / 3, 2);
  727. run(4, 8, 64, 13, 13 / 3, 2);
  728. run(4, 8, 64, 15, 15 / 3, 2);
  729. run(4, 8, 64, 17, 17 / 3, 2);
  730. run(4, 8, 64, 19, 19 / 3, 2);
  731. run(4, 8, 64, 21, 21 / 3, 2);
  732. run(4, 8, 64, 23, 23 / 3, 2);
  733. run(4, 8, 64, 25, 25 / 3, 2);
  734. run(4, 8, 64, 27, 27 / 3, 2);
  735. run(4, 8, 64, 29, 29 / 3, 2);
  736. run(4, 8, 64, 31, 31 / 3, 2);
  737. run(1, 2, 128, 31, 31 / 3, 2);
  738. run(1, 2, 256, 31, 31 / 3, 2);
  739. }
  740. }
  741. #if MEGDNN_WITH_BENCHMARK
  742. TEST_F(CUDA, CONV_FWD_BENCHMARK) {
  743. auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW, size_t SH = 1,
  744. size_t SW = 1, size_t FH = 1, size_t FW = 1, size_t PH = 0,
  745. size_t PW = 0, bool fp16io_c32 = false) {
  746. auto benchmarker = Benchmarker<ConvolutionForward>(handle_cuda());
  747. benchmarker.set_dtype(0, dtype::Float16())
  748. .set_dtype(1, dtype::Float16())
  749. .set_dtype(2, dtype::Float16());
  750. ConvolutionForward::Param param;
  751. param.stride_h = SH;
  752. param.stride_w = SW;
  753. param.pad_h = PH;
  754. param.pad_w = PW;
  755. if (fp16io_c32) {
  756. param.compute_mode = ConvolutionForward::Param::ComputeMode::FLOAT32;
  757. }
  758. benchmarker.set_param(param);
  759. std::unique_ptr<OprProxy<ConvolutionForward>> proxy{
  760. new OprProxy<ConvolutionForward>{true}};
  761. benchmarker.set_proxy(proxy);
  762. size_t OH = (IH - FH + 2 * PH) / SH + 1;
  763. size_t OW = (IW - FW + 2 * PW) / SW + 1;
  764. auto time =
  765. benchmarker.execs({{N, IC, IH, IW}, {OC, IC, FH, FW}, {N, OC, OH, OW}});
  766. time /= 1000.0 * 10.0;
  767. auto flo = (double)N * OC * IC * OH * OW * FH * FW * 2;
  768. auto flops = flo / time / 1e12;
  769. printf("comp_type %s: ", fp16io_c32 ? "32" : "16");
  770. printf("%.3fG FLO, flops %.3fTFLOPS\n", flo / 1e9, flops);
  771. };
  772. run(32, 512, 256, 56, 56, 1, 1, 1, 1, 0, 0, false);
  773. run(32, 512, 256, 56, 56, 1, 1, 1, 1, 0, 0, true);
  774. }
  775. TEST_F(CUDA, CONVOLUTION_FWD_BENCHMARK) {
  776. CUBenchmarker<ConvolutionForward> bench{handle_cuda()};
  777. std::unique_ptr<OprProxy<ConvolutionForward>> proxy{
  778. new OprProxy<ConvolutionForward>{true}};
  779. size_t RUNS = 10;
  780. bench.set_proxy(proxy).set_times(RUNS);
  781. auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW, size_t FH,
  782. size_t SH, size_t PH) {
  783. bench.set_dtype(0, dtype::Float32())
  784. .set_dtype(1, dtype::Float32())
  785. .set_dtype(2, dtype::Float32());
  786. param::Convolution param;
  787. param.stride_h = param.stride_w = SH;
  788. param.pad_h = param.pad_w = PH;
  789. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  790. bench.set_param(param);
  791. bench.proxy()->target_execution_policy.algo.reset();
  792. TensorLayout src{{N, IC, IH, IW}, dtype::Float32()},
  793. filter{{OC, IC, FH, FH}, dtype::Float32()};
  794. TensorLayout dst;
  795. {
  796. auto&& opr = handle_cuda()->create_operator<Convolution>();
  797. opr->param() = param;
  798. opr->deduce_layout(src, filter, dst);
  799. }
  800. auto time_ms_fp32 = bench.execl({src, filter, dst}) / RUNS;
  801. src.dtype = filter.dtype = dst.dtype = dtype::Float16();
  802. bench.proxy()->target_execution_policy.algo.reset();
  803. bench.set_dtype(0, dtype::Float16())
  804. .set_dtype(1, dtype::Float16())
  805. .set_dtype(2, dtype::Float16());
  806. auto time_ms_true_fp16 = bench.execl({src, filter, dst}) / RUNS;
  807. param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  808. bench.proxy()->target_execution_policy.algo.reset();
  809. bench.set_param(param);
  810. auto time_ms_pseudo_fp16 = bench.execl({src, filter, dst}) / RUNS;
  811. float flo = 2.0 * N * OC * IC * dst[2] * dst[3] * FH * FH;
  812. printf("inp=%s, kern=%s, dst=%s ", src.to_string().c_str(),
  813. filter.to_string().c_str(), dst.to_string().c_str());
  814. printf("time_fp32=%.2fms, flops=%.3fTFLOPS\ntime_true_fp16=%.2fms, "
  815. "flops=%.3fTFLOPS\ntime_pseudo_fp16=%.2fms, flops=%.3fFLOPS\n",
  816. time_ms_fp32, (flo / (time_ms_fp32 * 1e9)), time_ms_true_fp16,
  817. (flo / (time_ms_true_fp16 * 1e9)), time_ms_pseudo_fp16,
  818. (flo / (time_ms_pseudo_fp16 * 1e9)));
  819. printf("speedup (true_fp16/fp32)=%.2f, (true_fp16/pseudo_fp16)=%.2f\n",
  820. time_ms_fp32 / time_ms_true_fp16,
  821. time_ms_pseudo_fp16 / time_ms_true_fp16);
  822. };
  823. run(32, 64, 3, 224, 224, 7, 2, 3);
  824. run(32, 128, 128, 28, 28, 3, 1, 1);
  825. run(32, 256, 256, 14, 14, 3, 1, 1);
  826. run(32, 512, 512, 7, 7, 3, 1, 1);
  827. run(32, 64, 64, 56, 56, 3, 1, 1);
  828. run(32, 512, 256, 56, 56, 1, 2, 0);
  829. run(32, 1024, 512, 28, 28, 1, 2, 0);
  830. run(32, 2048, 1024, 14, 14, 1, 2, 0);
  831. run(32, 512, 128, 28, 28, 1, 1, 0);
  832. run(32, 128, 512, 28, 28, 1, 1, 0);
  833. run(32, 1024, 256, 14, 14, 1, 1, 0);
  834. run(32, 256, 1024, 14, 14, 1, 1, 0);
  835. run(32, 2048, 512, 7, 7, 1, 1, 0);
  836. run(32, 512, 2048, 7, 7, 1, 1, 0);
  837. run(32, 256, 64, 56, 56, 1, 1, 0);
  838. run(32, 64, 256, 56, 56, 1, 1, 0);
  839. run(32, 128, 256, 56, 56, 1, 2, 0);
  840. run(32, 256, 512, 28, 28, 1, 2, 0);
  841. run(32, 512, 1024, 14, 14, 1, 2, 0);
  842. run(32, 64, 64, 56, 56, 1, 1, 0);
  843. }
  844. TEST_F(CUDA, CONVOLUTION_BWD_DATA_BENCHMARK) {
  845. CUBenchmarker<ConvolutionBackwardData> bench{handle_cuda()};
  846. std::unique_ptr<OprProxy<ConvolutionBackwardData>> proxy{
  847. new OprProxy<ConvolutionBackwardData>{true}};
  848. size_t RUNS = 10;
  849. bench.set_proxy(proxy).set_times(RUNS);
  850. auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW, size_t FH,
  851. size_t SH, size_t PH) {
  852. bench.set_dtype(0, dtype::Float32())
  853. .set_dtype(1, dtype::Float32())
  854. .set_dtype(2, dtype::Float32());
  855. param::Convolution param;
  856. param.stride_h = param.stride_w = SH;
  857. param.pad_h = param.pad_w = PH;
  858. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  859. bench.set_param(param);
  860. bench.proxy()->target_execution_policy.algo.reset();
  861. TensorLayout src{{N, IC, IH, IW}, dtype::Float32()},
  862. filter{{OC, IC, FH, FH}, dtype::Float32()};
  863. TensorLayout dst;
  864. {
  865. auto&& opr = handle_cuda()->create_operator<Convolution>();
  866. opr->param() = param;
  867. opr->deduce_layout(src, filter, dst);
  868. }
  869. auto time_ms_fp32 = bench.execl({filter, dst, src}) / RUNS;
  870. src.dtype = filter.dtype = dst.dtype = dtype::Float16();
  871. bench.proxy()->target_execution_policy.algo.reset();
  872. bench.set_dtype(0, dtype::Float16())
  873. .set_dtype(1, dtype::Float16())
  874. .set_dtype(2, dtype::Float16());
  875. auto time_ms_true_fp16 = bench.execl({filter, dst, src}) / RUNS;
  876. param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  877. bench.proxy()->target_execution_policy.algo.reset();
  878. bench.set_param(param);
  879. auto time_ms_pseudo_fp16 = bench.execl({filter, dst, src}) / RUNS;
  880. float flo = 2.0 * N * OC * IC * dst[2] * dst[3] * FH * FH;
  881. printf("inp=%s, kern=%s, dst=%s ", src.to_string().c_str(),
  882. filter.to_string().c_str(), dst.to_string().c_str());
  883. printf("time_fp32=%.2fms, flops=%.3fTFLOPS\ntime_true_fp16=%.2fms, "
  884. "flops=%.3fTFLOPS\ntime_pseudo_fp16=%.2fms, flops=%.3fFLOPS\n",
  885. time_ms_fp32, (flo / (time_ms_fp32 * 1e9)), time_ms_true_fp16,
  886. (flo / (time_ms_true_fp16 * 1e9)), time_ms_pseudo_fp16,
  887. (flo / (time_ms_pseudo_fp16 * 1e9)));
  888. printf("speedup (true_fp16/fp32)=%.2f, (true_fp16/pseudo_fp16)=%.2f\n",
  889. time_ms_fp32 / time_ms_true_fp16,
  890. time_ms_pseudo_fp16 / time_ms_true_fp16);
  891. };
  892. run(32, 64, 3, 224, 224, 7, 2, 3);
  893. run(32, 128, 128, 28, 28, 3, 1, 1);
  894. run(32, 256, 256, 14, 14, 3, 1, 1);
  895. run(32, 512, 512, 7, 7, 3, 1, 1);
  896. run(32, 64, 64, 56, 56, 3, 1, 1);
  897. run(32, 512, 256, 56, 56, 1, 2, 0);
  898. run(32, 1024, 512, 28, 28, 1, 2, 0);
  899. run(32, 2048, 1024, 14, 14, 1, 2, 0);
  900. run(32, 512, 128, 28, 28, 1, 1, 0);
  901. run(32, 128, 512, 28, 28, 1, 1, 0);
  902. run(32, 1024, 256, 14, 14, 1, 1, 0);
  903. run(32, 256, 1024, 14, 14, 1, 1, 0);
  904. run(32, 2048, 512, 7, 7, 1, 1, 0);
  905. run(32, 512, 2048, 7, 7, 1, 1, 0);
  906. run(32, 256, 64, 56, 56, 1, 1, 0);
  907. run(32, 64, 256, 56, 56, 1, 1, 0);
  908. run(32, 128, 256, 56, 56, 1, 2, 0);
  909. run(32, 256, 512, 28, 28, 1, 2, 0);
  910. run(32, 512, 1024, 14, 14, 1, 2, 0);
  911. run(32, 64, 64, 56, 56, 1, 1, 0);
  912. }
  913. TEST_F(CUDA, BENCHMARK_CONVOLUTION_BWD_DATA_DEPTHWISE_LARGE_FILTER_FP32) {
  914. CUBenchmarker<ConvolutionBackwardData> bencher{handle_cuda()};
  915. bencher.set_display(false);
  916. bencher.set_before_exec_callback(
  917. AlgoChecker<ConvolutionBackwardData>("DEPTHWISE_LARGE_FILTER"));
  918. auto run = [&](size_t N, size_t OC, size_t g, size_t IH, size_t IW, size_t FH,
  919. size_t SH, size_t nr_times) {
  920. bencher.set_dtype(0, dtype::Float32())
  921. .set_dtype(1, dtype::Float32())
  922. .set_dtype(2, dtype::Float32());
  923. param::Convolution param;
  924. param.stride_h = param.stride_w = SH;
  925. param.pad_h = param.pad_w = FH / 2;
  926. param.sparse = param::Convolution::Sparse::GROUP;
  927. bencher.set_param(param);
  928. bencher.set_times(nr_times);
  929. TensorLayout src{{N, g, IH, IW}, dtype::Float32()},
  930. filter{{g, 1, 1, FH, FH}, dtype::Float32()};
  931. TensorLayout dst;
  932. {
  933. auto&& opr = handle_cuda()->create_operator<Convolution>();
  934. opr->param() = param;
  935. opr->deduce_layout(src, filter, dst);
  936. }
  937. auto time_ms_fp32 = bencher.execl({filter, dst, src}) / nr_times;
  938. float flo = 2.0 * N * g * dst[2] * dst[3] * FH * FH;
  939. printf("inp=%s, kern=%s, dst=%s ", src.to_string().c_str(),
  940. filter.to_string().c_str(), dst.to_string().c_str());
  941. printf("time_fp32=%.2fms, flops=%.3fTFLOPS\n", time_ms_fp32,
  942. (flo / (time_ms_fp32 * 1e9)));
  943. };
  944. run(64, 384, 384, 32, 32, 3, 1, 10);
  945. run(64, 384, 384, 32, 32, 5, 1, 10);
  946. run(64, 384, 384, 32, 32, 7, 1, 10);
  947. run(64, 384, 384, 32, 32, 9, 1, 10);
  948. run(64, 384, 384, 32, 32, 11, 1, 10);
  949. run(64, 384, 384, 32, 32, 13, 1, 10);
  950. run(64, 384, 384, 32, 32, 15, 1, 10);
  951. run(64, 384, 384, 32, 32, 17, 1, 10);
  952. run(64, 384, 384, 32, 32, 19, 1, 10);
  953. run(64, 384, 384, 32, 32, 21, 1, 10);
  954. run(64, 384, 384, 32, 32, 23, 1, 10);
  955. run(64, 384, 384, 32, 32, 25, 1, 10);
  956. run(64, 384, 384, 32, 32, 27, 1, 10);
  957. run(64, 384, 384, 32, 32, 29, 1, 10);
  958. run(64, 384, 384, 32, 32, 31, 1, 10);
  959. }
  960. TEST_F(CUDA, BENCHMARK_CONVOLUTION_BWD_DATA_DEPTHWISE_LARGE_FILTER_FP16) {
  961. CUBenchmarker<ConvolutionBackwardData> bencher{handle_cuda()};
  962. bencher.set_display(false);
  963. bencher.set_before_exec_callback(
  964. AlgoChecker<ConvolutionBackwardData>("DEPTHWISE_LARGE_FILTER"));
  965. auto run = [&](size_t N, size_t OC, size_t g, size_t IH, size_t IW, size_t FH,
  966. size_t SH, size_t nr_times) {
  967. bencher.set_dtype(0, dtype::Float16())
  968. .set_dtype(1, dtype::Float16())
  969. .set_dtype(2, dtype::Float16());
  970. param::Convolution param;
  971. param.stride_h = param.stride_w = SH;
  972. param.pad_h = param.pad_w = FH / 2;
  973. param.sparse = param::Convolution::Sparse::GROUP;
  974. bencher.set_param(param);
  975. bencher.set_times(nr_times);
  976. TensorLayout src{{N, g, IH, IW}, dtype::Float16()},
  977. filter{{g, 1, 1, FH, FH}, dtype::Float16()};
  978. TensorLayout dst;
  979. {
  980. auto&& opr = handle_cuda()->create_operator<Convolution>();
  981. opr->param() = param;
  982. opr->deduce_layout(src, filter, dst);
  983. }
  984. auto time_ms_fp16 = bencher.execl({filter, dst, src}) / nr_times;
  985. float flo = 2.0 * N * g * dst[2] * dst[3] * FH * FH;
  986. printf("inp=%s, kern=%s, dst=%s ", src.to_string().c_str(),
  987. filter.to_string().c_str(), dst.to_string().c_str());
  988. printf("time_fp16=%.2fms, flops=%.3fTFLOPS\n", time_ms_fp16,
  989. (flo / (time_ms_fp16 * 1e9)));
  990. };
  991. run(64, 384, 384, 32, 32, 3, 1, 10);
  992. run(64, 384, 384, 32, 32, 5, 1, 10);
  993. run(64, 384, 384, 32, 32, 7, 1, 10);
  994. run(64, 384, 384, 32, 32, 9, 1, 10);
  995. run(64, 384, 384, 32, 32, 11, 1, 10);
  996. run(64, 384, 384, 32, 32, 13, 1, 10);
  997. run(64, 384, 384, 32, 32, 15, 1, 10);
  998. run(64, 384, 384, 32, 32, 17, 1, 10);
  999. run(64, 384, 384, 32, 32, 19, 1, 10);
  1000. run(64, 384, 384, 32, 32, 21, 1, 10);
  1001. run(64, 384, 384, 32, 32, 23, 1, 10);
  1002. run(64, 384, 384, 32, 32, 25, 1, 10);
  1003. run(64, 384, 384, 32, 32, 27, 1, 10);
  1004. run(64, 384, 384, 32, 32, 29, 1, 10);
  1005. run(64, 384, 384, 32, 32, 31, 1, 10);
  1006. }
  1007. TEST_F(CUDA, BENCHMARK_CONVOLUTION_BWD_DATA_BF16) {
  1008. CUBenchmarker<ConvolutionBackwardData> bench{handle_cuda()};
  1009. std::unique_ptr<OprProxy<ConvolutionBackwardData>> proxy{
  1010. new OprProxy<ConvolutionBackwardData>{true}};
  1011. size_t RUNS = 10;
  1012. bench.set_proxy(proxy).set_times(RUNS);
  1013. auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW, size_t FH,
  1014. size_t SH, size_t PH) {
  1015. bench.set_dtype(0, dtype::BFloat16())
  1016. .set_dtype(1, dtype::BFloat16())
  1017. .set_dtype(2, dtype::BFloat16());
  1018. param::Convolution param;
  1019. param.stride_h = param.stride_w = SH;
  1020. param.pad_h = param.pad_w = PH;
  1021. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  1022. bench.set_param(param);
  1023. bench.proxy()->target_execution_policy = {};
  1024. TensorLayout src{{N, IC, IH, IW}, dtype::BFloat16()},
  1025. filter{{OC, IC, FH, FH}, dtype::BFloat16()};
  1026. TensorLayout dst;
  1027. {
  1028. auto&& opr = handle_cuda()->create_operator<Convolution>();
  1029. opr->param() = param;
  1030. opr->deduce_layout(src, filter, dst);
  1031. }
  1032. auto used = bench.execl({filter, dst, src}) / RUNS;
  1033. float flo = 2.0 * N * OC * IC * dst[2] * dst[3] * FH * FH;
  1034. printf("inp=%s, kern=%s, dst=%s ", src.to_string().c_str(),
  1035. filter.to_string().c_str(), dst.to_string().c_str());
  1036. printf("time_fp32=%.2fms, flops=%.3fTFLOPS\n", used, (flo / (used * 1e9)));
  1037. };
  1038. run(32, 64, 3, 224, 224, 7, 2, 3);
  1039. run(32, 128, 128, 28, 28, 3, 1, 1);
  1040. run(32, 256, 256, 14, 14, 3, 1, 1);
  1041. run(32, 512, 512, 7, 7, 3, 1, 1);
  1042. run(32, 64, 64, 56, 56, 3, 1, 1);
  1043. run(32, 512, 256, 56, 56, 1, 2, 0);
  1044. run(32, 1024, 512, 28, 28, 1, 2, 0);
  1045. run(32, 2048, 1024, 14, 14, 1, 2, 0);
  1046. run(32, 512, 128, 28, 28, 1, 1, 0);
  1047. run(32, 128, 512, 28, 28, 1, 1, 0);
  1048. run(32, 1024, 256, 14, 14, 1, 1, 0);
  1049. run(32, 256, 1024, 14, 14, 1, 1, 0);
  1050. run(32, 2048, 512, 7, 7, 1, 1, 0);
  1051. run(32, 512, 2048, 7, 7, 1, 1, 0);
  1052. run(32, 256, 64, 56, 56, 1, 1, 0);
  1053. run(32, 64, 256, 56, 56, 1, 1, 0);
  1054. run(32, 128, 256, 56, 56, 1, 2, 0);
  1055. run(32, 256, 512, 28, 28, 1, 2, 0);
  1056. run(32, 512, 1024, 14, 14, 1, 2, 0);
  1057. run(32, 64, 64, 56, 56, 1, 1, 0);
  1058. }
  1059. TEST_F(CUDA, BENCHMARK_CONVOLUTION_BWD_DATA_INT8_DP4A) {
  1060. CUBenchmarker<ConvolutionBackwardData> bench{handle_cuda()};
  1061. std::unique_ptr<OprProxy<ConvolutionBackwardData>> proxy{
  1062. new OprProxy<ConvolutionBackwardData>{true}};
  1063. size_t RUNS = 10;
  1064. bench.set_proxy(proxy).set_times(RUNS);
  1065. auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW, size_t FH,
  1066. size_t SH, size_t PH) {
  1067. bench.set_dtype(0, dtype::QuantizedS8{1.0f})
  1068. .set_dtype(1, dtype::QuantizedS8{1.0f})
  1069. .set_dtype(2, dtype::QuantizedS8{1.0f});
  1070. param::Convolution param;
  1071. param.format = param::Convolution::Format::NCHW4;
  1072. param.stride_h = param.stride_w = SH;
  1073. param.pad_h = param.pad_w = PH;
  1074. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  1075. bench.set_param(param);
  1076. bench.proxy()->target_execution_policy = {};
  1077. TensorLayout src{{N, IC / 4, IH, IW, 4}, dtype::QuantizedS8{1.0f}},
  1078. filter{{OC, IC / 4, FH, FH, 4}, dtype::QuantizedS8{1.0f}};
  1079. TensorLayout dst;
  1080. dst.dtype = dtype::QuantizedS8{1.0f};
  1081. {
  1082. auto&& opr = handle_cuda()->create_operator<Convolution>();
  1083. opr->param() = param;
  1084. opr->deduce_layout(src, filter, dst);
  1085. }
  1086. auto used = bench.execl({filter, dst, src}) / RUNS;
  1087. float flo = 2.0 * N * OC * IC * dst[2] * dst[3] * FH * FH;
  1088. printf("inp=%s, kern=%s, dst=%s ", src.to_string().c_str(),
  1089. filter.to_string().c_str(), dst.to_string().c_str());
  1090. printf("time_fp32=%.2fms, flops=%.3fTFLOPS\n", used, (flo / (used * 1e9)));
  1091. };
  1092. run(64, 32, 32, 92, 180, 4, 2, 2);
  1093. run(64, 32, 32, 46, 80, 4, 2, 2);
  1094. run(16, 16, 16, 92, 180, 4, 2, 2);
  1095. run(16, 16, 16, 46, 80, 4, 2, 2);
  1096. }
  1097. TEST_F(CUDA, CONVOLUTION_BWD_FILTER_BENCHMARK) {
  1098. CUBenchmarker<ConvolutionBackwardFilter> bench{handle_cuda()};
  1099. std::unique_ptr<OprProxy<ConvolutionBackwardFilter>> proxy{
  1100. new OprProxy<ConvolutionBackwardFilter>{true}};
  1101. size_t RUNS = 10;
  1102. bench.set_proxy(proxy).set_times(RUNS);
  1103. auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW, size_t FH,
  1104. size_t SH, size_t PH) {
  1105. bench.set_dtype(0, dtype::Float32())
  1106. .set_dtype(1, dtype::Float32())
  1107. .set_dtype(2, dtype::Float32());
  1108. param::Convolution param;
  1109. param.stride_h = param.stride_w = SH;
  1110. param.pad_h = param.pad_w = PH;
  1111. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  1112. bench.set_param(param);
  1113. bench.proxy()->target_execution_policy.algo.reset();
  1114. TensorLayout src{{N, IC, IH, IW}, dtype::Float32()},
  1115. filter{{OC, IC, FH, FH}, dtype::Float32()};
  1116. TensorLayout dst;
  1117. {
  1118. auto&& opr = handle_cuda()->create_operator<Convolution>();
  1119. opr->param() = param;
  1120. opr->deduce_layout(src, filter, dst);
  1121. }
  1122. auto time_ms_fp32 = bench.execl({src, dst, filter}) / RUNS;
  1123. src.dtype = filter.dtype = dst.dtype = dtype::Float16();
  1124. bench.proxy()->target_execution_policy.algo.reset();
  1125. bench.set_dtype(0, dtype::Float16())
  1126. .set_dtype(1, dtype::Float16())
  1127. .set_dtype(2, dtype::Float16());
  1128. auto time_ms_true_fp16 = bench.execl({src, dst, filter}) / RUNS;
  1129. param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  1130. bench.proxy()->target_execution_policy.algo.reset();
  1131. bench.set_param(param);
  1132. auto time_ms_pseudo_fp16 = bench.execl({src, dst, filter}) / RUNS;
  1133. float flo = 2.0 * N * OC * IC * dst[2] * dst[3] * FH * FH;
  1134. printf("inp=%s, kern=%s, dst=%s ", src.to_string().c_str(),
  1135. filter.to_string().c_str(), dst.to_string().c_str());
  1136. printf("time_fp32=%.2fms, flops=%.3fTFLOPS\ntime_true_fp16=%.2fms, "
  1137. "flops=%.3fTFLOPS\ntime_pseudo_fp16=%.2fms, flops=%.3fFLOPS\n",
  1138. time_ms_fp32, (flo / (time_ms_fp32 * 1e9)), time_ms_true_fp16,
  1139. (flo / (time_ms_true_fp16 * 1e9)), time_ms_pseudo_fp16,
  1140. (flo / (time_ms_pseudo_fp16 * 1e9)));
  1141. printf("speedup (true_fp16/fp32)=%.2f, (true_fp16/pseudo_fp16)=%.2f\n",
  1142. time_ms_fp32 / time_ms_true_fp16,
  1143. time_ms_pseudo_fp16 / time_ms_true_fp16);
  1144. };
  1145. run(32, 64, 3, 224, 224, 7, 2, 3);
  1146. run(32, 128, 128, 28, 28, 3, 1, 1);
  1147. run(32, 256, 256, 14, 14, 3, 1, 1);
  1148. run(32, 512, 512, 7, 7, 3, 1, 1);
  1149. run(32, 64, 64, 56, 56, 3, 1, 1);
  1150. run(32, 512, 256, 56, 56, 1, 2, 0);
  1151. run(32, 1024, 512, 28, 28, 1, 2, 0);
  1152. run(32, 2048, 1024, 14, 14, 1, 2, 0);
  1153. run(32, 512, 128, 28, 28, 1, 1, 0);
  1154. run(32, 128, 512, 28, 28, 1, 1, 0);
  1155. run(32, 1024, 256, 14, 14, 1, 1, 0);
  1156. run(32, 256, 1024, 14, 14, 1, 1, 0);
  1157. run(32, 2048, 512, 7, 7, 1, 1, 0);
  1158. run(32, 512, 2048, 7, 7, 1, 1, 0);
  1159. run(32, 256, 64, 56, 56, 1, 1, 0);
  1160. run(32, 64, 256, 56, 56, 1, 1, 0);
  1161. run(32, 128, 256, 56, 56, 1, 2, 0);
  1162. run(32, 256, 512, 28, 28, 1, 2, 0);
  1163. run(32, 512, 1024, 14, 14, 1, 2, 0);
  1164. run(32, 64, 64, 56, 56, 1, 1, 0);
  1165. }
  1166. TEST_F(CUDA, BENCHMARK_CONVOLUTION_BWD_FILTER_DEPTHWISE_LARGE_FILTER) {
  1167. CUBenchmarker<ConvolutionBackwardFilter> bench{handle_cuda()};
  1168. std::unique_ptr<OprProxy<ConvolutionBackwardFilter>> proxy{
  1169. new OprProxy<ConvolutionBackwardFilter>{true}};
  1170. size_t RUNS = 10;
  1171. bench.set_proxy(proxy).set_times(RUNS);
  1172. bench.set_before_exec_callback(AlgoChecker<ConvolutionBackwardFilter>(
  1173. "CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFTv7.6.3"));
  1174. auto run = [&](size_t N, size_t OC, size_t g, size_t IH, size_t IW, size_t FH,
  1175. size_t SH, size_t PH) {
  1176. bench.set_dtype(0, dtype::Float32())
  1177. .set_dtype(1, dtype::Float32())
  1178. .set_dtype(2, dtype::Float32());
  1179. param::Convolution param;
  1180. param.stride_h = param.stride_w = SH;
  1181. param.pad_h = param.pad_w = FH / 2;
  1182. param.sparse = param::Convolution::Sparse::GROUP;
  1183. bench.set_param(param);
  1184. bench.proxy()->target_execution_policy.algo.reset();
  1185. TensorLayout src{{N, g, IH, IW}, dtype::Float32()},
  1186. filter{{g, 1, 1, FH, FH}, dtype::Float32()};
  1187. TensorLayout dst;
  1188. {
  1189. auto&& opr = handle_cuda()->create_operator<Convolution>();
  1190. opr->param() = param;
  1191. opr->deduce_layout(src, filter, dst);
  1192. }
  1193. auto time_ms_fp32 = bench.execl({src, dst, filter}) / RUNS;
  1194. float flo = 2.0 * N * g * dst[2] * dst[3] * FH * FH;
  1195. printf("inp=%s, kern=%s, dst=%s ", src.to_string().c_str(),
  1196. filter.to_string().c_str(), dst.to_string().c_str());
  1197. printf("time_fp32=%.2fms, flops=%.3fTFLOPS\n", time_ms_fp32,
  1198. (flo / (time_ms_fp32 * 1e9)));
  1199. };
  1200. run(64, 384, 384, 32, 32, 31, 1, 15);
  1201. }
  1202. #endif
  1203. #undef CUDNN_VERSION_STRING
  1204. #undef V
  1205. #undef V1
  1206. } // namespace test
  1207. } // namespace megdnn
  1208. // vim: syntax=cpp.doxygen