You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

convolution.cpp 45 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077
  1. /**
  2. * \file dnn/test/cuda/convolution.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
  10. * implied.
  11. */
  12. #include "test/common/convolution.h"
  13. #include "megdnn/dtype.h"
  14. #include "megdnn/opr_param_defs.h"
  15. #include "megdnn/oprs.h"
  16. #include "src/cuda/utils.h"
  17. #include "test/common/accuracy_shake_checker.h"
  18. #include "test/common/checker.h"
  19. #include "test/common/rng.h"
  20. #include "test/common/tensor.h"
  21. #include "test/common/workspace_wrapper.h"
  22. #include "test/cuda/benchmark.h"
  23. #include "test/cuda/fixture.h"
  24. #define V1(x) #x
  25. #define V(x) V1(x)
  26. #define CUDNN_VERSION_STRING \
  27. "v" V(CUDNN_MAJOR) "." V(CUDNN_MINOR) "." V(CUDNN_PATCHLEVEL)
  28. namespace megdnn {
  29. namespace test {
  30. TEST_F(CUDA, CONVOLUTION_8X8X32) {
  31. if (!cuda::is_compute_capability_required(6, 1)) {
  32. printf("Skip CUDA.CONVOLUTION_8X8X32 test as current device"
  33. "doesn't support\n");
  34. return;
  35. }
  36. using namespace convolution;
  37. std::vector<TestArg> args;
  38. {
  39. auto v = get_args();
  40. for (auto&& a : v) {
  41. args.push_back(std::move(a));
  42. }
  43. }
  44. {
  45. auto v = get_dilated_args();
  46. for (auto&& a : v) {
  47. args.push_back(std::move(a));
  48. }
  49. }
  50. {
  51. auto v = get_chanwise_args();
  52. for (auto&& a : v) {
  53. args.push_back(std::move(a));
  54. }
  55. }
  56. Checker<ConvolutionForward> checker(handle_cuda());
  57. UniformIntRNG rng(-4, 4);
  58. for (auto arg : args) {
  59. arg.param.format = param::Convolution::Format::NHWC;
  60. arg.src = cvt_src_or_dst_nchw2nhwc(arg.src);
  61. arg.filter = cvt_filter_nchw2nhwc(arg.filter);
  62. checker.set_dtype(0, dtype::Int8())
  63. .set_dtype(1, dtype::Int8())
  64. .set_dtype(2, dtype::Int32())
  65. .set_param(arg.param)
  66. .set_rng(0, &rng)
  67. .set_rng(1, &rng)
  68. .execs({arg.src, arg.filter, {}});
  69. }
  70. }
  71. TEST_F(CUDA, CONVOLUTION_FORWARD) {
  72. using namespace convolution;
  73. std::vector<TestArg> args = get_args();
  74. Checker<ConvolutionForward> checker(handle_cuda());
  75. NormalRNG default_rng;
  76. for (auto&& arg : args) {
  77. float scale = 1.0f / sqrt(arg.filter[1] * arg.filter[2] * arg.filter[3]);
  78. UniformFloatRNG rng(scale, 2 * scale);
  79. checker.set_dtype(0, dtype::Float32())
  80. .set_dtype(1, dtype::Float32())
  81. .set_dtype(2, dtype::Float32())
  82. .set_rng(0, &default_rng)
  83. .set_rng(1, &default_rng)
  84. .set_epsilon(1e-3)
  85. .set_param(arg.param)
  86. .execs({arg.src, arg.filter, {}});
  87. checker.set_dtype(0, dtype::Float16())
  88. .set_dtype(1, dtype::Float16())
  89. .set_dtype(2, dtype::Float16())
  90. .set_rng(0, &rng)
  91. .set_rng(1, &rng)
  92. .set_epsilon(1e-1)
  93. .set_param(arg.param)
  94. .execs({arg.src, arg.filter, {}});
  95. arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  96. checker.set_dtype(0, dtype::Float16())
  97. .set_dtype(1, dtype::Float16())
  98. .set_dtype(2, dtype::Float16())
  99. .set_rng(0, &rng)
  100. .set_rng(1, &rng)
  101. .set_epsilon(1e-1)
  102. .set_param(arg.param)
  103. .execs({arg.src, arg.filter, {}});
  104. checker.set_dtype(0, dtype::BFloat16())
  105. .set_dtype(1, dtype::BFloat16())
  106. .set_dtype(2, dtype::BFloat16())
  107. .set_epsilon(1e-1)
  108. .set_param(arg.param)
  109. .execs({arg.src, arg.filter, {}});
  110. }
  111. }
  112. TEST_F(CUDA, CONV_FORWARD_MATMUL_NCHW4) {
  113. if (!cuda::is_compute_capability_required(6, 1))
  114. return;
  115. using namespace convolution;
  116. Checker<Convolution> checker(handle_cuda());
  117. UniformIntRNG int_rng{-127, 127};
  118. Convolution::Param param;
  119. param.format = Convolution::Param::Format::NCHW4;
  120. checker.set_dtype(0, dtype::QuantizedS8(0.132f))
  121. .set_dtype(1, dtype::QuantizedS8(0.0239f))
  122. .set_dtype(2, dtype::QuantizedS32(0.132f * 0.0239f))
  123. .set_rng(0, &int_rng)
  124. .set_rng(1, &int_rng)
  125. .set_param(param);
  126. checker.set_before_exec_callback(
  127. AlgoChecker<ConvolutionForward>(ExecutionPolicyAlgoName{
  128. "DEFAULT",
  129. {{ConvBiasForward::algo_name<ConvBiasForward::MatmulParam>(
  130. "MATMUL8X8X32", {})
  131. .c_str(),
  132. {}}}}));
  133. param.sparse = Convolution::Param::Sparse::DENSE;
  134. param.pad_h = param.pad_w = 1;
  135. param.stride_h = param.stride_w = 1;
  136. checker.set_param(param);
  137. checker.exec({{8, 4, 10, 10, 4}, {16, 4, 3, 3, 4}, {}});
  138. checker.exec({{1, 4, 2, 2, 4}, {16, 4, 3, 3, 4}, {}});
  139. checker.exec({{8, 64, 12, 12, 4}, {256, 64, 3, 3, 4}, {}});
  140. }
  141. TEST_F(CUDA, CONVOLUTION_1X1_FORWARD) {
  142. using namespace convolution;
  143. std::vector<TestArg> args = get_1x1_args();
  144. Checker<ConvolutionForward> checker(handle_cuda());
  145. NormalRNG default_rng;
  146. for (auto&& arg : args) {
  147. float scale = 1.0f / sqrt(arg.filter[1] * arg.filter[2] * arg.filter[3]);
  148. UniformFloatRNG rng(scale, 2 * scale);
  149. checker.set_dtype(0, dtype::Float32())
  150. .set_dtype(1, dtype::Float32())
  151. .set_rng(0, &default_rng)
  152. .set_rng(1, &default_rng)
  153. .set_epsilon(1e-3)
  154. .set_param(arg.param)
  155. .execs({arg.src, arg.filter, {}});
  156. }
  157. }
  158. TEST_F(CUDA, BENCHMARK_CONVOLUTION_1X1_FORWARD) {
  159. using namespace convolution;
  160. std::vector<TestArg> args = get_1x1_args();
  161. Benchmarker<ConvolutionForward> marker(handle_cuda());
  162. NormalRNG default_rng;
  163. for (auto&& arg : args) {
  164. float scale = 1.0f / sqrt(arg.filter[1] * arg.filter[2] * arg.filter[3]);
  165. UniformFloatRNG rng(scale, 2 * scale);
  166. marker.set_dtype(0, dtype::Float32())
  167. .set_dtype(1, dtype::Float32())
  168. .set_rng(0, &default_rng)
  169. .set_rng(1, &default_rng)
  170. .set_param(arg.param)
  171. .execs({arg.src, arg.filter, {}});
  172. }
  173. }
  174. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA) {
  175. using namespace convolution;
  176. std::vector<TestArg> args = get_args_cuda_conv_bwd_data();
  177. Checker<ConvolutionBackwardData> checker(handle_cuda());
  178. NormalRNG default_rng;
  179. for (auto&& arg : args) {
  180. float scale = 64.f / sqrt(arg.filter[0] * arg.filter[2] * arg.filter[3]);
  181. UniformFloatRNG rng(scale, 2 * scale);
  182. auto src = TensorLayout(arg.src, dtype::Float32());
  183. auto filter = TensorLayout(arg.filter, dtype::Float32());
  184. TensorLayout dst;
  185. {
  186. auto opr = handle_cuda()->create_operator<Convolution>();
  187. opr->param() = arg.param;
  188. opr->deduce_layout(src, filter, dst);
  189. }
  190. src.dtype = dst.dtype = filter.dtype = dtype::Float32();
  191. checker.set_rng(0, &default_rng)
  192. .set_rng(1, &default_rng)
  193. .set_epsilon(1e-3)
  194. .set_param(arg.param)
  195. .exec(TensorLayoutArray{filter, dst, src});
  196. if (!cuda::is_compute_capability_required(6, 0)) {
  197. src.dtype = dst.dtype = filter.dtype = dtype::Float16();
  198. checker.set_rng(0, &rng)
  199. .set_rng(1, &rng)
  200. .set_epsilon(1e-1)
  201. .set_param(arg.param)
  202. .exec(TensorLayoutArray{filter, dst, src});
  203. arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  204. checker.set_rng(0, &rng)
  205. .set_rng(1, &rng)
  206. .set_epsilon(1e-1)
  207. .set_param(arg.param)
  208. .exec(TensorLayoutArray{filter, dst, src});
  209. }
  210. checker.set_before_exec_callback(
  211. AlgoChecker<ConvolutionBackwardData>(ExecutionPolicyAlgoName{
  212. "CONVOLUTION_BACKWARD_DATD_BFLOAT16",
  213. {{"MATMUL", {{"CUBLAS", {}}}}}}));
  214. src.dtype = dst.dtype = filter.dtype = dtype::BFloat16();
  215. arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  216. checker.set_rng(0, &rng)
  217. .set_rng(1, &rng)
  218. .set_epsilon(1e-1)
  219. .set_param(arg.param)
  220. .exec(TensorLayoutArray{filter, dst, src});
  221. checker.reset_before_exec_callback();
  222. checker.opr()->execution_policy() = {};
  223. }
  224. }
  225. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_FP16_CUDNN7_5) {
  226. // algo CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 with
  227. // TensorCore operations produces incorrect result.
  228. // Maybe nvidia has fixed this issue
  229. // There is a test using incorrect case:
  230. // inp={2x8x18x18}, kern={8x8x2x2}, pad_h=pad_w=2, stride_h=stride_w=2,
  231. // dtype=float16
  232. using namespace convolution;
  233. std::vector<TestArg> args = get_args_cudnn_5_1_backward();
  234. Checker<ConvolutionBackwardData> checker(handle_cuda());
  235. NormalRNG default_rng;
  236. for (auto&& arg : args) {
  237. float scale = 128.f / sqrt(arg.filter[0] * arg.filter[2] * arg.filter[3]);
  238. scale = std::max(scale, 1.f);
  239. UniformFloatRNG rng(scale, 2 * scale);
  240. arg.param.format = param::Convolution::Format::NHWC;
  241. arg.src = cvt_src_or_dst_nchw2nhwc(arg.src);
  242. arg.filter = cvt_filter_nchw2nhwc(arg.filter);
  243. auto src = TensorLayout(arg.src, dtype::Float32());
  244. auto filter = TensorLayout(arg.filter, dtype::Float32());
  245. TensorLayout dst;
  246. {
  247. auto opr = handle_cuda()->create_operator<Convolution>();
  248. opr->param() = arg.param;
  249. opr->deduce_layout(src, filter, dst);
  250. }
  251. src.dtype = dst.dtype = filter.dtype = dtype::Float16();
  252. arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  253. checker.set_rng(0, &rng)
  254. .set_rng(1, &rng)
  255. .set_epsilon(1e-2)
  256. .set_param(arg.param)
  257. .exec(TensorLayoutArray{filter, dst, src});
  258. src.dtype = dst.dtype = filter.dtype = dtype::Float32();
  259. arg.param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  260. checker.set_rng(0, &rng)
  261. .set_rng(1, &rng)
  262. .set_epsilon(1e-2)
  263. .set_param(arg.param)
  264. .exec(TensorLayoutArray{filter, dst, src});
  265. }
  266. }
  267. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_NHWC) {
  268. using namespace convolution;
  269. std::vector<TestArg> args = get_args_cuda_conv_bwd_data();
  270. Checker<ConvolutionBackwardData> checker(handle_cuda());
  271. NormalRNG default_rng;
  272. for (auto&& arg : args) {
  273. float scale = 64.f / sqrt(arg.filter[0] * arg.filter[2] * arg.filter[3]);
  274. UniformFloatRNG rng(scale, 2 * scale);
  275. arg.param.format = param::Convolution::Format::NHWC;
  276. arg.src = cvt_src_or_dst_nchw2nhwc(arg.src);
  277. arg.filter = cvt_filter_nchw2nhwc(arg.filter);
  278. auto src = TensorLayout(arg.src, dtype::Float32());
  279. auto filter = TensorLayout(arg.filter, dtype::Float32());
  280. TensorLayout dst;
  281. {
  282. auto opr = handle_cuda()->create_operator<Convolution>();
  283. opr->param() = arg.param;
  284. opr->deduce_layout(src, filter, dst);
  285. }
  286. src.dtype = dst.dtype = filter.dtype = dtype::Float16();
  287. arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  288. checker.set_rng(0, &rng)
  289. .set_rng(1, &rng)
  290. .set_epsilon(1e-2)
  291. .set_param(arg.param)
  292. .exec(TensorLayoutArray{filter, dst, src});
  293. src.dtype = dst.dtype = filter.dtype = dtype::Float32();
  294. arg.param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  295. checker.set_rng(0, &rng)
  296. .set_rng(1, &rng)
  297. .set_epsilon(1e-2)
  298. .set_param(arg.param)
  299. .exec(TensorLayoutArray{filter, dst, src});
  300. }
  301. }
  302. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_CUDNN) {
  303. if (cuda::is_compute_capability_required(7, 0))
  304. return;
  305. using namespace convolution;
  306. Checker<ConvolutionBackwardData> checker(handle_cuda());
  307. checker.set_before_exec_callback(
  308. AlgoChecker<ConvolutionBackwardData>("CUDNN_CONVOLUTION"));
  309. //! noncontiguous case
  310. {
  311. param::Convolution param;
  312. param.pad_h = param.pad_w = 1;
  313. checker.set_param(param).execl(TensorLayoutArray{
  314. {{16, 16, 3, 3}, {144, 9, 3, 1}, dtype::Float32()},
  315. {{2, 16, 7, 7}, {1568, 49, 7, 1}, dtype::Float32()},
  316. {{2, 16, 7, 7}, {1568, 49, 7, 1}, dtype::Float32()},
  317. });
  318. }
  319. }
  320. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_MATMUL) {
  321. using namespace convolution;
  322. std::vector<TestArg> args = get_args_cuda_conv_bwd_data();
  323. Checker<ConvolutionBackwardData> checker(handle_cuda());
  324. checker.set_before_exec_callback(AlgoChecker<ConvolutionBackwardData>(
  325. ExecutionPolicyAlgoName{"MATMUL", {{"CUBLAS", {}}}}));
  326. NormalRNG default_rng;
  327. for (auto&& arg : args) {
  328. float scale = 64.f / sqrt(arg.filter[0] * arg.filter[2] * arg.filter[3]);
  329. UniformFloatRNG rng(scale, 2 * scale);
  330. auto src = TensorLayout(arg.src, dtype::Float32());
  331. auto filter = TensorLayout(arg.filter, dtype::Float32());
  332. TensorLayout dst;
  333. {
  334. auto opr = handle_cuda()->create_operator<Convolution>();
  335. opr->param() = arg.param;
  336. opr->deduce_layout(src, filter, dst);
  337. }
  338. src.dtype = dst.dtype = filter.dtype = dtype::Float32();
  339. checker.set_rng(0, &default_rng)
  340. .set_rng(1, &default_rng)
  341. .set_epsilon(1e-3)
  342. .set_param(arg.param)
  343. .exec(TensorLayoutArray{filter, dst, src});
  344. }
  345. //! noncontiguous case
  346. {
  347. param::Convolution param;
  348. param.pad_h = param.pad_w = 1;
  349. checker.set_param(param).execl(TensorLayoutArray{
  350. {{16, 16, 3, 3}, {144, 9, 3, 1}, dtype::Float32()},
  351. {{2, 16, 7, 7}, {1568, 49, 7, 1}, dtype::Float32()},
  352. {{2, 16, 7, 7}, {1568, 49, 7, 1}, dtype::Float32()},
  353. });
  354. }
  355. }
  356. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_INT8_NCHW4_DP4A) {
  357. if (!cuda::is_compute_capability_required(6, 1)) {
  358. printf("Skip CUDA.CONVOLUTION_BACKWARD_DATA_INT8_NCHW4_DP4A test as "
  359. "current device doesn't support\n");
  360. return;
  361. }
  362. using namespace convolution;
  363. std::vector<TestArg> args = get_args_int8_nchw4_conv_bwd_data();
  364. struct AlgoParam {
  365. int threadblock_m;
  366. int threadblock_n;
  367. int threadblock_k;
  368. int warp_m;
  369. int warp_n;
  370. int warp_k;
  371. int stage;
  372. std::string to_string() {
  373. return ssprintf(
  374. "_%dX%dX%d_%dX%dX%d_%dstage", threadblock_m, threadblock_n,
  375. threadblock_k, warp_m, warp_n, warp_k, stage);
  376. }
  377. };
  378. std::vector<AlgoParam> all_params;
  379. all_params.emplace_back(AlgoParam{16, 64, 8, 16, 64, 8, 2});
  380. all_params.emplace_back(AlgoParam{16, 128, 16, 16, 64, 16, 2});
  381. all_params.emplace_back(AlgoParam{16, 128, 16, 16, 128, 16, 1});
  382. all_params.emplace_back(AlgoParam{32, 128, 32, 32, 64, 32, 2});
  383. for (auto algo_param : all_params) {
  384. Checker<ConvolutionBackwardData> checker(handle_cuda());
  385. std::string algo_name(ssprintf(
  386. "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM%s", algo_param.to_string().c_str()));
  387. checker.set_before_exec_callback(
  388. AlgoChecker<ConvolutionBackwardData>(algo_name.c_str()));
  389. checker.set_epsilon(1 + 1e-3).set_max_avg_error(1e-1);
  390. for (auto&& arg : args) {
  391. UniformIntRNG rng(-3, 3);
  392. auto src = TensorLayout(arg.src, dtype::QuantizedS8{1.2f});
  393. auto filter = TensorLayout(arg.filter, dtype::QuantizedS8{1.3f});
  394. TensorLayout dst;
  395. dst.dtype = dtype::QuantizedS8{1.2f};
  396. {
  397. auto opr = handle_cuda()->create_operator<Convolution>();
  398. opr->param() = arg.param;
  399. opr->deduce_layout(src, filter, dst);
  400. }
  401. checker.set_rng(0, &rng).set_rng(1, &rng).set_param(arg.param).exec(
  402. TensorLayoutArray{filter, dst, src});
  403. }
  404. }
  405. }
  406. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_INT8_NCHW_DP4A) {
  407. if (!cuda::is_compute_capability_required(6, 1)) {
  408. printf("Skip CUDA.CONVOLUTION_BACKWARD_DATA_INT8_NCHW_DP4A test as "
  409. "current device doesn't support\n");
  410. return;
  411. }
  412. using namespace convolution;
  413. std::vector<TestArg> args = get_args_int8_nchw_conv_bwd_data();
  414. Checker<ConvolutionBackwardData> checker(handle_cuda());
  415. checker.set_before_exec_callback(
  416. AlgoChecker<ConvolutionBackwardData>("INT8_NCHW_DOTPROD_IMPLICIT_GEMM"));
  417. checker.set_epsilon(1 + 1e-3).set_max_avg_error(1e-1);
  418. for (auto&& arg : args) {
  419. UniformIntRNG rng(-3, 3);
  420. auto src = TensorLayout(arg.src, dtype::QuantizedS8{1.2f});
  421. auto filter = TensorLayout(arg.filter, dtype::QuantizedS8{1.3f});
  422. TensorLayout dst;
  423. dst.dtype = dtype::QuantizedS8{1.2f};
  424. {
  425. auto opr = handle_cuda()->create_operator<Convolution>();
  426. opr->param() = arg.param;
  427. opr->deduce_layout(src, filter, dst);
  428. }
  429. checker.set_rng(0, &rng).set_rng(1, &rng).set_param(arg.param).exec(
  430. TensorLayoutArray{filter, dst, src});
  431. }
  432. }
  433. #if CUDA_VERSION >= 10020
  434. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_INT8_NHWC_IMMA) {
  435. if (!cuda::is_compute_capability_required(7, 5)) {
  436. printf("Skip CUDA.CONVOLUTION_BACKWARD_DATA_INT8_NHWC_IMMA test as "
  437. "current device doesn't support\n");
  438. return;
  439. }
  440. using namespace convolution;
  441. std::vector<TestArg> args = get_args_int8_nhwc_conv_bwd_data();
  442. struct AlgoParam {
  443. int threadblock_m;
  444. int threadblock_n;
  445. int threadblock_k;
  446. int warp_m;
  447. int warp_n;
  448. int warp_k;
  449. int stage;
  450. int access_size;
  451. std::string to_string() {
  452. return ssprintf(
  453. "_%dX%dX%d_%dX%dX%d_%dstage_%d", threadblock_m, threadblock_n,
  454. threadblock_k, warp_m, warp_n, warp_k, stage, access_size);
  455. }
  456. };
  457. std::vector<AlgoParam> all_params;
  458. all_params.emplace_back(AlgoParam{64, 16, 32, 64, 16, 32, 2, 4});
  459. all_params.emplace_back(AlgoParam{64, 16, 32, 64, 16, 32, 2, 8});
  460. all_params.emplace_back(AlgoParam{64, 16, 32, 64, 16, 32, 2, 16});
  461. all_params.emplace_back(AlgoParam{128, 32, 32, 64, 32, 32, 1, 4});
  462. all_params.emplace_back(AlgoParam{128, 32, 32, 64, 32, 32, 1, 8});
  463. all_params.emplace_back(AlgoParam{128, 32, 32, 64, 32, 32, 1, 16});
  464. for (auto algo_param : all_params) {
  465. Checker<ConvolutionBackwardData> checker(handle_cuda());
  466. std::string algo_name(ssprintf(
  467. "INT8_NHWC_IMMA_IMPLICIT_GEMM%s", algo_param.to_string().c_str()));
  468. checker.set_before_exec_callback(
  469. AlgoChecker<ConvolutionBackwardData>(algo_name.c_str()));
  470. checker.set_epsilon(1 + 1e-3).set_max_avg_error(1e-1);
  471. for (auto&& arg : args) {
  472. UniformIntRNG rng(-3, 3);
  473. auto src = TensorLayout(arg.src, dtype::QuantizedS8{1.2f});
  474. auto filter = TensorLayout(arg.filter, dtype::QuantizedS8{1.3f});
  475. TensorLayout dst;
  476. dst.dtype = dtype::QuantizedS8{1.2f};
  477. {
  478. auto opr = handle_cuda()->create_operator<Convolution>();
  479. opr->param() = arg.param;
  480. opr->deduce_layout(src, filter, dst);
  481. }
  482. checker.set_rng(0, &rng).set_rng(1, &rng).set_param(arg.param).exec(
  483. TensorLayoutArray{filter, dst, src});
  484. }
  485. }
  486. }
  487. #endif
  488. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_FAILED_CUDNN7_5) {
  489. // BRAIN-481 failed on architectures 7.0, remove the following if statement,
  490. // when cudnn fixed the problem.
  491. if (cuda::is_compute_capability_required(7, 0))
  492. return;
  493. using namespace convolution;
  494. std::vector<TestArg> args = get_args_cudnn_7_5_failures();
  495. Checker<ConvolutionBackwardData> checker(handle_cuda());
  496. NormalRNG default_rng;
  497. for (auto&& arg : args) {
  498. float scale = 128.f / sqrt(arg.filter[0] * arg.filter[2] * arg.filter[3]);
  499. scale = std::max(scale, 1.f);
  500. UniformFloatRNG rng(scale, 2 * scale);
  501. auto src = TensorLayout(arg.src, dtype::Float32());
  502. auto filter = TensorLayout(arg.filter, dtype::Float32());
  503. TensorLayout dst;
  504. {
  505. auto opr = handle_cuda()->create_operator<Convolution>();
  506. opr->param() = arg.param;
  507. opr->deduce_layout(src, filter, dst);
  508. }
  509. src.dtype = dst.dtype = filter.dtype = dtype::Float32();
  510. checker.set_rng(0, &default_rng)
  511. .set_rng(1, &default_rng)
  512. .set_epsilon(1e-3)
  513. .set_param(arg.param)
  514. .exec(TensorLayoutArray{filter, dst, src});
  515. src.dtype = dst.dtype = filter.dtype = dtype::Float16();
  516. checker.set_rng(0, &rng)
  517. .set_rng(1, &rng)
  518. .set_epsilon(1e-1)
  519. .set_param(arg.param)
  520. .exec(TensorLayoutArray{filter, dst, src});
  521. arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  522. checker.set_rng(0, &rng)
  523. .set_rng(1, &rng)
  524. .set_epsilon(1e-1)
  525. .set_param(arg.param)
  526. .exec(TensorLayoutArray{filter, dst, src});
  527. }
  528. }
  529. TEST_F(CUDA, CONVOLUTION_BACKWARD_FILTER) {
  530. using namespace convolution;
  531. std::vector<TestArg> args = get_args();
  532. Checker<ConvolutionBackwardFilter> checker(handle_cuda());
  533. bool f16_checked = false;
  534. for (auto&& arg : args) {
  535. auto src = TensorLayout(arg.src, dtype::Float32());
  536. auto filter = TensorLayout(arg.filter, dtype::Float32());
  537. TensorLayout dst;
  538. {
  539. auto opr = handle_cuda()->create_operator<Convolution>();
  540. opr->param() = arg.param;
  541. opr->deduce_layout(src, filter, dst);
  542. }
  543. float scale = 1.0f / sqrt(dst[2] * dst[3]);
  544. UniformFloatRNG rng(scale, 2 * scale);
  545. src.dtype = dst.dtype = filter.dtype = dtype::Float32();
  546. checker.set_rng(0, &rng)
  547. .set_rng(1, &rng)
  548. .set_epsilon(1e-3)
  549. .set_param(arg.param)
  550. .exec(TensorLayoutArray{src, dst, filter});
  551. // reduce on large f16 array may introduce significant error
  552. if (dst.total_nr_elems() >= 1000 && f16_checked)
  553. continue;
  554. f16_checked = true;
  555. src.dtype = dst.dtype = filter.dtype = dtype::Float16();
  556. checker.set_rng(0, &rng)
  557. .set_rng(1, &rng)
  558. .set_epsilon(1e-1)
  559. .set_param(arg.param)
  560. .exec(TensorLayoutArray{src, dst, filter});
  561. arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  562. checker.set_rng(0, &rng)
  563. .set_rng(1, &rng)
  564. .set_epsilon(1e-1)
  565. .set_param(arg.param)
  566. .exec(TensorLayoutArray{src, dst, filter});
  567. checker.set_before_exec_callback(
  568. AlgoChecker<ConvolutionBackwardFilter>(ExecutionPolicyAlgoName{
  569. "CONVOLUTION_BACKWARD_FILTER_BFLOAT16",
  570. {{"MATMUL", {{"CUBLAS", {}}}}}}));
  571. src.dtype = dst.dtype = filter.dtype = dtype::BFloat16();
  572. checker.set_rng(0, &rng)
  573. .set_rng(1, &rng)
  574. .set_epsilon(1e-1)
  575. .set_param(arg.param)
  576. .exec(TensorLayoutArray{src, dst, filter});
  577. checker.reset_before_exec_callback();
  578. checker.opr()->execution_policy() = {};
  579. }
  580. }
  581. TEST_F(CUDA, CONVOLUTION_BACKWARD_FILTER_MATMUL) {
  582. using namespace convolution;
  583. std::vector<TestArg> args = get_args();
  584. Checker<ConvolutionBackwardFilter> checker(handle_cuda());
  585. checker.set_before_exec_callback(AlgoChecker<ConvolutionBackwardFilter>(
  586. ExecutionPolicyAlgoName{"MATMUL", {{"CUBLAS", {}}}}));
  587. for (auto&& arg : args) {
  588. auto src = TensorLayout(arg.src, dtype::Float32());
  589. auto filter = TensorLayout(arg.filter, dtype::Float32());
  590. TensorLayout dst;
  591. {
  592. auto opr = handle_cuda()->create_operator<Convolution>();
  593. opr->param() = arg.param;
  594. opr->deduce_layout(src, filter, dst);
  595. }
  596. float scale = 1.0f / sqrt(dst[2] * dst[3]);
  597. UniformFloatRNG rng(scale, 2 * scale);
  598. src.dtype = dst.dtype = filter.dtype = dtype::Float32();
  599. checker.set_rng(0, &rng)
  600. .set_rng(1, &rng)
  601. .set_epsilon(1e-3)
  602. .set_param(arg.param)
  603. .exec(TensorLayoutArray{src, dst, filter});
  604. }
  605. //! noncontiguous case
  606. {
  607. NormalRNG default_rng;
  608. param::Convolution param;
  609. param.pad_h = param.pad_w = 1;
  610. checker.set_rng(0, &default_rng)
  611. .set_rng(1, &default_rng)
  612. .set_param(param)
  613. .execl(TensorLayoutArray{
  614. {{2, 16, 7, 7}, {1568, 49, 7, 1}, dtype::Float32()},
  615. {{2, 16, 7, 7}, {1568, 49, 7, 1}, dtype::Float32()},
  616. {{16, 16, 3, 3}, {144, 9, 3, 1}, dtype::Float32()}});
  617. }
  618. }
  619. TEST_F(CUDA, CONVOLUTION_BACKWARD_FILTER_CUDNN) {
  620. if (cuda::is_compute_capability_required(7, 0))
  621. return;
  622. using namespace convolution;
  623. Checker<ConvolutionBackwardFilter> checker(handle_cuda());
  624. checker.set_before_exec_callback(
  625. AlgoChecker<ConvolutionBackwardFilter>("CUDNN_CONVOLUTION"));
  626. //! noncontiguous case
  627. {
  628. param::Convolution param;
  629. param.pad_h = param.pad_w = 1;
  630. checker.set_param(param).execl(TensorLayoutArray{
  631. {{2, 16, 7, 7}, {1568, 49, 7, 1}, dtype::Float32()},
  632. {{2, 16, 7, 7}, {1568, 49, 7, 1}, dtype::Float32()},
  633. {{16, 16, 3, 3}, {144, 9, 3, 1}, dtype::Float32()}});
  634. }
  635. }
  636. TEST_F(CUDA, CONV_CONFIG_COMBINATIONS) {
  637. auto eps_getter = [](bool f16, int stage, const char* name) -> float {
  638. if (f16) {
  639. return stage == 2 ? 0.5 : 0.2;
  640. }
  641. if (strstr(name, "WINOGRAD_NONFUSED"))
  642. return 0.3;
  643. return 1e-3;
  644. };
  645. convolution::test_conv_config_combinations(
  646. 2, handle_cuda(), false, true, true, eps_getter, true);
  647. convolution::test_conv_config_combinations(
  648. 3, handle_cuda(), false, true, true, eps_getter, true);
  649. convolution::test_conv_config_combinations(
  650. 5, handle_cuda(), false, true, true, eps_getter, true);
  651. }
  652. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_1) {
  653. if (cuda::is_compute_capability_required(7, 0))
  654. return;
  655. using namespace convolution;
  656. Checker<ConvolutionBackwardData> checker(handle_cuda());
  657. checker.set_before_exec_callback(AlgoChecker<ConvolutionBackwardData>(
  658. "CUDNN_CONVOLUTION_BWD_DATA_ALGO_1" CUDNN_VERSION_STRING));
  659. NormalRNG default_rng;
  660. TensorShape s_filter = TensorShape{8, 8, 2, 2}, s_src = TensorShape{2, 8, 18, 18};
  661. float scale = 1.0f / sqrt(s_filter[0] * s_filter[2] * s_filter[3]);
  662. UniformFloatRNG rng(scale, 2 * scale);
  663. auto src = TensorLayout(s_src, dtype::Float16());
  664. auto filter = TensorLayout(s_filter, dtype::Float16());
  665. TensorLayout dst;
  666. param::Convolution param;
  667. param.pad_h = param.pad_w = 2;
  668. param.stride_h = param.stride_w = 2;
  669. {
  670. auto opr = handle_cuda()->create_operator<Convolution>();
  671. opr->param() = param;
  672. opr->deduce_layout(src, filter, dst);
  673. }
  674. src.dtype = dst.dtype = filter.dtype = dtype::Float16();
  675. param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  676. checker.set_rng(0, &rng).set_rng(1, &rng).set_epsilon(0.2).set_param(param).exec(
  677. TensorLayoutArray{filter, dst, src});
  678. }
  679. #if MEGDNN_WITH_BENCHMARK
  680. TEST_F(CUDA, CONV_FWD_BENCHMARK) {
  681. auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW, size_t SH = 1,
  682. size_t SW = 1, size_t FH = 1, size_t FW = 1, size_t PH = 0,
  683. size_t PW = 0, bool fp16io_c32 = false) {
  684. auto benchmarker = Benchmarker<ConvolutionForward>(handle_cuda());
  685. benchmarker.set_dtype(0, dtype::Float16())
  686. .set_dtype(1, dtype::Float16())
  687. .set_dtype(2, dtype::Float16());
  688. ConvolutionForward::Param param;
  689. param.stride_h = SH;
  690. param.stride_w = SW;
  691. param.pad_h = PH;
  692. param.pad_w = PW;
  693. if (fp16io_c32) {
  694. param.compute_mode = ConvolutionForward::Param::ComputeMode::FLOAT32;
  695. }
  696. benchmarker.set_param(param);
  697. std::unique_ptr<OprProxy<ConvolutionForward>> proxy{
  698. new OprProxy<ConvolutionForward>{true}};
  699. benchmarker.set_proxy(proxy);
  700. size_t OH = (IH - FH + 2 * PH) / SH + 1;
  701. size_t OW = (IW - FW + 2 * PW) / SW + 1;
  702. auto time =
  703. benchmarker.execs({{N, IC, IH, IW}, {OC, IC, FH, FW}, {N, OC, OH, OW}});
  704. time /= 1000.0 * 10.0;
  705. auto flo = (double)N * OC * IC * OH * OW * FH * FW * 2;
  706. auto flops = flo / time / 1e12;
  707. printf("comp_type %s: ", fp16io_c32 ? "32" : "16");
  708. printf("%.3fG FLO, flops %.3fTFLOPS\n", flo / 1e9, flops);
  709. };
  710. run(32, 512, 256, 56, 56, 1, 1, 1, 1, 0, 0, false);
  711. run(32, 512, 256, 56, 56, 1, 1, 1, 1, 0, 0, true);
  712. }
  713. TEST_F(CUDA, CONVOLUTION_FWD_BENCHMARK) {
  714. CUBenchmarker<ConvolutionForward> bench{handle_cuda()};
  715. std::unique_ptr<OprProxy<ConvolutionForward>> proxy{
  716. new OprProxy<ConvolutionForward>{true}};
  717. size_t RUNS = 10;
  718. bench.set_proxy(proxy).set_times(RUNS);
  719. auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW, size_t FH,
  720. size_t SH, size_t PH) {
  721. bench.set_dtype(0, dtype::Float32())
  722. .set_dtype(1, dtype::Float32())
  723. .set_dtype(2, dtype::Float32());
  724. param::Convolution param;
  725. param.stride_h = param.stride_w = SH;
  726. param.pad_h = param.pad_w = PH;
  727. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  728. bench.set_param(param);
  729. bench.proxy()->target_execution_policy.algo.reset();
  730. TensorLayout src{{N, IC, IH, IW}, dtype::Float32()},
  731. filter{{OC, IC, FH, FH}, dtype::Float32()};
  732. TensorLayout dst;
  733. {
  734. auto&& opr = handle_cuda()->create_operator<Convolution>();
  735. opr->param() = param;
  736. opr->deduce_layout(src, filter, dst);
  737. }
  738. auto time_ms_fp32 = bench.execl({src, filter, dst}) / RUNS;
  739. src.dtype = filter.dtype = dst.dtype = dtype::Float16();
  740. bench.proxy()->target_execution_policy.algo.reset();
  741. bench.set_dtype(0, dtype::Float16())
  742. .set_dtype(1, dtype::Float16())
  743. .set_dtype(2, dtype::Float16());
  744. auto time_ms_true_fp16 = bench.execl({src, filter, dst}) / RUNS;
  745. param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  746. bench.proxy()->target_execution_policy.algo.reset();
  747. bench.set_param(param);
  748. auto time_ms_pseudo_fp16 = bench.execl({src, filter, dst}) / RUNS;
  749. float flo = 2.0 * N * OC * IC * dst[2] * dst[3] * FH * FH;
  750. printf("inp=%s, kern=%s, dst=%s ", src.to_string().c_str(),
  751. filter.to_string().c_str(), dst.to_string().c_str());
  752. printf("time_fp32=%.2fms, flops=%.3fTFLOPS\ntime_true_fp16=%.2fms, "
  753. "flops=%.3fTFLOPS\ntime_pseudo_fp16=%.2fms, flops=%.3fFLOPS\n",
  754. time_ms_fp32, (flo / (time_ms_fp32 * 1e9)), time_ms_true_fp16,
  755. (flo / (time_ms_true_fp16 * 1e9)), time_ms_pseudo_fp16,
  756. (flo / (time_ms_pseudo_fp16 * 1e9)));
  757. printf("speedup (true_fp16/fp32)=%.2f, (true_fp16/pseudo_fp16)=%.2f\n",
  758. time_ms_fp32 / time_ms_true_fp16,
  759. time_ms_pseudo_fp16 / time_ms_true_fp16);
  760. };
  761. run(32, 64, 3, 224, 224, 7, 2, 3);
  762. run(32, 128, 128, 28, 28, 3, 1, 1);
  763. run(32, 256, 256, 14, 14, 3, 1, 1);
  764. run(32, 512, 512, 7, 7, 3, 1, 1);
  765. run(32, 64, 64, 56, 56, 3, 1, 1);
  766. run(32, 512, 256, 56, 56, 1, 2, 0);
  767. run(32, 1024, 512, 28, 28, 1, 2, 0);
  768. run(32, 2048, 1024, 14, 14, 1, 2, 0);
  769. run(32, 512, 128, 28, 28, 1, 1, 0);
  770. run(32, 128, 512, 28, 28, 1, 1, 0);
  771. run(32, 1024, 256, 14, 14, 1, 1, 0);
  772. run(32, 256, 1024, 14, 14, 1, 1, 0);
  773. run(32, 2048, 512, 7, 7, 1, 1, 0);
  774. run(32, 512, 2048, 7, 7, 1, 1, 0);
  775. run(32, 256, 64, 56, 56, 1, 1, 0);
  776. run(32, 64, 256, 56, 56, 1, 1, 0);
  777. run(32, 128, 256, 56, 56, 1, 2, 0);
  778. run(32, 256, 512, 28, 28, 1, 2, 0);
  779. run(32, 512, 1024, 14, 14, 1, 2, 0);
  780. run(32, 64, 64, 56, 56, 1, 1, 0);
  781. }
  782. TEST_F(CUDA, CONVOLUTION_BWD_DATA_BENCHMARK) {
  783. CUBenchmarker<ConvolutionBackwardData> bench{handle_cuda()};
  784. std::unique_ptr<OprProxy<ConvolutionBackwardData>> proxy{
  785. new OprProxy<ConvolutionBackwardData>{true}};
  786. size_t RUNS = 10;
  787. bench.set_proxy(proxy).set_times(RUNS);
  788. auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW, size_t FH,
  789. size_t SH, size_t PH) {
  790. bench.set_dtype(0, dtype::Float32())
  791. .set_dtype(1, dtype::Float32())
  792. .set_dtype(2, dtype::Float32());
  793. param::Convolution param;
  794. param.stride_h = param.stride_w = SH;
  795. param.pad_h = param.pad_w = PH;
  796. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  797. bench.set_param(param);
  798. bench.proxy()->target_execution_policy.algo.reset();
  799. TensorLayout src{{N, IC, IH, IW}, dtype::Float32()},
  800. filter{{OC, IC, FH, FH}, dtype::Float32()};
  801. TensorLayout dst;
  802. {
  803. auto&& opr = handle_cuda()->create_operator<Convolution>();
  804. opr->param() = param;
  805. opr->deduce_layout(src, filter, dst);
  806. }
  807. auto time_ms_fp32 = bench.execl({filter, dst, src}) / RUNS;
  808. src.dtype = filter.dtype = dst.dtype = dtype::Float16();
  809. bench.proxy()->target_execution_policy.algo.reset();
  810. bench.set_dtype(0, dtype::Float16())
  811. .set_dtype(1, dtype::Float16())
  812. .set_dtype(2, dtype::Float16());
  813. auto time_ms_true_fp16 = bench.execl({filter, dst, src}) / RUNS;
  814. param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  815. bench.proxy()->target_execution_policy.algo.reset();
  816. bench.set_param(param);
  817. auto time_ms_pseudo_fp16 = bench.execl({filter, dst, src}) / RUNS;
  818. float flo = 2.0 * N * OC * IC * dst[2] * dst[3] * FH * FH;
  819. printf("inp=%s, kern=%s, dst=%s ", src.to_string().c_str(),
  820. filter.to_string().c_str(), dst.to_string().c_str());
  821. printf("time_fp32=%.2fms, flops=%.3fTFLOPS\ntime_true_fp16=%.2fms, "
  822. "flops=%.3fTFLOPS\ntime_pseudo_fp16=%.2fms, flops=%.3fFLOPS\n",
  823. time_ms_fp32, (flo / (time_ms_fp32 * 1e9)), time_ms_true_fp16,
  824. (flo / (time_ms_true_fp16 * 1e9)), time_ms_pseudo_fp16,
  825. (flo / (time_ms_pseudo_fp16 * 1e9)));
  826. printf("speedup (true_fp16/fp32)=%.2f, (true_fp16/pseudo_fp16)=%.2f\n",
  827. time_ms_fp32 / time_ms_true_fp16,
  828. time_ms_pseudo_fp16 / time_ms_true_fp16);
  829. };
  830. run(32, 64, 3, 224, 224, 7, 2, 3);
  831. run(32, 128, 128, 28, 28, 3, 1, 1);
  832. run(32, 256, 256, 14, 14, 3, 1, 1);
  833. run(32, 512, 512, 7, 7, 3, 1, 1);
  834. run(32, 64, 64, 56, 56, 3, 1, 1);
  835. run(32, 512, 256, 56, 56, 1, 2, 0);
  836. run(32, 1024, 512, 28, 28, 1, 2, 0);
  837. run(32, 2048, 1024, 14, 14, 1, 2, 0);
  838. run(32, 512, 128, 28, 28, 1, 1, 0);
  839. run(32, 128, 512, 28, 28, 1, 1, 0);
  840. run(32, 1024, 256, 14, 14, 1, 1, 0);
  841. run(32, 256, 1024, 14, 14, 1, 1, 0);
  842. run(32, 2048, 512, 7, 7, 1, 1, 0);
  843. run(32, 512, 2048, 7, 7, 1, 1, 0);
  844. run(32, 256, 64, 56, 56, 1, 1, 0);
  845. run(32, 64, 256, 56, 56, 1, 1, 0);
  846. run(32, 128, 256, 56, 56, 1, 2, 0);
  847. run(32, 256, 512, 28, 28, 1, 2, 0);
  848. run(32, 512, 1024, 14, 14, 1, 2, 0);
  849. run(32, 64, 64, 56, 56, 1, 1, 0);
  850. }
  851. TEST_F(CUDA, BENCHMARK_CONVOLUTION_BWD_DATA_BF16) {
  852. CUBenchmarker<ConvolutionBackwardData> bench{handle_cuda()};
  853. std::unique_ptr<OprProxy<ConvolutionBackwardData>> proxy{
  854. new OprProxy<ConvolutionBackwardData>{true}};
  855. size_t RUNS = 10;
  856. bench.set_proxy(proxy).set_times(RUNS);
  857. auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW, size_t FH,
  858. size_t SH, size_t PH) {
  859. bench.set_dtype(0, dtype::BFloat16())
  860. .set_dtype(1, dtype::BFloat16())
  861. .set_dtype(2, dtype::BFloat16());
  862. param::Convolution param;
  863. param.stride_h = param.stride_w = SH;
  864. param.pad_h = param.pad_w = PH;
  865. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  866. bench.set_param(param);
  867. bench.proxy()->target_execution_policy = {};
  868. TensorLayout src{{N, IC, IH, IW}, dtype::BFloat16()},
  869. filter{{OC, IC, FH, FH}, dtype::BFloat16()};
  870. TensorLayout dst;
  871. {
  872. auto&& opr = handle_cuda()->create_operator<Convolution>();
  873. opr->param() = param;
  874. opr->deduce_layout(src, filter, dst);
  875. }
  876. auto used = bench.execl({filter, dst, src}) / RUNS;
  877. float flo = 2.0 * N * OC * IC * dst[2] * dst[3] * FH * FH;
  878. printf("inp=%s, kern=%s, dst=%s ", src.to_string().c_str(),
  879. filter.to_string().c_str(), dst.to_string().c_str());
  880. printf("time_fp32=%.2fms, flops=%.3fTFLOPS\n", used, (flo / (used * 1e9)));
  881. };
  882. run(32, 64, 3, 224, 224, 7, 2, 3);
  883. run(32, 128, 128, 28, 28, 3, 1, 1);
  884. run(32, 256, 256, 14, 14, 3, 1, 1);
  885. run(32, 512, 512, 7, 7, 3, 1, 1);
  886. run(32, 64, 64, 56, 56, 3, 1, 1);
  887. run(32, 512, 256, 56, 56, 1, 2, 0);
  888. run(32, 1024, 512, 28, 28, 1, 2, 0);
  889. run(32, 2048, 1024, 14, 14, 1, 2, 0);
  890. run(32, 512, 128, 28, 28, 1, 1, 0);
  891. run(32, 128, 512, 28, 28, 1, 1, 0);
  892. run(32, 1024, 256, 14, 14, 1, 1, 0);
  893. run(32, 256, 1024, 14, 14, 1, 1, 0);
  894. run(32, 2048, 512, 7, 7, 1, 1, 0);
  895. run(32, 512, 2048, 7, 7, 1, 1, 0);
  896. run(32, 256, 64, 56, 56, 1, 1, 0);
  897. run(32, 64, 256, 56, 56, 1, 1, 0);
  898. run(32, 128, 256, 56, 56, 1, 2, 0);
  899. run(32, 256, 512, 28, 28, 1, 2, 0);
  900. run(32, 512, 1024, 14, 14, 1, 2, 0);
  901. run(32, 64, 64, 56, 56, 1, 1, 0);
  902. }
  903. TEST_F(CUDA, BENCHMARK_CONVOLUTION_BWD_DATA_INT8_DP4A) {
  904. CUBenchmarker<ConvolutionBackwardData> bench{handle_cuda()};
  905. std::unique_ptr<OprProxy<ConvolutionBackwardData>> proxy{
  906. new OprProxy<ConvolutionBackwardData>{true}};
  907. size_t RUNS = 10;
  908. bench.set_proxy(proxy).set_times(RUNS);
  909. auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW, size_t FH,
  910. size_t SH, size_t PH) {
  911. bench.set_dtype(0, dtype::QuantizedS8{1.0f})
  912. .set_dtype(1, dtype::QuantizedS8{1.0f})
  913. .set_dtype(2, dtype::QuantizedS8{1.0f});
  914. param::Convolution param;
  915. param.format = param::Convolution::Format::NCHW4;
  916. param.stride_h = param.stride_w = SH;
  917. param.pad_h = param.pad_w = PH;
  918. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  919. bench.set_param(param);
  920. bench.proxy()->target_execution_policy = {};
  921. TensorLayout src{{N, IC / 4, IH, IW, 4}, dtype::QuantizedS8{1.0f}},
  922. filter{{OC, IC / 4, FH, FH, 4}, dtype::QuantizedS8{1.0f}};
  923. TensorLayout dst;
  924. dst.dtype = dtype::QuantizedS8{1.0f};
  925. {
  926. auto&& opr = handle_cuda()->create_operator<Convolution>();
  927. opr->param() = param;
  928. opr->deduce_layout(src, filter, dst);
  929. }
  930. auto used = bench.execl({filter, dst, src}) / RUNS;
  931. float flo = 2.0 * N * OC * IC * dst[2] * dst[3] * FH * FH;
  932. printf("inp=%s, kern=%s, dst=%s ", src.to_string().c_str(),
  933. filter.to_string().c_str(), dst.to_string().c_str());
  934. printf("time_fp32=%.2fms, flops=%.3fTFLOPS\n", used, (flo / (used * 1e9)));
  935. };
  936. run(64, 32, 32, 92, 180, 4, 2, 2);
  937. run(64, 32, 32, 46, 80, 4, 2, 2);
  938. run(16, 16, 16, 92, 180, 4, 2, 2);
  939. run(16, 16, 16, 46, 80, 4, 2, 2);
  940. }
  941. TEST_F(CUDA, CONVOLUTION_BWD_FILTER_BENCHMARK) {
  942. CUBenchmarker<ConvolutionBackwardFilter> bench{handle_cuda()};
  943. std::unique_ptr<OprProxy<ConvolutionBackwardFilter>> proxy{
  944. new OprProxy<ConvolutionBackwardFilter>{true}};
  945. size_t RUNS = 10;
  946. bench.set_proxy(proxy).set_times(RUNS);
  947. auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW, size_t FH,
  948. size_t SH, size_t PH) {
  949. bench.set_dtype(0, dtype::Float32())
  950. .set_dtype(1, dtype::Float32())
  951. .set_dtype(2, dtype::Float32());
  952. param::Convolution param;
  953. param.stride_h = param.stride_w = SH;
  954. param.pad_h = param.pad_w = PH;
  955. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  956. bench.set_param(param);
  957. bench.proxy()->target_execution_policy.algo.reset();
  958. TensorLayout src{{N, IC, IH, IW}, dtype::Float32()},
  959. filter{{OC, IC, FH, FH}, dtype::Float32()};
  960. TensorLayout dst;
  961. {
  962. auto&& opr = handle_cuda()->create_operator<Convolution>();
  963. opr->param() = param;
  964. opr->deduce_layout(src, filter, dst);
  965. }
  966. auto time_ms_fp32 = bench.execl({src, dst, filter}) / RUNS;
  967. src.dtype = filter.dtype = dst.dtype = dtype::Float16();
  968. bench.proxy()->target_execution_policy.algo.reset();
  969. bench.set_dtype(0, dtype::Float16())
  970. .set_dtype(1, dtype::Float16())
  971. .set_dtype(2, dtype::Float16());
  972. auto time_ms_true_fp16 = bench.execl({src, dst, filter}) / RUNS;
  973. param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  974. bench.proxy()->target_execution_policy.algo.reset();
  975. bench.set_param(param);
  976. auto time_ms_pseudo_fp16 = bench.execl({src, dst, filter}) / RUNS;
  977. float flo = 2.0 * N * OC * IC * dst[2] * dst[3] * FH * FH;
  978. printf("inp=%s, kern=%s, dst=%s ", src.to_string().c_str(),
  979. filter.to_string().c_str(), dst.to_string().c_str());
  980. printf("time_fp32=%.2fms, flops=%.3fTFLOPS\ntime_true_fp16=%.2fms, "
  981. "flops=%.3fTFLOPS\ntime_pseudo_fp16=%.2fms, flops=%.3fFLOPS\n",
  982. time_ms_fp32, (flo / (time_ms_fp32 * 1e9)), time_ms_true_fp16,
  983. (flo / (time_ms_true_fp16 * 1e9)), time_ms_pseudo_fp16,
  984. (flo / (time_ms_pseudo_fp16 * 1e9)));
  985. printf("speedup (true_fp16/fp32)=%.2f, (true_fp16/pseudo_fp16)=%.2f\n",
  986. time_ms_fp32 / time_ms_true_fp16,
  987. time_ms_pseudo_fp16 / time_ms_true_fp16);
  988. };
  989. run(32, 64, 3, 224, 224, 7, 2, 3);
  990. run(32, 128, 128, 28, 28, 3, 1, 1);
  991. run(32, 256, 256, 14, 14, 3, 1, 1);
  992. run(32, 512, 512, 7, 7, 3, 1, 1);
  993. run(32, 64, 64, 56, 56, 3, 1, 1);
  994. run(32, 512, 256, 56, 56, 1, 2, 0);
  995. run(32, 1024, 512, 28, 28, 1, 2, 0);
  996. run(32, 2048, 1024, 14, 14, 1, 2, 0);
  997. run(32, 512, 128, 28, 28, 1, 1, 0);
  998. run(32, 128, 512, 28, 28, 1, 1, 0);
  999. run(32, 1024, 256, 14, 14, 1, 1, 0);
  1000. run(32, 256, 1024, 14, 14, 1, 1, 0);
  1001. run(32, 2048, 512, 7, 7, 1, 1, 0);
  1002. run(32, 512, 2048, 7, 7, 1, 1, 0);
  1003. run(32, 256, 64, 56, 56, 1, 1, 0);
  1004. run(32, 64, 256, 56, 56, 1, 1, 0);
  1005. run(32, 128, 256, 56, 56, 1, 2, 0);
  1006. run(32, 256, 512, 28, 28, 1, 2, 0);
  1007. run(32, 512, 1024, 14, 14, 1, 2, 0);
  1008. run(32, 64, 64, 56, 56, 1, 1, 0);
  1009. }
  1010. #endif
  1011. #undef CUDNN_VERSION_STRING
  1012. #undef V
  1013. #undef V1
  1014. } // namespace test
  1015. } // namespace megdnn
  1016. // vim: syntax=cpp.doxygen