You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

convolution.cpp 52 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248
  1. #include "test/common/convolution.h"
  2. #include "megdnn/dtype.h"
  3. #include "megdnn/opr_param_defs.h"
  4. #include "megdnn/oprs.h"
  5. #include "test/common/accuracy_shake_checker.h"
  6. #include "test/common/checker.h"
  7. #include "test/common/rng.h"
  8. #include "test/common/tensor.h"
  9. #include "test/common/workspace_wrapper.h"
  10. #include "test/cuda/benchmark.h"
  11. #include "test/cuda/fixture.h"
  12. #include "test/cuda/utils.h"
  13. #include <cudnn.h>
  14. #define V1(x) #x
  15. #define V(x) V1(x)
  16. #define CUDNN_VERSION_STRING \
  17. "v" V(CUDNN_MAJOR) "." V(CUDNN_MINOR) "." V(CUDNN_PATCHLEVEL)
  18. namespace megdnn {
  19. namespace test {
  20. TEST_F(CUDA, CONVOLUTION_8X8X32) {
  21. require_compute_capability(6, 1);
  22. using namespace convolution;
  23. std::vector<TestArg> args;
  24. {
  25. auto v = get_args();
  26. for (auto&& a : v) {
  27. args.push_back(std::move(a));
  28. }
  29. }
  30. {
  31. auto v = get_dilated_args();
  32. for (auto&& a : v) {
  33. args.push_back(std::move(a));
  34. }
  35. }
  36. {
  37. auto v = get_chanwise_args();
  38. for (auto&& a : v) {
  39. args.push_back(std::move(a));
  40. }
  41. }
  42. Checker<ConvolutionForward> checker(handle_cuda());
  43. UniformIntRNG rng(-4, 4);
  44. for (auto arg : args) {
  45. arg.param.format = param::Convolution::Format::NHWC;
  46. arg.src = cvt_src_or_dst_nchw2nhwc(arg.src);
  47. arg.filter = cvt_filter_nchw2nhwc(arg.filter);
  48. checker.set_dtype(0, dtype::Int8())
  49. .set_dtype(1, dtype::Int8())
  50. .set_dtype(2, dtype::Int32())
  51. .set_param(arg.param)
  52. .set_rng(0, &rng)
  53. .set_rng(1, &rng)
  54. .execs({arg.src, arg.filter, {}});
  55. }
  56. }
  57. TEST_F(CUDA, CONVOLUTION_FORWARD) {
  58. using namespace convolution;
  59. std::vector<TestArg> args = get_args();
  60. Checker<ConvolutionForward> checker(handle_cuda());
  61. NormalRNG default_rng;
  62. for (auto&& arg : args) {
  63. float scale = 1.0f / sqrt(arg.filter[1] * arg.filter[2] * arg.filter[3]);
  64. UniformFloatRNG rng(scale, 2 * scale);
  65. checker.set_dtype(0, dtype::Float32())
  66. .set_dtype(1, dtype::Float32())
  67. .set_dtype(2, dtype::Float32())
  68. .set_rng(0, &default_rng)
  69. .set_rng(1, &default_rng)
  70. .set_epsilon(1e-3)
  71. .set_param(arg.param)
  72. .execs({arg.src, arg.filter, {}});
  73. checker.set_dtype(0, dtype::Float16())
  74. .set_dtype(1, dtype::Float16())
  75. .set_dtype(2, dtype::Float16())
  76. .set_rng(0, &rng)
  77. .set_rng(1, &rng)
  78. .set_epsilon(1e-1)
  79. .set_param(arg.param)
  80. .execs({arg.src, arg.filter, {}});
  81. arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  82. checker.set_dtype(0, dtype::Float16())
  83. .set_dtype(1, dtype::Float16())
  84. .set_dtype(2, dtype::Float16())
  85. .set_rng(0, &rng)
  86. .set_rng(1, &rng)
  87. .set_epsilon(1e-1)
  88. .set_param(arg.param)
  89. .execs({arg.src, arg.filter, {}});
  90. checker.set_dtype(0, dtype::BFloat16())
  91. .set_dtype(1, dtype::BFloat16())
  92. .set_dtype(2, dtype::BFloat16())
  93. .set_epsilon(1e-1)
  94. .set_param(arg.param)
  95. .execs({arg.src, arg.filter, {}});
  96. }
  97. }
  98. TEST_F(CUDA, CONV_FORWARD_MATMUL_NCHW4) {
  99. require_compute_capability(6, 1);
  100. using namespace convolution;
  101. Checker<Convolution> checker(handle_cuda());
  102. UniformIntRNG int_rng{-127, 127};
  103. Convolution::Param param;
  104. param.format = Convolution::Param::Format::NCHW4;
  105. checker.set_dtype(0, dtype::QuantizedS8(0.132f))
  106. .set_dtype(1, dtype::QuantizedS8(0.0239f))
  107. .set_dtype(2, dtype::QuantizedS32(0.132f * 0.0239f))
  108. .set_rng(0, &int_rng)
  109. .set_rng(1, &int_rng)
  110. .set_param(param);
  111. checker.set_before_exec_callback(
  112. AlgoChecker<ConvolutionForward>(ExecutionPolicyAlgoName{
  113. "DEFAULT",
  114. {{ConvBiasForward::algo_name<ConvBiasForward::MatmulParam>(
  115. "MATMUL8X8X32", {})
  116. .c_str(),
  117. {}}}}));
  118. param.sparse = Convolution::Param::Sparse::DENSE;
  119. param.pad_h = param.pad_w = 1;
  120. param.stride_h = param.stride_w = 1;
  121. checker.set_param(param);
  122. checker.exec({{8, 4, 10, 10, 4}, {16, 4, 3, 3, 4}, {}});
  123. checker.exec({{1, 4, 2, 2, 4}, {16, 4, 3, 3, 4}, {}});
  124. checker.exec({{8, 64, 12, 12, 4}, {256, 64, 3, 3, 4}, {}});
  125. }
  126. TEST_F(CUDA, CONVOLUTION_1X1_FORWARD) {
  127. using namespace convolution;
  128. std::vector<TestArg> args = get_1x1_args();
  129. Checker<ConvolutionForward> checker(handle_cuda());
  130. NormalRNG default_rng;
  131. for (auto&& arg : args) {
  132. float scale = 1.0f / sqrt(arg.filter[1] * arg.filter[2] * arg.filter[3]);
  133. UniformFloatRNG rng(scale, 2 * scale);
  134. checker.set_dtype(0, dtype::Float32())
  135. .set_dtype(1, dtype::Float32())
  136. .set_rng(0, &default_rng)
  137. .set_rng(1, &default_rng)
  138. .set_epsilon(1e-3)
  139. .set_param(arg.param)
  140. .execs({arg.src, arg.filter, {}});
  141. }
  142. }
  143. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA) {
  144. using namespace convolution;
  145. std::vector<TestArg> args = get_args_cuda_conv_bwd_data();
  146. Checker<ConvolutionBackwardData> checker(handle_cuda());
  147. NormalRNG default_rng;
  148. for (auto&& arg : args) {
  149. float scale = 64.f / sqrt(arg.filter[0] * arg.filter[2] * arg.filter[3]);
  150. UniformFloatRNG rng(scale, 2 * scale);
  151. auto src = TensorLayout(arg.src, dtype::Float32());
  152. auto filter = TensorLayout(arg.filter, dtype::Float32());
  153. TensorLayout dst;
  154. {
  155. auto opr = handle_cuda()->create_operator<Convolution>();
  156. opr->param() = arg.param;
  157. opr->deduce_layout(src, filter, dst);
  158. }
  159. src.dtype = dst.dtype = filter.dtype = dtype::Float32();
  160. checker.set_rng(0, &default_rng)
  161. .set_rng(1, &default_rng)
  162. .set_epsilon(1e-3)
  163. .set_param(arg.param)
  164. .exec(TensorLayoutArray{filter, dst, src});
  165. if (!check_compute_capability(6, 0)) {
  166. src.dtype = dst.dtype = filter.dtype = dtype::Float16();
  167. checker.set_rng(0, &rng)
  168. .set_rng(1, &rng)
  169. .set_epsilon(1e-1)
  170. .set_param(arg.param)
  171. .exec(TensorLayoutArray{filter, dst, src});
  172. arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  173. checker.set_rng(0, &rng)
  174. .set_rng(1, &rng)
  175. .set_epsilon(1e-1)
  176. .set_param(arg.param)
  177. .exec(TensorLayoutArray{filter, dst, src});
  178. }
  179. checker.set_before_exec_callback(
  180. AlgoChecker<ConvolutionBackwardData>(ExecutionPolicyAlgoName{
  181. "CONVOLUTION_BACKWARD_DATD_BFLOAT16",
  182. {{"MATMUL", {{"CUBLAS", {}}}}}}));
  183. src.dtype = dst.dtype = filter.dtype = dtype::BFloat16();
  184. arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  185. checker.set_rng(0, &rng)
  186. .set_rng(1, &rng)
  187. .set_epsilon(1e-1)
  188. .set_param(arg.param)
  189. .exec(TensorLayoutArray{filter, dst, src});
  190. checker.reset_before_exec_callback();
  191. checker.opr()->execution_policy() = {};
  192. }
  193. }
  194. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_FP16_CUDNN7_5) {
  195. // algo CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 with
  196. // TensorCore operations produces incorrect result.
  197. // Maybe nvidia has fixed this issue
  198. // There is a test using incorrect case:
  199. // inp={2x8x18x18}, kern={8x8x2x2}, pad_h=pad_w=2, stride_h=stride_w=2,
  200. // dtype=float16
  201. using namespace convolution;
  202. std::vector<TestArg> args = get_args_cudnn_5_1_backward();
  203. Checker<ConvolutionBackwardData> checker(handle_cuda());
  204. NormalRNG default_rng;
  205. for (auto&& arg : args) {
  206. float scale = 128.f / sqrt(arg.filter[0] * arg.filter[2] * arg.filter[3]);
  207. scale = std::max(scale, 1.f);
  208. UniformFloatRNG rng(scale, 2 * scale);
  209. arg.param.format = param::Convolution::Format::NHWC;
  210. arg.src = cvt_src_or_dst_nchw2nhwc(arg.src);
  211. arg.filter = cvt_filter_nchw2nhwc(arg.filter);
  212. auto src = TensorLayout(arg.src, dtype::Float32());
  213. auto filter = TensorLayout(arg.filter, dtype::Float32());
  214. TensorLayout dst;
  215. {
  216. auto opr = handle_cuda()->create_operator<Convolution>();
  217. opr->param() = arg.param;
  218. opr->deduce_layout(src, filter, dst);
  219. }
  220. src.dtype = dst.dtype = filter.dtype = dtype::Float16();
  221. arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  222. checker.set_rng(0, &rng)
  223. .set_rng(1, &rng)
  224. .set_epsilon(1e-2)
  225. .set_param(arg.param)
  226. .exec(TensorLayoutArray{filter, dst, src});
  227. src.dtype = dst.dtype = filter.dtype = dtype::Float32();
  228. arg.param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  229. checker.set_rng(0, &rng)
  230. .set_rng(1, &rng)
  231. .set_epsilon(1e-2)
  232. .set_param(arg.param)
  233. .exec(TensorLayoutArray{filter, dst, src});
  234. }
  235. }
  236. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_NHWC) {
  237. using namespace convolution;
  238. std::vector<TestArg> args = get_args_cuda_conv_bwd_data();
  239. Checker<ConvolutionBackwardData> checker(handle_cuda());
  240. NormalRNG default_rng;
  241. for (auto&& arg : args) {
  242. float scale = 64.f / sqrt(arg.filter[0] * arg.filter[2] * arg.filter[3]);
  243. UniformFloatRNG rng(scale, 2 * scale);
  244. arg.param.format = param::Convolution::Format::NHWC;
  245. arg.src = cvt_src_or_dst_nchw2nhwc(arg.src);
  246. arg.filter = cvt_filter_nchw2nhwc(arg.filter);
  247. auto src = TensorLayout(arg.src, dtype::Float32());
  248. auto filter = TensorLayout(arg.filter, dtype::Float32());
  249. TensorLayout dst;
  250. {
  251. auto opr = handle_cuda()->create_operator<Convolution>();
  252. opr->param() = arg.param;
  253. opr->deduce_layout(src, filter, dst);
  254. }
  255. src.dtype = dst.dtype = filter.dtype = dtype::Float16();
  256. arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  257. checker.set_rng(0, &rng)
  258. .set_rng(1, &rng)
  259. .set_epsilon(1e-2)
  260. .set_param(arg.param)
  261. .exec(TensorLayoutArray{filter, dst, src});
  262. src.dtype = dst.dtype = filter.dtype = dtype::Float32();
  263. arg.param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  264. checker.set_rng(0, &rng)
  265. .set_rng(1, &rng)
  266. .set_epsilon(1e-2)
  267. .set_param(arg.param)
  268. .exec(TensorLayoutArray{filter, dst, src});
  269. }
  270. }
  271. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_CUDNN) {
  272. require_compute_capability(7, 0);
  273. using namespace convolution;
  274. Checker<ConvolutionBackwardData> checker(handle_cuda());
  275. checker.set_before_exec_callback(
  276. AlgoChecker<ConvolutionBackwardData>("CUDNN_CONVOLUTION"));
  277. //! noncontiguous case
  278. {
  279. param::Convolution param;
  280. param.pad_h = param.pad_w = 1;
  281. checker.set_param(param).execl(TensorLayoutArray{
  282. {{16, 16, 3, 3}, {144, 9, 3, 1}, dtype::Float32()},
  283. {{2, 16, 7, 7}, {1568, 49, 7, 1}, dtype::Float32()},
  284. {{2, 16, 7, 7}, {1568, 49, 7, 1}, dtype::Float32()},
  285. });
  286. }
  287. }
  288. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_MATMUL) {
  289. using namespace convolution;
  290. std::vector<TestArg> args = get_args_cuda_conv_bwd_data();
  291. Checker<ConvolutionBackwardData> checker(handle_cuda());
  292. checker.set_before_exec_callback(AlgoChecker<ConvolutionBackwardData>(
  293. ExecutionPolicyAlgoName{"MATMUL", {{"CUBLAS", {}}}}));
  294. NormalRNG default_rng;
  295. for (auto&& arg : args) {
  296. float scale = 64.f / sqrt(arg.filter[0] * arg.filter[2] * arg.filter[3]);
  297. UniformFloatRNG rng(scale, 2 * scale);
  298. auto src = TensorLayout(arg.src, dtype::Float32());
  299. auto filter = TensorLayout(arg.filter, dtype::Float32());
  300. TensorLayout dst;
  301. {
  302. auto opr = handle_cuda()->create_operator<Convolution>();
  303. opr->param() = arg.param;
  304. opr->deduce_layout(src, filter, dst);
  305. }
  306. src.dtype = dst.dtype = filter.dtype = dtype::Float32();
  307. checker.set_rng(0, &default_rng)
  308. .set_rng(1, &default_rng)
  309. .set_epsilon(1e-3)
  310. .set_param(arg.param)
  311. .exec(TensorLayoutArray{filter, dst, src});
  312. }
  313. //! noncontiguous case
  314. {
  315. param::Convolution param;
  316. param.pad_h = param.pad_w = 1;
  317. checker.set_param(param).execl(TensorLayoutArray{
  318. {{16, 16, 3, 3}, {144, 9, 3, 1}, dtype::Float32()},
  319. {{2, 16, 7, 7}, {1568, 49, 7, 1}, dtype::Float32()},
  320. {{2, 16, 7, 7}, {1568, 49, 7, 1}, dtype::Float32()},
  321. });
  322. }
  323. }
  324. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_INT8_NCHW4_DP4A) {
  325. require_compute_capability(6, 1);
  326. using namespace convolution;
  327. std::vector<TestArg> args = get_args_int8_nchw4_conv_bwd_data();
  328. struct AlgoParam {
  329. int threadblock_m;
  330. int threadblock_n;
  331. int threadblock_k;
  332. int warp_m;
  333. int warp_n;
  334. int warp_k;
  335. int stage;
  336. std::string to_string() {
  337. return ssprintf(
  338. "_%dX%dX%d_%dX%dX%d_%dstage", threadblock_m, threadblock_n,
  339. threadblock_k, warp_m, warp_n, warp_k, stage);
  340. }
  341. };
  342. std::vector<AlgoParam> all_params;
  343. all_params.emplace_back(AlgoParam{16, 64, 8, 16, 64, 8, 2});
  344. all_params.emplace_back(AlgoParam{16, 128, 16, 16, 64, 16, 2});
  345. all_params.emplace_back(AlgoParam{16, 128, 16, 16, 128, 16, 1});
  346. all_params.emplace_back(AlgoParam{32, 128, 32, 32, 64, 32, 2});
  347. for (auto algo_param : all_params) {
  348. Checker<ConvolutionBackwardData> checker(handle_cuda());
  349. std::string algo_name(ssprintf(
  350. "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM%s", algo_param.to_string().c_str()));
  351. checker.set_before_exec_callback(
  352. AlgoChecker<ConvolutionBackwardData>(algo_name.c_str()));
  353. checker.set_epsilon(1 + 1e-3).set_max_avg_error(1e-1);
  354. for (auto&& arg : args) {
  355. UniformIntRNG rng(-3, 3);
  356. auto src = TensorLayout(arg.src, dtype::QuantizedS8{1.2f});
  357. auto filter = TensorLayout(arg.filter, dtype::QuantizedS8{1.3f});
  358. TensorLayout dst;
  359. dst.dtype = dtype::QuantizedS8{1.2f};
  360. {
  361. auto opr = handle_cuda()->create_operator<Convolution>();
  362. opr->param() = arg.param;
  363. opr->deduce_layout(src, filter, dst);
  364. }
  365. checker.set_rng(0, &rng).set_rng(1, &rng).set_param(arg.param).exec(
  366. TensorLayoutArray{filter, dst, src});
  367. }
  368. }
  369. }
  370. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_INT8_NCHW_DP4A) {
  371. require_compute_capability(6, 1);
  372. using namespace convolution;
  373. std::vector<TestArg> args = get_args_int8_nchw_conv_bwd_data();
  374. Checker<ConvolutionBackwardData> checker(handle_cuda());
  375. checker.set_before_exec_callback(
  376. AlgoChecker<ConvolutionBackwardData>("INT8_NCHW_DOTPROD_IMPLICIT_GEMM"));
  377. checker.set_epsilon(1 + 1e-3).set_max_avg_error(1e-1);
  378. for (auto&& arg : args) {
  379. UniformIntRNG rng(-3, 3);
  380. auto src = TensorLayout(arg.src, dtype::QuantizedS8{1.2f});
  381. auto filter = TensorLayout(arg.filter, dtype::QuantizedS8{1.3f});
  382. TensorLayout dst;
  383. dst.dtype = dtype::QuantizedS8{1.2f};
  384. {
  385. auto opr = handle_cuda()->create_operator<Convolution>();
  386. opr->param() = arg.param;
  387. opr->deduce_layout(src, filter, dst);
  388. }
  389. checker.set_rng(0, &rng).set_rng(1, &rng).set_param(arg.param).exec(
  390. TensorLayoutArray{filter, dst, src});
  391. }
  392. }
  393. #if CUDA_VERSION >= 10020
  394. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_INT8_NHWC_IMMA) {
  395. require_compute_capability(7, 5);
  396. using namespace convolution;
  397. std::vector<TestArg> args = get_args_int8_nhwc_conv_bwd_data();
  398. struct AlgoParam {
  399. int threadblock_m;
  400. int threadblock_n;
  401. int threadblock_k;
  402. int warp_m;
  403. int warp_n;
  404. int warp_k;
  405. int stage;
  406. int access_size;
  407. std::string to_string() {
  408. return ssprintf(
  409. "_%dX%dX%d_%dX%dX%d_%dstage_%d", threadblock_m, threadblock_n,
  410. threadblock_k, warp_m, warp_n, warp_k, stage, access_size);
  411. }
  412. };
  413. std::vector<AlgoParam> all_params;
  414. all_params.emplace_back(AlgoParam{64, 16, 32, 64, 16, 32, 2, 4});
  415. all_params.emplace_back(AlgoParam{64, 16, 32, 64, 16, 32, 2, 8});
  416. all_params.emplace_back(AlgoParam{64, 16, 32, 64, 16, 32, 2, 16});
  417. all_params.emplace_back(AlgoParam{128, 32, 32, 64, 32, 32, 1, 4});
  418. all_params.emplace_back(AlgoParam{128, 32, 32, 64, 32, 32, 1, 8});
  419. all_params.emplace_back(AlgoParam{128, 32, 32, 64, 32, 32, 1, 16});
  420. for (auto algo_param : all_params) {
  421. Checker<ConvolutionBackwardData> checker(handle_cuda());
  422. std::string algo_name(ssprintf(
  423. "INT8_NHWC_IMMA_IMPLICIT_GEMM%s", algo_param.to_string().c_str()));
  424. checker.set_before_exec_callback(
  425. AlgoChecker<ConvolutionBackwardData>(algo_name.c_str()));
  426. checker.set_epsilon(1 + 1e-3).set_max_avg_error(1e-1);
  427. for (auto&& arg : args) {
  428. UniformIntRNG rng(-3, 3);
  429. auto src = TensorLayout(arg.src, dtype::QuantizedS8{1.2f});
  430. auto filter = TensorLayout(arg.filter, dtype::QuantizedS8{1.3f});
  431. TensorLayout dst;
  432. dst.dtype = dtype::QuantizedS8{1.2f};
  433. {
  434. auto opr = handle_cuda()->create_operator<Convolution>();
  435. opr->param() = arg.param;
  436. opr->deduce_layout(src, filter, dst);
  437. }
  438. checker.set_rng(0, &rng).set_rng(1, &rng).set_param(arg.param).exec(
  439. TensorLayoutArray{filter, dst, src});
  440. }
  441. }
  442. }
  443. #endif
  444. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_FAILED_CUDNN7_5) {
  445. // BRAIN-481 failed on architectures 7.0, remove the following if statement,
  446. // when cudnn fixed the problem.
  447. require_compute_capability(7, 0);
  448. using namespace convolution;
  449. std::vector<TestArg> args = get_args_cudnn_7_5_failures();
  450. Checker<ConvolutionBackwardData> checker(handle_cuda());
  451. NormalRNG default_rng;
  452. for (auto&& arg : args) {
  453. float scale = 128.f / sqrt(arg.filter[0] * arg.filter[2] * arg.filter[3]);
  454. scale = std::max(scale, 1.f);
  455. UniformFloatRNG rng(scale, 2 * scale);
  456. auto src = TensorLayout(arg.src, dtype::Float32());
  457. auto filter = TensorLayout(arg.filter, dtype::Float32());
  458. TensorLayout dst;
  459. {
  460. auto opr = handle_cuda()->create_operator<Convolution>();
  461. opr->param() = arg.param;
  462. opr->deduce_layout(src, filter, dst);
  463. }
  464. src.dtype = dst.dtype = filter.dtype = dtype::Float32();
  465. checker.set_rng(0, &default_rng)
  466. .set_rng(1, &default_rng)
  467. .set_epsilon(1e-3)
  468. .set_param(arg.param)
  469. .exec(TensorLayoutArray{filter, dst, src});
  470. src.dtype = dst.dtype = filter.dtype = dtype::Float16();
  471. checker.set_rng(0, &rng)
  472. .set_rng(1, &rng)
  473. .set_epsilon(1e-1)
  474. .set_param(arg.param)
  475. .exec(TensorLayoutArray{filter, dst, src});
  476. arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  477. checker.set_rng(0, &rng)
  478. .set_rng(1, &rng)
  479. .set_epsilon(1e-1)
  480. .set_param(arg.param)
  481. .exec(TensorLayoutArray{filter, dst, src});
  482. }
  483. }
  484. TEST_F(CUDA, CONVOLUTION_BACKWARD_FILTER) {
  485. using namespace convolution;
  486. std::vector<TestArg> args = get_args();
  487. Checker<ConvolutionBackwardFilter> checker(handle_cuda());
  488. bool f16_checked = false;
  489. for (auto&& arg : args) {
  490. auto src = TensorLayout(arg.src, dtype::Float32());
  491. auto filter = TensorLayout(arg.filter, dtype::Float32());
  492. TensorLayout dst;
  493. {
  494. auto opr = handle_cuda()->create_operator<Convolution>();
  495. opr->param() = arg.param;
  496. opr->deduce_layout(src, filter, dst);
  497. }
  498. float scale = 1.0f / sqrt(dst[2] * dst[3]);
  499. UniformFloatRNG rng(scale, 2 * scale);
  500. src.dtype = dst.dtype = filter.dtype = dtype::Float32();
  501. checker.set_rng(0, &rng)
  502. .set_rng(1, &rng)
  503. .set_epsilon(1e-3)
  504. .set_param(arg.param)
  505. .exec(TensorLayoutArray{src, dst, filter});
  506. // reduce on large f16 array may introduce significant error
  507. if (dst.total_nr_elems() >= 1000 && f16_checked)
  508. continue;
  509. f16_checked = true;
  510. src.dtype = dst.dtype = filter.dtype = dtype::Float16();
  511. checker.set_rng(0, &rng)
  512. .set_rng(1, &rng)
  513. .set_epsilon(1e-1)
  514. .set_param(arg.param)
  515. .exec(TensorLayoutArray{src, dst, filter});
  516. arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  517. checker.set_rng(0, &rng)
  518. .set_rng(1, &rng)
  519. .set_epsilon(1e-1)
  520. .set_param(arg.param)
  521. .exec(TensorLayoutArray{src, dst, filter});
  522. checker.set_before_exec_callback(
  523. AlgoChecker<ConvolutionBackwardFilter>(ExecutionPolicyAlgoName{
  524. "CONVOLUTION_BACKWARD_FILTER_BFLOAT16",
  525. {{"MATMUL", {{"CUBLAS", {}}}}}}));
  526. src.dtype = dst.dtype = filter.dtype = dtype::BFloat16();
  527. checker.set_rng(0, &rng)
  528. .set_rng(1, &rng)
  529. .set_epsilon(1e-1)
  530. .set_param(arg.param)
  531. .exec(TensorLayoutArray{src, dst, filter});
  532. checker.reset_before_exec_callback();
  533. checker.opr()->execution_policy() = {};
  534. }
  535. }
  536. TEST_F(CUDA, CONVOLUTION_BACKWARD_FILTER_MATMUL) {
  537. using namespace convolution;
  538. std::vector<TestArg> args = get_args();
  539. Checker<ConvolutionBackwardFilter> checker(handle_cuda());
  540. checker.set_before_exec_callback(AlgoChecker<ConvolutionBackwardFilter>(
  541. ExecutionPolicyAlgoName{"MATMUL", {{"CUBLAS", {}}}}));
  542. for (auto&& arg : args) {
  543. auto src = TensorLayout(arg.src, dtype::Float32());
  544. auto filter = TensorLayout(arg.filter, dtype::Float32());
  545. TensorLayout dst;
  546. {
  547. auto opr = handle_cuda()->create_operator<Convolution>();
  548. opr->param() = arg.param;
  549. opr->deduce_layout(src, filter, dst);
  550. }
  551. float scale = 1.0f / sqrt(dst[2] * dst[3]);
  552. UniformFloatRNG rng(scale, 2 * scale);
  553. src.dtype = dst.dtype = filter.dtype = dtype::Float32();
  554. checker.set_rng(0, &rng)
  555. .set_rng(1, &rng)
  556. .set_epsilon(1e-3)
  557. .set_param(arg.param)
  558. .exec(TensorLayoutArray{src, dst, filter});
  559. }
  560. //! noncontiguous case
  561. {
  562. NormalRNG default_rng;
  563. param::Convolution param;
  564. param.pad_h = param.pad_w = 1;
  565. checker.set_rng(0, &default_rng)
  566. .set_rng(1, &default_rng)
  567. .set_param(param)
  568. .execl(TensorLayoutArray{
  569. {{2, 16, 7, 7}, {1568, 49, 7, 1}, dtype::Float32()},
  570. {{2, 16, 7, 7}, {1568, 49, 7, 1}, dtype::Float32()},
  571. {{16, 16, 3, 3}, {144, 9, 3, 1}, dtype::Float32()}});
  572. }
  573. }
  574. TEST_F(CUDA, CONVOLUTION_BACKWARD_FILTER_CUDNN) {
  575. require_compute_capability(7, 0);
  576. using namespace convolution;
  577. Checker<ConvolutionBackwardFilter> checker(handle_cuda());
  578. checker.set_before_exec_callback(
  579. AlgoChecker<ConvolutionBackwardFilter>("CUDNN_CONVOLUTION"));
  580. //! noncontiguous case
  581. {
  582. param::Convolution param;
  583. param.pad_h = param.pad_w = 1;
  584. checker.set_param(param).execl(TensorLayoutArray{
  585. {{2, 16, 7, 7}, {1568, 49, 7, 1}, dtype::Float32()},
  586. {{2, 16, 7, 7}, {1568, 49, 7, 1}, dtype::Float32()},
  587. {{16, 16, 3, 3}, {144, 9, 3, 1}, dtype::Float32()}});
  588. }
  589. }
  590. TEST_F(CUDA, CONV_CONFIG_COMBINATIONS) {
  591. auto eps_getter = [](bool f16, int stage, const char* name) -> float {
  592. if (f16) {
  593. return stage == 2 ? 0.5 : 0.2;
  594. }
  595. if (strstr(name, "WINOGRAD_NONFUSED"))
  596. return 0.3;
  597. return 1e-3;
  598. };
  599. convolution::test_conv_config_combinations(
  600. 2, handle_cuda(), false, true, true, eps_getter, true);
  601. convolution::test_conv_config_combinations(
  602. 3, handle_cuda(), false, true, true, eps_getter, true);
  603. convolution::test_conv_config_combinations(
  604. 5, handle_cuda(), false, true, true, eps_getter, true);
  605. }
  606. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_1) {
  607. require_compute_capability(7, 0);
  608. using namespace convolution;
  609. Checker<ConvolutionBackwardData> checker(handle_cuda());
  610. checker.set_before_exec_callback(AlgoChecker<ConvolutionBackwardData>(
  611. "CUDNN_CONVOLUTION_BWD_DATA_ALGO_1" CUDNN_VERSION_STRING));
  612. NormalRNG default_rng;
  613. TensorShape s_filter = TensorShape{8, 8, 2, 2}, s_src = TensorShape{2, 8, 18, 18};
  614. float scale = 1.0f / sqrt(s_filter[0] * s_filter[2] * s_filter[3]);
  615. UniformFloatRNG rng(scale, 2 * scale);
  616. auto src = TensorLayout(s_src, dtype::Float16());
  617. auto filter = TensorLayout(s_filter, dtype::Float16());
  618. TensorLayout dst;
  619. param::Convolution param;
  620. param.pad_h = param.pad_w = 2;
  621. param.stride_h = param.stride_w = 2;
  622. {
  623. auto opr = handle_cuda()->create_operator<Convolution>();
  624. opr->param() = param;
  625. opr->deduce_layout(src, filter, dst);
  626. }
  627. src.dtype = dst.dtype = filter.dtype = dtype::Float16();
  628. param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  629. checker.set_rng(0, &rng).set_rng(1, &rng).set_epsilon(0.2).set_param(param).exec(
  630. TensorLayoutArray{filter, dst, src});
  631. }
  632. TEST_F(CUDA, CONVOLUTION_BACKWARD_DEPTHWISE_LARGE_FILTER) {
  633. Checker<ConvolutionBackwardData> checker(handle_cuda());
  634. checker.set_before_exec_callback(
  635. AlgoChecker<ConvolutionBackwardData>("DEPTHWISE_LARGE_FILTER"));
  636. for (auto dtype : std::vector<DType> {
  637. dtype::Float32(),
  638. #if CUDA_VERSION >= 9000
  639. dtype::Float16()
  640. #endif
  641. }) {
  642. auto run = [&checker, &dtype](
  643. size_t n, size_t g, size_t h, size_t fh, size_t padding,
  644. size_t stride) {
  645. param::Convolution param;
  646. param.stride_h = param.stride_w = stride;
  647. param.pad_h = param.pad_w = padding;
  648. param.mode = Convolution::Mode::CROSS_CORRELATION;
  649. param.sparse = param::Convolution::Sparse::GROUP;
  650. checker.set_dtype(0, dtype).set_dtype(1, dtype).set_dtype(2, dtype);
  651. float scale = 64.f / sqrt(fh * fh);
  652. UniformFloatRNG rng(scale, scale * 2);
  653. checker.set_rng(0, &rng).set_rng(1, &rng).set_rng(2, &rng);
  654. if (dtype.enumv() == DTypeEnum::Float16)
  655. checker.set_epsilon(1e-1);
  656. checker.set_param(param).execs(
  657. {{g, 1, 1, fh, fh},
  658. {n, g, (h + 2 * padding - fh + 1) / stride,
  659. (h + 2 * padding - fh + 1) / stride},
  660. {n, g, h, h}});
  661. };
  662. run(4, 8, 32, 5, 5 / 2, 1);
  663. run(4, 8, 32, 7, 7 / 2, 1);
  664. run(4, 8, 32, 9, 9 / 2, 1);
  665. run(4, 8, 32, 11, 11 / 2, 1);
  666. run(4, 8, 32, 13, 13 / 2, 1);
  667. run(4, 8, 32, 15, 15 / 2, 1);
  668. run(4, 8, 32, 17, 17 / 2, 1);
  669. run(4, 8, 32, 19, 19 / 2, 1);
  670. run(4, 8, 32, 21, 21 / 2, 1);
  671. run(4, 8, 32, 23, 23 / 2, 1);
  672. run(4, 8, 32, 25, 25 / 2, 1);
  673. run(4, 8, 32, 27, 27 / 2, 1);
  674. run(4, 8, 32, 29, 29 / 2, 1);
  675. run(4, 8, 32, 31, 31 / 2, 1);
  676. run(4, 8, 64, 5, 5 / 2, 2);
  677. run(4, 8, 64, 7, 7 / 3, 2);
  678. run(4, 8, 64, 9, 9 / 3, 2);
  679. run(4, 8, 64, 11, 11 / 3, 2);
  680. run(4, 8, 64, 13, 13 / 3, 2);
  681. run(4, 8, 64, 15, 15 / 3, 2);
  682. run(4, 8, 64, 17, 17 / 3, 2);
  683. run(4, 8, 64, 19, 19 / 3, 2);
  684. run(4, 8, 64, 21, 21 / 3, 2);
  685. run(4, 8, 64, 23, 23 / 3, 2);
  686. run(4, 8, 64, 25, 25 / 3, 2);
  687. run(4, 8, 64, 27, 27 / 3, 2);
  688. run(4, 8, 64, 29, 29 / 3, 2);
  689. run(4, 8, 64, 31, 31 / 3, 2);
  690. run(1, 2, 128, 31, 31 / 3, 2);
  691. run(1, 2, 256, 31, 31 / 3, 2);
  692. }
  693. }
  694. #if MEGDNN_WITH_BENCHMARK
  695. TEST_F(CUDA, BENCHMARK_CONVOLUTION_1X1_FORWARD) {
  696. using namespace convolution;
  697. std::vector<TestArg> args = get_1x1_args();
  698. Benchmarker<ConvolutionForward> marker(handle_cuda());
  699. NormalRNG default_rng;
  700. for (auto&& arg : args) {
  701. float scale = 1.0f / sqrt(arg.filter[1] * arg.filter[2] * arg.filter[3]);
  702. UniformFloatRNG rng(scale, 2 * scale);
  703. marker.set_dtype(0, dtype::Float32())
  704. .set_dtype(1, dtype::Float32())
  705. .set_rng(0, &default_rng)
  706. .set_rng(1, &default_rng)
  707. .set_param(arg.param)
  708. .execs({arg.src, arg.filter, {}});
  709. }
  710. }
  711. TEST_F(CUDA, CONV_FWD_BENCHMARK) {
  712. auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW, size_t SH = 1,
  713. size_t SW = 1, size_t FH = 1, size_t FW = 1, size_t PH = 0,
  714. size_t PW = 0, bool fp16io_c32 = false) {
  715. auto benchmarker = Benchmarker<ConvolutionForward>(handle_cuda());
  716. benchmarker.set_dtype(0, dtype::Float16())
  717. .set_dtype(1, dtype::Float16())
  718. .set_dtype(2, dtype::Float16());
  719. ConvolutionForward::Param param;
  720. param.stride_h = SH;
  721. param.stride_w = SW;
  722. param.pad_h = PH;
  723. param.pad_w = PW;
  724. if (fp16io_c32) {
  725. param.compute_mode = ConvolutionForward::Param::ComputeMode::FLOAT32;
  726. }
  727. benchmarker.set_param(param);
  728. std::unique_ptr<OprProxy<ConvolutionForward>> proxy{
  729. new OprProxy<ConvolutionForward>{true}};
  730. benchmarker.set_proxy(proxy);
  731. size_t OH = (IH - FH + 2 * PH) / SH + 1;
  732. size_t OW = (IW - FW + 2 * PW) / SW + 1;
  733. auto time =
  734. benchmarker.execs({{N, IC, IH, IW}, {OC, IC, FH, FW}, {N, OC, OH, OW}});
  735. time /= 1000.0 * 10.0;
  736. auto flo = (double)N * OC * IC * OH * OW * FH * FW * 2;
  737. auto flops = flo / time / 1e12;
  738. printf("comp_type %s: ", fp16io_c32 ? "32" : "16");
  739. printf("%.3fG FLO, flops %.3fTFLOPS\n", flo / 1e9, flops);
  740. };
  741. run(32, 512, 256, 56, 56, 1, 1, 1, 1, 0, 0, false);
  742. run(32, 512, 256, 56, 56, 1, 1, 1, 1, 0, 0, true);
  743. }
  744. TEST_F(CUDA, CONVOLUTION_FWD_BENCHMARK) {
  745. CUBenchmarker<ConvolutionForward> bench{handle_cuda()};
  746. std::unique_ptr<OprProxy<ConvolutionForward>> proxy{
  747. new OprProxy<ConvolutionForward>{true}};
  748. size_t RUNS = 10;
  749. bench.set_proxy(proxy).set_times(RUNS);
  750. auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW, size_t FH,
  751. size_t SH, size_t PH) {
  752. bench.set_dtype(0, dtype::Float32())
  753. .set_dtype(1, dtype::Float32())
  754. .set_dtype(2, dtype::Float32());
  755. param::Convolution param;
  756. param.stride_h = param.stride_w = SH;
  757. param.pad_h = param.pad_w = PH;
  758. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  759. bench.set_param(param);
  760. bench.proxy()->target_execution_policy.algo.reset();
  761. TensorLayout src{{N, IC, IH, IW}, dtype::Float32()},
  762. filter{{OC, IC, FH, FH}, dtype::Float32()};
  763. TensorLayout dst;
  764. {
  765. auto&& opr = handle_cuda()->create_operator<Convolution>();
  766. opr->param() = param;
  767. opr->deduce_layout(src, filter, dst);
  768. }
  769. auto time_ms_fp32 = bench.execl({src, filter, dst}) / RUNS;
  770. src.dtype = filter.dtype = dst.dtype = dtype::Float16();
  771. bench.proxy()->target_execution_policy.algo.reset();
  772. bench.set_dtype(0, dtype::Float16())
  773. .set_dtype(1, dtype::Float16())
  774. .set_dtype(2, dtype::Float16());
  775. auto time_ms_true_fp16 = bench.execl({src, filter, dst}) / RUNS;
  776. param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  777. bench.proxy()->target_execution_policy.algo.reset();
  778. bench.set_param(param);
  779. auto time_ms_pseudo_fp16 = bench.execl({src, filter, dst}) / RUNS;
  780. float flo = 2.0 * N * OC * IC * dst[2] * dst[3] * FH * FH;
  781. printf("inp=%s, kern=%s, dst=%s ", src.to_string().c_str(),
  782. filter.to_string().c_str(), dst.to_string().c_str());
  783. printf("time_fp32=%.2fms, flops=%.3fTFLOPS\ntime_true_fp16=%.2fms, "
  784. "flops=%.3fTFLOPS\ntime_pseudo_fp16=%.2fms, flops=%.3fFLOPS\n",
  785. time_ms_fp32, (flo / (time_ms_fp32 * 1e9)), time_ms_true_fp16,
  786. (flo / (time_ms_true_fp16 * 1e9)), time_ms_pseudo_fp16,
  787. (flo / (time_ms_pseudo_fp16 * 1e9)));
  788. printf("speedup (true_fp16/fp32)=%.2f, (true_fp16/pseudo_fp16)=%.2f\n",
  789. time_ms_fp32 / time_ms_true_fp16,
  790. time_ms_pseudo_fp16 / time_ms_true_fp16);
  791. };
  792. run(32, 64, 3, 224, 224, 7, 2, 3);
  793. run(32, 128, 128, 28, 28, 3, 1, 1);
  794. run(32, 256, 256, 14, 14, 3, 1, 1);
  795. run(32, 512, 512, 7, 7, 3, 1, 1);
  796. run(32, 64, 64, 56, 56, 3, 1, 1);
  797. run(32, 512, 256, 56, 56, 1, 2, 0);
  798. run(32, 1024, 512, 28, 28, 1, 2, 0);
  799. run(32, 2048, 1024, 14, 14, 1, 2, 0);
  800. run(32, 512, 128, 28, 28, 1, 1, 0);
  801. run(32, 128, 512, 28, 28, 1, 1, 0);
  802. run(32, 1024, 256, 14, 14, 1, 1, 0);
  803. run(32, 256, 1024, 14, 14, 1, 1, 0);
  804. run(32, 2048, 512, 7, 7, 1, 1, 0);
  805. run(32, 512, 2048, 7, 7, 1, 1, 0);
  806. run(32, 256, 64, 56, 56, 1, 1, 0);
  807. run(32, 64, 256, 56, 56, 1, 1, 0);
  808. run(32, 128, 256, 56, 56, 1, 2, 0);
  809. run(32, 256, 512, 28, 28, 1, 2, 0);
  810. run(32, 512, 1024, 14, 14, 1, 2, 0);
  811. run(32, 64, 64, 56, 56, 1, 1, 0);
  812. }
  813. TEST_F(CUDA, CONVOLUTION_BWD_DATA_BENCHMARK) {
  814. CUBenchmarker<ConvolutionBackwardData> bench{handle_cuda()};
  815. std::unique_ptr<OprProxy<ConvolutionBackwardData>> proxy{
  816. new OprProxy<ConvolutionBackwardData>{true}};
  817. size_t RUNS = 10;
  818. bench.set_proxy(proxy).set_times(RUNS);
  819. auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW, size_t FH,
  820. size_t SH, size_t PH) {
  821. bench.set_dtype(0, dtype::Float32())
  822. .set_dtype(1, dtype::Float32())
  823. .set_dtype(2, dtype::Float32());
  824. param::Convolution param;
  825. param.stride_h = param.stride_w = SH;
  826. param.pad_h = param.pad_w = PH;
  827. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  828. bench.set_param(param);
  829. bench.proxy()->target_execution_policy.algo.reset();
  830. TensorLayout src{{N, IC, IH, IW}, dtype::Float32()},
  831. filter{{OC, IC, FH, FH}, dtype::Float32()};
  832. TensorLayout dst;
  833. {
  834. auto&& opr = handle_cuda()->create_operator<Convolution>();
  835. opr->param() = param;
  836. opr->deduce_layout(src, filter, dst);
  837. }
  838. auto time_ms_fp32 = bench.execl({filter, dst, src}) / RUNS;
  839. src.dtype = filter.dtype = dst.dtype = dtype::Float16();
  840. bench.proxy()->target_execution_policy.algo.reset();
  841. bench.set_dtype(0, dtype::Float16())
  842. .set_dtype(1, dtype::Float16())
  843. .set_dtype(2, dtype::Float16());
  844. auto time_ms_true_fp16 = bench.execl({filter, dst, src}) / RUNS;
  845. param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  846. bench.proxy()->target_execution_policy.algo.reset();
  847. bench.set_param(param);
  848. auto time_ms_pseudo_fp16 = bench.execl({filter, dst, src}) / RUNS;
  849. float flo = 2.0 * N * OC * IC * dst[2] * dst[3] * FH * FH;
  850. printf("inp=%s, kern=%s, dst=%s ", src.to_string().c_str(),
  851. filter.to_string().c_str(), dst.to_string().c_str());
  852. printf("time_fp32=%.2fms, flops=%.3fTFLOPS\ntime_true_fp16=%.2fms, "
  853. "flops=%.3fTFLOPS\ntime_pseudo_fp16=%.2fms, flops=%.3fFLOPS\n",
  854. time_ms_fp32, (flo / (time_ms_fp32 * 1e9)), time_ms_true_fp16,
  855. (flo / (time_ms_true_fp16 * 1e9)), time_ms_pseudo_fp16,
  856. (flo / (time_ms_pseudo_fp16 * 1e9)));
  857. printf("speedup (true_fp16/fp32)=%.2f, (true_fp16/pseudo_fp16)=%.2f\n",
  858. time_ms_fp32 / time_ms_true_fp16,
  859. time_ms_pseudo_fp16 / time_ms_true_fp16);
  860. };
  861. run(32, 64, 3, 224, 224, 7, 2, 3);
  862. run(32, 128, 128, 28, 28, 3, 1, 1);
  863. run(32, 256, 256, 14, 14, 3, 1, 1);
  864. run(32, 512, 512, 7, 7, 3, 1, 1);
  865. run(32, 64, 64, 56, 56, 3, 1, 1);
  866. run(32, 512, 256, 56, 56, 1, 2, 0);
  867. run(32, 1024, 512, 28, 28, 1, 2, 0);
  868. run(32, 2048, 1024, 14, 14, 1, 2, 0);
  869. run(32, 512, 128, 28, 28, 1, 1, 0);
  870. run(32, 128, 512, 28, 28, 1, 1, 0);
  871. run(32, 1024, 256, 14, 14, 1, 1, 0);
  872. run(32, 256, 1024, 14, 14, 1, 1, 0);
  873. run(32, 2048, 512, 7, 7, 1, 1, 0);
  874. run(32, 512, 2048, 7, 7, 1, 1, 0);
  875. run(32, 256, 64, 56, 56, 1, 1, 0);
  876. run(32, 64, 256, 56, 56, 1, 1, 0);
  877. run(32, 128, 256, 56, 56, 1, 2, 0);
  878. run(32, 256, 512, 28, 28, 1, 2, 0);
  879. run(32, 512, 1024, 14, 14, 1, 2, 0);
  880. run(32, 64, 64, 56, 56, 1, 1, 0);
  881. }
  882. TEST_F(CUDA, BENCHMARK_CONVOLUTION_BWD_DATA_DEPTHWISE_LARGE_FILTER_FP32) {
  883. CUBenchmarker<ConvolutionBackwardData> bencher{handle_cuda()};
  884. bencher.set_display(false);
  885. bencher.set_before_exec_callback(
  886. AlgoChecker<ConvolutionBackwardData>("DEPTHWISE_LARGE_FILTER"));
  887. auto run = [&](size_t N, size_t OC, size_t g, size_t IH, size_t IW, size_t FH,
  888. size_t SH, size_t nr_times) {
  889. bencher.set_dtype(0, dtype::Float32())
  890. .set_dtype(1, dtype::Float32())
  891. .set_dtype(2, dtype::Float32());
  892. param::Convolution param;
  893. param.stride_h = param.stride_w = SH;
  894. param.pad_h = param.pad_w = FH / 2;
  895. param.sparse = param::Convolution::Sparse::GROUP;
  896. bencher.set_param(param);
  897. bencher.set_times(nr_times);
  898. TensorLayout src{{N, g, IH, IW}, dtype::Float32()},
  899. filter{{g, 1, 1, FH, FH}, dtype::Float32()};
  900. TensorLayout dst;
  901. {
  902. auto&& opr = handle_cuda()->create_operator<Convolution>();
  903. opr->param() = param;
  904. opr->deduce_layout(src, filter, dst);
  905. }
  906. auto time_ms_fp32 = bencher.execl({filter, dst, src}) / nr_times;
  907. float flo = 2.0 * N * g * dst[2] * dst[3] * FH * FH;
  908. printf("inp=%s, kern=%s, dst=%s ", src.to_string().c_str(),
  909. filter.to_string().c_str(), dst.to_string().c_str());
  910. printf("time_fp32=%.2fms, flops=%.3fTFLOPS\n", time_ms_fp32,
  911. (flo / (time_ms_fp32 * 1e9)));
  912. };
  913. run(64, 384, 384, 32, 32, 3, 1, 10);
  914. run(64, 384, 384, 32, 32, 5, 1, 10);
  915. run(64, 384, 384, 32, 32, 7, 1, 10);
  916. run(64, 384, 384, 32, 32, 9, 1, 10);
  917. run(64, 384, 384, 32, 32, 11, 1, 10);
  918. run(64, 384, 384, 32, 32, 13, 1, 10);
  919. run(64, 384, 384, 32, 32, 15, 1, 10);
  920. run(64, 384, 384, 32, 32, 17, 1, 10);
  921. run(64, 384, 384, 32, 32, 19, 1, 10);
  922. run(64, 384, 384, 32, 32, 21, 1, 10);
  923. run(64, 384, 384, 32, 32, 23, 1, 10);
  924. run(64, 384, 384, 32, 32, 25, 1, 10);
  925. run(64, 384, 384, 32, 32, 27, 1, 10);
  926. run(64, 384, 384, 32, 32, 29, 1, 10);
  927. run(64, 384, 384, 32, 32, 31, 1, 10);
  928. }
  929. TEST_F(CUDA, BENCHMARK_CONVOLUTION_BWD_DATA_DEPTHWISE_LARGE_FILTER_FP16) {
  930. CUBenchmarker<ConvolutionBackwardData> bencher{handle_cuda()};
  931. bencher.set_display(false);
  932. bencher.set_before_exec_callback(
  933. AlgoChecker<ConvolutionBackwardData>("DEPTHWISE_LARGE_FILTER"));
  934. auto run = [&](size_t N, size_t OC, size_t g, size_t IH, size_t IW, size_t FH,
  935. size_t SH, size_t nr_times) {
  936. bencher.set_dtype(0, dtype::Float16())
  937. .set_dtype(1, dtype::Float16())
  938. .set_dtype(2, dtype::Float16());
  939. param::Convolution param;
  940. param.stride_h = param.stride_w = SH;
  941. param.pad_h = param.pad_w = FH / 2;
  942. param.sparse = param::Convolution::Sparse::GROUP;
  943. bencher.set_param(param);
  944. bencher.set_times(nr_times);
  945. TensorLayout src{{N, g, IH, IW}, dtype::Float16()},
  946. filter{{g, 1, 1, FH, FH}, dtype::Float16()};
  947. TensorLayout dst;
  948. {
  949. auto&& opr = handle_cuda()->create_operator<Convolution>();
  950. opr->param() = param;
  951. opr->deduce_layout(src, filter, dst);
  952. }
  953. auto time_ms_fp16 = bencher.execl({filter, dst, src}) / nr_times;
  954. float flo = 2.0 * N * g * dst[2] * dst[3] * FH * FH;
  955. printf("inp=%s, kern=%s, dst=%s ", src.to_string().c_str(),
  956. filter.to_string().c_str(), dst.to_string().c_str());
  957. printf("time_fp16=%.2fms, flops=%.3fTFLOPS\n", time_ms_fp16,
  958. (flo / (time_ms_fp16 * 1e9)));
  959. };
  960. run(64, 384, 384, 32, 32, 3, 1, 10);
  961. run(64, 384, 384, 32, 32, 5, 1, 10);
  962. run(64, 384, 384, 32, 32, 7, 1, 10);
  963. run(64, 384, 384, 32, 32, 9, 1, 10);
  964. run(64, 384, 384, 32, 32, 11, 1, 10);
  965. run(64, 384, 384, 32, 32, 13, 1, 10);
  966. run(64, 384, 384, 32, 32, 15, 1, 10);
  967. run(64, 384, 384, 32, 32, 17, 1, 10);
  968. run(64, 384, 384, 32, 32, 19, 1, 10);
  969. run(64, 384, 384, 32, 32, 21, 1, 10);
  970. run(64, 384, 384, 32, 32, 23, 1, 10);
  971. run(64, 384, 384, 32, 32, 25, 1, 10);
  972. run(64, 384, 384, 32, 32, 27, 1, 10);
  973. run(64, 384, 384, 32, 32, 29, 1, 10);
  974. run(64, 384, 384, 32, 32, 31, 1, 10);
  975. }
  976. TEST_F(CUDA, BENCHMARK_CONVOLUTION_BWD_DATA_BF16) {
  977. CUBenchmarker<ConvolutionBackwardData> bench{handle_cuda()};
  978. std::unique_ptr<OprProxy<ConvolutionBackwardData>> proxy{
  979. new OprProxy<ConvolutionBackwardData>{true}};
  980. size_t RUNS = 10;
  981. bench.set_proxy(proxy).set_times(RUNS);
  982. auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW, size_t FH,
  983. size_t SH, size_t PH) {
  984. bench.set_dtype(0, dtype::BFloat16())
  985. .set_dtype(1, dtype::BFloat16())
  986. .set_dtype(2, dtype::BFloat16());
  987. param::Convolution param;
  988. param.stride_h = param.stride_w = SH;
  989. param.pad_h = param.pad_w = PH;
  990. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  991. bench.set_param(param);
  992. bench.proxy()->target_execution_policy = {};
  993. TensorLayout src{{N, IC, IH, IW}, dtype::BFloat16()},
  994. filter{{OC, IC, FH, FH}, dtype::BFloat16()};
  995. TensorLayout dst;
  996. {
  997. auto&& opr = handle_cuda()->create_operator<Convolution>();
  998. opr->param() = param;
  999. opr->deduce_layout(src, filter, dst);
  1000. }
  1001. auto used = bench.execl({filter, dst, src}) / RUNS;
  1002. float flo = 2.0 * N * OC * IC * dst[2] * dst[3] * FH * FH;
  1003. printf("inp=%s, kern=%s, dst=%s ", src.to_string().c_str(),
  1004. filter.to_string().c_str(), dst.to_string().c_str());
  1005. printf("time_fp32=%.2fms, flops=%.3fTFLOPS\n", used, (flo / (used * 1e9)));
  1006. };
  1007. run(32, 64, 3, 224, 224, 7, 2, 3);
  1008. run(32, 128, 128, 28, 28, 3, 1, 1);
  1009. run(32, 256, 256, 14, 14, 3, 1, 1);
  1010. run(32, 512, 512, 7, 7, 3, 1, 1);
  1011. run(32, 64, 64, 56, 56, 3, 1, 1);
  1012. run(32, 512, 256, 56, 56, 1, 2, 0);
  1013. run(32, 1024, 512, 28, 28, 1, 2, 0);
  1014. run(32, 2048, 1024, 14, 14, 1, 2, 0);
  1015. run(32, 512, 128, 28, 28, 1, 1, 0);
  1016. run(32, 128, 512, 28, 28, 1, 1, 0);
  1017. run(32, 1024, 256, 14, 14, 1, 1, 0);
  1018. run(32, 256, 1024, 14, 14, 1, 1, 0);
  1019. run(32, 2048, 512, 7, 7, 1, 1, 0);
  1020. run(32, 512, 2048, 7, 7, 1, 1, 0);
  1021. run(32, 256, 64, 56, 56, 1, 1, 0);
  1022. run(32, 64, 256, 56, 56, 1, 1, 0);
  1023. run(32, 128, 256, 56, 56, 1, 2, 0);
  1024. run(32, 256, 512, 28, 28, 1, 2, 0);
  1025. run(32, 512, 1024, 14, 14, 1, 2, 0);
  1026. run(32, 64, 64, 56, 56, 1, 1, 0);
  1027. }
  1028. TEST_F(CUDA, BENCHMARK_CONVOLUTION_BWD_DATA_INT8_DP4A) {
  1029. CUBenchmarker<ConvolutionBackwardData> bench{handle_cuda()};
  1030. std::unique_ptr<OprProxy<ConvolutionBackwardData>> proxy{
  1031. new OprProxy<ConvolutionBackwardData>{true}};
  1032. size_t RUNS = 10;
  1033. bench.set_proxy(proxy).set_times(RUNS);
  1034. auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW, size_t FH,
  1035. size_t SH, size_t PH) {
  1036. bench.set_dtype(0, dtype::QuantizedS8{1.0f})
  1037. .set_dtype(1, dtype::QuantizedS8{1.0f})
  1038. .set_dtype(2, dtype::QuantizedS8{1.0f});
  1039. param::Convolution param;
  1040. param.format = param::Convolution::Format::NCHW4;
  1041. param.stride_h = param.stride_w = SH;
  1042. param.pad_h = param.pad_w = PH;
  1043. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  1044. bench.set_param(param);
  1045. bench.proxy()->target_execution_policy = {};
  1046. TensorLayout src{{N, IC / 4, IH, IW, 4}, dtype::QuantizedS8{1.0f}},
  1047. filter{{OC, IC / 4, FH, FH, 4}, dtype::QuantizedS8{1.0f}};
  1048. TensorLayout dst;
  1049. dst.dtype = dtype::QuantizedS8{1.0f};
  1050. {
  1051. auto&& opr = handle_cuda()->create_operator<Convolution>();
  1052. opr->param() = param;
  1053. opr->deduce_layout(src, filter, dst);
  1054. }
  1055. auto used = bench.execl({filter, dst, src}) / RUNS;
  1056. float flo = 2.0 * N * OC * IC * dst[2] * dst[3] * FH * FH;
  1057. printf("inp=%s, kern=%s, dst=%s ", src.to_string().c_str(),
  1058. filter.to_string().c_str(), dst.to_string().c_str());
  1059. printf("time_fp32=%.2fms, flops=%.3fTFLOPS\n", used, (flo / (used * 1e9)));
  1060. };
  1061. run(64, 32, 32, 92, 180, 4, 2, 2);
  1062. run(64, 32, 32, 46, 80, 4, 2, 2);
  1063. run(16, 16, 16, 92, 180, 4, 2, 2);
  1064. run(16, 16, 16, 46, 80, 4, 2, 2);
  1065. }
  1066. TEST_F(CUDA, CONVOLUTION_BWD_FILTER_BENCHMARK) {
  1067. CUBenchmarker<ConvolutionBackwardFilter> bench{handle_cuda()};
  1068. std::unique_ptr<OprProxy<ConvolutionBackwardFilter>> proxy{
  1069. new OprProxy<ConvolutionBackwardFilter>{true}};
  1070. size_t RUNS = 10;
  1071. bench.set_proxy(proxy).set_times(RUNS);
  1072. auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW, size_t FH,
  1073. size_t SH, size_t PH) {
  1074. bench.set_dtype(0, dtype::Float32())
  1075. .set_dtype(1, dtype::Float32())
  1076. .set_dtype(2, dtype::Float32());
  1077. param::Convolution param;
  1078. param.stride_h = param.stride_w = SH;
  1079. param.pad_h = param.pad_w = PH;
  1080. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  1081. bench.set_param(param);
  1082. bench.proxy()->target_execution_policy.algo.reset();
  1083. TensorLayout src{{N, IC, IH, IW}, dtype::Float32()},
  1084. filter{{OC, IC, FH, FH}, dtype::Float32()};
  1085. TensorLayout dst;
  1086. {
  1087. auto&& opr = handle_cuda()->create_operator<Convolution>();
  1088. opr->param() = param;
  1089. opr->deduce_layout(src, filter, dst);
  1090. }
  1091. auto time_ms_fp32 = bench.execl({src, dst, filter}) / RUNS;
  1092. src.dtype = filter.dtype = dst.dtype = dtype::Float16();
  1093. bench.proxy()->target_execution_policy.algo.reset();
  1094. bench.set_dtype(0, dtype::Float16())
  1095. .set_dtype(1, dtype::Float16())
  1096. .set_dtype(2, dtype::Float16());
  1097. auto time_ms_true_fp16 = bench.execl({src, dst, filter}) / RUNS;
  1098. param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  1099. bench.proxy()->target_execution_policy.algo.reset();
  1100. bench.set_param(param);
  1101. auto time_ms_pseudo_fp16 = bench.execl({src, dst, filter}) / RUNS;
  1102. float flo = 2.0 * N * OC * IC * dst[2] * dst[3] * FH * FH;
  1103. printf("inp=%s, kern=%s, dst=%s ", src.to_string().c_str(),
  1104. filter.to_string().c_str(), dst.to_string().c_str());
  1105. printf("time_fp32=%.2fms, flops=%.3fTFLOPS\ntime_true_fp16=%.2fms, "
  1106. "flops=%.3fTFLOPS\ntime_pseudo_fp16=%.2fms, flops=%.3fFLOPS\n",
  1107. time_ms_fp32, (flo / (time_ms_fp32 * 1e9)), time_ms_true_fp16,
  1108. (flo / (time_ms_true_fp16 * 1e9)), time_ms_pseudo_fp16,
  1109. (flo / (time_ms_pseudo_fp16 * 1e9)));
  1110. printf("speedup (true_fp16/fp32)=%.2f, (true_fp16/pseudo_fp16)=%.2f\n",
  1111. time_ms_fp32 / time_ms_true_fp16,
  1112. time_ms_pseudo_fp16 / time_ms_true_fp16);
  1113. };
  1114. run(32, 64, 3, 224, 224, 7, 2, 3);
  1115. run(32, 128, 128, 28, 28, 3, 1, 1);
  1116. run(32, 256, 256, 14, 14, 3, 1, 1);
  1117. run(32, 512, 512, 7, 7, 3, 1, 1);
  1118. run(32, 64, 64, 56, 56, 3, 1, 1);
  1119. run(32, 512, 256, 56, 56, 1, 2, 0);
  1120. run(32, 1024, 512, 28, 28, 1, 2, 0);
  1121. run(32, 2048, 1024, 14, 14, 1, 2, 0);
  1122. run(32, 512, 128, 28, 28, 1, 1, 0);
  1123. run(32, 128, 512, 28, 28, 1, 1, 0);
  1124. run(32, 1024, 256, 14, 14, 1, 1, 0);
  1125. run(32, 256, 1024, 14, 14, 1, 1, 0);
  1126. run(32, 2048, 512, 7, 7, 1, 1, 0);
  1127. run(32, 512, 2048, 7, 7, 1, 1, 0);
  1128. run(32, 256, 64, 56, 56, 1, 1, 0);
  1129. run(32, 64, 256, 56, 56, 1, 1, 0);
  1130. run(32, 128, 256, 56, 56, 1, 2, 0);
  1131. run(32, 256, 512, 28, 28, 1, 2, 0);
  1132. run(32, 512, 1024, 14, 14, 1, 2, 0);
  1133. run(32, 64, 64, 56, 56, 1, 1, 0);
  1134. }
  1135. TEST_F(CUDA, BENCHMARK_CONVOLUTION_BWD_FILTER_DEPTHWISE_LARGE_FILTER) {
  1136. CUBenchmarker<ConvolutionBackwardFilter> bench{handle_cuda()};
  1137. std::unique_ptr<OprProxy<ConvolutionBackwardFilter>> proxy{
  1138. new OprProxy<ConvolutionBackwardFilter>{true}};
  1139. size_t RUNS = 10;
  1140. bench.set_proxy(proxy).set_times(RUNS);
  1141. bench.set_before_exec_callback(AlgoChecker<ConvolutionBackwardFilter>(
  1142. "CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFTv7.6.3"));
  1143. auto run = [&](size_t N, size_t OC, size_t g, size_t IH, size_t IW, size_t FH,
  1144. size_t SH, size_t PH) {
  1145. bench.set_dtype(0, dtype::Float32())
  1146. .set_dtype(1, dtype::Float32())
  1147. .set_dtype(2, dtype::Float32());
  1148. param::Convolution param;
  1149. param.stride_h = param.stride_w = SH;
  1150. param.pad_h = param.pad_w = FH / 2;
  1151. param.sparse = param::Convolution::Sparse::GROUP;
  1152. bench.set_param(param);
  1153. bench.proxy()->target_execution_policy.algo.reset();
  1154. TensorLayout src{{N, g, IH, IW}, dtype::Float32()},
  1155. filter{{g, 1, 1, FH, FH}, dtype::Float32()};
  1156. TensorLayout dst;
  1157. {
  1158. auto&& opr = handle_cuda()->create_operator<Convolution>();
  1159. opr->param() = param;
  1160. opr->deduce_layout(src, filter, dst);
  1161. }
  1162. auto time_ms_fp32 = bench.execl({src, dst, filter}) / RUNS;
  1163. float flo = 2.0 * N * g * dst[2] * dst[3] * FH * FH;
  1164. printf("inp=%s, kern=%s, dst=%s ", src.to_string().c_str(),
  1165. filter.to_string().c_str(), dst.to_string().c_str());
  1166. printf("time_fp32=%.2fms, flops=%.3fTFLOPS\n", time_ms_fp32,
  1167. (flo / (time_ms_fp32 * 1e9)));
  1168. };
  1169. run(64, 384, 384, 32, 32, 31, 1, 15);
  1170. }
  1171. #endif
  1172. #undef CUDNN_VERSION_STRING
  1173. #undef V
  1174. #undef V1
  1175. } // namespace test
  1176. } // namespace megdnn
  1177. // vim: syntax=cpp.doxygen