You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

convolution.cpp 48 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154
  1. /**
  2. * \file dnn/test/cuda/convolution.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
  10. * implied.
  11. */
  12. #include "test/common/convolution.h"
  13. #include "megdnn/dtype.h"
  14. #include "megdnn/opr_param_defs.h"
  15. #include "megdnn/oprs.h"
  16. #include "src/cuda/utils.h"
  17. #include "test/common/accuracy_shake_checker.h"
  18. #include "test/common/checker.h"
  19. #include "test/common/rng.h"
  20. #include "test/common/tensor.h"
  21. #include "test/common/workspace_wrapper.h"
  22. #include "test/cuda/benchmark.h"
  23. #include "test/cuda/fixture.h"
  24. #define V1(x) #x
  25. #define V(x) V1(x)
  26. #define CUDNN_VERSION_STRING \
  27. "v" V(CUDNN_MAJOR) "." V(CUDNN_MINOR) "." V(CUDNN_PATCHLEVEL)
  28. namespace megdnn {
  29. namespace test {
  30. TEST_F(CUDA, CONVOLUTION_8X8X32) {
  31. if (!cuda::is_compute_capability_required(6, 1)) {
  32. printf("Skip CUDA.CONVOLUTION_8X8X32 test as current device"
  33. "doesn't support\n");
  34. return;
  35. }
  36. using namespace convolution;
  37. std::vector<TestArg> args;
  38. {
  39. auto v = get_args();
  40. for (auto&& a : v) {
  41. args.push_back(std::move(a));
  42. }
  43. }
  44. {
  45. auto v = get_dilated_args();
  46. for (auto&& a : v) {
  47. args.push_back(std::move(a));
  48. }
  49. }
  50. {
  51. auto v = get_chanwise_args();
  52. for (auto&& a : v) {
  53. args.push_back(std::move(a));
  54. }
  55. }
  56. Checker<ConvolutionForward> checker(handle_cuda());
  57. UniformIntRNG rng(-4, 4);
  58. for (auto arg : args) {
  59. arg.param.format = param::Convolution::Format::NHWC;
  60. arg.src = cvt_src_or_dst_nchw2nhwc(arg.src);
  61. arg.filter = cvt_filter_nchw2nhwc(arg.filter);
  62. checker.set_dtype(0, dtype::Int8())
  63. .set_dtype(1, dtype::Int8())
  64. .set_dtype(2, dtype::Int32())
  65. .set_param(arg.param)
  66. .set_rng(0, &rng)
  67. .set_rng(1, &rng)
  68. .execs({arg.src, arg.filter, {}});
  69. }
  70. }
  71. TEST_F(CUDA, CONVOLUTION_FORWARD) {
  72. using namespace convolution;
  73. std::vector<TestArg> args = get_args();
  74. Checker<ConvolutionForward> checker(handle_cuda());
  75. NormalRNG default_rng;
  76. for (auto&& arg : args) {
  77. float scale = 1.0f / sqrt(arg.filter[1] * arg.filter[2] * arg.filter[3]);
  78. UniformFloatRNG rng(scale, 2 * scale);
  79. checker.set_dtype(0, dtype::Float32())
  80. .set_dtype(1, dtype::Float32())
  81. .set_dtype(2, dtype::Float32())
  82. .set_rng(0, &default_rng)
  83. .set_rng(1, &default_rng)
  84. .set_epsilon(1e-3)
  85. .set_param(arg.param)
  86. .execs({arg.src, arg.filter, {}});
  87. checker.set_dtype(0, dtype::Float16())
  88. .set_dtype(1, dtype::Float16())
  89. .set_dtype(2, dtype::Float16())
  90. .set_rng(0, &rng)
  91. .set_rng(1, &rng)
  92. .set_epsilon(1e-1)
  93. .set_param(arg.param)
  94. .execs({arg.src, arg.filter, {}});
  95. arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  96. checker.set_dtype(0, dtype::Float16())
  97. .set_dtype(1, dtype::Float16())
  98. .set_dtype(2, dtype::Float16())
  99. .set_rng(0, &rng)
  100. .set_rng(1, &rng)
  101. .set_epsilon(1e-1)
  102. .set_param(arg.param)
  103. .execs({arg.src, arg.filter, {}});
  104. checker.set_dtype(0, dtype::BFloat16())
  105. .set_dtype(1, dtype::BFloat16())
  106. .set_dtype(2, dtype::BFloat16())
  107. .set_epsilon(1e-1)
  108. .set_param(arg.param)
  109. .execs({arg.src, arg.filter, {}});
  110. }
  111. }
  112. TEST_F(CUDA, CONV_FORWARD_MATMUL_NCHW4) {
  113. if (!cuda::is_compute_capability_required(6, 1))
  114. return;
  115. using namespace convolution;
  116. Checker<Convolution> checker(handle_cuda());
  117. UniformIntRNG int_rng{-127, 127};
  118. Convolution::Param param;
  119. param.format = Convolution::Param::Format::NCHW4;
  120. checker.set_dtype(0, dtype::QuantizedS8(0.132f))
  121. .set_dtype(1, dtype::QuantizedS8(0.0239f))
  122. .set_dtype(2, dtype::QuantizedS32(0.132f * 0.0239f))
  123. .set_rng(0, &int_rng)
  124. .set_rng(1, &int_rng)
  125. .set_param(param);
  126. checker.set_before_exec_callback(
  127. AlgoChecker<ConvolutionForward>(ExecutionPolicyAlgoName{
  128. "DEFAULT",
  129. {{ConvBiasForward::algo_name<ConvBiasForward::MatmulParam>(
  130. "MATMUL8X8X32", {})
  131. .c_str(),
  132. {}}}}));
  133. param.sparse = Convolution::Param::Sparse::DENSE;
  134. param.pad_h = param.pad_w = 1;
  135. param.stride_h = param.stride_w = 1;
  136. checker.set_param(param);
  137. checker.exec({{8, 4, 10, 10, 4}, {16, 4, 3, 3, 4}, {}});
  138. checker.exec({{1, 4, 2, 2, 4}, {16, 4, 3, 3, 4}, {}});
  139. checker.exec({{8, 64, 12, 12, 4}, {256, 64, 3, 3, 4}, {}});
  140. }
  141. TEST_F(CUDA, CONVOLUTION_1X1_FORWARD) {
  142. using namespace convolution;
  143. std::vector<TestArg> args = get_1x1_args();
  144. Checker<ConvolutionForward> checker(handle_cuda());
  145. NormalRNG default_rng;
  146. for (auto&& arg : args) {
  147. float scale = 1.0f / sqrt(arg.filter[1] * arg.filter[2] * arg.filter[3]);
  148. UniformFloatRNG rng(scale, 2 * scale);
  149. checker.set_dtype(0, dtype::Float32())
  150. .set_dtype(1, dtype::Float32())
  151. .set_rng(0, &default_rng)
  152. .set_rng(1, &default_rng)
  153. .set_epsilon(1e-3)
  154. .set_param(arg.param)
  155. .execs({arg.src, arg.filter, {}});
  156. }
  157. }
  158. TEST_F(CUDA, BENCHMARK_CONVOLUTION_1X1_FORWARD) {
  159. using namespace convolution;
  160. std::vector<TestArg> args = get_1x1_args();
  161. Benchmarker<ConvolutionForward> marker(handle_cuda());
  162. NormalRNG default_rng;
  163. for (auto&& arg : args) {
  164. float scale = 1.0f / sqrt(arg.filter[1] * arg.filter[2] * arg.filter[3]);
  165. UniformFloatRNG rng(scale, 2 * scale);
  166. marker.set_dtype(0, dtype::Float32())
  167. .set_dtype(1, dtype::Float32())
  168. .set_rng(0, &default_rng)
  169. .set_rng(1, &default_rng)
  170. .set_param(arg.param)
  171. .execs({arg.src, arg.filter, {}});
  172. }
  173. }
  174. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA) {
  175. using namespace convolution;
  176. std::vector<TestArg> args = get_args_cuda_conv_bwd_data();
  177. Checker<ConvolutionBackwardData> checker(handle_cuda());
  178. NormalRNG default_rng;
  179. for (auto&& arg : args) {
  180. float scale = 64.f / sqrt(arg.filter[0] * arg.filter[2] * arg.filter[3]);
  181. UniformFloatRNG rng(scale, 2 * scale);
  182. auto src = TensorLayout(arg.src, dtype::Float32());
  183. auto filter = TensorLayout(arg.filter, dtype::Float32());
  184. TensorLayout dst;
  185. {
  186. auto opr = handle_cuda()->create_operator<Convolution>();
  187. opr->param() = arg.param;
  188. opr->deduce_layout(src, filter, dst);
  189. }
  190. src.dtype = dst.dtype = filter.dtype = dtype::Float32();
  191. checker.set_rng(0, &default_rng)
  192. .set_rng(1, &default_rng)
  193. .set_epsilon(1e-3)
  194. .set_param(arg.param)
  195. .exec(TensorLayoutArray{filter, dst, src});
  196. if (!cuda::is_compute_capability_required(6, 0)) {
  197. src.dtype = dst.dtype = filter.dtype = dtype::Float16();
  198. checker.set_rng(0, &rng)
  199. .set_rng(1, &rng)
  200. .set_epsilon(1e-1)
  201. .set_param(arg.param)
  202. .exec(TensorLayoutArray{filter, dst, src});
  203. arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  204. checker.set_rng(0, &rng)
  205. .set_rng(1, &rng)
  206. .set_epsilon(1e-1)
  207. .set_param(arg.param)
  208. .exec(TensorLayoutArray{filter, dst, src});
  209. }
  210. checker.set_before_exec_callback(
  211. AlgoChecker<ConvolutionBackwardData>(ExecutionPolicyAlgoName{
  212. "CONVOLUTION_BACKWARD_DATD_BFLOAT16",
  213. {{"MATMUL", {{"CUBLAS", {}}}}}}));
  214. src.dtype = dst.dtype = filter.dtype = dtype::BFloat16();
  215. arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  216. checker.set_rng(0, &rng)
  217. .set_rng(1, &rng)
  218. .set_epsilon(1e-1)
  219. .set_param(arg.param)
  220. .exec(TensorLayoutArray{filter, dst, src});
  221. checker.reset_before_exec_callback();
  222. checker.opr()->execution_policy() = {};
  223. }
  224. }
  225. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_FP16_CUDNN7_5) {
  226. // algo CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 with
  227. // TensorCore operations produces incorrect result.
  228. // Maybe nvidia has fixed this issue
  229. // There is a test using incorrect case:
  230. // inp={2x8x18x18}, kern={8x8x2x2}, pad_h=pad_w=2, stride_h=stride_w=2,
  231. // dtype=float16
  232. using namespace convolution;
  233. std::vector<TestArg> args = get_args_cudnn_5_1_backward();
  234. Checker<ConvolutionBackwardData> checker(handle_cuda());
  235. NormalRNG default_rng;
  236. for (auto&& arg : args) {
  237. float scale = 128.f / sqrt(arg.filter[0] * arg.filter[2] * arg.filter[3]);
  238. scale = std::max(scale, 1.f);
  239. UniformFloatRNG rng(scale, 2 * scale);
  240. arg.param.format = param::Convolution::Format::NHWC;
  241. arg.src = cvt_src_or_dst_nchw2nhwc(arg.src);
  242. arg.filter = cvt_filter_nchw2nhwc(arg.filter);
  243. auto src = TensorLayout(arg.src, dtype::Float32());
  244. auto filter = TensorLayout(arg.filter, dtype::Float32());
  245. TensorLayout dst;
  246. {
  247. auto opr = handle_cuda()->create_operator<Convolution>();
  248. opr->param() = arg.param;
  249. opr->deduce_layout(src, filter, dst);
  250. }
  251. src.dtype = dst.dtype = filter.dtype = dtype::Float16();
  252. arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  253. checker.set_rng(0, &rng)
  254. .set_rng(1, &rng)
  255. .set_epsilon(1e-2)
  256. .set_param(arg.param)
  257. .exec(TensorLayoutArray{filter, dst, src});
  258. src.dtype = dst.dtype = filter.dtype = dtype::Float32();
  259. arg.param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  260. checker.set_rng(0, &rng)
  261. .set_rng(1, &rng)
  262. .set_epsilon(1e-2)
  263. .set_param(arg.param)
  264. .exec(TensorLayoutArray{filter, dst, src});
  265. }
  266. }
  267. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_NHWC) {
  268. using namespace convolution;
  269. std::vector<TestArg> args = get_args_cuda_conv_bwd_data();
  270. Checker<ConvolutionBackwardData> checker(handle_cuda());
  271. NormalRNG default_rng;
  272. for (auto&& arg : args) {
  273. float scale = 64.f / sqrt(arg.filter[0] * arg.filter[2] * arg.filter[3]);
  274. UniformFloatRNG rng(scale, 2 * scale);
  275. arg.param.format = param::Convolution::Format::NHWC;
  276. arg.src = cvt_src_or_dst_nchw2nhwc(arg.src);
  277. arg.filter = cvt_filter_nchw2nhwc(arg.filter);
  278. auto src = TensorLayout(arg.src, dtype::Float32());
  279. auto filter = TensorLayout(arg.filter, dtype::Float32());
  280. TensorLayout dst;
  281. {
  282. auto opr = handle_cuda()->create_operator<Convolution>();
  283. opr->param() = arg.param;
  284. opr->deduce_layout(src, filter, dst);
  285. }
  286. src.dtype = dst.dtype = filter.dtype = dtype::Float16();
  287. arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  288. checker.set_rng(0, &rng)
  289. .set_rng(1, &rng)
  290. .set_epsilon(1e-2)
  291. .set_param(arg.param)
  292. .exec(TensorLayoutArray{filter, dst, src});
  293. src.dtype = dst.dtype = filter.dtype = dtype::Float32();
  294. arg.param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  295. checker.set_rng(0, &rng)
  296. .set_rng(1, &rng)
  297. .set_epsilon(1e-2)
  298. .set_param(arg.param)
  299. .exec(TensorLayoutArray{filter, dst, src});
  300. }
  301. }
  302. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_CUDNN) {
  303. if (cuda::is_compute_capability_required(7, 0))
  304. return;
  305. using namespace convolution;
  306. Checker<ConvolutionBackwardData> checker(handle_cuda());
  307. checker.set_before_exec_callback(
  308. AlgoChecker<ConvolutionBackwardData>("CUDNN_CONVOLUTION"));
  309. //! noncontiguous case
  310. {
  311. param::Convolution param;
  312. param.pad_h = param.pad_w = 1;
  313. checker.set_param(param).execl(TensorLayoutArray{
  314. {{16, 16, 3, 3}, {144, 9, 3, 1}, dtype::Float32()},
  315. {{2, 16, 7, 7}, {1568, 49, 7, 1}, dtype::Float32()},
  316. {{2, 16, 7, 7}, {1568, 49, 7, 1}, dtype::Float32()},
  317. });
  318. }
  319. }
  320. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_MATMUL) {
  321. using namespace convolution;
  322. std::vector<TestArg> args = get_args_cuda_conv_bwd_data();
  323. Checker<ConvolutionBackwardData> checker(handle_cuda());
  324. checker.set_before_exec_callback(AlgoChecker<ConvolutionBackwardData>(
  325. ExecutionPolicyAlgoName{"MATMUL", {{"CUBLAS", {}}}}));
  326. NormalRNG default_rng;
  327. for (auto&& arg : args) {
  328. float scale = 64.f / sqrt(arg.filter[0] * arg.filter[2] * arg.filter[3]);
  329. UniformFloatRNG rng(scale, 2 * scale);
  330. auto src = TensorLayout(arg.src, dtype::Float32());
  331. auto filter = TensorLayout(arg.filter, dtype::Float32());
  332. TensorLayout dst;
  333. {
  334. auto opr = handle_cuda()->create_operator<Convolution>();
  335. opr->param() = arg.param;
  336. opr->deduce_layout(src, filter, dst);
  337. }
  338. src.dtype = dst.dtype = filter.dtype = dtype::Float32();
  339. checker.set_rng(0, &default_rng)
  340. .set_rng(1, &default_rng)
  341. .set_epsilon(1e-3)
  342. .set_param(arg.param)
  343. .exec(TensorLayoutArray{filter, dst, src});
  344. }
  345. //! noncontiguous case
  346. {
  347. param::Convolution param;
  348. param.pad_h = param.pad_w = 1;
  349. checker.set_param(param).execl(TensorLayoutArray{
  350. {{16, 16, 3, 3}, {144, 9, 3, 1}, dtype::Float32()},
  351. {{2, 16, 7, 7}, {1568, 49, 7, 1}, dtype::Float32()},
  352. {{2, 16, 7, 7}, {1568, 49, 7, 1}, dtype::Float32()},
  353. });
  354. }
  355. }
  356. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_INT8_NCHW4_DP4A) {
  357. if (!cuda::is_compute_capability_required(6, 1)) {
  358. printf("Skip CUDA.CONVOLUTION_BACKWARD_DATA_INT8_NCHW4_DP4A test as "
  359. "current device doesn't support\n");
  360. return;
  361. }
  362. using namespace convolution;
  363. std::vector<TestArg> args = get_args_int8_nchw4_conv_bwd_data();
  364. struct AlgoParam {
  365. int threadblock_m;
  366. int threadblock_n;
  367. int threadblock_k;
  368. int warp_m;
  369. int warp_n;
  370. int warp_k;
  371. int stage;
  372. std::string to_string() {
  373. return ssprintf(
  374. "_%dX%dX%d_%dX%dX%d_%dstage", threadblock_m, threadblock_n,
  375. threadblock_k, warp_m, warp_n, warp_k, stage);
  376. }
  377. };
  378. std::vector<AlgoParam> all_params;
  379. all_params.emplace_back(AlgoParam{16, 64, 8, 16, 64, 8, 2});
  380. all_params.emplace_back(AlgoParam{16, 128, 16, 16, 64, 16, 2});
  381. all_params.emplace_back(AlgoParam{16, 128, 16, 16, 128, 16, 1});
  382. all_params.emplace_back(AlgoParam{32, 128, 32, 32, 64, 32, 2});
  383. for (auto algo_param : all_params) {
  384. Checker<ConvolutionBackwardData> checker(handle_cuda());
  385. std::string algo_name(ssprintf(
  386. "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM%s", algo_param.to_string().c_str()));
  387. checker.set_before_exec_callback(
  388. AlgoChecker<ConvolutionBackwardData>(algo_name.c_str()));
  389. checker.set_epsilon(1 + 1e-3).set_max_avg_error(1e-1);
  390. for (auto&& arg : args) {
  391. UniformIntRNG rng(-3, 3);
  392. auto src = TensorLayout(arg.src, dtype::QuantizedS8{1.2f});
  393. auto filter = TensorLayout(arg.filter, dtype::QuantizedS8{1.3f});
  394. TensorLayout dst;
  395. dst.dtype = dtype::QuantizedS8{1.2f};
  396. {
  397. auto opr = handle_cuda()->create_operator<Convolution>();
  398. opr->param() = arg.param;
  399. opr->deduce_layout(src, filter, dst);
  400. }
  401. checker.set_rng(0, &rng).set_rng(1, &rng).set_param(arg.param).exec(
  402. TensorLayoutArray{filter, dst, src});
  403. }
  404. }
  405. }
  406. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_INT8_NCHW_DP4A) {
  407. if (!cuda::is_compute_capability_required(6, 1)) {
  408. printf("Skip CUDA.CONVOLUTION_BACKWARD_DATA_INT8_NCHW_DP4A test as "
  409. "current device doesn't support\n");
  410. return;
  411. }
  412. using namespace convolution;
  413. std::vector<TestArg> args = get_args_int8_nchw_conv_bwd_data();
  414. Checker<ConvolutionBackwardData> checker(handle_cuda());
  415. checker.set_before_exec_callback(
  416. AlgoChecker<ConvolutionBackwardData>("INT8_NCHW_DOTPROD_IMPLICIT_GEMM"));
  417. checker.set_epsilon(1 + 1e-3).set_max_avg_error(1e-1);
  418. for (auto&& arg : args) {
  419. UniformIntRNG rng(-3, 3);
  420. auto src = TensorLayout(arg.src, dtype::QuantizedS8{1.2f});
  421. auto filter = TensorLayout(arg.filter, dtype::QuantizedS8{1.3f});
  422. TensorLayout dst;
  423. dst.dtype = dtype::QuantizedS8{1.2f};
  424. {
  425. auto opr = handle_cuda()->create_operator<Convolution>();
  426. opr->param() = arg.param;
  427. opr->deduce_layout(src, filter, dst);
  428. }
  429. checker.set_rng(0, &rng).set_rng(1, &rng).set_param(arg.param).exec(
  430. TensorLayoutArray{filter, dst, src});
  431. }
  432. }
  433. #if CUDA_VERSION >= 10020
  434. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_INT8_NHWC_IMMA) {
  435. if (!cuda::is_compute_capability_required(7, 5)) {
  436. printf("Skip CUDA.CONVOLUTION_BACKWARD_DATA_INT8_NHWC_IMMA test as "
  437. "current device doesn't support\n");
  438. return;
  439. }
  440. using namespace convolution;
  441. std::vector<TestArg> args = get_args_int8_nhwc_conv_bwd_data();
  442. struct AlgoParam {
  443. int threadblock_m;
  444. int threadblock_n;
  445. int threadblock_k;
  446. int warp_m;
  447. int warp_n;
  448. int warp_k;
  449. int stage;
  450. int access_size;
  451. std::string to_string() {
  452. return ssprintf(
  453. "_%dX%dX%d_%dX%dX%d_%dstage_%d", threadblock_m, threadblock_n,
  454. threadblock_k, warp_m, warp_n, warp_k, stage, access_size);
  455. }
  456. };
  457. std::vector<AlgoParam> all_params;
  458. all_params.emplace_back(AlgoParam{64, 16, 32, 64, 16, 32, 2, 4});
  459. all_params.emplace_back(AlgoParam{64, 16, 32, 64, 16, 32, 2, 8});
  460. all_params.emplace_back(AlgoParam{64, 16, 32, 64, 16, 32, 2, 16});
  461. all_params.emplace_back(AlgoParam{128, 32, 32, 64, 32, 32, 1, 4});
  462. all_params.emplace_back(AlgoParam{128, 32, 32, 64, 32, 32, 1, 8});
  463. all_params.emplace_back(AlgoParam{128, 32, 32, 64, 32, 32, 1, 16});
  464. for (auto algo_param : all_params) {
  465. Checker<ConvolutionBackwardData> checker(handle_cuda());
  466. std::string algo_name(ssprintf(
  467. "INT8_NHWC_IMMA_IMPLICIT_GEMM%s", algo_param.to_string().c_str()));
  468. checker.set_before_exec_callback(
  469. AlgoChecker<ConvolutionBackwardData>(algo_name.c_str()));
  470. checker.set_epsilon(1 + 1e-3).set_max_avg_error(1e-1);
  471. for (auto&& arg : args) {
  472. UniformIntRNG rng(-3, 3);
  473. auto src = TensorLayout(arg.src, dtype::QuantizedS8{1.2f});
  474. auto filter = TensorLayout(arg.filter, dtype::QuantizedS8{1.3f});
  475. TensorLayout dst;
  476. dst.dtype = dtype::QuantizedS8{1.2f};
  477. {
  478. auto opr = handle_cuda()->create_operator<Convolution>();
  479. opr->param() = arg.param;
  480. opr->deduce_layout(src, filter, dst);
  481. }
  482. checker.set_rng(0, &rng).set_rng(1, &rng).set_param(arg.param).exec(
  483. TensorLayoutArray{filter, dst, src});
  484. }
  485. }
  486. }
  487. #endif
  488. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_FAILED_CUDNN7_5) {
  489. // BRAIN-481 failed on architectures 7.0, remove the following if statement,
  490. // when cudnn fixed the problem.
  491. if (cuda::is_compute_capability_required(7, 0))
  492. return;
  493. using namespace convolution;
  494. std::vector<TestArg> args = get_args_cudnn_7_5_failures();
  495. Checker<ConvolutionBackwardData> checker(handle_cuda());
  496. NormalRNG default_rng;
  497. for (auto&& arg : args) {
  498. float scale = 128.f / sqrt(arg.filter[0] * arg.filter[2] * arg.filter[3]);
  499. scale = std::max(scale, 1.f);
  500. UniformFloatRNG rng(scale, 2 * scale);
  501. auto src = TensorLayout(arg.src, dtype::Float32());
  502. auto filter = TensorLayout(arg.filter, dtype::Float32());
  503. TensorLayout dst;
  504. {
  505. auto opr = handle_cuda()->create_operator<Convolution>();
  506. opr->param() = arg.param;
  507. opr->deduce_layout(src, filter, dst);
  508. }
  509. src.dtype = dst.dtype = filter.dtype = dtype::Float32();
  510. checker.set_rng(0, &default_rng)
  511. .set_rng(1, &default_rng)
  512. .set_epsilon(1e-3)
  513. .set_param(arg.param)
  514. .exec(TensorLayoutArray{filter, dst, src});
  515. src.dtype = dst.dtype = filter.dtype = dtype::Float16();
  516. checker.set_rng(0, &rng)
  517. .set_rng(1, &rng)
  518. .set_epsilon(1e-1)
  519. .set_param(arg.param)
  520. .exec(TensorLayoutArray{filter, dst, src});
  521. arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  522. checker.set_rng(0, &rng)
  523. .set_rng(1, &rng)
  524. .set_epsilon(1e-1)
  525. .set_param(arg.param)
  526. .exec(TensorLayoutArray{filter, dst, src});
  527. }
  528. }
  529. TEST_F(CUDA, CONVOLUTION_BACKWARD_FILTER) {
  530. using namespace convolution;
  531. std::vector<TestArg> args = get_args();
  532. Checker<ConvolutionBackwardFilter> checker(handle_cuda());
  533. bool f16_checked = false;
  534. for (auto&& arg : args) {
  535. auto src = TensorLayout(arg.src, dtype::Float32());
  536. auto filter = TensorLayout(arg.filter, dtype::Float32());
  537. TensorLayout dst;
  538. {
  539. auto opr = handle_cuda()->create_operator<Convolution>();
  540. opr->param() = arg.param;
  541. opr->deduce_layout(src, filter, dst);
  542. }
  543. float scale = 1.0f / sqrt(dst[2] * dst[3]);
  544. UniformFloatRNG rng(scale, 2 * scale);
  545. src.dtype = dst.dtype = filter.dtype = dtype::Float32();
  546. checker.set_rng(0, &rng)
  547. .set_rng(1, &rng)
  548. .set_epsilon(1e-3)
  549. .set_param(arg.param)
  550. .exec(TensorLayoutArray{src, dst, filter});
  551. // reduce on large f16 array may introduce significant error
  552. if (dst.total_nr_elems() >= 1000 && f16_checked)
  553. continue;
  554. f16_checked = true;
  555. src.dtype = dst.dtype = filter.dtype = dtype::Float16();
  556. checker.set_rng(0, &rng)
  557. .set_rng(1, &rng)
  558. .set_epsilon(1e-1)
  559. .set_param(arg.param)
  560. .exec(TensorLayoutArray{src, dst, filter});
  561. arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  562. checker.set_rng(0, &rng)
  563. .set_rng(1, &rng)
  564. .set_epsilon(1e-1)
  565. .set_param(arg.param)
  566. .exec(TensorLayoutArray{src, dst, filter});
  567. checker.set_before_exec_callback(
  568. AlgoChecker<ConvolutionBackwardFilter>(ExecutionPolicyAlgoName{
  569. "CONVOLUTION_BACKWARD_FILTER_BFLOAT16",
  570. {{"MATMUL", {{"CUBLAS", {}}}}}}));
  571. src.dtype = dst.dtype = filter.dtype = dtype::BFloat16();
  572. checker.set_rng(0, &rng)
  573. .set_rng(1, &rng)
  574. .set_epsilon(1e-1)
  575. .set_param(arg.param)
  576. .exec(TensorLayoutArray{src, dst, filter});
  577. checker.reset_before_exec_callback();
  578. checker.opr()->execution_policy() = {};
  579. }
  580. }
  581. TEST_F(CUDA, CONVOLUTION_BACKWARD_FILTER_MATMUL) {
  582. using namespace convolution;
  583. std::vector<TestArg> args = get_args();
  584. Checker<ConvolutionBackwardFilter> checker(handle_cuda());
  585. checker.set_before_exec_callback(AlgoChecker<ConvolutionBackwardFilter>(
  586. ExecutionPolicyAlgoName{"MATMUL", {{"CUBLAS", {}}}}));
  587. for (auto&& arg : args) {
  588. auto src = TensorLayout(arg.src, dtype::Float32());
  589. auto filter = TensorLayout(arg.filter, dtype::Float32());
  590. TensorLayout dst;
  591. {
  592. auto opr = handle_cuda()->create_operator<Convolution>();
  593. opr->param() = arg.param;
  594. opr->deduce_layout(src, filter, dst);
  595. }
  596. float scale = 1.0f / sqrt(dst[2] * dst[3]);
  597. UniformFloatRNG rng(scale, 2 * scale);
  598. src.dtype = dst.dtype = filter.dtype = dtype::Float32();
  599. checker.set_rng(0, &rng)
  600. .set_rng(1, &rng)
  601. .set_epsilon(1e-3)
  602. .set_param(arg.param)
  603. .exec(TensorLayoutArray{src, dst, filter});
  604. }
  605. //! noncontiguous case
  606. {
  607. NormalRNG default_rng;
  608. param::Convolution param;
  609. param.pad_h = param.pad_w = 1;
  610. checker.set_rng(0, &default_rng)
  611. .set_rng(1, &default_rng)
  612. .set_param(param)
  613. .execl(TensorLayoutArray{
  614. {{2, 16, 7, 7}, {1568, 49, 7, 1}, dtype::Float32()},
  615. {{2, 16, 7, 7}, {1568, 49, 7, 1}, dtype::Float32()},
  616. {{16, 16, 3, 3}, {144, 9, 3, 1}, dtype::Float32()}});
  617. }
  618. }
  619. TEST_F(CUDA, CONVOLUTION_BACKWARD_FILTER_CUDNN) {
  620. if (cuda::is_compute_capability_required(7, 0))
  621. return;
  622. using namespace convolution;
  623. Checker<ConvolutionBackwardFilter> checker(handle_cuda());
  624. checker.set_before_exec_callback(
  625. AlgoChecker<ConvolutionBackwardFilter>("CUDNN_CONVOLUTION"));
  626. //! noncontiguous case
  627. {
  628. param::Convolution param;
  629. param.pad_h = param.pad_w = 1;
  630. checker.set_param(param).execl(TensorLayoutArray{
  631. {{2, 16, 7, 7}, {1568, 49, 7, 1}, dtype::Float32()},
  632. {{2, 16, 7, 7}, {1568, 49, 7, 1}, dtype::Float32()},
  633. {{16, 16, 3, 3}, {144, 9, 3, 1}, dtype::Float32()}});
  634. }
  635. }
  636. TEST_F(CUDA, CONV_CONFIG_COMBINATIONS) {
  637. auto eps_getter = [](bool f16, int stage, const char* name) -> float {
  638. if (f16) {
  639. return stage == 2 ? 0.5 : 0.2;
  640. }
  641. if (strstr(name, "WINOGRAD_NONFUSED"))
  642. return 0.3;
  643. return 1e-3;
  644. };
  645. convolution::test_conv_config_combinations(
  646. 2, handle_cuda(), false, true, true, eps_getter, true);
  647. convolution::test_conv_config_combinations(
  648. 3, handle_cuda(), false, true, true, eps_getter, true);
  649. convolution::test_conv_config_combinations(
  650. 5, handle_cuda(), false, true, true, eps_getter, true);
  651. }
  652. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_1) {
  653. if (cuda::is_compute_capability_required(7, 0))
  654. return;
  655. using namespace convolution;
  656. Checker<ConvolutionBackwardData> checker(handle_cuda());
  657. checker.set_before_exec_callback(AlgoChecker<ConvolutionBackwardData>(
  658. "CUDNN_CONVOLUTION_BWD_DATA_ALGO_1" CUDNN_VERSION_STRING));
  659. NormalRNG default_rng;
  660. TensorShape s_filter = TensorShape{8, 8, 2, 2}, s_src = TensorShape{2, 8, 18, 18};
  661. float scale = 1.0f / sqrt(s_filter[0] * s_filter[2] * s_filter[3]);
  662. UniformFloatRNG rng(scale, 2 * scale);
  663. auto src = TensorLayout(s_src, dtype::Float16());
  664. auto filter = TensorLayout(s_filter, dtype::Float16());
  665. TensorLayout dst;
  666. param::Convolution param;
  667. param.pad_h = param.pad_w = 2;
  668. param.stride_h = param.stride_w = 2;
  669. {
  670. auto opr = handle_cuda()->create_operator<Convolution>();
  671. opr->param() = param;
  672. opr->deduce_layout(src, filter, dst);
  673. }
  674. src.dtype = dst.dtype = filter.dtype = dtype::Float16();
  675. param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  676. checker.set_rng(0, &rng).set_rng(1, &rng).set_epsilon(0.2).set_param(param).exec(
  677. TensorLayoutArray{filter, dst, src});
  678. }
  679. #if MEGDNN_WITH_BENCHMARK
  680. TEST_F(CUDA, CONV_FWD_BENCHMARK) {
  681. auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW, size_t SH = 1,
  682. size_t SW = 1, size_t FH = 1, size_t FW = 1, size_t PH = 0,
  683. size_t PW = 0, bool fp16io_c32 = false) {
  684. auto benchmarker = Benchmarker<ConvolutionForward>(handle_cuda());
  685. benchmarker.set_dtype(0, dtype::Float16())
  686. .set_dtype(1, dtype::Float16())
  687. .set_dtype(2, dtype::Float16());
  688. ConvolutionForward::Param param;
  689. param.stride_h = SH;
  690. param.stride_w = SW;
  691. param.pad_h = PH;
  692. param.pad_w = PW;
  693. if (fp16io_c32) {
  694. param.compute_mode = ConvolutionForward::Param::ComputeMode::FLOAT32;
  695. }
  696. benchmarker.set_param(param);
  697. std::unique_ptr<OprProxy<ConvolutionForward>> proxy{
  698. new OprProxy<ConvolutionForward>{true}};
  699. benchmarker.set_proxy(proxy);
  700. size_t OH = (IH - FH + 2 * PH) / SH + 1;
  701. size_t OW = (IW - FW + 2 * PW) / SW + 1;
  702. auto time =
  703. benchmarker.execs({{N, IC, IH, IW}, {OC, IC, FH, FW}, {N, OC, OH, OW}});
  704. time /= 1000.0 * 10.0;
  705. auto flo = (double)N * OC * IC * OH * OW * FH * FW * 2;
  706. auto flops = flo / time / 1e12;
  707. printf("comp_type %s: ", fp16io_c32 ? "32" : "16");
  708. printf("%.3fG FLO, flops %.3fTFLOPS\n", flo / 1e9, flops);
  709. };
  710. run(32, 512, 256, 56, 56, 1, 1, 1, 1, 0, 0, false);
  711. run(32, 512, 256, 56, 56, 1, 1, 1, 1, 0, 0, true);
  712. }
  713. TEST_F(CUDA, CONVOLUTION_FWD_BENCHMARK) {
  714. CUBenchmarker<ConvolutionForward> bench{handle_cuda()};
  715. std::unique_ptr<OprProxy<ConvolutionForward>> proxy{
  716. new OprProxy<ConvolutionForward>{true}};
  717. size_t RUNS = 10;
  718. bench.set_proxy(proxy).set_times(RUNS);
  719. auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW, size_t FH,
  720. size_t SH, size_t PH) {
  721. bench.set_dtype(0, dtype::Float32())
  722. .set_dtype(1, dtype::Float32())
  723. .set_dtype(2, dtype::Float32());
  724. param::Convolution param;
  725. param.stride_h = param.stride_w = SH;
  726. param.pad_h = param.pad_w = PH;
  727. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  728. bench.set_param(param);
  729. bench.proxy()->target_execution_policy.algo.reset();
  730. TensorLayout src{{N, IC, IH, IW}, dtype::Float32()},
  731. filter{{OC, IC, FH, FH}, dtype::Float32()};
  732. TensorLayout dst;
  733. {
  734. auto&& opr = handle_cuda()->create_operator<Convolution>();
  735. opr->param() = param;
  736. opr->deduce_layout(src, filter, dst);
  737. }
  738. auto time_ms_fp32 = bench.execl({src, filter, dst}) / RUNS;
  739. src.dtype = filter.dtype = dst.dtype = dtype::Float16();
  740. bench.proxy()->target_execution_policy.algo.reset();
  741. bench.set_dtype(0, dtype::Float16())
  742. .set_dtype(1, dtype::Float16())
  743. .set_dtype(2, dtype::Float16());
  744. auto time_ms_true_fp16 = bench.execl({src, filter, dst}) / RUNS;
  745. param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  746. bench.proxy()->target_execution_policy.algo.reset();
  747. bench.set_param(param);
  748. auto time_ms_pseudo_fp16 = bench.execl({src, filter, dst}) / RUNS;
  749. float flo = 2.0 * N * OC * IC * dst[2] * dst[3] * FH * FH;
  750. printf("inp=%s, kern=%s, dst=%s ", src.to_string().c_str(),
  751. filter.to_string().c_str(), dst.to_string().c_str());
  752. printf("time_fp32=%.2fms, flops=%.3fTFLOPS\ntime_true_fp16=%.2fms, "
  753. "flops=%.3fTFLOPS\ntime_pseudo_fp16=%.2fms, flops=%.3fFLOPS\n",
  754. time_ms_fp32, (flo / (time_ms_fp32 * 1e9)), time_ms_true_fp16,
  755. (flo / (time_ms_true_fp16 * 1e9)), time_ms_pseudo_fp16,
  756. (flo / (time_ms_pseudo_fp16 * 1e9)));
  757. printf("speedup (true_fp16/fp32)=%.2f, (true_fp16/pseudo_fp16)=%.2f\n",
  758. time_ms_fp32 / time_ms_true_fp16,
  759. time_ms_pseudo_fp16 / time_ms_true_fp16);
  760. };
  761. run(32, 64, 3, 224, 224, 7, 2, 3);
  762. run(32, 128, 128, 28, 28, 3, 1, 1);
  763. run(32, 256, 256, 14, 14, 3, 1, 1);
  764. run(32, 512, 512, 7, 7, 3, 1, 1);
  765. run(32, 64, 64, 56, 56, 3, 1, 1);
  766. run(32, 512, 256, 56, 56, 1, 2, 0);
  767. run(32, 1024, 512, 28, 28, 1, 2, 0);
  768. run(32, 2048, 1024, 14, 14, 1, 2, 0);
  769. run(32, 512, 128, 28, 28, 1, 1, 0);
  770. run(32, 128, 512, 28, 28, 1, 1, 0);
  771. run(32, 1024, 256, 14, 14, 1, 1, 0);
  772. run(32, 256, 1024, 14, 14, 1, 1, 0);
  773. run(32, 2048, 512, 7, 7, 1, 1, 0);
  774. run(32, 512, 2048, 7, 7, 1, 1, 0);
  775. run(32, 256, 64, 56, 56, 1, 1, 0);
  776. run(32, 64, 256, 56, 56, 1, 1, 0);
  777. run(32, 128, 256, 56, 56, 1, 2, 0);
  778. run(32, 256, 512, 28, 28, 1, 2, 0);
  779. run(32, 512, 1024, 14, 14, 1, 2, 0);
  780. run(32, 64, 64, 56, 56, 1, 1, 0);
  781. }
  782. TEST_F(CUDA, CONVOLUTION_BWD_DATA_BENCHMARK) {
  783. CUBenchmarker<ConvolutionBackwardData> bench{handle_cuda()};
  784. std::unique_ptr<OprProxy<ConvolutionBackwardData>> proxy{
  785. new OprProxy<ConvolutionBackwardData>{true}};
  786. size_t RUNS = 10;
  787. bench.set_proxy(proxy).set_times(RUNS);
  788. auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW, size_t FH,
  789. size_t SH, size_t PH) {
  790. bench.set_dtype(0, dtype::Float32())
  791. .set_dtype(1, dtype::Float32())
  792. .set_dtype(2, dtype::Float32());
  793. param::Convolution param;
  794. param.stride_h = param.stride_w = SH;
  795. param.pad_h = param.pad_w = PH;
  796. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  797. bench.set_param(param);
  798. bench.proxy()->target_execution_policy.algo.reset();
  799. TensorLayout src{{N, IC, IH, IW}, dtype::Float32()},
  800. filter{{OC, IC, FH, FH}, dtype::Float32()};
  801. TensorLayout dst;
  802. {
  803. auto&& opr = handle_cuda()->create_operator<Convolution>();
  804. opr->param() = param;
  805. opr->deduce_layout(src, filter, dst);
  806. }
  807. auto time_ms_fp32 = bench.execl({filter, dst, src}) / RUNS;
  808. src.dtype = filter.dtype = dst.dtype = dtype::Float16();
  809. bench.proxy()->target_execution_policy.algo.reset();
  810. bench.set_dtype(0, dtype::Float16())
  811. .set_dtype(1, dtype::Float16())
  812. .set_dtype(2, dtype::Float16());
  813. auto time_ms_true_fp16 = bench.execl({filter, dst, src}) / RUNS;
  814. param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  815. bench.proxy()->target_execution_policy.algo.reset();
  816. bench.set_param(param);
  817. auto time_ms_pseudo_fp16 = bench.execl({filter, dst, src}) / RUNS;
  818. float flo = 2.0 * N * OC * IC * dst[2] * dst[3] * FH * FH;
  819. printf("inp=%s, kern=%s, dst=%s ", src.to_string().c_str(),
  820. filter.to_string().c_str(), dst.to_string().c_str());
  821. printf("time_fp32=%.2fms, flops=%.3fTFLOPS\ntime_true_fp16=%.2fms, "
  822. "flops=%.3fTFLOPS\ntime_pseudo_fp16=%.2fms, flops=%.3fFLOPS\n",
  823. time_ms_fp32, (flo / (time_ms_fp32 * 1e9)), time_ms_true_fp16,
  824. (flo / (time_ms_true_fp16 * 1e9)), time_ms_pseudo_fp16,
  825. (flo / (time_ms_pseudo_fp16 * 1e9)));
  826. printf("speedup (true_fp16/fp32)=%.2f, (true_fp16/pseudo_fp16)=%.2f\n",
  827. time_ms_fp32 / time_ms_true_fp16,
  828. time_ms_pseudo_fp16 / time_ms_true_fp16);
  829. };
  830. run(32, 64, 3, 224, 224, 7, 2, 3);
  831. run(32, 128, 128, 28, 28, 3, 1, 1);
  832. run(32, 256, 256, 14, 14, 3, 1, 1);
  833. run(32, 512, 512, 7, 7, 3, 1, 1);
  834. run(32, 64, 64, 56, 56, 3, 1, 1);
  835. run(32, 512, 256, 56, 56, 1, 2, 0);
  836. run(32, 1024, 512, 28, 28, 1, 2, 0);
  837. run(32, 2048, 1024, 14, 14, 1, 2, 0);
  838. run(32, 512, 128, 28, 28, 1, 1, 0);
  839. run(32, 128, 512, 28, 28, 1, 1, 0);
  840. run(32, 1024, 256, 14, 14, 1, 1, 0);
  841. run(32, 256, 1024, 14, 14, 1, 1, 0);
  842. run(32, 2048, 512, 7, 7, 1, 1, 0);
  843. run(32, 512, 2048, 7, 7, 1, 1, 0);
  844. run(32, 256, 64, 56, 56, 1, 1, 0);
  845. run(32, 64, 256, 56, 56, 1, 1, 0);
  846. run(32, 128, 256, 56, 56, 1, 2, 0);
  847. run(32, 256, 512, 28, 28, 1, 2, 0);
  848. run(32, 512, 1024, 14, 14, 1, 2, 0);
  849. run(32, 64, 64, 56, 56, 1, 1, 0);
  850. }
  851. TEST_F(CUDA, BENCHMARK_CONVOLUTION_BWD_DATA_CHANWISE_SMALL_FEAT_LARGE_FILTER) {
  852. CUBenchmarker<ConvolutionBackwardData> bench{handle_cuda()};
  853. std::unique_ptr<OprProxy<ConvolutionBackwardData>> proxy{
  854. new OprProxy<ConvolutionBackwardData>{true}};
  855. size_t RUNS = 10;
  856. bench.set_proxy(proxy).set_times(RUNS);
  857. auto run = [&](size_t N, size_t OC, size_t g, size_t IH, size_t IW, size_t FH,
  858. size_t SH, size_t PH) {
  859. bench.set_dtype(0, dtype::Float32())
  860. .set_dtype(1, dtype::Float32())
  861. .set_dtype(2, dtype::Float32());
  862. param::Convolution param;
  863. param.stride_h = param.stride_w = SH;
  864. param.pad_h = param.pad_w = FH / 2;
  865. param.sparse = param::Convolution::Sparse::GROUP;
  866. bench.set_param(param);
  867. bench.proxy()->target_execution_policy.algo.reset();
  868. TensorLayout src{{N, g, IH, IW}, dtype::Float32()},
  869. filter{{g, 1, 1, FH, FH}, dtype::Float32()};
  870. TensorLayout dst;
  871. {
  872. auto&& opr = handle_cuda()->create_operator<Convolution>();
  873. opr->param() = param;
  874. opr->deduce_layout(src, filter, dst);
  875. }
  876. auto time_ms_fp32 = bench.execl({filter, dst, src}) / RUNS;
  877. float flo = 2.0 * N * g * dst[2] * dst[3] * FH * FH;
  878. printf("inp=%s, kern=%s, dst=%s ", src.to_string().c_str(),
  879. filter.to_string().c_str(), dst.to_string().c_str());
  880. printf("time_fp32=%.2fms, flops=%.3fTFLOPS\n", time_ms_fp32,
  881. (flo / (time_ms_fp32 * 1e9)));
  882. };
  883. run(64, 384, 384, 32, 32, 31, 1, 15);
  884. }
  885. TEST_F(CUDA, BENCHMARK_CONVOLUTION_BWD_DATA_BF16) {
  886. CUBenchmarker<ConvolutionBackwardData> bench{handle_cuda()};
  887. std::unique_ptr<OprProxy<ConvolutionBackwardData>> proxy{
  888. new OprProxy<ConvolutionBackwardData>{true}};
  889. size_t RUNS = 10;
  890. bench.set_proxy(proxy).set_times(RUNS);
  891. auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW, size_t FH,
  892. size_t SH, size_t PH) {
  893. bench.set_dtype(0, dtype::BFloat16())
  894. .set_dtype(1, dtype::BFloat16())
  895. .set_dtype(2, dtype::BFloat16());
  896. param::Convolution param;
  897. param.stride_h = param.stride_w = SH;
  898. param.pad_h = param.pad_w = PH;
  899. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  900. bench.set_param(param);
  901. bench.proxy()->target_execution_policy = {};
  902. TensorLayout src{{N, IC, IH, IW}, dtype::BFloat16()},
  903. filter{{OC, IC, FH, FH}, dtype::BFloat16()};
  904. TensorLayout dst;
  905. {
  906. auto&& opr = handle_cuda()->create_operator<Convolution>();
  907. opr->param() = param;
  908. opr->deduce_layout(src, filter, dst);
  909. }
  910. auto used = bench.execl({filter, dst, src}) / RUNS;
  911. float flo = 2.0 * N * OC * IC * dst[2] * dst[3] * FH * FH;
  912. printf("inp=%s, kern=%s, dst=%s ", src.to_string().c_str(),
  913. filter.to_string().c_str(), dst.to_string().c_str());
  914. printf("time_fp32=%.2fms, flops=%.3fTFLOPS\n", used, (flo / (used * 1e9)));
  915. };
  916. run(32, 64, 3, 224, 224, 7, 2, 3);
  917. run(32, 128, 128, 28, 28, 3, 1, 1);
  918. run(32, 256, 256, 14, 14, 3, 1, 1);
  919. run(32, 512, 512, 7, 7, 3, 1, 1);
  920. run(32, 64, 64, 56, 56, 3, 1, 1);
  921. run(32, 512, 256, 56, 56, 1, 2, 0);
  922. run(32, 1024, 512, 28, 28, 1, 2, 0);
  923. run(32, 2048, 1024, 14, 14, 1, 2, 0);
  924. run(32, 512, 128, 28, 28, 1, 1, 0);
  925. run(32, 128, 512, 28, 28, 1, 1, 0);
  926. run(32, 1024, 256, 14, 14, 1, 1, 0);
  927. run(32, 256, 1024, 14, 14, 1, 1, 0);
  928. run(32, 2048, 512, 7, 7, 1, 1, 0);
  929. run(32, 512, 2048, 7, 7, 1, 1, 0);
  930. run(32, 256, 64, 56, 56, 1, 1, 0);
  931. run(32, 64, 256, 56, 56, 1, 1, 0);
  932. run(32, 128, 256, 56, 56, 1, 2, 0);
  933. run(32, 256, 512, 28, 28, 1, 2, 0);
  934. run(32, 512, 1024, 14, 14, 1, 2, 0);
  935. run(32, 64, 64, 56, 56, 1, 1, 0);
  936. }
  937. TEST_F(CUDA, BENCHMARK_CONVOLUTION_BWD_DATA_INT8_DP4A) {
  938. CUBenchmarker<ConvolutionBackwardData> bench{handle_cuda()};
  939. std::unique_ptr<OprProxy<ConvolutionBackwardData>> proxy{
  940. new OprProxy<ConvolutionBackwardData>{true}};
  941. size_t RUNS = 10;
  942. bench.set_proxy(proxy).set_times(RUNS);
  943. auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW, size_t FH,
  944. size_t SH, size_t PH) {
  945. bench.set_dtype(0, dtype::QuantizedS8{1.0f})
  946. .set_dtype(1, dtype::QuantizedS8{1.0f})
  947. .set_dtype(2, dtype::QuantizedS8{1.0f});
  948. param::Convolution param;
  949. param.format = param::Convolution::Format::NCHW4;
  950. param.stride_h = param.stride_w = SH;
  951. param.pad_h = param.pad_w = PH;
  952. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  953. bench.set_param(param);
  954. bench.proxy()->target_execution_policy = {};
  955. TensorLayout src{{N, IC / 4, IH, IW, 4}, dtype::QuantizedS8{1.0f}},
  956. filter{{OC, IC / 4, FH, FH, 4}, dtype::QuantizedS8{1.0f}};
  957. TensorLayout dst;
  958. dst.dtype = dtype::QuantizedS8{1.0f};
  959. {
  960. auto&& opr = handle_cuda()->create_operator<Convolution>();
  961. opr->param() = param;
  962. opr->deduce_layout(src, filter, dst);
  963. }
  964. auto used = bench.execl({filter, dst, src}) / RUNS;
  965. float flo = 2.0 * N * OC * IC * dst[2] * dst[3] * FH * FH;
  966. printf("inp=%s, kern=%s, dst=%s ", src.to_string().c_str(),
  967. filter.to_string().c_str(), dst.to_string().c_str());
  968. printf("time_fp32=%.2fms, flops=%.3fTFLOPS\n", used, (flo / (used * 1e9)));
  969. };
  970. run(64, 32, 32, 92, 180, 4, 2, 2);
  971. run(64, 32, 32, 46, 80, 4, 2, 2);
  972. run(16, 16, 16, 92, 180, 4, 2, 2);
  973. run(16, 16, 16, 46, 80, 4, 2, 2);
  974. }
  975. TEST_F(CUDA, CONVOLUTION_BWD_FILTER_BENCHMARK) {
  976. CUBenchmarker<ConvolutionBackwardFilter> bench{handle_cuda()};
  977. std::unique_ptr<OprProxy<ConvolutionBackwardFilter>> proxy{
  978. new OprProxy<ConvolutionBackwardFilter>{true}};
  979. size_t RUNS = 10;
  980. bench.set_proxy(proxy).set_times(RUNS);
  981. auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW, size_t FH,
  982. size_t SH, size_t PH) {
  983. bench.set_dtype(0, dtype::Float32())
  984. .set_dtype(1, dtype::Float32())
  985. .set_dtype(2, dtype::Float32());
  986. param::Convolution param;
  987. param.stride_h = param.stride_w = SH;
  988. param.pad_h = param.pad_w = PH;
  989. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  990. bench.set_param(param);
  991. bench.proxy()->target_execution_policy.algo.reset();
  992. TensorLayout src{{N, IC, IH, IW}, dtype::Float32()},
  993. filter{{OC, IC, FH, FH}, dtype::Float32()};
  994. TensorLayout dst;
  995. {
  996. auto&& opr = handle_cuda()->create_operator<Convolution>();
  997. opr->param() = param;
  998. opr->deduce_layout(src, filter, dst);
  999. }
  1000. auto time_ms_fp32 = bench.execl({src, dst, filter}) / RUNS;
  1001. src.dtype = filter.dtype = dst.dtype = dtype::Float16();
  1002. bench.proxy()->target_execution_policy.algo.reset();
  1003. bench.set_dtype(0, dtype::Float16())
  1004. .set_dtype(1, dtype::Float16())
  1005. .set_dtype(2, dtype::Float16());
  1006. auto time_ms_true_fp16 = bench.execl({src, dst, filter}) / RUNS;
  1007. param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  1008. bench.proxy()->target_execution_policy.algo.reset();
  1009. bench.set_param(param);
  1010. auto time_ms_pseudo_fp16 = bench.execl({src, dst, filter}) / RUNS;
  1011. float flo = 2.0 * N * OC * IC * dst[2] * dst[3] * FH * FH;
  1012. printf("inp=%s, kern=%s, dst=%s ", src.to_string().c_str(),
  1013. filter.to_string().c_str(), dst.to_string().c_str());
  1014. printf("time_fp32=%.2fms, flops=%.3fTFLOPS\ntime_true_fp16=%.2fms, "
  1015. "flops=%.3fTFLOPS\ntime_pseudo_fp16=%.2fms, flops=%.3fFLOPS\n",
  1016. time_ms_fp32, (flo / (time_ms_fp32 * 1e9)), time_ms_true_fp16,
  1017. (flo / (time_ms_true_fp16 * 1e9)), time_ms_pseudo_fp16,
  1018. (flo / (time_ms_pseudo_fp16 * 1e9)));
  1019. printf("speedup (true_fp16/fp32)=%.2f, (true_fp16/pseudo_fp16)=%.2f\n",
  1020. time_ms_fp32 / time_ms_true_fp16,
  1021. time_ms_pseudo_fp16 / time_ms_true_fp16);
  1022. };
  1023. run(32, 64, 3, 224, 224, 7, 2, 3);
  1024. run(32, 128, 128, 28, 28, 3, 1, 1);
  1025. run(32, 256, 256, 14, 14, 3, 1, 1);
  1026. run(32, 512, 512, 7, 7, 3, 1, 1);
  1027. run(32, 64, 64, 56, 56, 3, 1, 1);
  1028. run(32, 512, 256, 56, 56, 1, 2, 0);
  1029. run(32, 1024, 512, 28, 28, 1, 2, 0);
  1030. run(32, 2048, 1024, 14, 14, 1, 2, 0);
  1031. run(32, 512, 128, 28, 28, 1, 1, 0);
  1032. run(32, 128, 512, 28, 28, 1, 1, 0);
  1033. run(32, 1024, 256, 14, 14, 1, 1, 0);
  1034. run(32, 256, 1024, 14, 14, 1, 1, 0);
  1035. run(32, 2048, 512, 7, 7, 1, 1, 0);
  1036. run(32, 512, 2048, 7, 7, 1, 1, 0);
  1037. run(32, 256, 64, 56, 56, 1, 1, 0);
  1038. run(32, 64, 256, 56, 56, 1, 1, 0);
  1039. run(32, 128, 256, 56, 56, 1, 2, 0);
  1040. run(32, 256, 512, 28, 28, 1, 2, 0);
  1041. run(32, 512, 1024, 14, 14, 1, 2, 0);
  1042. run(32, 64, 64, 56, 56, 1, 1, 0);
  1043. }
  1044. TEST_F(CUDA, BENCHMARK_CONVOLUTION_BWD_FILTER_CHANWISE_SMALL_FEAT_LARGE_FILTER) {
  1045. CUBenchmarker<ConvolutionBackwardFilter> bench{handle_cuda()};
  1046. std::unique_ptr<OprProxy<ConvolutionBackwardFilter>> proxy{
  1047. new OprProxy<ConvolutionBackwardFilter>{true}};
  1048. size_t RUNS = 10;
  1049. bench.set_proxy(proxy).set_times(RUNS);
  1050. bench.set_before_exec_callback(AlgoChecker<ConvolutionBackwardFilter>(
  1051. "CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFTv7.6.3"));
  1052. auto run = [&](size_t N, size_t OC, size_t g, size_t IH, size_t IW, size_t FH,
  1053. size_t SH, size_t PH) {
  1054. bench.set_dtype(0, dtype::Float32())
  1055. .set_dtype(1, dtype::Float32())
  1056. .set_dtype(2, dtype::Float32());
  1057. param::Convolution param;
  1058. param.stride_h = param.stride_w = SH;
  1059. param.pad_h = param.pad_w = FH / 2;
  1060. param.sparse = param::Convolution::Sparse::GROUP;
  1061. bench.set_param(param);
  1062. bench.proxy()->target_execution_policy.algo.reset();
  1063. TensorLayout src{{N, g, IH, IW}, dtype::Float32()},
  1064. filter{{g, 1, 1, FH, FH}, dtype::Float32()};
  1065. TensorLayout dst;
  1066. {
  1067. auto&& opr = handle_cuda()->create_operator<Convolution>();
  1068. opr->param() = param;
  1069. opr->deduce_layout(src, filter, dst);
  1070. }
  1071. auto time_ms_fp32 = bench.execl({src, dst, filter}) / RUNS;
  1072. float flo = 2.0 * N * g * dst[2] * dst[3] * FH * FH;
  1073. printf("inp=%s, kern=%s, dst=%s ", src.to_string().c_str(),
  1074. filter.to_string().c_str(), dst.to_string().c_str());
  1075. printf("time_fp32=%.2fms, flops=%.3fTFLOPS\n", time_ms_fp32,
  1076. (flo / (time_ms_fp32 * 1e9)));
  1077. };
  1078. run(64, 384, 384, 32, 32, 31, 1, 15);
  1079. }
  1080. #endif
  1081. #undef CUDNN_VERSION_STRING
  1082. #undef V
  1083. #undef V1
  1084. } // namespace test
  1085. } // namespace megdnn
  1086. // vim: syntax=cpp.doxygen