You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

convolution.cpp 45 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091
  1. /**
  2. * \file dnn/test/cuda/convolution.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
  10. * implied.
  11. */
  12. #include "megdnn/dtype.h"
  13. #include "megdnn/oprs.h"
  14. #include "megdnn/opr_param_defs.h"
  15. #include "test/cuda/fixture.h"
  16. #include "test/common/tensor.h"
  17. #include "test/common/workspace_wrapper.h"
  18. #include "test/common/checker.h"
  19. #include "test/common/convolution.h"
  20. #include "test/common/rng.h"
  21. #include "test/cuda/benchmark.h"
  22. #include "src/cuda/utils.h"
  23. #include "test/common/accuracy_shake_checker.h"
  24. #define V1(x) #x
  25. #define V(x) V1(x)
  26. #define CUDNN_VERSION_STRING \
  27. "v" V(CUDNN_MAJOR) "." V(CUDNN_MINOR) "." V(CUDNN_PATCHLEVEL)
  28. namespace megdnn {
  29. namespace test {
  30. TEST_F(CUDA, CONVOLUTION_8X8X32) {
  31. if (!cuda::is_compute_capability_required(6, 1)) {
  32. printf("Skip CUDA.CONVOLUTION_8X8X32 test as current device"
  33. "doesn't support\n");
  34. return;
  35. }
  36. using namespace convolution;
  37. std::vector<TestArg> args;
  38. {
  39. auto v = get_args();
  40. for (auto&& a : v) {
  41. args.push_back(std::move(a));
  42. }
  43. }
  44. {
  45. auto v = get_dilated_args();
  46. for (auto&& a : v) {
  47. args.push_back(std::move(a));
  48. }
  49. }
  50. {
  51. auto v = get_chanwise_args();
  52. for (auto&& a : v) {
  53. args.push_back(std::move(a));
  54. }
  55. }
  56. Checker<ConvolutionForward> checker(handle_cuda());
  57. UniformIntRNG rng(-4, 4);
  58. for (auto arg : args) {
  59. arg.param.format = param::Convolution::Format::NHWC;
  60. arg.src = cvt_src_or_dst_nchw2nhwc(arg.src);
  61. arg.filter = cvt_filter_nchw2nhwc(arg.filter);
  62. checker.set_dtype(0, dtype::Int8())
  63. .set_dtype(1, dtype::Int8())
  64. .set_dtype(2, dtype::Int32())
  65. .set_param(arg.param)
  66. .set_rng(0, &rng)
  67. .set_rng(1, &rng)
  68. .execs({arg.src, arg.filter, {}});
  69. }
  70. }
  71. TEST_F(CUDA, CONVOLUTION_FORWARD) {
  72. using namespace convolution;
  73. std::vector<TestArg> args = get_args();
  74. Checker<ConvolutionForward> checker(handle_cuda());
  75. NormalRNG default_rng;
  76. for (auto&& arg : args) {
  77. float scale =
  78. 1.0f / sqrt(arg.filter[1] * arg.filter[2] * arg.filter[3]);
  79. UniformFloatRNG rng(scale, 2 * scale);
  80. checker.set_dtype(0, dtype::Float32())
  81. .set_dtype(1, dtype::Float32())
  82. .set_dtype(2, dtype::Float32())
  83. .set_rng(0, &default_rng)
  84. .set_rng(1, &default_rng)
  85. .set_epsilon(1e-3)
  86. .set_param(arg.param)
  87. .execs({arg.src, arg.filter, {}});
  88. checker.set_dtype(0, dtype::Float16())
  89. .set_dtype(1, dtype::Float16())
  90. .set_dtype(2, dtype::Float16())
  91. .set_rng(0, &rng)
  92. .set_rng(1, &rng)
  93. .set_epsilon(1e-1)
  94. .set_param(arg.param)
  95. .execs({arg.src, arg.filter, {}});
  96. arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  97. checker.set_dtype(0, dtype::Float16())
  98. .set_dtype(1, dtype::Float16())
  99. .set_dtype(2, dtype::Float16())
  100. .set_rng(0, &rng)
  101. .set_rng(1, &rng)
  102. .set_epsilon(1e-1)
  103. .set_param(arg.param)
  104. .execs({arg.src, arg.filter, {}});
  105. checker.set_dtype(0, dtype::BFloat16())
  106. .set_dtype(1, dtype::BFloat16())
  107. .set_dtype(2, dtype::BFloat16())
  108. .set_epsilon(1e-1)
  109. .set_param(arg.param)
  110. .execs({arg.src, arg.filter, {}});
  111. }
  112. }
  113. TEST_F(CUDA, CONV_FORWARD_MATMUL_NCHW4) {
  114. if (!cuda::is_compute_capability_required(6, 1))
  115. return;
  116. using namespace convolution;
  117. Checker<Convolution> checker(handle_cuda());
  118. UniformIntRNG int_rng{-127, 127};
  119. Convolution::Param param;
  120. param.format = Convolution::Param::Format::NCHW4;
  121. checker.set_dtype(0, dtype::QuantizedS8(0.132f))
  122. .set_dtype(1, dtype::QuantizedS8(0.0239f))
  123. .set_dtype(2, dtype::QuantizedS32(0.132f * 0.0239f))
  124. .set_rng(0, &int_rng)
  125. .set_rng(1, &int_rng)
  126. .set_param(param);
  127. checker.set_before_exec_callback(
  128. AlgoChecker<ConvolutionForward>(ExecutionPolicyAlgoName{
  129. "DEFAULT",
  130. {{ConvBiasForward::algo_name<ConvBiasForward::MatmulParam>(
  131. "MATMUL8X8X32", {})
  132. .c_str(),
  133. {}}}}));
  134. param.sparse = Convolution::Param::Sparse::DENSE;
  135. param.pad_h = param.pad_w = 1;
  136. param.stride_h = param.stride_w = 1;
  137. checker.set_param(param);
  138. checker.exec({{8, 4, 10, 10, 4}, {16, 4, 3, 3, 4}, {}});
  139. checker.exec({{1, 4, 2, 2, 4}, {16, 4, 3, 3, 4}, {}});
  140. checker.exec({{8, 64, 12, 12, 4}, {256, 64, 3, 3, 4}, {}});
  141. }
  142. TEST_F(CUDA, CONVOLUTION_1X1_FORWARD) {
  143. using namespace convolution;
  144. std::vector<TestArg> args = get_1x1_args();
  145. Checker<ConvolutionForward> checker(handle_cuda());
  146. NormalRNG default_rng;
  147. for (auto&& arg : args) {
  148. float scale =
  149. 1.0f / sqrt(arg.filter[1] * arg.filter[2] * arg.filter[3]);
  150. UniformFloatRNG rng(scale, 2 * scale);
  151. checker.set_dtype(0, dtype::Float32())
  152. .set_dtype(1, dtype::Float32())
  153. .set_rng(0, &default_rng)
  154. .set_rng(1, &default_rng)
  155. .set_epsilon(1e-3)
  156. .set_param(arg.param)
  157. .execs({arg.src, arg.filter, {}});
  158. }
  159. }
  160. TEST_F(CUDA, BENCHMARK_CONVOLUTION_1X1_FORWARD) {
  161. using namespace convolution;
  162. std::vector<TestArg> args = get_1x1_args();
  163. Benchmarker<ConvolutionForward> marker(handle_cuda());
  164. NormalRNG default_rng;
  165. for (auto&& arg : args) {
  166. float scale =
  167. 1.0f / sqrt(arg.filter[1] * arg.filter[2] * arg.filter[3]);
  168. UniformFloatRNG rng(scale, 2 * scale);
  169. marker.set_dtype(0, dtype::Float32())
  170. .set_dtype(1, dtype::Float32())
  171. .set_rng(0, &default_rng)
  172. .set_rng(1, &default_rng)
  173. .set_param(arg.param)
  174. .execs({arg.src, arg.filter, {}});
  175. }
  176. }
  177. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA) {
  178. using namespace convolution;
  179. std::vector<TestArg> args = get_args_cuda_conv_bwd_data();
  180. Checker<ConvolutionBackwardData> checker(handle_cuda());
  181. NormalRNG default_rng;
  182. for (auto&& arg : args) {
  183. float scale =
  184. 64.f / sqrt(arg.filter[0] * arg.filter[2] * arg.filter[3]);
  185. UniformFloatRNG rng(scale, 2 * scale);
  186. auto src = TensorLayout(arg.src, dtype::Float32());
  187. auto filter = TensorLayout(arg.filter, dtype::Float32());
  188. TensorLayout dst;
  189. {
  190. auto opr = handle_cuda()->create_operator<Convolution>();
  191. opr->param() = arg.param;
  192. opr->deduce_layout(src, filter, dst);
  193. }
  194. src.dtype = dst.dtype = filter.dtype = dtype::Float32();
  195. checker.set_rng(0, &default_rng)
  196. .set_rng(1, &default_rng)
  197. .set_epsilon(1e-3)
  198. .set_param(arg.param)
  199. .exec(TensorLayoutArray{filter, dst, src});
  200. if (!cuda::is_compute_capability_required(6, 0)) {
  201. src.dtype = dst.dtype = filter.dtype = dtype::Float16();
  202. checker.set_rng(0, &rng)
  203. .set_rng(1, &rng)
  204. .set_epsilon(1e-1)
  205. .set_param(arg.param)
  206. .exec(TensorLayoutArray{filter, dst, src});
  207. arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  208. checker.set_rng(0, &rng)
  209. .set_rng(1, &rng)
  210. .set_epsilon(1e-1)
  211. .set_param(arg.param)
  212. .exec(TensorLayoutArray{filter, dst, src});
  213. }
  214. checker.set_before_exec_callback(AlgoChecker<ConvolutionBackwardData>(
  215. ExecutionPolicyAlgoName{"CONVOLUTION_BACKWARD_DATD_BFLOAT16",
  216. {{"MATMUL", {{"CUBLAS", {}}}}}}));
  217. src.dtype = dst.dtype = filter.dtype = dtype::BFloat16();
  218. arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  219. checker.set_rng(0, &rng)
  220. .set_rng(1, &rng)
  221. .set_epsilon(1e-1)
  222. .set_param(arg.param)
  223. .exec(TensorLayoutArray{filter, dst, src});
  224. checker.reset_before_exec_callback();
  225. checker.opr()->execution_policy() = {};
  226. }
  227. }
  228. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_FP16_CUDNN7_5) {
  229. // algo CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 with
  230. // TensorCore operations produces incorrect result.
  231. // Maybe nvidia has fixed this issue
  232. // There is a test using incorrect case:
  233. // inp={2x8x18x18}, kern={8x8x2x2}, pad_h=pad_w=2, stride_h=stride_w=2,
  234. // dtype=float16
  235. using namespace convolution;
  236. std::vector<TestArg> args = get_args_cudnn_5_1_backward();
  237. Checker<ConvolutionBackwardData> checker(handle_cuda());
  238. NormalRNG default_rng;
  239. for (auto&& arg : args) {
  240. float scale =
  241. 128.f / sqrt(arg.filter[0] * arg.filter[2] * arg.filter[3]);
  242. scale = std::max(scale, 1.f);
  243. UniformFloatRNG rng(scale, 2 * scale);
  244. arg.param.format = param::Convolution::Format::NHWC;
  245. arg.src = cvt_src_or_dst_nchw2nhwc(arg.src);
  246. arg.filter = cvt_filter_nchw2nhwc(arg.filter);
  247. auto src = TensorLayout(arg.src, dtype::Float32());
  248. auto filter = TensorLayout(arg.filter, dtype::Float32());
  249. TensorLayout dst;
  250. {
  251. auto opr = handle_cuda()->create_operator<Convolution>();
  252. opr->param() = arg.param;
  253. opr->deduce_layout(src, filter, dst);
  254. }
  255. src.dtype = dst.dtype = filter.dtype = dtype::Float16();
  256. arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  257. checker.set_rng(0, &rng)
  258. .set_rng(1, &rng)
  259. .set_epsilon(1e-2)
  260. .set_param(arg.param)
  261. .exec(TensorLayoutArray{filter, dst, src});
  262. src.dtype = dst.dtype = filter.dtype = dtype::Float32();
  263. arg.param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  264. checker.set_rng(0, &rng)
  265. .set_rng(1, &rng)
  266. .set_epsilon(1e-2)
  267. .set_param(arg.param)
  268. .exec(TensorLayoutArray{filter, dst, src});
  269. }
  270. }
  271. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_NHWC) {
  272. using namespace convolution;
  273. std::vector<TestArg> args = get_args_cuda_conv_bwd_data();
  274. Checker<ConvolutionBackwardData> checker(handle_cuda());
  275. NormalRNG default_rng;
  276. for (auto&& arg : args) {
  277. float scale =
  278. 64.f / sqrt(arg.filter[0] * arg.filter[2] * arg.filter[3]);
  279. UniformFloatRNG rng(scale, 2 * scale);
  280. arg.param.format = param::Convolution::Format::NHWC;
  281. arg.src = cvt_src_or_dst_nchw2nhwc(arg.src);
  282. arg.filter = cvt_filter_nchw2nhwc(arg.filter);
  283. auto src = TensorLayout(arg.src, dtype::Float32());
  284. auto filter = TensorLayout(arg.filter, dtype::Float32());
  285. TensorLayout dst;
  286. {
  287. auto opr = handle_cuda()->create_operator<Convolution>();
  288. opr->param() = arg.param;
  289. opr->deduce_layout(src, filter, dst);
  290. }
  291. src.dtype = dst.dtype = filter.dtype = dtype::Float16();
  292. arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  293. checker.set_rng(0, &rng)
  294. .set_rng(1, &rng)
  295. .set_epsilon(1e-2)
  296. .set_param(arg.param)
  297. .exec(TensorLayoutArray{filter, dst, src});
  298. src.dtype = dst.dtype = filter.dtype = dtype::Float32();
  299. arg.param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  300. checker.set_rng(0, &rng)
  301. .set_rng(1, &rng)
  302. .set_epsilon(1e-2)
  303. .set_param(arg.param)
  304. .exec(TensorLayoutArray{filter, dst, src});
  305. }
  306. }
  307. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_CUDNN) {
  308. if (cuda::is_compute_capability_required(7, 0))
  309. return;
  310. using namespace convolution;
  311. Checker<ConvolutionBackwardData> checker(handle_cuda());
  312. checker.set_before_exec_callback(AlgoChecker<ConvolutionBackwardData>(
  313. "CUDNN_CONVOLUTION"));
  314. //! noncontiguous case
  315. {
  316. param::Convolution param;
  317. param.pad_h = param.pad_w = 1;
  318. checker.set_param(param).execl(TensorLayoutArray{
  319. {{16, 16, 3, 3}, {144, 9, 3, 1}, dtype::Float32()},
  320. {{2, 16, 7, 7}, {1568, 49, 7, 1}, dtype::Float32()},
  321. {{2, 16, 7, 7}, {1568, 49, 7, 1}, dtype::Float32()},
  322. });
  323. }
  324. }
  325. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_MATMUL) {
  326. using namespace convolution;
  327. std::vector<TestArg> args = get_args_cuda_conv_bwd_data();
  328. Checker<ConvolutionBackwardData> checker(handle_cuda());
  329. checker.set_before_exec_callback(AlgoChecker<ConvolutionBackwardData>(
  330. ExecutionPolicyAlgoName{"MATMUL", {{"CUBLAS", {}}}}));
  331. NormalRNG default_rng;
  332. for (auto&& arg : args) {
  333. float scale =
  334. 64.f / sqrt(arg.filter[0] * arg.filter[2] * arg.filter[3]);
  335. UniformFloatRNG rng(scale, 2 * scale);
  336. auto src = TensorLayout(arg.src, dtype::Float32());
  337. auto filter = TensorLayout(arg.filter, dtype::Float32());
  338. TensorLayout dst;
  339. {
  340. auto opr = handle_cuda()->create_operator<Convolution>();
  341. opr->param() = arg.param;
  342. opr->deduce_layout(src, filter, dst);
  343. }
  344. src.dtype = dst.dtype = filter.dtype = dtype::Float32();
  345. checker.set_rng(0, &default_rng)
  346. .set_rng(1, &default_rng)
  347. .set_epsilon(1e-3)
  348. .set_param(arg.param)
  349. .exec(TensorLayoutArray{filter, dst, src});
  350. }
  351. //! noncontiguous case
  352. {
  353. param::Convolution param;
  354. param.pad_h = param.pad_w = 1;
  355. checker.set_param(param).execl(TensorLayoutArray{
  356. {{16, 16, 3, 3}, {144, 9, 3, 1}, dtype::Float32()},
  357. {{2, 16, 7, 7}, {1568, 49, 7, 1}, dtype::Float32()},
  358. {{2, 16, 7, 7}, {1568, 49, 7, 1}, dtype::Float32()},
  359. });
  360. }
  361. }
  362. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_INT8_NCHW4_DP4A) {
  363. if (!cuda::is_compute_capability_required(6, 1)) {
  364. printf("Skip CUDA.CONVOLUTION_BACKWARD_DATA_INT8_NCHW4_DP4A test as "
  365. "current device doesn't support\n");
  366. return;
  367. }
  368. using namespace convolution;
  369. std::vector<TestArg> args = get_args_int8_nchw4_conv_bwd_data();
  370. struct AlgoParam {
  371. int threadblock_m;
  372. int threadblock_n;
  373. int threadblock_k;
  374. int warp_m;
  375. int warp_n;
  376. int warp_k;
  377. int stage;
  378. std::string to_string() {
  379. return ssprintf("_%dX%dX%d_%dX%dX%d_%dstage", threadblock_m,
  380. threadblock_n, threadblock_k, warp_m, warp_n,
  381. warp_k, stage);
  382. }
  383. };
  384. std::vector<AlgoParam> all_params;
  385. all_params.emplace_back(AlgoParam{16, 64, 8, 16, 64, 8, 2});
  386. all_params.emplace_back(AlgoParam{16, 128, 16, 16, 64, 16, 2});
  387. all_params.emplace_back(AlgoParam{16, 128, 16, 16, 128, 16, 1});
  388. all_params.emplace_back(AlgoParam{32, 128, 32, 32, 64, 32, 2});
  389. for (auto algo_param : all_params) {
  390. Checker<ConvolutionBackwardData> checker(handle_cuda());
  391. std::string algo_name(ssprintf("INT8_NCHW4_DOTPROD_IMPLICIT_GEMM%s",
  392. algo_param.to_string().c_str()));
  393. checker.set_before_exec_callback(
  394. AlgoChecker<ConvolutionBackwardData>(algo_name.c_str()));
  395. checker.set_epsilon(1 + 1e-3).set_max_avg_error(1e-1);
  396. for (auto&& arg : args) {
  397. UniformIntRNG rng(-3, 3);
  398. auto src = TensorLayout(arg.src, dtype::QuantizedS8{1.2f});
  399. auto filter = TensorLayout(arg.filter, dtype::QuantizedS8{1.3f});
  400. TensorLayout dst;
  401. dst.dtype = dtype::QuantizedS8{1.2f};
  402. {
  403. auto opr = handle_cuda()->create_operator<Convolution>();
  404. opr->param() = arg.param;
  405. opr->deduce_layout(src, filter, dst);
  406. }
  407. checker.set_rng(0, &rng).set_rng(1, &rng).set_param(arg.param).exec(
  408. TensorLayoutArray{filter, dst, src});
  409. }
  410. }
  411. }
  412. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_INT8_NCHW_DP4A) {
  413. if (!cuda::is_compute_capability_required(6, 1)) {
  414. printf("Skip CUDA.CONVOLUTION_BACKWARD_DATA_INT8_NCHW_DP4A test as "
  415. "current device doesn't support\n");
  416. return;
  417. }
  418. using namespace convolution;
  419. std::vector<TestArg> args = get_args_int8_nchw_conv_bwd_data();
  420. Checker<ConvolutionBackwardData> checker(handle_cuda());
  421. checker.set_before_exec_callback(AlgoChecker<ConvolutionBackwardData>(
  422. "INT8_NCHW_DOTPROD_IMPLICIT_GEMM"));
  423. checker.set_epsilon(1 + 1e-3).set_max_avg_error(1e-1);
  424. for (auto&& arg : args) {
  425. UniformIntRNG rng(-3, 3);
  426. auto src = TensorLayout(arg.src, dtype::QuantizedS8{1.2f});
  427. auto filter = TensorLayout(arg.filter, dtype::QuantizedS8{1.3f});
  428. TensorLayout dst;
  429. dst.dtype = dtype::QuantizedS8{1.2f};
  430. {
  431. auto opr = handle_cuda()->create_operator<Convolution>();
  432. opr->param() = arg.param;
  433. opr->deduce_layout(src, filter, dst);
  434. }
  435. checker.set_rng(0, &rng).set_rng(1, &rng).set_param(arg.param).exec(
  436. TensorLayoutArray{filter, dst, src});
  437. }
  438. }
  439. #if CUDA_VERSION >= 10020
  440. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_INT8_NHWC_IMMA) {
  441. if (!cuda::is_compute_capability_required(7, 5)) {
  442. printf("Skip CUDA.CONVOLUTION_BACKWARD_DATA_INT8_NHWC_IMMA test as "
  443. "current device doesn't support\n");
  444. return;
  445. }
  446. using namespace convolution;
  447. std::vector<TestArg> args = get_args_int8_nhwc_conv_bwd_data();
  448. struct AlgoParam {
  449. int threadblock_m;
  450. int threadblock_n;
  451. int threadblock_k;
  452. int warp_m;
  453. int warp_n;
  454. int warp_k;
  455. int stage;
  456. int access_size;
  457. std::string to_string() {
  458. return ssprintf("_%dX%dX%d_%dX%dX%d_%dstage_%d", threadblock_m,
  459. threadblock_n, threadblock_k, warp_m, warp_n,
  460. warp_k, stage, access_size);
  461. }
  462. };
  463. std::vector<AlgoParam> all_params;
  464. all_params.emplace_back(AlgoParam{64, 16, 32, 64, 16, 32, 2, 4});
  465. all_params.emplace_back(AlgoParam{64, 16, 32, 64, 16, 32, 2, 8});
  466. all_params.emplace_back(AlgoParam{64, 16, 32, 64, 16, 32, 2, 16});
  467. all_params.emplace_back(AlgoParam{128, 32, 32, 64, 32, 32, 1, 4});
  468. all_params.emplace_back(AlgoParam{128, 32, 32, 64, 32, 32, 1, 8});
  469. all_params.emplace_back(AlgoParam{128, 32, 32, 64, 32, 32, 1, 16});
  470. for (auto algo_param : all_params) {
  471. Checker<ConvolutionBackwardData> checker(handle_cuda());
  472. std::string algo_name(ssprintf("INT8_NHWC_IMMA_IMPLICIT_GEMM%s",
  473. algo_param.to_string().c_str()));
  474. checker.set_before_exec_callback(
  475. AlgoChecker<ConvolutionBackwardData>(algo_name.c_str()));
  476. checker.set_epsilon(1 + 1e-3).set_max_avg_error(1e-1);
  477. for (auto&& arg : args) {
  478. UniformIntRNG rng(-3, 3);
  479. auto src = TensorLayout(arg.src, dtype::QuantizedS8{1.2f});
  480. auto filter = TensorLayout(arg.filter, dtype::QuantizedS8{1.3f});
  481. TensorLayout dst;
  482. dst.dtype = dtype::QuantizedS8{1.2f};
  483. {
  484. auto opr = handle_cuda()->create_operator<Convolution>();
  485. opr->param() = arg.param;
  486. opr->deduce_layout(src, filter, dst);
  487. }
  488. checker.set_rng(0, &rng).set_rng(1, &rng).set_param(arg.param).exec(
  489. TensorLayoutArray{filter, dst, src});
  490. }
  491. }
  492. }
  493. #endif
  494. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_FAILED_CUDNN7_5) {
  495. // BRAIN-481 failed on architectures 7.0, remove the following if statement,
  496. // when cudnn fixed the problem.
  497. if (cuda::is_compute_capability_required(7, 0))
  498. return;
  499. using namespace convolution;
  500. std::vector<TestArg> args = get_args_cudnn_7_5_failures();
  501. Checker<ConvolutionBackwardData> checker(handle_cuda());
  502. NormalRNG default_rng;
  503. for (auto&& arg : args) {
  504. float scale =
  505. 128.f / sqrt(arg.filter[0] * arg.filter[2] * arg.filter[3]);
  506. scale = std::max(scale, 1.f);
  507. UniformFloatRNG rng(scale, 2 * scale);
  508. auto src = TensorLayout(arg.src, dtype::Float32());
  509. auto filter = TensorLayout(arg.filter, dtype::Float32());
  510. TensorLayout dst;
  511. {
  512. auto opr = handle_cuda()->create_operator<Convolution>();
  513. opr->param() = arg.param;
  514. opr->deduce_layout(src, filter, dst);
  515. }
  516. src.dtype = dst.dtype = filter.dtype = dtype::Float32();
  517. checker.set_rng(0, &default_rng)
  518. .set_rng(1, &default_rng)
  519. .set_epsilon(1e-3)
  520. .set_param(arg.param)
  521. .exec(TensorLayoutArray{filter, dst, src});
  522. src.dtype = dst.dtype = filter.dtype = dtype::Float16();
  523. checker.set_rng(0, &rng)
  524. .set_rng(1, &rng)
  525. .set_epsilon(1e-1)
  526. .set_param(arg.param)
  527. .exec(TensorLayoutArray{filter, dst, src});
  528. arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  529. checker.set_rng(0, &rng)
  530. .set_rng(1, &rng)
  531. .set_epsilon(1e-1)
  532. .set_param(arg.param)
  533. .exec(TensorLayoutArray{filter, dst, src});
  534. }
  535. }
  536. TEST_F(CUDA, CONVOLUTION_BACKWARD_FILTER) {
  537. using namespace convolution;
  538. std::vector<TestArg> args = get_args();
  539. Checker<ConvolutionBackwardFilter> checker(handle_cuda());
  540. bool f16_checked = false;
  541. for (auto&& arg : args) {
  542. auto src = TensorLayout(arg.src, dtype::Float32());
  543. auto filter = TensorLayout(arg.filter, dtype::Float32());
  544. TensorLayout dst;
  545. {
  546. auto opr = handle_cuda()->create_operator<Convolution>();
  547. opr->param() = arg.param;
  548. opr->deduce_layout(src, filter, dst);
  549. }
  550. float scale = 1.0f / sqrt(dst[2] * dst[3]);
  551. UniformFloatRNG rng(scale, 2 * scale);
  552. src.dtype = dst.dtype = filter.dtype = dtype::Float32();
  553. checker.set_rng(0, &rng)
  554. .set_rng(1, &rng)
  555. .set_epsilon(1e-3)
  556. .set_param(arg.param)
  557. .exec(TensorLayoutArray{src, dst, filter});
  558. // reduce on large f16 array may introduce significant error
  559. if (dst.total_nr_elems() >= 1000 && f16_checked)
  560. continue;
  561. f16_checked = true;
  562. src.dtype = dst.dtype = filter.dtype = dtype::Float16();
  563. checker.set_rng(0, &rng)
  564. .set_rng(1, &rng)
  565. .set_epsilon(1e-1)
  566. .set_param(arg.param)
  567. .exec(TensorLayoutArray{src, dst, filter});
  568. arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  569. checker.set_rng(0, &rng)
  570. .set_rng(1, &rng)
  571. .set_epsilon(1e-1)
  572. .set_param(arg.param)
  573. .exec(TensorLayoutArray{src, dst, filter});
  574. checker.set_before_exec_callback(AlgoChecker<ConvolutionBackwardFilter>(
  575. ExecutionPolicyAlgoName{"CONVOLUTION_BACKWARD_FILTER_BFLOAT16",
  576. {{"MATMUL", {{"CUBLAS", {}}}}}}));
  577. src.dtype = dst.dtype = filter.dtype = dtype::BFloat16();
  578. checker.set_rng(0, &rng)
  579. .set_rng(1, &rng)
  580. .set_epsilon(1e-1)
  581. .set_param(arg.param)
  582. .exec(TensorLayoutArray{src, dst, filter});
  583. checker.reset_before_exec_callback();
  584. checker.opr()->execution_policy() = {};
  585. }
  586. }
  587. TEST_F(CUDA, CONVOLUTION_BACKWARD_FILTER_MATMUL) {
  588. using namespace convolution;
  589. std::vector<TestArg> args = get_args();
  590. Checker<ConvolutionBackwardFilter> checker(handle_cuda());
  591. checker.set_before_exec_callback(AlgoChecker<ConvolutionBackwardFilter>(
  592. ExecutionPolicyAlgoName{"MATMUL", {{"CUBLAS", {}}}}));
  593. for (auto&& arg : args) {
  594. auto src = TensorLayout(arg.src, dtype::Float32());
  595. auto filter = TensorLayout(arg.filter, dtype::Float32());
  596. TensorLayout dst;
  597. {
  598. auto opr = handle_cuda()->create_operator<Convolution>();
  599. opr->param() = arg.param;
  600. opr->deduce_layout(src, filter, dst);
  601. }
  602. float scale = 1.0f / sqrt(dst[2] * dst[3]);
  603. UniformFloatRNG rng(scale, 2 * scale);
  604. src.dtype = dst.dtype = filter.dtype = dtype::Float32();
  605. checker.set_rng(0, &rng)
  606. .set_rng(1, &rng)
  607. .set_epsilon(1e-3)
  608. .set_param(arg.param)
  609. .exec(TensorLayoutArray{src, dst, filter});
  610. }
  611. //! noncontiguous case
  612. {
  613. NormalRNG default_rng;
  614. param::Convolution param;
  615. param.pad_h = param.pad_w = 1;
  616. checker.set_rng(0, &default_rng)
  617. .set_rng(1, &default_rng)
  618. .set_param(param)
  619. .execl(TensorLayoutArray{
  620. {{2, 16, 7, 7}, {1568, 49, 7, 1}, dtype::Float32()},
  621. {{2, 16, 7, 7}, {1568, 49, 7, 1}, dtype::Float32()},
  622. {{16, 16, 3, 3}, {144, 9, 3, 1}, dtype::Float32()}});
  623. }
  624. }
  625. TEST_F(CUDA, CONVOLUTION_BACKWARD_FILTER_CUDNN) {
  626. if (cuda::is_compute_capability_required(7, 0))
  627. return;
  628. using namespace convolution;
  629. Checker<ConvolutionBackwardFilter> checker(handle_cuda());
  630. checker.set_before_exec_callback(AlgoChecker<ConvolutionBackwardFilter>(
  631. "CUDNN_CONVOLUTION"));
  632. //! noncontiguous case
  633. {
  634. param::Convolution param;
  635. param.pad_h = param.pad_w = 1;
  636. checker.set_param(param).execl(TensorLayoutArray{
  637. {{2, 16, 7, 7}, {1568, 49, 7, 1}, dtype::Float32()},
  638. {{2, 16, 7, 7}, {1568, 49, 7, 1}, dtype::Float32()},
  639. {{16, 16, 3, 3}, {144, 9, 3, 1}, dtype::Float32()}
  640. });
  641. }
  642. }
  643. TEST_F(CUDA, CONV_CONFIG_COMBINATIONS) {
  644. auto eps_getter = [](bool f16, int stage, const char* name) -> float {
  645. if (f16) {
  646. return stage == 2 ? 0.5 : 0.2;
  647. }
  648. if (strstr(name, "WINOGRAD_NONFUSED"))
  649. return 0.3;
  650. return 1e-3;
  651. };
  652. convolution::test_conv_config_combinations(2, handle_cuda(), false, true,
  653. true, eps_getter, true);
  654. convolution::test_conv_config_combinations(3, handle_cuda(), false, true,
  655. true, eps_getter, true);
  656. convolution::test_conv_config_combinations(5, handle_cuda(), false, true,
  657. true, eps_getter, true);
  658. }
  659. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_1) {
  660. if (cuda::is_compute_capability_required(7, 0))
  661. return;
  662. using namespace convolution;
  663. Checker<ConvolutionBackwardData> checker(handle_cuda());
  664. checker.set_before_exec_callback(AlgoChecker<ConvolutionBackwardData>(
  665. "CUDNN_CONVOLUTION_BWD_DATA_ALGO_1" CUDNN_VERSION_STRING));
  666. NormalRNG default_rng;
  667. TensorShape s_filter = TensorShape{8, 8, 2, 2},
  668. s_src = TensorShape{2, 8, 18, 18};
  669. float scale = 1.0f / sqrt(s_filter[0] * s_filter[2] * s_filter[3]);
  670. UniformFloatRNG rng(scale, 2 * scale);
  671. auto src = TensorLayout(s_src, dtype::Float16());
  672. auto filter = TensorLayout(s_filter, dtype::Float16());
  673. TensorLayout dst;
  674. param::Convolution param;
  675. param.pad_h = param.pad_w = 2;
  676. param.stride_h = param.stride_w = 2;
  677. {
  678. auto opr = handle_cuda()->create_operator<Convolution>();
  679. opr->param() = param;
  680. opr->deduce_layout(src, filter, dst);
  681. }
  682. src.dtype = dst.dtype = filter.dtype = dtype::Float16();
  683. param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  684. checker.set_rng(0, &rng)
  685. .set_rng(1, &rng)
  686. .set_epsilon(0.2)
  687. .set_param(param)
  688. .exec(TensorLayoutArray{filter, dst, src});
  689. }
  690. #if MEGDNN_WITH_BENCHMARK
  691. TEST_F(CUDA, CONV_FWD_BENCHMARK) {
  692. auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW,
  693. size_t SH = 1, size_t SW = 1, size_t FH = 1, size_t FW = 1,
  694. size_t PH = 0, size_t PW = 0, bool fp16io_c32 = false) {
  695. auto benchmarker = Benchmarker<ConvolutionForward>(handle_cuda());
  696. benchmarker.set_dtype(0, dtype::Float16())
  697. .set_dtype(1, dtype::Float16())
  698. .set_dtype(2, dtype::Float16());
  699. ConvolutionForward::Param param;
  700. param.stride_h = SH;
  701. param.stride_w = SW;
  702. param.pad_h = PH;
  703. param.pad_w = PW;
  704. if (fp16io_c32) {
  705. param.compute_mode =
  706. ConvolutionForward::Param::ComputeMode::FLOAT32;
  707. }
  708. benchmarker.set_param(param);
  709. std::unique_ptr<OprProxy<ConvolutionForward>> proxy{
  710. new OprProxy<ConvolutionForward>{true}};
  711. benchmarker.set_proxy(proxy);
  712. size_t OH = (IH - FH + 2 * PH) / SH + 1;
  713. size_t OW = (IW - FW + 2 * PW) / SW + 1;
  714. auto time = benchmarker.execs(
  715. {{N, IC, IH, IW}, {OC, IC, FH, FW}, {N, OC, OH, OW}});
  716. time /= 1000.0 * 10.0;
  717. auto flo = (double)N * OC * IC * OH * OW * FH * FW * 2;
  718. auto flops = flo / time / 1e12;
  719. printf("comp_type %s: ", fp16io_c32 ? "32" : "16");
  720. printf("%.3fG FLO, flops %.3fTFLOPS\n", flo / 1e9, flops);
  721. };
  722. run(32, 512, 256, 56, 56, 1, 1, 1, 1, 0, 0, false);
  723. run(32, 512, 256, 56, 56, 1, 1, 1, 1, 0, 0, true);
  724. }
  725. TEST_F(CUDA, CONVOLUTION_FWD_BENCHMARK) {
  726. CUBenchmarker<ConvolutionForward> bench{handle_cuda()};
  727. std::unique_ptr<OprProxy<ConvolutionForward>> proxy{
  728. new OprProxy<ConvolutionForward>{true}};
  729. size_t RUNS = 10;
  730. bench.set_proxy(proxy).set_times(RUNS);
  731. auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW,
  732. size_t FH, size_t SH, size_t PH) {
  733. bench.set_dtype(0, dtype::Float32())
  734. .set_dtype(1, dtype::Float32())
  735. .set_dtype(2, dtype::Float32());
  736. param::Convolution param;
  737. param.stride_h = param.stride_w = SH;
  738. param.pad_h = param.pad_w = PH;
  739. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  740. bench.set_param(param);
  741. bench.proxy()->target_execution_policy.algo.reset();
  742. TensorLayout src{{N, IC, IH, IW}, dtype::Float32()},
  743. filter{{OC, IC, FH, FH}, dtype::Float32()};
  744. TensorLayout dst;
  745. {
  746. auto&& opr = handle_cuda()->create_operator<Convolution>();
  747. opr->param() = param;
  748. opr->deduce_layout(src, filter, dst);
  749. }
  750. auto time_ms_fp32 = bench.execl({src, filter, dst}) / RUNS;
  751. src.dtype = filter.dtype = dst.dtype = dtype::Float16();
  752. bench.proxy()->target_execution_policy.algo.reset();
  753. bench.set_dtype(0, dtype::Float16())
  754. .set_dtype(1, dtype::Float16())
  755. .set_dtype(2, dtype::Float16());
  756. auto time_ms_true_fp16 = bench.execl({src, filter, dst}) / RUNS;
  757. param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  758. bench.proxy()->target_execution_policy.algo.reset();
  759. bench.set_param(param);
  760. auto time_ms_pseudo_fp16 = bench.execl({src, filter, dst}) / RUNS;
  761. float flo = 2.0 * N * OC * IC * dst[2] * dst[3] * FH * FH;
  762. printf("inp=%s, kern=%s, dst=%s ", src.to_string().c_str(),
  763. filter.to_string().c_str(), dst.to_string().c_str());
  764. printf("time_fp32=%.2fms, flops=%.3fTFLOPS\ntime_true_fp16=%.2fms, "
  765. "flops=%.3fTFLOPS\ntime_pseudo_fp16=%.2fms, flops=%.3fFLOPS\n",
  766. time_ms_fp32, (flo / (time_ms_fp32 * 1e9)), time_ms_true_fp16,
  767. (flo / (time_ms_true_fp16 * 1e9)), time_ms_pseudo_fp16,
  768. (flo / (time_ms_pseudo_fp16 * 1e9)));
  769. printf("speedup (true_fp16/fp32)=%.2f, (true_fp16/pseudo_fp16)=%.2f\n",
  770. time_ms_fp32 / time_ms_true_fp16,
  771. time_ms_pseudo_fp16 / time_ms_true_fp16);
  772. };
  773. run(32, 64, 3, 224, 224, 7, 2, 3);
  774. run(32, 128, 128, 28, 28, 3, 1, 1);
  775. run(32, 256, 256, 14, 14, 3, 1, 1);
  776. run(32, 512, 512, 7, 7, 3, 1, 1);
  777. run(32, 64, 64, 56, 56, 3, 1, 1);
  778. run(32, 512, 256, 56, 56, 1, 2, 0);
  779. run(32, 1024, 512, 28, 28, 1, 2, 0);
  780. run(32, 2048, 1024, 14, 14, 1, 2, 0);
  781. run(32, 512, 128, 28, 28, 1, 1, 0);
  782. run(32, 128, 512, 28, 28, 1, 1, 0);
  783. run(32, 1024, 256, 14, 14, 1, 1, 0);
  784. run(32, 256, 1024, 14, 14, 1, 1, 0);
  785. run(32, 2048, 512, 7, 7, 1, 1, 0);
  786. run(32, 512, 2048, 7, 7, 1, 1, 0);
  787. run(32, 256, 64, 56, 56, 1, 1, 0);
  788. run(32, 64, 256, 56, 56, 1, 1, 0);
  789. run(32, 128, 256, 56, 56, 1, 2, 0);
  790. run(32, 256, 512, 28, 28, 1, 2, 0);
  791. run(32, 512, 1024, 14, 14, 1, 2, 0);
  792. run(32, 64, 64, 56, 56, 1, 1, 0);
  793. }
  794. TEST_F(CUDA, CONVOLUTION_BWD_DATA_BENCHMARK) {
  795. CUBenchmarker<ConvolutionBackwardData> bench{handle_cuda()};
  796. std::unique_ptr<OprProxy<ConvolutionBackwardData>> proxy{
  797. new OprProxy<ConvolutionBackwardData>{true}};
  798. size_t RUNS = 10;
  799. bench.set_proxy(proxy).set_times(RUNS);
  800. auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW,
  801. size_t FH, size_t SH, size_t PH) {
  802. bench.set_dtype(0, dtype::Float32())
  803. .set_dtype(1, dtype::Float32())
  804. .set_dtype(2, dtype::Float32());
  805. param::Convolution param;
  806. param.stride_h = param.stride_w = SH;
  807. param.pad_h = param.pad_w = PH;
  808. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  809. bench.set_param(param);
  810. bench.proxy()->target_execution_policy.algo.reset();
  811. TensorLayout src{{N, IC, IH, IW}, dtype::Float32()},
  812. filter{{OC, IC, FH, FH}, dtype::Float32()};
  813. TensorLayout dst;
  814. {
  815. auto&& opr = handle_cuda()->create_operator<Convolution>();
  816. opr->param() = param;
  817. opr->deduce_layout(src, filter, dst);
  818. }
  819. auto time_ms_fp32 = bench.execl({filter, dst, src}) / RUNS;
  820. src.dtype = filter.dtype = dst.dtype = dtype::Float16();
  821. bench.proxy()->target_execution_policy.algo.reset();
  822. bench.set_dtype(0, dtype::Float16())
  823. .set_dtype(1, dtype::Float16())
  824. .set_dtype(2, dtype::Float16());
  825. auto time_ms_true_fp16 = bench.execl({filter, dst, src}) / RUNS;
  826. param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  827. bench.proxy()->target_execution_policy.algo.reset();
  828. bench.set_param(param);
  829. auto time_ms_pseudo_fp16 = bench.execl({filter, dst, src}) / RUNS;
  830. float flo = 2.0 * N * OC * IC * dst[2] * dst[3] * FH * FH;
  831. printf("inp=%s, kern=%s, dst=%s ", src.to_string().c_str(),
  832. filter.to_string().c_str(), dst.to_string().c_str());
  833. printf("time_fp32=%.2fms, flops=%.3fTFLOPS\ntime_true_fp16=%.2fms, "
  834. "flops=%.3fTFLOPS\ntime_pseudo_fp16=%.2fms, flops=%.3fFLOPS\n",
  835. time_ms_fp32, (flo / (time_ms_fp32 * 1e9)), time_ms_true_fp16,
  836. (flo / (time_ms_true_fp16 * 1e9)), time_ms_pseudo_fp16,
  837. (flo / (time_ms_pseudo_fp16 * 1e9)));
  838. printf("speedup (true_fp16/fp32)=%.2f, (true_fp16/pseudo_fp16)=%.2f\n",
  839. time_ms_fp32 / time_ms_true_fp16,
  840. time_ms_pseudo_fp16 / time_ms_true_fp16);
  841. };
  842. run(32, 64, 3, 224, 224, 7, 2, 3);
  843. run(32, 128, 128, 28, 28, 3, 1, 1);
  844. run(32, 256, 256, 14, 14, 3, 1, 1);
  845. run(32, 512, 512, 7, 7, 3, 1, 1);
  846. run(32, 64, 64, 56, 56, 3, 1, 1);
  847. run(32, 512, 256, 56, 56, 1, 2, 0);
  848. run(32, 1024, 512, 28, 28, 1, 2, 0);
  849. run(32, 2048, 1024, 14, 14, 1, 2, 0);
  850. run(32, 512, 128, 28, 28, 1, 1, 0);
  851. run(32, 128, 512, 28, 28, 1, 1, 0);
  852. run(32, 1024, 256, 14, 14, 1, 1, 0);
  853. run(32, 256, 1024, 14, 14, 1, 1, 0);
  854. run(32, 2048, 512, 7, 7, 1, 1, 0);
  855. run(32, 512, 2048, 7, 7, 1, 1, 0);
  856. run(32, 256, 64, 56, 56, 1, 1, 0);
  857. run(32, 64, 256, 56, 56, 1, 1, 0);
  858. run(32, 128, 256, 56, 56, 1, 2, 0);
  859. run(32, 256, 512, 28, 28, 1, 2, 0);
  860. run(32, 512, 1024, 14, 14, 1, 2, 0);
  861. run(32, 64, 64, 56, 56, 1, 1, 0);
  862. }
  863. TEST_F(CUDA, BENCHMARK_CONVOLUTION_BWD_DATA_BF16) {
  864. CUBenchmarker<ConvolutionBackwardData> bench{handle_cuda()};
  865. std::unique_ptr<OprProxy<ConvolutionBackwardData>> proxy{
  866. new OprProxy<ConvolutionBackwardData>{true}};
  867. size_t RUNS = 10;
  868. bench.set_proxy(proxy).set_times(RUNS);
  869. auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW,
  870. size_t FH, size_t SH, size_t PH) {
  871. bench.set_dtype(0, dtype::BFloat16())
  872. .set_dtype(1, dtype::BFloat16())
  873. .set_dtype(2, dtype::BFloat16());
  874. param::Convolution param;
  875. param.stride_h = param.stride_w = SH;
  876. param.pad_h = param.pad_w = PH;
  877. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  878. bench.set_param(param);
  879. bench.proxy()->target_execution_policy = {};
  880. TensorLayout src{{N, IC, IH, IW}, dtype::BFloat16()},
  881. filter{{OC, IC, FH, FH}, dtype::BFloat16()};
  882. TensorLayout dst;
  883. {
  884. auto&& opr = handle_cuda()->create_operator<Convolution>();
  885. opr->param() = param;
  886. opr->deduce_layout(src, filter, dst);
  887. }
  888. auto used = bench.execl({filter, dst, src}) / RUNS;
  889. float flo = 2.0 * N * OC * IC * dst[2] * dst[3] * FH * FH;
  890. printf("inp=%s, kern=%s, dst=%s ", src.to_string().c_str(),
  891. filter.to_string().c_str(), dst.to_string().c_str());
  892. printf("time_fp32=%.2fms, flops=%.3fTFLOPS\n", used,
  893. (flo / (used * 1e9)));
  894. };
  895. run(32, 64, 3, 224, 224, 7, 2, 3);
  896. run(32, 128, 128, 28, 28, 3, 1, 1);
  897. run(32, 256, 256, 14, 14, 3, 1, 1);
  898. run(32, 512, 512, 7, 7, 3, 1, 1);
  899. run(32, 64, 64, 56, 56, 3, 1, 1);
  900. run(32, 512, 256, 56, 56, 1, 2, 0);
  901. run(32, 1024, 512, 28, 28, 1, 2, 0);
  902. run(32, 2048, 1024, 14, 14, 1, 2, 0);
  903. run(32, 512, 128, 28, 28, 1, 1, 0);
  904. run(32, 128, 512, 28, 28, 1, 1, 0);
  905. run(32, 1024, 256, 14, 14, 1, 1, 0);
  906. run(32, 256, 1024, 14, 14, 1, 1, 0);
  907. run(32, 2048, 512, 7, 7, 1, 1, 0);
  908. run(32, 512, 2048, 7, 7, 1, 1, 0);
  909. run(32, 256, 64, 56, 56, 1, 1, 0);
  910. run(32, 64, 256, 56, 56, 1, 1, 0);
  911. run(32, 128, 256, 56, 56, 1, 2, 0);
  912. run(32, 256, 512, 28, 28, 1, 2, 0);
  913. run(32, 512, 1024, 14, 14, 1, 2, 0);
  914. run(32, 64, 64, 56, 56, 1, 1, 0);
  915. }
  916. TEST_F(CUDA, BENCHMARK_CONVOLUTION_BWD_DATA_INT8_DP4A) {
  917. CUBenchmarker<ConvolutionBackwardData> bench{handle_cuda()};
  918. std::unique_ptr<OprProxy<ConvolutionBackwardData>> proxy{
  919. new OprProxy<ConvolutionBackwardData>{true}};
  920. size_t RUNS = 10;
  921. bench.set_proxy(proxy).set_times(RUNS);
  922. auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW,
  923. size_t FH, size_t SH, size_t PH) {
  924. bench.set_dtype(0, dtype::QuantizedS8{1.0f})
  925. .set_dtype(1, dtype::QuantizedS8{1.0f})
  926. .set_dtype(2, dtype::QuantizedS8{1.0f});
  927. param::Convolution param;
  928. param.format = param::Convolution::Format::NCHW4;
  929. param.stride_h = param.stride_w = SH;
  930. param.pad_h = param.pad_w = PH;
  931. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  932. bench.set_param(param);
  933. bench.proxy()->target_execution_policy = {};
  934. TensorLayout src{{N, IC / 4, IH, IW, 4}, dtype::QuantizedS8{1.0f}},
  935. filter{{OC, IC / 4, FH, FH, 4}, dtype::QuantizedS8{1.0f}};
  936. TensorLayout dst;
  937. dst.dtype = dtype::QuantizedS8{1.0f};
  938. {
  939. auto&& opr = handle_cuda()->create_operator<Convolution>();
  940. opr->param() = param;
  941. opr->deduce_layout(src, filter, dst);
  942. }
  943. auto used = bench.execl({filter, dst, src}) / RUNS;
  944. float flo = 2.0 * N * OC * IC * dst[2] * dst[3] * FH * FH;
  945. printf("inp=%s, kern=%s, dst=%s ", src.to_string().c_str(),
  946. filter.to_string().c_str(), dst.to_string().c_str());
  947. printf("time_fp32=%.2fms, flops=%.3fTFLOPS\n", used,
  948. (flo / (used * 1e9)));
  949. };
  950. run(64, 32, 32, 92, 180, 4, 2, 2);
  951. run(64, 32, 32, 46, 80, 4, 2, 2);
  952. run(16, 16, 16, 92, 180, 4, 2, 2);
  953. run(16, 16, 16, 46, 80, 4, 2, 2);
  954. }
  955. TEST_F(CUDA, CONVOLUTION_BWD_FILTER_BENCHMARK) {
  956. CUBenchmarker<ConvolutionBackwardFilter> bench{handle_cuda()};
  957. std::unique_ptr<OprProxy<ConvolutionBackwardFilter>> proxy{
  958. new OprProxy<ConvolutionBackwardFilter>{true}};
  959. size_t RUNS = 10;
  960. bench.set_proxy(proxy).set_times(RUNS);
  961. auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW,
  962. size_t FH, size_t SH, size_t PH) {
  963. bench.set_dtype(0, dtype::Float32())
  964. .set_dtype(1, dtype::Float32())
  965. .set_dtype(2, dtype::Float32());
  966. param::Convolution param;
  967. param.stride_h = param.stride_w = SH;
  968. param.pad_h = param.pad_w = PH;
  969. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  970. bench.set_param(param);
  971. bench.proxy()->target_execution_policy.algo.reset();
  972. TensorLayout src{{N, IC, IH, IW}, dtype::Float32()},
  973. filter{{OC, IC, FH, FH}, dtype::Float32()};
  974. TensorLayout dst;
  975. {
  976. auto&& opr = handle_cuda()->create_operator<Convolution>();
  977. opr->param() = param;
  978. opr->deduce_layout(src, filter, dst);
  979. }
  980. auto time_ms_fp32 = bench.execl({src, dst, filter}) / RUNS;
  981. src.dtype = filter.dtype = dst.dtype = dtype::Float16();
  982. bench.proxy()->target_execution_policy.algo.reset();
  983. bench.set_dtype(0, dtype::Float16())
  984. .set_dtype(1, dtype::Float16())
  985. .set_dtype(2, dtype::Float16());
  986. auto time_ms_true_fp16 = bench.execl({src, dst, filter}) / RUNS;
  987. param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  988. bench.proxy()->target_execution_policy.algo.reset();
  989. bench.set_param(param);
  990. auto time_ms_pseudo_fp16 = bench.execl({src, dst, filter}) / RUNS;
  991. float flo = 2.0 * N * OC * IC * dst[2] * dst[3] * FH * FH;
  992. printf("inp=%s, kern=%s, dst=%s ", src.to_string().c_str(),
  993. filter.to_string().c_str(), dst.to_string().c_str());
  994. printf("time_fp32=%.2fms, flops=%.3fTFLOPS\ntime_true_fp16=%.2fms, "
  995. "flops=%.3fTFLOPS\ntime_pseudo_fp16=%.2fms, flops=%.3fFLOPS\n",
  996. time_ms_fp32, (flo / (time_ms_fp32 * 1e9)), time_ms_true_fp16,
  997. (flo / (time_ms_true_fp16 * 1e9)), time_ms_pseudo_fp16,
  998. (flo / (time_ms_pseudo_fp16 * 1e9)));
  999. printf("speedup (true_fp16/fp32)=%.2f, (true_fp16/pseudo_fp16)=%.2f\n",
  1000. time_ms_fp32 / time_ms_true_fp16,
  1001. time_ms_pseudo_fp16 / time_ms_true_fp16);
  1002. };
  1003. run(32, 64, 3, 224, 224, 7, 2, 3);
  1004. run(32, 128, 128, 28, 28, 3, 1, 1);
  1005. run(32, 256, 256, 14, 14, 3, 1, 1);
  1006. run(32, 512, 512, 7, 7, 3, 1, 1);
  1007. run(32, 64, 64, 56, 56, 3, 1, 1);
  1008. run(32, 512, 256, 56, 56, 1, 2, 0);
  1009. run(32, 1024, 512, 28, 28, 1, 2, 0);
  1010. run(32, 2048, 1024, 14, 14, 1, 2, 0);
  1011. run(32, 512, 128, 28, 28, 1, 1, 0);
  1012. run(32, 128, 512, 28, 28, 1, 1, 0);
  1013. run(32, 1024, 256, 14, 14, 1, 1, 0);
  1014. run(32, 256, 1024, 14, 14, 1, 1, 0);
  1015. run(32, 2048, 512, 7, 7, 1, 1, 0);
  1016. run(32, 512, 2048, 7, 7, 1, 1, 0);
  1017. run(32, 256, 64, 56, 56, 1, 1, 0);
  1018. run(32, 64, 256, 56, 56, 1, 1, 0);
  1019. run(32, 128, 256, 56, 56, 1, 2, 0);
  1020. run(32, 256, 512, 28, 28, 1, 2, 0);
  1021. run(32, 512, 1024, 14, 14, 1, 2, 0);
  1022. run(32, 64, 64, 56, 56, 1, 1, 0);
  1023. }
  1024. #endif
  1025. #undef CUDNN_VERSION_STRING
  1026. #undef V
  1027. #undef V1
  1028. } // namespace test
  1029. } // namespace megdnn
  1030. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台