You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

convolution.cpp 42 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010
  1. /**
  2. * \file dnn/test/cuda/convolution.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
  10. * implied.
  11. */
  12. #include "megdnn/dtype.h"
  13. #include "megdnn/oprs.h"
  14. #include "megdnn/opr_param_defs.h"
  15. #include "test/cuda/fixture.h"
  16. #include "test/common/tensor.h"
  17. #include "test/common/workspace_wrapper.h"
  18. #include "test/common/checker.h"
  19. #include "test/common/convolution.h"
  20. #include "test/common/rng.h"
  21. #include "test/cuda/benchmark.h"
  22. #include "src/cuda/utils.h"
  23. #include "test/common/accuracy_shake_checker.h"
  24. #define V1(x) #x
  25. #define V(x) V1(x)
  26. #define CUDNN_VERSION_STRING \
  27. "v" V(CUDNN_MAJOR) "." V(CUDNN_MINOR) "." V(CUDNN_PATCHLEVEL)
  28. namespace megdnn {
  29. namespace test {
  30. TEST_F(CUDA, CONVOLUTION_8X8X32) {
  31. if (!cuda::is_compute_capability_required(6, 1)) {
  32. printf("Skip CUDA.CONVOLUTION_8X8X32 test as current device"
  33. "doesn't support\n");
  34. return;
  35. }
  36. using namespace convolution;
  37. std::vector<TestArg> args;
  38. {
  39. auto v = get_args();
  40. for (auto&& a : v) {
  41. args.push_back(std::move(a));
  42. }
  43. }
  44. {
  45. auto v = get_dilated_args();
  46. for (auto&& a : v) {
  47. args.push_back(std::move(a));
  48. }
  49. }
  50. {
  51. auto v = get_chanwise_args();
  52. for (auto&& a : v) {
  53. args.push_back(std::move(a));
  54. }
  55. }
  56. Checker<ConvolutionForward> checker(handle_cuda());
  57. UniformIntRNG rng(-4, 4);
  58. for (auto arg : args) {
  59. arg.param.format = param::Convolution::Format::NHWC;
  60. arg.src = cvt_src_or_dst_nchw2nhwc(arg.src);
  61. arg.filter = cvt_filter_nchw2nhwc(arg.filter);
  62. checker.set_dtype(0, dtype::Int8())
  63. .set_dtype(1, dtype::Int8())
  64. .set_dtype(2, dtype::Int32())
  65. .set_param(arg.param)
  66. .set_rng(0, &rng)
  67. .set_rng(1, &rng)
  68. .execs({arg.src, arg.filter, {}});
  69. }
  70. }
  71. TEST_F(CUDA, CONVOLUTION_FORWARD) {
  72. using namespace convolution;
  73. std::vector<TestArg> args = get_args();
  74. Checker<ConvolutionForward> checker(handle_cuda());
  75. NormalRNG default_rng;
  76. for (auto&& arg : args) {
  77. float scale =
  78. 1.0f / sqrt(arg.filter[1] * arg.filter[2] * arg.filter[3]);
  79. UniformFloatRNG rng(scale, 2 * scale);
  80. checker.set_dtype(0, dtype::Float32())
  81. .set_dtype(1, dtype::Float32())
  82. .set_dtype(2, dtype::Float32())
  83. .set_rng(0, &default_rng)
  84. .set_rng(1, &default_rng)
  85. .set_epsilon(1e-3)
  86. .set_param(arg.param)
  87. .execs({arg.src, arg.filter, {}});
  88. checker.set_dtype(0, dtype::Float16())
  89. .set_dtype(1, dtype::Float16())
  90. .set_dtype(2, dtype::Float16())
  91. .set_rng(0, &rng)
  92. .set_rng(1, &rng)
  93. .set_epsilon(1e-1)
  94. .set_param(arg.param)
  95. .execs({arg.src, arg.filter, {}});
  96. arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  97. checker.set_dtype(0, dtype::Float16())
  98. .set_dtype(1, dtype::Float16())
  99. .set_dtype(2, dtype::Float16())
  100. .set_rng(0, &rng)
  101. .set_rng(1, &rng)
  102. .set_epsilon(1e-1)
  103. .set_param(arg.param)
  104. .execs({arg.src, arg.filter, {}});
  105. checker.set_dtype(0, dtype::BFloat16())
  106. .set_dtype(1, dtype::BFloat16())
  107. .set_dtype(2, dtype::BFloat16())
  108. .set_epsilon(1e-1)
  109. .set_param(arg.param)
  110. .execs({arg.src, arg.filter, {}});
  111. }
  112. }
  113. TEST_F(CUDA, CONV_FORWARD_MATMUL_NCHW4) {
  114. if (!cuda::is_compute_capability_required(6, 1))
  115. return;
  116. using namespace convolution;
  117. Checker<Convolution> checker(handle_cuda());
  118. UniformIntRNG int_rng{-127, 127};
  119. Convolution::Param param;
  120. param.format = Convolution::Param::Format::NCHW4;
  121. checker.set_dtype(0, dtype::QuantizedS8(0.132f))
  122. .set_dtype(1, dtype::QuantizedS8(0.0239f))
  123. .set_dtype(2, dtype::QuantizedS32(0.132f * 0.0239f))
  124. .set_rng(0, &int_rng)
  125. .set_rng(1, &int_rng)
  126. .set_param(param);
  127. checker.set_before_exec_callback(
  128. AlgoChecker<ConvolutionForward>(ExecutionPolicyAlgoName{
  129. "DEFAULT",
  130. {{ConvBiasForward::algo_name<ConvBiasForward::MatmulParam>(
  131. "MATMUL8X8X32", {})
  132. .c_str(),
  133. {}}}}));
  134. param.sparse = Convolution::Param::Sparse::DENSE;
  135. param.pad_h = param.pad_w = 1;
  136. param.stride_h = param.stride_w = 1;
  137. checker.set_param(param);
  138. checker.exec({{8, 4, 10, 10, 4}, {16, 4, 3, 3, 4}, {}});
  139. checker.exec({{1, 4, 2, 2, 4}, {16, 4, 3, 3, 4}, {}});
  140. checker.exec({{8, 64, 12, 12, 4}, {256, 64, 3, 3, 4}, {}});
  141. }
  142. TEST_F(CUDA, CONVOLUTION_1X1_FORWARD) {
  143. using namespace convolution;
  144. std::vector<TestArg> args = get_1x1_args();
  145. Checker<ConvolutionForward> checker(handle_cuda());
  146. NormalRNG default_rng;
  147. for (auto&& arg : args) {
  148. float scale =
  149. 1.0f / sqrt(arg.filter[1] * arg.filter[2] * arg.filter[3]);
  150. UniformFloatRNG rng(scale, 2 * scale);
  151. checker.set_dtype(0, dtype::Float32())
  152. .set_dtype(1, dtype::Float32())
  153. .set_rng(0, &default_rng)
  154. .set_rng(1, &default_rng)
  155. .set_epsilon(1e-3)
  156. .set_param(arg.param)
  157. .execs({arg.src, arg.filter, {}});
  158. }
  159. }
  160. TEST_F(CUDA, BENCHMARK_CONVOLUTION_1X1_FORWARD) {
  161. using namespace convolution;
  162. std::vector<TestArg> args = get_1x1_args();
  163. Benchmarker<ConvolutionForward> marker(handle_cuda());
  164. NormalRNG default_rng;
  165. for (auto&& arg : args) {
  166. float scale =
  167. 1.0f / sqrt(arg.filter[1] * arg.filter[2] * arg.filter[3]);
  168. UniformFloatRNG rng(scale, 2 * scale);
  169. marker.set_dtype(0, dtype::Float32())
  170. .set_dtype(1, dtype::Float32())
  171. .set_rng(0, &default_rng)
  172. .set_rng(1, &default_rng)
  173. .set_param(arg.param)
  174. .execs({arg.src, arg.filter, {}});
  175. }
  176. }
  177. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA) {
  178. using namespace convolution;
  179. std::vector<TestArg> args = get_args_cuda_conv_bwd_data();
  180. Checker<ConvolutionBackwardData> checker(handle_cuda());
  181. NormalRNG default_rng;
  182. for (auto&& arg : args) {
  183. float scale =
  184. 64.f / sqrt(arg.filter[0] * arg.filter[2] * arg.filter[3]);
  185. UniformFloatRNG rng(scale, 2 * scale);
  186. auto src = TensorLayout(arg.src, dtype::Float32());
  187. auto filter = TensorLayout(arg.filter, dtype::Float32());
  188. TensorLayout dst;
  189. {
  190. auto opr = handle_cuda()->create_operator<Convolution>();
  191. opr->param() = arg.param;
  192. opr->deduce_layout(src, filter, dst);
  193. }
  194. src.dtype = dst.dtype = filter.dtype = dtype::Float32();
  195. checker.set_rng(0, &default_rng)
  196. .set_rng(1, &default_rng)
  197. .set_epsilon(1e-3)
  198. .set_param(arg.param)
  199. .exec(TensorLayoutArray{filter, dst, src});
  200. if (!cuda::is_compute_capability_required(6, 0)) {
  201. src.dtype = dst.dtype = filter.dtype = dtype::Float16();
  202. checker.set_rng(0, &rng)
  203. .set_rng(1, &rng)
  204. .set_epsilon(1e-1)
  205. .set_param(arg.param)
  206. .exec(TensorLayoutArray{filter, dst, src});
  207. arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  208. checker.set_rng(0, &rng)
  209. .set_rng(1, &rng)
  210. .set_epsilon(1e-1)
  211. .set_param(arg.param)
  212. .exec(TensorLayoutArray{filter, dst, src});
  213. }
  214. checker.set_before_exec_callback(AlgoChecker<ConvolutionBackwardData>(
  215. ExecutionPolicyAlgoName{"CONVOLUTION_BACKWARD_DATD_BFLOAT16",
  216. {{"MATMUL", {{"CUBLAS", {}}}}}}));
  217. src.dtype = dst.dtype = filter.dtype = dtype::BFloat16();
  218. arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  219. checker.set_rng(0, &rng)
  220. .set_rng(1, &rng)
  221. .set_epsilon(1e-1)
  222. .set_param(arg.param)
  223. .exec(TensorLayoutArray{filter, dst, src});
  224. checker.reset_before_exec_callback();
  225. checker.opr()->execution_policy() = {};
  226. }
  227. }
  228. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_CUDNN) {
  229. if (cuda::is_compute_capability_required(7, 0))
  230. return;
  231. using namespace convolution;
  232. Checker<ConvolutionBackwardData> checker(handle_cuda());
  233. checker.set_before_exec_callback(AlgoChecker<ConvolutionBackwardData>(
  234. "CUDNN_CONVOLUTION"));
  235. //! noncontiguous case
  236. {
  237. param::Convolution param;
  238. param.pad_h = param.pad_w = 1;
  239. checker.set_param(param).execl(TensorLayoutArray{
  240. {{16, 16, 3, 3}, {144, 9, 3, 1}, dtype::Float32()},
  241. {{2, 16, 7, 7}, {1568, 49, 7, 1}, dtype::Float32()},
  242. {{2, 16, 7, 7}, {1568, 49, 7, 1}, dtype::Float32()},
  243. });
  244. }
  245. }
  246. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_MATMUL) {
  247. using namespace convolution;
  248. std::vector<TestArg> args = get_args_cuda_conv_bwd_data();
  249. Checker<ConvolutionBackwardData> checker(handle_cuda());
  250. checker.set_before_exec_callback(AlgoChecker<ConvolutionBackwardData>(
  251. ExecutionPolicyAlgoName{"MATMUL", {{"CUBLAS", {}}}}));
  252. NormalRNG default_rng;
  253. for (auto&& arg : args) {
  254. float scale =
  255. 64.f / sqrt(arg.filter[0] * arg.filter[2] * arg.filter[3]);
  256. UniformFloatRNG rng(scale, 2 * scale);
  257. auto src = TensorLayout(arg.src, dtype::Float32());
  258. auto filter = TensorLayout(arg.filter, dtype::Float32());
  259. TensorLayout dst;
  260. {
  261. auto opr = handle_cuda()->create_operator<Convolution>();
  262. opr->param() = arg.param;
  263. opr->deduce_layout(src, filter, dst);
  264. }
  265. src.dtype = dst.dtype = filter.dtype = dtype::Float32();
  266. checker.set_rng(0, &default_rng)
  267. .set_rng(1, &default_rng)
  268. .set_epsilon(1e-3)
  269. .set_param(arg.param)
  270. .exec(TensorLayoutArray{filter, dst, src});
  271. }
  272. //! noncontiguous case
  273. {
  274. param::Convolution param;
  275. param.pad_h = param.pad_w = 1;
  276. checker.set_param(param).execl(TensorLayoutArray{
  277. {{16, 16, 3, 3}, {144, 9, 3, 1}, dtype::Float32()},
  278. {{2, 16, 7, 7}, {1568, 49, 7, 1}, dtype::Float32()},
  279. {{2, 16, 7, 7}, {1568, 49, 7, 1}, dtype::Float32()},
  280. });
  281. }
  282. }
  283. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_INT8_NCHW4_DP4A) {
  284. if (!cuda::is_compute_capability_required(6, 1)) {
  285. printf("Skip CUDA.CONVOLUTION_BACKWARD_DATA_INT8_NCHW4_DP4A test as "
  286. "current device doesn't support\n");
  287. return;
  288. }
  289. using namespace convolution;
  290. std::vector<TestArg> args = get_args_int8_nchw4_conv_bwd_data();
  291. struct AlgoParam {
  292. int threadblock_m;
  293. int threadblock_n;
  294. int threadblock_k;
  295. int warp_m;
  296. int warp_n;
  297. int warp_k;
  298. int stage;
  299. std::string to_string() {
  300. return ssprintf("_%dX%dX%d_%dX%dX%d_%dstage", threadblock_m,
  301. threadblock_n, threadblock_k, warp_m, warp_n,
  302. warp_k, stage);
  303. }
  304. };
  305. std::vector<AlgoParam> all_params;
  306. all_params.emplace_back(AlgoParam{16, 64, 8, 16, 64, 8, 2});
  307. all_params.emplace_back(AlgoParam{16, 128, 16, 16, 64, 16, 2});
  308. all_params.emplace_back(AlgoParam{16, 128, 16, 16, 128, 16, 1});
  309. all_params.emplace_back(AlgoParam{32, 128, 32, 32, 64, 32, 2});
  310. for (auto algo_param : all_params) {
  311. Checker<ConvolutionBackwardData> checker(handle_cuda());
  312. std::string algo_name(ssprintf("INT8_NCHW4_DOTPROD_IMPLICIT_GEMM%s",
  313. algo_param.to_string().c_str()));
  314. checker.set_before_exec_callback(
  315. AlgoChecker<ConvolutionBackwardData>(algo_name.c_str()));
  316. checker.set_epsilon(1 + 1e-3).set_max_avg_error(1e-1);
  317. for (auto&& arg : args) {
  318. UniformIntRNG rng(-3, 3);
  319. auto src = TensorLayout(arg.src, dtype::QuantizedS8{1.2f});
  320. auto filter = TensorLayout(arg.filter, dtype::QuantizedS8{1.3f});
  321. TensorLayout dst;
  322. dst.dtype = dtype::QuantizedS8{1.2f};
  323. {
  324. auto opr = handle_cuda()->create_operator<Convolution>();
  325. opr->param() = arg.param;
  326. opr->deduce_layout(src, filter, dst);
  327. }
  328. checker.set_rng(0, &rng).set_rng(1, &rng).set_param(arg.param).exec(
  329. TensorLayoutArray{filter, dst, src});
  330. }
  331. }
  332. }
  333. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_INT8_NCHW_DP4A) {
  334. if (!cuda::is_compute_capability_required(6, 1)) {
  335. printf("Skip CUDA.CONVOLUTION_BACKWARD_DATA_INT8_NCHW_DP4A test as "
  336. "current device doesn't support\n");
  337. return;
  338. }
  339. using namespace convolution;
  340. std::vector<TestArg> args = get_args_int8_nchw_conv_bwd_data();
  341. Checker<ConvolutionBackwardData> checker(handle_cuda());
  342. checker.set_before_exec_callback(AlgoChecker<ConvolutionBackwardData>(
  343. "INT8_NCHW_DOTPROD_IMPLICIT_GEMM"));
  344. checker.set_epsilon(1 + 1e-3).set_max_avg_error(1e-1);
  345. for (auto&& arg : args) {
  346. UniformIntRNG rng(-3, 3);
  347. auto src = TensorLayout(arg.src, dtype::QuantizedS8{1.2f});
  348. auto filter = TensorLayout(arg.filter, dtype::QuantizedS8{1.3f});
  349. TensorLayout dst;
  350. dst.dtype = dtype::QuantizedS8{1.2f};
  351. {
  352. auto opr = handle_cuda()->create_operator<Convolution>();
  353. opr->param() = arg.param;
  354. opr->deduce_layout(src, filter, dst);
  355. }
  356. checker.set_rng(0, &rng).set_rng(1, &rng).set_param(arg.param).exec(
  357. TensorLayoutArray{filter, dst, src});
  358. }
  359. }
  360. #if CUDA_VERSION >= 10020
  361. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_INT8_NHWC_IMMA) {
  362. if (!cuda::is_compute_capability_required(7, 5)) {
  363. printf("Skip CUDA.CONVOLUTION_BACKWARD_DATA_INT8_NHWC_IMMA test as "
  364. "current device doesn't support\n");
  365. return;
  366. }
  367. using namespace convolution;
  368. std::vector<TestArg> args = get_args_int8_nhwc_conv_bwd_data();
  369. struct AlgoParam {
  370. int threadblock_m;
  371. int threadblock_n;
  372. int threadblock_k;
  373. int warp_m;
  374. int warp_n;
  375. int warp_k;
  376. int stage;
  377. int access_size;
  378. std::string to_string() {
  379. return ssprintf("_%dX%dX%d_%dX%dX%d_%dstage_%d", threadblock_m,
  380. threadblock_n, threadblock_k, warp_m, warp_n,
  381. warp_k, stage, access_size);
  382. }
  383. };
  384. std::vector<AlgoParam> all_params;
  385. all_params.emplace_back(AlgoParam{64, 16, 32, 64, 16, 32, 2, 4});
  386. all_params.emplace_back(AlgoParam{64, 16, 32, 64, 16, 32, 2, 8});
  387. all_params.emplace_back(AlgoParam{64, 16, 32, 64, 16, 32, 2, 16});
  388. all_params.emplace_back(AlgoParam{128, 32, 32, 64, 32, 32, 1, 4});
  389. all_params.emplace_back(AlgoParam{128, 32, 32, 64, 32, 32, 1, 8});
  390. all_params.emplace_back(AlgoParam{128, 32, 32, 64, 32, 32, 1, 16});
  391. for (auto algo_param : all_params) {
  392. Checker<ConvolutionBackwardData> checker(handle_cuda());
  393. std::string algo_name(ssprintf("INT8_NHWC_IMMA_IMPLICIT_GEMM%s",
  394. algo_param.to_string().c_str()));
  395. checker.set_before_exec_callback(
  396. AlgoChecker<ConvolutionBackwardData>(algo_name.c_str()));
  397. checker.set_epsilon(1 + 1e-3).set_max_avg_error(1e-1);
  398. for (auto&& arg : args) {
  399. UniformIntRNG rng(-3, 3);
  400. auto src = TensorLayout(arg.src, dtype::QuantizedS8{1.2f});
  401. auto filter = TensorLayout(arg.filter, dtype::QuantizedS8{1.3f});
  402. TensorLayout dst;
  403. dst.dtype = dtype::QuantizedS8{1.2f};
  404. {
  405. auto opr = handle_cuda()->create_operator<Convolution>();
  406. opr->param() = arg.param;
  407. opr->deduce_layout(src, filter, dst);
  408. }
  409. checker.set_rng(0, &rng).set_rng(1, &rng).set_param(arg.param).exec(
  410. TensorLayoutArray{filter, dst, src});
  411. }
  412. }
  413. }
  414. #endif
  415. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_FAILED_CUDNN7_5) {
  416. // BRAIN-481 failed on architectures 7.0, remove the following if statement,
  417. // when cudnn fixed the problem.
  418. if (cuda::is_compute_capability_required(7, 0))
  419. return;
  420. using namespace convolution;
  421. std::vector<TestArg> args = get_args_cudnn_7_5_failures();
  422. Checker<ConvolutionBackwardData> checker(handle_cuda());
  423. NormalRNG default_rng;
  424. for (auto&& arg : args) {
  425. float scale =
  426. 128.f / sqrt(arg.filter[0] * arg.filter[2] * arg.filter[3]);
  427. scale = std::max(scale, 1.f);
  428. UniformFloatRNG rng(scale, 2 * scale);
  429. auto src = TensorLayout(arg.src, dtype::Float32());
  430. auto filter = TensorLayout(arg.filter, dtype::Float32());
  431. TensorLayout dst;
  432. {
  433. auto opr = handle_cuda()->create_operator<Convolution>();
  434. opr->param() = arg.param;
  435. opr->deduce_layout(src, filter, dst);
  436. }
  437. src.dtype = dst.dtype = filter.dtype = dtype::Float32();
  438. checker.set_rng(0, &default_rng)
  439. .set_rng(1, &default_rng)
  440. .set_epsilon(1e-3)
  441. .set_param(arg.param)
  442. .exec(TensorLayoutArray{filter, dst, src});
  443. src.dtype = dst.dtype = filter.dtype = dtype::Float16();
  444. checker.set_rng(0, &rng)
  445. .set_rng(1, &rng)
  446. .set_epsilon(1e-1)
  447. .set_param(arg.param)
  448. .exec(TensorLayoutArray{filter, dst, src});
  449. arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  450. checker.set_rng(0, &rng)
  451. .set_rng(1, &rng)
  452. .set_epsilon(1e-1)
  453. .set_param(arg.param)
  454. .exec(TensorLayoutArray{filter, dst, src});
  455. }
  456. }
  457. TEST_F(CUDA, CONVOLUTION_BACKWARD_FILTER) {
  458. using namespace convolution;
  459. std::vector<TestArg> args = get_args();
  460. Checker<ConvolutionBackwardFilter> checker(handle_cuda());
  461. bool f16_checked = false;
  462. for (auto&& arg : args) {
  463. auto src = TensorLayout(arg.src, dtype::Float32());
  464. auto filter = TensorLayout(arg.filter, dtype::Float32());
  465. TensorLayout dst;
  466. {
  467. auto opr = handle_cuda()->create_operator<Convolution>();
  468. opr->param() = arg.param;
  469. opr->deduce_layout(src, filter, dst);
  470. }
  471. float scale = 1.0f / sqrt(dst[2] * dst[3]);
  472. UniformFloatRNG rng(scale, 2 * scale);
  473. src.dtype = dst.dtype = filter.dtype = dtype::Float32();
  474. checker.set_rng(0, &rng)
  475. .set_rng(1, &rng)
  476. .set_epsilon(1e-3)
  477. .set_param(arg.param)
  478. .exec(TensorLayoutArray{src, dst, filter});
  479. // reduce on large f16 array may introduce significant error
  480. if (dst.total_nr_elems() >= 1000 && f16_checked)
  481. continue;
  482. f16_checked = true;
  483. src.dtype = dst.dtype = filter.dtype = dtype::Float16();
  484. checker.set_rng(0, &rng)
  485. .set_rng(1, &rng)
  486. .set_epsilon(1e-1)
  487. .set_param(arg.param)
  488. .exec(TensorLayoutArray{src, dst, filter});
  489. arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  490. checker.set_rng(0, &rng)
  491. .set_rng(1, &rng)
  492. .set_epsilon(1e-1)
  493. .set_param(arg.param)
  494. .exec(TensorLayoutArray{src, dst, filter});
  495. checker.set_before_exec_callback(AlgoChecker<ConvolutionBackwardFilter>(
  496. ExecutionPolicyAlgoName{"CONVOLUTION_BACKWARD_FILTER_BFLOAT16",
  497. {{"MATMUL", {{"CUBLAS", {}}}}}}));
  498. src.dtype = dst.dtype = filter.dtype = dtype::BFloat16();
  499. checker.set_rng(0, &rng)
  500. .set_rng(1, &rng)
  501. .set_epsilon(1e-1)
  502. .set_param(arg.param)
  503. .exec(TensorLayoutArray{src, dst, filter});
  504. checker.reset_before_exec_callback();
  505. checker.opr()->execution_policy() = {};
  506. }
  507. }
  508. TEST_F(CUDA, CONVOLUTION_BACKWARD_FILTER_MATMUL) {
  509. using namespace convolution;
  510. std::vector<TestArg> args = get_args();
  511. Checker<ConvolutionBackwardFilter> checker(handle_cuda());
  512. checker.set_before_exec_callback(AlgoChecker<ConvolutionBackwardFilter>(
  513. ExecutionPolicyAlgoName{"MATMUL", {{"CUBLAS", {}}}}));
  514. for (auto&& arg : args) {
  515. auto src = TensorLayout(arg.src, dtype::Float32());
  516. auto filter = TensorLayout(arg.filter, dtype::Float32());
  517. TensorLayout dst;
  518. {
  519. auto opr = handle_cuda()->create_operator<Convolution>();
  520. opr->param() = arg.param;
  521. opr->deduce_layout(src, filter, dst);
  522. }
  523. float scale = 1.0f / sqrt(dst[2] * dst[3]);
  524. UniformFloatRNG rng(scale, 2 * scale);
  525. src.dtype = dst.dtype = filter.dtype = dtype::Float32();
  526. checker.set_rng(0, &rng)
  527. .set_rng(1, &rng)
  528. .set_epsilon(1e-3)
  529. .set_param(arg.param)
  530. .exec(TensorLayoutArray{src, dst, filter});
  531. }
  532. //! noncontiguous case
  533. {
  534. NormalRNG default_rng;
  535. param::Convolution param;
  536. param.pad_h = param.pad_w = 1;
  537. checker.set_rng(0, &default_rng)
  538. .set_rng(1, &default_rng)
  539. .set_param(param)
  540. .execl(TensorLayoutArray{
  541. {{2, 16, 7, 7}, {1568, 49, 7, 1}, dtype::Float32()},
  542. {{2, 16, 7, 7}, {1568, 49, 7, 1}, dtype::Float32()},
  543. {{16, 16, 3, 3}, {144, 9, 3, 1}, dtype::Float32()}});
  544. }
  545. }
  546. TEST_F(CUDA, CONVOLUTION_BACKWARD_FILTER_CUDNN) {
  547. if (cuda::is_compute_capability_required(7, 0))
  548. return;
  549. using namespace convolution;
  550. Checker<ConvolutionBackwardFilter> checker(handle_cuda());
  551. checker.set_before_exec_callback(AlgoChecker<ConvolutionBackwardFilter>(
  552. "CUDNN_CONVOLUTION"));
  553. //! noncontiguous case
  554. {
  555. param::Convolution param;
  556. param.pad_h = param.pad_w = 1;
  557. checker.set_param(param).execl(TensorLayoutArray{
  558. {{2, 16, 7, 7}, {1568, 49, 7, 1}, dtype::Float32()},
  559. {{2, 16, 7, 7}, {1568, 49, 7, 1}, dtype::Float32()},
  560. {{16, 16, 3, 3}, {144, 9, 3, 1}, dtype::Float32()}
  561. });
  562. }
  563. }
  564. TEST_F(CUDA, CONV_CONFIG_COMBINATIONS) {
  565. auto eps_getter = [](bool f16, int stage, const char* name) -> float {
  566. if (f16) {
  567. return stage == 2 ? 0.5 : 0.2;
  568. }
  569. if (strstr(name, "WINOGRAD_NONFUSED"))
  570. return 0.3;
  571. return 1e-3;
  572. };
  573. convolution::test_conv_config_combinations(2, handle_cuda(), false, true,
  574. true, eps_getter, true);
  575. convolution::test_conv_config_combinations(3, handle_cuda(), false, true,
  576. true, eps_getter, true);
  577. convolution::test_conv_config_combinations(5, handle_cuda(), false, true,
  578. true, eps_getter, true);
  579. }
  580. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_1) {
  581. if (cuda::is_compute_capability_required(7, 0))
  582. return;
  583. using namespace convolution;
  584. Checker<ConvolutionBackwardData> checker(handle_cuda());
  585. checker.set_before_exec_callback(AlgoChecker<ConvolutionBackwardData>(
  586. "CUDNN_CONVOLUTION_BWD_DATA_ALGO_1" CUDNN_VERSION_STRING));
  587. NormalRNG default_rng;
  588. TensorShape s_filter = TensorShape{8, 8, 2, 2},
  589. s_src = TensorShape{2, 8, 18, 18};
  590. float scale = 1.0f / sqrt(s_filter[0] * s_filter[2] * s_filter[3]);
  591. UniformFloatRNG rng(scale, 2 * scale);
  592. auto src = TensorLayout(s_src, dtype::Float16());
  593. auto filter = TensorLayout(s_filter, dtype::Float16());
  594. TensorLayout dst;
  595. param::Convolution param;
  596. param.pad_h = param.pad_w = 2;
  597. param.stride_h = param.stride_w = 2;
  598. {
  599. auto opr = handle_cuda()->create_operator<Convolution>();
  600. opr->param() = param;
  601. opr->deduce_layout(src, filter, dst);
  602. }
  603. src.dtype = dst.dtype = filter.dtype = dtype::Float16();
  604. param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  605. checker.set_rng(0, &rng)
  606. .set_rng(1, &rng)
  607. .set_epsilon(0.2)
  608. .set_param(param)
  609. .exec(TensorLayoutArray{filter, dst, src});
  610. }
  611. #if MEGDNN_WITH_BENCHMARK
  612. TEST_F(CUDA, CONV_FWD_BENCHMARK) {
  613. auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW,
  614. size_t SH = 1, size_t SW = 1, size_t FH = 1, size_t FW = 1,
  615. size_t PH = 0, size_t PW = 0, bool fp16io_c32 = false) {
  616. auto benchmarker = Benchmarker<ConvolutionForward>(handle_cuda());
  617. benchmarker.set_dtype(0, dtype::Float16())
  618. .set_dtype(1, dtype::Float16())
  619. .set_dtype(2, dtype::Float16());
  620. ConvolutionForward::Param param;
  621. param.stride_h = SH;
  622. param.stride_w = SW;
  623. param.pad_h = PH;
  624. param.pad_w = PW;
  625. if (fp16io_c32) {
  626. param.compute_mode =
  627. ConvolutionForward::Param::ComputeMode::FLOAT32;
  628. }
  629. benchmarker.set_param(param);
  630. std::unique_ptr<OprProxy<ConvolutionForward>> proxy{
  631. new OprProxy<ConvolutionForward>{true}};
  632. benchmarker.set_proxy(proxy);
  633. size_t OH = (IH - FH + 2 * PH) / SH + 1;
  634. size_t OW = (IW - FW + 2 * PW) / SW + 1;
  635. auto time = benchmarker.execs(
  636. {{N, IC, IH, IW}, {OC, IC, FH, FW}, {N, OC, OH, OW}});
  637. time /= 1000.0 * 10.0;
  638. auto flo = (double)N * OC * IC * OH * OW * FH * FW * 2;
  639. auto flops = flo / time / 1e12;
  640. printf("comp_type %s: ", fp16io_c32 ? "32" : "16");
  641. printf("%.3fG FLO, flops %.3fTFLOPS\n", flo / 1e9, flops);
  642. };
  643. run(32, 512, 256, 56, 56, 1, 1, 1, 1, 0, 0, false);
  644. run(32, 512, 256, 56, 56, 1, 1, 1, 1, 0, 0, true);
  645. }
  646. TEST_F(CUDA, CONVOLUTION_FWD_BENCHMARK) {
  647. CUBenchmarker<ConvolutionForward> bench{handle_cuda()};
  648. std::unique_ptr<OprProxy<ConvolutionForward>> proxy{
  649. new OprProxy<ConvolutionForward>{true}};
  650. size_t RUNS = 10;
  651. bench.set_proxy(proxy).set_times(RUNS);
  652. auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW,
  653. size_t FH, size_t SH, size_t PH) {
  654. bench.set_dtype(0, dtype::Float32())
  655. .set_dtype(1, dtype::Float32())
  656. .set_dtype(2, dtype::Float32());
  657. param::Convolution param;
  658. param.stride_h = param.stride_w = SH;
  659. param.pad_h = param.pad_w = PH;
  660. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  661. bench.set_param(param);
  662. bench.proxy()->target_execution_policy.algo.reset();
  663. TensorLayout src{{N, IC, IH, IW}, dtype::Float32()},
  664. filter{{OC, IC, FH, FH}, dtype::Float32()};
  665. TensorLayout dst;
  666. {
  667. auto&& opr = handle_cuda()->create_operator<Convolution>();
  668. opr->param() = param;
  669. opr->deduce_layout(src, filter, dst);
  670. }
  671. auto time_ms_fp32 = bench.execl({src, filter, dst}) / RUNS;
  672. src.dtype = filter.dtype = dst.dtype = dtype::Float16();
  673. bench.proxy()->target_execution_policy.algo.reset();
  674. bench.set_dtype(0, dtype::Float16())
  675. .set_dtype(1, dtype::Float16())
  676. .set_dtype(2, dtype::Float16());
  677. auto time_ms_true_fp16 = bench.execl({src, filter, dst}) / RUNS;
  678. param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  679. bench.proxy()->target_execution_policy.algo.reset();
  680. bench.set_param(param);
  681. auto time_ms_pseudo_fp16 = bench.execl({src, filter, dst}) / RUNS;
  682. float flo = 2.0 * N * OC * IC * dst[2] * dst[3] * FH * FH;
  683. printf("inp=%s, kern=%s, dst=%s ", src.to_string().c_str(),
  684. filter.to_string().c_str(), dst.to_string().c_str());
  685. printf("time_fp32=%.2fms, flops=%.3fTFLOPS\ntime_true_fp16=%.2fms, "
  686. "flops=%.3fTFLOPS\ntime_pseudo_fp16=%.2fms, flops=%.3fFLOPS\n",
  687. time_ms_fp32, (flo / (time_ms_fp32 * 1e9)), time_ms_true_fp16,
  688. (flo / (time_ms_true_fp16 * 1e9)), time_ms_pseudo_fp16,
  689. (flo / (time_ms_pseudo_fp16 * 1e9)));
  690. printf("speedup (true_fp16/fp32)=%.2f, (true_fp16/pseudo_fp16)=%.2f\n",
  691. time_ms_fp32 / time_ms_true_fp16,
  692. time_ms_pseudo_fp16 / time_ms_true_fp16);
  693. };
  694. run(32, 64, 3, 224, 224, 7, 2, 3);
  695. run(32, 128, 128, 28, 28, 3, 1, 1);
  696. run(32, 256, 256, 14, 14, 3, 1, 1);
  697. run(32, 512, 512, 7, 7, 3, 1, 1);
  698. run(32, 64, 64, 56, 56, 3, 1, 1);
  699. run(32, 512, 256, 56, 56, 1, 2, 0);
  700. run(32, 1024, 512, 28, 28, 1, 2, 0);
  701. run(32, 2048, 1024, 14, 14, 1, 2, 0);
  702. run(32, 512, 128, 28, 28, 1, 1, 0);
  703. run(32, 128, 512, 28, 28, 1, 1, 0);
  704. run(32, 1024, 256, 14, 14, 1, 1, 0);
  705. run(32, 256, 1024, 14, 14, 1, 1, 0);
  706. run(32, 2048, 512, 7, 7, 1, 1, 0);
  707. run(32, 512, 2048, 7, 7, 1, 1, 0);
  708. run(32, 256, 64, 56, 56, 1, 1, 0);
  709. run(32, 64, 256, 56, 56, 1, 1, 0);
  710. run(32, 128, 256, 56, 56, 1, 2, 0);
  711. run(32, 256, 512, 28, 28, 1, 2, 0);
  712. run(32, 512, 1024, 14, 14, 1, 2, 0);
  713. run(32, 64, 64, 56, 56, 1, 1, 0);
  714. }
  715. TEST_F(CUDA, CONVOLUTION_BWD_DATA_BENCHMARK) {
  716. CUBenchmarker<ConvolutionBackwardData> bench{handle_cuda()};
  717. std::unique_ptr<OprProxy<ConvolutionBackwardData>> proxy{
  718. new OprProxy<ConvolutionBackwardData>{true}};
  719. size_t RUNS = 10;
  720. bench.set_proxy(proxy).set_times(RUNS);
  721. auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW,
  722. size_t FH, size_t SH, size_t PH) {
  723. bench.set_dtype(0, dtype::Float32())
  724. .set_dtype(1, dtype::Float32())
  725. .set_dtype(2, dtype::Float32());
  726. param::Convolution param;
  727. param.stride_h = param.stride_w = SH;
  728. param.pad_h = param.pad_w = PH;
  729. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  730. bench.set_param(param);
  731. bench.proxy()->target_execution_policy.algo.reset();
  732. TensorLayout src{{N, IC, IH, IW}, dtype::Float32()},
  733. filter{{OC, IC, FH, FH}, dtype::Float32()};
  734. TensorLayout dst;
  735. {
  736. auto&& opr = handle_cuda()->create_operator<Convolution>();
  737. opr->param() = param;
  738. opr->deduce_layout(src, filter, dst);
  739. }
  740. auto time_ms_fp32 = bench.execl({filter, dst, src}) / RUNS;
  741. src.dtype = filter.dtype = dst.dtype = dtype::Float16();
  742. bench.proxy()->target_execution_policy.algo.reset();
  743. bench.set_dtype(0, dtype::Float16())
  744. .set_dtype(1, dtype::Float16())
  745. .set_dtype(2, dtype::Float16());
  746. auto time_ms_true_fp16 = bench.execl({filter, dst, src}) / RUNS;
  747. param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  748. bench.proxy()->target_execution_policy.algo.reset();
  749. bench.set_param(param);
  750. auto time_ms_pseudo_fp16 = bench.execl({filter, dst, src}) / RUNS;
  751. float flo = 2.0 * N * OC * IC * dst[2] * dst[3] * FH * FH;
  752. printf("inp=%s, kern=%s, dst=%s ", src.to_string().c_str(),
  753. filter.to_string().c_str(), dst.to_string().c_str());
  754. printf("time_fp32=%.2fms, flops=%.3fTFLOPS\ntime_true_fp16=%.2fms, "
  755. "flops=%.3fTFLOPS\ntime_pseudo_fp16=%.2fms, flops=%.3fFLOPS\n",
  756. time_ms_fp32, (flo / (time_ms_fp32 * 1e9)), time_ms_true_fp16,
  757. (flo / (time_ms_true_fp16 * 1e9)), time_ms_pseudo_fp16,
  758. (flo / (time_ms_pseudo_fp16 * 1e9)));
  759. printf("speedup (true_fp16/fp32)=%.2f, (true_fp16/pseudo_fp16)=%.2f\n",
  760. time_ms_fp32 / time_ms_true_fp16,
  761. time_ms_pseudo_fp16 / time_ms_true_fp16);
  762. };
  763. run(32, 64, 3, 224, 224, 7, 2, 3);
  764. run(32, 128, 128, 28, 28, 3, 1, 1);
  765. run(32, 256, 256, 14, 14, 3, 1, 1);
  766. run(32, 512, 512, 7, 7, 3, 1, 1);
  767. run(32, 64, 64, 56, 56, 3, 1, 1);
  768. run(32, 512, 256, 56, 56, 1, 2, 0);
  769. run(32, 1024, 512, 28, 28, 1, 2, 0);
  770. run(32, 2048, 1024, 14, 14, 1, 2, 0);
  771. run(32, 512, 128, 28, 28, 1, 1, 0);
  772. run(32, 128, 512, 28, 28, 1, 1, 0);
  773. run(32, 1024, 256, 14, 14, 1, 1, 0);
  774. run(32, 256, 1024, 14, 14, 1, 1, 0);
  775. run(32, 2048, 512, 7, 7, 1, 1, 0);
  776. run(32, 512, 2048, 7, 7, 1, 1, 0);
  777. run(32, 256, 64, 56, 56, 1, 1, 0);
  778. run(32, 64, 256, 56, 56, 1, 1, 0);
  779. run(32, 128, 256, 56, 56, 1, 2, 0);
  780. run(32, 256, 512, 28, 28, 1, 2, 0);
  781. run(32, 512, 1024, 14, 14, 1, 2, 0);
  782. run(32, 64, 64, 56, 56, 1, 1, 0);
  783. }
  784. TEST_F(CUDA, BENCHMARK_CONVOLUTION_BWD_DATA_BF16) {
  785. CUBenchmarker<ConvolutionBackwardData> bench{handle_cuda()};
  786. std::unique_ptr<OprProxy<ConvolutionBackwardData>> proxy{
  787. new OprProxy<ConvolutionBackwardData>{true}};
  788. size_t RUNS = 10;
  789. bench.set_proxy(proxy).set_times(RUNS);
  790. auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW,
  791. size_t FH, size_t SH, size_t PH) {
  792. bench.set_dtype(0, dtype::BFloat16())
  793. .set_dtype(1, dtype::BFloat16())
  794. .set_dtype(2, dtype::BFloat16());
  795. param::Convolution param;
  796. param.stride_h = param.stride_w = SH;
  797. param.pad_h = param.pad_w = PH;
  798. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  799. bench.set_param(param);
  800. bench.proxy()->target_execution_policy = {};
  801. TensorLayout src{{N, IC, IH, IW}, dtype::BFloat16()},
  802. filter{{OC, IC, FH, FH}, dtype::BFloat16()};
  803. TensorLayout dst;
  804. {
  805. auto&& opr = handle_cuda()->create_operator<Convolution>();
  806. opr->param() = param;
  807. opr->deduce_layout(src, filter, dst);
  808. }
  809. auto used = bench.execl({filter, dst, src}) / RUNS;
  810. float flo = 2.0 * N * OC * IC * dst[2] * dst[3] * FH * FH;
  811. printf("inp=%s, kern=%s, dst=%s ", src.to_string().c_str(),
  812. filter.to_string().c_str(), dst.to_string().c_str());
  813. printf("time_fp32=%.2fms, flops=%.3fTFLOPS\n", used,
  814. (flo / (used * 1e9)));
  815. };
  816. run(32, 64, 3, 224, 224, 7, 2, 3);
  817. run(32, 128, 128, 28, 28, 3, 1, 1);
  818. run(32, 256, 256, 14, 14, 3, 1, 1);
  819. run(32, 512, 512, 7, 7, 3, 1, 1);
  820. run(32, 64, 64, 56, 56, 3, 1, 1);
  821. run(32, 512, 256, 56, 56, 1, 2, 0);
  822. run(32, 1024, 512, 28, 28, 1, 2, 0);
  823. run(32, 2048, 1024, 14, 14, 1, 2, 0);
  824. run(32, 512, 128, 28, 28, 1, 1, 0);
  825. run(32, 128, 512, 28, 28, 1, 1, 0);
  826. run(32, 1024, 256, 14, 14, 1, 1, 0);
  827. run(32, 256, 1024, 14, 14, 1, 1, 0);
  828. run(32, 2048, 512, 7, 7, 1, 1, 0);
  829. run(32, 512, 2048, 7, 7, 1, 1, 0);
  830. run(32, 256, 64, 56, 56, 1, 1, 0);
  831. run(32, 64, 256, 56, 56, 1, 1, 0);
  832. run(32, 128, 256, 56, 56, 1, 2, 0);
  833. run(32, 256, 512, 28, 28, 1, 2, 0);
  834. run(32, 512, 1024, 14, 14, 1, 2, 0);
  835. run(32, 64, 64, 56, 56, 1, 1, 0);
  836. }
  837. TEST_F(CUDA, BENCHMARK_CONVOLUTION_BWD_DATA_INT8_DP4A) {
  838. CUBenchmarker<ConvolutionBackwardData> bench{handle_cuda()};
  839. std::unique_ptr<OprProxy<ConvolutionBackwardData>> proxy{
  840. new OprProxy<ConvolutionBackwardData>{true}};
  841. size_t RUNS = 10;
  842. bench.set_proxy(proxy).set_times(RUNS);
  843. auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW,
  844. size_t FH, size_t SH, size_t PH) {
  845. bench.set_dtype(0, dtype::QuantizedS8{1.0f})
  846. .set_dtype(1, dtype::QuantizedS8{1.0f})
  847. .set_dtype(2, dtype::QuantizedS8{1.0f});
  848. param::Convolution param;
  849. param.format = param::Convolution::Format::NCHW4;
  850. param.stride_h = param.stride_w = SH;
  851. param.pad_h = param.pad_w = PH;
  852. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  853. bench.set_param(param);
  854. bench.proxy()->target_execution_policy = {};
  855. TensorLayout src{{N, IC / 4, IH, IW, 4}, dtype::QuantizedS8{1.0f}},
  856. filter{{OC, IC / 4, FH, FH, 4}, dtype::QuantizedS8{1.0f}};
  857. TensorLayout dst;
  858. dst.dtype = dtype::QuantizedS8{1.0f};
  859. {
  860. auto&& opr = handle_cuda()->create_operator<Convolution>();
  861. opr->param() = param;
  862. opr->deduce_layout(src, filter, dst);
  863. }
  864. auto used = bench.execl({filter, dst, src}) / RUNS;
  865. float flo = 2.0 * N * OC * IC * dst[2] * dst[3] * FH * FH;
  866. printf("inp=%s, kern=%s, dst=%s ", src.to_string().c_str(),
  867. filter.to_string().c_str(), dst.to_string().c_str());
  868. printf("time_fp32=%.2fms, flops=%.3fTFLOPS\n", used,
  869. (flo / (used * 1e9)));
  870. };
  871. run(64, 32, 32, 92, 180, 4, 2, 2);
  872. run(64, 32, 32, 46, 80, 4, 2, 2);
  873. run(16, 16, 16, 92, 180, 4, 2, 2);
  874. run(16, 16, 16, 46, 80, 4, 2, 2);
  875. }
  876. TEST_F(CUDA, CONVOLUTION_BWD_FILTER_BENCHMARK) {
  877. CUBenchmarker<ConvolutionBackwardFilter> bench{handle_cuda()};
  878. std::unique_ptr<OprProxy<ConvolutionBackwardFilter>> proxy{
  879. new OprProxy<ConvolutionBackwardFilter>{true}};
  880. size_t RUNS = 10;
  881. bench.set_proxy(proxy).set_times(RUNS);
  882. auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW,
  883. size_t FH, size_t SH, size_t PH) {
  884. bench.set_dtype(0, dtype::Float32())
  885. .set_dtype(1, dtype::Float32())
  886. .set_dtype(2, dtype::Float32());
  887. param::Convolution param;
  888. param.stride_h = param.stride_w = SH;
  889. param.pad_h = param.pad_w = PH;
  890. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  891. bench.set_param(param);
  892. bench.proxy()->target_execution_policy.algo.reset();
  893. TensorLayout src{{N, IC, IH, IW}, dtype::Float32()},
  894. filter{{OC, IC, FH, FH}, dtype::Float32()};
  895. TensorLayout dst;
  896. {
  897. auto&& opr = handle_cuda()->create_operator<Convolution>();
  898. opr->param() = param;
  899. opr->deduce_layout(src, filter, dst);
  900. }
  901. auto time_ms_fp32 = bench.execl({src, dst, filter}) / RUNS;
  902. src.dtype = filter.dtype = dst.dtype = dtype::Float16();
  903. bench.proxy()->target_execution_policy.algo.reset();
  904. bench.set_dtype(0, dtype::Float16())
  905. .set_dtype(1, dtype::Float16())
  906. .set_dtype(2, dtype::Float16());
  907. auto time_ms_true_fp16 = bench.execl({src, dst, filter}) / RUNS;
  908. param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  909. bench.proxy()->target_execution_policy.algo.reset();
  910. bench.set_param(param);
  911. auto time_ms_pseudo_fp16 = bench.execl({src, dst, filter}) / RUNS;
  912. float flo = 2.0 * N * OC * IC * dst[2] * dst[3] * FH * FH;
  913. printf("inp=%s, kern=%s, dst=%s ", src.to_string().c_str(),
  914. filter.to_string().c_str(), dst.to_string().c_str());
  915. printf("time_fp32=%.2fms, flops=%.3fTFLOPS\ntime_true_fp16=%.2fms, "
  916. "flops=%.3fTFLOPS\ntime_pseudo_fp16=%.2fms, flops=%.3fFLOPS\n",
  917. time_ms_fp32, (flo / (time_ms_fp32 * 1e9)), time_ms_true_fp16,
  918. (flo / (time_ms_true_fp16 * 1e9)), time_ms_pseudo_fp16,
  919. (flo / (time_ms_pseudo_fp16 * 1e9)));
  920. printf("speedup (true_fp16/fp32)=%.2f, (true_fp16/pseudo_fp16)=%.2f\n",
  921. time_ms_fp32 / time_ms_true_fp16,
  922. time_ms_pseudo_fp16 / time_ms_true_fp16);
  923. };
  924. run(32, 64, 3, 224, 224, 7, 2, 3);
  925. run(32, 128, 128, 28, 28, 3, 1, 1);
  926. run(32, 256, 256, 14, 14, 3, 1, 1);
  927. run(32, 512, 512, 7, 7, 3, 1, 1);
  928. run(32, 64, 64, 56, 56, 3, 1, 1);
  929. run(32, 512, 256, 56, 56, 1, 2, 0);
  930. run(32, 1024, 512, 28, 28, 1, 2, 0);
  931. run(32, 2048, 1024, 14, 14, 1, 2, 0);
  932. run(32, 512, 128, 28, 28, 1, 1, 0);
  933. run(32, 128, 512, 28, 28, 1, 1, 0);
  934. run(32, 1024, 256, 14, 14, 1, 1, 0);
  935. run(32, 256, 1024, 14, 14, 1, 1, 0);
  936. run(32, 2048, 512, 7, 7, 1, 1, 0);
  937. run(32, 512, 2048, 7, 7, 1, 1, 0);
  938. run(32, 256, 64, 56, 56, 1, 1, 0);
  939. run(32, 64, 256, 56, 56, 1, 1, 0);
  940. run(32, 128, 256, 56, 56, 1, 2, 0);
  941. run(32, 256, 512, 28, 28, 1, 2, 0);
  942. run(32, 512, 1024, 14, 14, 1, 2, 0);
  943. run(32, 64, 64, 56, 56, 1, 1, 0);
  944. }
  945. #endif
  946. #undef CUDNN_VERSION_STRING
  947. #undef V
  948. #undef V1
  949. } // namespace test
  950. } // namespace megdnn
  951. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台