You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

convolution.cpp 52 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259
  1. /**
  2. * \file dnn/test/cuda/convolution.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
  10. * implied.
  11. */
  12. #include "test/common/convolution.h"
  13. #include "megdnn/dtype.h"
  14. #include "megdnn/opr_param_defs.h"
  15. #include "megdnn/oprs.h"
  16. #include "test/common/accuracy_shake_checker.h"
  17. #include "test/common/checker.h"
  18. #include "test/common/rng.h"
  19. #include "test/common/tensor.h"
  20. #include "test/common/workspace_wrapper.h"
  21. #include "test/cuda/benchmark.h"
  22. #include "test/cuda/fixture.h"
  23. #include "test/cuda/utils.h"
  24. #include <cudnn.h>
  25. #define V1(x) #x
  26. #define V(x) V1(x)
  27. #define CUDNN_VERSION_STRING \
  28. "v" V(CUDNN_MAJOR) "." V(CUDNN_MINOR) "." V(CUDNN_PATCHLEVEL)
  29. namespace megdnn {
  30. namespace test {
  31. TEST_F(CUDA, CONVOLUTION_8X8X32) {
  32. require_compute_capability(6, 1);
  33. using namespace convolution;
  34. std::vector<TestArg> args;
  35. {
  36. auto v = get_args();
  37. for (auto&& a : v) {
  38. args.push_back(std::move(a));
  39. }
  40. }
  41. {
  42. auto v = get_dilated_args();
  43. for (auto&& a : v) {
  44. args.push_back(std::move(a));
  45. }
  46. }
  47. {
  48. auto v = get_chanwise_args();
  49. for (auto&& a : v) {
  50. args.push_back(std::move(a));
  51. }
  52. }
  53. Checker<ConvolutionForward> checker(handle_cuda());
  54. UniformIntRNG rng(-4, 4);
  55. for (auto arg : args) {
  56. arg.param.format = param::Convolution::Format::NHWC;
  57. arg.src = cvt_src_or_dst_nchw2nhwc(arg.src);
  58. arg.filter = cvt_filter_nchw2nhwc(arg.filter);
  59. checker.set_dtype(0, dtype::Int8())
  60. .set_dtype(1, dtype::Int8())
  61. .set_dtype(2, dtype::Int32())
  62. .set_param(arg.param)
  63. .set_rng(0, &rng)
  64. .set_rng(1, &rng)
  65. .execs({arg.src, arg.filter, {}});
  66. }
  67. }
  68. TEST_F(CUDA, CONVOLUTION_FORWARD) {
  69. using namespace convolution;
  70. std::vector<TestArg> args = get_args();
  71. Checker<ConvolutionForward> checker(handle_cuda());
  72. NormalRNG default_rng;
  73. for (auto&& arg : args) {
  74. float scale = 1.0f / sqrt(arg.filter[1] * arg.filter[2] * arg.filter[3]);
  75. UniformFloatRNG rng(scale, 2 * scale);
  76. checker.set_dtype(0, dtype::Float32())
  77. .set_dtype(1, dtype::Float32())
  78. .set_dtype(2, dtype::Float32())
  79. .set_rng(0, &default_rng)
  80. .set_rng(1, &default_rng)
  81. .set_epsilon(1e-3)
  82. .set_param(arg.param)
  83. .execs({arg.src, arg.filter, {}});
  84. checker.set_dtype(0, dtype::Float16())
  85. .set_dtype(1, dtype::Float16())
  86. .set_dtype(2, dtype::Float16())
  87. .set_rng(0, &rng)
  88. .set_rng(1, &rng)
  89. .set_epsilon(1e-1)
  90. .set_param(arg.param)
  91. .execs({arg.src, arg.filter, {}});
  92. arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  93. checker.set_dtype(0, dtype::Float16())
  94. .set_dtype(1, dtype::Float16())
  95. .set_dtype(2, dtype::Float16())
  96. .set_rng(0, &rng)
  97. .set_rng(1, &rng)
  98. .set_epsilon(1e-1)
  99. .set_param(arg.param)
  100. .execs({arg.src, arg.filter, {}});
  101. checker.set_dtype(0, dtype::BFloat16())
  102. .set_dtype(1, dtype::BFloat16())
  103. .set_dtype(2, dtype::BFloat16())
  104. .set_epsilon(1e-1)
  105. .set_param(arg.param)
  106. .execs({arg.src, arg.filter, {}});
  107. }
  108. }
  109. TEST_F(CUDA, CONV_FORWARD_MATMUL_NCHW4) {
  110. require_compute_capability(6, 1);
  111. using namespace convolution;
  112. Checker<Convolution> checker(handle_cuda());
  113. UniformIntRNG int_rng{-127, 127};
  114. Convolution::Param param;
  115. param.format = Convolution::Param::Format::NCHW4;
  116. checker.set_dtype(0, dtype::QuantizedS8(0.132f))
  117. .set_dtype(1, dtype::QuantizedS8(0.0239f))
  118. .set_dtype(2, dtype::QuantizedS32(0.132f * 0.0239f))
  119. .set_rng(0, &int_rng)
  120. .set_rng(1, &int_rng)
  121. .set_param(param);
  122. checker.set_before_exec_callback(
  123. AlgoChecker<ConvolutionForward>(ExecutionPolicyAlgoName{
  124. "DEFAULT",
  125. {{ConvBiasForward::algo_name<ConvBiasForward::MatmulParam>(
  126. "MATMUL8X8X32", {})
  127. .c_str(),
  128. {}}}}));
  129. param.sparse = Convolution::Param::Sparse::DENSE;
  130. param.pad_h = param.pad_w = 1;
  131. param.stride_h = param.stride_w = 1;
  132. checker.set_param(param);
  133. checker.exec({{8, 4, 10, 10, 4}, {16, 4, 3, 3, 4}, {}});
  134. checker.exec({{1, 4, 2, 2, 4}, {16, 4, 3, 3, 4}, {}});
  135. checker.exec({{8, 64, 12, 12, 4}, {256, 64, 3, 3, 4}, {}});
  136. }
  137. TEST_F(CUDA, CONVOLUTION_1X1_FORWARD) {
  138. using namespace convolution;
  139. std::vector<TestArg> args = get_1x1_args();
  140. Checker<ConvolutionForward> checker(handle_cuda());
  141. NormalRNG default_rng;
  142. for (auto&& arg : args) {
  143. float scale = 1.0f / sqrt(arg.filter[1] * arg.filter[2] * arg.filter[3]);
  144. UniformFloatRNG rng(scale, 2 * scale);
  145. checker.set_dtype(0, dtype::Float32())
  146. .set_dtype(1, dtype::Float32())
  147. .set_rng(0, &default_rng)
  148. .set_rng(1, &default_rng)
  149. .set_epsilon(1e-3)
  150. .set_param(arg.param)
  151. .execs({arg.src, arg.filter, {}});
  152. }
  153. }
  154. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA) {
  155. using namespace convolution;
  156. std::vector<TestArg> args = get_args_cuda_conv_bwd_data();
  157. Checker<ConvolutionBackwardData> checker(handle_cuda());
  158. NormalRNG default_rng;
  159. for (auto&& arg : args) {
  160. float scale = 64.f / sqrt(arg.filter[0] * arg.filter[2] * arg.filter[3]);
  161. UniformFloatRNG rng(scale, 2 * scale);
  162. auto src = TensorLayout(arg.src, dtype::Float32());
  163. auto filter = TensorLayout(arg.filter, dtype::Float32());
  164. TensorLayout dst;
  165. {
  166. auto opr = handle_cuda()->create_operator<Convolution>();
  167. opr->param() = arg.param;
  168. opr->deduce_layout(src, filter, dst);
  169. }
  170. src.dtype = dst.dtype = filter.dtype = dtype::Float32();
  171. checker.set_rng(0, &default_rng)
  172. .set_rng(1, &default_rng)
  173. .set_epsilon(1e-3)
  174. .set_param(arg.param)
  175. .exec(TensorLayoutArray{filter, dst, src});
  176. if (!check_compute_capability(6, 0)) {
  177. src.dtype = dst.dtype = filter.dtype = dtype::Float16();
  178. checker.set_rng(0, &rng)
  179. .set_rng(1, &rng)
  180. .set_epsilon(1e-1)
  181. .set_param(arg.param)
  182. .exec(TensorLayoutArray{filter, dst, src});
  183. arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  184. checker.set_rng(0, &rng)
  185. .set_rng(1, &rng)
  186. .set_epsilon(1e-1)
  187. .set_param(arg.param)
  188. .exec(TensorLayoutArray{filter, dst, src});
  189. }
  190. checker.set_before_exec_callback(
  191. AlgoChecker<ConvolutionBackwardData>(ExecutionPolicyAlgoName{
  192. "CONVOLUTION_BACKWARD_DATD_BFLOAT16",
  193. {{"MATMUL", {{"CUBLAS", {}}}}}}));
  194. src.dtype = dst.dtype = filter.dtype = dtype::BFloat16();
  195. arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  196. checker.set_rng(0, &rng)
  197. .set_rng(1, &rng)
  198. .set_epsilon(1e-1)
  199. .set_param(arg.param)
  200. .exec(TensorLayoutArray{filter, dst, src});
  201. checker.reset_before_exec_callback();
  202. checker.opr()->execution_policy() = {};
  203. }
  204. }
  205. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_FP16_CUDNN7_5) {
  206. // algo CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 with
  207. // TensorCore operations produces incorrect result.
  208. // Maybe nvidia has fixed this issue
  209. // There is a test using incorrect case:
  210. // inp={2x8x18x18}, kern={8x8x2x2}, pad_h=pad_w=2, stride_h=stride_w=2,
  211. // dtype=float16
  212. using namespace convolution;
  213. std::vector<TestArg> args = get_args_cudnn_5_1_backward();
  214. Checker<ConvolutionBackwardData> checker(handle_cuda());
  215. NormalRNG default_rng;
  216. for (auto&& arg : args) {
  217. float scale = 128.f / sqrt(arg.filter[0] * arg.filter[2] * arg.filter[3]);
  218. scale = std::max(scale, 1.f);
  219. UniformFloatRNG rng(scale, 2 * scale);
  220. arg.param.format = param::Convolution::Format::NHWC;
  221. arg.src = cvt_src_or_dst_nchw2nhwc(arg.src);
  222. arg.filter = cvt_filter_nchw2nhwc(arg.filter);
  223. auto src = TensorLayout(arg.src, dtype::Float32());
  224. auto filter = TensorLayout(arg.filter, dtype::Float32());
  225. TensorLayout dst;
  226. {
  227. auto opr = handle_cuda()->create_operator<Convolution>();
  228. opr->param() = arg.param;
  229. opr->deduce_layout(src, filter, dst);
  230. }
  231. src.dtype = dst.dtype = filter.dtype = dtype::Float16();
  232. arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  233. checker.set_rng(0, &rng)
  234. .set_rng(1, &rng)
  235. .set_epsilon(1e-2)
  236. .set_param(arg.param)
  237. .exec(TensorLayoutArray{filter, dst, src});
  238. src.dtype = dst.dtype = filter.dtype = dtype::Float32();
  239. arg.param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  240. checker.set_rng(0, &rng)
  241. .set_rng(1, &rng)
  242. .set_epsilon(1e-2)
  243. .set_param(arg.param)
  244. .exec(TensorLayoutArray{filter, dst, src});
  245. }
  246. }
  247. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_NHWC) {
  248. using namespace convolution;
  249. std::vector<TestArg> args = get_args_cuda_conv_bwd_data();
  250. Checker<ConvolutionBackwardData> checker(handle_cuda());
  251. NormalRNG default_rng;
  252. for (auto&& arg : args) {
  253. float scale = 64.f / sqrt(arg.filter[0] * arg.filter[2] * arg.filter[3]);
  254. UniformFloatRNG rng(scale, 2 * scale);
  255. arg.param.format = param::Convolution::Format::NHWC;
  256. arg.src = cvt_src_or_dst_nchw2nhwc(arg.src);
  257. arg.filter = cvt_filter_nchw2nhwc(arg.filter);
  258. auto src = TensorLayout(arg.src, dtype::Float32());
  259. auto filter = TensorLayout(arg.filter, dtype::Float32());
  260. TensorLayout dst;
  261. {
  262. auto opr = handle_cuda()->create_operator<Convolution>();
  263. opr->param() = arg.param;
  264. opr->deduce_layout(src, filter, dst);
  265. }
  266. src.dtype = dst.dtype = filter.dtype = dtype::Float16();
  267. arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  268. checker.set_rng(0, &rng)
  269. .set_rng(1, &rng)
  270. .set_epsilon(1e-2)
  271. .set_param(arg.param)
  272. .exec(TensorLayoutArray{filter, dst, src});
  273. src.dtype = dst.dtype = filter.dtype = dtype::Float32();
  274. arg.param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  275. checker.set_rng(0, &rng)
  276. .set_rng(1, &rng)
  277. .set_epsilon(1e-2)
  278. .set_param(arg.param)
  279. .exec(TensorLayoutArray{filter, dst, src});
  280. }
  281. }
  282. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_CUDNN) {
  283. require_compute_capability(7, 0);
  284. using namespace convolution;
  285. Checker<ConvolutionBackwardData> checker(handle_cuda());
  286. checker.set_before_exec_callback(
  287. AlgoChecker<ConvolutionBackwardData>("CUDNN_CONVOLUTION"));
  288. //! noncontiguous case
  289. {
  290. param::Convolution param;
  291. param.pad_h = param.pad_w = 1;
  292. checker.set_param(param).execl(TensorLayoutArray{
  293. {{16, 16, 3, 3}, {144, 9, 3, 1}, dtype::Float32()},
  294. {{2, 16, 7, 7}, {1568, 49, 7, 1}, dtype::Float32()},
  295. {{2, 16, 7, 7}, {1568, 49, 7, 1}, dtype::Float32()},
  296. });
  297. }
  298. }
  299. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_MATMUL) {
  300. using namespace convolution;
  301. std::vector<TestArg> args = get_args_cuda_conv_bwd_data();
  302. Checker<ConvolutionBackwardData> checker(handle_cuda());
  303. checker.set_before_exec_callback(AlgoChecker<ConvolutionBackwardData>(
  304. ExecutionPolicyAlgoName{"MATMUL", {{"CUBLAS", {}}}}));
  305. NormalRNG default_rng;
  306. for (auto&& arg : args) {
  307. float scale = 64.f / sqrt(arg.filter[0] * arg.filter[2] * arg.filter[3]);
  308. UniformFloatRNG rng(scale, 2 * scale);
  309. auto src = TensorLayout(arg.src, dtype::Float32());
  310. auto filter = TensorLayout(arg.filter, dtype::Float32());
  311. TensorLayout dst;
  312. {
  313. auto opr = handle_cuda()->create_operator<Convolution>();
  314. opr->param() = arg.param;
  315. opr->deduce_layout(src, filter, dst);
  316. }
  317. src.dtype = dst.dtype = filter.dtype = dtype::Float32();
  318. checker.set_rng(0, &default_rng)
  319. .set_rng(1, &default_rng)
  320. .set_epsilon(1e-3)
  321. .set_param(arg.param)
  322. .exec(TensorLayoutArray{filter, dst, src});
  323. }
  324. //! noncontiguous case
  325. {
  326. param::Convolution param;
  327. param.pad_h = param.pad_w = 1;
  328. checker.set_param(param).execl(TensorLayoutArray{
  329. {{16, 16, 3, 3}, {144, 9, 3, 1}, dtype::Float32()},
  330. {{2, 16, 7, 7}, {1568, 49, 7, 1}, dtype::Float32()},
  331. {{2, 16, 7, 7}, {1568, 49, 7, 1}, dtype::Float32()},
  332. });
  333. }
  334. }
  335. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_INT8_NCHW4_DP4A) {
  336. require_compute_capability(6, 1);
  337. using namespace convolution;
  338. std::vector<TestArg> args = get_args_int8_nchw4_conv_bwd_data();
  339. struct AlgoParam {
  340. int threadblock_m;
  341. int threadblock_n;
  342. int threadblock_k;
  343. int warp_m;
  344. int warp_n;
  345. int warp_k;
  346. int stage;
  347. std::string to_string() {
  348. return ssprintf(
  349. "_%dX%dX%d_%dX%dX%d_%dstage", threadblock_m, threadblock_n,
  350. threadblock_k, warp_m, warp_n, warp_k, stage);
  351. }
  352. };
  353. std::vector<AlgoParam> all_params;
  354. all_params.emplace_back(AlgoParam{16, 64, 8, 16, 64, 8, 2});
  355. all_params.emplace_back(AlgoParam{16, 128, 16, 16, 64, 16, 2});
  356. all_params.emplace_back(AlgoParam{16, 128, 16, 16, 128, 16, 1});
  357. all_params.emplace_back(AlgoParam{32, 128, 32, 32, 64, 32, 2});
  358. for (auto algo_param : all_params) {
  359. Checker<ConvolutionBackwardData> checker(handle_cuda());
  360. std::string algo_name(ssprintf(
  361. "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM%s", algo_param.to_string().c_str()));
  362. checker.set_before_exec_callback(
  363. AlgoChecker<ConvolutionBackwardData>(algo_name.c_str()));
  364. checker.set_epsilon(1 + 1e-3).set_max_avg_error(1e-1);
  365. for (auto&& arg : args) {
  366. UniformIntRNG rng(-3, 3);
  367. auto src = TensorLayout(arg.src, dtype::QuantizedS8{1.2f});
  368. auto filter = TensorLayout(arg.filter, dtype::QuantizedS8{1.3f});
  369. TensorLayout dst;
  370. dst.dtype = dtype::QuantizedS8{1.2f};
  371. {
  372. auto opr = handle_cuda()->create_operator<Convolution>();
  373. opr->param() = arg.param;
  374. opr->deduce_layout(src, filter, dst);
  375. }
  376. checker.set_rng(0, &rng).set_rng(1, &rng).set_param(arg.param).exec(
  377. TensorLayoutArray{filter, dst, src});
  378. }
  379. }
  380. }
  381. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_INT8_NCHW_DP4A) {
  382. require_compute_capability(6, 1);
  383. using namespace convolution;
  384. std::vector<TestArg> args = get_args_int8_nchw_conv_bwd_data();
  385. Checker<ConvolutionBackwardData> checker(handle_cuda());
  386. checker.set_before_exec_callback(
  387. AlgoChecker<ConvolutionBackwardData>("INT8_NCHW_DOTPROD_IMPLICIT_GEMM"));
  388. checker.set_epsilon(1 + 1e-3).set_max_avg_error(1e-1);
  389. for (auto&& arg : args) {
  390. UniformIntRNG rng(-3, 3);
  391. auto src = TensorLayout(arg.src, dtype::QuantizedS8{1.2f});
  392. auto filter = TensorLayout(arg.filter, dtype::QuantizedS8{1.3f});
  393. TensorLayout dst;
  394. dst.dtype = dtype::QuantizedS8{1.2f};
  395. {
  396. auto opr = handle_cuda()->create_operator<Convolution>();
  397. opr->param() = arg.param;
  398. opr->deduce_layout(src, filter, dst);
  399. }
  400. checker.set_rng(0, &rng).set_rng(1, &rng).set_param(arg.param).exec(
  401. TensorLayoutArray{filter, dst, src});
  402. }
  403. }
  404. #if CUDA_VERSION >= 10020
  405. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_INT8_NHWC_IMMA) {
  406. require_compute_capability(7, 5);
  407. using namespace convolution;
  408. std::vector<TestArg> args = get_args_int8_nhwc_conv_bwd_data();
  409. struct AlgoParam {
  410. int threadblock_m;
  411. int threadblock_n;
  412. int threadblock_k;
  413. int warp_m;
  414. int warp_n;
  415. int warp_k;
  416. int stage;
  417. int access_size;
  418. std::string to_string() {
  419. return ssprintf(
  420. "_%dX%dX%d_%dX%dX%d_%dstage_%d", threadblock_m, threadblock_n,
  421. threadblock_k, warp_m, warp_n, warp_k, stage, access_size);
  422. }
  423. };
  424. std::vector<AlgoParam> all_params;
  425. all_params.emplace_back(AlgoParam{64, 16, 32, 64, 16, 32, 2, 4});
  426. all_params.emplace_back(AlgoParam{64, 16, 32, 64, 16, 32, 2, 8});
  427. all_params.emplace_back(AlgoParam{64, 16, 32, 64, 16, 32, 2, 16});
  428. all_params.emplace_back(AlgoParam{128, 32, 32, 64, 32, 32, 1, 4});
  429. all_params.emplace_back(AlgoParam{128, 32, 32, 64, 32, 32, 1, 8});
  430. all_params.emplace_back(AlgoParam{128, 32, 32, 64, 32, 32, 1, 16});
  431. for (auto algo_param : all_params) {
  432. Checker<ConvolutionBackwardData> checker(handle_cuda());
  433. std::string algo_name(ssprintf(
  434. "INT8_NHWC_IMMA_IMPLICIT_GEMM%s", algo_param.to_string().c_str()));
  435. checker.set_before_exec_callback(
  436. AlgoChecker<ConvolutionBackwardData>(algo_name.c_str()));
  437. checker.set_epsilon(1 + 1e-3).set_max_avg_error(1e-1);
  438. for (auto&& arg : args) {
  439. UniformIntRNG rng(-3, 3);
  440. auto src = TensorLayout(arg.src, dtype::QuantizedS8{1.2f});
  441. auto filter = TensorLayout(arg.filter, dtype::QuantizedS8{1.3f});
  442. TensorLayout dst;
  443. dst.dtype = dtype::QuantizedS8{1.2f};
  444. {
  445. auto opr = handle_cuda()->create_operator<Convolution>();
  446. opr->param() = arg.param;
  447. opr->deduce_layout(src, filter, dst);
  448. }
  449. checker.set_rng(0, &rng).set_rng(1, &rng).set_param(arg.param).exec(
  450. TensorLayoutArray{filter, dst, src});
  451. }
  452. }
  453. }
  454. #endif
  455. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_FAILED_CUDNN7_5) {
  456. // BRAIN-481 failed on architectures 7.0, remove the following if statement,
  457. // when cudnn fixed the problem.
  458. require_compute_capability(7, 0);
  459. using namespace convolution;
  460. std::vector<TestArg> args = get_args_cudnn_7_5_failures();
  461. Checker<ConvolutionBackwardData> checker(handle_cuda());
  462. NormalRNG default_rng;
  463. for (auto&& arg : args) {
  464. float scale = 128.f / sqrt(arg.filter[0] * arg.filter[2] * arg.filter[3]);
  465. scale = std::max(scale, 1.f);
  466. UniformFloatRNG rng(scale, 2 * scale);
  467. auto src = TensorLayout(arg.src, dtype::Float32());
  468. auto filter = TensorLayout(arg.filter, dtype::Float32());
  469. TensorLayout dst;
  470. {
  471. auto opr = handle_cuda()->create_operator<Convolution>();
  472. opr->param() = arg.param;
  473. opr->deduce_layout(src, filter, dst);
  474. }
  475. src.dtype = dst.dtype = filter.dtype = dtype::Float32();
  476. checker.set_rng(0, &default_rng)
  477. .set_rng(1, &default_rng)
  478. .set_epsilon(1e-3)
  479. .set_param(arg.param)
  480. .exec(TensorLayoutArray{filter, dst, src});
  481. src.dtype = dst.dtype = filter.dtype = dtype::Float16();
  482. checker.set_rng(0, &rng)
  483. .set_rng(1, &rng)
  484. .set_epsilon(1e-1)
  485. .set_param(arg.param)
  486. .exec(TensorLayoutArray{filter, dst, src});
  487. arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  488. checker.set_rng(0, &rng)
  489. .set_rng(1, &rng)
  490. .set_epsilon(1e-1)
  491. .set_param(arg.param)
  492. .exec(TensorLayoutArray{filter, dst, src});
  493. }
  494. }
  495. TEST_F(CUDA, CONVOLUTION_BACKWARD_FILTER) {
  496. using namespace convolution;
  497. std::vector<TestArg> args = get_args();
  498. Checker<ConvolutionBackwardFilter> checker(handle_cuda());
  499. bool f16_checked = false;
  500. for (auto&& arg : args) {
  501. auto src = TensorLayout(arg.src, dtype::Float32());
  502. auto filter = TensorLayout(arg.filter, dtype::Float32());
  503. TensorLayout dst;
  504. {
  505. auto opr = handle_cuda()->create_operator<Convolution>();
  506. opr->param() = arg.param;
  507. opr->deduce_layout(src, filter, dst);
  508. }
  509. float scale = 1.0f / sqrt(dst[2] * dst[3]);
  510. UniformFloatRNG rng(scale, 2 * scale);
  511. src.dtype = dst.dtype = filter.dtype = dtype::Float32();
  512. checker.set_rng(0, &rng)
  513. .set_rng(1, &rng)
  514. .set_epsilon(1e-3)
  515. .set_param(arg.param)
  516. .exec(TensorLayoutArray{src, dst, filter});
  517. // reduce on large f16 array may introduce significant error
  518. if (dst.total_nr_elems() >= 1000 && f16_checked)
  519. continue;
  520. f16_checked = true;
  521. src.dtype = dst.dtype = filter.dtype = dtype::Float16();
  522. checker.set_rng(0, &rng)
  523. .set_rng(1, &rng)
  524. .set_epsilon(1e-1)
  525. .set_param(arg.param)
  526. .exec(TensorLayoutArray{src, dst, filter});
  527. arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  528. checker.set_rng(0, &rng)
  529. .set_rng(1, &rng)
  530. .set_epsilon(1e-1)
  531. .set_param(arg.param)
  532. .exec(TensorLayoutArray{src, dst, filter});
  533. checker.set_before_exec_callback(
  534. AlgoChecker<ConvolutionBackwardFilter>(ExecutionPolicyAlgoName{
  535. "CONVOLUTION_BACKWARD_FILTER_BFLOAT16",
  536. {{"MATMUL", {{"CUBLAS", {}}}}}}));
  537. src.dtype = dst.dtype = filter.dtype = dtype::BFloat16();
  538. checker.set_rng(0, &rng)
  539. .set_rng(1, &rng)
  540. .set_epsilon(1e-1)
  541. .set_param(arg.param)
  542. .exec(TensorLayoutArray{src, dst, filter});
  543. checker.reset_before_exec_callback();
  544. checker.opr()->execution_policy() = {};
  545. }
  546. }
  547. TEST_F(CUDA, CONVOLUTION_BACKWARD_FILTER_MATMUL) {
  548. using namespace convolution;
  549. std::vector<TestArg> args = get_args();
  550. Checker<ConvolutionBackwardFilter> checker(handle_cuda());
  551. checker.set_before_exec_callback(AlgoChecker<ConvolutionBackwardFilter>(
  552. ExecutionPolicyAlgoName{"MATMUL", {{"CUBLAS", {}}}}));
  553. for (auto&& arg : args) {
  554. auto src = TensorLayout(arg.src, dtype::Float32());
  555. auto filter = TensorLayout(arg.filter, dtype::Float32());
  556. TensorLayout dst;
  557. {
  558. auto opr = handle_cuda()->create_operator<Convolution>();
  559. opr->param() = arg.param;
  560. opr->deduce_layout(src, filter, dst);
  561. }
  562. float scale = 1.0f / sqrt(dst[2] * dst[3]);
  563. UniformFloatRNG rng(scale, 2 * scale);
  564. src.dtype = dst.dtype = filter.dtype = dtype::Float32();
  565. checker.set_rng(0, &rng)
  566. .set_rng(1, &rng)
  567. .set_epsilon(1e-3)
  568. .set_param(arg.param)
  569. .exec(TensorLayoutArray{src, dst, filter});
  570. }
  571. //! noncontiguous case
  572. {
  573. NormalRNG default_rng;
  574. param::Convolution param;
  575. param.pad_h = param.pad_w = 1;
  576. checker.set_rng(0, &default_rng)
  577. .set_rng(1, &default_rng)
  578. .set_param(param)
  579. .execl(TensorLayoutArray{
  580. {{2, 16, 7, 7}, {1568, 49, 7, 1}, dtype::Float32()},
  581. {{2, 16, 7, 7}, {1568, 49, 7, 1}, dtype::Float32()},
  582. {{16, 16, 3, 3}, {144, 9, 3, 1}, dtype::Float32()}});
  583. }
  584. }
  585. TEST_F(CUDA, CONVOLUTION_BACKWARD_FILTER_CUDNN) {
  586. require_compute_capability(7, 0);
  587. using namespace convolution;
  588. Checker<ConvolutionBackwardFilter> checker(handle_cuda());
  589. checker.set_before_exec_callback(
  590. AlgoChecker<ConvolutionBackwardFilter>("CUDNN_CONVOLUTION"));
  591. //! noncontiguous case
  592. {
  593. param::Convolution param;
  594. param.pad_h = param.pad_w = 1;
  595. checker.set_param(param).execl(TensorLayoutArray{
  596. {{2, 16, 7, 7}, {1568, 49, 7, 1}, dtype::Float32()},
  597. {{2, 16, 7, 7}, {1568, 49, 7, 1}, dtype::Float32()},
  598. {{16, 16, 3, 3}, {144, 9, 3, 1}, dtype::Float32()}});
  599. }
  600. }
  601. TEST_F(CUDA, CONV_CONFIG_COMBINATIONS) {
  602. auto eps_getter = [](bool f16, int stage, const char* name) -> float {
  603. if (f16) {
  604. return stage == 2 ? 0.5 : 0.2;
  605. }
  606. if (strstr(name, "WINOGRAD_NONFUSED"))
  607. return 0.3;
  608. return 1e-3;
  609. };
  610. convolution::test_conv_config_combinations(
  611. 2, handle_cuda(), false, true, true, eps_getter, true);
  612. convolution::test_conv_config_combinations(
  613. 3, handle_cuda(), false, true, true, eps_getter, true);
  614. convolution::test_conv_config_combinations(
  615. 5, handle_cuda(), false, true, true, eps_getter, true);
  616. }
  617. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_1) {
  618. require_compute_capability(7, 0);
  619. using namespace convolution;
  620. Checker<ConvolutionBackwardData> checker(handle_cuda());
  621. checker.set_before_exec_callback(AlgoChecker<ConvolutionBackwardData>(
  622. "CUDNN_CONVOLUTION_BWD_DATA_ALGO_1" CUDNN_VERSION_STRING));
  623. NormalRNG default_rng;
  624. TensorShape s_filter = TensorShape{8, 8, 2, 2}, s_src = TensorShape{2, 8, 18, 18};
  625. float scale = 1.0f / sqrt(s_filter[0] * s_filter[2] * s_filter[3]);
  626. UniformFloatRNG rng(scale, 2 * scale);
  627. auto src = TensorLayout(s_src, dtype::Float16());
  628. auto filter = TensorLayout(s_filter, dtype::Float16());
  629. TensorLayout dst;
  630. param::Convolution param;
  631. param.pad_h = param.pad_w = 2;
  632. param.stride_h = param.stride_w = 2;
  633. {
  634. auto opr = handle_cuda()->create_operator<Convolution>();
  635. opr->param() = param;
  636. opr->deduce_layout(src, filter, dst);
  637. }
  638. src.dtype = dst.dtype = filter.dtype = dtype::Float16();
  639. param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  640. checker.set_rng(0, &rng).set_rng(1, &rng).set_epsilon(0.2).set_param(param).exec(
  641. TensorLayoutArray{filter, dst, src});
  642. }
  643. TEST_F(CUDA, CONVOLUTION_BACKWARD_DEPTHWISE_LARGE_FILTER) {
  644. Checker<ConvolutionBackwardData> checker(handle_cuda());
  645. checker.set_before_exec_callback(
  646. AlgoChecker<ConvolutionBackwardData>("DEPTHWISE_LARGE_FILTER"));
  647. for (auto dtype : std::vector<DType> {
  648. dtype::Float32(),
  649. #if CUDA_VERSION >= 9000
  650. dtype::Float16()
  651. #endif
  652. }) {
  653. auto run = [&checker, &dtype](
  654. size_t n, size_t g, size_t h, size_t fh, size_t padding,
  655. size_t stride) {
  656. param::Convolution param;
  657. param.stride_h = param.stride_w = stride;
  658. param.pad_h = param.pad_w = padding;
  659. param.mode = Convolution::Mode::CROSS_CORRELATION;
  660. param.sparse = param::Convolution::Sparse::GROUP;
  661. checker.set_dtype(0, dtype).set_dtype(1, dtype).set_dtype(2, dtype);
  662. float scale = 64.f / sqrt(fh * fh);
  663. UniformFloatRNG rng(scale, scale * 2);
  664. checker.set_rng(0, &rng).set_rng(1, &rng).set_rng(2, &rng);
  665. if (dtype.enumv() == DTypeEnum::Float16)
  666. checker.set_epsilon(1e-1);
  667. checker.set_param(param).execs(
  668. {{g, 1, 1, fh, fh},
  669. {n, g, (h + 2 * padding - fh + 1) / stride,
  670. (h + 2 * padding - fh + 1) / stride},
  671. {n, g, h, h}});
  672. };
  673. run(4, 8, 32, 5, 5 / 2, 1);
  674. run(4, 8, 32, 7, 7 / 2, 1);
  675. run(4, 8, 32, 9, 9 / 2, 1);
  676. run(4, 8, 32, 11, 11 / 2, 1);
  677. run(4, 8, 32, 13, 13 / 2, 1);
  678. run(4, 8, 32, 15, 15 / 2, 1);
  679. run(4, 8, 32, 17, 17 / 2, 1);
  680. run(4, 8, 32, 19, 19 / 2, 1);
  681. run(4, 8, 32, 21, 21 / 2, 1);
  682. run(4, 8, 32, 23, 23 / 2, 1);
  683. run(4, 8, 32, 25, 25 / 2, 1);
  684. run(4, 8, 32, 27, 27 / 2, 1);
  685. run(4, 8, 32, 29, 29 / 2, 1);
  686. run(4, 8, 32, 31, 31 / 2, 1);
  687. run(4, 8, 64, 5, 5 / 2, 2);
  688. run(4, 8, 64, 7, 7 / 3, 2);
  689. run(4, 8, 64, 9, 9 / 3, 2);
  690. run(4, 8, 64, 11, 11 / 3, 2);
  691. run(4, 8, 64, 13, 13 / 3, 2);
  692. run(4, 8, 64, 15, 15 / 3, 2);
  693. run(4, 8, 64, 17, 17 / 3, 2);
  694. run(4, 8, 64, 19, 19 / 3, 2);
  695. run(4, 8, 64, 21, 21 / 3, 2);
  696. run(4, 8, 64, 23, 23 / 3, 2);
  697. run(4, 8, 64, 25, 25 / 3, 2);
  698. run(4, 8, 64, 27, 27 / 3, 2);
  699. run(4, 8, 64, 29, 29 / 3, 2);
  700. run(4, 8, 64, 31, 31 / 3, 2);
  701. run(1, 2, 128, 31, 31 / 3, 2);
  702. run(1, 2, 256, 31, 31 / 3, 2);
  703. }
  704. }
  705. #if MEGDNN_WITH_BENCHMARK
  706. TEST_F(CUDA, BENCHMARK_CONVOLUTION_1X1_FORWARD) {
  707. using namespace convolution;
  708. std::vector<TestArg> args = get_1x1_args();
  709. Benchmarker<ConvolutionForward> marker(handle_cuda());
  710. NormalRNG default_rng;
  711. for (auto&& arg : args) {
  712. float scale = 1.0f / sqrt(arg.filter[1] * arg.filter[2] * arg.filter[3]);
  713. UniformFloatRNG rng(scale, 2 * scale);
  714. marker.set_dtype(0, dtype::Float32())
  715. .set_dtype(1, dtype::Float32())
  716. .set_rng(0, &default_rng)
  717. .set_rng(1, &default_rng)
  718. .set_param(arg.param)
  719. .execs({arg.src, arg.filter, {}});
  720. }
  721. }
  722. TEST_F(CUDA, CONV_FWD_BENCHMARK) {
  723. auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW, size_t SH = 1,
  724. size_t SW = 1, size_t FH = 1, size_t FW = 1, size_t PH = 0,
  725. size_t PW = 0, bool fp16io_c32 = false) {
  726. auto benchmarker = Benchmarker<ConvolutionForward>(handle_cuda());
  727. benchmarker.set_dtype(0, dtype::Float16())
  728. .set_dtype(1, dtype::Float16())
  729. .set_dtype(2, dtype::Float16());
  730. ConvolutionForward::Param param;
  731. param.stride_h = SH;
  732. param.stride_w = SW;
  733. param.pad_h = PH;
  734. param.pad_w = PW;
  735. if (fp16io_c32) {
  736. param.compute_mode = ConvolutionForward::Param::ComputeMode::FLOAT32;
  737. }
  738. benchmarker.set_param(param);
  739. std::unique_ptr<OprProxy<ConvolutionForward>> proxy{
  740. new OprProxy<ConvolutionForward>{true}};
  741. benchmarker.set_proxy(proxy);
  742. size_t OH = (IH - FH + 2 * PH) / SH + 1;
  743. size_t OW = (IW - FW + 2 * PW) / SW + 1;
  744. auto time =
  745. benchmarker.execs({{N, IC, IH, IW}, {OC, IC, FH, FW}, {N, OC, OH, OW}});
  746. time /= 1000.0 * 10.0;
  747. auto flo = (double)N * OC * IC * OH * OW * FH * FW * 2;
  748. auto flops = flo / time / 1e12;
  749. printf("comp_type %s: ", fp16io_c32 ? "32" : "16");
  750. printf("%.3fG FLO, flops %.3fTFLOPS\n", flo / 1e9, flops);
  751. };
  752. run(32, 512, 256, 56, 56, 1, 1, 1, 1, 0, 0, false);
  753. run(32, 512, 256, 56, 56, 1, 1, 1, 1, 0, 0, true);
  754. }
  755. TEST_F(CUDA, CONVOLUTION_FWD_BENCHMARK) {
  756. CUBenchmarker<ConvolutionForward> bench{handle_cuda()};
  757. std::unique_ptr<OprProxy<ConvolutionForward>> proxy{
  758. new OprProxy<ConvolutionForward>{true}};
  759. size_t RUNS = 10;
  760. bench.set_proxy(proxy).set_times(RUNS);
  761. auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW, size_t FH,
  762. size_t SH, size_t PH) {
  763. bench.set_dtype(0, dtype::Float32())
  764. .set_dtype(1, dtype::Float32())
  765. .set_dtype(2, dtype::Float32());
  766. param::Convolution param;
  767. param.stride_h = param.stride_w = SH;
  768. param.pad_h = param.pad_w = PH;
  769. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  770. bench.set_param(param);
  771. bench.proxy()->target_execution_policy.algo.reset();
  772. TensorLayout src{{N, IC, IH, IW}, dtype::Float32()},
  773. filter{{OC, IC, FH, FH}, dtype::Float32()};
  774. TensorLayout dst;
  775. {
  776. auto&& opr = handle_cuda()->create_operator<Convolution>();
  777. opr->param() = param;
  778. opr->deduce_layout(src, filter, dst);
  779. }
  780. auto time_ms_fp32 = bench.execl({src, filter, dst}) / RUNS;
  781. src.dtype = filter.dtype = dst.dtype = dtype::Float16();
  782. bench.proxy()->target_execution_policy.algo.reset();
  783. bench.set_dtype(0, dtype::Float16())
  784. .set_dtype(1, dtype::Float16())
  785. .set_dtype(2, dtype::Float16());
  786. auto time_ms_true_fp16 = bench.execl({src, filter, dst}) / RUNS;
  787. param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  788. bench.proxy()->target_execution_policy.algo.reset();
  789. bench.set_param(param);
  790. auto time_ms_pseudo_fp16 = bench.execl({src, filter, dst}) / RUNS;
  791. float flo = 2.0 * N * OC * IC * dst[2] * dst[3] * FH * FH;
  792. printf("inp=%s, kern=%s, dst=%s ", src.to_string().c_str(),
  793. filter.to_string().c_str(), dst.to_string().c_str());
  794. printf("time_fp32=%.2fms, flops=%.3fTFLOPS\ntime_true_fp16=%.2fms, "
  795. "flops=%.3fTFLOPS\ntime_pseudo_fp16=%.2fms, flops=%.3fFLOPS\n",
  796. time_ms_fp32, (flo / (time_ms_fp32 * 1e9)), time_ms_true_fp16,
  797. (flo / (time_ms_true_fp16 * 1e9)), time_ms_pseudo_fp16,
  798. (flo / (time_ms_pseudo_fp16 * 1e9)));
  799. printf("speedup (true_fp16/fp32)=%.2f, (true_fp16/pseudo_fp16)=%.2f\n",
  800. time_ms_fp32 / time_ms_true_fp16,
  801. time_ms_pseudo_fp16 / time_ms_true_fp16);
  802. };
  803. run(32, 64, 3, 224, 224, 7, 2, 3);
  804. run(32, 128, 128, 28, 28, 3, 1, 1);
  805. run(32, 256, 256, 14, 14, 3, 1, 1);
  806. run(32, 512, 512, 7, 7, 3, 1, 1);
  807. run(32, 64, 64, 56, 56, 3, 1, 1);
  808. run(32, 512, 256, 56, 56, 1, 2, 0);
  809. run(32, 1024, 512, 28, 28, 1, 2, 0);
  810. run(32, 2048, 1024, 14, 14, 1, 2, 0);
  811. run(32, 512, 128, 28, 28, 1, 1, 0);
  812. run(32, 128, 512, 28, 28, 1, 1, 0);
  813. run(32, 1024, 256, 14, 14, 1, 1, 0);
  814. run(32, 256, 1024, 14, 14, 1, 1, 0);
  815. run(32, 2048, 512, 7, 7, 1, 1, 0);
  816. run(32, 512, 2048, 7, 7, 1, 1, 0);
  817. run(32, 256, 64, 56, 56, 1, 1, 0);
  818. run(32, 64, 256, 56, 56, 1, 1, 0);
  819. run(32, 128, 256, 56, 56, 1, 2, 0);
  820. run(32, 256, 512, 28, 28, 1, 2, 0);
  821. run(32, 512, 1024, 14, 14, 1, 2, 0);
  822. run(32, 64, 64, 56, 56, 1, 1, 0);
  823. }
  824. TEST_F(CUDA, CONVOLUTION_BWD_DATA_BENCHMARK) {
  825. CUBenchmarker<ConvolutionBackwardData> bench{handle_cuda()};
  826. std::unique_ptr<OprProxy<ConvolutionBackwardData>> proxy{
  827. new OprProxy<ConvolutionBackwardData>{true}};
  828. size_t RUNS = 10;
  829. bench.set_proxy(proxy).set_times(RUNS);
  830. auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW, size_t FH,
  831. size_t SH, size_t PH) {
  832. bench.set_dtype(0, dtype::Float32())
  833. .set_dtype(1, dtype::Float32())
  834. .set_dtype(2, dtype::Float32());
  835. param::Convolution param;
  836. param.stride_h = param.stride_w = SH;
  837. param.pad_h = param.pad_w = PH;
  838. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  839. bench.set_param(param);
  840. bench.proxy()->target_execution_policy.algo.reset();
  841. TensorLayout src{{N, IC, IH, IW}, dtype::Float32()},
  842. filter{{OC, IC, FH, FH}, dtype::Float32()};
  843. TensorLayout dst;
  844. {
  845. auto&& opr = handle_cuda()->create_operator<Convolution>();
  846. opr->param() = param;
  847. opr->deduce_layout(src, filter, dst);
  848. }
  849. auto time_ms_fp32 = bench.execl({filter, dst, src}) / RUNS;
  850. src.dtype = filter.dtype = dst.dtype = dtype::Float16();
  851. bench.proxy()->target_execution_policy.algo.reset();
  852. bench.set_dtype(0, dtype::Float16())
  853. .set_dtype(1, dtype::Float16())
  854. .set_dtype(2, dtype::Float16());
  855. auto time_ms_true_fp16 = bench.execl({filter, dst, src}) / RUNS;
  856. param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  857. bench.proxy()->target_execution_policy.algo.reset();
  858. bench.set_param(param);
  859. auto time_ms_pseudo_fp16 = bench.execl({filter, dst, src}) / RUNS;
  860. float flo = 2.0 * N * OC * IC * dst[2] * dst[3] * FH * FH;
  861. printf("inp=%s, kern=%s, dst=%s ", src.to_string().c_str(),
  862. filter.to_string().c_str(), dst.to_string().c_str());
  863. printf("time_fp32=%.2fms, flops=%.3fTFLOPS\ntime_true_fp16=%.2fms, "
  864. "flops=%.3fTFLOPS\ntime_pseudo_fp16=%.2fms, flops=%.3fFLOPS\n",
  865. time_ms_fp32, (flo / (time_ms_fp32 * 1e9)), time_ms_true_fp16,
  866. (flo / (time_ms_true_fp16 * 1e9)), time_ms_pseudo_fp16,
  867. (flo / (time_ms_pseudo_fp16 * 1e9)));
  868. printf("speedup (true_fp16/fp32)=%.2f, (true_fp16/pseudo_fp16)=%.2f\n",
  869. time_ms_fp32 / time_ms_true_fp16,
  870. time_ms_pseudo_fp16 / time_ms_true_fp16);
  871. };
  872. run(32, 64, 3, 224, 224, 7, 2, 3);
  873. run(32, 128, 128, 28, 28, 3, 1, 1);
  874. run(32, 256, 256, 14, 14, 3, 1, 1);
  875. run(32, 512, 512, 7, 7, 3, 1, 1);
  876. run(32, 64, 64, 56, 56, 3, 1, 1);
  877. run(32, 512, 256, 56, 56, 1, 2, 0);
  878. run(32, 1024, 512, 28, 28, 1, 2, 0);
  879. run(32, 2048, 1024, 14, 14, 1, 2, 0);
  880. run(32, 512, 128, 28, 28, 1, 1, 0);
  881. run(32, 128, 512, 28, 28, 1, 1, 0);
  882. run(32, 1024, 256, 14, 14, 1, 1, 0);
  883. run(32, 256, 1024, 14, 14, 1, 1, 0);
  884. run(32, 2048, 512, 7, 7, 1, 1, 0);
  885. run(32, 512, 2048, 7, 7, 1, 1, 0);
  886. run(32, 256, 64, 56, 56, 1, 1, 0);
  887. run(32, 64, 256, 56, 56, 1, 1, 0);
  888. run(32, 128, 256, 56, 56, 1, 2, 0);
  889. run(32, 256, 512, 28, 28, 1, 2, 0);
  890. run(32, 512, 1024, 14, 14, 1, 2, 0);
  891. run(32, 64, 64, 56, 56, 1, 1, 0);
  892. }
  893. TEST_F(CUDA, BENCHMARK_CONVOLUTION_BWD_DATA_DEPTHWISE_LARGE_FILTER_FP32) {
  894. CUBenchmarker<ConvolutionBackwardData> bencher{handle_cuda()};
  895. bencher.set_display(false);
  896. bencher.set_before_exec_callback(
  897. AlgoChecker<ConvolutionBackwardData>("DEPTHWISE_LARGE_FILTER"));
  898. auto run = [&](size_t N, size_t OC, size_t g, size_t IH, size_t IW, size_t FH,
  899. size_t SH, size_t nr_times) {
  900. bencher.set_dtype(0, dtype::Float32())
  901. .set_dtype(1, dtype::Float32())
  902. .set_dtype(2, dtype::Float32());
  903. param::Convolution param;
  904. param.stride_h = param.stride_w = SH;
  905. param.pad_h = param.pad_w = FH / 2;
  906. param.sparse = param::Convolution::Sparse::GROUP;
  907. bencher.set_param(param);
  908. bencher.set_times(nr_times);
  909. TensorLayout src{{N, g, IH, IW}, dtype::Float32()},
  910. filter{{g, 1, 1, FH, FH}, dtype::Float32()};
  911. TensorLayout dst;
  912. {
  913. auto&& opr = handle_cuda()->create_operator<Convolution>();
  914. opr->param() = param;
  915. opr->deduce_layout(src, filter, dst);
  916. }
  917. auto time_ms_fp32 = bencher.execl({filter, dst, src}) / nr_times;
  918. float flo = 2.0 * N * g * dst[2] * dst[3] * FH * FH;
  919. printf("inp=%s, kern=%s, dst=%s ", src.to_string().c_str(),
  920. filter.to_string().c_str(), dst.to_string().c_str());
  921. printf("time_fp32=%.2fms, flops=%.3fTFLOPS\n", time_ms_fp32,
  922. (flo / (time_ms_fp32 * 1e9)));
  923. };
  924. run(64, 384, 384, 32, 32, 3, 1, 10);
  925. run(64, 384, 384, 32, 32, 5, 1, 10);
  926. run(64, 384, 384, 32, 32, 7, 1, 10);
  927. run(64, 384, 384, 32, 32, 9, 1, 10);
  928. run(64, 384, 384, 32, 32, 11, 1, 10);
  929. run(64, 384, 384, 32, 32, 13, 1, 10);
  930. run(64, 384, 384, 32, 32, 15, 1, 10);
  931. run(64, 384, 384, 32, 32, 17, 1, 10);
  932. run(64, 384, 384, 32, 32, 19, 1, 10);
  933. run(64, 384, 384, 32, 32, 21, 1, 10);
  934. run(64, 384, 384, 32, 32, 23, 1, 10);
  935. run(64, 384, 384, 32, 32, 25, 1, 10);
  936. run(64, 384, 384, 32, 32, 27, 1, 10);
  937. run(64, 384, 384, 32, 32, 29, 1, 10);
  938. run(64, 384, 384, 32, 32, 31, 1, 10);
  939. }
  940. TEST_F(CUDA, BENCHMARK_CONVOLUTION_BWD_DATA_DEPTHWISE_LARGE_FILTER_FP16) {
  941. CUBenchmarker<ConvolutionBackwardData> bencher{handle_cuda()};
  942. bencher.set_display(false);
  943. bencher.set_before_exec_callback(
  944. AlgoChecker<ConvolutionBackwardData>("DEPTHWISE_LARGE_FILTER"));
  945. auto run = [&](size_t N, size_t OC, size_t g, size_t IH, size_t IW, size_t FH,
  946. size_t SH, size_t nr_times) {
  947. bencher.set_dtype(0, dtype::Float16())
  948. .set_dtype(1, dtype::Float16())
  949. .set_dtype(2, dtype::Float16());
  950. param::Convolution param;
  951. param.stride_h = param.stride_w = SH;
  952. param.pad_h = param.pad_w = FH / 2;
  953. param.sparse = param::Convolution::Sparse::GROUP;
  954. bencher.set_param(param);
  955. bencher.set_times(nr_times);
  956. TensorLayout src{{N, g, IH, IW}, dtype::Float16()},
  957. filter{{g, 1, 1, FH, FH}, dtype::Float16()};
  958. TensorLayout dst;
  959. {
  960. auto&& opr = handle_cuda()->create_operator<Convolution>();
  961. opr->param() = param;
  962. opr->deduce_layout(src, filter, dst);
  963. }
  964. auto time_ms_fp16 = bencher.execl({filter, dst, src}) / nr_times;
  965. float flo = 2.0 * N * g * dst[2] * dst[3] * FH * FH;
  966. printf("inp=%s, kern=%s, dst=%s ", src.to_string().c_str(),
  967. filter.to_string().c_str(), dst.to_string().c_str());
  968. printf("time_fp16=%.2fms, flops=%.3fTFLOPS\n", time_ms_fp16,
  969. (flo / (time_ms_fp16 * 1e9)));
  970. };
  971. run(64, 384, 384, 32, 32, 3, 1, 10);
  972. run(64, 384, 384, 32, 32, 5, 1, 10);
  973. run(64, 384, 384, 32, 32, 7, 1, 10);
  974. run(64, 384, 384, 32, 32, 9, 1, 10);
  975. run(64, 384, 384, 32, 32, 11, 1, 10);
  976. run(64, 384, 384, 32, 32, 13, 1, 10);
  977. run(64, 384, 384, 32, 32, 15, 1, 10);
  978. run(64, 384, 384, 32, 32, 17, 1, 10);
  979. run(64, 384, 384, 32, 32, 19, 1, 10);
  980. run(64, 384, 384, 32, 32, 21, 1, 10);
  981. run(64, 384, 384, 32, 32, 23, 1, 10);
  982. run(64, 384, 384, 32, 32, 25, 1, 10);
  983. run(64, 384, 384, 32, 32, 27, 1, 10);
  984. run(64, 384, 384, 32, 32, 29, 1, 10);
  985. run(64, 384, 384, 32, 32, 31, 1, 10);
  986. }
  987. TEST_F(CUDA, BENCHMARK_CONVOLUTION_BWD_DATA_BF16) {
  988. CUBenchmarker<ConvolutionBackwardData> bench{handle_cuda()};
  989. std::unique_ptr<OprProxy<ConvolutionBackwardData>> proxy{
  990. new OprProxy<ConvolutionBackwardData>{true}};
  991. size_t RUNS = 10;
  992. bench.set_proxy(proxy).set_times(RUNS);
  993. auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW, size_t FH,
  994. size_t SH, size_t PH) {
  995. bench.set_dtype(0, dtype::BFloat16())
  996. .set_dtype(1, dtype::BFloat16())
  997. .set_dtype(2, dtype::BFloat16());
  998. param::Convolution param;
  999. param.stride_h = param.stride_w = SH;
  1000. param.pad_h = param.pad_w = PH;
  1001. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  1002. bench.set_param(param);
  1003. bench.proxy()->target_execution_policy = {};
  1004. TensorLayout src{{N, IC, IH, IW}, dtype::BFloat16()},
  1005. filter{{OC, IC, FH, FH}, dtype::BFloat16()};
  1006. TensorLayout dst;
  1007. {
  1008. auto&& opr = handle_cuda()->create_operator<Convolution>();
  1009. opr->param() = param;
  1010. opr->deduce_layout(src, filter, dst);
  1011. }
  1012. auto used = bench.execl({filter, dst, src}) / RUNS;
  1013. float flo = 2.0 * N * OC * IC * dst[2] * dst[3] * FH * FH;
  1014. printf("inp=%s, kern=%s, dst=%s ", src.to_string().c_str(),
  1015. filter.to_string().c_str(), dst.to_string().c_str());
  1016. printf("time_fp32=%.2fms, flops=%.3fTFLOPS\n", used, (flo / (used * 1e9)));
  1017. };
  1018. run(32, 64, 3, 224, 224, 7, 2, 3);
  1019. run(32, 128, 128, 28, 28, 3, 1, 1);
  1020. run(32, 256, 256, 14, 14, 3, 1, 1);
  1021. run(32, 512, 512, 7, 7, 3, 1, 1);
  1022. run(32, 64, 64, 56, 56, 3, 1, 1);
  1023. run(32, 512, 256, 56, 56, 1, 2, 0);
  1024. run(32, 1024, 512, 28, 28, 1, 2, 0);
  1025. run(32, 2048, 1024, 14, 14, 1, 2, 0);
  1026. run(32, 512, 128, 28, 28, 1, 1, 0);
  1027. run(32, 128, 512, 28, 28, 1, 1, 0);
  1028. run(32, 1024, 256, 14, 14, 1, 1, 0);
  1029. run(32, 256, 1024, 14, 14, 1, 1, 0);
  1030. run(32, 2048, 512, 7, 7, 1, 1, 0);
  1031. run(32, 512, 2048, 7, 7, 1, 1, 0);
  1032. run(32, 256, 64, 56, 56, 1, 1, 0);
  1033. run(32, 64, 256, 56, 56, 1, 1, 0);
  1034. run(32, 128, 256, 56, 56, 1, 2, 0);
  1035. run(32, 256, 512, 28, 28, 1, 2, 0);
  1036. run(32, 512, 1024, 14, 14, 1, 2, 0);
  1037. run(32, 64, 64, 56, 56, 1, 1, 0);
  1038. }
  1039. TEST_F(CUDA, BENCHMARK_CONVOLUTION_BWD_DATA_INT8_DP4A) {
  1040. CUBenchmarker<ConvolutionBackwardData> bench{handle_cuda()};
  1041. std::unique_ptr<OprProxy<ConvolutionBackwardData>> proxy{
  1042. new OprProxy<ConvolutionBackwardData>{true}};
  1043. size_t RUNS = 10;
  1044. bench.set_proxy(proxy).set_times(RUNS);
  1045. auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW, size_t FH,
  1046. size_t SH, size_t PH) {
  1047. bench.set_dtype(0, dtype::QuantizedS8{1.0f})
  1048. .set_dtype(1, dtype::QuantizedS8{1.0f})
  1049. .set_dtype(2, dtype::QuantizedS8{1.0f});
  1050. param::Convolution param;
  1051. param.format = param::Convolution::Format::NCHW4;
  1052. param.stride_h = param.stride_w = SH;
  1053. param.pad_h = param.pad_w = PH;
  1054. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  1055. bench.set_param(param);
  1056. bench.proxy()->target_execution_policy = {};
  1057. TensorLayout src{{N, IC / 4, IH, IW, 4}, dtype::QuantizedS8{1.0f}},
  1058. filter{{OC, IC / 4, FH, FH, 4}, dtype::QuantizedS8{1.0f}};
  1059. TensorLayout dst;
  1060. dst.dtype = dtype::QuantizedS8{1.0f};
  1061. {
  1062. auto&& opr = handle_cuda()->create_operator<Convolution>();
  1063. opr->param() = param;
  1064. opr->deduce_layout(src, filter, dst);
  1065. }
  1066. auto used = bench.execl({filter, dst, src}) / RUNS;
  1067. float flo = 2.0 * N * OC * IC * dst[2] * dst[3] * FH * FH;
  1068. printf("inp=%s, kern=%s, dst=%s ", src.to_string().c_str(),
  1069. filter.to_string().c_str(), dst.to_string().c_str());
  1070. printf("time_fp32=%.2fms, flops=%.3fTFLOPS\n", used, (flo / (used * 1e9)));
  1071. };
  1072. run(64, 32, 32, 92, 180, 4, 2, 2);
  1073. run(64, 32, 32, 46, 80, 4, 2, 2);
  1074. run(16, 16, 16, 92, 180, 4, 2, 2);
  1075. run(16, 16, 16, 46, 80, 4, 2, 2);
  1076. }
  1077. TEST_F(CUDA, CONVOLUTION_BWD_FILTER_BENCHMARK) {
  1078. CUBenchmarker<ConvolutionBackwardFilter> bench{handle_cuda()};
  1079. std::unique_ptr<OprProxy<ConvolutionBackwardFilter>> proxy{
  1080. new OprProxy<ConvolutionBackwardFilter>{true}};
  1081. size_t RUNS = 10;
  1082. bench.set_proxy(proxy).set_times(RUNS);
  1083. auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW, size_t FH,
  1084. size_t SH, size_t PH) {
  1085. bench.set_dtype(0, dtype::Float32())
  1086. .set_dtype(1, dtype::Float32())
  1087. .set_dtype(2, dtype::Float32());
  1088. param::Convolution param;
  1089. param.stride_h = param.stride_w = SH;
  1090. param.pad_h = param.pad_w = PH;
  1091. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  1092. bench.set_param(param);
  1093. bench.proxy()->target_execution_policy.algo.reset();
  1094. TensorLayout src{{N, IC, IH, IW}, dtype::Float32()},
  1095. filter{{OC, IC, FH, FH}, dtype::Float32()};
  1096. TensorLayout dst;
  1097. {
  1098. auto&& opr = handle_cuda()->create_operator<Convolution>();
  1099. opr->param() = param;
  1100. opr->deduce_layout(src, filter, dst);
  1101. }
  1102. auto time_ms_fp32 = bench.execl({src, dst, filter}) / RUNS;
  1103. src.dtype = filter.dtype = dst.dtype = dtype::Float16();
  1104. bench.proxy()->target_execution_policy.algo.reset();
  1105. bench.set_dtype(0, dtype::Float16())
  1106. .set_dtype(1, dtype::Float16())
  1107. .set_dtype(2, dtype::Float16());
  1108. auto time_ms_true_fp16 = bench.execl({src, dst, filter}) / RUNS;
  1109. param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  1110. bench.proxy()->target_execution_policy.algo.reset();
  1111. bench.set_param(param);
  1112. auto time_ms_pseudo_fp16 = bench.execl({src, dst, filter}) / RUNS;
  1113. float flo = 2.0 * N * OC * IC * dst[2] * dst[3] * FH * FH;
  1114. printf("inp=%s, kern=%s, dst=%s ", src.to_string().c_str(),
  1115. filter.to_string().c_str(), dst.to_string().c_str());
  1116. printf("time_fp32=%.2fms, flops=%.3fTFLOPS\ntime_true_fp16=%.2fms, "
  1117. "flops=%.3fTFLOPS\ntime_pseudo_fp16=%.2fms, flops=%.3fFLOPS\n",
  1118. time_ms_fp32, (flo / (time_ms_fp32 * 1e9)), time_ms_true_fp16,
  1119. (flo / (time_ms_true_fp16 * 1e9)), time_ms_pseudo_fp16,
  1120. (flo / (time_ms_pseudo_fp16 * 1e9)));
  1121. printf("speedup (true_fp16/fp32)=%.2f, (true_fp16/pseudo_fp16)=%.2f\n",
  1122. time_ms_fp32 / time_ms_true_fp16,
  1123. time_ms_pseudo_fp16 / time_ms_true_fp16);
  1124. };
  1125. run(32, 64, 3, 224, 224, 7, 2, 3);
  1126. run(32, 128, 128, 28, 28, 3, 1, 1);
  1127. run(32, 256, 256, 14, 14, 3, 1, 1);
  1128. run(32, 512, 512, 7, 7, 3, 1, 1);
  1129. run(32, 64, 64, 56, 56, 3, 1, 1);
  1130. run(32, 512, 256, 56, 56, 1, 2, 0);
  1131. run(32, 1024, 512, 28, 28, 1, 2, 0);
  1132. run(32, 2048, 1024, 14, 14, 1, 2, 0);
  1133. run(32, 512, 128, 28, 28, 1, 1, 0);
  1134. run(32, 128, 512, 28, 28, 1, 1, 0);
  1135. run(32, 1024, 256, 14, 14, 1, 1, 0);
  1136. run(32, 256, 1024, 14, 14, 1, 1, 0);
  1137. run(32, 2048, 512, 7, 7, 1, 1, 0);
  1138. run(32, 512, 2048, 7, 7, 1, 1, 0);
  1139. run(32, 256, 64, 56, 56, 1, 1, 0);
  1140. run(32, 64, 256, 56, 56, 1, 1, 0);
  1141. run(32, 128, 256, 56, 56, 1, 2, 0);
  1142. run(32, 256, 512, 28, 28, 1, 2, 0);
  1143. run(32, 512, 1024, 14, 14, 1, 2, 0);
  1144. run(32, 64, 64, 56, 56, 1, 1, 0);
  1145. }
  1146. TEST_F(CUDA, BENCHMARK_CONVOLUTION_BWD_FILTER_DEPTHWISE_LARGE_FILTER) {
  1147. CUBenchmarker<ConvolutionBackwardFilter> bench{handle_cuda()};
  1148. std::unique_ptr<OprProxy<ConvolutionBackwardFilter>> proxy{
  1149. new OprProxy<ConvolutionBackwardFilter>{true}};
  1150. size_t RUNS = 10;
  1151. bench.set_proxy(proxy).set_times(RUNS);
  1152. bench.set_before_exec_callback(AlgoChecker<ConvolutionBackwardFilter>(
  1153. "CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFTv7.6.3"));
  1154. auto run = [&](size_t N, size_t OC, size_t g, size_t IH, size_t IW, size_t FH,
  1155. size_t SH, size_t PH) {
  1156. bench.set_dtype(0, dtype::Float32())
  1157. .set_dtype(1, dtype::Float32())
  1158. .set_dtype(2, dtype::Float32());
  1159. param::Convolution param;
  1160. param.stride_h = param.stride_w = SH;
  1161. param.pad_h = param.pad_w = FH / 2;
  1162. param.sparse = param::Convolution::Sparse::GROUP;
  1163. bench.set_param(param);
  1164. bench.proxy()->target_execution_policy.algo.reset();
  1165. TensorLayout src{{N, g, IH, IW}, dtype::Float32()},
  1166. filter{{g, 1, 1, FH, FH}, dtype::Float32()};
  1167. TensorLayout dst;
  1168. {
  1169. auto&& opr = handle_cuda()->create_operator<Convolution>();
  1170. opr->param() = param;
  1171. opr->deduce_layout(src, filter, dst);
  1172. }
  1173. auto time_ms_fp32 = bench.execl({src, dst, filter}) / RUNS;
  1174. float flo = 2.0 * N * g * dst[2] * dst[3] * FH * FH;
  1175. printf("inp=%s, kern=%s, dst=%s ", src.to_string().c_str(),
  1176. filter.to_string().c_str(), dst.to_string().c_str());
  1177. printf("time_fp32=%.2fms, flops=%.3fTFLOPS\n", time_ms_fp32,
  1178. (flo / (time_ms_fp32 * 1e9)));
  1179. };
  1180. run(64, 384, 384, 32, 32, 31, 1, 15);
  1181. }
  1182. #endif
  1183. #undef CUDNN_VERSION_STRING
  1184. #undef V
  1185. #undef V1
  1186. } // namespace test
  1187. } // namespace megdnn
  1188. // vim: syntax=cpp.doxygen