You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

chanwise_convolution.cpp 58 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439
  1. /**
  2. * \file dnn/test/cuda/chanwise_convolution.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #include "megdnn/oprs/nn.h"
  12. #include "cuda.h"
  13. #include "megcore_cuda.h"
  14. #include "test/common/benchmarker.h"
  15. #include "test/common/checker.h"
  16. #include "test/common/convolution.h"
  17. #include "test/common/tensor.h"
  18. #include "test/common/workspace_wrapper.h"
  19. #include "test/cuda/benchmark.h"
  20. #include "test/cuda/fixture.h"
  21. #include "test/cuda/utils.h"
  22. #include <cuda_profiler_api.h>
  23. #include <cuda_runtime_api.h>
  24. using namespace megdnn;
  25. using namespace test;
  26. namespace {
  27. #if MEGDNN_WITH_BENCHMARK
  28. bool check_need_full_bench() {
  29. if (getenv("MEGDNN_CHANWISE_CONV_FULLBENCH"))
  30. return true;
  31. printf("set MEGDNN_CHANWISE_CONV_FULLBENCH to run full benchmark\n");
  32. return false;
  33. }
  34. #endif
  35. Convolution::Param gconv_param(Convolution::Param p, bool io16xc32 = false) {
  36. p.sparse = Convolution::Param::Sparse::GROUP;
  37. if (io16xc32)
  38. p.compute_mode = Convolution::Param::ComputeMode::FLOAT32;
  39. return p;
  40. }
  41. template <int P0, int P1, int P2>
  42. class BenchmarkEnv {
  43. Handle *handle, *handle_cpu;
  44. std::unique_ptr<GaussianRNG> rng;
  45. TensorLayout lsrc, lflt0, lflt1, ldst;
  46. std::unique_ptr<Tensor<>> src0, src1, flt0, flt0_cpu, flt1, flt1_cpu, dst0, dst1;
  47. cudaEvent_t cuda_ev[3];
  48. cudaStream_t cuda_stream;
  49. size_t pad_h, pad_w;
  50. template <typename T>
  51. static std::tuple<T, T, T> shuffle(std::tuple<T, T, T> data) {
  52. return std::make_tuple(
  53. std::get<P0>(data), std::get<P1>(data), std::get<P2>(data));
  54. }
  55. public:
  56. BenchmarkEnv(Handle* handle, Handle* handle_cpu) {
  57. this->handle = handle;
  58. this->handle_cpu = handle_cpu;
  59. rng = handle->create_operator<GaussianRNG>();
  60. // make cpu handle used
  61. handle_cpu->create_operator<Sleep>()->exec();
  62. for (int i = 0; i < 3; ++i)
  63. cudaEventCreate(&cuda_ev[i]);
  64. megcoreGetCUDAStream(handle->megcore_computing_handle(), &cuda_stream);
  65. }
  66. ~BenchmarkEnv() {
  67. for (int i = 0; i < 3; ++i)
  68. cudaEventDestroy(cuda_ev[i]);
  69. }
  70. void alloc(
  71. size_t N, size_t IC, size_t IH, size_t IW, size_t CHL_MUL, size_t FH,
  72. size_t FW, size_t PH, size_t PW) {
  73. pad_h = PH;
  74. pad_w = PW;
  75. auto mkly = [](const TensorShape& s) {
  76. return TensorLayout{s, dtype::Float32()};
  77. };
  78. lsrc = mkly({N, IC, IH, IW});
  79. lflt0 = mkly({CHL_MUL * IC, IC, FH, FW});
  80. lflt1 = mkly({IC, CHL_MUL, 1, FH, FW});
  81. ldst = mkly({N, IC * CHL_MUL, IH - FH + 1 + PH * 2, IW - FW + 1 + PW * 2});
  82. src0.reset(new Tensor<>(handle, lsrc));
  83. src1.reset(new Tensor<>(handle, lsrc));
  84. flt0.reset(new Tensor<>(handle, lflt0));
  85. flt0_cpu.reset(new Tensor<>(handle_cpu, lflt0));
  86. flt1.reset(new Tensor<>(handle, lflt1));
  87. flt1_cpu.reset(new Tensor<>(handle_cpu, lflt1));
  88. dst0.reset(new Tensor<>(handle, ldst));
  89. dst1.reset(new Tensor<>(handle, ldst));
  90. }
  91. void fill_src() {
  92. rng->exec(src0->tensornd(), {});
  93. megdnn_memcpy_D2D(handle, src1->ptr(), src0->ptr(), lsrc.span().dist_byte());
  94. }
  95. void fill_flt() {
  96. rng->exec(flt1->tensornd(), {});
  97. megdnn_memcpy_D2H(
  98. handle, flt1_cpu->ptr(), flt1->ptr(), lflt1.span().dist_byte());
  99. const size_t IC = lflt1[0], CHL_MUL = lflt1[1], FSIZE = lflt1[3] * lflt1[4];
  100. // fill flt0 from flt1
  101. float* src = flt1_cpu->ptr();
  102. float* dst = flt0_cpu->ptr();
  103. memset(dst, 0, lflt0.span().dist_byte());
  104. for (size_t i = 0; i < IC; ++i) {
  105. for (size_t j = 0; j < CHL_MUL; ++j) {
  106. memcpy(dst + ((i * CHL_MUL + j) * IC + i) * FSIZE,
  107. src + (i * CHL_MUL + j) * FSIZE, FSIZE * sizeof(float));
  108. }
  109. }
  110. megdnn_memcpy_H2D(handle, flt0->ptr(), dst, lflt0.span().dist_byte());
  111. }
  112. void fill_dst() {
  113. rng->exec(dst0->tensornd(), {});
  114. megdnn_memcpy_D2D(handle, dst1->ptr(), dst0->ptr(), ldst.span().dist_byte());
  115. }
  116. template <class Opr>
  117. void exec(Opr* opr0, Opr* opr1) {
  118. opr0->param().pad_h = pad_h;
  119. opr0->param().pad_w = pad_w;
  120. opr1->param() = opr0->param();
  121. opr1->param().sparse = param::Convolution::Sparse::GROUP;
  122. TensorND a0, b0, c0, a1, b1, c1;
  123. std::tie(a0, b0, c0) = shuffle(
  124. std::make_tuple(src0->tensornd(), flt0->tensornd(), dst0->tensornd()));
  125. std::tie(a1, b1, c1) = shuffle(
  126. std::make_tuple(src1->tensornd(), flt1->tensornd(), dst1->tensornd()));
  127. WorkspaceWrapper wk(
  128. handle,
  129. std::max(
  130. opr0->get_workspace_in_bytes(a0.layout, b0.layout, c0.layout),
  131. opr1->get_workspace_in_bytes(a1.layout, b1.layout, c1.layout)));
  132. cudaProfilerStart();
  133. cudaEventRecord(cuda_ev[0], cuda_stream);
  134. opr0->exec(a0, b0, c0, wk.workspace());
  135. cudaEventRecord(cuda_ev[1], cuda_stream);
  136. opr1->exec(a1, b1, c1, wk.workspace());
  137. cudaEventRecord(cuda_ev[2], cuda_stream);
  138. cudaProfilerStop();
  139. if (getenv("MEGDNN_CHANWISE_CONV_VERBOSE") ||
  140. getenv("MEGDNN_CHANWISE_CONV_FULLBENCH")) {
  141. cudaStreamSynchronize(cuda_stream);
  142. float t0 = -1, t1 = -1;
  143. cudaEventElapsedTime(&t0, cuda_ev[0], cuda_ev[1]);
  144. cudaEventElapsedTime(&t1, cuda_ev[1], cuda_ev[2]);
  145. printf("%s;%s;%s: cudnn/megdnn: %.3fms/%.3fms=%.3f\n",
  146. lsrc.TensorShape::to_string().c_str(),
  147. lflt1.TensorShape::to_string().c_str(),
  148. ldst.TensorShape::to_string().c_str(), t0, t1, t0 / t1);
  149. }
  150. }
  151. //! special for weight preprocess
  152. void exec_convolution(ConvolutionForward* opr0, ConvolutionForward* opr1) {
  153. opr0->param().pad_h = pad_h;
  154. opr0->param().pad_w = pad_w;
  155. opr1->param() = opr0->param();
  156. opr1->param().sparse = param::Convolution::Sparse::GROUP;
  157. TensorND a0, b0, c0, a1, b1, c1;
  158. std::tie(a0, b0, c0) = shuffle(
  159. std::make_tuple(src0->tensornd(), flt0->tensornd(), dst0->tensornd()));
  160. std::tie(a1, b1, c1) = shuffle(
  161. std::make_tuple(src1->tensornd(), flt1->tensornd(), dst1->tensornd()));
  162. WorkspaceWrapper wk(
  163. handle, std::max(
  164. opr0->get_workspace_in_bytes(
  165. a0.layout, b0.layout, c0.layout, nullptr),
  166. opr1->get_workspace_in_bytes(
  167. a1.layout, b1.layout, c1.layout, nullptr)));
  168. cudaProfilerStart();
  169. cudaEventRecord(cuda_ev[0], cuda_stream);
  170. opr0->exec(a0, b0, c0, nullptr, wk.workspace());
  171. cudaEventRecord(cuda_ev[1], cuda_stream);
  172. opr1->exec(a1, b1, c1, nullptr, wk.workspace());
  173. cudaEventRecord(cuda_ev[2], cuda_stream);
  174. cudaProfilerStop();
  175. if (getenv("MEGDNN_CHANWISE_CONV_VERBOSE") ||
  176. getenv("MEGDNN_CHANWISE_CONV_FULLBENCH")) {
  177. cudaStreamSynchronize(cuda_stream);
  178. float t0 = -1, t1 = -1;
  179. cudaEventElapsedTime(&t0, cuda_ev[0], cuda_ev[1]);
  180. cudaEventElapsedTime(&t1, cuda_ev[1], cuda_ev[2]);
  181. printf("%s;%s;%s: cudnn/megdnn: %.3fms/%.3fms=%.3f\n",
  182. lsrc.TensorShape::to_string().c_str(),
  183. lflt1.TensorShape::to_string().c_str(),
  184. ldst.TensorShape::to_string().c_str(), t0, t1, t0 / t1);
  185. }
  186. }
  187. void cmp_dst() {
  188. Tensor<> dst0_cpu(handle_cpu, ldst), dst1_cpu(handle_cpu, ldst);
  189. megdnn_memcpy_D2H(handle, dst0_cpu.ptr(), dst0->ptr(), ldst.span().dist_byte());
  190. megdnn_memcpy_D2H(handle, dst1_cpu.ptr(), dst1->ptr(), ldst.span().dist_byte());
  191. dst0_cpu.check_with(dst1_cpu);
  192. }
  193. void cmp_src() {
  194. Tensor<> src0_cpu(handle_cpu, lsrc), src1_cpu(handle_cpu, lsrc);
  195. megdnn_memcpy_D2H(handle, src0_cpu.ptr(), src0->ptr(), lsrc.span().dist_byte());
  196. megdnn_memcpy_D2H(handle, src1_cpu.ptr(), src1->ptr(), lsrc.span().dist_byte());
  197. src0_cpu.check_with(src1_cpu);
  198. }
  199. void cmp_flt() {
  200. Tensor<> flt0_cpu(handle_cpu, lflt0), flt1_cpu(handle_cpu, lflt1);
  201. float* p0 = flt0_cpu.ptr();
  202. float* p1 = flt1_cpu.ptr();
  203. megdnn_memcpy_D2H(handle, p0, flt0->ptr(), lflt0.span().dist_byte());
  204. megdnn_memcpy_D2H(handle, p1, flt1->ptr(), lflt1.span().dist_byte());
  205. size_t IC = lflt1[0], CHL_MUL = lflt1[1], FSIZE = lflt1[3] * lflt1[4];
  206. double tot_err = 0, tot_err_num = 0;
  207. for (size_t i = 0; i < IC; ++i) {
  208. for (size_t j = 0; j < CHL_MUL; ++j) {
  209. auto t0 = p0 + ((i * CHL_MUL + j) * IC + i) * FSIZE,
  210. t1 = p1 + (i * CHL_MUL + j) * FSIZE;
  211. for (size_t k = 0; k < FSIZE; ++k) {
  212. auto err = std::abs(diff(t0[k], t1[k]));
  213. tot_err += err;
  214. tot_err_num += 1;
  215. ASSERT_LT(err, 1e-2) << "failed at " << i << " " << j << " " << k
  216. << " vals=" << t0[k] << "," << t1[k];
  217. }
  218. }
  219. }
  220. auto avg_err = tot_err / tot_err_num;
  221. ASSERT_LT(avg_err, 1e-4);
  222. }
  223. };
  224. } // anonymous namespace
  225. constexpr auto M = Convolution::Mode::CROSS_CORRELATION;
  226. TEST_F(CUDA, CHANWISE_CONVOLUTION_FORWARD) {
  227. Checker<Convolution> checker(handle_cuda());
  228. bool require_algo = false;
  229. checker.set_before_exec_callback(AlgoChecker<ConvolutionForward>(
  230. ExecutionPolicyAlgoName{
  231. "DEFAULT",
  232. {{ConvBiasForward::algo_name<ConvBiasForward::DirectParam>(
  233. "CHANNEL_WISE", {})
  234. .c_str(),
  235. {}}}},
  236. &require_algo));
  237. for (auto dtype : std::vector<DType>{dtype::Float32(), dtype::Float16()}) {
  238. checker.set_dtype(0, dtype).set_dtype(1, dtype).set_dtype(2, dtype);
  239. if (dtype.enumv() == DTypeEnum::Float16)
  240. checker.set_epsilon(2e-2);
  241. // simple case
  242. // clang-format off
  243. for (uint32_t s : {1, 2})
  244. for (uint32_t p : {0, 1, 2, 3})
  245. for (size_t f : {2, 3, 5, 7})
  246. for (size_t ocpg : {1, 3}) {
  247. checker.set_param(gconv_param({M, p, p, s, s}))
  248. .execs({{2, 3, 16, 16}, {3, ocpg, 1, f, f}, {}});
  249. }
  250. // clang-format on
  251. checker.set_param(gconv_param({M, 2, 3, 2, 1}))
  252. .execs({{32, 12, 20, 10}, {12, 2, 1, 4, 5}, {}});
  253. // padding larger than kern
  254. checker.set_param(gconv_param({M, 20, 30, 4, 5}))
  255. .execs({{32, 12, 20, 10}, {12, 2, 1, 4, 5}, {}});
  256. }
  257. }
  258. TEST_F(CUDA, CHANWISE_CONVOLUTION_FORWARD_SMALL) {
  259. Checker<Convolution> checker(handle_cuda());
  260. bool require_algo = false;
  261. checker.set_before_exec_callback(AlgoChecker<ConvolutionForward>(
  262. ExecutionPolicyAlgoName{
  263. "DEFAULT",
  264. {{ConvBiasForward::algo_name<ConvBiasForward::DirectParam>(
  265. "CHANNEL_WISE_SMALL", {})
  266. .c_str(),
  267. {}}}},
  268. &require_algo));
  269. for (auto dtype : std::vector<DType> {
  270. dtype::Float32(),
  271. #if CUDA_VERSION >= 9000
  272. dtype::Float16()
  273. #endif
  274. }) {
  275. checker.set_dtype(0, dtype).set_dtype(1, dtype).set_dtype(2, dtype);
  276. if (dtype.enumv() == DTypeEnum::Float16)
  277. checker.set_epsilon(2e-2);
  278. // clang-format off
  279. for (uint32_t s : {1})
  280. for (uint32_t f : {1, 3, 5, 7}) {
  281. checker.set_param(gconv_param({M, f / 2, f / 2, s, s}))
  282. .execs({{2, 3, 16, 16}, {3, 1, 1, f, f}, {}});
  283. }
  284. // clang-format on
  285. checker.set_param(gconv_param({M, 1, 1, 1, 1}))
  286. .execs({{2, 3, 3, 16}, {3, 1, 1, 3, 3}, {}})
  287. .execs({{2, 3, 8, 3}, {3, 1, 1, 3, 3}, {}});
  288. }
  289. }
  290. TEST_F(CUDA, CHANWISE_CONVOLUTION_BACKWARD_DATA) {
  291. Checker<ConvolutionBackwardData> checker(handle_cuda());
  292. bool require_algo = false;
  293. checker.set_before_exec_callback(
  294. AlgoChecker<ConvolutionBackwardData>("CHANNEL_WISE", &require_algo));
  295. for (auto dtype : std::vector<DType>{dtype::Float32(), dtype::Float16()}) {
  296. checker.set_dtype(0, dtype).set_dtype(1, dtype).set_dtype(2, dtype);
  297. if (dtype.enumv() == DTypeEnum::Float16)
  298. checker.set_epsilon(1e-1);
  299. // simple case
  300. // clang-format off
  301. for (uint32_t s : {1, 2})
  302. for (uint32_t p : {0, 1, 2, 3})
  303. for (size_t f : {1, 2, 3, 5, 7})
  304. for (size_t ocpg : {1, 3}) {
  305. size_t ii = infer_conv_shape(16, f, s, p, true);
  306. checker.set_param(gconv_param({M, p, p, s, s}))
  307. .execs({{3, ocpg, 1, f, f},
  308. {2, 3 * ocpg, ii, ii},
  309. {2, 3, 16, 16}});
  310. }
  311. // clang-format on
  312. checker.set_param(gconv_param({M, 2, 3, 2, 1}))
  313. .execs({{12, 3, 1, 4, 5}, {32, 36, 20, 10}, {32, 12, 39, 8}});
  314. checker.set_param(gconv_param({M, 30, 20, 5, 4}))
  315. .execs({{6, 2, 1, 5, 4}, {32, 12, 12, 10}, {32, 6, 3, 2}});
  316. checker.set_param(gconv_param({M, 20, 30, 4, 5}))
  317. .execs({{6, 2, 1, 4, 5}, {32, 12, 10, 12}, {32, 6, 2, 3}});
  318. }
  319. }
  320. TEST_F(CUDA, CHANWISE_CONVOLUTION_BACKWARD_DATA_SMALL) {
  321. Checker<ConvolutionBackwardData> checker(handle_cuda());
  322. bool require_algo = false;
  323. checker.set_before_exec_callback(
  324. AlgoChecker<ConvolutionBackwardData>("CHANNEL_WISE_SMALL", &require_algo));
  325. for (auto dtype : std::vector<DType> {
  326. dtype::Float32(),
  327. #if CUDA_VERSION >= 9000
  328. dtype::Float16()
  329. #endif
  330. }) {
  331. checker.set_dtype(0, dtype).set_dtype(1, dtype).set_dtype(2, dtype);
  332. if (dtype.enumv() == DTypeEnum::Float16)
  333. checker.set_epsilon(2e-2);
  334. for (uint32_t f : {1, 3, 5, 7}) {
  335. checker.set_param(gconv_param({M, f / 2, f / 2, 1, 1}))
  336. .execs({{3, 1, 1, f, f}, {2, 3, 16, 16}, {2, 3, 16, 16}});
  337. }
  338. checker.set_param(gconv_param({M, 1, 1, 1, 1}))
  339. .execs({{3, 1, 1, 3, 3}, {2, 3, 3, 16}, {2, 3, 3, 16}})
  340. .execs({{3, 1, 1, 3, 3}, {2, 3, 8, 3}, {2, 3, 8, 3}});
  341. }
  342. }
  343. TEST_F(CUDA, CHANWISE_CONVOLUTION_BACKWARD_FILTER) {
  344. Checker<ConvolutionBackwardFilter> checker(handle_cuda());
  345. bool require_algo = false;
  346. checker.set_before_exec_callback(
  347. AlgoChecker<ConvolutionBackwardFilter>("CHANNEL_WISE", &require_algo));
  348. UniformFloatRNG rng(-0.1, 0.1);
  349. for (auto dtype : std::vector<DType>{dtype::Float32(), dtype::Float16()}) {
  350. checker.set_dtype(0, dtype)
  351. .set_dtype(1, dtype)
  352. .set_dtype(2, dtype)
  353. .set_rng(0, &rng)
  354. .set_rng(1, &rng);
  355. if (dtype.enumv() == DTypeEnum::Float16)
  356. checker.set_epsilon(2e-1);
  357. // simple case
  358. // clang-format off
  359. for (uint32_t s : {1, 2})
  360. for (uint32_t p : {0, 1, 2, 3})
  361. for (uint32_t f : {1, 2, 3, 5, 7})
  362. for (uint32_t ocpg : {1, 3})
  363. for (uint32_t i : {8, 16, 32, 64}){
  364. size_t ii = infer_conv_shape(i, f, s, p, true);
  365. checker.set_param(gconv_param({M, p, p, s, s}))
  366. .execs({{2, 3, i, i},
  367. {2, 3 * ocpg, ii, ii},
  368. {3, ocpg, 1, f, f}});
  369. }
  370. // clang-format on
  371. // padding larger than kern
  372. checker.set_param(gconv_param({M, 20, 30, 4, 5}))
  373. .execs({{32, 6, 2, 3}, {32, 12, 10, 12}, {6, 2, 1, 4, 5}});
  374. // unused filter items
  375. checker.set_param(gconv_param({M, 2, 3, 2, 3}))
  376. .execs({{32, 6, 1, 1}, {32, 12, 1, 1}, {6, 2, 1, 5, 7}});
  377. }
  378. }
  379. namespace {
  380. template <typename Op>
  381. struct AlgoCheckerMaker {
  382. static auto make(const char* name, bool* require_algo) {
  383. return AlgoChecker<Op>(name, require_algo);
  384. }
  385. };
  386. template <>
  387. struct AlgoCheckerMaker<ConvolutionForward> {
  388. static auto make(const char* name, bool* require_algo) {
  389. return AlgoChecker<ConvolutionForward>(
  390. ExecutionPolicyAlgoName{
  391. "DEFAULT",
  392. {{ConvBiasForward::algo_name<ConvBiasForward::DirectParam>(
  393. name, {})
  394. .c_str(),
  395. {}}}},
  396. require_algo);
  397. }
  398. };
  399. template <typename Op>
  400. void check_chanwise(DType io_type, DType comp_type, Handle* handle, const char* name) {
  401. Checker<Op> checker(handle);
  402. bool require_algo = false;
  403. checker.set_before_exec_callback(AlgoCheckerMaker<Op>::make(name, &require_algo));
  404. checker.set_dtype(0, io_type).set_dtype(1, io_type).set_dtype(2, io_type);
  405. bool io16xc32 = false;
  406. if (io_type == dtype::Float16()) {
  407. if (comp_type == dtype::Float16()) {
  408. checker.set_epsilon(1e-1);
  409. } else {
  410. io16xc32 = true;
  411. }
  412. }
  413. // dispatch testcase by operation
  414. if (std::is_same<Op, ConvolutionForward>::value) {
  415. // align 8
  416. checker.set_param(gconv_param({M, 7, 7, 1, 1}, io16xc32))
  417. .execs({{8, 2, 16, 16}, {2, 1, 1, 15, 15}, {}});
  418. // align 1
  419. checker.set_param(gconv_param({M, 7, 7, 1, 1}, io16xc32))
  420. .execs({{8, 2, 15, 15}, {2, 1, 1, 15, 15}, {}});
  421. // align 2
  422. checker.set_param(gconv_param({M, 7, 7, 1, 1}, io16xc32))
  423. .execs({{8, 2, 14, 14}, {2, 1, 1, 15, 15}, {}});
  424. // custom padding
  425. checker.set_param(gconv_param({M, 3, 3, 1, 1}, io16xc32))
  426. .execs({{8, 2, 16, 16}, {2, 1, 1, 15, 15}, {}});
  427. // custom stride
  428. checker.set_param(gconv_param({M, 7, 7, 2, 2}, io16xc32))
  429. .execs({{8, 2, 16, 16}, {2, 1, 1, 15, 15}, {}});
  430. } else if (std::is_same<Op, ConvolutionBackwardData>::value) {
  431. // align 8
  432. checker.set_param(gconv_param({M, 7, 7, 1, 1}, io16xc32))
  433. .execs({{2, 1, 1, 15, 15}, {8, 2, 16, 16}, {8, 2, 16, 16}});
  434. // align 1
  435. checker.set_param(gconv_param({M, 7, 7, 1, 1}, io16xc32))
  436. .execs({{2, 1, 1, 15, 15}, {8, 2, 15, 15}, {8, 2, 15, 15}});
  437. // align 2
  438. checker.set_param(gconv_param({M, 7, 7, 1, 1}, io16xc32))
  439. .execs({{2, 1, 1, 15, 15}, {8, 2, 14, 14}, {8, 2, 14, 14}});
  440. // custom padding
  441. checker.set_param(gconv_param({M, 3, 3, 1, 1}, io16xc32))
  442. .execs({{2, 1, 1, 15, 15}, {8, 2, 8, 8}, {8, 2, 16, 16}});
  443. // custom stride
  444. checker.set_param(gconv_param({M, 7, 7, 2, 2}, io16xc32))
  445. .execs({{2, 1, 1, 15, 15}, {8, 2, 7, 7}, {8, 2, 14, 14}});
  446. } else if (std::is_same<Op, ConvolutionBackwardFilter>::value) {
  447. }
  448. }
  449. } // namespace
  450. #define MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_FMA_KERNEL(cb) \
  451. cb(1, 128, 128, 8, 32, 64, 8); \
  452. cb(2, 128, 64, 8, 64, 32, 8); \
  453. cb(3, 128, 32, 8, 64, 32, 8); \
  454. cb(4, 64, 128, 8, 32, 64, 8); \
  455. cb(5, 32, 128, 8, 32, 64, 8); \
  456. cb(6, 64, 64, 8, 32, 64, 8); \
  457. cb(7, 32, 64, 8, 32, 64, 8); \
  458. cb(8, 32, 32, 8, 32, 32, 8); \
  459. cb(9, 64, 32, 8, 64, 32, 8);
  460. #define cb(tag, tbm, tbn, tbk, wm, wn, wk) \
  461. TEST_F(CUDA, CHANWISE_CONVOLUTION_FORWARD_CUTLASS_FMA_##tag) { \
  462. require_compute_capability(6, 1); \
  463. check_chanwise<ConvolutionForward>( \
  464. dtype::Float32(), dtype::Float32(), handle_cuda(), \
  465. "FLOAT32_NCHW_FMA_IMPLICIT_BATCHED_GEMM_" #tbm "X" #tbn "X" #tbk \
  466. "_" #wm "X" #wn "X" #wk "_2stage"); \
  467. }
  468. MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_FMA_KERNEL(cb)
  469. #undef cb
  470. #define cb(tag, tbm, tbn, tbk, wm, wn, wk) \
  471. TEST_F(CUDA, CHANWISE_CONVOLUTION_BACKWARD_DATA_CUTLASS_FMA_##tag) { \
  472. require_compute_capability(6, 1); \
  473. check_chanwise<ConvolutionBackwardData>( \
  474. dtype::Float32(), dtype::Float32(), handle_cuda(), \
  475. "FLOAT32_NCHW_FMA_IMPLICIT_BATCHED_GEMM_" #tbm "X" #tbn "X" #tbk \
  476. "_" #wm "X" #wn "X" #wk "_2stage"); \
  477. }
  478. MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_FMA_KERNEL(cb)
  479. #undef cb
  480. #undef MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_FMA_KERNEL
  481. #define MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_HMMA_KERNEL(cb) \
  482. cb(1, 128, 128, 32, 32, 32, 32); \
  483. cb(2, 128, 256, 32, 64, 64, 32); \
  484. cb(3, 128, 64, 32, 32, 32, 32); \
  485. cb(4, 64, 128, 32, 32, 32, 32); \
  486. cb(5, 64, 64, 32, 32, 32, 32);
  487. // check both ioc16 and io16xc32
  488. #define cb(tag, tbm, tbn, tbk, wm, wn, wk) \
  489. TEST_F(CUDA, CHANWISE_CONVOLUTION_FORWARD_CUTLASS_HMMA_##tag) { \
  490. require_compute_capability(7, 0); \
  491. check_chanwise<ConvolutionForward>( \
  492. dtype::Float16(), dtype::Float16(), handle_cuda(), \
  493. "FLOAT16_NCHW_HMMA_IMPLICIT_BATCHED_GEMM_" #tbm "X" #tbn "X" #tbk \
  494. "_" #wm "X" #wn "X" #wk "_2stage"); \
  495. check_chanwise<ConvolutionForward>( \
  496. dtype::Float16(), dtype::Float32(), handle_cuda(), \
  497. "FLOAT16_NCHW_HMMA_IMPLICIT_BATCHED_GEMM_" #tbm "X" #tbn "X" #tbk \
  498. "_" #wm "X" #wn "X" #wk "_2stage"); \
  499. }
  500. MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_HMMA_KERNEL(cb)
  501. #undef cb
  502. #define cb(tag, tbm, tbn, tbk, wm, wn, wk) \
  503. TEST_F(CUDA, CHANWISE_CONVOLUTION_BACKWARD_DATA_CUTLASS_HMMA_##tag) { \
  504. require_compute_capability(7, 0); \
  505. check_chanwise<ConvolutionBackwardData>( \
  506. dtype::Float16(), dtype::Float16(), handle_cuda(), \
  507. "FLOAT16_NCHW_HMMA_IMPLICIT_BATCHED_GEMM_" #tbm "X" #tbn "X" #tbk \
  508. "_" #wm "X" #wn "X" #wk "_mma8X8X4_2stage"); \
  509. check_chanwise<ConvolutionBackwardData>( \
  510. dtype::Float16(), dtype::Float32(), handle_cuda(), \
  511. "FLOAT16_NCHW_HMMA_IMPLICIT_BATCHED_GEMM_" #tbm "X" #tbn "X" #tbk \
  512. "_" #wm "X" #wn "X" #wk "_mma8X8X4_2stage"); \
  513. }
  514. MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_HMMA_KERNEL(cb)
  515. #undef cb
  516. #undef MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_FWD_HMMA_KERNEL
  517. #if MEGDNN_WITH_BENCHMARK
  518. TEST_F(CUDA, CHANWISE_CONVOLUTION_FORWARD_BENCH_CHECK) {
  519. auto handle = handle_cuda();
  520. auto handle_cpu = handle_naive();
  521. auto conv0 = handle->create_operator<ConvolutionForward>();
  522. auto conv1 = handle->create_operator<ConvolutionForward>();
  523. BenchmarkEnv<0, 1, 2> benv(handle, handle_cpu);
  524. auto run = [&](size_t N, size_t IC, size_t IH, size_t IW, size_t CHL_MUL, size_t FH,
  525. size_t FW, size_t PH, size_t PW) {
  526. benv.alloc(N, IC, IH, IW, CHL_MUL, FH, FW, PH, PW);
  527. benv.fill_src();
  528. benv.fill_flt();
  529. benv.exec_convolution(conv0.get(), conv1.get());
  530. benv.cmp_dst();
  531. };
  532. run(64, 60, 50, 50, 1, 3, 3, 1, 1);
  533. if (check_need_full_bench()) {
  534. run(64, 728, 18, 18, 2, 5, 5, 2, 2);
  535. run(64, 64, 150, 150, 2, 3, 3, 1, 1);
  536. run(1, 2048, 4, 4, 2, 3, 3, 1, 1);
  537. }
  538. }
  539. TEST_F(CUDA, CHANWISE_CONVOLUTION_BWD_DATA_BENCH_CHECK) {
  540. auto handle = handle_cuda();
  541. auto handle_cpu = handle_naive();
  542. auto conv0 = handle->create_operator<ConvolutionBackwardData>();
  543. auto conv1 = handle->create_operator<ConvolutionBackwardData>();
  544. BenchmarkEnv<1, 2, 0> benv(handle, handle_cpu);
  545. auto run = [&](size_t N, size_t IC, size_t IH, size_t IW, size_t CHL_MUL, size_t FH,
  546. size_t FW, size_t PH, size_t PW) {
  547. benv.alloc(N, IC, IH, IW, CHL_MUL, FH, FW, PH, PW);
  548. benv.fill_dst();
  549. benv.fill_flt();
  550. benv.exec(conv0.get(), conv1.get());
  551. benv.cmp_src();
  552. };
  553. run(64, 60, 50, 50, 1, 3, 3, 1, 1);
  554. if (check_need_full_bench()) {
  555. run(64, 728, 18, 18, 2, 5, 5, 2, 2);
  556. run(64, 64, 150, 150, 2, 3, 3, 1, 1);
  557. run(1, 2048, 4, 4, 2, 3, 3, 1, 1);
  558. }
  559. }
  560. TEST_F(CUDA, CHANWISE_CONVOLUTION_BWD_FILTER_BENCH_CHECK) {
  561. auto handle = handle_cuda();
  562. auto handle_cpu = handle_naive();
  563. auto conv0 = handle->create_operator<ConvolutionBackwardFilter>();
  564. auto conv1 = handle->create_operator<ConvolutionBackwardFilter>();
  565. BenchmarkEnv<0, 2, 1> benv(handle, handle_cpu);
  566. auto run = [&](size_t N, size_t IC, size_t IH, size_t IW, size_t CHL_MUL, size_t FH,
  567. size_t FW, size_t PH, size_t PW) {
  568. benv.alloc(N, IC, IH, IW, CHL_MUL, FH, FW, PH, PW);
  569. benv.fill_src();
  570. benv.fill_dst();
  571. benv.exec(conv0.get(), conv1.get());
  572. benv.cmp_flt();
  573. };
  574. run(64, 60, 50, 50, 1, 3, 3, 1, 1);
  575. if (check_need_full_bench()) {
  576. run(64, 728, 18, 18, 2, 5, 5, 2, 2);
  577. run(64, 64, 150, 150, 2, 3, 3, 1, 1);
  578. run(1, 2048, 4, 4, 2, 3, 3, 1, 1);
  579. }
  580. }
  581. TEST_F(CUDA, CHANWISE_CONVOLUTION_BENCH_ALL_ALGO_FWD) {
  582. // enable profiling
  583. std::unique_ptr<OprProxy<ConvolutionForward>> proxy{
  584. new OprProxy<ConvolutionForward>{true}};
  585. proxy->warmup_times = 1;
  586. proxy->exec_times = 10;
  587. Benchmarker<ConvolutionForward> checker(handle_cuda());
  588. checker.set_times(1);
  589. ConvolutionForward::Param param;
  590. param.sparse = ConvolutionForward::Param::Sparse::GROUP;
  591. checker.set_param(param);
  592. checker.set_proxy(proxy);
  593. auto run = [&](size_t N, size_t C, size_t IH, size_t IW, size_t FH, size_t FW) {
  594. checker.proxy()->target_execution_policy = {};
  595. checker.execs({{N, C, IH, IW}, {C, 1, 1, FH, FW}, {}});
  596. };
  597. run(128, 64, 90, 80, 3, 3);
  598. run(128, 90, 100, 100, 3, 5);
  599. run(128, 32, 62, 62, 5, 5);
  600. }
  601. TEST_F(CUDA, CHANWISE_CONVOLUTION_BENCH_ALL_ALGO_BWD_DATA) {
  602. // enable profiling
  603. std::unique_ptr<OprProxy<ConvolutionBackwardData>> proxy{
  604. new OprProxy<ConvolutionBackwardData>{true}};
  605. proxy->warmup_times = 1;
  606. proxy->exec_times = 10;
  607. Benchmarker<ConvolutionBackwardData> checker(handle_cuda());
  608. checker.set_times(1);
  609. ConvolutionBackwardData::Param param;
  610. param.sparse = ConvolutionForward::Param::Sparse::GROUP;
  611. checker.set_param(param);
  612. checker.set_proxy(proxy);
  613. auto run = [&](size_t N, size_t C, size_t IH, size_t IW, size_t FH, size_t FW) {
  614. checker.proxy()->target_execution_policy.algo.reset();
  615. checker.execs(
  616. {{C, 1, 1, FH, FW}, {N, C, IH - FH + 1, IW - FW + 1}, {N, C, IH, IW}});
  617. };
  618. run(128, 64, 90, 80, 3, 3);
  619. run(128, 90, 100, 100, 3, 5);
  620. run(128, 32, 62, 62, 5, 5);
  621. }
  622. TEST_F(CUDA, CHANWISE_CONVOLUTION_BENCH_ALL_ALGO_BWD_FILTER) {
  623. // enable profiling
  624. std::unique_ptr<OprProxy<ConvolutionBackwardFilter>> proxy{
  625. new OprProxy<ConvolutionBackwardFilter>{true}};
  626. proxy->warmup_times = 1;
  627. proxy->exec_times = 10;
  628. Benchmarker<ConvolutionBackwardFilter> checker(handle_cuda());
  629. checker.set_times(1);
  630. ConvolutionBackwardFilter::Param param;
  631. param.sparse = ConvolutionForward::Param::Sparse::GROUP;
  632. checker.set_param(param);
  633. checker.set_proxy(proxy);
  634. auto run = [&](size_t N, size_t C, size_t IH, size_t IW, size_t FH, size_t FW) {
  635. checker.proxy()->target_execution_policy.algo.reset();
  636. checker.execs(
  637. {{N, C, IH, IW}, {N, C, IH - FH + 1, IW - FW + 1}, {C, 1, 1, FH, FW}});
  638. };
  639. run(128, 64, 90, 80, 3, 3);
  640. run(128, 90, 100, 100, 3, 5);
  641. run(128, 32, 62, 62, 5, 5);
  642. }
  643. TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_ALL_ALGO_FORWARD) {
  644. CUBenchmarker<ConvolutionForward> bencher(handle_cuda());
  645. size_t RUNS = 10;
  646. bencher.set_display(false).set_times(RUNS);
  647. std::unique_ptr<OprProxy<ConvolutionForward>> proxy{
  648. new OprProxy<ConvolutionForward>{true}};
  649. bencher.set_proxy(proxy);
  650. Convolution::Param param;
  651. param.format = ConvBias::Param::Format::NCHW;
  652. param.sparse = Convolution::Param::Sparse::GROUP;
  653. NormalRNG rng;
  654. auto run = [&](size_t batch, size_t c, size_t ih, size_t iw, size_t f, size_t s) {
  655. param.pad_h = f / 2;
  656. param.pad_w = f / 2;
  657. param.stride_h = s;
  658. param.stride_w = s;
  659. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  660. TensorShape src = {batch, c, ih, iw}, filter = {c, 1, 1, f, f};
  661. TensorLayout dst_layout;
  662. auto opr = handle_cuda()->create_operator<Convolution>();
  663. opr->param() = param;
  664. opr->deduce_layout(
  665. {src, dtype::Float32()}, {filter, dtype::Float32()}, dst_layout);
  666. float bandwith = static_cast<float>(
  667. src.total_nr_elems() + filter.total_nr_elems() +
  668. dst_layout.total_nr_elems()) /
  669. (1024 * 1024 * 1024) * 1e3;
  670. bencher.set_param(param)
  671. .set_dtype(0, dtype::Float32())
  672. .set_dtype(1, dtype::Float32())
  673. .set_dtype(2, dtype::Float32())
  674. .set_rng(0, &rng)
  675. .set_rng(1, &rng);
  676. bencher.proxy()->target_execution_policy = {};
  677. auto time_in_ms_fp32 = bencher.execs({src, filter, {}}) / RUNS;
  678. bencher.set_param(param)
  679. .set_dtype(0, dtype::Float16())
  680. .set_dtype(1, dtype::Float16())
  681. .set_dtype(2, dtype::Float16())
  682. .set_rng(0, &rng)
  683. .set_rng(1, &rng);
  684. bencher.proxy()->target_execution_policy = {};
  685. auto time_in_ms_fp16 = bencher.execs({src, filter, {}}) / RUNS;
  686. bencher.proxy()->target_execution_policy.algo.reset();
  687. param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  688. bencher.set_param(param);
  689. auto time_in_ms_pseudo_fp16 = bencher.execs({src, filter, {}}) / RUNS;
  690. printf("stride=%zu src=%s, filter=%s, float32: %.2fms %.2fGB/s "
  691. "float16: %.2fms %.2fGB/s "
  692. "pseudo float16: %.2fms %.2fGB/s "
  693. "speedup: "
  694. "%0.2f (fp16/fp32) %.2f (fp16/pseudo fp16)\n",
  695. s, src.to_string().c_str(), filter.to_string().c_str(), time_in_ms_fp32,
  696. bandwith * 4 / time_in_ms_fp32, time_in_ms_fp16,
  697. bandwith * 2 / time_in_ms_fp16, time_in_ms_pseudo_fp16,
  698. bandwith * 2 / time_in_ms_pseudo_fp16, time_in_ms_fp32 / time_in_ms_fp16,
  699. time_in_ms_pseudo_fp16 / time_in_ms_fp16);
  700. };
  701. // clang-format off
  702. for (size_t s : {1, 2})
  703. for (size_t f : {3, 5, 7})
  704. for (size_t batch : {64})
  705. for (size_t c : {16, 32, 64, 128})
  706. for (size_t ih: {128, 256})
  707. for (size_t iw : {128, 256})
  708. run(batch, c, ih, iw, f, s);
  709. // clang-format on
  710. run(128, 192, 28, 28, 3, 1);
  711. run(128, 192, 28, 28, 3, 2);
  712. run(128, 576, 14, 14, 3, 1);
  713. run(128, 384, 14, 14, 3, 1);
  714. run(128, 32, 112, 112, 3, 1);
  715. run(128, 960, 7, 7, 3, 1);
  716. run(128, 384, 14, 14, 3, 1);
  717. run(128, 144, 56, 56, 3, 2);
  718. run(128, 384, 14, 14, 3, 1);
  719. run(128, 144, 56, 56, 3, 1);
  720. run(128, 96, 112, 112, 3, 2);
  721. run(128, 384, 14, 14, 3, 1);
  722. run(128, 192, 28, 28, 3, 1);
  723. run(128, 576, 14, 14, 3, 1);
  724. run(128, 576, 14, 14, 3, 2);
  725. }
  726. TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_FORWARD_FLOAT) {
  727. CUBenchmarker<ConvolutionForward> bencher(handle_cuda());
  728. size_t RUNS = 1;
  729. bencher.set_display(false).set_times(RUNS);
  730. bencher.set_before_exec_callback(
  731. AlgoChecker<ConvolutionForward>(ExecutionPolicyAlgoName{
  732. "DEFAULT",
  733. {{ConvBiasForward::algo_name<ConvBiasForward::DirectParam>(
  734. "CHANNEL_WISE", {})
  735. .c_str(),
  736. {}}}}));
  737. Convolution::Param param;
  738. param.format = ConvBias::Param::Format::NCHW;
  739. param.sparse = Convolution::Param::Sparse::GROUP;
  740. NormalRNG rng;
  741. auto run = [&](size_t batch, size_t c, size_t ih, size_t iw, size_t f, size_t s) {
  742. param.pad_h = f / 2;
  743. param.pad_w = f / 2;
  744. param.stride_h = s;
  745. param.stride_w = s;
  746. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  747. TensorShape src = {batch, c, ih, iw}, filter = {c, 1, 1, f, f};
  748. TensorLayout dst_layout;
  749. auto opr = handle_cuda()->create_operator<Convolution>();
  750. opr->param() = param;
  751. opr->deduce_layout(
  752. {src, dtype::Float32()}, {filter, dtype::Float32()}, dst_layout);
  753. float bandwith = static_cast<float>(
  754. src.total_nr_elems() + filter.total_nr_elems() +
  755. dst_layout.total_nr_elems()) /
  756. (1024 * 1024 * 1024) * 1e3;
  757. bencher.set_param(param)
  758. .set_dtype(0, dtype::Float32())
  759. .set_dtype(1, dtype::Float32())
  760. .set_dtype(2, dtype::Float32())
  761. .set_rng(0, &rng)
  762. .set_rng(1, &rng);
  763. auto time_in_ms_fp32 = bencher.execs({src, filter, {}}) / RUNS;
  764. bencher.set_param(param)
  765. .set_dtype(0, dtype::Float16())
  766. .set_dtype(1, dtype::Float16())
  767. .set_dtype(2, dtype::Float16())
  768. .set_rng(0, &rng)
  769. .set_rng(1, &rng);
  770. auto time_in_ms_fp16 = bencher.execs({src, filter, {}}) / RUNS;
  771. printf("stride=%zu src=%s, filter=%s, float32: %.2fms %.2fGB/s "
  772. "float16: %.2fms %.2fGB/s "
  773. "speedup: "
  774. "%0.2f (fp16/fp32)\n",
  775. s, src.to_string().c_str(), filter.to_string().c_str(), time_in_ms_fp32,
  776. bandwith * 4 / time_in_ms_fp32, time_in_ms_fp16,
  777. bandwith * 2 / time_in_ms_fp16, time_in_ms_fp32 / time_in_ms_fp16);
  778. };
  779. // clang-format off
  780. for (size_t s : {1})
  781. for (size_t f : {3, 5, 7})
  782. for (size_t batch : {64})
  783. for (size_t c : {16, 32, 64, 128})
  784. for (size_t ih: {8, 16, 32, 128, 256})
  785. for (size_t iw : {8, 16, 32, 128, 256})
  786. run(batch, c, ih, iw, f, s);
  787. // clang-format on
  788. }
  789. TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_FORWARD_FLOAT_SMALL) {
  790. CUBenchmarker<ConvolutionForward> bencher(handle_cuda());
  791. size_t RUNS = 1;
  792. bencher.set_display(false).set_times(RUNS);
  793. Convolution::Param param;
  794. param.format = ConvBias::Param::Format::NCHW;
  795. param.sparse = Convolution::Param::Sparse::GROUP;
  796. NormalRNG rng;
  797. auto run = [&](size_t batch, size_t c, size_t ih, size_t iw, size_t f, size_t s) {
  798. param.pad_h = f / 2;
  799. param.pad_w = f / 2;
  800. param.stride_h = s;
  801. param.stride_w = s;
  802. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  803. TensorShape src = {batch, c, ih, iw}, filter = {c, 1, 1, f, f};
  804. TensorLayout dst_layout;
  805. auto opr = handle_cuda()->create_operator<Convolution>();
  806. opr->param() = param;
  807. opr->deduce_layout(
  808. {src, dtype::Float32()}, {filter, dtype::Float32()}, dst_layout);
  809. float bandwith = static_cast<float>(
  810. src.total_nr_elems() + filter.total_nr_elems() +
  811. dst_layout.total_nr_elems()) /
  812. (1024 * 1024 * 1024) * 1e3;
  813. bencher.set_param(param)
  814. .set_dtype(0, dtype::Float32())
  815. .set_dtype(1, dtype::Float32())
  816. .set_dtype(2, dtype::Float32())
  817. .set_rng(0, &rng)
  818. .set_rng(1, &rng)
  819. .set_before_exec_callback(AlgoChecker<
  820. ConvolutionForward>(ExecutionPolicyAlgoName{
  821. "DEFAULT",
  822. {{ConvBiasForward::algo_name<ConvBiasForward::DirectParam>(
  823. "CHANNEL_WISE", {})
  824. .c_str(),
  825. {}}}}));
  826. auto time_in_ms_fp32_normal = bencher.execs({src, filter, {}}) / RUNS;
  827. bencher.set_before_exec_callback(
  828. AlgoChecker<ConvolutionForward>(ExecutionPolicyAlgoName{
  829. "DEFAULT",
  830. {{ConvBiasForward::algo_name<ConvBiasForward::DirectParam>(
  831. "CHANNEL_WISE", {})
  832. .c_str(),
  833. {}}}}));
  834. auto time_in_ms_fp32_small = bencher.execs({src, filter, {}}) / RUNS;
  835. bencher.set_param(param)
  836. .set_dtype(0, dtype::Float16())
  837. .set_dtype(1, dtype::Float16())
  838. .set_dtype(2, dtype::Float16())
  839. .set_rng(0, &rng)
  840. .set_rng(1, &rng);
  841. auto time_in_ms_fp16_small = bencher.execs({src, filter, {}}) / RUNS;
  842. printf("stride=%zu src=%s, filter=%s, fp32 normal: %.2fms %.2fGB/s "
  843. "small: %.2fms %.2fGB/s, fp16 small: %.2fms %.2fGB/s, "
  844. "speedup: "
  845. "%0.2f (fp32 small/normal) %0.2f (small fp16/fp32)\n",
  846. s, src.to_string().c_str(), filter.to_string().c_str(),
  847. time_in_ms_fp32_normal, bandwith * 4 / time_in_ms_fp32_normal,
  848. time_in_ms_fp32_small, bandwith * 4 / time_in_ms_fp32_small,
  849. time_in_ms_fp16_small, bandwith * 2 / time_in_ms_fp16_small,
  850. time_in_ms_fp32_normal / time_in_ms_fp32_small,
  851. time_in_ms_fp32_small / time_in_ms_fp16_small);
  852. };
  853. // clang-format off
  854. for (size_t s : {1})
  855. for (size_t f : {3, 5})
  856. for (size_t batch : {64})
  857. for (size_t c : {16, 32, 64, 128})
  858. for (size_t ih: {8, 16, 32})
  859. for (size_t iw : {8, 16, 32})
  860. run(batch, c, ih, iw, f, s);
  861. // clang-format on
  862. run(128, 192, 28, 28, 3, 1);
  863. run(128, 576, 14, 14, 3, 1);
  864. run(128, 384, 14, 14, 3, 1);
  865. run(128, 960, 7, 7, 3, 1);
  866. run(128, 384, 14, 14, 3, 1);
  867. run(128, 384, 14, 14, 3, 1);
  868. run(128, 384, 14, 14, 3, 1);
  869. run(128, 192, 28, 28, 3, 1);
  870. run(128, 576, 14, 14, 3, 1);
  871. }
  872. TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_FORWARD_CUDNN_DNN) {
  873. CUBenchmarker<ConvBiasForward> bencher(handle_cuda());
  874. size_t RUNS = 1;
  875. bencher.set_display(false).set_times(RUNS);
  876. ConvBias::Param param;
  877. param.format = ConvBias::Param::Format::NCHW;
  878. param.sparse = ConvBias::Param::Sparse::GROUP;
  879. NormalRNG rng;
  880. auto run = [&](size_t batch, size_t c, size_t ih, size_t iw, size_t f, size_t s) {
  881. param.pad_h = f / 2;
  882. param.pad_w = f / 2;
  883. param.stride_h = s;
  884. param.stride_w = s;
  885. param.compute_mode = param::ConvBias::ComputeMode::DEFAULT;
  886. TensorShape src = {batch, c, ih, iw}, filter = {c, 1, 1, f, f},
  887. bias = {1, c, 1, 1};
  888. TensorLayout dst_layout;
  889. auto opr = handle_cuda()->create_operator<ConvBias>();
  890. opr->param() = param;
  891. opr->deduce_layout(
  892. {src, dtype::Float32()}, {filter, dtype::Float32()},
  893. {bias, dtype::Float32()}, {}, dst_layout);
  894. float computation_mops =
  895. static_cast<float>(dst_layout.total_nr_elems() * f * f * 2) * 1e-6;
  896. bencher.set_param(param)
  897. .set_dtype(0, dtype::Float32())
  898. .set_dtype(1, dtype::Float32())
  899. .set_dtype(2, dtype::Float32())
  900. .set_rng(0, &rng)
  901. .set_rng(1, &rng);
  902. bencher.set_before_exec_callback(
  903. AlgoChecker<ConvBiasForward>(".+CHANNEL_WISE.+"));
  904. auto time_in_ms_dnn = bencher.execs({src, filter, bias, {}, {}}) / RUNS;
  905. bencher.set_param(param)
  906. .set_dtype(0, dtype::Float32())
  907. .set_dtype(1, dtype::Float32())
  908. .set_dtype(2, dtype::Float32())
  909. .set_rng(0, &rng)
  910. .set_rng(1, &rng);
  911. bencher.set_before_exec_callback(AlgoChecker<ConvBiasForward>(
  912. ".+CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM.+"));
  913. auto time_in_ms_cudnn = bencher.execs({src, filter, bias, {}, {}}) / RUNS;
  914. printf("stride=%zu src=%s, filter=%s, dst=%s, dnn: %.2fms %.2fGB/s "
  915. "cudnn: %.2fms %.2fGB/s "
  916. "speedup: "
  917. "%0.2f (dnn/cudnn)\n",
  918. s, src.to_string().c_str(), filter.to_string().c_str(),
  919. dst_layout.to_string().c_str(), time_in_ms_dnn,
  920. computation_mops / time_in_ms_dnn, time_in_ms_cudnn,
  921. computation_mops / time_in_ms_cudnn, time_in_ms_cudnn / time_in_ms_dnn);
  922. };
  923. // clang-format off
  924. for(size_t batch:{1, 16, 32, 64, 128}){
  925. run(batch, 32, 112, 112, 3, 1);
  926. run(batch, 96, 112, 112, 3, 2);
  927. run(batch, 96, 112, 112, 3, 1);
  928. run(batch, 144, 56, 56, 3, 2);
  929. run(batch, 144, 56, 56, 3, 1);
  930. run(batch, 192, 28, 28, 3, 1);
  931. run(batch, 384, 14, 14, 3, 1);
  932. run(batch, 576, 14, 14, 3, 1);
  933. run(batch, 960, 7, 7, 3, 1);
  934. //! calibrate heu algo policy hw_size param
  935. run(batch, 144, 24, 24, 3, 1);
  936. run(batch, 144, 22, 22, 3, 1);
  937. run(batch, 144, 20, 20, 3, 1);
  938. run(batch, 144, 18, 18, 3, 1);
  939. }
  940. // clang-format on
  941. }
  942. TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_BACKWARD_DATA_FLOAT_SMALL) {
  943. CUBenchmarker<ConvolutionBackwardData> bencher(handle_cuda());
  944. size_t RUNS = 1;
  945. bencher.set_display(false).set_times(RUNS);
  946. ConvolutionBackwardData::Param param;
  947. param.format = Convolution::Param::Format::NCHW;
  948. param.sparse = Convolution::Param::Sparse::GROUP;
  949. NormalRNG rng;
  950. auto run = [&](size_t batch, size_t c, size_t ih, size_t iw, size_t f, size_t s) {
  951. param.pad_h = f / 2;
  952. param.pad_w = f / 2;
  953. param.stride_h = s;
  954. param.stride_w = s;
  955. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  956. TensorShape src = {batch, c, ih, iw}, filter = {c, 1, 1, f, f};
  957. float bandwith = static_cast<float>(
  958. src.total_nr_elems() + filter.total_nr_elems() +
  959. src.total_nr_elems()) /
  960. (1024 * 1024 * 1024) * 1e3;
  961. bencher.set_param(param)
  962. .set_dtype(0, dtype::Float32())
  963. .set_dtype(1, dtype::Float32())
  964. .set_dtype(2, dtype::Float32())
  965. .set_rng(0, &rng)
  966. .set_rng(1, &rng)
  967. .set_before_exec_callback(
  968. AlgoChecker<ConvolutionBackwardData>("CHANNEL_WISE"));
  969. auto time_in_ms_fp32_normal = bencher.execs({filter, src, src}) / RUNS;
  970. bencher.set_before_exec_callback(
  971. AlgoChecker<ConvolutionBackwardData>("CHANNEL_WISE_SMALL"));
  972. auto time_in_ms_fp32_small = bencher.execs({filter, src, src}) / RUNS;
  973. bencher.set_param(param)
  974. .set_dtype(0, dtype::Float16())
  975. .set_dtype(1, dtype::Float16())
  976. .set_dtype(2, dtype::Float16())
  977. .set_rng(0, &rng)
  978. .set_rng(1, &rng);
  979. auto time_in_ms_fp16_small = bencher.execs({filter, src, src}) / RUNS;
  980. printf("stride=%zu src=%s, filter=%s, fp32 normal: %.2fms %.2fGB/s "
  981. "small: %.2fms %.2fGB/s, fp16 small: %.2fms %.2fGB/s, "
  982. "speedup: "
  983. "%0.2f (fp32 small/normal) %0.2f (small fp16/fp32)\n",
  984. s, src.to_string().c_str(), filter.to_string().c_str(),
  985. time_in_ms_fp32_normal, bandwith * 4 / time_in_ms_fp32_normal,
  986. time_in_ms_fp32_small, bandwith * 4 / time_in_ms_fp32_small,
  987. time_in_ms_fp16_small, bandwith * 2 / time_in_ms_fp16_small,
  988. time_in_ms_fp32_normal / time_in_ms_fp32_small,
  989. time_in_ms_fp32_small / time_in_ms_fp16_small);
  990. };
  991. // clang-format off
  992. for (size_t s : {1})
  993. for (size_t f : {3, 5})
  994. for (size_t batch : {64})
  995. for (size_t c : {16, 32, 64, 128})
  996. for (size_t ih: {8, 16, 32})
  997. for (size_t iw : {8, 16, 32})
  998. run(batch, c, ih, iw, f, s);
  999. // clang-format on
  1000. run(128, 192, 28, 28, 3, 1);
  1001. run(128, 576, 14, 14, 3, 1);
  1002. run(128, 384, 14, 14, 3, 1);
  1003. run(128, 960, 7, 7, 3, 1);
  1004. run(128, 384, 14, 14, 3, 1);
  1005. run(128, 384, 14, 14, 3, 1);
  1006. run(128, 384, 14, 14, 3, 1);
  1007. run(128, 192, 28, 28, 3, 1);
  1008. run(128, 576, 14, 14, 3, 1);
  1009. }
  1010. TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_BWD_DATA) {
  1011. CUBenchmarker<ConvolutionBackwardData> bencher(handle_cuda());
  1012. size_t RUNS = 1;
  1013. bencher.set_display(false).set_times(RUNS);
  1014. bencher.set_before_exec_callback(
  1015. AlgoChecker<ConvolutionBackwardData>("CHANNEL_WISE"));
  1016. Convolution::Param param;
  1017. param.format = ConvBias::Param::Format::NCHW;
  1018. param.sparse = Convolution::Param::Sparse::GROUP;
  1019. NormalRNG rng;
  1020. auto run = [&](size_t batch, size_t ocpg, size_t group, size_t ih, size_t iw,
  1021. size_t f, size_t p, size_t s) {
  1022. param.pad_h = p;
  1023. param.pad_w = p;
  1024. param.stride_h = s;
  1025. param.stride_w = s;
  1026. size_t oh, ow;
  1027. infer_conv_shape2d(ih, iw, f, f, s, s, p, p, oh, ow, true);
  1028. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  1029. TensorShape src_grad = {batch, group, ih, iw},
  1030. dst_grad = {batch, group * ocpg, oh, ow},
  1031. flt = {group, ocpg, 1, f, f};
  1032. auto opr = handle_cuda()->create_operator<Convolution>();
  1033. opr->param() = param;
  1034. float bandwith = static_cast<float>(
  1035. flt.total_nr_elems() + dst_grad.total_nr_elems() +
  1036. src_grad.total_nr_elems()) /
  1037. (1024 * 1024 * 1024) * 1e3;
  1038. bencher.set_param(param)
  1039. .set_dtype(0, dtype::Float32())
  1040. .set_dtype(1, dtype::Float32())
  1041. .set_dtype(2, dtype::Float32())
  1042. .set_rng(0, &rng)
  1043. .set_rng(1, &rng);
  1044. auto time_in_ms_fp32 = bencher.execs({flt, dst_grad, src_grad}) / RUNS;
  1045. bencher.set_param(param)
  1046. .set_dtype(0, dtype::Float16())
  1047. .set_dtype(1, dtype::Float16())
  1048. .set_dtype(2, dtype::Float16())
  1049. .set_rng(0, &rng)
  1050. .set_rng(1, &rng);
  1051. auto time_in_ms_fp16 = bencher.execs({flt, dst_grad, src_grad}) / RUNS;
  1052. printf("stride=%zu, src_grad=%s, flt=%s, "
  1053. "float32: %.2fms %.2fGB/s "
  1054. "float16: %.2fms %.2fGB/s "
  1055. "speedup: "
  1056. "%0.2f (fp16/fp32)\n",
  1057. s, src_grad.to_string().c_str(), flt.to_string().c_str(),
  1058. time_in_ms_fp32, bandwith * 4 / time_in_ms_fp32, time_in_ms_fp16,
  1059. bandwith * 2 / time_in_ms_fp16, time_in_ms_fp32 / time_in_ms_fp16);
  1060. };
  1061. // clang-format off
  1062. for (size_t s : {1, 2})
  1063. for (size_t f : {3, 5, 7})
  1064. for (size_t p : {f / 2})
  1065. for (size_t batch : {64})
  1066. for (size_t ocpg : {1})
  1067. for (size_t group : {16, 32, 64, 128})
  1068. for (size_t ih : {8, 16, 32, 128, 256})
  1069. for (size_t iw : {8, 16, 32, 128, 256})
  1070. run(batch, ocpg, group, ih, iw, f, p, s);
  1071. // clang-format on
  1072. }
  1073. TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_BWD_FILTER) {
  1074. CUBenchmarker<ConvolutionBackwardFilter> bencher(handle_cuda());
  1075. size_t RUNS = 1;
  1076. bencher.set_display(false).set_times(RUNS);
  1077. bencher.set_before_exec_callback(
  1078. AlgoChecker<ConvolutionBackwardFilter>("CHANNEL_WISE"));
  1079. Convolution::Param param;
  1080. param.format = ConvBias::Param::Format::NCHW;
  1081. param.sparse = Convolution::Param::Sparse::GROUP;
  1082. NormalRNG rng;
  1083. auto run = [&](size_t batch, size_t ocpg, size_t group, size_t i, size_t f,
  1084. size_t p, size_t s) {
  1085. param.pad_h = p;
  1086. param.pad_w = p;
  1087. param.stride_h = s;
  1088. param.stride_w = s;
  1089. size_t d = infer_conv_shape(i, f, s, p, true);
  1090. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  1091. TensorShape src = {batch, group, i, i}, dst_grad = {batch, group * ocpg, d, d},
  1092. flt_grad = {group, ocpg, 1, f, f};
  1093. auto opr = handle_cuda()->create_operator<Convolution>();
  1094. opr->param() = param;
  1095. float bandwith = static_cast<float>(
  1096. flt_grad.total_nr_elems() + dst_grad.total_nr_elems() +
  1097. src.total_nr_elems()) /
  1098. (1024 * 1024 * 1024) * 1e3;
  1099. bencher.set_param(param)
  1100. .set_dtype(0, dtype::Float32())
  1101. .set_dtype(1, dtype::Float32())
  1102. .set_dtype(2, dtype::Float32())
  1103. .set_rng(0, &rng)
  1104. .set_rng(1, &rng);
  1105. auto time_in_ms_fp32 = bencher.execs({src, dst_grad, flt_grad}) / RUNS;
  1106. bencher.set_param(param)
  1107. .set_dtype(0, dtype::Float16())
  1108. .set_dtype(1, dtype::Float16())
  1109. .set_dtype(2, dtype::Float16())
  1110. .set_rng(0, &rng)
  1111. .set_rng(1, &rng);
  1112. auto time_in_ms_fp16 = bencher.execs({src, dst_grad, flt_grad}) / RUNS;
  1113. printf("stride=%zu, src=%s, flt_grad=%s, "
  1114. "float32: %.2fms %.2fGB/s "
  1115. "float16: %.2fms %.2fGB/s "
  1116. "speedup: "
  1117. "%.2f (fp16/fp32)\n",
  1118. s, src.to_string().c_str(), flt_grad.to_string().c_str(),
  1119. time_in_ms_fp32, bandwith * 4 / time_in_ms_fp32, time_in_ms_fp16,
  1120. bandwith * 2 / time_in_ms_fp16, time_in_ms_fp32 / time_in_ms_fp16);
  1121. };
  1122. // clang-format off
  1123. for (size_t s : {1, 2})
  1124. for (size_t f : {3, 5, 7})
  1125. for (size_t p : {f / 2})
  1126. for (size_t batch : {64})
  1127. for (size_t ocpg : {1})
  1128. for (size_t group : {16, 32, 64, 128})
  1129. for (size_t i : {8, 16, 32, 64, 128})
  1130. run(batch, ocpg, group, i, f, p, s);
  1131. // clang-format on
  1132. }
  1133. TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_FORWARD_LARGE_KERNEL) {
  1134. CUBenchmarker<ConvolutionForward> bencher(handle_cuda());
  1135. size_t RUNS = 100;
  1136. bencher.set_display(false).set_times(RUNS);
  1137. std::unique_ptr<OprProxy<ConvolutionForward>> proxy{
  1138. new OprProxy<ConvolutionForward>{true}};
  1139. bencher.set_proxy(proxy);
  1140. Convolution::Param param;
  1141. param.format = ConvBias::Param::Format::NCHW;
  1142. param.sparse = Convolution::Param::Sparse::GROUP;
  1143. NormalRNG rng;
  1144. auto run = [&](size_t batch, size_t c, size_t ih, size_t iw, size_t f, size_t s) {
  1145. param.pad_h = f / 2;
  1146. param.pad_w = f / 2;
  1147. param.stride_h = s;
  1148. param.stride_w = s;
  1149. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  1150. TensorShape src = {batch, c, ih, iw}, filter = {c, 1, 1, f, f};
  1151. TensorLayout dst_layout;
  1152. auto opr = handle_cuda()->create_operator<Convolution>();
  1153. opr->param() = param;
  1154. opr->deduce_layout(
  1155. {src, dtype::Float32()}, {filter, dtype::Float32()}, dst_layout);
  1156. float bandwith = static_cast<float>(
  1157. src.total_nr_elems() + filter.total_nr_elems() +
  1158. dst_layout.total_nr_elems()) /
  1159. (1024 * 1024 * 1024) * 1e3;
  1160. bencher.set_param(param)
  1161. .set_dtype(0, dtype::Float32())
  1162. .set_dtype(1, dtype::Float32())
  1163. .set_dtype(2, dtype::Float32())
  1164. .set_rng(0, &rng)
  1165. .set_rng(1, &rng);
  1166. bencher.proxy()->target_execution_policy = {};
  1167. auto time_in_ms_fp32 = bencher.execs({src, filter, {}}) / RUNS;
  1168. bencher.set_param(param)
  1169. .set_dtype(0, dtype::Float16())
  1170. .set_dtype(1, dtype::Float16())
  1171. .set_dtype(2, dtype::Float16())
  1172. .set_rng(0, &rng)
  1173. .set_rng(1, &rng);
  1174. bencher.proxy()->target_execution_policy = {};
  1175. auto time_in_ms_fp16 = bencher.execs({src, filter, {}}) / RUNS;
  1176. bencher.proxy()->target_execution_policy.algo.reset();
  1177. param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  1178. bencher.set_param(param);
  1179. auto time_in_ms_pseudo_fp16 = bencher.execs({src, filter, {}}) / RUNS;
  1180. printf("stride=%zu src=%s, filter=%s, float32: %.2fms %.2fGB/s "
  1181. "float16: %.2fms %.2fGB/s "
  1182. "pseudo float16: %.2fms %.2fGB/s "
  1183. "speedup: "
  1184. "%0.2f (fp16/fp32) %.2f (fp16/pseudo fp16)\n",
  1185. s, src.to_string().c_str(), filter.to_string().c_str(), time_in_ms_fp32,
  1186. bandwith * 4 / time_in_ms_fp32, time_in_ms_fp16,
  1187. bandwith * 2 / time_in_ms_fp16, time_in_ms_pseudo_fp16,
  1188. bandwith * 2 / time_in_ms_pseudo_fp16, time_in_ms_fp32 / time_in_ms_fp16,
  1189. time_in_ms_pseudo_fp16 / time_in_ms_fp16);
  1190. };
  1191. // clang-format off
  1192. for (size_t b : {32, 64})
  1193. for (size_t f : {3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31}) {
  1194. run(b, 384, 32, 32, f, 1);
  1195. run(b, 384, 64, 64, f, 1);
  1196. }
  1197. // clang-format on
  1198. }
  1199. TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_BACKWARD_DATA_LARGE_KERNEL) {
  1200. CUBenchmarker<ConvolutionBackwardData> bencher(handle_cuda());
  1201. size_t RUNS = 100;
  1202. bencher.set_display(false).set_times(RUNS);
  1203. std::unique_ptr<OprProxy<ConvolutionBackwardData>> proxy{
  1204. new OprProxy<ConvolutionBackwardData>{true}};
  1205. bencher.set_proxy(proxy);
  1206. Convolution::Param param;
  1207. param.format = ConvBias::Param::Format::NCHW;
  1208. param.sparse = Convolution::Param::Sparse::GROUP;
  1209. NormalRNG rng;
  1210. auto run = [&](size_t batch, size_t c, size_t ih, size_t iw, size_t f, size_t s) {
  1211. param.pad_h = f / 2;
  1212. param.pad_w = f / 2;
  1213. param.stride_h = s;
  1214. param.stride_w = s;
  1215. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  1216. TensorShape src = {batch, c, ih, iw}, filter = {c, 1, 1, f, f};
  1217. TensorLayout dst_layout;
  1218. auto opr = handle_cuda()->create_operator<Convolution>();
  1219. opr->param() = param;
  1220. opr->deduce_layout(
  1221. {src, dtype::Float32()}, {filter, dtype::Float32()}, dst_layout);
  1222. float bandwith = static_cast<float>(
  1223. src.total_nr_elems() + filter.total_nr_elems() +
  1224. dst_layout.total_nr_elems()) /
  1225. (1024 * 1024 * 1024) * 1e3;
  1226. bencher.set_param(param)
  1227. .set_dtype(0, dtype::Float32())
  1228. .set_dtype(1, dtype::Float32())
  1229. .set_dtype(2, dtype::Float32())
  1230. .set_rng(0, &rng)
  1231. .set_rng(1, &rng);
  1232. bencher.proxy()->target_execution_policy = {};
  1233. auto time_in_ms_fp32 = bencher.execs({filter, src, src}) / RUNS;
  1234. bencher.set_param(param)
  1235. .set_dtype(0, dtype::Float16())
  1236. .set_dtype(1, dtype::Float16())
  1237. .set_dtype(2, dtype::Float16())
  1238. .set_rng(0, &rng)
  1239. .set_rng(1, &rng);
  1240. bencher.proxy()->target_execution_policy = {};
  1241. auto time_in_ms_fp16 = bencher.execs({filter, src, src}) / RUNS;
  1242. bencher.proxy()->target_execution_policy.algo.reset();
  1243. param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  1244. bencher.set_param(param);
  1245. auto time_in_ms_pseudo_fp16 = bencher.execs({filter, src, src}) / RUNS;
  1246. printf("stride=%zu src=%s, filter=%s, float32: %.2fms %.2fGB/s "
  1247. "float16: %.2fms %.2fGB/s "
  1248. "pseudo float16: %.2fms %.2fGB/s "
  1249. "speedup: "
  1250. "%0.2f (fp16/fp32) %.2f (fp16/pseudo fp16)\n",
  1251. s, src.to_string().c_str(), filter.to_string().c_str(), time_in_ms_fp32,
  1252. bandwith * 4 / time_in_ms_fp32, time_in_ms_fp16,
  1253. bandwith * 2 / time_in_ms_fp16, time_in_ms_pseudo_fp16,
  1254. bandwith * 2 / time_in_ms_pseudo_fp16, time_in_ms_fp32 / time_in_ms_fp16,
  1255. time_in_ms_pseudo_fp16 / time_in_ms_fp16);
  1256. };
  1257. // clang-format off
  1258. for (size_t b : {32, 64})
  1259. for (size_t f : {3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31}) {
  1260. run(b, 384, 32, 32, f, 1);
  1261. run(b, 384, 64, 64, f, 1);
  1262. }
  1263. // clang-format on
  1264. }
  1265. #endif
  1266. // vim: syntax=cpp.doxygen