You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

chanwise_convolution.cpp 58 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434
  1. /**
  2. * \file dnn/test/cuda/chanwise_convolution.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #include "megdnn/oprs/nn.h"
  12. #include "cuda.h"
  13. #include "megcore_cuda.h"
  14. #include "test/common/benchmarker.h"
  15. #include "test/common/checker.h"
  16. #include "test/common/convolution.h"
  17. #include "test/common/tensor.h"
  18. #include "test/common/workspace_wrapper.h"
  19. #include "test/cuda/benchmark.h"
  20. #include "test/cuda/fixture.h"
  21. #include <cuda_profiler_api.h>
  22. #include <cuda_runtime_api.h>
  23. using namespace megdnn;
  24. using namespace test;
  25. namespace {
  26. #if MEGDNN_WITH_BENCHMARK
  27. bool check_need_full_bench() {
  28. if (getenv("MEGDNN_CHANWISE_CONV_FULLBENCH"))
  29. return true;
  30. printf("set MEGDNN_CHANWISE_CONV_FULLBENCH to run full benchmark\n");
  31. return false;
  32. }
  33. #endif
  34. Convolution::Param gconv_param(Convolution::Param p, bool io16xc32 = false) {
  35. p.sparse = Convolution::Param::Sparse::GROUP;
  36. if (io16xc32)
  37. p.compute_mode = Convolution::Param::ComputeMode::FLOAT32;
  38. return p;
  39. }
  40. template <int P0, int P1, int P2>
  41. class BenchmarkEnv {
  42. Handle *handle, *handle_cpu;
  43. std::unique_ptr<GaussianRNG> rng;
  44. TensorLayout lsrc, lflt0, lflt1, ldst;
  45. std::unique_ptr<Tensor<>> src0, src1, flt0, flt0_cpu, flt1, flt1_cpu, dst0, dst1;
  46. cudaEvent_t cuda_ev[3];
  47. cudaStream_t cuda_stream;
  48. size_t pad_h, pad_w;
  49. template <typename T>
  50. static std::tuple<T, T, T> shuffle(std::tuple<T, T, T> data) {
  51. return std::make_tuple(
  52. std::get<P0>(data), std::get<P1>(data), std::get<P2>(data));
  53. }
  54. public:
  55. BenchmarkEnv(Handle* handle, Handle* handle_cpu) {
  56. this->handle = handle;
  57. this->handle_cpu = handle_cpu;
  58. rng = handle->create_operator<GaussianRNG>();
  59. // make cpu handle used
  60. handle_cpu->create_operator<Sleep>()->exec();
  61. for (int i = 0; i < 3; ++i)
  62. cudaEventCreate(&cuda_ev[i]);
  63. megcoreGetCUDAStream(handle->megcore_computing_handle(), &cuda_stream);
  64. }
  65. ~BenchmarkEnv() {
  66. for (int i = 0; i < 3; ++i)
  67. cudaEventDestroy(cuda_ev[i]);
  68. }
  69. void alloc(
  70. size_t N, size_t IC, size_t IH, size_t IW, size_t CHL_MUL, size_t FH,
  71. size_t FW, size_t PH, size_t PW) {
  72. pad_h = PH;
  73. pad_w = PW;
  74. auto mkly = [](const TensorShape& s) {
  75. return TensorLayout{s, dtype::Float32()};
  76. };
  77. lsrc = mkly({N, IC, IH, IW});
  78. lflt0 = mkly({CHL_MUL * IC, IC, FH, FW});
  79. lflt1 = mkly({IC, CHL_MUL, 1, FH, FW});
  80. ldst = mkly({N, IC * CHL_MUL, IH - FH + 1 + PH * 2, IW - FW + 1 + PW * 2});
  81. src0.reset(new Tensor<>(handle, lsrc));
  82. src1.reset(new Tensor<>(handle, lsrc));
  83. flt0.reset(new Tensor<>(handle, lflt0));
  84. flt0_cpu.reset(new Tensor<>(handle_cpu, lflt0));
  85. flt1.reset(new Tensor<>(handle, lflt1));
  86. flt1_cpu.reset(new Tensor<>(handle_cpu, lflt1));
  87. dst0.reset(new Tensor<>(handle, ldst));
  88. dst1.reset(new Tensor<>(handle, ldst));
  89. }
  90. void fill_src() {
  91. rng->exec(src0->tensornd(), {});
  92. megdnn_memcpy_D2D(handle, src1->ptr(), src0->ptr(), lsrc.span().dist_byte());
  93. }
  94. void fill_flt() {
  95. rng->exec(flt1->tensornd(), {});
  96. megdnn_memcpy_D2H(
  97. handle, flt1_cpu->ptr(), flt1->ptr(), lflt1.span().dist_byte());
  98. const size_t IC = lflt1[0], CHL_MUL = lflt1[1], FSIZE = lflt1[3] * lflt1[4];
  99. // fill flt0 from flt1
  100. float* src = flt1_cpu->ptr();
  101. float* dst = flt0_cpu->ptr();
  102. memset(dst, 0, lflt0.span().dist_byte());
  103. for (size_t i = 0; i < IC; ++i) {
  104. for (size_t j = 0; j < CHL_MUL; ++j) {
  105. memcpy(dst + ((i * CHL_MUL + j) * IC + i) * FSIZE,
  106. src + (i * CHL_MUL + j) * FSIZE, FSIZE * sizeof(float));
  107. }
  108. }
  109. megdnn_memcpy_H2D(handle, flt0->ptr(), dst, lflt0.span().dist_byte());
  110. }
  111. void fill_dst() {
  112. rng->exec(dst0->tensornd(), {});
  113. megdnn_memcpy_D2D(handle, dst1->ptr(), dst0->ptr(), ldst.span().dist_byte());
  114. }
  115. template <class Opr>
  116. void exec(Opr* opr0, Opr* opr1) {
  117. opr0->param().pad_h = pad_h;
  118. opr0->param().pad_w = pad_w;
  119. opr1->param() = opr0->param();
  120. opr1->param().sparse = param::Convolution::Sparse::GROUP;
  121. TensorND a0, b0, c0, a1, b1, c1;
  122. std::tie(a0, b0, c0) = shuffle(
  123. std::make_tuple(src0->tensornd(), flt0->tensornd(), dst0->tensornd()));
  124. std::tie(a1, b1, c1) = shuffle(
  125. std::make_tuple(src1->tensornd(), flt1->tensornd(), dst1->tensornd()));
  126. WorkspaceWrapper wk(
  127. handle,
  128. std::max(
  129. opr0->get_workspace_in_bytes(a0.layout, b0.layout, c0.layout),
  130. opr1->get_workspace_in_bytes(a1.layout, b1.layout, c1.layout)));
  131. cudaProfilerStart();
  132. cudaEventRecord(cuda_ev[0], cuda_stream);
  133. opr0->exec(a0, b0, c0, wk.workspace());
  134. cudaEventRecord(cuda_ev[1], cuda_stream);
  135. opr1->exec(a1, b1, c1, wk.workspace());
  136. cudaEventRecord(cuda_ev[2], cuda_stream);
  137. cudaProfilerStop();
  138. if (getenv("MEGDNN_CHANWISE_CONV_VERBOSE") ||
  139. getenv("MEGDNN_CHANWISE_CONV_FULLBENCH")) {
  140. cudaStreamSynchronize(cuda_stream);
  141. float t0 = -1, t1 = -1;
  142. cudaEventElapsedTime(&t0, cuda_ev[0], cuda_ev[1]);
  143. cudaEventElapsedTime(&t1, cuda_ev[1], cuda_ev[2]);
  144. printf("%s;%s;%s: cudnn/megdnn: %.3fms/%.3fms=%.3f\n",
  145. lsrc.TensorShape::to_string().c_str(),
  146. lflt1.TensorShape::to_string().c_str(),
  147. ldst.TensorShape::to_string().c_str(), t0, t1, t0 / t1);
  148. }
  149. }
  150. //! special for weight preprocess
  151. void exec_convolution(ConvolutionForward* opr0, ConvolutionForward* opr1) {
  152. opr0->param().pad_h = pad_h;
  153. opr0->param().pad_w = pad_w;
  154. opr1->param() = opr0->param();
  155. opr1->param().sparse = param::Convolution::Sparse::GROUP;
  156. TensorND a0, b0, c0, a1, b1, c1;
  157. std::tie(a0, b0, c0) = shuffle(
  158. std::make_tuple(src0->tensornd(), flt0->tensornd(), dst0->tensornd()));
  159. std::tie(a1, b1, c1) = shuffle(
  160. std::make_tuple(src1->tensornd(), flt1->tensornd(), dst1->tensornd()));
  161. WorkspaceWrapper wk(
  162. handle, std::max(
  163. opr0->get_workspace_in_bytes(
  164. a0.layout, b0.layout, c0.layout, nullptr),
  165. opr1->get_workspace_in_bytes(
  166. a1.layout, b1.layout, c1.layout, nullptr)));
  167. cudaProfilerStart();
  168. cudaEventRecord(cuda_ev[0], cuda_stream);
  169. opr0->exec(a0, b0, c0, nullptr, wk.workspace());
  170. cudaEventRecord(cuda_ev[1], cuda_stream);
  171. opr1->exec(a1, b1, c1, nullptr, wk.workspace());
  172. cudaEventRecord(cuda_ev[2], cuda_stream);
  173. cudaProfilerStop();
  174. if (getenv("MEGDNN_CHANWISE_CONV_VERBOSE") ||
  175. getenv("MEGDNN_CHANWISE_CONV_FULLBENCH")) {
  176. cudaStreamSynchronize(cuda_stream);
  177. float t0 = -1, t1 = -1;
  178. cudaEventElapsedTime(&t0, cuda_ev[0], cuda_ev[1]);
  179. cudaEventElapsedTime(&t1, cuda_ev[1], cuda_ev[2]);
  180. printf("%s;%s;%s: cudnn/megdnn: %.3fms/%.3fms=%.3f\n",
  181. lsrc.TensorShape::to_string().c_str(),
  182. lflt1.TensorShape::to_string().c_str(),
  183. ldst.TensorShape::to_string().c_str(), t0, t1, t0 / t1);
  184. }
  185. }
  186. void cmp_dst() {
  187. Tensor<> dst0_cpu(handle_cpu, ldst), dst1_cpu(handle_cpu, ldst);
  188. megdnn_memcpy_D2H(handle, dst0_cpu.ptr(), dst0->ptr(), ldst.span().dist_byte());
  189. megdnn_memcpy_D2H(handle, dst1_cpu.ptr(), dst1->ptr(), ldst.span().dist_byte());
  190. dst0_cpu.check_with(dst1_cpu);
  191. }
  192. void cmp_src() {
  193. Tensor<> src0_cpu(handle_cpu, lsrc), src1_cpu(handle_cpu, lsrc);
  194. megdnn_memcpy_D2H(handle, src0_cpu.ptr(), src0->ptr(), lsrc.span().dist_byte());
  195. megdnn_memcpy_D2H(handle, src1_cpu.ptr(), src1->ptr(), lsrc.span().dist_byte());
  196. src0_cpu.check_with(src1_cpu);
  197. }
  198. void cmp_flt() {
  199. Tensor<> flt0_cpu(handle_cpu, lflt0), flt1_cpu(handle_cpu, lflt1);
  200. float* p0 = flt0_cpu.ptr();
  201. float* p1 = flt1_cpu.ptr();
  202. megdnn_memcpy_D2H(handle, p0, flt0->ptr(), lflt0.span().dist_byte());
  203. megdnn_memcpy_D2H(handle, p1, flt1->ptr(), lflt1.span().dist_byte());
  204. size_t IC = lflt1[0], CHL_MUL = lflt1[1], FSIZE = lflt1[3] * lflt1[4];
  205. double tot_err = 0, tot_err_num = 0;
  206. for (size_t i = 0; i < IC; ++i) {
  207. for (size_t j = 0; j < CHL_MUL; ++j) {
  208. auto t0 = p0 + ((i * CHL_MUL + j) * IC + i) * FSIZE,
  209. t1 = p1 + (i * CHL_MUL + j) * FSIZE;
  210. for (size_t k = 0; k < FSIZE; ++k) {
  211. auto err = std::abs(diff(t0[k], t1[k]));
  212. tot_err += err;
  213. tot_err_num += 1;
  214. ASSERT_LT(err, 1e-2) << "failed at " << i << " " << j << " " << k
  215. << " vals=" << t0[k] << "," << t1[k];
  216. }
  217. }
  218. }
  219. auto avg_err = tot_err / tot_err_num;
  220. ASSERT_LT(avg_err, 1e-4);
  221. }
  222. };
  223. } // anonymous namespace
  224. constexpr auto M = Convolution::Mode::CROSS_CORRELATION;
  225. TEST_F(CUDA, CHANWISE_CONVOLUTION_FORWARD) {
  226. Checker<Convolution> checker(handle_cuda());
  227. bool require_algo = false;
  228. checker.set_before_exec_callback(AlgoChecker<ConvolutionForward>(
  229. ExecutionPolicyAlgoName{
  230. "DEFAULT",
  231. {{ConvBiasForward::algo_name<ConvBiasForward::DirectParam>(
  232. "CHANNEL_WISE", {})
  233. .c_str(),
  234. {}}}},
  235. &require_algo));
  236. for (auto dtype : std::vector<DType>{dtype::Float32(), dtype::Float16()}) {
  237. checker.set_dtype(0, dtype).set_dtype(1, dtype).set_dtype(2, dtype);
  238. if (dtype.enumv() == DTypeEnum::Float16)
  239. checker.set_epsilon(2e-2);
  240. // simple case
  241. // clang-format off
  242. for (uint32_t s : {1, 2})
  243. for (uint32_t p : {0, 1, 2, 3})
  244. for (size_t f : {2, 3, 5, 7})
  245. for (size_t ocpg : {1, 3}) {
  246. checker.set_param(gconv_param({M, p, p, s, s}))
  247. .execs({{2, 3, 16, 16}, {3, ocpg, 1, f, f}, {}});
  248. }
  249. // clang-format on
  250. checker.set_param(gconv_param({M, 2, 3, 2, 1}))
  251. .execs({{32, 12, 20, 10}, {12, 2, 1, 4, 5}, {}});
  252. // padding larger than kern
  253. checker.set_param(gconv_param({M, 20, 30, 4, 5}))
  254. .execs({{32, 12, 20, 10}, {12, 2, 1, 4, 5}, {}});
  255. }
  256. }
  257. TEST_F(CUDA, CHANWISE_CONVOLUTION_FORWARD_SMALL) {
  258. Checker<Convolution> checker(handle_cuda());
  259. bool require_algo = false;
  260. checker.set_before_exec_callback(AlgoChecker<ConvolutionForward>(
  261. ExecutionPolicyAlgoName{
  262. "DEFAULT",
  263. {{ConvBiasForward::algo_name<ConvBiasForward::DirectParam>(
  264. "CHANNEL_WISE_SMALL", {})
  265. .c_str(),
  266. {}}}},
  267. &require_algo));
  268. for (auto dtype : std::vector<DType> {
  269. dtype::Float32(),
  270. #if CUDA_VERSION >= 9000
  271. dtype::Float16()
  272. #endif
  273. }) {
  274. checker.set_dtype(0, dtype).set_dtype(1, dtype).set_dtype(2, dtype);
  275. if (dtype.enumv() == DTypeEnum::Float16)
  276. checker.set_epsilon(2e-2);
  277. // clang-format off
  278. for (uint32_t s : {1})
  279. for (uint32_t f : {1, 3, 5, 7}) {
  280. checker.set_param(gconv_param({M, f / 2, f / 2, s, s}))
  281. .execs({{2, 3, 16, 16}, {3, 1, 1, f, f}, {}});
  282. }
  283. // clang-format on
  284. checker.set_param(gconv_param({M, 1, 1, 1, 1}))
  285. .execs({{2, 3, 3, 16}, {3, 1, 1, 3, 3}, {}})
  286. .execs({{2, 3, 8, 3}, {3, 1, 1, 3, 3}, {}});
  287. }
  288. }
  289. TEST_F(CUDA, CHANWISE_CONVOLUTION_BACKWARD_DATA) {
  290. Checker<ConvolutionBackwardData> checker(handle_cuda());
  291. bool require_algo = false;
  292. checker.set_before_exec_callback(
  293. AlgoChecker<ConvolutionBackwardData>("CHANNEL_WISE", &require_algo));
  294. for (auto dtype : std::vector<DType>{dtype::Float32(), dtype::Float16()}) {
  295. checker.set_dtype(0, dtype).set_dtype(1, dtype).set_dtype(2, dtype);
  296. if (dtype.enumv() == DTypeEnum::Float16)
  297. checker.set_epsilon(1e-1);
  298. // simple case
  299. // clang-format off
  300. for (uint32_t s : {1, 2})
  301. for (uint32_t p : {0, 1, 2, 3})
  302. for (size_t f : {1, 2, 3, 5, 7})
  303. for (size_t ocpg : {1, 3}) {
  304. size_t ii = infer_conv_shape(16, f, s, p, true);
  305. checker.set_param(gconv_param({M, p, p, s, s}))
  306. .execs({{3, ocpg, 1, f, f},
  307. {2, 3 * ocpg, ii, ii},
  308. {2, 3, 16, 16}});
  309. }
  310. // clang-format on
  311. checker.set_param(gconv_param({M, 2, 3, 2, 1}))
  312. .execs({{12, 3, 1, 4, 5}, {32, 36, 20, 10}, {32, 12, 39, 8}});
  313. checker.set_param(gconv_param({M, 30, 20, 5, 4}))
  314. .execs({{6, 2, 1, 5, 4}, {32, 12, 12, 10}, {32, 6, 3, 2}});
  315. checker.set_param(gconv_param({M, 20, 30, 4, 5}))
  316. .execs({{6, 2, 1, 4, 5}, {32, 12, 10, 12}, {32, 6, 2, 3}});
  317. }
  318. }
  319. TEST_F(CUDA, CHANWISE_CONVOLUTION_BACKWARD_DATA_SMALL) {
  320. Checker<ConvolutionBackwardData> checker(handle_cuda());
  321. bool require_algo = false;
  322. checker.set_before_exec_callback(
  323. AlgoChecker<ConvolutionBackwardData>("CHANNEL_WISE_SMALL", &require_algo));
  324. for (auto dtype : std::vector<DType> {
  325. dtype::Float32(),
  326. #if CUDA_VERSION >= 9000
  327. dtype::Float16()
  328. #endif
  329. }) {
  330. checker.set_dtype(0, dtype).set_dtype(1, dtype).set_dtype(2, dtype);
  331. if (dtype.enumv() == DTypeEnum::Float16)
  332. checker.set_epsilon(2e-2);
  333. for (uint32_t f : {1, 3, 5, 7}) {
  334. checker.set_param(gconv_param({M, f / 2, f / 2, 1, 1}))
  335. .execs({{3, 1, 1, f, f}, {2, 3, 16, 16}, {2, 3, 16, 16}});
  336. }
  337. checker.set_param(gconv_param({M, 1, 1, 1, 1}))
  338. .execs({{3, 1, 1, 3, 3}, {2, 3, 3, 16}, {2, 3, 3, 16}})
  339. .execs({{3, 1, 1, 3, 3}, {2, 3, 8, 3}, {2, 3, 8, 3}});
  340. }
  341. }
  342. TEST_F(CUDA, CHANWISE_CONVOLUTION_BACKWARD_FILTER) {
  343. Checker<ConvolutionBackwardFilter> checker(handle_cuda());
  344. bool require_algo = false;
  345. checker.set_before_exec_callback(
  346. AlgoChecker<ConvolutionBackwardFilter>("CHANNEL_WISE", &require_algo));
  347. UniformFloatRNG rng(-0.1, 0.1);
  348. for (auto dtype : std::vector<DType>{dtype::Float32(), dtype::Float16()}) {
  349. checker.set_dtype(0, dtype)
  350. .set_dtype(1, dtype)
  351. .set_dtype(2, dtype)
  352. .set_rng(0, &rng)
  353. .set_rng(1, &rng);
  354. if (dtype.enumv() == DTypeEnum::Float16)
  355. checker.set_epsilon(2e-1);
  356. // simple case
  357. // clang-format off
  358. for (uint32_t s : {1, 2})
  359. for (uint32_t p : {0, 1, 2, 3})
  360. for (uint32_t f : {1, 2, 3, 5, 7})
  361. for (uint32_t ocpg : {1, 3})
  362. for (uint32_t i : {8, 16, 32, 64}){
  363. size_t ii = infer_conv_shape(i, f, s, p, true);
  364. checker.set_param(gconv_param({M, p, p, s, s}))
  365. .execs({{2, 3, i, i},
  366. {2, 3 * ocpg, ii, ii},
  367. {3, ocpg, 1, f, f}});
  368. }
  369. // clang-format on
  370. // padding larger than kern
  371. checker.set_param(gconv_param({M, 20, 30, 4, 5}))
  372. .execs({{32, 6, 2, 3}, {32, 12, 10, 12}, {6, 2, 1, 4, 5}});
  373. // unused filter items
  374. checker.set_param(gconv_param({M, 2, 3, 2, 3}))
  375. .execs({{32, 6, 1, 1}, {32, 12, 1, 1}, {6, 2, 1, 5, 7}});
  376. }
  377. }
  378. namespace {
  379. template <typename Op>
  380. struct AlgoCheckerMaker {
  381. static auto make(const char* name, bool* require_algo) {
  382. return AlgoChecker<Op>(name, require_algo);
  383. }
  384. };
  385. template <>
  386. struct AlgoCheckerMaker<ConvolutionForward> {
  387. static auto make(const char* name, bool* require_algo) {
  388. return AlgoChecker<ConvolutionForward>(
  389. ExecutionPolicyAlgoName{
  390. "DEFAULT",
  391. {{ConvBiasForward::algo_name<ConvBiasForward::DirectParam>(
  392. name, {})
  393. .c_str(),
  394. {}}}},
  395. require_algo);
  396. }
  397. };
  398. template <typename Op>
  399. void check_chanwise(DType io_type, DType comp_type, Handle* handle, const char* name) {
  400. Checker<Op> checker(handle);
  401. bool require_algo = false;
  402. checker.set_before_exec_callback(AlgoCheckerMaker<Op>::make(name, &require_algo));
  403. checker.set_dtype(0, io_type).set_dtype(1, io_type).set_dtype(2, io_type);
  404. bool io16xc32 = false;
  405. if (io_type == dtype::Float16()) {
  406. if (comp_type == dtype::Float16()) {
  407. checker.set_epsilon(1e-1);
  408. } else {
  409. io16xc32 = true;
  410. }
  411. }
  412. // dispatch testcase by operation
  413. if (std::is_same<Op, ConvolutionForward>::value) {
  414. // align 8
  415. checker.set_param(gconv_param({M, 7, 7, 1, 1}, io16xc32))
  416. .execs({{8, 2, 16, 16}, {2, 1, 1, 15, 15}, {}});
  417. // align 1
  418. checker.set_param(gconv_param({M, 7, 7, 1, 1}, io16xc32))
  419. .execs({{8, 2, 15, 15}, {2, 1, 1, 15, 15}, {}});
  420. // align 2
  421. checker.set_param(gconv_param({M, 7, 7, 1, 1}, io16xc32))
  422. .execs({{8, 2, 14, 14}, {2, 1, 1, 15, 15}, {}});
  423. // custom padding
  424. checker.set_param(gconv_param({M, 3, 3, 1, 1}, io16xc32))
  425. .execs({{8, 2, 16, 16}, {2, 1, 1, 15, 15}, {}});
  426. // custom stride
  427. checker.set_param(gconv_param({M, 7, 7, 2, 2}, io16xc32))
  428. .execs({{8, 2, 16, 16}, {2, 1, 1, 15, 15}, {}});
  429. } else if (std::is_same<Op, ConvolutionBackwardData>::value) {
  430. // align 8
  431. checker.set_param(gconv_param({M, 7, 7, 1, 1}, io16xc32))
  432. .execs({{2, 1, 1, 15, 15}, {8, 2, 16, 16}, {8, 2, 16, 16}});
  433. // align 1
  434. checker.set_param(gconv_param({M, 7, 7, 1, 1}, io16xc32))
  435. .execs({{2, 1, 1, 15, 15}, {8, 2, 15, 15}, {8, 2, 15, 15}});
  436. // align 2
  437. checker.set_param(gconv_param({M, 7, 7, 1, 1}, io16xc32))
  438. .execs({{2, 1, 1, 15, 15}, {8, 2, 14, 14}, {8, 2, 14, 14}});
  439. // custom padding
  440. checker.set_param(gconv_param({M, 3, 3, 1, 1}, io16xc32))
  441. .execs({{2, 1, 1, 15, 15}, {8, 2, 8, 8}, {8, 2, 16, 16}});
  442. // custom stride
  443. checker.set_param(gconv_param({M, 7, 7, 2, 2}, io16xc32))
  444. .execs({{2, 1, 1, 15, 15}, {8, 2, 7, 7}, {8, 2, 14, 14}});
  445. } else if (std::is_same<Op, ConvolutionBackwardFilter>::value) {
  446. }
  447. }
  448. } // namespace
  449. #define MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_FMA_KERNEL(cb) \
  450. cb(1, 128, 128, 8, 32, 64, 8); \
  451. cb(2, 128, 64, 8, 64, 32, 8); \
  452. cb(3, 128, 32, 8, 64, 32, 8); \
  453. cb(4, 64, 128, 8, 32, 64, 8); \
  454. cb(5, 32, 128, 8, 32, 64, 8); \
  455. cb(6, 64, 64, 8, 32, 64, 8); \
  456. cb(7, 32, 64, 8, 32, 64, 8); \
  457. cb(8, 32, 32, 8, 32, 32, 8); \
  458. cb(9, 64, 32, 8, 64, 32, 8);
  459. #define cb(tag, tbm, tbn, tbk, wm, wn, wk) \
  460. TEST_F(CUDA, CHANWISE_CONVOLUTION_FORWARD_CUTLASS_FMA_##tag) { \
  461. check_chanwise<ConvolutionForward>( \
  462. dtype::Float32(), dtype::Float32(), handle_cuda(), \
  463. "FLOAT32_NCHW_FMA_IMPLICIT_BATCHED_GEMM_" #tbm "X" #tbn "X" #tbk \
  464. "_" #wm "X" #wn "X" #wk "_2stage"); \
  465. }
  466. MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_FMA_KERNEL(cb)
  467. #undef cb
  468. #define cb(tag, tbm, tbn, tbk, wm, wn, wk) \
  469. TEST_F(CUDA, CHANWISE_CONVOLUTION_BACKWARD_DATA_CUTLASS_FMA_##tag) { \
  470. check_chanwise<ConvolutionBackwardData>( \
  471. dtype::Float32(), dtype::Float32(), handle_cuda(), \
  472. "FLOAT32_NCHW_FMA_IMPLICIT_BATCHED_GEMM_" #tbm "X" #tbn "X" #tbk \
  473. "_" #wm "X" #wn "X" #wk "_2stage"); \
  474. }
  475. MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_FMA_KERNEL(cb)
  476. #undef cb
  477. #undef MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_FMA_KERNEL
  478. #define MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_HMMA_KERNEL(cb) \
  479. cb(1, 128, 128, 32, 32, 32, 32); \
  480. cb(2, 128, 256, 32, 64, 64, 32); \
  481. cb(3, 128, 64, 32, 32, 32, 32); \
  482. cb(4, 64, 128, 32, 32, 32, 32); \
  483. cb(5, 64, 64, 32, 32, 32, 32);
  484. // check both ioc16 and io16xc32
  485. #define cb(tag, tbm, tbn, tbk, wm, wn, wk) \
  486. TEST_F(CUDA, CHANWISE_CONVOLUTION_FORWARD_CUTLASS_HMMA_##tag) { \
  487. check_chanwise<ConvolutionForward>( \
  488. dtype::Float16(), dtype::Float16(), handle_cuda(), \
  489. "FLOAT16_NCHW_HMMA_IMPLICIT_BATCHED_GEMM_" #tbm "X" #tbn "X" #tbk \
  490. "_" #wm "X" #wn "X" #wk "_2stage"); \
  491. check_chanwise<ConvolutionForward>( \
  492. dtype::Float16(), dtype::Float32(), handle_cuda(), \
  493. "FLOAT16_NCHW_HMMA_IMPLICIT_BATCHED_GEMM_" #tbm "X" #tbn "X" #tbk \
  494. "_" #wm "X" #wn "X" #wk "_2stage"); \
  495. }
  496. MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_HMMA_KERNEL(cb)
  497. #undef cb
  498. #define cb(tag, tbm, tbn, tbk, wm, wn, wk) \
  499. TEST_F(CUDA, CHANWISE_CONVOLUTION_BACKWARD_DATA_CUTLASS_HMMA_##tag) { \
  500. check_chanwise<ConvolutionBackwardData>( \
  501. dtype::Float16(), dtype::Float16(), handle_cuda(), \
  502. "FLOAT16_NCHW_HMMA_IMPLICIT_BATCHED_GEMM_" #tbm "X" #tbn "X" #tbk \
  503. "_" #wm "X" #wn "X" #wk "_mma8X8X4_2stage"); \
  504. check_chanwise<ConvolutionBackwardData>( \
  505. dtype::Float16(), dtype::Float32(), handle_cuda(), \
  506. "FLOAT16_NCHW_HMMA_IMPLICIT_BATCHED_GEMM_" #tbm "X" #tbn "X" #tbk \
  507. "_" #wm "X" #wn "X" #wk "_mma8X8X4_2stage"); \
  508. }
  509. MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_HMMA_KERNEL(cb)
  510. #undef cb
  511. #undef MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_FWD_HMMA_KERNEL
  512. #if MEGDNN_WITH_BENCHMARK
  513. TEST_F(CUDA, CHANWISE_CONVOLUTION_FORWARD_BENCH_CHECK) {
  514. auto handle = handle_cuda();
  515. auto handle_cpu = handle_naive();
  516. auto conv0 = handle->create_operator<ConvolutionForward>();
  517. auto conv1 = handle->create_operator<ConvolutionForward>();
  518. BenchmarkEnv<0, 1, 2> benv(handle, handle_cpu);
  519. auto run = [&](size_t N, size_t IC, size_t IH, size_t IW, size_t CHL_MUL, size_t FH,
  520. size_t FW, size_t PH, size_t PW) {
  521. benv.alloc(N, IC, IH, IW, CHL_MUL, FH, FW, PH, PW);
  522. benv.fill_src();
  523. benv.fill_flt();
  524. benv.exec_convolution(conv0.get(), conv1.get());
  525. benv.cmp_dst();
  526. };
  527. run(64, 60, 50, 50, 1, 3, 3, 1, 1);
  528. if (check_need_full_bench()) {
  529. run(64, 728, 18, 18, 2, 5, 5, 2, 2);
  530. run(64, 64, 150, 150, 2, 3, 3, 1, 1);
  531. run(1, 2048, 4, 4, 2, 3, 3, 1, 1);
  532. }
  533. }
  534. TEST_F(CUDA, CHANWISE_CONVOLUTION_BWD_DATA_BENCH_CHECK) {
  535. auto handle = handle_cuda();
  536. auto handle_cpu = handle_naive();
  537. auto conv0 = handle->create_operator<ConvolutionBackwardData>();
  538. auto conv1 = handle->create_operator<ConvolutionBackwardData>();
  539. BenchmarkEnv<1, 2, 0> benv(handle, handle_cpu);
  540. auto run = [&](size_t N, size_t IC, size_t IH, size_t IW, size_t CHL_MUL, size_t FH,
  541. size_t FW, size_t PH, size_t PW) {
  542. benv.alloc(N, IC, IH, IW, CHL_MUL, FH, FW, PH, PW);
  543. benv.fill_dst();
  544. benv.fill_flt();
  545. benv.exec(conv0.get(), conv1.get());
  546. benv.cmp_src();
  547. };
  548. run(64, 60, 50, 50, 1, 3, 3, 1, 1);
  549. if (check_need_full_bench()) {
  550. run(64, 728, 18, 18, 2, 5, 5, 2, 2);
  551. run(64, 64, 150, 150, 2, 3, 3, 1, 1);
  552. run(1, 2048, 4, 4, 2, 3, 3, 1, 1);
  553. }
  554. }
  555. TEST_F(CUDA, CHANWISE_CONVOLUTION_BWD_FILTER_BENCH_CHECK) {
  556. auto handle = handle_cuda();
  557. auto handle_cpu = handle_naive();
  558. auto conv0 = handle->create_operator<ConvolutionBackwardFilter>();
  559. auto conv1 = handle->create_operator<ConvolutionBackwardFilter>();
  560. BenchmarkEnv<0, 2, 1> benv(handle, handle_cpu);
  561. auto run = [&](size_t N, size_t IC, size_t IH, size_t IW, size_t CHL_MUL, size_t FH,
  562. size_t FW, size_t PH, size_t PW) {
  563. benv.alloc(N, IC, IH, IW, CHL_MUL, FH, FW, PH, PW);
  564. benv.fill_src();
  565. benv.fill_dst();
  566. benv.exec(conv0.get(), conv1.get());
  567. benv.cmp_flt();
  568. };
  569. run(64, 60, 50, 50, 1, 3, 3, 1, 1);
  570. if (check_need_full_bench()) {
  571. run(64, 728, 18, 18, 2, 5, 5, 2, 2);
  572. run(64, 64, 150, 150, 2, 3, 3, 1, 1);
  573. run(1, 2048, 4, 4, 2, 3, 3, 1, 1);
  574. }
  575. }
  576. TEST_F(CUDA, CHANWISE_CONVOLUTION_BENCH_ALL_ALGO_FWD) {
  577. // enable profiling
  578. std::unique_ptr<OprProxy<ConvolutionForward>> proxy{
  579. new OprProxy<ConvolutionForward>{true}};
  580. proxy->warmup_times = 1;
  581. proxy->exec_times = 10;
  582. Benchmarker<ConvolutionForward> checker(handle_cuda());
  583. checker.set_times(1);
  584. ConvolutionForward::Param param;
  585. param.sparse = ConvolutionForward::Param::Sparse::GROUP;
  586. checker.set_param(param);
  587. checker.set_proxy(proxy);
  588. auto run = [&](size_t N, size_t C, size_t IH, size_t IW, size_t FH, size_t FW) {
  589. checker.proxy()->target_execution_policy = {};
  590. checker.execs({{N, C, IH, IW}, {C, 1, 1, FH, FW}, {}});
  591. };
  592. run(128, 64, 90, 80, 3, 3);
  593. run(128, 90, 100, 100, 3, 5);
  594. run(128, 32, 62, 62, 5, 5);
  595. }
  596. TEST_F(CUDA, CHANWISE_CONVOLUTION_BENCH_ALL_ALGO_BWD_DATA) {
  597. // enable profiling
  598. std::unique_ptr<OprProxy<ConvolutionBackwardData>> proxy{
  599. new OprProxy<ConvolutionBackwardData>{true}};
  600. proxy->warmup_times = 1;
  601. proxy->exec_times = 10;
  602. Benchmarker<ConvolutionBackwardData> checker(handle_cuda());
  603. checker.set_times(1);
  604. ConvolutionBackwardData::Param param;
  605. param.sparse = ConvolutionForward::Param::Sparse::GROUP;
  606. checker.set_param(param);
  607. checker.set_proxy(proxy);
  608. auto run = [&](size_t N, size_t C, size_t IH, size_t IW, size_t FH, size_t FW) {
  609. checker.proxy()->target_execution_policy.algo.reset();
  610. checker.execs(
  611. {{C, 1, 1, FH, FW}, {N, C, IH - FH + 1, IW - FW + 1}, {N, C, IH, IW}});
  612. };
  613. run(128, 64, 90, 80, 3, 3);
  614. run(128, 90, 100, 100, 3, 5);
  615. run(128, 32, 62, 62, 5, 5);
  616. }
  617. TEST_F(CUDA, CHANWISE_CONVOLUTION_BENCH_ALL_ALGO_BWD_FILTER) {
  618. // enable profiling
  619. std::unique_ptr<OprProxy<ConvolutionBackwardFilter>> proxy{
  620. new OprProxy<ConvolutionBackwardFilter>{true}};
  621. proxy->warmup_times = 1;
  622. proxy->exec_times = 10;
  623. Benchmarker<ConvolutionBackwardFilter> checker(handle_cuda());
  624. checker.set_times(1);
  625. ConvolutionBackwardFilter::Param param;
  626. param.sparse = ConvolutionForward::Param::Sparse::GROUP;
  627. checker.set_param(param);
  628. checker.set_proxy(proxy);
  629. auto run = [&](size_t N, size_t C, size_t IH, size_t IW, size_t FH, size_t FW) {
  630. checker.proxy()->target_execution_policy.algo.reset();
  631. checker.execs(
  632. {{N, C, IH, IW}, {N, C, IH - FH + 1, IW - FW + 1}, {C, 1, 1, FH, FW}});
  633. };
  634. run(128, 64, 90, 80, 3, 3);
  635. run(128, 90, 100, 100, 3, 5);
  636. run(128, 32, 62, 62, 5, 5);
  637. }
  638. TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_ALL_ALGO_FORWARD) {
  639. CUBenchmarker<ConvolutionForward> bencher(handle_cuda());
  640. size_t RUNS = 10;
  641. bencher.set_display(false).set_times(RUNS);
  642. std::unique_ptr<OprProxy<ConvolutionForward>> proxy{
  643. new OprProxy<ConvolutionForward>{true}};
  644. bencher.set_proxy(proxy);
  645. Convolution::Param param;
  646. param.format = ConvBias::Param::Format::NCHW;
  647. param.sparse = Convolution::Param::Sparse::GROUP;
  648. NormalRNG rng;
  649. auto run = [&](size_t batch, size_t c, size_t ih, size_t iw, size_t f, size_t s) {
  650. param.pad_h = f / 2;
  651. param.pad_w = f / 2;
  652. param.stride_h = s;
  653. param.stride_w = s;
  654. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  655. TensorShape src = {batch, c, ih, iw}, filter = {c, 1, 1, f, f};
  656. TensorLayout dst_layout;
  657. auto opr = handle_cuda()->create_operator<Convolution>();
  658. opr->param() = param;
  659. opr->deduce_layout(
  660. {src, dtype::Float32()}, {filter, dtype::Float32()}, dst_layout);
  661. float bandwith = static_cast<float>(
  662. src.total_nr_elems() + filter.total_nr_elems() +
  663. dst_layout.total_nr_elems()) /
  664. (1024 * 1024 * 1024) * 1e3;
  665. bencher.set_param(param)
  666. .set_dtype(0, dtype::Float32())
  667. .set_dtype(1, dtype::Float32())
  668. .set_dtype(2, dtype::Float32())
  669. .set_rng(0, &rng)
  670. .set_rng(1, &rng);
  671. bencher.proxy()->target_execution_policy = {};
  672. auto time_in_ms_fp32 = bencher.execs({src, filter, {}}) / RUNS;
  673. bencher.set_param(param)
  674. .set_dtype(0, dtype::Float16())
  675. .set_dtype(1, dtype::Float16())
  676. .set_dtype(2, dtype::Float16())
  677. .set_rng(0, &rng)
  678. .set_rng(1, &rng);
  679. bencher.proxy()->target_execution_policy = {};
  680. auto time_in_ms_fp16 = bencher.execs({src, filter, {}}) / RUNS;
  681. bencher.proxy()->target_execution_policy.algo.reset();
  682. param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  683. bencher.set_param(param);
  684. auto time_in_ms_pseudo_fp16 = bencher.execs({src, filter, {}}) / RUNS;
  685. printf("stride=%zu src=%s, filter=%s, float32: %.2fms %.2fGB/s "
  686. "float16: %.2fms %.2fGB/s "
  687. "pseudo float16: %.2fms %.2fGB/s "
  688. "speedup: "
  689. "%0.2f (fp16/fp32) %.2f (fp16/pseudo fp16)\n",
  690. s, src.to_string().c_str(), filter.to_string().c_str(), time_in_ms_fp32,
  691. bandwith * 4 / time_in_ms_fp32, time_in_ms_fp16,
  692. bandwith * 2 / time_in_ms_fp16, time_in_ms_pseudo_fp16,
  693. bandwith * 2 / time_in_ms_pseudo_fp16, time_in_ms_fp32 / time_in_ms_fp16,
  694. time_in_ms_pseudo_fp16 / time_in_ms_fp16);
  695. };
  696. // clang-format off
  697. for (size_t s : {1, 2})
  698. for (size_t f : {3, 5, 7})
  699. for (size_t batch : {64})
  700. for (size_t c : {16, 32, 64, 128})
  701. for (size_t ih: {128, 256})
  702. for (size_t iw : {128, 256})
  703. run(batch, c, ih, iw, f, s);
  704. // clang-format on
  705. run(128, 192, 28, 28, 3, 1);
  706. run(128, 192, 28, 28, 3, 2);
  707. run(128, 576, 14, 14, 3, 1);
  708. run(128, 384, 14, 14, 3, 1);
  709. run(128, 32, 112, 112, 3, 1);
  710. run(128, 960, 7, 7, 3, 1);
  711. run(128, 384, 14, 14, 3, 1);
  712. run(128, 144, 56, 56, 3, 2);
  713. run(128, 384, 14, 14, 3, 1);
  714. run(128, 144, 56, 56, 3, 1);
  715. run(128, 96, 112, 112, 3, 2);
  716. run(128, 384, 14, 14, 3, 1);
  717. run(128, 192, 28, 28, 3, 1);
  718. run(128, 576, 14, 14, 3, 1);
  719. run(128, 576, 14, 14, 3, 2);
  720. }
  721. TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_FORWARD_FLOAT) {
  722. CUBenchmarker<ConvolutionForward> bencher(handle_cuda());
  723. size_t RUNS = 1;
  724. bencher.set_display(false).set_times(RUNS);
  725. bencher.set_before_exec_callback(
  726. AlgoChecker<ConvolutionForward>(ExecutionPolicyAlgoName{
  727. "DEFAULT",
  728. {{ConvBiasForward::algo_name<ConvBiasForward::DirectParam>(
  729. "CHANNEL_WISE", {})
  730. .c_str(),
  731. {}}}}));
  732. Convolution::Param param;
  733. param.format = ConvBias::Param::Format::NCHW;
  734. param.sparse = Convolution::Param::Sparse::GROUP;
  735. NormalRNG rng;
  736. auto run = [&](size_t batch, size_t c, size_t ih, size_t iw, size_t f, size_t s) {
  737. param.pad_h = f / 2;
  738. param.pad_w = f / 2;
  739. param.stride_h = s;
  740. param.stride_w = s;
  741. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  742. TensorShape src = {batch, c, ih, iw}, filter = {c, 1, 1, f, f};
  743. TensorLayout dst_layout;
  744. auto opr = handle_cuda()->create_operator<Convolution>();
  745. opr->param() = param;
  746. opr->deduce_layout(
  747. {src, dtype::Float32()}, {filter, dtype::Float32()}, dst_layout);
  748. float bandwith = static_cast<float>(
  749. src.total_nr_elems() + filter.total_nr_elems() +
  750. dst_layout.total_nr_elems()) /
  751. (1024 * 1024 * 1024) * 1e3;
  752. bencher.set_param(param)
  753. .set_dtype(0, dtype::Float32())
  754. .set_dtype(1, dtype::Float32())
  755. .set_dtype(2, dtype::Float32())
  756. .set_rng(0, &rng)
  757. .set_rng(1, &rng);
  758. auto time_in_ms_fp32 = bencher.execs({src, filter, {}}) / RUNS;
  759. bencher.set_param(param)
  760. .set_dtype(0, dtype::Float16())
  761. .set_dtype(1, dtype::Float16())
  762. .set_dtype(2, dtype::Float16())
  763. .set_rng(0, &rng)
  764. .set_rng(1, &rng);
  765. auto time_in_ms_fp16 = bencher.execs({src, filter, {}}) / RUNS;
  766. printf("stride=%zu src=%s, filter=%s, float32: %.2fms %.2fGB/s "
  767. "float16: %.2fms %.2fGB/s "
  768. "speedup: "
  769. "%0.2f (fp16/fp32)\n",
  770. s, src.to_string().c_str(), filter.to_string().c_str(), time_in_ms_fp32,
  771. bandwith * 4 / time_in_ms_fp32, time_in_ms_fp16,
  772. bandwith * 2 / time_in_ms_fp16, time_in_ms_fp32 / time_in_ms_fp16);
  773. };
  774. // clang-format off
  775. for (size_t s : {1})
  776. for (size_t f : {3, 5, 7})
  777. for (size_t batch : {64})
  778. for (size_t c : {16, 32, 64, 128})
  779. for (size_t ih: {8, 16, 32, 128, 256})
  780. for (size_t iw : {8, 16, 32, 128, 256})
  781. run(batch, c, ih, iw, f, s);
  782. // clang-format on
  783. }
  784. TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_FORWARD_FLOAT_SMALL) {
  785. CUBenchmarker<ConvolutionForward> bencher(handle_cuda());
  786. size_t RUNS = 1;
  787. bencher.set_display(false).set_times(RUNS);
  788. Convolution::Param param;
  789. param.format = ConvBias::Param::Format::NCHW;
  790. param.sparse = Convolution::Param::Sparse::GROUP;
  791. NormalRNG rng;
  792. auto run = [&](size_t batch, size_t c, size_t ih, size_t iw, size_t f, size_t s) {
  793. param.pad_h = f / 2;
  794. param.pad_w = f / 2;
  795. param.stride_h = s;
  796. param.stride_w = s;
  797. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  798. TensorShape src = {batch, c, ih, iw}, filter = {c, 1, 1, f, f};
  799. TensorLayout dst_layout;
  800. auto opr = handle_cuda()->create_operator<Convolution>();
  801. opr->param() = param;
  802. opr->deduce_layout(
  803. {src, dtype::Float32()}, {filter, dtype::Float32()}, dst_layout);
  804. float bandwith = static_cast<float>(
  805. src.total_nr_elems() + filter.total_nr_elems() +
  806. dst_layout.total_nr_elems()) /
  807. (1024 * 1024 * 1024) * 1e3;
  808. bencher.set_param(param)
  809. .set_dtype(0, dtype::Float32())
  810. .set_dtype(1, dtype::Float32())
  811. .set_dtype(2, dtype::Float32())
  812. .set_rng(0, &rng)
  813. .set_rng(1, &rng)
  814. .set_before_exec_callback(AlgoChecker<
  815. ConvolutionForward>(ExecutionPolicyAlgoName{
  816. "DEFAULT",
  817. {{ConvBiasForward::algo_name<ConvBiasForward::DirectParam>(
  818. "CHANNEL_WISE", {})
  819. .c_str(),
  820. {}}}}));
  821. auto time_in_ms_fp32_normal = bencher.execs({src, filter, {}}) / RUNS;
  822. bencher.set_before_exec_callback(
  823. AlgoChecker<ConvolutionForward>(ExecutionPolicyAlgoName{
  824. "DEFAULT",
  825. {{ConvBiasForward::algo_name<ConvBiasForward::DirectParam>(
  826. "CHANNEL_WISE", {})
  827. .c_str(),
  828. {}}}}));
  829. auto time_in_ms_fp32_small = bencher.execs({src, filter, {}}) / RUNS;
  830. bencher.set_param(param)
  831. .set_dtype(0, dtype::Float16())
  832. .set_dtype(1, dtype::Float16())
  833. .set_dtype(2, dtype::Float16())
  834. .set_rng(0, &rng)
  835. .set_rng(1, &rng);
  836. auto time_in_ms_fp16_small = bencher.execs({src, filter, {}}) / RUNS;
  837. printf("stride=%zu src=%s, filter=%s, fp32 normal: %.2fms %.2fGB/s "
  838. "small: %.2fms %.2fGB/s, fp16 small: %.2fms %.2fGB/s, "
  839. "speedup: "
  840. "%0.2f (fp32 small/normal) %0.2f (small fp16/fp32)\n",
  841. s, src.to_string().c_str(), filter.to_string().c_str(),
  842. time_in_ms_fp32_normal, bandwith * 4 / time_in_ms_fp32_normal,
  843. time_in_ms_fp32_small, bandwith * 4 / time_in_ms_fp32_small,
  844. time_in_ms_fp16_small, bandwith * 2 / time_in_ms_fp16_small,
  845. time_in_ms_fp32_normal / time_in_ms_fp32_small,
  846. time_in_ms_fp32_small / time_in_ms_fp16_small);
  847. };
  848. // clang-format off
  849. for (size_t s : {1})
  850. for (size_t f : {3, 5})
  851. for (size_t batch : {64})
  852. for (size_t c : {16, 32, 64, 128})
  853. for (size_t ih: {8, 16, 32})
  854. for (size_t iw : {8, 16, 32})
  855. run(batch, c, ih, iw, f, s);
  856. // clang-format on
  857. run(128, 192, 28, 28, 3, 1);
  858. run(128, 576, 14, 14, 3, 1);
  859. run(128, 384, 14, 14, 3, 1);
  860. run(128, 960, 7, 7, 3, 1);
  861. run(128, 384, 14, 14, 3, 1);
  862. run(128, 384, 14, 14, 3, 1);
  863. run(128, 384, 14, 14, 3, 1);
  864. run(128, 192, 28, 28, 3, 1);
  865. run(128, 576, 14, 14, 3, 1);
  866. }
  867. TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_FORWARD_CUDNN_DNN) {
  868. CUBenchmarker<ConvBiasForward> bencher(handle_cuda());
  869. size_t RUNS = 1;
  870. bencher.set_display(false).set_times(RUNS);
  871. ConvBias::Param param;
  872. param.format = ConvBias::Param::Format::NCHW;
  873. param.sparse = ConvBias::Param::Sparse::GROUP;
  874. NormalRNG rng;
  875. auto run = [&](size_t batch, size_t c, size_t ih, size_t iw, size_t f, size_t s) {
  876. param.pad_h = f / 2;
  877. param.pad_w = f / 2;
  878. param.stride_h = s;
  879. param.stride_w = s;
  880. param.compute_mode = param::ConvBias::ComputeMode::DEFAULT;
  881. TensorShape src = {batch, c, ih, iw}, filter = {c, 1, 1, f, f},
  882. bias = {1, c, 1, 1};
  883. TensorLayout dst_layout;
  884. auto opr = handle_cuda()->create_operator<ConvBias>();
  885. opr->param() = param;
  886. opr->deduce_layout(
  887. {src, dtype::Float32()}, {filter, dtype::Float32()},
  888. {bias, dtype::Float32()}, {}, dst_layout);
  889. float computation_mops =
  890. static_cast<float>(dst_layout.total_nr_elems() * f * f * 2) * 1e-6;
  891. bencher.set_param(param)
  892. .set_dtype(0, dtype::Float32())
  893. .set_dtype(1, dtype::Float32())
  894. .set_dtype(2, dtype::Float32())
  895. .set_rng(0, &rng)
  896. .set_rng(1, &rng);
  897. bencher.set_before_exec_callback(
  898. AlgoChecker<ConvBiasForward>(".+CHANNEL_WISE.+"));
  899. auto time_in_ms_dnn = bencher.execs({src, filter, bias, {}, {}}) / RUNS;
  900. bencher.set_param(param)
  901. .set_dtype(0, dtype::Float32())
  902. .set_dtype(1, dtype::Float32())
  903. .set_dtype(2, dtype::Float32())
  904. .set_rng(0, &rng)
  905. .set_rng(1, &rng);
  906. bencher.set_before_exec_callback(AlgoChecker<ConvBiasForward>(
  907. ".+CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM.+"));
  908. auto time_in_ms_cudnn = bencher.execs({src, filter, bias, {}, {}}) / RUNS;
  909. printf("stride=%zu src=%s, filter=%s, dst=%s, dnn: %.2fms %.2fGB/s "
  910. "cudnn: %.2fms %.2fGB/s "
  911. "speedup: "
  912. "%0.2f (dnn/cudnn)\n",
  913. s, src.to_string().c_str(), filter.to_string().c_str(),
  914. dst_layout.to_string().c_str(), time_in_ms_dnn,
  915. computation_mops / time_in_ms_dnn, time_in_ms_cudnn,
  916. computation_mops / time_in_ms_cudnn, time_in_ms_cudnn / time_in_ms_dnn);
  917. };
  918. // clang-format off
  919. for(size_t batch:{1, 16, 32, 64, 128}){
  920. run(batch, 32, 112, 112, 3, 1);
  921. run(batch, 96, 112, 112, 3, 2);
  922. run(batch, 96, 112, 112, 3, 1);
  923. run(batch, 144, 56, 56, 3, 2);
  924. run(batch, 144, 56, 56, 3, 1);
  925. run(batch, 192, 28, 28, 3, 1);
  926. run(batch, 384, 14, 14, 3, 1);
  927. run(batch, 576, 14, 14, 3, 1);
  928. run(batch, 960, 7, 7, 3, 1);
  929. //! calibrate heu algo policy hw_size param
  930. run(batch, 144, 24, 24, 3, 1);
  931. run(batch, 144, 22, 22, 3, 1);
  932. run(batch, 144, 20, 20, 3, 1);
  933. run(batch, 144, 18, 18, 3, 1);
  934. }
  935. // clang-format on
  936. }
  937. TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_BACKWARD_DATA_FLOAT_SMALL) {
  938. CUBenchmarker<ConvolutionBackwardData> bencher(handle_cuda());
  939. size_t RUNS = 1;
  940. bencher.set_display(false).set_times(RUNS);
  941. ConvolutionBackwardData::Param param;
  942. param.format = Convolution::Param::Format::NCHW;
  943. param.sparse = Convolution::Param::Sparse::GROUP;
  944. NormalRNG rng;
  945. auto run = [&](size_t batch, size_t c, size_t ih, size_t iw, size_t f, size_t s) {
  946. param.pad_h = f / 2;
  947. param.pad_w = f / 2;
  948. param.stride_h = s;
  949. param.stride_w = s;
  950. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  951. TensorShape src = {batch, c, ih, iw}, filter = {c, 1, 1, f, f};
  952. float bandwith = static_cast<float>(
  953. src.total_nr_elems() + filter.total_nr_elems() +
  954. src.total_nr_elems()) /
  955. (1024 * 1024 * 1024) * 1e3;
  956. bencher.set_param(param)
  957. .set_dtype(0, dtype::Float32())
  958. .set_dtype(1, dtype::Float32())
  959. .set_dtype(2, dtype::Float32())
  960. .set_rng(0, &rng)
  961. .set_rng(1, &rng)
  962. .set_before_exec_callback(
  963. AlgoChecker<ConvolutionBackwardData>("CHANNEL_WISE"));
  964. auto time_in_ms_fp32_normal = bencher.execs({filter, src, src}) / RUNS;
  965. bencher.set_before_exec_callback(
  966. AlgoChecker<ConvolutionBackwardData>("CHANNEL_WISE_SMALL"));
  967. auto time_in_ms_fp32_small = bencher.execs({filter, src, src}) / RUNS;
  968. bencher.set_param(param)
  969. .set_dtype(0, dtype::Float16())
  970. .set_dtype(1, dtype::Float16())
  971. .set_dtype(2, dtype::Float16())
  972. .set_rng(0, &rng)
  973. .set_rng(1, &rng);
  974. auto time_in_ms_fp16_small = bencher.execs({filter, src, src}) / RUNS;
  975. printf("stride=%zu src=%s, filter=%s, fp32 normal: %.2fms %.2fGB/s "
  976. "small: %.2fms %.2fGB/s, fp16 small: %.2fms %.2fGB/s, "
  977. "speedup: "
  978. "%0.2f (fp32 small/normal) %0.2f (small fp16/fp32)\n",
  979. s, src.to_string().c_str(), filter.to_string().c_str(),
  980. time_in_ms_fp32_normal, bandwith * 4 / time_in_ms_fp32_normal,
  981. time_in_ms_fp32_small, bandwith * 4 / time_in_ms_fp32_small,
  982. time_in_ms_fp16_small, bandwith * 2 / time_in_ms_fp16_small,
  983. time_in_ms_fp32_normal / time_in_ms_fp32_small,
  984. time_in_ms_fp32_small / time_in_ms_fp16_small);
  985. };
  986. // clang-format off
  987. for (size_t s : {1})
  988. for (size_t f : {3, 5})
  989. for (size_t batch : {64})
  990. for (size_t c : {16, 32, 64, 128})
  991. for (size_t ih: {8, 16, 32})
  992. for (size_t iw : {8, 16, 32})
  993. run(batch, c, ih, iw, f, s);
  994. // clang-format on
  995. run(128, 192, 28, 28, 3, 1);
  996. run(128, 576, 14, 14, 3, 1);
  997. run(128, 384, 14, 14, 3, 1);
  998. run(128, 960, 7, 7, 3, 1);
  999. run(128, 384, 14, 14, 3, 1);
  1000. run(128, 384, 14, 14, 3, 1);
  1001. run(128, 384, 14, 14, 3, 1);
  1002. run(128, 192, 28, 28, 3, 1);
  1003. run(128, 576, 14, 14, 3, 1);
  1004. }
  1005. TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_BWD_DATA) {
  1006. CUBenchmarker<ConvolutionBackwardData> bencher(handle_cuda());
  1007. size_t RUNS = 1;
  1008. bencher.set_display(false).set_times(RUNS);
  1009. bencher.set_before_exec_callback(
  1010. AlgoChecker<ConvolutionBackwardData>("CHANNEL_WISE"));
  1011. Convolution::Param param;
  1012. param.format = ConvBias::Param::Format::NCHW;
  1013. param.sparse = Convolution::Param::Sparse::GROUP;
  1014. NormalRNG rng;
  1015. auto run = [&](size_t batch, size_t ocpg, size_t group, size_t ih, size_t iw,
  1016. size_t f, size_t p, size_t s) {
  1017. param.pad_h = p;
  1018. param.pad_w = p;
  1019. param.stride_h = s;
  1020. param.stride_w = s;
  1021. size_t oh, ow;
  1022. infer_conv_shape2d(ih, iw, f, f, s, s, p, p, oh, ow, true);
  1023. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  1024. TensorShape src_grad = {batch, group, ih, iw},
  1025. dst_grad = {batch, group * ocpg, oh, ow},
  1026. flt = {group, ocpg, 1, f, f};
  1027. auto opr = handle_cuda()->create_operator<Convolution>();
  1028. opr->param() = param;
  1029. float bandwith = static_cast<float>(
  1030. flt.total_nr_elems() + dst_grad.total_nr_elems() +
  1031. src_grad.total_nr_elems()) /
  1032. (1024 * 1024 * 1024) * 1e3;
  1033. bencher.set_param(param)
  1034. .set_dtype(0, dtype::Float32())
  1035. .set_dtype(1, dtype::Float32())
  1036. .set_dtype(2, dtype::Float32())
  1037. .set_rng(0, &rng)
  1038. .set_rng(1, &rng);
  1039. auto time_in_ms_fp32 = bencher.execs({flt, dst_grad, src_grad}) / RUNS;
  1040. bencher.set_param(param)
  1041. .set_dtype(0, dtype::Float16())
  1042. .set_dtype(1, dtype::Float16())
  1043. .set_dtype(2, dtype::Float16())
  1044. .set_rng(0, &rng)
  1045. .set_rng(1, &rng);
  1046. auto time_in_ms_fp16 = bencher.execs({flt, dst_grad, src_grad}) / RUNS;
  1047. printf("stride=%zu, src_grad=%s, flt=%s, "
  1048. "float32: %.2fms %.2fGB/s "
  1049. "float16: %.2fms %.2fGB/s "
  1050. "speedup: "
  1051. "%0.2f (fp16/fp32)\n",
  1052. s, src_grad.to_string().c_str(), flt.to_string().c_str(),
  1053. time_in_ms_fp32, bandwith * 4 / time_in_ms_fp32, time_in_ms_fp16,
  1054. bandwith * 2 / time_in_ms_fp16, time_in_ms_fp32 / time_in_ms_fp16);
  1055. };
  1056. // clang-format off
  1057. for (size_t s : {1, 2})
  1058. for (size_t f : {3, 5, 7})
  1059. for (size_t p : {f / 2})
  1060. for (size_t batch : {64})
  1061. for (size_t ocpg : {1})
  1062. for (size_t group : {16, 32, 64, 128})
  1063. for (size_t ih : {8, 16, 32, 128, 256})
  1064. for (size_t iw : {8, 16, 32, 128, 256})
  1065. run(batch, ocpg, group, ih, iw, f, p, s);
  1066. // clang-format on
  1067. }
  1068. TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_BWD_FILTER) {
  1069. CUBenchmarker<ConvolutionBackwardFilter> bencher(handle_cuda());
  1070. size_t RUNS = 1;
  1071. bencher.set_display(false).set_times(RUNS);
  1072. bencher.set_before_exec_callback(
  1073. AlgoChecker<ConvolutionBackwardFilter>("CHANNEL_WISE"));
  1074. Convolution::Param param;
  1075. param.format = ConvBias::Param::Format::NCHW;
  1076. param.sparse = Convolution::Param::Sparse::GROUP;
  1077. NormalRNG rng;
  1078. auto run = [&](size_t batch, size_t ocpg, size_t group, size_t i, size_t f,
  1079. size_t p, size_t s) {
  1080. param.pad_h = p;
  1081. param.pad_w = p;
  1082. param.stride_h = s;
  1083. param.stride_w = s;
  1084. size_t d = infer_conv_shape(i, f, s, p, true);
  1085. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  1086. TensorShape src = {batch, group, i, i}, dst_grad = {batch, group * ocpg, d, d},
  1087. flt_grad = {group, ocpg, 1, f, f};
  1088. auto opr = handle_cuda()->create_operator<Convolution>();
  1089. opr->param() = param;
  1090. float bandwith = static_cast<float>(
  1091. flt_grad.total_nr_elems() + dst_grad.total_nr_elems() +
  1092. src.total_nr_elems()) /
  1093. (1024 * 1024 * 1024) * 1e3;
  1094. bencher.set_param(param)
  1095. .set_dtype(0, dtype::Float32())
  1096. .set_dtype(1, dtype::Float32())
  1097. .set_dtype(2, dtype::Float32())
  1098. .set_rng(0, &rng)
  1099. .set_rng(1, &rng);
  1100. auto time_in_ms_fp32 = bencher.execs({src, dst_grad, flt_grad}) / RUNS;
  1101. bencher.set_param(param)
  1102. .set_dtype(0, dtype::Float16())
  1103. .set_dtype(1, dtype::Float16())
  1104. .set_dtype(2, dtype::Float16())
  1105. .set_rng(0, &rng)
  1106. .set_rng(1, &rng);
  1107. auto time_in_ms_fp16 = bencher.execs({src, dst_grad, flt_grad}) / RUNS;
  1108. printf("stride=%zu, src=%s, flt_grad=%s, "
  1109. "float32: %.2fms %.2fGB/s "
  1110. "float16: %.2fms %.2fGB/s "
  1111. "speedup: "
  1112. "%.2f (fp16/fp32)\n",
  1113. s, src.to_string().c_str(), flt_grad.to_string().c_str(),
  1114. time_in_ms_fp32, bandwith * 4 / time_in_ms_fp32, time_in_ms_fp16,
  1115. bandwith * 2 / time_in_ms_fp16, time_in_ms_fp32 / time_in_ms_fp16);
  1116. };
  1117. // clang-format off
  1118. for (size_t s : {1, 2})
  1119. for (size_t f : {3, 5, 7})
  1120. for (size_t p : {f / 2})
  1121. for (size_t batch : {64})
  1122. for (size_t ocpg : {1})
  1123. for (size_t group : {16, 32, 64, 128})
  1124. for (size_t i : {8, 16, 32, 64, 128})
  1125. run(batch, ocpg, group, i, f, p, s);
  1126. // clang-format on
  1127. }
  1128. TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_FORWARD_LARGE_KERNEL) {
  1129. CUBenchmarker<ConvolutionForward> bencher(handle_cuda());
  1130. size_t RUNS = 100;
  1131. bencher.set_display(false).set_times(RUNS);
  1132. std::unique_ptr<OprProxy<ConvolutionForward>> proxy{
  1133. new OprProxy<ConvolutionForward>{true}};
  1134. bencher.set_proxy(proxy);
  1135. Convolution::Param param;
  1136. param.format = ConvBias::Param::Format::NCHW;
  1137. param.sparse = Convolution::Param::Sparse::GROUP;
  1138. NormalRNG rng;
  1139. auto run = [&](size_t batch, size_t c, size_t ih, size_t iw, size_t f, size_t s) {
  1140. param.pad_h = f / 2;
  1141. param.pad_w = f / 2;
  1142. param.stride_h = s;
  1143. param.stride_w = s;
  1144. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  1145. TensorShape src = {batch, c, ih, iw}, filter = {c, 1, 1, f, f};
  1146. TensorLayout dst_layout;
  1147. auto opr = handle_cuda()->create_operator<Convolution>();
  1148. opr->param() = param;
  1149. opr->deduce_layout(
  1150. {src, dtype::Float32()}, {filter, dtype::Float32()}, dst_layout);
  1151. float bandwith = static_cast<float>(
  1152. src.total_nr_elems() + filter.total_nr_elems() +
  1153. dst_layout.total_nr_elems()) /
  1154. (1024 * 1024 * 1024) * 1e3;
  1155. bencher.set_param(param)
  1156. .set_dtype(0, dtype::Float32())
  1157. .set_dtype(1, dtype::Float32())
  1158. .set_dtype(2, dtype::Float32())
  1159. .set_rng(0, &rng)
  1160. .set_rng(1, &rng);
  1161. bencher.proxy()->target_execution_policy = {};
  1162. auto time_in_ms_fp32 = bencher.execs({src, filter, {}}) / RUNS;
  1163. bencher.set_param(param)
  1164. .set_dtype(0, dtype::Float16())
  1165. .set_dtype(1, dtype::Float16())
  1166. .set_dtype(2, dtype::Float16())
  1167. .set_rng(0, &rng)
  1168. .set_rng(1, &rng);
  1169. bencher.proxy()->target_execution_policy = {};
  1170. auto time_in_ms_fp16 = bencher.execs({src, filter, {}}) / RUNS;
  1171. bencher.proxy()->target_execution_policy.algo.reset();
  1172. param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  1173. bencher.set_param(param);
  1174. auto time_in_ms_pseudo_fp16 = bencher.execs({src, filter, {}}) / RUNS;
  1175. printf("stride=%zu src=%s, filter=%s, float32: %.2fms %.2fGB/s "
  1176. "float16: %.2fms %.2fGB/s "
  1177. "pseudo float16: %.2fms %.2fGB/s "
  1178. "speedup: "
  1179. "%0.2f (fp16/fp32) %.2f (fp16/pseudo fp16)\n",
  1180. s, src.to_string().c_str(), filter.to_string().c_str(), time_in_ms_fp32,
  1181. bandwith * 4 / time_in_ms_fp32, time_in_ms_fp16,
  1182. bandwith * 2 / time_in_ms_fp16, time_in_ms_pseudo_fp16,
  1183. bandwith * 2 / time_in_ms_pseudo_fp16, time_in_ms_fp32 / time_in_ms_fp16,
  1184. time_in_ms_pseudo_fp16 / time_in_ms_fp16);
  1185. };
  1186. // clang-format off
  1187. for (size_t b : {32, 64})
  1188. for (size_t f : {3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31}) {
  1189. run(b, 384, 32, 32, f, 1);
  1190. run(b, 384, 64, 64, f, 1);
  1191. }
  1192. // clang-format on
  1193. }
  1194. TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_BACKWARD_DATA_LARGE_KERNEL) {
  1195. CUBenchmarker<ConvolutionBackwardData> bencher(handle_cuda());
  1196. size_t RUNS = 100;
  1197. bencher.set_display(false).set_times(RUNS);
  1198. std::unique_ptr<OprProxy<ConvolutionBackwardData>> proxy{
  1199. new OprProxy<ConvolutionBackwardData>{true}};
  1200. bencher.set_proxy(proxy);
  1201. Convolution::Param param;
  1202. param.format = ConvBias::Param::Format::NCHW;
  1203. param.sparse = Convolution::Param::Sparse::GROUP;
  1204. NormalRNG rng;
  1205. auto run = [&](size_t batch, size_t c, size_t ih, size_t iw, size_t f, size_t s) {
  1206. param.pad_h = f / 2;
  1207. param.pad_w = f / 2;
  1208. param.stride_h = s;
  1209. param.stride_w = s;
  1210. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  1211. TensorShape src = {batch, c, ih, iw}, filter = {c, 1, 1, f, f};
  1212. TensorLayout dst_layout;
  1213. auto opr = handle_cuda()->create_operator<Convolution>();
  1214. opr->param() = param;
  1215. opr->deduce_layout(
  1216. {src, dtype::Float32()}, {filter, dtype::Float32()}, dst_layout);
  1217. float bandwith = static_cast<float>(
  1218. src.total_nr_elems() + filter.total_nr_elems() +
  1219. dst_layout.total_nr_elems()) /
  1220. (1024 * 1024 * 1024) * 1e3;
  1221. bencher.set_param(param)
  1222. .set_dtype(0, dtype::Float32())
  1223. .set_dtype(1, dtype::Float32())
  1224. .set_dtype(2, dtype::Float32())
  1225. .set_rng(0, &rng)
  1226. .set_rng(1, &rng);
  1227. bencher.proxy()->target_execution_policy = {};
  1228. auto time_in_ms_fp32 = bencher.execs({filter, src, src}) / RUNS;
  1229. bencher.set_param(param)
  1230. .set_dtype(0, dtype::Float16())
  1231. .set_dtype(1, dtype::Float16())
  1232. .set_dtype(2, dtype::Float16())
  1233. .set_rng(0, &rng)
  1234. .set_rng(1, &rng);
  1235. bencher.proxy()->target_execution_policy = {};
  1236. auto time_in_ms_fp16 = bencher.execs({filter, src, src}) / RUNS;
  1237. bencher.proxy()->target_execution_policy.algo.reset();
  1238. param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  1239. bencher.set_param(param);
  1240. auto time_in_ms_pseudo_fp16 = bencher.execs({src, filter, {}}) / RUNS;
  1241. printf("stride=%zu src=%s, filter=%s, float32: %.2fms %.2fGB/s "
  1242. "float16: %.2fms %.2fGB/s "
  1243. "pseudo float16: %.2fms %.2fGB/s "
  1244. "speedup: "
  1245. "%0.2f (fp16/fp32) %.2f (fp16/pseudo fp16)\n",
  1246. s, src.to_string().c_str(), filter.to_string().c_str(), time_in_ms_fp32,
  1247. bandwith * 4 / time_in_ms_fp32, time_in_ms_fp16,
  1248. bandwith * 2 / time_in_ms_fp16, time_in_ms_pseudo_fp16,
  1249. bandwith * 2 / time_in_ms_pseudo_fp16, time_in_ms_fp32 / time_in_ms_fp16,
  1250. time_in_ms_pseudo_fp16 / time_in_ms_fp16);
  1251. };
  1252. // clang-format off
  1253. for (size_t b : {32, 64})
  1254. for (size_t f : {3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31}) {
  1255. run(b, 384, 32, 32, f, 1);
  1256. run(b, 384, 64, 64, f, 1);
  1257. }
  1258. // clang-format on
  1259. }
  1260. #endif
  1261. // vim: syntax=cpp.doxygen