You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

chanwise_convolution.cpp 44 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150
  1. /**
  2. * \file dnn/test/cuda/chanwise_convolution.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #include "megdnn/oprs/nn.h"
  12. #include "test/cuda/fixture.h"
  13. #include "test/cuda/benchmark.h"
  14. #include "test/common/tensor.h"
  15. #include "test/common/workspace_wrapper.h"
  16. #include "test/common/checker.h"
  17. #include "test/common/convolution.h"
  18. #include "test/common/benchmarker.h"
  19. #include "megcore_cuda.h"
  20. #include "cuda.h"
  21. #include <cuda_profiler_api.h>
  22. #include <cuda_runtime_api.h>
  23. using namespace megdnn;
  24. using namespace test;
  25. namespace {
  26. #if MEGDNN_WITH_BENCHMARK
  27. bool check_need_full_bench() {
  28. if (getenv("MEGDNN_CHANWISE_CONV_FULLBENCH"))
  29. return true;
  30. printf("set MEGDNN_CHANWISE_CONV_FULLBENCH to run full benchmark\n");
  31. return false;
  32. }
  33. #endif
  34. Convolution::Param gconv_param(Convolution::Param p) {
  35. p.sparse = Convolution::Param::Sparse::GROUP;
  36. return p;
  37. }
  38. template<int P0, int P1, int P2>
  39. class BenchmarkEnv {
  40. Handle *handle, *handle_cpu;
  41. std::unique_ptr<GaussianRNG> rng;
  42. TensorLayout lsrc, lflt0, lflt1, ldst;
  43. std::unique_ptr<Tensor<>> src0, src1,
  44. flt0, flt0_cpu, flt1, flt1_cpu, dst0, dst1;
  45. cudaEvent_t cuda_ev[3];
  46. cudaStream_t cuda_stream;
  47. size_t pad_h, pad_w;
  48. template<typename T>
  49. static std::tuple<T, T, T> shuffle(std::tuple<T, T, T> data) {
  50. return std::make_tuple(
  51. std::get<P0>(data), std::get<P1>(data), std::get<P2>(data));
  52. }
  53. public:
  54. BenchmarkEnv(Handle *handle, Handle *handle_cpu) {
  55. this->handle = handle;
  56. this->handle_cpu = handle_cpu;
  57. rng = handle->create_operator<GaussianRNG>();
  58. // make cpu handle used
  59. handle_cpu->create_operator<Sleep>()->exec();
  60. for (int i = 0; i < 3; ++ i)
  61. cudaEventCreate(&cuda_ev[i]);
  62. megcoreGetCUDAStream(handle->megcore_computing_handle(), &cuda_stream);
  63. }
  64. ~BenchmarkEnv() {
  65. for (int i = 0; i < 3; ++ i)
  66. cudaEventDestroy(cuda_ev[i]);
  67. }
  68. void alloc(size_t N, size_t IC, size_t IH, size_t IW,
  69. size_t CHL_MUL, size_t FH, size_t FW, size_t PH, size_t PW) {
  70. pad_h = PH;
  71. pad_w = PW;
  72. auto mkly = [](const TensorShape &s) {
  73. return TensorLayout{s, dtype::Float32()};
  74. };
  75. lsrc = mkly({N, IC, IH, IW});
  76. lflt0 = mkly({CHL_MUL*IC, IC, FH, FW});
  77. lflt1 = mkly({IC, CHL_MUL, 1, FH, FW});
  78. ldst = mkly({N, IC*CHL_MUL, IH-FH+1+PH*2, IW-FW+1+PW*2});
  79. src0.reset(new Tensor<>(handle, lsrc));
  80. src1.reset(new Tensor<>(handle, lsrc));
  81. flt0.reset(new Tensor<>(handle, lflt0));
  82. flt0_cpu.reset(new Tensor<>(handle_cpu, lflt0));
  83. flt1.reset(new Tensor<>(handle, lflt1));
  84. flt1_cpu.reset(new Tensor<>(handle_cpu, lflt1));
  85. dst0.reset(new Tensor<>(handle, ldst));
  86. dst1.reset(new Tensor<>(handle, ldst));
  87. }
  88. void fill_src() {
  89. rng->exec(src0->tensornd(), {});
  90. megdnn_memcpy_D2D(handle, src1->ptr(), src0->ptr(),
  91. lsrc.span().dist_byte());
  92. }
  93. void fill_flt() {
  94. rng->exec(flt1->tensornd(), {});
  95. megdnn_memcpy_D2H(handle,
  96. flt1_cpu->ptr(), flt1->ptr(), lflt1.span().dist_byte());
  97. const size_t IC = lflt1[0], CHL_MUL = lflt1[1],
  98. FSIZE = lflt1[3] * lflt1[4];
  99. // fill flt0 from flt1
  100. float* src = flt1_cpu->ptr();
  101. float* dst = flt0_cpu->ptr();
  102. memset(dst, 0, lflt0.span().dist_byte());
  103. for (size_t i = 0; i < IC; ++ i) {
  104. for (size_t j = 0; j < CHL_MUL; ++ j) {
  105. memcpy(dst + ((i * CHL_MUL + j) * IC + i) * FSIZE,
  106. src + (i * CHL_MUL + j) * FSIZE,
  107. FSIZE * sizeof(float));
  108. }
  109. }
  110. megdnn_memcpy_H2D(handle,
  111. flt0->ptr(), dst, lflt0.span().dist_byte());
  112. }
  113. void fill_dst() {
  114. rng->exec(dst0->tensornd(), {});
  115. megdnn_memcpy_D2D(handle, dst1->ptr(), dst0->ptr(),
  116. ldst.span().dist_byte());
  117. }
  118. template<class Opr>
  119. void exec(Opr *opr0, Opr *opr1) {
  120. opr0->param().pad_h = pad_h;
  121. opr0->param().pad_w = pad_w;
  122. opr1->param() = opr0->param();
  123. opr1->param().sparse = param::Convolution::Sparse::GROUP;
  124. TensorND a0, b0, c0, a1, b1, c1;
  125. std::tie(a0, b0, c0) = shuffle(std::make_tuple(
  126. src0->tensornd(), flt0->tensornd(), dst0->tensornd()));
  127. std::tie(a1, b1, c1) = shuffle(std::make_tuple(
  128. src1->tensornd(), flt1->tensornd(), dst1->tensornd()));
  129. WorkspaceWrapper wk(handle,
  130. std::max(
  131. opr0->get_workspace_in_bytes(
  132. a0.layout, b0.layout, c0.layout),
  133. opr1->get_workspace_in_bytes(
  134. a1.layout, b1.layout, c1.layout)
  135. ));
  136. cudaProfilerStart();
  137. cudaEventRecord(cuda_ev[0], cuda_stream);
  138. opr0->exec(a0, b0, c0, wk.workspace());
  139. cudaEventRecord(cuda_ev[1], cuda_stream);
  140. opr1->exec(a1, b1, c1, wk.workspace());
  141. cudaEventRecord(cuda_ev[2], cuda_stream);
  142. cudaProfilerStop();
  143. if (getenv("MEGDNN_CHANWISE_CONV_VERBOSE") ||
  144. getenv("MEGDNN_CHANWISE_CONV_FULLBENCH")) {
  145. cudaStreamSynchronize(cuda_stream);
  146. float t0 = -1, t1 = -1;
  147. cudaEventElapsedTime(&t0, cuda_ev[0], cuda_ev[1]);
  148. cudaEventElapsedTime(&t1, cuda_ev[1], cuda_ev[2]);
  149. printf("%s;%s;%s: cudnn/megdnn: %.3fms/%.3fms=%.3f\n",
  150. lsrc.TensorShape::to_string().c_str(),
  151. lflt1.TensorShape::to_string().c_str(),
  152. ldst.TensorShape::to_string().c_str(),
  153. t0, t1, t0 / t1);
  154. }
  155. }
  156. //! special for weight preprocess
  157. void exec_convolution(ConvolutionForward* opr0, ConvolutionForward* opr1) {
  158. opr0->param().pad_h = pad_h;
  159. opr0->param().pad_w = pad_w;
  160. opr1->param() = opr0->param();
  161. opr1->param().sparse = param::Convolution::Sparse::GROUP;
  162. TensorND a0, b0, c0, a1, b1, c1;
  163. std::tie(a0, b0, c0) = shuffle(std::make_tuple(
  164. src0->tensornd(), flt0->tensornd(), dst0->tensornd()));
  165. std::tie(a1, b1, c1) = shuffle(std::make_tuple(
  166. src1->tensornd(), flt1->tensornd(), dst1->tensornd()));
  167. WorkspaceWrapper wk(
  168. handle,
  169. std::max(opr0->get_workspace_in_bytes(a0.layout, b0.layout,
  170. c0.layout, nullptr),
  171. opr1->get_workspace_in_bytes(a1.layout, b1.layout,
  172. c1.layout, nullptr)));
  173. cudaProfilerStart();
  174. cudaEventRecord(cuda_ev[0], cuda_stream);
  175. opr0->exec(a0, b0, c0, nullptr, wk.workspace());
  176. cudaEventRecord(cuda_ev[1], cuda_stream);
  177. opr1->exec(a1, b1, c1, nullptr, wk.workspace());
  178. cudaEventRecord(cuda_ev[2], cuda_stream);
  179. cudaProfilerStop();
  180. if (getenv("MEGDNN_CHANWISE_CONV_VERBOSE") ||
  181. getenv("MEGDNN_CHANWISE_CONV_FULLBENCH")) {
  182. cudaStreamSynchronize(cuda_stream);
  183. float t0 = -1, t1 = -1;
  184. cudaEventElapsedTime(&t0, cuda_ev[0], cuda_ev[1]);
  185. cudaEventElapsedTime(&t1, cuda_ev[1], cuda_ev[2]);
  186. printf("%s;%s;%s: cudnn/megdnn: %.3fms/%.3fms=%.3f\n",
  187. lsrc.TensorShape::to_string().c_str(),
  188. lflt1.TensorShape::to_string().c_str(),
  189. ldst.TensorShape::to_string().c_str(),
  190. t0, t1, t0 / t1);
  191. }
  192. }
  193. void cmp_dst() {
  194. Tensor<> dst0_cpu(handle_cpu, ldst), dst1_cpu(handle_cpu, ldst);
  195. megdnn_memcpy_D2H(handle,
  196. dst0_cpu.ptr(), dst0->ptr(), ldst.span().dist_byte());
  197. megdnn_memcpy_D2H(handle,
  198. dst1_cpu.ptr(), dst1->ptr(), ldst.span().dist_byte());
  199. dst0_cpu.check_with(dst1_cpu);
  200. }
  201. void cmp_src() {
  202. Tensor<> src0_cpu(handle_cpu, lsrc), src1_cpu(handle_cpu, lsrc);
  203. megdnn_memcpy_D2H(handle,
  204. src0_cpu.ptr(), src0->ptr(), lsrc.span().dist_byte());
  205. megdnn_memcpy_D2H(handle,
  206. src1_cpu.ptr(), src1->ptr(), lsrc.span().dist_byte());
  207. src0_cpu.check_with(src1_cpu);
  208. }
  209. void cmp_flt() {
  210. Tensor<> flt0_cpu(handle_cpu, lflt0), flt1_cpu(handle_cpu, lflt1);
  211. float *p0 = flt0_cpu.ptr();
  212. float *p1 = flt1_cpu.ptr();
  213. megdnn_memcpy_D2H(handle, p0, flt0->ptr(), lflt0.span().dist_byte());
  214. megdnn_memcpy_D2H(handle, p1, flt1->ptr(), lflt1.span().dist_byte());
  215. size_t IC = lflt1[0], CHL_MUL = lflt1[1],
  216. FSIZE = lflt1[3] * lflt1[4];
  217. double tot_err = 0, tot_err_num = 0;
  218. for (size_t i = 0; i < IC; ++ i) {
  219. for (size_t j = 0; j < CHL_MUL; ++ j) {
  220. auto t0 = p0 + ((i * CHL_MUL + j) * IC + i) * FSIZE,
  221. t1 = p1 + (i * CHL_MUL + j) * FSIZE;
  222. for (size_t k = 0; k < FSIZE; ++ k) {
  223. auto err = std::abs(diff(t0[k], t1[k]));
  224. tot_err += err;
  225. tot_err_num += 1;
  226. ASSERT_LT(err, 1e-2) << "failed at " <<
  227. i << " " << j << " " << k <<
  228. " vals=" << t0[k] << "," << t1[k];
  229. }
  230. }
  231. }
  232. auto avg_err = tot_err / tot_err_num;
  233. ASSERT_LT(avg_err, 1e-4);
  234. }
  235. };
  236. } // anonymous namespace
  237. constexpr auto M = Convolution::Mode::CROSS_CORRELATION;
  238. TEST_F(CUDA, CHANWISE_CONVOLUTION_FORWARD) {
  239. Checker<Convolution> checker(handle_cuda());
  240. bool require_algo = false;
  241. checker.set_before_exec_callback(AlgoChecker<ConvolutionForward>(
  242. ConvBiasForward::algo_name<ConvBiasForward::DirectParam>(
  243. "CHANNEL_WISE", {})
  244. .c_str(),
  245. &require_algo));
  246. for (auto dtype : std::vector<DType>{dtype::Float32(), dtype::Float16()}) {
  247. checker.set_dtype(0, dtype).set_dtype(1, dtype).set_dtype(2, dtype);
  248. if (dtype.enumv() == DTypeEnum::Float16)
  249. checker.set_epsilon(2e-2);
  250. // simple case
  251. // clang-format off
  252. for (uint32_t s : {1, 2})
  253. for (uint32_t p : {0, 1, 2, 3})
  254. for (size_t f : {2, 3, 5, 7})
  255. for (size_t ocpg : {1, 3}) {
  256. checker.set_param(gconv_param({M, p, p, s, s}))
  257. .execs({{2, 3, 16, 16}, {3, ocpg, 1, f, f}, {}});
  258. }
  259. // clang-format on
  260. checker.set_param(gconv_param({M, 2, 3, 2, 1}))
  261. .execs({{32, 12, 20, 10}, {12, 2, 1, 4, 5}, {}});
  262. // padding larger than kern
  263. checker.set_param(gconv_param({M, 20, 30, 4, 5}))
  264. .execs({{32, 12, 20, 10}, {12, 2, 1, 4, 5}, {}});
  265. }
  266. }
  267. TEST_F(CUDA, CHANWISE_CONVOLUTION_FORWARD_SMALL) {
  268. Checker<Convolution> checker(handle_cuda());
  269. bool require_algo = false;
  270. checker.set_before_exec_callback(AlgoChecker<ConvolutionForward>(
  271. ConvBiasForward::algo_name<ConvBiasForward::DirectParam>(
  272. "CHANNEL_WISE_SMALL", {}).c_str(),
  273. &require_algo));
  274. for (auto dtype : std::vector<DType> {
  275. dtype::Float32(),
  276. #if CUDA_VERSION >= 9000
  277. dtype::Float16()
  278. #endif
  279. }) {
  280. checker.set_dtype(0, dtype).set_dtype(1, dtype).set_dtype(2, dtype);
  281. if (dtype.enumv() == DTypeEnum::Float16)
  282. checker.set_epsilon(2e-2);
  283. // clang-format off
  284. for (uint32_t s : {1})
  285. for (uint32_t f : {1, 3, 5, 7}) {
  286. checker.set_param(gconv_param({M, f / 2, f / 2, s, s}))
  287. .execs({{2, 3, 16, 16}, {3, 1, 1, f, f}, {}});
  288. }
  289. // clang-format on
  290. checker.set_param(gconv_param({M, 1, 1, 1, 1}))
  291. .execs({{2, 3, 3, 16}, {3, 1, 1, 3, 3}, {}})
  292. .execs({{2, 3, 8, 3}, {3, 1, 1, 3, 3}, {}});
  293. }
  294. }
  295. TEST_F(CUDA, CHANWISE_CONVOLUTION_BACKWARD_DATA) {
  296. Checker<ConvolutionBackwardData> checker(handle_cuda());
  297. bool require_algo = false;
  298. checker.set_before_exec_callback(AlgoChecker<ConvolutionBackwardData>(
  299. "CHANNEL_WISE", &require_algo));
  300. for (auto dtype : std::vector<DType>{dtype::Float32(), dtype::Float16()}) {
  301. checker.set_dtype(0, dtype).set_dtype(1, dtype).set_dtype(2, dtype);
  302. if (dtype.enumv() == DTypeEnum::Float16)
  303. checker.set_epsilon(1e-1);
  304. // simple case
  305. // clang-format off
  306. for (uint32_t s : {1, 2})
  307. for (uint32_t p : {0, 1, 2, 3})
  308. for (size_t f : {1, 2, 3, 5, 7})
  309. for (size_t ocpg : {1, 3}) {
  310. size_t ii = infer_conv_shape(16, f, s, p, true);
  311. checker.set_param(gconv_param({M, p, p, s, s}))
  312. .execs({{3, ocpg, 1, f, f},
  313. {2, 3 * ocpg, ii, ii},
  314. {2, 3, 16, 16}});
  315. }
  316. // clang-format on
  317. checker.set_param(gconv_param({M, 2, 3, 2, 1}))
  318. .execs({{12, 3, 1, 4, 5}, {32, 36, 20, 10}, {32, 12, 39, 8}});
  319. checker.set_param(gconv_param({M, 30, 20, 5, 4}))
  320. .execs({{6, 2, 1, 5, 4}, {32, 12, 12, 10}, {32, 6, 3, 2}});
  321. checker.set_param(gconv_param({M, 20, 30, 4, 5}))
  322. .execs({{6, 2, 1, 4, 5}, {32, 12, 10, 12}, {32, 6, 2, 3}});
  323. }
  324. }
  325. TEST_F(CUDA, CHANWISE_CONVOLUTION_BACKWARD_DATA_SMALL) {
  326. Checker<ConvolutionBackwardData> checker(handle_cuda());
  327. bool require_algo = false;
  328. checker.set_before_exec_callback(
  329. AlgoChecker<ConvolutionBackwardData>(
  330. "CHANNEL_WISE_SMALL", &require_algo));
  331. for (auto dtype : std::vector<DType> {
  332. dtype::Float32(),
  333. #if CUDA_VERSION >= 9000
  334. dtype::Float16()
  335. #endif
  336. }) {
  337. checker.set_dtype(0, dtype).set_dtype(1, dtype).set_dtype(2, dtype);
  338. if (dtype.enumv() == DTypeEnum::Float16)
  339. checker.set_epsilon(2e-2);
  340. for (uint32_t f : {1, 3, 5, 7}) {
  341. checker.set_param(gconv_param({M, f/2, f/2, 1, 1}))
  342. .execs({{3, 1, 1, f, f}, {2, 3, 16, 16}, {2, 3, 16, 16}});
  343. }
  344. checker.set_param(gconv_param({M, 1, 1, 1, 1}))
  345. .execs({{3, 1, 1, 3, 3}, {2, 3, 3, 16}, {2, 3, 3, 16}})
  346. .execs({{3, 1, 1, 3, 3}, {2, 3, 8, 3}, {2, 3, 8, 3}});
  347. }
  348. }
  349. TEST_F(CUDA, CHANWISE_CONVOLUTION_BACKWARD_FILTER) {
  350. Checker<ConvolutionBackwardFilter> checker(handle_cuda());
  351. bool require_algo = false;
  352. checker.set_before_exec_callback(AlgoChecker<ConvolutionBackwardFilter>(
  353. "CHANNEL_WISE", &require_algo));
  354. UniformFloatRNG rng(-0.1, 0.1);
  355. for (auto dtype : std::vector<DType>{dtype::Float32(), dtype::Float16()}) {
  356. checker.set_dtype(0, dtype).set_dtype(1, dtype).set_dtype(2, dtype).set_rng(0, &rng).set_rng(1, &rng);
  357. if (dtype.enumv() == DTypeEnum::Float16)
  358. checker.set_epsilon(2e-1);
  359. // simple case
  360. // clang-format off
  361. for (uint32_t s : {1, 2})
  362. for (uint32_t p : {0, 1, 2, 3})
  363. for (uint32_t f : {1, 2, 3, 5, 7})
  364. for (uint32_t ocpg : {1, 3})
  365. for (uint32_t i : {8, 16, 32, 64}){
  366. size_t ii = infer_conv_shape(i, f, s, p, true);
  367. checker.set_param(gconv_param({M, p, p, s, s}))
  368. .execs({{2, 3, i, i},
  369. {2, 3 * ocpg, ii, ii},
  370. {3, ocpg, 1, f, f}});
  371. }
  372. // clang-format on
  373. // padding larger than kern
  374. checker.set_param(gconv_param({M, 20, 30, 4, 5})).
  375. execs({{32, 6, 2, 3}, {32, 12, 10, 12}, {6, 2, 1, 4, 5}});
  376. // unused filter items
  377. checker.set_param(gconv_param({M, 2, 3, 2, 3})).
  378. execs({{32, 6, 1, 1}, {32, 12, 1, 1}, {6, 2, 1, 5, 7}});
  379. }
  380. }
  381. #if MEGDNN_WITH_BENCHMARK
  382. TEST_F(CUDA, CHANWISE_CONVOLUTION_FORWARD_BENCH_CHECK) {
  383. auto handle = handle_cuda();
  384. auto handle_cpu = handle_naive();
  385. auto conv0 = handle->create_operator<ConvolutionForward>();
  386. auto conv1 = handle->create_operator<ConvolutionForward>();
  387. BenchmarkEnv<0, 1, 2> benv(handle, handle_cpu);
  388. auto run = [&](size_t N, size_t IC, size_t IH, size_t IW,
  389. size_t CHL_MUL, size_t FH, size_t FW, size_t PH, size_t PW) {
  390. benv.alloc(N, IC, IH, IW, CHL_MUL, FH, FW, PH, PW);
  391. benv.fill_src();
  392. benv.fill_flt();
  393. benv.exec_convolution(conv0.get(), conv1.get());
  394. benv.cmp_dst();
  395. };
  396. run(64, 60, 50, 50, 1, 3, 3, 1, 1);
  397. if (check_need_full_bench()) {
  398. run(64, 728, 18, 18, 2, 5, 5, 2, 2);
  399. run(64, 64, 150, 150, 2, 3, 3, 1, 1);
  400. run(1, 2048, 4, 4, 2, 3, 3, 1, 1);
  401. }
  402. }
  403. TEST_F(CUDA, CHANWISE_CONVOLUTION_BWD_DATA_BENCH_CHECK) {
  404. auto handle = handle_cuda();
  405. auto handle_cpu = handle_naive();
  406. auto conv0 = handle->create_operator<ConvolutionBackwardData>();
  407. auto conv1 = handle->create_operator<ConvolutionBackwardData>();
  408. BenchmarkEnv<1, 2, 0> benv(handle, handle_cpu);
  409. auto run = [&](size_t N, size_t IC, size_t IH, size_t IW,
  410. size_t CHL_MUL, size_t FH, size_t FW, size_t PH, size_t PW) {
  411. benv.alloc(N, IC, IH, IW, CHL_MUL, FH, FW, PH, PW);
  412. benv.fill_dst();
  413. benv.fill_flt();
  414. benv.exec(conv0.get(), conv1.get());
  415. benv.cmp_src();
  416. };
  417. run(64, 60, 50, 50, 1, 3, 3, 1, 1);
  418. if (check_need_full_bench()) {
  419. run(64, 728, 18, 18, 2, 5, 5, 2, 2);
  420. run(64, 64, 150, 150, 2, 3, 3, 1, 1);
  421. run(1, 2048, 4, 4, 2, 3, 3, 1, 1);
  422. }
  423. }
  424. TEST_F(CUDA, CHANWISE_CONVOLUTION_BWD_FILTER_BENCH_CHECK) {
  425. auto handle = handle_cuda();
  426. auto handle_cpu = handle_naive();
  427. auto conv0 = handle->create_operator<ConvolutionBackwardFilter>();
  428. auto conv1 = handle->create_operator<ConvolutionBackwardFilter>();
  429. BenchmarkEnv<0, 2, 1> benv(handle, handle_cpu);
  430. auto run = [&](size_t N, size_t IC, size_t IH, size_t IW,
  431. size_t CHL_MUL, size_t FH, size_t FW, size_t PH, size_t PW) {
  432. benv.alloc(N, IC, IH, IW, CHL_MUL, FH, FW, PH, PW);
  433. benv.fill_src();
  434. benv.fill_dst();
  435. benv.exec(conv0.get(), conv1.get());
  436. benv.cmp_flt();
  437. };
  438. run(64, 60, 50, 50, 1, 3, 3, 1, 1);
  439. if (check_need_full_bench()){
  440. run(64, 728, 18, 18, 2, 5, 5, 2, 2);
  441. run(64, 64, 150, 150, 2, 3, 3, 1, 1);
  442. run(1, 2048, 4, 4, 2, 3, 3, 1, 1);
  443. }
  444. }
  445. TEST_F(CUDA, CHANWISE_CONVOLUTION_BENCH_ALL_ALGO_FWD) {
  446. // enable profiling
  447. std::unique_ptr<OprProxy<ConvolutionForward>> proxy{
  448. new OprProxy<ConvolutionForward>{true}};
  449. proxy->warmup_times = 1;
  450. proxy->exec_times = 10;
  451. Benchmarker<ConvolutionForward> checker(handle_cuda());
  452. checker.set_times(1);
  453. ConvolutionForward::Param param;
  454. param.sparse = ConvolutionForward::Param::Sparse::GROUP;
  455. checker.set_param(param);
  456. checker.set_proxy(proxy);
  457. auto run = [&](size_t N, size_t C, size_t IH, size_t IW, size_t FH,
  458. size_t FW) {
  459. checker.proxy()->target_execution_policy.algo.reset();
  460. checker.execs({{N, C, IH, IW}, {C, 1, 1, FH, FW}, {}});
  461. };
  462. run(128, 64, 90, 80, 3, 3);
  463. run(128, 90, 100, 100, 3, 5);
  464. run(128, 32, 62, 62, 5, 5);
  465. }
  466. TEST_F(CUDA, CHANWISE_CONVOLUTION_BENCH_ALL_ALGO_BWD_DATA) {
  467. // enable profiling
  468. std::unique_ptr<OprProxy<ConvolutionBackwardData>> proxy{
  469. new OprProxy<ConvolutionBackwardData>{true}};
  470. proxy->warmup_times = 1;
  471. proxy->exec_times = 10;
  472. Benchmarker<ConvolutionBackwardData> checker(handle_cuda());
  473. checker.set_times(1);
  474. ConvolutionBackwardData::Param param;
  475. param.sparse = ConvolutionForward::Param::Sparse::GROUP;
  476. checker.set_param(param);
  477. checker.set_proxy(proxy);
  478. auto run = [&](size_t N, size_t C, size_t IH, size_t IW, size_t FH,
  479. size_t FW) {
  480. checker.proxy()->target_execution_policy.algo.reset();
  481. checker.execs({{C, 1, 1, FH, FW},
  482. {N, C, IH - FH + 1, IW - FW + 1},
  483. {N, C, IH, IW}});
  484. };
  485. run(128, 64, 90, 80, 3, 3);
  486. run(128, 90, 100, 100, 3, 5);
  487. run(128, 32, 62, 62, 5, 5);
  488. }
  489. TEST_F(CUDA, CHANWISE_CONVOLUTION_BENCH_ALL_ALGO_BWD_FILTER) {
  490. // enable profiling
  491. std::unique_ptr<OprProxy<ConvolutionBackwardFilter>> proxy{
  492. new OprProxy<ConvolutionBackwardFilter>{true}};
  493. proxy->warmup_times = 1;
  494. proxy->exec_times = 10;
  495. Benchmarker<ConvolutionBackwardFilter> checker(handle_cuda());
  496. checker.set_times(1);
  497. ConvolutionBackwardFilter::Param param;
  498. param.sparse = ConvolutionForward::Param::Sparse::GROUP;
  499. checker.set_param(param);
  500. checker.set_proxy(proxy);
  501. auto run = [&](size_t N, size_t C, size_t IH, size_t IW, size_t FH,
  502. size_t FW) {
  503. checker.proxy()->target_execution_policy.algo.reset();
  504. checker.execs({{N, C, IH, IW},
  505. {N, C, IH - FH + 1, IW - FW + 1},
  506. {C, 1, 1, FH, FW}});
  507. };
  508. run(128, 64, 90, 80, 3, 3);
  509. run(128, 90, 100, 100, 3, 5);
  510. run(128, 32, 62, 62, 5, 5);
  511. }
  512. TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_ALL_ALGO_FORWARD) {
  513. CUBenchmarker<ConvolutionForward> bencher(handle_cuda());
  514. size_t RUNS = 10;
  515. bencher.set_display(false).set_times(RUNS);
  516. std::unique_ptr<OprProxy<ConvolutionForward>> proxy{
  517. new OprProxy<ConvolutionForward>{true}};
  518. bencher.set_proxy(proxy);
  519. Convolution::Param param;
  520. param.format = ConvBias::Param::Format::NCHW;
  521. param.sparse = Convolution::Param::Sparse::GROUP;
  522. NormalRNG rng;
  523. auto run = [&](size_t batch, size_t c, size_t ih, size_t iw, size_t f,
  524. size_t s) {
  525. param.pad_h = f / 2;
  526. param.pad_w = f / 2;
  527. param.stride_h = s;
  528. param.stride_w = s;
  529. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  530. TensorShape src = {batch, c, ih, iw}, filter = {c, 1, 1, f, f};
  531. TensorLayout dst_layout;
  532. auto opr = handle_cuda()->create_operator<Convolution>();
  533. opr->param() = param;
  534. opr->deduce_layout({src, dtype::Float32()}, {filter, dtype::Float32()},
  535. dst_layout);
  536. float bandwith = static_cast<float>(src.total_nr_elems() +
  537. filter.total_nr_elems() +
  538. dst_layout.total_nr_elems()) /
  539. (1024 * 1024 * 1024) * 1e3;
  540. bencher.set_param(param)
  541. .set_dtype(0, dtype::Float32())
  542. .set_dtype(1, dtype::Float32())
  543. .set_dtype(2, dtype::Float32())
  544. .set_rng(0, &rng)
  545. .set_rng(1, &rng);
  546. bencher.proxy()->target_execution_policy.algo.reset();
  547. auto time_in_ms_fp32 = bencher.execs({src, filter, {}}) / RUNS;
  548. bencher.set_param(param)
  549. .set_dtype(0, dtype::Float16())
  550. .set_dtype(1, dtype::Float16())
  551. .set_dtype(2, dtype::Float16())
  552. .set_rng(0, &rng)
  553. .set_rng(1, &rng);
  554. bencher.proxy()->target_execution_policy.algo.reset();
  555. auto time_in_ms_fp16 = bencher.execs({src, filter, {}}) / RUNS;
  556. bencher.proxy()->target_execution_policy.algo.reset();
  557. param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  558. bencher.set_param(param);
  559. auto time_in_ms_pseudo_fp16 = bencher.execs({src, filter, {}}) / RUNS;
  560. printf("stride=%zu src=%s, filter=%s, float32: %.2fms %.2fGB/s "
  561. "float16: %.2fms %.2fGB/s "
  562. "pseudo float16: %.2fms %.2fGB/s "
  563. "speedup: "
  564. "%0.2f (fp16/fp32) %.2f (fp16/pseudo fp16)\n",
  565. s, src.to_string().c_str(), filter.to_string().c_str(),
  566. time_in_ms_fp32, bandwith * 4 / time_in_ms_fp32, time_in_ms_fp16,
  567. bandwith * 2 / time_in_ms_fp16, time_in_ms_pseudo_fp16,
  568. bandwith * 2 / time_in_ms_pseudo_fp16,
  569. time_in_ms_fp32 / time_in_ms_fp16,
  570. time_in_ms_pseudo_fp16 / time_in_ms_fp16);
  571. };
  572. // clang-format off
  573. for (size_t s : {1, 2})
  574. for (size_t f : {3, 5, 7})
  575. for (size_t batch : {64})
  576. for (size_t c : {16, 32, 64, 128})
  577. for (size_t ih: {128, 256})
  578. for (size_t iw : {128, 256})
  579. run(batch, c, ih, iw, f, s);
  580. // clang-format on
  581. run(128, 192, 28, 28, 3, 1);
  582. run(128, 192, 28, 28, 3, 2);
  583. run(128, 576, 14, 14, 3, 1);
  584. run(128, 384, 14, 14, 3, 1);
  585. run(128, 32, 112, 112, 3, 1);
  586. run(128, 960, 7, 7, 3, 1);
  587. run(128, 384, 14, 14, 3, 1);
  588. run(128, 144, 56, 56, 3, 2);
  589. run(128, 384, 14, 14, 3, 1);
  590. run(128, 144, 56, 56, 3, 1);
  591. run(128, 96, 112, 112, 3, 2);
  592. run(128, 384, 14, 14, 3, 1);
  593. run(128, 192, 28, 28, 3, 1);
  594. run(128, 576, 14, 14, 3, 1);
  595. run(128, 576, 14, 14, 3, 2);
  596. }
  597. TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_FORWARD_FLOAT) {
  598. CUBenchmarker<ConvolutionForward> bencher(handle_cuda());
  599. size_t RUNS = 1;
  600. bencher.set_display(false).set_times(RUNS);
  601. bencher.set_before_exec_callback(AlgoChecker<ConvolutionForward>(
  602. ConvBiasForward::algo_name<ConvBiasForward::DirectParam>(
  603. "CHANNEL_WISE", {})
  604. .c_str()));
  605. Convolution::Param param;
  606. param.format = ConvBias::Param::Format::NCHW;
  607. param.sparse = Convolution::Param::Sparse::GROUP;
  608. NormalRNG rng;
  609. auto run = [&](size_t batch, size_t c, size_t ih, size_t iw, size_t f,
  610. size_t s) {
  611. param.pad_h = f / 2;
  612. param.pad_w = f / 2;
  613. param.stride_h = s;
  614. param.stride_w = s;
  615. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  616. TensorShape src = {batch, c, ih, iw}, filter = {c, 1, 1, f, f};
  617. TensorLayout dst_layout;
  618. auto opr = handle_cuda()->create_operator<Convolution>();
  619. opr->param() = param;
  620. opr->deduce_layout({src, dtype::Float32()}, {filter, dtype::Float32()},
  621. dst_layout);
  622. float bandwith = static_cast<float>(src.total_nr_elems() +
  623. filter.total_nr_elems() +
  624. dst_layout.total_nr_elems()) /
  625. (1024 * 1024 * 1024) * 1e3;
  626. bencher.set_param(param)
  627. .set_dtype(0, dtype::Float32())
  628. .set_dtype(1, dtype::Float32())
  629. .set_dtype(2, dtype::Float32())
  630. .set_rng(0, &rng)
  631. .set_rng(1, &rng);
  632. auto time_in_ms_fp32 = bencher.execs({src, filter, {}}) / RUNS;
  633. bencher.set_param(param)
  634. .set_dtype(0, dtype::Float16())
  635. .set_dtype(1, dtype::Float16())
  636. .set_dtype(2, dtype::Float16())
  637. .set_rng(0, &rng)
  638. .set_rng(1, &rng);
  639. auto time_in_ms_fp16 = bencher.execs({src, filter, {}}) / RUNS;
  640. printf("stride=%zu src=%s, filter=%s, float32: %.2fms %.2fGB/s "
  641. "float16: %.2fms %.2fGB/s "
  642. "speedup: "
  643. "%0.2f (fp16/fp32)\n",
  644. s, src.to_string().c_str(), filter.to_string().c_str(),
  645. time_in_ms_fp32, bandwith * 4 / time_in_ms_fp32, time_in_ms_fp16,
  646. bandwith * 2 / time_in_ms_fp16,
  647. time_in_ms_fp32 / time_in_ms_fp16);
  648. };
  649. // clang-format off
  650. for (size_t s : {1})
  651. for (size_t f : {3, 5, 7})
  652. for (size_t batch : {64})
  653. for (size_t c : {16, 32, 64, 128})
  654. for (size_t ih: {8, 16, 32, 128, 256})
  655. for (size_t iw : {8, 16, 32, 128, 256})
  656. run(batch, c, ih, iw, f, s);
  657. // clang-format on
  658. }
  659. TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_FORWARD_FLOAT_SMALL) {
  660. CUBenchmarker<ConvolutionForward> bencher(handle_cuda());
  661. size_t RUNS = 1;
  662. bencher.set_display(false).set_times(RUNS);
  663. Convolution::Param param;
  664. param.format = ConvBias::Param::Format::NCHW;
  665. param.sparse = Convolution::Param::Sparse::GROUP;
  666. NormalRNG rng;
  667. auto run = [&](size_t batch, size_t c, size_t ih, size_t iw, size_t f,
  668. size_t s) {
  669. param.pad_h = f / 2;
  670. param.pad_w = f / 2;
  671. param.stride_h = s;
  672. param.stride_w = s;
  673. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  674. TensorShape src = {batch, c, ih, iw}, filter = {c, 1, 1, f, f};
  675. TensorLayout dst_layout;
  676. auto opr = handle_cuda()->create_operator<Convolution>();
  677. opr->param() = param;
  678. opr->deduce_layout({src, dtype::Float32()}, {filter, dtype::Float32()},
  679. dst_layout);
  680. float bandwith = static_cast<float>(src.total_nr_elems() +
  681. filter.total_nr_elems() +
  682. dst_layout.total_nr_elems()) /
  683. (1024 * 1024 * 1024) * 1e3;
  684. bencher.set_param(param)
  685. .set_dtype(0, dtype::Float32())
  686. .set_dtype(1, dtype::Float32())
  687. .set_dtype(2, dtype::Float32())
  688. .set_rng(0, &rng)
  689. .set_rng(1, &rng)
  690. .set_before_exec_callback(AlgoChecker<ConvolutionForward>(
  691. ConvBiasForward::algo_name<
  692. ConvBiasForward::DirectParam>("CHANNEL_WISE",
  693. {})
  694. .c_str()));
  695. auto time_in_ms_fp32_normal = bencher.execs({src, filter, {}}) / RUNS;
  696. bencher.set_before_exec_callback(AlgoChecker<ConvolutionForward>(
  697. ConvBiasForward::algo_name<ConvBiasForward::DirectParam>(
  698. "CHANNEL_WISE", {})
  699. .c_str()));
  700. auto time_in_ms_fp32_small = bencher.execs({src, filter, {}}) / RUNS;
  701. bencher.set_param(param)
  702. .set_dtype(0, dtype::Float16())
  703. .set_dtype(1, dtype::Float16())
  704. .set_dtype(2, dtype::Float16())
  705. .set_rng(0, &rng)
  706. .set_rng(1, &rng);
  707. auto time_in_ms_fp16_small = bencher.execs({src, filter, {}}) / RUNS;
  708. printf("stride=%zu src=%s, filter=%s, fp32 normal: %.2fms %.2fGB/s "
  709. "small: %.2fms %.2fGB/s, fp16 small: %.2fms %.2fGB/s, "
  710. "speedup: "
  711. "%0.2f (fp32 small/normal) %0.2f (small fp16/fp32)\n",
  712. s, src.to_string().c_str(), filter.to_string().c_str(),
  713. time_in_ms_fp32_normal, bandwith * 4 / time_in_ms_fp32_normal,
  714. time_in_ms_fp32_small, bandwith * 4 / time_in_ms_fp32_small,
  715. time_in_ms_fp16_small, bandwith * 2 / time_in_ms_fp16_small,
  716. time_in_ms_fp32_normal / time_in_ms_fp32_small,
  717. time_in_ms_fp32_small / time_in_ms_fp16_small);
  718. };
  719. // clang-format off
  720. for (size_t s : {1})
  721. for (size_t f : {3, 5})
  722. for (size_t batch : {64})
  723. for (size_t c : {16, 32, 64, 128})
  724. for (size_t ih: {8, 16, 32})
  725. for (size_t iw : {8, 16, 32})
  726. run(batch, c, ih, iw, f, s);
  727. // clang-format on
  728. run(128, 192, 28, 28, 3, 1);
  729. run(128, 576, 14, 14, 3, 1);
  730. run(128, 384, 14, 14, 3, 1);
  731. run(128, 960, 7, 7, 3, 1);
  732. run(128, 384, 14, 14, 3, 1);
  733. run(128, 384, 14, 14, 3, 1);
  734. run(128, 384, 14, 14, 3, 1);
  735. run(128, 192, 28, 28, 3, 1);
  736. run(128, 576, 14, 14, 3, 1);
  737. }
  738. TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_FORWARD_CUDNN_DNN) {
  739. CUBenchmarker<ConvBiasForward> bencher(handle_cuda());
  740. size_t RUNS = 1;
  741. bencher.set_display(false).set_times(RUNS);
  742. ConvBias::Param param;
  743. param.format = ConvBias::Param::Format::NCHW;
  744. param.sparse = ConvBias::Param::Sparse::GROUP;
  745. NormalRNG rng;
  746. auto run = [&](size_t batch, size_t c, size_t ih, size_t iw, size_t f,
  747. size_t s) {
  748. param.pad_h = f / 2;
  749. param.pad_w = f / 2;
  750. param.stride_h = s;
  751. param.stride_w = s;
  752. param.compute_mode = param::ConvBias::ComputeMode::DEFAULT;
  753. TensorShape src = {batch, c, ih, iw}, filter = {c, 1, 1, f, f},
  754. bias = {1, c, 1, 1};
  755. TensorLayout dst_layout;
  756. auto opr = handle_cuda()->create_operator<ConvBias>();
  757. opr->param() = param;
  758. opr->deduce_layout({src, dtype::Float32()}, {filter, dtype::Float32()},
  759. {bias, dtype::Float32()}, {}, dst_layout);
  760. float computation_mops =
  761. static_cast<float>(dst_layout.total_nr_elems() * f * f * 2) *
  762. 1e-6;
  763. bencher.set_param(param)
  764. .set_dtype(0, dtype::Float32())
  765. .set_dtype(1, dtype::Float32())
  766. .set_dtype(2, dtype::Float32())
  767. .set_rng(0, &rng)
  768. .set_rng(1, &rng);
  769. bencher.set_before_exec_callback(
  770. AlgoChecker<ConvBiasForward>(".+CHANNEL_WISE.+"));
  771. auto time_in_ms_dnn = bencher.execs({src, filter, bias, {}, {}}) / RUNS;
  772. bencher.set_param(param)
  773. .set_dtype(0, dtype::Float32())
  774. .set_dtype(1, dtype::Float32())
  775. .set_dtype(2, dtype::Float32())
  776. .set_rng(0, &rng)
  777. .set_rng(1, &rng);
  778. bencher.set_before_exec_callback(AlgoChecker<ConvBiasForward>(
  779. ".+CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM.+"));
  780. auto time_in_ms_cudnn =
  781. bencher.execs({src, filter, bias, {}, {}}) / RUNS;
  782. printf("stride=%zu src=%s, filter=%s, dst=%s, dnn: %.2fms %.2fGB/s "
  783. "cudnn: %.2fms %.2fGB/s "
  784. "speedup: "
  785. "%0.2f (dnn/cudnn)\n",
  786. s, src.to_string().c_str(), filter.to_string().c_str(),
  787. dst_layout.to_string().c_str(), time_in_ms_dnn,
  788. computation_mops / time_in_ms_dnn, time_in_ms_cudnn,
  789. computation_mops / time_in_ms_cudnn,
  790. time_in_ms_cudnn / time_in_ms_dnn);
  791. };
  792. // clang-format off
  793. for(size_t batch:{1, 16, 32, 64, 128}){
  794. run(batch, 32, 112, 112, 3, 1);
  795. run(batch, 96, 112, 112, 3, 2);
  796. run(batch, 96, 112, 112, 3, 1);
  797. run(batch, 144, 56, 56, 3, 2);
  798. run(batch, 144, 56, 56, 3, 1);
  799. run(batch, 192, 28, 28, 3, 1);
  800. run(batch, 384, 14, 14, 3, 1);
  801. run(batch, 576, 14, 14, 3, 1);
  802. run(batch, 960, 7, 7, 3, 1);
  803. //! calibrate heu algo policy hw_size param
  804. run(batch, 144, 24, 24, 3, 1);
  805. run(batch, 144, 22, 22, 3, 1);
  806. run(batch, 144, 20, 20, 3, 1);
  807. run(batch, 144, 18, 18, 3, 1);
  808. }
  809. // clang-format on
  810. }
  811. TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_BACKWARD_DATA_FLOAT_SMALL) {
  812. CUBenchmarker<ConvolutionBackwardData> bencher(handle_cuda());
  813. size_t RUNS = 1;
  814. bencher.set_display(false).set_times(RUNS);
  815. ConvolutionBackwardData::Param param;
  816. param.format = Convolution::Param::Format::NCHW;
  817. param.sparse = Convolution::Param::Sparse::GROUP;
  818. NormalRNG rng;
  819. auto run = [&](size_t batch, size_t c, size_t ih, size_t iw, size_t f,
  820. size_t s) {
  821. param.pad_h = f / 2;
  822. param.pad_w = f / 2;
  823. param.stride_h = s;
  824. param.stride_w = s;
  825. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  826. TensorShape src = {batch, c, ih, iw}, filter = {c, 1, 1, f, f};
  827. float bandwith = static_cast<float>(src.total_nr_elems() +
  828. filter.total_nr_elems() +
  829. src.total_nr_elems()) /
  830. (1024 * 1024 * 1024) * 1e3;
  831. bencher.set_param(param)
  832. .set_dtype(0, dtype::Float32())
  833. .set_dtype(1, dtype::Float32())
  834. .set_dtype(2, dtype::Float32())
  835. .set_rng(0, &rng)
  836. .set_rng(1, &rng)
  837. .set_before_exec_callback(
  838. AlgoChecker<ConvolutionBackwardData>("CHANNEL_WISE"));
  839. auto time_in_ms_fp32_normal = bencher.execs({filter, src, src}) / RUNS;
  840. bencher.set_before_exec_callback(
  841. AlgoChecker<ConvolutionBackwardData>("CHANNEL_WISE_SMALL"));
  842. auto time_in_ms_fp32_small = bencher.execs({filter, src, src}) / RUNS;
  843. bencher.set_param(param)
  844. .set_dtype(0, dtype::Float16())
  845. .set_dtype(1, dtype::Float16())
  846. .set_dtype(2, dtype::Float16())
  847. .set_rng(0, &rng)
  848. .set_rng(1, &rng);
  849. auto time_in_ms_fp16_small = bencher.execs({filter, src, src}) / RUNS;
  850. printf("stride=%zu src=%s, filter=%s, fp32 normal: %.2fms %.2fGB/s "
  851. "small: %.2fms %.2fGB/s, fp16 small: %.2fms %.2fGB/s, "
  852. "speedup: "
  853. "%0.2f (fp32 small/normal) %0.2f (small fp16/fp32)\n",
  854. s, src.to_string().c_str(), filter.to_string().c_str(),
  855. time_in_ms_fp32_normal, bandwith * 4 / time_in_ms_fp32_normal,
  856. time_in_ms_fp32_small, bandwith * 4 / time_in_ms_fp32_small,
  857. time_in_ms_fp16_small, bandwith * 2 / time_in_ms_fp16_small,
  858. time_in_ms_fp32_normal / time_in_ms_fp32_small,
  859. time_in_ms_fp32_small / time_in_ms_fp16_small);
  860. };
  861. // clang-format off
  862. for (size_t s : {1})
  863. for (size_t f : {3, 5})
  864. for (size_t batch : {64})
  865. for (size_t c : {16, 32, 64, 128})
  866. for (size_t ih: {8, 16, 32})
  867. for (size_t iw : {8, 16, 32})
  868. run(batch, c, ih, iw, f, s);
  869. // clang-format on
  870. run(128, 192, 28, 28, 3, 1);
  871. run(128, 576, 14, 14, 3, 1);
  872. run(128, 384, 14, 14, 3, 1);
  873. run(128, 960, 7, 7, 3, 1);
  874. run(128, 384, 14, 14, 3, 1);
  875. run(128, 384, 14, 14, 3, 1);
  876. run(128, 384, 14, 14, 3, 1);
  877. run(128, 192, 28, 28, 3, 1);
  878. run(128, 576, 14, 14, 3, 1);
  879. }
  880. TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_BWD_DATA) {
  881. CUBenchmarker<ConvolutionBackwardData> bencher(handle_cuda());
  882. size_t RUNS = 1;
  883. bencher.set_display(false).set_times(RUNS);
  884. bencher.set_before_exec_callback(
  885. AlgoChecker<ConvolutionBackwardData>("CHANNEL_WISE"));
  886. Convolution::Param param;
  887. param.format = ConvBias::Param::Format::NCHW;
  888. param.sparse = Convolution::Param::Sparse::GROUP;
  889. NormalRNG rng;
  890. auto run = [&](size_t batch, size_t ocpg, size_t group, size_t ih,
  891. size_t iw, size_t f, size_t p, size_t s) {
  892. param.pad_h = p;
  893. param.pad_w = p;
  894. param.stride_h = s;
  895. param.stride_w = s;
  896. size_t oh, ow;
  897. infer_conv_shape2d(ih, iw, f, f, s, s, p, p, oh, ow, true);
  898. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  899. TensorShape src_grad = {batch, group, ih, iw},
  900. dst_grad = {batch, group * ocpg, oh, ow},
  901. flt = {group, ocpg, 1, f, f};
  902. auto opr = handle_cuda()->create_operator<Convolution>();
  903. opr->param() = param;
  904. float bandwith = static_cast<float>(flt.total_nr_elems() +
  905. dst_grad.total_nr_elems() +
  906. src_grad.total_nr_elems()) /
  907. (1024 * 1024 * 1024) * 1e3;
  908. bencher.set_param(param)
  909. .set_dtype(0, dtype::Float32())
  910. .set_dtype(1, dtype::Float32())
  911. .set_dtype(2, dtype::Float32())
  912. .set_rng(0, &rng)
  913. .set_rng(1, &rng);
  914. auto time_in_ms_fp32 = bencher.execs({flt, dst_grad, src_grad}) / RUNS;
  915. bencher.set_param(param)
  916. .set_dtype(0, dtype::Float16())
  917. .set_dtype(1, dtype::Float16())
  918. .set_dtype(2, dtype::Float16())
  919. .set_rng(0, &rng)
  920. .set_rng(1, &rng);
  921. auto time_in_ms_fp16 = bencher.execs({flt, dst_grad, src_grad}) / RUNS;
  922. printf("stride=%zu, src_grad=%s, flt=%s, "
  923. "float32: %.2fms %.2fGB/s "
  924. "float16: %.2fms %.2fGB/s "
  925. "speedup: "
  926. "%0.2f (fp16/fp32)\n",
  927. s, src_grad.to_string().c_str(), flt.to_string().c_str(),
  928. time_in_ms_fp32, bandwith * 4 / time_in_ms_fp32, time_in_ms_fp16,
  929. bandwith * 2 / time_in_ms_fp16,
  930. time_in_ms_fp32 / time_in_ms_fp16);
  931. };
  932. // clang-format off
  933. for (size_t s : {1, 2})
  934. for (size_t f : {3, 5, 7})
  935. for (size_t p : {f / 2})
  936. for (size_t batch : {64})
  937. for (size_t ocpg : {1})
  938. for (size_t group : {16, 32, 64, 128})
  939. for (size_t ih : {8, 16, 32, 128, 256})
  940. for (size_t iw : {8, 16, 32, 128, 256})
  941. run(batch, ocpg, group, ih, iw, f, p, s);
  942. // clang-format on
  943. }
  944. TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_BWD_FILTER) {
  945. CUBenchmarker<ConvolutionBackwardFilter> bencher(handle_cuda());
  946. size_t RUNS = 1;
  947. bencher.set_display(false).set_times(RUNS);
  948. bencher.set_before_exec_callback(
  949. AlgoChecker<ConvolutionBackwardFilter>("CHANNEL_WISE"));
  950. Convolution::Param param;
  951. param.format = ConvBias::Param::Format::NCHW;
  952. param.sparse = Convolution::Param::Sparse::GROUP;
  953. NormalRNG rng;
  954. auto run = [&](size_t batch, size_t ocpg, size_t group, size_t i,
  955. size_t f, size_t p, size_t s) {
  956. param.pad_h = p;
  957. param.pad_w = p;
  958. param.stride_h = s;
  959. param.stride_w = s;
  960. size_t d = infer_conv_shape(i, f, s, p, true);
  961. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  962. TensorShape src = {batch, group, i, i},
  963. dst_grad = {batch, group * ocpg, d, d},
  964. flt_grad = {group, ocpg, 1, f, f};
  965. auto opr = handle_cuda()->create_operator<Convolution>();
  966. opr->param() = param;
  967. float bandwith = static_cast<float>(flt_grad.total_nr_elems() +
  968. dst_grad.total_nr_elems() +
  969. src.total_nr_elems()) /
  970. (1024 * 1024 * 1024) * 1e3;
  971. bencher.set_param(param)
  972. .set_dtype(0, dtype::Float32())
  973. .set_dtype(1, dtype::Float32())
  974. .set_dtype(2, dtype::Float32())
  975. .set_rng(0, &rng)
  976. .set_rng(1, &rng);
  977. auto time_in_ms_fp32 = bencher.execs({src, dst_grad, flt_grad}) / RUNS;
  978. bencher.set_param(param)
  979. .set_dtype(0, dtype::Float16())
  980. .set_dtype(1, dtype::Float16())
  981. .set_dtype(2, dtype::Float16())
  982. .set_rng(0, &rng)
  983. .set_rng(1, &rng);
  984. auto time_in_ms_fp16 = bencher.execs({src, dst_grad, flt_grad}) / RUNS;
  985. printf("stride=%zu, src=%s, flt_grad=%s, "
  986. "float32: %.2fms %.2fGB/s "
  987. "float16: %.2fms %.2fGB/s "
  988. "speedup: "
  989. "%.2f (fp16/fp32)\n",
  990. s, src.to_string().c_str(), flt_grad.to_string().c_str(),
  991. time_in_ms_fp32, bandwith * 4 / time_in_ms_fp32, time_in_ms_fp16,
  992. bandwith * 2 / time_in_ms_fp16,
  993. time_in_ms_fp32 / time_in_ms_fp16);
  994. };
  995. // clang-format off
  996. for (size_t s : {1, 2})
  997. for (size_t f : {3, 5, 7})
  998. for (size_t p : {f / 2})
  999. for (size_t batch : {64})
  1000. for (size_t ocpg : {1})
  1001. for (size_t group : {16, 32, 64, 128})
  1002. for (size_t i : {8, 16, 32, 64, 128})
  1003. run(batch, ocpg, group, i, f, p, s);
  1004. // clang-format on
  1005. }
  1006. #endif
  1007. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台