You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

chanwise_convolution.cpp 41 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068
  1. /**
  2. * \file dnn/test/cuda/chanwise_convolution.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #include "megdnn/oprs/nn.h"
  12. #include "test/cuda/fixture.h"
  13. #include "test/cuda/benchmark.h"
  14. #include "test/common/tensor.h"
  15. #include "test/common/workspace_wrapper.h"
  16. #include "test/common/checker.h"
  17. #include "test/common/convolution.h"
  18. #include "test/common/benchmarker.h"
  19. #include "megcore_cuda.h"
  20. #include "cuda.h"
  21. #include <cuda_profiler_api.h>
  22. #include <cuda_runtime_api.h>
  23. using namespace megdnn;
  24. using namespace test;
  25. namespace {
  26. #if MEGDNN_WITH_BENCHMARK
  27. bool check_need_full_bench() {
  28. if (getenv("MEGDNN_CHANWISE_CONV_FULLBENCH"))
  29. return true;
  30. printf("set MEGDNN_CHANWISE_CONV_FULLBENCH to run full benchmark\n");
  31. return false;
  32. }
  33. #endif
  34. Convolution::Param gconv_param(Convolution::Param p) {
  35. p.sparse = Convolution::Param::Sparse::GROUP;
  36. return p;
  37. }
  38. template<int P0, int P1, int P2>
  39. class BenchmarkEnv {
  40. Handle *handle, *handle_cpu;
  41. std::unique_ptr<GaussianRNG> rng;
  42. TensorLayout lsrc, lflt0, lflt1, ldst;
  43. std::unique_ptr<Tensor<>> src0, src1,
  44. flt0, flt0_cpu, flt1, flt1_cpu, dst0, dst1;
  45. cudaEvent_t cuda_ev[3];
  46. cudaStream_t cuda_stream;
  47. size_t pad_h, pad_w;
  48. template<typename T>
  49. static std::tuple<T, T, T> shuffle(std::tuple<T, T, T> data) {
  50. return std::make_tuple(
  51. std::get<P0>(data), std::get<P1>(data), std::get<P2>(data));
  52. }
  53. public:
  54. BenchmarkEnv(Handle *handle, Handle *handle_cpu) {
  55. this->handle = handle;
  56. this->handle_cpu = handle_cpu;
  57. rng = handle->create_operator<GaussianRNG>();
  58. // make cpu handle used
  59. handle_cpu->create_operator<Sleep>()->exec();
  60. for (int i = 0; i < 3; ++ i)
  61. cudaEventCreate(&cuda_ev[i]);
  62. megcoreGetCUDAStream(handle->megcore_computing_handle(), &cuda_stream);
  63. }
  64. ~BenchmarkEnv() {
  65. for (int i = 0; i < 3; ++ i)
  66. cudaEventDestroy(cuda_ev[i]);
  67. }
  68. void alloc(size_t N, size_t IC, size_t IH, size_t IW,
  69. size_t CHL_MUL, size_t FH, size_t FW, size_t PH, size_t PW) {
  70. pad_h = PH;
  71. pad_w = PW;
  72. auto mkly = [](const TensorShape &s) {
  73. return TensorLayout{s, dtype::Float32()};
  74. };
  75. lsrc = mkly({N, IC, IH, IW});
  76. lflt0 = mkly({CHL_MUL*IC, IC, FH, FW});
  77. lflt1 = mkly({IC, CHL_MUL, 1, FH, FW});
  78. ldst = mkly({N, IC*CHL_MUL, IH-FH+1+PH*2, IW-FW+1+PW*2});
  79. src0.reset(new Tensor<>(handle, lsrc));
  80. src1.reset(new Tensor<>(handle, lsrc));
  81. flt0.reset(new Tensor<>(handle, lflt0));
  82. flt0_cpu.reset(new Tensor<>(handle_cpu, lflt0));
  83. flt1.reset(new Tensor<>(handle, lflt1));
  84. flt1_cpu.reset(new Tensor<>(handle_cpu, lflt1));
  85. dst0.reset(new Tensor<>(handle, ldst));
  86. dst1.reset(new Tensor<>(handle, ldst));
  87. }
  88. void fill_src() {
  89. rng->exec(src0->tensornd(), {});
  90. megdnn_memcpy_D2D(handle, src1->ptr(), src0->ptr(),
  91. lsrc.span().dist_byte());
  92. }
  93. void fill_flt() {
  94. rng->exec(flt1->tensornd(), {});
  95. megdnn_memcpy_D2H(handle,
  96. flt1_cpu->ptr(), flt1->ptr(), lflt1.span().dist_byte());
  97. const size_t IC = lflt1[0], CHL_MUL = lflt1[1],
  98. FSIZE = lflt1[3] * lflt1[4];
  99. // fill flt0 from flt1
  100. float* src = flt1_cpu->ptr();
  101. float* dst = flt0_cpu->ptr();
  102. memset(dst, 0, lflt0.span().dist_byte());
  103. for (size_t i = 0; i < IC; ++ i) {
  104. for (size_t j = 0; j < CHL_MUL; ++ j) {
  105. memcpy(dst + ((i * CHL_MUL + j) * IC + i) * FSIZE,
  106. src + (i * CHL_MUL + j) * FSIZE,
  107. FSIZE * sizeof(float));
  108. }
  109. }
  110. megdnn_memcpy_H2D(handle,
  111. flt0->ptr(), dst, lflt0.span().dist_byte());
  112. }
  113. void fill_dst() {
  114. rng->exec(dst0->tensornd(), {});
  115. megdnn_memcpy_D2D(handle, dst1->ptr(), dst0->ptr(),
  116. ldst.span().dist_byte());
  117. }
  118. template<class Opr>
  119. void exec(Opr *opr0, Opr *opr1) {
  120. opr0->param().pad_h = pad_h;
  121. opr0->param().pad_w = pad_w;
  122. opr1->param() = opr0->param();
  123. opr1->param().sparse = param::Convolution::Sparse::GROUP;
  124. TensorND a0, b0, c0, a1, b1, c1;
  125. std::tie(a0, b0, c0) = shuffle(std::make_tuple(
  126. src0->tensornd(), flt0->tensornd(), dst0->tensornd()));
  127. std::tie(a1, b1, c1) = shuffle(std::make_tuple(
  128. src1->tensornd(), flt1->tensornd(), dst1->tensornd()));
  129. WorkspaceWrapper wk(handle,
  130. std::max(
  131. opr0->get_workspace_in_bytes(
  132. a0.layout, b0.layout, c0.layout),
  133. opr1->get_workspace_in_bytes(
  134. a1.layout, b1.layout, c1.layout)
  135. ));
  136. cudaProfilerStart();
  137. cudaEventRecord(cuda_ev[0], cuda_stream);
  138. opr0->exec(a0, b0, c0, wk.workspace());
  139. cudaEventRecord(cuda_ev[1], cuda_stream);
  140. opr1->exec(a1, b1, c1, wk.workspace());
  141. cudaEventRecord(cuda_ev[2], cuda_stream);
  142. cudaProfilerStop();
  143. if (getenv("MEGDNN_CHANWISE_CONV_VERBOSE") ||
  144. getenv("MEGDNN_CHANWISE_CONV_FULLBENCH")) {
  145. cudaStreamSynchronize(cuda_stream);
  146. float t0 = -1, t1 = -1;
  147. cudaEventElapsedTime(&t0, cuda_ev[0], cuda_ev[1]);
  148. cudaEventElapsedTime(&t1, cuda_ev[1], cuda_ev[2]);
  149. printf("%s;%s;%s: cudnn/megdnn: %.3fms/%.3fms=%.3f\n",
  150. lsrc.TensorShape::to_string().c_str(),
  151. lflt1.TensorShape::to_string().c_str(),
  152. ldst.TensorShape::to_string().c_str(),
  153. t0, t1, t0 / t1);
  154. }
  155. }
  156. //! special for weight preprocess
  157. void exec_convolution(ConvolutionForward* opr0, ConvolutionForward* opr1) {
  158. opr0->param().pad_h = pad_h;
  159. opr0->param().pad_w = pad_w;
  160. opr1->param() = opr0->param();
  161. opr1->param().sparse = param::Convolution::Sparse::GROUP;
  162. TensorND a0, b0, c0, a1, b1, c1;
  163. std::tie(a0, b0, c0) = shuffle(std::make_tuple(
  164. src0->tensornd(), flt0->tensornd(), dst0->tensornd()));
  165. std::tie(a1, b1, c1) = shuffle(std::make_tuple(
  166. src1->tensornd(), flt1->tensornd(), dst1->tensornd()));
  167. WorkspaceWrapper wk(
  168. handle,
  169. std::max(opr0->get_workspace_in_bytes(a0.layout, b0.layout,
  170. c0.layout, nullptr),
  171. opr1->get_workspace_in_bytes(a1.layout, b1.layout,
  172. c1.layout, nullptr)));
  173. cudaProfilerStart();
  174. cudaEventRecord(cuda_ev[0], cuda_stream);
  175. opr0->exec(a0, b0, c0, nullptr, wk.workspace());
  176. cudaEventRecord(cuda_ev[1], cuda_stream);
  177. opr1->exec(a1, b1, c1, nullptr, wk.workspace());
  178. cudaEventRecord(cuda_ev[2], cuda_stream);
  179. cudaProfilerStop();
  180. if (getenv("MEGDNN_CHANWISE_CONV_VERBOSE") ||
  181. getenv("MEGDNN_CHANWISE_CONV_FULLBENCH")) {
  182. cudaStreamSynchronize(cuda_stream);
  183. float t0 = -1, t1 = -1;
  184. cudaEventElapsedTime(&t0, cuda_ev[0], cuda_ev[1]);
  185. cudaEventElapsedTime(&t1, cuda_ev[1], cuda_ev[2]);
  186. printf("%s;%s;%s: cudnn/megdnn: %.3fms/%.3fms=%.3f\n",
  187. lsrc.TensorShape::to_string().c_str(),
  188. lflt1.TensorShape::to_string().c_str(),
  189. ldst.TensorShape::to_string().c_str(),
  190. t0, t1, t0 / t1);
  191. }
  192. }
  193. void cmp_dst() {
  194. Tensor<> dst0_cpu(handle_cpu, ldst), dst1_cpu(handle_cpu, ldst);
  195. megdnn_memcpy_D2H(handle,
  196. dst0_cpu.ptr(), dst0->ptr(), ldst.span().dist_byte());
  197. megdnn_memcpy_D2H(handle,
  198. dst1_cpu.ptr(), dst1->ptr(), ldst.span().dist_byte());
  199. dst0_cpu.check_with(dst1_cpu);
  200. }
  201. void cmp_src() {
  202. Tensor<> src0_cpu(handle_cpu, lsrc), src1_cpu(handle_cpu, lsrc);
  203. megdnn_memcpy_D2H(handle,
  204. src0_cpu.ptr(), src0->ptr(), lsrc.span().dist_byte());
  205. megdnn_memcpy_D2H(handle,
  206. src1_cpu.ptr(), src1->ptr(), lsrc.span().dist_byte());
  207. src0_cpu.check_with(src1_cpu);
  208. }
  209. void cmp_flt() {
  210. Tensor<> flt0_cpu(handle_cpu, lflt0), flt1_cpu(handle_cpu, lflt1);
  211. float *p0 = flt0_cpu.ptr();
  212. float *p1 = flt1_cpu.ptr();
  213. megdnn_memcpy_D2H(handle, p0, flt0->ptr(), lflt0.span().dist_byte());
  214. megdnn_memcpy_D2H(handle, p1, flt1->ptr(), lflt1.span().dist_byte());
  215. size_t IC = lflt1[0], CHL_MUL = lflt1[1],
  216. FSIZE = lflt1[3] * lflt1[4];
  217. double tot_err = 0, tot_err_num = 0;
  218. for (size_t i = 0; i < IC; ++ i) {
  219. for (size_t j = 0; j < CHL_MUL; ++ j) {
  220. auto t0 = p0 + ((i * CHL_MUL + j) * IC + i) * FSIZE,
  221. t1 = p1 + (i * CHL_MUL + j) * FSIZE;
  222. for (size_t k = 0; k < FSIZE; ++ k) {
  223. auto err = std::abs(diff(t0[k], t1[k]));
  224. tot_err += err;
  225. tot_err_num += 1;
  226. ASSERT_LT(err, 1e-2) << "failed at " <<
  227. i << " " << j << " " << k <<
  228. " vals=" << t0[k] << "," << t1[k];
  229. }
  230. }
  231. }
  232. auto avg_err = tot_err / tot_err_num;
  233. ASSERT_LT(avg_err, 1e-4);
  234. }
  235. };
  236. } // anonymous namespace
  237. constexpr auto M = Convolution::Mode::CROSS_CORRELATION;
  238. TEST_F(CUDA, CHANWISE_CONVOLUTION_FORWARD) {
  239. Checker<Convolution> checker(handle_cuda());
  240. bool require_algo = false;
  241. checker.set_before_exec_callback(AlgoChecker<ConvolutionForward>(
  242. ConvBiasForward::algo_name<ConvBiasForward::DirectParam>(
  243. "CHANNEL_WISE", {})
  244. .c_str(),
  245. &require_algo));
  246. for (auto dtype : std::vector<DType>{dtype::Float32(), dtype::Float16()}) {
  247. checker.set_dtype(0, dtype).set_dtype(1, dtype).set_dtype(2, dtype);
  248. if (dtype.enumv() == DTypeEnum::Float16)
  249. checker.set_epsilon(2e-2);
  250. // simple case
  251. // clang-format off
  252. for (uint32_t s : {1, 2})
  253. for (uint32_t p : {0, 1, 2, 3})
  254. for (size_t f : {2, 3, 5, 7})
  255. for (size_t ocpg : {1, 3}) {
  256. checker.set_param(gconv_param({M, p, p, s, s}))
  257. .execs({{2, 3, 16, 16}, {3, ocpg, 1, f, f}, {}});
  258. }
  259. // clang-format on
  260. checker.set_param(gconv_param({M, 2, 3, 2, 1}))
  261. .execs({{32, 12, 20, 10}, {12, 2, 1, 4, 5}, {}});
  262. // padding larger than kern
  263. checker.set_param(gconv_param({M, 20, 30, 4, 5}))
  264. .execs({{32, 12, 20, 10}, {12, 2, 1, 4, 5}, {}});
  265. }
  266. }
  267. TEST_F(CUDA, CHANWISE_CONVOLUTION_FORWARD_SMALL) {
  268. Checker<Convolution> checker(handle_cuda());
  269. bool require_algo = false;
  270. checker.set_before_exec_callback(AlgoChecker<ConvolutionForward>(
  271. ConvBiasForward::algo_name<ConvBiasForward::DirectParam>(
  272. "CHANNEL_WISE_SMALL", {}).c_str(),
  273. &require_algo));
  274. for (auto dtype : std::vector<DType> {
  275. dtype::Float32(),
  276. #if CUDA_VERSION >= 9000
  277. dtype::Float16()
  278. #endif
  279. }) {
  280. checker.set_dtype(0, dtype).set_dtype(1, dtype).set_dtype(2, dtype);
  281. if (dtype.enumv() == DTypeEnum::Float16)
  282. checker.set_epsilon(2e-2);
  283. // clang-format off
  284. for (uint32_t s : {1})
  285. for (uint32_t f : {1, 3, 5, 7}) {
  286. checker.set_param(gconv_param({M, f / 2, f / 2, s, s}))
  287. .execs({{2, 3, 16, 16}, {3, 1, 1, f, f}, {}});
  288. }
  289. // clang-format on
  290. checker.set_param(gconv_param({M, 1, 1, 1, 1}))
  291. .execs({{2, 3, 3, 16}, {3, 1, 1, 3, 3}, {}})
  292. .execs({{2, 3, 8, 3}, {3, 1, 1, 3, 3}, {}});
  293. }
  294. }
  295. TEST_F(CUDA, CHANWISE_CONVOLUTION_BACKWARD_DATA) {
  296. Checker<ConvolutionBackwardData> checker(handle_cuda());
  297. bool require_algo = false;
  298. checker.set_before_exec_callback(AlgoChecker<ConvolutionBackwardData>(
  299. "CHANNEL_WISE", &require_algo));
  300. for (auto dtype : std::vector<DType>{dtype::Float32(), dtype::Float16()}) {
  301. checker.set_dtype(0, dtype).set_dtype(1, dtype).set_dtype(2, dtype);
  302. if (dtype.enumv() == DTypeEnum::Float16)
  303. checker.set_epsilon(1e-1);
  304. // simple case
  305. // clang-format off
  306. for (uint32_t s : {1, 2})
  307. for (uint32_t p : {0, 1, 2, 3})
  308. for (size_t f : {1, 2, 3, 5, 7})
  309. for (size_t ocpg : {1, 3}) {
  310. size_t ii = infer_conv_shape(16, f, s, p, true);
  311. checker.set_param(gconv_param({M, p, p, s, s}))
  312. .execs({{3, ocpg, 1, f, f},
  313. {2, 3 * ocpg, ii, ii},
  314. {2, 3, 16, 16}});
  315. }
  316. // clang-format on
  317. checker.set_param(gconv_param({M, 2, 3, 2, 1}))
  318. .execs({{12, 3, 1, 4, 5}, {32, 36, 20, 10}, {32, 12, 39, 8}});
  319. checker.set_param(gconv_param({M, 30, 20, 5, 4}))
  320. .execs({{6, 2, 1, 5, 4}, {32, 12, 12, 10}, {32, 6, 3, 2}});
  321. checker.set_param(gconv_param({M, 20, 30, 4, 5}))
  322. .execs({{6, 2, 1, 4, 5}, {32, 12, 10, 12}, {32, 6, 2, 3}});
  323. }
  324. }
  325. TEST_F(CUDA, CHANWISE_CONVOLUTION_BACKWARD_DATA_SMALL) {
  326. Checker<ConvolutionBackwardData> checker(handle_cuda());
  327. bool require_algo = false;
  328. checker.set_before_exec_callback(
  329. AlgoChecker<ConvolutionBackwardData>(
  330. "CHANNEL_WISE_SMALL", &require_algo));
  331. for (auto dtype : std::vector<DType> {
  332. dtype::Float32(),
  333. #if CUDA_VERSION >= 9000
  334. dtype::Float16()
  335. #endif
  336. }) {
  337. checker.set_dtype(0, dtype).set_dtype(1, dtype).set_dtype(2, dtype);
  338. if (dtype.enumv() == DTypeEnum::Float16)
  339. checker.set_epsilon(2e-2);
  340. for (uint32_t f : {1, 3, 5, 7}) {
  341. checker.set_param(gconv_param({M, f/2, f/2, 1, 1}))
  342. .execs({{3, 1, 1, f, f}, {2, 3, 16, 16}, {2, 3, 16, 16}});
  343. }
  344. checker.set_param(gconv_param({M, 1, 1, 1, 1}))
  345. .execs({{3, 1, 1, 3, 3}, {2, 3, 3, 16}, {2, 3, 3, 16}})
  346. .execs({{3, 1, 1, 3, 3}, {2, 3, 8, 3}, {2, 3, 8, 3}});
  347. }
  348. }
  349. TEST_F(CUDA, CHANWISE_CONVOLUTION_BACKWARD_FILTER) {
  350. Checker<ConvolutionBackwardFilter> checker(handle_cuda());
  351. bool require_algo = false;
  352. checker.set_before_exec_callback(AlgoChecker<ConvolutionBackwardFilter>(
  353. "CHANNEL_WISE", &require_algo));
  354. UniformFloatRNG rng(-0.1, 0.1);
  355. for (auto dtype : std::vector<DType>{dtype::Float32(), dtype::Float16()}) {
  356. checker.set_dtype(0, dtype).set_dtype(1, dtype).set_dtype(2, dtype).set_rng(0, &rng).set_rng(1, &rng);
  357. if (dtype.enumv() == DTypeEnum::Float16)
  358. checker.set_epsilon(2e-1);
  359. // simple case
  360. // clang-format off
  361. for (uint32_t s : {1, 2})
  362. for (uint32_t p : {0, 1, 2, 3})
  363. for (uint32_t f : {1, 2, 3, 5, 7})
  364. for (uint32_t ocpg : {1, 3})
  365. for (uint32_t i : {8, 16, 32, 64}){
  366. size_t ii = infer_conv_shape(i, f, s, p, true);
  367. checker.set_param(gconv_param({M, p, p, s, s}))
  368. .execs({{2, 3, i, i},
  369. {2, 3 * ocpg, ii, ii},
  370. {3, ocpg, 1, f, f}});
  371. }
  372. // clang-format on
  373. // padding larger than kern
  374. checker.set_param(gconv_param({M, 20, 30, 4, 5})).
  375. execs({{32, 6, 2, 3}, {32, 12, 10, 12}, {6, 2, 1, 4, 5}});
  376. // unused filter items
  377. checker.set_param(gconv_param({M, 2, 3, 2, 3})).
  378. execs({{32, 6, 1, 1}, {32, 12, 1, 1}, {6, 2, 1, 5, 7}});
  379. }
  380. }
  381. #if MEGDNN_WITH_BENCHMARK
  382. TEST_F(CUDA, CHANWISE_CONVOLUTION_FORWARD_BENCH_CHECK) {
  383. auto handle = handle_cuda();
  384. auto handle_cpu = handle_naive();
  385. auto conv0 = handle->create_operator<ConvolutionForward>();
  386. auto conv1 = handle->create_operator<ConvolutionForward>();
  387. BenchmarkEnv<0, 1, 2> benv(handle, handle_cpu);
  388. auto run = [&](size_t N, size_t IC, size_t IH, size_t IW,
  389. size_t CHL_MUL, size_t FH, size_t FW, size_t PH, size_t PW) {
  390. benv.alloc(N, IC, IH, IW, CHL_MUL, FH, FW, PH, PW);
  391. benv.fill_src();
  392. benv.fill_flt();
  393. benv.exec_convolution(conv0.get(), conv1.get());
  394. benv.cmp_dst();
  395. };
  396. run(64, 60, 50, 50, 1, 3, 3, 1, 1);
  397. if (check_need_full_bench()) {
  398. run(64, 728, 18, 18, 2, 5, 5, 2, 2);
  399. run(64, 64, 150, 150, 2, 3, 3, 1, 1);
  400. run(1, 2048, 4, 4, 2, 3, 3, 1, 1);
  401. }
  402. }
  403. TEST_F(CUDA, CHANWISE_CONVOLUTION_BWD_DATA_BENCH_CHECK) {
  404. auto handle = handle_cuda();
  405. auto handle_cpu = handle_naive();
  406. auto conv0 = handle->create_operator<ConvolutionBackwardData>();
  407. auto conv1 = handle->create_operator<ConvolutionBackwardData>();
  408. BenchmarkEnv<1, 2, 0> benv(handle, handle_cpu);
  409. auto run = [&](size_t N, size_t IC, size_t IH, size_t IW,
  410. size_t CHL_MUL, size_t FH, size_t FW, size_t PH, size_t PW) {
  411. benv.alloc(N, IC, IH, IW, CHL_MUL, FH, FW, PH, PW);
  412. benv.fill_dst();
  413. benv.fill_flt();
  414. benv.exec(conv0.get(), conv1.get());
  415. benv.cmp_src();
  416. };
  417. run(64, 60, 50, 50, 1, 3, 3, 1, 1);
  418. if (check_need_full_bench()) {
  419. run(64, 728, 18, 18, 2, 5, 5, 2, 2);
  420. run(64, 64, 150, 150, 2, 3, 3, 1, 1);
  421. run(1, 2048, 4, 4, 2, 3, 3, 1, 1);
  422. }
  423. }
  424. TEST_F(CUDA, CHANWISE_CONVOLUTION_BWD_FILTER_BENCH_CHECK) {
  425. auto handle = handle_cuda();
  426. auto handle_cpu = handle_naive();
  427. auto conv0 = handle->create_operator<ConvolutionBackwardFilter>();
  428. auto conv1 = handle->create_operator<ConvolutionBackwardFilter>();
  429. BenchmarkEnv<0, 2, 1> benv(handle, handle_cpu);
  430. auto run = [&](size_t N, size_t IC, size_t IH, size_t IW,
  431. size_t CHL_MUL, size_t FH, size_t FW, size_t PH, size_t PW) {
  432. benv.alloc(N, IC, IH, IW, CHL_MUL, FH, FW, PH, PW);
  433. benv.fill_src();
  434. benv.fill_dst();
  435. benv.exec(conv0.get(), conv1.get());
  436. benv.cmp_flt();
  437. };
  438. run(64, 60, 50, 50, 1, 3, 3, 1, 1);
  439. if (check_need_full_bench()){
  440. run(64, 728, 18, 18, 2, 5, 5, 2, 2);
  441. run(64, 64, 150, 150, 2, 3, 3, 1, 1);
  442. run(1, 2048, 4, 4, 2, 3, 3, 1, 1);
  443. }
  444. }
  445. TEST_F(CUDA, CHANWISE_CONVOLUTION_BENCH_ALL_ALGO_FWD) {
  446. // enable profiling
  447. std::unique_ptr<OprProxy<ConvolutionForward>> proxy{
  448. new OprProxy<ConvolutionForward>{true}};
  449. proxy->warmup_times = 1;
  450. proxy->exec_times = 10;
  451. Benchmarker<ConvolutionForward> checker(handle_cuda());
  452. checker.set_times(1);
  453. ConvolutionForward::Param param;
  454. param.sparse = ConvolutionForward::Param::Sparse::GROUP;
  455. checker.set_param(param);
  456. checker.set_proxy(proxy);
  457. auto run = [&](size_t N, size_t C, size_t IH, size_t IW, size_t FH,
  458. size_t FW) {
  459. checker.proxy()->target_algo = nullptr;
  460. checker.execs({{N, C, IH, IW}, {C, 1, 1, FH, FW}, {}});
  461. };
  462. run(128, 64, 90, 80, 3, 3);
  463. run(128, 90, 100, 100, 3, 5);
  464. run(128, 32, 62, 62, 5, 5);
  465. }
  466. TEST_F(CUDA, CHANWISE_CONVOLUTION_BENCH_ALL_ALGO_BWD_DATA) {
  467. // enable profiling
  468. std::unique_ptr<OprProxy<ConvolutionBackwardData>> proxy{
  469. new OprProxy<ConvolutionBackwardData>{true}};
  470. proxy->warmup_times = 1;
  471. proxy->exec_times = 10;
  472. Benchmarker<ConvolutionBackwardData> checker(handle_cuda());
  473. checker.set_times(1);
  474. ConvolutionBackwardData::Param param;
  475. param.sparse = ConvolutionForward::Param::Sparse::GROUP;
  476. checker.set_param(param);
  477. checker.set_proxy(proxy);
  478. auto run = [&](size_t N, size_t C, size_t IH, size_t IW, size_t FH,
  479. size_t FW) {
  480. checker.proxy()->target_algo = nullptr;
  481. checker.execs({{C, 1, 1, FH, FW},
  482. {N, C, IH - FH + 1, IW - FW + 1},
  483. {N, C, IH, IW}});
  484. };
  485. run(128, 64, 90, 80, 3, 3);
  486. run(128, 90, 100, 100, 3, 5);
  487. run(128, 32, 62, 62, 5, 5);
  488. }
  489. TEST_F(CUDA, CHANWISE_CONVOLUTION_BENCH_ALL_ALGO_BWD_FILTER) {
  490. // enable profiling
  491. std::unique_ptr<OprProxy<ConvolutionBackwardFilter>> proxy{
  492. new OprProxy<ConvolutionBackwardFilter>{true}};
  493. proxy->warmup_times = 1;
  494. proxy->exec_times = 10;
  495. Benchmarker<ConvolutionBackwardFilter> checker(handle_cuda());
  496. checker.set_times(1);
  497. ConvolutionBackwardFilter::Param param;
  498. param.sparse = ConvolutionForward::Param::Sparse::GROUP;
  499. checker.set_param(param);
  500. checker.set_proxy(proxy);
  501. auto run = [&](size_t N, size_t C, size_t IH, size_t IW, size_t FH,
  502. size_t FW) {
  503. checker.proxy()->target_algo = nullptr;
  504. checker.execs({{N, C, IH, IW},
  505. {N, C, IH - FH + 1, IW - FW + 1},
  506. {C, 1, 1, FH, FW}});
  507. };
  508. run(128, 64, 90, 80, 3, 3);
  509. run(128, 90, 100, 100, 3, 5);
  510. run(128, 32, 62, 62, 5, 5);
  511. }
  512. TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_ALL_ALGO_FORWARD) {
  513. CUBenchmarker<ConvolutionForward> bencher(handle_cuda());
  514. size_t RUNS = 10;
  515. bencher.set_display(false).set_times(RUNS);
  516. std::unique_ptr<OprProxy<ConvolutionForward>> proxy{
  517. new OprProxy<ConvolutionForward>{true}};
  518. bencher.set_proxy(proxy);
  519. Convolution::Param param;
  520. param.format = ConvBias::Param::Format::NCHW;
  521. param.sparse = Convolution::Param::Sparse::GROUP;
  522. NormalRNG rng;
  523. auto run = [&](size_t batch, size_t c, size_t ih, size_t iw, size_t f,
  524. size_t s) {
  525. param.pad_h = f / 2;
  526. param.pad_w = f / 2;
  527. param.stride_h = s;
  528. param.stride_w = s;
  529. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  530. TensorShape src = {batch, c, ih, iw}, filter = {c, 1, 1, f, f};
  531. TensorLayout dst_layout;
  532. auto opr = handle_cuda()->create_operator<Convolution>();
  533. opr->param() = param;
  534. opr->deduce_layout({src, dtype::Float32()}, {filter, dtype::Float32()},
  535. dst_layout);
  536. float bandwith = static_cast<float>(src.total_nr_elems() +
  537. filter.total_nr_elems() +
  538. dst_layout.total_nr_elems()) /
  539. (1024 * 1024 * 1024) * 1e3;
  540. bencher.set_param(param)
  541. .set_dtype(0, dtype::Float32())
  542. .set_dtype(1, dtype::Float32())
  543. .set_dtype(2, dtype::Float32())
  544. .set_rng(0, &rng)
  545. .set_rng(1, &rng);
  546. bencher.proxy()->target_algo = nullptr;
  547. auto time_in_ms_fp32 = bencher.execs({src, filter, {}}) / RUNS;
  548. bencher.set_param(param)
  549. .set_dtype(0, dtype::Float16())
  550. .set_dtype(1, dtype::Float16())
  551. .set_dtype(2, dtype::Float16())
  552. .set_rng(0, &rng)
  553. .set_rng(1, &rng);
  554. bencher.proxy()->target_algo = nullptr;
  555. auto time_in_ms_fp16 = bencher.execs({src, filter, {}}) / RUNS;
  556. bencher.proxy()->target_algo = nullptr;
  557. param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  558. bencher.set_param(param);
  559. auto time_in_ms_pseudo_fp16 = bencher.execs({src, filter, {}}) / RUNS;
  560. printf("stride=%zu src=%s, filter=%s, float32: %.2fms %.2fGB/s "
  561. "float16: %.2fms %.2fGB/s "
  562. "pseudo float16: %.2fms %.2fGB/s "
  563. "speedup: "
  564. "%0.2f (fp16/fp32) %.2f (fp16/pseudo fp16)\n",
  565. s, src.to_string().c_str(), filter.to_string().c_str(),
  566. time_in_ms_fp32, bandwith * 4 / time_in_ms_fp32, time_in_ms_fp16,
  567. bandwith * 2 / time_in_ms_fp16, time_in_ms_pseudo_fp16,
  568. bandwith * 2 / time_in_ms_pseudo_fp16,
  569. time_in_ms_fp32 / time_in_ms_fp16,
  570. time_in_ms_pseudo_fp16 / time_in_ms_fp16);
  571. };
  572. // clang-format off
  573. for (size_t s : {1, 2})
  574. for (size_t f : {3, 5, 7})
  575. for (size_t batch : {64})
  576. for (size_t c : {16, 32, 64, 128})
  577. for (size_t ih: {128, 256})
  578. for (size_t iw : {128, 256})
  579. run(batch, c, ih, iw, f, s);
  580. // clang-format on
  581. run(128, 192, 28, 28, 3, 1);
  582. run(128, 192, 28, 28, 3, 2);
  583. run(128, 576, 14, 14, 3, 1);
  584. run(128, 384, 14, 14, 3, 1);
  585. run(128, 32, 112, 112, 3, 1);
  586. run(128, 960, 7, 7, 3, 1);
  587. run(128, 384, 14, 14, 3, 1);
  588. run(128, 144, 56, 56, 3, 2);
  589. run(128, 384, 14, 14, 3, 1);
  590. run(128, 144, 56, 56, 3, 1);
  591. run(128, 96, 112, 112, 3, 2);
  592. run(128, 384, 14, 14, 3, 1);
  593. run(128, 192, 28, 28, 3, 1);
  594. run(128, 576, 14, 14, 3, 1);
  595. run(128, 576, 14, 14, 3, 2);
  596. }
  597. TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_FORWARD_FLOAT) {
  598. CUBenchmarker<ConvolutionForward> bencher(handle_cuda());
  599. size_t RUNS = 1;
  600. bencher.set_display(false).set_times(RUNS);
  601. bencher.set_before_exec_callback(AlgoChecker<ConvolutionForward>(
  602. ConvBiasForward::algo_name<ConvBiasForward::DirectParam>(
  603. "CHANNEL_WISE", {})
  604. .c_str()));
  605. Convolution::Param param;
  606. param.format = ConvBias::Param::Format::NCHW;
  607. param.sparse = Convolution::Param::Sparse::GROUP;
  608. NormalRNG rng;
  609. auto run = [&](size_t batch, size_t c, size_t ih, size_t iw, size_t f,
  610. size_t s) {
  611. param.pad_h = f / 2;
  612. param.pad_w = f / 2;
  613. param.stride_h = s;
  614. param.stride_w = s;
  615. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  616. TensorShape src = {batch, c, ih, iw}, filter = {c, 1, 1, f, f};
  617. TensorLayout dst_layout;
  618. auto opr = handle_cuda()->create_operator<Convolution>();
  619. opr->param() = param;
  620. opr->deduce_layout({src, dtype::Float32()}, {filter, dtype::Float32()},
  621. dst_layout);
  622. float bandwith = static_cast<float>(src.total_nr_elems() +
  623. filter.total_nr_elems() +
  624. dst_layout.total_nr_elems()) /
  625. (1024 * 1024 * 1024) * 1e3;
  626. bencher.set_param(param)
  627. .set_dtype(0, dtype::Float32())
  628. .set_dtype(1, dtype::Float32())
  629. .set_dtype(2, dtype::Float32())
  630. .set_rng(0, &rng)
  631. .set_rng(1, &rng);
  632. auto time_in_ms_fp32 = bencher.execs({src, filter, {}}) / RUNS;
  633. bencher.set_param(param)
  634. .set_dtype(0, dtype::Float16())
  635. .set_dtype(1, dtype::Float16())
  636. .set_dtype(2, dtype::Float16())
  637. .set_rng(0, &rng)
  638. .set_rng(1, &rng);
  639. auto time_in_ms_fp16 = bencher.execs({src, filter, {}}) / RUNS;
  640. printf("stride=%zu src=%s, filter=%s, float32: %.2fms %.2fGB/s "
  641. "float16: %.2fms %.2fGB/s "
  642. "speedup: "
  643. "%0.2f (fp16/fp32)\n",
  644. s, src.to_string().c_str(), filter.to_string().c_str(),
  645. time_in_ms_fp32, bandwith * 4 / time_in_ms_fp32, time_in_ms_fp16,
  646. bandwith * 2 / time_in_ms_fp16,
  647. time_in_ms_fp32 / time_in_ms_fp16);
  648. };
  649. // clang-format off
  650. for (size_t s : {1})
  651. for (size_t f : {3, 5, 7})
  652. for (size_t batch : {64})
  653. for (size_t c : {16, 32, 64, 128})
  654. for (size_t ih: {8, 16, 32, 128, 256})
  655. for (size_t iw : {8, 16, 32, 128, 256})
  656. run(batch, c, ih, iw, f, s);
  657. // clang-format on
  658. }
  659. TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_FORWARD_FLOAT_SMALL) {
  660. CUBenchmarker<ConvolutionForward> bencher(handle_cuda());
  661. size_t RUNS = 1;
  662. bencher.set_display(false).set_times(RUNS);
  663. Convolution::Param param;
  664. param.format = ConvBias::Param::Format::NCHW;
  665. param.sparse = Convolution::Param::Sparse::GROUP;
  666. NormalRNG rng;
  667. auto run = [&](size_t batch, size_t c, size_t ih, size_t iw, size_t f,
  668. size_t s) {
  669. param.pad_h = f / 2;
  670. param.pad_w = f / 2;
  671. param.stride_h = s;
  672. param.stride_w = s;
  673. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  674. TensorShape src = {batch, c, ih, iw}, filter = {c, 1, 1, f, f};
  675. TensorLayout dst_layout;
  676. auto opr = handle_cuda()->create_operator<Convolution>();
  677. opr->param() = param;
  678. opr->deduce_layout({src, dtype::Float32()}, {filter, dtype::Float32()},
  679. dst_layout);
  680. float bandwith = static_cast<float>(src.total_nr_elems() +
  681. filter.total_nr_elems() +
  682. dst_layout.total_nr_elems()) /
  683. (1024 * 1024 * 1024) * 1e3;
  684. bencher.set_param(param)
  685. .set_dtype(0, dtype::Float32())
  686. .set_dtype(1, dtype::Float32())
  687. .set_dtype(2, dtype::Float32())
  688. .set_rng(0, &rng)
  689. .set_rng(1, &rng)
  690. .set_before_exec_callback(AlgoChecker<ConvolutionForward>(
  691. ConvBiasForward::algo_name<
  692. ConvBiasForward::DirectParam>("CHANNEL_WISE",
  693. {})
  694. .c_str()));
  695. auto time_in_ms_fp32_normal = bencher.execs({src, filter, {}}) / RUNS;
  696. bencher.set_before_exec_callback(AlgoChecker<ConvolutionForward>(
  697. ConvBiasForward::algo_name<ConvBiasForward::DirectParam>(
  698. "CHANNEL_WISE", {})
  699. .c_str()));
  700. auto time_in_ms_fp32_small = bencher.execs({src, filter, {}}) / RUNS;
  701. bencher.set_param(param)
  702. .set_dtype(0, dtype::Float16())
  703. .set_dtype(1, dtype::Float16())
  704. .set_dtype(2, dtype::Float16())
  705. .set_rng(0, &rng)
  706. .set_rng(1, &rng);
  707. auto time_in_ms_fp16_small = bencher.execs({src, filter, {}}) / RUNS;
  708. printf("stride=%zu src=%s, filter=%s, fp32 normal: %.2fms %.2fGB/s "
  709. "small: %.2fms %.2fGB/s, fp16 small: %.2fms %.2fGB/s, "
  710. "speedup: "
  711. "%0.2f (fp32 small/normal) %0.2f (small fp16/fp32)\n",
  712. s, src.to_string().c_str(), filter.to_string().c_str(),
  713. time_in_ms_fp32_normal, bandwith * 4 / time_in_ms_fp32_normal,
  714. time_in_ms_fp32_small, bandwith * 4 / time_in_ms_fp32_small,
  715. time_in_ms_fp16_small, bandwith * 2 / time_in_ms_fp16_small,
  716. time_in_ms_fp32_normal / time_in_ms_fp32_small,
  717. time_in_ms_fp32_small / time_in_ms_fp16_small);
  718. };
  719. // clang-format off
  720. for (size_t s : {1})
  721. for (size_t f : {3, 5})
  722. for (size_t batch : {64})
  723. for (size_t c : {16, 32, 64, 128})
  724. for (size_t ih: {8, 16, 32})
  725. for (size_t iw : {8, 16, 32})
  726. run(batch, c, ih, iw, f, s);
  727. // clang-format on
  728. run(128, 192, 28, 28, 3, 1);
  729. run(128, 576, 14, 14, 3, 1);
  730. run(128, 384, 14, 14, 3, 1);
  731. run(128, 960, 7, 7, 3, 1);
  732. run(128, 384, 14, 14, 3, 1);
  733. run(128, 384, 14, 14, 3, 1);
  734. run(128, 384, 14, 14, 3, 1);
  735. run(128, 192, 28, 28, 3, 1);
  736. run(128, 576, 14, 14, 3, 1);
  737. }
  738. TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_BACKWARD_DATA_FLOAT_SMALL) {
  739. CUBenchmarker<ConvolutionBackwardData> bencher(handle_cuda());
  740. size_t RUNS = 1;
  741. bencher.set_display(false).set_times(RUNS);
  742. ConvolutionBackwardData::Param param;
  743. param.format = Convolution::Param::Format::NCHW;
  744. param.sparse = Convolution::Param::Sparse::GROUP;
  745. NormalRNG rng;
  746. auto run = [&](size_t batch, size_t c, size_t ih, size_t iw, size_t f,
  747. size_t s) {
  748. param.pad_h = f / 2;
  749. param.pad_w = f / 2;
  750. param.stride_h = s;
  751. param.stride_w = s;
  752. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  753. TensorShape src = {batch, c, ih, iw}, filter = {c, 1, 1, f, f};
  754. float bandwith = static_cast<float>(src.total_nr_elems() +
  755. filter.total_nr_elems() +
  756. src.total_nr_elems()) /
  757. (1024 * 1024 * 1024) * 1e3;
  758. bencher.set_param(param)
  759. .set_dtype(0, dtype::Float32())
  760. .set_dtype(1, dtype::Float32())
  761. .set_dtype(2, dtype::Float32())
  762. .set_rng(0, &rng)
  763. .set_rng(1, &rng)
  764. .set_before_exec_callback(
  765. AlgoChecker<ConvolutionBackwardData>("CHANNEL_WISE"));
  766. auto time_in_ms_fp32_normal = bencher.execs({filter, src, src}) / RUNS;
  767. bencher.set_before_exec_callback(
  768. AlgoChecker<ConvolutionBackwardData>("CHANNEL_WISE_SMALL"));
  769. auto time_in_ms_fp32_small = bencher.execs({filter, src, src}) / RUNS;
  770. bencher.set_param(param)
  771. .set_dtype(0, dtype::Float16())
  772. .set_dtype(1, dtype::Float16())
  773. .set_dtype(2, dtype::Float16())
  774. .set_rng(0, &rng)
  775. .set_rng(1, &rng);
  776. auto time_in_ms_fp16_small = bencher.execs({filter, src, src}) / RUNS;
  777. printf("stride=%zu src=%s, filter=%s, fp32 normal: %.2fms %.2fGB/s "
  778. "small: %.2fms %.2fGB/s, fp16 small: %.2fms %.2fGB/s, "
  779. "speedup: "
  780. "%0.2f (fp32 small/normal) %0.2f (small fp16/fp32)\n",
  781. s, src.to_string().c_str(), filter.to_string().c_str(),
  782. time_in_ms_fp32_normal, bandwith * 4 / time_in_ms_fp32_normal,
  783. time_in_ms_fp32_small, bandwith * 4 / time_in_ms_fp32_small,
  784. time_in_ms_fp16_small, bandwith * 2 / time_in_ms_fp16_small,
  785. time_in_ms_fp32_normal / time_in_ms_fp32_small,
  786. time_in_ms_fp32_small / time_in_ms_fp16_small);
  787. };
  788. // clang-format off
  789. for (size_t s : {1})
  790. for (size_t f : {3, 5})
  791. for (size_t batch : {64})
  792. for (size_t c : {16, 32, 64, 128})
  793. for (size_t ih: {8, 16, 32})
  794. for (size_t iw : {8, 16, 32})
  795. run(batch, c, ih, iw, f, s);
  796. // clang-format on
  797. run(128, 192, 28, 28, 3, 1);
  798. run(128, 576, 14, 14, 3, 1);
  799. run(128, 384, 14, 14, 3, 1);
  800. run(128, 960, 7, 7, 3, 1);
  801. run(128, 384, 14, 14, 3, 1);
  802. run(128, 384, 14, 14, 3, 1);
  803. run(128, 384, 14, 14, 3, 1);
  804. run(128, 192, 28, 28, 3, 1);
  805. run(128, 576, 14, 14, 3, 1);
  806. }
  807. TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_BWD_DATA) {
  808. CUBenchmarker<ConvolutionBackwardData> bencher(handle_cuda());
  809. size_t RUNS = 1;
  810. bencher.set_display(false).set_times(RUNS);
  811. bencher.set_before_exec_callback(
  812. AlgoChecker<ConvolutionBackwardData>("CHANNEL_WISE"));
  813. Convolution::Param param;
  814. param.format = ConvBias::Param::Format::NCHW;
  815. param.sparse = Convolution::Param::Sparse::GROUP;
  816. NormalRNG rng;
  817. auto run = [&](size_t batch, size_t ocpg, size_t group, size_t ih,
  818. size_t iw, size_t f, size_t p, size_t s) {
  819. param.pad_h = p;
  820. param.pad_w = p;
  821. param.stride_h = s;
  822. param.stride_w = s;
  823. size_t oh, ow;
  824. infer_conv_shape2d(ih, iw, f, f, s, s, p, p, oh, ow, true);
  825. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  826. TensorShape src_grad = {batch, group, ih, iw},
  827. dst_grad = {batch, group * ocpg, oh, ow},
  828. flt = {group, ocpg, 1, f, f};
  829. auto opr = handle_cuda()->create_operator<Convolution>();
  830. opr->param() = param;
  831. float bandwith = static_cast<float>(flt.total_nr_elems() +
  832. dst_grad.total_nr_elems() +
  833. src_grad.total_nr_elems()) /
  834. (1024 * 1024 * 1024) * 1e3;
  835. bencher.set_param(param)
  836. .set_dtype(0, dtype::Float32())
  837. .set_dtype(1, dtype::Float32())
  838. .set_dtype(2, dtype::Float32())
  839. .set_rng(0, &rng)
  840. .set_rng(1, &rng);
  841. auto time_in_ms_fp32 = bencher.execs({flt, dst_grad, src_grad}) / RUNS;
  842. bencher.set_param(param)
  843. .set_dtype(0, dtype::Float16())
  844. .set_dtype(1, dtype::Float16())
  845. .set_dtype(2, dtype::Float16())
  846. .set_rng(0, &rng)
  847. .set_rng(1, &rng);
  848. auto time_in_ms_fp16 = bencher.execs({flt, dst_grad, src_grad}) / RUNS;
  849. printf("stride=%zu, src_grad=%s, flt=%s, "
  850. "float32: %.2fms %.2fGB/s "
  851. "float16: %.2fms %.2fGB/s "
  852. "speedup: "
  853. "%0.2f (fp16/fp32)\n",
  854. s, src_grad.to_string().c_str(), flt.to_string().c_str(),
  855. time_in_ms_fp32, bandwith * 4 / time_in_ms_fp32, time_in_ms_fp16,
  856. bandwith * 2 / time_in_ms_fp16,
  857. time_in_ms_fp32 / time_in_ms_fp16);
  858. };
  859. // clang-format off
  860. for (size_t s : {1, 2})
  861. for (size_t f : {3, 5, 7})
  862. for (size_t p : {f / 2})
  863. for (size_t batch : {64})
  864. for (size_t ocpg : {1})
  865. for (size_t group : {16, 32, 64, 128})
  866. for (size_t ih : {8, 16, 32, 128, 256})
  867. for (size_t iw : {8, 16, 32, 128, 256})
  868. run(batch, ocpg, group, ih, iw, f, p, s);
  869. // clang-format on
  870. }
  871. TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_BWD_FILTER) {
  872. CUBenchmarker<ConvolutionBackwardFilter> bencher(handle_cuda());
  873. size_t RUNS = 1;
  874. bencher.set_display(false).set_times(RUNS);
  875. bencher.set_before_exec_callback(
  876. AlgoChecker<ConvolutionBackwardFilter>("CHANNEL_WISE"));
  877. Convolution::Param param;
  878. param.format = ConvBias::Param::Format::NCHW;
  879. param.sparse = Convolution::Param::Sparse::GROUP;
  880. NormalRNG rng;
  881. auto run = [&](size_t batch, size_t ocpg, size_t group, size_t i,
  882. size_t f, size_t p, size_t s) {
  883. param.pad_h = p;
  884. param.pad_w = p;
  885. param.stride_h = s;
  886. param.stride_w = s;
  887. size_t d = infer_conv_shape(i, f, s, p, true);
  888. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  889. TensorShape src = {batch, group, i, i},
  890. dst_grad = {batch, group * ocpg, d, d},
  891. flt_grad = {group, ocpg, 1, f, f};
  892. auto opr = handle_cuda()->create_operator<Convolution>();
  893. opr->param() = param;
  894. float bandwith = static_cast<float>(flt_grad.total_nr_elems() +
  895. dst_grad.total_nr_elems() +
  896. src.total_nr_elems()) /
  897. (1024 * 1024 * 1024) * 1e3;
  898. bencher.set_param(param)
  899. .set_dtype(0, dtype::Float32())
  900. .set_dtype(1, dtype::Float32())
  901. .set_dtype(2, dtype::Float32())
  902. .set_rng(0, &rng)
  903. .set_rng(1, &rng);
  904. auto time_in_ms_fp32 = bencher.execs({src, dst_grad, flt_grad}) / RUNS;
  905. bencher.set_param(param)
  906. .set_dtype(0, dtype::Float16())
  907. .set_dtype(1, dtype::Float16())
  908. .set_dtype(2, dtype::Float16())
  909. .set_rng(0, &rng)
  910. .set_rng(1, &rng);
  911. auto time_in_ms_fp16 = bencher.execs({src, dst_grad, flt_grad}) / RUNS;
  912. printf("stride=%zu, src=%s, flt_grad=%s, "
  913. "float32: %.2fms %.2fGB/s "
  914. "float16: %.2fms %.2fGB/s "
  915. "speedup: "
  916. "%.2f (fp16/fp32)\n",
  917. s, src.to_string().c_str(), flt_grad.to_string().c_str(),
  918. time_in_ms_fp32, bandwith * 4 / time_in_ms_fp32, time_in_ms_fp16,
  919. bandwith * 2 / time_in_ms_fp16,
  920. time_in_ms_fp32 / time_in_ms_fp16);
  921. };
  922. // clang-format off
  923. for (size_t s : {1, 2})
  924. for (size_t f : {3, 5, 7})
  925. for (size_t p : {f / 2})
  926. for (size_t batch : {64})
  927. for (size_t ocpg : {1})
  928. for (size_t group : {16, 32, 64, 128})
  929. for (size_t i : {8, 16, 32, 64, 128})
  930. run(batch, ocpg, group, i, f, p, s);
  931. // clang-format on
  932. }
  933. #endif
  934. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台