You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

chanwise_convolution.cpp 39 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028
  1. /**
  2. * \file dnn/test/cuda/chanwise_convolution.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #include "megdnn/oprs/nn.h"
  12. #include "test/cuda/fixture.h"
  13. #include "test/cuda/benchmark.h"
  14. #include "test/common/tensor.h"
  15. #include "test/common/workspace_wrapper.h"
  16. #include "test/common/checker.h"
  17. #include "test/common/convolution.h"
  18. #include "test/common/benchmarker.h"
  19. #include "megcore_cuda.h"
  20. #include "cuda.h"
  21. #include <cuda_profiler_api.h>
  22. #include <cuda_runtime_api.h>
  23. using namespace megdnn;
  24. using namespace test;
  25. namespace {
  26. #if MEGDNN_WITH_BENCHMARK
  27. bool check_need_full_bench() {
  28. if (getenv("MEGDNN_CHANWISE_CONV_FULLBENCH"))
  29. return true;
  30. printf("set MEGDNN_CHANWISE_CONV_FULLBENCH to run full benchmark\n");
  31. return false;
  32. }
  33. #endif
  34. Convolution::Param gconv_param(Convolution::Param p) {
  35. p.sparse = Convolution::Param::Sparse::GROUP;
  36. return p;
  37. }
  38. template<int P0, int P1, int P2>
  39. class BenchmarkEnv {
  40. Handle *handle, *handle_cpu;
  41. std::unique_ptr<GaussianRNG> rng;
  42. TensorLayout lsrc, lflt0, lflt1, ldst;
  43. std::unique_ptr<Tensor<>> src0, src1,
  44. flt0, flt0_cpu, flt1, flt1_cpu, dst0, dst1;
  45. cudaEvent_t cuda_ev[3];
  46. cudaStream_t cuda_stream;
  47. size_t pad_h, pad_w;
  48. template<typename T>
  49. static std::tuple<T, T, T> shuffle(std::tuple<T, T, T> data) {
  50. return std::make_tuple(
  51. std::get<P0>(data), std::get<P1>(data), std::get<P2>(data));
  52. }
  53. public:
  54. BenchmarkEnv(Handle *handle, Handle *handle_cpu) {
  55. this->handle = handle;
  56. this->handle_cpu = handle_cpu;
  57. rng = handle->create_operator<GaussianRNG>();
  58. // make cpu handle used
  59. handle_cpu->create_operator<Sleep>()->exec();
  60. for (int i = 0; i < 3; ++ i)
  61. cudaEventCreate(&cuda_ev[i]);
  62. megcoreGetCUDAStream(handle->megcore_computing_handle(), &cuda_stream);
  63. }
  64. ~BenchmarkEnv() {
  65. for (int i = 0; i < 3; ++ i)
  66. cudaEventDestroy(cuda_ev[i]);
  67. }
  68. void alloc(size_t N, size_t IC, size_t IH, size_t IW,
  69. size_t CHL_MUL, size_t FH, size_t FW, size_t PH, size_t PW) {
  70. pad_h = PH;
  71. pad_w = PW;
  72. auto mkly = [](const TensorShape &s) {
  73. return TensorLayout{s, dtype::Float32()};
  74. };
  75. lsrc = mkly({N, IC, IH, IW});
  76. lflt0 = mkly({CHL_MUL*IC, IC, FH, FW});
  77. lflt1 = mkly({IC, CHL_MUL, 1, FH, FW});
  78. ldst = mkly({N, IC*CHL_MUL, IH-FH+1+PH*2, IW-FW+1+PW*2});
  79. src0.reset(new Tensor<>(handle, lsrc));
  80. src1.reset(new Tensor<>(handle, lsrc));
  81. flt0.reset(new Tensor<>(handle, lflt0));
  82. flt0_cpu.reset(new Tensor<>(handle_cpu, lflt0));
  83. flt1.reset(new Tensor<>(handle, lflt1));
  84. flt1_cpu.reset(new Tensor<>(handle_cpu, lflt1));
  85. dst0.reset(new Tensor<>(handle, ldst));
  86. dst1.reset(new Tensor<>(handle, ldst));
  87. }
  88. void fill_src() {
  89. rng->exec(src0->tensornd(), {});
  90. megdnn_memcpy_D2D(handle, src1->ptr(), src0->ptr(),
  91. lsrc.span().dist_byte());
  92. }
  93. void fill_flt() {
  94. rng->exec(flt1->tensornd(), {});
  95. megdnn_memcpy_D2H(handle,
  96. flt1_cpu->ptr(), flt1->ptr(), lflt1.span().dist_byte());
  97. const size_t IC = lflt1[0], CHL_MUL = lflt1[1],
  98. FSIZE = lflt1[3] * lflt1[4];
  99. // fill flt0 from flt1
  100. float* src = flt1_cpu->ptr();
  101. float* dst = flt0_cpu->ptr();
  102. memset(dst, 0, lflt0.span().dist_byte());
  103. for (size_t i = 0; i < IC; ++ i) {
  104. for (size_t j = 0; j < CHL_MUL; ++ j) {
  105. memcpy(dst + ((i * CHL_MUL + j) * IC + i) * FSIZE,
  106. src + (i * CHL_MUL + j) * FSIZE,
  107. FSIZE * sizeof(float));
  108. }
  109. }
  110. megdnn_memcpy_H2D(handle,
  111. flt0->ptr(), dst, lflt0.span().dist_byte());
  112. }
  113. void fill_dst() {
  114. rng->exec(dst0->tensornd(), {});
  115. megdnn_memcpy_D2D(handle, dst1->ptr(), dst0->ptr(),
  116. ldst.span().dist_byte());
  117. }
  118. template<class Opr>
  119. void exec(Opr *opr0, Opr *opr1) {
  120. opr0->param().pad_h = pad_h;
  121. opr0->param().pad_w = pad_w;
  122. opr1->param() = opr0->param();
  123. opr1->param().sparse = param::Convolution::Sparse::GROUP;
  124. TensorND a0, b0, c0, a1, b1, c1;
  125. std::tie(a0, b0, c0) = shuffle(std::make_tuple(
  126. src0->tensornd(), flt0->tensornd(), dst0->tensornd()));
  127. std::tie(a1, b1, c1) = shuffle(std::make_tuple(
  128. src1->tensornd(), flt1->tensornd(), dst1->tensornd()));
  129. WorkspaceWrapper wk(handle,
  130. std::max(
  131. opr0->get_workspace_in_bytes(
  132. a0.layout, b0.layout, c0.layout),
  133. opr1->get_workspace_in_bytes(
  134. a1.layout, b1.layout, c1.layout)
  135. ));
  136. cudaProfilerStart();
  137. cudaEventRecord(cuda_ev[0], cuda_stream);
  138. opr0->exec(a0, b0, c0, wk.workspace());
  139. cudaEventRecord(cuda_ev[1], cuda_stream);
  140. opr1->exec(a1, b1, c1, wk.workspace());
  141. cudaEventRecord(cuda_ev[2], cuda_stream);
  142. cudaProfilerStop();
  143. if (getenv("MEGDNN_CHANWISE_CONV_VERBOSE") ||
  144. getenv("MEGDNN_CHANWISE_CONV_FULLBENCH")) {
  145. cudaStreamSynchronize(cuda_stream);
  146. float t0 = -1, t1 = -1;
  147. cudaEventElapsedTime(&t0, cuda_ev[0], cuda_ev[1]);
  148. cudaEventElapsedTime(&t1, cuda_ev[1], cuda_ev[2]);
  149. printf("%s;%s;%s: cudnn/megdnn: %.3fms/%.3fms=%.3f\n",
  150. lsrc.TensorShape::to_string().c_str(),
  151. lflt1.TensorShape::to_string().c_str(),
  152. ldst.TensorShape::to_string().c_str(),
  153. t0, t1, t0 / t1);
  154. }
  155. }
  156. void cmp_dst() {
  157. Tensor<> dst0_cpu(handle_cpu, ldst), dst1_cpu(handle_cpu, ldst);
  158. megdnn_memcpy_D2H(handle,
  159. dst0_cpu.ptr(), dst0->ptr(), ldst.span().dist_byte());
  160. megdnn_memcpy_D2H(handle,
  161. dst1_cpu.ptr(), dst1->ptr(), ldst.span().dist_byte());
  162. dst0_cpu.check_with(dst1_cpu);
  163. }
  164. void cmp_src() {
  165. Tensor<> src0_cpu(handle_cpu, lsrc), src1_cpu(handle_cpu, lsrc);
  166. megdnn_memcpy_D2H(handle,
  167. src0_cpu.ptr(), src0->ptr(), lsrc.span().dist_byte());
  168. megdnn_memcpy_D2H(handle,
  169. src1_cpu.ptr(), src1->ptr(), lsrc.span().dist_byte());
  170. src0_cpu.check_with(src1_cpu);
  171. }
  172. void cmp_flt() {
  173. Tensor<> flt0_cpu(handle_cpu, lflt0), flt1_cpu(handle_cpu, lflt1);
  174. float *p0 = flt0_cpu.ptr();
  175. float *p1 = flt1_cpu.ptr();
  176. megdnn_memcpy_D2H(handle, p0, flt0->ptr(), lflt0.span().dist_byte());
  177. megdnn_memcpy_D2H(handle, p1, flt1->ptr(), lflt1.span().dist_byte());
  178. size_t IC = lflt1[0], CHL_MUL = lflt1[1],
  179. FSIZE = lflt1[3] * lflt1[4];
  180. double tot_err = 0, tot_err_num = 0;
  181. for (size_t i = 0; i < IC; ++ i) {
  182. for (size_t j = 0; j < CHL_MUL; ++ j) {
  183. auto t0 = p0 + ((i * CHL_MUL + j) * IC + i) * FSIZE,
  184. t1 = p1 + (i * CHL_MUL + j) * FSIZE;
  185. for (size_t k = 0; k < FSIZE; ++ k) {
  186. auto err = std::abs(diff(t0[k], t1[k]));
  187. tot_err += err;
  188. tot_err_num += 1;
  189. ASSERT_LT(err, 1e-2) << "failed at " <<
  190. i << " " << j << " " << k <<
  191. " vals=" << t0[k] << "," << t1[k];
  192. }
  193. }
  194. }
  195. auto avg_err = tot_err / tot_err_num;
  196. ASSERT_LT(avg_err, 1e-4);
  197. }
  198. };
  199. } // anonymous namespace
  200. constexpr auto M = Convolution::Mode::CROSS_CORRELATION;
  201. TEST_F(CUDA, CHANWISE_CONVOLUTION_FORWARD) {
  202. Checker<Convolution> checker(handle_cuda());
  203. bool require_algo = false;
  204. checker.set_before_exec_callback(AlgoChecker<ConvolutionForward>(
  205. ConvBiasForward::algo_name<ConvBiasForward::DirectParam>(
  206. "CHANNEL_WISE", {})
  207. .c_str(),
  208. &require_algo));
  209. for (auto dtype : std::vector<DType>{dtype::Float32(), dtype::Float16()}) {
  210. checker.set_dtype(0, dtype).set_dtype(1, dtype).set_dtype(2, dtype);
  211. if (dtype.enumv() == DTypeEnum::Float16)
  212. checker.set_epsilon(2e-2);
  213. // simple case
  214. // clang-format off
  215. for (uint32_t s : {1, 2})
  216. for (uint32_t p : {0, 1, 2, 3})
  217. for (size_t f : {2, 3, 5, 7})
  218. for (size_t ocpg : {1, 3}) {
  219. checker.set_param(gconv_param({M, p, p, s, s}))
  220. .execs({{2, 3, 16, 16}, {3, ocpg, 1, f, f}, {}});
  221. }
  222. // clang-format on
  223. checker.set_param(gconv_param({M, 2, 3, 2, 1}))
  224. .execs({{32, 12, 20, 10}, {12, 2, 1, 4, 5}, {}});
  225. // padding larger than kern
  226. checker.set_param(gconv_param({M, 20, 30, 4, 5}))
  227. .execs({{32, 12, 20, 10}, {12, 2, 1, 4, 5}, {}});
  228. }
  229. }
  230. TEST_F(CUDA, CHANWISE_CONVOLUTION_FORWARD_SMALL) {
  231. Checker<Convolution> checker(handle_cuda());
  232. bool require_algo = false;
  233. checker.set_before_exec_callback(AlgoChecker<ConvolutionForward>(
  234. ConvBiasForward::algo_name<ConvBiasForward::DirectParam>(
  235. "CHANNEL_WISE_SMALL", {}).c_str(),
  236. &require_algo));
  237. for (auto dtype : std::vector<DType> {
  238. dtype::Float32(),
  239. #if CUDA_VERSION >= 9000
  240. dtype::Float16()
  241. #endif
  242. }) {
  243. checker.set_dtype(0, dtype).set_dtype(1, dtype).set_dtype(2, dtype);
  244. if (dtype.enumv() == DTypeEnum::Float16)
  245. checker.set_epsilon(2e-2);
  246. // clang-format off
  247. for (uint32_t s : {1})
  248. for (uint32_t f : {1, 3, 5, 7}) {
  249. checker.set_param(gconv_param({M, f / 2, f / 2, s, s}))
  250. .execs({{2, 3, 16, 16}, {3, 1, 1, f, f}, {}});
  251. }
  252. // clang-format on
  253. checker.set_param(gconv_param({M, 1, 1, 1, 1}))
  254. .execs({{2, 3, 3, 16}, {3, 1, 1, 3, 3}, {}})
  255. .execs({{2, 3, 8, 3}, {3, 1, 1, 3, 3}, {}});
  256. }
  257. }
  258. TEST_F(CUDA, CHANWISE_CONVOLUTION_BACKWARD_DATA) {
  259. Checker<ConvolutionBackwardData> checker(handle_cuda());
  260. bool require_algo = false;
  261. checker.set_before_exec_callback(AlgoChecker<ConvolutionBackwardData>(
  262. "CHANNEL_WISE", &require_algo));
  263. for (auto dtype : std::vector<DType>{dtype::Float32(), dtype::Float16()}) {
  264. checker.set_dtype(0, dtype).set_dtype(1, dtype).set_dtype(2, dtype);
  265. if (dtype.enumv() == DTypeEnum::Float16)
  266. checker.set_epsilon(1e-1);
  267. // simple case
  268. // clang-format off
  269. for (uint32_t s : {1, 2})
  270. for (uint32_t p : {0, 1, 2, 3})
  271. for (size_t f : {1, 2, 3, 5, 7})
  272. for (size_t ocpg : {1, 3}) {
  273. size_t ii = infer_conv_shape(16, f, s, p, true);
  274. checker.set_param(gconv_param({M, p, p, s, s}))
  275. .execs({{3, ocpg, 1, f, f},
  276. {2, 3 * ocpg, ii, ii},
  277. {2, 3, 16, 16}});
  278. }
  279. // clang-format on
  280. checker.set_param(gconv_param({M, 2, 3, 2, 1}))
  281. .execs({{12, 3, 1, 4, 5}, {32, 36, 20, 10}, {32, 12, 39, 8}});
  282. checker.set_param(gconv_param({M, 30, 20, 5, 4}))
  283. .execs({{6, 2, 1, 5, 4}, {32, 12, 12, 10}, {32, 6, 3, 2}});
  284. checker.set_param(gconv_param({M, 20, 30, 4, 5}))
  285. .execs({{6, 2, 1, 4, 5}, {32, 12, 10, 12}, {32, 6, 2, 3}});
  286. }
  287. }
  288. TEST_F(CUDA, CHANWISE_CONVOLUTION_BACKWARD_DATA_SMALL) {
  289. Checker<ConvolutionBackwardData> checker(handle_cuda());
  290. bool require_algo = false;
  291. checker.set_before_exec_callback(
  292. AlgoChecker<ConvolutionBackwardData>(
  293. "CHANNEL_WISE_SMALL", &require_algo));
  294. for (auto dtype : std::vector<DType> {
  295. dtype::Float32(),
  296. #if CUDA_VERSION >= 9000
  297. dtype::Float16()
  298. #endif
  299. }) {
  300. checker.set_dtype(0, dtype).set_dtype(1, dtype).set_dtype(2, dtype);
  301. if (dtype.enumv() == DTypeEnum::Float16)
  302. checker.set_epsilon(2e-2);
  303. for (uint32_t f : {1, 3, 5, 7}) {
  304. checker.set_param(gconv_param({M, f/2, f/2, 1, 1}))
  305. .execs({{3, 1, 1, f, f}, {2, 3, 16, 16}, {2, 3, 16, 16}});
  306. }
  307. checker.set_param(gconv_param({M, 1, 1, 1, 1}))
  308. .execs({{3, 1, 1, 3, 3}, {2, 3, 3, 16}, {2, 3, 3, 16}})
  309. .execs({{3, 1, 1, 3, 3}, {2, 3, 8, 3}, {2, 3, 8, 3}});
  310. }
  311. }
  312. TEST_F(CUDA, CHANWISE_CONVOLUTION_BACKWARD_FILTER) {
  313. Checker<ConvolutionBackwardFilter> checker(handle_cuda());
  314. bool require_algo = false;
  315. checker.set_before_exec_callback(AlgoChecker<ConvolutionBackwardFilter>(
  316. "CHANNEL_WISE", &require_algo));
  317. UniformFloatRNG rng(-0.1, 0.1);
  318. for (auto dtype : std::vector<DType>{dtype::Float32(), dtype::Float16()}) {
  319. checker.set_dtype(0, dtype).set_dtype(1, dtype).set_dtype(2, dtype).set_rng(0, &rng).set_rng(1, &rng);
  320. if (dtype.enumv() == DTypeEnum::Float16)
  321. checker.set_epsilon(2e-1);
  322. // simple case
  323. // clang-format off
  324. for (uint32_t s : {1, 2})
  325. for (uint32_t p : {0, 1, 2, 3})
  326. for (uint32_t f : {1, 2, 3, 5, 7})
  327. for (uint32_t ocpg : {1, 3})
  328. for (uint32_t i : {8, 16, 32, 64}){
  329. size_t ii = infer_conv_shape(i, f, s, p, true);
  330. checker.set_param(gconv_param({M, p, p, s, s}))
  331. .execs({{2, 3, i, i},
  332. {2, 3 * ocpg, ii, ii},
  333. {3, ocpg, 1, f, f}});
  334. }
  335. // clang-format on
  336. // padding larger than kern
  337. checker.set_param(gconv_param({M, 20, 30, 4, 5})).
  338. execs({{32, 6, 2, 3}, {32, 12, 10, 12}, {6, 2, 1, 4, 5}});
  339. // unused filter items
  340. checker.set_param(gconv_param({M, 2, 3, 2, 3})).
  341. execs({{32, 6, 1, 1}, {32, 12, 1, 1}, {6, 2, 1, 5, 7}});
  342. }
  343. }
  344. #if MEGDNN_WITH_BENCHMARK
  345. TEST_F(CUDA, CHANWISE_CONVOLUTION_FORWARD_BENCH_CHECK) {
  346. auto handle = handle_cuda();
  347. auto handle_cpu = handle_naive();
  348. auto conv0 = handle->create_operator<ConvolutionForward>();
  349. auto conv1 = handle->create_operator<ConvolutionForward>();
  350. BenchmarkEnv<0, 1, 2> benv(handle, handle_cpu);
  351. auto run = [&](size_t N, size_t IC, size_t IH, size_t IW,
  352. size_t CHL_MUL, size_t FH, size_t FW, size_t PH, size_t PW) {
  353. benv.alloc(N, IC, IH, IW, CHL_MUL, FH, FW, PH, PW);
  354. benv.fill_src();
  355. benv.fill_flt();
  356. benv.exec(conv0.get(), conv1.get());
  357. benv.cmp_dst();
  358. };
  359. run(64, 60, 50, 50, 1, 3, 3, 1, 1);
  360. if (check_need_full_bench()) {
  361. run(64, 728, 18, 18, 2, 5, 5, 2, 2);
  362. run(64, 64, 150, 150, 2, 3, 3, 1, 1);
  363. run(1, 2048, 4, 4, 2, 3, 3, 1, 1);
  364. }
  365. }
  366. TEST_F(CUDA, CHANWISE_CONVOLUTION_BWD_DATA_BENCH_CHECK) {
  367. auto handle = handle_cuda();
  368. auto handle_cpu = handle_naive();
  369. auto conv0 = handle->create_operator<ConvolutionBackwardData>();
  370. auto conv1 = handle->create_operator<ConvolutionBackwardData>();
  371. BenchmarkEnv<1, 2, 0> benv(handle, handle_cpu);
  372. auto run = [&](size_t N, size_t IC, size_t IH, size_t IW,
  373. size_t CHL_MUL, size_t FH, size_t FW, size_t PH, size_t PW) {
  374. benv.alloc(N, IC, IH, IW, CHL_MUL, FH, FW, PH, PW);
  375. benv.fill_dst();
  376. benv.fill_flt();
  377. benv.exec(conv0.get(), conv1.get());
  378. benv.cmp_src();
  379. };
  380. run(64, 60, 50, 50, 1, 3, 3, 1, 1);
  381. if (check_need_full_bench()) {
  382. run(64, 728, 18, 18, 2, 5, 5, 2, 2);
  383. run(64, 64, 150, 150, 2, 3, 3, 1, 1);
  384. run(1, 2048, 4, 4, 2, 3, 3, 1, 1);
  385. }
  386. }
  387. TEST_F(CUDA, CHANWISE_CONVOLUTION_BWD_FILTER_BENCH_CHECK) {
  388. auto handle = handle_cuda();
  389. auto handle_cpu = handle_naive();
  390. auto conv0 = handle->create_operator<ConvolutionBackwardFilter>();
  391. auto conv1 = handle->create_operator<ConvolutionBackwardFilter>();
  392. BenchmarkEnv<0, 2, 1> benv(handle, handle_cpu);
  393. auto run = [&](size_t N, size_t IC, size_t IH, size_t IW,
  394. size_t CHL_MUL, size_t FH, size_t FW, size_t PH, size_t PW) {
  395. benv.alloc(N, IC, IH, IW, CHL_MUL, FH, FW, PH, PW);
  396. benv.fill_src();
  397. benv.fill_dst();
  398. benv.exec(conv0.get(), conv1.get());
  399. benv.cmp_flt();
  400. };
  401. run(64, 60, 50, 50, 1, 3, 3, 1, 1);
  402. if (check_need_full_bench()){
  403. run(64, 728, 18, 18, 2, 5, 5, 2, 2);
  404. run(64, 64, 150, 150, 2, 3, 3, 1, 1);
  405. run(1, 2048, 4, 4, 2, 3, 3, 1, 1);
  406. }
  407. }
  408. TEST_F(CUDA, CHANWISE_CONVOLUTION_BENCH_ALL_ALGO_FWD) {
  409. // enable profiling
  410. std::unique_ptr<OprProxy<ConvolutionForward>> proxy{
  411. new OprProxy<ConvolutionForward>{true}};
  412. proxy->warmup_times = 1;
  413. proxy->exec_times = 10;
  414. Benchmarker<ConvolutionForward> checker(handle_cuda());
  415. checker.set_times(1);
  416. ConvolutionForward::Param param;
  417. param.sparse = ConvolutionForward::Param::Sparse::GROUP;
  418. checker.set_param(param);
  419. checker.set_proxy(proxy);
  420. auto run = [&](size_t N, size_t C, size_t IH, size_t IW, size_t FH,
  421. size_t FW) {
  422. checker.proxy()->target_algo = nullptr;
  423. checker.execs({{N, C, IH, IW}, {C, 1, 1, FH, FW}, {}});
  424. };
  425. run(128, 64, 90, 80, 3, 3);
  426. run(128, 90, 100, 100, 3, 5);
  427. run(128, 32, 62, 62, 5, 5);
  428. }
  429. TEST_F(CUDA, CHANWISE_CONVOLUTION_BENCH_ALL_ALGO_BWD_DATA) {
  430. // enable profiling
  431. std::unique_ptr<OprProxy<ConvolutionBackwardData>> proxy{
  432. new OprProxy<ConvolutionBackwardData>{true}};
  433. proxy->warmup_times = 1;
  434. proxy->exec_times = 10;
  435. Benchmarker<ConvolutionBackwardData> checker(handle_cuda());
  436. checker.set_times(1);
  437. ConvolutionBackwardData::Param param;
  438. param.sparse = ConvolutionForward::Param::Sparse::GROUP;
  439. checker.set_param(param);
  440. checker.set_proxy(proxy);
  441. auto run = [&](size_t N, size_t C, size_t IH, size_t IW, size_t FH,
  442. size_t FW) {
  443. checker.proxy()->target_algo = nullptr;
  444. checker.execs({{C, 1, 1, FH, FW},
  445. {N, C, IH - FH + 1, IW - FW + 1},
  446. {N, C, IH, IW}});
  447. };
  448. run(128, 64, 90, 80, 3, 3);
  449. run(128, 90, 100, 100, 3, 5);
  450. run(128, 32, 62, 62, 5, 5);
  451. }
  452. TEST_F(CUDA, CHANWISE_CONVOLUTION_BENCH_ALL_ALGO_BWD_FILTER) {
  453. // enable profiling
  454. std::unique_ptr<OprProxy<ConvolutionBackwardFilter>> proxy{
  455. new OprProxy<ConvolutionBackwardFilter>{true}};
  456. proxy->warmup_times = 1;
  457. proxy->exec_times = 10;
  458. Benchmarker<ConvolutionBackwardFilter> checker(handle_cuda());
  459. checker.set_times(1);
  460. ConvolutionBackwardFilter::Param param;
  461. param.sparse = ConvolutionForward::Param::Sparse::GROUP;
  462. checker.set_param(param);
  463. checker.set_proxy(proxy);
  464. auto run = [&](size_t N, size_t C, size_t IH, size_t IW, size_t FH,
  465. size_t FW) {
  466. checker.proxy()->target_algo = nullptr;
  467. checker.execs({{N, C, IH, IW},
  468. {N, C, IH - FH + 1, IW - FW + 1},
  469. {C, 1, 1, FH, FW}});
  470. };
  471. run(128, 64, 90, 80, 3, 3);
  472. run(128, 90, 100, 100, 3, 5);
  473. run(128, 32, 62, 62, 5, 5);
  474. }
  475. TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_ALL_ALGO_FORWARD) {
  476. CUBenchmarker<ConvolutionForward> bencher(handle_cuda());
  477. size_t RUNS = 10;
  478. bencher.set_display(false).set_times(RUNS);
  479. std::unique_ptr<OprProxy<ConvolutionForward>> proxy{
  480. new OprProxy<ConvolutionForward>{true}};
  481. bencher.set_proxy(proxy);
  482. Convolution::Param param;
  483. param.format = ConvBias::Param::Format::NCHW;
  484. param.sparse = Convolution::Param::Sparse::GROUP;
  485. NormalRNG rng;
  486. auto run = [&](size_t batch, size_t c, size_t ih, size_t iw, size_t f,
  487. size_t s) {
  488. param.pad_h = f / 2;
  489. param.pad_w = f / 2;
  490. param.stride_h = s;
  491. param.stride_w = s;
  492. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  493. TensorShape src = {batch, c, ih, iw}, filter = {c, 1, 1, f, f};
  494. TensorLayout dst_layout;
  495. auto opr = handle_cuda()->create_operator<Convolution>();
  496. opr->param() = param;
  497. opr->deduce_layout({src, dtype::Float32()}, {filter, dtype::Float32()},
  498. dst_layout);
  499. float bandwith = static_cast<float>(src.total_nr_elems() +
  500. filter.total_nr_elems() +
  501. dst_layout.total_nr_elems()) /
  502. (1024 * 1024 * 1024) * 1e3;
  503. bencher.set_param(param)
  504. .set_dtype(0, dtype::Float32())
  505. .set_dtype(1, dtype::Float32())
  506. .set_dtype(2, dtype::Float32())
  507. .set_rng(0, &rng)
  508. .set_rng(1, &rng);
  509. bencher.proxy()->target_algo = nullptr;
  510. auto time_in_ms_fp32 = bencher.execs({src, filter, {}}) / RUNS;
  511. bencher.set_param(param)
  512. .set_dtype(0, dtype::Float16())
  513. .set_dtype(1, dtype::Float16())
  514. .set_dtype(2, dtype::Float16())
  515. .set_rng(0, &rng)
  516. .set_rng(1, &rng);
  517. bencher.proxy()->target_algo = nullptr;
  518. auto time_in_ms_fp16 = bencher.execs({src, filter, {}}) / RUNS;
  519. bencher.proxy()->target_algo = nullptr;
  520. param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  521. bencher.set_param(param);
  522. auto time_in_ms_pseudo_fp16 = bencher.execs({src, filter, {}}) / RUNS;
  523. printf("stride=%zu src=%s, filter=%s, float32: %.2fms %.2fGB/s "
  524. "float16: %.2fms %.2fGB/s "
  525. "pseudo float16: %.2fms %.2fGB/s "
  526. "speedup: "
  527. "%0.2f (fp16/fp32) %.2f (fp16/pseudo fp16)\n",
  528. s, src.to_string().c_str(), filter.to_string().c_str(),
  529. time_in_ms_fp32, bandwith * 4 / time_in_ms_fp32, time_in_ms_fp16,
  530. bandwith * 2 / time_in_ms_fp16, time_in_ms_pseudo_fp16,
  531. bandwith * 2 / time_in_ms_pseudo_fp16,
  532. time_in_ms_fp32 / time_in_ms_fp16,
  533. time_in_ms_pseudo_fp16 / time_in_ms_fp16);
  534. };
  535. // clang-format off
  536. for (size_t s : {1, 2})
  537. for (size_t f : {3, 5, 7})
  538. for (size_t batch : {64})
  539. for (size_t c : {16, 32, 64, 128})
  540. for (size_t ih: {128, 256})
  541. for (size_t iw : {128, 256})
  542. run(batch, c, ih, iw, f, s);
  543. // clang-format on
  544. run(128, 192, 28, 28, 3, 1);
  545. run(128, 192, 28, 28, 3, 2);
  546. run(128, 576, 14, 14, 3, 1);
  547. run(128, 384, 14, 14, 3, 1);
  548. run(128, 32, 112, 112, 3, 1);
  549. run(128, 960, 7, 7, 3, 1);
  550. run(128, 384, 14, 14, 3, 1);
  551. run(128, 144, 56, 56, 3, 2);
  552. run(128, 384, 14, 14, 3, 1);
  553. run(128, 144, 56, 56, 3, 1);
  554. run(128, 96, 112, 112, 3, 2);
  555. run(128, 384, 14, 14, 3, 1);
  556. run(128, 192, 28, 28, 3, 1);
  557. run(128, 576, 14, 14, 3, 1);
  558. run(128, 576, 14, 14, 3, 2);
  559. }
  560. TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_FORWARD_FLOAT) {
  561. CUBenchmarker<ConvolutionForward> bencher(handle_cuda());
  562. size_t RUNS = 1;
  563. bencher.set_display(false).set_times(RUNS);
  564. bencher.set_before_exec_callback(AlgoChecker<ConvolutionForward>(
  565. ConvBiasForward::algo_name<ConvBiasForward::DirectParam>(
  566. "CHANNEL_WISE", {})
  567. .c_str()));
  568. Convolution::Param param;
  569. param.format = ConvBias::Param::Format::NCHW;
  570. param.sparse = Convolution::Param::Sparse::GROUP;
  571. NormalRNG rng;
  572. auto run = [&](size_t batch, size_t c, size_t ih, size_t iw, size_t f,
  573. size_t s) {
  574. param.pad_h = f / 2;
  575. param.pad_w = f / 2;
  576. param.stride_h = s;
  577. param.stride_w = s;
  578. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  579. TensorShape src = {batch, c, ih, iw}, filter = {c, 1, 1, f, f};
  580. TensorLayout dst_layout;
  581. auto opr = handle_cuda()->create_operator<Convolution>();
  582. opr->param() = param;
  583. opr->deduce_layout({src, dtype::Float32()}, {filter, dtype::Float32()},
  584. dst_layout);
  585. float bandwith = static_cast<float>(src.total_nr_elems() +
  586. filter.total_nr_elems() +
  587. dst_layout.total_nr_elems()) /
  588. (1024 * 1024 * 1024) * 1e3;
  589. bencher.set_param(param)
  590. .set_dtype(0, dtype::Float32())
  591. .set_dtype(1, dtype::Float32())
  592. .set_dtype(2, dtype::Float32())
  593. .set_rng(0, &rng)
  594. .set_rng(1, &rng);
  595. auto time_in_ms_fp32 = bencher.execs({src, filter, {}}) / RUNS;
  596. bencher.set_param(param)
  597. .set_dtype(0, dtype::Float16())
  598. .set_dtype(1, dtype::Float16())
  599. .set_dtype(2, dtype::Float16())
  600. .set_rng(0, &rng)
  601. .set_rng(1, &rng);
  602. auto time_in_ms_fp16 = bencher.execs({src, filter, {}}) / RUNS;
  603. printf("stride=%zu src=%s, filter=%s, float32: %.2fms %.2fGB/s "
  604. "float16: %.2fms %.2fGB/s "
  605. "speedup: "
  606. "%0.2f (fp16/fp32)\n",
  607. s, src.to_string().c_str(), filter.to_string().c_str(),
  608. time_in_ms_fp32, bandwith * 4 / time_in_ms_fp32, time_in_ms_fp16,
  609. bandwith * 2 / time_in_ms_fp16,
  610. time_in_ms_fp32 / time_in_ms_fp16);
  611. };
  612. // clang-format off
  613. for (size_t s : {1})
  614. for (size_t f : {3, 5, 7})
  615. for (size_t batch : {64})
  616. for (size_t c : {16, 32, 64, 128})
  617. for (size_t ih: {8, 16, 32, 128, 256})
  618. for (size_t iw : {8, 16, 32, 128, 256})
  619. run(batch, c, ih, iw, f, s);
  620. // clang-format on
  621. }
  622. TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_FORWARD_FLOAT_SMALL) {
  623. CUBenchmarker<ConvolutionForward> bencher(handle_cuda());
  624. size_t RUNS = 1;
  625. bencher.set_display(false).set_times(RUNS);
  626. Convolution::Param param;
  627. param.format = ConvBias::Param::Format::NCHW;
  628. param.sparse = Convolution::Param::Sparse::GROUP;
  629. NormalRNG rng;
  630. auto run = [&](size_t batch, size_t c, size_t ih, size_t iw, size_t f,
  631. size_t s) {
  632. param.pad_h = f / 2;
  633. param.pad_w = f / 2;
  634. param.stride_h = s;
  635. param.stride_w = s;
  636. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  637. TensorShape src = {batch, c, ih, iw}, filter = {c, 1, 1, f, f};
  638. TensorLayout dst_layout;
  639. auto opr = handle_cuda()->create_operator<Convolution>();
  640. opr->param() = param;
  641. opr->deduce_layout({src, dtype::Float32()}, {filter, dtype::Float32()},
  642. dst_layout);
  643. float bandwith = static_cast<float>(src.total_nr_elems() +
  644. filter.total_nr_elems() +
  645. dst_layout.total_nr_elems()) /
  646. (1024 * 1024 * 1024) * 1e3;
  647. bencher.set_param(param)
  648. .set_dtype(0, dtype::Float32())
  649. .set_dtype(1, dtype::Float32())
  650. .set_dtype(2, dtype::Float32())
  651. .set_rng(0, &rng)
  652. .set_rng(1, &rng)
  653. .set_before_exec_callback(AlgoChecker<ConvolutionForward>(
  654. ConvBiasForward::algo_name<
  655. ConvBiasForward::DirectParam>("CHANNEL_WISE",
  656. {})
  657. .c_str()));
  658. auto time_in_ms_fp32_normal = bencher.execs({src, filter, {}}) / RUNS;
  659. bencher.set_before_exec_callback(AlgoChecker<ConvolutionForward>(
  660. ConvBiasForward::algo_name<ConvBiasForward::DirectParam>(
  661. "CHANNEL_WISE", {})
  662. .c_str()));
  663. auto time_in_ms_fp32_small = bencher.execs({src, filter, {}}) / RUNS;
  664. bencher.set_param(param)
  665. .set_dtype(0, dtype::Float16())
  666. .set_dtype(1, dtype::Float16())
  667. .set_dtype(2, dtype::Float16())
  668. .set_rng(0, &rng)
  669. .set_rng(1, &rng);
  670. auto time_in_ms_fp16_small = bencher.execs({src, filter, {}}) / RUNS;
  671. printf("stride=%zu src=%s, filter=%s, fp32 normal: %.2fms %.2fGB/s "
  672. "small: %.2fms %.2fGB/s, fp16 small: %.2fms %.2fGB/s, "
  673. "speedup: "
  674. "%0.2f (fp32 small/normal) %0.2f (small fp16/fp32)\n",
  675. s, src.to_string().c_str(), filter.to_string().c_str(),
  676. time_in_ms_fp32_normal, bandwith * 4 / time_in_ms_fp32_normal,
  677. time_in_ms_fp32_small, bandwith * 4 / time_in_ms_fp32_small,
  678. time_in_ms_fp16_small, bandwith * 2 / time_in_ms_fp16_small,
  679. time_in_ms_fp32_normal / time_in_ms_fp32_small,
  680. time_in_ms_fp32_small / time_in_ms_fp16_small);
  681. };
  682. // clang-format off
  683. for (size_t s : {1})
  684. for (size_t f : {3, 5})
  685. for (size_t batch : {64})
  686. for (size_t c : {16, 32, 64, 128})
  687. for (size_t ih: {8, 16, 32})
  688. for (size_t iw : {8, 16, 32})
  689. run(batch, c, ih, iw, f, s);
  690. // clang-format on
  691. run(128, 192, 28, 28, 3, 1);
  692. run(128, 576, 14, 14, 3, 1);
  693. run(128, 384, 14, 14, 3, 1);
  694. run(128, 960, 7, 7, 3, 1);
  695. run(128, 384, 14, 14, 3, 1);
  696. run(128, 384, 14, 14, 3, 1);
  697. run(128, 384, 14, 14, 3, 1);
  698. run(128, 192, 28, 28, 3, 1);
  699. run(128, 576, 14, 14, 3, 1);
  700. }
  701. TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_BACKWARD_DATA_FLOAT_SMALL) {
  702. CUBenchmarker<ConvolutionBackwardData> bencher(handle_cuda());
  703. size_t RUNS = 1;
  704. bencher.set_display(false).set_times(RUNS);
  705. ConvolutionBackwardData::Param param;
  706. param.format = Convolution::Param::Format::NCHW;
  707. param.sparse = Convolution::Param::Sparse::GROUP;
  708. NormalRNG rng;
  709. auto run = [&](size_t batch, size_t c, size_t ih, size_t iw, size_t f,
  710. size_t s) {
  711. param.pad_h = f / 2;
  712. param.pad_w = f / 2;
  713. param.stride_h = s;
  714. param.stride_w = s;
  715. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  716. TensorShape src = {batch, c, ih, iw}, filter = {c, 1, 1, f, f};
  717. float bandwith = static_cast<float>(src.total_nr_elems() +
  718. filter.total_nr_elems() +
  719. src.total_nr_elems()) /
  720. (1024 * 1024 * 1024) * 1e3;
  721. bencher.set_param(param)
  722. .set_dtype(0, dtype::Float32())
  723. .set_dtype(1, dtype::Float32())
  724. .set_dtype(2, dtype::Float32())
  725. .set_rng(0, &rng)
  726. .set_rng(1, &rng)
  727. .set_before_exec_callback(
  728. AlgoChecker<ConvolutionBackwardData>("CHANNEL_WISE"));
  729. auto time_in_ms_fp32_normal = bencher.execs({filter, src, src}) / RUNS;
  730. bencher.set_before_exec_callback(
  731. AlgoChecker<ConvolutionBackwardData>("CHANNEL_WISE_SMALL"));
  732. auto time_in_ms_fp32_small = bencher.execs({filter, src, src}) / RUNS;
  733. bencher.set_param(param)
  734. .set_dtype(0, dtype::Float16())
  735. .set_dtype(1, dtype::Float16())
  736. .set_dtype(2, dtype::Float16())
  737. .set_rng(0, &rng)
  738. .set_rng(1, &rng);
  739. auto time_in_ms_fp16_small = bencher.execs({filter, src, src}) / RUNS;
  740. printf("stride=%zu src=%s, filter=%s, fp32 normal: %.2fms %.2fGB/s "
  741. "small: %.2fms %.2fGB/s, fp16 small: %.2fms %.2fGB/s, "
  742. "speedup: "
  743. "%0.2f (fp32 small/normal) %0.2f (small fp16/fp32)\n",
  744. s, src.to_string().c_str(), filter.to_string().c_str(),
  745. time_in_ms_fp32_normal, bandwith * 4 / time_in_ms_fp32_normal,
  746. time_in_ms_fp32_small, bandwith * 4 / time_in_ms_fp32_small,
  747. time_in_ms_fp16_small, bandwith * 2 / time_in_ms_fp16_small,
  748. time_in_ms_fp32_normal / time_in_ms_fp32_small,
  749. time_in_ms_fp32_small / time_in_ms_fp16_small);
  750. };
  751. // clang-format off
  752. for (size_t s : {1})
  753. for (size_t f : {3, 5})
  754. for (size_t batch : {64})
  755. for (size_t c : {16, 32, 64, 128})
  756. for (size_t ih: {8, 16, 32})
  757. for (size_t iw : {8, 16, 32})
  758. run(batch, c, ih, iw, f, s);
  759. // clang-format on
  760. run(128, 192, 28, 28, 3, 1);
  761. run(128, 576, 14, 14, 3, 1);
  762. run(128, 384, 14, 14, 3, 1);
  763. run(128, 960, 7, 7, 3, 1);
  764. run(128, 384, 14, 14, 3, 1);
  765. run(128, 384, 14, 14, 3, 1);
  766. run(128, 384, 14, 14, 3, 1);
  767. run(128, 192, 28, 28, 3, 1);
  768. run(128, 576, 14, 14, 3, 1);
  769. }
  770. TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_BWD_DATA) {
  771. CUBenchmarker<ConvolutionBackwardData> bencher(handle_cuda());
  772. size_t RUNS = 1;
  773. bencher.set_display(false).set_times(RUNS);
  774. bencher.set_before_exec_callback(
  775. AlgoChecker<ConvolutionBackwardData>("CHANNEL_WISE"));
  776. Convolution::Param param;
  777. param.format = ConvBias::Param::Format::NCHW;
  778. param.sparse = Convolution::Param::Sparse::GROUP;
  779. NormalRNG rng;
  780. auto run = [&](size_t batch, size_t ocpg, size_t group, size_t ih,
  781. size_t iw, size_t f, size_t p, size_t s) {
  782. param.pad_h = p;
  783. param.pad_w = p;
  784. param.stride_h = s;
  785. param.stride_w = s;
  786. size_t oh, ow;
  787. infer_conv_shape2d(ih, iw, f, f, s, s, p, p, oh, ow, true);
  788. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  789. TensorShape src_grad = {batch, group, ih, iw},
  790. dst_grad = {batch, group * ocpg, oh, ow},
  791. flt = {group, ocpg, 1, f, f};
  792. auto opr = handle_cuda()->create_operator<Convolution>();
  793. opr->param() = param;
  794. float bandwith = static_cast<float>(flt.total_nr_elems() +
  795. dst_grad.total_nr_elems() +
  796. src_grad.total_nr_elems()) /
  797. (1024 * 1024 * 1024) * 1e3;
  798. bencher.set_param(param)
  799. .set_dtype(0, dtype::Float32())
  800. .set_dtype(1, dtype::Float32())
  801. .set_dtype(2, dtype::Float32())
  802. .set_rng(0, &rng)
  803. .set_rng(1, &rng);
  804. auto time_in_ms_fp32 = bencher.execs({flt, dst_grad, src_grad}) / RUNS;
  805. bencher.set_param(param)
  806. .set_dtype(0, dtype::Float16())
  807. .set_dtype(1, dtype::Float16())
  808. .set_dtype(2, dtype::Float16())
  809. .set_rng(0, &rng)
  810. .set_rng(1, &rng);
  811. auto time_in_ms_fp16 = bencher.execs({flt, dst_grad, src_grad}) / RUNS;
  812. printf("stride=%zu, src_grad=%s, flt=%s, "
  813. "float32: %.2fms %.2fGB/s "
  814. "float16: %.2fms %.2fGB/s "
  815. "speedup: "
  816. "%0.2f (fp16/fp32)\n",
  817. s, src_grad.to_string().c_str(), flt.to_string().c_str(),
  818. time_in_ms_fp32, bandwith * 4 / time_in_ms_fp32, time_in_ms_fp16,
  819. bandwith * 2 / time_in_ms_fp16,
  820. time_in_ms_fp32 / time_in_ms_fp16);
  821. };
  822. // clang-format off
  823. for (size_t s : {1, 2})
  824. for (size_t f : {3, 5, 7})
  825. for (size_t p : {f / 2})
  826. for (size_t batch : {64})
  827. for (size_t ocpg : {1})
  828. for (size_t group : {16, 32, 64, 128})
  829. for (size_t ih : {8, 16, 32, 128, 256})
  830. for (size_t iw : {8, 16, 32, 128, 256})
  831. run(batch, ocpg, group, ih, iw, f, p, s);
  832. // clang-format on
  833. }
  834. TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_BWD_FILTER) {
  835. CUBenchmarker<ConvolutionBackwardFilter> bencher(handle_cuda());
  836. size_t RUNS = 1;
  837. bencher.set_display(false).set_times(RUNS);
  838. bencher.set_before_exec_callback(
  839. AlgoChecker<ConvolutionBackwardFilter>("CHANNEL_WISE"));
  840. Convolution::Param param;
  841. param.format = ConvBias::Param::Format::NCHW;
  842. param.sparse = Convolution::Param::Sparse::GROUP;
  843. NormalRNG rng;
  844. auto run = [&](size_t batch, size_t ocpg, size_t group, size_t i,
  845. size_t f, size_t p, size_t s) {
  846. param.pad_h = p;
  847. param.pad_w = p;
  848. param.stride_h = s;
  849. param.stride_w = s;
  850. size_t d = infer_conv_shape(i, f, s, p, true);
  851. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  852. TensorShape src = {batch, group, i, i},
  853. dst_grad = {batch, group * ocpg, d, d},
  854. flt_grad = {group, ocpg, 1, f, f};
  855. auto opr = handle_cuda()->create_operator<Convolution>();
  856. opr->param() = param;
  857. float bandwith = static_cast<float>(flt_grad.total_nr_elems() +
  858. dst_grad.total_nr_elems() +
  859. src.total_nr_elems()) /
  860. (1024 * 1024 * 1024) * 1e3;
  861. bencher.set_param(param)
  862. .set_dtype(0, dtype::Float32())
  863. .set_dtype(1, dtype::Float32())
  864. .set_dtype(2, dtype::Float32())
  865. .set_rng(0, &rng)
  866. .set_rng(1, &rng);
  867. auto time_in_ms_fp32 = bencher.execs({src, dst_grad, flt_grad}) / RUNS;
  868. bencher.set_param(param)
  869. .set_dtype(0, dtype::Float16())
  870. .set_dtype(1, dtype::Float16())
  871. .set_dtype(2, dtype::Float16())
  872. .set_rng(0, &rng)
  873. .set_rng(1, &rng);
  874. auto time_in_ms_fp16 = bencher.execs({src, dst_grad, flt_grad}) / RUNS;
  875. printf("stride=%zu, src=%s, flt_grad=%s, "
  876. "float32: %.2fms %.2fGB/s "
  877. "float16: %.2fms %.2fGB/s "
  878. "speedup: "
  879. "%.2f (fp16/fp32)\n",
  880. s, src.to_string().c_str(), flt_grad.to_string().c_str(),
  881. time_in_ms_fp32, bandwith * 4 / time_in_ms_fp32, time_in_ms_fp16,
  882. bandwith * 2 / time_in_ms_fp16,
  883. time_in_ms_fp32 / time_in_ms_fp16);
  884. };
  885. // clang-format off
  886. for (size_t s : {1, 2})
  887. for (size_t f : {3, 5, 7})
  888. for (size_t p : {f / 2})
  889. for (size_t batch : {64})
  890. for (size_t ocpg : {1})
  891. for (size_t group : {16, 32, 64, 128})
  892. for (size_t i : {8, 16, 32, 64, 128})
  893. run(batch, ocpg, group, i, f, p, s);
  894. // clang-format on
  895. }
  896. #endif
  897. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台