You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

chanwise_convolution.cpp 45 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172
  1. /**
  2. * \file dnn/test/cuda/chanwise_convolution.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #include "megdnn/oprs/nn.h"
  12. #include "test/cuda/fixture.h"
  13. #include "test/cuda/benchmark.h"
  14. #include "test/common/tensor.h"
  15. #include "test/common/workspace_wrapper.h"
  16. #include "test/common/checker.h"
  17. #include "test/common/convolution.h"
  18. #include "test/common/benchmarker.h"
  19. #include "megcore_cuda.h"
  20. #include "cuda.h"
  21. #include <cuda_profiler_api.h>
  22. #include <cuda_runtime_api.h>
  23. using namespace megdnn;
  24. using namespace test;
  25. namespace {
  26. #if MEGDNN_WITH_BENCHMARK
  27. bool check_need_full_bench() {
  28. if (getenv("MEGDNN_CHANWISE_CONV_FULLBENCH"))
  29. return true;
  30. printf("set MEGDNN_CHANWISE_CONV_FULLBENCH to run full benchmark\n");
  31. return false;
  32. }
  33. #endif
  34. Convolution::Param gconv_param(Convolution::Param p) {
  35. p.sparse = Convolution::Param::Sparse::GROUP;
  36. return p;
  37. }
  38. template<int P0, int P1, int P2>
  39. class BenchmarkEnv {
  40. Handle *handle, *handle_cpu;
  41. std::unique_ptr<GaussianRNG> rng;
  42. TensorLayout lsrc, lflt0, lflt1, ldst;
  43. std::unique_ptr<Tensor<>> src0, src1,
  44. flt0, flt0_cpu, flt1, flt1_cpu, dst0, dst1;
  45. cudaEvent_t cuda_ev[3];
  46. cudaStream_t cuda_stream;
  47. size_t pad_h, pad_w;
  48. template<typename T>
  49. static std::tuple<T, T, T> shuffle(std::tuple<T, T, T> data) {
  50. return std::make_tuple(
  51. std::get<P0>(data), std::get<P1>(data), std::get<P2>(data));
  52. }
  53. public:
  54. BenchmarkEnv(Handle *handle, Handle *handle_cpu) {
  55. this->handle = handle;
  56. this->handle_cpu = handle_cpu;
  57. rng = handle->create_operator<GaussianRNG>();
  58. // make cpu handle used
  59. handle_cpu->create_operator<Sleep>()->exec();
  60. for (int i = 0; i < 3; ++ i)
  61. cudaEventCreate(&cuda_ev[i]);
  62. megcoreGetCUDAStream(handle->megcore_computing_handle(), &cuda_stream);
  63. }
  64. ~BenchmarkEnv() {
  65. for (int i = 0; i < 3; ++ i)
  66. cudaEventDestroy(cuda_ev[i]);
  67. }
  68. void alloc(size_t N, size_t IC, size_t IH, size_t IW,
  69. size_t CHL_MUL, size_t FH, size_t FW, size_t PH, size_t PW) {
  70. pad_h = PH;
  71. pad_w = PW;
  72. auto mkly = [](const TensorShape &s) {
  73. return TensorLayout{s, dtype::Float32()};
  74. };
  75. lsrc = mkly({N, IC, IH, IW});
  76. lflt0 = mkly({CHL_MUL*IC, IC, FH, FW});
  77. lflt1 = mkly({IC, CHL_MUL, 1, FH, FW});
  78. ldst = mkly({N, IC*CHL_MUL, IH-FH+1+PH*2, IW-FW+1+PW*2});
  79. src0.reset(new Tensor<>(handle, lsrc));
  80. src1.reset(new Tensor<>(handle, lsrc));
  81. flt0.reset(new Tensor<>(handle, lflt0));
  82. flt0_cpu.reset(new Tensor<>(handle_cpu, lflt0));
  83. flt1.reset(new Tensor<>(handle, lflt1));
  84. flt1_cpu.reset(new Tensor<>(handle_cpu, lflt1));
  85. dst0.reset(new Tensor<>(handle, ldst));
  86. dst1.reset(new Tensor<>(handle, ldst));
  87. }
  88. void fill_src() {
  89. rng->exec(src0->tensornd(), {});
  90. megdnn_memcpy_D2D(handle, src1->ptr(), src0->ptr(),
  91. lsrc.span().dist_byte());
  92. }
  93. void fill_flt() {
  94. rng->exec(flt1->tensornd(), {});
  95. megdnn_memcpy_D2H(handle,
  96. flt1_cpu->ptr(), flt1->ptr(), lflt1.span().dist_byte());
  97. const size_t IC = lflt1[0], CHL_MUL = lflt1[1],
  98. FSIZE = lflt1[3] * lflt1[4];
  99. // fill flt0 from flt1
  100. float* src = flt1_cpu->ptr();
  101. float* dst = flt0_cpu->ptr();
  102. memset(dst, 0, lflt0.span().dist_byte());
  103. for (size_t i = 0; i < IC; ++ i) {
  104. for (size_t j = 0; j < CHL_MUL; ++ j) {
  105. memcpy(dst + ((i * CHL_MUL + j) * IC + i) * FSIZE,
  106. src + (i * CHL_MUL + j) * FSIZE,
  107. FSIZE * sizeof(float));
  108. }
  109. }
  110. megdnn_memcpy_H2D(handle,
  111. flt0->ptr(), dst, lflt0.span().dist_byte());
  112. }
  113. void fill_dst() {
  114. rng->exec(dst0->tensornd(), {});
  115. megdnn_memcpy_D2D(handle, dst1->ptr(), dst0->ptr(),
  116. ldst.span().dist_byte());
  117. }
  118. template<class Opr>
  119. void exec(Opr *opr0, Opr *opr1) {
  120. opr0->param().pad_h = pad_h;
  121. opr0->param().pad_w = pad_w;
  122. opr1->param() = opr0->param();
  123. opr1->param().sparse = param::Convolution::Sparse::GROUP;
  124. TensorND a0, b0, c0, a1, b1, c1;
  125. std::tie(a0, b0, c0) = shuffle(std::make_tuple(
  126. src0->tensornd(), flt0->tensornd(), dst0->tensornd()));
  127. std::tie(a1, b1, c1) = shuffle(std::make_tuple(
  128. src1->tensornd(), flt1->tensornd(), dst1->tensornd()));
  129. WorkspaceWrapper wk(handle,
  130. std::max(
  131. opr0->get_workspace_in_bytes(
  132. a0.layout, b0.layout, c0.layout),
  133. opr1->get_workspace_in_bytes(
  134. a1.layout, b1.layout, c1.layout)
  135. ));
  136. cudaProfilerStart();
  137. cudaEventRecord(cuda_ev[0], cuda_stream);
  138. opr0->exec(a0, b0, c0, wk.workspace());
  139. cudaEventRecord(cuda_ev[1], cuda_stream);
  140. opr1->exec(a1, b1, c1, wk.workspace());
  141. cudaEventRecord(cuda_ev[2], cuda_stream);
  142. cudaProfilerStop();
  143. if (getenv("MEGDNN_CHANWISE_CONV_VERBOSE") ||
  144. getenv("MEGDNN_CHANWISE_CONV_FULLBENCH")) {
  145. cudaStreamSynchronize(cuda_stream);
  146. float t0 = -1, t1 = -1;
  147. cudaEventElapsedTime(&t0, cuda_ev[0], cuda_ev[1]);
  148. cudaEventElapsedTime(&t1, cuda_ev[1], cuda_ev[2]);
  149. printf("%s;%s;%s: cudnn/megdnn: %.3fms/%.3fms=%.3f\n",
  150. lsrc.TensorShape::to_string().c_str(),
  151. lflt1.TensorShape::to_string().c_str(),
  152. ldst.TensorShape::to_string().c_str(),
  153. t0, t1, t0 / t1);
  154. }
  155. }
  156. //! special for weight preprocess
  157. void exec_convolution(ConvolutionForward* opr0, ConvolutionForward* opr1) {
  158. opr0->param().pad_h = pad_h;
  159. opr0->param().pad_w = pad_w;
  160. opr1->param() = opr0->param();
  161. opr1->param().sparse = param::Convolution::Sparse::GROUP;
  162. TensorND a0, b0, c0, a1, b1, c1;
  163. std::tie(a0, b0, c0) = shuffle(std::make_tuple(
  164. src0->tensornd(), flt0->tensornd(), dst0->tensornd()));
  165. std::tie(a1, b1, c1) = shuffle(std::make_tuple(
  166. src1->tensornd(), flt1->tensornd(), dst1->tensornd()));
  167. WorkspaceWrapper wk(
  168. handle,
  169. std::max(opr0->get_workspace_in_bytes(a0.layout, b0.layout,
  170. c0.layout, nullptr),
  171. opr1->get_workspace_in_bytes(a1.layout, b1.layout,
  172. c1.layout, nullptr)));
  173. cudaProfilerStart();
  174. cudaEventRecord(cuda_ev[0], cuda_stream);
  175. opr0->exec(a0, b0, c0, nullptr, wk.workspace());
  176. cudaEventRecord(cuda_ev[1], cuda_stream);
  177. opr1->exec(a1, b1, c1, nullptr, wk.workspace());
  178. cudaEventRecord(cuda_ev[2], cuda_stream);
  179. cudaProfilerStop();
  180. if (getenv("MEGDNN_CHANWISE_CONV_VERBOSE") ||
  181. getenv("MEGDNN_CHANWISE_CONV_FULLBENCH")) {
  182. cudaStreamSynchronize(cuda_stream);
  183. float t0 = -1, t1 = -1;
  184. cudaEventElapsedTime(&t0, cuda_ev[0], cuda_ev[1]);
  185. cudaEventElapsedTime(&t1, cuda_ev[1], cuda_ev[2]);
  186. printf("%s;%s;%s: cudnn/megdnn: %.3fms/%.3fms=%.3f\n",
  187. lsrc.TensorShape::to_string().c_str(),
  188. lflt1.TensorShape::to_string().c_str(),
  189. ldst.TensorShape::to_string().c_str(),
  190. t0, t1, t0 / t1);
  191. }
  192. }
  193. void cmp_dst() {
  194. Tensor<> dst0_cpu(handle_cpu, ldst), dst1_cpu(handle_cpu, ldst);
  195. megdnn_memcpy_D2H(handle,
  196. dst0_cpu.ptr(), dst0->ptr(), ldst.span().dist_byte());
  197. megdnn_memcpy_D2H(handle,
  198. dst1_cpu.ptr(), dst1->ptr(), ldst.span().dist_byte());
  199. dst0_cpu.check_with(dst1_cpu);
  200. }
  201. void cmp_src() {
  202. Tensor<> src0_cpu(handle_cpu, lsrc), src1_cpu(handle_cpu, lsrc);
  203. megdnn_memcpy_D2H(handle,
  204. src0_cpu.ptr(), src0->ptr(), lsrc.span().dist_byte());
  205. megdnn_memcpy_D2H(handle,
  206. src1_cpu.ptr(), src1->ptr(), lsrc.span().dist_byte());
  207. src0_cpu.check_with(src1_cpu);
  208. }
  209. void cmp_flt() {
  210. Tensor<> flt0_cpu(handle_cpu, lflt0), flt1_cpu(handle_cpu, lflt1);
  211. float *p0 = flt0_cpu.ptr();
  212. float *p1 = flt1_cpu.ptr();
  213. megdnn_memcpy_D2H(handle, p0, flt0->ptr(), lflt0.span().dist_byte());
  214. megdnn_memcpy_D2H(handle, p1, flt1->ptr(), lflt1.span().dist_byte());
  215. size_t IC = lflt1[0], CHL_MUL = lflt1[1],
  216. FSIZE = lflt1[3] * lflt1[4];
  217. double tot_err = 0, tot_err_num = 0;
  218. for (size_t i = 0; i < IC; ++ i) {
  219. for (size_t j = 0; j < CHL_MUL; ++ j) {
  220. auto t0 = p0 + ((i * CHL_MUL + j) * IC + i) * FSIZE,
  221. t1 = p1 + (i * CHL_MUL + j) * FSIZE;
  222. for (size_t k = 0; k < FSIZE; ++ k) {
  223. auto err = std::abs(diff(t0[k], t1[k]));
  224. tot_err += err;
  225. tot_err_num += 1;
  226. ASSERT_LT(err, 1e-2) << "failed at " <<
  227. i << " " << j << " " << k <<
  228. " vals=" << t0[k] << "," << t1[k];
  229. }
  230. }
  231. }
  232. auto avg_err = tot_err / tot_err_num;
  233. ASSERT_LT(avg_err, 1e-4);
  234. }
  235. };
  236. } // anonymous namespace
  237. constexpr auto M = Convolution::Mode::CROSS_CORRELATION;
  238. TEST_F(CUDA, CHANWISE_CONVOLUTION_FORWARD) {
  239. Checker<Convolution> checker(handle_cuda());
  240. bool require_algo = false;
  241. checker.set_before_exec_callback(AlgoChecker<ConvolutionForward>(
  242. ExecutionPolicyAlgoName{
  243. "DEFAULT",
  244. {{ConvBiasForward::algo_name<ConvBiasForward::DirectParam>(
  245. "CHANNEL_WISE", {})
  246. .c_str(),
  247. {}}}},
  248. &require_algo));
  249. for (auto dtype : std::vector<DType>{dtype::Float32(), dtype::Float16()}) {
  250. checker.set_dtype(0, dtype).set_dtype(1, dtype).set_dtype(2, dtype);
  251. if (dtype.enumv() == DTypeEnum::Float16)
  252. checker.set_epsilon(2e-2);
  253. // simple case
  254. // clang-format off
  255. for (uint32_t s : {1, 2})
  256. for (uint32_t p : {0, 1, 2, 3})
  257. for (size_t f : {2, 3, 5, 7})
  258. for (size_t ocpg : {1, 3}) {
  259. checker.set_param(gconv_param({M, p, p, s, s}))
  260. .execs({{2, 3, 16, 16}, {3, ocpg, 1, f, f}, {}});
  261. }
  262. // clang-format on
  263. checker.set_param(gconv_param({M, 2, 3, 2, 1}))
  264. .execs({{32, 12, 20, 10}, {12, 2, 1, 4, 5}, {}});
  265. // padding larger than kern
  266. checker.set_param(gconv_param({M, 20, 30, 4, 5}))
  267. .execs({{32, 12, 20, 10}, {12, 2, 1, 4, 5}, {}});
  268. }
  269. }
  270. TEST_F(CUDA, CHANWISE_CONVOLUTION_FORWARD_SMALL) {
  271. Checker<Convolution> checker(handle_cuda());
  272. bool require_algo = false;
  273. checker.set_before_exec_callback(AlgoChecker<ConvolutionForward>(
  274. ExecutionPolicyAlgoName{
  275. "DEFAULT",
  276. {{ConvBiasForward::algo_name<ConvBiasForward::DirectParam>(
  277. "CHANNEL_WISE_SMALL", {})
  278. .c_str(),
  279. {}}}},
  280. &require_algo));
  281. for (auto dtype : std::vector<DType> {
  282. dtype::Float32(),
  283. #if CUDA_VERSION >= 9000
  284. dtype::Float16()
  285. #endif
  286. }) {
  287. checker.set_dtype(0, dtype).set_dtype(1, dtype).set_dtype(2, dtype);
  288. if (dtype.enumv() == DTypeEnum::Float16)
  289. checker.set_epsilon(2e-2);
  290. // clang-format off
  291. for (uint32_t s : {1})
  292. for (uint32_t f : {1, 3, 5, 7}) {
  293. checker.set_param(gconv_param({M, f / 2, f / 2, s, s}))
  294. .execs({{2, 3, 16, 16}, {3, 1, 1, f, f}, {}});
  295. }
  296. // clang-format on
  297. checker.set_param(gconv_param({M, 1, 1, 1, 1}))
  298. .execs({{2, 3, 3, 16}, {3, 1, 1, 3, 3}, {}})
  299. .execs({{2, 3, 8, 3}, {3, 1, 1, 3, 3}, {}});
  300. }
  301. }
  302. TEST_F(CUDA, CHANWISE_CONVOLUTION_BACKWARD_DATA) {
  303. Checker<ConvolutionBackwardData> checker(handle_cuda());
  304. bool require_algo = false;
  305. checker.set_before_exec_callback(AlgoChecker<ConvolutionBackwardData>(
  306. "CHANNEL_WISE", &require_algo));
  307. for (auto dtype : std::vector<DType>{dtype::Float32(), dtype::Float16()}) {
  308. checker.set_dtype(0, dtype).set_dtype(1, dtype).set_dtype(2, dtype);
  309. if (dtype.enumv() == DTypeEnum::Float16)
  310. checker.set_epsilon(1e-1);
  311. // simple case
  312. // clang-format off
  313. for (uint32_t s : {1, 2})
  314. for (uint32_t p : {0, 1, 2, 3})
  315. for (size_t f : {1, 2, 3, 5, 7})
  316. for (size_t ocpg : {1, 3}) {
  317. size_t ii = infer_conv_shape(16, f, s, p, true);
  318. checker.set_param(gconv_param({M, p, p, s, s}))
  319. .execs({{3, ocpg, 1, f, f},
  320. {2, 3 * ocpg, ii, ii},
  321. {2, 3, 16, 16}});
  322. }
  323. // clang-format on
  324. checker.set_param(gconv_param({M, 2, 3, 2, 1}))
  325. .execs({{12, 3, 1, 4, 5}, {32, 36, 20, 10}, {32, 12, 39, 8}});
  326. checker.set_param(gconv_param({M, 30, 20, 5, 4}))
  327. .execs({{6, 2, 1, 5, 4}, {32, 12, 12, 10}, {32, 6, 3, 2}});
  328. checker.set_param(gconv_param({M, 20, 30, 4, 5}))
  329. .execs({{6, 2, 1, 4, 5}, {32, 12, 10, 12}, {32, 6, 2, 3}});
  330. }
  331. }
  332. TEST_F(CUDA, CHANWISE_CONVOLUTION_BACKWARD_DATA_SMALL) {
  333. Checker<ConvolutionBackwardData> checker(handle_cuda());
  334. bool require_algo = false;
  335. checker.set_before_exec_callback(AlgoChecker<ConvolutionBackwardData>(
  336. "CHANNEL_WISE_SMALL", &require_algo));
  337. for (auto dtype : std::vector<DType> {
  338. dtype::Float32(),
  339. #if CUDA_VERSION >= 9000
  340. dtype::Float16()
  341. #endif
  342. }) {
  343. checker.set_dtype(0, dtype).set_dtype(1, dtype).set_dtype(2, dtype);
  344. if (dtype.enumv() == DTypeEnum::Float16)
  345. checker.set_epsilon(2e-2);
  346. for (uint32_t f : {1, 3, 5, 7}) {
  347. checker.set_param(gconv_param({M, f/2, f/2, 1, 1}))
  348. .execs({{3, 1, 1, f, f}, {2, 3, 16, 16}, {2, 3, 16, 16}});
  349. }
  350. checker.set_param(gconv_param({M, 1, 1, 1, 1}))
  351. .execs({{3, 1, 1, 3, 3}, {2, 3, 3, 16}, {2, 3, 3, 16}})
  352. .execs({{3, 1, 1, 3, 3}, {2, 3, 8, 3}, {2, 3, 8, 3}});
  353. }
  354. }
  355. TEST_F(CUDA, CHANWISE_CONVOLUTION_BACKWARD_FILTER) {
  356. Checker<ConvolutionBackwardFilter> checker(handle_cuda());
  357. bool require_algo = false;
  358. checker.set_before_exec_callback(AlgoChecker<ConvolutionBackwardFilter>(
  359. "CHANNEL_WISE", &require_algo));
  360. UniformFloatRNG rng(-0.1, 0.1);
  361. for (auto dtype : std::vector<DType>{dtype::Float32(), dtype::Float16()}) {
  362. checker.set_dtype(0, dtype)
  363. .set_dtype(1, dtype)
  364. .set_dtype(2, dtype)
  365. .set_rng(0, &rng)
  366. .set_rng(1, &rng);
  367. if (dtype.enumv() == DTypeEnum::Float16)
  368. checker.set_epsilon(2e-1);
  369. // simple case
  370. // clang-format off
  371. for (uint32_t s : {1, 2})
  372. for (uint32_t p : {0, 1, 2, 3})
  373. for (uint32_t f : {1, 2, 3, 5, 7})
  374. for (uint32_t ocpg : {1, 3})
  375. for (uint32_t i : {8, 16, 32, 64}){
  376. size_t ii = infer_conv_shape(i, f, s, p, true);
  377. checker.set_param(gconv_param({M, p, p, s, s}))
  378. .execs({{2, 3, i, i},
  379. {2, 3 * ocpg, ii, ii},
  380. {3, ocpg, 1, f, f}});
  381. }
  382. // clang-format on
  383. // padding larger than kern
  384. checker.set_param(gconv_param({M, 20, 30, 4, 5})).
  385. execs({{32, 6, 2, 3}, {32, 12, 10, 12}, {6, 2, 1, 4, 5}});
  386. // unused filter items
  387. checker.set_param(gconv_param({M, 2, 3, 2, 3})).
  388. execs({{32, 6, 1, 1}, {32, 12, 1, 1}, {6, 2, 1, 5, 7}});
  389. }
  390. }
  391. #if MEGDNN_WITH_BENCHMARK
  392. TEST_F(CUDA, CHANWISE_CONVOLUTION_FORWARD_BENCH_CHECK) {
  393. auto handle = handle_cuda();
  394. auto handle_cpu = handle_naive();
  395. auto conv0 = handle->create_operator<ConvolutionForward>();
  396. auto conv1 = handle->create_operator<ConvolutionForward>();
  397. BenchmarkEnv<0, 1, 2> benv(handle, handle_cpu);
  398. auto run = [&](size_t N, size_t IC, size_t IH, size_t IW,
  399. size_t CHL_MUL, size_t FH, size_t FW, size_t PH, size_t PW) {
  400. benv.alloc(N, IC, IH, IW, CHL_MUL, FH, FW, PH, PW);
  401. benv.fill_src();
  402. benv.fill_flt();
  403. benv.exec_convolution(conv0.get(), conv1.get());
  404. benv.cmp_dst();
  405. };
  406. run(64, 60, 50, 50, 1, 3, 3, 1, 1);
  407. if (check_need_full_bench()) {
  408. run(64, 728, 18, 18, 2, 5, 5, 2, 2);
  409. run(64, 64, 150, 150, 2, 3, 3, 1, 1);
  410. run(1, 2048, 4, 4, 2, 3, 3, 1, 1);
  411. }
  412. }
  413. TEST_F(CUDA, CHANWISE_CONVOLUTION_BWD_DATA_BENCH_CHECK) {
  414. auto handle = handle_cuda();
  415. auto handle_cpu = handle_naive();
  416. auto conv0 = handle->create_operator<ConvolutionBackwardData>();
  417. auto conv1 = handle->create_operator<ConvolutionBackwardData>();
  418. BenchmarkEnv<1, 2, 0> benv(handle, handle_cpu);
  419. auto run = [&](size_t N, size_t IC, size_t IH, size_t IW,
  420. size_t CHL_MUL, size_t FH, size_t FW, size_t PH, size_t PW) {
  421. benv.alloc(N, IC, IH, IW, CHL_MUL, FH, FW, PH, PW);
  422. benv.fill_dst();
  423. benv.fill_flt();
  424. benv.exec(conv0.get(), conv1.get());
  425. benv.cmp_src();
  426. };
  427. run(64, 60, 50, 50, 1, 3, 3, 1, 1);
  428. if (check_need_full_bench()) {
  429. run(64, 728, 18, 18, 2, 5, 5, 2, 2);
  430. run(64, 64, 150, 150, 2, 3, 3, 1, 1);
  431. run(1, 2048, 4, 4, 2, 3, 3, 1, 1);
  432. }
  433. }
  434. TEST_F(CUDA, CHANWISE_CONVOLUTION_BWD_FILTER_BENCH_CHECK) {
  435. auto handle = handle_cuda();
  436. auto handle_cpu = handle_naive();
  437. auto conv0 = handle->create_operator<ConvolutionBackwardFilter>();
  438. auto conv1 = handle->create_operator<ConvolutionBackwardFilter>();
  439. BenchmarkEnv<0, 2, 1> benv(handle, handle_cpu);
  440. auto run = [&](size_t N, size_t IC, size_t IH, size_t IW,
  441. size_t CHL_MUL, size_t FH, size_t FW, size_t PH, size_t PW) {
  442. benv.alloc(N, IC, IH, IW, CHL_MUL, FH, FW, PH, PW);
  443. benv.fill_src();
  444. benv.fill_dst();
  445. benv.exec(conv0.get(), conv1.get());
  446. benv.cmp_flt();
  447. };
  448. run(64, 60, 50, 50, 1, 3, 3, 1, 1);
  449. if (check_need_full_bench()){
  450. run(64, 728, 18, 18, 2, 5, 5, 2, 2);
  451. run(64, 64, 150, 150, 2, 3, 3, 1, 1);
  452. run(1, 2048, 4, 4, 2, 3, 3, 1, 1);
  453. }
  454. }
  455. TEST_F(CUDA, CHANWISE_CONVOLUTION_BENCH_ALL_ALGO_FWD) {
  456. // enable profiling
  457. std::unique_ptr<OprProxy<ConvolutionForward>> proxy{
  458. new OprProxy<ConvolutionForward>{true}};
  459. proxy->warmup_times = 1;
  460. proxy->exec_times = 10;
  461. Benchmarker<ConvolutionForward> checker(handle_cuda());
  462. checker.set_times(1);
  463. ConvolutionForward::Param param;
  464. param.sparse = ConvolutionForward::Param::Sparse::GROUP;
  465. checker.set_param(param);
  466. checker.set_proxy(proxy);
  467. auto run = [&](size_t N, size_t C, size_t IH, size_t IW, size_t FH,
  468. size_t FW) {
  469. checker.proxy()->target_execution_policy = {};
  470. checker.execs({{N, C, IH, IW}, {C, 1, 1, FH, FW}, {}});
  471. };
  472. run(128, 64, 90, 80, 3, 3);
  473. run(128, 90, 100, 100, 3, 5);
  474. run(128, 32, 62, 62, 5, 5);
  475. }
  476. TEST_F(CUDA, CHANWISE_CONVOLUTION_BENCH_ALL_ALGO_BWD_DATA) {
  477. // enable profiling
  478. std::unique_ptr<OprProxy<ConvolutionBackwardData>> proxy{
  479. new OprProxy<ConvolutionBackwardData>{true}};
  480. proxy->warmup_times = 1;
  481. proxy->exec_times = 10;
  482. Benchmarker<ConvolutionBackwardData> checker(handle_cuda());
  483. checker.set_times(1);
  484. ConvolutionBackwardData::Param param;
  485. param.sparse = ConvolutionForward::Param::Sparse::GROUP;
  486. checker.set_param(param);
  487. checker.set_proxy(proxy);
  488. auto run = [&](size_t N, size_t C, size_t IH, size_t IW, size_t FH,
  489. size_t FW) {
  490. checker.proxy()->target_execution_policy.algo.reset();
  491. checker.execs({{C, 1, 1, FH, FW},
  492. {N, C, IH - FH + 1, IW - FW + 1},
  493. {N, C, IH, IW}});
  494. };
  495. run(128, 64, 90, 80, 3, 3);
  496. run(128, 90, 100, 100, 3, 5);
  497. run(128, 32, 62, 62, 5, 5);
  498. }
  499. TEST_F(CUDA, CHANWISE_CONVOLUTION_BENCH_ALL_ALGO_BWD_FILTER) {
  500. // enable profiling
  501. std::unique_ptr<OprProxy<ConvolutionBackwardFilter>> proxy{
  502. new OprProxy<ConvolutionBackwardFilter>{true}};
  503. proxy->warmup_times = 1;
  504. proxy->exec_times = 10;
  505. Benchmarker<ConvolutionBackwardFilter> checker(handle_cuda());
  506. checker.set_times(1);
  507. ConvolutionBackwardFilter::Param param;
  508. param.sparse = ConvolutionForward::Param::Sparse::GROUP;
  509. checker.set_param(param);
  510. checker.set_proxy(proxy);
  511. auto run = [&](size_t N, size_t C, size_t IH, size_t IW, size_t FH,
  512. size_t FW) {
  513. checker.proxy()->target_execution_policy.algo.reset();
  514. checker.execs({{N, C, IH, IW},
  515. {N, C, IH - FH + 1, IW - FW + 1},
  516. {C, 1, 1, FH, FW}});
  517. };
  518. run(128, 64, 90, 80, 3, 3);
  519. run(128, 90, 100, 100, 3, 5);
  520. run(128, 32, 62, 62, 5, 5);
  521. }
  522. TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_ALL_ALGO_FORWARD) {
  523. CUBenchmarker<ConvolutionForward> bencher(handle_cuda());
  524. size_t RUNS = 10;
  525. bencher.set_display(false).set_times(RUNS);
  526. std::unique_ptr<OprProxy<ConvolutionForward>> proxy{
  527. new OprProxy<ConvolutionForward>{true}};
  528. bencher.set_proxy(proxy);
  529. Convolution::Param param;
  530. param.format = ConvBias::Param::Format::NCHW;
  531. param.sparse = Convolution::Param::Sparse::GROUP;
  532. NormalRNG rng;
  533. auto run = [&](size_t batch, size_t c, size_t ih, size_t iw, size_t f,
  534. size_t s) {
  535. param.pad_h = f / 2;
  536. param.pad_w = f / 2;
  537. param.stride_h = s;
  538. param.stride_w = s;
  539. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  540. TensorShape src = {batch, c, ih, iw}, filter = {c, 1, 1, f, f};
  541. TensorLayout dst_layout;
  542. auto opr = handle_cuda()->create_operator<Convolution>();
  543. opr->param() = param;
  544. opr->deduce_layout({src, dtype::Float32()}, {filter, dtype::Float32()},
  545. dst_layout);
  546. float bandwith = static_cast<float>(src.total_nr_elems() +
  547. filter.total_nr_elems() +
  548. dst_layout.total_nr_elems()) /
  549. (1024 * 1024 * 1024) * 1e3;
  550. bencher.set_param(param)
  551. .set_dtype(0, dtype::Float32())
  552. .set_dtype(1, dtype::Float32())
  553. .set_dtype(2, dtype::Float32())
  554. .set_rng(0, &rng)
  555. .set_rng(1, &rng);
  556. bencher.proxy()->target_execution_policy = {};
  557. auto time_in_ms_fp32 = bencher.execs({src, filter, {}}) / RUNS;
  558. bencher.set_param(param)
  559. .set_dtype(0, dtype::Float16())
  560. .set_dtype(1, dtype::Float16())
  561. .set_dtype(2, dtype::Float16())
  562. .set_rng(0, &rng)
  563. .set_rng(1, &rng);
  564. bencher.proxy()->target_execution_policy = {};
  565. auto time_in_ms_fp16 = bencher.execs({src, filter, {}}) / RUNS;
  566. bencher.proxy()->target_execution_policy.algo.reset();
  567. param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  568. bencher.set_param(param);
  569. auto time_in_ms_pseudo_fp16 = bencher.execs({src, filter, {}}) / RUNS;
  570. printf("stride=%zu src=%s, filter=%s, float32: %.2fms %.2fGB/s "
  571. "float16: %.2fms %.2fGB/s "
  572. "pseudo float16: %.2fms %.2fGB/s "
  573. "speedup: "
  574. "%0.2f (fp16/fp32) %.2f (fp16/pseudo fp16)\n",
  575. s, src.to_string().c_str(), filter.to_string().c_str(),
  576. time_in_ms_fp32, bandwith * 4 / time_in_ms_fp32, time_in_ms_fp16,
  577. bandwith * 2 / time_in_ms_fp16, time_in_ms_pseudo_fp16,
  578. bandwith * 2 / time_in_ms_pseudo_fp16,
  579. time_in_ms_fp32 / time_in_ms_fp16,
  580. time_in_ms_pseudo_fp16 / time_in_ms_fp16);
  581. };
  582. // clang-format off
  583. for (size_t s : {1, 2})
  584. for (size_t f : {3, 5, 7})
  585. for (size_t batch : {64})
  586. for (size_t c : {16, 32, 64, 128})
  587. for (size_t ih: {128, 256})
  588. for (size_t iw : {128, 256})
  589. run(batch, c, ih, iw, f, s);
  590. // clang-format on
  591. run(128, 192, 28, 28, 3, 1);
  592. run(128, 192, 28, 28, 3, 2);
  593. run(128, 576, 14, 14, 3, 1);
  594. run(128, 384, 14, 14, 3, 1);
  595. run(128, 32, 112, 112, 3, 1);
  596. run(128, 960, 7, 7, 3, 1);
  597. run(128, 384, 14, 14, 3, 1);
  598. run(128, 144, 56, 56, 3, 2);
  599. run(128, 384, 14, 14, 3, 1);
  600. run(128, 144, 56, 56, 3, 1);
  601. run(128, 96, 112, 112, 3, 2);
  602. run(128, 384, 14, 14, 3, 1);
  603. run(128, 192, 28, 28, 3, 1);
  604. run(128, 576, 14, 14, 3, 1);
  605. run(128, 576, 14, 14, 3, 2);
  606. }
  607. TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_FORWARD_FLOAT) {
  608. CUBenchmarker<ConvolutionForward> bencher(handle_cuda());
  609. size_t RUNS = 1;
  610. bencher.set_display(false).set_times(RUNS);
  611. bencher.set_before_exec_callback(
  612. AlgoChecker<ConvolutionForward>(ExecutionPolicyAlgoName{
  613. "DEFAULT",
  614. {{ConvBiasForward::algo_name<ConvBiasForward::DirectParam>(
  615. "CHANNEL_WISE", {})
  616. .c_str(),
  617. {}}}}));
  618. Convolution::Param param;
  619. param.format = ConvBias::Param::Format::NCHW;
  620. param.sparse = Convolution::Param::Sparse::GROUP;
  621. NormalRNG rng;
  622. auto run = [&](size_t batch, size_t c, size_t ih, size_t iw, size_t f,
  623. size_t s) {
  624. param.pad_h = f / 2;
  625. param.pad_w = f / 2;
  626. param.stride_h = s;
  627. param.stride_w = s;
  628. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  629. TensorShape src = {batch, c, ih, iw}, filter = {c, 1, 1, f, f};
  630. TensorLayout dst_layout;
  631. auto opr = handle_cuda()->create_operator<Convolution>();
  632. opr->param() = param;
  633. opr->deduce_layout({src, dtype::Float32()}, {filter, dtype::Float32()},
  634. dst_layout);
  635. float bandwith = static_cast<float>(src.total_nr_elems() +
  636. filter.total_nr_elems() +
  637. dst_layout.total_nr_elems()) /
  638. (1024 * 1024 * 1024) * 1e3;
  639. bencher.set_param(param)
  640. .set_dtype(0, dtype::Float32())
  641. .set_dtype(1, dtype::Float32())
  642. .set_dtype(2, dtype::Float32())
  643. .set_rng(0, &rng)
  644. .set_rng(1, &rng);
  645. auto time_in_ms_fp32 = bencher.execs({src, filter, {}}) / RUNS;
  646. bencher.set_param(param)
  647. .set_dtype(0, dtype::Float16())
  648. .set_dtype(1, dtype::Float16())
  649. .set_dtype(2, dtype::Float16())
  650. .set_rng(0, &rng)
  651. .set_rng(1, &rng);
  652. auto time_in_ms_fp16 = bencher.execs({src, filter, {}}) / RUNS;
  653. printf("stride=%zu src=%s, filter=%s, float32: %.2fms %.2fGB/s "
  654. "float16: %.2fms %.2fGB/s "
  655. "speedup: "
  656. "%0.2f (fp16/fp32)\n",
  657. s, src.to_string().c_str(), filter.to_string().c_str(),
  658. time_in_ms_fp32, bandwith * 4 / time_in_ms_fp32, time_in_ms_fp16,
  659. bandwith * 2 / time_in_ms_fp16,
  660. time_in_ms_fp32 / time_in_ms_fp16);
  661. };
  662. // clang-format off
  663. for (size_t s : {1})
  664. for (size_t f : {3, 5, 7})
  665. for (size_t batch : {64})
  666. for (size_t c : {16, 32, 64, 128})
  667. for (size_t ih: {8, 16, 32, 128, 256})
  668. for (size_t iw : {8, 16, 32, 128, 256})
  669. run(batch, c, ih, iw, f, s);
  670. // clang-format on
  671. }
  672. TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_FORWARD_FLOAT_SMALL) {
  673. CUBenchmarker<ConvolutionForward> bencher(handle_cuda());
  674. size_t RUNS = 1;
  675. bencher.set_display(false).set_times(RUNS);
  676. Convolution::Param param;
  677. param.format = ConvBias::Param::Format::NCHW;
  678. param.sparse = Convolution::Param::Sparse::GROUP;
  679. NormalRNG rng;
  680. auto run = [&](size_t batch, size_t c, size_t ih, size_t iw, size_t f,
  681. size_t s) {
  682. param.pad_h = f / 2;
  683. param.pad_w = f / 2;
  684. param.stride_h = s;
  685. param.stride_w = s;
  686. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  687. TensorShape src = {batch, c, ih, iw}, filter = {c, 1, 1, f, f};
  688. TensorLayout dst_layout;
  689. auto opr = handle_cuda()->create_operator<Convolution>();
  690. opr->param() = param;
  691. opr->deduce_layout({src, dtype::Float32()}, {filter, dtype::Float32()},
  692. dst_layout);
  693. float bandwith = static_cast<float>(src.total_nr_elems() +
  694. filter.total_nr_elems() +
  695. dst_layout.total_nr_elems()) /
  696. (1024 * 1024 * 1024) * 1e3;
  697. bencher.set_param(param)
  698. .set_dtype(0, dtype::Float32())
  699. .set_dtype(1, dtype::Float32())
  700. .set_dtype(2, dtype::Float32())
  701. .set_rng(0, &rng)
  702. .set_rng(1, &rng)
  703. .set_before_exec_callback(
  704. AlgoChecker<ConvolutionForward>(ExecutionPolicyAlgoName{
  705. "DEFAULT",
  706. {{ConvBiasForward::algo_name<
  707. ConvBiasForward::DirectParam>(
  708. "CHANNEL_WISE", {})
  709. .c_str(),
  710. {}}}}));
  711. auto time_in_ms_fp32_normal = bencher.execs({src, filter, {}}) / RUNS;
  712. bencher.set_before_exec_callback(AlgoChecker<ConvolutionForward>(
  713. ExecutionPolicyAlgoName{"DEFAULT",
  714. {{ConvBiasForward::algo_name<
  715. ConvBiasForward::DirectParam>(
  716. "CHANNEL_WISE", {})
  717. .c_str(),
  718. {}}}}));
  719. auto time_in_ms_fp32_small = bencher.execs({src, filter, {}}) / RUNS;
  720. bencher.set_param(param)
  721. .set_dtype(0, dtype::Float16())
  722. .set_dtype(1, dtype::Float16())
  723. .set_dtype(2, dtype::Float16())
  724. .set_rng(0, &rng)
  725. .set_rng(1, &rng);
  726. auto time_in_ms_fp16_small = bencher.execs({src, filter, {}}) / RUNS;
  727. printf("stride=%zu src=%s, filter=%s, fp32 normal: %.2fms %.2fGB/s "
  728. "small: %.2fms %.2fGB/s, fp16 small: %.2fms %.2fGB/s, "
  729. "speedup: "
  730. "%0.2f (fp32 small/normal) %0.2f (small fp16/fp32)\n",
  731. s, src.to_string().c_str(), filter.to_string().c_str(),
  732. time_in_ms_fp32_normal, bandwith * 4 / time_in_ms_fp32_normal,
  733. time_in_ms_fp32_small, bandwith * 4 / time_in_ms_fp32_small,
  734. time_in_ms_fp16_small, bandwith * 2 / time_in_ms_fp16_small,
  735. time_in_ms_fp32_normal / time_in_ms_fp32_small,
  736. time_in_ms_fp32_small / time_in_ms_fp16_small);
  737. };
  738. // clang-format off
  739. for (size_t s : {1})
  740. for (size_t f : {3, 5})
  741. for (size_t batch : {64})
  742. for (size_t c : {16, 32, 64, 128})
  743. for (size_t ih: {8, 16, 32})
  744. for (size_t iw : {8, 16, 32})
  745. run(batch, c, ih, iw, f, s);
  746. // clang-format on
  747. run(128, 192, 28, 28, 3, 1);
  748. run(128, 576, 14, 14, 3, 1);
  749. run(128, 384, 14, 14, 3, 1);
  750. run(128, 960, 7, 7, 3, 1);
  751. run(128, 384, 14, 14, 3, 1);
  752. run(128, 384, 14, 14, 3, 1);
  753. run(128, 384, 14, 14, 3, 1);
  754. run(128, 192, 28, 28, 3, 1);
  755. run(128, 576, 14, 14, 3, 1);
  756. }
  757. TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_FORWARD_CUDNN_DNN) {
  758. CUBenchmarker<ConvBiasForward> bencher(handle_cuda());
  759. size_t RUNS = 1;
  760. bencher.set_display(false).set_times(RUNS);
  761. ConvBias::Param param;
  762. param.format = ConvBias::Param::Format::NCHW;
  763. param.sparse = ConvBias::Param::Sparse::GROUP;
  764. NormalRNG rng;
  765. auto run = [&](size_t batch, size_t c, size_t ih, size_t iw, size_t f,
  766. size_t s) {
  767. param.pad_h = f / 2;
  768. param.pad_w = f / 2;
  769. param.stride_h = s;
  770. param.stride_w = s;
  771. param.compute_mode = param::ConvBias::ComputeMode::DEFAULT;
  772. TensorShape src = {batch, c, ih, iw}, filter = {c, 1, 1, f, f},
  773. bias = {1, c, 1, 1};
  774. TensorLayout dst_layout;
  775. auto opr = handle_cuda()->create_operator<ConvBias>();
  776. opr->param() = param;
  777. opr->deduce_layout({src, dtype::Float32()}, {filter, dtype::Float32()},
  778. {bias, dtype::Float32()}, {}, dst_layout);
  779. float computation_mops =
  780. static_cast<float>(dst_layout.total_nr_elems() * f * f * 2) *
  781. 1e-6;
  782. bencher.set_param(param)
  783. .set_dtype(0, dtype::Float32())
  784. .set_dtype(1, dtype::Float32())
  785. .set_dtype(2, dtype::Float32())
  786. .set_rng(0, &rng)
  787. .set_rng(1, &rng);
  788. bencher.set_before_exec_callback(
  789. AlgoChecker<ConvBiasForward>(".+CHANNEL_WISE.+"));
  790. auto time_in_ms_dnn = bencher.execs({src, filter, bias, {}, {}}) / RUNS;
  791. bencher.set_param(param)
  792. .set_dtype(0, dtype::Float32())
  793. .set_dtype(1, dtype::Float32())
  794. .set_dtype(2, dtype::Float32())
  795. .set_rng(0, &rng)
  796. .set_rng(1, &rng);
  797. bencher.set_before_exec_callback(AlgoChecker<ConvBiasForward>(
  798. ".+CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM.+"));
  799. auto time_in_ms_cudnn =
  800. bencher.execs({src, filter, bias, {}, {}}) / RUNS;
  801. printf("stride=%zu src=%s, filter=%s, dst=%s, dnn: %.2fms %.2fGB/s "
  802. "cudnn: %.2fms %.2fGB/s "
  803. "speedup: "
  804. "%0.2f (dnn/cudnn)\n",
  805. s, src.to_string().c_str(), filter.to_string().c_str(),
  806. dst_layout.to_string().c_str(), time_in_ms_dnn,
  807. computation_mops / time_in_ms_dnn, time_in_ms_cudnn,
  808. computation_mops / time_in_ms_cudnn,
  809. time_in_ms_cudnn / time_in_ms_dnn);
  810. };
  811. // clang-format off
  812. for(size_t batch:{1, 16, 32, 64, 128}){
  813. run(batch, 32, 112, 112, 3, 1);
  814. run(batch, 96, 112, 112, 3, 2);
  815. run(batch, 96, 112, 112, 3, 1);
  816. run(batch, 144, 56, 56, 3, 2);
  817. run(batch, 144, 56, 56, 3, 1);
  818. run(batch, 192, 28, 28, 3, 1);
  819. run(batch, 384, 14, 14, 3, 1);
  820. run(batch, 576, 14, 14, 3, 1);
  821. run(batch, 960, 7, 7, 3, 1);
  822. //! calibrate heu algo policy hw_size param
  823. run(batch, 144, 24, 24, 3, 1);
  824. run(batch, 144, 22, 22, 3, 1);
  825. run(batch, 144, 20, 20, 3, 1);
  826. run(batch, 144, 18, 18, 3, 1);
  827. }
  828. // clang-format on
  829. }
  830. TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_BACKWARD_DATA_FLOAT_SMALL) {
  831. CUBenchmarker<ConvolutionBackwardData> bencher(handle_cuda());
  832. size_t RUNS = 1;
  833. bencher.set_display(false).set_times(RUNS);
  834. ConvolutionBackwardData::Param param;
  835. param.format = Convolution::Param::Format::NCHW;
  836. param.sparse = Convolution::Param::Sparse::GROUP;
  837. NormalRNG rng;
  838. auto run = [&](size_t batch, size_t c, size_t ih, size_t iw, size_t f,
  839. size_t s) {
  840. param.pad_h = f / 2;
  841. param.pad_w = f / 2;
  842. param.stride_h = s;
  843. param.stride_w = s;
  844. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  845. TensorShape src = {batch, c, ih, iw}, filter = {c, 1, 1, f, f};
  846. float bandwith = static_cast<float>(src.total_nr_elems() +
  847. filter.total_nr_elems() +
  848. src.total_nr_elems()) /
  849. (1024 * 1024 * 1024) * 1e3;
  850. bencher.set_param(param)
  851. .set_dtype(0, dtype::Float32())
  852. .set_dtype(1, dtype::Float32())
  853. .set_dtype(2, dtype::Float32())
  854. .set_rng(0, &rng)
  855. .set_rng(1, &rng)
  856. .set_before_exec_callback(
  857. AlgoChecker<ConvolutionBackwardData>("CHANNEL_WISE"));
  858. auto time_in_ms_fp32_normal = bencher.execs({filter, src, src}) / RUNS;
  859. bencher.set_before_exec_callback(
  860. AlgoChecker<ConvolutionBackwardData>("CHANNEL_WISE_SMALL"));
  861. auto time_in_ms_fp32_small = bencher.execs({filter, src, src}) / RUNS;
  862. bencher.set_param(param)
  863. .set_dtype(0, dtype::Float16())
  864. .set_dtype(1, dtype::Float16())
  865. .set_dtype(2, dtype::Float16())
  866. .set_rng(0, &rng)
  867. .set_rng(1, &rng);
  868. auto time_in_ms_fp16_small = bencher.execs({filter, src, src}) / RUNS;
  869. printf("stride=%zu src=%s, filter=%s, fp32 normal: %.2fms %.2fGB/s "
  870. "small: %.2fms %.2fGB/s, fp16 small: %.2fms %.2fGB/s, "
  871. "speedup: "
  872. "%0.2f (fp32 small/normal) %0.2f (small fp16/fp32)\n",
  873. s, src.to_string().c_str(), filter.to_string().c_str(),
  874. time_in_ms_fp32_normal, bandwith * 4 / time_in_ms_fp32_normal,
  875. time_in_ms_fp32_small, bandwith * 4 / time_in_ms_fp32_small,
  876. time_in_ms_fp16_small, bandwith * 2 / time_in_ms_fp16_small,
  877. time_in_ms_fp32_normal / time_in_ms_fp32_small,
  878. time_in_ms_fp32_small / time_in_ms_fp16_small);
  879. };
  880. // clang-format off
  881. for (size_t s : {1})
  882. for (size_t f : {3, 5})
  883. for (size_t batch : {64})
  884. for (size_t c : {16, 32, 64, 128})
  885. for (size_t ih: {8, 16, 32})
  886. for (size_t iw : {8, 16, 32})
  887. run(batch, c, ih, iw, f, s);
  888. // clang-format on
  889. run(128, 192, 28, 28, 3, 1);
  890. run(128, 576, 14, 14, 3, 1);
  891. run(128, 384, 14, 14, 3, 1);
  892. run(128, 960, 7, 7, 3, 1);
  893. run(128, 384, 14, 14, 3, 1);
  894. run(128, 384, 14, 14, 3, 1);
  895. run(128, 384, 14, 14, 3, 1);
  896. run(128, 192, 28, 28, 3, 1);
  897. run(128, 576, 14, 14, 3, 1);
  898. }
  899. TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_BWD_DATA) {
  900. CUBenchmarker<ConvolutionBackwardData> bencher(handle_cuda());
  901. size_t RUNS = 1;
  902. bencher.set_display(false).set_times(RUNS);
  903. bencher.set_before_exec_callback(
  904. AlgoChecker<ConvolutionBackwardData>("CHANNEL_WISE"));
  905. Convolution::Param param;
  906. param.format = ConvBias::Param::Format::NCHW;
  907. param.sparse = Convolution::Param::Sparse::GROUP;
  908. NormalRNG rng;
  909. auto run = [&](size_t batch, size_t ocpg, size_t group, size_t ih,
  910. size_t iw, size_t f, size_t p, size_t s) {
  911. param.pad_h = p;
  912. param.pad_w = p;
  913. param.stride_h = s;
  914. param.stride_w = s;
  915. size_t oh, ow;
  916. infer_conv_shape2d(ih, iw, f, f, s, s, p, p, oh, ow, true);
  917. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  918. TensorShape src_grad = {batch, group, ih, iw},
  919. dst_grad = {batch, group * ocpg, oh, ow},
  920. flt = {group, ocpg, 1, f, f};
  921. auto opr = handle_cuda()->create_operator<Convolution>();
  922. opr->param() = param;
  923. float bandwith = static_cast<float>(flt.total_nr_elems() +
  924. dst_grad.total_nr_elems() +
  925. src_grad.total_nr_elems()) /
  926. (1024 * 1024 * 1024) * 1e3;
  927. bencher.set_param(param)
  928. .set_dtype(0, dtype::Float32())
  929. .set_dtype(1, dtype::Float32())
  930. .set_dtype(2, dtype::Float32())
  931. .set_rng(0, &rng)
  932. .set_rng(1, &rng);
  933. auto time_in_ms_fp32 = bencher.execs({flt, dst_grad, src_grad}) / RUNS;
  934. bencher.set_param(param)
  935. .set_dtype(0, dtype::Float16())
  936. .set_dtype(1, dtype::Float16())
  937. .set_dtype(2, dtype::Float16())
  938. .set_rng(0, &rng)
  939. .set_rng(1, &rng);
  940. auto time_in_ms_fp16 = bencher.execs({flt, dst_grad, src_grad}) / RUNS;
  941. printf("stride=%zu, src_grad=%s, flt=%s, "
  942. "float32: %.2fms %.2fGB/s "
  943. "float16: %.2fms %.2fGB/s "
  944. "speedup: "
  945. "%0.2f (fp16/fp32)\n",
  946. s, src_grad.to_string().c_str(), flt.to_string().c_str(),
  947. time_in_ms_fp32, bandwith * 4 / time_in_ms_fp32, time_in_ms_fp16,
  948. bandwith * 2 / time_in_ms_fp16,
  949. time_in_ms_fp32 / time_in_ms_fp16);
  950. };
  951. // clang-format off
  952. for (size_t s : {1, 2})
  953. for (size_t f : {3, 5, 7})
  954. for (size_t p : {f / 2})
  955. for (size_t batch : {64})
  956. for (size_t ocpg : {1})
  957. for (size_t group : {16, 32, 64, 128})
  958. for (size_t ih : {8, 16, 32, 128, 256})
  959. for (size_t iw : {8, 16, 32, 128, 256})
  960. run(batch, ocpg, group, ih, iw, f, p, s);
  961. // clang-format on
  962. }
  963. TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_BWD_FILTER) {
  964. CUBenchmarker<ConvolutionBackwardFilter> bencher(handle_cuda());
  965. size_t RUNS = 1;
  966. bencher.set_display(false).set_times(RUNS);
  967. bencher.set_before_exec_callback(
  968. AlgoChecker<ConvolutionBackwardFilter>("CHANNEL_WISE"));
  969. Convolution::Param param;
  970. param.format = ConvBias::Param::Format::NCHW;
  971. param.sparse = Convolution::Param::Sparse::GROUP;
  972. NormalRNG rng;
  973. auto run = [&](size_t batch, size_t ocpg, size_t group, size_t i,
  974. size_t f, size_t p, size_t s) {
  975. param.pad_h = p;
  976. param.pad_w = p;
  977. param.stride_h = s;
  978. param.stride_w = s;
  979. size_t d = infer_conv_shape(i, f, s, p, true);
  980. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  981. TensorShape src = {batch, group, i, i},
  982. dst_grad = {batch, group * ocpg, d, d},
  983. flt_grad = {group, ocpg, 1, f, f};
  984. auto opr = handle_cuda()->create_operator<Convolution>();
  985. opr->param() = param;
  986. float bandwith = static_cast<float>(flt_grad.total_nr_elems() +
  987. dst_grad.total_nr_elems() +
  988. src.total_nr_elems()) /
  989. (1024 * 1024 * 1024) * 1e3;
  990. bencher.set_param(param)
  991. .set_dtype(0, dtype::Float32())
  992. .set_dtype(1, dtype::Float32())
  993. .set_dtype(2, dtype::Float32())
  994. .set_rng(0, &rng)
  995. .set_rng(1, &rng);
  996. auto time_in_ms_fp32 = bencher.execs({src, dst_grad, flt_grad}) / RUNS;
  997. bencher.set_param(param)
  998. .set_dtype(0, dtype::Float16())
  999. .set_dtype(1, dtype::Float16())
  1000. .set_dtype(2, dtype::Float16())
  1001. .set_rng(0, &rng)
  1002. .set_rng(1, &rng);
  1003. auto time_in_ms_fp16 = bencher.execs({src, dst_grad, flt_grad}) / RUNS;
  1004. printf("stride=%zu, src=%s, flt_grad=%s, "
  1005. "float32: %.2fms %.2fGB/s "
  1006. "float16: %.2fms %.2fGB/s "
  1007. "speedup: "
  1008. "%.2f (fp16/fp32)\n",
  1009. s, src.to_string().c_str(), flt_grad.to_string().c_str(),
  1010. time_in_ms_fp32, bandwith * 4 / time_in_ms_fp32, time_in_ms_fp16,
  1011. bandwith * 2 / time_in_ms_fp16,
  1012. time_in_ms_fp32 / time_in_ms_fp16);
  1013. };
  1014. // clang-format off
  1015. for (size_t s : {1, 2})
  1016. for (size_t f : {3, 5, 7})
  1017. for (size_t p : {f / 2})
  1018. for (size_t batch : {64})
  1019. for (size_t ocpg : {1})
  1020. for (size_t group : {16, 32, 64, 128})
  1021. for (size_t i : {8, 16, 32, 64, 128})
  1022. run(batch, ocpg, group, i, f, p, s);
  1023. // clang-format on
  1024. }
  1025. #endif
  1026. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台