You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

chanwise_convolution.cpp 16 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444
  1. /**
  2. * \file dnn/test/rocm/chanwise_convolution.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #include "include/hcc_detail/hcc_defs_prologue.h"
  12. #include "megdnn/oprs/nn.h"
  13. #include "megcore_rocm.h"
  14. #include "test/common/benchmarker.h"
  15. #include "test/common/checker.h"
  16. #include "test/common/convolution.h"
  17. #include "test/common/tensor.h"
  18. #include "test/common/workspace_wrapper.h"
  19. #include "test/rocm/fixture.h"
  20. #include "hip_header.h"
  21. using namespace megdnn;
  22. using namespace test;
  23. namespace {
  24. #if MEGDNN_WITH_BENCHMARK
  25. bool check_need_full_bench() {
  26. if (getenv("MEGDNN_CHANWISE_CONV_FULLBENCH"))
  27. return true;
  28. printf("set MEGDNN_CHANWISE_CONV_FULLBENCH to run full benchmark\n");
  29. return false;
  30. }
  31. #endif
  32. Convolution::Param gconv_param(Convolution::Param p) {
  33. p.sparse = Convolution::Param::Sparse::GROUP;
  34. return p;
  35. }
  36. template <int P0, int P1, int P2>
  37. class BenchmarkEnv {
  38. Handle *handle, *handle_cpu;
  39. std::unique_ptr<GaussianRNG> rng;
  40. TensorLayout lsrc, lflt0, lflt1, ldst;
  41. std::unique_ptr<Tensor<>> src0, src1, flt0, flt0_cpu, flt1, flt1_cpu, dst0,
  42. dst1;
  43. hipEvent_t hip_ev[3];
  44. hipStream_t hip_stream;
  45. size_t pad_h, pad_w;
  46. template <typename T>
  47. static std::tuple<T, T, T> shuffle(std::tuple<T, T, T> data) {
  48. return std::make_tuple(std::get<P0>(data), std::get<P1>(data),
  49. std::get<P2>(data));
  50. }
  51. public:
  52. BenchmarkEnv(Handle* handle, Handle* handle_cpu) {
  53. this->handle = handle;
  54. this->handle_cpu = handle_cpu;
  55. rng = handle->create_operator<GaussianRNG>();
  56. // make cpu handle used
  57. handle_cpu->create_operator<Sleep>()->exec();
  58. for (int i = 0; i < 3; ++i)
  59. hipEventCreate(&hip_ev[i]);
  60. megcoreGetROCMStream(handle->megcore_computing_handle(), &hip_stream);
  61. }
  62. ~BenchmarkEnv() {
  63. for (int i = 0; i < 3; ++i)
  64. hipEventDestroy(hip_ev[i]);
  65. }
  66. void alloc(size_t N, size_t IC, size_t IH, size_t IW, size_t CHL_MUL,
  67. size_t FH, size_t FW, size_t PH, size_t PW) {
  68. pad_h = PH;
  69. pad_w = PW;
  70. auto mkly = [](const TensorShape& s) {
  71. return TensorLayout{s, dtype::Float32()};
  72. };
  73. lsrc = mkly({N, IC, IH, IW});
  74. lflt0 = mkly({CHL_MUL * IC, IC, FH, FW});
  75. lflt1 = mkly({IC, CHL_MUL, 1, FH, FW});
  76. ldst = mkly(
  77. {N, IC * CHL_MUL, IH - FH + 1 + PH * 2, IW - FW + 1 + PW * 2});
  78. src0.reset(new Tensor<>(handle, lsrc));
  79. src1.reset(new Tensor<>(handle, lsrc));
  80. flt0.reset(new Tensor<>(handle, lflt0));
  81. flt0_cpu.reset(new Tensor<>(handle_cpu, lflt0));
  82. flt1.reset(new Tensor<>(handle, lflt1));
  83. flt1_cpu.reset(new Tensor<>(handle_cpu, lflt1));
  84. dst0.reset(new Tensor<>(handle, ldst));
  85. dst1.reset(new Tensor<>(handle, ldst));
  86. }
  87. void fill_src() {
  88. rng->exec(src0->tensornd(), {});
  89. megdnn_memcpy_D2D(handle, src1->ptr(), src0->ptr(),
  90. lsrc.span().dist_byte());
  91. }
  92. void fill_flt() {
  93. rng->exec(flt1->tensornd(), {});
  94. megdnn_memcpy_D2H(handle, flt1_cpu->ptr(), flt1->ptr(),
  95. lflt1.span().dist_byte());
  96. const size_t IC = lflt1[0], CHL_MUL = lflt1[1],
  97. FSIZE = lflt1[3] * lflt1[4];
  98. // fill flt0 from flt1
  99. float* src = flt1_cpu->ptr();
  100. float* dst = flt0_cpu->ptr();
  101. memset(dst, 0, lflt0.span().dist_byte());
  102. for (size_t i = 0; i < IC; ++i) {
  103. for (size_t j = 0; j < CHL_MUL; ++j) {
  104. memcpy(dst + ((i * CHL_MUL + j) * IC + i) * FSIZE,
  105. src + (i * CHL_MUL + j) * FSIZE, FSIZE * sizeof(float));
  106. }
  107. }
  108. megdnn_memcpy_H2D(handle, flt0->ptr(), dst, lflt0.span().dist_byte());
  109. }
  110. void fill_dst() {
  111. rng->exec(dst0->tensornd(), {});
  112. megdnn_memcpy_D2D(handle, dst1->ptr(), dst0->ptr(),
  113. ldst.span().dist_byte());
  114. }
  115. template <class Opr>
  116. void exec(Opr* opr0, Opr* opr1) {
  117. opr0->param().pad_h = pad_h;
  118. opr0->param().pad_w = pad_w;
  119. opr1->param() = opr0->param();
  120. opr1->param().sparse = param::Convolution::Sparse::GROUP;
  121. TensorND a0, b0, c0, a1, b1, c1;
  122. std::tie(a0, b0, c0) = shuffle(std::make_tuple(
  123. src0->tensornd(), flt0->tensornd(), dst0->tensornd()));
  124. std::tie(a1, b1, c1) = shuffle(std::make_tuple(
  125. src1->tensornd(), flt1->tensornd(), dst1->tensornd()));
  126. WorkspaceWrapper wk(handle,
  127. std::max(opr0->get_workspace_in_bytes(
  128. a0.layout, b0.layout, c0.layout),
  129. opr1->get_workspace_in_bytes(
  130. a1.layout, b1.layout, c1.layout)));
  131. hipProfilerStart();
  132. hipEventRecord(hip_ev[0], hip_stream);
  133. opr0->exec(a0, b0, c0, wk.workspace());
  134. hipEventRecord(hip_ev[1], hip_stream);
  135. opr1->exec(a1, b1, c1, wk.workspace());
  136. hipEventRecord(hip_ev[2], hip_stream);
  137. hipProfilerStop();
  138. if (getenv("MEGDNN_CHANWISE_CONV_VERBOSE") ||
  139. getenv("MEGDNN_CHANWISE_CONV_FULLBENCH")) {
  140. hipStreamSynchronize(hip_stream);
  141. float t0 = -1, t1 = -1;
  142. hipEventElapsedTime(&t0, hip_ev[0], hip_ev[1]);
  143. hipEventElapsedTime(&t1, hip_ev[1], hip_ev[2]);
  144. printf("%s;%s;%s: miopen/megdnn: %.3fms/%.3fms=%.3f\n",
  145. lsrc.TensorShape::to_string().c_str(),
  146. lflt1.TensorShape::to_string().c_str(),
  147. ldst.TensorShape::to_string().c_str(), t0, t1, t0 / t1);
  148. }
  149. }
  150. void cmp_dst() {
  151. Tensor<> dst0_cpu(handle_cpu, ldst), dst1_cpu(handle_cpu, ldst);
  152. megdnn_memcpy_D2H(handle, dst0_cpu.ptr(), dst0->ptr(),
  153. ldst.span().dist_byte());
  154. megdnn_memcpy_D2H(handle, dst1_cpu.ptr(), dst1->ptr(),
  155. ldst.span().dist_byte());
  156. dst0_cpu.check_with(dst1_cpu);
  157. }
  158. void cmp_src() {
  159. Tensor<> src0_cpu(handle_cpu, lsrc), src1_cpu(handle_cpu, lsrc);
  160. megdnn_memcpy_D2H(handle, src0_cpu.ptr(), src0->ptr(),
  161. lsrc.span().dist_byte());
  162. megdnn_memcpy_D2H(handle, src1_cpu.ptr(), src1->ptr(),
  163. lsrc.span().dist_byte());
  164. src0_cpu.check_with(src1_cpu);
  165. }
  166. void cmp_flt() {
  167. Tensor<> flt0_cpu(handle_cpu, lflt0), flt1_cpu(handle_cpu, lflt1);
  168. float* p0 = flt0_cpu.ptr();
  169. float* p1 = flt1_cpu.ptr();
  170. megdnn_memcpy_D2H(handle, p0, flt0->ptr(), lflt0.span().dist_byte());
  171. megdnn_memcpy_D2H(handle, p1, flt1->ptr(), lflt1.span().dist_byte());
  172. size_t IC = lflt1[0], CHL_MUL = lflt1[1], FSIZE = lflt1[3] * lflt1[4];
  173. double tot_err = 0, tot_err_num = 0;
  174. for (size_t i = 0; i < IC; ++i) {
  175. for (size_t j = 0; j < CHL_MUL; ++j) {
  176. auto t0 = p0 + ((i * CHL_MUL + j) * IC + i) * FSIZE,
  177. t1 = p1 + (i * CHL_MUL + j) * FSIZE;
  178. for (size_t k = 0; k < FSIZE; ++k) {
  179. auto err = std::abs(diff(t0[k], t1[k]));
  180. tot_err += err;
  181. tot_err_num += 1;
  182. ASSERT_LT(err, 1e-2)
  183. << "failed at " << i << " " << j << " " << k
  184. << " vals=" << t0[k] << "," << t1[k];
  185. }
  186. }
  187. }
  188. auto avg_err = tot_err / tot_err_num;
  189. ASSERT_LT(avg_err, 1e-4);
  190. }
  191. };
  192. } // anonymous namespace
  193. constexpr auto M = Convolution::Mode::CROSS_CORRELATION;
  194. TEST_F(ROCM, CHANWISE_CONVOLUTION_FORWARD) {
  195. Checker<Convolution> checker(handle_rocm());
  196. bool require_algo = false;
  197. checker.set_before_exec_callback(
  198. AlgoChecker<ConvolutionForward>("CHANNEL_WISE", &require_algo));
  199. // simple case
  200. checker.set_param(gconv_param({M, 0, 0, 1, 1}))
  201. .execs({{1, 1, 2, 2}, {1, 1, 1, 2, 2}, {}})
  202. .execs({{1, 1, 5, 5}, {1, 1, 1, 2, 2}, {}});
  203. checker.execs({{2, 2, 5, 5}, {2, 3, 1, 2, 2}, {2, 6, 4, 4}});
  204. checker.set_param(gconv_param({M, 1, 1, 1, 1}))
  205. .execs({{2, 2, 5, 5}, {2, 1, 1, 2, 2}, {}});
  206. checker.set_param(gconv_param({M, 2, 3, 2, 1}))
  207. .execs({{32, 12, 20, 10}, {12, 2, 1, 4, 5}, {}});
  208. // padding larger than kern
  209. checker.set_param(gconv_param({M, 20, 30, 4, 5}))
  210. .execs({{32, 12, 20, 10}, {12, 2, 1, 4, 5}, {}});
  211. }
  212. TEST_F(ROCM, CHANWISE_CONVOLUTION_BACKWARD_DATA) {
  213. Checker<ConvolutionBackwardData> checker(handle_rocm());
  214. checker.set_param(gconv_param({M, 0, 0, 1, 1}))
  215. .execs({{1, 1, 1, 2, 2}, {1, 1, 1, 1}, {1, 1, 2, 2}})
  216. .execs({{1, 1, 1, 2, 2}, {1, 1, 5, 5}, {1, 1, 6, 6}});
  217. checker.execs({{2, 1, 1, 2, 2}, {1, 2, 1, 1}, {1, 2, 2, 2}})
  218. .execs({{2, 1, 1, 2, 2}, {1, 2, 5, 5}, {1, 2, 6, 6}})
  219. .execs({{2, 3, 1, 2, 2}, {2, 6, 5, 5}, {2, 2, 6, 6}});
  220. checker.set_param(gconv_param({M, 1, 1, 1, 1}))
  221. .execs({{2, 1, 1, 2, 2}, {2, 2, 5, 5}, {2, 2, 4, 4}});
  222. checker.set_param(gconv_param({M, 2, 3, 2, 1}))
  223. .execs({{12, 3, 1, 4, 5}, {32, 36, 20, 10}, {32, 12, 39, 8}});
  224. // padding larger than kern
  225. checker.set_param(gconv_param({M, 20, 30, 4, 5}))
  226. .execs({{6, 2, 1, 4, 5}, {32, 12, 10, 12}, {32, 6, 2, 3}});
  227. }
  228. TEST_F(ROCM, CHANWISE_CONVOLUTION_BACKWARD_FILTER) {
  229. Checker<ConvolutionBackwardFilter> checker(handle_rocm());
  230. checker.set_param(gconv_param({M, 0, 0, 1, 1}))
  231. .execs({{1, 1, 2, 2}, {1, 1, 1, 1}, {1, 1, 1, 2, 2}})
  232. .execs({{1, 1, 6, 6}, {1, 1, 5, 5}, {1, 1, 1, 2, 2}})
  233. .execs({{256, 1, 2, 2}, {256, 1, 1, 1}, {1, 1, 1, 2, 2}});
  234. checker.execs({{1, 2, 2, 2}, {1, 2, 1, 1}, {2, 1, 1, 2, 2}})
  235. .execs({{1, 2, 6, 6}, {1, 2, 5, 5}, {2, 1, 1, 2, 2}})
  236. .execs({{2, 2, 6, 6}, {2, 6, 5, 5}, {2, 3, 1, 2, 2}});
  237. checker.set_param(gconv_param({M, 1, 1, 1, 1}))
  238. .execs({{2, 2, 4, 4}, {2, 2, 5, 5}, {2, 1, 1, 2, 2}});
  239. checker.set_param(gconv_param({M, 0, 0, 1, 1}))
  240. .execs({{40960, 1, 1, 1}, {40960, 1, 1, 1}, {1, 1, 1, 1, 1}});
  241. checker.set_param(gconv_param({M, 2, 3, 2, 1}))
  242. .execs({{32, 12, 39, 8}, {32, 36, 20, 10}, {12, 3, 1, 4, 5}});
  243. // padding larger than kern
  244. checker.set_param(gconv_param({M, 20, 30, 4, 5}))
  245. .execs({{32, 6, 2, 3}, {32, 12, 10, 12}, {6, 2, 1, 4, 5}});
  246. // unused filter items
  247. checker.set_param(gconv_param({M, 2, 3, 2, 3}))
  248. .execs({{32, 6, 1, 1}, {32, 12, 1, 1}, {6, 2, 1, 5, 7}});
  249. }
  250. #if MEGDNN_WITH_BENCHMARK
  251. TEST_F(ROCM, CHANWISE_CONVOLUTION_FORWARD_BENCH_CHECK) {
  252. auto handle = handle_rocm();
  253. auto handle_cpu = handle_naive();
  254. auto conv0 = handle->create_operator<ConvolutionForward>();
  255. auto conv1 = handle->create_operator<ConvolutionForward>();
  256. BenchmarkEnv<0, 1, 2> benv(handle, handle_cpu);
  257. auto run = [&](size_t N, size_t IC, size_t IH, size_t IW, size_t CHL_MUL,
  258. size_t FH, size_t FW, size_t PH, size_t PW) {
  259. benv.alloc(N, IC, IH, IW, CHL_MUL, FH, FW, PH, PW);
  260. benv.fill_src();
  261. benv.fill_flt();
  262. benv.exec(conv0.get(), conv1.get());
  263. benv.cmp_dst();
  264. };
  265. run(64, 60, 50, 50, 1, 3, 3, 1, 1);
  266. if (check_need_full_bench()) {
  267. run(64, 728, 18, 18, 2, 5, 5, 2, 2);
  268. run(64, 64, 150, 150, 2, 3, 3, 1, 1);
  269. run(1, 2048, 4, 4, 2, 3, 3, 1, 1);
  270. }
  271. }
  272. TEST_F(ROCM, CHANWISE_CONVOLUTION_BWD_DATA_BENCH_CHECK) {
  273. auto handle = handle_rocm();
  274. auto handle_cpu = handle_naive();
  275. auto conv0 = handle->create_operator<ConvolutionBackwardData>();
  276. auto conv1 = handle->create_operator<ConvolutionBackwardData>();
  277. BenchmarkEnv<1, 2, 0> benv(handle, handle_cpu);
  278. auto run = [&](size_t N, size_t IC, size_t IH, size_t IW, size_t CHL_MUL,
  279. size_t FH, size_t FW, size_t PH, size_t PW) {
  280. benv.alloc(N, IC, IH, IW, CHL_MUL, FH, FW, PH, PW);
  281. benv.fill_dst();
  282. benv.fill_flt();
  283. benv.exec(conv0.get(), conv1.get());
  284. benv.cmp_src();
  285. };
  286. run(64, 60, 50, 50, 1, 3, 3, 1, 1);
  287. if (check_need_full_bench()) {
  288. run(64, 728, 18, 18, 2, 5, 5, 2, 2);
  289. run(64, 64, 150, 150, 2, 3, 3, 1, 1);
  290. run(1, 2048, 4, 4, 2, 3, 3, 1, 1);
  291. }
  292. }
  293. TEST_F(ROCM, CHANWISE_CONVOLUTION_BWD_FILTER_BENCH_CHECK) {
  294. auto handle = handle_rocm();
  295. auto handle_cpu = handle_naive();
  296. auto conv0 = handle->create_operator<ConvolutionBackwardFilter>();
  297. auto conv1 = handle->create_operator<ConvolutionBackwardFilter>();
  298. BenchmarkEnv<0, 2, 1> benv(handle, handle_cpu);
  299. auto run = [&](size_t N, size_t IC, size_t IH, size_t IW, size_t CHL_MUL,
  300. size_t FH, size_t FW, size_t PH, size_t PW) {
  301. benv.alloc(N, IC, IH, IW, CHL_MUL, FH, FW, PH, PW);
  302. benv.fill_src();
  303. benv.fill_dst();
  304. benv.exec(conv0.get(), conv1.get());
  305. benv.cmp_flt();
  306. };
  307. run(64, 60, 50, 50, 1, 3, 3, 1, 1);
  308. if (check_need_full_bench()) {
  309. run(64, 728, 18, 18, 2, 5, 5, 2, 2);
  310. run(64, 64, 150, 150, 2, 3, 3, 1, 1);
  311. run(1, 2048, 4, 4, 2, 3, 3, 1, 1);
  312. }
  313. }
  314. TEST_F(ROCM, CHANWISE_CONVOLUTION_BENCH_ALL_ALGO_FWD) {
  315. // enable profiling
  316. OprProxy<ConvolutionForward> proxy(true);
  317. proxy.warmup_times = 1;
  318. proxy.exec_times = 10;
  319. Benchmarker<ConvolutionForward> checker(handle_rocm());
  320. checker.set_times(1);
  321. ConvolutionForward::Param param;
  322. param.sparse = ConvolutionForward::Param::Sparse::GROUP;
  323. checker.set_param(param);
  324. auto run = [&](size_t N, size_t C, size_t IH, size_t IW, size_t FH,
  325. size_t FW) {
  326. checker.set_proxy(proxy);
  327. checker.execs({{N, C, IH, IW}, {C, 1, 1, FH, FW}, {}});
  328. };
  329. run(128, 64, 90, 80, 3, 3);
  330. run(128, 90, 100, 100, 3, 5);
  331. run(128, 32, 62, 62, 5, 5);
  332. }
  333. TEST_F(ROCM, CHANWISE_CONVOLUTION_BENCH_ALL_ALGO_BWD_DATA) {
  334. // enable profiling
  335. OprProxy<ConvolutionBackwardData> proxy(true);
  336. proxy.warmup_times = 1;
  337. proxy.exec_times = 10;
  338. Benchmarker<ConvolutionBackwardData> checker(handle_rocm());
  339. checker.set_times(1);
  340. ConvolutionBackwardData::Param param;
  341. param.sparse = ConvolutionForward::Param::Sparse::GROUP;
  342. checker.set_param(param);
  343. auto run = [&](size_t N, size_t C, size_t IH, size_t IW, size_t FH,
  344. size_t FW) {
  345. checker.set_proxy(proxy);
  346. checker.execs({{C, 1, 1, FH, FW},
  347. {N, C, IH - FH + 1, IW - FW + 1},
  348. {N, C, IH, IW}});
  349. };
  350. run(128, 64, 90, 80, 3, 3);
  351. run(128, 90, 100, 100, 3, 5);
  352. run(128, 32, 62, 62, 5, 5);
  353. }
  354. TEST_F(ROCM, CHANWISE_CONVOLUTION_BENCH_ALL_ALGO_BWD_FILTER) {
  355. // enable profiling
  356. OprProxy<ConvolutionBackwardFilter> proxy(true);
  357. proxy.warmup_times = 1;
  358. proxy.exec_times = 10;
  359. Benchmarker<ConvolutionBackwardFilter> checker(handle_rocm());
  360. checker.set_times(1);
  361. ConvolutionBackwardFilter::Param param;
  362. param.sparse = ConvolutionForward::Param::Sparse::GROUP;
  363. checker.set_param(param);
  364. auto run = [&](size_t N, size_t C, size_t IH, size_t IW, size_t FH,
  365. size_t FW) {
  366. checker.set_proxy(proxy);
  367. checker.execs({{N, C, IH, IW},
  368. {N, C, IH - FH + 1, IW - FW + 1},
  369. {C, 1, 1, FH, FW}});
  370. };
  371. run(128, 64, 90, 80, 3, 3);
  372. run(128, 90, 100, 100, 3, 5);
  373. run(128, 32, 62, 62, 5, 5);
  374. }
  375. #endif
  376. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台