You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

chanwise_convolution.cpp 16 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430
  1. /**
  2. * \file dnn/test/rocm/chanwise_convolution.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #include "include/hcc_detail/hcc_defs_prologue.h"
  12. #include "megdnn/oprs/nn.h"
  13. #include "megcore_rocm.h"
  14. #include "test/common/benchmarker.h"
  15. #include "test/common/checker.h"
  16. #include "test/common/convolution.h"
  17. #include "test/common/tensor.h"
  18. #include "test/common/workspace_wrapper.h"
  19. #include "test/rocm/fixture.h"
  20. #include "hip_header.h"
  21. using namespace megdnn;
  22. using namespace test;
  23. namespace {
  24. #if MEGDNN_WITH_BENCHMARK
  25. bool check_need_full_bench() {
  26. if (getenv("MEGDNN_CHANWISE_CONV_FULLBENCH"))
  27. return true;
  28. printf("set MEGDNN_CHANWISE_CONV_FULLBENCH to run full benchmark\n");
  29. return false;
  30. }
  31. #endif
  32. Convolution::Param gconv_param(Convolution::Param p) {
  33. p.sparse = Convolution::Param::Sparse::GROUP;
  34. return p;
  35. }
  36. template <int P0, int P1, int P2>
  37. class BenchmarkEnv {
  38. Handle *handle, *handle_cpu;
  39. std::unique_ptr<GaussianRNG> rng;
  40. TensorLayout lsrc, lflt0, lflt1, ldst;
  41. std::unique_ptr<Tensor<>> src0, src1, flt0, flt0_cpu, flt1, flt1_cpu, dst0, dst1;
  42. hipEvent_t hip_ev[3];
  43. hipStream_t hip_stream;
  44. size_t pad_h, pad_w;
  45. template <typename T>
  46. static std::tuple<T, T, T> shuffle(std::tuple<T, T, T> data) {
  47. return std::make_tuple(
  48. std::get<P0>(data), std::get<P1>(data), std::get<P2>(data));
  49. }
  50. public:
  51. BenchmarkEnv(Handle* handle, Handle* handle_cpu) {
  52. this->handle = handle;
  53. this->handle_cpu = handle_cpu;
  54. rng = handle->create_operator<GaussianRNG>();
  55. // make cpu handle used
  56. handle_cpu->create_operator<Sleep>()->exec();
  57. for (int i = 0; i < 3; ++i)
  58. hipEventCreate(&hip_ev[i]);
  59. megcoreGetROCMStream(handle->megcore_computing_handle(), &hip_stream);
  60. }
  61. ~BenchmarkEnv() {
  62. for (int i = 0; i < 3; ++i)
  63. hipEventDestroy(hip_ev[i]);
  64. }
  65. void alloc(
  66. size_t N, size_t IC, size_t IH, size_t IW, size_t CHL_MUL, size_t FH,
  67. size_t FW, size_t PH, size_t PW) {
  68. pad_h = PH;
  69. pad_w = PW;
  70. auto mkly = [](const TensorShape& s) {
  71. return TensorLayout{s, dtype::Float32()};
  72. };
  73. lsrc = mkly({N, IC, IH, IW});
  74. lflt0 = mkly({CHL_MUL * IC, IC, FH, FW});
  75. lflt1 = mkly({IC, CHL_MUL, 1, FH, FW});
  76. ldst = mkly({N, IC * CHL_MUL, IH - FH + 1 + PH * 2, IW - FW + 1 + PW * 2});
  77. src0.reset(new Tensor<>(handle, lsrc));
  78. src1.reset(new Tensor<>(handle, lsrc));
  79. flt0.reset(new Tensor<>(handle, lflt0));
  80. flt0_cpu.reset(new Tensor<>(handle_cpu, lflt0));
  81. flt1.reset(new Tensor<>(handle, lflt1));
  82. flt1_cpu.reset(new Tensor<>(handle_cpu, lflt1));
  83. dst0.reset(new Tensor<>(handle, ldst));
  84. dst1.reset(new Tensor<>(handle, ldst));
  85. }
  86. void fill_src() {
  87. rng->exec(src0->tensornd(), {});
  88. megdnn_memcpy_D2D(handle, src1->ptr(), src0->ptr(), lsrc.span().dist_byte());
  89. }
  90. void fill_flt() {
  91. rng->exec(flt1->tensornd(), {});
  92. megdnn_memcpy_D2H(
  93. handle, flt1_cpu->ptr(), flt1->ptr(), lflt1.span().dist_byte());
  94. const size_t IC = lflt1[0], CHL_MUL = lflt1[1], FSIZE = lflt1[3] * lflt1[4];
  95. // fill flt0 from flt1
  96. float* src = flt1_cpu->ptr();
  97. float* dst = flt0_cpu->ptr();
  98. memset(dst, 0, lflt0.span().dist_byte());
  99. for (size_t i = 0; i < IC; ++i) {
  100. for (size_t j = 0; j < CHL_MUL; ++j) {
  101. memcpy(dst + ((i * CHL_MUL + j) * IC + i) * FSIZE,
  102. src + (i * CHL_MUL + j) * FSIZE, FSIZE * sizeof(float));
  103. }
  104. }
  105. megdnn_memcpy_H2D(handle, flt0->ptr(), dst, lflt0.span().dist_byte());
  106. }
  107. void fill_dst() {
  108. rng->exec(dst0->tensornd(), {});
  109. megdnn_memcpy_D2D(handle, dst1->ptr(), dst0->ptr(), ldst.span().dist_byte());
  110. }
  111. template <class Opr>
  112. void exec(Opr* opr0, Opr* opr1) {
  113. opr0->param().pad_h = pad_h;
  114. opr0->param().pad_w = pad_w;
  115. opr1->param() = opr0->param();
  116. opr1->param().sparse = param::Convolution::Sparse::GROUP;
  117. TensorND a0, b0, c0, a1, b1, c1;
  118. std::tie(a0, b0, c0) = shuffle(
  119. std::make_tuple(src0->tensornd(), flt0->tensornd(), dst0->tensornd()));
  120. std::tie(a1, b1, c1) = shuffle(
  121. std::make_tuple(src1->tensornd(), flt1->tensornd(), dst1->tensornd()));
  122. WorkspaceWrapper wk(
  123. handle,
  124. std::max(
  125. opr0->get_workspace_in_bytes(a0.layout, b0.layout, c0.layout),
  126. opr1->get_workspace_in_bytes(a1.layout, b1.layout, c1.layout)));
  127. hipProfilerStart();
  128. hipEventRecord(hip_ev[0], hip_stream);
  129. opr0->exec(a0, b0, c0, wk.workspace());
  130. hipEventRecord(hip_ev[1], hip_stream);
  131. opr1->exec(a1, b1, c1, wk.workspace());
  132. hipEventRecord(hip_ev[2], hip_stream);
  133. hipProfilerStop();
  134. if (getenv("MEGDNN_CHANWISE_CONV_VERBOSE") ||
  135. getenv("MEGDNN_CHANWISE_CONV_FULLBENCH")) {
  136. hipStreamSynchronize(hip_stream);
  137. float t0 = -1, t1 = -1;
  138. hipEventElapsedTime(&t0, hip_ev[0], hip_ev[1]);
  139. hipEventElapsedTime(&t1, hip_ev[1], hip_ev[2]);
  140. printf("%s;%s;%s: miopen/megdnn: %.3fms/%.3fms=%.3f\n",
  141. lsrc.TensorShape::to_string().c_str(),
  142. lflt1.TensorShape::to_string().c_str(),
  143. ldst.TensorShape::to_string().c_str(), t0, t1, t0 / t1);
  144. }
  145. }
  146. void cmp_dst() {
  147. Tensor<> dst0_cpu(handle_cpu, ldst), dst1_cpu(handle_cpu, ldst);
  148. megdnn_memcpy_D2H(handle, dst0_cpu.ptr(), dst0->ptr(), ldst.span().dist_byte());
  149. megdnn_memcpy_D2H(handle, dst1_cpu.ptr(), dst1->ptr(), ldst.span().dist_byte());
  150. dst0_cpu.check_with(dst1_cpu);
  151. }
  152. void cmp_src() {
  153. Tensor<> src0_cpu(handle_cpu, lsrc), src1_cpu(handle_cpu, lsrc);
  154. megdnn_memcpy_D2H(handle, src0_cpu.ptr(), src0->ptr(), lsrc.span().dist_byte());
  155. megdnn_memcpy_D2H(handle, src1_cpu.ptr(), src1->ptr(), lsrc.span().dist_byte());
  156. src0_cpu.check_with(src1_cpu);
  157. }
  158. void cmp_flt() {
  159. Tensor<> flt0_cpu(handle_cpu, lflt0), flt1_cpu(handle_cpu, lflt1);
  160. float* p0 = flt0_cpu.ptr();
  161. float* p1 = flt1_cpu.ptr();
  162. megdnn_memcpy_D2H(handle, p0, flt0->ptr(), lflt0.span().dist_byte());
  163. megdnn_memcpy_D2H(handle, p1, flt1->ptr(), lflt1.span().dist_byte());
  164. size_t IC = lflt1[0], CHL_MUL = lflt1[1], FSIZE = lflt1[3] * lflt1[4];
  165. double tot_err = 0, tot_err_num = 0;
  166. for (size_t i = 0; i < IC; ++i) {
  167. for (size_t j = 0; j < CHL_MUL; ++j) {
  168. auto t0 = p0 + ((i * CHL_MUL + j) * IC + i) * FSIZE,
  169. t1 = p1 + (i * CHL_MUL + j) * FSIZE;
  170. for (size_t k = 0; k < FSIZE; ++k) {
  171. auto err = std::abs(diff(t0[k], t1[k]));
  172. tot_err += err;
  173. tot_err_num += 1;
  174. ASSERT_LT(err, 1e-2) << "failed at " << i << " " << j << " " << k
  175. << " vals=" << t0[k] << "," << t1[k];
  176. }
  177. }
  178. }
  179. auto avg_err = tot_err / tot_err_num;
  180. ASSERT_LT(avg_err, 1e-4);
  181. }
  182. };
  183. } // anonymous namespace
  184. constexpr auto M = Convolution::Mode::CROSS_CORRELATION;
  185. TEST_F(ROCM, CHANWISE_CONVOLUTION_FORWARD) {
  186. Checker<Convolution> checker(handle_rocm());
  187. bool require_algo = false;
  188. checker.set_before_exec_callback(
  189. AlgoChecker<ConvolutionForward>("CHANNEL_WISE", &require_algo));
  190. // simple case
  191. checker.set_param(gconv_param({M, 0, 0, 1, 1}))
  192. .execs({{1, 1, 2, 2}, {1, 1, 1, 2, 2}, {}})
  193. .execs({{1, 1, 5, 5}, {1, 1, 1, 2, 2}, {}});
  194. checker.execs({{2, 2, 5, 5}, {2, 3, 1, 2, 2}, {2, 6, 4, 4}});
  195. checker.set_param(gconv_param({M, 1, 1, 1, 1}))
  196. .execs({{2, 2, 5, 5}, {2, 1, 1, 2, 2}, {}});
  197. checker.set_param(gconv_param({M, 2, 3, 2, 1}))
  198. .execs({{32, 12, 20, 10}, {12, 2, 1, 4, 5}, {}});
  199. // padding larger than kern
  200. checker.set_param(gconv_param({M, 20, 30, 4, 5}))
  201. .execs({{32, 12, 20, 10}, {12, 2, 1, 4, 5}, {}});
  202. }
  203. TEST_F(ROCM, CHANWISE_CONVOLUTION_BACKWARD_DATA) {
  204. Checker<ConvolutionBackwardData> checker(handle_rocm());
  205. checker.set_param(gconv_param({M, 0, 0, 1, 1}))
  206. .execs({{1, 1, 1, 2, 2}, {1, 1, 1, 1}, {1, 1, 2, 2}})
  207. .execs({{1, 1, 1, 2, 2}, {1, 1, 5, 5}, {1, 1, 6, 6}});
  208. checker.execs({{2, 1, 1, 2, 2}, {1, 2, 1, 1}, {1, 2, 2, 2}})
  209. .execs({{2, 1, 1, 2, 2}, {1, 2, 5, 5}, {1, 2, 6, 6}})
  210. .execs({{2, 3, 1, 2, 2}, {2, 6, 5, 5}, {2, 2, 6, 6}});
  211. checker.set_param(gconv_param({M, 1, 1, 1, 1}))
  212. .execs({{2, 1, 1, 2, 2}, {2, 2, 5, 5}, {2, 2, 4, 4}});
  213. checker.set_param(gconv_param({M, 2, 3, 2, 1}))
  214. .execs({{12, 3, 1, 4, 5}, {32, 36, 20, 10}, {32, 12, 39, 8}});
  215. // padding larger than kern
  216. checker.set_param(gconv_param({M, 20, 30, 4, 5}))
  217. .execs({{6, 2, 1, 4, 5}, {32, 12, 10, 12}, {32, 6, 2, 3}});
  218. }
  219. TEST_F(ROCM, CHANWISE_CONVOLUTION_BACKWARD_FILTER) {
  220. Checker<ConvolutionBackwardFilter> checker(handle_rocm());
  221. checker.set_param(gconv_param({M, 0, 0, 1, 1}))
  222. .execs({{1, 1, 2, 2}, {1, 1, 1, 1}, {1, 1, 1, 2, 2}})
  223. .execs({{1, 1, 6, 6}, {1, 1, 5, 5}, {1, 1, 1, 2, 2}})
  224. .execs({{256, 1, 2, 2}, {256, 1, 1, 1}, {1, 1, 1, 2, 2}});
  225. checker.execs({{1, 2, 2, 2}, {1, 2, 1, 1}, {2, 1, 1, 2, 2}})
  226. .execs({{1, 2, 6, 6}, {1, 2, 5, 5}, {2, 1, 1, 2, 2}})
  227. .execs({{2, 2, 6, 6}, {2, 6, 5, 5}, {2, 3, 1, 2, 2}});
  228. checker.set_param(gconv_param({M, 1, 1, 1, 1}))
  229. .execs({{2, 2, 4, 4}, {2, 2, 5, 5}, {2, 1, 1, 2, 2}});
  230. checker.set_param(gconv_param({M, 0, 0, 1, 1}))
  231. .execs({{40960, 1, 1, 1}, {40960, 1, 1, 1}, {1, 1, 1, 1, 1}});
  232. checker.set_param(gconv_param({M, 2, 3, 2, 1}))
  233. .execs({{32, 12, 39, 8}, {32, 36, 20, 10}, {12, 3, 1, 4, 5}});
  234. // padding larger than kern
  235. checker.set_param(gconv_param({M, 20, 30, 4, 5}))
  236. .execs({{32, 6, 2, 3}, {32, 12, 10, 12}, {6, 2, 1, 4, 5}});
  237. // unused filter items
  238. checker.set_param(gconv_param({M, 2, 3, 2, 3}))
  239. .execs({{32, 6, 1, 1}, {32, 12, 1, 1}, {6, 2, 1, 5, 7}});
  240. }
  241. #if MEGDNN_WITH_BENCHMARK
  242. TEST_F(ROCM, CHANWISE_CONVOLUTION_FORWARD_BENCH_CHECK) {
  243. auto handle = handle_rocm();
  244. auto handle_cpu = handle_naive();
  245. auto conv0 = handle->create_operator<ConvolutionForward>();
  246. auto conv1 = handle->create_operator<ConvolutionForward>();
  247. BenchmarkEnv<0, 1, 2> benv(handle, handle_cpu);
  248. auto run = [&](size_t N, size_t IC, size_t IH, size_t IW, size_t CHL_MUL, size_t FH,
  249. size_t FW, size_t PH, size_t PW) {
  250. benv.alloc(N, IC, IH, IW, CHL_MUL, FH, FW, PH, PW);
  251. benv.fill_src();
  252. benv.fill_flt();
  253. benv.exec(conv0.get(), conv1.get());
  254. benv.cmp_dst();
  255. };
  256. run(64, 60, 50, 50, 1, 3, 3, 1, 1);
  257. if (check_need_full_bench()) {
  258. run(64, 728, 18, 18, 2, 5, 5, 2, 2);
  259. run(64, 64, 150, 150, 2, 3, 3, 1, 1);
  260. run(1, 2048, 4, 4, 2, 3, 3, 1, 1);
  261. }
  262. }
  263. TEST_F(ROCM, CHANWISE_CONVOLUTION_BWD_DATA_BENCH_CHECK) {
  264. auto handle = handle_rocm();
  265. auto handle_cpu = handle_naive();
  266. auto conv0 = handle->create_operator<ConvolutionBackwardData>();
  267. auto conv1 = handle->create_operator<ConvolutionBackwardData>();
  268. BenchmarkEnv<1, 2, 0> benv(handle, handle_cpu);
  269. auto run = [&](size_t N, size_t IC, size_t IH, size_t IW, size_t CHL_MUL, size_t FH,
  270. size_t FW, size_t PH, size_t PW) {
  271. benv.alloc(N, IC, IH, IW, CHL_MUL, FH, FW, PH, PW);
  272. benv.fill_dst();
  273. benv.fill_flt();
  274. benv.exec(conv0.get(), conv1.get());
  275. benv.cmp_src();
  276. };
  277. run(64, 60, 50, 50, 1, 3, 3, 1, 1);
  278. if (check_need_full_bench()) {
  279. run(64, 728, 18, 18, 2, 5, 5, 2, 2);
  280. run(64, 64, 150, 150, 2, 3, 3, 1, 1);
  281. run(1, 2048, 4, 4, 2, 3, 3, 1, 1);
  282. }
  283. }
  284. TEST_F(ROCM, CHANWISE_CONVOLUTION_BWD_FILTER_BENCH_CHECK) {
  285. auto handle = handle_rocm();
  286. auto handle_cpu = handle_naive();
  287. auto conv0 = handle->create_operator<ConvolutionBackwardFilter>();
  288. auto conv1 = handle->create_operator<ConvolutionBackwardFilter>();
  289. BenchmarkEnv<0, 2, 1> benv(handle, handle_cpu);
  290. auto run = [&](size_t N, size_t IC, size_t IH, size_t IW, size_t CHL_MUL, size_t FH,
  291. size_t FW, size_t PH, size_t PW) {
  292. benv.alloc(N, IC, IH, IW, CHL_MUL, FH, FW, PH, PW);
  293. benv.fill_src();
  294. benv.fill_dst();
  295. benv.exec(conv0.get(), conv1.get());
  296. benv.cmp_flt();
  297. };
  298. run(64, 60, 50, 50, 1, 3, 3, 1, 1);
  299. if (check_need_full_bench()) {
  300. run(64, 728, 18, 18, 2, 5, 5, 2, 2);
  301. run(64, 64, 150, 150, 2, 3, 3, 1, 1);
  302. run(1, 2048, 4, 4, 2, 3, 3, 1, 1);
  303. }
  304. }
  305. TEST_F(ROCM, CHANWISE_CONVOLUTION_BENCH_ALL_ALGO_FWD) {
  306. // enable profiling
  307. OprProxy<ConvolutionForward> proxy(true);
  308. proxy.warmup_times = 1;
  309. proxy.exec_times = 10;
  310. Benchmarker<ConvolutionForward> checker(handle_rocm());
  311. checker.set_times(1);
  312. ConvolutionForward::Param param;
  313. param.sparse = ConvolutionForward::Param::Sparse::GROUP;
  314. checker.set_param(param);
  315. auto run = [&](size_t N, size_t C, size_t IH, size_t IW, size_t FH, size_t FW) {
  316. checker.set_proxy(proxy);
  317. checker.execs({{N, C, IH, IW}, {C, 1, 1, FH, FW}, {}});
  318. };
  319. run(128, 64, 90, 80, 3, 3);
  320. run(128, 90, 100, 100, 3, 5);
  321. run(128, 32, 62, 62, 5, 5);
  322. }
  323. TEST_F(ROCM, CHANWISE_CONVOLUTION_BENCH_ALL_ALGO_BWD_DATA) {
  324. // enable profiling
  325. OprProxy<ConvolutionBackwardData> proxy(true);
  326. proxy.warmup_times = 1;
  327. proxy.exec_times = 10;
  328. Benchmarker<ConvolutionBackwardData> checker(handle_rocm());
  329. checker.set_times(1);
  330. ConvolutionBackwardData::Param param;
  331. param.sparse = ConvolutionForward::Param::Sparse::GROUP;
  332. checker.set_param(param);
  333. auto run = [&](size_t N, size_t C, size_t IH, size_t IW, size_t FH, size_t FW) {
  334. checker.set_proxy(proxy);
  335. checker.execs(
  336. {{C, 1, 1, FH, FW}, {N, C, IH - FH + 1, IW - FW + 1}, {N, C, IH, IW}});
  337. };
  338. run(128, 64, 90, 80, 3, 3);
  339. run(128, 90, 100, 100, 3, 5);
  340. run(128, 32, 62, 62, 5, 5);
  341. }
  342. TEST_F(ROCM, CHANWISE_CONVOLUTION_BENCH_ALL_ALGO_BWD_FILTER) {
  343. // enable profiling
  344. OprProxy<ConvolutionBackwardFilter> proxy(true);
  345. proxy.warmup_times = 1;
  346. proxy.exec_times = 10;
  347. Benchmarker<ConvolutionBackwardFilter> checker(handle_rocm());
  348. checker.set_times(1);
  349. ConvolutionBackwardFilter::Param param;
  350. param.sparse = ConvolutionForward::Param::Sparse::GROUP;
  351. checker.set_param(param);
  352. auto run = [&](size_t N, size_t C, size_t IH, size_t IW, size_t FH, size_t FW) {
  353. checker.set_proxy(proxy);
  354. checker.execs(
  355. {{N, C, IH, IW}, {N, C, IH - FH + 1, IW - FW + 1}, {C, 1, 1, FH, FW}});
  356. };
  357. run(128, 64, 90, 80, 3, 3);
  358. run(128, 90, 100, 100, 3, 5);
  359. run(128, 32, 62, 62, 5, 5);
  360. }
  361. #endif
  362. // vim: syntax=cpp.doxygen