You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

chanwise_convolution.cpp 15 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419
  1. #include "include/hcc_detail/hcc_defs_prologue.h"
  2. #include "megdnn/oprs/nn.h"
  3. #include "megcore_rocm.h"
  4. #include "test/common/benchmarker.h"
  5. #include "test/common/checker.h"
  6. #include "test/common/convolution.h"
  7. #include "test/common/tensor.h"
  8. #include "test/common/workspace_wrapper.h"
  9. #include "test/rocm/fixture.h"
  10. #include "hip_header.h"
  11. using namespace megdnn;
  12. using namespace test;
  13. namespace {
  14. #if MEGDNN_WITH_BENCHMARK
  15. bool check_need_full_bench() {
  16. if (getenv("MEGDNN_CHANWISE_CONV_FULLBENCH"))
  17. return true;
  18. printf("set MEGDNN_CHANWISE_CONV_FULLBENCH to run full benchmark\n");
  19. return false;
  20. }
  21. #endif
  22. Convolution::Param gconv_param(Convolution::Param p) {
  23. p.sparse = Convolution::Param::Sparse::GROUP;
  24. return p;
  25. }
  26. template <int P0, int P1, int P2>
  27. class BenchmarkEnv {
  28. Handle *handle, *handle_cpu;
  29. std::unique_ptr<GaussianRNG> rng;
  30. TensorLayout lsrc, lflt0, lflt1, ldst;
  31. std::unique_ptr<Tensor<>> src0, src1, flt0, flt0_cpu, flt1, flt1_cpu, dst0, dst1;
  32. hipEvent_t hip_ev[3];
  33. hipStream_t hip_stream;
  34. size_t pad_h, pad_w;
  35. template <typename T>
  36. static std::tuple<T, T, T> shuffle(std::tuple<T, T, T> data) {
  37. return std::make_tuple(
  38. std::get<P0>(data), std::get<P1>(data), std::get<P2>(data));
  39. }
  40. public:
  41. BenchmarkEnv(Handle* handle, Handle* handle_cpu) {
  42. this->handle = handle;
  43. this->handle_cpu = handle_cpu;
  44. rng = handle->create_operator<GaussianRNG>();
  45. // make cpu handle used
  46. handle_cpu->create_operator<Sleep>()->exec();
  47. for (int i = 0; i < 3; ++i)
  48. hipEventCreate(&hip_ev[i]);
  49. megcoreGetROCMStream(handle->megcore_computing_handle(), &hip_stream);
  50. }
  51. ~BenchmarkEnv() {
  52. for (int i = 0; i < 3; ++i)
  53. hipEventDestroy(hip_ev[i]);
  54. }
  55. void alloc(
  56. size_t N, size_t IC, size_t IH, size_t IW, size_t CHL_MUL, size_t FH,
  57. size_t FW, size_t PH, size_t PW) {
  58. pad_h = PH;
  59. pad_w = PW;
  60. auto mkly = [](const TensorShape& s) {
  61. return TensorLayout{s, dtype::Float32()};
  62. };
  63. lsrc = mkly({N, IC, IH, IW});
  64. lflt0 = mkly({CHL_MUL * IC, IC, FH, FW});
  65. lflt1 = mkly({IC, CHL_MUL, 1, FH, FW});
  66. ldst = mkly({N, IC * CHL_MUL, IH - FH + 1 + PH * 2, IW - FW + 1 + PW * 2});
  67. src0.reset(new Tensor<>(handle, lsrc));
  68. src1.reset(new Tensor<>(handle, lsrc));
  69. flt0.reset(new Tensor<>(handle, lflt0));
  70. flt0_cpu.reset(new Tensor<>(handle_cpu, lflt0));
  71. flt1.reset(new Tensor<>(handle, lflt1));
  72. flt1_cpu.reset(new Tensor<>(handle_cpu, lflt1));
  73. dst0.reset(new Tensor<>(handle, ldst));
  74. dst1.reset(new Tensor<>(handle, ldst));
  75. }
  76. void fill_src() {
  77. rng->exec(src0->tensornd(), {});
  78. megdnn_memcpy_D2D(handle, src1->ptr(), src0->ptr(), lsrc.span().dist_byte());
  79. }
  80. void fill_flt() {
  81. rng->exec(flt1->tensornd(), {});
  82. megdnn_memcpy_D2H(
  83. handle, flt1_cpu->ptr(), flt1->ptr(), lflt1.span().dist_byte());
  84. const size_t IC = lflt1[0], CHL_MUL = lflt1[1], FSIZE = lflt1[3] * lflt1[4];
  85. // fill flt0 from flt1
  86. float* src = flt1_cpu->ptr();
  87. float* dst = flt0_cpu->ptr();
  88. memset(dst, 0, lflt0.span().dist_byte());
  89. for (size_t i = 0; i < IC; ++i) {
  90. for (size_t j = 0; j < CHL_MUL; ++j) {
  91. memcpy(dst + ((i * CHL_MUL + j) * IC + i) * FSIZE,
  92. src + (i * CHL_MUL + j) * FSIZE, FSIZE * sizeof(float));
  93. }
  94. }
  95. megdnn_memcpy_H2D(handle, flt0->ptr(), dst, lflt0.span().dist_byte());
  96. }
  97. void fill_dst() {
  98. rng->exec(dst0->tensornd(), {});
  99. megdnn_memcpy_D2D(handle, dst1->ptr(), dst0->ptr(), ldst.span().dist_byte());
  100. }
  101. template <class Opr>
  102. void exec(Opr* opr0, Opr* opr1) {
  103. opr0->param().pad_h = pad_h;
  104. opr0->param().pad_w = pad_w;
  105. opr1->param() = opr0->param();
  106. opr1->param().sparse = param::Convolution::Sparse::GROUP;
  107. TensorND a0, b0, c0, a1, b1, c1;
  108. std::tie(a0, b0, c0) = shuffle(
  109. std::make_tuple(src0->tensornd(), flt0->tensornd(), dst0->tensornd()));
  110. std::tie(a1, b1, c1) = shuffle(
  111. std::make_tuple(src1->tensornd(), flt1->tensornd(), dst1->tensornd()));
  112. WorkspaceWrapper wk(
  113. handle,
  114. std::max(
  115. opr0->get_workspace_in_bytes(a0.layout, b0.layout, c0.layout),
  116. opr1->get_workspace_in_bytes(a1.layout, b1.layout, c1.layout)));
  117. hipProfilerStart();
  118. hipEventRecord(hip_ev[0], hip_stream);
  119. opr0->exec(a0, b0, c0, wk.workspace());
  120. hipEventRecord(hip_ev[1], hip_stream);
  121. opr1->exec(a1, b1, c1, wk.workspace());
  122. hipEventRecord(hip_ev[2], hip_stream);
  123. hipProfilerStop();
  124. if (getenv("MEGDNN_CHANWISE_CONV_VERBOSE") ||
  125. getenv("MEGDNN_CHANWISE_CONV_FULLBENCH")) {
  126. hipStreamSynchronize(hip_stream);
  127. float t0 = -1, t1 = -1;
  128. hipEventElapsedTime(&t0, hip_ev[0], hip_ev[1]);
  129. hipEventElapsedTime(&t1, hip_ev[1], hip_ev[2]);
  130. printf("%s;%s;%s: miopen/megdnn: %.3fms/%.3fms=%.3f\n",
  131. lsrc.TensorShape::to_string().c_str(),
  132. lflt1.TensorShape::to_string().c_str(),
  133. ldst.TensorShape::to_string().c_str(), t0, t1, t0 / t1);
  134. }
  135. }
  136. void cmp_dst() {
  137. Tensor<> dst0_cpu(handle_cpu, ldst), dst1_cpu(handle_cpu, ldst);
  138. megdnn_memcpy_D2H(handle, dst0_cpu.ptr(), dst0->ptr(), ldst.span().dist_byte());
  139. megdnn_memcpy_D2H(handle, dst1_cpu.ptr(), dst1->ptr(), ldst.span().dist_byte());
  140. dst0_cpu.check_with(dst1_cpu);
  141. }
  142. void cmp_src() {
  143. Tensor<> src0_cpu(handle_cpu, lsrc), src1_cpu(handle_cpu, lsrc);
  144. megdnn_memcpy_D2H(handle, src0_cpu.ptr(), src0->ptr(), lsrc.span().dist_byte());
  145. megdnn_memcpy_D2H(handle, src1_cpu.ptr(), src1->ptr(), lsrc.span().dist_byte());
  146. src0_cpu.check_with(src1_cpu);
  147. }
  148. void cmp_flt() {
  149. Tensor<> flt0_cpu(handle_cpu, lflt0), flt1_cpu(handle_cpu, lflt1);
  150. float* p0 = flt0_cpu.ptr();
  151. float* p1 = flt1_cpu.ptr();
  152. megdnn_memcpy_D2H(handle, p0, flt0->ptr(), lflt0.span().dist_byte());
  153. megdnn_memcpy_D2H(handle, p1, flt1->ptr(), lflt1.span().dist_byte());
  154. size_t IC = lflt1[0], CHL_MUL = lflt1[1], FSIZE = lflt1[3] * lflt1[4];
  155. double tot_err = 0, tot_err_num = 0;
  156. for (size_t i = 0; i < IC; ++i) {
  157. for (size_t j = 0; j < CHL_MUL; ++j) {
  158. auto t0 = p0 + ((i * CHL_MUL + j) * IC + i) * FSIZE,
  159. t1 = p1 + (i * CHL_MUL + j) * FSIZE;
  160. for (size_t k = 0; k < FSIZE; ++k) {
  161. auto err = std::abs(diff(t0[k], t1[k]));
  162. tot_err += err;
  163. tot_err_num += 1;
  164. ASSERT_LT(err, 1e-2) << "failed at " << i << " " << j << " " << k
  165. << " vals=" << t0[k] << "," << t1[k];
  166. }
  167. }
  168. }
  169. auto avg_err = tot_err / tot_err_num;
  170. ASSERT_LT(avg_err, 1e-4);
  171. }
  172. };
  173. } // anonymous namespace
  174. constexpr auto M = Convolution::Mode::CROSS_CORRELATION;
  175. TEST_F(ROCM, CHANWISE_CONVOLUTION_FORWARD) {
  176. Checker<Convolution> checker(handle_rocm());
  177. bool require_algo = false;
  178. checker.set_before_exec_callback(
  179. AlgoChecker<ConvolutionForward>("CHANNEL_WISE", &require_algo));
  180. // simple case
  181. checker.set_param(gconv_param({M, 0, 0, 1, 1}))
  182. .execs({{1, 1, 2, 2}, {1, 1, 1, 2, 2}, {}})
  183. .execs({{1, 1, 5, 5}, {1, 1, 1, 2, 2}, {}});
  184. checker.execs({{2, 2, 5, 5}, {2, 3, 1, 2, 2}, {2, 6, 4, 4}});
  185. checker.set_param(gconv_param({M, 1, 1, 1, 1}))
  186. .execs({{2, 2, 5, 5}, {2, 1, 1, 2, 2}, {}});
  187. checker.set_param(gconv_param({M, 2, 3, 2, 1}))
  188. .execs({{32, 12, 20, 10}, {12, 2, 1, 4, 5}, {}});
  189. // padding larger than kern
  190. checker.set_param(gconv_param({M, 20, 30, 4, 5}))
  191. .execs({{32, 12, 20, 10}, {12, 2, 1, 4, 5}, {}});
  192. }
  193. TEST_F(ROCM, CHANWISE_CONVOLUTION_BACKWARD_DATA) {
  194. Checker<ConvolutionBackwardData> checker(handle_rocm());
  195. checker.set_param(gconv_param({M, 0, 0, 1, 1}))
  196. .execs({{1, 1, 1, 2, 2}, {1, 1, 1, 1}, {1, 1, 2, 2}})
  197. .execs({{1, 1, 1, 2, 2}, {1, 1, 5, 5}, {1, 1, 6, 6}});
  198. checker.execs({{2, 1, 1, 2, 2}, {1, 2, 1, 1}, {1, 2, 2, 2}})
  199. .execs({{2, 1, 1, 2, 2}, {1, 2, 5, 5}, {1, 2, 6, 6}})
  200. .execs({{2, 3, 1, 2, 2}, {2, 6, 5, 5}, {2, 2, 6, 6}});
  201. checker.set_param(gconv_param({M, 1, 1, 1, 1}))
  202. .execs({{2, 1, 1, 2, 2}, {2, 2, 5, 5}, {2, 2, 4, 4}});
  203. checker.set_param(gconv_param({M, 2, 3, 2, 1}))
  204. .execs({{12, 3, 1, 4, 5}, {32, 36, 20, 10}, {32, 12, 39, 8}});
  205. // padding larger than kern
  206. checker.set_param(gconv_param({M, 20, 30, 4, 5}))
  207. .execs({{6, 2, 1, 4, 5}, {32, 12, 10, 12}, {32, 6, 2, 3}});
  208. }
  209. TEST_F(ROCM, CHANWISE_CONVOLUTION_BACKWARD_FILTER) {
  210. Checker<ConvolutionBackwardFilter> checker(handle_rocm());
  211. checker.set_param(gconv_param({M, 0, 0, 1, 1}))
  212. .execs({{1, 1, 2, 2}, {1, 1, 1, 1}, {1, 1, 1, 2, 2}})
  213. .execs({{1, 1, 6, 6}, {1, 1, 5, 5}, {1, 1, 1, 2, 2}})
  214. .execs({{256, 1, 2, 2}, {256, 1, 1, 1}, {1, 1, 1, 2, 2}});
  215. checker.execs({{1, 2, 2, 2}, {1, 2, 1, 1}, {2, 1, 1, 2, 2}})
  216. .execs({{1, 2, 6, 6}, {1, 2, 5, 5}, {2, 1, 1, 2, 2}})
  217. .execs({{2, 2, 6, 6}, {2, 6, 5, 5}, {2, 3, 1, 2, 2}});
  218. checker.set_param(gconv_param({M, 1, 1, 1, 1}))
  219. .execs({{2, 2, 4, 4}, {2, 2, 5, 5}, {2, 1, 1, 2, 2}});
  220. checker.set_param(gconv_param({M, 0, 0, 1, 1}))
  221. .execs({{40960, 1, 1, 1}, {40960, 1, 1, 1}, {1, 1, 1, 1, 1}});
  222. checker.set_param(gconv_param({M, 2, 3, 2, 1}))
  223. .execs({{32, 12, 39, 8}, {32, 36, 20, 10}, {12, 3, 1, 4, 5}});
  224. // padding larger than kern
  225. checker.set_param(gconv_param({M, 20, 30, 4, 5}))
  226. .execs({{32, 6, 2, 3}, {32, 12, 10, 12}, {6, 2, 1, 4, 5}});
  227. // unused filter items
  228. checker.set_param(gconv_param({M, 2, 3, 2, 3}))
  229. .execs({{32, 6, 1, 1}, {32, 12, 1, 1}, {6, 2, 1, 5, 7}});
  230. }
  231. #if MEGDNN_WITH_BENCHMARK
  232. TEST_F(ROCM, CHANWISE_CONVOLUTION_FORWARD_BENCH_CHECK) {
  233. auto handle = handle_rocm();
  234. auto handle_cpu = handle_naive();
  235. auto conv0 = handle->create_operator<ConvolutionForward>();
  236. auto conv1 = handle->create_operator<ConvolutionForward>();
  237. BenchmarkEnv<0, 1, 2> benv(handle, handle_cpu);
  238. auto run = [&](size_t N, size_t IC, size_t IH, size_t IW, size_t CHL_MUL, size_t FH,
  239. size_t FW, size_t PH, size_t PW) {
  240. benv.alloc(N, IC, IH, IW, CHL_MUL, FH, FW, PH, PW);
  241. benv.fill_src();
  242. benv.fill_flt();
  243. benv.exec(conv0.get(), conv1.get());
  244. benv.cmp_dst();
  245. };
  246. run(64, 60, 50, 50, 1, 3, 3, 1, 1);
  247. if (check_need_full_bench()) {
  248. run(64, 728, 18, 18, 2, 5, 5, 2, 2);
  249. run(64, 64, 150, 150, 2, 3, 3, 1, 1);
  250. run(1, 2048, 4, 4, 2, 3, 3, 1, 1);
  251. }
  252. }
  253. TEST_F(ROCM, CHANWISE_CONVOLUTION_BWD_DATA_BENCH_CHECK) {
  254. auto handle = handle_rocm();
  255. auto handle_cpu = handle_naive();
  256. auto conv0 = handle->create_operator<ConvolutionBackwardData>();
  257. auto conv1 = handle->create_operator<ConvolutionBackwardData>();
  258. BenchmarkEnv<1, 2, 0> benv(handle, handle_cpu);
  259. auto run = [&](size_t N, size_t IC, size_t IH, size_t IW, size_t CHL_MUL, size_t FH,
  260. size_t FW, size_t PH, size_t PW) {
  261. benv.alloc(N, IC, IH, IW, CHL_MUL, FH, FW, PH, PW);
  262. benv.fill_dst();
  263. benv.fill_flt();
  264. benv.exec(conv0.get(), conv1.get());
  265. benv.cmp_src();
  266. };
  267. run(64, 60, 50, 50, 1, 3, 3, 1, 1);
  268. if (check_need_full_bench()) {
  269. run(64, 728, 18, 18, 2, 5, 5, 2, 2);
  270. run(64, 64, 150, 150, 2, 3, 3, 1, 1);
  271. run(1, 2048, 4, 4, 2, 3, 3, 1, 1);
  272. }
  273. }
  274. TEST_F(ROCM, CHANWISE_CONVOLUTION_BWD_FILTER_BENCH_CHECK) {
  275. auto handle = handle_rocm();
  276. auto handle_cpu = handle_naive();
  277. auto conv0 = handle->create_operator<ConvolutionBackwardFilter>();
  278. auto conv1 = handle->create_operator<ConvolutionBackwardFilter>();
  279. BenchmarkEnv<0, 2, 1> benv(handle, handle_cpu);
  280. auto run = [&](size_t N, size_t IC, size_t IH, size_t IW, size_t CHL_MUL, size_t FH,
  281. size_t FW, size_t PH, size_t PW) {
  282. benv.alloc(N, IC, IH, IW, CHL_MUL, FH, FW, PH, PW);
  283. benv.fill_src();
  284. benv.fill_dst();
  285. benv.exec(conv0.get(), conv1.get());
  286. benv.cmp_flt();
  287. };
  288. run(64, 60, 50, 50, 1, 3, 3, 1, 1);
  289. if (check_need_full_bench()) {
  290. run(64, 728, 18, 18, 2, 5, 5, 2, 2);
  291. run(64, 64, 150, 150, 2, 3, 3, 1, 1);
  292. run(1, 2048, 4, 4, 2, 3, 3, 1, 1);
  293. }
  294. }
  295. TEST_F(ROCM, CHANWISE_CONVOLUTION_BENCH_ALL_ALGO_FWD) {
  296. // enable profiling
  297. OprProxy<ConvolutionForward> proxy(true);
  298. proxy.warmup_times = 1;
  299. proxy.exec_times = 10;
  300. Benchmarker<ConvolutionForward> checker(handle_rocm());
  301. checker.set_times(1);
  302. ConvolutionForward::Param param;
  303. param.sparse = ConvolutionForward::Param::Sparse::GROUP;
  304. checker.set_param(param);
  305. auto run = [&](size_t N, size_t C, size_t IH, size_t IW, size_t FH, size_t FW) {
  306. checker.set_proxy(proxy);
  307. checker.execs({{N, C, IH, IW}, {C, 1, 1, FH, FW}, {}});
  308. };
  309. run(128, 64, 90, 80, 3, 3);
  310. run(128, 90, 100, 100, 3, 5);
  311. run(128, 32, 62, 62, 5, 5);
  312. }
  313. TEST_F(ROCM, CHANWISE_CONVOLUTION_BENCH_ALL_ALGO_BWD_DATA) {
  314. // enable profiling
  315. OprProxy<ConvolutionBackwardData> proxy(true);
  316. proxy.warmup_times = 1;
  317. proxy.exec_times = 10;
  318. Benchmarker<ConvolutionBackwardData> checker(handle_rocm());
  319. checker.set_times(1);
  320. ConvolutionBackwardData::Param param;
  321. param.sparse = ConvolutionForward::Param::Sparse::GROUP;
  322. checker.set_param(param);
  323. auto run = [&](size_t N, size_t C, size_t IH, size_t IW, size_t FH, size_t FW) {
  324. checker.set_proxy(proxy);
  325. checker.execs(
  326. {{C, 1, 1, FH, FW}, {N, C, IH - FH + 1, IW - FW + 1}, {N, C, IH, IW}});
  327. };
  328. run(128, 64, 90, 80, 3, 3);
  329. run(128, 90, 100, 100, 3, 5);
  330. run(128, 32, 62, 62, 5, 5);
  331. }
  332. TEST_F(ROCM, CHANWISE_CONVOLUTION_BENCH_ALL_ALGO_BWD_FILTER) {
  333. // enable profiling
  334. OprProxy<ConvolutionBackwardFilter> proxy(true);
  335. proxy.warmup_times = 1;
  336. proxy.exec_times = 10;
  337. Benchmarker<ConvolutionBackwardFilter> checker(handle_rocm());
  338. checker.set_times(1);
  339. ConvolutionBackwardFilter::Param param;
  340. param.sparse = ConvolutionForward::Param::Sparse::GROUP;
  341. checker.set_param(param);
  342. auto run = [&](size_t N, size_t C, size_t IH, size_t IW, size_t FH, size_t FW) {
  343. checker.set_proxy(proxy);
  344. checker.execs(
  345. {{N, C, IH, IW}, {N, C, IH - FH + 1, IW - FW + 1}, {C, 1, 1, FH, FW}});
  346. };
  347. run(128, 64, 90, 80, 3, 3);
  348. run(128, 90, 100, 100, 3, 5);
  349. run(128, 32, 62, 62, 5, 5);
  350. }
  351. #endif
  352. // vim: syntax=cpp.doxygen