You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

chanwise_convolution3d.cpp 17 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422
  1. /**
  2. * \file dnn/test/cuda/chanwise_convolution3d.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #include "megdnn/oprs/nn.h"
  12. #include "megcore_cuda.h"
  13. #include "test/common/checker.h"
  14. #include "test/common/convolution3d.h"
  15. #include "test/common/tensor.h"
  16. #include "test/common/workspace_wrapper.h"
  17. #include "test/cuda/fixture.h"
  18. #include <cuda_profiler_api.h>
  19. #include <cuda_runtime_api.h>
  20. using namespace megdnn;
  21. using namespace test;
  22. namespace {
  23. #if MEGDNN_WITH_BENCHMARK
  24. bool check_need_full_bench() {
  25. if (getenv("MEGDNN_CHANWISE_CONV3D_FULLBENCH"))
  26. return true;
  27. printf("set MEGDNN_CHANWISE_CONV3D_FULLBENCH to run full benchmark\n");
  28. return false;
  29. }
  30. #endif
  31. Convolution3D::Param gconv_param(Convolution3D::Param p) {
  32. p.sparse = Convolution3D::Param::Sparse::GROUP;
  33. return p;
  34. }
  35. template <int P0, int P1, int P2>
  36. class BenchmarkEnv {
  37. Handle *handle, *handle_cpu;
  38. std::unique_ptr<GaussianRNG> rng;
  39. TensorLayout lsrc, lflt0, lflt1, ldst;
  40. std::unique_ptr<Tensor<>> src0, src1, flt0, flt0_cpu, flt1, flt1_cpu, dst0,
  41. dst1;
  42. cudaEvent_t cuda_ev[3];
  43. cudaStream_t cuda_stream;
  44. size_t pad_d, pad_h, pad_w;
  45. template <typename T>
  46. static std::tuple<T, T, T> shuffle(std::tuple<T, T, T> data) {
  47. return std::make_tuple(std::get<P0>(data), std::get<P1>(data),
  48. std::get<P2>(data));
  49. }
  50. public:
  51. BenchmarkEnv(Handle* handle, Handle* handle_cpu) {
  52. this->handle = handle;
  53. this->handle_cpu = handle_cpu;
  54. rng = handle->create_operator<GaussianRNG>();
  55. // make cpu handle used
  56. handle_cpu->create_operator<Sleep>()->exec();
  57. for (int i = 0; i < 3; ++i)
  58. cudaEventCreate(&cuda_ev[i]);
  59. megcoreGetCUDAStream(handle->megcore_computing_handle(), &cuda_stream);
  60. }
  61. ~BenchmarkEnv() {
  62. for (int i = 0; i < 3; ++i)
  63. cudaEventDestroy(cuda_ev[i]);
  64. }
  65. void alloc(size_t N, size_t IC, size_t ID, size_t IH, size_t IW,
  66. size_t CHL_MUL, size_t FD, size_t FH, size_t FW, size_t PD,
  67. size_t PH, size_t PW) {
  68. pad_d = PD;
  69. pad_h = PH;
  70. pad_w = PW;
  71. auto mkly = [](const TensorShape& s) {
  72. return TensorLayout{s, dtype::Float32()};
  73. };
  74. lsrc = mkly({N, IC, ID, IH, IW});
  75. lflt0 = mkly({CHL_MUL * IC, IC, FD, FH, FW});
  76. lflt1 = mkly({IC, CHL_MUL, 1, FD, FH, FW});
  77. ldst = mkly({N, IC * CHL_MUL, ID - FD + 1 + PD * 2,
  78. IH - FH + 1 + PH * 2, IW - FW + 1 + PW * 2});
  79. src0.reset(new Tensor<>(handle, lsrc));
  80. src1.reset(new Tensor<>(handle, lsrc));
  81. flt0.reset(new Tensor<>(handle, lflt0));
  82. flt0_cpu.reset(new Tensor<>(handle_cpu, lflt0));
  83. flt1.reset(new Tensor<>(handle, lflt1));
  84. flt1_cpu.reset(new Tensor<>(handle_cpu, lflt1));
  85. dst0.reset(new Tensor<>(handle, ldst));
  86. dst1.reset(new Tensor<>(handle, ldst));
  87. }
  88. void fill_src() {
  89. rng->exec(src0->tensornd(), {});
  90. megdnn_memcpy_D2D(handle, src1->ptr(), src0->ptr(),
  91. lsrc.span().dist_byte());
  92. }
  93. void fill_flt() {
  94. rng->exec(flt1->tensornd(), {});
  95. megdnn_memcpy_D2H(handle, flt1_cpu->ptr(), flt1->ptr(),
  96. lflt1.span().dist_byte());
  97. const size_t IC = lflt1[0], CHL_MUL = lflt1[1],
  98. FSIZE = lflt1[3] * lflt1[4] * lflt1[5];
  99. // fill flt0 from flt1
  100. float* src = flt1_cpu->ptr();
  101. float* dst = flt0_cpu->ptr();
  102. memset(dst, 0, lflt0.span().dist_byte());
  103. for (size_t i = 0; i < IC; ++i) {
  104. for (size_t j = 0; j < CHL_MUL; ++j) {
  105. memcpy(dst + ((i * CHL_MUL + j) * IC + i) * FSIZE,
  106. src + (i * CHL_MUL + j) * FSIZE, FSIZE * sizeof(float));
  107. }
  108. }
  109. megdnn_memcpy_H2D(handle, flt0->ptr(), dst, lflt0.span().dist_byte());
  110. }
  111. void fill_dst() {
  112. rng->exec(dst0->tensornd(), {});
  113. megdnn_memcpy_D2D(handle, dst1->ptr(), dst0->ptr(),
  114. ldst.span().dist_byte());
  115. }
  116. template <class Opr>
  117. void exec(Opr* opr0, Opr* opr1) {
  118. opr0->param().pad_d = pad_d;
  119. opr0->param().pad_h = pad_h;
  120. opr0->param().pad_w = pad_w;
  121. opr1->param() = opr0->param();
  122. opr1->param().sparse = param::Convolution3D::Sparse::GROUP;
  123. TensorND a0, b0, c0, a1, b1, c1;
  124. std::tie(a0, b0, c0) = shuffle(std::make_tuple(
  125. src0->tensornd(), flt0->tensornd(), dst0->tensornd()));
  126. std::tie(a1, b1, c1) = shuffle(std::make_tuple(
  127. src1->tensornd(), flt1->tensornd(), dst1->tensornd()));
  128. WorkspaceWrapper wk(handle,
  129. std::max(opr0->get_workspace_in_bytes(
  130. a0.layout, b0.layout, c0.layout),
  131. opr1->get_workspace_in_bytes(
  132. a1.layout, b1.layout, c1.layout)));
  133. cudaProfilerStart();
  134. cudaEventRecord(cuda_ev[0], cuda_stream);
  135. opr0->exec(a0, b0, c0, wk.workspace());
  136. cudaEventRecord(cuda_ev[1], cuda_stream);
  137. opr1->exec(a1, b1, c1, wk.workspace());
  138. cudaEventRecord(cuda_ev[2], cuda_stream);
  139. cudaProfilerStop();
  140. if (getenv("MEGDNN_CHANWISE_CONV3D_VERBOSE") ||
  141. getenv("MEGDNN_CHANWISE_CONV3D_FULLBENCH")) {
  142. cudaStreamSynchronize(cuda_stream);
  143. float t0 = -1, t1 = -1;
  144. cudaEventElapsedTime(&t0, cuda_ev[0], cuda_ev[1]);
  145. cudaEventElapsedTime(&t1, cuda_ev[1], cuda_ev[2]);
  146. printf("%s;%s;%s: cudnn/megdnn: %.3fms/%.3fms=%.3f\n",
  147. lsrc.TensorShape::to_string().c_str(),
  148. lflt1.TensorShape::to_string().c_str(),
  149. ldst.TensorShape::to_string().c_str(), t0, t1, t0 / t1);
  150. }
  151. }
  152. void cmp_dst() {
  153. Tensor<> dst0_cpu(handle_cpu, ldst), dst1_cpu(handle_cpu, ldst);
  154. megdnn_memcpy_D2H(handle, dst0_cpu.ptr(), dst0->ptr(),
  155. ldst.span().dist_byte());
  156. megdnn_memcpy_D2H(handle, dst1_cpu.ptr(), dst1->ptr(),
  157. ldst.span().dist_byte());
  158. dst0_cpu.check_with(dst1_cpu);
  159. }
  160. void cmp_src() {
  161. Tensor<> src0_cpu(handle_cpu, lsrc), src1_cpu(handle_cpu, lsrc);
  162. megdnn_memcpy_D2H(handle, src0_cpu.ptr(), src0->ptr(),
  163. lsrc.span().dist_byte());
  164. megdnn_memcpy_D2H(handle, src1_cpu.ptr(), src1->ptr(),
  165. lsrc.span().dist_byte());
  166. src0_cpu.check_with(src1_cpu);
  167. }
  168. void cmp_flt() {
  169. Tensor<> flt0_cpu(handle_cpu, lflt0), flt1_cpu(handle_cpu, lflt1);
  170. float* p0 = flt0_cpu.ptr();
  171. float* p1 = flt1_cpu.ptr();
  172. megdnn_memcpy_D2H(handle, p0, flt0->ptr(), lflt0.span().dist_byte());
  173. megdnn_memcpy_D2H(handle, p1, flt1->ptr(), lflt1.span().dist_byte());
  174. size_t IC = lflt1[0], CHL_MUL = lflt1[1],
  175. FSIZE = lflt1[3] * lflt1[4] * lflt1[5];
  176. double tot_err = 0, tot_err_num = 0;
  177. for (size_t i = 0; i < IC; ++i) {
  178. for (size_t j = 0; j < CHL_MUL; ++j) {
  179. auto t0 = p0 + ((i * CHL_MUL + j) * IC + i) * FSIZE,
  180. t1 = p1 + (i * CHL_MUL + j) * FSIZE;
  181. for (size_t k = 0; k < FSIZE; ++k) {
  182. auto err = std::abs(diff(t0[k], t1[k]));
  183. tot_err += err;
  184. tot_err_num += 1;
  185. ASSERT_LT(err, 1e-2)
  186. << "failed at " << i << " " << j << " " << k
  187. << " vals=" << t0[k] << "," << t1[k];
  188. }
  189. }
  190. }
  191. auto avg_err = tot_err / tot_err_num;
  192. ASSERT_LT(avg_err, 1e-4);
  193. }
  194. };
  195. } // anonymous namespace
  196. constexpr auto M = Convolution3D::Mode::CROSS_CORRELATION;
  197. TEST_F(CUDA, CHANWISE_CONVOLUTION3D_FORWARD) {
  198. constexpr auto M = Convolution3D::Mode::CROSS_CORRELATION;
  199. Checker<Convolution3D> checker(handle_cuda());
  200. bool require_algo = false;
  201. checker.set_before_exec_callback(
  202. AlgoChecker<Convolution3DForward>(
  203. "CHANNEL_WISE", &require_algo));
  204. checker.set_param(gconv_param({M, 0, 0, 0, 1, 1, 1}))
  205. .execs({{1, 1, 2, 2, 2}, {1, 1, 1, 2, 2, 2}, {}})
  206. .execs({{1, 1, 5, 5, 5}, {1, 1, 1, 2, 2, 2}, {}});
  207. checker.set_param(gconv_param({M, 0, 0, 0, 1, 1, 1}))
  208. .execs({{1, 2, 2, 2, 2}, {2, 1, 1, 2, 2, 2}, {}})
  209. .execs({{1, 2, 5, 5, 5}, {2, 1, 1, 2, 2, 2}, {}})
  210. .execs({{2, 2, 5, 5, 5}, {2, 3, 1, 2, 2, 2}, {2, 6, 4, 4, 4}});
  211. checker.set_param(gconv_param({M, 1, 1, 1, 1, 1, 1}))
  212. .execs({{2, 2, 5, 5, 5}, {2, 1, 1, 2, 2, 2}, {}});
  213. checker.set_param(gconv_param({M, 2, 3, 3, 2, 1, 1}))
  214. .execs({{4, 12, 10, 5, 10}, {12, 2, 1, 4, 5, 5}, {}});
  215. // padding larger than kern
  216. checker.set_param(gconv_param({M, 10, 15, 15, 4, 5, 5}))
  217. .execs({{4, 12, 10, 5, 10}, {12, 2, 1, 4, 5, 5}, {}});
  218. for (uint32_t n : {8, 12})
  219. for (uint32_t id : {12})
  220. for (uint32_t ih : {12})
  221. for (uint32_t iw : {16})
  222. for (uint32_t ic : {4})
  223. for (uint32_t oc : {4})
  224. for (uint32_t fd : {2, 5})
  225. for (uint32_t pd : {2})
  226. for (uint32_t sd : {1})
  227. for (uint32_t dd : {1}) {
  228. checker
  229. .set_param(gconv_param(
  230. {M, pd, pd, pd, sd,
  231. sd, sd, dd, dd,
  232. dd}))
  233. .execs({{n, ic, id, ih, iw},
  234. {ic, oc, 1, fd, fd,
  235. fd},
  236. {}});
  237. }
  238. }
  239. TEST_F(CUDA, CHANWISE_CONVOLUTION3D_BACKWARD_DATA) {
  240. Checker<Convolution3DBackwardData> checker(handle_cuda());
  241. bool require_algo = false;
  242. checker.set_before_exec_callback(
  243. AlgoChecker<Convolution3DBackwardData>(
  244. "CHANNEL_WISE", &require_algo));
  245. checker.set_param(gconv_param({M, 0, 0, 0, 1, 1, 1}))
  246. .execs({{1, 1, 1, 2, 2, 2}, {1, 1, 1, 1, 1}, {1, 1, 2, 2, 2}})
  247. .execs({{1, 1, 1, 2, 2, 2}, {1, 1, 5, 5, 5}, {1, 1, 6, 6, 6}});
  248. require_algo = true;
  249. checker.execs({{2, 1, 1, 2, 2, 2}, {1, 2, 1, 1, 1}, {1, 2, 2, 2, 2}})
  250. .execs({{2, 1, 1, 2, 2, 2}, {1, 2, 5, 5, 5}, {1, 2, 6, 6, 6}})
  251. .execs({{2, 3, 1, 2, 2, 2}, {2, 6, 5, 5, 5}, {2, 2, 6, 6, 6}});
  252. checker.set_param(gconv_param({M, 1, 1, 1, 1, 1, 1}))
  253. .execs({{2, 1, 1, 2, 2, 2}, {2, 2, 5, 5, 5}, {2, 2, 4, 4, 4}});
  254. checker.set_param(gconv_param({M, 2, 3, 3, 2, 1, 1}))
  255. .execs({{12, 2, 1, 4, 5, 5},
  256. {32, 24, 20, 10, 10},
  257. {32, 12, 39, 8, 8}});
  258. // padding larger than kern
  259. checker.set_param(gconv_param({M, 20, 30, 20, 4, 5, 4}))
  260. .execs({{6, 2, 1, 4, 5, 4},
  261. {32, 12, 10, 12, 10},
  262. {32, 6, 2, 3, 2}});
  263. }
  264. TEST_F(CUDA, CHANWISE_CONVOLUTION3D_BACKWARD_FILTER) {
  265. Checker<Convolution3DBackwardFilter> checker(handle_cuda());
  266. bool require_algo = false;
  267. checker.set_before_exec_callback(
  268. AlgoChecker<Convolution3DBackwardFilter>(
  269. "CHANNEL_WISE", &require_algo));
  270. checker.set_param(gconv_param({M, 0, 0, 0, 1, 1, 1}))
  271. .execs({{1, 1, 2, 2, 2}, {1, 1, 1, 1, 1}, {1, 1, 1, 2, 2, 2}})
  272. .execs({{1, 1, 6, 6, 6}, {1, 1, 5, 5, 5}, {1, 1, 1, 2, 2, 2}})
  273. .execs({{256, 1, 2, 2, 2}, {256, 1, 1, 1, 1}, {1, 1, 1, 2, 2, 2}});
  274. require_algo = true;
  275. checker.execs({{1, 2, 2, 2, 2}, {1, 2, 1, 1, 1}, {2, 1, 1, 2, 2, 2}})
  276. .execs({{1, 2, 6, 6, 6}, {1, 2, 5, 5, 5}, {2, 1, 1, 2, 2, 2}})
  277. .execs({{2, 2, 6, 6, 6}, {2, 6, 5, 5, 5}, {2, 3, 1, 2, 2, 2}});
  278. checker.set_param(gconv_param({M, 1, 1, 1, 1, 1, 1}))
  279. .execs({{2, 2, 4, 4, 4}, {2, 2, 5, 5, 5}, {2, 1, 1, 2, 2, 2}});
  280. require_algo = false;
  281. checker.set_param(gconv_param({M, 0, 0, 0, 1, 1, 1}))
  282. .execs({{40960, 1, 1, 1, 1},
  283. {40960, 1, 1, 1, 1},
  284. {1, 1, 1, 1, 1, 1}});
  285. require_algo = true;
  286. checker.set_param(gconv_param({M, 2, 3, 2, 2, 1, 2}))
  287. .execs({{32, 12, 39, 8, 20},
  288. {32, 36, 20, 10, 10},
  289. {12, 3, 1, 4, 5, 6}});
  290. // padding larger than kern
  291. checker.set_param(gconv_param({M, 20, 30, 30, 4, 5, 5}))
  292. .execs({{32, 6, 2, 3, 3},
  293. {32, 12, 10, 12, 12},
  294. {6, 2, 1, 4, 5, 5}});
  295. // unused filter items
  296. checker.set_param(gconv_param({M, 2, 3, 3, 2, 3, 3}))
  297. .execs({{32, 6, 1, 1, 1}, {32, 12, 1, 1, 1}, {6, 2, 1, 5, 7, 7}});
  298. }
  299. #if MEGDNN_WITH_BENCHMARK
  300. TEST_F(CUDA, CHANWISE_CONVOLUTION3D_FORWARD_BENCH_CHECK) {
  301. auto handle = handle_cuda();
  302. auto handle_cpu = handle_naive();
  303. auto conv0 = handle->create_operator<Convolution3DForward>();
  304. auto conv1 = handle->create_operator<Convolution3DForward>();
  305. BenchmarkEnv<0, 1, 2> benv(handle, handle_cpu);
  306. auto run = [&](size_t N, size_t IC, size_t ID, size_t IH, size_t IW,
  307. size_t CHL_MUL, size_t FD, size_t FH, size_t FW, size_t PD,
  308. size_t PH, size_t PW) {
  309. benv.alloc(N, IC, ID, IH, IW, CHL_MUL, FD, FH, FW, PD, PH, PW);
  310. benv.fill_src();
  311. benv.fill_flt();
  312. benv.exec(conv0.get(), conv1.get());
  313. benv.cmp_dst();
  314. };
  315. run(64, 30, 10, 10, 10, 1, 3, 3, 3, 1, 1, 1);
  316. if (check_need_full_bench()) {
  317. run(64, 728, 9, 9, 9, 2, 5, 5, 5, 2, 2, 2);
  318. run(64, 64, 30, 30, 30, 2, 3, 3, 3, 1, 1, 1);
  319. run(1, 2048, 4, 4, 4, 2, 3, 3, 3, 1, 1, 1);
  320. }
  321. }
  322. TEST_F(CUDA, CHANWISE_CONVOLUTION3D_BWD_DATA_BENCH_CHECK) {
  323. auto handle = handle_cuda();
  324. auto handle_cpu = handle_naive();
  325. auto conv0 = handle->create_operator<Convolution3DBackwardData>();
  326. auto conv1 = handle->create_operator<Convolution3DBackwardData>();
  327. BenchmarkEnv<1, 2, 0> benv(handle, handle_cpu);
  328. auto run = [&](size_t N, size_t IC, size_t ID, size_t IH, size_t IW,
  329. size_t CHL_MUL, size_t FD, size_t FH, size_t FW, size_t PD,
  330. size_t PH, size_t PW) {
  331. benv.alloc(N, ID, IC, IH, IW, CHL_MUL, FD, FH, FW, PD, PH, PW);
  332. benv.fill_dst();
  333. benv.fill_flt();
  334. benv.exec(conv0.get(), conv1.get());
  335. benv.cmp_src();
  336. };
  337. run(64, 60, 50, 50, 50, 1, 3, 3, 3, 1, 1, 1);
  338. if (check_need_full_bench()) {
  339. run(64, 728, 18, 18, 18, 2, 5, 5, 5, 2, 2, 2);
  340. run(64, 64, 32, 32, 32, 2, 3, 3, 3, 1, 1, 1);
  341. run(1, 2048, 4, 4, 4, 2, 3, 3, 3, 1, 1, 1);
  342. }
  343. }
  344. TEST_F(CUDA, CHANWISE_CONVOLUTION3D_BWD_FILTER_BENCH_CHECK) {
  345. auto handle = handle_cuda();
  346. auto handle_cpu = handle_naive();
  347. auto conv0 = handle->create_operator<Convolution3DBackwardFilter>();
  348. auto conv1 = handle->create_operator<Convolution3DBackwardFilter>();
  349. BenchmarkEnv<0, 2, 1> benv(handle, handle_cpu);
  350. auto run = [&](size_t N, size_t IC, size_t ID, size_t IH, size_t IW,
  351. size_t CHL_MUL, size_t FD, size_t FH, size_t FW, size_t PD,
  352. size_t PH, size_t PW) {
  353. benv.alloc(N, IC, ID, IH, IW, CHL_MUL, FD, FH, FW, PD, PH, PW);
  354. benv.fill_src();
  355. benv.fill_dst();
  356. benv.exec(conv0.get(), conv1.get());
  357. benv.cmp_flt();
  358. };
  359. run(67, 729, 20, 20, 20, 1, 3, 3, 3, 1, 1, 1);
  360. if (check_need_full_bench()) {
  361. run(64, 728, 18, 18, 18, 2, 5, 5, 5, 2, 2, 2);
  362. // the case below is an sample that select unexpected algo_1
  363. run(64, 64, 32, 32, 32, 2, 3, 3, 3, 1, 1, 1);
  364. run(1, 2048, 4, 4, 4, 2, 3, 3, 3, 1, 1, 1);
  365. }
  366. }
  367. #endif
  368. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台