You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

chanwise_convolution3d.cpp 16 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397
  1. /**
  2. * \file dnn/test/cuda/chanwise_convolution3d.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #include "megdnn/oprs/nn.h"
  12. #include "megcore_cuda.h"
  13. #include "test/common/checker.h"
  14. #include "test/common/convolution3d.h"
  15. #include "test/common/tensor.h"
  16. #include "test/common/workspace_wrapper.h"
  17. #include "test/cuda/fixture.h"
  18. #include <cuda_profiler_api.h>
  19. #include <cuda_runtime_api.h>
  20. using namespace megdnn;
  21. using namespace test;
  22. namespace {
  23. #if MEGDNN_WITH_BENCHMARK
  24. bool check_need_full_bench() {
  25. if (getenv("MEGDNN_CHANWISE_CONV3D_FULLBENCH"))
  26. return true;
  27. printf("set MEGDNN_CHANWISE_CONV3D_FULLBENCH to run full benchmark\n");
  28. return false;
  29. }
  30. #endif
  31. Convolution3D::Param gconv_param(Convolution3D::Param p) {
  32. p.sparse = Convolution3D::Param::Sparse::GROUP;
  33. return p;
  34. }
  35. template <int P0, int P1, int P2>
  36. class BenchmarkEnv {
  37. Handle *handle, *handle_cpu;
  38. std::unique_ptr<GaussianRNG> rng;
  39. TensorLayout lsrc, lflt0, lflt1, ldst;
  40. std::unique_ptr<Tensor<>> src0, src1, flt0, flt0_cpu, flt1, flt1_cpu, dst0, dst1;
  41. cudaEvent_t cuda_ev[3];
  42. cudaStream_t cuda_stream;
  43. size_t pad_d, pad_h, pad_w;
  44. template <typename T>
  45. static std::tuple<T, T, T> shuffle(std::tuple<T, T, T> data) {
  46. return std::make_tuple(
  47. std::get<P0>(data), std::get<P1>(data), std::get<P2>(data));
  48. }
  49. public:
  50. BenchmarkEnv(Handle* handle, Handle* handle_cpu) {
  51. this->handle = handle;
  52. this->handle_cpu = handle_cpu;
  53. rng = handle->create_operator<GaussianRNG>();
  54. // make cpu handle used
  55. handle_cpu->create_operator<Sleep>()->exec();
  56. for (int i = 0; i < 3; ++i)
  57. cudaEventCreate(&cuda_ev[i]);
  58. megcoreGetCUDAStream(handle->megcore_computing_handle(), &cuda_stream);
  59. }
  60. ~BenchmarkEnv() {
  61. for (int i = 0; i < 3; ++i)
  62. cudaEventDestroy(cuda_ev[i]);
  63. }
  64. void alloc(
  65. size_t N, size_t IC, size_t ID, size_t IH, size_t IW, size_t CHL_MUL,
  66. size_t FD, size_t FH, size_t FW, size_t PD, size_t PH, size_t PW) {
  67. pad_d = PD;
  68. pad_h = PH;
  69. pad_w = PW;
  70. auto mkly = [](const TensorShape& s) {
  71. return TensorLayout{s, dtype::Float32()};
  72. };
  73. lsrc = mkly({N, IC, ID, IH, IW});
  74. lflt0 = mkly({CHL_MUL * IC, IC, FD, FH, FW});
  75. lflt1 = mkly({IC, CHL_MUL, 1, FD, FH, FW});
  76. ldst =
  77. mkly({N, IC * CHL_MUL, ID - FD + 1 + PD * 2, IH - FH + 1 + PH * 2,
  78. IW - FW + 1 + PW * 2});
  79. src0.reset(new Tensor<>(handle, lsrc));
  80. src1.reset(new Tensor<>(handle, lsrc));
  81. flt0.reset(new Tensor<>(handle, lflt0));
  82. flt0_cpu.reset(new Tensor<>(handle_cpu, lflt0));
  83. flt1.reset(new Tensor<>(handle, lflt1));
  84. flt1_cpu.reset(new Tensor<>(handle_cpu, lflt1));
  85. dst0.reset(new Tensor<>(handle, ldst));
  86. dst1.reset(new Tensor<>(handle, ldst));
  87. }
  88. void fill_src() {
  89. rng->exec(src0->tensornd(), {});
  90. megdnn_memcpy_D2D(handle, src1->ptr(), src0->ptr(), lsrc.span().dist_byte());
  91. }
  92. void fill_flt() {
  93. rng->exec(flt1->tensornd(), {});
  94. megdnn_memcpy_D2H(
  95. handle, flt1_cpu->ptr(), flt1->ptr(), lflt1.span().dist_byte());
  96. const size_t IC = lflt1[0], CHL_MUL = lflt1[1],
  97. FSIZE = lflt1[3] * lflt1[4] * lflt1[5];
  98. // fill flt0 from flt1
  99. float* src = flt1_cpu->ptr();
  100. float* dst = flt0_cpu->ptr();
  101. memset(dst, 0, lflt0.span().dist_byte());
  102. for (size_t i = 0; i < IC; ++i) {
  103. for (size_t j = 0; j < CHL_MUL; ++j) {
  104. memcpy(dst + ((i * CHL_MUL + j) * IC + i) * FSIZE,
  105. src + (i * CHL_MUL + j) * FSIZE, FSIZE * sizeof(float));
  106. }
  107. }
  108. megdnn_memcpy_H2D(handle, flt0->ptr(), dst, lflt0.span().dist_byte());
  109. }
  110. void fill_dst() {
  111. rng->exec(dst0->tensornd(), {});
  112. megdnn_memcpy_D2D(handle, dst1->ptr(), dst0->ptr(), ldst.span().dist_byte());
  113. }
  114. template <class Opr>
  115. void exec(Opr* opr0, Opr* opr1) {
  116. opr0->param().pad_d = pad_d;
  117. opr0->param().pad_h = pad_h;
  118. opr0->param().pad_w = pad_w;
  119. opr1->param() = opr0->param();
  120. opr1->param().sparse = param::Convolution3D::Sparse::GROUP;
  121. TensorND a0, b0, c0, a1, b1, c1;
  122. std::tie(a0, b0, c0) = shuffle(
  123. std::make_tuple(src0->tensornd(), flt0->tensornd(), dst0->tensornd()));
  124. std::tie(a1, b1, c1) = shuffle(
  125. std::make_tuple(src1->tensornd(), flt1->tensornd(), dst1->tensornd()));
  126. WorkspaceWrapper wk(
  127. handle,
  128. std::max(
  129. opr0->get_workspace_in_bytes(a0.layout, b0.layout, c0.layout),
  130. opr1->get_workspace_in_bytes(a1.layout, b1.layout, c1.layout)));
  131. cudaProfilerStart();
  132. cudaEventRecord(cuda_ev[0], cuda_stream);
  133. opr0->exec(a0, b0, c0, wk.workspace());
  134. cudaEventRecord(cuda_ev[1], cuda_stream);
  135. opr1->exec(a1, b1, c1, wk.workspace());
  136. cudaEventRecord(cuda_ev[2], cuda_stream);
  137. cudaProfilerStop();
  138. if (getenv("MEGDNN_CHANWISE_CONV3D_VERBOSE") ||
  139. getenv("MEGDNN_CHANWISE_CONV3D_FULLBENCH")) {
  140. cudaStreamSynchronize(cuda_stream);
  141. float t0 = -1, t1 = -1;
  142. cudaEventElapsedTime(&t0, cuda_ev[0], cuda_ev[1]);
  143. cudaEventElapsedTime(&t1, cuda_ev[1], cuda_ev[2]);
  144. printf("%s;%s;%s: cudnn/megdnn: %.3fms/%.3fms=%.3f\n",
  145. lsrc.TensorShape::to_string().c_str(),
  146. lflt1.TensorShape::to_string().c_str(),
  147. ldst.TensorShape::to_string().c_str(), t0, t1, t0 / t1);
  148. }
  149. }
  150. void cmp_dst() {
  151. Tensor<> dst0_cpu(handle_cpu, ldst), dst1_cpu(handle_cpu, ldst);
  152. megdnn_memcpy_D2H(handle, dst0_cpu.ptr(), dst0->ptr(), ldst.span().dist_byte());
  153. megdnn_memcpy_D2H(handle, dst1_cpu.ptr(), dst1->ptr(), ldst.span().dist_byte());
  154. dst0_cpu.check_with(dst1_cpu);
  155. }
  156. void cmp_src() {
  157. Tensor<> src0_cpu(handle_cpu, lsrc), src1_cpu(handle_cpu, lsrc);
  158. megdnn_memcpy_D2H(handle, src0_cpu.ptr(), src0->ptr(), lsrc.span().dist_byte());
  159. megdnn_memcpy_D2H(handle, src1_cpu.ptr(), src1->ptr(), lsrc.span().dist_byte());
  160. src0_cpu.check_with(src1_cpu);
  161. }
  162. void cmp_flt() {
  163. Tensor<> flt0_cpu(handle_cpu, lflt0), flt1_cpu(handle_cpu, lflt1);
  164. float* p0 = flt0_cpu.ptr();
  165. float* p1 = flt1_cpu.ptr();
  166. megdnn_memcpy_D2H(handle, p0, flt0->ptr(), lflt0.span().dist_byte());
  167. megdnn_memcpy_D2H(handle, p1, flt1->ptr(), lflt1.span().dist_byte());
  168. size_t IC = lflt1[0], CHL_MUL = lflt1[1],
  169. FSIZE = lflt1[3] * lflt1[4] * lflt1[5];
  170. double tot_err = 0, tot_err_num = 0;
  171. for (size_t i = 0; i < IC; ++i) {
  172. for (size_t j = 0; j < CHL_MUL; ++j) {
  173. auto t0 = p0 + ((i * CHL_MUL + j) * IC + i) * FSIZE,
  174. t1 = p1 + (i * CHL_MUL + j) * FSIZE;
  175. for (size_t k = 0; k < FSIZE; ++k) {
  176. auto err = std::abs(diff(t0[k], t1[k]));
  177. tot_err += err;
  178. tot_err_num += 1;
  179. ASSERT_LT(err, 1e-2) << "failed at " << i << " " << j << " " << k
  180. << " vals=" << t0[k] << "," << t1[k];
  181. }
  182. }
  183. }
  184. auto avg_err = tot_err / tot_err_num;
  185. ASSERT_LT(avg_err, 1e-4);
  186. }
  187. };
  188. } // anonymous namespace
  189. constexpr auto M = Convolution3D::Mode::CROSS_CORRELATION;
  190. TEST_F(CUDA, CHANWISE_CONVOLUTION3D_FORWARD) {
  191. constexpr auto M = Convolution3D::Mode::CROSS_CORRELATION;
  192. Checker<Convolution3D> checker(handle_cuda());
  193. bool require_algo = false;
  194. checker.set_before_exec_callback(
  195. AlgoChecker<Convolution3DForward>("CHANNEL_WISE", &require_algo));
  196. checker.set_param(gconv_param({M, 0, 0, 0, 1, 1, 1}))
  197. .execs({{1, 1, 2, 2, 2}, {1, 1, 1, 2, 2, 2}, {}})
  198. .execs({{1, 1, 5, 5, 5}, {1, 1, 1, 2, 2, 2}, {}});
  199. checker.set_param(gconv_param({M, 0, 0, 0, 1, 1, 1}))
  200. .execs({{1, 2, 2, 2, 2}, {2, 1, 1, 2, 2, 2}, {}})
  201. .execs({{1, 2, 5, 5, 5}, {2, 1, 1, 2, 2, 2}, {}})
  202. .execs({{2, 2, 5, 5, 5}, {2, 3, 1, 2, 2, 2}, {2, 6, 4, 4, 4}});
  203. checker.set_param(gconv_param({M, 1, 1, 1, 1, 1, 1}))
  204. .execs({{2, 2, 5, 5, 5}, {2, 1, 1, 2, 2, 2}, {}});
  205. checker.set_param(gconv_param({M, 2, 3, 3, 2, 1, 1}))
  206. .execs({{4, 12, 10, 5, 10}, {12, 2, 1, 4, 5, 5}, {}});
  207. // padding larger than kern
  208. checker.set_param(gconv_param({M, 10, 15, 15, 4, 5, 5}))
  209. .execs({{4, 12, 10, 5, 10}, {12, 2, 1, 4, 5, 5}, {}});
  210. for (uint32_t n : {8, 12})
  211. for (uint32_t id : {12})
  212. for (uint32_t ih : {12})
  213. for (uint32_t iw : {16})
  214. for (uint32_t ic : {4})
  215. for (uint32_t oc : {4})
  216. for (uint32_t fd : {2, 5})
  217. for (uint32_t pd : {2})
  218. for (uint32_t sd : {1})
  219. for (uint32_t dd : {1}) {
  220. checker
  221. .set_param(gconv_param(
  222. {M, pd, pd, pd, sd, sd, sd,
  223. dd, dd, dd}))
  224. .execs({{n, ic, id, ih, iw},
  225. {ic, oc, 1, fd, fd, fd},
  226. {}});
  227. }
  228. }
  229. TEST_F(CUDA, CHANWISE_CONVOLUTION3D_BACKWARD_DATA) {
  230. Checker<Convolution3DBackwardData> checker(handle_cuda());
  231. bool require_algo = false;
  232. checker.set_before_exec_callback(
  233. AlgoChecker<Convolution3DBackwardData>("CHANNEL_WISE", &require_algo));
  234. checker.set_param(gconv_param({M, 0, 0, 0, 1, 1, 1}))
  235. .execs({{1, 1, 1, 2, 2, 2}, {1, 1, 1, 1, 1}, {1, 1, 2, 2, 2}})
  236. .execs({{1, 1, 1, 2, 2, 2}, {1, 1, 5, 5, 5}, {1, 1, 6, 6, 6}});
  237. require_algo = true;
  238. checker.execs({{2, 1, 1, 2, 2, 2}, {1, 2, 1, 1, 1}, {1, 2, 2, 2, 2}})
  239. .execs({{2, 1, 1, 2, 2, 2}, {1, 2, 5, 5, 5}, {1, 2, 6, 6, 6}})
  240. .execs({{2, 3, 1, 2, 2, 2}, {2, 6, 5, 5, 5}, {2, 2, 6, 6, 6}});
  241. checker.set_param(gconv_param({M, 1, 1, 1, 1, 1, 1}))
  242. .execs({{2, 1, 1, 2, 2, 2}, {2, 2, 5, 5, 5}, {2, 2, 4, 4, 4}});
  243. checker.set_param(gconv_param({M, 2, 3, 3, 2, 1, 1}))
  244. .execs({{12, 2, 1, 4, 5, 5}, {32, 24, 20, 10, 10}, {32, 12, 39, 8, 8}});
  245. // padding larger than kern
  246. checker.set_param(gconv_param({M, 20, 30, 20, 4, 5, 4}))
  247. .execs({{6, 2, 1, 4, 5, 4}, {32, 12, 10, 12, 10}, {32, 6, 2, 3, 2}});
  248. }
  249. TEST_F(CUDA, CHANWISE_CONVOLUTION3D_BACKWARD_FILTER) {
  250. Checker<Convolution3DBackwardFilter> checker(handle_cuda());
  251. bool require_algo = false;
  252. checker.set_before_exec_callback(
  253. AlgoChecker<Convolution3DBackwardFilter>("CHANNEL_WISE", &require_algo));
  254. checker.set_param(gconv_param({M, 0, 0, 0, 1, 1, 1}))
  255. .execs({{1, 1, 2, 2, 2}, {1, 1, 1, 1, 1}, {1, 1, 1, 2, 2, 2}})
  256. .execs({{1, 1, 6, 6, 6}, {1, 1, 5, 5, 5}, {1, 1, 1, 2, 2, 2}})
  257. .execs({{256, 1, 2, 2, 2}, {256, 1, 1, 1, 1}, {1, 1, 1, 2, 2, 2}});
  258. require_algo = true;
  259. checker.execs({{1, 2, 2, 2, 2}, {1, 2, 1, 1, 1}, {2, 1, 1, 2, 2, 2}})
  260. .execs({{1, 2, 6, 6, 6}, {1, 2, 5, 5, 5}, {2, 1, 1, 2, 2, 2}})
  261. .execs({{2, 2, 6, 6, 6}, {2, 6, 5, 5, 5}, {2, 3, 1, 2, 2, 2}});
  262. checker.set_param(gconv_param({M, 1, 1, 1, 1, 1, 1}))
  263. .execs({{2, 2, 4, 4, 4}, {2, 2, 5, 5, 5}, {2, 1, 1, 2, 2, 2}});
  264. require_algo = false;
  265. checker.set_param(gconv_param({M, 0, 0, 0, 1, 1, 1}))
  266. .execs({{40960, 1, 1, 1, 1}, {40960, 1, 1, 1, 1}, {1, 1, 1, 1, 1, 1}});
  267. require_algo = true;
  268. checker.set_param(gconv_param({M, 2, 3, 2, 2, 1, 2}))
  269. .execs({{32, 12, 39, 8, 20}, {32, 36, 20, 10, 10}, {12, 3, 1, 4, 5, 6}});
  270. // padding larger than kern
  271. checker.set_param(gconv_param({M, 20, 30, 30, 4, 5, 5}))
  272. .execs({{32, 6, 2, 3, 3}, {32, 12, 10, 12, 12}, {6, 2, 1, 4, 5, 5}});
  273. // unused filter items
  274. checker.set_param(gconv_param({M, 2, 3, 3, 2, 3, 3}))
  275. .execs({{32, 6, 1, 1, 1}, {32, 12, 1, 1, 1}, {6, 2, 1, 5, 7, 7}});
  276. }
  277. #if MEGDNN_WITH_BENCHMARK
  278. TEST_F(CUDA, CHANWISE_CONVOLUTION3D_FORWARD_BENCH_CHECK) {
  279. auto handle = handle_cuda();
  280. auto handle_cpu = handle_naive();
  281. auto conv0 = handle->create_operator<Convolution3DForward>();
  282. auto conv1 = handle->create_operator<Convolution3DForward>();
  283. BenchmarkEnv<0, 1, 2> benv(handle, handle_cpu);
  284. auto run = [&](size_t N, size_t IC, size_t ID, size_t IH, size_t IW, size_t CHL_MUL,
  285. size_t FD, size_t FH, size_t FW, size_t PD, size_t PH, size_t PW) {
  286. benv.alloc(N, IC, ID, IH, IW, CHL_MUL, FD, FH, FW, PD, PH, PW);
  287. benv.fill_src();
  288. benv.fill_flt();
  289. benv.exec(conv0.get(), conv1.get());
  290. benv.cmp_dst();
  291. };
  292. run(64, 30, 10, 10, 10, 1, 3, 3, 3, 1, 1, 1);
  293. if (check_need_full_bench()) {
  294. run(64, 728, 9, 9, 9, 2, 5, 5, 5, 2, 2, 2);
  295. run(64, 64, 30, 30, 30, 2, 3, 3, 3, 1, 1, 1);
  296. run(1, 2048, 4, 4, 4, 2, 3, 3, 3, 1, 1, 1);
  297. }
  298. }
  299. TEST_F(CUDA, CHANWISE_CONVOLUTION3D_BWD_DATA_BENCH_CHECK) {
  300. auto handle = handle_cuda();
  301. auto handle_cpu = handle_naive();
  302. auto conv0 = handle->create_operator<Convolution3DBackwardData>();
  303. auto conv1 = handle->create_operator<Convolution3DBackwardData>();
  304. BenchmarkEnv<1, 2, 0> benv(handle, handle_cpu);
  305. auto run = [&](size_t N, size_t IC, size_t ID, size_t IH, size_t IW, size_t CHL_MUL,
  306. size_t FD, size_t FH, size_t FW, size_t PD, size_t PH, size_t PW) {
  307. benv.alloc(N, ID, IC, IH, IW, CHL_MUL, FD, FH, FW, PD, PH, PW);
  308. benv.fill_dst();
  309. benv.fill_flt();
  310. benv.exec(conv0.get(), conv1.get());
  311. benv.cmp_src();
  312. };
  313. run(64, 60, 50, 50, 50, 1, 3, 3, 3, 1, 1, 1);
  314. if (check_need_full_bench()) {
  315. run(64, 728, 18, 18, 18, 2, 5, 5, 5, 2, 2, 2);
  316. run(64, 64, 32, 32, 32, 2, 3, 3, 3, 1, 1, 1);
  317. run(1, 2048, 4, 4, 4, 2, 3, 3, 3, 1, 1, 1);
  318. }
  319. }
  320. TEST_F(CUDA, CHANWISE_CONVOLUTION3D_BWD_FILTER_BENCH_CHECK) {
  321. auto handle = handle_cuda();
  322. auto handle_cpu = handle_naive();
  323. auto conv0 = handle->create_operator<Convolution3DBackwardFilter>();
  324. auto conv1 = handle->create_operator<Convolution3DBackwardFilter>();
  325. BenchmarkEnv<0, 2, 1> benv(handle, handle_cpu);
  326. auto run = [&](size_t N, size_t IC, size_t ID, size_t IH, size_t IW, size_t CHL_MUL,
  327. size_t FD, size_t FH, size_t FW, size_t PD, size_t PH, size_t PW) {
  328. benv.alloc(N, IC, ID, IH, IW, CHL_MUL, FD, FH, FW, PD, PH, PW);
  329. benv.fill_src();
  330. benv.fill_dst();
  331. benv.exec(conv0.get(), conv1.get());
  332. benv.cmp_flt();
  333. };
  334. run(67, 729, 20, 20, 20, 1, 3, 3, 3, 1, 1, 1);
  335. if (check_need_full_bench()) {
  336. run(64, 728, 18, 18, 18, 2, 5, 5, 5, 2, 2, 2);
  337. // the case below is an sample that select unexpected algo_1
  338. run(64, 64, 32, 32, 32, 2, 3, 3, 3, 1, 1, 1);
  339. run(1, 2048, 4, 4, 4, 2, 3, 3, 3, 1, 1, 1);
  340. }
  341. }
  342. #endif
  343. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台