You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

chanwise_convolution3d.cpp 16 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386
  1. #include "megdnn/oprs/nn.h"
  2. #include "megcore_cuda.h"
  3. #include "test/common/checker.h"
  4. #include "test/common/convolution3d.h"
  5. #include "test/common/tensor.h"
  6. #include "test/common/workspace_wrapper.h"
  7. #include "test/cuda/fixture.h"
  8. #include <cuda_profiler_api.h>
  9. #include <cuda_runtime_api.h>
  10. using namespace megdnn;
  11. using namespace test;
  12. namespace {
  13. #if MEGDNN_WITH_BENCHMARK
  14. bool check_need_full_bench() {
  15. if (getenv("MEGDNN_CHANWISE_CONV3D_FULLBENCH"))
  16. return true;
  17. printf("set MEGDNN_CHANWISE_CONV3D_FULLBENCH to run full benchmark\n");
  18. return false;
  19. }
  20. #endif
  21. Convolution3D::Param gconv_param(Convolution3D::Param p) {
  22. p.sparse = Convolution3D::Param::Sparse::GROUP;
  23. return p;
  24. }
  25. template <int P0, int P1, int P2>
  26. class BenchmarkEnv {
  27. Handle *handle, *handle_cpu;
  28. std::unique_ptr<GaussianRNG> rng;
  29. TensorLayout lsrc, lflt0, lflt1, ldst;
  30. std::unique_ptr<Tensor<>> src0, src1, flt0, flt0_cpu, flt1, flt1_cpu, dst0, dst1;
  31. cudaEvent_t cuda_ev[3];
  32. cudaStream_t cuda_stream;
  33. size_t pad_d, pad_h, pad_w;
  34. template <typename T>
  35. static std::tuple<T, T, T> shuffle(std::tuple<T, T, T> data) {
  36. return std::make_tuple(
  37. std::get<P0>(data), std::get<P1>(data), std::get<P2>(data));
  38. }
  39. public:
  40. BenchmarkEnv(Handle* handle, Handle* handle_cpu) {
  41. this->handle = handle;
  42. this->handle_cpu = handle_cpu;
  43. rng = handle->create_operator<GaussianRNG>();
  44. // make cpu handle used
  45. handle_cpu->create_operator<Sleep>()->exec();
  46. for (int i = 0; i < 3; ++i)
  47. cudaEventCreate(&cuda_ev[i]);
  48. megcoreGetCUDAStream(handle->megcore_computing_handle(), &cuda_stream);
  49. }
  50. ~BenchmarkEnv() {
  51. for (int i = 0; i < 3; ++i)
  52. cudaEventDestroy(cuda_ev[i]);
  53. }
  54. void alloc(
  55. size_t N, size_t IC, size_t ID, size_t IH, size_t IW, size_t CHL_MUL,
  56. size_t FD, size_t FH, size_t FW, size_t PD, size_t PH, size_t PW) {
  57. pad_d = PD;
  58. pad_h = PH;
  59. pad_w = PW;
  60. auto mkly = [](const TensorShape& s) {
  61. return TensorLayout{s, dtype::Float32()};
  62. };
  63. lsrc = mkly({N, IC, ID, IH, IW});
  64. lflt0 = mkly({CHL_MUL * IC, IC, FD, FH, FW});
  65. lflt1 = mkly({IC, CHL_MUL, 1, FD, FH, FW});
  66. ldst =
  67. mkly({N, IC * CHL_MUL, ID - FD + 1 + PD * 2, IH - FH + 1 + PH * 2,
  68. IW - FW + 1 + PW * 2});
  69. src0.reset(new Tensor<>(handle, lsrc));
  70. src1.reset(new Tensor<>(handle, lsrc));
  71. flt0.reset(new Tensor<>(handle, lflt0));
  72. flt0_cpu.reset(new Tensor<>(handle_cpu, lflt0));
  73. flt1.reset(new Tensor<>(handle, lflt1));
  74. flt1_cpu.reset(new Tensor<>(handle_cpu, lflt1));
  75. dst0.reset(new Tensor<>(handle, ldst));
  76. dst1.reset(new Tensor<>(handle, ldst));
  77. }
  78. void fill_src() {
  79. rng->exec(src0->tensornd(), {});
  80. megdnn_memcpy_D2D(handle, src1->ptr(), src0->ptr(), lsrc.span().dist_byte());
  81. }
  82. void fill_flt() {
  83. rng->exec(flt1->tensornd(), {});
  84. megdnn_memcpy_D2H(
  85. handle, flt1_cpu->ptr(), flt1->ptr(), lflt1.span().dist_byte());
  86. const size_t IC = lflt1[0], CHL_MUL = lflt1[1],
  87. FSIZE = lflt1[3] * lflt1[4] * lflt1[5];
  88. // fill flt0 from flt1
  89. float* src = flt1_cpu->ptr();
  90. float* dst = flt0_cpu->ptr();
  91. memset(dst, 0, lflt0.span().dist_byte());
  92. for (size_t i = 0; i < IC; ++i) {
  93. for (size_t j = 0; j < CHL_MUL; ++j) {
  94. memcpy(dst + ((i * CHL_MUL + j) * IC + i) * FSIZE,
  95. src + (i * CHL_MUL + j) * FSIZE, FSIZE * sizeof(float));
  96. }
  97. }
  98. megdnn_memcpy_H2D(handle, flt0->ptr(), dst, lflt0.span().dist_byte());
  99. }
  100. void fill_dst() {
  101. rng->exec(dst0->tensornd(), {});
  102. megdnn_memcpy_D2D(handle, dst1->ptr(), dst0->ptr(), ldst.span().dist_byte());
  103. }
  104. template <class Opr>
  105. void exec(Opr* opr0, Opr* opr1) {
  106. opr0->param().pad_d = pad_d;
  107. opr0->param().pad_h = pad_h;
  108. opr0->param().pad_w = pad_w;
  109. opr1->param() = opr0->param();
  110. opr1->param().sparse = param::Convolution3D::Sparse::GROUP;
  111. TensorND a0, b0, c0, a1, b1, c1;
  112. std::tie(a0, b0, c0) = shuffle(
  113. std::make_tuple(src0->tensornd(), flt0->tensornd(), dst0->tensornd()));
  114. std::tie(a1, b1, c1) = shuffle(
  115. std::make_tuple(src1->tensornd(), flt1->tensornd(), dst1->tensornd()));
  116. WorkspaceWrapper wk(
  117. handle,
  118. std::max(
  119. opr0->get_workspace_in_bytes(a0.layout, b0.layout, c0.layout),
  120. opr1->get_workspace_in_bytes(a1.layout, b1.layout, c1.layout)));
  121. cudaProfilerStart();
  122. cudaEventRecord(cuda_ev[0], cuda_stream);
  123. opr0->exec(a0, b0, c0, wk.workspace());
  124. cudaEventRecord(cuda_ev[1], cuda_stream);
  125. opr1->exec(a1, b1, c1, wk.workspace());
  126. cudaEventRecord(cuda_ev[2], cuda_stream);
  127. cudaProfilerStop();
  128. if (getenv("MEGDNN_CHANWISE_CONV3D_VERBOSE") ||
  129. getenv("MEGDNN_CHANWISE_CONV3D_FULLBENCH")) {
  130. cudaStreamSynchronize(cuda_stream);
  131. float t0 = -1, t1 = -1;
  132. cudaEventElapsedTime(&t0, cuda_ev[0], cuda_ev[1]);
  133. cudaEventElapsedTime(&t1, cuda_ev[1], cuda_ev[2]);
  134. printf("%s;%s;%s: cudnn/megdnn: %.3fms/%.3fms=%.3f\n",
  135. lsrc.TensorShape::to_string().c_str(),
  136. lflt1.TensorShape::to_string().c_str(),
  137. ldst.TensorShape::to_string().c_str(), t0, t1, t0 / t1);
  138. }
  139. }
  140. void cmp_dst() {
  141. Tensor<> dst0_cpu(handle_cpu, ldst), dst1_cpu(handle_cpu, ldst);
  142. megdnn_memcpy_D2H(handle, dst0_cpu.ptr(), dst0->ptr(), ldst.span().dist_byte());
  143. megdnn_memcpy_D2H(handle, dst1_cpu.ptr(), dst1->ptr(), ldst.span().dist_byte());
  144. dst0_cpu.check_with(dst1_cpu);
  145. }
  146. void cmp_src() {
  147. Tensor<> src0_cpu(handle_cpu, lsrc), src1_cpu(handle_cpu, lsrc);
  148. megdnn_memcpy_D2H(handle, src0_cpu.ptr(), src0->ptr(), lsrc.span().dist_byte());
  149. megdnn_memcpy_D2H(handle, src1_cpu.ptr(), src1->ptr(), lsrc.span().dist_byte());
  150. src0_cpu.check_with(src1_cpu);
  151. }
  152. void cmp_flt() {
  153. Tensor<> flt0_cpu(handle_cpu, lflt0), flt1_cpu(handle_cpu, lflt1);
  154. float* p0 = flt0_cpu.ptr();
  155. float* p1 = flt1_cpu.ptr();
  156. megdnn_memcpy_D2H(handle, p0, flt0->ptr(), lflt0.span().dist_byte());
  157. megdnn_memcpy_D2H(handle, p1, flt1->ptr(), lflt1.span().dist_byte());
  158. size_t IC = lflt1[0], CHL_MUL = lflt1[1],
  159. FSIZE = lflt1[3] * lflt1[4] * lflt1[5];
  160. double tot_err = 0, tot_err_num = 0;
  161. for (size_t i = 0; i < IC; ++i) {
  162. for (size_t j = 0; j < CHL_MUL; ++j) {
  163. auto t0 = p0 + ((i * CHL_MUL + j) * IC + i) * FSIZE,
  164. t1 = p1 + (i * CHL_MUL + j) * FSIZE;
  165. for (size_t k = 0; k < FSIZE; ++k) {
  166. auto err = std::abs(diff(t0[k], t1[k]));
  167. tot_err += err;
  168. tot_err_num += 1;
  169. ASSERT_LT(err, 1e-2) << "failed at " << i << " " << j << " " << k
  170. << " vals=" << t0[k] << "," << t1[k];
  171. }
  172. }
  173. }
  174. auto avg_err = tot_err / tot_err_num;
  175. ASSERT_LT(avg_err, 1e-4);
  176. }
  177. };
  178. } // anonymous namespace
  179. constexpr auto M = Convolution3D::Mode::CROSS_CORRELATION;
  180. TEST_F(CUDA, CHANWISE_CONVOLUTION3D_FORWARD) {
  181. constexpr auto M = Convolution3D::Mode::CROSS_CORRELATION;
  182. Checker<Convolution3D> checker(handle_cuda());
  183. bool require_algo = false;
  184. checker.set_before_exec_callback(
  185. AlgoChecker<Convolution3DForward>("CHANNEL_WISE", &require_algo));
  186. checker.set_param(gconv_param({M, 0, 0, 0, 1, 1, 1}))
  187. .execs({{1, 1, 2, 2, 2}, {1, 1, 1, 2, 2, 2}, {}})
  188. .execs({{1, 1, 5, 5, 5}, {1, 1, 1, 2, 2, 2}, {}});
  189. checker.set_param(gconv_param({M, 0, 0, 0, 1, 1, 1}))
  190. .execs({{1, 2, 2, 2, 2}, {2, 1, 1, 2, 2, 2}, {}})
  191. .execs({{1, 2, 5, 5, 5}, {2, 1, 1, 2, 2, 2}, {}})
  192. .execs({{2, 2, 5, 5, 5}, {2, 3, 1, 2, 2, 2}, {2, 6, 4, 4, 4}});
  193. checker.set_param(gconv_param({M, 1, 1, 1, 1, 1, 1}))
  194. .execs({{2, 2, 5, 5, 5}, {2, 1, 1, 2, 2, 2}, {}});
  195. checker.set_param(gconv_param({M, 2, 3, 3, 2, 1, 1}))
  196. .execs({{4, 12, 10, 5, 10}, {12, 2, 1, 4, 5, 5}, {}});
  197. // padding larger than kern
  198. checker.set_param(gconv_param({M, 10, 15, 15, 4, 5, 5}))
  199. .execs({{4, 12, 10, 5, 10}, {12, 2, 1, 4, 5, 5}, {}});
  200. for (uint32_t n : {8, 12})
  201. for (uint32_t id : {12})
  202. for (uint32_t ih : {12})
  203. for (uint32_t iw : {16})
  204. for (uint32_t ic : {4})
  205. for (uint32_t oc : {4})
  206. for (uint32_t fd : {2, 5})
  207. for (uint32_t pd : {2})
  208. for (uint32_t sd : {1})
  209. for (uint32_t dd : {1}) {
  210. checker
  211. .set_param(gconv_param(
  212. {M, pd, pd, pd, sd, sd, sd,
  213. dd, dd, dd}))
  214. .execs({{n, ic, id, ih, iw},
  215. {ic, oc, 1, fd, fd, fd},
  216. {}});
  217. }
  218. }
  219. TEST_F(CUDA, CHANWISE_CONVOLUTION3D_BACKWARD_DATA) {
  220. Checker<Convolution3DBackwardData> checker(handle_cuda());
  221. bool require_algo = false;
  222. checker.set_before_exec_callback(
  223. AlgoChecker<Convolution3DBackwardData>("CHANNEL_WISE", &require_algo));
  224. checker.set_param(gconv_param({M, 0, 0, 0, 1, 1, 1}))
  225. .execs({{1, 1, 1, 2, 2, 2}, {1, 1, 1, 1, 1}, {1, 1, 2, 2, 2}})
  226. .execs({{1, 1, 1, 2, 2, 2}, {1, 1, 5, 5, 5}, {1, 1, 6, 6, 6}});
  227. require_algo = true;
  228. checker.execs({{2, 1, 1, 2, 2, 2}, {1, 2, 1, 1, 1}, {1, 2, 2, 2, 2}})
  229. .execs({{2, 1, 1, 2, 2, 2}, {1, 2, 5, 5, 5}, {1, 2, 6, 6, 6}})
  230. .execs({{2, 3, 1, 2, 2, 2}, {2, 6, 5, 5, 5}, {2, 2, 6, 6, 6}});
  231. checker.set_param(gconv_param({M, 1, 1, 1, 1, 1, 1}))
  232. .execs({{2, 1, 1, 2, 2, 2}, {2, 2, 5, 5, 5}, {2, 2, 4, 4, 4}});
  233. checker.set_param(gconv_param({M, 2, 3, 3, 2, 1, 1}))
  234. .execs({{12, 2, 1, 4, 5, 5}, {32, 24, 20, 10, 10}, {32, 12, 39, 8, 8}});
  235. // padding larger than kern
  236. checker.set_param(gconv_param({M, 20, 30, 20, 4, 5, 4}))
  237. .execs({{6, 2, 1, 4, 5, 4}, {32, 12, 10, 12, 10}, {32, 6, 2, 3, 2}});
  238. }
  239. TEST_F(CUDA, CHANWISE_CONVOLUTION3D_BACKWARD_FILTER) {
  240. Checker<Convolution3DBackwardFilter> checker(handle_cuda());
  241. bool require_algo = false;
  242. checker.set_before_exec_callback(
  243. AlgoChecker<Convolution3DBackwardFilter>("CHANNEL_WISE", &require_algo));
  244. checker.set_param(gconv_param({M, 0, 0, 0, 1, 1, 1}))
  245. .execs({{1, 1, 2, 2, 2}, {1, 1, 1, 1, 1}, {1, 1, 1, 2, 2, 2}})
  246. .execs({{1, 1, 6, 6, 6}, {1, 1, 5, 5, 5}, {1, 1, 1, 2, 2, 2}})
  247. .execs({{256, 1, 2, 2, 2}, {256, 1, 1, 1, 1}, {1, 1, 1, 2, 2, 2}});
  248. require_algo = true;
  249. checker.execs({{1, 2, 2, 2, 2}, {1, 2, 1, 1, 1}, {2, 1, 1, 2, 2, 2}})
  250. .execs({{1, 2, 6, 6, 6}, {1, 2, 5, 5, 5}, {2, 1, 1, 2, 2, 2}})
  251. .execs({{2, 2, 6, 6, 6}, {2, 6, 5, 5, 5}, {2, 3, 1, 2, 2, 2}});
  252. checker.set_param(gconv_param({M, 1, 1, 1, 1, 1, 1}))
  253. .execs({{2, 2, 4, 4, 4}, {2, 2, 5, 5, 5}, {2, 1, 1, 2, 2, 2}});
  254. require_algo = false;
  255. checker.set_param(gconv_param({M, 0, 0, 0, 1, 1, 1}))
  256. .execs({{40960, 1, 1, 1, 1}, {40960, 1, 1, 1, 1}, {1, 1, 1, 1, 1, 1}});
  257. require_algo = true;
  258. checker.set_param(gconv_param({M, 2, 3, 2, 2, 1, 2}))
  259. .execs({{32, 12, 39, 8, 20}, {32, 36, 20, 10, 10}, {12, 3, 1, 4, 5, 6}});
  260. // padding larger than kern
  261. checker.set_param(gconv_param({M, 20, 30, 30, 4, 5, 5}))
  262. .execs({{32, 6, 2, 3, 3}, {32, 12, 10, 12, 12}, {6, 2, 1, 4, 5, 5}});
  263. // unused filter items
  264. checker.set_param(gconv_param({M, 2, 3, 3, 2, 3, 3}))
  265. .execs({{32, 6, 1, 1, 1}, {32, 12, 1, 1, 1}, {6, 2, 1, 5, 7, 7}});
  266. }
  267. #if MEGDNN_WITH_BENCHMARK
  268. TEST_F(CUDA, CHANWISE_CONVOLUTION3D_FORWARD_BENCH_CHECK) {
  269. auto handle = handle_cuda();
  270. auto handle_cpu = handle_naive();
  271. auto conv0 = handle->create_operator<Convolution3DForward>();
  272. auto conv1 = handle->create_operator<Convolution3DForward>();
  273. BenchmarkEnv<0, 1, 2> benv(handle, handle_cpu);
  274. auto run = [&](size_t N, size_t IC, size_t ID, size_t IH, size_t IW, size_t CHL_MUL,
  275. size_t FD, size_t FH, size_t FW, size_t PD, size_t PH, size_t PW) {
  276. benv.alloc(N, IC, ID, IH, IW, CHL_MUL, FD, FH, FW, PD, PH, PW);
  277. benv.fill_src();
  278. benv.fill_flt();
  279. benv.exec(conv0.get(), conv1.get());
  280. benv.cmp_dst();
  281. };
  282. run(64, 30, 10, 10, 10, 1, 3, 3, 3, 1, 1, 1);
  283. if (check_need_full_bench()) {
  284. run(64, 728, 9, 9, 9, 2, 5, 5, 5, 2, 2, 2);
  285. run(64, 64, 30, 30, 30, 2, 3, 3, 3, 1, 1, 1);
  286. run(1, 2048, 4, 4, 4, 2, 3, 3, 3, 1, 1, 1);
  287. }
  288. }
  289. TEST_F(CUDA, CHANWISE_CONVOLUTION3D_BWD_DATA_BENCH_CHECK) {
  290. auto handle = handle_cuda();
  291. auto handle_cpu = handle_naive();
  292. auto conv0 = handle->create_operator<Convolution3DBackwardData>();
  293. auto conv1 = handle->create_operator<Convolution3DBackwardData>();
  294. BenchmarkEnv<1, 2, 0> benv(handle, handle_cpu);
  295. auto run = [&](size_t N, size_t IC, size_t ID, size_t IH, size_t IW, size_t CHL_MUL,
  296. size_t FD, size_t FH, size_t FW, size_t PD, size_t PH, size_t PW) {
  297. benv.alloc(N, ID, IC, IH, IW, CHL_MUL, FD, FH, FW, PD, PH, PW);
  298. benv.fill_dst();
  299. benv.fill_flt();
  300. benv.exec(conv0.get(), conv1.get());
  301. benv.cmp_src();
  302. };
  303. run(64, 60, 50, 50, 50, 1, 3, 3, 3, 1, 1, 1);
  304. if (check_need_full_bench()) {
  305. run(64, 728, 18, 18, 18, 2, 5, 5, 5, 2, 2, 2);
  306. run(64, 64, 32, 32, 32, 2, 3, 3, 3, 1, 1, 1);
  307. run(1, 2048, 4, 4, 4, 2, 3, 3, 3, 1, 1, 1);
  308. }
  309. }
  310. TEST_F(CUDA, CHANWISE_CONVOLUTION3D_BWD_FILTER_BENCH_CHECK) {
  311. auto handle = handle_cuda();
  312. auto handle_cpu = handle_naive();
  313. auto conv0 = handle->create_operator<Convolution3DBackwardFilter>();
  314. auto conv1 = handle->create_operator<Convolution3DBackwardFilter>();
  315. BenchmarkEnv<0, 2, 1> benv(handle, handle_cpu);
  316. auto run = [&](size_t N, size_t IC, size_t ID, size_t IH, size_t IW, size_t CHL_MUL,
  317. size_t FD, size_t FH, size_t FW, size_t PD, size_t PH, size_t PW) {
  318. benv.alloc(N, IC, ID, IH, IW, CHL_MUL, FD, FH, FW, PD, PH, PW);
  319. benv.fill_src();
  320. benv.fill_dst();
  321. benv.exec(conv0.get(), conv1.get());
  322. benv.cmp_flt();
  323. };
  324. run(67, 729, 20, 20, 20, 1, 3, 3, 3, 1, 1, 1);
  325. if (check_need_full_bench()) {
  326. run(64, 728, 18, 18, 18, 2, 5, 5, 5, 2, 2, 2);
  327. // the case below is an sample that select unexpected algo_1
  328. run(64, 64, 32, 32, 32, 2, 3, 3, 3, 1, 1, 1);
  329. run(1, 2048, 4, 4, 4, 2, 3, 3, 3, 1, 1, 1);
  330. }
  331. }
  332. #endif
  333. // vim: syntax=cpp.doxygen