You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

chanwise_convolution.cpp 63 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545
  1. #include "megdnn/oprs/nn.h"
  2. #include "cuda.h"
  3. #include "megcore_cuda.h"
  4. #include "test/common/benchmarker.h"
  5. #include "test/common/checker.h"
  6. #include "test/common/convolution.h"
  7. #include "test/common/tensor.h"
  8. #include "test/common/workspace_wrapper.h"
  9. #include "test/cuda/benchmark.h"
  10. #include "test/cuda/fixture.h"
  11. #include "test/cuda/utils.h"
  12. #include <cuda_profiler_api.h>
  13. #include <cuda_runtime_api.h>
  14. using namespace megdnn;
  15. using namespace test;
  16. namespace {
  17. #if MEGDNN_WITH_BENCHMARK
  18. bool check_need_full_bench() {
  19. if (getenv("MEGDNN_CHANWISE_CONV_FULLBENCH"))
  20. return true;
  21. printf("set MEGDNN_CHANWISE_CONV_FULLBENCH to run full benchmark\n");
  22. return false;
  23. }
  24. #endif
  25. Convolution::Param gconv_param(Convolution::Param p, bool io16xc32 = false) {
  26. p.sparse = Convolution::Param::Sparse::GROUP;
  27. if (io16xc32)
  28. p.compute_mode = Convolution::Param::ComputeMode::FLOAT32;
  29. return p;
  30. }
  31. template <int P0, int P1, int P2>
  32. class BenchmarkEnv {
  33. Handle *handle, *handle_cpu;
  34. std::unique_ptr<GaussianRNG> rng;
  35. TensorLayout lsrc, lflt0, lflt1, ldst;
  36. std::unique_ptr<Tensor<>> src0, src1, flt0, flt0_cpu, flt1, flt1_cpu, dst0, dst1;
  37. cudaEvent_t cuda_ev[3];
  38. cudaStream_t cuda_stream;
  39. size_t pad_h, pad_w;
  40. template <typename T>
  41. static std::tuple<T, T, T> shuffle(std::tuple<T, T, T> data) {
  42. return std::make_tuple(
  43. std::get<P0>(data), std::get<P1>(data), std::get<P2>(data));
  44. }
  45. public:
  46. BenchmarkEnv(Handle* handle, Handle* handle_cpu) {
  47. this->handle = handle;
  48. this->handle_cpu = handle_cpu;
  49. rng = handle->create_operator<GaussianRNG>();
  50. // make cpu handle used
  51. handle_cpu->create_operator<Sleep>()->exec();
  52. for (int i = 0; i < 3; ++i)
  53. cudaEventCreate(&cuda_ev[i]);
  54. megcoreGetCUDAStream(handle->megcore_computing_handle(), &cuda_stream);
  55. }
  56. ~BenchmarkEnv() {
  57. for (int i = 0; i < 3; ++i)
  58. cudaEventDestroy(cuda_ev[i]);
  59. }
  60. void alloc(
  61. size_t N, size_t IC, size_t IH, size_t IW, size_t CHL_MUL, size_t FH,
  62. size_t FW, size_t PH, size_t PW) {
  63. pad_h = PH;
  64. pad_w = PW;
  65. auto mkly = [](const TensorShape& s) {
  66. return TensorLayout{s, dtype::Float32()};
  67. };
  68. lsrc = mkly({N, IC, IH, IW});
  69. lflt0 = mkly({CHL_MUL * IC, IC, FH, FW});
  70. lflt1 = mkly({IC, CHL_MUL, 1, FH, FW});
  71. ldst = mkly({N, IC * CHL_MUL, IH - FH + 1 + PH * 2, IW - FW + 1 + PW * 2});
  72. src0.reset(new Tensor<>(handle, lsrc));
  73. src1.reset(new Tensor<>(handle, lsrc));
  74. flt0.reset(new Tensor<>(handle, lflt0));
  75. flt0_cpu.reset(new Tensor<>(handle_cpu, lflt0));
  76. flt1.reset(new Tensor<>(handle, lflt1));
  77. flt1_cpu.reset(new Tensor<>(handle_cpu, lflt1));
  78. dst0.reset(new Tensor<>(handle, ldst));
  79. dst1.reset(new Tensor<>(handle, ldst));
  80. }
  81. void fill_src() {
  82. rng->exec(src0->tensornd(), {});
  83. megdnn_memcpy_D2D(handle, src1->ptr(), src0->ptr(), lsrc.span().dist_byte());
  84. }
  85. void fill_flt() {
  86. rng->exec(flt1->tensornd(), {});
  87. megdnn_memcpy_D2H(
  88. handle, flt1_cpu->ptr(), flt1->ptr(), lflt1.span().dist_byte());
  89. const size_t IC = lflt1[0], CHL_MUL = lflt1[1], FSIZE = lflt1[3] * lflt1[4];
  90. // fill flt0 from flt1
  91. float* src = flt1_cpu->ptr();
  92. float* dst = flt0_cpu->ptr();
  93. memset(dst, 0, lflt0.span().dist_byte());
  94. for (size_t i = 0; i < IC; ++i) {
  95. for (size_t j = 0; j < CHL_MUL; ++j) {
  96. memcpy(dst + ((i * CHL_MUL + j) * IC + i) * FSIZE,
  97. src + (i * CHL_MUL + j) * FSIZE, FSIZE * sizeof(float));
  98. }
  99. }
  100. megdnn_memcpy_H2D(handle, flt0->ptr(), dst, lflt0.span().dist_byte());
  101. }
  102. void fill_dst() {
  103. rng->exec(dst0->tensornd(), {});
  104. megdnn_memcpy_D2D(handle, dst1->ptr(), dst0->ptr(), ldst.span().dist_byte());
  105. }
  106. template <class Opr>
  107. void exec(Opr* opr0, Opr* opr1) {
  108. opr0->param().pad_h = pad_h;
  109. opr0->param().pad_w = pad_w;
  110. opr1->param() = opr0->param();
  111. opr1->param().sparse = param::Convolution::Sparse::GROUP;
  112. TensorND a0, b0, c0, a1, b1, c1;
  113. std::tie(a0, b0, c0) = shuffle(
  114. std::make_tuple(src0->tensornd(), flt0->tensornd(), dst0->tensornd()));
  115. std::tie(a1, b1, c1) = shuffle(
  116. std::make_tuple(src1->tensornd(), flt1->tensornd(), dst1->tensornd()));
  117. WorkspaceWrapper wk(
  118. handle,
  119. std::max(
  120. opr0->get_workspace_in_bytes(a0.layout, b0.layout, c0.layout),
  121. opr1->get_workspace_in_bytes(a1.layout, b1.layout, c1.layout)));
  122. cudaProfilerStart();
  123. cudaEventRecord(cuda_ev[0], cuda_stream);
  124. opr0->exec(a0, b0, c0, wk.workspace());
  125. cudaEventRecord(cuda_ev[1], cuda_stream);
  126. opr1->exec(a1, b1, c1, wk.workspace());
  127. cudaEventRecord(cuda_ev[2], cuda_stream);
  128. cudaProfilerStop();
  129. if (getenv("MEGDNN_CHANWISE_CONV_VERBOSE") ||
  130. getenv("MEGDNN_CHANWISE_CONV_FULLBENCH")) {
  131. cudaStreamSynchronize(cuda_stream);
  132. float t0 = -1, t1 = -1;
  133. cudaEventElapsedTime(&t0, cuda_ev[0], cuda_ev[1]);
  134. cudaEventElapsedTime(&t1, cuda_ev[1], cuda_ev[2]);
  135. printf("%s;%s;%s: cudnn/megdnn: %.3fms/%.3fms=%.3f\n",
  136. lsrc.TensorShape::to_string().c_str(),
  137. lflt1.TensorShape::to_string().c_str(),
  138. ldst.TensorShape::to_string().c_str(), t0, t1, t0 / t1);
  139. }
  140. }
  141. //! special for weight preprocess
  142. void exec_convolution(ConvolutionForward* opr0, ConvolutionForward* opr1) {
  143. opr0->param().pad_h = pad_h;
  144. opr0->param().pad_w = pad_w;
  145. opr1->param() = opr0->param();
  146. opr1->param().sparse = param::Convolution::Sparse::GROUP;
  147. TensorND a0, b0, c0, a1, b1, c1;
  148. std::tie(a0, b0, c0) = shuffle(
  149. std::make_tuple(src0->tensornd(), flt0->tensornd(), dst0->tensornd()));
  150. std::tie(a1, b1, c1) = shuffle(
  151. std::make_tuple(src1->tensornd(), flt1->tensornd(), dst1->tensornd()));
  152. WorkspaceWrapper wk(
  153. handle, std::max(
  154. opr0->get_workspace_in_bytes(
  155. a0.layout, b0.layout, c0.layout, nullptr),
  156. opr1->get_workspace_in_bytes(
  157. a1.layout, b1.layout, c1.layout, nullptr)));
  158. cudaProfilerStart();
  159. cudaEventRecord(cuda_ev[0], cuda_stream);
  160. opr0->exec(a0, b0, c0, nullptr, wk.workspace());
  161. cudaEventRecord(cuda_ev[1], cuda_stream);
  162. opr1->exec(a1, b1, c1, nullptr, wk.workspace());
  163. cudaEventRecord(cuda_ev[2], cuda_stream);
  164. cudaProfilerStop();
  165. if (getenv("MEGDNN_CHANWISE_CONV_VERBOSE") ||
  166. getenv("MEGDNN_CHANWISE_CONV_FULLBENCH")) {
  167. cudaStreamSynchronize(cuda_stream);
  168. float t0 = -1, t1 = -1;
  169. cudaEventElapsedTime(&t0, cuda_ev[0], cuda_ev[1]);
  170. cudaEventElapsedTime(&t1, cuda_ev[1], cuda_ev[2]);
  171. printf("%s;%s;%s: cudnn/megdnn: %.3fms/%.3fms=%.3f\n",
  172. lsrc.TensorShape::to_string().c_str(),
  173. lflt1.TensorShape::to_string().c_str(),
  174. ldst.TensorShape::to_string().c_str(), t0, t1, t0 / t1);
  175. }
  176. }
  177. void cmp_dst() {
  178. Tensor<> dst0_cpu(handle_cpu, ldst), dst1_cpu(handle_cpu, ldst);
  179. megdnn_memcpy_D2H(handle, dst0_cpu.ptr(), dst0->ptr(), ldst.span().dist_byte());
  180. megdnn_memcpy_D2H(handle, dst1_cpu.ptr(), dst1->ptr(), ldst.span().dist_byte());
  181. dst0_cpu.check_with(dst1_cpu);
  182. }
  183. void cmp_src() {
  184. Tensor<> src0_cpu(handle_cpu, lsrc), src1_cpu(handle_cpu, lsrc);
  185. megdnn_memcpy_D2H(handle, src0_cpu.ptr(), src0->ptr(), lsrc.span().dist_byte());
  186. megdnn_memcpy_D2H(handle, src1_cpu.ptr(), src1->ptr(), lsrc.span().dist_byte());
  187. src0_cpu.check_with(src1_cpu);
  188. }
  189. void cmp_flt() {
  190. Tensor<> flt0_cpu(handle_cpu, lflt0), flt1_cpu(handle_cpu, lflt1);
  191. float* p0 = flt0_cpu.ptr();
  192. float* p1 = flt1_cpu.ptr();
  193. megdnn_memcpy_D2H(handle, p0, flt0->ptr(), lflt0.span().dist_byte());
  194. megdnn_memcpy_D2H(handle, p1, flt1->ptr(), lflt1.span().dist_byte());
  195. size_t IC = lflt1[0], CHL_MUL = lflt1[1], FSIZE = lflt1[3] * lflt1[4];
  196. double tot_err = 0, tot_err_num = 0;
  197. for (size_t i = 0; i < IC; ++i) {
  198. for (size_t j = 0; j < CHL_MUL; ++j) {
  199. auto t0 = p0 + ((i * CHL_MUL + j) * IC + i) * FSIZE,
  200. t1 = p1 + (i * CHL_MUL + j) * FSIZE;
  201. for (size_t k = 0; k < FSIZE; ++k) {
  202. auto err = std::abs(diff(t0[k], t1[k]));
  203. tot_err += err;
  204. tot_err_num += 1;
  205. ASSERT_LT(err, 1e-2) << "failed at " << i << " " << j << " " << k
  206. << " vals=" << t0[k] << "," << t1[k];
  207. }
  208. }
  209. }
  210. auto avg_err = tot_err / tot_err_num;
  211. ASSERT_LT(avg_err, 1e-4);
  212. }
  213. };
  214. } // anonymous namespace
  215. constexpr auto M = Convolution::Mode::CROSS_CORRELATION;
  216. TEST_F(CUDA, CHANWISE_CONVOLUTION_FORWARD) {
  217. Checker<Convolution> checker(handle_cuda());
  218. bool require_algo = false;
  219. checker.set_before_exec_callback(AlgoChecker<ConvolutionForward>(
  220. ExecutionPolicyAlgoName{
  221. "DEFAULT",
  222. {{ConvBiasForward::algo_name<ConvBiasForward::DirectParam>(
  223. "CHANNEL_WISE", {})
  224. .c_str(),
  225. {}}}},
  226. &require_algo));
  227. for (auto dtype : std::vector<DType>{dtype::Float32(), dtype::Float16()}) {
  228. checker.set_dtype(0, dtype).set_dtype(1, dtype).set_dtype(2, dtype);
  229. if (dtype.enumv() == DTypeEnum::Float16)
  230. checker.set_epsilon(2e-2);
  231. // simple case
  232. // clang-format off
  233. for (uint32_t s : {1, 2})
  234. for (uint32_t p : {0, 1, 2, 3})
  235. for (size_t f : {2, 3, 5, 7})
  236. for (size_t ocpg : {1, 3}) {
  237. checker.set_param(gconv_param({M, p, p, s, s}))
  238. .execs({{2, 3, 16, 16}, {3, ocpg, 1, f, f}, {}});
  239. }
  240. // clang-format on
  241. checker.set_param(gconv_param({M, 2, 3, 2, 1}))
  242. .execs({{32, 12, 20, 10}, {12, 2, 1, 4, 5}, {}});
  243. // padding larger than kern
  244. checker.set_param(gconv_param({M, 20, 30, 4, 5}))
  245. .execs({{32, 12, 20, 10}, {12, 2, 1, 4, 5}, {}});
  246. }
  247. }
  248. TEST_F(CUDA, CHANWISE_CONVOLUTION_FORWARD_SMALL) {
  249. Checker<Convolution> checker(handle_cuda());
  250. bool require_algo = false;
  251. checker.set_before_exec_callback(AlgoChecker<ConvolutionForward>(
  252. ExecutionPolicyAlgoName{
  253. "DEFAULT",
  254. {{ConvBiasForward::algo_name<ConvBiasForward::DirectParam>(
  255. "CHANNEL_WISE_SMALL", {})
  256. .c_str(),
  257. {}}}},
  258. &require_algo));
  259. for (auto dtype : std::vector<DType> {
  260. dtype::Float32(),
  261. #if CUDA_VERSION >= 9000
  262. dtype::Float16()
  263. #endif
  264. }) {
  265. checker.set_dtype(0, dtype).set_dtype(1, dtype).set_dtype(2, dtype);
  266. if (dtype.enumv() == DTypeEnum::Float16)
  267. checker.set_epsilon(2e-2);
  268. // clang-format off
  269. for (uint32_t s : {1})
  270. for (uint32_t f : {1, 3, 5, 7}) {
  271. checker.set_param(gconv_param({M, f / 2, f / 2, s, s}))
  272. .execs({{2, 3, 16, 16}, {3, 1, 1, f, f}, {}});
  273. }
  274. // clang-format on
  275. checker.set_param(gconv_param({M, 1, 1, 1, 1}))
  276. .execs({{2, 3, 3, 16}, {3, 1, 1, 3, 3}, {}})
  277. .execs({{2, 3, 8, 3}, {3, 1, 1, 3, 3}, {}});
  278. }
  279. }
  280. TEST_F(CUDA, CHANWISE_CONVOLUTION_BACKWARD_DATA) {
  281. Checker<ConvolutionBackwardData> checker(handle_cuda());
  282. bool require_algo = false;
  283. checker.set_before_exec_callback(
  284. AlgoChecker<ConvolutionBackwardData>("CHANNEL_WISE", &require_algo));
  285. for (auto dtype : std::vector<DType>{dtype::Float32(), dtype::Float16()}) {
  286. checker.set_dtype(0, dtype).set_dtype(1, dtype).set_dtype(2, dtype);
  287. if (dtype.enumv() == DTypeEnum::Float16)
  288. checker.set_epsilon(1e-1);
  289. // simple case
  290. // clang-format off
  291. for (uint32_t s : {1, 2})
  292. for (uint32_t p : {0, 1, 2, 3})
  293. for (size_t f : {1, 2, 3, 5, 7})
  294. for (size_t ocpg : {1, 3}) {
  295. size_t ii = infer_conv_shape(16, f, s, p, true);
  296. checker.set_param(gconv_param({M, p, p, s, s}))
  297. .execs({{3, ocpg, 1, f, f},
  298. {2, 3 * ocpg, ii, ii},
  299. {2, 3, 16, 16}});
  300. }
  301. // clang-format on
  302. checker.set_param(gconv_param({M, 2, 3, 2, 1}))
  303. .execs({{12, 3, 1, 4, 5}, {32, 36, 20, 10}, {32, 12, 39, 8}});
  304. checker.set_param(gconv_param({M, 30, 20, 5, 4}))
  305. .execs({{6, 2, 1, 5, 4}, {32, 12, 12, 10}, {32, 6, 3, 2}});
  306. checker.set_param(gconv_param({M, 20, 30, 4, 5}))
  307. .execs({{6, 2, 1, 4, 5}, {32, 12, 10, 12}, {32, 6, 2, 3}});
  308. }
  309. }
  310. TEST_F(CUDA, CHANWISE_CONVOLUTION_BACKWARD_DATA_SMALL) {
  311. Checker<ConvolutionBackwardData> checker(handle_cuda());
  312. bool require_algo = false;
  313. checker.set_before_exec_callback(
  314. AlgoChecker<ConvolutionBackwardData>("CHANNEL_WISE_SMALL", &require_algo));
  315. for (auto dtype : std::vector<DType> {
  316. dtype::Float32(),
  317. #if CUDA_VERSION >= 9000
  318. dtype::Float16()
  319. #endif
  320. }) {
  321. checker.set_dtype(0, dtype).set_dtype(1, dtype).set_dtype(2, dtype);
  322. if (dtype.enumv() == DTypeEnum::Float16)
  323. checker.set_epsilon(2e-2);
  324. for (uint32_t f : {1, 3, 5, 7}) {
  325. checker.set_param(gconv_param({M, f / 2, f / 2, 1, 1}))
  326. .execs({{3, 1, 1, f, f}, {2, 3, 16, 16}, {2, 3, 16, 16}});
  327. }
  328. checker.set_param(gconv_param({M, 1, 1, 1, 1}))
  329. .execs({{3, 1, 1, 3, 3}, {2, 3, 3, 16}, {2, 3, 3, 16}})
  330. .execs({{3, 1, 1, 3, 3}, {2, 3, 8, 3}, {2, 3, 8, 3}});
  331. }
  332. }
  333. TEST_F(CUDA, CHANWISE_CONVOLUTION_BACKWARD_FILTER) {
  334. Checker<ConvolutionBackwardFilter> checker(handle_cuda());
  335. bool require_algo = false;
  336. checker.set_before_exec_callback(
  337. AlgoChecker<ConvolutionBackwardFilter>("CHANNEL_WISE", &require_algo));
  338. UniformFloatRNG rng(-0.1, 0.1);
  339. for (auto dtype : std::vector<DType>{dtype::Float32(), dtype::Float16()}) {
  340. checker.set_dtype(0, dtype)
  341. .set_dtype(1, dtype)
  342. .set_dtype(2, dtype)
  343. .set_rng(0, &rng)
  344. .set_rng(1, &rng);
  345. if (dtype.enumv() == DTypeEnum::Float16)
  346. checker.set_epsilon(2e-1);
  347. // simple case
  348. // clang-format off
  349. for (uint32_t s : {1, 2})
  350. for (uint32_t p : {0, 1, 2, 3})
  351. for (uint32_t f : {1, 2, 3, 5, 7})
  352. for (uint32_t ocpg : {1, 3})
  353. for (uint32_t i : {8, 16, 32, 64}){
  354. size_t ii = infer_conv_shape(i, f, s, p, true);
  355. checker.set_param(gconv_param({M, p, p, s, s}))
  356. .execs({{2, 3, i, i},
  357. {2, 3 * ocpg, ii, ii},
  358. {3, ocpg, 1, f, f}});
  359. }
  360. // clang-format on
  361. // padding larger than kern
  362. checker.set_param(gconv_param({M, 20, 30, 4, 5}))
  363. .execs({{32, 6, 2, 3}, {32, 12, 10, 12}, {6, 2, 1, 4, 5}});
  364. // unused filter items
  365. checker.set_param(gconv_param({M, 2, 3, 2, 3}))
  366. .execs({{32, 6, 1, 1}, {32, 12, 1, 1}, {6, 2, 1, 5, 7}});
  367. }
  368. }
  369. namespace {
  370. template <typename Op>
  371. struct AlgoCheckerMaker {
  372. static auto make(const char* name, bool* require_algo) {
  373. return AlgoChecker<Op>(name, require_algo);
  374. }
  375. };
  376. template <>
  377. struct AlgoCheckerMaker<ConvolutionForward> {
  378. static auto make(const char* name, bool* require_algo) {
  379. return AlgoChecker<ConvolutionForward>(
  380. ExecutionPolicyAlgoName{
  381. "DEFAULT",
  382. {{ConvBiasForward::algo_name<ConvBiasForward::DirectParam>(
  383. name, {})
  384. .c_str(),
  385. {}}}},
  386. require_algo);
  387. }
  388. };
  389. template <typename Op>
  390. void check_chanwise(DType io_type, DType comp_type, Handle* handle, const char* name) {
  391. Checker<Op> checker(handle);
  392. bool require_algo = false;
  393. checker.set_before_exec_callback(AlgoCheckerMaker<Op>::make(name, &require_algo));
  394. checker.set_dtype(0, io_type).set_dtype(1, io_type).set_dtype(2, io_type);
  395. bool io16xc32 = false;
  396. if (io_type == dtype::Float16()) {
  397. if (comp_type == dtype::Float16()) {
  398. checker.set_epsilon(1e-1);
  399. } else {
  400. io16xc32 = true;
  401. }
  402. }
  403. // dispatch testcase by operation
  404. if (std::is_same<Op, ConvolutionForward>::value) {
  405. // align 8
  406. checker.set_param(gconv_param({M, 7, 7, 1, 1}, io16xc32))
  407. .execs({{8, 2, 16, 16}, {2, 1, 1, 15, 15}, {}});
  408. // align 1
  409. checker.set_param(gconv_param({M, 7, 7, 1, 1}, io16xc32))
  410. .execs({{8, 2, 15, 15}, {2, 1, 1, 15, 15}, {}});
  411. // align 2
  412. checker.set_param(gconv_param({M, 7, 7, 1, 1}, io16xc32))
  413. .execs({{8, 2, 14, 14}, {2, 1, 1, 15, 15}, {}});
  414. // custom padding
  415. checker.set_param(gconv_param({M, 3, 3, 1, 1}, io16xc32))
  416. .execs({{8, 2, 16, 16}, {2, 1, 1, 15, 15}, {}});
  417. // custom stride
  418. checker.set_param(gconv_param({M, 7, 7, 2, 2}, io16xc32))
  419. .execs({{8, 2, 16, 16}, {2, 1, 1, 15, 15}, {}});
  420. } else if (std::is_same<Op, ConvolutionBackwardData>::value) {
  421. // align 8
  422. checker.set_param(gconv_param({M, 7, 7, 1, 1}, io16xc32))
  423. .execs({{2, 1, 1, 15, 15}, {8, 2, 16, 16}, {8, 2, 16, 16}});
  424. // align 1
  425. checker.set_param(gconv_param({M, 7, 7, 1, 1}, io16xc32))
  426. .execs({{2, 1, 1, 15, 15}, {8, 2, 15, 15}, {8, 2, 15, 15}});
  427. // align 2
  428. checker.set_param(gconv_param({M, 7, 7, 1, 1}, io16xc32))
  429. .execs({{2, 1, 1, 15, 15}, {8, 2, 14, 14}, {8, 2, 14, 14}});
  430. // custom padding
  431. checker.set_param(gconv_param({M, 3, 3, 1, 1}, io16xc32))
  432. .execs({{2, 1, 1, 15, 15}, {8, 2, 8, 8}, {8, 2, 16, 16}});
  433. // custom stride
  434. checker.set_param(gconv_param({M, 7, 7, 2, 2}, io16xc32))
  435. .execs({{2, 1, 1, 15, 15}, {8, 2, 7, 7}, {8, 2, 14, 14}});
  436. } else if (std::is_same<Op, ConvolutionBackwardFilter>::value) {
  437. // align 8
  438. checker.set_param(gconv_param({M, 7, 7, 1, 1}, io16xc32))
  439. .execs({{8, 2, 16, 16}, {8, 2, 16, 16}, {2, 1, 1, 15, 15}});
  440. // align 1
  441. checker.set_param(gconv_param({M, 7, 7, 1, 1}, io16xc32))
  442. .execs({{8, 2, 15, 15}, {8, 2, 15, 15}, {2, 1, 1, 15, 15}});
  443. // align 2
  444. checker.set_param(gconv_param({M, 7, 7, 1, 1}, io16xc32))
  445. .execs({{8, 2, 14, 14}, {8, 2, 14, 14}, {2, 1, 1, 15, 15}});
  446. // custom padding
  447. checker.set_param(gconv_param({M, 3, 3, 1, 1}, io16xc32))
  448. .execs({{8, 2, 16, 16}, {8, 2, 8, 8}, {2, 1, 1, 15, 15}});
  449. // custom stride
  450. checker.set_param(gconv_param({M, 7, 7, 2, 2}, io16xc32))
  451. .execs({{8, 2, 14, 14}, {8, 2, 7, 7}, {2, 1, 1, 15, 15}});
  452. }
  453. }
  454. } // namespace
  455. #define MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_FMA_KERNEL(cb) \
  456. cb(1, 128, 128, 8, 32, 64, 8); \
  457. cb(2, 128, 64, 8, 64, 32, 8); \
  458. cb(3, 128, 32, 8, 64, 32, 8); \
  459. cb(4, 64, 128, 8, 32, 64, 8); \
  460. cb(5, 32, 128, 8, 32, 64, 8); \
  461. cb(6, 64, 64, 8, 32, 64, 8); \
  462. cb(7, 32, 64, 8, 32, 64, 8); \
  463. cb(8, 32, 32, 8, 32, 32, 8); \
  464. cb(9, 64, 32, 8, 64, 32, 8);
  465. #define cb(tag, tbm, tbn, tbk, wm, wn, wk) \
  466. TEST_F(CUDA, CHANWISE_CONVOLUTION_FORWARD_CUTLASS_FMA_##tag) { \
  467. require_compute_capability(6, 1); \
  468. check_chanwise<ConvolutionForward>( \
  469. dtype::Float32(), dtype::Float32(), handle_cuda(), \
  470. "FLOAT32_NCHW_FMA_IMPLICIT_BATCHED_GEMM_" #tbm "X" #tbn "X" #tbk \
  471. "_" #wm "X" #wn "X" #wk "_2stage"); \
  472. }
  473. MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_FMA_KERNEL(cb)
  474. #undef cb
  475. #define cb(tag, tbm, tbn, tbk, wm, wn, wk) \
  476. TEST_F(CUDA, CHANWISE_CONVOLUTION_BACKWARD_DATA_CUTLASS_FMA_##tag) { \
  477. require_compute_capability(6, 1); \
  478. check_chanwise<ConvolutionBackwardData>( \
  479. dtype::Float32(), dtype::Float32(), handle_cuda(), \
  480. "FLOAT32_NCHW_FMA_IMPLICIT_BATCHED_GEMM_" #tbm "X" #tbn "X" #tbk \
  481. "_" #wm "X" #wn "X" #wk "_2stage"); \
  482. }
  483. MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_FMA_KERNEL(cb)
  484. #undef cb
  485. #define cb(tag, tbm, tbn, tbk, wm, wn, wk) \
  486. TEST_F(CUDA, CHANWISE_CONVOLUTION_BACKWARD_FILTER_CUTLASS_FMA_##tag) { \
  487. require_compute_capability(6, 1); \
  488. check_chanwise<ConvolutionBackwardFilter>( \
  489. dtype::Float32(), dtype::Float32(), handle_cuda(), \
  490. "FLOAT32_NCHW_FMA_IMPLICIT_BATCHED_GEMM_" #tbm "X" #tbn "X" #tbk \
  491. "_" #wm "X" #wn "X" #wk "_2stage"); \
  492. }
  493. MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_FMA_KERNEL(cb)
  494. #undef cb
  495. #undef MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_FMA_KERNEL
  496. #if CUDA_VERSION >= 10010
  497. #define MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_HMMA_KERNEL(cb) \
  498. cb(1, 128, 128, 32, 32, 32, 32); \
  499. cb(2, 128, 256, 32, 64, 64, 32); \
  500. cb(3, 128, 64, 32, 32, 32, 32); \
  501. cb(4, 64, 128, 32, 32, 32, 32); \
  502. cb(5, 64, 64, 32, 32, 32, 32);
  503. #else
  504. // hmma instruction need cuda version >= 10.2, disable hmma testcases in this path
  505. #define MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_HMMA_KERNEL(cb)
  506. #endif
  507. // check both ioc16 and io16xc32
  508. #define cb(tag, tbm, tbn, tbk, wm, wn, wk) \
  509. TEST_F(CUDA, CHANWISE_CONVOLUTION_FORWARD_CUTLASS_HMMA_##tag) { \
  510. require_compute_capability(7, 0); \
  511. check_chanwise<ConvolutionForward>( \
  512. dtype::Float16(), dtype::Float16(), handle_cuda(), \
  513. "FLOAT16_NCHW_HMMA_IMPLICIT_BATCHED_GEMM_" #tbm "X" #tbn "X" #tbk \
  514. "_" #wm "X" #wn "X" #wk "_2stage"); \
  515. check_chanwise<ConvolutionForward>( \
  516. dtype::Float16(), dtype::Float32(), handle_cuda(), \
  517. "FLOAT16_NCHW_HMMA_IMPLICIT_BATCHED_GEMM_" #tbm "X" #tbn "X" #tbk \
  518. "_" #wm "X" #wn "X" #wk "_2stage"); \
  519. }
  520. MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_HMMA_KERNEL(cb)
  521. #undef cb
  522. #define cb(tag, tbm, tbn, tbk, wm, wn, wk) \
  523. TEST_F(CUDA, CHANWISE_CONVOLUTION_BACKWARD_DATA_CUTLASS_HMMA_##tag) { \
  524. require_compute_capability(7, 0); \
  525. check_chanwise<ConvolutionBackwardData>( \
  526. dtype::Float16(), dtype::Float16(), handle_cuda(), \
  527. "FLOAT16_NCHW_HMMA_IMPLICIT_BATCHED_GEMM_" #tbm "X" #tbn "X" #tbk \
  528. "_" #wm "X" #wn "X" #wk "_mma8X8X4_2stage"); \
  529. check_chanwise<ConvolutionBackwardData>( \
  530. dtype::Float16(), dtype::Float32(), handle_cuda(), \
  531. "FLOAT16_NCHW_HMMA_IMPLICIT_BATCHED_GEMM_" #tbm "X" #tbn "X" #tbk \
  532. "_" #wm "X" #wn "X" #wk "_mma8X8X4_2stage"); \
  533. }
  534. MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_HMMA_KERNEL(cb)
  535. #undef cb
  536. #define cb(tag, tbm, tbn, tbk, wm, wn, wk) \
  537. TEST_F(CUDA, CHANWISE_CONVOLUTION_BACKWARD_FILTER_CUTLASS_HMMA_##tag) { \
  538. require_compute_capability(7, 0); \
  539. check_chanwise<ConvolutionBackwardData>( \
  540. dtype::Float16(), dtype::Float32(), handle_cuda(), \
  541. "FLOAT16_NCHW_HMMA_IMPLICIT_BATCHED_GEMM_" #tbm "X" #tbn "X" #tbk \
  542. "_" #wm "X" #wn "X" #wk "_mma8X8X4_2stage"); \
  543. }
  544. MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_HMMA_KERNEL(cb)
  545. #undef cb
  546. #undef MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_FWD_HMMA_KERNEL
  547. #if MEGDNN_WITH_BENCHMARK
  548. TEST_F(CUDA, CHANWISE_CONVOLUTION_FORWARD_BENCH_CHECK) {
  549. auto handle = handle_cuda();
  550. auto handle_cpu = handle_naive();
  551. auto conv0 = handle->create_operator<ConvolutionForward>();
  552. auto conv1 = handle->create_operator<ConvolutionForward>();
  553. BenchmarkEnv<0, 1, 2> benv(handle, handle_cpu);
  554. auto run = [&](size_t N, size_t IC, size_t IH, size_t IW, size_t CHL_MUL, size_t FH,
  555. size_t FW, size_t PH, size_t PW) {
  556. benv.alloc(N, IC, IH, IW, CHL_MUL, FH, FW, PH, PW);
  557. benv.fill_src();
  558. benv.fill_flt();
  559. benv.exec_convolution(conv0.get(), conv1.get());
  560. benv.cmp_dst();
  561. };
  562. run(64, 60, 50, 50, 1, 3, 3, 1, 1);
  563. if (check_need_full_bench()) {
  564. run(64, 728, 18, 18, 2, 5, 5, 2, 2);
  565. run(64, 64, 150, 150, 2, 3, 3, 1, 1);
  566. run(1, 2048, 4, 4, 2, 3, 3, 1, 1);
  567. }
  568. }
  569. TEST_F(CUDA, CHANWISE_CONVOLUTION_BWD_DATA_BENCH_CHECK) {
  570. auto handle = handle_cuda();
  571. auto handle_cpu = handle_naive();
  572. auto conv0 = handle->create_operator<ConvolutionBackwardData>();
  573. auto conv1 = handle->create_operator<ConvolutionBackwardData>();
  574. BenchmarkEnv<1, 2, 0> benv(handle, handle_cpu);
  575. auto run = [&](size_t N, size_t IC, size_t IH, size_t IW, size_t CHL_MUL, size_t FH,
  576. size_t FW, size_t PH, size_t PW) {
  577. benv.alloc(N, IC, IH, IW, CHL_MUL, FH, FW, PH, PW);
  578. benv.fill_dst();
  579. benv.fill_flt();
  580. benv.exec(conv0.get(), conv1.get());
  581. benv.cmp_src();
  582. };
  583. run(64, 60, 50, 50, 1, 3, 3, 1, 1);
  584. if (check_need_full_bench()) {
  585. run(64, 728, 18, 18, 2, 5, 5, 2, 2);
  586. run(64, 64, 150, 150, 2, 3, 3, 1, 1);
  587. run(1, 2048, 4, 4, 2, 3, 3, 1, 1);
  588. }
  589. }
  590. TEST_F(CUDA, CHANWISE_CONVOLUTION_BWD_FILTER_BENCH_CHECK) {
  591. auto handle = handle_cuda();
  592. auto handle_cpu = handle_naive();
  593. auto conv0 = handle->create_operator<ConvolutionBackwardFilter>();
  594. auto conv1 = handle->create_operator<ConvolutionBackwardFilter>();
  595. BenchmarkEnv<0, 2, 1> benv(handle, handle_cpu);
  596. auto run = [&](size_t N, size_t IC, size_t IH, size_t IW, size_t CHL_MUL, size_t FH,
  597. size_t FW, size_t PH, size_t PW) {
  598. benv.alloc(N, IC, IH, IW, CHL_MUL, FH, FW, PH, PW);
  599. benv.fill_src();
  600. benv.fill_dst();
  601. benv.exec(conv0.get(), conv1.get());
  602. benv.cmp_flt();
  603. };
  604. run(64, 60, 50, 50, 1, 3, 3, 1, 1);
  605. if (check_need_full_bench()) {
  606. run(64, 728, 18, 18, 2, 5, 5, 2, 2);
  607. run(64, 64, 150, 150, 2, 3, 3, 1, 1);
  608. run(1, 2048, 4, 4, 2, 3, 3, 1, 1);
  609. }
  610. }
  611. TEST_F(CUDA, CHANWISE_CONVOLUTION_BENCH_ALL_ALGO_FWD) {
  612. // enable profiling
  613. std::unique_ptr<OprProxy<ConvolutionForward>> proxy{
  614. new OprProxy<ConvolutionForward>{true}};
  615. proxy->warmup_times = 1;
  616. proxy->exec_times = 10;
  617. Benchmarker<ConvolutionForward> checker(handle_cuda());
  618. checker.set_times(1);
  619. ConvolutionForward::Param param;
  620. param.sparse = ConvolutionForward::Param::Sparse::GROUP;
  621. checker.set_param(param);
  622. checker.set_proxy(proxy);
  623. auto run = [&](size_t N, size_t C, size_t IH, size_t IW, size_t FH, size_t FW) {
  624. checker.proxy()->target_execution_policy = {};
  625. checker.execs({{N, C, IH, IW}, {C, 1, 1, FH, FW}, {}});
  626. };
  627. run(128, 64, 90, 80, 3, 3);
  628. run(128, 90, 100, 100, 3, 5);
  629. run(128, 32, 62, 62, 5, 5);
  630. }
  631. TEST_F(CUDA, CHANWISE_CONVOLUTION_BENCH_ALL_ALGO_BWD_DATA) {
  632. // enable profiling
  633. std::unique_ptr<OprProxy<ConvolutionBackwardData>> proxy{
  634. new OprProxy<ConvolutionBackwardData>{true}};
  635. proxy->warmup_times = 1;
  636. proxy->exec_times = 10;
  637. Benchmarker<ConvolutionBackwardData> checker(handle_cuda());
  638. checker.set_times(1);
  639. ConvolutionBackwardData::Param param;
  640. param.sparse = ConvolutionForward::Param::Sparse::GROUP;
  641. checker.set_param(param);
  642. checker.set_proxy(proxy);
  643. auto run = [&](size_t N, size_t C, size_t IH, size_t IW, size_t FH, size_t FW) {
  644. checker.proxy()->target_execution_policy.algo.reset();
  645. checker.execs(
  646. {{C, 1, 1, FH, FW}, {N, C, IH - FH + 1, IW - FW + 1}, {N, C, IH, IW}});
  647. };
  648. run(128, 64, 90, 80, 3, 3);
  649. run(128, 90, 100, 100, 3, 5);
  650. run(128, 32, 62, 62, 5, 5);
  651. }
  652. TEST_F(CUDA, CHANWISE_CONVOLUTION_BENCH_ALL_ALGO_BWD_FILTER) {
  653. // enable profiling
  654. std::unique_ptr<OprProxy<ConvolutionBackwardFilter>> proxy{
  655. new OprProxy<ConvolutionBackwardFilter>{true}};
  656. proxy->warmup_times = 1;
  657. proxy->exec_times = 10;
  658. Benchmarker<ConvolutionBackwardFilter> checker(handle_cuda());
  659. checker.set_times(1);
  660. ConvolutionBackwardFilter::Param param;
  661. param.sparse = ConvolutionForward::Param::Sparse::GROUP;
  662. checker.set_param(param);
  663. checker.set_proxy(proxy);
  664. auto run = [&](size_t N, size_t C, size_t IH, size_t IW, size_t FH, size_t FW) {
  665. checker.proxy()->target_execution_policy.algo.reset();
  666. checker.execs(
  667. {{N, C, IH, IW}, {N, C, IH - FH + 1, IW - FW + 1}, {C, 1, 1, FH, FW}});
  668. };
  669. run(128, 64, 90, 80, 3, 3);
  670. run(128, 90, 100, 100, 3, 5);
  671. run(128, 32, 62, 62, 5, 5);
  672. }
  673. TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_ALL_ALGO_FORWARD) {
  674. CUBenchmarker<ConvolutionForward> bencher(handle_cuda());
  675. size_t RUNS = 10;
  676. bencher.set_display(false).set_times(RUNS);
  677. std::unique_ptr<OprProxy<ConvolutionForward>> proxy{
  678. new OprProxy<ConvolutionForward>{true}};
  679. bencher.set_proxy(proxy);
  680. Convolution::Param param;
  681. param.format = ConvBias::Param::Format::NCHW;
  682. param.sparse = Convolution::Param::Sparse::GROUP;
  683. NormalRNG rng;
  684. auto run = [&](size_t batch, size_t c, size_t ih, size_t iw, size_t f, size_t s) {
  685. param.pad_h = f / 2;
  686. param.pad_w = f / 2;
  687. param.stride_h = s;
  688. param.stride_w = s;
  689. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  690. TensorShape src = {batch, c, ih, iw}, filter = {c, 1, 1, f, f};
  691. TensorLayout dst_layout;
  692. auto opr = handle_cuda()->create_operator<Convolution>();
  693. opr->param() = param;
  694. opr->deduce_layout(
  695. {src, dtype::Float32()}, {filter, dtype::Float32()}, dst_layout);
  696. float bandwith = static_cast<float>(
  697. src.total_nr_elems() + filter.total_nr_elems() +
  698. dst_layout.total_nr_elems()) /
  699. (1024 * 1024 * 1024) * 1e3;
  700. bencher.set_param(param)
  701. .set_dtype(0, dtype::Float32())
  702. .set_dtype(1, dtype::Float32())
  703. .set_dtype(2, dtype::Float32())
  704. .set_rng(0, &rng)
  705. .set_rng(1, &rng);
  706. bencher.proxy()->target_execution_policy = {};
  707. auto time_in_ms_fp32 = bencher.execs({src, filter, {}}) / RUNS;
  708. bencher.set_param(param)
  709. .set_dtype(0, dtype::Float16())
  710. .set_dtype(1, dtype::Float16())
  711. .set_dtype(2, dtype::Float16())
  712. .set_rng(0, &rng)
  713. .set_rng(1, &rng);
  714. bencher.proxy()->target_execution_policy = {};
  715. auto time_in_ms_fp16 = bencher.execs({src, filter, {}}) / RUNS;
  716. bencher.proxy()->target_execution_policy.algo.reset();
  717. param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  718. bencher.set_param(param);
  719. auto time_in_ms_pseudo_fp16 = bencher.execs({src, filter, {}}) / RUNS;
  720. printf("stride=%zu src=%s, filter=%s, float32: %.2fms %.2fGB/s "
  721. "float16: %.2fms %.2fGB/s "
  722. "pseudo float16: %.2fms %.2fGB/s "
  723. "speedup: "
  724. "%0.2f (fp16/fp32) %.2f (fp16/pseudo fp16)\n",
  725. s, src.to_string().c_str(), filter.to_string().c_str(), time_in_ms_fp32,
  726. bandwith * 4 / time_in_ms_fp32, time_in_ms_fp16,
  727. bandwith * 2 / time_in_ms_fp16, time_in_ms_pseudo_fp16,
  728. bandwith * 2 / time_in_ms_pseudo_fp16, time_in_ms_fp32 / time_in_ms_fp16,
  729. time_in_ms_pseudo_fp16 / time_in_ms_fp16);
  730. };
  731. // clang-format off
  732. for (size_t s : {1, 2})
  733. for (size_t f : {3, 5, 7})
  734. for (size_t batch : {64})
  735. for (size_t c : {16, 32, 64, 128})
  736. for (size_t ih: {128, 256})
  737. for (size_t iw : {128, 256})
  738. run(batch, c, ih, iw, f, s);
  739. // clang-format on
  740. run(128, 192, 28, 28, 3, 1);
  741. run(128, 192, 28, 28, 3, 2);
  742. run(128, 576, 14, 14, 3, 1);
  743. run(128, 384, 14, 14, 3, 1);
  744. run(128, 32, 112, 112, 3, 1);
  745. run(128, 960, 7, 7, 3, 1);
  746. run(128, 384, 14, 14, 3, 1);
  747. run(128, 144, 56, 56, 3, 2);
  748. run(128, 384, 14, 14, 3, 1);
  749. run(128, 144, 56, 56, 3, 1);
  750. run(128, 96, 112, 112, 3, 2);
  751. run(128, 384, 14, 14, 3, 1);
  752. run(128, 192, 28, 28, 3, 1);
  753. run(128, 576, 14, 14, 3, 1);
  754. run(128, 576, 14, 14, 3, 2);
  755. }
  756. TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_FORWARD_FLOAT) {
  757. CUBenchmarker<ConvolutionForward> bencher(handle_cuda());
  758. size_t RUNS = 1;
  759. bencher.set_display(false).set_times(RUNS);
  760. bencher.set_before_exec_callback(
  761. AlgoChecker<ConvolutionForward>(ExecutionPolicyAlgoName{
  762. "DEFAULT",
  763. {{ConvBiasForward::algo_name<ConvBiasForward::DirectParam>(
  764. "CHANNEL_WISE", {})
  765. .c_str(),
  766. {}}}}));
  767. Convolution::Param param;
  768. param.format = ConvBias::Param::Format::NCHW;
  769. param.sparse = Convolution::Param::Sparse::GROUP;
  770. NormalRNG rng;
  771. auto run = [&](size_t batch, size_t c, size_t ih, size_t iw, size_t f, size_t s) {
  772. param.pad_h = f / 2;
  773. param.pad_w = f / 2;
  774. param.stride_h = s;
  775. param.stride_w = s;
  776. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  777. TensorShape src = {batch, c, ih, iw}, filter = {c, 1, 1, f, f};
  778. TensorLayout dst_layout;
  779. auto opr = handle_cuda()->create_operator<Convolution>();
  780. opr->param() = param;
  781. opr->deduce_layout(
  782. {src, dtype::Float32()}, {filter, dtype::Float32()}, dst_layout);
  783. float bandwith = static_cast<float>(
  784. src.total_nr_elems() + filter.total_nr_elems() +
  785. dst_layout.total_nr_elems()) /
  786. (1024 * 1024 * 1024) * 1e3;
  787. bencher.set_param(param)
  788. .set_dtype(0, dtype::Float32())
  789. .set_dtype(1, dtype::Float32())
  790. .set_dtype(2, dtype::Float32())
  791. .set_rng(0, &rng)
  792. .set_rng(1, &rng);
  793. auto time_in_ms_fp32 = bencher.execs({src, filter, {}}) / RUNS;
  794. bencher.set_param(param)
  795. .set_dtype(0, dtype::Float16())
  796. .set_dtype(1, dtype::Float16())
  797. .set_dtype(2, dtype::Float16())
  798. .set_rng(0, &rng)
  799. .set_rng(1, &rng);
  800. auto time_in_ms_fp16 = bencher.execs({src, filter, {}}) / RUNS;
  801. printf("stride=%zu src=%s, filter=%s, float32: %.2fms %.2fGB/s "
  802. "float16: %.2fms %.2fGB/s "
  803. "speedup: "
  804. "%0.2f (fp16/fp32)\n",
  805. s, src.to_string().c_str(), filter.to_string().c_str(), time_in_ms_fp32,
  806. bandwith * 4 / time_in_ms_fp32, time_in_ms_fp16,
  807. bandwith * 2 / time_in_ms_fp16, time_in_ms_fp32 / time_in_ms_fp16);
  808. };
  809. // clang-format off
  810. for (size_t s : {1})
  811. for (size_t f : {3, 5, 7})
  812. for (size_t batch : {64})
  813. for (size_t c : {16, 32, 64, 128})
  814. for (size_t ih: {8, 16, 32, 128, 256})
  815. for (size_t iw : {8, 16, 32, 128, 256})
  816. run(batch, c, ih, iw, f, s);
  817. // clang-format on
  818. }
  819. TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_FORWARD_FLOAT_SMALL) {
  820. CUBenchmarker<ConvolutionForward> bencher(handle_cuda());
  821. size_t RUNS = 1;
  822. bencher.set_display(false).set_times(RUNS);
  823. Convolution::Param param;
  824. param.format = ConvBias::Param::Format::NCHW;
  825. param.sparse = Convolution::Param::Sparse::GROUP;
  826. NormalRNG rng;
  827. auto run = [&](size_t batch, size_t c, size_t ih, size_t iw, size_t f, size_t s) {
  828. param.pad_h = f / 2;
  829. param.pad_w = f / 2;
  830. param.stride_h = s;
  831. param.stride_w = s;
  832. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  833. TensorShape src = {batch, c, ih, iw}, filter = {c, 1, 1, f, f};
  834. TensorLayout dst_layout;
  835. auto opr = handle_cuda()->create_operator<Convolution>();
  836. opr->param() = param;
  837. opr->deduce_layout(
  838. {src, dtype::Float32()}, {filter, dtype::Float32()}, dst_layout);
  839. float bandwith = static_cast<float>(
  840. src.total_nr_elems() + filter.total_nr_elems() +
  841. dst_layout.total_nr_elems()) /
  842. (1024 * 1024 * 1024) * 1e3;
  843. bencher.set_param(param)
  844. .set_dtype(0, dtype::Float32())
  845. .set_dtype(1, dtype::Float32())
  846. .set_dtype(2, dtype::Float32())
  847. .set_rng(0, &rng)
  848. .set_rng(1, &rng)
  849. .set_before_exec_callback(AlgoChecker<
  850. ConvolutionForward>(ExecutionPolicyAlgoName{
  851. "DEFAULT",
  852. {{ConvBiasForward::algo_name<ConvBiasForward::DirectParam>(
  853. "CHANNEL_WISE", {})
  854. .c_str(),
  855. {}}}}));
  856. auto time_in_ms_fp32_normal = bencher.execs({src, filter, {}}) / RUNS;
  857. bencher.set_before_exec_callback(
  858. AlgoChecker<ConvolutionForward>(ExecutionPolicyAlgoName{
  859. "DEFAULT",
  860. {{ConvBiasForward::algo_name<ConvBiasForward::DirectParam>(
  861. "CHANNEL_WISE", {})
  862. .c_str(),
  863. {}}}}));
  864. auto time_in_ms_fp32_small = bencher.execs({src, filter, {}}) / RUNS;
  865. bencher.set_param(param)
  866. .set_dtype(0, dtype::Float16())
  867. .set_dtype(1, dtype::Float16())
  868. .set_dtype(2, dtype::Float16())
  869. .set_rng(0, &rng)
  870. .set_rng(1, &rng);
  871. auto time_in_ms_fp16_small = bencher.execs({src, filter, {}}) / RUNS;
  872. printf("stride=%zu src=%s, filter=%s, fp32 normal: %.2fms %.2fGB/s "
  873. "small: %.2fms %.2fGB/s, fp16 small: %.2fms %.2fGB/s, "
  874. "speedup: "
  875. "%0.2f (fp32 small/normal) %0.2f (small fp16/fp32)\n",
  876. s, src.to_string().c_str(), filter.to_string().c_str(),
  877. time_in_ms_fp32_normal, bandwith * 4 / time_in_ms_fp32_normal,
  878. time_in_ms_fp32_small, bandwith * 4 / time_in_ms_fp32_small,
  879. time_in_ms_fp16_small, bandwith * 2 / time_in_ms_fp16_small,
  880. time_in_ms_fp32_normal / time_in_ms_fp32_small,
  881. time_in_ms_fp32_small / time_in_ms_fp16_small);
  882. };
  883. // clang-format off
  884. for (size_t s : {1})
  885. for (size_t f : {3, 5})
  886. for (size_t batch : {64})
  887. for (size_t c : {16, 32, 64, 128})
  888. for (size_t ih: {8, 16, 32})
  889. for (size_t iw : {8, 16, 32})
  890. run(batch, c, ih, iw, f, s);
  891. // clang-format on
  892. run(128, 192, 28, 28, 3, 1);
  893. run(128, 576, 14, 14, 3, 1);
  894. run(128, 384, 14, 14, 3, 1);
  895. run(128, 960, 7, 7, 3, 1);
  896. run(128, 384, 14, 14, 3, 1);
  897. run(128, 384, 14, 14, 3, 1);
  898. run(128, 384, 14, 14, 3, 1);
  899. run(128, 192, 28, 28, 3, 1);
  900. run(128, 576, 14, 14, 3, 1);
  901. }
  902. TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_FORWARD_CUDNN_DNN) {
  903. CUBenchmarker<ConvBiasForward> bencher(handle_cuda());
  904. size_t RUNS = 1;
  905. bencher.set_display(false).set_times(RUNS);
  906. ConvBias::Param param;
  907. param.format = ConvBias::Param::Format::NCHW;
  908. param.sparse = ConvBias::Param::Sparse::GROUP;
  909. NormalRNG rng;
  910. auto run = [&](size_t batch, size_t c, size_t ih, size_t iw, size_t f, size_t s) {
  911. param.pad_h = f / 2;
  912. param.pad_w = f / 2;
  913. param.stride_h = s;
  914. param.stride_w = s;
  915. param.compute_mode = param::ConvBias::ComputeMode::DEFAULT;
  916. TensorShape src = {batch, c, ih, iw}, filter = {c, 1, 1, f, f},
  917. bias = {1, c, 1, 1};
  918. TensorLayout dst_layout;
  919. auto opr = handle_cuda()->create_operator<ConvBias>();
  920. opr->param() = param;
  921. opr->deduce_layout(
  922. {src, dtype::Float32()}, {filter, dtype::Float32()},
  923. {bias, dtype::Float32()}, {}, dst_layout);
  924. float computation_mops =
  925. static_cast<float>(dst_layout.total_nr_elems() * f * f * 2) * 1e-6;
  926. bencher.set_param(param)
  927. .set_dtype(0, dtype::Float32())
  928. .set_dtype(1, dtype::Float32())
  929. .set_dtype(2, dtype::Float32())
  930. .set_rng(0, &rng)
  931. .set_rng(1, &rng);
  932. bencher.set_before_exec_callback(
  933. AlgoChecker<ConvBiasForward>(".+CHANNEL_WISE.+"));
  934. auto time_in_ms_dnn = bencher.execs({src, filter, bias, {}, {}}) / RUNS;
  935. bencher.set_param(param)
  936. .set_dtype(0, dtype::Float32())
  937. .set_dtype(1, dtype::Float32())
  938. .set_dtype(2, dtype::Float32())
  939. .set_rng(0, &rng)
  940. .set_rng(1, &rng);
  941. bencher.set_before_exec_callback(AlgoChecker<ConvBiasForward>(
  942. ".+CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM.+"));
  943. auto time_in_ms_cudnn = bencher.execs({src, filter, bias, {}, {}}) / RUNS;
  944. printf("stride=%zu src=%s, filter=%s, dst=%s, dnn: %.2fms %.2fGB/s "
  945. "cudnn: %.2fms %.2fGB/s "
  946. "speedup: "
  947. "%0.2f (dnn/cudnn)\n",
  948. s, src.to_string().c_str(), filter.to_string().c_str(),
  949. dst_layout.to_string().c_str(), time_in_ms_dnn,
  950. computation_mops / time_in_ms_dnn, time_in_ms_cudnn,
  951. computation_mops / time_in_ms_cudnn, time_in_ms_cudnn / time_in_ms_dnn);
  952. };
  953. // clang-format off
  954. for(size_t batch:{1, 16, 32, 64, 128}){
  955. run(batch, 32, 112, 112, 3, 1);
  956. run(batch, 96, 112, 112, 3, 2);
  957. run(batch, 96, 112, 112, 3, 1);
  958. run(batch, 144, 56, 56, 3, 2);
  959. run(batch, 144, 56, 56, 3, 1);
  960. run(batch, 192, 28, 28, 3, 1);
  961. run(batch, 384, 14, 14, 3, 1);
  962. run(batch, 576, 14, 14, 3, 1);
  963. run(batch, 960, 7, 7, 3, 1);
  964. //! calibrate heu algo policy hw_size param
  965. run(batch, 144, 24, 24, 3, 1);
  966. run(batch, 144, 22, 22, 3, 1);
  967. run(batch, 144, 20, 20, 3, 1);
  968. run(batch, 144, 18, 18, 3, 1);
  969. }
  970. // clang-format on
  971. }
  972. TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_BACKWARD_DATA_FLOAT_SMALL) {
  973. CUBenchmarker<ConvolutionBackwardData> bencher(handle_cuda());
  974. size_t RUNS = 1;
  975. bencher.set_display(false).set_times(RUNS);
  976. ConvolutionBackwardData::Param param;
  977. param.format = Convolution::Param::Format::NCHW;
  978. param.sparse = Convolution::Param::Sparse::GROUP;
  979. NormalRNG rng;
  980. auto run = [&](size_t batch, size_t c, size_t ih, size_t iw, size_t f, size_t s) {
  981. param.pad_h = f / 2;
  982. param.pad_w = f / 2;
  983. param.stride_h = s;
  984. param.stride_w = s;
  985. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  986. TensorShape src = {batch, c, ih, iw}, filter = {c, 1, 1, f, f};
  987. float bandwith = static_cast<float>(
  988. src.total_nr_elems() + filter.total_nr_elems() +
  989. src.total_nr_elems()) /
  990. (1024 * 1024 * 1024) * 1e3;
  991. bencher.set_param(param)
  992. .set_dtype(0, dtype::Float32())
  993. .set_dtype(1, dtype::Float32())
  994. .set_dtype(2, dtype::Float32())
  995. .set_rng(0, &rng)
  996. .set_rng(1, &rng)
  997. .set_before_exec_callback(
  998. AlgoChecker<ConvolutionBackwardData>("CHANNEL_WISE"));
  999. auto time_in_ms_fp32_normal = bencher.execs({filter, src, src}) / RUNS;
  1000. bencher.set_before_exec_callback(
  1001. AlgoChecker<ConvolutionBackwardData>("CHANNEL_WISE_SMALL"));
  1002. auto time_in_ms_fp32_small = bencher.execs({filter, src, src}) / RUNS;
  1003. bencher.set_param(param)
  1004. .set_dtype(0, dtype::Float16())
  1005. .set_dtype(1, dtype::Float16())
  1006. .set_dtype(2, dtype::Float16())
  1007. .set_rng(0, &rng)
  1008. .set_rng(1, &rng);
  1009. auto time_in_ms_fp16_small = bencher.execs({filter, src, src}) / RUNS;
  1010. printf("stride=%zu src=%s, filter=%s, fp32 normal: %.2fms %.2fGB/s "
  1011. "small: %.2fms %.2fGB/s, fp16 small: %.2fms %.2fGB/s, "
  1012. "speedup: "
  1013. "%0.2f (fp32 small/normal) %0.2f (small fp16/fp32)\n",
  1014. s, src.to_string().c_str(), filter.to_string().c_str(),
  1015. time_in_ms_fp32_normal, bandwith * 4 / time_in_ms_fp32_normal,
  1016. time_in_ms_fp32_small, bandwith * 4 / time_in_ms_fp32_small,
  1017. time_in_ms_fp16_small, bandwith * 2 / time_in_ms_fp16_small,
  1018. time_in_ms_fp32_normal / time_in_ms_fp32_small,
  1019. time_in_ms_fp32_small / time_in_ms_fp16_small);
  1020. };
  1021. // clang-format off
  1022. for (size_t s : {1})
  1023. for (size_t f : {3, 5})
  1024. for (size_t batch : {64})
  1025. for (size_t c : {16, 32, 64, 128})
  1026. for (size_t ih: {8, 16, 32})
  1027. for (size_t iw : {8, 16, 32})
  1028. run(batch, c, ih, iw, f, s);
  1029. // clang-format on
  1030. run(128, 192, 28, 28, 3, 1);
  1031. run(128, 576, 14, 14, 3, 1);
  1032. run(128, 384, 14, 14, 3, 1);
  1033. run(128, 960, 7, 7, 3, 1);
  1034. run(128, 384, 14, 14, 3, 1);
  1035. run(128, 384, 14, 14, 3, 1);
  1036. run(128, 384, 14, 14, 3, 1);
  1037. run(128, 192, 28, 28, 3, 1);
  1038. run(128, 576, 14, 14, 3, 1);
  1039. }
  1040. TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_BWD_DATA) {
  1041. CUBenchmarker<ConvolutionBackwardData> bencher(handle_cuda());
  1042. size_t RUNS = 1;
  1043. bencher.set_display(false).set_times(RUNS);
  1044. bencher.set_before_exec_callback(
  1045. AlgoChecker<ConvolutionBackwardData>("CHANNEL_WISE"));
  1046. Convolution::Param param;
  1047. param.format = ConvBias::Param::Format::NCHW;
  1048. param.sparse = Convolution::Param::Sparse::GROUP;
  1049. NormalRNG rng;
  1050. auto run = [&](size_t batch, size_t ocpg, size_t group, size_t ih, size_t iw,
  1051. size_t f, size_t p, size_t s) {
  1052. param.pad_h = p;
  1053. param.pad_w = p;
  1054. param.stride_h = s;
  1055. param.stride_w = s;
  1056. size_t oh, ow;
  1057. infer_conv_shape2d(ih, iw, f, f, s, s, p, p, oh, ow, true);
  1058. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  1059. TensorShape src_grad = {batch, group, ih, iw},
  1060. dst_grad = {batch, group * ocpg, oh, ow},
  1061. flt = {group, ocpg, 1, f, f};
  1062. auto opr = handle_cuda()->create_operator<Convolution>();
  1063. opr->param() = param;
  1064. float bandwith = static_cast<float>(
  1065. flt.total_nr_elems() + dst_grad.total_nr_elems() +
  1066. src_grad.total_nr_elems()) /
  1067. (1024 * 1024 * 1024) * 1e3;
  1068. bencher.set_param(param)
  1069. .set_dtype(0, dtype::Float32())
  1070. .set_dtype(1, dtype::Float32())
  1071. .set_dtype(2, dtype::Float32())
  1072. .set_rng(0, &rng)
  1073. .set_rng(1, &rng);
  1074. auto time_in_ms_fp32 = bencher.execs({flt, dst_grad, src_grad}) / RUNS;
  1075. bencher.set_param(param)
  1076. .set_dtype(0, dtype::Float16())
  1077. .set_dtype(1, dtype::Float16())
  1078. .set_dtype(2, dtype::Float16())
  1079. .set_rng(0, &rng)
  1080. .set_rng(1, &rng);
  1081. auto time_in_ms_fp16 = bencher.execs({flt, dst_grad, src_grad}) / RUNS;
  1082. printf("stride=%zu, src_grad=%s, flt=%s, "
  1083. "float32: %.2fms %.2fGB/s "
  1084. "float16: %.2fms %.2fGB/s "
  1085. "speedup: "
  1086. "%0.2f (fp16/fp32)\n",
  1087. s, src_grad.to_string().c_str(), flt.to_string().c_str(),
  1088. time_in_ms_fp32, bandwith * 4 / time_in_ms_fp32, time_in_ms_fp16,
  1089. bandwith * 2 / time_in_ms_fp16, time_in_ms_fp32 / time_in_ms_fp16);
  1090. };
  1091. // clang-format off
  1092. for (size_t s : {1, 2})
  1093. for (size_t f : {3, 5, 7})
  1094. for (size_t p : {f / 2})
  1095. for (size_t batch : {64})
  1096. for (size_t ocpg : {1})
  1097. for (size_t group : {16, 32, 64, 128})
  1098. for (size_t ih : {8, 16, 32, 128, 256})
  1099. for (size_t iw : {8, 16, 32, 128, 256})
  1100. run(batch, ocpg, group, ih, iw, f, p, s);
  1101. // clang-format on
  1102. }
  1103. TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_BWD_FILTER) {
  1104. CUBenchmarker<ConvolutionBackwardFilter> bencher(handle_cuda());
  1105. size_t RUNS = 1;
  1106. bencher.set_display(false).set_times(RUNS);
  1107. bencher.set_before_exec_callback(
  1108. AlgoChecker<ConvolutionBackwardFilter>("CHANNEL_WISE"));
  1109. Convolution::Param param;
  1110. param.format = ConvBias::Param::Format::NCHW;
  1111. param.sparse = Convolution::Param::Sparse::GROUP;
  1112. NormalRNG rng;
  1113. auto run = [&](size_t batch, size_t ocpg, size_t group, size_t i, size_t f,
  1114. size_t p, size_t s) {
  1115. param.pad_h = p;
  1116. param.pad_w = p;
  1117. param.stride_h = s;
  1118. param.stride_w = s;
  1119. size_t d = infer_conv_shape(i, f, s, p, true);
  1120. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  1121. TensorShape src = {batch, group, i, i}, dst_grad = {batch, group * ocpg, d, d},
  1122. flt_grad = {group, ocpg, 1, f, f};
  1123. auto opr = handle_cuda()->create_operator<Convolution>();
  1124. opr->param() = param;
  1125. float bandwith = static_cast<float>(
  1126. flt_grad.total_nr_elems() + dst_grad.total_nr_elems() +
  1127. src.total_nr_elems()) /
  1128. (1024 * 1024 * 1024) * 1e3;
  1129. bencher.set_param(param)
  1130. .set_dtype(0, dtype::Float32())
  1131. .set_dtype(1, dtype::Float32())
  1132. .set_dtype(2, dtype::Float32())
  1133. .set_rng(0, &rng)
  1134. .set_rng(1, &rng);
  1135. auto time_in_ms_fp32 = bencher.execs({src, dst_grad, flt_grad}) / RUNS;
  1136. bencher.set_param(param)
  1137. .set_dtype(0, dtype::Float16())
  1138. .set_dtype(1, dtype::Float16())
  1139. .set_dtype(2, dtype::Float16())
  1140. .set_rng(0, &rng)
  1141. .set_rng(1, &rng);
  1142. auto time_in_ms_fp16 = bencher.execs({src, dst_grad, flt_grad}) / RUNS;
  1143. printf("stride=%zu, src=%s, flt_grad=%s, "
  1144. "float32: %.2fms %.2fGB/s "
  1145. "float16: %.2fms %.2fGB/s "
  1146. "speedup: "
  1147. "%.2f (fp16/fp32)\n",
  1148. s, src.to_string().c_str(), flt_grad.to_string().c_str(),
  1149. time_in_ms_fp32, bandwith * 4 / time_in_ms_fp32, time_in_ms_fp16,
  1150. bandwith * 2 / time_in_ms_fp16, time_in_ms_fp32 / time_in_ms_fp16);
  1151. };
  1152. // clang-format off
  1153. for (size_t s : {1, 2})
  1154. for (size_t f : {3, 5, 7})
  1155. for (size_t p : {f / 2})
  1156. for (size_t batch : {64})
  1157. for (size_t ocpg : {1})
  1158. for (size_t group : {16, 32, 64, 128})
  1159. for (size_t i : {8, 16, 32, 64, 128})
  1160. run(batch, ocpg, group, i, f, p, s);
  1161. // clang-format on
  1162. }
  1163. TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_FORWARD_LARGE_KERNEL) {
  1164. CUBenchmarker<ConvolutionForward> bencher(handle_cuda());
  1165. size_t RUNS = 100;
  1166. bencher.set_display(false).set_times(RUNS);
  1167. std::unique_ptr<OprProxy<ConvolutionForward>> proxy{
  1168. new OprProxy<ConvolutionForward>{true}};
  1169. bencher.set_proxy(proxy);
  1170. Convolution::Param param;
  1171. param.format = ConvBias::Param::Format::NCHW;
  1172. param.sparse = Convolution::Param::Sparse::GROUP;
  1173. NormalRNG rng;
  1174. auto run = [&](size_t batch, size_t c, size_t ih, size_t iw, size_t f, size_t s) {
  1175. param.pad_h = f / 2;
  1176. param.pad_w = f / 2;
  1177. param.stride_h = s;
  1178. param.stride_w = s;
  1179. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  1180. TensorShape src = {batch, c, ih, iw}, filter = {c, 1, 1, f, f};
  1181. TensorLayout dst_layout;
  1182. auto opr = handle_cuda()->create_operator<Convolution>();
  1183. opr->param() = param;
  1184. opr->deduce_layout(
  1185. {src, dtype::Float32()}, {filter, dtype::Float32()}, dst_layout);
  1186. float bandwith = static_cast<float>(
  1187. src.total_nr_elems() + filter.total_nr_elems() +
  1188. dst_layout.total_nr_elems()) /
  1189. (1024 * 1024 * 1024) * 1e3;
  1190. bencher.set_param(param)
  1191. .set_dtype(0, dtype::Float32())
  1192. .set_dtype(1, dtype::Float32())
  1193. .set_dtype(2, dtype::Float32())
  1194. .set_rng(0, &rng)
  1195. .set_rng(1, &rng);
  1196. bencher.proxy()->target_execution_policy = {};
  1197. auto time_in_ms_fp32 = bencher.execs({src, filter, {}}) / RUNS;
  1198. bencher.set_param(param)
  1199. .set_dtype(0, dtype::Float16())
  1200. .set_dtype(1, dtype::Float16())
  1201. .set_dtype(2, dtype::Float16())
  1202. .set_rng(0, &rng)
  1203. .set_rng(1, &rng);
  1204. bencher.proxy()->target_execution_policy = {};
  1205. auto time_in_ms_fp16 = bencher.execs({src, filter, {}}) / RUNS;
  1206. bencher.proxy()->target_execution_policy.algo.reset();
  1207. param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  1208. bencher.set_param(param);
  1209. auto time_in_ms_pseudo_fp16 = bencher.execs({src, filter, {}}) / RUNS;
  1210. printf("stride=%zu src=%s, filter=%s, float32: %.2fms %.2fGB/s "
  1211. "float16: %.2fms %.2fGB/s "
  1212. "pseudo float16: %.2fms %.2fGB/s "
  1213. "speedup: "
  1214. "%0.2f (fp16/fp32) %.2f (fp16/pseudo fp16)\n",
  1215. s, src.to_string().c_str(), filter.to_string().c_str(), time_in_ms_fp32,
  1216. bandwith * 4 / time_in_ms_fp32, time_in_ms_fp16,
  1217. bandwith * 2 / time_in_ms_fp16, time_in_ms_pseudo_fp16,
  1218. bandwith * 2 / time_in_ms_pseudo_fp16, time_in_ms_fp32 / time_in_ms_fp16,
  1219. time_in_ms_pseudo_fp16 / time_in_ms_fp16);
  1220. };
  1221. // clang-format off
  1222. for (size_t b : {32, 64})
  1223. for (size_t f : {3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31}) {
  1224. run(b, 384, 32, 32, f, 1);
  1225. run(b, 384, 64, 64, f, 1);
  1226. }
  1227. // clang-format on
  1228. }
  1229. TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_BACKWARD_DATA_LARGE_KERNEL) {
  1230. CUBenchmarker<ConvolutionBackwardData> bencher(handle_cuda());
  1231. size_t RUNS = 100;
  1232. bencher.set_display(false).set_times(RUNS);
  1233. std::unique_ptr<OprProxy<ConvolutionBackwardData>> proxy{
  1234. new OprProxy<ConvolutionBackwardData>{true}};
  1235. bencher.set_proxy(proxy);
  1236. Convolution::Param param;
  1237. param.format = ConvBias::Param::Format::NCHW;
  1238. param.sparse = Convolution::Param::Sparse::GROUP;
  1239. NormalRNG rng;
  1240. auto run = [&](size_t batch, size_t c, size_t ih, size_t iw, size_t f, size_t s) {
  1241. param.pad_h = f / 2;
  1242. param.pad_w = f / 2;
  1243. param.stride_h = s;
  1244. param.stride_w = s;
  1245. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  1246. TensorShape src = {batch, c, ih, iw}, filter = {c, 1, 1, f, f};
  1247. TensorLayout dst_layout;
  1248. auto opr = handle_cuda()->create_operator<Convolution>();
  1249. opr->param() = param;
  1250. opr->deduce_layout(
  1251. {src, dtype::Float32()}, {filter, dtype::Float32()}, dst_layout);
  1252. float bandwith = static_cast<float>(
  1253. src.total_nr_elems() + filter.total_nr_elems() +
  1254. dst_layout.total_nr_elems()) /
  1255. (1024 * 1024 * 1024) * 1e3;
  1256. bencher.set_param(param)
  1257. .set_dtype(0, dtype::Float32())
  1258. .set_dtype(1, dtype::Float32())
  1259. .set_dtype(2, dtype::Float32())
  1260. .set_rng(0, &rng)
  1261. .set_rng(1, &rng);
  1262. bencher.proxy()->target_execution_policy = {};
  1263. auto time_in_ms_fp32 = bencher.execs({filter, src, src}) / RUNS;
  1264. bencher.set_param(param)
  1265. .set_dtype(0, dtype::Float16())
  1266. .set_dtype(1, dtype::Float16())
  1267. .set_dtype(2, dtype::Float16())
  1268. .set_rng(0, &rng)
  1269. .set_rng(1, &rng);
  1270. bencher.proxy()->target_execution_policy = {};
  1271. auto time_in_ms_fp16 = bencher.execs({filter, src, src}) / RUNS;
  1272. bencher.proxy()->target_execution_policy.algo.reset();
  1273. param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  1274. bencher.set_param(param);
  1275. auto time_in_ms_pseudo_fp16 = bencher.execs({filter, src, src}) / RUNS;
  1276. printf("stride=%zu src=%s, filter=%s, float32: %.2fms %.2fGB/s "
  1277. "float16: %.2fms %.2fGB/s "
  1278. "pseudo float16: %.2fms %.2fGB/s "
  1279. "speedup: "
  1280. "%0.2f (fp16/fp32) %.2f (fp16/pseudo fp16)\n",
  1281. s, src.to_string().c_str(), filter.to_string().c_str(), time_in_ms_fp32,
  1282. bandwith * 4 / time_in_ms_fp32, time_in_ms_fp16,
  1283. bandwith * 2 / time_in_ms_fp16, time_in_ms_pseudo_fp16,
  1284. bandwith * 2 / time_in_ms_pseudo_fp16, time_in_ms_fp32 / time_in_ms_fp16,
  1285. time_in_ms_pseudo_fp16 / time_in_ms_fp16);
  1286. };
  1287. // clang-format off
  1288. for (size_t b : {32, 64})
  1289. for (size_t f : {3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31}) {
  1290. run(b, 384, 32, 32, f, 1);
  1291. run(b, 384, 64, 64, f, 1);
  1292. }
  1293. // clang-format on
  1294. }
  1295. TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_BACKWARD_FILTER_LARGE_KERNEL) {
  1296. CUBenchmarker<ConvolutionBackwardFilter> bencher(handle_cuda());
  1297. size_t RUNS = 100;
  1298. bencher.set_display(false).set_times(RUNS);
  1299. std::unique_ptr<OprProxy<ConvolutionBackwardFilter>> proxy{
  1300. new OprProxy<ConvolutionBackwardFilter>{true}};
  1301. bencher.set_proxy(proxy);
  1302. Convolution::Param param;
  1303. param.format = ConvBias::Param::Format::NCHW;
  1304. param.sparse = Convolution::Param::Sparse::GROUP;
  1305. NormalRNG rng;
  1306. auto run = [&](size_t batch, size_t c, size_t ih, size_t iw, size_t f, size_t s) {
  1307. param.pad_h = f / 2;
  1308. param.pad_w = f / 2;
  1309. param.stride_h = s;
  1310. param.stride_w = s;
  1311. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  1312. TensorShape src = {batch, c, ih, iw}, filter = {c, 1, 1, f, f};
  1313. TensorLayout dst_layout;
  1314. auto opr = handle_cuda()->create_operator<Convolution>();
  1315. opr->param() = param;
  1316. opr->deduce_layout(
  1317. {src, dtype::Float32()}, {filter, dtype::Float32()}, dst_layout);
  1318. float bandwith = static_cast<float>(
  1319. src.total_nr_elems() + filter.total_nr_elems() +
  1320. dst_layout.total_nr_elems()) /
  1321. (1024 * 1024 * 1024) * 1e3;
  1322. bencher.set_param(param)
  1323. .set_dtype(0, dtype::Float32())
  1324. .set_dtype(1, dtype::Float32())
  1325. .set_dtype(2, dtype::Float32())
  1326. .set_rng(0, &rng)
  1327. .set_rng(1, &rng);
  1328. bencher.proxy()->target_execution_policy = {};
  1329. auto time_in_ms_fp32 = bencher.execs({src, src, filter}) / RUNS;
  1330. bencher.set_param(param)
  1331. .set_dtype(0, dtype::Float16())
  1332. .set_dtype(1, dtype::Float16())
  1333. .set_dtype(2, dtype::Float16())
  1334. .set_rng(0, &rng)
  1335. .set_rng(1, &rng);
  1336. bencher.proxy()->target_execution_policy = {};
  1337. param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  1338. bencher.set_param(param);
  1339. auto time_in_ms_pseudo_fp16 = bencher.execs({src, src, filter}) / RUNS;
  1340. printf("stride=%zu src=%s, filter=%s, float32: %.2fms %.2fGB/s "
  1341. "pseudo float16: %.2fms %.2fGB/s "
  1342. "speedup: "
  1343. "%0.2f (fp16/fp32) \n",
  1344. s, src.to_string().c_str(), filter.to_string().c_str(), time_in_ms_fp32,
  1345. bandwith * 4 / time_in_ms_fp32, time_in_ms_pseudo_fp16,
  1346. bandwith * 2 / time_in_ms_pseudo_fp16,
  1347. time_in_ms_fp32 / time_in_ms_pseudo_fp16);
  1348. };
  1349. // clang-format off
  1350. for (size_t b : {32, 64})
  1351. for (size_t f : {3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31}) {
  1352. run(b, 384, 32, 32, f, 1);
  1353. run(b, 384, 64, 64, f, 1);
  1354. }
  1355. // clang-format on
  1356. }
  1357. #endif
  1358. // vim: syntax=cpp.doxygen