You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

algos.cpp 36 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754
  1. /**
  2. * \file dnn/src/arm_common/conv_bias/fp32/algos.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #include "src/arm_common/conv_bias/fp32/algos.h"
  12. #include "src/arm_common/conv_bias/direct/multi_thread_common.h"
  13. #include "src/arm_common/conv_bias/fp32/direct.h"
  14. #include "src/arm_common/conv_bias/fp32/do_conv_stride1.h"
  15. #include "src/arm_common/conv_bias/fp32/do_conv_stride2.h"
  16. #include "src/arm_common/conv_bias/fp32/strategy.h"
  17. #include "src/arm_common/conv_bias/img2col_helper.h"
  18. #include "src/arm_common/conv_bias/postprocess_helper.h"
  19. #include "src/common/opr_delegate.h"
  20. #include "src/fallback/conv_bias/common.h"
  21. #include "midout.h"
  22. MIDOUT_DECL(megdnn_arm_common_winograd_fp32)
  23. using namespace megdnn;
  24. using namespace arm_common;
  25. /* ======================= AlgoFP32WinogradF23_4x4 ======================== */
  26. bool ConvBiasImpl::AlgoFP32WinogradF23_4x4::usable(
  27. fallback::ConvBiasImpl* opr, const NCBKernSizeParam& param,
  28. AlgoSelectionStrategy /*algo_selection_strategy*/) const {
  29. MEGDNN_MARK_USED_VAR(opr);
  30. MEGDNN_MARK_USED_VAR(param);
  31. MIDOUT_BEGIN(megdnn_arm_common_winograd_fp32, 0, 0) {
  32. if (param.filter_meta.icpg % 4 != 0 || param.filter_meta.ocpg % 4 != 0)
  33. return false;
  34. using Strategy = winograd::winograd_2x3_4x4_f;
  35. Strategy strategy(param.src_type, param.filter_type, param.dst_type);
  36. auto&& matmul_param =
  37. megdnn::winograd::ConvBias<Strategy,
  38. param::MatrixMul::Format::MK4>(
  39. strategy, m_tile_size, param.nr_threads, param.osz[0],
  40. param.osz[1], param.filter_meta.ocpg)
  41. .get_matmul_kern_param(param);
  42. return m_matmul_algo->usable(matmul_param) &&
  43. (opr->param().format == param::ConvBias::Format::NCHW ||
  44. (opr->param().format ==
  45. param::ConvBias::Format::NCHW_WINOGRAD &&
  46. opr->param().output_block_size == 2 &&
  47. param.winograd_matmul_format ==
  48. param::MatrixMul::Format::MK4)) &&
  49. opr->param().mode == param::ConvBias::Mode::CROSS_CORRELATION &&
  50. (param.filter_meta.spatial[0] == param.filter_meta.spatial[1] &&
  51. param.filter_meta.spatial[0] == 3) &&
  52. (param.filter_meta.stride[0] == param.filter_meta.stride[1] &&
  53. param.filter_meta.stride[0] == 1) &&
  54. (param.filter_meta.dilation[0] ==
  55. param.filter_meta.dilation[1] &&
  56. param.filter_meta.dilation[0] == 1) &&
  57. param.compute_mode == param::ConvBias::ComputeMode::DEFAULT &&
  58. param.src_type.enumv() == DTypeEnum::Float32;
  59. }
  60. MIDOUT_END();
  61. return false;
  62. }
  63. size_t ConvBiasImpl::AlgoFP32WinogradF23_4x4::get_workspace(
  64. fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const {
  65. MEGDNN_MARK_USED_VAR(param);
  66. MIDOUT_BEGIN(megdnn_arm_common_winograd_fp32, 0, 1) {
  67. winograd::winograd_2x3_4x4_f strategy(param.src_type, param.filter_type,
  68. param.dst_type);
  69. return megdnn::winograd::ConvBias<winograd::winograd_2x3_4x4_f,
  70. param::MatrixMul::Format::MK4>(
  71. strategy, m_tile_size, param.nr_threads, param.osz[0],
  72. param.osz[1], param.filter_meta.ocpg)
  73. .get_workspace_size(param, m_matmul_algo);
  74. }
  75. MIDOUT_END();
  76. return 0;
  77. }
  78. SmallVector<ConvBiasImpl::NCBKern>
  79. ConvBiasImpl::AlgoFP32WinogradF23_4x4::dispatch_kerns(
  80. fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const {
  81. MEGDNN_MARK_USED_VAR(param);
  82. MIDOUT_BEGIN(megdnn_arm_common_winograd_fp32, 0, 2) {
  83. winograd::winograd_2x3_4x4_f strategy(param.src_type, param.filter_type,
  84. param.dst_type);
  85. auto winograd_impl =
  86. megdnn::winograd::ConvBias<winograd::winograd_2x3_4x4_f,
  87. param::MatrixMul::Format::MK4>(
  88. strategy, m_tile_size, param.nr_threads, param.osz[0],
  89. param.osz[1], param.filter_meta.ocpg);
  90. return winograd_impl.get_kerns(param, m_matmul_algo);
  91. }
  92. MIDOUT_END();
  93. return {};
  94. }
  95. /* ======================= AlgoFP32WinogradF63 ======================== */
  96. bool ConvBiasImpl::AlgoFP32WinogradF63::usable(
  97. fallback::ConvBiasImpl* opr, const NCBKernSizeParam& param,
  98. AlgoSelectionStrategy /*algo_selection_strategy*/) const {
  99. MEGDNN_MARK_USED_VAR(param);
  100. MEGDNN_MARK_USED_VAR(opr);
  101. MIDOUT_BEGIN(megdnn_arm_common_winograd_fp32, 1, 0) {
  102. using Strategy = winograd::winograd_6x3_1x1_f;
  103. Strategy strategy(param.src_type, param.filter_type, param.dst_type);
  104. auto&& matmul_param =
  105. megdnn::winograd::ConvBias<Strategy>(
  106. strategy, m_tile_size, param.nr_threads, param.osz[0],
  107. param.osz[1], param.filter_meta.ocpg)
  108. .get_matmul_kern_param(param);
  109. return m_matmul_algo->usable(matmul_param) &&
  110. (opr->param().format == param::ConvBias::Format::NCHW ||
  111. (opr->param().format ==
  112. param::ConvBias::Format::NCHW_WINOGRAD &&
  113. opr->param().output_block_size == 6 &&
  114. param.winograd_matmul_format ==
  115. param::MatrixMul::Format::DEFAULT)) &&
  116. opr->param().mode == param::ConvBias::Mode::CROSS_CORRELATION &&
  117. (param.filter_meta.spatial[0] == param.filter_meta.spatial[1] &&
  118. param.filter_meta.spatial[0] == 3) &&
  119. (param.filter_meta.stride[0] == param.filter_meta.stride[1] &&
  120. param.filter_meta.stride[0] == 1) &&
  121. (param.filter_meta.dilation[0] ==
  122. param.filter_meta.dilation[1] &&
  123. param.filter_meta.dilation[0] == 1) &&
  124. param.compute_mode == param::ConvBias::ComputeMode::DEFAULT &&
  125. param.src_type.enumv() == DTypeEnum::Float32;
  126. }
  127. MIDOUT_END();
  128. return false;
  129. }
  130. size_t ConvBiasImpl::AlgoFP32WinogradF63::get_workspace(
  131. fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const {
  132. MEGDNN_MARK_USED_VAR(param);
  133. MIDOUT_BEGIN(megdnn_arm_common_winograd_fp32, 1, 1) {
  134. winograd::winograd_6x3_1x1_f strategy(param.src_type, param.filter_type,
  135. param.dst_type);
  136. return megdnn::winograd::ConvBias<winograd::winograd_6x3_1x1_f>(
  137. strategy, m_tile_size, param.nr_threads, param.osz[0],
  138. param.osz[1], param.filter_meta.ocpg)
  139. .get_workspace_size(param, m_matmul_algo);
  140. }
  141. MIDOUT_END();
  142. return 0;
  143. }
  144. SmallVector<ConvBiasImpl::NCBKern>
  145. ConvBiasImpl::AlgoFP32WinogradF63::dispatch_kerns(
  146. fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const {
  147. MEGDNN_MARK_USED_VAR(param);
  148. MIDOUT_BEGIN(megdnn_arm_common_winograd_fp32, 1, 2) {
  149. winograd::winograd_6x3_1x1_f strategy(param.src_type, param.filter_type,
  150. param.dst_type);
  151. auto winograd_impl =
  152. megdnn::winograd::ConvBias<winograd::winograd_6x3_1x1_f>(
  153. strategy, m_tile_size, param.nr_threads, param.osz[0],
  154. param.osz[1], param.filter_meta.ocpg);
  155. return winograd_impl.get_kerns(param, m_matmul_algo);
  156. }
  157. MIDOUT_END();
  158. return {};
  159. }
  160. /* ======================= AlgoFP32WinogradF54 ======================== */
  161. bool ConvBiasImpl::AlgoFP32WinogradF54::usable(
  162. fallback::ConvBiasImpl* opr, const NCBKernSizeParam& param,
  163. AlgoSelectionStrategy /*algo_selection_strategy*/) const {
  164. MEGDNN_MARK_USED_VAR(param);
  165. MEGDNN_MARK_USED_VAR(opr);
  166. MIDOUT_BEGIN(megdnn_arm_common_winograd_fp32, 2, 0) {
  167. using Strategy = winograd::winograd_5x4_1x1_f;
  168. Strategy strategy(param.src_type, param.filter_type, param.dst_type);
  169. auto&& matmul_param =
  170. megdnn::winograd::ConvBias<Strategy>(
  171. strategy, m_tile_size, param.nr_threads, param.osz[0],
  172. param.osz[1], param.filter_meta.ocpg)
  173. .get_matmul_kern_param(param);
  174. return m_matmul_algo->usable(matmul_param) &&
  175. (opr->param().format == param::ConvBias::Format::NCHW ||
  176. (opr->param().format ==
  177. param::ConvBias::Format::NCHW_WINOGRAD &&
  178. opr->param().output_block_size == 5 &&
  179. param.winograd_matmul_format ==
  180. param::MatrixMul::Format::DEFAULT)) &&
  181. opr->param().mode == param::ConvBias::Mode::CROSS_CORRELATION &&
  182. (param.filter_meta.spatial[0] == param.filter_meta.spatial[1] &&
  183. param.filter_meta.spatial[0] == 4) &&
  184. (param.filter_meta.stride[0] == param.filter_meta.stride[1] &&
  185. param.filter_meta.stride[0] == 1) &&
  186. (param.filter_meta.dilation[0] ==
  187. param.filter_meta.dilation[1] &&
  188. param.filter_meta.dilation[0] == 1) &&
  189. param.compute_mode == param::ConvBias::ComputeMode::DEFAULT &&
  190. param.src_type.enumv() == DTypeEnum::Float32;
  191. }
  192. MIDOUT_END();
  193. return false;
  194. }
  195. size_t ConvBiasImpl::AlgoFP32WinogradF54::get_workspace(
  196. fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const {
  197. MEGDNN_MARK_USED_VAR(param);
  198. MIDOUT_BEGIN(megdnn_arm_common_winograd_fp32, 2, 1) {
  199. winograd::winograd_5x4_1x1_f strategy(param.src_type, param.filter_type,
  200. param.dst_type);
  201. return megdnn::winograd::ConvBias<winograd::winograd_5x4_1x1_f>(
  202. strategy, m_tile_size, param.nr_threads, param.osz[0],
  203. param.osz[1], param.filter_meta.ocpg)
  204. .get_workspace_size(param, m_matmul_algo);
  205. }
  206. MIDOUT_END();
  207. return 0;
  208. }
  209. SmallVector<ConvBiasImpl::NCBKern>
  210. ConvBiasImpl::AlgoFP32WinogradF54::dispatch_kerns(
  211. fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const {
  212. MEGDNN_MARK_USED_VAR(param);
  213. MIDOUT_BEGIN(megdnn_arm_common_winograd_fp32, 2, 2) {
  214. winograd::winograd_5x4_1x1_f strategy(param.src_type, param.filter_type,
  215. param.dst_type);
  216. auto winograd_impl =
  217. megdnn::winograd::ConvBias<winograd::winograd_5x4_1x1_f>(
  218. strategy, m_tile_size, param.nr_threads, param.osz[0],
  219. param.osz[1], param.filter_meta.ocpg);
  220. return winograd_impl.get_kerns(param, m_matmul_algo);
  221. }
  222. MIDOUT_END();
  223. return {};
  224. }
  225. /* ======================= AlgoFP32WinogradF45 ======================== */
  226. bool ConvBiasImpl::AlgoFP32WinogradF45::usable(
  227. fallback::ConvBiasImpl* opr, const NCBKernSizeParam& param,
  228. AlgoSelectionStrategy /*algo_selection_strategy*/) const {
  229. MEGDNN_MARK_USED_VAR(param);
  230. MEGDNN_MARK_USED_VAR(opr);
  231. MIDOUT_BEGIN(megdnn_arm_common_winograd_fp32, 3, 0) {
  232. using Strategy = winograd::winograd_4x5_1x1_f;
  233. Strategy strategy(param.src_type, param.filter_type, param.dst_type);
  234. auto&& matmul_param =
  235. megdnn::winograd::ConvBias<Strategy>(
  236. strategy, m_tile_size, param.nr_threads, param.osz[0],
  237. param.osz[1], param.filter_meta.ocpg)
  238. .get_matmul_kern_param(param);
  239. return m_matmul_algo->usable(matmul_param) &&
  240. (opr->param().format == param::ConvBias::Format::NCHW ||
  241. (opr->param().format ==
  242. param::ConvBias::Format::NCHW_WINOGRAD &&
  243. opr->param().output_block_size == 4 &&
  244. param.winograd_matmul_format ==
  245. param::MatrixMul::Format::DEFAULT)) &&
  246. opr->param().mode == param::ConvBias::Mode::CROSS_CORRELATION &&
  247. (param.filter_meta.spatial[0] == param.filter_meta.spatial[1] &&
  248. param.filter_meta.spatial[0] == 5) &&
  249. (param.filter_meta.stride[0] == param.filter_meta.stride[1] &&
  250. param.filter_meta.stride[0] == 1) &&
  251. (param.filter_meta.dilation[0] ==
  252. param.filter_meta.dilation[1] &&
  253. param.filter_meta.dilation[0] == 1) &&
  254. param.compute_mode == param::ConvBias::ComputeMode::DEFAULT &&
  255. param.src_type.enumv() == DTypeEnum::Float32;
  256. }
  257. MIDOUT_END();
  258. return false;
  259. }
  260. size_t ConvBiasImpl::AlgoFP32WinogradF45::get_workspace(
  261. fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const {
  262. MEGDNN_MARK_USED_VAR(param);
  263. MIDOUT_BEGIN(megdnn_arm_common_winograd_fp32, 3, 1) {
  264. winograd::winograd_4x5_1x1_f strategy(param.src_type, param.filter_type,
  265. param.dst_type);
  266. return megdnn::winograd::ConvBias<winograd::winograd_4x5_1x1_f>(
  267. strategy, m_tile_size, param.nr_threads, param.osz[0],
  268. param.osz[1], param.filter_meta.ocpg)
  269. .get_workspace_size(param, m_matmul_algo);
  270. }
  271. MIDOUT_END();
  272. return 0;
  273. }
  274. SmallVector<ConvBiasImpl::NCBKern>
  275. ConvBiasImpl::AlgoFP32WinogradF45::dispatch_kerns(
  276. fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const {
  277. MEGDNN_MARK_USED_VAR(param);
  278. MIDOUT_BEGIN(megdnn_arm_common_winograd_fp32, 3, 2) {
  279. winograd::winograd_4x5_1x1_f strategy(param.src_type, param.filter_type,
  280. param.dst_type);
  281. auto winograd_impl =
  282. megdnn::winograd::ConvBias<winograd::winograd_4x5_1x1_f>(
  283. strategy, m_tile_size, param.nr_threads, param.osz[0],
  284. param.osz[1], param.filter_meta.ocpg);
  285. return winograd_impl.get_kerns(param, m_matmul_algo);
  286. }
  287. MIDOUT_END();
  288. return {};
  289. }
  290. /* ======================= AlgoFP32WinogradF63_4x4 ======================== */
  291. bool ConvBiasImpl::AlgoFP32WinogradF63_4x4::usable(
  292. fallback::ConvBiasImpl* opr, const NCBKernSizeParam& param,
  293. AlgoSelectionStrategy /*algo_selection_strategy*/) const {
  294. MEGDNN_MARK_USED_VAR(param);
  295. MEGDNN_MARK_USED_VAR(opr);
  296. MIDOUT_BEGIN(megdnn_arm_common_winograd_fp32, 4, 0) {
  297. if (param.filter_meta.icpg % 4 != 0 || param.filter_meta.ocpg % 4 != 0)
  298. return false;
  299. using Strategy = winograd::winograd_6x3_4x4_f;
  300. Strategy strategy(param.src_type, param.filter_type, param.dst_type);
  301. auto&& matmul_param =
  302. megdnn::winograd::ConvBias<Strategy,
  303. param::MatrixMul::Format::MK4>(
  304. strategy, m_tile_size, param.nr_threads, param.osz[0],
  305. param.osz[1], param.filter_meta.ocpg)
  306. .get_matmul_kern_param(param);
  307. return m_matmul_algo->usable(matmul_param) &&
  308. (opr->param().format == param::ConvBias::Format::NCHW ||
  309. (opr->param().format ==
  310. param::ConvBias::Format::NCHW_WINOGRAD &&
  311. opr->param().output_block_size == 6 &&
  312. param.winograd_matmul_format ==
  313. param::MatrixMul::Format::MK4)) &&
  314. opr->param().mode == param::ConvBias::Mode::CROSS_CORRELATION &&
  315. (param.filter_meta.spatial[0] == param.filter_meta.spatial[1] &&
  316. param.filter_meta.spatial[0] == 3) &&
  317. (param.filter_meta.stride[0] == param.filter_meta.stride[1] &&
  318. param.filter_meta.stride[0] == 1) &&
  319. (param.filter_meta.dilation[0] ==
  320. param.filter_meta.dilation[1] &&
  321. param.filter_meta.dilation[0] == 1) &&
  322. param.compute_mode == param::ConvBias::ComputeMode::DEFAULT &&
  323. param.src_type.enumv() == DTypeEnum::Float32 &&
  324. param.filter_meta.icpg % 4 == 0 &&
  325. param.filter_meta.ocpg % 4 == 0;
  326. }
  327. MIDOUT_END();
  328. return false;
  329. }
  330. size_t ConvBiasImpl::AlgoFP32WinogradF63_4x4::get_workspace(
  331. fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const {
  332. MEGDNN_MARK_USED_VAR(param);
  333. MIDOUT_BEGIN(megdnn_arm_common_winograd_fp32, 4, 1) {
  334. winograd::winograd_6x3_4x4_f strategy(param.src_type, param.filter_type,
  335. param.dst_type);
  336. return megdnn::winograd::ConvBias<winograd::winograd_6x3_4x4_f,
  337. param::MatrixMul::Format::MK4>(
  338. strategy, m_tile_size, param.nr_threads, param.osz[0],
  339. param.osz[1], param.filter_meta.ocpg)
  340. .get_workspace_size(param, m_matmul_algo);
  341. }
  342. MIDOUT_END();
  343. return 0;
  344. }
  345. SmallVector<ConvBiasImpl::NCBKern>
  346. ConvBiasImpl::AlgoFP32WinogradF63_4x4::dispatch_kerns(
  347. fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const {
  348. MEGDNN_MARK_USED_VAR(param);
  349. MIDOUT_BEGIN(megdnn_arm_common_winograd_fp32, 4, 2) {
  350. winograd::winograd_6x3_4x4_f strategy(param.src_type, param.filter_type,
  351. param.dst_type);
  352. auto winograd_impl =
  353. megdnn::winograd::ConvBias<winograd::winograd_6x3_4x4_f,
  354. param::MatrixMul::Format::MK4>(
  355. strategy, m_tile_size, param.nr_threads, param.osz[0],
  356. param.osz[1], param.filter_meta.ocpg);
  357. return winograd_impl.get_kerns(param, m_matmul_algo);
  358. }
  359. MIDOUT_END();
  360. return {};
  361. }
  362. /* ===================== direct algo ===================== */
  363. MIDOUT_DECL(megdnn_arm_common_conv_bias_f32_kimpl);
  364. bool ConvBiasImpl::AlgoF32Direct::usable(
  365. fallback::ConvBiasImpl*, const NCBKernSizeParam& param,
  366. AlgoSelectionStrategy algo_selection_strategy) const {
  367. MIDOUT_BEGIN(megdnn_arm_common_conv_bias_f32_kimpl, 0, 0) {
  368. auto&& fm = param.filter_meta;
  369. auto FH = fm.spatial[0];
  370. auto SH = fm.stride[0], SW = fm.stride[1];
  371. // the condition ``param.isz[0]*param.isz[1] >= 4'' and
  372. // ``param.osz[0]*param.osz[1] >= 4'' comes from the fact that the
  373. // kernel may have access to up to 4 floats after the end of the memory
  374. // chunk.
  375. bool aviliable = fm.format == param::ConvBias::Format::NCHW &&
  376. param.src_type.enumv() == DTypeEnum::Float32 &&
  377. param.filter_type.enumv() == DTypeEnum::Float32 &&
  378. param.dst_type.enumv() == DTypeEnum::Float32 &&
  379. fm.spatial_ndim == 2 && fm.dilation[0] == 1 &&
  380. fm.dilation[1] == 1 &&
  381. param.isz[0] * param.isz[1] >= 4 &&
  382. param.osz[0] * param.osz[1] >= 4 && FH <= 7 &&
  383. SH == 1 && SW == 1;
  384. if (algo_selection_strategy == AlgoSelectionStrategy::HEURISTIC) {
  385. bool large_group = param.filter_meta.group >= param.nr_threads;
  386. aviliable &= (large_group == m_large_group);
  387. }
  388. return aviliable;
  389. }
  390. MIDOUT_END();
  391. return false;
  392. }
  393. size_t ConvBiasImpl::AlgoF32Direct::get_workspace(
  394. fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const {
  395. MIDOUT_BEGIN(megdnn_arm_common_conv_bias_f32_kimpl, 0, 1) {
  396. auto wbundle = MultithreadDirectConvCommon<float, float>::get_bundle(
  397. param, m_large_group);
  398. return wbundle.total_size_in_bytes();
  399. }
  400. MIDOUT_END();
  401. return 0;
  402. }
  403. SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoF32Direct::get_kimpls(
  404. const NCBKernSizeParam& param) const {
  405. auto fm = param.filter_meta;
  406. size_t N = param.n;
  407. size_t IC = param.filter_meta.icpg;
  408. size_t OC = param.filter_meta.ocpg;
  409. size_t group = fm.group;
  410. WorkspaceBundle wbundle =
  411. MultithreadDirectConvCommon<float, float>::get_bundle(
  412. param, m_large_group);
  413. SmallVector<NCBKern> ret_kerns;
  414. //! When group >= nr_threads, treat it as large_group, each thread process
  415. //! one group for better performance
  416. if (m_large_group) {
  417. //! Channel wise conv and big groups
  418. auto exec_one_group = [wbundle](const NCBKernParam& kern_param,
  419. const NCBKernIndex& ncb_index) {
  420. auto fm = kern_param.filter_meta;
  421. size_t IC = fm.icpg;
  422. size_t OC = fm.ocpg;
  423. WorkspaceBundle bundle = wbundle;
  424. if (fm.should_flip) {
  425. for (size_t oc = 0; oc < OC; oc++) {
  426. MultithreadDirectConvCommon<float, float>::weight_flip_kern(
  427. bundle, kern_param, ncb_index,
  428. {ncb_index.thread_id, 0, oc});
  429. }
  430. }
  431. for (size_t ic = 0; ic < IC; ic++) {
  432. MultithreadDirectConvCommon<float, float>::copy_padding_kern(
  433. bundle, kern_param, ncb_index, {ncb_index.thread_id, 0, ic});
  434. }
  435. for (size_t oc = 0; oc < OC; oc++) {
  436. MultithreadDirectConvCommon<float, float>::do_conv_kern(
  437. bundle, kern_param, ncb_index,
  438. fp32::conv_bias::kern_direct,
  439. {ncb_index.thread_id, 0, oc});
  440. }
  441. };
  442. ret_kerns.push_back({exec_one_group, {group, N, 1_z}});
  443. } else {
  444. WorkspaceBundle bundle = wbundle;
  445. if (fm.should_flip) {
  446. auto weight_flip = [bundle](const NCBKernParam& kern_param,
  447. const NCBKernIndex& ncb_index) {
  448. MultithreadDirectConvCommon<float, float>::weight_flip_kern(
  449. bundle, kern_param, ncb_index, ncb_index.ndrange_id);
  450. };
  451. ret_kerns.push_back({weight_flip, {group, 1_z, OC}});
  452. }
  453. auto copy_padding = [bundle](const NCBKernParam& kern_param,
  454. const NCBKernIndex& ncb_index) {
  455. MultithreadDirectConvCommon<float, float>::copy_padding_kern(
  456. bundle, kern_param, ncb_index, ncb_index.ndrange_id);
  457. };
  458. ret_kerns.push_back({copy_padding, {group, N, IC}});
  459. auto do_conv = [bundle](const NCBKernParam& kern_param,
  460. const NCBKernIndex& ncb_index) {
  461. MultithreadDirectConvCommon<float, float>::do_conv_kern(
  462. bundle, kern_param, ncb_index, fp32::conv_bias::kern_direct,
  463. ncb_index.ndrange_id);
  464. };
  465. ret_kerns.push_back({do_conv, {group, N, OC}});
  466. }
  467. return ret_kerns;
  468. }
  469. SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoF32Direct::dispatch_kerns(
  470. fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const {
  471. MIDOUT_BEGIN(megdnn_arm_common_conv_bias_f32_kimpl, 0, 1) {
  472. return get_kimpls(param);
  473. }
  474. MIDOUT_END();
  475. return {};
  476. }
  477. /* ===================== stride-1 algo ===================== */
  478. bool ConvBiasImpl::AlgoF32DirectStride1::usable(
  479. fallback::ConvBiasImpl*, const NCBKernSizeParam& param,
  480. AlgoSelectionStrategy algo_selection_strategy) const {
  481. MIDOUT_BEGIN(megdnn_arm_common_conv_bias_f32_kimpl, 1, 1) {
  482. auto&& fm = param.filter_meta;
  483. auto FH = fm.spatial[0];
  484. bool aviliable =
  485. param.filter_meta.format == param::ConvBias::Format::NCHW &&
  486. param.src_type.enumv() == DTypeEnum::Float32 &&
  487. param.filter_type.enumv() == DTypeEnum::Float32 &&
  488. param.dst_type.enumv() == DTypeEnum::Float32 &&
  489. !fm.should_flip && fm.spatial_ndim == 2 &&
  490. fm.dilation[0] == 1 && fm.dilation[1] == 1 &&
  491. fm.stride[0] == 1 && fm.stride[1] == 1 && FH == fm.spatial[1] &&
  492. (FH == 2 || FH == 3 || FH == 5 || FH == 7);
  493. if (algo_selection_strategy ==
  494. ConvBiasImpl::AlgoSelectionStrategy::HEURISTIC) {
  495. bool large_group = param.filter_meta.group >= param.nr_threads;
  496. aviliable &= (large_group == m_large_group);
  497. }
  498. return aviliable;
  499. }
  500. MIDOUT_END();
  501. return false;
  502. }
  503. size_t ConvBiasImpl::AlgoF32DirectStride1::get_workspace(
  504. fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const {
  505. MIDOUT_BEGIN(megdnn_arm_common_conv_bias_f32_kimpl, 1, 1) {
  506. auto bundle =
  507. MultithreadDirectConvCommon<float, float>::get_bundle_stride(
  508. param, m_large_group);
  509. return bundle.total_size_in_bytes();
  510. }
  511. MIDOUT_END();
  512. return 0;
  513. }
  514. SmallVector<ConvBiasImpl::NCBKern>
  515. ConvBiasImpl::AlgoF32DirectStride1::get_kimpls(
  516. const NCBKernSizeParam& param) const {
  517. auto fm = param.filter_meta;
  518. auto FH = fm.spatial[0];
  519. size_t N = param.n;
  520. size_t IC = param.filter_meta.icpg;
  521. size_t OC = param.filter_meta.ocpg;
  522. size_t group = fm.group;
  523. using Func = std::function<void(const float*, const float*, float*, size_t,
  524. size_t, size_t, size_t, size_t)>;
  525. Func conv_kern_function = nullptr;
  526. #define SWITCH_KERN_STR1() \
  527. switch (FH) { \
  528. case 2: \
  529. conv_kern_function = fp32::conv_stride1::do_conv_2x2_stride1; \
  530. break; \
  531. case 3: \
  532. conv_kern_function = fp32::conv_stride1::do_conv_3x3_stride1; \
  533. break; \
  534. case 5: \
  535. conv_kern_function = fp32::conv_stride1::do_conv_5x5_stride1; \
  536. break; \
  537. case 7: \
  538. conv_kern_function = fp32::conv_stride1::do_conv_7x7_stride1; \
  539. break; \
  540. }
  541. SWITCH_KERN_STR1();
  542. WorkspaceBundle wbundle =
  543. MultithreadDirectConvCommon<float, float>::get_bundle_stride(
  544. param, m_large_group);
  545. SmallVector<NCBKern> ret_kerns;
  546. //! When group >= nr_threads, treat it as large_group, each thread process
  547. //! one group for better performance
  548. if (m_large_group) {
  549. //! Channel wise conv and big groups
  550. auto exec_one_group = [wbundle, conv_kern_function](
  551. const NCBKernParam& kern_param,
  552. const NCBKernIndex& ncb_index) {
  553. auto fm = kern_param.filter_meta;
  554. size_t IC = fm.icpg;
  555. size_t OC = fm.ocpg;
  556. WorkspaceBundle bundle = wbundle;
  557. for (size_t ic = 0; ic < IC; ic++) {
  558. MultithreadDirectConvCommon<float, float>::
  559. copy_padding_kern_stride(bundle, kern_param, ncb_index,
  560. {ncb_index.thread_id, 0, ic});
  561. }
  562. for (size_t oc = 0; oc < OC; oc++) {
  563. MultithreadDirectConvCommon<float, float>::do_conv_kern_stride(
  564. bundle, kern_param, ncb_index, conv_kern_function,
  565. {ncb_index.thread_id, 0, oc});
  566. }
  567. };
  568. ret_kerns.push_back({exec_one_group, {group, N, 1_z}});
  569. } else {
  570. WorkspaceBundle bundle = wbundle;
  571. auto copy_padding = [bundle](const NCBKernParam& kern_param,
  572. const NCBKernIndex& ncb_index) {
  573. MultithreadDirectConvCommon<float, float>::copy_padding_kern_stride(
  574. bundle, kern_param, ncb_index, ncb_index.ndrange_id);
  575. };
  576. ret_kerns.push_back({copy_padding, {group, N, IC}});
  577. auto do_conv = [bundle, conv_kern_function](
  578. const NCBKernParam& kern_param,
  579. const NCBKernIndex& ncb_index) {
  580. MultithreadDirectConvCommon<float, float>::do_conv_kern_stride(
  581. bundle, kern_param, ncb_index, conv_kern_function,
  582. ncb_index.ndrange_id);
  583. };
  584. ret_kerns.push_back({do_conv, {group, N, OC}});
  585. }
  586. return ret_kerns;
  587. }
  588. SmallVector<ConvBiasImpl::NCBKern>
  589. ConvBiasImpl::AlgoF32DirectStride1::dispatch_kerns(
  590. fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const {
  591. MIDOUT_BEGIN(megdnn_arm_common_conv_bias_f32_kimpl, 1, 2) {
  592. return get_kimpls(param);
  593. }
  594. MIDOUT_END();
  595. return {};
  596. }
  597. /* ===================== stride-2 algo ===================== */
  598. bool ConvBiasImpl::AlgoF32DirectStride2::usable(
  599. fallback::ConvBiasImpl*, const NCBKernSizeParam& param,
  600. AlgoSelectionStrategy algo_selection_strategy) const {
  601. MIDOUT_BEGIN(megdnn_arm_common_conv_bias_f32_kimpl, 2, 0) {
  602. auto&& fm = param.filter_meta;
  603. auto FH = fm.spatial[0];
  604. bool aviliable =
  605. param.filter_meta.format == param::ConvBias::Format::NCHW &&
  606. param.src_type.enumv() == DTypeEnum::Float32 &&
  607. param.filter_type.enumv() == DTypeEnum::Float32 &&
  608. param.dst_type.enumv() == DTypeEnum::Float32 &&
  609. !fm.should_flip && fm.spatial_ndim == 2 &&
  610. fm.dilation[0] == 1 && fm.dilation[1] == 1 &&
  611. fm.stride[0] == 2 && fm.stride[1] == 2 && FH == fm.spatial[1] &&
  612. (FH == 2 || FH == 3 || FH == 5 || FH == 7);
  613. if (algo_selection_strategy ==
  614. ConvBiasImpl::AlgoSelectionStrategy::HEURISTIC) {
  615. bool large_group = param.filter_meta.group >= param.nr_threads;
  616. aviliable &= (large_group == m_large_group);
  617. }
  618. return aviliable;
  619. }
  620. MIDOUT_END();
  621. return false;
  622. }
  623. size_t ConvBiasImpl::AlgoF32DirectStride2::get_workspace(
  624. fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const {
  625. MIDOUT_BEGIN(megdnn_arm_common_conv_bias_f32_kimpl, 2, 1) {
  626. auto bundle =
  627. MultithreadDirectConvCommon<float, float>::get_bundle_stride(
  628. param, m_large_group);
  629. return bundle.total_size_in_bytes();
  630. }
  631. MIDOUT_END();
  632. return 0;
  633. }
  634. SmallVector<ConvBiasImpl::NCBKern>
  635. ConvBiasImpl::AlgoF32DirectStride2::get_kimpls(
  636. const NCBKernSizeParam& param) const {
  637. auto fm = param.filter_meta;
  638. auto FH = fm.spatial[0];
  639. size_t N = param.n;
  640. size_t IC = param.filter_meta.icpg;
  641. size_t OC = param.filter_meta.ocpg;
  642. size_t group = fm.group;
  643. using Func = std::function<void(const float*, const float*, float*, size_t,
  644. size_t, size_t, size_t, size_t)>;
  645. Func conv_kern_function = nullptr;
  646. #define SWITCH_KERN_STR2() \
  647. switch (FH) { \
  648. case 2: \
  649. conv_kern_function = fp32::conv_stride2::do_conv_2x2_stride2; \
  650. break; \
  651. case 3: \
  652. conv_kern_function = fp32::conv_stride2::do_conv_3x3_stride2; \
  653. break; \
  654. case 5: \
  655. conv_kern_function = fp32::conv_stride2::do_conv_5x5_stride2; \
  656. break; \
  657. case 7: \
  658. conv_kern_function = fp32::conv_stride2::do_conv_7x7_stride2; \
  659. break; \
  660. }
  661. SWITCH_KERN_STR2();
  662. WorkspaceBundle wbundle =
  663. MultithreadDirectConvCommon<float, float>::get_bundle_stride(
  664. param, m_large_group);
  665. SmallVector<NCBKern> ret_kerns;
  666. //! When group >= nr_threads, treat it as large_group, each thread process
  667. //! one group for better performance
  668. if (m_large_group) {
  669. //! Channel wise conv and big groups
  670. auto exec_one_group = [wbundle, conv_kern_function](
  671. const NCBKernParam& kern_param,
  672. const NCBKernIndex& ncb_index) {
  673. auto fm = kern_param.filter_meta;
  674. size_t IC = fm.icpg;
  675. size_t OC = fm.ocpg;
  676. WorkspaceBundle bundle = wbundle;
  677. for (size_t ic = 0; ic < IC; ic++) {
  678. MultithreadDirectConvCommon<float, float>::
  679. copy_padding_kern_stride(bundle, kern_param, ncb_index,
  680. {ncb_index.thread_id, 0, ic});
  681. }
  682. for (size_t oc = 0; oc < OC; oc++) {
  683. MultithreadDirectConvCommon<float, float>::do_conv_kern_stride(
  684. bundle, kern_param, ncb_index, conv_kern_function,
  685. {ncb_index.thread_id, 0, oc});
  686. }
  687. };
  688. ret_kerns.push_back({exec_one_group, {group, N, 1_z}});
  689. } else {
  690. WorkspaceBundle bundle = wbundle;
  691. auto copy_padding = [bundle](const NCBKernParam& kern_param,
  692. const NCBKernIndex& ncb_index) {
  693. MultithreadDirectConvCommon<float, float>::copy_padding_kern_stride(
  694. bundle, kern_param, ncb_index, ncb_index.ndrange_id);
  695. };
  696. ret_kerns.push_back({copy_padding, {group, N, IC}});
  697. auto do_conv = [bundle, conv_kern_function](
  698. const NCBKernParam& kern_param,
  699. const NCBKernIndex& ncb_index) {
  700. MultithreadDirectConvCommon<float, float>::do_conv_kern_stride(
  701. bundle, kern_param, ncb_index, conv_kern_function,
  702. ncb_index.ndrange_id);
  703. };
  704. ret_kerns.push_back({do_conv, {group, N, OC}});
  705. }
  706. return ret_kerns;
  707. }
  708. SmallVector<ConvBiasImpl::NCBKern>
  709. ConvBiasImpl::AlgoF32DirectStride2::dispatch_kerns(
  710. fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const {
  711. MIDOUT_BEGIN(megdnn_arm_common_conv_bias_f32_kimpl, 2, 2) {
  712. return get_kimpls(param);
  713. }
  714. MIDOUT_END();
  715. return {};
  716. }
  717. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台