You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

algos.cpp 34 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706
  1. /**
  2. * \file dnn/src/arm_common/conv_bias/fp32/algos.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
  10. * implied.
  11. */
  12. #include "src/arm_common/conv_bias/fp32/algos.h"
  13. #include "src/arm_common/conv_bias/direct/multi_thread_common.h"
  14. #include "src/arm_common/conv_bias/fp32/direct.h"
  15. #include "src/arm_common/conv_bias/fp32/do_conv_stride1.h"
  16. #include "src/arm_common/conv_bias/fp32/do_conv_stride2.h"
  17. #include "src/arm_common/conv_bias/fp32/strategy.h"
  18. #include "src/arm_common/conv_bias/img2col_helper.h"
  19. #include "src/arm_common/conv_bias/postprocess_helper.h"
  20. #include "src/common/opr_delegate.h"
  21. #include "src/fallback/conv_bias/common.h"
  22. #include "midout.h"
  23. MIDOUT_DECL(megdnn_arm_common_winograd_fp32)
  24. using namespace megdnn;
  25. using namespace arm_common;
  26. /* ======================= AlgoFP32WinogradF23_4x4 ======================== */
  27. bool ConvBiasImpl::AlgoFP32WinogradF23_4x4::usable(
  28. const NCBKernSizeParam& param,
  29. AlgoSelectionStrategy /*algo_selection_strategy*/) const {
  30. MEGDNN_MARK_USED_VAR(param);
  31. MIDOUT_BEGIN(megdnn_arm_common_winograd_fp32, 0, 0) {
  32. if (param.filter_meta.icpg % 4 != 0 || param.filter_meta.ocpg % 4 != 0)
  33. return false;
  34. using Strategy = winograd::winograd_2x3_4x4_f;
  35. using PackMode = fallback::MatrixMulImpl::AlgoBase::PackMode;
  36. Strategy strategy(param.src_type, param.filter_type, param.dst_type);
  37. auto&& matmul_param =
  38. megdnn::winograd::ConvBias<Strategy,
  39. param::MatrixMul::Format::MK4>(
  40. strategy, m_tile_size, param)
  41. .get_matmul_kern_param(param);
  42. return m_matmul_algo->usable(matmul_param) &&
  43. m_matmul_algo->packmode() == PackMode::NO_PACK &&
  44. (param.filter_meta.format == param::ConvBias::Format::NCHW ||
  45. (param.filter_meta.format ==
  46. param::ConvBias::Format::NCHW_WINOGRAD &&
  47. param.output_block_size == 2 &&
  48. param.winograd_matmul_format ==
  49. param::MatrixMul::Format::MK4)) &&
  50. !param.filter_meta.should_flip &&
  51. (param.filter_meta.spatial[0] == param.filter_meta.spatial[1] &&
  52. param.filter_meta.spatial[0] == 3) &&
  53. (param.filter_meta.stride[0] == param.filter_meta.stride[1] &&
  54. param.filter_meta.stride[0] == 1) &&
  55. (param.filter_meta.dilation[0] ==
  56. param.filter_meta.dilation[1] &&
  57. param.filter_meta.dilation[0] == 1) &&
  58. param.compute_mode == param::ConvBias::ComputeMode::DEFAULT &&
  59. param.src_type.enumv() == DTypeEnum::Float32;
  60. }
  61. MIDOUT_END();
  62. return false;
  63. }
  64. MEGDNN_WINOGRAD_ALGO_FUN_DEFINE_ALL(AlgoFP32WinogradF23_4x4,
  65. winograd::winograd_2x3_4x4_f,
  66. megdnn_arm_common_winograd_fp32,
  67. param::MatrixMul::Format::MK4);
  68. /* ======================= AlgoFP32WinogradF63 ======================== */
  69. bool ConvBiasImpl::AlgoFP32WinogradF63::usable(
  70. const NCBKernSizeParam& param,
  71. AlgoSelectionStrategy /*algo_selection_strategy*/) const {
  72. MEGDNN_MARK_USED_VAR(param);
  73. MIDOUT_BEGIN(megdnn_arm_common_winograd_fp32, 1, 0) {
  74. using Strategy = winograd::winograd_6x3_1x1_f;
  75. Strategy strategy(param.src_type, param.filter_type, param.dst_type);
  76. auto&& matmul_param = megdnn::winograd::ConvBias<Strategy>(
  77. strategy, m_tile_size, param)
  78. .get_matmul_kern_param(param);
  79. return m_matmul_algo->usable(matmul_param) &&
  80. (param.filter_meta.format == param::ConvBias::Format::NCHW ||
  81. (param.filter_meta.format ==
  82. param::ConvBias::Format::NCHW_WINOGRAD &&
  83. param.output_block_size == 6 &&
  84. param.winograd_matmul_format ==
  85. param::MatrixMul::Format::DEFAULT)) &&
  86. !param.filter_meta.should_flip &&
  87. (param.filter_meta.spatial[0] == param.filter_meta.spatial[1] &&
  88. param.filter_meta.spatial[0] == 3) &&
  89. (param.filter_meta.stride[0] == param.filter_meta.stride[1] &&
  90. param.filter_meta.stride[0] == 1) &&
  91. (param.filter_meta.dilation[0] ==
  92. param.filter_meta.dilation[1] &&
  93. param.filter_meta.dilation[0] == 1) &&
  94. param.compute_mode == param::ConvBias::ComputeMode::DEFAULT &&
  95. param.src_type.enumv() == DTypeEnum::Float32;
  96. }
  97. MIDOUT_END();
  98. return false;
  99. }
  100. MEGDNN_WINOGRAD_ALGO_FUN_DEFINE_ALL(AlgoFP32WinogradF63,
  101. winograd::winograd_6x3_1x1_f,
  102. megdnn_arm_common_winograd_fp32,
  103. param::MatrixMul::Format::DEFAULT);
  104. /* ======================= AlgoFP32WinogradF54 ======================== */
  105. bool ConvBiasImpl::AlgoFP32WinogradF54::usable(
  106. const NCBKernSizeParam& param,
  107. AlgoSelectionStrategy /*algo_selection_strategy*/) const {
  108. MEGDNN_MARK_USED_VAR(param);
  109. MIDOUT_BEGIN(megdnn_arm_common_winograd_fp32, 2, 0) {
  110. using Strategy = winograd::winograd_5x4_1x1_f;
  111. Strategy strategy(param.src_type, param.filter_type, param.dst_type);
  112. auto&& matmul_param = megdnn::winograd::ConvBias<Strategy>(
  113. strategy, m_tile_size, param)
  114. .get_matmul_kern_param(param);
  115. return m_matmul_algo->usable(matmul_param) &&
  116. (param.filter_meta.format == param::ConvBias::Format::NCHW ||
  117. (param.filter_meta.format ==
  118. param::ConvBias::Format::NCHW_WINOGRAD &&
  119. param.output_block_size == 5 &&
  120. param.winograd_matmul_format ==
  121. param::MatrixMul::Format::DEFAULT)) &&
  122. !param.filter_meta.should_flip &&
  123. (param.filter_meta.spatial[0] == param.filter_meta.spatial[1] &&
  124. param.filter_meta.spatial[0] == 4) &&
  125. (param.filter_meta.stride[0] == param.filter_meta.stride[1] &&
  126. param.filter_meta.stride[0] == 1) &&
  127. (param.filter_meta.dilation[0] ==
  128. param.filter_meta.dilation[1] &&
  129. param.filter_meta.dilation[0] == 1) &&
  130. param.compute_mode == param::ConvBias::ComputeMode::DEFAULT &&
  131. param.src_type.enumv() == DTypeEnum::Float32;
  132. }
  133. MIDOUT_END();
  134. return false;
  135. }
  136. MEGDNN_WINOGRAD_ALGO_FUN_DEFINE_ALL(AlgoFP32WinogradF54,
  137. winograd::winograd_5x4_1x1_f,
  138. megdnn_arm_common_winograd_fp32,
  139. param::MatrixMul::Format::DEFAULT);
  140. /* ======================= AlgoFP32WinogradF45 ======================== */
  141. bool ConvBiasImpl::AlgoFP32WinogradF45::usable(
  142. const NCBKernSizeParam& param,
  143. AlgoSelectionStrategy /*algo_selection_strategy*/) const {
  144. MEGDNN_MARK_USED_VAR(param);
  145. MIDOUT_BEGIN(megdnn_arm_common_winograd_fp32, 3, 0) {
  146. using Strategy = winograd::winograd_4x5_1x1_f;
  147. Strategy strategy(param.src_type, param.filter_type, param.dst_type);
  148. auto&& matmul_param = megdnn::winograd::ConvBias<Strategy>(
  149. strategy, m_tile_size, param)
  150. .get_matmul_kern_param(param);
  151. return m_matmul_algo->usable(matmul_param) &&
  152. (param.filter_meta.format == param::ConvBias::Format::NCHW ||
  153. (param.filter_meta.format ==
  154. param::ConvBias::Format::NCHW_WINOGRAD &&
  155. param.output_block_size == 4 &&
  156. param.winograd_matmul_format ==
  157. param::MatrixMul::Format::DEFAULT)) &&
  158. !param.filter_meta.should_flip &&
  159. (param.filter_meta.spatial[0] == param.filter_meta.spatial[1] &&
  160. param.filter_meta.spatial[0] == 5) &&
  161. (param.filter_meta.stride[0] == param.filter_meta.stride[1] &&
  162. param.filter_meta.stride[0] == 1) &&
  163. (param.filter_meta.dilation[0] ==
  164. param.filter_meta.dilation[1] &&
  165. param.filter_meta.dilation[0] == 1) &&
  166. param.compute_mode == param::ConvBias::ComputeMode::DEFAULT &&
  167. param.src_type.enumv() == DTypeEnum::Float32;
  168. }
  169. MIDOUT_END();
  170. return false;
  171. }
  172. MEGDNN_WINOGRAD_ALGO_FUN_DEFINE_ALL(AlgoFP32WinogradF45,
  173. winograd::winograd_4x5_1x1_f,
  174. megdnn_arm_common_winograd_fp32,
  175. param::MatrixMul::Format::DEFAULT);
  176. /* ======================= AlgoFP32WinogradF63_4x4 ======================== */
  177. bool ConvBiasImpl::AlgoFP32WinogradF63_4x4::usable(
  178. const NCBKernSizeParam& param,
  179. AlgoSelectionStrategy /*algo_selection_strategy*/) const {
  180. MEGDNN_MARK_USED_VAR(param);
  181. MIDOUT_BEGIN(megdnn_arm_common_winograd_fp32, 4, 0) {
  182. if (param.filter_meta.icpg % 4 != 0 || param.filter_meta.ocpg % 4 != 0)
  183. return false;
  184. using Strategy = winograd::winograd_6x3_4x4_f;
  185. using PackMode = fallback::MatrixMulImpl::AlgoBase::PackMode;
  186. Strategy strategy(param.src_type, param.filter_type, param.dst_type);
  187. auto&& matmul_param =
  188. megdnn::winograd::ConvBias<Strategy,
  189. param::MatrixMul::Format::MK4>(
  190. strategy, m_tile_size, param)
  191. .get_matmul_kern_param(param);
  192. return m_matmul_algo->usable(matmul_param) &&
  193. m_matmul_algo->packmode() == PackMode::NO_PACK &&
  194. (param.filter_meta.format == param::ConvBias::Format::NCHW ||
  195. (param.filter_meta.format ==
  196. param::ConvBias::Format::NCHW_WINOGRAD &&
  197. param.output_block_size == 6 &&
  198. param.winograd_matmul_format ==
  199. param::MatrixMul::Format::MK4)) &&
  200. !param.filter_meta.should_flip &&
  201. (param.filter_meta.spatial[0] == param.filter_meta.spatial[1] &&
  202. param.filter_meta.spatial[0] == 3) &&
  203. (param.filter_meta.stride[0] == param.filter_meta.stride[1] &&
  204. param.filter_meta.stride[0] == 1) &&
  205. (param.filter_meta.dilation[0] ==
  206. param.filter_meta.dilation[1] &&
  207. param.filter_meta.dilation[0] == 1) &&
  208. param.compute_mode == param::ConvBias::ComputeMode::DEFAULT &&
  209. param.src_type.enumv() == DTypeEnum::Float32 &&
  210. param.filter_meta.icpg % 4 == 0 &&
  211. param.filter_meta.ocpg % 4 == 0;
  212. }
  213. MIDOUT_END();
  214. return false;
  215. }
  216. MEGDNN_WINOGRAD_ALGO_FUN_DEFINE_ALL(AlgoFP32WinogradF63_4x4,
  217. winograd::winograd_6x3_4x4_f,
  218. megdnn_arm_common_winograd_fp32,
  219. param::MatrixMul::Format::MK4);
  220. /* =================== AlgoFP32WinogradF23_4x4_NCHW44 =================== */
  221. bool ConvBiasImpl::AlgoFP32WinogradF23_4x4_NCHW44::usable(
  222. const NCBKernSizeParam& param,
  223. AlgoSelectionStrategy /*algo_selection_strategy*/) const {
  224. MEGDNN_MARK_USED_VAR(param);
  225. MIDOUT_BEGIN(megdnn_arm_common_winograd_fp32,
  226. midout_iv("AlgoFP32WinogradF23_4x4_NCHW44"_hash)) {
  227. if (param.filter_meta.icpg % 4 != 0 || param.filter_meta.ocpg % 4 != 0)
  228. return false;
  229. using Strategy = winograd::winograd_F23_mk4_f_nchw44;
  230. Strategy strategy(param.src_type, param.filter_type, param.dst_type);
  231. auto&& matmul_param =
  232. megdnn::winograd::ConvBias<Strategy,
  233. param::MatrixMul::Format::MK4>(
  234. strategy, m_tile_size, param)
  235. .get_matmul_kern_param(param);
  236. return m_matmul_algo->usable(matmul_param) &&
  237. m_matmul_algo->packmode() ==
  238. fallback::MatrixMulImpl::AlgoBase::PackMode::NO_PACK &&
  239. (param.filter_meta.format == param::ConvBias::Format::NCHW44 ||
  240. (param.filter_meta.format ==
  241. param::ConvBias::Format::NCHW44_WINOGRAD &&
  242. param.output_block_size == 2 &&
  243. param.winograd_matmul_format ==
  244. param::MatrixMul::Format::MK4)) &&
  245. !param.filter_meta.should_flip &&
  246. (param.filter_meta.spatial[0] == param.filter_meta.spatial[1] &&
  247. param.filter_meta.spatial[0] == 3) &&
  248. (param.filter_meta.stride[0] == param.filter_meta.stride[1] &&
  249. param.filter_meta.stride[0] == 1) &&
  250. (param.filter_meta.dilation[0] ==
  251. param.filter_meta.dilation[1] &&
  252. param.filter_meta.dilation[0] == 1) &&
  253. param.compute_mode == param::ConvBias::ComputeMode::DEFAULT &&
  254. param.src_type.enumv() == DTypeEnum::Float32;
  255. }
  256. MIDOUT_END();
  257. return false;
  258. }
  259. MEGDNN_WINOGRAD_ALGO_FUN_DEFINE_ALL(AlgoFP32WinogradF23_4x4_NCHW44,
  260. winograd::winograd_F23_mk4_f_nchw44,
  261. megdnn_arm_common_winograd_fp32,
  262. param::MatrixMul::Format::MK4);
  263. /* =================== AlgoFP32WinogradF63_4x4_NCHW44 ===================== */
  264. bool ConvBiasImpl::AlgoFP32WinogradF63_4x4_NCHW44::usable(
  265. const NCBKernSizeParam& param,
  266. AlgoSelectionStrategy /*algo_selection_strategy*/) const {
  267. MEGDNN_MARK_USED_VAR(param);
  268. MIDOUT_BEGIN(megdnn_arm_common_winograd_fp32,
  269. midout_iv("AlgoFP32WinogradF63_4x4_NCHW44"_hash)) {
  270. if (param.filter_meta.icpg % 4 != 0 || param.filter_meta.ocpg % 4 != 0)
  271. return false;
  272. using Strategy = winograd::winograd_F63_mk4_f_nchw44;
  273. Strategy strategy(param.src_type, param.filter_type, param.dst_type);
  274. auto&& matmul_param =
  275. megdnn::winograd::ConvBias<Strategy,
  276. param::MatrixMul::Format::MK4>(
  277. strategy, m_tile_size, param)
  278. .get_matmul_kern_param(param);
  279. return m_matmul_algo->usable(matmul_param) &&
  280. m_matmul_algo->packmode() ==
  281. fallback::MatrixMulImpl::AlgoBase::PackMode::NO_PACK &&
  282. (param.filter_meta.format == param::ConvBias::Format::NCHW44 ||
  283. (param.filter_meta.format ==
  284. param::ConvBias::Format::NCHW44_WINOGRAD &&
  285. param.output_block_size == 6 &&
  286. param.winograd_matmul_format ==
  287. param::MatrixMul::Format::MK4)) &&
  288. !param.filter_meta.should_flip &&
  289. (param.filter_meta.spatial[0] == param.filter_meta.spatial[1] &&
  290. param.filter_meta.spatial[0] == 3) &&
  291. (param.filter_meta.stride[0] == param.filter_meta.stride[1] &&
  292. param.filter_meta.stride[0] == 1) &&
  293. (param.filter_meta.dilation[0] ==
  294. param.filter_meta.dilation[1] &&
  295. param.filter_meta.dilation[0] == 1) &&
  296. param.compute_mode == param::ConvBias::ComputeMode::DEFAULT &&
  297. param.src_type.enumv() == DTypeEnum::Float32 &&
  298. param.filter_meta.icpg % 4 == 0 &&
  299. param.filter_meta.ocpg % 4 == 0;
  300. }
  301. MIDOUT_END();
  302. return false;
  303. }
  304. MEGDNN_WINOGRAD_ALGO_FUN_DEFINE_ALL(AlgoFP32WinogradF63_4x4_NCHW44,
  305. winograd::winograd_F63_mk4_f_nchw44,
  306. megdnn_arm_common_winograd_fp32,
  307. param::MatrixMul::Format::MK4);
  308. /* ===================== direct algo ===================== */
  309. MIDOUT_DECL(megdnn_arm_common_conv_bias_f32_kimpl);
  310. bool ConvBiasImpl::AlgoF32Direct::usable(
  311. const NCBKernSizeParam& param,
  312. AlgoSelectionStrategy algo_selection_strategy) const {
  313. MIDOUT_BEGIN(megdnn_arm_common_conv_bias_f32_kimpl, 0, 0) {
  314. auto&& fm = param.filter_meta;
  315. auto FH = fm.spatial[0];
  316. auto SH = fm.stride[0], SW = fm.stride[1];
  317. // the condition ``param.isz[0]*param.isz[1] >= 4'' and
  318. // ``param.osz[0]*param.osz[1] >= 4'' comes from the fact that the
  319. // kernel may have access to up to 4 floats after the end of the memory
  320. // chunk.
  321. bool aviliable = fm.format == param::ConvBias::Format::NCHW &&
  322. param.src_type.enumv() == DTypeEnum::Float32 &&
  323. param.filter_type.enumv() == DTypeEnum::Float32 &&
  324. param.dst_type.enumv() == DTypeEnum::Float32 &&
  325. fm.spatial_ndim == 2 && fm.dilation[0] == 1 &&
  326. fm.dilation[1] == 1 &&
  327. param.isz[0] * param.isz[1] >= 4 &&
  328. param.osz[0] * param.osz[1] >= 4 && FH <= 7 &&
  329. SH == 1 && SW == 1;
  330. if (algo_selection_strategy == AlgoSelectionStrategy::HEURISTIC) {
  331. bool large_group = param.filter_meta.group >= param.nr_threads;
  332. aviliable &= (large_group == m_large_group);
  333. }
  334. return aviliable;
  335. }
  336. MIDOUT_END();
  337. return false;
  338. }
  339. size_t ConvBiasImpl::AlgoF32Direct::get_workspace(
  340. const NCBKernSizeParam& param) const {
  341. MIDOUT_BEGIN(megdnn_arm_common_conv_bias_f32_kimpl, 0, 1) {
  342. auto wbundle = MultithreadDirectConvCommon<float, float>::get_bundle(
  343. param, m_large_group);
  344. return wbundle.total_size_in_bytes();
  345. }
  346. MIDOUT_END();
  347. return 0;
  348. }
  349. SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoF32Direct::get_kimpls(
  350. const NCBKernSizeParam& param) const {
  351. auto fm = param.filter_meta;
  352. size_t N = param.n;
  353. size_t IC = param.filter_meta.icpg;
  354. size_t OC = param.filter_meta.ocpg;
  355. size_t group = fm.group;
  356. WorkspaceBundle bundle =
  357. MultithreadDirectConvCommon<float, float>::get_bundle(
  358. param, m_large_group);
  359. SmallVector<NCBKern> ret_kerns;
  360. //! When group >= nr_threads, treat it as large_group, each thread process
  361. //! one group for better performance
  362. if (m_large_group) {
  363. //! Channel wise conv and big groups
  364. auto exec_one_group = [bundle](const NCBKernParam& kern_param,
  365. const NCBKernIndex& ncb_index) mutable {
  366. auto fm = kern_param.filter_meta;
  367. size_t IC = fm.icpg;
  368. size_t OC = fm.ocpg;
  369. bundle.set(kern_param.workspace_ptr);
  370. if (fm.should_flip) {
  371. for (size_t oc = 0; oc < OC; oc++) {
  372. MultithreadDirectConvCommon<float, float>::weight_flip_kern(
  373. bundle, kern_param, ncb_index,
  374. {ncb_index.thread_id, 0, oc});
  375. }
  376. }
  377. for (size_t ic = 0; ic < IC; ic++) {
  378. MultithreadDirectConvCommon<float, float>::copy_padding_kern(
  379. bundle, kern_param, ncb_index,
  380. {ncb_index.thread_id, 0, ic});
  381. }
  382. for (size_t oc = 0; oc < OC; oc++) {
  383. MultithreadDirectConvCommon<float, float>::do_conv_kern(
  384. bundle, kern_param, ncb_index,
  385. fp32::conv_bias::kern_direct,
  386. {ncb_index.thread_id, 0, oc});
  387. }
  388. };
  389. ret_kerns.push_back({exec_one_group, {group, N, 1_z}});
  390. } else {
  391. if (fm.should_flip) {
  392. auto weight_flip = [bundle](const NCBKernParam& kern_param,
  393. const NCBKernIndex& ncb_index) mutable {
  394. bundle.set(kern_param.workspace_ptr);
  395. MultithreadDirectConvCommon<float, float>::weight_flip_kern(
  396. bundle, kern_param, ncb_index, ncb_index.ndrange_id);
  397. };
  398. ret_kerns.push_back({weight_flip, {group, 1_z, OC}});
  399. }
  400. auto copy_padding = [bundle](const NCBKernParam& kern_param,
  401. const NCBKernIndex& ncb_index) mutable {
  402. bundle.set(kern_param.workspace_ptr);
  403. MultithreadDirectConvCommon<float, float>::copy_padding_kern(
  404. bundle, kern_param, ncb_index, ncb_index.ndrange_id);
  405. };
  406. ret_kerns.push_back({copy_padding, {group, N, IC}});
  407. auto do_conv = [bundle](const NCBKernParam& kern_param,
  408. const NCBKernIndex& ncb_index) mutable {
  409. bundle.set(kern_param.workspace_ptr);
  410. MultithreadDirectConvCommon<float, float>::do_conv_kern(
  411. bundle, kern_param, ncb_index, fp32::conv_bias::kern_direct,
  412. ncb_index.ndrange_id);
  413. };
  414. ret_kerns.push_back({do_conv, {group, N, OC}});
  415. }
  416. return ret_kerns;
  417. }
  418. SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoF32Direct::dispatch_kerns(
  419. const NCBKernSizeParam& param) const {
  420. MIDOUT_BEGIN(megdnn_arm_common_conv_bias_f32_kimpl, 0, 1) {
  421. return get_kimpls(param);
  422. }
  423. MIDOUT_END();
  424. return {};
  425. }
  426. /* ===================== stride-1 algo ===================== */
  427. bool ConvBiasImpl::AlgoF32DirectStride1::usable(
  428. const NCBKernSizeParam& param,
  429. AlgoSelectionStrategy algo_selection_strategy) const {
  430. MIDOUT_BEGIN(megdnn_arm_common_conv_bias_f32_kimpl, 1, 1) {
  431. auto&& fm = param.filter_meta;
  432. auto FH = fm.spatial[0];
  433. bool aviliable =
  434. param.filter_meta.format == param::ConvBias::Format::NCHW &&
  435. param.src_type.enumv() == DTypeEnum::Float32 &&
  436. param.filter_type.enumv() == DTypeEnum::Float32 &&
  437. param.dst_type.enumv() == DTypeEnum::Float32 &&
  438. !fm.should_flip && fm.spatial_ndim == 2 &&
  439. fm.dilation[0] == 1 && fm.dilation[1] == 1 &&
  440. fm.stride[0] == 1 && fm.stride[1] == 1 && FH == fm.spatial[1] &&
  441. (FH == 2 || FH == 3 || FH == 5 || FH == 7);
  442. if (algo_selection_strategy ==
  443. ConvBiasImpl::AlgoSelectionStrategy::HEURISTIC) {
  444. bool large_group = param.filter_meta.group >= param.nr_threads;
  445. aviliable &= (large_group == m_large_group);
  446. }
  447. return aviliable;
  448. }
  449. MIDOUT_END();
  450. return false;
  451. }
  452. size_t ConvBiasImpl::AlgoF32DirectStride1::get_workspace(
  453. const NCBKernSizeParam& param) const {
  454. MIDOUT_BEGIN(megdnn_arm_common_conv_bias_f32_kimpl, 1, 1) {
  455. auto bundle =
  456. MultithreadDirectConvCommon<float, float>::get_bundle_stride(
  457. param, m_large_group);
  458. return bundle.total_size_in_bytes();
  459. }
  460. MIDOUT_END();
  461. return 0;
  462. }
  463. SmallVector<ConvBiasImpl::NCBKern>
  464. ConvBiasImpl::AlgoF32DirectStride1::get_kimpls(
  465. const NCBKernSizeParam& param) const {
  466. auto fm = param.filter_meta;
  467. auto FH = fm.spatial[0];
  468. size_t N = param.n;
  469. size_t IC = param.filter_meta.icpg;
  470. size_t OC = param.filter_meta.ocpg;
  471. size_t group = fm.group;
  472. using Func = std::function<void(const float*, const float*, float*, size_t,
  473. size_t, size_t, size_t, size_t)>;
  474. Func conv_kern_function = nullptr;
  475. #define SWITCH_KERN_STR1() \
  476. switch (FH) { \
  477. case 2: \
  478. conv_kern_function = fp32::conv_stride1::do_conv_2x2_stride1; \
  479. break; \
  480. case 3: \
  481. conv_kern_function = fp32::conv_stride1::do_conv_3x3_stride1; \
  482. break; \
  483. case 5: \
  484. conv_kern_function = fp32::conv_stride1::do_conv_5x5_stride1; \
  485. break; \
  486. case 7: \
  487. conv_kern_function = fp32::conv_stride1::do_conv_7x7_stride1; \
  488. break; \
  489. }
  490. SWITCH_KERN_STR1();
  491. WorkspaceBundle bundle =
  492. MultithreadDirectConvCommon<float, float>::get_bundle_stride(
  493. param, m_large_group);
  494. SmallVector<NCBKern> ret_kerns;
  495. //! When group >= nr_threads, treat it as large_group, each thread process
  496. //! one group for better performance
  497. if (m_large_group) {
  498. //! Channel wise conv and big groups
  499. auto exec_one_group = [bundle, conv_kern_function](
  500. const NCBKernParam& kern_param,
  501. const NCBKernIndex& ncb_index) mutable {
  502. auto fm = kern_param.filter_meta;
  503. size_t IC = fm.icpg;
  504. size_t OC = fm.ocpg;
  505. bundle.set(kern_param.workspace_ptr);
  506. for (size_t ic = 0; ic < IC; ic++) {
  507. MultithreadDirectConvCommon<float, float>::
  508. copy_padding_kern_stride(bundle, kern_param, ncb_index,
  509. {ncb_index.thread_id, 0, ic});
  510. }
  511. for (size_t oc = 0; oc < OC; oc++) {
  512. MultithreadDirectConvCommon<float, float>::do_conv_kern_stride(
  513. bundle, kern_param, ncb_index, conv_kern_function,
  514. {ncb_index.thread_id, 0, oc});
  515. }
  516. };
  517. ret_kerns.push_back({exec_one_group, {group, N, 1_z}});
  518. } else {
  519. auto copy_padding = [bundle](const NCBKernParam& kern_param,
  520. const NCBKernIndex& ncb_index) mutable {
  521. bundle.set(kern_param.workspace_ptr);
  522. MultithreadDirectConvCommon<float, float>::copy_padding_kern_stride(
  523. bundle, kern_param, ncb_index, ncb_index.ndrange_id);
  524. };
  525. ret_kerns.push_back({copy_padding, {group, N, IC}});
  526. auto do_conv = [bundle, conv_kern_function](
  527. const NCBKernParam& kern_param,
  528. const NCBKernIndex& ncb_index) mutable {
  529. bundle.set(kern_param.workspace_ptr);
  530. MultithreadDirectConvCommon<float, float>::do_conv_kern_stride(
  531. bundle, kern_param, ncb_index, conv_kern_function,
  532. ncb_index.ndrange_id);
  533. };
  534. ret_kerns.push_back({do_conv, {group, N, OC}});
  535. }
  536. return ret_kerns;
  537. }
  538. SmallVector<ConvBiasImpl::NCBKern>
  539. ConvBiasImpl::AlgoF32DirectStride1::dispatch_kerns(
  540. const NCBKernSizeParam& param) const {
  541. MIDOUT_BEGIN(megdnn_arm_common_conv_bias_f32_kimpl, 1, 2) {
  542. return get_kimpls(param);
  543. }
  544. MIDOUT_END();
  545. return {};
  546. }
  547. /* ===================== stride-2 algo ===================== */
  548. bool ConvBiasImpl::AlgoF32DirectStride2::usable(
  549. const NCBKernSizeParam& param,
  550. AlgoSelectionStrategy algo_selection_strategy) const {
  551. MIDOUT_BEGIN(megdnn_arm_common_conv_bias_f32_kimpl, 2, 0) {
  552. auto&& fm = param.filter_meta;
  553. auto FH = fm.spatial[0];
  554. bool aviliable =
  555. param.filter_meta.format == param::ConvBias::Format::NCHW &&
  556. param.src_type.enumv() == DTypeEnum::Float32 &&
  557. param.filter_type.enumv() == DTypeEnum::Float32 &&
  558. param.dst_type.enumv() == DTypeEnum::Float32 &&
  559. !fm.should_flip && fm.spatial_ndim == 2 &&
  560. fm.dilation[0] == 1 && fm.dilation[1] == 1 &&
  561. fm.stride[0] == 2 && fm.stride[1] == 2 && FH == fm.spatial[1] &&
  562. (FH == 2 || FH == 3 || FH == 5 || FH == 7);
  563. if (algo_selection_strategy ==
  564. ConvBiasImpl::AlgoSelectionStrategy::HEURISTIC) {
  565. bool large_group = param.filter_meta.group >= param.nr_threads;
  566. aviliable &= (large_group == m_large_group);
  567. }
  568. return aviliable;
  569. }
  570. MIDOUT_END();
  571. return false;
  572. }
  573. size_t ConvBiasImpl::AlgoF32DirectStride2::get_workspace(
  574. const NCBKernSizeParam& param) const {
  575. MIDOUT_BEGIN(megdnn_arm_common_conv_bias_f32_kimpl, 2, 1) {
  576. auto bundle =
  577. MultithreadDirectConvCommon<float, float>::get_bundle_stride(
  578. param, m_large_group);
  579. return bundle.total_size_in_bytes();
  580. }
  581. MIDOUT_END();
  582. return 0;
  583. }
  584. SmallVector<ConvBiasImpl::NCBKern>
  585. ConvBiasImpl::AlgoF32DirectStride2::get_kimpls(
  586. const NCBKernSizeParam& param) const {
  587. auto fm = param.filter_meta;
  588. auto FH = fm.spatial[0];
  589. size_t N = param.n;
  590. size_t IC = param.filter_meta.icpg;
  591. size_t OC = param.filter_meta.ocpg;
  592. size_t group = fm.group;
  593. using Func = std::function<void(const float*, const float*, float*, size_t,
  594. size_t, size_t, size_t, size_t)>;
  595. Func conv_kern_function = nullptr;
  596. #define SWITCH_KERN_STR2() \
  597. switch (FH) { \
  598. case 2: \
  599. conv_kern_function = fp32::conv_stride2::do_conv_2x2_stride2; \
  600. break; \
  601. case 3: \
  602. conv_kern_function = fp32::conv_stride2::do_conv_3x3_stride2; \
  603. break; \
  604. case 5: \
  605. conv_kern_function = fp32::conv_stride2::do_conv_5x5_stride2; \
  606. break; \
  607. case 7: \
  608. conv_kern_function = fp32::conv_stride2::do_conv_7x7_stride2; \
  609. break; \
  610. }
  611. SWITCH_KERN_STR2();
  612. WorkspaceBundle bundle =
  613. MultithreadDirectConvCommon<float, float>::get_bundle_stride(
  614. param, m_large_group);
  615. SmallVector<NCBKern> ret_kerns;
  616. //! When group >= nr_threads, treat it as large_group, each thread process
  617. //! one group for better performance
  618. if (m_large_group) {
  619. //! Channel wise conv and big groups
  620. auto exec_one_group = [bundle, conv_kern_function](
  621. const NCBKernParam& kern_param,
  622. const NCBKernIndex& ncb_index) mutable {
  623. auto fm = kern_param.filter_meta;
  624. size_t IC = fm.icpg;
  625. size_t OC = fm.ocpg;
  626. bundle.set(kern_param.workspace_ptr);
  627. for (size_t ic = 0; ic < IC; ic++) {
  628. MultithreadDirectConvCommon<float, float>::
  629. copy_padding_kern_stride(bundle, kern_param, ncb_index,
  630. {ncb_index.thread_id, 0, ic});
  631. }
  632. for (size_t oc = 0; oc < OC; oc++) {
  633. MultithreadDirectConvCommon<float, float>::do_conv_kern_stride(
  634. bundle, kern_param, ncb_index, conv_kern_function,
  635. {ncb_index.thread_id, 0, oc});
  636. }
  637. };
  638. ret_kerns.push_back({exec_one_group, {group, N, 1_z}});
  639. } else {
  640. auto copy_padding = [bundle](const NCBKernParam& kern_param,
  641. const NCBKernIndex& ncb_index) mutable {
  642. bundle.set(kern_param.workspace_ptr);
  643. MultithreadDirectConvCommon<float, float>::copy_padding_kern_stride(
  644. bundle, kern_param, ncb_index, ncb_index.ndrange_id);
  645. };
  646. ret_kerns.push_back({copy_padding, {group, N, IC}});
  647. auto do_conv = [bundle, conv_kern_function](
  648. const NCBKernParam& kern_param,
  649. const NCBKernIndex& ncb_index) mutable {
  650. bundle.set(kern_param.workspace_ptr);
  651. MultithreadDirectConvCommon<float, float>::do_conv_kern_stride(
  652. bundle, kern_param, ncb_index, conv_kern_function,
  653. ncb_index.ndrange_id);
  654. };
  655. ret_kerns.push_back({do_conv, {group, N, OC}});
  656. }
  657. return ret_kerns;
  658. }
  659. SmallVector<ConvBiasImpl::NCBKern>
  660. ConvBiasImpl::AlgoF32DirectStride2::dispatch_kerns(
  661. const NCBKernSizeParam& param) const {
  662. MIDOUT_BEGIN(megdnn_arm_common_conv_bias_f32_kimpl, 2, 2) {
  663. return get_kimpls(param);
  664. }
  665. MIDOUT_END();
  666. return {};
  667. }
  668. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台