You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

algos.cpp 27 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628
  1. /**
  2. * \file dnn/src/fallback/convolution/algos.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
  10. * implied.
  11. */
  12. #include "src/fallback/convolution/algos.h"
  13. #include "src/common/opr_delegate.h"
  14. #include "src/fallback/convolution/col2img_helper.h"
  15. #include "src/fallback/convolution/run_conv.h"
  16. #include "midout.h"
  17. using namespace megdnn;
  18. using namespace fallback;
  19. MIDOUT_DECL(megdnn_fallback_conv)
  20. MIDOUT_DECL(megdnn_fallback_deconv)
  21. namespace {
  22. template <typename T>
  23. void incr_ptr(T*& dst, ptrdiff_t delta) {
  24. dst = reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(dst) + delta);
  25. }
  26. using NCBKernSizeParam = ConvolutionBackwardDataImpl::NCBKernSizeParam;
  27. using NCBKernParam = ConvolutionBackwardDataImpl::NCBKernParam;
  28. Relayout* get_relayout_opr() {
  29. static CpuOprDelegationStorage<> storage;
  30. return storage.get<Relayout>();
  31. }
  32. MatrixMul* get_matmul_opr(const NCBKernSizeParam& param) {
  33. using ConvCM = param::Convolution::ComputeMode;
  34. using MmCM = param::MatrixMul::ComputeMode;
  35. static CpuOprDelegationStorage<2> storage;
  36. switch (param.compute_mode) {
  37. default:
  38. return storage.get<MatrixMul, 0>({});
  39. case ConvCM::FLOAT32: {
  40. MatrixMul::Param p;
  41. p.compute_mode = MmCM::FLOAT32;
  42. return storage.get<MatrixMul, 1>(p);
  43. }
  44. }
  45. }
  46. WorkspaceBundle get_bundle(const NCBKernSizeParam& param) {
  47. UNPACK_CONV_F32_NCB_KERN_SIZES(param);
  48. MEGDNN_MARK_USED_VAR(N);
  49. MEGDNN_MARK_USED_VAR(OH);
  50. MEGDNN_MARK_USED_VAR(OW);
  51. bool can_matrix_mul_direct =
  52. (FH == 1 && FW == 1 && SH == 1 && SW == 1 && PH == 0 && PW == 0);
  53. // temp space to store unrolled matrix
  54. // workspace for matrix mul opr
  55. // workspace for relayout opr
  56. size_t part0, part1, part2;
  57. if (can_matrix_mul_direct) {
  58. part0 = 0;
  59. } else {
  60. part0 = (IC * FH * FW * IH * IW) * param.grad_type.size();
  61. }
  62. part2 = (OC * IC * FH * FW) * param.filter_type.size();
  63. {
  64. TensorLayout A_, B_, C_;
  65. A_ = TensorLayout({IC * FH * FW, OC}, param.filter_type);
  66. B_ = TensorLayout({OC, IH * IW}, param.diff_type);
  67. C_ = TensorLayout({IC * FH * FW, IH * IW}, param.grad_type);
  68. part1 = get_matmul_opr(param)->get_workspace_in_bytes(A_, B_, C_);
  69. }
  70. return {nullptr, {part0, part1, part2}};
  71. }
  72. template <typename ftype, typename dtype, typename gtype>
  73. void kern_matmul(const NCBKernParam& param) {
  74. bool is_xcorr = !param.filter_meta.should_flip;
  75. UNPACK_CONV_F32_NCB_KERN_SIZES(param);
  76. auto bundle = get_bundle(param);
  77. bundle.set(param.workspace_ptr);
  78. bool is1X1 =
  79. (FH == 1 && FW == 1 && SH == 1 && SW == 1 && PH == 0 && PW == 0);
  80. typedef void (*Func1)(const gtype*, gtype*, int, int, int, int, int, int,
  81. int);
  82. typedef void (*Func2)(const gtype*, gtype*, int, int, int, int, int, int,
  83. int, int, int, int, int);
  84. Func1 f1 = nullptr;
  85. Func2 f2 = nullptr;
  86. if (is_xcorr) {
  87. f1 = col2img<true>;
  88. f2 = col2img_stride_padding<true>;
  89. } else {
  90. f1 = col2img<false>;
  91. f2 = col2img_stride_padding<false>;
  92. }
  93. ftype* filter = const_cast<ftype*>(param.filter<ftype>());
  94. TensorND A_src, A_dst;
  95. {
  96. A_src.layout = TensorLayout({IC * FH * FW, OC},
  97. {static_cast<std::ptrdiff_t>(1),
  98. static_cast<std::ptrdiff_t>(IC * FH * FW)},
  99. param.filter_type);
  100. A_src.raw_ptr = static_cast<void*>(filter);
  101. A_dst.layout = TensorLayout({IC * FH * FW, OC}, param.filter_type);
  102. A_dst.raw_ptr = static_cast<void*>(bundle.get(2));
  103. // TODO Should be removed once armv8 convolution support transpose.
  104. get_relayout_opr()->exec(A_src, A_dst, inplace_cpu_handle().get());
  105. }
  106. for (size_t n = 0; n < N; ++n) {
  107. gtype *C_src, *C_dst;
  108. dtype* diff =
  109. const_cast<dtype*>(param.diff<dtype>() + n * param.inp_bs);
  110. gtype* grad = param.grad<gtype>() + n * param.out_bs;
  111. if (is1X1) {
  112. C_src = grad;
  113. } else {
  114. C_src = static_cast<gtype*>(bundle.get(0));
  115. }
  116. {
  117. TensorND B_, C_;
  118. B_.layout = TensorLayout({OC, IH * IW}, param.diff_type);
  119. B_.raw_ptr = static_cast<void*>(diff);
  120. C_.layout = TensorLayout({IC * FH * FW, IH * IW}, param.grad_type);
  121. C_.raw_ptr = C_src;
  122. Workspace workspace(static_cast<dt_byte*>(bundle.get(1)),
  123. bundle.get_size(1));
  124. get_matmul_opr(param)->exec(A_dst, B_, C_, workspace);
  125. }
  126. if (!is1X1) {
  127. C_dst = grad;
  128. std::memset(C_dst, 0, param.grad_type.size() * IC * OH * OW);
  129. if (PH == 0 && PW == 0 && SH == 1 && SW == 1) {
  130. f1(C_src, C_dst, OH, OW, IC, IH, IW, FH, FW);
  131. } else {
  132. f2(C_src, C_dst, OH, OW, IC, IH, IW, FH, FW, SH, SW, PH, PW);
  133. }
  134. }
  135. }
  136. }
  137. void kern_direct(const NCBKernParam& param) {
  138. UNPACK_CONV_F32_NCB_KERN_SIZES(param);
  139. auto diff = param.diff<float>(), filter = param.filter<float>();
  140. auto grad = param.grad<float>();
  141. for (size_t n = 0; n < N; ++n) {
  142. convolution::run_conv_backward_data(
  143. diff + n * param.inp_bs, filter, grad + n * param.out_bs,
  144. param.workspace_ptr, IH, IW, IC, FH, FW, OH, OW, OC, PH, PW, SH,
  145. SW, !param.filter_meta.should_flip);
  146. }
  147. }
  148. } // namespace
  149. /* ===================== fallback algo ===================== */
  150. bool ConvolutionImpl::AlgoFallback::usable(
  151. const NCBKernSizeParam& param,
  152. AlgoSelectionStrategy /*algo_selection_strategy*/) const {
  153. auto&& fm = param.filter_meta;
  154. return fm.format == param::Convolution::Format::NCHW &&
  155. param.src_type.enumv() == DTypeEnum::Float32 &&
  156. param.filter_type.enumv() == DTypeEnum::Float32 &&
  157. param.dst_type.enumv() == DTypeEnum::Float32 &&
  158. fm.spatial_ndim == 2 && fm.dilation[0] == 1 && fm.dilation[1] == 1;
  159. }
  160. size_t ConvolutionImpl::AlgoFallback::get_workspace(
  161. const NCBKernSizeParam& param) const {
  162. MIDOUT_BEGIN(megdnn_fallback_conv,
  163. midout_iv("AlgoFallback::get_workspace"_hash)) {
  164. auto FH = param.filter_meta.spatial[0],
  165. FW = param.filter_meta.spatial[1];
  166. size_t nr_threads = param.nr_threads;
  167. if (param.filter_meta.should_flip) {
  168. // need transpose filter
  169. return WorkspaceBundle{nullptr, {FH * FW * sizeof(float)}}
  170. .total_size_in_bytes() *
  171. nr_threads;
  172. } else {
  173. return 0;
  174. }
  175. }
  176. MIDOUT_END();
  177. return 0;
  178. }
  179. SmallVector<ConvolutionImpl::NCBKern>
  180. ConvolutionImpl::AlgoFallback::dispatch_kern(
  181. const NCBKernSizeParam& param) const {
  182. MIDOUT_BEGIN(megdnn_fallback_conv,
  183. midout_iv("AlgoFallback::dispatch_kern"_hash)) {
  184. size_t group = param.filter_meta.group;
  185. size_t N = param.n;
  186. size_t nr_threads = param.nr_threads;
  187. size_t workspace_per_thread = get_workspace( param) / nr_threads;
  188. auto kern_fallback = [workspace_per_thread](const NCBKernParam& p,
  189. const NCBKernIndex& ncb_index) {
  190. UNPACK_CONV_F32_NCB_KERN_SIZES(p);
  191. size_t batch_id = ncb_index.ndrange_id[1];
  192. size_t group_id = ncb_index.ndrange_id[0];
  193. MEGDNN_MARK_USED_VAR(N);
  194. auto src = p.src<float>(batch_id, group_id),
  195. filter = p.filter<float>(group_id);
  196. auto dst = p.dst<float>(batch_id, group_id);
  197. size_t thread_id = ncb_index.thread_id;
  198. void* workspace_ptr = reinterpret_cast<void*>(
  199. reinterpret_cast<ptrdiff_t>(p.workspace_ptr) +
  200. workspace_per_thread * thread_id);
  201. convolution::run_conv(src, filter, dst, workspace_ptr, IH, IW, IC, FH,
  202. FW, OH, OW, OC, PH, PW, SH, SW,
  203. !p.filter_meta.should_flip);
  204. };
  205. return {{kern_fallback, {group, N, 1_z}}};
  206. }
  207. MIDOUT_END();
  208. }
  209. /* ===================== naive algo ===================== */
  210. bool ConvolutionImpl::AlgoNaive::usable(
  211. const NCBKernSizeParam& param,
  212. AlgoSelectionStrategy /*algo_selection_strategy*/) const {
  213. bool ret = false;
  214. #define cb(dt) ret |= (param.src_type.enumv() == DTypeTrait<dt>::enumv);
  215. MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(cb);
  216. #undef cb
  217. #define cb(dt_src, dt_dst) \
  218. ret |= (param.src_type.enumv() == DTypeTrait<dt_src>::enumv && \
  219. param.filter_type.enumv() == DTypeTrait<dt_src>::enumv && \
  220. param.dst_type.enumv() == DTypeTrait<dt_dst>::enumv)
  221. cb(dtype::Int8, dtype::Int16);
  222. cb(dtype::Int8, dtype::Int32);
  223. cb(dtype::Quantized8Asymm, dtype::QuantizedS32);
  224. cb(dtype::QuantizedS8, dtype::QuantizedS32);
  225. #undef cb
  226. ret = ret &&
  227. (param.filter_meta.format == param::Convolution::Format::NCHW ||
  228. param.filter_meta.format == param::Convolution::Format::NHWC);
  229. return ret;
  230. }
  231. SmallVector<ConvolutionImpl::NCBKern> ConvolutionImpl::AlgoNaive::dispatch_kern(
  232. const NCBKernSizeParam& param) const {
  233. size_t N = param.n;
  234. size_t group = param.filter_meta.group;
  235. #define cb(dt, cmode, compute_type) \
  236. do { \
  237. if (param.src_type.enumv() == DTypeTrait<dt>::enumv && \
  238. param.compute_mode == param::ConvBias::ComputeMode::cmode) { \
  239. using ctype = DTypeTrait<dt>::ctype; \
  240. using comp_type = DTypeTrait<compute_type>::ctype; \
  241. MIDOUT_BEGIN(megdnn_fallback_conv, midout_iv(1)) { \
  242. return {{kern_naive_forward<ctype, ctype, comp_type>, \
  243. {group, N, 1_z}}}; \
  244. } \
  245. MIDOUT_END(); \
  246. } \
  247. } while (0)
  248. cb(dtype::Float32, DEFAULT, dtype::Float32);
  249. #if !MEGDNN_DISABLE_FLOAT16
  250. cb(dtype::Float16, DEFAULT, dtype::Float16);
  251. cb(dtype::Float16, FLOAT32, dtype::Float32);
  252. #endif
  253. #undef cb
  254. #define cb(dt_src, dt_dst) \
  255. do { \
  256. if (param.src_type.enumv() == DTypeTrait<dt_src>::enumv && \
  257. param.filter_type.enumv() == DTypeTrait<dt_src>::enumv && \
  258. param.dst_type.enumv() == DTypeTrait<dt_dst>::enumv) { \
  259. MIDOUT_BEGIN(megdnn_fallback_conv, midout_iv(2)) { \
  260. return {{kern_naive_forward<DTypeTrait<dt_src>::ctype, \
  261. DTypeTrait<dt_dst>::ctype, \
  262. DTypeTrait<dt_dst>::ctype>, \
  263. {group, N, 1_z}}}; \
  264. } \
  265. MIDOUT_END(); \
  266. } \
  267. } while (0)
  268. cb(dtype::Int8, dtype::Int16);
  269. cb(dtype::Int8, dtype::Int32);
  270. cb(dtype::Quantized8Asymm, dtype::QuantizedS32);
  271. cb(dtype::QuantizedS8, dtype::QuantizedS32);
  272. megdnn_throw("unknown convolution data type");
  273. #undef cb
  274. }
  275. /* ===================== default algo ===================== */
  276. ConvolutionImpl::AlgoDefault::AlgoDefault(ConvBiasImpl::AlgoBase* algorithm)
  277. : m_algorithm(algorithm) {
  278. megdnn_assert_internal(algorithm);
  279. m_name = ssprintf("CONVOLUTION_DEFAULT_%s", m_algorithm->name());
  280. }
  281. ConvBiasImpl::NCBKernSizeParam
  282. ConvolutionImpl::AlgoDefault::init_conv_bias_param(
  283. const NCBKernSizeParam& param) {
  284. DType bias_type = param.dst_type;
  285. if (bias_type.category() == DTypeCategory::QUANTIZED) {
  286. bias_type = dtype::QuantizedS32(
  287. mul_scale(param.src_type, param.filter_type));
  288. }
  289. return {param,
  290. bias_type,
  291. 0,
  292. BiasMode::NO_BIAS,
  293. param::ConvBias::NonlineMode::IDENTITY};
  294. }
  295. bool ConvolutionImpl::AlgoDefault::is_preferred(
  296. const NCBKernSizeParam& param) const {
  297. ::ConvBiasImpl::NCBKernSizeParam conv_bias_param =
  298. init_conv_bias_param(param);
  299. return m_algorithm->is_preferred(conv_bias_param);
  300. }
  301. bool ConvolutionImpl::AlgoDefault::usable(
  302. const NCBKernSizeParam& param,
  303. AlgoSelectionStrategy algo_selection_strategy) const {
  304. ::ConvBiasImpl::NCBKernSizeParam conv_bias_param =
  305. init_conv_bias_param(param);
  306. return m_algorithm->usable(conv_bias_param,
  307. static_cast<ConvBiasImpl::AlgoSelectionStrategy>(
  308. algo_selection_strategy));
  309. }
  310. WorkspaceBundle ConvolutionImpl::AlgoDefault::get_bundle(
  311. const NCBKernSizeParam& param) const {
  312. ::ConvBiasImpl::NCBKernSizeParam conv_bias_param =
  313. init_conv_bias_param(param);
  314. return WorkspaceBundle(nullptr, {m_algorithm->get_workspace(
  315. conv_bias_param)});
  316. }
  317. size_t ConvolutionImpl::AlgoDefault::get_workspace(
  318. const NCBKernSizeParam& param) const {
  319. MIDOUT_BEGIN(megdnn_fallback_conv,
  320. midout_iv("AlgoDefault::get_workspace"_hash)) {
  321. return get_bundle(param).total_size_in_bytes();
  322. }
  323. MIDOUT_END();
  324. return 0;
  325. }
  326. size_t ConvolutionImpl::AlgoDefault::get_preprocess_workspace(
  327. const NCBKernSizeParam& param) const {
  328. MIDOUT_BEGIN(megdnn_fallback_conv,
  329. midout_iv("AlgoDefault::get_preprocess_workspace"_hash)) {
  330. ::ConvBiasImpl::NCBKernSizeParam conv_bias_param =
  331. init_conv_bias_param(param);
  332. return m_algorithm->get_preprocess_workspace(conv_bias_param);
  333. }
  334. MIDOUT_END();
  335. }
  336. SmallVector<TensorLayout>
  337. ConvolutionImpl::AlgoDefault::deduce_preprocessed_filter_layout(
  338. const NCBKernSizeParam& param) const {
  339. MIDOUT_BEGIN(
  340. megdnn_fallback_conv,
  341. midout_iv("AlgoDefault::deduce_preprocessed_filter_layout"_hash)) {
  342. ::ConvBiasImpl::NCBKernSizeParam conv_bias_param =
  343. init_conv_bias_param(param);
  344. return m_algorithm->deduce_preprocessed_filter_layout(conv_bias_param);
  345. }
  346. MIDOUT_END();
  347. }
  348. //! Return the implement preprocess kernel
  349. SmallVector<ConvolutionImpl::NCBKern>
  350. ConvolutionImpl::AlgoDefault::get_preprocess_kimpl(
  351. ConvBiasImpl::AlgoBase* algo,
  352. const NCBKernSizeParam& param) {
  353. MIDOUT_BEGIN(megdnn_fallback_conv, midout_iv("get_preprocess_kimpl"_hash)) {
  354. // construct the conv_bias kern param
  355. ::ConvBiasImpl::NCBKernParam conv_bias_param;
  356. static_cast<::ConvBiasImpl::NCBKernSizeParam&>(conv_bias_param) =
  357. init_conv_bias_param(param);
  358. auto conv_bias_preprocess_kerns =
  359. algo->dispatch_preprocess_kerns(conv_bias_param);
  360. SmallVector<ConvolutionImpl::NCBKern> convolution_preprocess_kerns;
  361. //! Set the conv_bias param using convolution param
  362. auto set_param_filter_workspace_ptr =
  363. [](const NCBKernParam& conv_param,
  364. ::ConvBiasImpl::NCBKernParam& conv_bias_param) {
  365. conv_bias_param.filter_ptr = conv_param.filter_ptr;
  366. conv_bias_param.workspace_ptr = conv_param.workspace_ptr;
  367. conv_bias_param.workspace_size = conv_param.workspace_size;
  368. };
  369. for (size_t i = 0; i < conv_bias_preprocess_kerns.size(); i++) {
  370. auto kernel = conv_bias_preprocess_kerns[i];
  371. //! If the kerenl batch parallel
  372. auto run = [param = conv_bias_param, kernel,
  373. &set_param_filter_workspace_ptr](
  374. const NCBKernParam& p,
  375. const NCBKernIndex& ncb_index) mutable {
  376. set_param_filter_workspace_ptr(p, param);
  377. kernel.kern(param, {ncb_index.thread_id, ncb_index.ndrange_id});
  378. };
  379. convolution_preprocess_kerns.push_back({run, kernel.global_size});
  380. }
  381. return convolution_preprocess_kerns;
  382. }
  383. MIDOUT_END();
  384. }
  385. //! Return the implement kernel
  386. SmallVector<ConvolutionImpl::NCBKern> ConvolutionImpl::AlgoDefault::get_kimpl(
  387. ConvBiasImpl::AlgoBase* algo,
  388. const NCBKernSizeParam& param) {
  389. MIDOUT_BEGIN(megdnn_fallback_conv, midout_iv(0)) {
  390. // construct the conv_bias kern param
  391. ::ConvBiasImpl::NCBKernParam conv_bias_param;
  392. static_cast<::ConvBiasImpl::NCBKernSizeParam&>(conv_bias_param) =
  393. init_conv_bias_param(param);
  394. auto&& conv_bias_kerns = algo->dispatch_kerns(conv_bias_param);
  395. SmallVector<ConvolutionImpl::NCBKern> convolution_kerns;
  396. //! Set the conv_bias param using convolution param
  397. auto set_copy_param_compute_address =
  398. [](const NCBKernParam& conv_param,
  399. ::ConvBiasImpl::NCBKernParam& conv_bias_param) {
  400. conv_bias_param.src_ptr = conv_param.src_ptr;
  401. conv_bias_param.filter_ptr = conv_param.filter_ptr;
  402. conv_bias_param.dst_ptr = conv_param.dst_ptr;
  403. conv_bias_param.workspace_ptr = conv_param.workspace_ptr;
  404. conv_bias_param.workspace_size = conv_param.workspace_size;
  405. };
  406. for (size_t i = 0; i < conv_bias_kerns.size(); i++) {
  407. auto&& kernel = conv_bias_kerns[i];
  408. //! If the kerenl batch parallel
  409. auto run = [param = conv_bias_param, kernel,
  410. &set_copy_param_compute_address](
  411. const NCBKernParam& p,
  412. const NCBKernIndex& ncb_index) mutable {
  413. set_copy_param_compute_address(p, param);
  414. kernel.kern(param, {ncb_index.thread_id, ncb_index.ndrange_id});
  415. };
  416. convolution_kerns.push_back({run, kernel.global_size});
  417. }
  418. return convolution_kerns;
  419. }
  420. MIDOUT_END();
  421. }
  422. /////////////////////////// ConvolutionBackwardData /////////////////////
  423. /* ===================== naive algo ===================== */
  424. bool ConvolutionBackwardDataImpl::AlgoNaive::usable(
  425. ConvolutionBackwardDataImpl*, const NCBKernSizeParam& param) const {
  426. bool ret = false;
  427. #define cb(dt) ret |= (param.diff_type.enumv() == DTypeTrait<dt>::enumv);
  428. MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(cb);
  429. #undef cb
  430. #define cb(dt_src, dt_dst) \
  431. ret |= (param.diff_type.enumv() == DTypeTrait<dt_src>::enumv && \
  432. param.filter_type.enumv() == DTypeTrait<dt_src>::enumv && \
  433. param.grad_type.enumv() == DTypeTrait<dt_dst>::enumv)
  434. cb(dtype::Int8, dtype::Int32);
  435. cb(dtype::Quantized8Asymm, dtype::QuantizedS32);
  436. cb(dtype::QuantizedS8, dtype::QuantizedS32);
  437. #undef cb
  438. return ret;
  439. }
  440. size_t ConvolutionBackwardDataImpl::AlgoNaive::get_workspace(
  441. ConvolutionBackwardDataImpl*, const NCBKernSizeParam&) const {
  442. return 0;
  443. }
  444. ConvolutionBackwardDataImpl::ncb_kern_t
  445. ConvolutionBackwardDataImpl::AlgoNaive::dispatch_kern(
  446. ConvolutionBackwardDataImpl*, const NCBKernSizeParam& param) const {
  447. #define cb(_dt) \
  448. do { \
  449. if (param.filter_type.enumv() == DTypeTrait<_dt>::enumv) { \
  450. MIDOUT_BEGIN(megdnn_fallback_deconv, \
  451. midout_iv(DTypeTrait<_dt>::enumv)) { \
  452. using ctype = DTypeTrait<_dt>::ctype; \
  453. return kern_naive<ctype, ctype, ctype>; \
  454. } \
  455. MIDOUT_END(); \
  456. } \
  457. } while (0);
  458. MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(cb);
  459. #undef cb
  460. #define cb(dt_src, dt_dst) \
  461. do { \
  462. if (param.diff_type.enumv() == DTypeTrait<dt_src>::enumv && \
  463. param.filter_type.enumv() == DTypeTrait<dt_src>::enumv && \
  464. param.grad_type.enumv() == DTypeTrait<dt_dst>::enumv) { \
  465. MIDOUT_BEGIN(megdnn_fallback_deconv, \
  466. midout_iv(DTypeTrait<dt_src>::enumv)) { \
  467. return kern_naive<DTypeTrait<dt_src>::ctype, \
  468. DTypeTrait<dt_src>::ctype, \
  469. DTypeTrait<dt_dst>::ctype>; \
  470. } \
  471. MIDOUT_END(); \
  472. } \
  473. } while (0)
  474. cb(dtype::Int8, dtype::Int32);
  475. cb(dtype::Quantized8Asymm, dtype::QuantizedS32);
  476. cb(dtype::QuantizedS8, dtype::QuantizedS32);
  477. megdnn_throw("unsupported data type on ConvolutionBackwardData");
  478. #undef cb
  479. }
  480. /* ===================== direct algo ===================== */
  481. bool ConvolutionBackwardDataImpl::AlgoDirect::usable(
  482. ConvolutionBackwardDataImpl*, const NCBKernSizeParam& param) const {
  483. auto&& fm = param.filter_meta;
  484. return fm.format == param::Convolution::Format::NCHW &&
  485. param.diff_type.enumv() == DTypeEnum::Float32 &&
  486. param.filter_type.enumv() == DTypeEnum::Float32 &&
  487. param.grad_type.enumv() == DTypeEnum::Float32 &&
  488. fm.spatial_ndim == 2 && fm.group == 1 && fm.dilation[0] == 1 &&
  489. fm.dilation[1] == 1;
  490. }
  491. size_t ConvolutionBackwardDataImpl::AlgoDirect::get_workspace(
  492. ConvolutionBackwardDataImpl*, const NCBKernSizeParam& param) const {
  493. MIDOUT_BEGIN(megdnn_fallback_deconv,
  494. midout_iv("AlgoDirect::get_workspace"_hash)) {
  495. auto FH = param.filter_meta.spatial[0],
  496. FW = param.filter_meta.spatial[1];
  497. if (param.filter_meta.should_flip) {
  498. // need transpose filter
  499. return FH * FW * sizeof(float);
  500. } else {
  501. return 0;
  502. }
  503. }
  504. MIDOUT_END();
  505. return 0;
  506. }
  507. ConvolutionBackwardDataImpl::ncb_kern_t
  508. ConvolutionBackwardDataImpl::AlgoDirect::dispatch_kern(
  509. ConvolutionBackwardDataImpl*, const NCBKernSizeParam&) const {
  510. MIDOUT_BEGIN(megdnn_fallback_conv,
  511. midout_iv("AlgoDirect::dispatch_kern"_hash)) {
  512. return kern_direct;
  513. }
  514. MIDOUT_END();
  515. }
  516. /* ===================== Matrix mul algo ===================== */
  517. bool ConvolutionBackwardDataImpl::AlgoMatrixMul::usable(
  518. ConvolutionBackwardDataImpl*, const NCBKernSizeParam& param) const {
  519. auto&& fm = param.filter_meta;
  520. return fm.format == param::Convolution::Format::NCHW &&
  521. fm.spatial_ndim == 2 && fm.group == 1 && fm.dilation[0] == 1 &&
  522. fm.dilation[1] == 1;
  523. }
  524. size_t ConvolutionBackwardDataImpl::AlgoMatrixMul::get_workspace(
  525. ConvolutionBackwardDataImpl*, const NCBKernSizeParam& param) const {
  526. MIDOUT_BEGIN(megdnn_fallback_deconv,
  527. midout_iv("AlgoMatrixMul::get_workspace"_hash)) {
  528. return get_bundle(param).total_size_in_bytes();
  529. }
  530. MIDOUT_END();
  531. return 0;
  532. }
  533. ConvolutionBackwardDataImpl::ncb_kern_t
  534. ConvolutionBackwardDataImpl::AlgoMatrixMul::dispatch_kern(
  535. ConvolutionBackwardDataImpl*, const NCBKernSizeParam& param) const {
  536. #define cb(dt, midout_tag) \
  537. do { \
  538. if (param.filter_type.enumv() == DTypeTrait<dt>::enumv) { \
  539. MIDOUT_BEGIN(megdnn_fallback_deconv, midout_iv(midout_tag)) { \
  540. using ctype = DTypeTrait<dt>::ctype; \
  541. return kern_matmul<ctype, ctype, ctype>; \
  542. } \
  543. MIDOUT_END(); \
  544. } \
  545. } while (0);
  546. cb(dtype::Float32, "FLOAT"_hash);
  547. DNN_INC_FLOAT16(cb(dtype::Float16, "FLOAT16"_hash));
  548. DNN_INC_FLOAT16(cb(dtype::BFloat16, "BFLOAT16"_hash));
  549. #undef cb
  550. #define cb(dt_src, dt_dst, midout_tag) \
  551. do { \
  552. if (param.diff_type.enumv() == DTypeTrait<dt_src>::enumv && \
  553. param.filter_type.enumv() == DTypeTrait<dt_src>::enumv && \
  554. param.grad_type.enumv() == DTypeTrait<dt_dst>::enumv) { \
  555. MIDOUT_BEGIN(megdnn_fallback_deconv, midout_iv(midout_tag)) { \
  556. return kern_matmul<DTypeTrait<dt_src>::ctype, \
  557. DTypeTrait<dt_src>::ctype, \
  558. DTypeTrait<dt_dst>::ctype>; \
  559. } \
  560. MIDOUT_END(); \
  561. } \
  562. } while (0)
  563. cb(dtype::Int8, dtype::Int32, "INT8x8x32"_hash);
  564. cb(dtype::QuantizedS8, dtype::QuantizedS32, "QINT8x8x32"_hash);
  565. cb(dtype::Quantized8Asymm, dtype::QuantizedS32, "QUINT8x8x32"_hash);
  566. megdnn_throw("unsupported data type on matrix mul");
  567. #undef cb
  568. }
  569. bool ConvolutionBackwardDataImpl::AlgoMatrixMul::is_preferred(
  570. const NCBKernSizeParam& param) const {
  571. return is_matrix_mul_preferred(param);
  572. }
  573. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台