You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

convolution.cpp 102 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593
  1. /**
  2. * \file src/opr/test/dnn/convolution.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #include "megbrain/comp_node_env.h"
  12. #include "./legacy_checker.h"
  13. #include "megbrain/opr/dnn/convolution.h"
  14. #include "megbrain/test/autocheck.h"
  15. #include "megbrain/test/helper.h"
  16. #include "megbrain/test/megdnn_helper.h"
  17. #include "megbrain/serialization/serializer.h"
  18. #include "megbrain/opr/basic_arith.h"
  19. #include "megbrain/gopt/inference.h"
  20. #include "megbrain/opr/tensor_manip.h"
  21. #include "megdnn/dtype.h"
  22. #include "megdnn/oprs/base.h"
  23. #include <gmock/gmock.h>
  24. #include <cmath>
  25. #include <memory>
  26. #include <random>
  27. using namespace mgb;
  28. namespace {
  29. using Param = opr::Convolution::Param;
  30. using Param3D = opr::Convolution3D::Param;
  31. using Mode = Param::Mode;
  32. Mode modes_to_check[] = {Mode::CONVOLUTION, Mode::CROSS_CORRELATION};
  33. void conv_bwd_data_brute(const std::vector<std::shared_ptr<HostTensorND>>& inps,
  34. std::shared_ptr<HostTensorND>& dest,
  35. const opr::ConvolutionBackwardData::Param& param) {
  36. mgb_assert(param.format == Param::Format::NCHW);
  37. auto &&data = *inps[0], &&filter = *inps[1];
  38. size_t N = data.shape(0), IH = data.shape(2), IW = data.shape(3);
  39. size_t GROUP, ICPG, OCPG, FH, FW;
  40. if (param.sparse == Param::Sparse::DENSE) {
  41. GROUP = 1, ICPG = filter.shape(0), OCPG = filter.shape(1),
  42. FH = filter.shape(2), FW = filter.shape(3);
  43. } else {
  44. mgb_assert(param.sparse == Param::Sparse::GROUP);
  45. GROUP = filter.shape(0), ICPG = filter.shape(1), OCPG = filter.shape(2),
  46. FH = filter.shape(3), FW = filter.shape(4);
  47. }
  48. auto get_shp = [](size_t inp, size_t filter, size_t stride, size_t pad,
  49. size_t dilate) {
  50. return (inp - 1) * stride + (filter - 1) * dilate + 1 - pad * 2;
  51. };
  52. size_t OH = get_shp(IH, FH, param.stride_h, param.pad_h, param.dilate_h),
  53. OW = get_shp(IW, FW, param.stride_w, param.pad_w, param.dilate_w);
  54. dest = std::make_shared<HostTensorND>(CompNode::load("xpu0"),
  55. TensorShape{N, OCPG * GROUP, OH, OW});
  56. auto&& out = *dest;
  57. auto fptr = filter.ptr<float>(), dptr = data.ptr<float>(),
  58. optr = out.ptr<float>();
  59. memset(optr, 0, sizeof(float) * out.shape().total_nr_elems());
  60. auto ol = out.layout(), fl = filter.layout();
  61. #define FOR2(a, A, b, B) \
  62. for (size_t a = 0; a < A; ++a) \
  63. for (size_t b = 0; b < B; ++b)
  64. #define FOR3(a, A, b, B, c, C) \
  65. FOR2(a, A, b, B) \
  66. for (size_t c = 0; c < C; ++c)
  67. FOR3(n, N, group, GROUP, icg, ICPG)
  68. FOR2(ih, IH, iw, IW) {
  69. float scale = *(dptr++);
  70. FOR3(ocg, OCPG, fh, FH, fw, FW) {
  71. auto oc_tot = group * OCPG + ocg;
  72. int oh = int(ih * param.stride_h + fh * param.dilate_h) -
  73. int(param.pad_h),
  74. ow = int(iw * param.stride_w + fw * param.dilate_w) -
  75. int(param.pad_w);
  76. if (oh >= 0 && ow >= 0 && oh < static_cast<int>(OH) &&
  77. ow < static_cast<int>(OW)) {
  78. auto out_off = n * ol.stride[0] + oc_tot * ol.stride[1] +
  79. oh * ol.stride[2] + ow;
  80. size_t flt_off = 0;
  81. if (param.sparse == Param::Convolution::Sparse::DENSE) {
  82. flt_off = icg * fl.stride[0] +
  83. ocg * fl.stride[1] + fh * fl.stride[2] + fw;
  84. } else {
  85. flt_off = group * fl.stride[0] + icg * fl.stride[1] +
  86. ocg * fl.stride[2] + fh * fl.stride[3] + fw;
  87. }
  88. optr[out_off] += scale * fptr[flt_off];
  89. }
  90. }
  91. }
  92. #undef FOR3
  93. #undef FOR2
  94. }
  95. void conv_bwd_flt_brute(const std::vector<std::shared_ptr<HostTensorND>>& inps,
  96. std::shared_ptr<HostTensorND>& out,
  97. const opr::ConvolutionBackwardFilter::Param& param) {
  98. auto &&src = *inps[0], &&diff = *inps[1], &&filter = *inps[2];
  99. size_t N = src.shape(0), IH = src.shape(2), IW = src.shape(3),
  100. OC = filter.shape(0), IC = filter.shape(1), FH = filter.shape(2),
  101. FW = filter.shape(3), OH = diff.shape(2), OW = diff.shape(3);
  102. out = std::make_shared<HostTensorND>(CompNode::load("xpu0"),
  103. TensorShape{OC, IC, FH, FW});
  104. auto&& grad = *out;
  105. auto sptr = src.ptr<float>(), dptr = diff.ptr<float>(),
  106. gptr = grad.ptr<float>();
  107. memset(gptr, 0, sizeof(float) * grad.shape().total_nr_elems());
  108. auto valid = [&](size_t ih, size_t iw) { return ih < IH && iw < IW; };
  109. for (size_t n = 0; n < N; ++n)
  110. for (size_t oc = 0; oc < OC; ++oc)
  111. for (size_t ic = 0; ic < IC; ++ic) {
  112. for (size_t oh = 0; oh < OH; ++oh)
  113. for (size_t ow = 0; ow < OW; ++ow) {
  114. for (size_t fh = 0; fh < FH; ++fh)
  115. for (size_t fw = 0; fw < FW; ++fw) {
  116. size_t ih = oh * param.stride_h + fh -
  117. param.pad_h,
  118. iw = ow * param.stride_w + fw -
  119. param.pad_w;
  120. auto src_data =
  121. valid(ih, iw)
  122. ? sptr[(n * IC + ic) * IH * IW +
  123. ih * IW + iw]
  124. : 0;
  125. gptr[(oc * IC + ic) * FH * FW + fh * FW + fw] +=
  126. dptr[(n * OC + oc) * OH * OW + oh * OW +
  127. ow] *
  128. src_data;
  129. }
  130. }
  131. }
  132. }
  133. void local_share_brute(const std::vector<std::shared_ptr<HostTensorND>>& inps,
  134. std::shared_ptr<HostTensorND>& out,
  135. const opr::LocalShare::Param& param) {
  136. auto in = inps[0], filter = inps[1];
  137. mgb_assert(in->shape().ndim == 4);
  138. mgb_assert(filter->shape().ndim == 6);
  139. int batch_size = in->shape()[0], ci = in->shape()[1], hi = in->shape()[2],
  140. wi = in->shape()[3];
  141. int fh = filter->shape()[3], fw = filter->shape()[4];
  142. int ph = param.pad_h, pw = param.pad_w;
  143. int sh = param.stride_h, sw = param.stride_w;
  144. int dh = param.dilate_h, dw = param.dilate_w;
  145. int sgh = filter->shape()[0], sgw = filter->shape()[1];
  146. mgb_assert(dh == 1 && dw == 1);
  147. mgb_assert(static_cast<uint32_t>(sgh) == param.spatial_groups_h &&
  148. static_cast<uint32_t>(sgw) == param.spatial_groups_w);
  149. int ho = (hi + 2 * ph - fh) / sh + 1;
  150. int wo = (wi + 2 * pw - fw) / sw + 1;
  151. mgb_assert(ho % sgh == 0 && wo % sgw == 0);
  152. int grp_ho = ho / sgh, grp_wo = wo / sgw;
  153. int co = filter->shape()[5];
  154. size_t u_batch = batch_size, u_co = co, u_ho = ho, u_wo = wo;
  155. out = std::make_shared<HostTensorND>(
  156. CompNode::load("xpu0"), TensorShape{u_batch, u_co, u_ho, u_wo});
  157. mgb_assert(param.mode == Param::Mode::CROSS_CORRELATION);
  158. for (int n = 0; n < batch_size; ++n) {
  159. for (int oc = 0; oc < co; ++oc) {
  160. for (int oh = 0; oh < ho; ++oh) {
  161. for (int ow = 0; ow < wo; ++ow) {
  162. size_t u_n = n, u_oc = oc, u_oh = oh, u_ow = ow;
  163. float& dval = out->ptr<float>({u_n, u_oc, u_oh, u_ow})[0];
  164. dval = 0;
  165. int grp_oh_idx = oh / grp_ho;
  166. int grp_ow_idx = ow / grp_wo;
  167. for (int ic = 0; ic < ci; ++ic) {
  168. for (int kh = 0; kh < fh; ++kh) {
  169. for (int kw = 0; kw < fw; ++kw) {
  170. int ih = oh * sh - ph + kh;
  171. int iw = ow * sw - pw + kw;
  172. float sval = 0.f;
  173. float fval = 0.f;
  174. if (ih >= 0 && ih < hi && iw >= 0 && iw < wi) {
  175. sval = in->ptr<float>(
  176. {static_cast<size_t>(n),
  177. static_cast<size_t>(ic),
  178. static_cast<size_t>(ih),
  179. static_cast<size_t>(iw)})[0];
  180. }
  181. fval = filter->ptr<float>(
  182. {static_cast<size_t>(grp_oh_idx),
  183. static_cast<size_t>(grp_ow_idx),
  184. static_cast<size_t>(ic),
  185. static_cast<size_t>(kh),
  186. static_cast<size_t>(kw),
  187. static_cast<size_t>(oc)})[0];
  188. dval += fval * sval;
  189. }
  190. }
  191. }
  192. }
  193. }
  194. }
  195. }
  196. }
  197. void convolution_brute(const std::vector<std::shared_ptr<HostTensorND>> &in_tensor,
  198. std::shared_ptr<HostTensorND> &out_tensor,
  199. const opr::Convolution::Param &param)
  200. {
  201. mgb_assert(in_tensor.size() == 2);
  202. auto in = in_tensor[0], filter = in_tensor[1];
  203. mgb_assert(in->shape().ndim == 4);
  204. mgb_assert(filter->shape().ndim == 4);
  205. int batch_size = in->shape().shape[0];
  206. int ic = in->shape().shape[1];
  207. int ih = in->shape().shape[2];
  208. int iw = in->shape().shape[3];
  209. int fh = filter->shape().shape[2];
  210. int fw = filter->shape().shape[3];
  211. int ph = param.pad_h;
  212. int pw = param.pad_w;
  213. int sh = param.stride_h;
  214. int sw = param.stride_w;
  215. int dh = param.dilate_h;
  216. int dw = param.dilate_w;
  217. mgb_assert(ih + 2*ph >= (fh - 1) * dh + 1);
  218. mgb_assert(iw + 2*pw >= (fw - 1) * dw + 1);
  219. int oh = (ih + 2*ph - ((fh - 1) * dh + 1)) / sh + 1;
  220. int ow = (iw + 2*pw - ((fw - 1) * dw + 1)) / sw + 1;
  221. mgb_assert(static_cast<size_t>(ic) == filter->shape().shape[1]);
  222. int oc = filter->shape().shape[0];
  223. out_tensor = std::make_shared<HostTensorND>(CompNode::load("xpu0"),
  224. TensorShape{
  225. static_cast<size_t>(batch_size),
  226. static_cast<size_t>(oc),
  227. static_cast<size_t>(oh),
  228. static_cast<size_t>(ow)});
  229. int pn, poc, poh, pow, pih, piw, pic, pfh, pfw;
  230. for (pn = 0; pn < batch_size; ++pn)
  231. for (poc = 0; poc < oc; ++poc)
  232. for (poh = 0, pih = -ph; poh < oh; ++poh, pih += sh)
  233. for (pow = 0, piw = -pw; pow < ow; ++pow, piw += sw)
  234. {
  235. float &target = out_tensor->ptr<float>({
  236. static_cast<size_t>(pn),
  237. static_cast<size_t>(poc),
  238. static_cast<size_t>(poh),
  239. static_cast<size_t>(pow)})[0];
  240. target = 0;
  241. for (pic = 0; pic < ic; ++pic)
  242. for (pfh = 0; pfh < fh; ++pfh)
  243. for (pfw = 0; pfw < fw; ++pfw)
  244. {
  245. int prih, priw;
  246. float img_data, filter_data;
  247. if (param.mode == Param::Mode::CONVOLUTION) {
  248. prih = pih + (fh - pfh - 1) * dh;
  249. priw = piw + (fw - pfw - 1) * dw;
  250. } else {
  251. mgb_assert(param.mode == Param::Mode::CROSS_CORRELATION);
  252. prih = pih + pfh * dh;
  253. priw = piw + pfw * dw;
  254. }
  255. if (prih >= 0 && prih < ih &&
  256. priw >= 0 && priw < iw) {
  257. img_data = in_tensor[0]->ptr<float>({
  258. static_cast<size_t>(pn),
  259. static_cast<size_t>(pic),
  260. static_cast<size_t>(prih),
  261. static_cast<size_t>(priw)})[0];
  262. } else {
  263. img_data = 0;
  264. }
  265. filter_data = filter->ptr<float>({
  266. static_cast<size_t>(poc),
  267. static_cast<size_t>(pic),
  268. static_cast<size_t>(pfh),
  269. static_cast<size_t>(pfw)})[0];
  270. target += img_data * filter_data;
  271. }
  272. }
  273. }
  274. opr::Convolution::Param convert_to_conv_param(
  275. const opr::ConvBiasForward::Param& param) {
  276. return opr::Convolution::Param{
  277. param.mode, param.pad_h, param.pad_w,
  278. param.stride_h, param.stride_w, param.dilate_h,
  279. param.dilate_w, param.sparse, param.format};
  280. };
  281. #if MGB_CUDA
  282. opr::Convolution::Param convert_to_conv_param(
  283. const opr::BatchConvBiasForward::Param& param) {
  284. return opr::Convolution::Param{
  285. param.mode, param.pad_h, param.pad_w,
  286. param.stride_h, param.stride_w, param.dilate_h,
  287. param.dilate_w, param.sparse, param.format};
  288. };
  289. #endif
  290. TEST(TestOprDNN, ConvolutionForward) {
  291. uint32_t ih = 10, ic = 16, oc = 32, ph = 0, sh = 1, fh = 2;
  292. for (auto mode: modes_to_check) {
  293. uint32_t iw = ih + 1, fw = fh + 1, pw = ph + 1, sw = sh + 1;
  294. Param param{mode, ph, pw, sh, sw};
  295. size_t batch_size = 32;
  296. // !!! DEPRECATED. use AutoOprChecker instead.
  297. opr::test::ForwardChecker<opr::Convolution, 2> forward_checker({
  298. {batch_size, ic, ih, iw},
  299. {oc, ic, fh, fw}},
  300. convolution_brute, param);
  301. forward_checker.run();
  302. }
  303. }
  304. TEST(TestOprDNN, ConvolutionBackward) {
  305. uint32_t ih = 10, ic = 16, oc = 32, ph = 0, sh = 1, fh = 2;
  306. for (auto mode: modes_to_check) {
  307. uint32_t iw = 11, fw = 4, pw = 1, sw = 3;
  308. Param param{mode, ph, pw, sh, sw};
  309. size_t batch_size = 32;
  310. // !!! DEPRECATED. use AutoOprChecker instead.
  311. opr::test::BackwardChecker<opr::Convolution, 2> backward_checker({
  312. {batch_size, ic, ih, iw},
  313. {oc, ic, fh, fw}}, param, 1e-2, 1);
  314. backward_checker.run();
  315. }
  316. }
  317. TEST(TestOprDNN, ConvBiasExePolicy) {
  318. using Param = opr::ConvBias::Param;
  319. Param param;
  320. using Policy = opr::ConvBias::ExecutionPolicy;
  321. using S = Policy::Strategy;
  322. auto cn = CompNode::load("cpux");
  323. #if MGB_ENABLE_FASTRUN
  324. for (auto strategy :
  325. SmallVector<S>{S::PROFILE, S::HEURISTIC, S::PROFILE | S::REPRODUCIBLE,
  326. S::PROFILE | S::HEURISTIC, S::PROFILE | S::OPTIMIZED}) {
  327. #else
  328. for (auto strategy :
  329. SmallVector<S>{S : HEURISTIC, S::PROFILE | S::HEURISTIC}) {
  330. #endif
  331. auto graph = ComputingGraph::make();
  332. HostTensorGenerator<> gen;
  333. auto mkvar = [&](const char* name, const TensorShape& shp,
  334. const DType& dtype) {
  335. return opr::TypeCvt::make(
  336. opr::Host2DeviceCopy::make(*graph, gen(shp), cn).rename(name),
  337. dtype);
  338. };
  339. auto x = mkvar("x", {20, 50, 50, 16}, dtype::QuantizedS8(2.5f));
  340. auto w = mkvar("w", {24, 3, 3, 16}, dtype::QuantizedS8(2.5f));
  341. auto bias = mkvar("bias", {1, 1, 1, 24}, dtype::QuantizedS32(6.25f));
  342. param.nonlineMode = Param::NonlineMode::RELU;
  343. param.format = Param::Format::NHWC;
  344. Policy policy;
  345. policy.strategy = strategy;
  346. auto conv_bias = opr::ConvBias::make(
  347. x, w, bias, param, policy,
  348. OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  349. HostTensorND host_y;
  350. auto func = graph->compile({make_callback_copy(conv_bias, host_y)});
  351. func->execute();
  352. }
  353. }
  354. TEST(TestOprDNN, ConvBiasExePolicy_Quantized8Asym) {
  355. using Param = opr::ConvBias::Param;
  356. Param param;
  357. using Policy = opr::ConvBias::ExecutionPolicy;
  358. using S = Policy::Strategy;
  359. auto cn = CompNode::load("cpux");
  360. for (auto strategy :
  361. SmallVector<S>{S::PROFILE, S::PROFILE | S::REPRODUCIBLE}) {
  362. auto graph = ComputingGraph::make();
  363. HostTensorGenerator<> gen;
  364. auto mkvar = [&](const char* name, const TensorShape& shp,
  365. const DType& dtype) {
  366. return opr::TypeCvt::make(
  367. opr::Host2DeviceCopy::make(*graph, gen(shp), cn).rename(name),
  368. dtype);
  369. };
  370. auto x = mkvar("x", {20, 50, 50, 16}, dtype::Quantized8Asymm(2.5f, static_cast<uint8_t>(0)));
  371. auto w = mkvar("w", {24, 3, 3, 16}, dtype::Quantized8Asymm(2.5f, static_cast<uint8_t>(0)));
  372. auto bias = mkvar("bias", {1, 1, 1, 24}, dtype::QuantizedS32(6.25f));
  373. param.nonlineMode = Param::NonlineMode::RELU;
  374. param.format = Param::Format::NHWC;
  375. Policy policy;
  376. policy.strategy = strategy;
  377. auto conv_bias = opr::ConvBias::make(
  378. x, w, bias, param, policy,
  379. OperatorNodeConfig{dtype::Quantized8Asymm(2.5f, static_cast<uint8_t>(0))});
  380. HostTensorND host_y;
  381. auto func = graph->compile({make_callback_copy(conv_bias, host_y)});
  382. func->execute();
  383. }
  384. }
  385. TEST(TestOprDNN, ConvolutionExePolicy) {
  386. Param param{Mode::CONVOLUTION};
  387. using Policy = opr::Convolution::ExecutionPolicy;
  388. using S = Policy::Strategy;
  389. int nr_get = 0;
  390. auto on_get = [&nr_get](const std::string&, const void*, size_t,
  391. const void*, size_t) { ++nr_get; };
  392. PersistentCacheHook cache_hook{on_get};
  393. #if MGB_ENABLE_FASTRUN
  394. for (auto strategy :
  395. SmallVector<S>{S::PROFILE, S::HEURISTIC, S::PROFILE | S::REPRODUCIBLE,
  396. S::PROFILE | S::HEURISTIC, S::PROFILE | S::OPTIMIZED}) {
  397. #else
  398. for (auto strategy :
  399. SmallVector<S>{S : HEURISTIC, S::PROFILE | S::HEURISTIC}) {
  400. #endif
  401. using Checker = AutoOprChecker<2, 1>;
  402. auto make_graph = [&](const Checker::SymInpArray& inputs)
  403. -> Checker::SymOutArray {
  404. Policy policy;
  405. policy.strategy = strategy;
  406. auto out =
  407. opr::Convolution::make(inputs[0], inputs[1], param, policy);
  408. return {out};
  409. };
  410. auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
  411. std::shared_ptr<HostTensorND> sh_out;
  412. convolution_brute({inp.begin(), inp.end()}, sh_out, param);
  413. dest[0] = *sh_out;
  414. };
  415. Checker::RunOptions opt;
  416. opt.numdiff_eps = 1;
  417. nr_get = 0;
  418. Checker(make_graph, fwd)
  419. .run({TensorShape{3, 2, 10, 6}, {4, 2, 3, 2}}, opt)
  420. .run({TensorShape{6, 3, 8, 13}, {2, 3, 2, 13}}, opt)
  421. .run({TensorShape{1, 1, 10, 10}, {2, 1, 3, 3}}, opt);
  422. if (strategy == S::HEURISTIC) {
  423. ASSERT_EQ(0, nr_get);
  424. } else {
  425. ASSERT_LT(0, nr_get);
  426. }
  427. }
  428. }
  429. TEST(TestOprDNN, ConvolutionBackwardDataBfloat16ExePolicy) {
  430. REQUIRE_GPU(1);
  431. Param param{Mode::CROSS_CORRELATION, 1, 1, 1, 1};
  432. param.compute_mode = Param::ComputeMode::FLOAT32;
  433. using Policy = opr::Convolution::ExecutionPolicy;
  434. using S = Policy::Strategy;
  435. auto gen_bfp16 = [](HostTensorND& dest) {
  436. RNGxorshf rng{next_rand_seed()};
  437. auto rand_real = [&rng]() {
  438. std::uniform_real_distribution<float> dist(-1, 1);
  439. return dist(rng);
  440. };
  441. auto ptr = dest.ptr<dt_bfloat16>();
  442. size_t elems = dest.shape().total_nr_elems();
  443. for (size_t i = 0; i < elems; i++) {
  444. ptr[i] = dt_bfloat16(rand_real());
  445. }
  446. };
  447. auto f32_to_bf16 = [](const std::shared_ptr<HostTensorND>& src)
  448. -> std::shared_ptr<HostTensorND> {
  449. auto ret = std::make_shared<HostTensorND>(
  450. src->comp_node(), src->shape(), dtype::BFloat16{});
  451. for (size_t i = 0; i < src->layout().total_nr_elems(); i++) {
  452. ret->ptr<dt_bfloat16>()[i] = src->ptr<dt_float32>()[i];
  453. }
  454. return ret;
  455. };
  456. auto bf16_to_f32 = [](const std::shared_ptr<HostTensorND>& src)
  457. -> std::shared_ptr<HostTensorND> {
  458. auto ret = std::make_shared<HostTensorND>(
  459. src->comp_node(), src->shape(), dtype::Float32{});
  460. for (size_t i = 0; i < src->layout().total_nr_elems(); i++) {
  461. ret->ptr<dt_float32>()[i] = src->ptr<dt_bfloat16>()[i];
  462. }
  463. return ret;
  464. };
  465. int nr_get = 0;
  466. auto on_get = [&nr_get](const std::string&, const void*, size_t,
  467. const void*, size_t) { ++nr_get; };
  468. PersistentCacheHook cache_hook{on_get};
  469. #if MGB_ENABLE_FASTRUN
  470. for (auto strategy :
  471. {S::PROFILE, S::HEURISTIC, S(S::PROFILE | S::REPRODUCIBLE),
  472. S(S::PROFILE | S::HEURISTIC)}) {
  473. #else
  474. for (auto strategy: {S:HEURISTIC, S(S::PROFILE | S::HEURISTIC)}) {
  475. #endif
  476. using Checker = AutoOprChecker<2, 1>;
  477. auto make_graph = [&](const Checker::SymInpArray& inputs)
  478. -> Checker::SymOutArray {
  479. Policy policy;
  480. policy.strategy = strategy;
  481. return {opr::ConvolutionBackwardData::make_deconv(
  482. inputs[0], inputs[1], param, policy)};
  483. };
  484. auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
  485. std::shared_ptr<HostTensorND> out;
  486. conv_bwd_data_brute(
  487. {bf16_to_f32(inp[0]), bf16_to_f32(inp[1])}, out,
  488. param);
  489. dest[0] = *f32_to_bf16(out);
  490. };
  491. Checker::RunOptions opt;
  492. opt.outputs_max_err = 1e-3;
  493. nr_get = 0;
  494. Checker(make_graph, fwd)
  495. .disable_grad_check()
  496. .set_input_dtype(0, dtype::BFloat16{})
  497. .set_input_dtype(1, dtype::BFloat16{})
  498. .set_input_generator(0, gen_bfp16)
  499. .set_input_generator(1, gen_bfp16)
  500. .run({TensorShape{3, 4, 10, 6}, {4, 2, 3, 3}}, opt)
  501. .run({TensorShape{2, 2, 4, 3}, {2, 2, 3, 3}}, opt)
  502. .run({TensorShape{1, 3, 10, 6}, {3, 2, 3, 3}}, opt);
  503. if (strategy == S::HEURISTIC) {
  504. ASSERT_EQ(0, nr_get);
  505. } else {
  506. ASSERT_LT(0, nr_get);
  507. }
  508. }
  509. }
  510. TEST(TestOprDNN, Deconvolution) {
  511. // dilated grouped deconv
  512. using Checker = AutoOprChecker<2, 1>;
  513. Param param{Mode::CROSS_CORRELATION, 0, 1, 1, 2};
  514. param.dilate_h = 2;
  515. param.sparse = Param::Sparse::GROUP;
  516. auto make_graph = [&](
  517. const Checker::SymInpArray &inputs) -> Checker::SymOutArray {
  518. return {opr::ConvolutionBackwardData::make_deconv(
  519. inputs[0], inputs[1], param)};
  520. };
  521. auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
  522. std::shared_ptr<HostTensorND> out;
  523. conv_bwd_data_brute({inp[0], inp[1]}, out, param);
  524. dest[0] = *out;
  525. };
  526. Checker::RunOptions opt;
  527. opt.numdiff_eps = 1;
  528. Checker(make_graph, fwd).
  529. run({TensorShape{2, 4, 6, 8}, {1, 4, 5, 3, 2}}, opt).
  530. run({TensorShape{3, 2, 1, 1}, {2, 1, 1, 4, 3}}, opt).
  531. run({TensorShape{4, 6, 7, 2}, {2, 3, 4, 8, 13}}, opt);
  532. }
  533. TEST(TestOprDNN, DeconvolutionExePolicy_QuantizedS8) {
  534. REQUIRE_GPU(1);
  535. auto cn = CompNode::load("gpu0");
  536. cn.activate();
  537. REQUIRE_CUDA_COMPUTE_CAPABILITY(6, 1);
  538. Param param;
  539. using Policy = opr::ConvolutionBackwardData::ExecutionPolicy;
  540. using S = Policy::Strategy;
  541. #if MGB_ENABLE_FASTRUN
  542. for (auto strategy :
  543. {S::PROFILE, S::HEURISTIC, S(S::PROFILE | S::REPRODUCIBLE),
  544. S(S::PROFILE | S::HEURISTIC)}) {
  545. #else
  546. for (auto strategy: {S:HEURISTIC, S(S::PROFILE | S::HEURISTIC)}) {
  547. #endif
  548. auto graph = ComputingGraph::make();
  549. HostTensorGenerator<> gen;
  550. auto mkvar = [&](const char* name, const TensorShape& shp,
  551. const DType& dtype) {
  552. return opr::TypeCvt::make(
  553. opr::Host2DeviceCopy::make(*graph, gen(shp)).rename(name),
  554. dtype);
  555. };
  556. auto x = mkvar("x", {16, 4, 50, 50, 4}, dtype::QuantizedS8(1.2f));
  557. auto w = mkvar("w", {16, 4, 4, 4, 4}, dtype::QuantizedS8(1.3f));
  558. param.format = Param::Format::NCHW4;
  559. param.pad_h = param.pad_w = 2;
  560. param.stride_h = param.stride_w = 2;
  561. Policy policy;
  562. policy.strategy = strategy;
  563. auto deconv = opr::ConvolutionBackwardData::make_deconv(
  564. x, w, param, policy,
  565. OperatorNodeConfig{dtype::QuantizedS8(1.2f)});
  566. HostTensorND host_y;
  567. auto func = graph->compile({make_callback_copy(deconv, host_y)});
  568. func->execute();
  569. }
  570. }
  571. TEST(TestOprDNN, ConvolutionBackwardFilter) {
  572. using Checker = AutoOprChecker<3, 1>;
  573. constexpr size_t PH = 0, PW = 1, SH = 1, SW = 2;
  574. auto make_graph = [&](
  575. const Checker::SymInpArray &inputs) -> Checker::SymOutArray {
  576. Param param{Mode::CROSS_CORRELATION, PH, PW, SH, SW};
  577. return {opr::ConvolutionBackwardFilter::make(
  578. inputs[0], inputs[1], inputs[2], param)};
  579. };
  580. auto fwd = [&](Checker::NumOutArray &dest, Checker::NumInpArray inp) {
  581. std::shared_ptr<HostTensorND> out;
  582. conv_bwd_flt_brute({inp[0], inp[1], inp[2]}, out,
  583. Param{Mode::CROSS_CORRELATION, PH, PW, SH, SW});
  584. dest[0] = *out;
  585. };
  586. #define get_shp(N, P, S, F) ((N + 2 * P - F) / S + 1)
  587. #define inp_tensor(N, IC, OC, IH, IW, FH, FW) \
  588. { TensorShape{N, IC, IH, IW}, \
  589. {N, OC, get_shp(IH, PH, SH, FH), get_shp(IW, PW, SW, FW)}, \
  590. {OC, IC, FH, FW} }
  591. Checker::RunOptions opt;
  592. opt.numdiff_eps = 1;
  593. Checker(make_graph, fwd).
  594. run(inp_tensor(2, 3, 4, 9, 8, 4, 3), opt).
  595. run(inp_tensor(1, 5, 3, 7, 9, 3, 4), opt).
  596. run(inp_tensor(3, 4, 4, 9, 9, 3, 3), opt);
  597. #undef inp_tensor
  598. #undef get_shp
  599. }
  600. TEST(TestOprDNN, DilatedConvolution) {
  601. using Checker = AutoOprChecker<2, 1>;
  602. opr::ConvolutionForward::Param param;
  603. param.pad_h = 5;
  604. param.pad_w = 2;
  605. param.stride_w = 2;
  606. param.dilate_h = 2;
  607. auto make_graph = [&](const Checker::SymInpArray &inputs) ->
  608. Checker::SymOutArray {
  609. return {opr::Convolution::make(inputs[0], inputs[1], param)};
  610. };
  611. auto fwd = [&](Checker::NumOutArray &dest, Checker::NumInpArray inp) {
  612. auto opr = megdnn_naive_handle()->create_operator<
  613. megdnn::Convolution>();
  614. opr->param() = param;
  615. TensorLayout dest_layout;
  616. opr->deduce_layout(inp[0]->layout(), inp[1]->layout(), dest_layout);
  617. std::vector<dt_byte> workspace(opr->get_workspace_in_bytes(
  618. inp[0]->layout(), inp[1]->layout(), dest_layout, nullptr));
  619. dest[0].dtype(dtype::Float32()).
  620. comp_node(inp[0]->comp_node()).resize(dest_layout);
  621. opr->exec(inp[0]->as_megdnn(), inp[1]->as_megdnn(), dest[0].as_megdnn(),
  622. nullptr, {workspace.data(), workspace.size()});
  623. };
  624. Checker::RunOptions option;
  625. option.numdiff_eps = 0.1;
  626. Checker(make_graph, fwd).
  627. run({TensorShape{2, 3, 8, 7}, TensorShape{4, 3, 2, 2}}, option).
  628. run({TensorShape{2, 3, 8, 7}, TensorShape{4, 3, 3, 2}}, option).
  629. run({TensorShape{2, 3, 8, 9}, TensorShape{4, 3, 3, 2}}, option);
  630. }
  631. TEST(TestOprDNN, GroupConv) {
  632. using Checker = AutoOprChecker<2, 1>;
  633. opr::Convolution::Param param;
  634. param.pad_h = 1;
  635. param.pad_w = 2;
  636. param.stride_h = 2;
  637. auto make_graph = [&](
  638. const Checker::SymInpArray &inputs) -> Checker::SymOutArray {
  639. auto p1 = param;
  640. p1.sparse = opr::Convolution::Param::Sparse::GROUP;
  641. return {opr::Convolution::make(inputs[0], inputs[1], p1)};
  642. };
  643. auto cn = CompNode::load("xpux");
  644. auto inp0 = std::make_shared<HostTensorND>(cn, dtype::Float32()),
  645. inp1 = std::make_shared<HostTensorND>(cn, dtype::Float32());
  646. HostTensorND out_raw;
  647. auto graph_raw = ComputingGraph::make();
  648. auto func_raw = graph_raw->compile({
  649. make_callback_copy(
  650. opr::Convolution::make(
  651. opr::Host2DeviceCopy::make(*graph_raw, inp0),
  652. opr::Host2DeviceCopy::make(*graph_raw, inp1),
  653. param),
  654. out_raw)});
  655. auto fwd = [&](Checker::NumOutArray &dest, Checker::NumInpArray inp) {
  656. auto &&out = dest[0];
  657. auto sl = inp[0]->layout(),
  658. fl = inp[1]->layout().remove_axis(0);
  659. TensorLayout ol;
  660. auto group = inp[1]->layout()[0];
  661. sl.shape[1] /= group;
  662. for (size_t i = 0; i < group; ++ i) {
  663. inp0->copy_from(inp[0]->sub(SubTensorSpec::make_from_offset_elem(
  664. sl, i * sl[1] * sl[2] * sl[3])));
  665. inp1->copy_from(inp[1]->sub(SubTensorSpec::make_from_offset_elem(
  666. fl, i * fl.total_nr_elems())));
  667. func_raw->execute();
  668. if (!i) {
  669. auto oshp = out_raw.shape();
  670. oshp[1] *= group;
  671. out.resize(oshp);
  672. ol = out.layout();
  673. ol[1] /= group;
  674. }
  675. out.sub(SubTensorSpec::make_from_offset_elem(
  676. ol, i * ol[1] * ol[2] * ol[3])).copy_from_fixlayout(
  677. out_raw);
  678. }
  679. };
  680. Checker::RunOptions opt;
  681. opt.numdiff_eps = 1;
  682. opt.outputs_max_err = 5e-5;
  683. Checker checker{make_graph, fwd};
  684. auto run = [&](const TensorShape &ishp,
  685. size_t fh, size_t fw, size_t oc, size_t group) {
  686. size_t ic = ishp[1];
  687. TensorShape flt{group, oc/group, ic/group, fh, fw};
  688. checker.run({ishp, flt}, opt);
  689. };
  690. run({1, 2, 1, 1}, 1, 1, 2, 2);
  691. run({3, 9, 5, 4}, 1, 2, 6, 3);
  692. run({3, 6, 8, 9}, 3, 1, 4, 2);
  693. run({2, 5, 3, 6}, 2, 3, 5, 1);
  694. run({2, 6, 3, 6}, 2, 3, 6, 6);
  695. }
  696. TEST(TestOprDNN, MaskConvolution) {
  697. using Checker = AutoOprChecker<3, 1>;
  698. opr::Convolution::Param param;
  699. auto make_graph =
  700. [&](const Checker::SymInpArray& inputs) -> Checker::SymOutArray {
  701. return {opr::MaskConvolution::make(inputs[0], inputs[1], inputs[2],
  702. param)};
  703. };
  704. auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
  705. std::shared_ptr<HostTensorND> sh_out;
  706. convolution_brute({inp[0], inp[1]}, sh_out, param);
  707. dest[0] = *sh_out;
  708. size_t N = dest[0].shape()[0];
  709. size_t OC = dest[0].shape()[1];
  710. size_t OH = dest[0].shape()[2];
  711. size_t OW = dest[0].shape()[3];
  712. auto mask_ptr = inp[2]->ptr<int8_t>();
  713. auto dest_ptr = dest[0].ptr<float>();
  714. for (size_t i = 0; i < N * OC; ++i) {
  715. for (size_t mask_idx = 0; mask_idx < OH * OW; ++mask_idx) {
  716. if (mask_ptr[mask_idx] == 0) {
  717. dest_ptr[i * OH * OW + mask_idx] = 0;
  718. }
  719. }
  720. }
  721. };
  722. auto gen_mask = [](HostTensorND& dest) {
  723. HostTensorGenerator<dtype::Int8, RandomDistribution::UNIFORM>
  724. mask_generator{0, 1};
  725. dest = *mask_generator(dest.shape(), dest.comp_node());
  726. };
  727. auto run_with_param = [&](size_t SH = 1, size_t SW = 1, size_t PH = 0,
  728. size_t PW = 0) {
  729. param.pad_h = PH;
  730. param.pad_w = PW;
  731. param.stride_h = SH;
  732. param.stride_w = SW;
  733. Checker checker{make_graph, fwd};
  734. Checker::RunOptions opt;
  735. checker.set_output_allow_grad(0, false);
  736. checker.set_input_dtype(2, dtype::Int8());
  737. checker.set_input_generator(2, gen_mask);
  738. auto run = [&](size_t N, size_t IC, size_t OC, size_t IH, size_t IW,
  739. size_t FH, size_t FW) {
  740. size_t OH = (IH + 2 * PH - FH) / SH + 1;
  741. size_t OW = (IW + 2 * PW - FW) / SW + 1;
  742. checker.run(
  743. {TensorShape{N, IC, IH, IW}, {OC, IC, FH, FW}, {OH, OW}},
  744. opt);
  745. };
  746. run(1, 1, 1, 5, 5, 3, 3);
  747. run(2, 3, 4, 5, 5, 3, 3);
  748. run(3, 3, 4, 224, 223, 3, 3);
  749. run(3, 3, 4, 224, 223, 2, 2);
  750. };
  751. run_with_param();
  752. run_with_param(2, 2, 3, 3);
  753. run_with_param(3, 2, 1, 2);
  754. run_with_param(2, 3, 2, 2);
  755. }
  756. TEST(TestOprDNN, MaskPropagate) {
  757. using Checker = AutoOprChecker<3, 1>;
  758. opr::MaskPropagate::Param mask_param;
  759. opr::Convolution::Param conv_param;
  760. auto make_graph =
  761. [&](const Checker::SymInpArray& inputs) -> Checker::SymOutArray {
  762. auto inp_mask = inputs[2];
  763. auto out_mask = opr::MaskPropagate::make(inp_mask, mask_param);
  764. return {opr::MaskConvolution::make(inputs[0], inputs[1], out_mask,
  765. conv_param)};
  766. };
  767. auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
  768. auto& src = *inp[0];
  769. auto& mask = *inp[2];
  770. auto src_ptr = inp[0]->ptr<float>();
  771. auto mask_ptr = inp[2]->ptr<int>();
  772. mgb_assert(src.shape()[2] == mask.shape()[0] &&
  773. src.shape()[3] == mask.shape()[1]);
  774. for (size_t i = 0; i < src.shape()[0] * src.shape()[1]; ++i) {
  775. for (size_t mask_idx = 0;
  776. mask_idx < src.shape()[2] * src.shape()[3]; ++mask_idx) {
  777. if (mask_ptr[mask_idx] == 0) {
  778. src_ptr[i * src.layout().stride[1] + mask_idx] = 0;
  779. }
  780. }
  781. }
  782. std::shared_ptr<HostTensorND> sh_out;
  783. convolution_brute({inp[0], inp[1]}, sh_out, conv_param);
  784. dest[0] = *sh_out;
  785. };
  786. auto gen_mask = [](HostTensorND& dest) {
  787. HostTensorGenerator<dtype::Int32, RandomDistribution::UNIFORM>
  788. mask_generator{0, 1};
  789. dest = *mask_generator(dest.shape(), dest.comp_node());
  790. };
  791. auto run_with_param = [&](size_t FH, size_t FW, size_t SH = 1,
  792. size_t SW = 1, size_t PH = 0, size_t PW = 0,
  793. size_t DH = 1, size_t DW = 1) {
  794. conv_param.pad_h = PH;
  795. conv_param.pad_w = PW;
  796. conv_param.stride_h = SH;
  797. conv_param.stride_w = SW;
  798. conv_param.dilate_h = DH;
  799. conv_param.dilate_w = DW;
  800. mask_param.pad_h = PH;
  801. mask_param.pad_w = PW;
  802. mask_param.stride_h = SH;
  803. mask_param.stride_w = SW;
  804. mask_param.kernel_h = FH;
  805. mask_param.kernel_w = FW;
  806. mask_param.dilate_h = DH;
  807. mask_param.dilate_w = DW;
  808. Checker checker{make_graph, fwd};
  809. Checker::RunOptions opt;
  810. checker.set_output_allow_grad(0, false);
  811. checker.set_input_dtype(2, dtype::Int32());
  812. checker.set_input_generator(2, gen_mask);
  813. auto run = [&](size_t N, size_t IC, size_t OC, size_t IH, size_t IW) {
  814. checker.run(
  815. {TensorShape{N, IC, IH, IW}, {OC, IC, FH, FW}, {IH, IW}},
  816. opt);
  817. };
  818. run(1, 1, 1, 5, 5);
  819. run(2, 3, 4, 5, 5);
  820. run(3, 3, 4, 224, 223);
  821. run(3, 3, 4, 224, 223);
  822. };
  823. run_with_param(3, 3, 1, 1, 0, 0, 2, 2);
  824. run_with_param(3, 3, 2, 2, 3, 3);
  825. run_with_param(4, 2, 3, 2, 1, 2);
  826. run_with_param(2, 4, 2, 3, 2, 2);
  827. run_with_param(4, 2, 3, 2, 1, 2, 2, 2);
  828. run_with_param(2, 4, 2, 3, 2, 2, 2, 1);
  829. }
  830. void convolution3d_brute(const std::vector<std::shared_ptr<HostTensorND>> &in_tensor,
  831. std::shared_ptr<HostTensorND> &out_tensor,
  832. const opr::Convolution3D::Param &param)
  833. {
  834. mgb_assert(in_tensor.size() == 2);
  835. auto in = in_tensor[0], filter = in_tensor[1];
  836. mgb_assert(in->shape().ndim == 5);
  837. mgb_assert(filter->shape().ndim == 5);
  838. int batch_size = in->shape().shape[0];
  839. int ic = in->shape().shape[1];
  840. int id = in->shape().shape[2];
  841. int ih = in->shape().shape[3];
  842. int iw = in->shape().shape[4];
  843. int fd = filter->shape().shape[2];
  844. int fh = filter->shape().shape[3];
  845. int fw = filter->shape().shape[4];
  846. int pd = param.pad_d;
  847. int ph = param.pad_h;
  848. int pw = param.pad_w;
  849. int sd = param.stride_d;
  850. int sh = param.stride_h;
  851. int sw = param.stride_w;
  852. int dd = param.dilate_d;
  853. int dh = param.dilate_h;
  854. int dw = param.dilate_w;
  855. mgb_assert(id + 2*pd >= (fd - 1) * dd + 1);
  856. mgb_assert(ih + 2*ph >= (fh - 1) * dh + 1);
  857. mgb_assert(iw + 2*pw >= (fw - 1) * dw + 1);
  858. int od = (id + 2*pd - ((fd - 1) * dd + 1)) / sd + 1;
  859. int oh = (ih + 2*ph - ((fh - 1) * dh + 1)) / sh + 1;
  860. int ow = (iw + 2*pw - ((fw - 1) * dw + 1)) / sw + 1;
  861. mgb_assert(static_cast<size_t>(ic) == filter->shape().shape[1]);
  862. int oc = filter->shape().shape[0];
  863. out_tensor = std::make_shared<HostTensorND>(CompNode::load("xpu0"),
  864. TensorShape{
  865. static_cast<size_t>(batch_size),
  866. static_cast<size_t>(oc),
  867. static_cast<size_t>(od),
  868. static_cast<size_t>(oh),
  869. static_cast<size_t>(ow)});
  870. int pn, poc, pod, poh, pow,
  871. pic, pid, pih, piw,
  872. pfd, pfh, pfw;
  873. for (pn = 0; pn < batch_size; ++pn)
  874. for (poc = 0; poc < oc; ++poc)
  875. for (pod = 0, pid = -pd; pod < od; ++pod, pid += sd)
  876. for (poh = 0, pih = -ph; poh < oh; ++poh, pih += sh)
  877. for (pow = 0, piw = -pw; pow < ow; ++pow, piw += sw)
  878. {
  879. float &target = out_tensor->ptr<float>({
  880. static_cast<size_t>(pn),
  881. static_cast<size_t>(poc),
  882. static_cast<size_t>(pod),
  883. static_cast<size_t>(poh),
  884. static_cast<size_t>(pow)})[0];
  885. target = 0;
  886. for (pic = 0; pic < ic; ++pic)
  887. for (pfd = 0; pfd < fd; ++pfd)
  888. for (pfh = 0; pfh < fh; ++pfh)
  889. for (pfw = 0; pfw < fw; ++pfw)
  890. {
  891. int prid, prih, priw;
  892. float img_data, filter_data;
  893. if (param.mode == opr::Convolution3D::Param::Mode::CONVOLUTION) {
  894. prid = pid + (fd - pfd - 1) * dd;
  895. prih = pih + (fh - pfh - 1) * dh;
  896. priw = piw + (fw - pfw - 1) * dw;
  897. } else {
  898. mgb_assert(param.mode == opr::Convolution3D::Param::Mode::CROSS_CORRELATION);
  899. prid = pid + pfd * dd;
  900. prih = pih + pfh * dh;
  901. priw = piw + pfw * dw;
  902. }
  903. if (prid >= 0 && prid < id &&
  904. prih >= 0 && prih < ih &&
  905. priw >= 0 && priw < iw) {
  906. img_data = in_tensor[0]->ptr<float>({
  907. static_cast<size_t>(pn),
  908. static_cast<size_t>(pic),
  909. static_cast<size_t>(prid),
  910. static_cast<size_t>(prih),
  911. static_cast<size_t>(priw)})[0];
  912. } else {
  913. img_data = 0;
  914. }
  915. filter_data = filter->ptr<float>({
  916. static_cast<size_t>(poc),
  917. static_cast<size_t>(pic),
  918. static_cast<size_t>(pfd),
  919. static_cast<size_t>(pfh),
  920. static_cast<size_t>(pfw)})[0];
  921. target += img_data * filter_data;
  922. }
  923. }
  924. }
  925. TEST(TestOprDNN, Convolution3DForward) {
  926. for (uint32_t batch_size : {8})
  927. for (uint32_t id : {12})
  928. for (uint32_t fd : {1, 3})
  929. for (uint32_t ic : {4})
  930. for (uint32_t oc : {ic})
  931. for (uint32_t pd : {0, 2})
  932. for (uint32_t sd : {1, 3})
  933. for (uint32_t dd : {1, 3})
  934. for (bool xcorr : {0, 1}) {
  935. uint32_t ih = id + 1, fh = fd, ph = pd + 1, sh = sd + 1;
  936. uint32_t iw = ih + 1, fw = fh, pw = ph + 1, sw = sh + 1;
  937. Param3D param{xcorr ? Param3D::Mode::CROSS_CORRELATION :
  938. Param3D::Mode::CONVOLUTION , pd, ph, pw,
  939. sd, sh, sw, dd, dd, dd};
  940. // !!! DEPRECATED. use AutoOprChecker instead.
  941. opr::test::ForwardChecker<opr::Convolution3D, 2> forward_checker({
  942. {batch_size, ic, id, ih, iw},
  943. {oc, ic, fd, fh, fw}},
  944. convolution3d_brute, param);
  945. forward_checker.run();
  946. }
  947. }
  948. TEST(TestOprDNN, Convolution3DBackward) {
  949. for (uint32_t batch_size : {8})
  950. for (uint32_t id : {12})
  951. for (uint32_t fd : {1, 3})
  952. for (uint32_t ic : {4})
  953. for (uint32_t oc : {ic})
  954. for (uint32_t pd : {0, 2})
  955. for (uint32_t sd : {1, 3})
  956. for (uint32_t dd : {1, 3})
  957. for (bool xcorr : {0, 1}) {
  958. uint32_t ih = id + 1, fh = fd, ph = pd + 1, sh = sd + 1;
  959. uint32_t iw = ih + 1, fw = fh, pw = ph + 1, sw = sh + 1;
  960. Param3D param{xcorr ? Param3D::Mode::CROSS_CORRELATION :
  961. Param3D::Mode::CONVOLUTION,
  962. pd, ph, pw, sd, sh, sw, dd, dd, dd};
  963. // !!! DEPRECATED. use AutoOprChecker instead.
  964. opr::test::BackwardChecker<opr::Convolution3D, 2> backward_checker(
  965. {{batch_size, ic, id, ih, iw},
  966. {oc, ic, fd, fh, fw}}, param, 1e-2, 1);
  967. backward_checker.run();
  968. }
  969. }
  970. TEST(TestOprDNN, GroupConv3D) {
  971. using Checker = AutoOprChecker<2, 1>;
  972. opr::Convolution3D::Param param;
  973. param.pad_d = 0;
  974. param.pad_h = 1;
  975. param.pad_w = 0;
  976. param.stride_d = 1;
  977. param.stride_h = 2;
  978. auto make_graph = [&](
  979. const Checker::SymInpArray &inputs) -> Checker::SymOutArray {
  980. auto p1 = param;
  981. p1.sparse = opr::Convolution3D::Param::Sparse::GROUP;
  982. return {opr::Convolution3D::make(inputs[0], inputs[1], p1)};
  983. };
  984. auto cn = CompNode::load("xpux");
  985. auto inp0 = std::make_shared<HostTensorND>(cn, dtype::Float32()),
  986. inp1 = std::make_shared<HostTensorND>(cn, dtype::Float32());
  987. HostTensorND out_raw;
  988. auto graph_raw = ComputingGraph::make();
  989. auto func_raw = graph_raw->compile({
  990. make_callback_copy(
  991. opr::Convolution3D::make(
  992. opr::Host2DeviceCopy::make(*graph_raw, inp0),
  993. opr::Host2DeviceCopy::make(*graph_raw, inp1),
  994. param),
  995. out_raw)});
  996. auto fwd = [&](Checker::NumOutArray &dest, Checker::NumInpArray inp) {
  997. auto &&out = dest[0];
  998. auto sl = inp[0]->layout(),
  999. fl = inp[1]->layout().remove_axis(0);
  1000. TensorLayout ol;
  1001. auto group = inp[1]->layout()[0];
  1002. sl.shape[1] /= group;
  1003. for (size_t i = 0; i < group; ++ i) {
  1004. inp0->copy_from(inp[0]->sub(SubTensorSpec::make_from_offset_elem(
  1005. sl, i * sl[1] * sl[2] * sl[3] * sl[4])));
  1006. inp1->copy_from(inp[1]->sub(SubTensorSpec::make_from_offset_elem(
  1007. fl, i * fl.total_nr_elems())));
  1008. func_raw->execute();
  1009. if (!i) {
  1010. auto oshp = out_raw.shape();
  1011. oshp[1] *= group;
  1012. out.resize(oshp);
  1013. ol = out.layout();
  1014. ol[1] /= group;
  1015. }
  1016. out.sub(SubTensorSpec::make_from_offset_elem(
  1017. ol, i * ol[1] * ol[2] * ol[3] * ol[4])).
  1018. copy_from_fixlayout(out_raw);
  1019. }
  1020. };
  1021. Checker::RunOptions opt;
  1022. opt.numdiff_eps = 1;
  1023. opt.outputs_max_err = 5e-5;
  1024. Checker checker{make_graph, fwd};
  1025. auto run = [&](const TensorShape &ishp,
  1026. size_t fd, size_t fh, size_t fw, size_t oc, size_t group) {
  1027. size_t ic = ishp[1];
  1028. TensorShape flt{group, oc/group, ic/group, fd, fh, fw};
  1029. checker.
  1030. run({ishp, flt}, opt);
  1031. };
  1032. run({1, 2, 1, 1, 1}, 1, 1, 1, 2, 2);
  1033. run({3, 9, 5, 4, 3}, 1, 2, 3, 6, 3);
  1034. run({2, 1, 3, 6, 9}, 2, 3, 3, 5, 1);
  1035. run({2, 1, 3, 6, 9}, 2, 3, 3, 5, 1);
  1036. }
  1037. TEST(TestOprDNN, Deconvolution3D) {
  1038. using Checker = AutoOprChecker<2, 1>;
  1039. Param3D param{Param3D::Mode::CROSS_CORRELATION, 0, 1, 1, 1, 2, 2};
  1040. param.sparse = Param3D::Sparse::GROUP;
  1041. auto make_graph = [&](
  1042. const Checker::SymInpArray &inputs) -> Checker::SymOutArray {
  1043. return {opr::Convolution3DBackwardData::make_deconv(
  1044. inputs[0], inputs[1], param)};
  1045. };
  1046. auto fwd = [&](Checker::NumOutArray &dest, Checker::NumInpArray inp) {
  1047. auto &&data = *inp[0], &&filter = *inp[1];
  1048. size_t N = data.shape(0),
  1049. ID = data.shape(2), IH = data.shape(3), IW = data.shape(4),
  1050. GROUP = filter.shape(0),
  1051. ICPG = filter.shape(1), OCPG = filter.shape(2),
  1052. FD = filter.shape(3), FH = filter.shape(4), FW = filter.shape(5);
  1053. auto &&out = dest[0];
  1054. auto get_shp = [](
  1055. size_t inp, size_t filter, size_t stride, size_t pad,
  1056. size_t dilate) {
  1057. return (inp - 1) * stride + (filter - 1) * dilate + 1 - pad * 2;
  1058. };
  1059. size_t OD = get_shp(ID, FD,
  1060. param.stride_d, param.pad_d, param.dilate_d),
  1061. OH = get_shp(IH, FH,
  1062. param.stride_h, param.pad_h, param.dilate_h),
  1063. OW = get_shp(IW, FW,
  1064. param.stride_w, param.pad_w, param.dilate_w);
  1065. out.resize({N, OCPG * GROUP, OD, OH, OW});
  1066. auto fptr = filter.ptr<float>(),
  1067. dptr = data.ptr<float>(),
  1068. optr = out.ptr<float>();
  1069. memset(optr, 0, sizeof(float) * out.shape().total_nr_elems());
  1070. auto ol = out.layout(), fl = filter.layout();
  1071. #define FOR2(a, A, b, B) \
  1072. for (size_t a = 0; a < A; ++ a) \
  1073. for (size_t b = 0; b < B; ++ b)
  1074. #define FOR3(a, A, b, B, c, C) \
  1075. FOR2(a, A, b, B) \
  1076. for (size_t c = 0; c < C; ++ c)
  1077. #define FOR4(a, A, b, B, c, C, d, D) \
  1078. FOR3(a, A, b, B, c, C) \
  1079. for (size_t d = 0; d < D; ++ d)
  1080. FOR3(n, N, group, GROUP, icg, ICPG)
  1081. FOR3(id, ID, ih, IH, iw, IW) {
  1082. float scale = *(dptr ++);
  1083. FOR4(ocg, OCPG, fd, FD, fh, FH, fw, FW) {
  1084. auto oc_tot = group * OCPG + ocg;
  1085. int od = int(id * param.stride_d +
  1086. fd * param.dilate_d) - int(param.pad_d),
  1087. oh = int(ih * param.stride_h +
  1088. fh * param.dilate_h) - int(param.pad_h),
  1089. ow = int(iw * param.stride_w +
  1090. fw * param.dilate_w) - int(param.pad_w);
  1091. if (od >= 0 && oh >= 0 && ow >= 0 &&
  1092. od < static_cast<int>(OD) &&
  1093. oh < static_cast<int>(OH) &&
  1094. ow < static_cast<int>(OW)) {
  1095. auto out_off = n * ol.stride[0] + oc_tot * ol.stride[1] +
  1096. od * ol.stride[2] + oh * ol.stride[3] + ow,
  1097. flt_off = group * fl.stride[0] + icg * fl.stride[1] +
  1098. ocg * fl.stride[2] + fd * fl.stride[3] +
  1099. fh * fl.stride[4] + fw;
  1100. optr[out_off] += scale * fptr[flt_off];
  1101. }
  1102. }
  1103. }
  1104. #undef FOR4
  1105. #undef FOR3
  1106. #undef FOR2
  1107. };
  1108. Checker::RunOptions opt;
  1109. opt.numdiff_eps = 1;
  1110. Checker(make_graph, fwd).
  1111. run({TensorShape{2, 4, 3, 3, 2}, {1, 4, 5, 3, 2, 2}}, opt).
  1112. run({TensorShape{3, 2, 1, 1, 1}, {2, 1, 1, 4, 3, 3}}, opt).
  1113. run({TensorShape{4, 6, 2, 2, 2}, {2, 3, 4, 6, 5, 4}}, opt);
  1114. }
  1115. TEST(TestOprDNN, Convolution3DExePolicy) {
  1116. Param3D param{Param3D::Mode::CONVOLUTION};
  1117. using Policy = opr::Convolution3D::ExecutionPolicy;
  1118. using S = Policy::Strategy;
  1119. #if MGB_ENABLE_FASTRUN
  1120. for (auto strategy :
  1121. SmallVector<S>{S::PROFILE, S::HEURISTIC, S::PROFILE | S::REPRODUCIBLE,
  1122. S::PROFILE | S::HEURISTIC}) {
  1123. #else
  1124. for (auto strategy :
  1125. SmallVector<S>{S : HEURISTIC, S::PROFILE | S::HEURISTIC}) {
  1126. #endif
  1127. using Checker = AutoOprChecker<2, 1>;
  1128. auto make_graph = [&](const Checker::SymInpArray &inputs) ->
  1129. Checker::SymOutArray {
  1130. Policy policy;
  1131. policy.strategy = strategy;
  1132. auto out = opr::Convolution3D::make(
  1133. inputs[0], inputs[1], param, policy);
  1134. return {out};
  1135. };
  1136. auto fwd = [&](Checker::NumOutArray &dest, Checker::NumInpArray inp) {
  1137. std::shared_ptr<HostTensorND> sh_out;
  1138. convolution3d_brute({inp.begin(), inp.end()}, sh_out, param);
  1139. dest[0] = *sh_out;
  1140. };
  1141. Checker::RunOptions opt;
  1142. opt.numdiff_eps = 1;
  1143. Checker(make_graph, fwd).
  1144. run({TensorShape{3, 2, 3, 4, 1}, {4, 2, 2, 2, 1}}, opt).
  1145. run({TensorShape{3, 3, 2, 6, 2}, {2, 3, 1, 4, 1}}, opt).
  1146. run({TensorShape{1, 1, 4, 4, 4}, {2, 1, 3, 3, 3}}, opt);
  1147. }
  1148. }
  1149. TEST(TestOprDNN, ConvBiasForward) {
  1150. using Checker2 = AutoOprChecker<2, 1>;
  1151. using Checker3 = AutoOprChecker<3, 1>;
  1152. opr::ConvBiasForward::Param param;
  1153. auto make_graph2 =
  1154. [&](const Checker2::SymInpArray& inputs) -> Checker2::SymOutArray {
  1155. return {opr::ConvBiasForward::make(inputs[0], inputs[1], param)};
  1156. };
  1157. auto make_graph3 =
  1158. [&](const Checker3::SymInpArray& inputs) -> Checker3::SymOutArray {
  1159. return {opr::ConvBiasForward::make(inputs[0], inputs[1], inputs[2],
  1160. param)};
  1161. };
  1162. auto fwd2 = [&](Checker2::NumOutArray& dest, Checker2::NumInpArray inp) {
  1163. std::shared_ptr<HostTensorND> sh_out;
  1164. convolution_brute({inp[0], inp[1]}, sh_out,
  1165. convert_to_conv_param(param));
  1166. dest[0] = *sh_out;
  1167. };
  1168. auto fwd3 = [&](Checker3::NumOutArray& dest, Checker3::NumInpArray inp) {
  1169. std::shared_ptr<HostTensorND> sh_out;
  1170. convolution_brute({inp[0], inp[1]}, sh_out,
  1171. convert_to_conv_param(param));
  1172. dest[0] = *sh_out;
  1173. size_t N = dest[0].shape()[0];
  1174. size_t OC = dest[0].shape()[1];
  1175. size_t OH = dest[0].shape()[2];
  1176. size_t OW = dest[0].shape()[3];
  1177. auto dest_ptr = dest[0].ptr<float>();
  1178. for (size_t i = 0; i < N; i++) {
  1179. auto bias_ptr = inp[2]->ptr<float>();
  1180. for (size_t c = 0; c < OC; c++) {
  1181. for (size_t hw = 0; hw < OH * OW; hw++) {
  1182. *(dest_ptr++) += *(bias_ptr);
  1183. }
  1184. bias_ptr++;
  1185. }
  1186. }
  1187. };
  1188. auto run_with_param = [&](size_t SH = 1, size_t SW = 1, size_t PH = 0,
  1189. size_t PW = 0) {
  1190. param.pad_h = PH;
  1191. param.pad_w = PW;
  1192. param.stride_h = SH;
  1193. param.stride_w = SW;
  1194. Checker2 checker2{make_graph2, fwd2};
  1195. Checker2::RunOptions opt2;
  1196. checker2.set_output_allow_grad(0, false);
  1197. Checker3 checker3{make_graph3, fwd3};
  1198. Checker3::RunOptions opt3;
  1199. checker3.set_output_allow_grad(0, false);
  1200. auto run = [&](size_t N, size_t IC, size_t OC, size_t IH, size_t IW,
  1201. size_t FH, size_t FW) {
  1202. auto opr = megdnn_naive_handle()
  1203. ->create_operator<megdnn::ConvolutionForward>();
  1204. opr->param() = convert_to_conv_param(param);
  1205. TensorLayout dest_layout;
  1206. opr->deduce_layout({{N, IC, IH, IW}, dtype::Float32()},
  1207. {{OC, IC, FH, FW}, dtype::Float32()},
  1208. dest_layout);
  1209. checker2.run({TensorShape{N, IC, IH, IW}, {OC, IC, FH, FW}}, opt2);
  1210. checker3.run({TensorShape{N, IC, IH, IW},
  1211. {OC, IC, FH, FW},
  1212. {1, OC, 1, 1}},
  1213. opt3);
  1214. };
  1215. run(1, 1, 1, 5, 5, 1, 1);
  1216. run(1, 1, 1, 5, 5, 3, 3);
  1217. run(2, 3, 4, 5, 5, 3, 3);
  1218. run(3, 3, 4, 224, 223, 3, 3);
  1219. run(3, 3, 4, 224, 223, 2, 2);
  1220. };
  1221. run_with_param();
  1222. run_with_param(2, 2, 3, 3);
  1223. run_with_param(3, 2, 1, 2);
  1224. run_with_param(2, 3, 2, 2);
  1225. }
  1226. TEST(TestOprDNN, ConvBiasForwardWithZ) {
  1227. REQUIRE_GPU(1);
  1228. using Checker4 = AutoOprChecker<4, 1>;
  1229. opr::ConvBiasForward::Param param;
  1230. auto make_graph4 =
  1231. [&](const Checker4::SymInpArray& inputs) -> Checker4::SymOutArray {
  1232. return {opr::ConvBiasForward::make(inputs[0], inputs[1], inputs[2],
  1233. inputs[3], param)};
  1234. };
  1235. auto fwd4 = [&](Checker4::NumOutArray& dest, Checker4::NumInpArray inp) {
  1236. std::shared_ptr<HostTensorND> sh_out;
  1237. convolution_brute({inp[0], inp[1]}, sh_out,
  1238. convert_to_conv_param(param));
  1239. dest[0] = *sh_out;
  1240. size_t N = dest[0].shape()[0];
  1241. size_t OC = dest[0].shape()[1];
  1242. size_t OH = dest[0].shape()[2];
  1243. size_t OW = dest[0].shape()[3];
  1244. auto dest_ptr = dest[0].ptr<float>();
  1245. float* z_ptr = inp[3]->ptr<float>();
  1246. for (size_t i = 0; i < N; i++) {
  1247. auto bias_ptr = inp[2]->ptr<float>();
  1248. for (size_t c = 0; c < OC; c++) {
  1249. for (size_t hw = 0; hw < OH * OW; hw++) {
  1250. *(dest_ptr++) += *(bias_ptr) + *(z_ptr++);
  1251. }
  1252. bias_ptr++;
  1253. }
  1254. }
  1255. };
  1256. auto run_with_param = [&](size_t SH = 1, size_t SW = 1, size_t PH = 0,
  1257. size_t PW = 0) {
  1258. param.pad_h = PH;
  1259. param.pad_w = PW;
  1260. param.stride_h = SH;
  1261. param.stride_w = SW;
  1262. Checker4 checker4{make_graph4, fwd4};
  1263. Checker4::RunOptions opt4;
  1264. checker4.set_output_allow_grad(0, false);
  1265. auto run = [&](size_t N, size_t IC, size_t OC, size_t IH, size_t IW,
  1266. size_t FH, size_t FW) {
  1267. auto opr = megdnn_naive_handle()
  1268. ->create_operator<megdnn::ConvolutionForward>();
  1269. opr->param() = convert_to_conv_param(param);
  1270. TensorLayout dest_layout;
  1271. opr->deduce_layout({{N, IC, IH, IW}, dtype::Float32()},
  1272. {{OC, IC, FH, FW}, dtype::Float32()},
  1273. dest_layout);
  1274. checker4.run({TensorShape{N, IC, IH, IW},
  1275. {OC, IC, FH, FW},
  1276. {1, OC, 1, 1},
  1277. {N, OC, dest_layout[2], dest_layout[3]}},
  1278. opt4);
  1279. };
  1280. run(1, 1, 1, 5, 5, 3, 3);
  1281. run(2, 3, 4, 5, 5, 3, 3);
  1282. run(3, 3, 4, 224, 223, 3, 3);
  1283. run(3, 3, 4, 224, 223, 2, 2);
  1284. };
  1285. run_with_param();
  1286. run_with_param(2, 2, 3, 3);
  1287. run_with_param(3, 2, 1, 2);
  1288. run_with_param(2, 3, 2, 2);
  1289. }
  1290. TEST(TestOprDNN, ConvBiasINT8x8xX_NCHW4) {
  1291. using Checker = AutoOprChecker<3, 1>;
  1292. using Param = opr::ConvBias::Param;
  1293. opr::ConvBiasForward::Param param;
  1294. auto make_quantized = [&](SymbolVar x, const DType& dtype) {
  1295. return opr::TypeCvt::make(x, dtype);
  1296. };
  1297. auto make_graph =
  1298. [&](const Checker::SymInpArray& inputs) -> Checker::SymOutArray {
  1299. auto conv_param = convert_to_conv_param(param);
  1300. auto y = opr::Convolution::make(
  1301. make_quantized(inputs[0], dtype::QuantizedS8(0.3f)),
  1302. make_quantized(inputs[1], dtype::QuantizedS8(0.1f)), conv_param);
  1303. y = y + make_quantized(inputs[2], dtype::QuantizedS32(0.03f));
  1304. if (param.nonlineMode == Param::NonlineMode::RELU)
  1305. y = opr::Elemwise::make(
  1306. {y}, {opr::Elemwise::Mode::RELU});
  1307. y = opr::TypeCvt::make(y, dtype::QuantizedS8(0.5f));
  1308. return {opr::TypeCvt::make(y, dtype::Float32())};
  1309. };
  1310. auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
  1311. auto graph = ComputingGraph::make();
  1312. Checker::SymInpArray inputs;
  1313. for (size_t i = 0; i < inp.size(); ++i) {
  1314. inputs[i] = opr::Host2DeviceCopy::make(
  1315. *graph, inp[i]);
  1316. }
  1317. auto options = gopt::OptimizeForInferenceOptions{};
  1318. options.enable_fuse_conv_bias_nonlinearity();
  1319. auto y = gopt::optimize_for_inference({make_graph(inputs)[0]},
  1320. options)[0];
  1321. auto func = graph->compile({make_callback_copy(y, dest[0])});
  1322. func->execute();
  1323. func->wait();
  1324. };
  1325. auto run_with_param = [&](size_t SH = 1, size_t SW = 1, size_t PH = 0,
  1326. size_t PW = 0, size_t group = 1) {
  1327. param.pad_h = PH;
  1328. param.pad_w = PW;
  1329. param.stride_h = SH;
  1330. param.stride_w = SW;
  1331. param.format = Param::Format::NCHW4;
  1332. if (group != 1)
  1333. param.sparse = Param::Sparse::GROUP;
  1334. Checker checker{make_graph, fwd, CompNode::load("cpu0")};
  1335. Checker::RunOptions opt;
  1336. checker.set_output_allow_grad(0, false);
  1337. auto run = [&](size_t N, size_t IC, size_t OC, size_t IH, size_t IW,
  1338. size_t FH, size_t FW) {
  1339. mgb_assert(IC % 4 == 0 && OC % 4 == 0);
  1340. checker.run({TensorShape{N, group * IC / 4, IH, IW, 4},
  1341. {group, OC, IC / 4, FH, FW, 4},
  1342. {1, group * OC / 4, 1, 1, 4}},
  1343. opt);
  1344. };
  1345. run(1, 8, 8, 56, 56, 3, 3);
  1346. run(1, 8, 8, 56, 56, 3, 3);
  1347. run(1, 8, 8, 56, 56, 3, 3);
  1348. };
  1349. run_with_param(1, 1, 1, 1, 8);
  1350. run_with_param();
  1351. run_with_param(2, 2, 3, 3);
  1352. run_with_param(3, 2, 1, 2);
  1353. run_with_param(2, 3, 2, 2);
  1354. }
  1355. TEST(TestOprDNN, ConvolutionDTypeInference) {
  1356. Param param;
  1357. param.mode = Mode::CONVOLUTION;
  1358. auto cn = CompNode::load("cpu0");
  1359. auto graph = ComputingGraph::make();
  1360. HostTensorND inp_host{
  1361. cn, {1, 3, 7, 7}, dtype::Quantized8Asymm(0.233f, (uint8_t)123)};
  1362. HostTensorND filt_host{
  1363. cn, {8, 3, 1, 1}, dtype::Quantized8Asymm(0.874f, (uint8_t)234)};
  1364. auto inp = opr::ImmutableTensor::make(*graph, inp_host);
  1365. auto filt = opr::ImmutableTensor::make(*graph, filt_host);
  1366. auto opr = opr::Convolution::make(inp, filt, param);
  1367. ASSERT_EQ(opr.dtype().enumv(), DTypeEnum::QuantizedS32);
  1368. // This has to be EQ instead of NEAR
  1369. EXPECT_EQ(opr.dtype().param<dtype::QuantizedS32>().scale, 0.233f * 0.874f);
  1370. inp_host = {cn, {1, 3, 7, 7}, dtype::QuantizedS8(0.1234f)};
  1371. filt_host = {cn, {8, 3, 1, 1}, dtype::QuantizedS8(0.2345f)};
  1372. inp = opr::ImmutableTensor::make(*graph, inp_host);
  1373. filt = opr::ImmutableTensor::make(*graph, filt_host);
  1374. opr = opr::Convolution::make(inp, filt, param);
  1375. ASSERT_EQ(opr.dtype().enumv(), DTypeEnum::QuantizedS32);
  1376. EXPECT_EQ(opr.dtype().param<dtype::QuantizedS32>().scale,
  1377. 0.1234f * 0.2345f);
  1378. inp_host = {cn, {1, 3, 7, 7}, dtype::Int8()};
  1379. filt_host = {cn, {8, 3, 1, 1}, dtype::Int8()};
  1380. inp = opr::ImmutableTensor::make(*graph, inp_host);
  1381. filt = opr::ImmutableTensor::make(*graph, filt_host);
  1382. opr = opr::Convolution::make(inp, filt, param);
  1383. ASSERT_EQ(opr.dtype().enumv(), DTypeEnum::Int32);
  1384. }
  1385. TEST(TestOprDNN, ConvBiasINT8x8xXDTypeInference) {
  1386. float inp_scale = 1.926f;
  1387. float filt_scale = 0.817f;
  1388. float bias_scale = inp_scale * filt_scale;
  1389. opr::ConvBias::Param param;
  1390. param.mode = Mode::CONVOLUTION;
  1391. auto cn = CompNode::load("cpu0");
  1392. auto graph = ComputingGraph::make();
  1393. HostTensorND inp_host{cn, {1, 3, 7, 7}, dtype::QuantizedS8(inp_scale)};
  1394. HostTensorND filt_host{cn, {8, 3, 1, 1}, dtype::QuantizedS8(filt_scale)};
  1395. DType output_dtype = dtype::QuantizedS8(bias_scale);
  1396. HostTensorND bias_host{cn, {1, 3, 7, 7}, dtype::QuantizedS32(bias_scale)};
  1397. auto inp = opr::ImmutableTensor::make(*graph, inp_host);
  1398. auto filt = opr::ImmutableTensor::make(*graph, filt_host);
  1399. auto bias = opr::ImmutableTensor::make(*graph, filt_host);
  1400. auto opr = opr::ConvBiasForward::make(inp, filt, bias, param,
  1401. {}, OperatorNodeConfig{output_dtype});
  1402. ASSERT_EQ(opr.dtype().enumv(), DTypeEnum::QuantizedS8);
  1403. EXPECT_EQ(opr.dtype().param<dtype::QuantizedS8>().scale, bias_scale);
  1404. }
  1405. TEST(TestOprDNN, ConvBiasINT8x8xXSerialization) {
  1406. using namespace serialization;
  1407. float inp_scale = 1.926f;
  1408. float filt_scale = 0.817f;
  1409. float bias_scale = inp_scale * filt_scale;
  1410. DType output_dtype = dtype::QuantizedS8(bias_scale);
  1411. auto fname = output_file("ConvBiasINT8x8xXTest");
  1412. auto dump = [&]() {
  1413. opr::ConvBias::Param param;
  1414. param.mode = Mode::CONVOLUTION;
  1415. auto cn = CompNode::load("cpu0");
  1416. auto graph = ComputingGraph::make();
  1417. HostTensorND inp_host{cn, {1, 3, 7, 7}, dtype::QuantizedS8(inp_scale)};
  1418. HostTensorND filt_host{
  1419. cn, {8, 3, 1, 1}, dtype::QuantizedS8(filt_scale)};
  1420. HostTensorND bias_host{
  1421. cn, {1, 3, 7, 7}, dtype::QuantizedS32(bias_scale)};
  1422. auto inp = opr::ImmutableTensor::make(*graph, inp_host);
  1423. auto filt = opr::ImmutableTensor::make(*graph, filt_host);
  1424. auto bias = opr::ImmutableTensor::make(*graph, filt_host);
  1425. auto opr = opr::ConvBiasForward::make(inp, filt, bias, param,
  1426. {},
  1427. OperatorNodeConfig{output_dtype});
  1428. auto dumper = GraphDumper::make(OutputFile::make_fs(fname.c_str()));
  1429. auto rst = dumper->dump({opr});
  1430. ASSERT_EQ(rst.outputs.size(), 1u);
  1431. };
  1432. auto load = [&]() {
  1433. auto loader = GraphLoader::make(InputFile::make_fs(fname.c_str()));
  1434. auto rst = loader->load();
  1435. ASSERT_EQ(rst.output_var_list.size(), 1u);
  1436. EXPECT_EQ(rst.output_var_list[0].dtype(), output_dtype);
  1437. };
  1438. dump();
  1439. load();
  1440. }
  1441. TEST(TestOprDNN, LocalShareForward) {
  1442. REQUIRE_GPU(1);
  1443. using Checker = AutoOprChecker<2, 1>;
  1444. using Param = opr::LocalShare::Param;
  1445. Param param;
  1446. param.mode = Param::Mode::CROSS_CORRELATION;
  1447. param.sparse = Param::Sparse::DENSE;
  1448. auto make_graph =
  1449. [&](const Checker::SymInpArray& inputs) -> Checker::SymOutArray {
  1450. return {opr::LocalShare::make(inputs[0], inputs[1], param)};
  1451. };
  1452. auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
  1453. mgb_assert(inp.size() == 2);
  1454. mgb_assert(dest.size() == 1);
  1455. std::shared_ptr<HostTensorND> out;
  1456. local_share_brute({inp[0], inp[1]}, out, param);
  1457. dest[0] = *out;
  1458. };
  1459. auto run_with_param = [&](size_t fh = 3, size_t fw = 3, size_t sh = 1,
  1460. size_t sw = 1, size_t sgh = 3, size_t sgw = 3) {
  1461. size_t ph = fh / 2, pw = fw / 2;
  1462. param.pad_h = ph, param.pad_w = pw;
  1463. param.stride_h = sh, param.stride_w = sw, param.spatial_groups_h = sgh,
  1464. param.spatial_groups_w = sgw;
  1465. Checker checker{make_graph, fwd};
  1466. Checker::RunOptions opt;
  1467. checker.set_output_allow_grad(0, false);
  1468. checker.set_input_dtype(0, dtype::Float32());
  1469. checker.set_input_dtype(1, dtype::Float32());
  1470. auto run = [&](size_t n, size_t ci, size_t co, size_t hi, size_t wi) {
  1471. size_t ho = (hi + 2 * ph - fh) / sh + 1;
  1472. size_t wo = (wi + 2 * pw - fw) / sw + 1;
  1473. if (ho % sgh != 0 || wo % sgw != 0)
  1474. return;
  1475. checker.run({TensorShape{n, ci, hi, wi},
  1476. TensorShape{sgh, sgw, ci, fh, fw, co}},
  1477. opt);
  1478. };
  1479. run(32, 2, 7, 24, 24);
  1480. run(16, 2, 7, 24, 24);
  1481. run(32, 2, 8, 12, 12);
  1482. run(16, 2, 9, 6, 6);
  1483. };
  1484. run_with_param(1, 1, 1, 1, 3, 3);
  1485. run_with_param(3, 3, 1, 1, 2, 2);
  1486. run_with_param(5, 5, 1, 1, 2, 2);
  1487. run_with_param(7, 7, 1, 1, 2, 2);
  1488. run_with_param(1, 1, 2, 2, 3, 3);
  1489. run_with_param(3, 3, 2, 2, 2, 2);
  1490. run_with_param(5, 5, 1, 1, 2, 2);
  1491. run_with_param(7, 7, 1, 1, 2, 2);
  1492. }
  1493. TEST(TestOprDNN, LocalShareForwardGrad) {
  1494. REQUIRE_GPU(1);
  1495. using Checker = AutoOprChecker<2, 1>;
  1496. using Param = opr::LocalShare::Param;
  1497. Param param;
  1498. param.mode = Param::Mode::CROSS_CORRELATION;
  1499. param.sparse = Param::Sparse::DENSE;
  1500. auto make_graph =
  1501. [&](const Checker::SymInpArray& inputs) -> Checker::SymOutArray {
  1502. return {opr::LocalShare::make(inputs[0], inputs[1], param)};
  1503. };
  1504. auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
  1505. mgb_assert(inp.size() == 2);
  1506. mgb_assert(dest.size() == 1);
  1507. std::shared_ptr<HostTensorND> out;
  1508. local_share_brute({inp[0], inp[1]}, out, param);
  1509. dest[0] = *out;
  1510. };
  1511. auto run_with_param = [&](size_t fh = 3, size_t fw = 3, size_t sh = 1,
  1512. size_t sw = 1, size_t sgh = 3, size_t sgw = 3) {
  1513. size_t ph = fh / 2, pw = fw / 2;
  1514. param.pad_h = ph, param.pad_w = pw;
  1515. param.stride_h = sh, param.stride_w = sw, param.spatial_groups_h = sgh,
  1516. param.spatial_groups_w = sgw;
  1517. Checker checker{make_graph, fwd};
  1518. Checker::RunOptions opt;
  1519. checker.set_output_allow_grad(0, true);
  1520. opt.numdiff_max_err = 1e-1;
  1521. checker.set_input_dtype(0, dtype::Float32());
  1522. checker.set_input_dtype(1, dtype::Float32());
  1523. auto run = [&](size_t n, size_t ci, size_t co, size_t hi, size_t wi) {
  1524. size_t ho = (hi + 2 * ph - fh) / sh + 1;
  1525. size_t wo = (wi + 2 * pw - fw) / sw + 1;
  1526. if (ho % sgh != 0 || wo % sgw != 0)
  1527. return;
  1528. checker.run({TensorShape{n, ci, hi, wi},
  1529. TensorShape{sgh, sgw, ci, fh, fw, co}},
  1530. opt);
  1531. };
  1532. run(4, 2, 8, 24, 24);
  1533. run(8, 2, 4, 6, 6);
  1534. run(16, 4, 8, 12, 12);
  1535. run(4, 4, 8, 12, 12);
  1536. };
  1537. run_with_param(1, 1, 1, 1, 3, 3);
  1538. run_with_param(1, 1, 2, 2, 3, 3);
  1539. run_with_param(3, 3, 2, 2, 2, 2);
  1540. }
  1541. TEST(TestOprDNN, LocalShareForwardExecPolicy) {
  1542. REQUIRE_GPU(1);
  1543. using Checker = AutoOprChecker<2, 1>;
  1544. using Policy = opr::LocalShare::ExecutionPolicy;
  1545. using S = Policy::Strategy;
  1546. using Param = opr::LocalShare::Param;
  1547. Param param;
  1548. param.mode = Param::Mode::CROSS_CORRELATION;
  1549. param.sparse = Param::Sparse::DENSE;
  1550. int nr_get = 0;
  1551. auto on_get = [&nr_get](const std::string&, const void*, size_t,
  1552. const void*, size_t) { ++nr_get; };
  1553. PersistentCacheHook cache_hook{on_get};
  1554. #if MGB_ENABLE_FASTRUN
  1555. for (auto strategy :
  1556. SmallVector<S>{S::PROFILE, S::HEURISTIC, S::PROFILE | S::REPRODUCIBLE,
  1557. S::PROFILE | S::HEURISTIC, S::PROFILE | S::OPTIMIZED}) {
  1558. #else
  1559. for (auto strategy :
  1560. SmallVector<S>{S : HEURISTIC, S::PROFILE | S::HEURISTIC}) {
  1561. #endif
  1562. auto make_graph = [&](const Checker::SymInpArray& inputs)
  1563. -> Checker::SymOutArray {
  1564. Policy policy;
  1565. policy.strategy = strategy;
  1566. return {opr::LocalShare::make(inputs[0], inputs[1], param, policy)};
  1567. };
  1568. auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
  1569. mgb_assert(inp.size() == 2);
  1570. mgb_assert(dest.size() == 1);
  1571. std::shared_ptr<HostTensorND> out;
  1572. local_share_brute({inp[0], inp[1]}, out, param);
  1573. dest[0] = *out;
  1574. };
  1575. auto run_with_param = [&](size_t fh = 3, size_t fw = 3, size_t sh = 1,
  1576. size_t sw = 1, size_t sgh = 3,
  1577. size_t sgw = 3) {
  1578. size_t ph = fh / 2, pw = fw / 2;
  1579. param.pad_h = ph, param.pad_w = pw;
  1580. param.stride_h = sh, param.stride_w = sw,
  1581. param.spatial_groups_h = sgh, param.spatial_groups_w = sgw;
  1582. Checker checker{make_graph, fwd};
  1583. Checker::RunOptions opt;
  1584. checker.set_output_allow_grad(0, false);
  1585. checker.set_input_dtype(0, dtype::Float32());
  1586. checker.set_input_dtype(1, dtype::Float32());
  1587. nr_get = 0;
  1588. opt.outputs_max_err = 1e-3;
  1589. auto run = [&](size_t n, size_t ci, size_t co, size_t hi,
  1590. size_t wi) {
  1591. size_t ho = (hi + 2 * ph - fh) / sh + 1;
  1592. size_t wo = (wi + 2 * pw - fw) / sw + 1;
  1593. if (ho % sgh != 0 || wo % sgw != 0)
  1594. return;
  1595. checker.run({TensorShape{n, ci, hi, wi},
  1596. TensorShape{sgh, sgw, ci, fh, fw, co}},
  1597. opt);
  1598. };
  1599. run(32, 4, 8, 24, 24);
  1600. run(32, 4, 8, 12, 12);
  1601. run(16, 4, 8, 12, 12);
  1602. run(32, 4, 8, 6, 6);
  1603. if (strategy == S::HEURISTIC) {
  1604. ASSERT_EQ(0, nr_get);
  1605. } else {
  1606. ASSERT_LT(0, nr_get);
  1607. }
  1608. };
  1609. run_with_param(1, 1, 1, 1, 3, 3);
  1610. run_with_param(3, 3, 1, 1, 2, 2);
  1611. run_with_param(5, 5, 1, 1, 2, 2);
  1612. run_with_param(7, 7, 1, 1, 2, 2);
  1613. run_with_param(1, 1, 2, 2, 3, 3);
  1614. run_with_param(3, 3, 2, 2, 2, 2);
  1615. run_with_param(5, 5, 1, 1, 2, 2);
  1616. run_with_param(7, 7, 1, 1, 2, 2);
  1617. }
  1618. }
  1619. TEST(TestOprDNN, LocalShareSerialization) {
  1620. using namespace serialization;
  1621. auto fname = output_file("LocalShareForwardTest");
  1622. auto dump = [&]() {
  1623. opr::LocalShare::Param param;
  1624. param.mode = Mode::CROSS_CORRELATION;
  1625. param.stride_h = param.stride_w = 1;
  1626. param.pad_h = param.pad_w = 0;
  1627. param.spatial_groups_h = param.spatial_groups_w = 3;
  1628. auto cn = CompNode::load("cpu0");
  1629. auto graph = ComputingGraph::make();
  1630. HostTensorND inp_host{cn, {32, 4, 24, 24}, dtype::Float32()};
  1631. HostTensorND filt_host{
  1632. cn, {3, 3, 4, 1, 1, 8}, dtype::Float32()};
  1633. auto inp = opr::ImmutableTensor::make(*graph, inp_host);
  1634. auto filt = opr::ImmutableTensor::make(*graph, filt_host);
  1635. auto opr = opr::LocalShareForward::make(inp, filt, param,
  1636. {});
  1637. auto dumper = GraphDumper::make(OutputFile::make_fs(fname.c_str()));
  1638. auto rst = dumper->dump({opr});
  1639. ASSERT_EQ(rst.outputs.size(), 1u);
  1640. };
  1641. auto load = [&]() {
  1642. auto loader = GraphLoader::make(InputFile::make_fs(fname.c_str()));
  1643. auto rst = loader->load();
  1644. ASSERT_EQ(rst.output_var_list.size(), 1u);
  1645. };
  1646. dump();
  1647. load();
  1648. }
  1649. TEST(TestOprDNN, DeformableConvForward) {
  1650. REQUIRE_GPU(1);
  1651. using Checker = AutoOprChecker<4, 1>;
  1652. using Policy = opr::DeformableConvForward::ExecutionPolicy;
  1653. using S = Policy::Strategy;
  1654. using Param = opr::DeformableConvForward::Param;
  1655. Param param;
  1656. #if MGB_ENABLE_FASTRUN
  1657. for (auto strategy :
  1658. SmallVector<S>{S::PROFILE, S::HEURISTIC, S::PROFILE | S::REPRODUCIBLE,
  1659. S::PROFILE | S::HEURISTIC, S::PROFILE | S::OPTIMIZED}) {
  1660. #else
  1661. for (auto strategy :
  1662. SmallVector<S>{S : HEURISTIC, S::PROFILE | S::HEURISTIC}) {
  1663. #endif
  1664. auto make_graph = [&](const Checker::SymInpArray& inputs)
  1665. -> Checker::SymOutArray {
  1666. Policy policy;
  1667. policy.strategy = strategy;
  1668. return {opr::DeformableConvForward::make(
  1669. inputs[0], inputs[1], inputs[2], inputs[3], param, policy)};
  1670. };
  1671. auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
  1672. auto opr =
  1673. megdnn_naive_handle()
  1674. ->create_operator<megdnn::DeformableConvForward>();
  1675. opr->param() = param;
  1676. TensorLayout dest_layout;
  1677. opr->deduce_layout(inp[0]->layout(), inp[1]->layout(),
  1678. inp[2]->layout(), inp[3]->layout(), dest_layout);
  1679. std::vector<dt_byte> workspace(opr->get_workspace_in_bytes(
  1680. inp[0]->layout(), inp[1]->layout(), inp[2]->layout(),
  1681. inp[3]->layout(), dest_layout));
  1682. dest[0].dtype(dtype::Float32())
  1683. .comp_node(inp[0]->comp_node())
  1684. .resize(dest_layout);
  1685. opr->exec(inp[0]->as_megdnn(), inp[1]->as_megdnn(),
  1686. inp[2]->as_megdnn(), inp[3]->as_megdnn(),
  1687. dest[0].as_megdnn(),
  1688. {workspace.data(), workspace.size()});
  1689. };
  1690. auto run_with_param = [&](size_t fh, size_t fw, size_t sh, size_t sw,
  1691. size_t dh, size_t dw, size_t group,
  1692. size_t deformable_group) {
  1693. Checker checker{make_graph, fwd};
  1694. size_t ph = fh / 2, pw = fw / 2;
  1695. param.pad_h = ph, param.pad_w = pw;
  1696. param.stride_h = sh, param.stride_w = sw;
  1697. param.dilate_h = dh, param.dilate_w = dw;
  1698. param.format = Param::Format::NCHW;
  1699. param.mode = Param::Mode::CROSS_CORRELATION;
  1700. param.sparse = Param::Sparse::DENSE;
  1701. if (group > 1)
  1702. param.sparse = Param::Sparse::GROUP;
  1703. Checker::RunOptions opt;
  1704. float DELTA = 1e-3;
  1705. opt.numdiff_eps = DELTA;
  1706. opt.numdiff_max_err = 1e-1;
  1707. auto gen_off = [DELTA](HostTensorND& off, float l = -2.f, float h = 2.f) {
  1708. RNGxorshf rng{next_rand_seed()};
  1709. auto elems = off.shape().total_nr_elems();
  1710. auto ptr = off.ptr<float>();
  1711. auto rand_real = [](RNGxorshf& rng, float lo, float hi) {
  1712. std::uniform_real_distribution<float> dist(lo, hi);
  1713. return dist(rng);
  1714. };
  1715. for (size_t i = 0; i < elems; ++i) {
  1716. do {
  1717. float val = rand_real(rng, l, h);
  1718. if (abs(floor(val + 2 * DELTA) - floor(val)) <= 1e-6f &&
  1719. abs(floor(val - 2 * DELTA) - floor(val)) <= 1e-6f) {
  1720. ptr[i] = val;
  1721. break;
  1722. }
  1723. } while (true);
  1724. }
  1725. };
  1726. //! generate offset to avoid value near integer
  1727. /// because bilinear function is not derivable over there
  1728. checker.set_input_generator(2, gen_off);
  1729. checker.set_input_dtype(0, dtype::Float32());
  1730. checker.set_input_dtype(1, dtype::Float32());
  1731. checker.set_input_dtype(2, dtype::Float32());
  1732. checker.set_input_dtype(3, dtype::Float32());
  1733. auto run = [&](size_t n, size_t ih, size_t iw, size_t icpg,
  1734. size_t ocpg) {
  1735. size_t oh = (ih + 2 * ph - fh) / sh + 1;
  1736. size_t ow = (iw + 2 * pw - fw) / sw + 1;
  1737. checker.run({TensorShape{n, group * icpg, ih, iw},
  1738. (param.sparse == Param::Sparse::GROUP)
  1739. ? TensorShape{group, ocpg, icpg, fh, fw}
  1740. : TensorShape{group * ocpg, group * icpg,
  1741. fh, fw},
  1742. {n, 2 * deformable_group * fh * fw, oh, ow},
  1743. {n, deformable_group * fh * fw, oh, ow}},
  1744. opt);
  1745. };
  1746. run(1, 3, 3, 2, 1);
  1747. run(2, 3, 3, 2, 2);
  1748. run(1, 5, 5, 2, 1);
  1749. };
  1750. // run_with_param(1, 1, 1, 1, 1, 1, 1, 1);
  1751. run_with_param(3, 3, 1, 1, 1, 1, 2, 2);
  1752. // run_with_param(5, 5, 1, 1, 1, 1, 2, 2);
  1753. }
  1754. }
  1755. TEST(TestOprDNN, DeformableConvSerialization) {
  1756. using namespace serialization;
  1757. auto fname = output_file("DeformableConvTest");
  1758. auto dump = [&]() {
  1759. using Param = opr::DeformableConvForward::Param;
  1760. Param param;
  1761. size_t n = 16, ocpg = 2, icpg = 4;
  1762. size_t ih = 24, iw = 24, fh = 3, fw = 3, ph = 2, pw = 2, sh = 1, sw = 1, dh = 1, dw = 1;
  1763. size_t group = 1, deformable_group =1;
  1764. size_t oh = (ih + 2 * ph - fh) / sh + 1;
  1765. size_t ow = (iw + 2 * pw - fw) / sw + 1;
  1766. param.pad_h = ph, param.pad_w = pw;
  1767. param.stride_h = sh, param.stride_w = sw;
  1768. param.dilate_h = dh, param.dilate_w = dw;
  1769. param.format = Param::Format::NCHW;
  1770. param.mode = Param::Mode::CROSS_CORRELATION;
  1771. param.sparse = Param::Sparse::DENSE;
  1772. auto cn = CompNode::load("cpu0");
  1773. auto graph = ComputingGraph::make();
  1774. HostTensorND inp_host{cn, {n, group * icpg, ih, iw}, dtype::Float32()};
  1775. HostTensorND filt_host{
  1776. cn, {group * ocpg, group * icpg, fh, fw}, dtype::Float32()};
  1777. HostTensorND offset_host{
  1778. cn, {n, 2 * deformable_group * fh * fw, oh, ow}, dtype::Float32()};
  1779. HostTensorND mask_host{
  1780. cn, {n, deformable_group * fh * fw, oh, ow}, dtype::Float32()};
  1781. auto inp = opr::ImmutableTensor::make(*graph, inp_host);
  1782. auto filt = opr::ImmutableTensor::make(*graph, filt_host);
  1783. auto offset = opr::ImmutableTensor::make(*graph, offset_host);
  1784. auto mask = opr::ImmutableTensor::make(*graph, mask_host);
  1785. auto opr = opr::DeformableConvForward::make(inp, filt, offset, mask,
  1786. param, {}, {});
  1787. auto dumper = GraphDumper::make(OutputFile::make_fs(fname.c_str()));
  1788. auto rst = dumper->dump({opr});
  1789. ASSERT_EQ(rst.outputs.size(), 1u);
  1790. };
  1791. auto load = [&]() {
  1792. auto loader = GraphLoader::make(InputFile::make_fs(fname.c_str()));
  1793. auto rst = loader->load();
  1794. ASSERT_EQ(rst.output_var_list.size(), 1u);
  1795. };
  1796. dump();
  1797. load();
  1798. }
  1799. #if MGB_CUDA
  1800. TEST(TestOprDNN, BatchConvBiasForward) {
  1801. REQUIRE_GPU(1);
  1802. auto cn = CompNode::load("gpu0");
  1803. cn.activate();
  1804. REQUIRE_CUDA_COMPUTE_CAPABILITY(6, 1);
  1805. using Checker = AutoOprChecker<3, 1>;
  1806. using Policy = opr::BatchConvBiasForward::ExecutionPolicy;
  1807. using S = Policy::Strategy;
  1808. using Param = opr::BatchConvBiasForward::Param;
  1809. Param param;
  1810. param.format = Param::Format::NCHW4;
  1811. param.mode = Param::Mode::CROSS_CORRELATION;
  1812. param.sparse = Param::Sparse::DENSE;
  1813. #if MGB_ENABLE_FASTRUN
  1814. for (auto strategy :
  1815. SmallVector<S>{S::PROFILE, S::HEURISTIC, S::PROFILE | S::REPRODUCIBLE,
  1816. S::PROFILE | S::HEURISTIC, S::PROFILE | S::OPTIMIZED}) {
  1817. #else
  1818. for (auto strategy :
  1819. SmallVector<S>{S : HEURISTIC, S::PROFILE | S::HEURISTIC}) {
  1820. #endif
  1821. auto make_quantized = [&](SymbolVar x, const DType& dtype) {
  1822. return opr::TypeCvt::make(x, dtype);
  1823. };
  1824. auto make_graph = [&](const Checker::SymInpArray& inputs)
  1825. -> Checker::SymOutArray {
  1826. Policy policy;
  1827. policy.strategy = strategy;
  1828. auto conv_bias = opr::BatchConvBiasForward::make(
  1829. make_quantized(inputs[0], dtype::QuantizedS8{1.1f}),
  1830. make_quantized(inputs[1], dtype::QuantizedS8{1.2f}),
  1831. make_quantized(inputs[2], dtype::QuantizedS32{1.1f * 1.2f}),
  1832. param, policy,
  1833. OperatorNodeConfig{dtype::QuantizedS8{1.3f}});
  1834. return {opr::TypeCvt::make(conv_bias, dtype::Float32())};
  1835. };
  1836. auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
  1837. mgb_assert(inp.size() == 3);
  1838. mgb_assert(dest.size() == 1);
  1839. auto graph = ComputingGraph::make();
  1840. Checker::SymInpArray inputs;
  1841. for (size_t i = 0; i < inp.size(); ++i) {
  1842. inputs[i] = opr::Host2DeviceCopy::make(*graph, inp[i]);
  1843. }
  1844. auto src = make_quantized(inputs[0], dtype::QuantizedS8{1.1f}),
  1845. filter = make_quantized(inputs[1], dtype::QuantizedS8{1.2f}),
  1846. bias = make_quantized(inputs[2],
  1847. dtype::QuantizedS32{1.1f * 1.2f});
  1848. {
  1849. auto xshp = opr::GetVarShape::make(src);
  1850. auto cv = [&src](int v) { return src.make_scalar(v); };
  1851. auto sub = [&xshp, &cv](int idx) {
  1852. return opr::IndexAt::make(xshp, {{0, cv(idx)}});
  1853. };
  1854. auto tshp = opr::Concat::make(
  1855. {cv(1), sub(0) * sub(1), sub(2), sub(3), sub(4)}, 0);
  1856. src = opr::Reshape::make(src, tshp);
  1857. }
  1858. auto conv_param = convert_to_conv_param(param);
  1859. conv_param.sparse = opr::BatchConvBias::Param::Sparse::GROUP;
  1860. auto y = opr::Convolution::make(src, filter, conv_param);
  1861. {
  1862. auto fshp = opr::GetVarShape::make(filter);
  1863. auto batch =
  1864. opr::IndexAt::make(fshp, {{0, filter.make_scalar(0)}});
  1865. auto xshp = opr::GetVarShape::make(y);
  1866. auto cv = [&y](int v) { return y.make_scalar(v); };
  1867. auto sub = [&xshp, &cv](int idx) {
  1868. return opr::IndexAt::make(xshp, {{0, cv(idx)}});
  1869. };
  1870. auto tshp = opr::Concat::make(
  1871. {batch, sub(1) / batch, sub(2), sub(3), sub(4)}, 0);
  1872. y = opr::Reshape::make(y, tshp);
  1873. }
  1874. y = y + bias;
  1875. y = opr::TypeCvt::make(y, dtype::QuantizedS8{1.3f});
  1876. y = opr::TypeCvt::make(y, dtype::Float32());
  1877. auto func = graph->compile({make_callback_copy(y, dest[0])});
  1878. func->execute();
  1879. func->wait();
  1880. };
  1881. auto run_with_param = [&](size_t sh = 1, size_t sw = 1) {
  1882. size_t fh = 1;
  1883. size_t fw = 1;
  1884. size_t ph = fh / 2, pw = fw / 2;
  1885. param.pad_h = ph, param.pad_w = pw;
  1886. param.stride_h = sh, param.stride_w = sw;
  1887. Checker checker{make_graph, fwd, cn};
  1888. Checker::RunOptions opt;
  1889. checker.set_output_allow_grad(0, false);
  1890. checker.set_input_dtype(0, dtype::Float32());
  1891. checker.set_input_dtype(1, dtype::Float32());
  1892. checker.set_input_dtype(2, dtype::Float32());
  1893. auto run = [&](size_t n, size_t ci, size_t co, size_t hi,
  1894. size_t wi) {
  1895. checker.run({TensorShape{n, ci / 4, hi, wi, 4},
  1896. TensorShape{n, co, ci / 4, fh, fw, 4},
  1897. TensorShape{1, co / 4, 1, 1, 4}},
  1898. opt);
  1899. };
  1900. run(32, 16, 32, 24, 24);
  1901. run(16, 16, 32, 24, 24);
  1902. run(32, 16, 64, 12, 12);
  1903. run(16, 16, 64, 6, 6);
  1904. };
  1905. run_with_param(1, 1);
  1906. run_with_param(2, 2);
  1907. }
  1908. }
  1909. #endif
  1910. TEST(TestOprDNN, BatchConvBiasSerialization) {
  1911. using namespace serialization;
  1912. auto fname = output_file("BatchConvBiasForwardTest");
  1913. auto dump = [&]() {
  1914. opr::BatchConvBias::Param param;
  1915. param.mode = Mode::CROSS_CORRELATION;
  1916. param.format = opr::BatchConvBias::Param::Format::NCHW4;
  1917. param.stride_h = param.stride_w = 1;
  1918. param.pad_h = param.pad_w = 0;
  1919. auto cn = CompNode::load("cpu0");
  1920. auto graph = ComputingGraph::make();
  1921. HostTensorND inp_host{cn, {32, 1, 24, 24, 4}, dtype::QuantizedS8{1.1f}};
  1922. HostTensorND filt_host{cn, {32, 8, 1, 1, 1, 4}, dtype::QuantizedS8{1.2f}};
  1923. auto inp = opr::ImmutableTensor::make(*graph, inp_host);
  1924. auto filt = opr::ImmutableTensor::make(*graph, filt_host);
  1925. auto opr = opr::BatchConvBiasForward::make(
  1926. inp, filt, param, {},
  1927. OperatorNodeConfig{dtype::QuantizedS8{1.3f}});
  1928. auto dumper = GraphDumper::make(OutputFile::make_fs(fname.c_str()));
  1929. auto rst = dumper->dump({opr});
  1930. ASSERT_EQ(rst.outputs.size(), 1u);
  1931. };
  1932. auto load = [&]() {
  1933. auto loader = GraphLoader::make(InputFile::make_fs(fname.c_str()));
  1934. auto rst = loader->load();
  1935. ASSERT_EQ(rst.output_var_list.size(), 1u);
  1936. };
  1937. dump();
  1938. load();
  1939. }
  1940. TEST(TestOprDNN, HeuristicReproducible) {
  1941. using Policy = opr::ConvolutionBackwardFilter::ExecutionPolicy;
  1942. using S = Policy::Strategy;
  1943. using Checker = AutoOprChecker<3, 1>;
  1944. constexpr size_t PH = 1, PW = 1, SH = 1, SW = 1;
  1945. for (auto strategy :
  1946. SmallVector<S>{S::HEURISTIC, S::HEURISTIC | S::REPRODUCIBLE}) {
  1947. VarNode* bwd_flt;
  1948. auto make_graph = [&](const Checker::SymInpArray& inputs)
  1949. -> Checker::SymOutArray {
  1950. Param param{Mode::CROSS_CORRELATION, PH, PW, SH, SW};
  1951. Policy policy;
  1952. policy.strategy = strategy;
  1953. auto out = opr::ConvolutionBackwardFilter::make(
  1954. inputs[0], inputs[1], inputs[2], param, policy);
  1955. bwd_flt = out.node();
  1956. return {out};
  1957. };
  1958. auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
  1959. std::shared_ptr<HostTensorND> out;
  1960. conv_bwd_flt_brute({inp[0], inp[1], inp[2]}, out,
  1961. Param{Mode::CROSS_CORRELATION, PH, PW, SH, SW});
  1962. dest[0] = *out;
  1963. };
  1964. #define get_shp(N, P, S, F) ((N + 2 * P - F) / S + 1)
  1965. #define inp_tensor(N, IC, OC, IH, IW, FH, FW) \
  1966. { \
  1967. TensorShape{N, IC, IH, IW}, \
  1968. {N, OC, get_shp(IH, PH, SH, FH), get_shp(IW, PW, SW, FW)}, { \
  1969. OC, IC, FH, FW \
  1970. } \
  1971. }
  1972. Checker::RunOptions opt;
  1973. opt.numdiff_eps = 1;
  1974. opt.outputs_max_err = 1e-3;
  1975. std::string algo_name0, algo_name1;
  1976. {
  1977. Checker checker(make_graph, fwd);
  1978. checker.run(inp_tensor(2, 3, 4, 9, 8, 3, 3), opt)
  1979. .run(inp_tensor(1, 5, 3, 7, 9, 3, 3), opt)
  1980. .run(inp_tensor(3, 4, 4, 9, 9, 3, 3), opt);
  1981. auto&& megdnn_opr = static_cast<megdnn::ConvolutionBackwardFilter*>(
  1982. static_cast<opr::ConvolutionBackwardFilter*>(
  1983. bwd_flt->owner_opr())
  1984. ->megdnn_opr());
  1985. auto&& algo = megdnn_opr->execution_policy().algo;
  1986. megdnn::Algorithm* palgo =
  1987. megdnn_opr->get_algorithm_from_desc(algo);
  1988. mgb_assert(palgo, "Unknown algo description");
  1989. if (strategy == S(S::HEURISTIC | S::REPRODUCIBLE)) {
  1990. EXPECT_TRUE(palgo->contain_attribute(
  1991. megdnn::AlgoAttribute::REPRODUCIBLE));
  1992. }
  1993. algo_name0 = palgo->name();
  1994. }
  1995. {
  1996. Checker checker(make_graph, fwd);
  1997. checker.run(inp_tensor(2, 3, 4, 9, 8, 3, 3), opt)
  1998. .run(inp_tensor(1, 5, 3, 7, 9, 3, 3), opt)
  1999. .run(inp_tensor(3, 4, 4, 9, 9, 3, 3), opt);
  2000. auto&& megdnn_opr = static_cast<megdnn::ConvolutionBackwardFilter*>(
  2001. static_cast<opr::ConvolutionBackwardFilter*>(
  2002. bwd_flt->owner_opr())
  2003. ->megdnn_opr());
  2004. auto&& algo = megdnn_opr->execution_policy().algo;
  2005. megdnn::Algorithm* palgo =
  2006. megdnn_opr->get_algorithm_from_desc(algo);
  2007. mgb_assert(palgo, "Unknown algo description");
  2008. algo_name1 = palgo->name();
  2009. }
  2010. EXPECT_TRUE(algo_name0 == algo_name1);
  2011. }
  2012. #undef inp_tensor
  2013. #undef get_shp
  2014. }
  2015. #if MGB_CUDA
  2016. TEST(TestOprDNN, ConvolutionMultiCompNode) {
  2017. REQUIRE_GPU(1);
  2018. auto cn0 = CompNode::load("gpu0:0"), cn1 = CompNode::load("gpu0:1");
  2019. cn0.activate();
  2020. auto&& prop = CompNodeEnv::from_comp_node(cn0).cuda_env().device_prop;
  2021. auto sm_ver = prop.major * 10 + prop.minor;
  2022. if (sm_ver < 61) {
  2023. printf("This testcast ignored due to insufficient cuda cap(got: %d, "
  2024. "expected: %d)\n",
  2025. sm_ver, 61);
  2026. return;
  2027. }
  2028. HostTensorGenerator<dtype::Int8> gen;
  2029. auto mkvar = [&gen](const char* name, const TensorShape& shp,
  2030. const DType& dtype,
  2031. std::shared_ptr<ComputingGraph> graph,
  2032. const CompNode& cn) {
  2033. return opr::TypeCvt::make(
  2034. opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
  2035. dtype);
  2036. };
  2037. auto mkcvar = [&gen](const char* name, const TensorShape& shp,
  2038. const DType& dtype,
  2039. std::shared_ptr<ComputingGraph> graph,
  2040. const CompNode& cn) {
  2041. return opr::TypeCvt::make(
  2042. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  2043. .rename(name),
  2044. dtype);
  2045. };
  2046. auto graph0 = ComputingGraph::make();
  2047. graph0->options().graph_opt_level = 0;
  2048. auto graph1 = ComputingGraph::make();
  2049. graph1->options().graph_opt_level = 0;
  2050. auto make_func = [&gen, &mkvar, &mkcvar](
  2051. std::shared_ptr<ComputingGraph> graph,
  2052. const CompNode& cn) {
  2053. using Policy = opr::ConvBias::ExecutionPolicy;
  2054. using S = Policy::Strategy;
  2055. auto x = mkvar("x", {64, 32, 28, 28, 4}, dtype::QuantizedS8(2.5f),
  2056. graph, cn),
  2057. w1 = mkcvar("w1", {256, 32, 5, 5, 4}, dtype::QuantizedS8(2.5f),
  2058. graph, cn),
  2059. b1 = mkcvar("b1", {1, 64, 1, 1, 4}, dtype::QuantizedS32(6.25f),
  2060. graph, cn),
  2061. w2 = mkcvar("w2", {256, 64, 3, 3, 4}, dtype::QuantizedS8(2.5f),
  2062. graph, cn),
  2063. b2 = mkcvar("b2", {1, 64, 1, 1, 4}, dtype::QuantizedS32(6.25f),
  2064. graph, cn);
  2065. opr::ConvBias::Param param;
  2066. param.format = opr::ConvBias::Param::Format::NCHW4;
  2067. param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU;
  2068. param.stride_h = param.stride_w = 2;
  2069. param.pad_h = param.pad_w = 2;
  2070. Policy policy;
  2071. policy.strategy = S::PROFILE;
  2072. auto y = opr::ConvBias::make(
  2073. x, w1, b1, param, policy,
  2074. OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  2075. param.stride_h = param.stride_w = 1;
  2076. param.pad_h = param.pad_w = 1;
  2077. y = opr::ConvBias::make(y, w2, b2, param, policy,
  2078. OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  2079. return y;
  2080. };
  2081. auto y0 = make_func(graph0, cn0);
  2082. auto y1 = make_func(graph1, cn1);
  2083. HostTensorND host_y0, host_y1;
  2084. auto func0 = graph0->compile({make_callback_copy(y0, host_y0)});
  2085. auto func1 = graph1->compile({make_callback_copy(y1, host_y1)});
  2086. auto worker = [&func0, &func1](int wid) {
  2087. static const int iter_num = 1000;
  2088. if (wid == 0) {
  2089. for (int i = 0; i < iter_num; ++i)
  2090. func0->execute();
  2091. } else {
  2092. for (int i = 0; i < iter_num; ++i)
  2093. func1->execute();
  2094. }
  2095. };
  2096. std::thread worker0(worker, 0);
  2097. std::thread worker1(worker, 1);
  2098. worker0.join();
  2099. worker1.join();
  2100. }
  2101. #endif
  2102. } // anonymous namespace
  2103. #ifndef _WIN32
  2104. namespace mgb {
  2105. namespace opr {
  2106. namespace testing {
  2107. class ConvolutionTestingPeer {
  2108. opr::ConvolutionForward& m_conv_opr;
  2109. public:
  2110. explicit ConvolutionTestingPeer(cg::OperatorNodeBase* opr)
  2111. : m_conv_opr(opr->cast_final_safe<opr::ConvolutionForward>()) {}
  2112. void set_megdnn_opr(
  2113. std::unique_ptr<megdnn::ConvolutionForward> megdnn_opr) {
  2114. m_conv_opr.set_megdnn_opr(std::move(megdnn_opr));
  2115. }
  2116. };
  2117. } // namespace testing
  2118. } // namespace opr
  2119. } // namespace mgb
  2120. namespace {
  2121. using megdnn::TensorND;
  2122. using megdnn::Workspace;
  2123. using opr::testing::ConvolutionTestingPeer;
  2124. class MockConvolutionForward : public megdnn::ConvolutionForward {
  2125. const char* m_algorithm_set_name;
  2126. public:
  2127. MockConvolutionForward(megdnn::ConvolutionForward* orig,
  2128. const char* algo_set_name)
  2129. : megdnn::ConvolutionForward(orig->handle()),
  2130. m_algorithm_set_name(algo_set_name) {}
  2131. MOCK_METHOD5(exec, void(_megdnn_tensor_in src, _megdnn_tensor_in filter,
  2132. _megdnn_tensor_out dst,
  2133. const PreprocessedFilter* preprocessed_filter,
  2134. _megdnn_workspace workspace));
  2135. MOCK_METHOD5(exec_preprocess,
  2136. void(const TensorLayout& src_layout, _megdnn_tensor_in filter,
  2137. const TensorLayout& dst_layout,
  2138. PreprocessedFilter* preprocessed_filter,
  2139. _megdnn_workspace workspace));
  2140. MOCK_METHOD4(get_workspace_in_bytes,
  2141. size_t(const TensorLayout& src, const TensorLayout& filter,
  2142. const TensorLayout& dst,
  2143. const PreprocessedFilter* preprocessed_filter));
  2144. MOCK_METHOD3(deduce_preprocessed_filter_layout,
  2145. SmallVector<TensorLayout>(const TensorLayout& src,
  2146. const TensorLayout& filter,
  2147. const TensorLayout& dst));
  2148. MOCK_METHOD3(get_preprocess_workspace_in_bytes,
  2149. size_t(const TensorLayout& src, const TensorLayout& filter,
  2150. const TensorLayout& dst));
  2151. MOCK_METHOD3(get_all_algorithms_info,
  2152. std::vector<AlgorithmInfo>(const TensorLayout& p0,
  2153. const TensorLayout& p1,
  2154. const TensorLayout& p2));
  2155. MOCK_METHOD5(get_algorithm_info_heuristic,
  2156. AlgorithmInfo(const TensorLayout& p0, const TensorLayout& p1,
  2157. const TensorLayout& p2,
  2158. size_t workspace_limit_in_bytes,
  2159. bool reproducible));
  2160. MOCK_METHOD3(get_all_algorithms,
  2161. std::vector<Algorithm*>(const TensorLayout& p0,
  2162. const TensorLayout& p1,
  2163. const TensorLayout& p2));
  2164. MOCK_METHOD5(get_algorithm_heuristic,
  2165. Algorithm*(const TensorLayout& p0, const TensorLayout& p1,
  2166. const TensorLayout& p2,
  2167. size_t workspace_limit_in_bytes,
  2168. bool reproducible));
  2169. MOCK_METHOD1(get_algorithm_from_desc,
  2170. Algorithm*(const AlgorithmDesc&));
  2171. protected:
  2172. const char* get_algorithm_set_name() const override {
  2173. return m_algorithm_set_name;
  2174. }
  2175. };
  2176. class MockAlgorithm : public megdnn::detail::Algorithm {
  2177. const char* m_name;
  2178. public:
  2179. MockAlgorithm(const char* name = "NotImportant") : m_name(name) {}
  2180. Attribute attribute() const override {
  2181. return Attribute::REPRODUCIBLE;
  2182. }
  2183. const char* name() const override { return m_name; }
  2184. uint32_t type() const override {
  2185. return megdnn::detail::Algorithm::INVALID_ALGO_TYPE;
  2186. }
  2187. virtual ~MockAlgorithm() = default;
  2188. };
  2189. class TestWeightPreprocess : public ::testing::Test {
  2190. protected:
  2191. CompNode comp_node;
  2192. std::shared_ptr<ComputingGraph> graph;
  2193. std::shared_ptr<HostTensorND> x_host;
  2194. MockConvolutionForward* mock_conv_ptr;
  2195. SymbolVar y;
  2196. HostTensorND y_host;
  2197. std::unique_ptr<cg::AsyncExecutable> func;
  2198. MockConvolutionForward& mock_conv() { return *mock_conv_ptr; }
  2199. void SetUp() override {
  2200. constexpr uint32_t ih = 10, ic = 16, oc = 32, ph = 0, sh = 1, fh = 2,
  2201. iw = ih;
  2202. comp_node = CompNode::load("cpux");
  2203. graph = ComputingGraph::make();
  2204. graph->options().graph_opt.weight_preprocess = is_weight_preprocess();
  2205. TensorShape x_shape{1, ic, ih, iw}, w_shape{oc, ic, fh, fh};
  2206. x_host = std::make_shared<HostTensorND>(comp_node, x_shape);
  2207. auto x = opr::Host2DeviceCopy::make(*graph, x_host);
  2208. auto w = opr::ImmutableTensor::make(*graph, {comp_node, w_shape});
  2209. Param param;
  2210. param.pad_h = param.pad_w = ph;
  2211. param.stride_h = param.stride_w = sh;
  2212. param.format = Param::Format::NCHW;
  2213. y = opr::ConvolutionForward::make(x, w, param);
  2214. auto& opr =
  2215. y.node()->owner_opr()->cast_final<opr::ConvolutionForward>();
  2216. auto mock = std::make_unique<MockConvolutionForward>(
  2217. opr.megdnn_opr(), ::testing::UnitTest::GetInstance()
  2218. ->current_test_info()
  2219. ->name());
  2220. mock_conv_ptr = mock.get();
  2221. ConvolutionTestingPeer{&opr}.set_megdnn_opr(std::move(mock));
  2222. func = graph->compile({make_callback_copy(y, y_host)});
  2223. }
  2224. void run() { func->execute().wait(); }
  2225. virtual bool is_weight_preprocess() { return true; }
  2226. void TearDown() override {
  2227. func.reset();
  2228. // Triggers mock check
  2229. graph.reset();
  2230. x_host.reset();
  2231. }
  2232. };
  2233. TEST_F(TestWeightPreprocess, NoPreprocessNeeded) {
  2234. using ::testing::_;
  2235. using ::testing::Return;
  2236. auto& mock = mock_conv();
  2237. MockAlgorithm algo;
  2238. EXPECT_CALL(mock, get_algorithm_heuristic(_, _, _, _, _))
  2239. .WillRepeatedly(Return(&algo));
  2240. EXPECT_CALL(mock, get_algorithm_from_desc(_))
  2241. .WillRepeatedly(Return(&algo));
  2242. EXPECT_CALL(mock, get_workspace_in_bytes(_, _, _, _))
  2243. .WillRepeatedly(Return(0));
  2244. EXPECT_CALL(mock, get_preprocess_workspace_in_bytes(_, _, _))
  2245. .WillRepeatedly(Return(0));
  2246. {
  2247. ::testing::InSequence seq;
  2248. // Return empty preprocess filters, indicating no need to preprocess
  2249. EXPECT_CALL(mock, deduce_preprocessed_filter_layout(_, _, _))
  2250. .WillRepeatedly(Return(SmallVector<TensorLayout>{}));
  2251. EXPECT_CALL(mock, exec_preprocess(_, _, _, _, _)).Times(0);
  2252. EXPECT_CALL(mock, exec(_, _, _, nullptr, _));
  2253. run();
  2254. }
  2255. }
  2256. TEST_F(TestWeightPreprocess, PreprocessCalledOnlyOnce) {
  2257. using ::testing::_;
  2258. using ::testing::Return;
  2259. using ::testing::Field;
  2260. using ::testing::Invoke;
  2261. using ::testing::Expectation;
  2262. using PF = MockConvolutionForward::PreprocessedFilter;
  2263. auto& mock = mock_conv();
  2264. MockAlgorithm algo;
  2265. SmallVector<TensorLayout> filter_layout{{{1, 2, 3, 4}, dtype::Float32()},
  2266. {{5, 6, 7, 8}, dtype::Float32()}};
  2267. EXPECT_CALL(mock, deduce_preprocessed_filter_layout(_, _, _))
  2268. .WillRepeatedly(Return(filter_layout));
  2269. EXPECT_CALL(mock, get_algorithm_from_desc(_))
  2270. .WillRepeatedly(Return(&algo));
  2271. Expectation algo_call =
  2272. EXPECT_CALL(mock, get_algorithm_heuristic(_, _, _, _, _))
  2273. .WillOnce(Return(&algo));
  2274. Expectation ws_call = EXPECT_CALL(mock, get_workspace_in_bytes(_, _, _, _))
  2275. .After(algo_call)
  2276. .WillOnce(Return(0));
  2277. Expectation pre_ws_call =
  2278. EXPECT_CALL(mock, get_preprocess_workspace_in_bytes(_, _, _))
  2279. .After(algo_call)
  2280. .WillOnce(Return(233));
  2281. {
  2282. ::testing::InSequence seq;
  2283. // exec_preprocess should be called only once, with workspace allocated
  2284. int salt = 0;
  2285. EXPECT_CALL(mock, exec_preprocess(_, _, _, _, _))
  2286. .After(ws_call, pre_ws_call)
  2287. .WillOnce(Invoke([&](const TensorLayout&, _megdnn_tensor_in,
  2288. const TensorLayout&, PF* pf,
  2289. _megdnn_workspace workspace) {
  2290. ASSERT_EQ(workspace.size, 233);
  2291. ASSERT_NE(pf, nullptr);
  2292. pf->algorithm_id = &salt;
  2293. ASSERT_EQ(pf->tensors.size(), 2);
  2294. ASSERT_TRUE(pf->tensors[0].layout.eq_shape({1, 2, 3, 4}));
  2295. ASSERT_TRUE(pf->tensors[1].layout.eq_shape({5, 6, 7, 8}));
  2296. ASSERT_NE(pf->tensors[0].raw_ptr, nullptr);
  2297. ASSERT_NE(pf->tensors[1].raw_ptr, nullptr);
  2298. pf->tensors[0].ptr<float>()[0] = 114.514f;
  2299. pf->tensors[1].ptr<float>()[0] = 1926.0817f;
  2300. }));
  2301. // Run the graph multiple times.
  2302. for (int i = 0; i < 3; i++) {
  2303. if (i > 0) {
  2304. EXPECT_CALL(mock, exec_preprocess(_, _, _, _, _)).Times(0);
  2305. }
  2306. EXPECT_CALL(mock, exec(_, _, _, _, _))
  2307. .WillOnce(Invoke([&](_megdnn_tensor_in, _megdnn_tensor_in,
  2308. _megdnn_tensor_out, const PF* pf,
  2309. _megdnn_workspace) {
  2310. ASSERT_NE(pf, nullptr);
  2311. ASSERT_EQ(pf->algorithm_id, &salt);
  2312. ASSERT_EQ(pf->tensors[0].ptr<float>()[0], 114.514f);
  2313. ASSERT_EQ(pf->tensors[1].ptr<float>()[0], 1926.0817f);
  2314. }));
  2315. run();
  2316. }
  2317. }
  2318. }
  2319. class TestNoWeightPreprocess : public TestWeightPreprocess {
  2320. bool is_weight_preprocess() override { return false; }
  2321. };
  2322. TEST_F(TestNoWeightPreprocess, NoPreprocess) {
  2323. using ::testing::_;
  2324. using ::testing::Return;
  2325. auto& mock = mock_conv();
  2326. MockAlgorithm algo;
  2327. EXPECT_CALL(mock, get_algorithm_heuristic(_, _, _, _, _))
  2328. .WillRepeatedly(Return(&algo));
  2329. EXPECT_CALL(mock, get_algorithm_from_desc(_))
  2330. .WillRepeatedly(Return(&algo));
  2331. EXPECT_CALL(mock, get_workspace_in_bytes(_, _, _, _))
  2332. .WillRepeatedly(Return(0));
  2333. EXPECT_CALL(mock, get_preprocess_workspace_in_bytes(_, _, _))
  2334. .WillRepeatedly(Return(0));
  2335. {
  2336. ::testing::InSequence seq;
  2337. // Return empty preprocess filters, indicating no need to preprocess
  2338. EXPECT_CALL(mock, deduce_preprocessed_filter_layout(_, _, _)).Times(0);
  2339. EXPECT_CALL(mock, exec_preprocess(_, _, _, _, _)).Times(0);
  2340. EXPECT_CALL(mock, exec(_, _, _, nullptr, _));
  2341. run();
  2342. }
  2343. }
  2344. } // anonymous namespace
  2345. #endif
  2346. // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台