You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

convolution.cpp 112 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844
  1. /**
  2. * \file src/opr/test/dnn/convolution.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #include "megbrain/comp_node_env.h"
  12. #include "./legacy_checker.h"
  13. #include "megbrain/opr/dnn/convolution.h"
  14. #include "megbrain/test/autocheck.h"
  15. #include "megbrain/test/helper.h"
  16. #include "megbrain/test/megdnn_helper.h"
  17. #include "megbrain/serialization/serializer.h"
  18. #include "megbrain/opr/basic_arith.h"
  19. #include "megbrain/gopt/inference.h"
  20. #include "megbrain/opr/tensor_manip.h"
  21. #include "megdnn/dtype.h"
  22. #include "megdnn/oprs/base.h"
  23. #include <gmock/gmock.h>
  24. #include <cmath>
  25. #include <memory>
  26. #include <random>
  27. using namespace mgb;
  28. namespace {
  29. using Param = opr::Convolution::Param;
  30. using Param3D = opr::Convolution3D::Param;
  31. using Mode = Param::Mode;
  32. Mode modes_to_check[] = {Mode::CONVOLUTION, Mode::CROSS_CORRELATION};
  33. void conv_bwd_data_brute(const std::vector<std::shared_ptr<HostTensorND>>& inps,
  34. std::shared_ptr<HostTensorND>& dest,
  35. const opr::ConvolutionBackwardData::Param& param) {
  36. mgb_assert(param.format == Param::Format::NCHW);
  37. auto &&data = *inps[0], &&filter = *inps[1];
  38. size_t N = data.shape(0), IH = data.shape(2), IW = data.shape(3);
  39. size_t GROUP, ICPG, OCPG, FH, FW;
  40. if (param.sparse == Param::Sparse::DENSE) {
  41. GROUP = 1, ICPG = filter.shape(0), OCPG = filter.shape(1),
  42. FH = filter.shape(2), FW = filter.shape(3);
  43. } else {
  44. mgb_assert(param.sparse == Param::Sparse::GROUP);
  45. GROUP = filter.shape(0), ICPG = filter.shape(1), OCPG = filter.shape(2),
  46. FH = filter.shape(3), FW = filter.shape(4);
  47. }
  48. auto get_shp = [](size_t inp, size_t filter, size_t stride, size_t pad,
  49. size_t dilate) {
  50. return (inp - 1) * stride + (filter - 1) * dilate + 1 - pad * 2;
  51. };
  52. size_t OH = get_shp(IH, FH, param.stride_h, param.pad_h, param.dilate_h),
  53. OW = get_shp(IW, FW, param.stride_w, param.pad_w, param.dilate_w);
  54. dest = std::make_shared<HostTensorND>(CompNode::load("xpu0"),
  55. TensorShape{N, OCPG * GROUP, OH, OW});
  56. auto&& out = *dest;
  57. auto fptr = filter.ptr<float>(), dptr = data.ptr<float>(),
  58. optr = out.ptr<float>();
  59. memset(optr, 0, sizeof(float) * out.shape().total_nr_elems());
  60. auto ol = out.layout(), fl = filter.layout();
  61. #define FOR2(a, A, b, B) \
  62. for (size_t a = 0; a < A; ++a) \
  63. for (size_t b = 0; b < B; ++b)
  64. #define FOR3(a, A, b, B, c, C) \
  65. FOR2(a, A, b, B) \
  66. for (size_t c = 0; c < C; ++c)
  67. FOR3(n, N, group, GROUP, icg, ICPG)
  68. FOR2(ih, IH, iw, IW) {
  69. float scale = *(dptr++);
  70. FOR3(ocg, OCPG, fh, FH, fw, FW) {
  71. auto oc_tot = group * OCPG + ocg;
  72. int oh = int(ih * param.stride_h + fh * param.dilate_h) -
  73. int(param.pad_h),
  74. ow = int(iw * param.stride_w + fw * param.dilate_w) -
  75. int(param.pad_w);
  76. if (oh >= 0 && ow >= 0 && oh < static_cast<int>(OH) &&
  77. ow < static_cast<int>(OW)) {
  78. auto out_off = n * ol.stride[0] + oc_tot * ol.stride[1] +
  79. oh * ol.stride[2] + ow;
  80. size_t flt_off = 0;
  81. if (param.sparse == Param::Convolution::Sparse::DENSE) {
  82. flt_off = icg * fl.stride[0] +
  83. ocg * fl.stride[1] + fh * fl.stride[2] + fw;
  84. } else {
  85. flt_off = group * fl.stride[0] + icg * fl.stride[1] +
  86. ocg * fl.stride[2] + fh * fl.stride[3] + fw;
  87. }
  88. optr[out_off] += scale * fptr[flt_off];
  89. }
  90. }
  91. }
  92. #undef FOR3
  93. #undef FOR2
  94. }
  95. void conv_bwd_flt_brute(const std::vector<std::shared_ptr<HostTensorND>>& inps,
  96. std::shared_ptr<HostTensorND>& out,
  97. const opr::ConvolutionBackwardFilter::Param& param) {
  98. auto &&src = *inps[0], &&diff = *inps[1], &&filter = *inps[2];
  99. size_t N = src.shape(0), IH = src.shape(2), IW = src.shape(3),
  100. OC = filter.shape(0), IC = filter.shape(1), FH = filter.shape(2),
  101. FW = filter.shape(3), OH = diff.shape(2), OW = diff.shape(3);
  102. out = std::make_shared<HostTensorND>(CompNode::load("xpu0"),
  103. TensorShape{OC, IC, FH, FW});
  104. auto&& grad = *out;
  105. auto sptr = src.ptr<float>(), dptr = diff.ptr<float>(),
  106. gptr = grad.ptr<float>();
  107. memset(gptr, 0, sizeof(float) * grad.shape().total_nr_elems());
  108. auto valid = [&](size_t ih, size_t iw) { return ih < IH && iw < IW; };
  109. for (size_t n = 0; n < N; ++n)
  110. for (size_t oc = 0; oc < OC; ++oc)
  111. for (size_t ic = 0; ic < IC; ++ic) {
  112. for (size_t oh = 0; oh < OH; ++oh)
  113. for (size_t ow = 0; ow < OW; ++ow) {
  114. for (size_t fh = 0; fh < FH; ++fh)
  115. for (size_t fw = 0; fw < FW; ++fw) {
  116. size_t ih = oh * param.stride_h + fh -
  117. param.pad_h,
  118. iw = ow * param.stride_w + fw -
  119. param.pad_w;
  120. auto src_data =
  121. valid(ih, iw)
  122. ? sptr[(n * IC + ic) * IH * IW +
  123. ih * IW + iw]
  124. : 0;
  125. gptr[(oc * IC + ic) * FH * FW + fh * FW + fw] +=
  126. dptr[(n * OC + oc) * OH * OW + oh * OW +
  127. ow] *
  128. src_data;
  129. }
  130. }
  131. }
  132. }
  133. void local_share_brute(const std::vector<std::shared_ptr<HostTensorND>>& inps,
  134. std::shared_ptr<HostTensorND>& out,
  135. const opr::LocalShare::Param& param) {
  136. auto in = inps[0], filter = inps[1];
  137. mgb_assert(in->shape().ndim == 4);
  138. mgb_assert(filter->shape().ndim == 6);
  139. int batch_size = in->shape()[0], ci = in->shape()[1], hi = in->shape()[2],
  140. wi = in->shape()[3];
  141. int fh = filter->shape()[3], fw = filter->shape()[4];
  142. int ph = param.pad_h, pw = param.pad_w;
  143. int sh = param.stride_h, sw = param.stride_w;
  144. int dh = param.dilate_h, dw = param.dilate_w;
  145. int sgh = filter->shape()[0], sgw = filter->shape()[1];
  146. mgb_assert(dh == 1 && dw == 1);
  147. mgb_assert(static_cast<uint32_t>(sgh) == param.spatial_groups_h &&
  148. static_cast<uint32_t>(sgw) == param.spatial_groups_w);
  149. int ho = (hi + 2 * ph - fh) / sh + 1;
  150. int wo = (wi + 2 * pw - fw) / sw + 1;
  151. mgb_assert(ho % sgh == 0 && wo % sgw == 0);
  152. int grp_ho = ho / sgh, grp_wo = wo / sgw;
  153. int co = filter->shape()[5];
  154. size_t u_batch = batch_size, u_co = co, u_ho = ho, u_wo = wo;
  155. out = std::make_shared<HostTensorND>(
  156. CompNode::load("xpu0"), TensorShape{u_batch, u_co, u_ho, u_wo});
  157. mgb_assert(param.mode == Param::Mode::CROSS_CORRELATION);
  158. for (int n = 0; n < batch_size; ++n) {
  159. for (int oc = 0; oc < co; ++oc) {
  160. for (int oh = 0; oh < ho; ++oh) {
  161. for (int ow = 0; ow < wo; ++ow) {
  162. size_t u_n = n, u_oc = oc, u_oh = oh, u_ow = ow;
  163. float& dval = out->ptr<float>({u_n, u_oc, u_oh, u_ow})[0];
  164. dval = 0;
  165. int grp_oh_idx = oh / grp_ho;
  166. int grp_ow_idx = ow / grp_wo;
  167. for (int ic = 0; ic < ci; ++ic) {
  168. for (int kh = 0; kh < fh; ++kh) {
  169. for (int kw = 0; kw < fw; ++kw) {
  170. int ih = oh * sh - ph + kh;
  171. int iw = ow * sw - pw + kw;
  172. float sval = 0.f;
  173. float fval = 0.f;
  174. if (ih >= 0 && ih < hi && iw >= 0 && iw < wi) {
  175. sval = in->ptr<float>(
  176. {static_cast<size_t>(n),
  177. static_cast<size_t>(ic),
  178. static_cast<size_t>(ih),
  179. static_cast<size_t>(iw)})[0];
  180. }
  181. fval = filter->ptr<float>(
  182. {static_cast<size_t>(grp_oh_idx),
  183. static_cast<size_t>(grp_ow_idx),
  184. static_cast<size_t>(ic),
  185. static_cast<size_t>(kh),
  186. static_cast<size_t>(kw),
  187. static_cast<size_t>(oc)})[0];
  188. dval += fval * sval;
  189. }
  190. }
  191. }
  192. }
  193. }
  194. }
  195. }
  196. }
  197. void convolution_brute(const std::vector<std::shared_ptr<HostTensorND>> &in_tensor,
  198. std::shared_ptr<HostTensorND> &out_tensor,
  199. const opr::Convolution::Param &param)
  200. {
  201. mgb_assert(in_tensor.size() == 2);
  202. auto in = in_tensor[0], filter = in_tensor[1];
  203. mgb_assert(in->shape().ndim == 4);
  204. mgb_assert(filter->shape().ndim == 4);
  205. int batch_size = in->shape().shape[0];
  206. int ic = in->shape().shape[1];
  207. int ih = in->shape().shape[2];
  208. int iw = in->shape().shape[3];
  209. int fh = filter->shape().shape[2];
  210. int fw = filter->shape().shape[3];
  211. int ph = param.pad_h;
  212. int pw = param.pad_w;
  213. int sh = param.stride_h;
  214. int sw = param.stride_w;
  215. int dh = param.dilate_h;
  216. int dw = param.dilate_w;
  217. mgb_assert(ih + 2*ph >= (fh - 1) * dh + 1);
  218. mgb_assert(iw + 2*pw >= (fw - 1) * dw + 1);
  219. int oh = (ih + 2*ph - ((fh - 1) * dh + 1)) / sh + 1;
  220. int ow = (iw + 2*pw - ((fw - 1) * dw + 1)) / sw + 1;
  221. mgb_assert(static_cast<size_t>(ic) == filter->shape().shape[1]);
  222. int oc = filter->shape().shape[0];
  223. out_tensor = std::make_shared<HostTensorND>(CompNode::load("xpu0"),
  224. TensorShape{
  225. static_cast<size_t>(batch_size),
  226. static_cast<size_t>(oc),
  227. static_cast<size_t>(oh),
  228. static_cast<size_t>(ow)});
  229. int pn, poc, poh, pow, pih, piw, pic, pfh, pfw;
  230. for (pn = 0; pn < batch_size; ++pn)
  231. for (poc = 0; poc < oc; ++poc)
  232. for (poh = 0, pih = -ph; poh < oh; ++poh, pih += sh)
  233. for (pow = 0, piw = -pw; pow < ow; ++pow, piw += sw)
  234. {
  235. float &target = out_tensor->ptr<float>({
  236. static_cast<size_t>(pn),
  237. static_cast<size_t>(poc),
  238. static_cast<size_t>(poh),
  239. static_cast<size_t>(pow)})[0];
  240. target = 0;
  241. for (pic = 0; pic < ic; ++pic)
  242. for (pfh = 0; pfh < fh; ++pfh)
  243. for (pfw = 0; pfw < fw; ++pfw)
  244. {
  245. int prih, priw;
  246. float img_data, filter_data;
  247. if (param.mode == Param::Mode::CONVOLUTION) {
  248. prih = pih + (fh - pfh - 1) * dh;
  249. priw = piw + (fw - pfw - 1) * dw;
  250. } else {
  251. mgb_assert(param.mode == Param::Mode::CROSS_CORRELATION);
  252. prih = pih + pfh * dh;
  253. priw = piw + pfw * dw;
  254. }
  255. if (prih >= 0 && prih < ih &&
  256. priw >= 0 && priw < iw) {
  257. img_data = in_tensor[0]->ptr<float>({
  258. static_cast<size_t>(pn),
  259. static_cast<size_t>(pic),
  260. static_cast<size_t>(prih),
  261. static_cast<size_t>(priw)})[0];
  262. } else {
  263. img_data = 0;
  264. }
  265. filter_data = filter->ptr<float>({
  266. static_cast<size_t>(poc),
  267. static_cast<size_t>(pic),
  268. static_cast<size_t>(pfh),
  269. static_cast<size_t>(pfw)})[0];
  270. target += img_data * filter_data;
  271. }
  272. }
  273. }
  274. opr::Convolution::Param convert_to_conv_param(
  275. const opr::ConvBiasForward::Param& param) {
  276. return opr::Convolution::Param{
  277. param.mode, param.pad_h, param.pad_w,
  278. param.stride_h, param.stride_w, param.dilate_h,
  279. param.dilate_w, param.sparse, param.format};
  280. };
  281. #if MGB_CUDA
  282. opr::Convolution::Param convert_to_conv_param(
  283. const opr::BatchConvBiasForward::Param& param) {
  284. return opr::Convolution::Param{
  285. param.mode, param.pad_h, param.pad_w,
  286. param.stride_h, param.stride_w, param.dilate_h,
  287. param.dilate_w, param.sparse, param.format};
  288. };
  289. #endif
  290. TEST(TestOprDNN, ConvolutionForward) {
  291. uint32_t ih = 10, ic = 16, oc = 32, ph = 0, sh = 1, fh = 2;
  292. for (auto mode: modes_to_check) {
  293. uint32_t iw = ih + 1, fw = fh + 1, pw = ph + 1, sw = sh + 1;
  294. Param param{mode, ph, pw, sh, sw};
  295. size_t batch_size = 32;
  296. // !!! DEPRECATED. use AutoOprChecker instead.
  297. opr::test::ForwardChecker<opr::Convolution, 2> forward_checker({
  298. {batch_size, ic, ih, iw},
  299. {oc, ic, fh, fw}},
  300. convolution_brute, param);
  301. forward_checker.run();
  302. }
  303. }
  304. TEST(TestOprDNN, ConvolutionBackward) {
  305. uint32_t ih = 10, ic = 16, oc = 32, ph = 0, sh = 1, fh = 2;
  306. for (auto mode: modes_to_check) {
  307. uint32_t iw = 11, fw = 4, pw = 1, sw = 3;
  308. Param param{mode, ph, pw, sh, sw};
  309. size_t batch_size = 32;
  310. // !!! DEPRECATED. use AutoOprChecker instead.
  311. opr::test::BackwardChecker<opr::Convolution, 2> backward_checker({
  312. {batch_size, ic, ih, iw},
  313. {oc, ic, fh, fw}}, param, 1e-2, 1);
  314. backward_checker.run();
  315. }
  316. }
  317. TEST(TestOprDNN, ConvBiasExePolicy) {
  318. using Param = opr::ConvBias::Param;
  319. Param param;
  320. using Policy = opr::ConvBias::ExecutionPolicy;
  321. using S = Policy::Strategy;
  322. auto cn = CompNode::load("cpux");
  323. auto orig_impl = PersistentCache::set_impl(
  324. std::make_shared<InMemoryPersistentCache>());
  325. auto run = [&](S strategy) {
  326. auto graph = ComputingGraph::make();
  327. HostTensorGenerator<> gen;
  328. auto mkvar = [&](const char* name, const TensorShape& shp,
  329. const DType& dtype) {
  330. return opr::TypeCvt::make(
  331. opr::Host2DeviceCopy::make(*graph, gen(shp), cn)
  332. .rename(name),
  333. dtype);
  334. };
  335. auto x = mkvar("x", {20, 50, 50, 16}, dtype::QuantizedS8(2.5f));
  336. auto w = mkvar("w", {24, 3, 3, 16}, dtype::QuantizedS8(2.5f));
  337. auto bias = mkvar("bias", {1, 1, 1, 24}, dtype::QuantizedS32(6.25f));
  338. param.nonlineMode = Param::NonlineMode::RELU;
  339. param.format = Param::Format::NHWC;
  340. Policy policy;
  341. policy.strategy = strategy;
  342. auto conv_bias = opr::ConvBias::make(
  343. x, w, bias, param, policy,
  344. OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  345. HostTensorND host_y;
  346. auto func = graph->compile({make_callback_copy(conv_bias, host_y)});
  347. func->execute();
  348. //! set a new cache
  349. PersistentCache::set_impl(std::make_shared<InMemoryPersistentCache>());
  350. };
  351. #if MGB_ENABLE_FASTRUN
  352. for (auto strategy :
  353. SmallVector<S>{S::PROFILE, S::HEURISTIC, S::PROFILE | S::REPRODUCIBLE,
  354. S::PROFILE | S::HEURISTIC}) {
  355. #else
  356. for (auto strategy :
  357. SmallVector<S>{S : HEURISTIC, S::PROFILE | S::HEURISTIC}) {
  358. #endif
  359. run(strategy);
  360. }
  361. ASSERT_THROW(run(S::OPTIMIZED | S::PROFILE), MegBrainError);
  362. PersistentCache::set_impl(orig_impl);
  363. }
  364. TEST(TestOprDNN, ConvBiasExePolicy_Quantized8Asym) {
  365. using Param = opr::ConvBias::Param;
  366. Param param;
  367. using Policy = opr::ConvBias::ExecutionPolicy;
  368. using S = Policy::Strategy;
  369. auto cn = CompNode::load("cpux");
  370. for (auto strategy :
  371. SmallVector<S>{S::PROFILE, S::PROFILE | S::REPRODUCIBLE}) {
  372. auto graph = ComputingGraph::make();
  373. HostTensorGenerator<> gen;
  374. auto mkvar = [&](const char* name, const TensorShape& shp,
  375. const DType& dtype) {
  376. return opr::TypeCvt::make(
  377. opr::Host2DeviceCopy::make(*graph, gen(shp), cn)
  378. .rename(name),
  379. dtype);
  380. };
  381. auto x = mkvar("x", {20, 50, 50, 16},
  382. dtype::Quantized8Asymm(2.5f, static_cast<uint8_t>(0)));
  383. auto w = mkvar("w", {24, 3, 3, 16},
  384. dtype::Quantized8Asymm(2.5f, static_cast<uint8_t>(0)));
  385. auto bias = mkvar("bias", {1, 1, 1, 24}, dtype::QuantizedS32(6.25f));
  386. param.nonlineMode = Param::NonlineMode::RELU;
  387. param.format = Param::Format::NHWC;
  388. Policy policy;
  389. policy.strategy = strategy;
  390. auto conv_bias = opr::ConvBias::make(
  391. x, w, bias, param, policy,
  392. OperatorNodeConfig{dtype::Quantized8Asymm(2.5f, static_cast<uint8_t>(0))});
  393. HostTensorND host_y;
  394. auto func = graph->compile({make_callback_copy(conv_bias, host_y)});
  395. func->execute();
  396. }
  397. }
  398. TEST(TestOprDNN, ConvolutionExePolicy) {
  399. Param param{Mode::CONVOLUTION};
  400. using Policy = opr::Convolution::ExecutionPolicy;
  401. using S = Policy::Strategy;
  402. int nr_get = 0;
  403. auto on_get = [&nr_get](const std::string&, const void*, size_t,
  404. const void*, size_t) { ++nr_get; };
  405. PersistentCacheHook cache_hook{on_get};
  406. #if MGB_ENABLE_FASTRUN
  407. for (auto strategy :
  408. SmallVector<S>{S::PROFILE, S::HEURISTIC, S::PROFILE | S::REPRODUCIBLE,
  409. S::PROFILE | S::HEURISTIC}) {
  410. #else
  411. for (auto strategy :
  412. SmallVector<S>{S : HEURISTIC, S::PROFILE | S::HEURISTIC}) {
  413. #endif
  414. using Checker = AutoOprChecker<2, 1>;
  415. auto make_graph = [&](const Checker::SymInpArray& inputs)
  416. -> Checker::SymOutArray {
  417. Policy policy;
  418. policy.strategy = strategy;
  419. auto out =
  420. opr::Convolution::make(inputs[0], inputs[1], param, policy);
  421. return {out};
  422. };
  423. auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
  424. std::shared_ptr<HostTensorND> sh_out;
  425. convolution_brute({inp.begin(), inp.end()}, sh_out, param);
  426. dest[0] = *sh_out;
  427. };
  428. Checker::RunOptions opt;
  429. opt.numdiff_eps = 1;
  430. nr_get = 0;
  431. Checker(make_graph, fwd)
  432. .run({TensorShape{3, 2, 10, 6}, {4, 2, 3, 2}}, opt)
  433. .run({TensorShape{6, 3, 8, 13}, {2, 3, 2, 13}}, opt)
  434. .run({TensorShape{1, 1, 10, 10}, {2, 1, 3, 3}}, opt);
  435. if (strategy == S::HEURISTIC) {
  436. ASSERT_EQ(0, nr_get);
  437. } else {
  438. ASSERT_LT(0, nr_get);
  439. }
  440. }
  441. }
  442. TEST(TestOprDNN, ConvolutionBackwardDataBfloat16ExePolicy) {
  443. REQUIRE_GPU(1);
  444. Param param{Mode::CROSS_CORRELATION, 1, 1, 1, 1};
  445. param.compute_mode = Param::ComputeMode::FLOAT32;
  446. using Policy = opr::Convolution::ExecutionPolicy;
  447. using S = Policy::Strategy;
  448. auto gen_bfp16 = [](HostTensorND& dest) {
  449. RNGxorshf rng{next_rand_seed()};
  450. auto rand_real = [&rng]() {
  451. std::uniform_real_distribution<float> dist(-1, 1);
  452. return dist(rng);
  453. };
  454. auto ptr = dest.ptr<dt_bfloat16>();
  455. size_t elems = dest.shape().total_nr_elems();
  456. for (size_t i = 0; i < elems; i++) {
  457. ptr[i] = dt_bfloat16(rand_real());
  458. }
  459. };
  460. auto f32_to_bf16 = [](const std::shared_ptr<HostTensorND>& src)
  461. -> std::shared_ptr<HostTensorND> {
  462. auto ret = std::make_shared<HostTensorND>(
  463. src->comp_node(), src->shape(), dtype::BFloat16{});
  464. for (size_t i = 0; i < src->layout().total_nr_elems(); i++) {
  465. ret->ptr<dt_bfloat16>()[i] = src->ptr<dt_float32>()[i];
  466. }
  467. return ret;
  468. };
  469. auto bf16_to_f32 = [](const std::shared_ptr<HostTensorND>& src)
  470. -> std::shared_ptr<HostTensorND> {
  471. auto ret = std::make_shared<HostTensorND>(
  472. src->comp_node(), src->shape(), dtype::Float32{});
  473. for (size_t i = 0; i < src->layout().total_nr_elems(); i++) {
  474. ret->ptr<dt_float32>()[i] = src->ptr<dt_bfloat16>()[i];
  475. }
  476. return ret;
  477. };
  478. int nr_get = 0;
  479. auto on_get = [&nr_get](const std::string&, const void*, size_t,
  480. const void*, size_t) { ++nr_get; };
  481. PersistentCacheHook cache_hook{on_get};
  482. #if MGB_ENABLE_FASTRUN
  483. for (auto strategy :
  484. {S::PROFILE, S::HEURISTIC, S(S::PROFILE | S::REPRODUCIBLE),
  485. S(S::PROFILE | S::HEURISTIC)}) {
  486. #else
  487. for (auto strategy: {S:HEURISTIC, S(S::PROFILE | S::HEURISTIC)}) {
  488. #endif
  489. using Checker = AutoOprChecker<2, 1>;
  490. auto make_graph = [&](const Checker::SymInpArray& inputs)
  491. -> Checker::SymOutArray {
  492. Policy policy;
  493. policy.strategy = strategy;
  494. return {opr::ConvolutionBackwardData::make_deconv(
  495. inputs[0], inputs[1], param, policy)};
  496. };
  497. auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
  498. std::shared_ptr<HostTensorND> out;
  499. conv_bwd_data_brute(
  500. {bf16_to_f32(inp[0]), bf16_to_f32(inp[1])}, out,
  501. param);
  502. dest[0] = *f32_to_bf16(out);
  503. };
  504. Checker::RunOptions opt;
  505. opt.outputs_max_err = 1e-3;
  506. nr_get = 0;
  507. Checker(make_graph, fwd)
  508. .disable_grad_check()
  509. .set_input_dtype(0, dtype::BFloat16{})
  510. .set_input_dtype(1, dtype::BFloat16{})
  511. .set_input_generator(0, gen_bfp16)
  512. .set_input_generator(1, gen_bfp16)
  513. .run({TensorShape{3, 4, 10, 6}, {4, 2, 3, 3}}, opt)
  514. .run({TensorShape{2, 2, 4, 3}, {2, 2, 3, 3}}, opt)
  515. .run({TensorShape{1, 3, 10, 6}, {3, 2, 3, 3}}, opt);
  516. if (strategy == S::HEURISTIC) {
  517. ASSERT_EQ(0, nr_get);
  518. } else {
  519. ASSERT_LT(0, nr_get);
  520. }
  521. }
  522. }
  523. #if MGB_ENABLE_FASTRUN
  524. TEST(TestOprDNN, ConvolutionBackwardDataFloat16ExePolicy) {
  525. REQUIRE_GPU(1);
  526. Param param{Mode::CROSS_CORRELATION, 1, 1, 1, 1};
  527. param.compute_mode = Param::ComputeMode::FLOAT32;
  528. using Policy = opr::Convolution::ExecutionPolicy;
  529. using S = Policy::Strategy;
  530. auto gen_fp16 = [](HostTensorND& dest) {
  531. RNGxorshf rng{next_rand_seed()};
  532. auto rand_real = [&rng]() {
  533. std::uniform_real_distribution<float> dist(-1, 1);
  534. return dist(rng);
  535. };
  536. auto ptr = dest.ptr<dt_float16>();
  537. size_t elems = dest.shape().total_nr_elems();
  538. for (size_t i = 0; i < elems; i++) {
  539. ptr[i] = dt_float16(rand_real());
  540. }
  541. };
  542. auto f32_to_f16 = [](const std::shared_ptr<HostTensorND>& src)
  543. -> std::shared_ptr<HostTensorND> {
  544. auto ret = std::make_shared<HostTensorND>(
  545. src->comp_node(), src->shape(), dtype::Float16{});
  546. for (size_t i = 0; i < src->layout().total_nr_elems(); i++) {
  547. ret->ptr<dt_float16>()[i] = src->ptr<dt_float32>()[i];
  548. }
  549. return ret;
  550. };
  551. auto f16_to_f32 = [](const std::shared_ptr<HostTensorND>& src)
  552. -> std::shared_ptr<HostTensorND> {
  553. auto ret = std::make_shared<HostTensorND>(
  554. src->comp_node(), src->shape(), dtype::Float32{});
  555. for (size_t i = 0; i < src->layout().total_nr_elems(); i++) {
  556. ret->ptr<dt_float32>()[i] = src->ptr<dt_float16>()[i];
  557. }
  558. return ret;
  559. };
  560. int nr_get = 0;
  561. auto on_get = [&nr_get](const std::string&, const void*, size_t,
  562. const void*, size_t) { ++nr_get; };
  563. PersistentCacheHook cache_hook{on_get};
  564. auto strategy = S(S::PROFILE | S::REPRODUCIBLE);
  565. using Checker = AutoOprChecker<2, 1>;
  566. auto make_graph =
  567. [&](const Checker::SymInpArray& inputs) -> Checker::SymOutArray {
  568. Policy policy;
  569. policy.strategy = strategy;
  570. return {opr::ConvolutionBackwardData::make_deconv(inputs[0], inputs[1],
  571. param, policy)};
  572. };
  573. auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
  574. std::shared_ptr<HostTensorND> out;
  575. conv_bwd_data_brute({f16_to_f32(inp[0]), f16_to_f32(inp[1])}, out,
  576. param);
  577. dest[0] = *f32_to_f16(out);
  578. };
  579. Checker::RunOptions opt;
  580. opt.outputs_max_err = 1e-2;
  581. nr_get = 0;
  582. Checker(make_graph, fwd)
  583. .disable_grad_check()
  584. .set_input_dtype(0, dtype::Float16{})
  585. .set_input_dtype(1, dtype::Float16{})
  586. .set_input_generator(0, gen_fp16)
  587. .set_input_generator(1, gen_fp16)
  588. .run({TensorShape{3, 4, 10, 6}, {4, 2, 3, 3}}, opt)
  589. .run({TensorShape{2, 2, 4, 3}, {2, 2, 3, 3}}, opt)
  590. .run({TensorShape{1, 3, 10, 6}, {3, 2, 3, 3}}, opt);
  591. if (strategy == S::HEURISTIC) {
  592. ASSERT_EQ(0, nr_get);
  593. } else {
  594. ASSERT_LT(0, nr_get);
  595. }
  596. }
  597. #endif
  598. TEST(TestOprDNN, Deconvolution) {
  599. // dilated grouped deconv
  600. using Checker = AutoOprChecker<2, 1>;
  601. Param param{Mode::CROSS_CORRELATION, 0, 1, 1, 2};
  602. param.dilate_h = 2;
  603. param.sparse = Param::Sparse::GROUP;
  604. auto make_graph = [&](
  605. const Checker::SymInpArray &inputs) -> Checker::SymOutArray {
  606. return {opr::ConvolutionBackwardData::make_deconv(
  607. inputs[0], inputs[1], param)};
  608. };
  609. auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
  610. std::shared_ptr<HostTensorND> out;
  611. conv_bwd_data_brute({inp[0], inp[1]}, out, param);
  612. dest[0] = *out;
  613. };
  614. Checker::RunOptions opt;
  615. opt.numdiff_eps = 1;
  616. Checker(make_graph, fwd).
  617. run({TensorShape{2, 4, 6, 8}, {1, 4, 5, 3, 2}}, opt).
  618. run({TensorShape{3, 2, 1, 1}, {2, 1, 1, 4, 3}}, opt).
  619. run({TensorShape{4, 6, 7, 2}, {2, 3, 4, 8, 13}}, opt);
  620. }
  621. TEST(TestOprDNN, DeconvolutionExePolicy_QuantizedS8) {
  622. REQUIRE_GPU(1);
  623. auto cn = CompNode::load("gpu0");
  624. cn.activate();
  625. REQUIRE_CUDA_COMPUTE_CAPABILITY(6, 1);
  626. Param param;
  627. using Policy = opr::ConvolutionBackwardData::ExecutionPolicy;
  628. using S = Policy::Strategy;
  629. #if MGB_ENABLE_FASTRUN
  630. for (auto strategy :
  631. {S::PROFILE, S::HEURISTIC, S(S::PROFILE | S::REPRODUCIBLE),
  632. S(S::PROFILE | S::HEURISTIC)}) {
  633. #else
  634. for (auto strategy: {S:HEURISTIC, S(S::PROFILE | S::HEURISTIC)}) {
  635. #endif
  636. auto graph = ComputingGraph::make();
  637. HostTensorGenerator<> gen;
  638. auto mkvar = [&](const char* name, const TensorShape& shp,
  639. const DType& dtype) {
  640. return opr::TypeCvt::make(
  641. opr::Host2DeviceCopy::make(*graph, gen(shp)).rename(name),
  642. dtype);
  643. };
  644. auto x = mkvar("x", {16, 4, 50, 50, 4}, dtype::QuantizedS8(1.2f));
  645. auto w = mkvar("w", {16, 4, 4, 4, 4}, dtype::QuantizedS8(1.3f));
  646. param.format = Param::Format::NCHW4;
  647. param.pad_h = param.pad_w = 2;
  648. param.stride_h = param.stride_w = 2;
  649. Policy policy;
  650. policy.strategy = strategy;
  651. auto deconv = opr::ConvolutionBackwardData::make_deconv(
  652. x, w, param, policy,
  653. OperatorNodeConfig{dtype::QuantizedS8(1.2f)});
  654. HostTensorND host_y;
  655. auto func = graph->compile({make_callback_copy(deconv, host_y)});
  656. func->execute();
  657. }
  658. }
  659. TEST(TestOprDNN, ConvolutionBackwardFilter) {
  660. using Checker = AutoOprChecker<3, 1>;
  661. constexpr size_t PH = 0, PW = 1, SH = 1, SW = 2;
  662. auto make_graph = [&](
  663. const Checker::SymInpArray &inputs) -> Checker::SymOutArray {
  664. Param param{Mode::CROSS_CORRELATION, PH, PW, SH, SW};
  665. return {opr::ConvolutionBackwardFilter::make(
  666. inputs[0], inputs[1], inputs[2], param)};
  667. };
  668. auto fwd = [&](Checker::NumOutArray &dest, Checker::NumInpArray inp) {
  669. std::shared_ptr<HostTensorND> out;
  670. conv_bwd_flt_brute({inp[0], inp[1], inp[2]}, out,
  671. Param{Mode::CROSS_CORRELATION, PH, PW, SH, SW});
  672. dest[0] = *out;
  673. };
  674. #define get_shp(N, P, S, F) ((N + 2 * P - F) / S + 1)
  675. #define inp_tensor(N, IC, OC, IH, IW, FH, FW) \
  676. { TensorShape{N, IC, IH, IW}, \
  677. {N, OC, get_shp(IH, PH, SH, FH), get_shp(IW, PW, SW, FW)}, \
  678. {OC, IC, FH, FW} }
  679. Checker::RunOptions opt;
  680. opt.numdiff_eps = 1;
  681. Checker(make_graph, fwd).
  682. run(inp_tensor(2, 3, 4, 9, 8, 4, 3), opt).
  683. run(inp_tensor(1, 5, 3, 7, 9, 3, 4), opt).
  684. run(inp_tensor(3, 4, 4, 9, 9, 3, 3), opt);
  685. #undef inp_tensor
  686. #undef get_shp
  687. }
  688. TEST(TestOprDNN, DilatedConvolution) {
  689. using Checker = AutoOprChecker<2, 1>;
  690. opr::ConvolutionForward::Param param;
  691. param.pad_h = 5;
  692. param.pad_w = 2;
  693. param.stride_w = 2;
  694. param.dilate_h = 2;
  695. auto make_graph = [&](const Checker::SymInpArray &inputs) ->
  696. Checker::SymOutArray {
  697. return {opr::Convolution::make(inputs[0], inputs[1], param)};
  698. };
  699. auto fwd = [&](Checker::NumOutArray &dest, Checker::NumInpArray inp) {
  700. auto opr = megdnn_naive_handle()->create_operator<
  701. megdnn::Convolution>();
  702. opr->param() = param;
  703. TensorLayout dest_layout;
  704. opr->deduce_layout(inp[0]->layout(), inp[1]->layout(), dest_layout);
  705. std::vector<dt_byte> workspace(opr->get_workspace_in_bytes(
  706. inp[0]->layout(), inp[1]->layout(), dest_layout, nullptr));
  707. dest[0].dtype(dtype::Float32()).
  708. comp_node(inp[0]->comp_node()).resize(dest_layout);
  709. opr->exec(inp[0]->as_megdnn(), inp[1]->as_megdnn(), dest[0].as_megdnn(),
  710. nullptr, {workspace.data(), workspace.size()});
  711. };
  712. Checker::RunOptions option;
  713. option.numdiff_eps = 0.1;
  714. Checker(make_graph, fwd).
  715. run({TensorShape{2, 3, 8, 7}, TensorShape{4, 3, 2, 2}}, option).
  716. run({TensorShape{2, 3, 8, 7}, TensorShape{4, 3, 3, 2}}, option).
  717. run({TensorShape{2, 3, 8, 9}, TensorShape{4, 3, 3, 2}}, option);
  718. }
  719. TEST(TestOprDNN, GroupConv) {
  720. using Checker = AutoOprChecker<2, 1>;
  721. opr::Convolution::Param param;
  722. param.pad_h = 1;
  723. param.pad_w = 2;
  724. param.stride_h = 2;
  725. auto make_graph = [&](
  726. const Checker::SymInpArray &inputs) -> Checker::SymOutArray {
  727. auto p1 = param;
  728. p1.sparse = opr::Convolution::Param::Sparse::GROUP;
  729. return {opr::Convolution::make(inputs[0], inputs[1], p1)};
  730. };
  731. auto cn = CompNode::load("xpux");
  732. auto inp0 = std::make_shared<HostTensorND>(cn, dtype::Float32()),
  733. inp1 = std::make_shared<HostTensorND>(cn, dtype::Float32());
  734. HostTensorND out_raw;
  735. auto graph_raw = ComputingGraph::make();
  736. auto func_raw = graph_raw->compile({
  737. make_callback_copy(
  738. opr::Convolution::make(
  739. opr::Host2DeviceCopy::make(*graph_raw, inp0),
  740. opr::Host2DeviceCopy::make(*graph_raw, inp1),
  741. param),
  742. out_raw)});
  743. auto fwd = [&](Checker::NumOutArray &dest, Checker::NumInpArray inp) {
  744. auto &&out = dest[0];
  745. auto sl = inp[0]->layout(),
  746. fl = inp[1]->layout().remove_axis(0);
  747. TensorLayout ol;
  748. auto group = inp[1]->layout()[0];
  749. sl.shape[1] /= group;
  750. for (size_t i = 0; i < group; ++ i) {
  751. inp0->copy_from(inp[0]->sub(SubTensorSpec::make_from_offset_elem(
  752. sl, i * sl[1] * sl[2] * sl[3])));
  753. inp1->copy_from(inp[1]->sub(SubTensorSpec::make_from_offset_elem(
  754. fl, i * fl.total_nr_elems())));
  755. func_raw->execute();
  756. if (!i) {
  757. auto oshp = out_raw.shape();
  758. oshp[1] *= group;
  759. out.resize(oshp);
  760. ol = out.layout();
  761. ol[1] /= group;
  762. }
  763. out.sub(SubTensorSpec::make_from_offset_elem(
  764. ol, i * ol[1] * ol[2] * ol[3])).copy_from_fixlayout(
  765. out_raw);
  766. }
  767. };
  768. Checker::RunOptions opt;
  769. opt.numdiff_eps = 1;
  770. opt.outputs_max_err = 5e-5;
  771. Checker checker{make_graph, fwd};
  772. auto run = [&](const TensorShape &ishp,
  773. size_t fh, size_t fw, size_t oc, size_t group) {
  774. size_t ic = ishp[1];
  775. TensorShape flt{group, oc/group, ic/group, fh, fw};
  776. checker.run({ishp, flt}, opt);
  777. };
  778. run({1, 2, 1, 1}, 1, 1, 2, 2);
  779. run({3, 9, 5, 4}, 1, 2, 6, 3);
  780. run({3, 6, 8, 9}, 3, 1, 4, 2);
  781. run({2, 5, 3, 6}, 2, 3, 5, 1);
  782. run({2, 6, 3, 6}, 2, 3, 6, 6);
  783. }
  784. TEST(TestOprDNN, MaskConvolution) {
  785. using Checker = AutoOprChecker<3, 1>;
  786. opr::Convolution::Param param;
  787. auto make_graph =
  788. [&](const Checker::SymInpArray& inputs) -> Checker::SymOutArray {
  789. return {opr::MaskConvolution::make(inputs[0], inputs[1], inputs[2],
  790. param)};
  791. };
  792. auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
  793. std::shared_ptr<HostTensorND> sh_out;
  794. convolution_brute({inp[0], inp[1]}, sh_out, param);
  795. dest[0] = *sh_out;
  796. size_t N = dest[0].shape()[0];
  797. size_t OC = dest[0].shape()[1];
  798. size_t OH = dest[0].shape()[2];
  799. size_t OW = dest[0].shape()[3];
  800. auto mask_ptr = inp[2]->ptr<int8_t>();
  801. auto dest_ptr = dest[0].ptr<float>();
  802. for (size_t i = 0; i < N * OC; ++i) {
  803. for (size_t mask_idx = 0; mask_idx < OH * OW; ++mask_idx) {
  804. if (mask_ptr[mask_idx] == 0) {
  805. dest_ptr[i * OH * OW + mask_idx] = 0;
  806. }
  807. }
  808. }
  809. };
  810. auto gen_mask = [](HostTensorND& dest) {
  811. HostTensorGenerator<dtype::Int8, RandomDistribution::UNIFORM>
  812. mask_generator{0, 1};
  813. dest = *mask_generator(dest.shape(), dest.comp_node());
  814. };
  815. auto run_with_param = [&](size_t SH = 1, size_t SW = 1, size_t PH = 0,
  816. size_t PW = 0) {
  817. param.pad_h = PH;
  818. param.pad_w = PW;
  819. param.stride_h = SH;
  820. param.stride_w = SW;
  821. Checker checker{make_graph, fwd};
  822. Checker::RunOptions opt;
  823. checker.set_output_allow_grad(0, false);
  824. checker.set_input_dtype(2, dtype::Int8());
  825. checker.set_input_generator(2, gen_mask);
  826. auto run = [&](size_t N, size_t IC, size_t OC, size_t IH, size_t IW,
  827. size_t FH, size_t FW) {
  828. size_t OH = (IH + 2 * PH - FH) / SH + 1;
  829. size_t OW = (IW + 2 * PW - FW) / SW + 1;
  830. checker.run(
  831. {TensorShape{N, IC, IH, IW}, {OC, IC, FH, FW}, {OH, OW}},
  832. opt);
  833. };
  834. run(1, 1, 1, 5, 5, 3, 3);
  835. run(2, 3, 4, 5, 5, 3, 3);
  836. run(3, 3, 4, 224, 223, 3, 3);
  837. run(3, 3, 4, 224, 223, 2, 2);
  838. };
  839. run_with_param();
  840. run_with_param(2, 2, 3, 3);
  841. run_with_param(3, 2, 1, 2);
  842. run_with_param(2, 3, 2, 2);
  843. }
  844. TEST(TestOprDNN, MaskPropagate) {
  845. using Checker = AutoOprChecker<3, 1>;
  846. opr::MaskPropagate::Param mask_param;
  847. opr::Convolution::Param conv_param;
  848. auto make_graph =
  849. [&](const Checker::SymInpArray& inputs) -> Checker::SymOutArray {
  850. auto inp_mask = inputs[2];
  851. auto out_mask = opr::MaskPropagate::make(inp_mask, mask_param);
  852. return {opr::MaskConvolution::make(inputs[0], inputs[1], out_mask,
  853. conv_param)};
  854. };
  855. auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
  856. auto& src = *inp[0];
  857. auto& mask = *inp[2];
  858. auto src_ptr = inp[0]->ptr<float>();
  859. auto mask_ptr = inp[2]->ptr<int>();
  860. mgb_assert(src.shape()[2] == mask.shape()[0] &&
  861. src.shape()[3] == mask.shape()[1]);
  862. for (size_t i = 0; i < src.shape()[0] * src.shape()[1]; ++i) {
  863. for (size_t mask_idx = 0;
  864. mask_idx < src.shape()[2] * src.shape()[3]; ++mask_idx) {
  865. if (mask_ptr[mask_idx] == 0) {
  866. src_ptr[i * src.layout().stride[1] + mask_idx] = 0;
  867. }
  868. }
  869. }
  870. std::shared_ptr<HostTensorND> sh_out;
  871. convolution_brute({inp[0], inp[1]}, sh_out, conv_param);
  872. dest[0] = *sh_out;
  873. };
  874. auto gen_mask = [](HostTensorND& dest) {
  875. HostTensorGenerator<dtype::Int32, RandomDistribution::UNIFORM>
  876. mask_generator{0, 1};
  877. dest = *mask_generator(dest.shape(), dest.comp_node());
  878. };
  879. auto run_with_param = [&](size_t FH, size_t FW, size_t SH = 1,
  880. size_t SW = 1, size_t PH = 0, size_t PW = 0,
  881. size_t DH = 1, size_t DW = 1) {
  882. conv_param.pad_h = PH;
  883. conv_param.pad_w = PW;
  884. conv_param.stride_h = SH;
  885. conv_param.stride_w = SW;
  886. conv_param.dilate_h = DH;
  887. conv_param.dilate_w = DW;
  888. mask_param.pad_h = PH;
  889. mask_param.pad_w = PW;
  890. mask_param.stride_h = SH;
  891. mask_param.stride_w = SW;
  892. mask_param.kernel_h = FH;
  893. mask_param.kernel_w = FW;
  894. mask_param.dilate_h = DH;
  895. mask_param.dilate_w = DW;
  896. Checker checker{make_graph, fwd};
  897. Checker::RunOptions opt;
  898. checker.set_output_allow_grad(0, false);
  899. checker.set_input_dtype(2, dtype::Int32());
  900. checker.set_input_generator(2, gen_mask);
  901. auto run = [&](size_t N, size_t IC, size_t OC, size_t IH, size_t IW) {
  902. checker.run(
  903. {TensorShape{N, IC, IH, IW}, {OC, IC, FH, FW}, {IH, IW}},
  904. opt);
  905. };
  906. run(1, 1, 1, 5, 5);
  907. run(2, 3, 4, 5, 5);
  908. run(3, 3, 4, 224, 223);
  909. run(3, 3, 4, 224, 223);
  910. };
  911. run_with_param(3, 3, 1, 1, 0, 0, 2, 2);
  912. run_with_param(3, 3, 2, 2, 3, 3);
  913. run_with_param(4, 2, 3, 2, 1, 2);
  914. run_with_param(2, 4, 2, 3, 2, 2);
  915. run_with_param(4, 2, 3, 2, 1, 2, 2, 2);
  916. run_with_param(2, 4, 2, 3, 2, 2, 2, 1);
  917. }
  918. void convolution3d_brute(const std::vector<std::shared_ptr<HostTensorND>> &in_tensor,
  919. std::shared_ptr<HostTensorND> &out_tensor,
  920. const opr::Convolution3D::Param &param)
  921. {
  922. mgb_assert(in_tensor.size() == 2);
  923. auto in = in_tensor[0], filter = in_tensor[1];
  924. mgb_assert(in->shape().ndim == 5);
  925. mgb_assert(filter->shape().ndim == 5);
  926. int batch_size = in->shape().shape[0];
  927. int ic = in->shape().shape[1];
  928. int id = in->shape().shape[2];
  929. int ih = in->shape().shape[3];
  930. int iw = in->shape().shape[4];
  931. int fd = filter->shape().shape[2];
  932. int fh = filter->shape().shape[3];
  933. int fw = filter->shape().shape[4];
  934. int pd = param.pad_d;
  935. int ph = param.pad_h;
  936. int pw = param.pad_w;
  937. int sd = param.stride_d;
  938. int sh = param.stride_h;
  939. int sw = param.stride_w;
  940. int dd = param.dilate_d;
  941. int dh = param.dilate_h;
  942. int dw = param.dilate_w;
  943. mgb_assert(id + 2*pd >= (fd - 1) * dd + 1);
  944. mgb_assert(ih + 2*ph >= (fh - 1) * dh + 1);
  945. mgb_assert(iw + 2*pw >= (fw - 1) * dw + 1);
  946. int od = (id + 2*pd - ((fd - 1) * dd + 1)) / sd + 1;
  947. int oh = (ih + 2*ph - ((fh - 1) * dh + 1)) / sh + 1;
  948. int ow = (iw + 2*pw - ((fw - 1) * dw + 1)) / sw + 1;
  949. mgb_assert(static_cast<size_t>(ic) == filter->shape().shape[1]);
  950. int oc = filter->shape().shape[0];
  951. out_tensor = std::make_shared<HostTensorND>(CompNode::load("xpu0"),
  952. TensorShape{
  953. static_cast<size_t>(batch_size),
  954. static_cast<size_t>(oc),
  955. static_cast<size_t>(od),
  956. static_cast<size_t>(oh),
  957. static_cast<size_t>(ow)});
  958. int pn, poc, pod, poh, pow,
  959. pic, pid, pih, piw,
  960. pfd, pfh, pfw;
  961. for (pn = 0; pn < batch_size; ++pn)
  962. for (poc = 0; poc < oc; ++poc)
  963. for (pod = 0, pid = -pd; pod < od; ++pod, pid += sd)
  964. for (poh = 0, pih = -ph; poh < oh; ++poh, pih += sh)
  965. for (pow = 0, piw = -pw; pow < ow; ++pow, piw += sw)
  966. {
  967. float &target = out_tensor->ptr<float>({
  968. static_cast<size_t>(pn),
  969. static_cast<size_t>(poc),
  970. static_cast<size_t>(pod),
  971. static_cast<size_t>(poh),
  972. static_cast<size_t>(pow)})[0];
  973. target = 0;
  974. for (pic = 0; pic < ic; ++pic)
  975. for (pfd = 0; pfd < fd; ++pfd)
  976. for (pfh = 0; pfh < fh; ++pfh)
  977. for (pfw = 0; pfw < fw; ++pfw)
  978. {
  979. int prid, prih, priw;
  980. float img_data, filter_data;
  981. if (param.mode == opr::Convolution3D::Param::Mode::CONVOLUTION) {
  982. prid = pid + (fd - pfd - 1) * dd;
  983. prih = pih + (fh - pfh - 1) * dh;
  984. priw = piw + (fw - pfw - 1) * dw;
  985. } else {
  986. mgb_assert(param.mode == opr::Convolution3D::Param::Mode::CROSS_CORRELATION);
  987. prid = pid + pfd * dd;
  988. prih = pih + pfh * dh;
  989. priw = piw + pfw * dw;
  990. }
  991. if (prid >= 0 && prid < id &&
  992. prih >= 0 && prih < ih &&
  993. priw >= 0 && priw < iw) {
  994. img_data = in_tensor[0]->ptr<float>({
  995. static_cast<size_t>(pn),
  996. static_cast<size_t>(pic),
  997. static_cast<size_t>(prid),
  998. static_cast<size_t>(prih),
  999. static_cast<size_t>(priw)})[0];
  1000. } else {
  1001. img_data = 0;
  1002. }
  1003. filter_data = filter->ptr<float>({
  1004. static_cast<size_t>(poc),
  1005. static_cast<size_t>(pic),
  1006. static_cast<size_t>(pfd),
  1007. static_cast<size_t>(pfh),
  1008. static_cast<size_t>(pfw)})[0];
  1009. target += img_data * filter_data;
  1010. }
  1011. }
  1012. }
  1013. TEST(TestOprDNN, Convolution3DForward) {
  1014. for (uint32_t batch_size : {8})
  1015. for (uint32_t id : {12})
  1016. for (uint32_t fd : {1, 3})
  1017. for (uint32_t ic : {4})
  1018. for (uint32_t oc : {ic})
  1019. for (uint32_t pd : {0, 2})
  1020. for (uint32_t sd : {1, 3})
  1021. for (uint32_t dd : {1, 3})
  1022. for (bool xcorr : {0, 1}) {
  1023. uint32_t ih = id + 1, fh = fd, ph = pd + 1, sh = sd + 1;
  1024. uint32_t iw = ih + 1, fw = fh, pw = ph + 1, sw = sh + 1;
  1025. Param3D param{xcorr ? Param3D::Mode::CROSS_CORRELATION :
  1026. Param3D::Mode::CONVOLUTION , pd, ph, pw,
  1027. sd, sh, sw, dd, dd, dd};
  1028. // !!! DEPRECATED. use AutoOprChecker instead.
  1029. opr::test::ForwardChecker<opr::Convolution3D, 2> forward_checker({
  1030. {batch_size, ic, id, ih, iw},
  1031. {oc, ic, fd, fh, fw}},
  1032. convolution3d_brute, param);
  1033. forward_checker.run();
  1034. }
  1035. }
  1036. TEST(TestOprDNN, Convolution3DBackward) {
  1037. for (uint32_t batch_size : {8})
  1038. for (uint32_t id : {12})
  1039. for (uint32_t fd : {1, 3})
  1040. for (uint32_t ic : {4})
  1041. for (uint32_t oc : {ic})
  1042. for (uint32_t pd : {0, 2})
  1043. for (uint32_t sd : {1, 3})
  1044. for (uint32_t dd : {1, 3})
  1045. for (bool xcorr : {0, 1}) {
  1046. uint32_t ih = id + 1, fh = fd, ph = pd + 1, sh = sd + 1;
  1047. uint32_t iw = ih + 1, fw = fh, pw = ph + 1, sw = sh + 1;
  1048. Param3D param{xcorr ? Param3D::Mode::CROSS_CORRELATION :
  1049. Param3D::Mode::CONVOLUTION,
  1050. pd, ph, pw, sd, sh, sw, dd, dd, dd};
  1051. // !!! DEPRECATED. use AutoOprChecker instead.
  1052. opr::test::BackwardChecker<opr::Convolution3D, 2> backward_checker(
  1053. {{batch_size, ic, id, ih, iw},
  1054. {oc, ic, fd, fh, fw}}, param, 1e-2, 1);
  1055. backward_checker.run();
  1056. }
  1057. }
  1058. TEST(TestOprDNN, GroupConv3D) {
  1059. using Checker = AutoOprChecker<2, 1>;
  1060. opr::Convolution3D::Param param;
  1061. param.pad_d = 0;
  1062. param.pad_h = 1;
  1063. param.pad_w = 0;
  1064. param.stride_d = 1;
  1065. param.stride_h = 2;
  1066. auto make_graph = [&](
  1067. const Checker::SymInpArray &inputs) -> Checker::SymOutArray {
  1068. auto p1 = param;
  1069. p1.sparse = opr::Convolution3D::Param::Sparse::GROUP;
  1070. return {opr::Convolution3D::make(inputs[0], inputs[1], p1)};
  1071. };
  1072. auto cn = CompNode::load("xpux");
  1073. auto inp0 = std::make_shared<HostTensorND>(cn, dtype::Float32()),
  1074. inp1 = std::make_shared<HostTensorND>(cn, dtype::Float32());
  1075. HostTensorND out_raw;
  1076. auto graph_raw = ComputingGraph::make();
  1077. auto func_raw = graph_raw->compile({
  1078. make_callback_copy(
  1079. opr::Convolution3D::make(
  1080. opr::Host2DeviceCopy::make(*graph_raw, inp0),
  1081. opr::Host2DeviceCopy::make(*graph_raw, inp1),
  1082. param),
  1083. out_raw)});
  1084. auto fwd = [&](Checker::NumOutArray &dest, Checker::NumInpArray inp) {
  1085. auto &&out = dest[0];
  1086. auto sl = inp[0]->layout(),
  1087. fl = inp[1]->layout().remove_axis(0);
  1088. TensorLayout ol;
  1089. auto group = inp[1]->layout()[0];
  1090. sl.shape[1] /= group;
  1091. for (size_t i = 0; i < group; ++ i) {
  1092. inp0->copy_from(inp[0]->sub(SubTensorSpec::make_from_offset_elem(
  1093. sl, i * sl[1] * sl[2] * sl[3] * sl[4])));
  1094. inp1->copy_from(inp[1]->sub(SubTensorSpec::make_from_offset_elem(
  1095. fl, i * fl.total_nr_elems())));
  1096. func_raw->execute();
  1097. if (!i) {
  1098. auto oshp = out_raw.shape();
  1099. oshp[1] *= group;
  1100. out.resize(oshp);
  1101. ol = out.layout();
  1102. ol[1] /= group;
  1103. }
  1104. out.sub(SubTensorSpec::make_from_offset_elem(
  1105. ol, i * ol[1] * ol[2] * ol[3] * ol[4])).
  1106. copy_from_fixlayout(out_raw);
  1107. }
  1108. };
  1109. Checker::RunOptions opt;
  1110. opt.numdiff_eps = 1;
  1111. opt.outputs_max_err = 5e-5;
  1112. Checker checker{make_graph, fwd};
  1113. auto run = [&](const TensorShape &ishp,
  1114. size_t fd, size_t fh, size_t fw, size_t oc, size_t group) {
  1115. size_t ic = ishp[1];
  1116. TensorShape flt{group, oc/group, ic/group, fd, fh, fw};
  1117. checker.
  1118. run({ishp, flt}, opt);
  1119. };
  1120. run({1, 2, 1, 1, 1}, 1, 1, 1, 2, 2);
  1121. run({3, 9, 5, 4, 3}, 1, 2, 3, 6, 3);
  1122. run({2, 1, 3, 6, 9}, 2, 3, 3, 5, 1);
  1123. run({2, 1, 3, 6, 9}, 2, 3, 3, 5, 1);
  1124. }
  1125. TEST(TestOprDNN, Deconvolution3D) {
  1126. using Checker = AutoOprChecker<2, 1>;
  1127. Param3D param{Param3D::Mode::CROSS_CORRELATION, 0, 1, 1, 1, 2, 2};
  1128. param.sparse = Param3D::Sparse::GROUP;
  1129. auto make_graph = [&](
  1130. const Checker::SymInpArray &inputs) -> Checker::SymOutArray {
  1131. return {opr::Convolution3DBackwardData::make_deconv(
  1132. inputs[0], inputs[1], param)};
  1133. };
  1134. auto fwd = [&](Checker::NumOutArray &dest, Checker::NumInpArray inp) {
  1135. auto &&data = *inp[0], &&filter = *inp[1];
  1136. size_t N = data.shape(0),
  1137. ID = data.shape(2), IH = data.shape(3), IW = data.shape(4),
  1138. GROUP = filter.shape(0),
  1139. ICPG = filter.shape(1), OCPG = filter.shape(2),
  1140. FD = filter.shape(3), FH = filter.shape(4), FW = filter.shape(5);
  1141. auto &&out = dest[0];
  1142. auto get_shp = [](
  1143. size_t inp, size_t filter, size_t stride, size_t pad,
  1144. size_t dilate) {
  1145. return (inp - 1) * stride + (filter - 1) * dilate + 1 - pad * 2;
  1146. };
  1147. size_t OD = get_shp(ID, FD,
  1148. param.stride_d, param.pad_d, param.dilate_d),
  1149. OH = get_shp(IH, FH,
  1150. param.stride_h, param.pad_h, param.dilate_h),
  1151. OW = get_shp(IW, FW,
  1152. param.stride_w, param.pad_w, param.dilate_w);
  1153. out.resize({N, OCPG * GROUP, OD, OH, OW});
  1154. auto fptr = filter.ptr<float>(),
  1155. dptr = data.ptr<float>(),
  1156. optr = out.ptr<float>();
  1157. memset(optr, 0, sizeof(float) * out.shape().total_nr_elems());
  1158. auto ol = out.layout(), fl = filter.layout();
  1159. #define FOR2(a, A, b, B) \
  1160. for (size_t a = 0; a < A; ++ a) \
  1161. for (size_t b = 0; b < B; ++ b)
  1162. #define FOR3(a, A, b, B, c, C) \
  1163. FOR2(a, A, b, B) \
  1164. for (size_t c = 0; c < C; ++ c)
  1165. #define FOR4(a, A, b, B, c, C, d, D) \
  1166. FOR3(a, A, b, B, c, C) \
  1167. for (size_t d = 0; d < D; ++ d)
  1168. FOR3(n, N, group, GROUP, icg, ICPG)
  1169. FOR3(id, ID, ih, IH, iw, IW) {
  1170. float scale = *(dptr ++);
  1171. FOR4(ocg, OCPG, fd, FD, fh, FH, fw, FW) {
  1172. auto oc_tot = group * OCPG + ocg;
  1173. int od = int(id * param.stride_d +
  1174. fd * param.dilate_d) - int(param.pad_d),
  1175. oh = int(ih * param.stride_h +
  1176. fh * param.dilate_h) - int(param.pad_h),
  1177. ow = int(iw * param.stride_w +
  1178. fw * param.dilate_w) - int(param.pad_w);
  1179. if (od >= 0 && oh >= 0 && ow >= 0 &&
  1180. od < static_cast<int>(OD) &&
  1181. oh < static_cast<int>(OH) &&
  1182. ow < static_cast<int>(OW)) {
  1183. auto out_off = n * ol.stride[0] + oc_tot * ol.stride[1] +
  1184. od * ol.stride[2] + oh * ol.stride[3] + ow,
  1185. flt_off = group * fl.stride[0] + icg * fl.stride[1] +
  1186. ocg * fl.stride[2] + fd * fl.stride[3] +
  1187. fh * fl.stride[4] + fw;
  1188. optr[out_off] += scale * fptr[flt_off];
  1189. }
  1190. }
  1191. }
  1192. #undef FOR4
  1193. #undef FOR3
  1194. #undef FOR2
  1195. };
  1196. Checker::RunOptions opt;
  1197. opt.numdiff_eps = 1;
  1198. Checker(make_graph, fwd).
  1199. run({TensorShape{2, 4, 3, 3, 2}, {1, 4, 5, 3, 2, 2}}, opt).
  1200. run({TensorShape{3, 2, 1, 1, 1}, {2, 1, 1, 4, 3, 3}}, opt).
  1201. run({TensorShape{4, 6, 2, 2, 2}, {2, 3, 4, 6, 5, 4}}, opt);
  1202. }
  1203. TEST(TestOprDNN, Convolution3DExePolicy) {
  1204. Param3D param{Param3D::Mode::CONVOLUTION};
  1205. using Policy = opr::Convolution3D::ExecutionPolicy;
  1206. using S = Policy::Strategy;
  1207. #if MGB_ENABLE_FASTRUN
  1208. for (auto strategy :
  1209. SmallVector<S>{S::PROFILE, S::HEURISTIC, S::PROFILE | S::REPRODUCIBLE,
  1210. S::PROFILE | S::HEURISTIC}) {
  1211. #else
  1212. for (auto strategy :
  1213. SmallVector<S>{S : HEURISTIC, S::PROFILE | S::HEURISTIC}) {
  1214. #endif
  1215. using Checker = AutoOprChecker<2, 1>;
  1216. auto make_graph = [&](const Checker::SymInpArray &inputs) ->
  1217. Checker::SymOutArray {
  1218. Policy policy;
  1219. policy.strategy = strategy;
  1220. auto out = opr::Convolution3D::make(
  1221. inputs[0], inputs[1], param, policy);
  1222. return {out};
  1223. };
  1224. auto fwd = [&](Checker::NumOutArray &dest, Checker::NumInpArray inp) {
  1225. std::shared_ptr<HostTensorND> sh_out;
  1226. convolution3d_brute({inp.begin(), inp.end()}, sh_out, param);
  1227. dest[0] = *sh_out;
  1228. };
  1229. Checker::RunOptions opt;
  1230. opt.numdiff_eps = 1;
  1231. Checker(make_graph, fwd).
  1232. run({TensorShape{3, 2, 3, 4, 1}, {4, 2, 2, 2, 1}}, opt).
  1233. run({TensorShape{3, 3, 2, 6, 2}, {2, 3, 1, 4, 1}}, opt).
  1234. run({TensorShape{1, 1, 4, 4, 4}, {2, 1, 3, 3, 3}}, opt);
  1235. }
  1236. }
  1237. TEST(TestOprDNN, ConvBiasForward) {
  1238. using Checker2 = AutoOprChecker<2, 1>;
  1239. using Checker3 = AutoOprChecker<3, 1>;
  1240. opr::ConvBiasForward::Param param;
  1241. auto make_graph2 =
  1242. [&](const Checker2::SymInpArray& inputs) -> Checker2::SymOutArray {
  1243. return {opr::ConvBiasForward::make(inputs[0], inputs[1], param)};
  1244. };
  1245. auto make_graph3 =
  1246. [&](const Checker3::SymInpArray& inputs) -> Checker3::SymOutArray {
  1247. return {opr::ConvBiasForward::make(inputs[0], inputs[1], inputs[2],
  1248. param)};
  1249. };
  1250. auto fwd2 = [&](Checker2::NumOutArray& dest, Checker2::NumInpArray inp) {
  1251. std::shared_ptr<HostTensorND> sh_out;
  1252. convolution_brute({inp[0], inp[1]}, sh_out,
  1253. convert_to_conv_param(param));
  1254. dest[0] = *sh_out;
  1255. };
  1256. auto fwd3 = [&](Checker3::NumOutArray& dest, Checker3::NumInpArray inp) {
  1257. std::shared_ptr<HostTensorND> sh_out;
  1258. convolution_brute({inp[0], inp[1]}, sh_out,
  1259. convert_to_conv_param(param));
  1260. dest[0] = *sh_out;
  1261. size_t N = dest[0].shape()[0];
  1262. size_t OC = dest[0].shape()[1];
  1263. size_t OH = dest[0].shape()[2];
  1264. size_t OW = dest[0].shape()[3];
  1265. auto dest_ptr = dest[0].ptr<float>();
  1266. for (size_t i = 0; i < N; i++) {
  1267. auto bias_ptr = inp[2]->ptr<float>();
  1268. for (size_t c = 0; c < OC; c++) {
  1269. for (size_t hw = 0; hw < OH * OW; hw++) {
  1270. *(dest_ptr++) += *(bias_ptr);
  1271. }
  1272. bias_ptr++;
  1273. }
  1274. }
  1275. };
  1276. auto run_with_param = [&](size_t SH = 1, size_t SW = 1, size_t PH = 0,
  1277. size_t PW = 0) {
  1278. param.pad_h = PH;
  1279. param.pad_w = PW;
  1280. param.stride_h = SH;
  1281. param.stride_w = SW;
  1282. Checker2 checker2{make_graph2, fwd2};
  1283. Checker2::RunOptions opt2;
  1284. checker2.set_output_allow_grad(0, false);
  1285. Checker3 checker3{make_graph3, fwd3};
  1286. Checker3::RunOptions opt3;
  1287. checker3.set_output_allow_grad(0, false);
  1288. auto run = [&](size_t N, size_t IC, size_t OC, size_t IH, size_t IW,
  1289. size_t FH, size_t FW) {
  1290. auto opr = megdnn_naive_handle()
  1291. ->create_operator<megdnn::ConvolutionForward>();
  1292. opr->param() = convert_to_conv_param(param);
  1293. TensorLayout dest_layout;
  1294. opr->deduce_layout({{N, IC, IH, IW}, dtype::Float32()},
  1295. {{OC, IC, FH, FW}, dtype::Float32()},
  1296. dest_layout);
  1297. checker2.run({TensorShape{N, IC, IH, IW}, {OC, IC, FH, FW}}, opt2);
  1298. checker3.run({TensorShape{N, IC, IH, IW},
  1299. {OC, IC, FH, FW},
  1300. {1, OC, 1, 1}},
  1301. opt3);
  1302. };
  1303. run(1, 1, 1, 5, 5, 1, 1);
  1304. run(1, 1, 1, 5, 5, 3, 3);
  1305. run(2, 3, 4, 5, 5, 3, 3);
  1306. run(3, 3, 4, 224, 223, 3, 3);
  1307. run(3, 3, 4, 224, 223, 2, 2);
  1308. };
  1309. run_with_param();
  1310. run_with_param(2, 2, 3, 3);
  1311. run_with_param(3, 2, 1, 2);
  1312. run_with_param(2, 3, 2, 2);
  1313. }
  1314. TEST(TestOprDNN, ConvBiasForwardWithZ) {
  1315. REQUIRE_GPU(1);
  1316. using Checker4 = AutoOprChecker<4, 1>;
  1317. opr::ConvBiasForward::Param param;
  1318. auto make_graph4 =
  1319. [&](const Checker4::SymInpArray& inputs) -> Checker4::SymOutArray {
  1320. return {opr::ConvBiasForward::make(inputs[0], inputs[1], inputs[2],
  1321. inputs[3], param)};
  1322. };
  1323. auto fwd4 = [&](Checker4::NumOutArray& dest, Checker4::NumInpArray inp) {
  1324. std::shared_ptr<HostTensorND> sh_out;
  1325. convolution_brute({inp[0], inp[1]}, sh_out,
  1326. convert_to_conv_param(param));
  1327. dest[0] = *sh_out;
  1328. size_t N = dest[0].shape()[0];
  1329. size_t OC = dest[0].shape()[1];
  1330. size_t OH = dest[0].shape()[2];
  1331. size_t OW = dest[0].shape()[3];
  1332. auto dest_ptr = dest[0].ptr<float>();
  1333. float* z_ptr = inp[3]->ptr<float>();
  1334. for (size_t i = 0; i < N; i++) {
  1335. auto bias_ptr = inp[2]->ptr<float>();
  1336. for (size_t c = 0; c < OC; c++) {
  1337. for (size_t hw = 0; hw < OH * OW; hw++) {
  1338. *(dest_ptr++) += *(bias_ptr) + *(z_ptr++);
  1339. }
  1340. bias_ptr++;
  1341. }
  1342. }
  1343. };
  1344. auto run_with_param = [&](size_t SH = 1, size_t SW = 1, size_t PH = 0,
  1345. size_t PW = 0) {
  1346. param.pad_h = PH;
  1347. param.pad_w = PW;
  1348. param.stride_h = SH;
  1349. param.stride_w = SW;
  1350. Checker4 checker4{make_graph4, fwd4};
  1351. Checker4::RunOptions opt4;
  1352. checker4.set_output_allow_grad(0, false);
  1353. auto run = [&](size_t N, size_t IC, size_t OC, size_t IH, size_t IW,
  1354. size_t FH, size_t FW) {
  1355. auto opr = megdnn_naive_handle()
  1356. ->create_operator<megdnn::ConvolutionForward>();
  1357. opr->param() = convert_to_conv_param(param);
  1358. TensorLayout dest_layout;
  1359. opr->deduce_layout({{N, IC, IH, IW}, dtype::Float32()},
  1360. {{OC, IC, FH, FW}, dtype::Float32()},
  1361. dest_layout);
  1362. checker4.run({TensorShape{N, IC, IH, IW},
  1363. {OC, IC, FH, FW},
  1364. {1, OC, 1, 1},
  1365. {N, OC, dest_layout[2], dest_layout[3]}},
  1366. opt4);
  1367. };
  1368. run(1, 1, 1, 5, 5, 3, 3);
  1369. run(2, 3, 4, 5, 5, 3, 3);
  1370. run(3, 3, 4, 224, 223, 3, 3);
  1371. run(3, 3, 4, 224, 223, 2, 2);
  1372. };
  1373. run_with_param();
  1374. run_with_param(2, 2, 3, 3);
  1375. run_with_param(3, 2, 1, 2);
  1376. run_with_param(2, 3, 2, 2);
  1377. }
  1378. TEST(TestOprDNN, ConvBiasINT8x8xX_NCHW4) {
  1379. using Checker = AutoOprChecker<3, 1>;
  1380. using Param = opr::ConvBias::Param;
  1381. opr::ConvBiasForward::Param param;
  1382. auto make_quantized = [&](SymbolVar x, const DType& dtype) {
  1383. return opr::TypeCvt::make(x, dtype);
  1384. };
  1385. auto make_graph =
  1386. [&](const Checker::SymInpArray& inputs) -> Checker::SymOutArray {
  1387. auto conv_param = convert_to_conv_param(param);
  1388. auto y = opr::Convolution::make(
  1389. make_quantized(inputs[0], dtype::QuantizedS8(0.3f)),
  1390. make_quantized(inputs[1], dtype::QuantizedS8(0.1f)), conv_param);
  1391. y = y + make_quantized(inputs[2], dtype::QuantizedS32(0.03f));
  1392. if (param.nonlineMode == Param::NonlineMode::RELU)
  1393. y = opr::Elemwise::make(
  1394. {y}, {opr::Elemwise::Mode::RELU});
  1395. y = opr::TypeCvt::make(y, dtype::QuantizedS8(0.5f));
  1396. return {opr::TypeCvt::make(y, dtype::Float32())};
  1397. };
  1398. auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
  1399. auto graph = ComputingGraph::make();
  1400. Checker::SymInpArray inputs;
  1401. for (size_t i = 0; i < inp.size(); ++i) {
  1402. inputs[i] = opr::Host2DeviceCopy::make(
  1403. *graph, inp[i]);
  1404. }
  1405. auto options = gopt::OptimizeForInferenceOptions{};
  1406. options.enable_fuse_conv_bias_nonlinearity();
  1407. auto y = gopt::optimize_for_inference({make_graph(inputs)[0]},
  1408. options)[0];
  1409. auto func = graph->compile({make_callback_copy(y, dest[0])});
  1410. func->execute();
  1411. func->wait();
  1412. };
  1413. auto run_with_param = [&](size_t SH = 1, size_t SW = 1, size_t PH = 0,
  1414. size_t PW = 0, size_t group = 1) {
  1415. param.pad_h = PH;
  1416. param.pad_w = PW;
  1417. param.stride_h = SH;
  1418. param.stride_w = SW;
  1419. param.format = Param::Format::NCHW4;
  1420. if (group != 1)
  1421. param.sparse = Param::Sparse::GROUP;
  1422. Checker checker{make_graph, fwd, CompNode::load("cpu0")};
  1423. Checker::RunOptions opt;
  1424. checker.set_output_allow_grad(0, false);
  1425. auto run = [&](size_t N, size_t IC, size_t OC, size_t IH, size_t IW,
  1426. size_t FH, size_t FW) {
  1427. mgb_assert(IC % 4 == 0 && OC % 4 == 0);
  1428. checker.run({TensorShape{N, group * IC / 4, IH, IW, 4},
  1429. {group, OC, IC / 4, FH, FW, 4},
  1430. {1, group * OC / 4, 1, 1, 4}},
  1431. opt);
  1432. };
  1433. run(1, 8, 8, 56, 56, 3, 3);
  1434. run(1, 8, 8, 56, 56, 3, 3);
  1435. run(1, 8, 8, 56, 56, 3, 3);
  1436. };
  1437. run_with_param(1, 1, 1, 1, 8);
  1438. run_with_param();
  1439. run_with_param(2, 2, 3, 3);
  1440. run_with_param(3, 2, 1, 2);
  1441. run_with_param(2, 3, 2, 2);
  1442. }
  1443. TEST(TestOprDNN, ConvolutionDTypeInference) {
  1444. Param param;
  1445. param.mode = Mode::CONVOLUTION;
  1446. auto cn = CompNode::load("cpu0");
  1447. auto graph = ComputingGraph::make();
  1448. HostTensorND inp_host{
  1449. cn, {1, 3, 7, 7}, dtype::Quantized8Asymm(0.233f, (uint8_t)123)};
  1450. HostTensorND filt_host{
  1451. cn, {8, 3, 1, 1}, dtype::Quantized8Asymm(0.874f, (uint8_t)234)};
  1452. auto inp = opr::ImmutableTensor::make(*graph, inp_host);
  1453. auto filt = opr::ImmutableTensor::make(*graph, filt_host);
  1454. auto opr = opr::Convolution::make(inp, filt, param);
  1455. ASSERT_EQ(opr.dtype().enumv(), DTypeEnum::QuantizedS32);
  1456. // This has to be EQ instead of NEAR
  1457. EXPECT_EQ(opr.dtype().param<dtype::QuantizedS32>().scale, 0.233f * 0.874f);
  1458. inp_host = {cn, {1, 3, 7, 7}, dtype::QuantizedS8(0.1234f)};
  1459. filt_host = {cn, {8, 3, 1, 1}, dtype::QuantizedS8(0.2345f)};
  1460. inp = opr::ImmutableTensor::make(*graph, inp_host);
  1461. filt = opr::ImmutableTensor::make(*graph, filt_host);
  1462. opr = opr::Convolution::make(inp, filt, param);
  1463. ASSERT_EQ(opr.dtype().enumv(), DTypeEnum::QuantizedS32);
  1464. EXPECT_EQ(opr.dtype().param<dtype::QuantizedS32>().scale,
  1465. 0.1234f * 0.2345f);
  1466. inp_host = {cn, {1, 3, 7, 7}, dtype::Int8()};
  1467. filt_host = {cn, {8, 3, 1, 1}, dtype::Int8()};
  1468. inp = opr::ImmutableTensor::make(*graph, inp_host);
  1469. filt = opr::ImmutableTensor::make(*graph, filt_host);
  1470. opr = opr::Convolution::make(inp, filt, param);
  1471. ASSERT_EQ(opr.dtype().enumv(), DTypeEnum::Int32);
  1472. }
  1473. TEST(TestOprDNN, ConvBiasINT8x8xXDTypeInference) {
  1474. float inp_scale = 1.926f;
  1475. float filt_scale = 0.817f;
  1476. float bias_scale = inp_scale * filt_scale;
  1477. opr::ConvBias::Param param;
  1478. param.mode = Mode::CONVOLUTION;
  1479. auto cn = CompNode::load("cpu0");
  1480. auto graph = ComputingGraph::make();
  1481. HostTensorND inp_host{cn, {1, 3, 7, 7}, dtype::QuantizedS8(inp_scale)};
  1482. HostTensorND filt_host{cn, {8, 3, 1, 1}, dtype::QuantizedS8(filt_scale)};
  1483. DType output_dtype = dtype::QuantizedS8(bias_scale);
  1484. HostTensorND bias_host{cn, {1, 3, 7, 7}, dtype::QuantizedS32(bias_scale)};
  1485. auto inp = opr::ImmutableTensor::make(*graph, inp_host);
  1486. auto filt = opr::ImmutableTensor::make(*graph, filt_host);
  1487. auto bias = opr::ImmutableTensor::make(*graph, filt_host);
  1488. auto opr = opr::ConvBiasForward::make(inp, filt, bias, param,
  1489. {}, OperatorNodeConfig{output_dtype});
  1490. ASSERT_EQ(opr.dtype().enumv(), DTypeEnum::QuantizedS8);
  1491. EXPECT_EQ(opr.dtype().param<dtype::QuantizedS8>().scale, bias_scale);
  1492. }
  1493. TEST(TestOprDNN, ConvBiasINT8x8xXSerialization) {
  1494. using namespace serialization;
  1495. float inp_scale = 1.926f;
  1496. float filt_scale = 0.817f;
  1497. float bias_scale = inp_scale * filt_scale;
  1498. DType output_dtype = dtype::QuantizedS8(bias_scale);
  1499. auto fname = output_file("ConvBiasINT8x8xXTest");
  1500. auto dump = [&]() {
  1501. opr::ConvBias::Param param;
  1502. param.mode = Mode::CONVOLUTION;
  1503. auto cn = CompNode::load("cpu0");
  1504. auto graph = ComputingGraph::make();
  1505. HostTensorND inp_host{cn, {1, 3, 7, 7}, dtype::QuantizedS8(inp_scale)};
  1506. HostTensorND filt_host{
  1507. cn, {8, 3, 1, 1}, dtype::QuantizedS8(filt_scale)};
  1508. HostTensorND bias_host{
  1509. cn, {1, 3, 7, 7}, dtype::QuantizedS32(bias_scale)};
  1510. auto inp = opr::ImmutableTensor::make(*graph, inp_host);
  1511. auto filt = opr::ImmutableTensor::make(*graph, filt_host);
  1512. auto bias = opr::ImmutableTensor::make(*graph, filt_host);
  1513. auto opr = opr::ConvBiasForward::make(inp, filt, bias, param,
  1514. {},
  1515. OperatorNodeConfig{output_dtype});
  1516. auto dumper = GraphDumper::make(OutputFile::make_fs(fname.c_str()));
  1517. auto rst = dumper->dump({opr});
  1518. ASSERT_EQ(rst.outputs.size(), 1u);
  1519. };
  1520. auto load = [&]() {
  1521. auto loader = GraphLoader::make(InputFile::make_fs(fname.c_str()));
  1522. auto rst = loader->load();
  1523. ASSERT_EQ(rst.output_var_list.size(), 1u);
  1524. EXPECT_EQ(rst.output_var_list[0].dtype(), output_dtype);
  1525. };
  1526. dump();
  1527. load();
  1528. }
  1529. TEST(TestOprDNN, LocalShareForward) {
  1530. REQUIRE_GPU(1);
  1531. using Checker = AutoOprChecker<2, 1>;
  1532. using Param = opr::LocalShare::Param;
  1533. Param param;
  1534. param.mode = Param::Mode::CROSS_CORRELATION;
  1535. param.sparse = Param::Sparse::DENSE;
  1536. auto make_graph =
  1537. [&](const Checker::SymInpArray& inputs) -> Checker::SymOutArray {
  1538. return {opr::LocalShare::make(inputs[0], inputs[1], param)};
  1539. };
  1540. auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
  1541. mgb_assert(inp.size() == 2);
  1542. mgb_assert(dest.size() == 1);
  1543. std::shared_ptr<HostTensorND> out;
  1544. local_share_brute({inp[0], inp[1]}, out, param);
  1545. dest[0] = *out;
  1546. };
  1547. auto run_with_param = [&](size_t fh = 3, size_t fw = 3, size_t sh = 1,
  1548. size_t sw = 1, size_t sgh = 3, size_t sgw = 3) {
  1549. size_t ph = fh / 2, pw = fw / 2;
  1550. param.pad_h = ph, param.pad_w = pw;
  1551. param.stride_h = sh, param.stride_w = sw, param.spatial_groups_h = sgh,
  1552. param.spatial_groups_w = sgw;
  1553. Checker checker{make_graph, fwd};
  1554. Checker::RunOptions opt;
  1555. checker.set_output_allow_grad(0, false);
  1556. checker.set_input_dtype(0, dtype::Float32());
  1557. checker.set_input_dtype(1, dtype::Float32());
  1558. auto run = [&](size_t n, size_t ci, size_t co, size_t hi, size_t wi) {
  1559. size_t ho = (hi + 2 * ph - fh) / sh + 1;
  1560. size_t wo = (wi + 2 * pw - fw) / sw + 1;
  1561. if (ho % sgh != 0 || wo % sgw != 0)
  1562. return;
  1563. checker.run({TensorShape{n, ci, hi, wi},
  1564. TensorShape{sgh, sgw, ci, fh, fw, co}},
  1565. opt);
  1566. };
  1567. run(32, 2, 7, 24, 24);
  1568. run(16, 2, 7, 24, 24);
  1569. run(32, 2, 8, 12, 12);
  1570. run(16, 2, 9, 6, 6);
  1571. };
  1572. run_with_param(1, 1, 1, 1, 3, 3);
  1573. run_with_param(3, 3, 1, 1, 2, 2);
  1574. run_with_param(5, 5, 1, 1, 2, 2);
  1575. run_with_param(7, 7, 1, 1, 2, 2);
  1576. run_with_param(1, 1, 2, 2, 3, 3);
  1577. run_with_param(3, 3, 2, 2, 2, 2);
  1578. run_with_param(5, 5, 1, 1, 2, 2);
  1579. run_with_param(7, 7, 1, 1, 2, 2);
  1580. }
  1581. TEST(TestOprDNN, LocalShareForwardGrad) {
  1582. REQUIRE_GPU(1);
  1583. using Checker = AutoOprChecker<2, 1>;
  1584. using Param = opr::LocalShare::Param;
  1585. Param param;
  1586. param.mode = Param::Mode::CROSS_CORRELATION;
  1587. param.sparse = Param::Sparse::DENSE;
  1588. auto make_graph =
  1589. [&](const Checker::SymInpArray& inputs) -> Checker::SymOutArray {
  1590. return {opr::LocalShare::make(inputs[0], inputs[1], param)};
  1591. };
  1592. auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
  1593. mgb_assert(inp.size() == 2);
  1594. mgb_assert(dest.size() == 1);
  1595. std::shared_ptr<HostTensorND> out;
  1596. local_share_brute({inp[0], inp[1]}, out, param);
  1597. dest[0] = *out;
  1598. };
  1599. auto run_with_param = [&](size_t fh = 3, size_t fw = 3, size_t sh = 1,
  1600. size_t sw = 1, size_t sgh = 3, size_t sgw = 3) {
  1601. size_t ph = fh / 2, pw = fw / 2;
  1602. param.pad_h = ph, param.pad_w = pw;
  1603. param.stride_h = sh, param.stride_w = sw, param.spatial_groups_h = sgh,
  1604. param.spatial_groups_w = sgw;
  1605. Checker checker{make_graph, fwd};
  1606. Checker::RunOptions opt;
  1607. checker.set_output_allow_grad(0, true);
  1608. opt.numdiff_max_err = 1e-1;
  1609. checker.set_input_dtype(0, dtype::Float32());
  1610. checker.set_input_dtype(1, dtype::Float32());
  1611. auto run = [&](size_t n, size_t ci, size_t co, size_t hi, size_t wi) {
  1612. size_t ho = (hi + 2 * ph - fh) / sh + 1;
  1613. size_t wo = (wi + 2 * pw - fw) / sw + 1;
  1614. if (ho % sgh != 0 || wo % sgw != 0)
  1615. return;
  1616. checker.run({TensorShape{n, ci, hi, wi},
  1617. TensorShape{sgh, sgw, ci, fh, fw, co}},
  1618. opt);
  1619. };
  1620. run(4, 2, 8, 24, 24);
  1621. run(8, 2, 4, 6, 6);
  1622. run(16, 4, 8, 12, 12);
  1623. run(4, 4, 8, 12, 12);
  1624. };
  1625. run_with_param(1, 1, 1, 1, 3, 3);
  1626. run_with_param(1, 1, 2, 2, 3, 3);
  1627. run_with_param(3, 3, 2, 2, 2, 2);
  1628. }
  1629. TEST(TestOprDNN, LocalShareForwardExecPolicy) {
  1630. REQUIRE_GPU(1);
  1631. using Checker = AutoOprChecker<2, 1>;
  1632. using Policy = opr::LocalShare::ExecutionPolicy;
  1633. using S = Policy::Strategy;
  1634. using Param = opr::LocalShare::Param;
  1635. Param param;
  1636. param.mode = Param::Mode::CROSS_CORRELATION;
  1637. param.sparse = Param::Sparse::DENSE;
  1638. int nr_get = 0;
  1639. auto on_get = [&nr_get](const std::string&, const void*, size_t,
  1640. const void*, size_t) { ++nr_get; };
  1641. PersistentCacheHook cache_hook{on_get};
  1642. #if MGB_ENABLE_FASTRUN
  1643. for (auto strategy :
  1644. SmallVector<S>{S::PROFILE, S::HEURISTIC, S::PROFILE | S::REPRODUCIBLE,
  1645. S::PROFILE | S::HEURISTIC, S::PROFILE | S::OPTIMIZED}) {
  1646. #else
  1647. for (auto strategy :
  1648. SmallVector<S>{S : HEURISTIC, S::PROFILE | S::HEURISTIC}) {
  1649. #endif
  1650. auto make_graph = [&](const Checker::SymInpArray& inputs)
  1651. -> Checker::SymOutArray {
  1652. Policy policy;
  1653. policy.strategy = strategy;
  1654. return {opr::LocalShare::make(inputs[0], inputs[1], param, policy)};
  1655. };
  1656. auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
  1657. mgb_assert(inp.size() == 2);
  1658. mgb_assert(dest.size() == 1);
  1659. std::shared_ptr<HostTensorND> out;
  1660. local_share_brute({inp[0], inp[1]}, out, param);
  1661. dest[0] = *out;
  1662. };
  1663. auto run_with_param = [&](size_t fh = 3, size_t fw = 3, size_t sh = 1,
  1664. size_t sw = 1, size_t sgh = 3,
  1665. size_t sgw = 3) {
  1666. size_t ph = fh / 2, pw = fw / 2;
  1667. param.pad_h = ph, param.pad_w = pw;
  1668. param.stride_h = sh, param.stride_w = sw,
  1669. param.spatial_groups_h = sgh, param.spatial_groups_w = sgw;
  1670. Checker checker{make_graph, fwd};
  1671. Checker::RunOptions opt;
  1672. checker.set_output_allow_grad(0, false);
  1673. checker.set_input_dtype(0, dtype::Float32());
  1674. checker.set_input_dtype(1, dtype::Float32());
  1675. nr_get = 0;
  1676. opt.outputs_max_err = 1e-3;
  1677. auto run = [&](size_t n, size_t ci, size_t co, size_t hi,
  1678. size_t wi) {
  1679. size_t ho = (hi + 2 * ph - fh) / sh + 1;
  1680. size_t wo = (wi + 2 * pw - fw) / sw + 1;
  1681. if (ho % sgh != 0 || wo % sgw != 0)
  1682. return;
  1683. checker.run({TensorShape{n, ci, hi, wi},
  1684. TensorShape{sgh, sgw, ci, fh, fw, co}},
  1685. opt);
  1686. };
  1687. run(32, 4, 8, 24, 24);
  1688. run(32, 4, 8, 12, 12);
  1689. run(16, 4, 8, 12, 12);
  1690. run(32, 4, 8, 6, 6);
  1691. if (strategy == S::HEURISTIC) {
  1692. ASSERT_EQ(0, nr_get);
  1693. } else {
  1694. ASSERT_LT(0, nr_get);
  1695. }
  1696. };
  1697. run_with_param(1, 1, 1, 1, 3, 3);
  1698. run_with_param(3, 3, 1, 1, 2, 2);
  1699. run_with_param(5, 5, 1, 1, 2, 2);
  1700. run_with_param(7, 7, 1, 1, 2, 2);
  1701. run_with_param(1, 1, 2, 2, 3, 3);
  1702. run_with_param(3, 3, 2, 2, 2, 2);
  1703. run_with_param(5, 5, 1, 1, 2, 2);
  1704. run_with_param(7, 7, 1, 1, 2, 2);
  1705. }
  1706. }
  1707. TEST(TestOprDNN, LocalShareSerialization) {
  1708. using namespace serialization;
  1709. auto fname = output_file("LocalShareForwardTest");
  1710. auto dump = [&]() {
  1711. opr::LocalShare::Param param;
  1712. param.mode = Mode::CROSS_CORRELATION;
  1713. param.stride_h = param.stride_w = 1;
  1714. param.pad_h = param.pad_w = 0;
  1715. param.spatial_groups_h = param.spatial_groups_w = 3;
  1716. auto cn = CompNode::load("cpu0");
  1717. auto graph = ComputingGraph::make();
  1718. HostTensorND inp_host{cn, {32, 4, 24, 24}, dtype::Float32()};
  1719. HostTensorND filt_host{
  1720. cn, {3, 3, 4, 1, 1, 8}, dtype::Float32()};
  1721. auto inp = opr::ImmutableTensor::make(*graph, inp_host);
  1722. auto filt = opr::ImmutableTensor::make(*graph, filt_host);
  1723. auto opr = opr::LocalShareForward::make(inp, filt, param,
  1724. {});
  1725. auto dumper = GraphDumper::make(OutputFile::make_fs(fname.c_str()));
  1726. auto rst = dumper->dump({opr});
  1727. ASSERT_EQ(rst.outputs.size(), 1u);
  1728. };
  1729. auto load = [&]() {
  1730. auto loader = GraphLoader::make(InputFile::make_fs(fname.c_str()));
  1731. auto rst = loader->load();
  1732. ASSERT_EQ(rst.output_var_list.size(), 1u);
  1733. };
  1734. dump();
  1735. load();
  1736. }
  1737. TEST(TestOprDNN, DeformableConvForward) {
  1738. REQUIRE_GPU(1);
  1739. using Checker = AutoOprChecker<4, 1>;
  1740. using Policy = opr::DeformableConvForward::ExecutionPolicy;
  1741. using S = Policy::Strategy;
  1742. using Param = opr::DeformableConvForward::Param;
  1743. Param param;
  1744. #if MGB_ENABLE_FASTRUN
  1745. for (auto strategy :
  1746. SmallVector<S>{S::PROFILE, S::HEURISTIC, S::PROFILE | S::REPRODUCIBLE,
  1747. S::PROFILE | S::HEURISTIC, S::PROFILE | S::OPTIMIZED}) {
  1748. #else
  1749. for (auto strategy :
  1750. SmallVector<S>{S : HEURISTIC, S::PROFILE | S::HEURISTIC}) {
  1751. #endif
  1752. auto make_graph = [&](const Checker::SymInpArray& inputs)
  1753. -> Checker::SymOutArray {
  1754. Policy policy;
  1755. policy.strategy = strategy;
  1756. return {opr::DeformableConvForward::make(
  1757. inputs[0], inputs[1], inputs[2], inputs[3], param, policy)};
  1758. };
  1759. auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
  1760. auto opr =
  1761. megdnn_naive_handle()
  1762. ->create_operator<megdnn::DeformableConvForward>();
  1763. opr->param() = param;
  1764. TensorLayout dest_layout;
  1765. opr->deduce_layout(inp[0]->layout(), inp[1]->layout(),
  1766. inp[2]->layout(), inp[3]->layout(), dest_layout);
  1767. std::vector<dt_byte> workspace(opr->get_workspace_in_bytes(
  1768. inp[0]->layout(), inp[1]->layout(), inp[2]->layout(),
  1769. inp[3]->layout(), dest_layout));
  1770. dest[0].dtype(dtype::Float32())
  1771. .comp_node(inp[0]->comp_node())
  1772. .resize(dest_layout);
  1773. opr->exec(inp[0]->as_megdnn(), inp[1]->as_megdnn(),
  1774. inp[2]->as_megdnn(), inp[3]->as_megdnn(),
  1775. dest[0].as_megdnn(),
  1776. {workspace.data(), workspace.size()});
  1777. };
  1778. auto run_with_param = [&](size_t fh, size_t fw, size_t sh, size_t sw,
  1779. size_t dh, size_t dw, size_t group,
  1780. size_t deformable_group) {
  1781. Checker checker{make_graph, fwd};
  1782. size_t ph = fh / 2, pw = fw / 2;
  1783. param.pad_h = ph, param.pad_w = pw;
  1784. param.stride_h = sh, param.stride_w = sw;
  1785. param.dilate_h = dh, param.dilate_w = dw;
  1786. param.format = Param::Format::NCHW;
  1787. param.mode = Param::Mode::CROSS_CORRELATION;
  1788. param.sparse = Param::Sparse::DENSE;
  1789. if (group > 1)
  1790. param.sparse = Param::Sparse::GROUP;
  1791. Checker::RunOptions opt;
  1792. float DELTA = 1e-3;
  1793. opt.numdiff_eps = DELTA;
  1794. opt.numdiff_max_err = 1e-1;
  1795. auto gen_off = [DELTA](HostTensorND& off, float l = -2.f, float h = 2.f) {
  1796. RNGxorshf rng{next_rand_seed()};
  1797. auto elems = off.shape().total_nr_elems();
  1798. auto ptr = off.ptr<float>();
  1799. auto rand_real = [](RNGxorshf& rng, float lo, float hi) {
  1800. std::uniform_real_distribution<float> dist(lo, hi);
  1801. return dist(rng);
  1802. };
  1803. for (size_t i = 0; i < elems; ++i) {
  1804. do {
  1805. float val = rand_real(rng, l, h);
  1806. if (abs(floor(val + 2 * DELTA) - floor(val)) <= 1e-6f &&
  1807. abs(floor(val - 2 * DELTA) - floor(val)) <= 1e-6f) {
  1808. ptr[i] = val;
  1809. break;
  1810. }
  1811. } while (true);
  1812. }
  1813. };
  1814. //! generate offset to avoid value near integer
  1815. /// because bilinear function is not derivable over there
  1816. checker.set_input_generator(2, gen_off);
  1817. checker.set_input_dtype(0, dtype::Float32());
  1818. checker.set_input_dtype(1, dtype::Float32());
  1819. checker.set_input_dtype(2, dtype::Float32());
  1820. checker.set_input_dtype(3, dtype::Float32());
  1821. auto run = [&](size_t n, size_t ih, size_t iw, size_t icpg,
  1822. size_t ocpg) {
  1823. size_t oh = (ih + 2 * ph - fh) / sh + 1;
  1824. size_t ow = (iw + 2 * pw - fw) / sw + 1;
  1825. checker.run({TensorShape{n, group * icpg, ih, iw},
  1826. (param.sparse == Param::Sparse::GROUP)
  1827. ? TensorShape{group, ocpg, icpg, fh, fw}
  1828. : TensorShape{group * ocpg, group * icpg,
  1829. fh, fw},
  1830. {n, 2 * deformable_group * fh * fw, oh, ow},
  1831. {n, deformable_group * fh * fw, oh, ow}},
  1832. opt);
  1833. };
  1834. run(1, 3, 3, 2, 1);
  1835. run(2, 3, 3, 2, 2);
  1836. run(1, 5, 5, 2, 1);
  1837. };
  1838. // run_with_param(1, 1, 1, 1, 1, 1, 1, 1);
  1839. run_with_param(3, 3, 1, 1, 1, 1, 2, 2);
  1840. // run_with_param(5, 5, 1, 1, 1, 1, 2, 2);
  1841. }
  1842. }
  1843. TEST(TestOprDNN, DeformableConvSerialization) {
  1844. using namespace serialization;
  1845. auto fname = output_file("DeformableConvTest");
  1846. auto dump = [&]() {
  1847. using Param = opr::DeformableConvForward::Param;
  1848. Param param;
  1849. size_t n = 16, ocpg = 2, icpg = 4;
  1850. size_t ih = 24, iw = 24, fh = 3, fw = 3, ph = 2, pw = 2, sh = 1, sw = 1, dh = 1, dw = 1;
  1851. size_t group = 1, deformable_group =1;
  1852. size_t oh = (ih + 2 * ph - fh) / sh + 1;
  1853. size_t ow = (iw + 2 * pw - fw) / sw + 1;
  1854. param.pad_h = ph, param.pad_w = pw;
  1855. param.stride_h = sh, param.stride_w = sw;
  1856. param.dilate_h = dh, param.dilate_w = dw;
  1857. param.format = Param::Format::NCHW;
  1858. param.mode = Param::Mode::CROSS_CORRELATION;
  1859. param.sparse = Param::Sparse::DENSE;
  1860. auto cn = CompNode::load("cpu0");
  1861. auto graph = ComputingGraph::make();
  1862. HostTensorND inp_host{cn, {n, group * icpg, ih, iw}, dtype::Float32()};
  1863. HostTensorND filt_host{
  1864. cn, {group * ocpg, group * icpg, fh, fw}, dtype::Float32()};
  1865. HostTensorND offset_host{
  1866. cn, {n, 2 * deformable_group * fh * fw, oh, ow}, dtype::Float32()};
  1867. HostTensorND mask_host{
  1868. cn, {n, deformable_group * fh * fw, oh, ow}, dtype::Float32()};
  1869. auto inp = opr::ImmutableTensor::make(*graph, inp_host);
  1870. auto filt = opr::ImmutableTensor::make(*graph, filt_host);
  1871. auto offset = opr::ImmutableTensor::make(*graph, offset_host);
  1872. auto mask = opr::ImmutableTensor::make(*graph, mask_host);
  1873. auto opr = opr::DeformableConvForward::make(inp, filt, offset, mask,
  1874. param, {}, {});
  1875. auto dumper = GraphDumper::make(OutputFile::make_fs(fname.c_str()));
  1876. auto rst = dumper->dump({opr});
  1877. ASSERT_EQ(rst.outputs.size(), 1u);
  1878. };
  1879. auto load = [&]() {
  1880. auto loader = GraphLoader::make(InputFile::make_fs(fname.c_str()));
  1881. auto rst = loader->load();
  1882. ASSERT_EQ(rst.output_var_list.size(), 1u);
  1883. };
  1884. dump();
  1885. load();
  1886. }
  1887. #if MGB_CUDA
  1888. TEST(TestOprDNN, BatchConvBiasForward) {
  1889. REQUIRE_GPU(1);
  1890. auto cn = CompNode::load("gpu0");
  1891. cn.activate();
  1892. REQUIRE_CUDA_COMPUTE_CAPABILITY(6, 1);
  1893. using Checker = AutoOprChecker<3, 1>;
  1894. using Policy = opr::BatchConvBiasForward::ExecutionPolicy;
  1895. using S = Policy::Strategy;
  1896. using Param = opr::BatchConvBiasForward::Param;
  1897. Param param;
  1898. param.format = Param::Format::NCHW4;
  1899. param.mode = Param::Mode::CROSS_CORRELATION;
  1900. param.sparse = Param::Sparse::DENSE;
  1901. #if MGB_ENABLE_FASTRUN
  1902. for (auto strategy :
  1903. SmallVector<S>{S::PROFILE, S::HEURISTIC, S::PROFILE | S::REPRODUCIBLE,
  1904. S::PROFILE | S::HEURISTIC, S::PROFILE | S::OPTIMIZED}) {
  1905. #else
  1906. for (auto strategy :
  1907. SmallVector<S>{S : HEURISTIC, S::PROFILE | S::HEURISTIC}) {
  1908. #endif
  1909. auto make_quantized = [&](SymbolVar x, const DType& dtype) {
  1910. return opr::TypeCvt::make(x, dtype);
  1911. };
  1912. auto make_graph = [&](const Checker::SymInpArray& inputs)
  1913. -> Checker::SymOutArray {
  1914. Policy policy;
  1915. policy.strategy = strategy;
  1916. auto conv_bias = opr::BatchConvBiasForward::make(
  1917. make_quantized(inputs[0], dtype::QuantizedS8{1.1f}),
  1918. make_quantized(inputs[1], dtype::QuantizedS8{1.2f}),
  1919. make_quantized(inputs[2], dtype::QuantizedS32{1.1f * 1.2f}),
  1920. param, policy,
  1921. OperatorNodeConfig{dtype::QuantizedS8{1.3f}});
  1922. return {opr::TypeCvt::make(conv_bias, dtype::Float32())};
  1923. };
  1924. auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
  1925. mgb_assert(inp.size() == 3);
  1926. mgb_assert(dest.size() == 1);
  1927. auto graph = ComputingGraph::make();
  1928. Checker::SymInpArray inputs;
  1929. for (size_t i = 0; i < inp.size(); ++i) {
  1930. inputs[i] = opr::Host2DeviceCopy::make(*graph, inp[i]);
  1931. }
  1932. auto src = make_quantized(inputs[0], dtype::QuantizedS8{1.1f}),
  1933. filter = make_quantized(inputs[1], dtype::QuantizedS8{1.2f}),
  1934. bias = make_quantized(inputs[2],
  1935. dtype::QuantizedS32{1.1f * 1.2f});
  1936. {
  1937. auto xshp = opr::GetVarShape::make(src);
  1938. auto cv = [&src](int v) { return src.make_scalar(v); };
  1939. auto sub = [&xshp, &cv](int idx) {
  1940. return opr::IndexAt::make(xshp, {{0, cv(idx)}});
  1941. };
  1942. auto tshp = opr::Concat::make(
  1943. {cv(1), sub(0) * sub(1), sub(2), sub(3), sub(4)}, 0);
  1944. src = opr::Reshape::make(src, tshp);
  1945. }
  1946. auto conv_param = convert_to_conv_param(param);
  1947. conv_param.sparse = opr::BatchConvBias::Param::Sparse::GROUP;
  1948. auto y = opr::Convolution::make(src, filter, conv_param);
  1949. {
  1950. auto fshp = opr::GetVarShape::make(filter);
  1951. auto batch =
  1952. opr::IndexAt::make(fshp, {{0, filter.make_scalar(0)}});
  1953. auto xshp = opr::GetVarShape::make(y);
  1954. auto cv = [&y](int v) { return y.make_scalar(v); };
  1955. auto sub = [&xshp, &cv](int idx) {
  1956. return opr::IndexAt::make(xshp, {{0, cv(idx)}});
  1957. };
  1958. auto tshp = opr::Concat::make(
  1959. {batch, sub(1) / batch, sub(2), sub(3), sub(4)}, 0);
  1960. y = opr::Reshape::make(y, tshp);
  1961. }
  1962. y = y + bias;
  1963. y = opr::TypeCvt::make(y, dtype::QuantizedS8{1.3f});
  1964. y = opr::TypeCvt::make(y, dtype::Float32());
  1965. auto func = graph->compile({make_callback_copy(y, dest[0])});
  1966. func->execute();
  1967. func->wait();
  1968. };
  1969. auto run_with_param = [&](size_t sh = 1, size_t sw = 1) {
  1970. size_t fh = 1;
  1971. size_t fw = 1;
  1972. size_t ph = fh / 2, pw = fw / 2;
  1973. param.pad_h = ph, param.pad_w = pw;
  1974. param.stride_h = sh, param.stride_w = sw;
  1975. Checker checker{make_graph, fwd, cn};
  1976. Checker::RunOptions opt;
  1977. checker.set_output_allow_grad(0, false);
  1978. checker.set_input_dtype(0, dtype::Float32());
  1979. checker.set_input_dtype(1, dtype::Float32());
  1980. checker.set_input_dtype(2, dtype::Float32());
  1981. auto run = [&](size_t n, size_t ci, size_t co, size_t hi,
  1982. size_t wi) {
  1983. checker.run({TensorShape{n, ci / 4, hi, wi, 4},
  1984. TensorShape{n, co, ci / 4, fh, fw, 4},
  1985. TensorShape{1, co / 4, 1, 1, 4}},
  1986. opt);
  1987. };
  1988. run(32, 16, 32, 24, 24);
  1989. run(16, 16, 32, 24, 24);
  1990. run(32, 16, 64, 12, 12);
  1991. run(16, 16, 64, 6, 6);
  1992. };
  1993. run_with_param(1, 1);
  1994. run_with_param(2, 2);
  1995. }
  1996. }
  1997. #endif
  1998. TEST(TestOprDNN, BatchConvBiasSerialization) {
  1999. using namespace serialization;
  2000. auto fname = output_file("BatchConvBiasForwardTest");
  2001. auto dump = [&]() {
  2002. opr::BatchConvBias::Param param;
  2003. param.mode = Mode::CROSS_CORRELATION;
  2004. param.format = opr::BatchConvBias::Param::Format::NCHW4;
  2005. param.stride_h = param.stride_w = 1;
  2006. param.pad_h = param.pad_w = 0;
  2007. auto cn = CompNode::load("cpu0");
  2008. auto graph = ComputingGraph::make();
  2009. HostTensorND inp_host{cn, {32, 1, 24, 24, 4}, dtype::QuantizedS8{1.1f}};
  2010. HostTensorND filt_host{cn, {32, 8, 1, 1, 1, 4}, dtype::QuantizedS8{1.2f}};
  2011. auto inp = opr::ImmutableTensor::make(*graph, inp_host);
  2012. auto filt = opr::ImmutableTensor::make(*graph, filt_host);
  2013. auto opr = opr::BatchConvBiasForward::make(
  2014. inp, filt, param, {},
  2015. OperatorNodeConfig{dtype::QuantizedS8{1.3f}});
  2016. auto dumper = GraphDumper::make(OutputFile::make_fs(fname.c_str()));
  2017. auto rst = dumper->dump({opr});
  2018. ASSERT_EQ(rst.outputs.size(), 1u);
  2019. };
  2020. auto load = [&]() {
  2021. auto loader = GraphLoader::make(InputFile::make_fs(fname.c_str()));
  2022. auto rst = loader->load();
  2023. ASSERT_EQ(rst.output_var_list.size(), 1u);
  2024. };
  2025. dump();
  2026. load();
  2027. }
  2028. TEST(TestOprDNN, HeuristicReproducible) {
  2029. using Policy = opr::ConvolutionBackwardFilter::ExecutionPolicy;
  2030. using S = Policy::Strategy;
  2031. using Checker = AutoOprChecker<3, 1>;
  2032. constexpr size_t PH = 1, PW = 1, SH = 1, SW = 1;
  2033. for (auto strategy :
  2034. SmallVector<S>{S::HEURISTIC, S::HEURISTIC | S::REPRODUCIBLE}) {
  2035. VarNode* bwd_flt;
  2036. auto make_graph = [&](const Checker::SymInpArray& inputs)
  2037. -> Checker::SymOutArray {
  2038. Param param{Mode::CROSS_CORRELATION, PH, PW, SH, SW};
  2039. Policy policy;
  2040. policy.strategy = strategy;
  2041. auto out = opr::ConvolutionBackwardFilter::make(
  2042. inputs[0], inputs[1], inputs[2], param, policy);
  2043. bwd_flt = out.node();
  2044. return {out};
  2045. };
  2046. auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
  2047. std::shared_ptr<HostTensorND> out;
  2048. conv_bwd_flt_brute({inp[0], inp[1], inp[2]}, out,
  2049. Param{Mode::CROSS_CORRELATION, PH, PW, SH, SW});
  2050. dest[0] = *out;
  2051. };
  2052. #define get_shp(N, P, S, F) ((N + 2 * P - F) / S + 1)
  2053. #define inp_tensor(N, IC, OC, IH, IW, FH, FW) \
  2054. { \
  2055. TensorShape{N, IC, IH, IW}, \
  2056. {N, OC, get_shp(IH, PH, SH, FH), get_shp(IW, PW, SW, FW)}, { \
  2057. OC, IC, FH, FW \
  2058. } \
  2059. }
  2060. Checker::RunOptions opt;
  2061. opt.numdiff_eps = 1;
  2062. opt.outputs_max_err = 1e-3;
  2063. std::string algo_name0, algo_name1;
  2064. {
  2065. Checker checker(make_graph, fwd);
  2066. checker.run(inp_tensor(2, 3, 4, 9, 8, 3, 3), opt)
  2067. .run(inp_tensor(1, 5, 3, 7, 9, 3, 3), opt)
  2068. .run(inp_tensor(3, 4, 4, 9, 9, 3, 3), opt);
  2069. auto&& megdnn_opr = static_cast<megdnn::ConvolutionBackwardFilter*>(
  2070. static_cast<opr::ConvolutionBackwardFilter*>(
  2071. bwd_flt->owner_opr())
  2072. ->megdnn_opr());
  2073. auto&& algo = megdnn_opr->execution_policy().algo;
  2074. megdnn::Algorithm* palgo =
  2075. megdnn_opr->get_algorithm_from_desc(algo);
  2076. mgb_assert(palgo, "Unknown algo description");
  2077. if (strategy == S(S::HEURISTIC | S::REPRODUCIBLE)) {
  2078. EXPECT_TRUE(palgo->contain_attribute_all(
  2079. megdnn::AlgoAttribute::REPRODUCIBLE));
  2080. }
  2081. algo_name0 = palgo->name();
  2082. }
  2083. {
  2084. Checker checker(make_graph, fwd);
  2085. checker.run(inp_tensor(2, 3, 4, 9, 8, 3, 3), opt)
  2086. .run(inp_tensor(1, 5, 3, 7, 9, 3, 3), opt)
  2087. .run(inp_tensor(3, 4, 4, 9, 9, 3, 3), opt);
  2088. auto&& megdnn_opr = static_cast<megdnn::ConvolutionBackwardFilter*>(
  2089. static_cast<opr::ConvolutionBackwardFilter*>(
  2090. bwd_flt->owner_opr())
  2091. ->megdnn_opr());
  2092. auto&& algo = megdnn_opr->execution_policy().algo;
  2093. megdnn::Algorithm* palgo =
  2094. megdnn_opr->get_algorithm_from_desc(algo);
  2095. mgb_assert(palgo, "Unknown algo description");
  2096. algo_name1 = palgo->name();
  2097. }
  2098. EXPECT_TRUE(algo_name0 == algo_name1);
  2099. }
  2100. #undef inp_tensor
  2101. #undef get_shp
  2102. }
  2103. #if MGB_CUDA
  2104. TEST(TestOprDNN, ConvolutionMultiCompNode) {
  2105. REQUIRE_GPU(1);
  2106. auto cn0 = CompNode::load("gpu0:0"), cn1 = CompNode::load("gpu0:1");
  2107. cn0.activate();
  2108. auto&& prop = CompNodeEnv::from_comp_node(cn0).cuda_env().device_prop;
  2109. auto sm_ver = prop.major * 10 + prop.minor;
  2110. if (sm_ver < 61) {
  2111. printf("This testcast ignored due to insufficient cuda cap(got: %d, "
  2112. "expected: %d)\n",
  2113. sm_ver, 61);
  2114. return;
  2115. }
  2116. HostTensorGenerator<dtype::Int8> gen;
  2117. auto mkvar = [&gen](const char* name, const TensorShape& shp,
  2118. const DType& dtype,
  2119. std::shared_ptr<ComputingGraph> graph,
  2120. const CompNode& cn) {
  2121. return opr::TypeCvt::make(
  2122. opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
  2123. dtype);
  2124. };
  2125. auto mkcvar = [&gen](const char* name, const TensorShape& shp,
  2126. const DType& dtype,
  2127. std::shared_ptr<ComputingGraph> graph,
  2128. const CompNode& cn) {
  2129. return opr::TypeCvt::make(
  2130. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  2131. .rename(name),
  2132. dtype);
  2133. };
  2134. auto graph0 = ComputingGraph::make();
  2135. graph0->options().graph_opt_level = 0;
  2136. auto graph1 = ComputingGraph::make();
  2137. graph1->options().graph_opt_level = 0;
  2138. auto make_func = [&gen, &mkvar, &mkcvar](
  2139. std::shared_ptr<ComputingGraph> graph,
  2140. const CompNode& cn) {
  2141. using Policy = opr::ConvBias::ExecutionPolicy;
  2142. using S = Policy::Strategy;
  2143. auto x = mkvar("x", {64, 32, 28, 28, 4}, dtype::QuantizedS8(2.5f),
  2144. graph, cn),
  2145. w1 = mkcvar("w1", {256, 32, 5, 5, 4}, dtype::QuantizedS8(2.5f),
  2146. graph, cn),
  2147. b1 = mkcvar("b1", {1, 64, 1, 1, 4}, dtype::QuantizedS32(6.25f),
  2148. graph, cn),
  2149. w2 = mkcvar("w2", {256, 64, 3, 3, 4}, dtype::QuantizedS8(2.5f),
  2150. graph, cn),
  2151. b2 = mkcvar("b2", {1, 64, 1, 1, 4}, dtype::QuantizedS32(6.25f),
  2152. graph, cn);
  2153. opr::ConvBias::Param param;
  2154. param.format = opr::ConvBias::Param::Format::NCHW4;
  2155. param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU;
  2156. param.stride_h = param.stride_w = 2;
  2157. param.pad_h = param.pad_w = 2;
  2158. Policy policy;
  2159. policy.strategy = S::PROFILE;
  2160. auto y = opr::ConvBias::make(
  2161. x, w1, b1, param, policy,
  2162. OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  2163. param.stride_h = param.stride_w = 1;
  2164. param.pad_h = param.pad_w = 1;
  2165. y = opr::ConvBias::make(y, w2, b2, param, policy,
  2166. OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  2167. return y;
  2168. };
  2169. auto y0 = make_func(graph0, cn0);
  2170. auto y1 = make_func(graph1, cn1);
  2171. HostTensorND host_y0, host_y1;
  2172. auto func0 = graph0->compile({make_callback_copy(y0, host_y0)});
  2173. auto func1 = graph1->compile({make_callback_copy(y1, host_y1)});
  2174. auto worker = [&func0, &func1](int wid) {
  2175. static const int iter_num = 1000;
  2176. if (wid == 0) {
  2177. for (int i = 0; i < iter_num; ++i)
  2178. func0->execute();
  2179. } else {
  2180. for (int i = 0; i < iter_num; ++i)
  2181. func1->execute();
  2182. }
  2183. };
  2184. std::thread worker0(worker, 0);
  2185. std::thread worker1(worker, 1);
  2186. worker0.join();
  2187. worker1.join();
  2188. }
  2189. #endif
  2190. } // anonymous namespace
  2191. #ifndef _WIN32
  2192. namespace mgb {
  2193. namespace opr {
  2194. namespace testing {
  2195. class ConvolutionTestingPeer {
  2196. opr::ConvolutionForward& m_conv_opr;
  2197. public:
  2198. explicit ConvolutionTestingPeer(cg::OperatorNodeBase* opr)
  2199. : m_conv_opr(opr->cast_final_safe<opr::ConvolutionForward>()) {}
  2200. void set_megdnn_opr(
  2201. std::unique_ptr<megdnn::ConvolutionForward> megdnn_opr) {
  2202. m_conv_opr.set_megdnn_opr(std::move(megdnn_opr));
  2203. }
  2204. };
  2205. } // namespace testing
  2206. } // namespace opr
  2207. } // namespace mgb
  2208. namespace {
  2209. using megdnn::TensorND;
  2210. using megdnn::Workspace;
  2211. using opr::testing::ConvolutionTestingPeer;
  2212. class MockConvolutionForward : public megdnn::ConvolutionForward {
  2213. const char* m_algorithm_set_name;
  2214. public:
  2215. MockConvolutionForward(megdnn::ConvolutionForward* orig,
  2216. const char* algo_set_name)
  2217. : megdnn::ConvolutionForward(orig->handle()),
  2218. m_algorithm_set_name(algo_set_name) {}
  2219. MOCK_METHOD5(exec, void(_megdnn_tensor_in src, _megdnn_tensor_in filter,
  2220. _megdnn_tensor_out dst,
  2221. const PreprocessedFilter* preprocessed_filter,
  2222. _megdnn_workspace workspace));
  2223. MOCK_METHOD5(exec_preprocess,
  2224. void(const TensorLayout& src_layout, _megdnn_tensor_in filter,
  2225. const TensorLayout& dst_layout,
  2226. PreprocessedFilter* preprocessed_filter,
  2227. _megdnn_workspace workspace));
  2228. MOCK_METHOD4(get_workspace_in_bytes,
  2229. size_t(const TensorLayout& src, const TensorLayout& filter,
  2230. const TensorLayout& dst,
  2231. const PreprocessedFilter* preprocessed_filter));
  2232. MOCK_METHOD3(deduce_preprocessed_filter_layout,
  2233. SmallVector<TensorLayout>(const TensorLayout& src,
  2234. const TensorLayout& filter,
  2235. const TensorLayout& dst));
  2236. MOCK_METHOD3(get_preprocess_workspace_in_bytes,
  2237. size_t(const TensorLayout& src, const TensorLayout& filter,
  2238. const TensorLayout& dst));
  2239. MOCK_METHOD3(get_all_algorithms_info,
  2240. std::vector<AlgorithmInfo>(const TensorLayout& p0,
  2241. const TensorLayout& p1,
  2242. const TensorLayout& p2));
  2243. MOCK_METHOD6(get_algorithm_info_heuristic,
  2244. AlgorithmInfo(const TensorLayout& p0, const TensorLayout& p1,
  2245. const TensorLayout& p2,
  2246. size_t workspace_limit_in_bytes,
  2247. const AlgoAttribute& positive_attr,
  2248. const AlgoAttribute& negative_attr));
  2249. MOCK_METHOD3(get_all_algorithms,
  2250. std::vector<Algorithm*>(const TensorLayout& p0,
  2251. const TensorLayout& p1,
  2252. const TensorLayout& p2));
  2253. MOCK_METHOD6(get_algorithm_heuristic,
  2254. Algorithm*(const TensorLayout& p0, const TensorLayout& p1,
  2255. const TensorLayout& p2,
  2256. size_t workspace_limit_in_bytes,
  2257. const AlgoAttribute& positive_attr,
  2258. const AlgoAttribute& negative_attr));
  2259. MOCK_METHOD1(get_algorithm_from_desc,
  2260. Algorithm*(const AlgorithmDesc&));
  2261. protected:
  2262. const char* get_algorithm_set_name() const override {
  2263. return m_algorithm_set_name;
  2264. }
  2265. };
  2266. class MockAlgorithm : public megdnn::detail::Algorithm {
  2267. const char* m_name;
  2268. public:
  2269. MockAlgorithm(const char* name = "NotImportant") : m_name(name) {}
  2270. Attribute attribute() const override {
  2271. return Attribute::REPRODUCIBLE;
  2272. }
  2273. const char* name() const override { return m_name; }
  2274. uint32_t type() const override {
  2275. return megdnn::detail::Algorithm::INVALID_ALGO_TYPE;
  2276. }
  2277. virtual ~MockAlgorithm() = default;
  2278. };
  2279. class TestWeightPreprocess : public ::testing::Test {
  2280. protected:
  2281. CompNode comp_node;
  2282. std::shared_ptr<ComputingGraph> graph;
  2283. std::shared_ptr<HostTensorND> x_host;
  2284. MockConvolutionForward* mock_conv_ptr;
  2285. SymbolVar y;
  2286. HostTensorND y_host;
  2287. std::unique_ptr<cg::AsyncExecutable> func;
  2288. MockConvolutionForward& mock_conv() { return *mock_conv_ptr; }
  2289. void SetUp() override {
  2290. constexpr uint32_t ih = 10, ic = 16, oc = 32, ph = 0, sh = 1, fh = 2,
  2291. iw = ih;
  2292. comp_node = CompNode::load("cpux");
  2293. graph = ComputingGraph::make();
  2294. graph->options().graph_opt.weight_preprocess = is_weight_preprocess();
  2295. TensorShape x_shape{1, ic, ih, iw}, w_shape{oc, ic, fh, fh};
  2296. x_host = std::make_shared<HostTensorND>(comp_node, x_shape);
  2297. auto x = opr::Host2DeviceCopy::make(*graph, x_host);
  2298. auto w = opr::ImmutableTensor::make(*graph, {comp_node, w_shape});
  2299. Param param;
  2300. param.pad_h = param.pad_w = ph;
  2301. param.stride_h = param.stride_w = sh;
  2302. param.format = Param::Format::NCHW;
  2303. y = opr::ConvolutionForward::make(x, w, param);
  2304. auto& opr =
  2305. y.node()->owner_opr()->cast_final<opr::ConvolutionForward>();
  2306. auto mock = std::make_unique<MockConvolutionForward>(
  2307. opr.megdnn_opr(), ::testing::UnitTest::GetInstance()
  2308. ->current_test_info()
  2309. ->name());
  2310. mock_conv_ptr = mock.get();
  2311. ConvolutionTestingPeer{&opr}.set_megdnn_opr(std::move(mock));
  2312. func = graph->compile({make_callback_copy(y, y_host)});
  2313. }
  2314. void run() { func->execute().wait(); }
  2315. virtual bool is_weight_preprocess() { return true; }
  2316. void TearDown() override {
  2317. func.reset();
  2318. // Triggers mock check
  2319. graph.reset();
  2320. x_host.reset();
  2321. }
  2322. };
  2323. TEST_F(TestWeightPreprocess, NoPreprocessNeeded) {
  2324. using ::testing::_;
  2325. using ::testing::Return;
  2326. auto& mock = mock_conv();
  2327. MockAlgorithm algo;
  2328. EXPECT_CALL(mock, get_algorithm_heuristic(_, _, _, _, _, _))
  2329. .WillRepeatedly(Return(&algo));
  2330. EXPECT_CALL(mock, get_algorithm_from_desc(_))
  2331. .WillRepeatedly(Return(&algo));
  2332. EXPECT_CALL(mock, get_workspace_in_bytes(_, _, _, _))
  2333. .WillRepeatedly(Return(0));
  2334. EXPECT_CALL(mock, get_preprocess_workspace_in_bytes(_, _, _))
  2335. .WillRepeatedly(Return(0));
  2336. {
  2337. ::testing::InSequence seq;
  2338. // Return empty preprocess filters, indicating no need to preprocess
  2339. EXPECT_CALL(mock, deduce_preprocessed_filter_layout(_, _, _))
  2340. .WillRepeatedly(Return(SmallVector<TensorLayout>{}));
  2341. EXPECT_CALL(mock, exec_preprocess(_, _, _, _, _)).Times(0);
  2342. EXPECT_CALL(mock, exec(_, _, _, nullptr, _));
  2343. run();
  2344. }
  2345. }
  2346. TEST_F(TestWeightPreprocess, PreprocessCalledOnlyOnce) {
  2347. using ::testing::_;
  2348. using ::testing::Return;
  2349. using ::testing::Field;
  2350. using ::testing::Invoke;
  2351. using ::testing::Expectation;
  2352. using PF = MockConvolutionForward::PreprocessedFilter;
  2353. auto& mock = mock_conv();
  2354. MockAlgorithm algo;
  2355. SmallVector<TensorLayout> filter_layout{{{1, 2, 3, 4}, dtype::Float32()},
  2356. {{5, 6, 7, 8}, dtype::Float32()}};
  2357. EXPECT_CALL(mock, deduce_preprocessed_filter_layout(_, _, _))
  2358. .WillRepeatedly(Return(filter_layout));
  2359. EXPECT_CALL(mock, get_algorithm_from_desc(_))
  2360. .WillRepeatedly(Return(&algo));
  2361. Expectation algo_call =
  2362. EXPECT_CALL(mock, get_algorithm_heuristic(_, _, _, _, _, _))
  2363. .WillOnce(Return(&algo));
  2364. Expectation ws_call = EXPECT_CALL(mock, get_workspace_in_bytes(_, _, _, _))
  2365. .After(algo_call)
  2366. .WillOnce(Return(0));
  2367. Expectation pre_ws_call =
  2368. EXPECT_CALL(mock, get_preprocess_workspace_in_bytes(_, _, _))
  2369. .After(algo_call)
  2370. .WillOnce(Return(233));
  2371. {
  2372. ::testing::InSequence seq;
  2373. // exec_preprocess should be called only once, with workspace allocated
  2374. int salt = 0;
  2375. EXPECT_CALL(mock, exec_preprocess(_, _, _, _, _))
  2376. .After(ws_call, pre_ws_call)
  2377. .WillOnce(Invoke([&](const TensorLayout&, _megdnn_tensor_in,
  2378. const TensorLayout&, PF* pf,
  2379. _megdnn_workspace workspace) {
  2380. ASSERT_EQ(workspace.size, 233);
  2381. ASSERT_NE(pf, nullptr);
  2382. pf->algorithm_id = &salt;
  2383. ASSERT_EQ(pf->tensors.size(), 2);
  2384. ASSERT_TRUE(pf->tensors[0].layout.eq_shape({1, 2, 3, 4}));
  2385. ASSERT_TRUE(pf->tensors[1].layout.eq_shape({5, 6, 7, 8}));
  2386. ASSERT_NE(pf->tensors[0].raw_ptr, nullptr);
  2387. ASSERT_NE(pf->tensors[1].raw_ptr, nullptr);
  2388. pf->tensors[0].ptr<float>()[0] = 114.514f;
  2389. pf->tensors[1].ptr<float>()[0] = 1926.0817f;
  2390. }));
  2391. // Run the graph multiple times.
  2392. for (int i = 0; i < 3; i++) {
  2393. if (i > 0) {
  2394. EXPECT_CALL(mock, exec_preprocess(_, _, _, _, _)).Times(0);
  2395. }
  2396. EXPECT_CALL(mock, exec(_, _, _, _, _))
  2397. .WillOnce(Invoke([&](_megdnn_tensor_in, _megdnn_tensor_in,
  2398. _megdnn_tensor_out, const PF* pf,
  2399. _megdnn_workspace) {
  2400. ASSERT_NE(pf, nullptr);
  2401. ASSERT_EQ(pf->algorithm_id, &salt);
  2402. ASSERT_EQ(pf->tensors[0].ptr<float>()[0], 114.514f);
  2403. ASSERT_EQ(pf->tensors[1].ptr<float>()[0], 1926.0817f);
  2404. }));
  2405. run();
  2406. }
  2407. }
  2408. }
  2409. class TestNoWeightPreprocess : public TestWeightPreprocess {
  2410. bool is_weight_preprocess() override { return false; }
  2411. };
  2412. TEST_F(TestNoWeightPreprocess, NoPreprocess) {
  2413. using ::testing::_;
  2414. using ::testing::Return;
  2415. auto& mock = mock_conv();
  2416. MockAlgorithm algo;
  2417. EXPECT_CALL(mock, get_algorithm_heuristic(_, _, _, _, _, _))
  2418. .WillRepeatedly(Return(&algo));
  2419. EXPECT_CALL(mock, get_algorithm_from_desc(_))
  2420. .WillRepeatedly(Return(&algo));
  2421. EXPECT_CALL(mock, get_workspace_in_bytes(_, _, _, _))
  2422. .WillRepeatedly(Return(0));
  2423. EXPECT_CALL(mock, get_preprocess_workspace_in_bytes(_, _, _))
  2424. .WillRepeatedly(Return(0));
  2425. {
  2426. ::testing::InSequence seq;
  2427. // Return empty preprocess filters, indicating no need to preprocess
  2428. EXPECT_CALL(mock, deduce_preprocessed_filter_layout(_, _, _)).Times(0);
  2429. EXPECT_CALL(mock, exec_preprocess(_, _, _, _, _)).Times(0);
  2430. EXPECT_CALL(mock, exec(_, _, _, nullptr, _));
  2431. run();
  2432. }
  2433. }
  2434. } // anonymous namespace
  2435. #endif
  2436. namespace {
  2437. TEST(TestOprDNN, ConvBiasInt4Serialize) {
  2438. using namespace serialization;
  2439. float inp_scale = 1.20210327f;
  2440. float filt_scale = 1.20210406f;
  2441. float bias_scale = inp_scale * filt_scale;
  2442. DType output_dtype = dtype::QuantizedS4{inp_scale};
  2443. HostTensorGenerator<dtype::Int8> gen;
  2444. std::shared_ptr<HostTensorND> xv;
  2445. auto mkvar = [](const char* name, const DType& dtype,
  2446. std::shared_ptr<ComputingGraph> graph,
  2447. std::shared_ptr<HostTensorND> val) {
  2448. return opr::TypeCvt::make(
  2449. opr::Host2DeviceCopy::make(*graph, val).rename(name), dtype);
  2450. };
  2451. auto mkcvar =
  2452. [&gen](const char* name, const TensorShape& shp, const DType& dtype,
  2453. std::shared_ptr<ComputingGraph> graph, const CompNode& cn) {
  2454. return opr::TypeCvt::make(
  2455. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  2456. .rename(name),
  2457. dtype);
  2458. };
  2459. auto fname = output_file("ConvBiasInt4Serialize");
  2460. HostTensorND y1, y2;
  2461. auto dump = [&]() {
  2462. opr::ConvBias::Param param;
  2463. param.mode = Mode::CONVOLUTION;
  2464. auto cn = CompNode::load("cpu0");
  2465. auto graph = ComputingGraph::make();
  2466. xv = gen({1, 64, 56, 56}, cn);
  2467. auto x = mkvar("x", dtype::QuantizedS4{inp_scale}, graph, xv);
  2468. auto w = mkcvar("w", {256, 64, 1, 1}, dtype::QuantizedS4{filt_scale}, graph, cn);
  2469. auto b = mkcvar("b", {1, 256, 1, 1}, dtype::QuantizedS32{bias_scale}, graph, cn);
  2470. auto y = opr::ConvBiasForward::make(x, w, b, param, {},
  2471. OperatorNodeConfig{output_dtype});
  2472. auto w1 = mkcvar("w1", {64, 256, 1, 1}, dtype::QuantizedS4{filt_scale},
  2473. graph, cn);
  2474. auto b1 = mkcvar("b1", {1, 64, 1, 1}, dtype::QuantizedS32{bias_scale},
  2475. graph, cn);
  2476. y = opr::ConvBiasForward::make(y, w1, b1, param, {},
  2477. OperatorNodeConfig{output_dtype});
  2478. y = opr::TypeCvt::make(y, dtype::Float32());
  2479. auto dumper = GraphDumper::make(OutputFile::make_fs(fname.c_str()));
  2480. auto func = graph->compile({make_callback_copy(y, y1)});
  2481. func->execute();
  2482. func->wait();
  2483. auto rst = dumper->dump({y});
  2484. ASSERT_EQ(rst.outputs.size(), 1u);
  2485. };
  2486. auto load = [&]() {
  2487. auto loader = GraphLoader::make(InputFile::make_fs(fname.c_str()));
  2488. auto rst = loader->load();
  2489. for (const auto& t : rst.tensor_map) {
  2490. t.second->copy_from(*xv).sync();
  2491. }
  2492. auto func = rst.graph->compile(
  2493. {make_callback_copy(rst.output_var_list[0], y2)});
  2494. func->execute();
  2495. func->wait();
  2496. ASSERT_EQ(rst.output_var_list.size(), 1u);
  2497. EXPECT_EQ(rst.output_var_list[0].dtype(), dtype::Float32());
  2498. };
  2499. dump();
  2500. load();
  2501. MGB_ASSERT_TENSOR_NEAR(y1, y2, 1e-3);
  2502. }
  2503. TEST(TestOprDNN, ConvBiasInt4SerializeWithParamFuse) {
  2504. using namespace serialization;
  2505. float inp_scale = 1.20210327f;
  2506. float filt_scale = 1.20210406f;
  2507. float bias_scale = inp_scale * filt_scale;
  2508. DType output_dtype = dtype::QuantizedS4{inp_scale};
  2509. HostTensorGenerator<dtype::Int8> gen;
  2510. std::shared_ptr<HostTensorND> xv;
  2511. auto mkvar = [](const char* name, const DType& dtype,
  2512. std::shared_ptr<ComputingGraph> graph,
  2513. std::shared_ptr<HostTensorND> val) {
  2514. return opr::TypeCvt::make(
  2515. opr::Host2DeviceCopy::make(*graph, val).rename(name), dtype);
  2516. };
  2517. auto mkcvar =
  2518. [&gen](const char* name, const TensorShape& shp, const DType& dtype,
  2519. std::shared_ptr<ComputingGraph> graph, const CompNode& cn) {
  2520. return opr::TypeCvt::make(
  2521. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  2522. .rename(name),
  2523. dtype);
  2524. };
  2525. auto fname = output_file("ConvBiasInt4SerializeWithParamFuse");
  2526. HostTensorND y1, y2;
  2527. auto dump = [&]() {
  2528. opr::ConvBias::Param param;
  2529. param.mode = Mode::CONVOLUTION;
  2530. auto cn = CompNode::load("cpu0");
  2531. auto graph = ComputingGraph::make();
  2532. xv = gen({1, 64, 56, 56}, cn);
  2533. auto x = mkvar("x", dtype::QuantizedS4{inp_scale}, graph, xv);
  2534. auto w = mkcvar("w", {256, 64, 1, 1}, dtype::QuantizedS4{filt_scale}, graph, cn);
  2535. auto b = mkcvar("b", {1, 256, 1, 1}, dtype::QuantizedS32{bias_scale}, graph, cn);
  2536. auto y = opr::ConvBiasForward::make(x, w, b, param, {},
  2537. OperatorNodeConfig{output_dtype});
  2538. auto w1 = mkcvar("w1", {64, 256, 1, 1}, dtype::QuantizedS4{filt_scale},
  2539. graph, cn);
  2540. auto b1 = mkcvar("b1", {1, 64, 1, 1}, dtype::QuantizedS32{bias_scale},
  2541. graph, cn);
  2542. y = opr::ConvBiasForward::make(y, w1, b1, param, {},
  2543. OperatorNodeConfig{output_dtype});
  2544. y = opr::TypeCvt::make(y, dtype::Float32());
  2545. SymbolVar y_param_fused;
  2546. unpack_vector(gopt::GraphOptimizer{}
  2547. .add_pass<gopt::ParamFusePass>()
  2548. .apply({{y}})
  2549. .endpoint_vars(),
  2550. y_param_fused);
  2551. auto dumper = GraphDumper::make(OutputFile::make_fs(fname.c_str()));
  2552. auto func = graph->compile({make_callback_copy(y_param_fused, y1)});
  2553. func->execute();
  2554. func->wait();
  2555. auto rst = dumper->dump({y_param_fused});
  2556. ASSERT_EQ(rst.outputs.size(), 1u);
  2557. };
  2558. auto load = [&]() {
  2559. auto loader = GraphLoader::make(InputFile::make_fs(fname.c_str()));
  2560. auto rst = loader->load();
  2561. for (const auto& t : rst.tensor_map) {
  2562. t.second->copy_from(*xv).sync();
  2563. }
  2564. auto func = rst.graph->compile(
  2565. {make_callback_copy(rst.output_var_list[0], y2)});
  2566. func->execute();
  2567. func->wait();
  2568. ASSERT_EQ(rst.output_var_list.size(), 1u);
  2569. EXPECT_EQ(rst.output_var_list[0].dtype(), dtype::Float32());
  2570. };
  2571. dump();
  2572. load();
  2573. MGB_ASSERT_TENSOR_NEAR(y1, y2, 1e-3);
  2574. }
  2575. } // namespace
  2576. // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台