You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

convolution.cpp 117 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839
  1. #include "./legacy_checker.h"
  2. #include "megbrain/comp_node_env.h"
  3. #include "megbrain/gopt/inference.h"
  4. #include "megbrain/opr/basic_arith.h"
  5. #include "megbrain/opr/dnn/convolution.h"
  6. #include "megbrain/opr/tensor_manip.h"
  7. #include "megbrain/serialization/serializer.h"
  8. #include "megbrain/test/autocheck.h"
  9. #include "megbrain/test/helper.h"
  10. #include "megbrain/test/megdnn_helper.h"
  11. #include "megbrain/utils/infile_persistent_cache.h"
  12. #include "megdnn/algorithm_cache.h"
  13. #include "megdnn/dtype.h"
  14. #include "megdnn/oprs/base.h"
  15. #include <gmock/gmock.h>
  16. #include <cmath>
  17. #include <memory>
  18. #include <random>
  19. using namespace mgb;
  20. namespace {
  21. using Param = opr::Convolution::Param;
  22. using Param3D = opr::Convolution3D::Param;
  23. using Mode = Param::Mode;
  24. Mode modes_to_check[] = {Mode::CONVOLUTION, Mode::CROSS_CORRELATION};
  25. void conv_bwd_data_brute(
  26. const std::vector<std::shared_ptr<HostTensorND>>& inps,
  27. std::shared_ptr<HostTensorND>& dest,
  28. const opr::ConvolutionBackwardData::Param& param) {
  29. mgb_assert(param.format == Param::Format::NCHW);
  30. auto &&data = *inps[0], &&filter = *inps[1];
  31. size_t N = data.shape(0), IH = data.shape(2), IW = data.shape(3);
  32. size_t GROUP, ICPG, OCPG, FH, FW;
  33. if (param.sparse == Param::Sparse::DENSE) {
  34. GROUP = 1, ICPG = filter.shape(0), OCPG = filter.shape(1), FH = filter.shape(2),
  35. FW = filter.shape(3);
  36. } else {
  37. mgb_assert(param.sparse == Param::Sparse::GROUP);
  38. GROUP = filter.shape(0), ICPG = filter.shape(1), OCPG = filter.shape(2),
  39. FH = filter.shape(3), FW = filter.shape(4);
  40. }
  41. auto get_shp = [](size_t inp, size_t filter, size_t stride, size_t pad,
  42. size_t dilate) {
  43. return (inp - 1) * stride + (filter - 1) * dilate + 1 - pad * 2;
  44. };
  45. size_t OH = get_shp(IH, FH, param.stride_h, param.pad_h, param.dilate_h),
  46. OW = get_shp(IW, FW, param.stride_w, param.pad_w, param.dilate_w);
  47. dest = std::make_shared<HostTensorND>(
  48. CompNode::load("xpu0"), TensorShape{N, OCPG * GROUP, OH, OW});
  49. auto&& out = *dest;
  50. auto fptr = filter.ptr<float>(), dptr = data.ptr<float>(), optr = out.ptr<float>();
  51. memset(optr, 0, sizeof(float) * out.shape().total_nr_elems());
  52. auto ol = out.layout(), fl = filter.layout();
  53. #define FOR2(a, A, b, B) \
  54. for (size_t a = 0; a < A; ++a) \
  55. for (size_t b = 0; b < B; ++b)
  56. #define FOR3(a, A, b, B, c, C) \
  57. FOR2(a, A, b, B) \
  58. for (size_t c = 0; c < C; ++c)
  59. FOR3(n, N, group, GROUP, icg, ICPG)
  60. FOR2(ih, IH, iw, IW) {
  61. float scale = *(dptr++);
  62. FOR3(ocg, OCPG, fh, FH, fw, FW) {
  63. auto oc_tot = group * OCPG + ocg;
  64. int oh = int(ih * param.stride_h + fh * param.dilate_h) - int(param.pad_h),
  65. ow = int(iw * param.stride_w + fw * param.dilate_w) - int(param.pad_w);
  66. if (oh >= 0 && ow >= 0 && oh < static_cast<int>(OH) &&
  67. ow < static_cast<int>(OW)) {
  68. auto out_off = n * ol.stride[0] + oc_tot * ol.stride[1] +
  69. oh * ol.stride[2] + ow;
  70. size_t flt_off = 0;
  71. if (param.sparse == Param::Convolution::Sparse::DENSE) {
  72. flt_off = icg * fl.stride[0] + ocg * fl.stride[1] +
  73. fh * fl.stride[2] + fw;
  74. } else {
  75. flt_off = group * fl.stride[0] + icg * fl.stride[1] +
  76. ocg * fl.stride[2] + fh * fl.stride[3] + fw;
  77. }
  78. optr[out_off] += scale * fptr[flt_off];
  79. }
  80. }
  81. }
  82. #undef FOR3
  83. #undef FOR2
  84. }
  85. void conv_bwd_flt_brute(
  86. const std::vector<std::shared_ptr<HostTensorND>>& inps,
  87. std::shared_ptr<HostTensorND>& out,
  88. const opr::ConvolutionBackwardFilter::Param& param) {
  89. auto &&src = *inps[0], &&diff = *inps[1], &&filter = *inps[2];
  90. size_t N = src.shape(0), IH = src.shape(2), IW = src.shape(3), OC = filter.shape(0),
  91. IC = filter.shape(1), FH = filter.shape(2), FW = filter.shape(3),
  92. OH = diff.shape(2), OW = diff.shape(3);
  93. out = std::make_shared<HostTensorND>(
  94. CompNode::load("xpu0"), TensorShape{OC, IC, FH, FW});
  95. auto&& grad = *out;
  96. auto sptr = src.ptr<float>(), dptr = diff.ptr<float>(), gptr = grad.ptr<float>();
  97. memset(gptr, 0, sizeof(float) * grad.shape().total_nr_elems());
  98. auto valid = [&](size_t ih, size_t iw) { return ih < IH && iw < IW; };
  99. for (size_t n = 0; n < N; ++n)
  100. for (size_t oc = 0; oc < OC; ++oc)
  101. for (size_t ic = 0; ic < IC; ++ic) {
  102. for (size_t oh = 0; oh < OH; ++oh)
  103. for (size_t ow = 0; ow < OW; ++ow) {
  104. for (size_t fh = 0; fh < FH; ++fh)
  105. for (size_t fw = 0; fw < FW; ++fw) {
  106. size_t ih = oh * param.stride_h + fh - param.pad_h,
  107. iw = ow * param.stride_w + fw - param.pad_w;
  108. auto src_data = valid(ih, iw)
  109. ? sptr[(n * IC + ic) * IH * IW +
  110. ih * IW + iw]
  111. : 0;
  112. gptr[(oc * IC + ic) * FH * FW + fh * FW + fw] +=
  113. dptr[(n * OC + oc) * OH * OW + oh * OW + ow] *
  114. src_data;
  115. }
  116. }
  117. }
  118. }
  119. void local_share_brute(
  120. const std::vector<std::shared_ptr<HostTensorND>>& inps,
  121. std::shared_ptr<HostTensorND>& out, const opr::LocalShare::Param& param) {
  122. auto in = inps[0], filter = inps[1];
  123. mgb_assert(in->shape().ndim == 4);
  124. mgb_assert(filter->shape().ndim == 6);
  125. int batch_size = in->shape()[0], ci = in->shape()[1], hi = in->shape()[2],
  126. wi = in->shape()[3];
  127. int fh = filter->shape()[3], fw = filter->shape()[4];
  128. int ph = param.pad_h, pw = param.pad_w;
  129. int sh = param.stride_h, sw = param.stride_w;
  130. int dh = param.dilate_h, dw = param.dilate_w;
  131. int sgh = filter->shape()[0], sgw = filter->shape()[1];
  132. mgb_assert(dh == 1 && dw == 1);
  133. mgb_assert(
  134. static_cast<uint32_t>(sgh) == param.spatial_groups_h &&
  135. static_cast<uint32_t>(sgw) == param.spatial_groups_w);
  136. int ho = (hi + 2 * ph - fh) / sh + 1;
  137. int wo = (wi + 2 * pw - fw) / sw + 1;
  138. mgb_assert(ho % sgh == 0 && wo % sgw == 0);
  139. int grp_ho = ho / sgh, grp_wo = wo / sgw;
  140. int co = filter->shape()[5];
  141. size_t u_batch = batch_size, u_co = co, u_ho = ho, u_wo = wo;
  142. out = std::make_shared<HostTensorND>(
  143. CompNode::load("xpu0"), TensorShape{u_batch, u_co, u_ho, u_wo});
  144. mgb_assert(param.mode == Param::Mode::CROSS_CORRELATION);
  145. for (int n = 0; n < batch_size; ++n) {
  146. for (int oc = 0; oc < co; ++oc) {
  147. for (int oh = 0; oh < ho; ++oh) {
  148. for (int ow = 0; ow < wo; ++ow) {
  149. size_t u_n = n, u_oc = oc, u_oh = oh, u_ow = ow;
  150. float& dval = out->ptr<float>({u_n, u_oc, u_oh, u_ow})[0];
  151. dval = 0;
  152. int grp_oh_idx = oh / grp_ho;
  153. int grp_ow_idx = ow / grp_wo;
  154. for (int ic = 0; ic < ci; ++ic) {
  155. for (int kh = 0; kh < fh; ++kh) {
  156. for (int kw = 0; kw < fw; ++kw) {
  157. int ih = oh * sh - ph + kh;
  158. int iw = ow * sw - pw + kw;
  159. float sval = 0.f;
  160. float fval = 0.f;
  161. if (ih >= 0 && ih < hi && iw >= 0 && iw < wi) {
  162. sval = in->ptr<float>(
  163. {static_cast<size_t>(n),
  164. static_cast<size_t>(ic),
  165. static_cast<size_t>(ih),
  166. static_cast<size_t>(iw)})[0];
  167. }
  168. fval = filter->ptr<float>(
  169. {static_cast<size_t>(grp_oh_idx),
  170. static_cast<size_t>(grp_ow_idx),
  171. static_cast<size_t>(ic),
  172. static_cast<size_t>(kh),
  173. static_cast<size_t>(kw),
  174. static_cast<size_t>(oc)})[0];
  175. dval += fval * sval;
  176. }
  177. }
  178. }
  179. }
  180. }
  181. }
  182. }
  183. }
  184. void convolution_brute(
  185. const std::vector<std::shared_ptr<HostTensorND>>& in_tensor,
  186. std::shared_ptr<HostTensorND>& out_tensor,
  187. const opr::Convolution::Param& param) {
  188. mgb_assert(in_tensor.size() == 2);
  189. auto in = in_tensor[0], filter = in_tensor[1];
  190. mgb_assert(in->shape().ndim == 4);
  191. mgb_assert(filter->shape().ndim == 4);
  192. int batch_size = in->shape().shape[0];
  193. int ic = in->shape().shape[1];
  194. int ih = in->shape().shape[2];
  195. int iw = in->shape().shape[3];
  196. int fh = filter->shape().shape[2];
  197. int fw = filter->shape().shape[3];
  198. int ph = param.pad_h;
  199. int pw = param.pad_w;
  200. int sh = param.stride_h;
  201. int sw = param.stride_w;
  202. int dh = param.dilate_h;
  203. int dw = param.dilate_w;
  204. mgb_assert(ih + 2 * ph >= (fh - 1) * dh + 1);
  205. mgb_assert(iw + 2 * pw >= (fw - 1) * dw + 1);
  206. int oh = (ih + 2 * ph - ((fh - 1) * dh + 1)) / sh + 1;
  207. int ow = (iw + 2 * pw - ((fw - 1) * dw + 1)) / sw + 1;
  208. mgb_assert(static_cast<size_t>(ic) == filter->shape().shape[1]);
  209. int oc = filter->shape().shape[0];
  210. out_tensor = std::make_shared<HostTensorND>(
  211. CompNode::load("xpu0"),
  212. TensorShape{
  213. static_cast<size_t>(batch_size), static_cast<size_t>(oc),
  214. static_cast<size_t>(oh), static_cast<size_t>(ow)});
  215. int pn, poc, poh, pow, pih, piw, pic, pfh, pfw;
  216. for (pn = 0; pn < batch_size; ++pn)
  217. for (poc = 0; poc < oc; ++poc)
  218. for (poh = 0, pih = -ph; poh < oh; ++poh, pih += sh)
  219. for (pow = 0, piw = -pw; pow < ow; ++pow, piw += sw) {
  220. float& target = out_tensor->ptr<float>(
  221. {static_cast<size_t>(pn), static_cast<size_t>(poc),
  222. static_cast<size_t>(poh), static_cast<size_t>(pow)})[0];
  223. target = 0;
  224. for (pic = 0; pic < ic; ++pic)
  225. for (pfh = 0; pfh < fh; ++pfh)
  226. for (pfw = 0; pfw < fw; ++pfw) {
  227. int prih, priw;
  228. float img_data, filter_data;
  229. if (param.mode == Param::Mode::CONVOLUTION) {
  230. prih = pih + (fh - pfh - 1) * dh;
  231. priw = piw + (fw - pfw - 1) * dw;
  232. } else {
  233. mgb_assert(
  234. param.mode ==
  235. Param::Mode::CROSS_CORRELATION);
  236. prih = pih + pfh * dh;
  237. priw = piw + pfw * dw;
  238. }
  239. if (prih >= 0 && prih < ih && priw >= 0 && priw < iw) {
  240. img_data = in_tensor[0]->ptr<float>(
  241. {static_cast<size_t>(pn),
  242. static_cast<size_t>(pic),
  243. static_cast<size_t>(prih),
  244. static_cast<size_t>(priw)})[0];
  245. } else {
  246. img_data = 0;
  247. }
  248. filter_data = filter->ptr<float>(
  249. {static_cast<size_t>(poc),
  250. static_cast<size_t>(pic),
  251. static_cast<size_t>(pfh),
  252. static_cast<size_t>(pfw)})[0];
  253. target += img_data * filter_data;
  254. }
  255. }
  256. }
  257. opr::Convolution::Param convert_to_conv_param(
  258. const opr::ConvBiasForward::Param& param) {
  259. return opr::Convolution::Param{param.mode, param.pad_h, param.pad_w,
  260. param.stride_h, param.stride_w, param.dilate_h,
  261. param.dilate_w, param.sparse, param.format};
  262. };
  263. #if MGB_CUDA
  264. opr::Convolution::Param convert_to_conv_param(
  265. const opr::BatchConvBiasForward::Param& param) {
  266. return opr::Convolution::Param{param.mode, param.pad_h, param.pad_w,
  267. param.stride_h, param.stride_w, param.dilate_h,
  268. param.dilate_w, param.sparse, param.format};
  269. };
  270. #endif
  271. TEST(TestOprDNN, ConvolutionForward) {
  272. uint32_t ih = 10, ic = 16, oc = 32, ph = 0, sh = 1, fh = 2;
  273. for (auto mode : modes_to_check) {
  274. uint32_t iw = ih + 1, fw = fh + 1, pw = ph + 1, sw = sh + 1;
  275. Param param{mode, ph, pw, sh, sw};
  276. size_t batch_size = 32;
  277. // !!! DEPRECATED. use AutoOprChecker instead.
  278. opr::test::ForwardChecker<opr::Convolution, 2> forward_checker(
  279. {{batch_size, ic, ih, iw}, {oc, ic, fh, fw}}, convolution_brute, param);
  280. forward_checker.run();
  281. }
  282. }
  283. TEST(TestOprDNN, ConvolutionBackward) {
  284. uint32_t ih = 10, ic = 16, oc = 32, ph = 0, sh = 1, fh = 2;
  285. for (auto mode : modes_to_check) {
  286. uint32_t iw = 11, fw = 4, pw = 1, sw = 3;
  287. Param param{mode, ph, pw, sh, sw};
  288. size_t batch_size = 32;
  289. // !!! DEPRECATED. use AutoOprChecker instead.
  290. opr::test::BackwardChecker<opr::Convolution, 2> backward_checker(
  291. {{batch_size, ic, ih, iw}, {oc, ic, fh, fw}}, param, 1e-2, 1);
  292. backward_checker.run();
  293. }
  294. }
  295. TEST(TestOprDNN, ConvBiasExePolicy) {
  296. using Param = opr::ConvBias::Param;
  297. Param param;
  298. using Policy = opr::ConvBias::ExecutionPolicy;
  299. using S = Policy::Strategy;
  300. auto cn = CompNode::load("cpux");
  301. auto orig_impl =
  302. PersistentCache::set_impl(std::make_shared<InMemoryPersistentCache>());
  303. auto run = [&](S strategy) {
  304. auto graph = ComputingGraph::make();
  305. HostTensorGenerator<> gen;
  306. auto mkvar = [&](const char* name, const TensorShape& shp, const DType& dtype) {
  307. return opr::TypeCvt::make(
  308. opr::Host2DeviceCopy::make(*graph, gen(shp), cn).rename(name),
  309. dtype);
  310. };
  311. auto x = mkvar("x", {20, 50, 50, 16}, dtype::QuantizedS8(2.5f));
  312. auto w = mkvar("w", {24, 3, 3, 16}, dtype::QuantizedS8(2.5f));
  313. auto bias = mkvar("bias", {1, 1, 1, 24}, dtype::QuantizedS32(6.25f));
  314. param.nonlineMode = Param::NonlineMode::RELU;
  315. param.format = Param::Format::NHWC;
  316. Policy policy;
  317. policy.strategy = strategy;
  318. auto conv_bias = opr::ConvBias::make(
  319. x, w, bias, param, policy,
  320. OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  321. HostTensorND host_y;
  322. auto func = graph->compile({make_callback_copy(conv_bias, host_y)});
  323. func->execute();
  324. //! force clear all PersistentCache by get_cache
  325. PersistentCache::inst().clear_cache();
  326. size_t old_size = PersistentCache::inst().get_cache().size();
  327. ASSERT_EQ(old_size, 0);
  328. //! set a new cache
  329. PersistentCache::set_impl(std::make_shared<InMemoryPersistentCache>());
  330. };
  331. #if MGB_ENABLE_FASTRUN
  332. for (auto strategy : SmallVector<S>{
  333. S::PROFILE, S::HEURISTIC, S::PROFILE | S::REPRODUCIBLE,
  334. S::PROFILE | S::HEURISTIC}) {
  335. #else
  336. for (auto strategy : SmallVector<S>{S : HEURISTIC, S::PROFILE | S::HEURISTIC}) {
  337. #endif
  338. run(strategy);
  339. }
  340. megdnn::AlgorithmCache::instance().clear();
  341. ASSERT_THROW(run(S::OPTIMIZED | S::PROFILE), MegBrainError);
  342. PersistentCache::set_impl(orig_impl);
  343. }
  344. TEST(TestOprDNN, PersistentCacheAppend) {
  345. PersistentCache::inst().clear_cache();
  346. auto orig_impl =
  347. PersistentCache::set_impl(std::make_shared<InMemoryPersistentCache>());
  348. auto orig_impl_size = orig_impl->get_cache().size();
  349. auto category_a = "test_category_a";
  350. std::vector<int8_t> blob_key{1, 2, 3, 4, 5, 6, 7, 8};
  351. std::vector<int8_t> blob_value{-1, -2, -3, -4, -5, -6, -7, -8};
  352. PersistentCache::Blob key = {.ptr = blob_key.data(), .size = blob_key.size()};
  353. PersistentCache::Blob value = {.ptr = blob_value.data(), .size = blob_value.size()};
  354. //! trigger call InMemoryPersistentCache put
  355. PersistentCache::inst().put(category_a, key, value);
  356. auto now_size = PersistentCache::inst().get_cache().size();
  357. //! assert new key not in InMemoryPersistentCache imp
  358. ASSERT_EQ(orig_impl_size + 1, now_size);
  359. //! trigger append call InFilePersistentCache init
  360. PersistentCache::set_impl(std::make_shared<InFilePersistentCache>());
  361. auto size_after_restore = PersistentCache::inst().get_cache().size();
  362. //! assert key not in InFilePersistentCache imp
  363. //! as memory instance do cache do not sync cache to file
  364. ASSERT_EQ(size_after_restore, orig_impl_size);
  365. auto t_file_imp = std::make_shared<InFilePersistentCache>();
  366. auto category_b = "test_category_b";
  367. //! trigger call InFilePersistentCache put
  368. t_file_imp->put(category_b, key, value);
  369. //! set new file imp
  370. PersistentCache::set_impl(t_file_imp);
  371. //! trigger InFilePersistentCache append init
  372. auto old_cache =
  373. PersistentCache::set_impl(std::make_shared<InFilePersistentCache>());
  374. //! assert set_impl return old cache exactly
  375. ASSERT_EQ(old_cache->m_cache.size(), now_size);
  376. //! test key get
  377. auto get_value = PersistentCache::inst().get(category_b, key);
  378. ASSERT_TRUE(
  379. !memcmp(get_value.val().ptr, blob_value.data(),
  380. blob_value.size() * sizeof(int8_t)));
  381. size_after_restore = PersistentCache::inst().get_cache().size();
  382. //! assert key still in orig_impl imp
  383. ASSERT_EQ(size_after_restore, now_size);
  384. //! restore old impl, may memory or file, trigger may memory append init
  385. PersistentCache::set_impl(orig_impl);
  386. size_after_restore = PersistentCache::inst().get_cache().size();
  387. //! assert key not in orig_impl imp, caused by get_cache will clear m_cache
  388. ASSERT_EQ(size_after_restore + 1, now_size);
  389. }
  390. TEST(TestOprDNN, ConvBiasExePolicy_Quantized8Asym) {
  391. using Param = opr::ConvBias::Param;
  392. Param param;
  393. using Policy = opr::ConvBias::ExecutionPolicy;
  394. using S = Policy::Strategy;
  395. auto cn = CompNode::load("cpux");
  396. for (auto strategy : SmallVector<S>{S::PROFILE, S::PROFILE | S::REPRODUCIBLE}) {
  397. auto graph = ComputingGraph::make();
  398. HostTensorGenerator<> gen;
  399. auto mkvar = [&](const char* name, const TensorShape& shp, const DType& dtype) {
  400. return opr::TypeCvt::make(
  401. opr::Host2DeviceCopy::make(*graph, gen(shp), cn).rename(name),
  402. dtype);
  403. };
  404. auto x =
  405. mkvar("x", {20, 50, 50, 16},
  406. dtype::Quantized8Asymm(2.5f, static_cast<uint8_t>(0)));
  407. auto w =
  408. mkvar("w", {24, 3, 3, 16},
  409. dtype::Quantized8Asymm(2.5f, static_cast<uint8_t>(0)));
  410. auto bias = mkvar("bias", {1, 1, 1, 24}, dtype::QuantizedS32(6.25f));
  411. param.nonlineMode = Param::NonlineMode::RELU;
  412. param.format = Param::Format::NHWC;
  413. Policy policy;
  414. policy.strategy = strategy;
  415. auto conv_bias = opr::ConvBias::make(
  416. x, w, bias, param, policy,
  417. OperatorNodeConfig{
  418. dtype::Quantized8Asymm(2.5f, static_cast<uint8_t>(0))});
  419. HostTensorND host_y;
  420. auto func = graph->compile({make_callback_copy(conv_bias, host_y)});
  421. func->execute();
  422. }
  423. }
  424. TEST(TestOprDNN, ConvolutionExePolicy) {
  425. Param param{Mode::CONVOLUTION};
  426. using Policy = opr::Convolution::ExecutionPolicy;
  427. using S = Policy::Strategy;
  428. int nr_get = 0;
  429. auto on_get = [&nr_get](
  430. const std::string&, const void*, size_t, const void*,
  431. size_t) { ++nr_get; };
  432. PersistentCacheHook cache_hook{on_get};
  433. #if MGB_ENABLE_FASTRUN
  434. for (auto strategy : SmallVector<S>{
  435. S::PROFILE, S::HEURISTIC, S::PROFILE | S::REPRODUCIBLE,
  436. S::PROFILE | S::HEURISTIC}) {
  437. #else
  438. for (auto strategy : SmallVector<S>{S : HEURISTIC, S::PROFILE | S::HEURISTIC}) {
  439. #endif
  440. megdnn::AlgorithmCache::instance().clear();
  441. using Checker = AutoOprChecker<2, 1>;
  442. auto make_graph =
  443. [&](const Checker::SymInpArray& inputs) -> Checker::SymOutArray {
  444. Policy policy;
  445. policy.strategy = strategy;
  446. auto out = opr::Convolution::make(inputs[0], inputs[1], param, policy);
  447. return {out};
  448. };
  449. auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
  450. std::shared_ptr<HostTensorND> sh_out;
  451. convolution_brute({inp.begin(), inp.end()}, sh_out, param);
  452. dest[0] = *sh_out;
  453. };
  454. Checker::RunOptions opt;
  455. opt.numdiff_eps = 1;
  456. nr_get = 0;
  457. Checker(make_graph, fwd)
  458. .run({TensorShape{3, 2, 10, 6}, {4, 2, 3, 2}}, opt)
  459. .run({TensorShape{6, 3, 8, 13}, {2, 3, 2, 13}}, opt)
  460. .run({TensorShape{1, 1, 10, 10}, {2, 1, 3, 3}}, opt);
  461. if (strategy == S::HEURISTIC) {
  462. ASSERT_EQ(0, nr_get);
  463. } else {
  464. ASSERT_LT(0, nr_get);
  465. }
  466. megdnn::AlgorithmCache::instance().clear();
  467. }
  468. }
  469. TEST(TestOprDNN, ConvolutionBackwardDataBfloat16ExePolicy) {
  470. REQUIRE_GPU(1);
  471. Param param{Mode::CROSS_CORRELATION, 1, 1, 1, 1};
  472. param.compute_mode = Param::ComputeMode::FLOAT32;
  473. using Policy = opr::Convolution::ExecutionPolicy;
  474. using S = Policy::Strategy;
  475. auto gen_bfp16 = [](HostTensorND& dest) {
  476. RNGxorshf rng{next_rand_seed()};
  477. auto rand_real = [&rng]() {
  478. std::uniform_real_distribution<float> dist(-1, 1);
  479. return dist(rng);
  480. };
  481. auto ptr = dest.ptr<dt_bfloat16>();
  482. size_t elems = dest.shape().total_nr_elems();
  483. for (size_t i = 0; i < elems; i++) {
  484. ptr[i] = dt_bfloat16(rand_real());
  485. }
  486. };
  487. auto f32_to_bf16 = [](const std::shared_ptr<HostTensorND>& src)
  488. -> std::shared_ptr<HostTensorND> {
  489. auto ret = std::make_shared<HostTensorND>(
  490. src->comp_node(), src->shape(), dtype::BFloat16{});
  491. for (size_t i = 0; i < src->layout().total_nr_elems(); i++) {
  492. ret->ptr<dt_bfloat16>()[i] = src->ptr<dt_float32>()[i];
  493. }
  494. return ret;
  495. };
  496. auto bf16_to_f32 = [](const std::shared_ptr<HostTensorND>& src)
  497. -> std::shared_ptr<HostTensorND> {
  498. auto ret = std::make_shared<HostTensorND>(
  499. src->comp_node(), src->shape(), dtype::Float32{});
  500. for (size_t i = 0; i < src->layout().total_nr_elems(); i++) {
  501. ret->ptr<dt_float32>()[i] = src->ptr<dt_bfloat16>()[i];
  502. }
  503. return ret;
  504. };
  505. int nr_get = 0;
  506. auto on_get = [&nr_get](
  507. const std::string&, const void*, size_t, const void*,
  508. size_t) { ++nr_get; };
  509. PersistentCacheHook cache_hook{on_get};
  510. #if MGB_ENABLE_FASTRUN
  511. for (auto strategy :
  512. {S::PROFILE, S::HEURISTIC, S(S::PROFILE | S::REPRODUCIBLE),
  513. S(S::PROFILE | S::HEURISTIC)}) {
  514. #else
  515. for (auto strategy : {S : HEURISTIC, S(S::PROFILE | S::HEURISTIC)}) {
  516. #endif
  517. megdnn::AlgorithmCache::instance().clear();
  518. using Checker = AutoOprChecker<2, 1>;
  519. auto make_graph =
  520. [&](const Checker::SymInpArray& inputs) -> Checker::SymOutArray {
  521. Policy policy;
  522. policy.strategy = strategy;
  523. return {opr::ConvolutionBackwardData::make_deconv(
  524. inputs[0], inputs[1], param, policy)};
  525. };
  526. auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
  527. std::shared_ptr<HostTensorND> out;
  528. conv_bwd_data_brute({bf16_to_f32(inp[0]), bf16_to_f32(inp[1])}, out, param);
  529. dest[0] = *f32_to_bf16(out);
  530. };
  531. Checker::RunOptions opt;
  532. opt.outputs_max_err = 1e-3;
  533. nr_get = 0;
  534. Checker(make_graph, fwd)
  535. .disable_grad_check()
  536. .set_input_dtype(0, dtype::BFloat16{})
  537. .set_input_dtype(1, dtype::BFloat16{})
  538. .set_input_generator(0, gen_bfp16)
  539. .set_input_generator(1, gen_bfp16)
  540. .run({TensorShape{3, 4, 10, 6}, {4, 2, 3, 3}}, opt)
  541. .run({TensorShape{2, 2, 4, 3}, {2, 2, 3, 3}}, opt)
  542. .run({TensorShape{1, 3, 10, 6}, {3, 2, 3, 3}}, opt);
  543. if (strategy == S::HEURISTIC) {
  544. ASSERT_EQ(0, nr_get);
  545. } else {
  546. ASSERT_LT(0, nr_get);
  547. }
  548. }
  549. }
  550. #if MGB_ENABLE_FASTRUN
  551. TEST(TestOprDNN, ConvolutionBackwardDataFloat16ExePolicy) {
  552. REQUIRE_GPU(1);
  553. Param param{Mode::CROSS_CORRELATION, 1, 1, 1, 1};
  554. param.compute_mode = Param::ComputeMode::FLOAT32;
  555. using Policy = opr::Convolution::ExecutionPolicy;
  556. using S = Policy::Strategy;
  557. auto gen_fp16 = [](HostTensorND& dest) {
  558. RNGxorshf rng{next_rand_seed()};
  559. auto rand_real = [&rng]() {
  560. std::uniform_real_distribution<float> dist(-1, 1);
  561. return dist(rng);
  562. };
  563. auto ptr = dest.ptr<dt_float16>();
  564. size_t elems = dest.shape().total_nr_elems();
  565. for (size_t i = 0; i < elems; i++) {
  566. ptr[i] = dt_float16(rand_real());
  567. }
  568. };
  569. auto f32_to_f16 = [](const std::shared_ptr<HostTensorND>& src)
  570. -> std::shared_ptr<HostTensorND> {
  571. auto ret = std::make_shared<HostTensorND>(
  572. src->comp_node(), src->shape(), dtype::Float16{});
  573. for (size_t i = 0; i < src->layout().total_nr_elems(); i++) {
  574. ret->ptr<dt_float16>()[i] = src->ptr<dt_float32>()[i];
  575. }
  576. return ret;
  577. };
  578. auto f16_to_f32 = [](const std::shared_ptr<HostTensorND>& src)
  579. -> std::shared_ptr<HostTensorND> {
  580. auto ret = std::make_shared<HostTensorND>(
  581. src->comp_node(), src->shape(), dtype::Float32{});
  582. for (size_t i = 0; i < src->layout().total_nr_elems(); i++) {
  583. ret->ptr<dt_float32>()[i] = src->ptr<dt_float16>()[i];
  584. }
  585. return ret;
  586. };
  587. int nr_get = 0;
  588. auto on_get = [&nr_get](
  589. const std::string&, const void*, size_t, const void*,
  590. size_t) { ++nr_get; };
  591. PersistentCacheHook cache_hook{on_get};
  592. auto strategy = S(S::PROFILE | S::REPRODUCIBLE);
  593. using Checker = AutoOprChecker<2, 1>;
  594. auto make_graph = [&](const Checker::SymInpArray& inputs) -> Checker::SymOutArray {
  595. Policy policy;
  596. policy.strategy = strategy;
  597. return {opr::ConvolutionBackwardData::make_deconv(
  598. inputs[0], inputs[1], param, policy)};
  599. };
  600. auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
  601. std::shared_ptr<HostTensorND> out;
  602. conv_bwd_data_brute({f16_to_f32(inp[0]), f16_to_f32(inp[1])}, out, param);
  603. dest[0] = *f32_to_f16(out);
  604. };
  605. Checker::RunOptions opt;
  606. opt.outputs_max_err = 1e-2;
  607. nr_get = 0;
  608. Checker(make_graph, fwd)
  609. .disable_grad_check()
  610. .set_input_dtype(0, dtype::Float16{})
  611. .set_input_dtype(1, dtype::Float16{})
  612. .set_input_generator(0, gen_fp16)
  613. .set_input_generator(1, gen_fp16)
  614. .run({TensorShape{3, 4, 10, 6}, {4, 2, 3, 3}}, opt)
  615. .run({TensorShape{2, 2, 4, 3}, {2, 2, 3, 3}}, opt)
  616. .run({TensorShape{1, 3, 10, 6}, {3, 2, 3, 3}}, opt);
  617. if (strategy == S::HEURISTIC) {
  618. ASSERT_EQ(0, nr_get);
  619. } else {
  620. ASSERT_LT(0, nr_get);
  621. }
  622. }
  623. #endif
  624. TEST(TestOprDNN, Deconvolution) {
  625. // dilated grouped deconv
  626. using Checker = AutoOprChecker<2, 1>;
  627. Param param{Mode::CROSS_CORRELATION, 0, 1, 1, 2};
  628. param.dilate_h = 2;
  629. param.sparse = Param::Sparse::GROUP;
  630. auto make_graph = [&](const Checker::SymInpArray& inputs) -> Checker::SymOutArray {
  631. return {opr::ConvolutionBackwardData::make_deconv(inputs[0], inputs[1], param)};
  632. };
  633. auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
  634. std::shared_ptr<HostTensorND> out;
  635. conv_bwd_data_brute({inp[0], inp[1]}, out, param);
  636. dest[0] = *out;
  637. };
  638. Checker::RunOptions opt;
  639. opt.numdiff_eps = 1;
  640. Checker(make_graph, fwd)
  641. .run({TensorShape{2, 4, 6, 8}, {1, 4, 5, 3, 2}}, opt)
  642. .run({TensorShape{3, 2, 1, 1}, {2, 1, 1, 4, 3}}, opt)
  643. .run({TensorShape{4, 6, 7, 2}, {2, 3, 4, 8, 13}}, opt);
  644. }
  645. TEST(TestOprDNN, DeconvolutionExePolicy_QuantizedS8) {
  646. REQUIRE_GPU(1);
  647. auto cn = CompNode::load("gpu0");
  648. cn.activate();
  649. REQUIRE_CUDA_COMPUTE_CAPABILITY(6, 1);
  650. Param param;
  651. using Policy = opr::ConvolutionBackwardData::ExecutionPolicy;
  652. using S = Policy::Strategy;
  653. #if MGB_ENABLE_FASTRUN
  654. for (auto strategy :
  655. {S::PROFILE, S::HEURISTIC, S(S::PROFILE | S::REPRODUCIBLE),
  656. S(S::PROFILE | S::HEURISTIC)}) {
  657. #else
  658. for (auto strategy : {S : HEURISTIC, S(S::PROFILE | S::HEURISTIC)}) {
  659. #endif
  660. auto graph = ComputingGraph::make();
  661. HostTensorGenerator<> gen;
  662. auto mkvar = [&](const char* name, const TensorShape& shp, const DType& dtype) {
  663. return opr::TypeCvt::make(
  664. opr::Host2DeviceCopy::make(*graph, gen(shp)).rename(name), dtype);
  665. };
  666. auto x = mkvar("x", {16, 4, 50, 50, 4}, dtype::QuantizedS8(1.2f));
  667. auto w = mkvar("w", {16, 4, 4, 4, 4}, dtype::QuantizedS8(1.3f));
  668. param.format = Param::Format::NCHW4;
  669. param.pad_h = param.pad_w = 2;
  670. param.stride_h = param.stride_w = 2;
  671. Policy policy;
  672. policy.strategy = strategy;
  673. auto deconv = opr::ConvolutionBackwardData::make_deconv(
  674. x, w, param, policy, OperatorNodeConfig{dtype::QuantizedS8(1.2f)});
  675. HostTensorND host_y;
  676. auto func = graph->compile({make_callback_copy(deconv, host_y)});
  677. func->execute();
  678. }
  679. }
  680. TEST(TestOprDNN, ConvolutionBackwardFilter) {
  681. using Checker = AutoOprChecker<3, 1>;
  682. constexpr size_t PH = 0, PW = 1, SH = 1, SW = 2;
  683. auto make_graph = [&](const Checker::SymInpArray& inputs) -> Checker::SymOutArray {
  684. Param param{Mode::CROSS_CORRELATION, PH, PW, SH, SW};
  685. return {opr::ConvolutionBackwardFilter::make(
  686. inputs[0], inputs[1], inputs[2], param)};
  687. };
  688. auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
  689. std::shared_ptr<HostTensorND> out;
  690. conv_bwd_flt_brute(
  691. {inp[0], inp[1], inp[2]}, out,
  692. Param{Mode::CROSS_CORRELATION, PH, PW, SH, SW});
  693. dest[0] = *out;
  694. };
  695. #define get_shp(N, P, S, F) ((N + 2 * P - F) / S + 1)
  696. #define inp_tensor(N, IC, OC, IH, IW, FH, FW) \
  697. { \
  698. TensorShape{N, IC, IH, IW}, \
  699. {N, OC, get_shp(IH, PH, SH, FH), get_shp(IW, PW, SW, FW)}, { \
  700. OC, IC, FH, FW \
  701. } \
  702. }
  703. Checker::RunOptions opt;
  704. opt.numdiff_eps = 1;
  705. Checker(make_graph, fwd)
  706. .run(inp_tensor(2, 3, 4, 9, 8, 4, 3), opt)
  707. .run(inp_tensor(1, 5, 3, 7, 9, 3, 4), opt)
  708. .run(inp_tensor(3, 4, 4, 9, 9, 3, 3), opt);
  709. #undef inp_tensor
  710. #undef get_shp
  711. }
  712. TEST(TestOprDNN, DilatedConvolution) {
  713. using Checker = AutoOprChecker<2, 1>;
  714. opr::ConvolutionForward::Param param;
  715. param.pad_h = 5;
  716. param.pad_w = 2;
  717. param.stride_w = 2;
  718. param.dilate_h = 2;
  719. auto make_graph = [&](const Checker::SymInpArray& inputs) -> Checker::SymOutArray {
  720. return {opr::Convolution::make(inputs[0], inputs[1], param)};
  721. };
  722. auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
  723. auto opr = megdnn_naive_handle()->create_operator<megdnn::Convolution>();
  724. opr->param() = param;
  725. TensorLayout dest_layout;
  726. opr->deduce_layout(inp[0]->layout(), inp[1]->layout(), dest_layout);
  727. std::vector<dt_byte> workspace(opr->get_workspace_in_bytes(
  728. inp[0]->layout(), inp[1]->layout(), dest_layout, nullptr));
  729. dest[0].dtype(dtype::Float32())
  730. .comp_node(inp[0]->comp_node())
  731. .resize(dest_layout);
  732. opr->exec(
  733. inp[0]->as_megdnn(), inp[1]->as_megdnn(), dest[0].as_megdnn(), nullptr,
  734. {workspace.data(), workspace.size()});
  735. };
  736. Checker::RunOptions option;
  737. option.numdiff_eps = 0.1;
  738. Checker(make_graph, fwd)
  739. .run({TensorShape{2, 3, 8, 7}, TensorShape{4, 3, 2, 2}}, option)
  740. .run({TensorShape{2, 3, 8, 7}, TensorShape{4, 3, 3, 2}}, option)
  741. .run({TensorShape{2, 3, 8, 9}, TensorShape{4, 3, 3, 2}}, option);
  742. }
  743. TEST(TestOprDNN, GroupConv) {
  744. using Checker = AutoOprChecker<2, 1>;
  745. opr::Convolution::Param param;
  746. param.pad_h = 1;
  747. param.pad_w = 2;
  748. param.stride_h = 2;
  749. auto make_graph = [&](const Checker::SymInpArray& inputs) -> Checker::SymOutArray {
  750. auto p1 = param;
  751. p1.sparse = opr::Convolution::Param::Sparse::GROUP;
  752. return {opr::Convolution::make(inputs[0], inputs[1], p1)};
  753. };
  754. auto cn = CompNode::load("xpux");
  755. auto inp0 = std::make_shared<HostTensorND>(cn, dtype::Float32()),
  756. inp1 = std::make_shared<HostTensorND>(cn, dtype::Float32());
  757. HostTensorND out_raw;
  758. auto graph_raw = ComputingGraph::make();
  759. auto func_raw = graph_raw->compile({make_callback_copy(
  760. opr::Convolution::make(
  761. opr::Host2DeviceCopy::make(*graph_raw, inp0),
  762. opr::Host2DeviceCopy::make(*graph_raw, inp1), param),
  763. out_raw)});
  764. auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
  765. auto&& out = dest[0];
  766. auto sl = inp[0]->layout(), fl = inp[1]->layout().remove_axis(0);
  767. TensorLayout ol;
  768. auto group = inp[1]->layout()[0];
  769. sl.shape[1] /= group;
  770. for (size_t i = 0; i < group; ++i) {
  771. inp0->copy_from(inp[0]->sub(SubTensorSpec::make_from_offset_elem(
  772. sl, i * sl[1] * sl[2] * sl[3])));
  773. inp1->copy_from(inp[1]->sub(
  774. SubTensorSpec::make_from_offset_elem(fl, i * fl.total_nr_elems())));
  775. func_raw->execute();
  776. if (!i) {
  777. auto oshp = out_raw.shape();
  778. oshp[1] *= group;
  779. out.resize(oshp);
  780. ol = out.layout();
  781. ol[1] /= group;
  782. }
  783. out.sub(SubTensorSpec::make_from_offset_elem(ol, i * ol[1] * ol[2] * ol[3]))
  784. .copy_from_fixlayout(out_raw);
  785. }
  786. };
  787. Checker::RunOptions opt;
  788. opt.numdiff_eps = 1;
  789. opt.outputs_max_err = 5e-5;
  790. Checker checker{make_graph, fwd};
  791. auto run = [&](const TensorShape& ishp, size_t fh, size_t fw, size_t oc,
  792. size_t group) {
  793. size_t ic = ishp[1];
  794. TensorShape flt{group, oc / group, ic / group, fh, fw};
  795. checker.run({ishp, flt}, opt);
  796. };
  797. run({1, 2, 1, 1}, 1, 1, 2, 2);
  798. run({3, 9, 5, 4}, 1, 2, 6, 3);
  799. run({3, 6, 8, 9}, 3, 1, 4, 2);
  800. run({2, 5, 3, 6}, 2, 3, 5, 1);
  801. run({2, 6, 3, 6}, 2, 3, 6, 6);
  802. }
  803. TEST(TestOprDNN, MaskConvolution) {
  804. using Checker = AutoOprChecker<3, 1>;
  805. opr::Convolution::Param param;
  806. auto make_graph = [&](const Checker::SymInpArray& inputs) -> Checker::SymOutArray {
  807. return {opr::MaskConvolution::make(inputs[0], inputs[1], inputs[2], param)};
  808. };
  809. auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
  810. std::shared_ptr<HostTensorND> sh_out;
  811. convolution_brute({inp[0], inp[1]}, sh_out, param);
  812. dest[0] = *sh_out;
  813. size_t N = dest[0].shape()[0];
  814. size_t OC = dest[0].shape()[1];
  815. size_t OH = dest[0].shape()[2];
  816. size_t OW = dest[0].shape()[3];
  817. auto mask_ptr = inp[2]->ptr<int8_t>();
  818. auto dest_ptr = dest[0].ptr<float>();
  819. for (size_t i = 0; i < N * OC; ++i) {
  820. for (size_t mask_idx = 0; mask_idx < OH * OW; ++mask_idx) {
  821. if (mask_ptr[mask_idx] == 0) {
  822. dest_ptr[i * OH * OW + mask_idx] = 0;
  823. }
  824. }
  825. }
  826. };
  827. auto gen_mask = [](HostTensorND& dest) {
  828. HostTensorGenerator<dtype::Int8, RandomDistribution::UNIFORM> mask_generator{
  829. 0, 1};
  830. dest = *mask_generator(dest.shape(), dest.comp_node());
  831. };
  832. auto run_with_param = [&](size_t SH = 1, size_t SW = 1, size_t PH = 0,
  833. size_t PW = 0) {
  834. param.pad_h = PH;
  835. param.pad_w = PW;
  836. param.stride_h = SH;
  837. param.stride_w = SW;
  838. Checker checker{make_graph, fwd};
  839. Checker::RunOptions opt;
  840. checker.set_output_allow_grad(0, false);
  841. checker.set_input_dtype(2, dtype::Int8());
  842. checker.set_input_generator(2, gen_mask);
  843. auto run = [&](size_t N, size_t IC, size_t OC, size_t IH, size_t IW, size_t FH,
  844. size_t FW) {
  845. size_t OH = (IH + 2 * PH - FH) / SH + 1;
  846. size_t OW = (IW + 2 * PW - FW) / SW + 1;
  847. checker.run({TensorShape{N, IC, IH, IW}, {OC, IC, FH, FW}, {OH, OW}}, opt);
  848. };
  849. run(1, 1, 1, 5, 5, 3, 3);
  850. run(2, 3, 4, 5, 5, 3, 3);
  851. run(3, 3, 4, 224, 223, 3, 3);
  852. run(3, 3, 4, 224, 223, 2, 2);
  853. };
  854. run_with_param();
  855. run_with_param(2, 2, 3, 3);
  856. run_with_param(3, 2, 1, 2);
  857. run_with_param(2, 3, 2, 2);
  858. }
  859. TEST(TestOprDNN, MaskPropagate) {
  860. using Checker = AutoOprChecker<3, 1>;
  861. opr::MaskPropagate::Param mask_param;
  862. opr::Convolution::Param conv_param;
  863. auto make_graph = [&](const Checker::SymInpArray& inputs) -> Checker::SymOutArray {
  864. auto inp_mask = inputs[2];
  865. auto out_mask = opr::MaskPropagate::make(inp_mask, mask_param);
  866. return {opr::MaskConvolution::make(inputs[0], inputs[1], out_mask, conv_param)};
  867. };
  868. auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
  869. auto& src = *inp[0];
  870. auto& mask = *inp[2];
  871. auto src_ptr = inp[0]->ptr<float>();
  872. auto mask_ptr = inp[2]->ptr<int>();
  873. mgb_assert(
  874. src.shape()[2] == mask.shape()[0] && src.shape()[3] == mask.shape()[1]);
  875. for (size_t i = 0; i < src.shape()[0] * src.shape()[1]; ++i) {
  876. for (size_t mask_idx = 0; mask_idx < src.shape()[2] * src.shape()[3];
  877. ++mask_idx) {
  878. if (mask_ptr[mask_idx] == 0) {
  879. src_ptr[i * src.layout().stride[1] + mask_idx] = 0;
  880. }
  881. }
  882. }
  883. std::shared_ptr<HostTensorND> sh_out;
  884. convolution_brute({inp[0], inp[1]}, sh_out, conv_param);
  885. dest[0] = *sh_out;
  886. };
  887. auto gen_mask = [](HostTensorND& dest) {
  888. HostTensorGenerator<dtype::Int32, RandomDistribution::UNIFORM> mask_generator{
  889. 0, 1};
  890. dest = *mask_generator(dest.shape(), dest.comp_node());
  891. };
  892. auto run_with_param = [&](size_t FH, size_t FW, size_t SH = 1, size_t SW = 1,
  893. size_t PH = 0, size_t PW = 0, size_t DH = 1,
  894. size_t DW = 1) {
  895. conv_param.pad_h = PH;
  896. conv_param.pad_w = PW;
  897. conv_param.stride_h = SH;
  898. conv_param.stride_w = SW;
  899. conv_param.dilate_h = DH;
  900. conv_param.dilate_w = DW;
  901. mask_param.pad_h = PH;
  902. mask_param.pad_w = PW;
  903. mask_param.stride_h = SH;
  904. mask_param.stride_w = SW;
  905. mask_param.kernel_h = FH;
  906. mask_param.kernel_w = FW;
  907. mask_param.dilate_h = DH;
  908. mask_param.dilate_w = DW;
  909. Checker checker{make_graph, fwd};
  910. Checker::RunOptions opt;
  911. checker.set_output_allow_grad(0, false);
  912. checker.set_input_dtype(2, dtype::Int32());
  913. checker.set_input_generator(2, gen_mask);
  914. auto run = [&](size_t N, size_t IC, size_t OC, size_t IH, size_t IW) {
  915. checker.run({TensorShape{N, IC, IH, IW}, {OC, IC, FH, FW}, {IH, IW}}, opt);
  916. };
  917. run(1, 1, 1, 5, 5);
  918. run(2, 3, 4, 5, 5);
  919. run(3, 3, 4, 224, 223);
  920. run(3, 3, 4, 224, 223);
  921. };
  922. run_with_param(3, 3, 1, 1, 0, 0, 2, 2);
  923. run_with_param(3, 3, 2, 2, 3, 3);
  924. run_with_param(4, 2, 3, 2, 1, 2);
  925. run_with_param(2, 4, 2, 3, 2, 2);
  926. run_with_param(4, 2, 3, 2, 1, 2, 2, 2);
  927. run_with_param(2, 4, 2, 3, 2, 2, 2, 1);
  928. }
  929. void convolution3d_brute(
  930. const std::vector<std::shared_ptr<HostTensorND>>& in_tensor,
  931. std::shared_ptr<HostTensorND>& out_tensor,
  932. const opr::Convolution3D::Param& param) {
  933. mgb_assert(in_tensor.size() == 2);
  934. auto in = in_tensor[0], filter = in_tensor[1];
  935. mgb_assert(in->shape().ndim == 5);
  936. mgb_assert(filter->shape().ndim == 5);
  937. int batch_size = in->shape().shape[0];
  938. int ic = in->shape().shape[1];
  939. int id = in->shape().shape[2];
  940. int ih = in->shape().shape[3];
  941. int iw = in->shape().shape[4];
  942. int fd = filter->shape().shape[2];
  943. int fh = filter->shape().shape[3];
  944. int fw = filter->shape().shape[4];
  945. int pd = param.pad_d;
  946. int ph = param.pad_h;
  947. int pw = param.pad_w;
  948. int sd = param.stride_d;
  949. int sh = param.stride_h;
  950. int sw = param.stride_w;
  951. int dd = param.dilate_d;
  952. int dh = param.dilate_h;
  953. int dw = param.dilate_w;
  954. mgb_assert(id + 2 * pd >= (fd - 1) * dd + 1);
  955. mgb_assert(ih + 2 * ph >= (fh - 1) * dh + 1);
  956. mgb_assert(iw + 2 * pw >= (fw - 1) * dw + 1);
  957. int od = (id + 2 * pd - ((fd - 1) * dd + 1)) / sd + 1;
  958. int oh = (ih + 2 * ph - ((fh - 1) * dh + 1)) / sh + 1;
  959. int ow = (iw + 2 * pw - ((fw - 1) * dw + 1)) / sw + 1;
  960. mgb_assert(static_cast<size_t>(ic) == filter->shape().shape[1]);
  961. int oc = filter->shape().shape[0];
  962. out_tensor = std::make_shared<HostTensorND>(
  963. CompNode::load("xpu0"),
  964. TensorShape{
  965. static_cast<size_t>(batch_size), static_cast<size_t>(oc),
  966. static_cast<size_t>(od), static_cast<size_t>(oh),
  967. static_cast<size_t>(ow)});
  968. int pn, poc, pod, poh, pow, pic, pid, pih, piw, pfd, pfh, pfw;
  969. for (pn = 0; pn < batch_size; ++pn)
  970. for (poc = 0; poc < oc; ++poc)
  971. for (pod = 0, pid = -pd; pod < od; ++pod, pid += sd)
  972. for (poh = 0, pih = -ph; poh < oh; ++poh, pih += sh)
  973. for (pow = 0, piw = -pw; pow < ow; ++pow, piw += sw) {
  974. float& target = out_tensor->ptr<float>(
  975. {static_cast<size_t>(pn), static_cast<size_t>(poc),
  976. static_cast<size_t>(pod), static_cast<size_t>(poh),
  977. static_cast<size_t>(pow)})[0];
  978. target = 0;
  979. for (pic = 0; pic < ic; ++pic)
  980. for (pfd = 0; pfd < fd; ++pfd)
  981. for (pfh = 0; pfh < fh; ++pfh)
  982. for (pfw = 0; pfw < fw; ++pfw) {
  983. int prid, prih, priw;
  984. float img_data, filter_data;
  985. if (param.mode == opr::Convolution3D::Param::
  986. Mode::CONVOLUTION) {
  987. prid = pid + (fd - pfd - 1) * dd;
  988. prih = pih + (fh - pfh - 1) * dh;
  989. priw = piw + (fw - pfw - 1) * dw;
  990. } else {
  991. mgb_assert(
  992. param.mode ==
  993. opr::Convolution3D::Param::Mode::
  994. CROSS_CORRELATION);
  995. prid = pid + pfd * dd;
  996. prih = pih + pfh * dh;
  997. priw = piw + pfw * dw;
  998. }
  999. if (prid >= 0 && prid < id && prih >= 0 &&
  1000. prih < ih && priw >= 0 && priw < iw) {
  1001. img_data = in_tensor[0]->ptr<float>(
  1002. {static_cast<size_t>(pn),
  1003. static_cast<size_t>(pic),
  1004. static_cast<size_t>(prid),
  1005. static_cast<size_t>(prih),
  1006. static_cast<size_t>(priw)})[0];
  1007. } else {
  1008. img_data = 0;
  1009. }
  1010. filter_data = filter->ptr<float>(
  1011. {static_cast<size_t>(poc),
  1012. static_cast<size_t>(pic),
  1013. static_cast<size_t>(pfd),
  1014. static_cast<size_t>(pfh),
  1015. static_cast<size_t>(pfw)})[0];
  1016. target += img_data * filter_data;
  1017. }
  1018. }
  1019. }
  1020. TEST(TestOprDNN, Convolution3DForward) {
  1021. for (uint32_t batch_size : {8})
  1022. for (uint32_t id : {12})
  1023. for (uint32_t fd : {1, 3})
  1024. for (uint32_t ic : {4})
  1025. for (uint32_t oc : {ic})
  1026. for (uint32_t pd : {0, 2})
  1027. for (uint32_t sd : {1, 3})
  1028. for (uint32_t dd : {1, 3})
  1029. for (bool xcorr : {0, 1}) {
  1030. uint32_t ih = id + 1, fh = fd, ph = pd + 1,
  1031. sh = sd + 1;
  1032. uint32_t iw = ih + 1, fw = fh, pw = ph + 1,
  1033. sw = sh + 1;
  1034. Param3D param{
  1035. xcorr ? Param3D::Mode::CROSS_CORRELATION
  1036. : Param3D::Mode::CONVOLUTION,
  1037. pd,
  1038. ph,
  1039. pw,
  1040. sd,
  1041. sh,
  1042. sw,
  1043. dd,
  1044. dd,
  1045. dd};
  1046. // !!! DEPRECATED. use AutoOprChecker instead.
  1047. opr::test::ForwardChecker<opr::Convolution3D, 2>
  1048. forward_checker(
  1049. {{batch_size, ic, id, ih, iw},
  1050. {oc, ic, fd, fh, fw}},
  1051. convolution3d_brute, param);
  1052. forward_checker.run();
  1053. }
  1054. }
  1055. TEST(TestOprDNN, Convolution3DBackward) {
  1056. for (uint32_t batch_size : {8})
  1057. for (uint32_t id : {12})
  1058. for (uint32_t fd : {1, 3})
  1059. for (uint32_t ic : {4})
  1060. for (uint32_t oc : {ic})
  1061. for (uint32_t pd : {0, 2})
  1062. for (uint32_t sd : {1, 3})
  1063. for (uint32_t dd : {1, 3})
  1064. for (bool xcorr : {0, 1}) {
  1065. uint32_t ih = id + 1, fh = fd, ph = pd + 1,
  1066. sh = sd + 1;
  1067. uint32_t iw = ih + 1, fw = fh, pw = ph + 1,
  1068. sw = sh + 1;
  1069. Param3D param{
  1070. xcorr ? Param3D::Mode::CROSS_CORRELATION
  1071. : Param3D::Mode::CONVOLUTION,
  1072. pd,
  1073. ph,
  1074. pw,
  1075. sd,
  1076. sh,
  1077. sw,
  1078. dd,
  1079. dd,
  1080. dd};
  1081. // !!! DEPRECATED. use AutoOprChecker instead.
  1082. opr::test::BackwardChecker<
  1083. opr::Convolution3D, 2>
  1084. backward_checker(
  1085. {{batch_size, ic, id, ih, iw},
  1086. {oc, ic, fd, fh, fw}},
  1087. param, 1e-2, 1);
  1088. backward_checker.run();
  1089. }
  1090. }
  1091. TEST(TestOprDNN, GroupConv3D) {
  1092. using Checker = AutoOprChecker<2, 1>;
  1093. opr::Convolution3D::Param param;
  1094. param.pad_d = 0;
  1095. param.pad_h = 1;
  1096. param.pad_w = 0;
  1097. param.stride_d = 1;
  1098. param.stride_h = 2;
  1099. auto make_graph = [&](const Checker::SymInpArray& inputs) -> Checker::SymOutArray {
  1100. auto p1 = param;
  1101. p1.sparse = opr::Convolution3D::Param::Sparse::GROUP;
  1102. return {opr::Convolution3D::make(inputs[0], inputs[1], p1)};
  1103. };
  1104. auto cn = CompNode::load("xpux");
  1105. auto inp0 = std::make_shared<HostTensorND>(cn, dtype::Float32()),
  1106. inp1 = std::make_shared<HostTensorND>(cn, dtype::Float32());
  1107. HostTensorND out_raw;
  1108. auto graph_raw = ComputingGraph::make();
  1109. auto func_raw = graph_raw->compile({make_callback_copy(
  1110. opr::Convolution3D::make(
  1111. opr::Host2DeviceCopy::make(*graph_raw, inp0),
  1112. opr::Host2DeviceCopy::make(*graph_raw, inp1), param),
  1113. out_raw)});
  1114. auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
  1115. auto&& out = dest[0];
  1116. auto sl = inp[0]->layout(), fl = inp[1]->layout().remove_axis(0);
  1117. TensorLayout ol;
  1118. auto group = inp[1]->layout()[0];
  1119. sl.shape[1] /= group;
  1120. for (size_t i = 0; i < group; ++i) {
  1121. inp0->copy_from(inp[0]->sub(SubTensorSpec::make_from_offset_elem(
  1122. sl, i * sl[1] * sl[2] * sl[3] * sl[4])));
  1123. inp1->copy_from(inp[1]->sub(
  1124. SubTensorSpec::make_from_offset_elem(fl, i * fl.total_nr_elems())));
  1125. func_raw->execute();
  1126. if (!i) {
  1127. auto oshp = out_raw.shape();
  1128. oshp[1] *= group;
  1129. out.resize(oshp);
  1130. ol = out.layout();
  1131. ol[1] /= group;
  1132. }
  1133. out.sub(SubTensorSpec::make_from_offset_elem(
  1134. ol, i * ol[1] * ol[2] * ol[3] * ol[4]))
  1135. .copy_from_fixlayout(out_raw);
  1136. }
  1137. };
  1138. Checker::RunOptions opt;
  1139. opt.numdiff_eps = 1;
  1140. opt.outputs_max_err = 5e-5;
  1141. Checker checker{make_graph, fwd};
  1142. auto run = [&](const TensorShape& ishp, size_t fd, size_t fh, size_t fw, size_t oc,
  1143. size_t group) {
  1144. size_t ic = ishp[1];
  1145. TensorShape flt{group, oc / group, ic / group, fd, fh, fw};
  1146. checker.run({ishp, flt}, opt);
  1147. };
  1148. run({1, 2, 1, 1, 1}, 1, 1, 1, 2, 2);
  1149. run({3, 9, 5, 4, 3}, 1, 2, 3, 6, 3);
  1150. run({2, 1, 3, 6, 9}, 2, 3, 3, 5, 1);
  1151. run({2, 1, 3, 6, 9}, 2, 3, 3, 5, 1);
  1152. }
  1153. TEST(TestOprDNN, Deconvolution3D) {
  1154. using Checker = AutoOprChecker<2, 1>;
  1155. Param3D param{Param3D::Mode::CROSS_CORRELATION, 0, 1, 1, 1, 2, 2};
  1156. param.sparse = Param3D::Sparse::GROUP;
  1157. auto make_graph = [&](const Checker::SymInpArray& inputs) -> Checker::SymOutArray {
  1158. return {opr::Convolution3DBackwardData::make_deconv(
  1159. inputs[0], inputs[1], param)};
  1160. };
  1161. auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
  1162. auto &&data = *inp[0], &&filter = *inp[1];
  1163. size_t N = data.shape(0), ID = data.shape(2), IH = data.shape(3),
  1164. IW = data.shape(4), GROUP = filter.shape(0), ICPG = filter.shape(1),
  1165. OCPG = filter.shape(2), FD = filter.shape(3), FH = filter.shape(4),
  1166. FW = filter.shape(5);
  1167. auto&& out = dest[0];
  1168. auto get_shp = [](size_t inp, size_t filter, size_t stride, size_t pad,
  1169. size_t dilate) {
  1170. return (inp - 1) * stride + (filter - 1) * dilate + 1 - pad * 2;
  1171. };
  1172. size_t OD = get_shp(ID, FD, param.stride_d, param.pad_d, param.dilate_d),
  1173. OH = get_shp(IH, FH, param.stride_h, param.pad_h, param.dilate_h),
  1174. OW = get_shp(IW, FW, param.stride_w, param.pad_w, param.dilate_w);
  1175. out.resize({N, OCPG * GROUP, OD, OH, OW});
  1176. auto fptr = filter.ptr<float>(), dptr = data.ptr<float>(),
  1177. optr = out.ptr<float>();
  1178. memset(optr, 0, sizeof(float) * out.shape().total_nr_elems());
  1179. auto ol = out.layout(), fl = filter.layout();
  1180. #define FOR2(a, A, b, B) \
  1181. for (size_t a = 0; a < A; ++a) \
  1182. for (size_t b = 0; b < B; ++b)
  1183. #define FOR3(a, A, b, B, c, C) \
  1184. FOR2(a, A, b, B) \
  1185. for (size_t c = 0; c < C; ++c)
  1186. #define FOR4(a, A, b, B, c, C, d, D) \
  1187. FOR3(a, A, b, B, c, C) \
  1188. for (size_t d = 0; d < D; ++d)
  1189. FOR3(n, N, group, GROUP, icg, ICPG)
  1190. FOR3(id, ID, ih, IH, iw, IW) {
  1191. float scale = *(dptr++);
  1192. FOR4(ocg, OCPG, fd, FD, fh, FH, fw, FW) {
  1193. auto oc_tot = group * OCPG + ocg;
  1194. int od = int(id * param.stride_d + fd * param.dilate_d) -
  1195. int(param.pad_d),
  1196. oh = int(ih * param.stride_h + fh * param.dilate_h) -
  1197. int(param.pad_h),
  1198. ow = int(iw * param.stride_w + fw * param.dilate_w) -
  1199. int(param.pad_w);
  1200. if (od >= 0 && oh >= 0 && ow >= 0 && od < static_cast<int>(OD) &&
  1201. oh < static_cast<int>(OH) && ow < static_cast<int>(OW)) {
  1202. auto out_off = n * ol.stride[0] + oc_tot * ol.stride[1] +
  1203. od * ol.stride[2] + oh * ol.stride[3] + ow,
  1204. flt_off = group * fl.stride[0] + icg * fl.stride[1] +
  1205. ocg * fl.stride[2] + fd * fl.stride[3] +
  1206. fh * fl.stride[4] + fw;
  1207. optr[out_off] += scale * fptr[flt_off];
  1208. }
  1209. }
  1210. }
  1211. #undef FOR4
  1212. #undef FOR3
  1213. #undef FOR2
  1214. };
  1215. Checker::RunOptions opt;
  1216. opt.numdiff_eps = 1;
  1217. Checker(make_graph, fwd)
  1218. .run({TensorShape{2, 4, 3, 3, 2}, {1, 4, 5, 3, 2, 2}}, opt)
  1219. .run({TensorShape{3, 2, 1, 1, 1}, {2, 1, 1, 4, 3, 3}}, opt)
  1220. .run({TensorShape{4, 6, 2, 2, 2}, {2, 3, 4, 6, 5, 4}}, opt);
  1221. }
  1222. TEST(TestOprDNN, Convolution3DExePolicy) {
  1223. Param3D param{Param3D::Mode::CONVOLUTION};
  1224. using Policy = opr::Convolution3D::ExecutionPolicy;
  1225. using S = Policy::Strategy;
  1226. #if MGB_ENABLE_FASTRUN
  1227. for (auto strategy : SmallVector<S>{
  1228. S::PROFILE, S::HEURISTIC, S::PROFILE | S::REPRODUCIBLE,
  1229. S::PROFILE | S::HEURISTIC}) {
  1230. #else
  1231. for (auto strategy : SmallVector<S>{S : HEURISTIC, S::PROFILE | S::HEURISTIC}) {
  1232. #endif
  1233. using Checker = AutoOprChecker<2, 1>;
  1234. auto make_graph =
  1235. [&](const Checker::SymInpArray& inputs) -> Checker::SymOutArray {
  1236. Policy policy;
  1237. policy.strategy = strategy;
  1238. auto out = opr::Convolution3D::make(inputs[0], inputs[1], param, policy);
  1239. return {out};
  1240. };
  1241. auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
  1242. std::shared_ptr<HostTensorND> sh_out;
  1243. convolution3d_brute({inp.begin(), inp.end()}, sh_out, param);
  1244. dest[0] = *sh_out;
  1245. };
  1246. Checker::RunOptions opt;
  1247. opt.numdiff_eps = 1;
  1248. Checker(make_graph, fwd)
  1249. .run({TensorShape{3, 2, 3, 4, 1}, {4, 2, 2, 2, 1}}, opt)
  1250. .run({TensorShape{3, 3, 2, 6, 2}, {2, 3, 1, 4, 1}}, opt)
  1251. .run({TensorShape{1, 1, 4, 4, 4}, {2, 1, 3, 3, 3}}, opt);
  1252. }
  1253. }
  1254. TEST(TestOprDNN, ConvBiasForward) {
  1255. using Checker2 = AutoOprChecker<2, 1>;
  1256. using Checker3 = AutoOprChecker<3, 1>;
  1257. opr::ConvBiasForward::Param param;
  1258. auto make_graph2 =
  1259. [&](const Checker2::SymInpArray& inputs) -> Checker2::SymOutArray {
  1260. return {opr::ConvBiasForward::make(inputs[0], inputs[1], param)};
  1261. };
  1262. auto make_graph3 =
  1263. [&](const Checker3::SymInpArray& inputs) -> Checker3::SymOutArray {
  1264. return {opr::ConvBiasForward::make(inputs[0], inputs[1], inputs[2], param)};
  1265. };
  1266. auto fwd2 = [&](Checker2::NumOutArray& dest, Checker2::NumInpArray inp) {
  1267. std::shared_ptr<HostTensorND> sh_out;
  1268. convolution_brute({inp[0], inp[1]}, sh_out, convert_to_conv_param(param));
  1269. dest[0] = *sh_out;
  1270. };
  1271. auto fwd3 = [&](Checker3::NumOutArray& dest, Checker3::NumInpArray inp) {
  1272. std::shared_ptr<HostTensorND> sh_out;
  1273. convolution_brute({inp[0], inp[1]}, sh_out, convert_to_conv_param(param));
  1274. dest[0] = *sh_out;
  1275. size_t N = dest[0].shape()[0];
  1276. size_t OC = dest[0].shape()[1];
  1277. size_t OH = dest[0].shape()[2];
  1278. size_t OW = dest[0].shape()[3];
  1279. auto dest_ptr = dest[0].ptr<float>();
  1280. for (size_t i = 0; i < N; i++) {
  1281. auto bias_ptr = inp[2]->ptr<float>();
  1282. for (size_t c = 0; c < OC; c++) {
  1283. for (size_t hw = 0; hw < OH * OW; hw++) {
  1284. *(dest_ptr++) += *(bias_ptr);
  1285. }
  1286. bias_ptr++;
  1287. }
  1288. }
  1289. };
  1290. auto run_with_param = [&](size_t SH = 1, size_t SW = 1, size_t PH = 0,
  1291. size_t PW = 0) {
  1292. param.pad_h = PH;
  1293. param.pad_w = PW;
  1294. param.stride_h = SH;
  1295. param.stride_w = SW;
  1296. Checker2 checker2{make_graph2, fwd2};
  1297. Checker2::RunOptions opt2;
  1298. checker2.set_output_allow_grad(0, false);
  1299. Checker3 checker3{make_graph3, fwd3};
  1300. Checker3::RunOptions opt3;
  1301. checker3.set_output_allow_grad(0, false);
  1302. auto run = [&](size_t N, size_t IC, size_t OC, size_t IH, size_t IW, size_t FH,
  1303. size_t FW) {
  1304. auto opr = megdnn_naive_handle()
  1305. ->create_operator<megdnn::ConvolutionForward>();
  1306. opr->param() = convert_to_conv_param(param);
  1307. TensorLayout dest_layout;
  1308. opr->deduce_layout(
  1309. {{N, IC, IH, IW}, dtype::Float32()},
  1310. {{OC, IC, FH, FW}, dtype::Float32()}, dest_layout);
  1311. checker2.run({TensorShape{N, IC, IH, IW}, {OC, IC, FH, FW}}, opt2);
  1312. checker3.run(
  1313. {TensorShape{N, IC, IH, IW}, {OC, IC, FH, FW}, {1, OC, 1, 1}},
  1314. opt3);
  1315. };
  1316. run(1, 1, 1, 5, 5, 1, 1);
  1317. run(1, 1, 1, 5, 5, 3, 3);
  1318. run(2, 3, 4, 5, 5, 3, 3);
  1319. run(3, 3, 4, 224, 223, 3, 3);
  1320. run(3, 3, 4, 224, 223, 2, 2);
  1321. };
  1322. run_with_param();
  1323. run_with_param(2, 2, 3, 3);
  1324. run_with_param(3, 2, 1, 2);
  1325. run_with_param(2, 3, 2, 2);
  1326. }
  1327. TEST(TestOprDNN, ConvBiasForwardWithZ) {
  1328. REQUIRE_GPU(1);
  1329. using Checker4 = AutoOprChecker<4, 1>;
  1330. opr::ConvBiasForward::Param param;
  1331. auto make_graph4 =
  1332. [&](const Checker4::SymInpArray& inputs) -> Checker4::SymOutArray {
  1333. return {opr::ConvBiasForward::make(
  1334. inputs[0], inputs[1], inputs[2], inputs[3], param)};
  1335. };
  1336. auto fwd4 = [&](Checker4::NumOutArray& dest, Checker4::NumInpArray inp) {
  1337. std::shared_ptr<HostTensorND> sh_out;
  1338. convolution_brute({inp[0], inp[1]}, sh_out, convert_to_conv_param(param));
  1339. dest[0] = *sh_out;
  1340. size_t N = dest[0].shape()[0];
  1341. size_t OC = dest[0].shape()[1];
  1342. size_t OH = dest[0].shape()[2];
  1343. size_t OW = dest[0].shape()[3];
  1344. auto dest_ptr = dest[0].ptr<float>();
  1345. float* z_ptr = inp[3]->ptr<float>();
  1346. for (size_t i = 0; i < N; i++) {
  1347. auto bias_ptr = inp[2]->ptr<float>();
  1348. for (size_t c = 0; c < OC; c++) {
  1349. for (size_t hw = 0; hw < OH * OW; hw++) {
  1350. *(dest_ptr++) += *(bias_ptr) + *(z_ptr++);
  1351. }
  1352. bias_ptr++;
  1353. }
  1354. }
  1355. };
  1356. auto run_with_param = [&](size_t SH = 1, size_t SW = 1, size_t PH = 0,
  1357. size_t PW = 0) {
  1358. param.pad_h = PH;
  1359. param.pad_w = PW;
  1360. param.stride_h = SH;
  1361. param.stride_w = SW;
  1362. Checker4 checker4{make_graph4, fwd4};
  1363. Checker4::RunOptions opt4;
  1364. checker4.set_output_allow_grad(0, false);
  1365. auto run = [&](size_t N, size_t IC, size_t OC, size_t IH, size_t IW, size_t FH,
  1366. size_t FW) {
  1367. auto opr = megdnn_naive_handle()
  1368. ->create_operator<megdnn::ConvolutionForward>();
  1369. opr->param() = convert_to_conv_param(param);
  1370. TensorLayout dest_layout;
  1371. opr->deduce_layout(
  1372. {{N, IC, IH, IW}, dtype::Float32()},
  1373. {{OC, IC, FH, FW}, dtype::Float32()}, dest_layout);
  1374. checker4.run(
  1375. {TensorShape{N, IC, IH, IW},
  1376. {OC, IC, FH, FW},
  1377. {1, OC, 1, 1},
  1378. {N, OC, dest_layout[2], dest_layout[3]}},
  1379. opt4);
  1380. };
  1381. run(1, 1, 1, 5, 5, 3, 3);
  1382. run(2, 3, 4, 5, 5, 3, 3);
  1383. run(3, 3, 4, 224, 223, 3, 3);
  1384. run(3, 3, 4, 224, 223, 2, 2);
  1385. };
  1386. run_with_param();
  1387. run_with_param(2, 2, 3, 3);
  1388. run_with_param(3, 2, 1, 2);
  1389. run_with_param(2, 3, 2, 2);
  1390. }
  1391. TEST(TestOprDNN, ConvBiasINT8x8xX_NCHW4) {
  1392. using Checker = AutoOprChecker<3, 1>;
  1393. using Param = opr::ConvBias::Param;
  1394. opr::ConvBiasForward::Param param;
  1395. auto make_quantized = [&](SymbolVar x, const DType& dtype) {
  1396. return opr::TypeCvt::make(x, dtype);
  1397. };
  1398. auto make_graph = [&](const Checker::SymInpArray& inputs) -> Checker::SymOutArray {
  1399. auto conv_param = convert_to_conv_param(param);
  1400. auto y = opr::Convolution::make(
  1401. make_quantized(inputs[0], dtype::QuantizedS8(0.3f)),
  1402. make_quantized(inputs[1], dtype::QuantizedS8(0.1f)), conv_param);
  1403. y = y + make_quantized(inputs[2], dtype::QuantizedS32(0.03f));
  1404. if (param.nonlineMode == Param::NonlineMode::RELU)
  1405. y = opr::Elemwise::make({y}, {opr::Elemwise::Mode::RELU});
  1406. y = opr::TypeCvt::make(y, dtype::QuantizedS8(0.5f));
  1407. return {opr::TypeCvt::make(y, dtype::Float32())};
  1408. };
  1409. auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
  1410. auto graph = ComputingGraph::make();
  1411. Checker::SymInpArray inputs;
  1412. for (size_t i = 0; i < inp.size(); ++i) {
  1413. inputs[i] = opr::Host2DeviceCopy::make(*graph, inp[i]);
  1414. }
  1415. auto options = gopt::OptimizeForInferenceOptions{};
  1416. options.enable_fuse_conv_bias_nonlinearity();
  1417. auto y = gopt::optimize_for_inference({make_graph(inputs)[0]}, options)[0];
  1418. auto func = graph->compile({make_callback_copy(y, dest[0])});
  1419. func->execute();
  1420. func->wait();
  1421. };
  1422. auto run_with_param = [&](size_t SH = 1, size_t SW = 1, size_t PH = 0,
  1423. size_t PW = 0, size_t group = 1) {
  1424. param.pad_h = PH;
  1425. param.pad_w = PW;
  1426. param.stride_h = SH;
  1427. param.stride_w = SW;
  1428. param.format = Param::Format::NCHW4;
  1429. if (group != 1)
  1430. param.sparse = Param::Sparse::GROUP;
  1431. Checker checker{make_graph, fwd, CompNode::load("cpu0")};
  1432. Checker::RunOptions opt;
  1433. checker.set_output_allow_grad(0, false);
  1434. auto run = [&](size_t N, size_t IC, size_t OC, size_t IH, size_t IW, size_t FH,
  1435. size_t FW) {
  1436. mgb_assert(IC % 4 == 0 && OC % 4 == 0);
  1437. checker.run(
  1438. {TensorShape{N, group * IC / 4, IH, IW, 4},
  1439. {group, OC, IC / 4, FH, FW, 4},
  1440. {1, group * OC / 4, 1, 1, 4}},
  1441. opt);
  1442. };
  1443. run(1, 8, 8, 56, 56, 3, 3);
  1444. run(1, 8, 8, 56, 56, 3, 3);
  1445. run(1, 8, 8, 56, 56, 3, 3);
  1446. };
  1447. run_with_param(1, 1, 1, 1, 8);
  1448. run_with_param();
  1449. run_with_param(2, 2, 3, 3);
  1450. run_with_param(3, 2, 1, 2);
  1451. run_with_param(2, 3, 2, 2);
  1452. }
  1453. TEST(TestOprDNN, ConvolutionDTypeInference) {
  1454. Param param;
  1455. param.mode = Mode::CONVOLUTION;
  1456. auto cn = CompNode::load("cpu0");
  1457. auto graph = ComputingGraph::make();
  1458. HostTensorND inp_host{
  1459. cn, {1, 3, 7, 7}, dtype::Quantized8Asymm(0.233f, (uint8_t)123)};
  1460. HostTensorND filt_host{
  1461. cn, {8, 3, 1, 1}, dtype::Quantized8Asymm(0.874f, (uint8_t)234)};
  1462. auto inp = opr::ImmutableTensor::make(*graph, inp_host);
  1463. auto filt = opr::ImmutableTensor::make(*graph, filt_host);
  1464. auto opr = opr::Convolution::make(inp, filt, param);
  1465. ASSERT_EQ(opr.dtype().enumv(), DTypeEnum::QuantizedS32);
  1466. // This has to be EQ instead of NEAR
  1467. EXPECT_EQ(opr.dtype().param<dtype::QuantizedS32>().scale, 0.233f * 0.874f);
  1468. inp_host = {cn, {1, 3, 7, 7}, dtype::QuantizedS8(0.1234f)};
  1469. filt_host = {cn, {8, 3, 1, 1}, dtype::QuantizedS8(0.2345f)};
  1470. inp = opr::ImmutableTensor::make(*graph, inp_host);
  1471. filt = opr::ImmutableTensor::make(*graph, filt_host);
  1472. opr = opr::Convolution::make(inp, filt, param);
  1473. ASSERT_EQ(opr.dtype().enumv(), DTypeEnum::QuantizedS32);
  1474. EXPECT_EQ(opr.dtype().param<dtype::QuantizedS32>().scale, 0.1234f * 0.2345f);
  1475. inp_host = {cn, {1, 3, 7, 7}, dtype::Int8()};
  1476. filt_host = {cn, {8, 3, 1, 1}, dtype::Int8()};
  1477. inp = opr::ImmutableTensor::make(*graph, inp_host);
  1478. filt = opr::ImmutableTensor::make(*graph, filt_host);
  1479. opr = opr::Convolution::make(inp, filt, param);
  1480. ASSERT_EQ(opr.dtype().enumv(), DTypeEnum::Int32);
  1481. }
  1482. TEST(TestOprDNN, ConvBiasINT8x8xXDTypeInference) {
  1483. float inp_scale = 1.926f;
  1484. float filt_scale = 0.817f;
  1485. float bias_scale = inp_scale * filt_scale;
  1486. opr::ConvBias::Param param;
  1487. param.mode = Mode::CONVOLUTION;
  1488. auto cn = CompNode::load("cpu0");
  1489. auto graph = ComputingGraph::make();
  1490. HostTensorND inp_host{cn, {1, 3, 7, 7}, dtype::QuantizedS8(inp_scale)};
  1491. HostTensorND filt_host{cn, {8, 3, 1, 1}, dtype::QuantizedS8(filt_scale)};
  1492. DType output_dtype = dtype::QuantizedS8(bias_scale);
  1493. HostTensorND bias_host{cn, {1, 3, 7, 7}, dtype::QuantizedS32(bias_scale)};
  1494. auto inp = opr::ImmutableTensor::make(*graph, inp_host);
  1495. auto filt = opr::ImmutableTensor::make(*graph, filt_host);
  1496. auto bias = opr::ImmutableTensor::make(*graph, filt_host);
  1497. auto opr = opr::ConvBiasForward::make(
  1498. inp, filt, bias, param, {}, OperatorNodeConfig{output_dtype});
  1499. ASSERT_EQ(opr.dtype().enumv(), DTypeEnum::QuantizedS8);
  1500. EXPECT_EQ(opr.dtype().param<dtype::QuantizedS8>().scale, bias_scale);
  1501. }
  1502. TEST(TestOprDNN, ConvBiasINT8x8xXSerialization) {
  1503. using namespace serialization;
  1504. float inp_scale = 1.926f;
  1505. float filt_scale = 0.817f;
  1506. float bias_scale = inp_scale * filt_scale;
  1507. DType output_dtype = dtype::QuantizedS8(bias_scale);
  1508. auto fname = output_file("ConvBiasINT8x8xXTest");
  1509. auto dump = [&]() {
  1510. opr::ConvBias::Param param;
  1511. param.mode = Mode::CONVOLUTION;
  1512. auto cn = CompNode::load("cpu0");
  1513. auto graph = ComputingGraph::make();
  1514. HostTensorND inp_host{cn, {1, 3, 7, 7}, dtype::QuantizedS8(inp_scale)};
  1515. HostTensorND filt_host{cn, {8, 3, 1, 1}, dtype::QuantizedS8(filt_scale)};
  1516. HostTensorND bias_host{cn, {1, 3, 7, 7}, dtype::QuantizedS32(bias_scale)};
  1517. auto inp = opr::ImmutableTensor::make(*graph, inp_host);
  1518. auto filt = opr::ImmutableTensor::make(*graph, filt_host);
  1519. auto bias = opr::ImmutableTensor::make(*graph, filt_host);
  1520. auto opr = opr::ConvBiasForward::make(
  1521. inp, filt, bias, param, {}, OperatorNodeConfig{output_dtype});
  1522. auto dumper = GraphDumper::make(OutputFile::make_fs(fname.c_str()));
  1523. auto rst = dumper->dump({opr});
  1524. ASSERT_EQ(rst.outputs.size(), 1u);
  1525. };
  1526. auto load = [&]() {
  1527. auto loader = GraphLoader::make(InputFile::make_fs(fname.c_str()));
  1528. auto rst = loader->load();
  1529. ASSERT_EQ(rst.output_var_list.size(), 1u);
  1530. EXPECT_EQ(rst.output_var_list[0].dtype(), output_dtype);
  1531. };
  1532. dump();
  1533. load();
  1534. }
  1535. TEST(TestOprDNN, LocalShareForward) {
  1536. REQUIRE_GPU(1);
  1537. using Checker = AutoOprChecker<2, 1>;
  1538. using Param = opr::LocalShare::Param;
  1539. Param param;
  1540. param.mode = Param::Mode::CROSS_CORRELATION;
  1541. param.sparse = Param::Sparse::DENSE;
  1542. auto make_graph = [&](const Checker::SymInpArray& inputs) -> Checker::SymOutArray {
  1543. return {opr::LocalShare::make(inputs[0], inputs[1], param)};
  1544. };
  1545. auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
  1546. mgb_assert(inp.size() == 2);
  1547. mgb_assert(dest.size() == 1);
  1548. std::shared_ptr<HostTensorND> out;
  1549. local_share_brute({inp[0], inp[1]}, out, param);
  1550. dest[0] = *out;
  1551. };
  1552. auto run_with_param = [&](size_t fh = 3, size_t fw = 3, size_t sh = 1,
  1553. size_t sw = 1, size_t sgh = 3, size_t sgw = 3) {
  1554. size_t ph = fh / 2, pw = fw / 2;
  1555. param.pad_h = ph, param.pad_w = pw;
  1556. param.stride_h = sh, param.stride_w = sw, param.spatial_groups_h = sgh,
  1557. param.spatial_groups_w = sgw;
  1558. Checker checker{make_graph, fwd};
  1559. Checker::RunOptions opt;
  1560. checker.set_output_allow_grad(0, false);
  1561. checker.set_input_dtype(0, dtype::Float32());
  1562. checker.set_input_dtype(1, dtype::Float32());
  1563. auto run = [&](size_t n, size_t ci, size_t co, size_t hi, size_t wi) {
  1564. size_t ho = (hi + 2 * ph - fh) / sh + 1;
  1565. size_t wo = (wi + 2 * pw - fw) / sw + 1;
  1566. if (ho % sgh != 0 || wo % sgw != 0)
  1567. return;
  1568. checker.run(
  1569. {TensorShape{n, ci, hi, wi}, TensorShape{sgh, sgw, ci, fh, fw, co}},
  1570. opt);
  1571. };
  1572. run(32, 2, 7, 24, 24);
  1573. run(16, 2, 7, 24, 24);
  1574. run(32, 2, 8, 12, 12);
  1575. run(16, 2, 9, 6, 6);
  1576. };
  1577. run_with_param(1, 1, 1, 1, 3, 3);
  1578. run_with_param(3, 3, 1, 1, 2, 2);
  1579. run_with_param(5, 5, 1, 1, 2, 2);
  1580. run_with_param(7, 7, 1, 1, 2, 2);
  1581. run_with_param(1, 1, 2, 2, 3, 3);
  1582. run_with_param(3, 3, 2, 2, 2, 2);
  1583. run_with_param(5, 5, 1, 1, 2, 2);
  1584. run_with_param(7, 7, 1, 1, 2, 2);
  1585. }
  1586. TEST(TestOprDNN, LocalShareForwardGrad) {
  1587. REQUIRE_GPU(1);
  1588. using Checker = AutoOprChecker<2, 1>;
  1589. using Param = opr::LocalShare::Param;
  1590. Param param;
  1591. param.mode = Param::Mode::CROSS_CORRELATION;
  1592. param.sparse = Param::Sparse::DENSE;
  1593. auto make_graph = [&](const Checker::SymInpArray& inputs) -> Checker::SymOutArray {
  1594. return {opr::LocalShare::make(inputs[0], inputs[1], param)};
  1595. };
  1596. auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
  1597. mgb_assert(inp.size() == 2);
  1598. mgb_assert(dest.size() == 1);
  1599. std::shared_ptr<HostTensorND> out;
  1600. local_share_brute({inp[0], inp[1]}, out, param);
  1601. dest[0] = *out;
  1602. };
  1603. auto run_with_param = [&](size_t fh = 3, size_t fw = 3, size_t sh = 1,
  1604. size_t sw = 1, size_t sgh = 3, size_t sgw = 3) {
  1605. size_t ph = fh / 2, pw = fw / 2;
  1606. param.pad_h = ph, param.pad_w = pw;
  1607. param.stride_h = sh, param.stride_w = sw, param.spatial_groups_h = sgh,
  1608. param.spatial_groups_w = sgw;
  1609. Checker checker{make_graph, fwd};
  1610. Checker::RunOptions opt;
  1611. checker.set_output_allow_grad(0, true);
  1612. opt.numdiff_max_err = 1e-1;
  1613. checker.set_input_dtype(0, dtype::Float32());
  1614. checker.set_input_dtype(1, dtype::Float32());
  1615. auto run = [&](size_t n, size_t ci, size_t co, size_t hi, size_t wi) {
  1616. size_t ho = (hi + 2 * ph - fh) / sh + 1;
  1617. size_t wo = (wi + 2 * pw - fw) / sw + 1;
  1618. if (ho % sgh != 0 || wo % sgw != 0)
  1619. return;
  1620. checker.run(
  1621. {TensorShape{n, ci, hi, wi}, TensorShape{sgh, sgw, ci, fh, fw, co}},
  1622. opt);
  1623. };
  1624. run(4, 2, 8, 24, 24);
  1625. run(8, 2, 4, 6, 6);
  1626. run(16, 4, 8, 12, 12);
  1627. run(4, 4, 8, 12, 12);
  1628. };
  1629. run_with_param(1, 1, 1, 1, 3, 3);
  1630. run_with_param(1, 1, 2, 2, 3, 3);
  1631. run_with_param(3, 3, 2, 2, 2, 2);
  1632. }
  1633. TEST(TestOprDNN, LocalShareForwardExecPolicy) {
  1634. REQUIRE_GPU(1);
  1635. using Checker = AutoOprChecker<2, 1>;
  1636. using Policy = opr::LocalShare::ExecutionPolicy;
  1637. using S = Policy::Strategy;
  1638. using Param = opr::LocalShare::Param;
  1639. Param param;
  1640. param.mode = Param::Mode::CROSS_CORRELATION;
  1641. param.sparse = Param::Sparse::DENSE;
  1642. int nr_get = 0;
  1643. auto on_get = [&nr_get](
  1644. const std::string&, const void*, size_t, const void*,
  1645. size_t) { ++nr_get; };
  1646. PersistentCacheHook cache_hook{on_get};
  1647. #if MGB_ENABLE_FASTRUN
  1648. for (auto strategy : SmallVector<S>{
  1649. S::PROFILE, S::HEURISTIC, S::PROFILE | S::REPRODUCIBLE,
  1650. S::PROFILE | S::HEURISTIC, S::PROFILE | S::OPTIMIZED}) {
  1651. #else
  1652. for (auto strategy : SmallVector<S>{S : HEURISTIC, S::PROFILE | S::HEURISTIC}) {
  1653. #endif
  1654. auto make_graph =
  1655. [&](const Checker::SymInpArray& inputs) -> Checker::SymOutArray {
  1656. Policy policy;
  1657. policy.strategy = strategy;
  1658. return {opr::LocalShare::make(inputs[0], inputs[1], param, policy)};
  1659. };
  1660. auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
  1661. mgb_assert(inp.size() == 2);
  1662. mgb_assert(dest.size() == 1);
  1663. std::shared_ptr<HostTensorND> out;
  1664. local_share_brute({inp[0], inp[1]}, out, param);
  1665. dest[0] = *out;
  1666. };
  1667. auto run_with_param = [&](size_t fh = 3, size_t fw = 3, size_t sh = 1,
  1668. size_t sw = 1, size_t sgh = 3, size_t sgw = 3) {
  1669. megdnn::AlgorithmCache::instance().clear();
  1670. size_t ph = fh / 2, pw = fw / 2;
  1671. param.pad_h = ph, param.pad_w = pw;
  1672. param.stride_h = sh, param.stride_w = sw, param.spatial_groups_h = sgh,
  1673. param.spatial_groups_w = sgw;
  1674. Checker checker{make_graph, fwd};
  1675. Checker::RunOptions opt;
  1676. checker.set_output_allow_grad(0, false);
  1677. checker.set_input_dtype(0, dtype::Float32());
  1678. checker.set_input_dtype(1, dtype::Float32());
  1679. nr_get = 0;
  1680. opt.outputs_max_err = 1e-3;
  1681. auto run = [&](size_t n, size_t ci, size_t co, size_t hi, size_t wi) {
  1682. size_t ho = (hi + 2 * ph - fh) / sh + 1;
  1683. size_t wo = (wi + 2 * pw - fw) / sw + 1;
  1684. if (ho % sgh != 0 || wo % sgw != 0)
  1685. return;
  1686. checker.run(
  1687. {TensorShape{n, ci, hi, wi},
  1688. TensorShape{sgh, sgw, ci, fh, fw, co}},
  1689. opt);
  1690. };
  1691. run(32, 4, 8, 24, 24);
  1692. run(32, 4, 8, 12, 12);
  1693. run(16, 4, 8, 12, 12);
  1694. run(32, 4, 8, 6, 6);
  1695. if (strategy == S::HEURISTIC) {
  1696. ASSERT_EQ(0, nr_get);
  1697. } else {
  1698. ASSERT_LT(0, nr_get);
  1699. }
  1700. };
  1701. run_with_param(1, 1, 1, 1, 3, 3);
  1702. run_with_param(3, 3, 1, 1, 2, 2);
  1703. run_with_param(5, 5, 1, 1, 2, 2);
  1704. run_with_param(7, 7, 1, 1, 2, 2);
  1705. run_with_param(1, 1, 2, 2, 3, 3);
  1706. run_with_param(3, 3, 2, 2, 2, 2);
  1707. run_with_param(5, 5, 1, 1, 2, 2);
  1708. run_with_param(7, 7, 1, 1, 2, 2);
  1709. }
  1710. }
  1711. TEST(TestOprDNN, LocalShareSerialization) {
  1712. using namespace serialization;
  1713. auto fname = output_file("LocalShareForwardTest");
  1714. auto dump = [&]() {
  1715. opr::LocalShare::Param param;
  1716. param.mode = Mode::CROSS_CORRELATION;
  1717. param.stride_h = param.stride_w = 1;
  1718. param.pad_h = param.pad_w = 0;
  1719. param.spatial_groups_h = param.spatial_groups_w = 3;
  1720. auto cn = CompNode::load("cpu0");
  1721. auto graph = ComputingGraph::make();
  1722. HostTensorND inp_host{cn, {32, 4, 24, 24}, dtype::Float32()};
  1723. HostTensorND filt_host{cn, {3, 3, 4, 1, 1, 8}, dtype::Float32()};
  1724. auto inp = opr::ImmutableTensor::make(*graph, inp_host);
  1725. auto filt = opr::ImmutableTensor::make(*graph, filt_host);
  1726. auto opr = opr::LocalShareForward::make(inp, filt, param, {});
  1727. auto dumper = GraphDumper::make(OutputFile::make_fs(fname.c_str()));
  1728. auto rst = dumper->dump({opr});
  1729. ASSERT_EQ(rst.outputs.size(), 1u);
  1730. };
  1731. auto load = [&]() {
  1732. auto loader = GraphLoader::make(InputFile::make_fs(fname.c_str()));
  1733. auto rst = loader->load();
  1734. ASSERT_EQ(rst.output_var_list.size(), 1u);
  1735. };
  1736. dump();
  1737. load();
  1738. }
  1739. TEST(TestOprDNN, DeformableConvForward) {
  1740. REQUIRE_GPU(1);
  1741. using Checker = AutoOprChecker<4, 1>;
  1742. using Policy = opr::DeformableConvForward::ExecutionPolicy;
  1743. using S = Policy::Strategy;
  1744. using Param = opr::DeformableConvForward::Param;
  1745. Param param;
  1746. #if MGB_ENABLE_FASTRUN
  1747. for (auto strategy : SmallVector<S>{
  1748. S::PROFILE, S::HEURISTIC, S::PROFILE | S::REPRODUCIBLE,
  1749. S::PROFILE | S::HEURISTIC, S::PROFILE | S::OPTIMIZED}) {
  1750. #else
  1751. for (auto strategy : SmallVector<S>{S : HEURISTIC, S::PROFILE | S::HEURISTIC}) {
  1752. #endif
  1753. auto make_graph =
  1754. [&](const Checker::SymInpArray& inputs) -> Checker::SymOutArray {
  1755. Policy policy;
  1756. policy.strategy = strategy;
  1757. return {opr::DeformableConvForward::make(
  1758. inputs[0], inputs[1], inputs[2], inputs[3], param, policy)};
  1759. };
  1760. auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
  1761. auto opr = megdnn_naive_handle()
  1762. ->create_operator<megdnn::DeformableConvForward>();
  1763. opr->param() = param;
  1764. TensorLayout dest_layout;
  1765. opr->deduce_layout(
  1766. inp[0]->layout(), inp[1]->layout(), inp[2]->layout(),
  1767. inp[3]->layout(), dest_layout);
  1768. std::vector<dt_byte> workspace(opr->get_workspace_in_bytes(
  1769. inp[0]->layout(), inp[1]->layout(), inp[2]->layout(),
  1770. inp[3]->layout(), dest_layout));
  1771. dest[0].dtype(dtype::Float32())
  1772. .comp_node(inp[0]->comp_node())
  1773. .resize(dest_layout);
  1774. opr->exec(
  1775. inp[0]->as_megdnn(), inp[1]->as_megdnn(), inp[2]->as_megdnn(),
  1776. inp[3]->as_megdnn(), dest[0].as_megdnn(),
  1777. {workspace.data(), workspace.size()});
  1778. };
  1779. auto run_with_param = [&](size_t fh, size_t fw, size_t sh, size_t sw, size_t dh,
  1780. size_t dw, size_t group, size_t deformable_group) {
  1781. Checker checker{make_graph, fwd};
  1782. size_t ph = fh / 2, pw = fw / 2;
  1783. param.pad_h = ph, param.pad_w = pw;
  1784. param.stride_h = sh, param.stride_w = sw;
  1785. param.dilate_h = dh, param.dilate_w = dw;
  1786. param.format = Param::Format::NCHW;
  1787. param.mode = Param::Mode::CROSS_CORRELATION;
  1788. param.sparse = Param::Sparse::DENSE;
  1789. if (group > 1)
  1790. param.sparse = Param::Sparse::GROUP;
  1791. Checker::RunOptions opt;
  1792. float DELTA = 1e-3;
  1793. opt.numdiff_eps = DELTA;
  1794. opt.numdiff_max_err = 1e-1;
  1795. auto gen_off = [DELTA](HostTensorND& off, float l = -2.f, float h = 2.f) {
  1796. RNGxorshf rng{next_rand_seed()};
  1797. auto elems = off.shape().total_nr_elems();
  1798. auto ptr = off.ptr<float>();
  1799. auto rand_real = [](RNGxorshf& rng, float lo, float hi) {
  1800. std::uniform_real_distribution<float> dist(lo, hi);
  1801. return dist(rng);
  1802. };
  1803. for (size_t i = 0; i < elems; ++i) {
  1804. do {
  1805. float val = rand_real(rng, l, h);
  1806. if (abs(floor(val + 2 * DELTA) - floor(val)) <= 1e-6f &&
  1807. abs(floor(val - 2 * DELTA) - floor(val)) <= 1e-6f) {
  1808. ptr[i] = val;
  1809. break;
  1810. }
  1811. } while (true);
  1812. }
  1813. };
  1814. //! generate offset to avoid value near integer
  1815. /// because bilinear function is not derivable over there
  1816. checker.set_input_generator(2, gen_off);
  1817. checker.set_input_dtype(0, dtype::Float32());
  1818. checker.set_input_dtype(1, dtype::Float32());
  1819. checker.set_input_dtype(2, dtype::Float32());
  1820. checker.set_input_dtype(3, dtype::Float32());
  1821. auto run = [&](size_t n, size_t ih, size_t iw, size_t icpg, size_t ocpg) {
  1822. size_t oh = (ih + 2 * ph - fh) / sh + 1;
  1823. size_t ow = (iw + 2 * pw - fw) / sw + 1;
  1824. checker.run(
  1825. {TensorShape{n, group * icpg, ih, iw},
  1826. (param.sparse == Param::Sparse::GROUP)
  1827. ? TensorShape{group, ocpg, icpg, fh, fw}
  1828. : TensorShape{group * ocpg, group * icpg, fh, fw},
  1829. {n, 2 * deformable_group * fh * fw, oh, ow},
  1830. {n, deformable_group * fh * fw, oh, ow}},
  1831. opt);
  1832. };
  1833. run(1, 3, 3, 2, 1);
  1834. run(2, 3, 3, 2, 2);
  1835. run(1, 5, 5, 2, 1);
  1836. };
  1837. // run_with_param(1, 1, 1, 1, 1, 1, 1, 1);
  1838. run_with_param(3, 3, 1, 1, 1, 1, 2, 2);
  1839. // run_with_param(5, 5, 1, 1, 1, 1, 2, 2);
  1840. }
  1841. }
  1842. TEST(TestOprDNN, DeformableConvSerialization) {
  1843. using namespace serialization;
  1844. auto fname = output_file("DeformableConvTest");
  1845. auto dump = [&]() {
  1846. using Param = opr::DeformableConvForward::Param;
  1847. Param param;
  1848. size_t n = 16, ocpg = 2, icpg = 4;
  1849. size_t ih = 24, iw = 24, fh = 3, fw = 3, ph = 2, pw = 2, sh = 1, sw = 1, dh = 1,
  1850. dw = 1;
  1851. size_t group = 1, deformable_group = 1;
  1852. size_t oh = (ih + 2 * ph - fh) / sh + 1;
  1853. size_t ow = (iw + 2 * pw - fw) / sw + 1;
  1854. param.pad_h = ph, param.pad_w = pw;
  1855. param.stride_h = sh, param.stride_w = sw;
  1856. param.dilate_h = dh, param.dilate_w = dw;
  1857. param.format = Param::Format::NCHW;
  1858. param.mode = Param::Mode::CROSS_CORRELATION;
  1859. param.sparse = Param::Sparse::DENSE;
  1860. auto cn = CompNode::load("cpu0");
  1861. auto graph = ComputingGraph::make();
  1862. HostTensorND inp_host{cn, {n, group * icpg, ih, iw}, dtype::Float32()};
  1863. HostTensorND filt_host{
  1864. cn, {group * ocpg, group * icpg, fh, fw}, dtype::Float32()};
  1865. HostTensorND offset_host{
  1866. cn, {n, 2 * deformable_group * fh * fw, oh, ow}, dtype::Float32()};
  1867. HostTensorND mask_host{
  1868. cn, {n, deformable_group * fh * fw, oh, ow}, dtype::Float32()};
  1869. auto inp = opr::ImmutableTensor::make(*graph, inp_host);
  1870. auto filt = opr::ImmutableTensor::make(*graph, filt_host);
  1871. auto offset = opr::ImmutableTensor::make(*graph, offset_host);
  1872. auto mask = opr::ImmutableTensor::make(*graph, mask_host);
  1873. auto opr = opr::DeformableConvForward::make(
  1874. inp, filt, offset, mask, param, {}, {});
  1875. auto dumper = GraphDumper::make(OutputFile::make_fs(fname.c_str()));
  1876. auto rst = dumper->dump({opr});
  1877. ASSERT_EQ(rst.outputs.size(), 1u);
  1878. };
  1879. auto load = [&]() {
  1880. auto loader = GraphLoader::make(InputFile::make_fs(fname.c_str()));
  1881. auto rst = loader->load();
  1882. ASSERT_EQ(rst.output_var_list.size(), 1u);
  1883. };
  1884. dump();
  1885. load();
  1886. }
  1887. #if MGB_CUDA
  1888. TEST(TestOprDNN, BatchConvBiasForward) {
  1889. REQUIRE_GPU(1);
  1890. auto cn = CompNode::load("gpu0");
  1891. cn.activate();
  1892. REQUIRE_CUDA_COMPUTE_CAPABILITY(6, 1);
  1893. using Checker = AutoOprChecker<3, 1>;
  1894. using Policy = opr::BatchConvBiasForward::ExecutionPolicy;
  1895. using S = Policy::Strategy;
  1896. using Param = opr::BatchConvBiasForward::Param;
  1897. Param param;
  1898. param.format = Param::Format::NCHW4;
  1899. param.mode = Param::Mode::CROSS_CORRELATION;
  1900. param.sparse = Param::Sparse::DENSE;
  1901. #if MGB_ENABLE_FASTRUN
  1902. for (auto strategy : SmallVector<S>{
  1903. S::PROFILE, S::HEURISTIC, S::PROFILE | S::REPRODUCIBLE,
  1904. S::PROFILE | S::HEURISTIC, S::PROFILE | S::OPTIMIZED}) {
  1905. #else
  1906. for (auto strategy : SmallVector<S>{S : HEURISTIC, S::PROFILE | S::HEURISTIC}) {
  1907. #endif
  1908. auto make_quantized = [&](SymbolVar x, const DType& dtype) {
  1909. return opr::TypeCvt::make(x, dtype);
  1910. };
  1911. auto make_graph =
  1912. [&](const Checker::SymInpArray& inputs) -> Checker::SymOutArray {
  1913. Policy policy;
  1914. policy.strategy = strategy;
  1915. auto conv_bias = opr::BatchConvBiasForward::make(
  1916. make_quantized(inputs[0], dtype::QuantizedS8{1.1f}),
  1917. make_quantized(inputs[1], dtype::QuantizedS8{1.2f}),
  1918. make_quantized(inputs[2], dtype::QuantizedS32{1.1f * 1.2f}), param,
  1919. policy, OperatorNodeConfig{dtype::QuantizedS8{1.3f}});
  1920. return {opr::TypeCvt::make(conv_bias, dtype::Float32())};
  1921. };
  1922. auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
  1923. mgb_assert(inp.size() == 3);
  1924. mgb_assert(dest.size() == 1);
  1925. auto graph = ComputingGraph::make();
  1926. Checker::SymInpArray inputs;
  1927. for (size_t i = 0; i < inp.size(); ++i) {
  1928. inputs[i] = opr::Host2DeviceCopy::make(*graph, inp[i]);
  1929. }
  1930. auto src = make_quantized(inputs[0], dtype::QuantizedS8{1.1f}),
  1931. filter = make_quantized(inputs[1], dtype::QuantizedS8{1.2f}),
  1932. bias = make_quantized(inputs[2], dtype::QuantizedS32{1.1f * 1.2f});
  1933. {
  1934. auto xshp = opr::GetVarShape::make(src);
  1935. auto cv = [&src](int v) { return src.make_scalar(v); };
  1936. auto sub = [&xshp, &cv](int idx) {
  1937. return opr::IndexAt::make(xshp, {{0, cv(idx)}});
  1938. };
  1939. auto tshp = opr::Concat::make(
  1940. {cv(1), sub(0) * sub(1), sub(2), sub(3), sub(4)}, 0);
  1941. src = opr::Reshape::make(src, tshp);
  1942. }
  1943. auto conv_param = convert_to_conv_param(param);
  1944. conv_param.sparse = opr::BatchConvBias::Param::Sparse::GROUP;
  1945. auto y = opr::Convolution::make(src, filter, conv_param);
  1946. {
  1947. auto fshp = opr::GetVarShape::make(filter);
  1948. auto batch = opr::IndexAt::make(fshp, {{0, filter.make_scalar(0)}});
  1949. auto xshp = opr::GetVarShape::make(y);
  1950. auto cv = [&y](int v) { return y.make_scalar(v); };
  1951. auto sub = [&xshp, &cv](int idx) {
  1952. return opr::IndexAt::make(xshp, {{0, cv(idx)}});
  1953. };
  1954. auto tshp = opr::Concat::make(
  1955. {batch, sub(1) / batch, sub(2), sub(3), sub(4)}, 0);
  1956. y = opr::Reshape::make(y, tshp);
  1957. }
  1958. y = y + bias;
  1959. y = opr::TypeCvt::make(y, dtype::QuantizedS8{1.3f});
  1960. y = opr::TypeCvt::make(y, dtype::Float32());
  1961. auto func = graph->compile({make_callback_copy(y, dest[0])});
  1962. func->execute();
  1963. func->wait();
  1964. };
  1965. auto run_with_param = [&](size_t sh = 1, size_t sw = 1) {
  1966. size_t fh = 1;
  1967. size_t fw = 1;
  1968. size_t ph = fh / 2, pw = fw / 2;
  1969. param.pad_h = ph, param.pad_w = pw;
  1970. param.stride_h = sh, param.stride_w = sw;
  1971. Checker checker{make_graph, fwd, cn};
  1972. Checker::RunOptions opt;
  1973. checker.set_output_allow_grad(0, false);
  1974. checker.set_input_dtype(0, dtype::Float32());
  1975. checker.set_input_dtype(1, dtype::Float32());
  1976. checker.set_input_dtype(2, dtype::Float32());
  1977. auto run = [&](size_t n, size_t ci, size_t co, size_t hi, size_t wi) {
  1978. checker.run(
  1979. {TensorShape{n, ci / 4, hi, wi, 4},
  1980. TensorShape{n, co, ci / 4, fh, fw, 4},
  1981. TensorShape{1, co / 4, 1, 1, 4}},
  1982. opt);
  1983. };
  1984. run(32, 16, 32, 24, 24);
  1985. run(16, 16, 32, 24, 24);
  1986. run(32, 16, 64, 12, 12);
  1987. run(16, 16, 64, 6, 6);
  1988. };
  1989. run_with_param(1, 1);
  1990. run_with_param(2, 2);
  1991. }
  1992. }
  1993. #endif
  1994. TEST(TestOprDNN, BatchConvBiasSerialization) {
  1995. using namespace serialization;
  1996. auto fname = output_file("BatchConvBiasForwardTest");
  1997. auto dump = [&]() {
  1998. opr::BatchConvBias::Param param;
  1999. param.mode = Mode::CROSS_CORRELATION;
  2000. param.format = opr::BatchConvBias::Param::Format::NCHW4;
  2001. param.stride_h = param.stride_w = 1;
  2002. param.pad_h = param.pad_w = 0;
  2003. auto cn = CompNode::load("cpu0");
  2004. auto graph = ComputingGraph::make();
  2005. HostTensorND inp_host{cn, {32, 1, 24, 24, 4}, dtype::QuantizedS8{1.1f}};
  2006. HostTensorND filt_host{cn, {32, 8, 1, 1, 1, 4}, dtype::QuantizedS8{1.2f}};
  2007. auto inp = opr::ImmutableTensor::make(*graph, inp_host);
  2008. auto filt = opr::ImmutableTensor::make(*graph, filt_host);
  2009. auto opr = opr::BatchConvBiasForward::make(
  2010. inp, filt, param, {}, OperatorNodeConfig{dtype::QuantizedS8{1.3f}});
  2011. auto dumper = GraphDumper::make(OutputFile::make_fs(fname.c_str()));
  2012. auto rst = dumper->dump({opr});
  2013. ASSERT_EQ(rst.outputs.size(), 1u);
  2014. };
  2015. auto load = [&]() {
  2016. auto loader = GraphLoader::make(InputFile::make_fs(fname.c_str()));
  2017. auto rst = loader->load();
  2018. ASSERT_EQ(rst.output_var_list.size(), 1u);
  2019. };
  2020. dump();
  2021. load();
  2022. }
  2023. TEST(TestOprDNN, HeuristicReproducible) {
  2024. using Policy = opr::ConvolutionBackwardFilter::ExecutionPolicy;
  2025. using S = Policy::Strategy;
  2026. using Checker = AutoOprChecker<3, 1>;
  2027. constexpr size_t PH = 1, PW = 1, SH = 1, SW = 1;
  2028. for (auto strategy : SmallVector<S>{S::HEURISTIC, S::HEURISTIC | S::REPRODUCIBLE}) {
  2029. VarNode* bwd_flt;
  2030. auto make_graph =
  2031. [&](const Checker::SymInpArray& inputs) -> Checker::SymOutArray {
  2032. Param param{Mode::CROSS_CORRELATION, PH, PW, SH, SW};
  2033. Policy policy;
  2034. policy.strategy = strategy;
  2035. auto out = opr::ConvolutionBackwardFilter::make(
  2036. inputs[0], inputs[1], inputs[2], param, policy);
  2037. bwd_flt = out.node();
  2038. return {out};
  2039. };
  2040. auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
  2041. std::shared_ptr<HostTensorND> out;
  2042. conv_bwd_flt_brute(
  2043. {inp[0], inp[1], inp[2]}, out,
  2044. Param{Mode::CROSS_CORRELATION, PH, PW, SH, SW});
  2045. dest[0] = *out;
  2046. };
  2047. #define get_shp(N, P, S, F) ((N + 2 * P - F) / S + 1)
  2048. #define inp_tensor(N, IC, OC, IH, IW, FH, FW) \
  2049. { \
  2050. TensorShape{N, IC, IH, IW}, \
  2051. {N, OC, get_shp(IH, PH, SH, FH), get_shp(IW, PW, SW, FW)}, { \
  2052. OC, IC, FH, FW \
  2053. } \
  2054. }
  2055. Checker::RunOptions opt;
  2056. opt.numdiff_eps = 1;
  2057. opt.outputs_max_err = 1e-3;
  2058. std::string algo_name0, algo_name1;
  2059. {
  2060. Checker checker(make_graph, fwd);
  2061. checker.run(inp_tensor(2, 3, 4, 9, 8, 3, 3), opt)
  2062. .run(inp_tensor(1, 5, 3, 7, 9, 3, 3), opt)
  2063. .run(inp_tensor(3, 4, 4, 9, 9, 3, 3), opt);
  2064. auto&& megdnn_opr = static_cast<megdnn::ConvolutionBackwardFilter*>(
  2065. static_cast<opr::ConvolutionBackwardFilter*>(bwd_flt->owner_opr())
  2066. ->megdnn_opr());
  2067. auto&& algo = megdnn_opr->execution_policy().algo;
  2068. megdnn::Algorithm* palgo = megdnn_opr->get_algorithm_from_desc(algo);
  2069. mgb_assert(palgo, "Unknown algo description");
  2070. if (strategy == S(S::HEURISTIC | S::REPRODUCIBLE)) {
  2071. EXPECT_TRUE(palgo->contain_attribute_all(
  2072. megdnn::AlgoAttribute::REPRODUCIBLE));
  2073. }
  2074. algo_name0 = palgo->name();
  2075. }
  2076. megdnn::AlgorithmCache::instance().clear();
  2077. {
  2078. Checker checker(make_graph, fwd);
  2079. checker.run(inp_tensor(2, 3, 4, 9, 8, 3, 3), opt)
  2080. .run(inp_tensor(1, 5, 3, 7, 9, 3, 3), opt)
  2081. .run(inp_tensor(3, 4, 4, 9, 9, 3, 3), opt);
  2082. auto&& megdnn_opr = static_cast<megdnn::ConvolutionBackwardFilter*>(
  2083. static_cast<opr::ConvolutionBackwardFilter*>(bwd_flt->owner_opr())
  2084. ->megdnn_opr());
  2085. auto&& algo = megdnn_opr->execution_policy().algo;
  2086. megdnn::Algorithm* palgo = megdnn_opr->get_algorithm_from_desc(algo);
  2087. mgb_assert(palgo, "Unknown algo description");
  2088. algo_name1 = palgo->name();
  2089. }
  2090. EXPECT_TRUE(algo_name0 == algo_name1);
  2091. megdnn::AlgorithmCache::instance().clear();
  2092. }
  2093. #undef inp_tensor
  2094. #undef get_shp
  2095. }
  2096. #if MGB_CUDA
  2097. TEST(TestOprDNN, ConvolutionMultiCompNode) {
  2098. REQUIRE_GPU(1);
  2099. auto cn0 = CompNode::load("gpu0:0"), cn1 = CompNode::load("gpu0:1");
  2100. cn0.activate();
  2101. auto&& prop = CompNodeEnv::from_comp_node(cn0).cuda_env().device_prop;
  2102. auto sm_ver = prop.major * 10 + prop.minor;
  2103. if (sm_ver < 61) {
  2104. printf("This testcast ignored due to insufficient cuda cap(got: %d, "
  2105. "expected: %d)\n",
  2106. sm_ver, 61);
  2107. return;
  2108. }
  2109. HostTensorGenerator<dtype::Int8> gen;
  2110. auto mkvar = [&gen](const char* name, const TensorShape& shp, const DType& dtype,
  2111. std::shared_ptr<ComputingGraph> graph, const CompNode& cn) {
  2112. return opr::TypeCvt::make(
  2113. opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name), dtype);
  2114. };
  2115. auto mkcvar = [&gen](const char* name, const TensorShape& shp, const DType& dtype,
  2116. std::shared_ptr<ComputingGraph> graph, const CompNode& cn) {
  2117. return opr::TypeCvt::make(
  2118. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)).rename(name),
  2119. dtype);
  2120. };
  2121. auto graph0 = ComputingGraph::make();
  2122. graph0->options().graph_opt_level = 0;
  2123. auto graph1 = ComputingGraph::make();
  2124. graph1->options().graph_opt_level = 0;
  2125. auto make_func = [&gen, &mkvar, &mkcvar](
  2126. std::shared_ptr<ComputingGraph> graph,
  2127. const CompNode& cn) {
  2128. using Policy = opr::ConvBias::ExecutionPolicy;
  2129. using S = Policy::Strategy;
  2130. auto x = mkvar("x", {64, 32, 28, 28, 4}, dtype::QuantizedS8(2.5f), graph, cn),
  2131. w1 = mkcvar("w1", {256, 32, 5, 5, 4}, dtype::QuantizedS8(2.5f), graph, cn),
  2132. b1 = mkcvar("b1", {1, 64, 1, 1, 4}, dtype::QuantizedS32(6.25f), graph, cn),
  2133. w2 = mkcvar("w2", {256, 64, 3, 3, 4}, dtype::QuantizedS8(2.5f), graph, cn),
  2134. b2 = mkcvar("b2", {1, 64, 1, 1, 4}, dtype::QuantizedS32(6.25f), graph, cn);
  2135. opr::ConvBias::Param param;
  2136. param.format = opr::ConvBias::Param::Format::NCHW4;
  2137. param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU;
  2138. param.stride_h = param.stride_w = 2;
  2139. param.pad_h = param.pad_w = 2;
  2140. Policy policy;
  2141. policy.strategy = S::PROFILE;
  2142. auto y = opr::ConvBias::make(
  2143. x, w1, b1, param, policy, OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  2144. param.stride_h = param.stride_w = 1;
  2145. param.pad_h = param.pad_w = 1;
  2146. y = opr::ConvBias::make(
  2147. y, w2, b2, param, policy, OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  2148. return y;
  2149. };
  2150. auto y0 = make_func(graph0, cn0);
  2151. auto y1 = make_func(graph1, cn1);
  2152. HostTensorND host_y0, host_y1;
  2153. auto func0 = graph0->compile({make_callback_copy(y0, host_y0)});
  2154. auto func1 = graph1->compile({make_callback_copy(y1, host_y1)});
  2155. auto worker = [&func0, &func1](int wid) {
  2156. static const int iter_num = 1000;
  2157. if (wid == 0) {
  2158. for (int i = 0; i < iter_num; ++i)
  2159. func0->execute();
  2160. } else {
  2161. for (int i = 0; i < iter_num; ++i)
  2162. ; // test
  2163. // func1->execute();
  2164. }
  2165. };
  2166. std::thread worker0(worker, 0);
  2167. std::thread worker1(worker, 1);
  2168. worker0.join();
  2169. worker1.join();
  2170. }
  2171. #endif
  2172. } // anonymous namespace
  2173. #ifndef _WIN32
  2174. namespace mgb {
  2175. namespace opr {
  2176. namespace testing {
  2177. class ConvolutionTestingPeer {
  2178. opr::ConvolutionForward& m_conv_opr;
  2179. public:
  2180. explicit ConvolutionTestingPeer(cg::OperatorNodeBase* opr)
  2181. : m_conv_opr(opr->cast_final_safe<opr::ConvolutionForward>()) {}
  2182. void set_megdnn_opr(std::unique_ptr<megdnn::ConvolutionForward> megdnn_opr) {
  2183. m_conv_opr.set_megdnn_opr(std::move(megdnn_opr));
  2184. }
  2185. };
  2186. } // namespace testing
  2187. } // namespace opr
  2188. } // namespace mgb
  2189. namespace {
  2190. using megdnn::TensorND;
  2191. using megdnn::Workspace;
  2192. using opr::testing::ConvolutionTestingPeer;
  2193. class MockConvolutionForward : public megdnn::ConvolutionForward {
  2194. const char* m_algorithm_set_name;
  2195. public:
  2196. MockConvolutionForward(megdnn::ConvolutionForward* orig, const char* algo_set_name)
  2197. : megdnn::ConvolutionForward(orig->handle()),
  2198. m_algorithm_set_name(algo_set_name) {}
  2199. MOCK_METHOD5(
  2200. exec,
  2201. void(_megdnn_tensor_in src, _megdnn_tensor_in filter,
  2202. _megdnn_tensor_out dst, const PreprocessedFilter* preprocessed_filter,
  2203. _megdnn_workspace workspace));
  2204. MOCK_METHOD5(
  2205. exec_preprocess,
  2206. void(const TensorLayout& src_layout, _megdnn_tensor_in filter,
  2207. const TensorLayout& dst_layout,
  2208. PreprocessedFilter* preprocessed_filter, _megdnn_workspace workspace));
  2209. MOCK_METHOD4(
  2210. get_workspace_in_bytes,
  2211. size_t(const TensorLayout& src, const TensorLayout& filter,
  2212. const TensorLayout& dst,
  2213. const PreprocessedFilter* preprocessed_filter));
  2214. MOCK_METHOD3(
  2215. deduce_preprocessed_filter_layout,
  2216. SmallVector<TensorLayout>(
  2217. const TensorLayout& src, const TensorLayout& filter,
  2218. const TensorLayout& dst));
  2219. MOCK_METHOD3(
  2220. get_preprocess_workspace_in_bytes,
  2221. size_t(const TensorLayout& src, const TensorLayout& filter,
  2222. const TensorLayout& dst));
  2223. MOCK_METHOD3(
  2224. get_all_algorithms_info,
  2225. std::vector<AlgorithmInfo>(
  2226. const TensorLayout& p0, const TensorLayout& p1,
  2227. const TensorLayout& p2));
  2228. MOCK_METHOD3(
  2229. get_all_algorithms_info_safe,
  2230. std::vector<AlgorithmInfo>(
  2231. const TensorLayout& p0, const TensorLayout& p1,
  2232. const TensorLayout& p2));
  2233. MOCK_METHOD6(
  2234. get_algorithm_info_heuristic,
  2235. AlgorithmInfo(
  2236. const TensorLayout& p0, const TensorLayout& p1,
  2237. const TensorLayout& p2, size_t workspace_limit_in_bytes,
  2238. const AlgoAttribute& positive_attr,
  2239. const AlgoAttribute& negative_attr));
  2240. MOCK_METHOD3(
  2241. get_all_algorithms, std::vector<Algorithm*>(
  2242. const TensorLayout& p0, const TensorLayout& p1,
  2243. const TensorLayout& p2));
  2244. MOCK_METHOD3(
  2245. get_all_algorithms_safe,
  2246. std::vector<Algorithm*>(
  2247. const TensorLayout& p0, const TensorLayout& p1,
  2248. const TensorLayout& p2));
  2249. MOCK_METHOD6(
  2250. get_algorithm_heuristic,
  2251. Algorithm*(
  2252. const TensorLayout& p0, const TensorLayout& p1,
  2253. const TensorLayout& p2, size_t workspace_limit_in_bytes,
  2254. const AlgoAttribute& positive_attr,
  2255. const AlgoAttribute& negative_attr));
  2256. MOCK_METHOD1(get_algorithm_from_desc, Algorithm*(const AlgorithmDesc&));
  2257. protected:
  2258. const char* get_algorithm_set_name() const override { return m_algorithm_set_name; }
  2259. };
  2260. class MockAlgorithm : public megdnn::detail::Algorithm {
  2261. const char* m_name;
  2262. public:
  2263. MockAlgorithm(const char* name = "NotImportant") : m_name(name) {}
  2264. Attribute attribute() const override { return Attribute::REPRODUCIBLE; }
  2265. const char* name() const override { return m_name; }
  2266. uint32_t type() const override {
  2267. return megdnn::detail::Algorithm::INVALID_ALGO_TYPE;
  2268. }
  2269. virtual ~MockAlgorithm() = default;
  2270. };
  2271. class TestWeightPreprocess : public ::testing::Test {
  2272. protected:
  2273. CompNode comp_node;
  2274. std::shared_ptr<ComputingGraph> graph;
  2275. std::shared_ptr<HostTensorND> x_host;
  2276. MockConvolutionForward* mock_conv_ptr;
  2277. SymbolVar y;
  2278. HostTensorND y_host;
  2279. std::unique_ptr<cg::AsyncExecutable> func;
  2280. MockConvolutionForward& mock_conv() { return *mock_conv_ptr; }
  2281. void SetUp() override {
  2282. constexpr uint32_t ih = 10, ic = 16, oc = 32, ph = 0, sh = 1, fh = 2, iw = ih;
  2283. comp_node = CompNode::load("cpux");
  2284. graph = ComputingGraph::make();
  2285. graph->options().graph_opt.weight_preprocess = is_weight_preprocess();
  2286. TensorShape x_shape{1, ic, ih, iw}, w_shape{oc, ic, fh, fh};
  2287. x_host = std::make_shared<HostTensorND>(comp_node, x_shape);
  2288. auto x = opr::Host2DeviceCopy::make(*graph, x_host);
  2289. auto w = opr::ImmutableTensor::make(*graph, {comp_node, w_shape});
  2290. Param param;
  2291. param.pad_h = param.pad_w = ph;
  2292. param.stride_h = param.stride_w = sh;
  2293. param.format = Param::Format::NCHW;
  2294. y = opr::ConvolutionForward::make(x, w, param);
  2295. auto& opr = y.node()->owner_opr()->cast_final<opr::ConvolutionForward>();
  2296. auto mock = std::make_unique<MockConvolutionForward>(
  2297. opr.megdnn_opr(),
  2298. ::testing::UnitTest::GetInstance()->current_test_info()->name());
  2299. mock_conv_ptr = mock.get();
  2300. ConvolutionTestingPeer{&opr}.set_megdnn_opr(std::move(mock));
  2301. func = graph->compile({make_callback_copy(y, y_host)});
  2302. }
  2303. void run() { func->execute().wait(); }
  2304. virtual bool is_weight_preprocess() { return true; }
  2305. void TearDown() override {
  2306. func.reset();
  2307. // Triggers mock check
  2308. graph.reset();
  2309. x_host.reset();
  2310. }
  2311. };
  2312. TEST_F(TestWeightPreprocess, NoPreprocessNeeded) {
  2313. using ::testing::_;
  2314. using ::testing::Return;
  2315. auto& mock = mock_conv();
  2316. MockAlgorithm algo;
  2317. EXPECT_CALL(mock, get_algorithm_heuristic(_, _, _, _, _, _))
  2318. .WillRepeatedly(Return(&algo));
  2319. EXPECT_CALL(mock, get_algorithm_from_desc(_)).WillRepeatedly(Return(&algo));
  2320. EXPECT_CALL(mock, get_workspace_in_bytes(_, _, _, _)).WillRepeatedly(Return(0));
  2321. EXPECT_CALL(mock, get_preprocess_workspace_in_bytes(_, _, _))
  2322. .WillRepeatedly(Return(0));
  2323. {
  2324. ::testing::InSequence seq;
  2325. // Return empty preprocess filters, indicating no need to preprocess
  2326. EXPECT_CALL(mock, deduce_preprocessed_filter_layout(_, _, _))
  2327. .WillRepeatedly(Return(SmallVector<TensorLayout>{}));
  2328. EXPECT_CALL(mock, exec_preprocess(_, _, _, _, _)).Times(0);
  2329. EXPECT_CALL(mock, exec(_, _, _, nullptr, _));
  2330. run();
  2331. }
  2332. }
  2333. TEST_F(TestWeightPreprocess, PreprocessCalledOnlyOnce) {
  2334. megdnn::AlgorithmCache::instance().clear();
  2335. using ::testing::_;
  2336. using ::testing::Expectation;
  2337. using ::testing::Field;
  2338. using ::testing::Invoke;
  2339. using ::testing::Return;
  2340. using PF = MockConvolutionForward::PreprocessedFilter;
  2341. auto& mock = mock_conv();
  2342. MockAlgorithm algo;
  2343. SmallVector<TensorLayout> filter_layout{
  2344. {{1, 2, 3, 4}, dtype::Float32()}, {{5, 6, 7, 8}, dtype::Float32()}};
  2345. EXPECT_CALL(mock, deduce_preprocessed_filter_layout(_, _, _))
  2346. .WillRepeatedly(Return(filter_layout));
  2347. EXPECT_CALL(mock, get_algorithm_from_desc(_)).WillRepeatedly(Return(&algo));
  2348. Expectation algo_call = EXPECT_CALL(mock, get_algorithm_heuristic(_, _, _, _, _, _))
  2349. .WillOnce(Return(&algo));
  2350. Expectation ws_call = EXPECT_CALL(mock, get_workspace_in_bytes(_, _, _, _))
  2351. .After(algo_call)
  2352. .WillOnce(Return(0));
  2353. Expectation pre_ws_call =
  2354. EXPECT_CALL(mock, get_preprocess_workspace_in_bytes(_, _, _))
  2355. .After(algo_call)
  2356. .WillOnce(Return(233));
  2357. {
  2358. ::testing::InSequence seq;
  2359. // exec_preprocess should be called only once, with workspace allocated
  2360. int salt = 0;
  2361. EXPECT_CALL(mock, exec_preprocess(_, _, _, _, _))
  2362. .After(ws_call, pre_ws_call)
  2363. .WillOnce(Invoke([&](const TensorLayout&, _megdnn_tensor_in,
  2364. const TensorLayout&, PF* pf,
  2365. _megdnn_workspace workspace) {
  2366. ASSERT_EQ(workspace.size, 233);
  2367. ASSERT_NE(pf, nullptr);
  2368. pf->algorithm_id = &salt;
  2369. ASSERT_EQ(pf->tensors.size(), 2);
  2370. ASSERT_TRUE(pf->tensors[0].layout.eq_shape({1, 2, 3, 4}));
  2371. ASSERT_TRUE(pf->tensors[1].layout.eq_shape({5, 6, 7, 8}));
  2372. ASSERT_NE(pf->tensors[0].raw_ptr(), nullptr);
  2373. ASSERT_NE(pf->tensors[1].raw_ptr(), nullptr);
  2374. pf->tensors[0].ptr<float>()[0] = 114.514f;
  2375. pf->tensors[1].ptr<float>()[0] = 1926.0817f;
  2376. }));
  2377. // Run the graph multiple times.
  2378. for (int i = 0; i < 3; i++) {
  2379. if (i > 0) {
  2380. EXPECT_CALL(mock, exec_preprocess(_, _, _, _, _)).Times(0);
  2381. }
  2382. EXPECT_CALL(mock, exec(_, _, _, _, _))
  2383. .WillOnce(Invoke([&](_megdnn_tensor_in, _megdnn_tensor_in,
  2384. _megdnn_tensor_out, const PF* pf,
  2385. _megdnn_workspace) {
  2386. ASSERT_NE(pf, nullptr);
  2387. ASSERT_EQ(pf->algorithm_id, &salt);
  2388. ASSERT_EQ(pf->tensors[0].ptr<float>()[0], 114.514f);
  2389. ASSERT_EQ(pf->tensors[1].ptr<float>()[0], 1926.0817f);
  2390. }));
  2391. run();
  2392. }
  2393. }
  2394. }
  2395. class TestNoWeightPreprocess : public TestWeightPreprocess {
  2396. bool is_weight_preprocess() override { return false; }
  2397. };
  2398. TEST_F(TestNoWeightPreprocess, NoPreprocess) {
  2399. using ::testing::_;
  2400. using ::testing::Return;
  2401. auto& mock = mock_conv();
  2402. MockAlgorithm algo;
  2403. EXPECT_CALL(mock, get_algorithm_heuristic(_, _, _, _, _, _))
  2404. .WillRepeatedly(Return(&algo));
  2405. EXPECT_CALL(mock, get_algorithm_from_desc(_)).WillRepeatedly(Return(&algo));
  2406. EXPECT_CALL(mock, get_workspace_in_bytes(_, _, _, _)).WillRepeatedly(Return(0));
  2407. EXPECT_CALL(mock, get_preprocess_workspace_in_bytes(_, _, _))
  2408. .WillRepeatedly(Return(0));
  2409. {
  2410. ::testing::InSequence seq;
  2411. // Return empty preprocess filters, indicating no need to preprocess
  2412. EXPECT_CALL(mock, deduce_preprocessed_filter_layout(_, _, _)).Times(0);
  2413. EXPECT_CALL(mock, exec_preprocess(_, _, _, _, _)).Times(0);
  2414. EXPECT_CALL(mock, exec(_, _, _, nullptr, _));
  2415. run();
  2416. }
  2417. }
  2418. } // anonymous namespace
  2419. #endif
  2420. namespace {
  2421. TEST(TestOprDNN, ConvBiasInt4Serialize) {
  2422. using namespace serialization;
  2423. float inp_scale = 1.20210327f;
  2424. float filt_scale = 1.20210406f;
  2425. float bias_scale = inp_scale * filt_scale;
  2426. DType output_dtype = dtype::QuantizedS4{inp_scale};
  2427. HostTensorGenerator<dtype::Int8> gen;
  2428. std::shared_ptr<HostTensorND> xv;
  2429. auto mkvar = [](const char* name, const DType& dtype,
  2430. std::shared_ptr<ComputingGraph> graph,
  2431. std::shared_ptr<HostTensorND> val) {
  2432. return opr::TypeCvt::make(
  2433. opr::Host2DeviceCopy::make(*graph, val).rename(name), dtype);
  2434. };
  2435. auto mkcvar = [&gen](const char* name, const TensorShape& shp, const DType& dtype,
  2436. std::shared_ptr<ComputingGraph> graph, const CompNode& cn) {
  2437. return opr::TypeCvt::make(
  2438. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)).rename(name),
  2439. dtype);
  2440. };
  2441. auto fname = output_file("ConvBiasInt4Serialize");
  2442. HostTensorND y1, y2;
  2443. auto dump = [&]() {
  2444. opr::ConvBias::Param param;
  2445. param.mode = Mode::CONVOLUTION;
  2446. auto cn = CompNode::load("cpu0");
  2447. auto graph = ComputingGraph::make();
  2448. xv = gen({1, 64, 56, 56}, cn);
  2449. auto x = mkvar("x", dtype::QuantizedS4{inp_scale}, graph, xv);
  2450. auto w =
  2451. mkcvar("w", {256, 64, 1, 1}, dtype::QuantizedS4{filt_scale}, graph, cn);
  2452. auto b =
  2453. mkcvar("b", {1, 256, 1, 1}, dtype::QuantizedS32{bias_scale}, graph, cn);
  2454. auto y = opr::ConvBiasForward::make(
  2455. x, w, b, param, {}, OperatorNodeConfig{output_dtype});
  2456. auto w1 = mkcvar(
  2457. "w1", {64, 256, 1, 1}, dtype::QuantizedS4{filt_scale}, graph, cn);
  2458. auto b1 =
  2459. mkcvar("b1", {1, 64, 1, 1}, dtype::QuantizedS32{bias_scale}, graph, cn);
  2460. y = opr::ConvBiasForward::make(
  2461. y, w1, b1, param, {}, OperatorNodeConfig{output_dtype});
  2462. y = opr::TypeCvt::make(y, dtype::Float32());
  2463. auto dumper = GraphDumper::make(OutputFile::make_fs(fname.c_str()));
  2464. auto func = graph->compile({make_callback_copy(y, y1)});
  2465. func->execute();
  2466. func->wait();
  2467. auto rst = dumper->dump({y});
  2468. ASSERT_EQ(rst.outputs.size(), 1u);
  2469. };
  2470. auto load = [&]() {
  2471. auto loader = GraphLoader::make(InputFile::make_fs(fname.c_str()));
  2472. auto rst = loader->load();
  2473. for (const auto& t : rst.tensor_map) {
  2474. t.second->copy_from(*xv).sync();
  2475. }
  2476. auto func =
  2477. rst.graph->compile({make_callback_copy(rst.output_var_list[0], y2)});
  2478. func->execute();
  2479. func->wait();
  2480. ASSERT_EQ(rst.output_var_list.size(), 1u);
  2481. EXPECT_EQ(rst.output_var_list[0].dtype(), dtype::Float32());
  2482. };
  2483. dump();
  2484. load();
  2485. MGB_ASSERT_TENSOR_NEAR(y1, y2, 1e-3);
  2486. }
  2487. TEST(TestOprDNN, ConvBiasInt4SerializeWithParamFuse) {
  2488. using namespace serialization;
  2489. float inp_scale = 1.20210327f;
  2490. float filt_scale = 1.20210406f;
  2491. float bias_scale = inp_scale * filt_scale;
  2492. DType output_dtype = dtype::QuantizedS4{inp_scale};
  2493. HostTensorGenerator<dtype::Int8> gen;
  2494. std::shared_ptr<HostTensorND> xv;
  2495. auto mkvar = [](const char* name, const DType& dtype,
  2496. std::shared_ptr<ComputingGraph> graph,
  2497. std::shared_ptr<HostTensorND> val) {
  2498. return opr::TypeCvt::make(
  2499. opr::Host2DeviceCopy::make(*graph, val).rename(name), dtype);
  2500. };
  2501. auto mkcvar = [&gen](const char* name, const TensorShape& shp, const DType& dtype,
  2502. std::shared_ptr<ComputingGraph> graph, const CompNode& cn) {
  2503. return opr::TypeCvt::make(
  2504. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)).rename(name),
  2505. dtype);
  2506. };
  2507. auto fname = output_file("ConvBiasInt4SerializeWithParamFuse");
  2508. HostTensorND y1, y2;
  2509. auto dump = [&]() {
  2510. opr::ConvBias::Param param;
  2511. param.mode = Mode::CONVOLUTION;
  2512. auto cn = CompNode::load("cpu0");
  2513. auto graph = ComputingGraph::make();
  2514. xv = gen({1, 64, 56, 56}, cn);
  2515. auto x = mkvar("x", dtype::QuantizedS4{inp_scale}, graph, xv);
  2516. auto w =
  2517. mkcvar("w", {256, 64, 1, 1}, dtype::QuantizedS4{filt_scale}, graph, cn);
  2518. auto b =
  2519. mkcvar("b", {1, 256, 1, 1}, dtype::QuantizedS32{bias_scale}, graph, cn);
  2520. auto y = opr::ConvBiasForward::make(
  2521. x, w, b, param, {}, OperatorNodeConfig{output_dtype});
  2522. auto w1 = mkcvar(
  2523. "w1", {64, 256, 1, 1}, dtype::QuantizedS4{filt_scale}, graph, cn);
  2524. auto b1 =
  2525. mkcvar("b1", {1, 64, 1, 1}, dtype::QuantizedS32{bias_scale}, graph, cn);
  2526. y = opr::ConvBiasForward::make(
  2527. y, w1, b1, param, {}, OperatorNodeConfig{output_dtype});
  2528. y = opr::TypeCvt::make(y, dtype::Float32());
  2529. SymbolVar y_param_fused;
  2530. unpack_vector(
  2531. gopt::GraphOptimizer{}
  2532. .add_pass<gopt::ParamFusePass>()
  2533. .apply({{y}})
  2534. .endpoint_vars(),
  2535. y_param_fused);
  2536. auto dumper = GraphDumper::make(OutputFile::make_fs(fname.c_str()));
  2537. auto func = graph->compile({make_callback_copy(y_param_fused, y1)});
  2538. func->execute();
  2539. func->wait();
  2540. auto rst = dumper->dump({y_param_fused});
  2541. ASSERT_EQ(rst.outputs.size(), 1u);
  2542. };
  2543. auto load = [&]() {
  2544. auto loader = GraphLoader::make(InputFile::make_fs(fname.c_str()));
  2545. auto rst = loader->load();
  2546. for (const auto& t : rst.tensor_map) {
  2547. t.second->copy_from(*xv).sync();
  2548. }
  2549. auto func =
  2550. rst.graph->compile({make_callback_copy(rst.output_var_list[0], y2)});
  2551. func->execute();
  2552. func->wait();
  2553. ASSERT_EQ(rst.output_var_list.size(), 1u);
  2554. EXPECT_EQ(rst.output_var_list[0].dtype(), dtype::Float32());
  2555. };
  2556. dump();
  2557. load();
  2558. MGB_ASSERT_TENSOR_NEAR(y1, y2, 1e-3);
  2559. }
  2560. } // namespace
  2561. // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}