You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

convolution.cpp 116 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929
  1. /**
  2. * \file src/opr/test/dnn/convolution.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #include "megbrain/comp_node_env.h"
  12. #include "./legacy_checker.h"
  13. #include "megbrain/opr/dnn/convolution.h"
  14. #include "megbrain/test/autocheck.h"
  15. #include "megbrain/test/helper.h"
  16. #include "megbrain/test/megdnn_helper.h"
  17. #include "megbrain/serialization/serializer.h"
  18. #include "megbrain/opr/basic_arith.h"
  19. #include "megbrain/gopt/inference.h"
  20. #include "megbrain/opr/tensor_manip.h"
  21. #include "megdnn/dtype.h"
  22. #include "megdnn/oprs/base.h"
  23. #include <gmock/gmock.h>
  24. #include <cmath>
  25. #include <memory>
  26. #include <random>
  27. using namespace mgb;
  28. namespace {
  29. using Param = opr::Convolution::Param;
  30. using Param3D = opr::Convolution3D::Param;
  31. using Mode = Param::Mode;
  32. Mode modes_to_check[] = {Mode::CONVOLUTION, Mode::CROSS_CORRELATION};
  33. void conv_bwd_data_brute(const std::vector<std::shared_ptr<HostTensorND>>& inps,
  34. std::shared_ptr<HostTensorND>& dest,
  35. const opr::ConvolutionBackwardData::Param& param) {
  36. mgb_assert(param.format == Param::Format::NCHW);
  37. auto &&data = *inps[0], &&filter = *inps[1];
  38. size_t N = data.shape(0), IH = data.shape(2), IW = data.shape(3);
  39. size_t GROUP, ICPG, OCPG, FH, FW;
  40. if (param.sparse == Param::Sparse::DENSE) {
  41. GROUP = 1, ICPG = filter.shape(0), OCPG = filter.shape(1),
  42. FH = filter.shape(2), FW = filter.shape(3);
  43. } else {
  44. mgb_assert(param.sparse == Param::Sparse::GROUP);
  45. GROUP = filter.shape(0), ICPG = filter.shape(1), OCPG = filter.shape(2),
  46. FH = filter.shape(3), FW = filter.shape(4);
  47. }
  48. auto get_shp = [](size_t inp, size_t filter, size_t stride, size_t pad,
  49. size_t dilate) {
  50. return (inp - 1) * stride + (filter - 1) * dilate + 1 - pad * 2;
  51. };
  52. size_t OH = get_shp(IH, FH, param.stride_h, param.pad_h, param.dilate_h),
  53. OW = get_shp(IW, FW, param.stride_w, param.pad_w, param.dilate_w);
  54. dest = std::make_shared<HostTensorND>(CompNode::load("xpu0"),
  55. TensorShape{N, OCPG * GROUP, OH, OW});
  56. auto&& out = *dest;
  57. auto fptr = filter.ptr<float>(), dptr = data.ptr<float>(),
  58. optr = out.ptr<float>();
  59. memset(optr, 0, sizeof(float) * out.shape().total_nr_elems());
  60. auto ol = out.layout(), fl = filter.layout();
  61. #define FOR2(a, A, b, B) \
  62. for (size_t a = 0; a < A; ++a) \
  63. for (size_t b = 0; b < B; ++b)
  64. #define FOR3(a, A, b, B, c, C) \
  65. FOR2(a, A, b, B) \
  66. for (size_t c = 0; c < C; ++c)
  67. FOR3(n, N, group, GROUP, icg, ICPG)
  68. FOR2(ih, IH, iw, IW) {
  69. float scale = *(dptr++);
  70. FOR3(ocg, OCPG, fh, FH, fw, FW) {
  71. auto oc_tot = group * OCPG + ocg;
  72. int oh = int(ih * param.stride_h + fh * param.dilate_h) -
  73. int(param.pad_h),
  74. ow = int(iw * param.stride_w + fw * param.dilate_w) -
  75. int(param.pad_w);
  76. if (oh >= 0 && ow >= 0 && oh < static_cast<int>(OH) &&
  77. ow < static_cast<int>(OW)) {
  78. auto out_off = n * ol.stride[0] + oc_tot * ol.stride[1] +
  79. oh * ol.stride[2] + ow;
  80. size_t flt_off = 0;
  81. if (param.sparse == Param::Convolution::Sparse::DENSE) {
  82. flt_off = icg * fl.stride[0] +
  83. ocg * fl.stride[1] + fh * fl.stride[2] + fw;
  84. } else {
  85. flt_off = group * fl.stride[0] + icg * fl.stride[1] +
  86. ocg * fl.stride[2] + fh * fl.stride[3] + fw;
  87. }
  88. optr[out_off] += scale * fptr[flt_off];
  89. }
  90. }
  91. }
  92. #undef FOR3
  93. #undef FOR2
  94. }
  95. void conv_bwd_flt_brute(const std::vector<std::shared_ptr<HostTensorND>>& inps,
  96. std::shared_ptr<HostTensorND>& out,
  97. const opr::ConvolutionBackwardFilter::Param& param) {
  98. auto &&src = *inps[0], &&diff = *inps[1], &&filter = *inps[2];
  99. size_t N = src.shape(0), IH = src.shape(2), IW = src.shape(3),
  100. OC = filter.shape(0), IC = filter.shape(1), FH = filter.shape(2),
  101. FW = filter.shape(3), OH = diff.shape(2), OW = diff.shape(3);
  102. out = std::make_shared<HostTensorND>(CompNode::load("xpu0"),
  103. TensorShape{OC, IC, FH, FW});
  104. auto&& grad = *out;
  105. auto sptr = src.ptr<float>(), dptr = diff.ptr<float>(),
  106. gptr = grad.ptr<float>();
  107. memset(gptr, 0, sizeof(float) * grad.shape().total_nr_elems());
  108. auto valid = [&](size_t ih, size_t iw) { return ih < IH && iw < IW; };
  109. for (size_t n = 0; n < N; ++n)
  110. for (size_t oc = 0; oc < OC; ++oc)
  111. for (size_t ic = 0; ic < IC; ++ic) {
  112. for (size_t oh = 0; oh < OH; ++oh)
  113. for (size_t ow = 0; ow < OW; ++ow) {
  114. for (size_t fh = 0; fh < FH; ++fh)
  115. for (size_t fw = 0; fw < FW; ++fw) {
  116. size_t ih = oh * param.stride_h + fh -
  117. param.pad_h,
  118. iw = ow * param.stride_w + fw -
  119. param.pad_w;
  120. auto src_data =
  121. valid(ih, iw)
  122. ? sptr[(n * IC + ic) * IH * IW +
  123. ih * IW + iw]
  124. : 0;
  125. gptr[(oc * IC + ic) * FH * FW + fh * FW + fw] +=
  126. dptr[(n * OC + oc) * OH * OW + oh * OW +
  127. ow] *
  128. src_data;
  129. }
  130. }
  131. }
  132. }
  133. void local_share_brute(const std::vector<std::shared_ptr<HostTensorND>>& inps,
  134. std::shared_ptr<HostTensorND>& out,
  135. const opr::LocalShare::Param& param) {
  136. auto in = inps[0], filter = inps[1];
  137. mgb_assert(in->shape().ndim == 4);
  138. mgb_assert(filter->shape().ndim == 6);
  139. int batch_size = in->shape()[0], ci = in->shape()[1], hi = in->shape()[2],
  140. wi = in->shape()[3];
  141. int fh = filter->shape()[3], fw = filter->shape()[4];
  142. int ph = param.pad_h, pw = param.pad_w;
  143. int sh = param.stride_h, sw = param.stride_w;
  144. int dh = param.dilate_h, dw = param.dilate_w;
  145. int sgh = filter->shape()[0], sgw = filter->shape()[1];
  146. mgb_assert(dh == 1 && dw == 1);
  147. mgb_assert(static_cast<uint32_t>(sgh) == param.spatial_groups_h &&
  148. static_cast<uint32_t>(sgw) == param.spatial_groups_w);
  149. int ho = (hi + 2 * ph - fh) / sh + 1;
  150. int wo = (wi + 2 * pw - fw) / sw + 1;
  151. mgb_assert(ho % sgh == 0 && wo % sgw == 0);
  152. int grp_ho = ho / sgh, grp_wo = wo / sgw;
  153. int co = filter->shape()[5];
  154. size_t u_batch = batch_size, u_co = co, u_ho = ho, u_wo = wo;
  155. out = std::make_shared<HostTensorND>(
  156. CompNode::load("xpu0"), TensorShape{u_batch, u_co, u_ho, u_wo});
  157. mgb_assert(param.mode == Param::Mode::CROSS_CORRELATION);
  158. for (int n = 0; n < batch_size; ++n) {
  159. for (int oc = 0; oc < co; ++oc) {
  160. for (int oh = 0; oh < ho; ++oh) {
  161. for (int ow = 0; ow < wo; ++ow) {
  162. size_t u_n = n, u_oc = oc, u_oh = oh, u_ow = ow;
  163. float& dval = out->ptr<float>({u_n, u_oc, u_oh, u_ow})[0];
  164. dval = 0;
  165. int grp_oh_idx = oh / grp_ho;
  166. int grp_ow_idx = ow / grp_wo;
  167. for (int ic = 0; ic < ci; ++ic) {
  168. for (int kh = 0; kh < fh; ++kh) {
  169. for (int kw = 0; kw < fw; ++kw) {
  170. int ih = oh * sh - ph + kh;
  171. int iw = ow * sw - pw + kw;
  172. float sval = 0.f;
  173. float fval = 0.f;
  174. if (ih >= 0 && ih < hi && iw >= 0 && iw < wi) {
  175. sval = in->ptr<float>(
  176. {static_cast<size_t>(n),
  177. static_cast<size_t>(ic),
  178. static_cast<size_t>(ih),
  179. static_cast<size_t>(iw)})[0];
  180. }
  181. fval = filter->ptr<float>(
  182. {static_cast<size_t>(grp_oh_idx),
  183. static_cast<size_t>(grp_ow_idx),
  184. static_cast<size_t>(ic),
  185. static_cast<size_t>(kh),
  186. static_cast<size_t>(kw),
  187. static_cast<size_t>(oc)})[0];
  188. dval += fval * sval;
  189. }
  190. }
  191. }
  192. }
  193. }
  194. }
  195. }
  196. }
  197. void convolution_brute(const std::vector<std::shared_ptr<HostTensorND>> &in_tensor,
  198. std::shared_ptr<HostTensorND> &out_tensor,
  199. const opr::Convolution::Param &param)
  200. {
  201. mgb_assert(in_tensor.size() == 2);
  202. auto in = in_tensor[0], filter = in_tensor[1];
  203. mgb_assert(in->shape().ndim == 4);
  204. mgb_assert(filter->shape().ndim == 4);
  205. int batch_size = in->shape().shape[0];
  206. int ic = in->shape().shape[1];
  207. int ih = in->shape().shape[2];
  208. int iw = in->shape().shape[3];
  209. int fh = filter->shape().shape[2];
  210. int fw = filter->shape().shape[3];
  211. int ph = param.pad_h;
  212. int pw = param.pad_w;
  213. int sh = param.stride_h;
  214. int sw = param.stride_w;
  215. int dh = param.dilate_h;
  216. int dw = param.dilate_w;
  217. mgb_assert(ih + 2*ph >= (fh - 1) * dh + 1);
  218. mgb_assert(iw + 2*pw >= (fw - 1) * dw + 1);
  219. int oh = (ih + 2*ph - ((fh - 1) * dh + 1)) / sh + 1;
  220. int ow = (iw + 2*pw - ((fw - 1) * dw + 1)) / sw + 1;
  221. mgb_assert(static_cast<size_t>(ic) == filter->shape().shape[1]);
  222. int oc = filter->shape().shape[0];
  223. out_tensor = std::make_shared<HostTensorND>(CompNode::load("xpu0"),
  224. TensorShape{
  225. static_cast<size_t>(batch_size),
  226. static_cast<size_t>(oc),
  227. static_cast<size_t>(oh),
  228. static_cast<size_t>(ow)});
  229. int pn, poc, poh, pow, pih, piw, pic, pfh, pfw;
  230. for (pn = 0; pn < batch_size; ++pn)
  231. for (poc = 0; poc < oc; ++poc)
  232. for (poh = 0, pih = -ph; poh < oh; ++poh, pih += sh)
  233. for (pow = 0, piw = -pw; pow < ow; ++pow, piw += sw)
  234. {
  235. float &target = out_tensor->ptr<float>({
  236. static_cast<size_t>(pn),
  237. static_cast<size_t>(poc),
  238. static_cast<size_t>(poh),
  239. static_cast<size_t>(pow)})[0];
  240. target = 0;
  241. for (pic = 0; pic < ic; ++pic)
  242. for (pfh = 0; pfh < fh; ++pfh)
  243. for (pfw = 0; pfw < fw; ++pfw)
  244. {
  245. int prih, priw;
  246. float img_data, filter_data;
  247. if (param.mode == Param::Mode::CONVOLUTION) {
  248. prih = pih + (fh - pfh - 1) * dh;
  249. priw = piw + (fw - pfw - 1) * dw;
  250. } else {
  251. mgb_assert(param.mode == Param::Mode::CROSS_CORRELATION);
  252. prih = pih + pfh * dh;
  253. priw = piw + pfw * dw;
  254. }
  255. if (prih >= 0 && prih < ih &&
  256. priw >= 0 && priw < iw) {
  257. img_data = in_tensor[0]->ptr<float>({
  258. static_cast<size_t>(pn),
  259. static_cast<size_t>(pic),
  260. static_cast<size_t>(prih),
  261. static_cast<size_t>(priw)})[0];
  262. } else {
  263. img_data = 0;
  264. }
  265. filter_data = filter->ptr<float>({
  266. static_cast<size_t>(poc),
  267. static_cast<size_t>(pic),
  268. static_cast<size_t>(pfh),
  269. static_cast<size_t>(pfw)})[0];
  270. target += img_data * filter_data;
  271. }
  272. }
  273. }
  274. opr::Convolution::Param convert_to_conv_param(
  275. const opr::ConvBiasForward::Param& param) {
  276. return opr::Convolution::Param{
  277. param.mode, param.pad_h, param.pad_w,
  278. param.stride_h, param.stride_w, param.dilate_h,
  279. param.dilate_w, param.sparse, param.format};
  280. };
  281. #if MGB_CUDA
  282. opr::Convolution::Param convert_to_conv_param(
  283. const opr::BatchConvBiasForward::Param& param) {
  284. return opr::Convolution::Param{
  285. param.mode, param.pad_h, param.pad_w,
  286. param.stride_h, param.stride_w, param.dilate_h,
  287. param.dilate_w, param.sparse, param.format};
  288. };
  289. #endif
  290. TEST(TestOprDNN, ConvolutionForward) {
  291. uint32_t ih = 10, ic = 16, oc = 32, ph = 0, sh = 1, fh = 2;
  292. for (auto mode: modes_to_check) {
  293. uint32_t iw = ih + 1, fw = fh + 1, pw = ph + 1, sw = sh + 1;
  294. Param param{mode, ph, pw, sh, sw};
  295. size_t batch_size = 32;
  296. // !!! DEPRECATED. use AutoOprChecker instead.
  297. opr::test::ForwardChecker<opr::Convolution, 2> forward_checker({
  298. {batch_size, ic, ih, iw},
  299. {oc, ic, fh, fw}},
  300. convolution_brute, param);
  301. forward_checker.run();
  302. }
  303. }
  304. TEST(TestOprDNN, ConvolutionBackward) {
  305. uint32_t ih = 10, ic = 16, oc = 32, ph = 0, sh = 1, fh = 2;
  306. for (auto mode: modes_to_check) {
  307. uint32_t iw = 11, fw = 4, pw = 1, sw = 3;
  308. Param param{mode, ph, pw, sh, sw};
  309. size_t batch_size = 32;
  310. // !!! DEPRECATED. use AutoOprChecker instead.
  311. opr::test::BackwardChecker<opr::Convolution, 2> backward_checker({
  312. {batch_size, ic, ih, iw},
  313. {oc, ic, fh, fw}}, param, 1e-2, 1);
  314. backward_checker.run();
  315. }
  316. }
  317. TEST(TestOprDNN, ConvBiasExePolicy) {
  318. using Param = opr::ConvBias::Param;
  319. Param param;
  320. using Policy = opr::ConvBias::ExecutionPolicy;
  321. using S = Policy::Strategy;
  322. auto cn = CompNode::load("cpux");
  323. auto orig_impl = PersistentCache::set_impl(
  324. std::make_shared<InMemoryPersistentCache>());
  325. auto run = [&](S strategy) {
  326. auto graph = ComputingGraph::make();
  327. HostTensorGenerator<> gen;
  328. auto mkvar = [&](const char* name, const TensorShape& shp,
  329. const DType& dtype) {
  330. return opr::TypeCvt::make(
  331. opr::Host2DeviceCopy::make(*graph, gen(shp), cn)
  332. .rename(name),
  333. dtype);
  334. };
  335. auto x = mkvar("x", {20, 50, 50, 16}, dtype::QuantizedS8(2.5f));
  336. auto w = mkvar("w", {24, 3, 3, 16}, dtype::QuantizedS8(2.5f));
  337. auto bias = mkvar("bias", {1, 1, 1, 24}, dtype::QuantizedS32(6.25f));
  338. param.nonlineMode = Param::NonlineMode::RELU;
  339. param.format = Param::Format::NHWC;
  340. Policy policy;
  341. policy.strategy = strategy;
  342. auto conv_bias = opr::ConvBias::make(
  343. x, w, bias, param, policy,
  344. OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  345. HostTensorND host_y;
  346. auto func = graph->compile({make_callback_copy(conv_bias, host_y)});
  347. func->execute();
  348. //! set a new cache
  349. PersistentCache::set_impl(std::make_shared<InMemoryPersistentCache>());
  350. };
  351. #if MGB_ENABLE_FASTRUN
  352. for (auto strategy :
  353. SmallVector<S>{S::PROFILE, S::HEURISTIC, S::PROFILE | S::REPRODUCIBLE,
  354. S::PROFILE | S::HEURISTIC}) {
  355. #else
  356. for (auto strategy :
  357. SmallVector<S>{S : HEURISTIC, S::PROFILE | S::HEURISTIC}) {
  358. #endif
  359. run(strategy);
  360. }
  361. ASSERT_THROW(run(S::OPTIMIZED | S::PROFILE), MegBrainError);
  362. PersistentCache::set_impl(orig_impl);
  363. }
  364. TEST(TestOprDNN, ConvBiasExePolicy_Quantized8Asym) {
  365. using Param = opr::ConvBias::Param;
  366. Param param;
  367. using Policy = opr::ConvBias::ExecutionPolicy;
  368. using S = Policy::Strategy;
  369. auto cn = CompNode::load("cpux");
  370. for (auto strategy :
  371. SmallVector<S>{S::PROFILE, S::PROFILE | S::REPRODUCIBLE}) {
  372. auto graph = ComputingGraph::make();
  373. HostTensorGenerator<> gen;
  374. auto mkvar = [&](const char* name, const TensorShape& shp,
  375. const DType& dtype) {
  376. return opr::TypeCvt::make(
  377. opr::Host2DeviceCopy::make(*graph, gen(shp), cn)
  378. .rename(name),
  379. dtype);
  380. };
  381. auto x = mkvar("x", {20, 50, 50, 16},
  382. dtype::Quantized8Asymm(2.5f, static_cast<uint8_t>(0)));
  383. auto w = mkvar("w", {24, 3, 3, 16},
  384. dtype::Quantized8Asymm(2.5f, static_cast<uint8_t>(0)));
  385. auto bias = mkvar("bias", {1, 1, 1, 24}, dtype::QuantizedS32(6.25f));
  386. param.nonlineMode = Param::NonlineMode::RELU;
  387. param.format = Param::Format::NHWC;
  388. Policy policy;
  389. policy.strategy = strategy;
  390. auto conv_bias = opr::ConvBias::make(
  391. x, w, bias, param, policy,
  392. OperatorNodeConfig{dtype::Quantized8Asymm(2.5f, static_cast<uint8_t>(0))});
  393. HostTensorND host_y;
  394. auto func = graph->compile({make_callback_copy(conv_bias, host_y)});
  395. func->execute();
  396. }
  397. }
  398. TEST(TestOprDNN, ConvolutionExePolicy) {
  399. Param param{Mode::CONVOLUTION};
  400. using Policy = opr::Convolution::ExecutionPolicy;
  401. using S = Policy::Strategy;
  402. int nr_get = 0;
  403. auto on_get = [&nr_get](const std::string&, const void*, size_t,
  404. const void*, size_t) { ++nr_get; };
  405. PersistentCacheHook cache_hook{on_get};
  406. #if MGB_ENABLE_FASTRUN
  407. for (auto strategy :
  408. SmallVector<S>{S::PROFILE, S::HEURISTIC, S::PROFILE | S::REPRODUCIBLE,
  409. S::PROFILE | S::HEURISTIC}) {
  410. #else
  411. for (auto strategy :
  412. SmallVector<S>{S : HEURISTIC, S::PROFILE | S::HEURISTIC}) {
  413. #endif
  414. using Checker = AutoOprChecker<2, 1>;
  415. auto make_graph = [&](const Checker::SymInpArray& inputs)
  416. -> Checker::SymOutArray {
  417. Policy policy;
  418. policy.strategy = strategy;
  419. auto out =
  420. opr::Convolution::make(inputs[0], inputs[1], param, policy);
  421. return {out};
  422. };
  423. auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
  424. std::shared_ptr<HostTensorND> sh_out;
  425. convolution_brute({inp.begin(), inp.end()}, sh_out, param);
  426. dest[0] = *sh_out;
  427. };
  428. Checker::RunOptions opt;
  429. opt.numdiff_eps = 1;
  430. nr_get = 0;
  431. Checker(make_graph, fwd)
  432. .run({TensorShape{3, 2, 10, 6}, {4, 2, 3, 2}}, opt)
  433. .run({TensorShape{6, 3, 8, 13}, {2, 3, 2, 13}}, opt)
  434. .run({TensorShape{1, 1, 10, 10}, {2, 1, 3, 3}}, opt);
  435. if (strategy == S::HEURISTIC) {
  436. ASSERT_EQ(0, nr_get);
  437. } else {
  438. ASSERT_LT(0, nr_get);
  439. }
  440. }
  441. }
  442. TEST(TestOprDNN, ConvolutionBackwardDataBfloat16ExePolicy) {
  443. REQUIRE_GPU(1);
  444. Param param{Mode::CROSS_CORRELATION, 1, 1, 1, 1};
  445. param.compute_mode = Param::ComputeMode::FLOAT32;
  446. using Policy = opr::Convolution::ExecutionPolicy;
  447. using S = Policy::Strategy;
  448. auto gen_bfp16 = [](HostTensorND& dest) {
  449. RNGxorshf rng{next_rand_seed()};
  450. auto rand_real = [&rng]() {
  451. std::uniform_real_distribution<float> dist(-1, 1);
  452. return dist(rng);
  453. };
  454. auto ptr = dest.ptr<dt_bfloat16>();
  455. size_t elems = dest.shape().total_nr_elems();
  456. for (size_t i = 0; i < elems; i++) {
  457. ptr[i] = dt_bfloat16(rand_real());
  458. }
  459. };
  460. auto f32_to_bf16 = [](const std::shared_ptr<HostTensorND>& src)
  461. -> std::shared_ptr<HostTensorND> {
  462. auto ret = std::make_shared<HostTensorND>(
  463. src->comp_node(), src->shape(), dtype::BFloat16{});
  464. for (size_t i = 0; i < src->layout().total_nr_elems(); i++) {
  465. ret->ptr<dt_bfloat16>()[i] = src->ptr<dt_float32>()[i];
  466. }
  467. return ret;
  468. };
  469. auto bf16_to_f32 = [](const std::shared_ptr<HostTensorND>& src)
  470. -> std::shared_ptr<HostTensorND> {
  471. auto ret = std::make_shared<HostTensorND>(
  472. src->comp_node(), src->shape(), dtype::Float32{});
  473. for (size_t i = 0; i < src->layout().total_nr_elems(); i++) {
  474. ret->ptr<dt_float32>()[i] = src->ptr<dt_bfloat16>()[i];
  475. }
  476. return ret;
  477. };
  478. int nr_get = 0;
  479. auto on_get = [&nr_get](const std::string&, const void*, size_t,
  480. const void*, size_t) { ++nr_get; };
  481. PersistentCacheHook cache_hook{on_get};
  482. #if MGB_ENABLE_FASTRUN
  483. for (auto strategy :
  484. {S::PROFILE, S::HEURISTIC, S(S::PROFILE | S::REPRODUCIBLE),
  485. S(S::PROFILE | S::HEURISTIC)}) {
  486. #else
  487. for (auto strategy: {S:HEURISTIC, S(S::PROFILE | S::HEURISTIC)}) {
  488. #endif
  489. using Checker = AutoOprChecker<2, 1>;
  490. auto make_graph = [&](const Checker::SymInpArray& inputs)
  491. -> Checker::SymOutArray {
  492. Policy policy;
  493. policy.strategy = strategy;
  494. return {opr::ConvolutionBackwardData::make_deconv(
  495. inputs[0], inputs[1], param, policy)};
  496. };
  497. auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
  498. std::shared_ptr<HostTensorND> out;
  499. conv_bwd_data_brute(
  500. {bf16_to_f32(inp[0]), bf16_to_f32(inp[1])}, out,
  501. param);
  502. dest[0] = *f32_to_bf16(out);
  503. };
  504. Checker::RunOptions opt;
  505. opt.outputs_max_err = 1e-3;
  506. nr_get = 0;
  507. Checker(make_graph, fwd)
  508. .disable_grad_check()
  509. .set_input_dtype(0, dtype::BFloat16{})
  510. .set_input_dtype(1, dtype::BFloat16{})
  511. .set_input_generator(0, gen_bfp16)
  512. .set_input_generator(1, gen_bfp16)
  513. .run({TensorShape{3, 4, 10, 6}, {4, 2, 3, 3}}, opt)
  514. .run({TensorShape{2, 2, 4, 3}, {2, 2, 3, 3}}, opt)
  515. .run({TensorShape{1, 3, 10, 6}, {3, 2, 3, 3}}, opt);
  516. if (strategy == S::HEURISTIC) {
  517. ASSERT_EQ(0, nr_get);
  518. } else {
  519. ASSERT_LT(0, nr_get);
  520. }
  521. }
  522. }
  523. TEST(TestOprDNN, Deconvolution) {
  524. // dilated grouped deconv
  525. using Checker = AutoOprChecker<2, 1>;
  526. Param param{Mode::CROSS_CORRELATION, 0, 1, 1, 2};
  527. param.dilate_h = 2;
  528. param.sparse = Param::Sparse::GROUP;
  529. auto make_graph = [&](
  530. const Checker::SymInpArray &inputs) -> Checker::SymOutArray {
  531. return {opr::ConvolutionBackwardData::make_deconv(
  532. inputs[0], inputs[1], param)};
  533. };
  534. auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
  535. std::shared_ptr<HostTensorND> out;
  536. conv_bwd_data_brute({inp[0], inp[1]}, out, param);
  537. dest[0] = *out;
  538. };
  539. Checker::RunOptions opt;
  540. opt.numdiff_eps = 1;
  541. Checker(make_graph, fwd).
  542. run({TensorShape{2, 4, 6, 8}, {1, 4, 5, 3, 2}}, opt).
  543. run({TensorShape{3, 2, 1, 1}, {2, 1, 1, 4, 3}}, opt).
  544. run({TensorShape{4, 6, 7, 2}, {2, 3, 4, 8, 13}}, opt);
  545. }
  546. TEST(TestOprDNN, DeconvolutionExePolicy_QuantizedS8) {
  547. REQUIRE_GPU(1);
  548. auto cn = CompNode::load("gpu0");
  549. cn.activate();
  550. REQUIRE_CUDA_COMPUTE_CAPABILITY(6, 1);
  551. Param param;
  552. using Policy = opr::ConvolutionBackwardData::ExecutionPolicy;
  553. using S = Policy::Strategy;
  554. #if MGB_ENABLE_FASTRUN
  555. for (auto strategy :
  556. {S::PROFILE, S::HEURISTIC, S(S::PROFILE | S::REPRODUCIBLE),
  557. S(S::PROFILE | S::HEURISTIC)}) {
  558. #else
  559. for (auto strategy: {S:HEURISTIC, S(S::PROFILE | S::HEURISTIC)}) {
  560. #endif
  561. auto graph = ComputingGraph::make();
  562. HostTensorGenerator<> gen;
  563. auto mkvar = [&](const char* name, const TensorShape& shp,
  564. const DType& dtype) {
  565. return opr::TypeCvt::make(
  566. opr::Host2DeviceCopy::make(*graph, gen(shp)).rename(name),
  567. dtype);
  568. };
  569. auto x = mkvar("x", {16, 4, 50, 50, 4}, dtype::QuantizedS8(1.2f));
  570. auto w = mkvar("w", {16, 4, 4, 4, 4}, dtype::QuantizedS8(1.3f));
  571. param.format = Param::Format::NCHW4;
  572. param.pad_h = param.pad_w = 2;
  573. param.stride_h = param.stride_w = 2;
  574. Policy policy;
  575. policy.strategy = strategy;
  576. auto deconv = opr::ConvolutionBackwardData::make_deconv(
  577. x, w, param, policy,
  578. OperatorNodeConfig{dtype::QuantizedS8(1.2f)});
  579. HostTensorND host_y;
  580. auto func = graph->compile({make_callback_copy(deconv, host_y)});
  581. func->execute();
  582. }
  583. }
  584. TEST(TestOprDNN, ConvolutionBackwardFilter) {
  585. using Checker = AutoOprChecker<3, 1>;
  586. constexpr size_t PH = 0, PW = 1, SH = 1, SW = 2;
  587. auto make_graph = [&](
  588. const Checker::SymInpArray &inputs) -> Checker::SymOutArray {
  589. Param param{Mode::CROSS_CORRELATION, PH, PW, SH, SW};
  590. return {opr::ConvolutionBackwardFilter::make(
  591. inputs[0], inputs[1], inputs[2], param)};
  592. };
  593. auto fwd = [&](Checker::NumOutArray &dest, Checker::NumInpArray inp) {
  594. std::shared_ptr<HostTensorND> out;
  595. conv_bwd_flt_brute({inp[0], inp[1], inp[2]}, out,
  596. Param{Mode::CROSS_CORRELATION, PH, PW, SH, SW});
  597. dest[0] = *out;
  598. };
  599. #define get_shp(N, P, S, F) ((N + 2 * P - F) / S + 1)
  600. #define inp_tensor(N, IC, OC, IH, IW, FH, FW) \
  601. { TensorShape{N, IC, IH, IW}, \
  602. {N, OC, get_shp(IH, PH, SH, FH), get_shp(IW, PW, SW, FW)}, \
  603. {OC, IC, FH, FW} }
  604. Checker::RunOptions opt;
  605. opt.numdiff_eps = 1;
  606. Checker(make_graph, fwd).
  607. run(inp_tensor(2, 3, 4, 9, 8, 4, 3), opt).
  608. run(inp_tensor(1, 5, 3, 7, 9, 3, 4), opt).
  609. run(inp_tensor(3, 4, 4, 9, 9, 3, 3), opt);
  610. #undef inp_tensor
  611. #undef get_shp
  612. }
  613. TEST(TestOprDNN, DilatedConvolution) {
  614. using Checker = AutoOprChecker<2, 1>;
  615. opr::ConvolutionForward::Param param;
  616. param.pad_h = 5;
  617. param.pad_w = 2;
  618. param.stride_w = 2;
  619. param.dilate_h = 2;
  620. auto make_graph = [&](const Checker::SymInpArray &inputs) ->
  621. Checker::SymOutArray {
  622. return {opr::Convolution::make(inputs[0], inputs[1], param)};
  623. };
  624. auto fwd = [&](Checker::NumOutArray &dest, Checker::NumInpArray inp) {
  625. auto opr = megdnn_naive_handle()->create_operator<
  626. megdnn::Convolution>();
  627. opr->param() = param;
  628. TensorLayout dest_layout;
  629. opr->deduce_layout(inp[0]->layout(), inp[1]->layout(), dest_layout);
  630. std::vector<dt_byte> workspace(opr->get_workspace_in_bytes(
  631. inp[0]->layout(), inp[1]->layout(), dest_layout, nullptr));
  632. dest[0].dtype(dtype::Float32()).
  633. comp_node(inp[0]->comp_node()).resize(dest_layout);
  634. opr->exec(inp[0]->as_megdnn(), inp[1]->as_megdnn(), dest[0].as_megdnn(),
  635. nullptr, {workspace.data(), workspace.size()});
  636. };
  637. Checker::RunOptions option;
  638. option.numdiff_eps = 0.1;
  639. Checker(make_graph, fwd).
  640. run({TensorShape{2, 3, 8, 7}, TensorShape{4, 3, 2, 2}}, option).
  641. run({TensorShape{2, 3, 8, 7}, TensorShape{4, 3, 3, 2}}, option).
  642. run({TensorShape{2, 3, 8, 9}, TensorShape{4, 3, 3, 2}}, option);
  643. }
  644. TEST(TestOprDNN, GroupConv) {
  645. using Checker = AutoOprChecker<2, 1>;
  646. opr::Convolution::Param param;
  647. param.pad_h = 1;
  648. param.pad_w = 2;
  649. param.stride_h = 2;
  650. auto make_graph = [&](
  651. const Checker::SymInpArray &inputs) -> Checker::SymOutArray {
  652. auto p1 = param;
  653. p1.sparse = opr::Convolution::Param::Sparse::GROUP;
  654. return {opr::Convolution::make(inputs[0], inputs[1], p1)};
  655. };
  656. auto cn = CompNode::load("xpux");
  657. auto inp0 = std::make_shared<HostTensorND>(cn, dtype::Float32()),
  658. inp1 = std::make_shared<HostTensorND>(cn, dtype::Float32());
  659. HostTensorND out_raw;
  660. auto graph_raw = ComputingGraph::make();
  661. auto func_raw = graph_raw->compile({
  662. make_callback_copy(
  663. opr::Convolution::make(
  664. opr::Host2DeviceCopy::make(*graph_raw, inp0),
  665. opr::Host2DeviceCopy::make(*graph_raw, inp1),
  666. param),
  667. out_raw)});
  668. auto fwd = [&](Checker::NumOutArray &dest, Checker::NumInpArray inp) {
  669. auto &&out = dest[0];
  670. auto sl = inp[0]->layout(),
  671. fl = inp[1]->layout().remove_axis(0);
  672. TensorLayout ol;
  673. auto group = inp[1]->layout()[0];
  674. sl.shape[1] /= group;
  675. for (size_t i = 0; i < group; ++ i) {
  676. inp0->copy_from(inp[0]->sub(SubTensorSpec::make_from_offset_elem(
  677. sl, i * sl[1] * sl[2] * sl[3])));
  678. inp1->copy_from(inp[1]->sub(SubTensorSpec::make_from_offset_elem(
  679. fl, i * fl.total_nr_elems())));
  680. func_raw->execute();
  681. if (!i) {
  682. auto oshp = out_raw.shape();
  683. oshp[1] *= group;
  684. out.resize(oshp);
  685. ol = out.layout();
  686. ol[1] /= group;
  687. }
  688. out.sub(SubTensorSpec::make_from_offset_elem(
  689. ol, i * ol[1] * ol[2] * ol[3])).copy_from_fixlayout(
  690. out_raw);
  691. }
  692. };
  693. Checker::RunOptions opt;
  694. opt.numdiff_eps = 1;
  695. opt.outputs_max_err = 5e-5;
  696. Checker checker{make_graph, fwd};
  697. auto run = [&](const TensorShape &ishp,
  698. size_t fh, size_t fw, size_t oc, size_t group) {
  699. size_t ic = ishp[1];
  700. TensorShape flt{group, oc/group, ic/group, fh, fw};
  701. checker.run({ishp, flt}, opt);
  702. };
  703. run({1, 2, 1, 1}, 1, 1, 2, 2);
  704. run({3, 9, 5, 4}, 1, 2, 6, 3);
  705. run({3, 6, 8, 9}, 3, 1, 4, 2);
  706. run({2, 5, 3, 6}, 2, 3, 5, 1);
  707. run({2, 6, 3, 6}, 2, 3, 6, 6);
  708. }
  709. TEST(TestOprDNN, MaskConvolution) {
  710. using Checker = AutoOprChecker<3, 1>;
  711. opr::Convolution::Param param;
  712. auto make_graph =
  713. [&](const Checker::SymInpArray& inputs) -> Checker::SymOutArray {
  714. return {opr::MaskConvolution::make(inputs[0], inputs[1], inputs[2],
  715. param)};
  716. };
  717. auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
  718. std::shared_ptr<HostTensorND> sh_out;
  719. convolution_brute({inp[0], inp[1]}, sh_out, param);
  720. dest[0] = *sh_out;
  721. size_t N = dest[0].shape()[0];
  722. size_t OC = dest[0].shape()[1];
  723. size_t OH = dest[0].shape()[2];
  724. size_t OW = dest[0].shape()[3];
  725. auto mask_ptr = inp[2]->ptr<int8_t>();
  726. auto dest_ptr = dest[0].ptr<float>();
  727. for (size_t i = 0; i < N * OC; ++i) {
  728. for (size_t mask_idx = 0; mask_idx < OH * OW; ++mask_idx) {
  729. if (mask_ptr[mask_idx] == 0) {
  730. dest_ptr[i * OH * OW + mask_idx] = 0;
  731. }
  732. }
  733. }
  734. };
  735. auto gen_mask = [](HostTensorND& dest) {
  736. HostTensorGenerator<dtype::Int8, RandomDistribution::UNIFORM>
  737. mask_generator{0, 1};
  738. dest = *mask_generator(dest.shape(), dest.comp_node());
  739. };
  740. auto run_with_param = [&](size_t SH = 1, size_t SW = 1, size_t PH = 0,
  741. size_t PW = 0) {
  742. param.pad_h = PH;
  743. param.pad_w = PW;
  744. param.stride_h = SH;
  745. param.stride_w = SW;
  746. Checker checker{make_graph, fwd};
  747. Checker::RunOptions opt;
  748. checker.set_output_allow_grad(0, false);
  749. checker.set_input_dtype(2, dtype::Int8());
  750. checker.set_input_generator(2, gen_mask);
  751. auto run = [&](size_t N, size_t IC, size_t OC, size_t IH, size_t IW,
  752. size_t FH, size_t FW) {
  753. size_t OH = (IH + 2 * PH - FH) / SH + 1;
  754. size_t OW = (IW + 2 * PW - FW) / SW + 1;
  755. checker.run(
  756. {TensorShape{N, IC, IH, IW}, {OC, IC, FH, FW}, {OH, OW}},
  757. opt);
  758. };
  759. run(1, 1, 1, 5, 5, 3, 3);
  760. run(2, 3, 4, 5, 5, 3, 3);
  761. run(3, 3, 4, 224, 223, 3, 3);
  762. run(3, 3, 4, 224, 223, 2, 2);
  763. };
  764. run_with_param();
  765. run_with_param(2, 2, 3, 3);
  766. run_with_param(3, 2, 1, 2);
  767. run_with_param(2, 3, 2, 2);
  768. }
  769. TEST(TestOprDNN, MaskPropagate) {
  770. using Checker = AutoOprChecker<3, 1>;
  771. opr::MaskPropagate::Param mask_param;
  772. opr::Convolution::Param conv_param;
  773. auto make_graph =
  774. [&](const Checker::SymInpArray& inputs) -> Checker::SymOutArray {
  775. auto inp_mask = inputs[2];
  776. auto out_mask = opr::MaskPropagate::make(inp_mask, mask_param);
  777. return {opr::MaskConvolution::make(inputs[0], inputs[1], out_mask,
  778. conv_param)};
  779. };
  780. auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
  781. auto& src = *inp[0];
  782. auto& mask = *inp[2];
  783. auto src_ptr = inp[0]->ptr<float>();
  784. auto mask_ptr = inp[2]->ptr<int>();
  785. mgb_assert(src.shape()[2] == mask.shape()[0] &&
  786. src.shape()[3] == mask.shape()[1]);
  787. for (size_t i = 0; i < src.shape()[0] * src.shape()[1]; ++i) {
  788. for (size_t mask_idx = 0;
  789. mask_idx < src.shape()[2] * src.shape()[3]; ++mask_idx) {
  790. if (mask_ptr[mask_idx] == 0) {
  791. src_ptr[i * src.layout().stride[1] + mask_idx] = 0;
  792. }
  793. }
  794. }
  795. std::shared_ptr<HostTensorND> sh_out;
  796. convolution_brute({inp[0], inp[1]}, sh_out, conv_param);
  797. dest[0] = *sh_out;
  798. };
  799. auto gen_mask = [](HostTensorND& dest) {
  800. HostTensorGenerator<dtype::Int32, RandomDistribution::UNIFORM>
  801. mask_generator{0, 1};
  802. dest = *mask_generator(dest.shape(), dest.comp_node());
  803. };
  804. auto run_with_param = [&](size_t FH, size_t FW, size_t SH = 1,
  805. size_t SW = 1, size_t PH = 0, size_t PW = 0,
  806. size_t DH = 1, size_t DW = 1) {
  807. conv_param.pad_h = PH;
  808. conv_param.pad_w = PW;
  809. conv_param.stride_h = SH;
  810. conv_param.stride_w = SW;
  811. conv_param.dilate_h = DH;
  812. conv_param.dilate_w = DW;
  813. mask_param.pad_h = PH;
  814. mask_param.pad_w = PW;
  815. mask_param.stride_h = SH;
  816. mask_param.stride_w = SW;
  817. mask_param.kernel_h = FH;
  818. mask_param.kernel_w = FW;
  819. mask_param.dilate_h = DH;
  820. mask_param.dilate_w = DW;
  821. Checker checker{make_graph, fwd};
  822. Checker::RunOptions opt;
  823. checker.set_output_allow_grad(0, false);
  824. checker.set_input_dtype(2, dtype::Int32());
  825. checker.set_input_generator(2, gen_mask);
  826. auto run = [&](size_t N, size_t IC, size_t OC, size_t IH, size_t IW) {
  827. checker.run(
  828. {TensorShape{N, IC, IH, IW}, {OC, IC, FH, FW}, {IH, IW}},
  829. opt);
  830. };
  831. run(1, 1, 1, 5, 5);
  832. run(2, 3, 4, 5, 5);
  833. run(3, 3, 4, 224, 223);
  834. run(3, 3, 4, 224, 223);
  835. };
  836. run_with_param(3, 3, 1, 1, 0, 0, 2, 2);
  837. run_with_param(3, 3, 2, 2, 3, 3);
  838. run_with_param(4, 2, 3, 2, 1, 2);
  839. run_with_param(2, 4, 2, 3, 2, 2);
  840. run_with_param(4, 2, 3, 2, 1, 2, 2, 2);
  841. run_with_param(2, 4, 2, 3, 2, 2, 2, 1);
  842. }
  843. void convolution3d_brute(const std::vector<std::shared_ptr<HostTensorND>> &in_tensor,
  844. std::shared_ptr<HostTensorND> &out_tensor,
  845. const opr::Convolution3D::Param &param)
  846. {
  847. mgb_assert(in_tensor.size() == 2);
  848. auto in = in_tensor[0], filter = in_tensor[1];
  849. mgb_assert(in->shape().ndim == 5);
  850. mgb_assert(filter->shape().ndim == 5);
  851. int batch_size = in->shape().shape[0];
  852. int ic = in->shape().shape[1];
  853. int id = in->shape().shape[2];
  854. int ih = in->shape().shape[3];
  855. int iw = in->shape().shape[4];
  856. int fd = filter->shape().shape[2];
  857. int fh = filter->shape().shape[3];
  858. int fw = filter->shape().shape[4];
  859. int pd = param.pad_d;
  860. int ph = param.pad_h;
  861. int pw = param.pad_w;
  862. int sd = param.stride_d;
  863. int sh = param.stride_h;
  864. int sw = param.stride_w;
  865. int dd = param.dilate_d;
  866. int dh = param.dilate_h;
  867. int dw = param.dilate_w;
  868. mgb_assert(id + 2*pd >= (fd - 1) * dd + 1);
  869. mgb_assert(ih + 2*ph >= (fh - 1) * dh + 1);
  870. mgb_assert(iw + 2*pw >= (fw - 1) * dw + 1);
  871. int od = (id + 2*pd - ((fd - 1) * dd + 1)) / sd + 1;
  872. int oh = (ih + 2*ph - ((fh - 1) * dh + 1)) / sh + 1;
  873. int ow = (iw + 2*pw - ((fw - 1) * dw + 1)) / sw + 1;
  874. mgb_assert(static_cast<size_t>(ic) == filter->shape().shape[1]);
  875. int oc = filter->shape().shape[0];
  876. out_tensor = std::make_shared<HostTensorND>(CompNode::load("xpu0"),
  877. TensorShape{
  878. static_cast<size_t>(batch_size),
  879. static_cast<size_t>(oc),
  880. static_cast<size_t>(od),
  881. static_cast<size_t>(oh),
  882. static_cast<size_t>(ow)});
  883. int pn, poc, pod, poh, pow,
  884. pic, pid, pih, piw,
  885. pfd, pfh, pfw;
  886. for (pn = 0; pn < batch_size; ++pn)
  887. for (poc = 0; poc < oc; ++poc)
  888. for (pod = 0, pid = -pd; pod < od; ++pod, pid += sd)
  889. for (poh = 0, pih = -ph; poh < oh; ++poh, pih += sh)
  890. for (pow = 0, piw = -pw; pow < ow; ++pow, piw += sw)
  891. {
  892. float &target = out_tensor->ptr<float>({
  893. static_cast<size_t>(pn),
  894. static_cast<size_t>(poc),
  895. static_cast<size_t>(pod),
  896. static_cast<size_t>(poh),
  897. static_cast<size_t>(pow)})[0];
  898. target = 0;
  899. for (pic = 0; pic < ic; ++pic)
  900. for (pfd = 0; pfd < fd; ++pfd)
  901. for (pfh = 0; pfh < fh; ++pfh)
  902. for (pfw = 0; pfw < fw; ++pfw)
  903. {
  904. int prid, prih, priw;
  905. float img_data, filter_data;
  906. if (param.mode == opr::Convolution3D::Param::Mode::CONVOLUTION) {
  907. prid = pid + (fd - pfd - 1) * dd;
  908. prih = pih + (fh - pfh - 1) * dh;
  909. priw = piw + (fw - pfw - 1) * dw;
  910. } else {
  911. mgb_assert(param.mode == opr::Convolution3D::Param::Mode::CROSS_CORRELATION);
  912. prid = pid + pfd * dd;
  913. prih = pih + pfh * dh;
  914. priw = piw + pfw * dw;
  915. }
  916. if (prid >= 0 && prid < id &&
  917. prih >= 0 && prih < ih &&
  918. priw >= 0 && priw < iw) {
  919. img_data = in_tensor[0]->ptr<float>({
  920. static_cast<size_t>(pn),
  921. static_cast<size_t>(pic),
  922. static_cast<size_t>(prid),
  923. static_cast<size_t>(prih),
  924. static_cast<size_t>(priw)})[0];
  925. } else {
  926. img_data = 0;
  927. }
  928. filter_data = filter->ptr<float>({
  929. static_cast<size_t>(poc),
  930. static_cast<size_t>(pic),
  931. static_cast<size_t>(pfd),
  932. static_cast<size_t>(pfh),
  933. static_cast<size_t>(pfw)})[0];
  934. target += img_data * filter_data;
  935. }
  936. }
  937. }
  938. TEST(TestOprDNN, Convolution3DForward) {
  939. for (uint32_t batch_size : {8})
  940. for (uint32_t id : {12})
  941. for (uint32_t fd : {1, 3})
  942. for (uint32_t ic : {4})
  943. for (uint32_t oc : {ic})
  944. for (uint32_t pd : {0, 2})
  945. for (uint32_t sd : {1, 3})
  946. for (uint32_t dd : {1, 3})
  947. for (bool xcorr : {0, 1}) {
  948. uint32_t ih = id + 1, fh = fd, ph = pd + 1, sh = sd + 1;
  949. uint32_t iw = ih + 1, fw = fh, pw = ph + 1, sw = sh + 1;
  950. Param3D param{xcorr ? Param3D::Mode::CROSS_CORRELATION :
  951. Param3D::Mode::CONVOLUTION , pd, ph, pw,
  952. sd, sh, sw, dd, dd, dd};
  953. // !!! DEPRECATED. use AutoOprChecker instead.
  954. opr::test::ForwardChecker<opr::Convolution3D, 2> forward_checker({
  955. {batch_size, ic, id, ih, iw},
  956. {oc, ic, fd, fh, fw}},
  957. convolution3d_brute, param);
  958. forward_checker.run();
  959. }
  960. }
  961. TEST(TestOprDNN, Convolution3DBackward) {
  962. for (uint32_t batch_size : {8})
  963. for (uint32_t id : {12})
  964. for (uint32_t fd : {1, 3})
  965. for (uint32_t ic : {4})
  966. for (uint32_t oc : {ic})
  967. for (uint32_t pd : {0, 2})
  968. for (uint32_t sd : {1, 3})
  969. for (uint32_t dd : {1, 3})
  970. for (bool xcorr : {0, 1}) {
  971. uint32_t ih = id + 1, fh = fd, ph = pd + 1, sh = sd + 1;
  972. uint32_t iw = ih + 1, fw = fh, pw = ph + 1, sw = sh + 1;
  973. Param3D param{xcorr ? Param3D::Mode::CROSS_CORRELATION :
  974. Param3D::Mode::CONVOLUTION,
  975. pd, ph, pw, sd, sh, sw, dd, dd, dd};
  976. // !!! DEPRECATED. use AutoOprChecker instead.
  977. opr::test::BackwardChecker<opr::Convolution3D, 2> backward_checker(
  978. {{batch_size, ic, id, ih, iw},
  979. {oc, ic, fd, fh, fw}}, param, 1e-2, 1);
  980. backward_checker.run();
  981. }
  982. }
  983. TEST(TestOprDNN, GroupConv3D) {
  984. using Checker = AutoOprChecker<2, 1>;
  985. opr::Convolution3D::Param param;
  986. param.pad_d = 0;
  987. param.pad_h = 1;
  988. param.pad_w = 0;
  989. param.stride_d = 1;
  990. param.stride_h = 2;
  991. auto make_graph = [&](
  992. const Checker::SymInpArray &inputs) -> Checker::SymOutArray {
  993. auto p1 = param;
  994. p1.sparse = opr::Convolution3D::Param::Sparse::GROUP;
  995. return {opr::Convolution3D::make(inputs[0], inputs[1], p1)};
  996. };
  997. auto cn = CompNode::load("xpux");
  998. auto inp0 = std::make_shared<HostTensorND>(cn, dtype::Float32()),
  999. inp1 = std::make_shared<HostTensorND>(cn, dtype::Float32());
  1000. HostTensorND out_raw;
  1001. auto graph_raw = ComputingGraph::make();
  1002. auto func_raw = graph_raw->compile({
  1003. make_callback_copy(
  1004. opr::Convolution3D::make(
  1005. opr::Host2DeviceCopy::make(*graph_raw, inp0),
  1006. opr::Host2DeviceCopy::make(*graph_raw, inp1),
  1007. param),
  1008. out_raw)});
  1009. auto fwd = [&](Checker::NumOutArray &dest, Checker::NumInpArray inp) {
  1010. auto &&out = dest[0];
  1011. auto sl = inp[0]->layout(),
  1012. fl = inp[1]->layout().remove_axis(0);
  1013. TensorLayout ol;
  1014. auto group = inp[1]->layout()[0];
  1015. sl.shape[1] /= group;
  1016. for (size_t i = 0; i < group; ++ i) {
  1017. inp0->copy_from(inp[0]->sub(SubTensorSpec::make_from_offset_elem(
  1018. sl, i * sl[1] * sl[2] * sl[3] * sl[4])));
  1019. inp1->copy_from(inp[1]->sub(SubTensorSpec::make_from_offset_elem(
  1020. fl, i * fl.total_nr_elems())));
  1021. func_raw->execute();
  1022. if (!i) {
  1023. auto oshp = out_raw.shape();
  1024. oshp[1] *= group;
  1025. out.resize(oshp);
  1026. ol = out.layout();
  1027. ol[1] /= group;
  1028. }
  1029. out.sub(SubTensorSpec::make_from_offset_elem(
  1030. ol, i * ol[1] * ol[2] * ol[3] * ol[4])).
  1031. copy_from_fixlayout(out_raw);
  1032. }
  1033. };
  1034. Checker::RunOptions opt;
  1035. opt.numdiff_eps = 1;
  1036. opt.outputs_max_err = 5e-5;
  1037. Checker checker{make_graph, fwd};
  1038. auto run = [&](const TensorShape &ishp,
  1039. size_t fd, size_t fh, size_t fw, size_t oc, size_t group) {
  1040. size_t ic = ishp[1];
  1041. TensorShape flt{group, oc/group, ic/group, fd, fh, fw};
  1042. checker.
  1043. run({ishp, flt}, opt);
  1044. };
  1045. run({1, 2, 1, 1, 1}, 1, 1, 1, 2, 2);
  1046. run({3, 9, 5, 4, 3}, 1, 2, 3, 6, 3);
  1047. run({2, 1, 3, 6, 9}, 2, 3, 3, 5, 1);
  1048. run({2, 1, 3, 6, 9}, 2, 3, 3, 5, 1);
  1049. }
  1050. TEST(TestOprDNN, Deconvolution3D) {
  1051. using Checker = AutoOprChecker<2, 1>;
  1052. Param3D param{Param3D::Mode::CROSS_CORRELATION, 0, 1, 1, 1, 2, 2};
  1053. param.sparse = Param3D::Sparse::GROUP;
  1054. auto make_graph = [&](
  1055. const Checker::SymInpArray &inputs) -> Checker::SymOutArray {
  1056. return {opr::Convolution3DBackwardData::make_deconv(
  1057. inputs[0], inputs[1], param)};
  1058. };
  1059. auto fwd = [&](Checker::NumOutArray &dest, Checker::NumInpArray inp) {
  1060. auto &&data = *inp[0], &&filter = *inp[1];
  1061. size_t N = data.shape(0),
  1062. ID = data.shape(2), IH = data.shape(3), IW = data.shape(4),
  1063. GROUP = filter.shape(0),
  1064. ICPG = filter.shape(1), OCPG = filter.shape(2),
  1065. FD = filter.shape(3), FH = filter.shape(4), FW = filter.shape(5);
  1066. auto &&out = dest[0];
  1067. auto get_shp = [](
  1068. size_t inp, size_t filter, size_t stride, size_t pad,
  1069. size_t dilate) {
  1070. return (inp - 1) * stride + (filter - 1) * dilate + 1 - pad * 2;
  1071. };
  1072. size_t OD = get_shp(ID, FD,
  1073. param.stride_d, param.pad_d, param.dilate_d),
  1074. OH = get_shp(IH, FH,
  1075. param.stride_h, param.pad_h, param.dilate_h),
  1076. OW = get_shp(IW, FW,
  1077. param.stride_w, param.pad_w, param.dilate_w);
  1078. out.resize({N, OCPG * GROUP, OD, OH, OW});
  1079. auto fptr = filter.ptr<float>(),
  1080. dptr = data.ptr<float>(),
  1081. optr = out.ptr<float>();
  1082. memset(optr, 0, sizeof(float) * out.shape().total_nr_elems());
  1083. auto ol = out.layout(), fl = filter.layout();
  1084. #define FOR2(a, A, b, B) \
  1085. for (size_t a = 0; a < A; ++ a) \
  1086. for (size_t b = 0; b < B; ++ b)
  1087. #define FOR3(a, A, b, B, c, C) \
  1088. FOR2(a, A, b, B) \
  1089. for (size_t c = 0; c < C; ++ c)
  1090. #define FOR4(a, A, b, B, c, C, d, D) \
  1091. FOR3(a, A, b, B, c, C) \
  1092. for (size_t d = 0; d < D; ++ d)
  1093. FOR3(n, N, group, GROUP, icg, ICPG)
  1094. FOR3(id, ID, ih, IH, iw, IW) {
  1095. float scale = *(dptr ++);
  1096. FOR4(ocg, OCPG, fd, FD, fh, FH, fw, FW) {
  1097. auto oc_tot = group * OCPG + ocg;
  1098. int od = int(id * param.stride_d +
  1099. fd * param.dilate_d) - int(param.pad_d),
  1100. oh = int(ih * param.stride_h +
  1101. fh * param.dilate_h) - int(param.pad_h),
  1102. ow = int(iw * param.stride_w +
  1103. fw * param.dilate_w) - int(param.pad_w);
  1104. if (od >= 0 && oh >= 0 && ow >= 0 &&
  1105. od < static_cast<int>(OD) &&
  1106. oh < static_cast<int>(OH) &&
  1107. ow < static_cast<int>(OW)) {
  1108. auto out_off = n * ol.stride[0] + oc_tot * ol.stride[1] +
  1109. od * ol.stride[2] + oh * ol.stride[3] + ow,
  1110. flt_off = group * fl.stride[0] + icg * fl.stride[1] +
  1111. ocg * fl.stride[2] + fd * fl.stride[3] +
  1112. fh * fl.stride[4] + fw;
  1113. optr[out_off] += scale * fptr[flt_off];
  1114. }
  1115. }
  1116. }
  1117. #undef FOR4
  1118. #undef FOR3
  1119. #undef FOR2
  1120. };
  1121. Checker::RunOptions opt;
  1122. opt.numdiff_eps = 1;
  1123. Checker(make_graph, fwd).
  1124. run({TensorShape{2, 4, 3, 3, 2}, {1, 4, 5, 3, 2, 2}}, opt).
  1125. run({TensorShape{3, 2, 1, 1, 1}, {2, 1, 1, 4, 3, 3}}, opt).
  1126. run({TensorShape{4, 6, 2, 2, 2}, {2, 3, 4, 6, 5, 4}}, opt);
  1127. }
  1128. TEST(TestOprDNN, Convolution3DExePolicy) {
  1129. Param3D param{Param3D::Mode::CONVOLUTION};
  1130. using Policy = opr::Convolution3D::ExecutionPolicy;
  1131. using S = Policy::Strategy;
  1132. #if MGB_ENABLE_FASTRUN
  1133. for (auto strategy :
  1134. SmallVector<S>{S::PROFILE, S::HEURISTIC, S::PROFILE | S::REPRODUCIBLE,
  1135. S::PROFILE | S::HEURISTIC}) {
  1136. #else
  1137. for (auto strategy :
  1138. SmallVector<S>{S : HEURISTIC, S::PROFILE | S::HEURISTIC}) {
  1139. #endif
  1140. using Checker = AutoOprChecker<2, 1>;
  1141. auto make_graph = [&](const Checker::SymInpArray &inputs) ->
  1142. Checker::SymOutArray {
  1143. Policy policy;
  1144. policy.strategy = strategy;
  1145. auto out = opr::Convolution3D::make(
  1146. inputs[0], inputs[1], param, policy);
  1147. return {out};
  1148. };
  1149. auto fwd = [&](Checker::NumOutArray &dest, Checker::NumInpArray inp) {
  1150. std::shared_ptr<HostTensorND> sh_out;
  1151. convolution3d_brute({inp.begin(), inp.end()}, sh_out, param);
  1152. dest[0] = *sh_out;
  1153. };
  1154. Checker::RunOptions opt;
  1155. opt.numdiff_eps = 1;
  1156. Checker(make_graph, fwd).
  1157. run({TensorShape{3, 2, 3, 4, 1}, {4, 2, 2, 2, 1}}, opt).
  1158. run({TensorShape{3, 3, 2, 6, 2}, {2, 3, 1, 4, 1}}, opt).
  1159. run({TensorShape{1, 1, 4, 4, 4}, {2, 1, 3, 3, 3}}, opt);
  1160. }
  1161. }
  1162. TEST(TestOprDNN, ConvBiasForward) {
  1163. using Checker2 = AutoOprChecker<2, 1>;
  1164. using Checker3 = AutoOprChecker<3, 1>;
  1165. opr::ConvBiasForward::Param param;
  1166. auto make_graph2 =
  1167. [&](const Checker2::SymInpArray& inputs) -> Checker2::SymOutArray {
  1168. return {opr::ConvBiasForward::make(inputs[0], inputs[1], param)};
  1169. };
  1170. auto make_graph3 =
  1171. [&](const Checker3::SymInpArray& inputs) -> Checker3::SymOutArray {
  1172. return {opr::ConvBiasForward::make(inputs[0], inputs[1], inputs[2],
  1173. param)};
  1174. };
  1175. auto fwd2 = [&](Checker2::NumOutArray& dest, Checker2::NumInpArray inp) {
  1176. std::shared_ptr<HostTensorND> sh_out;
  1177. convolution_brute({inp[0], inp[1]}, sh_out,
  1178. convert_to_conv_param(param));
  1179. dest[0] = *sh_out;
  1180. };
  1181. auto fwd3 = [&](Checker3::NumOutArray& dest, Checker3::NumInpArray inp) {
  1182. std::shared_ptr<HostTensorND> sh_out;
  1183. convolution_brute({inp[0], inp[1]}, sh_out,
  1184. convert_to_conv_param(param));
  1185. dest[0] = *sh_out;
  1186. size_t N = dest[0].shape()[0];
  1187. size_t OC = dest[0].shape()[1];
  1188. size_t OH = dest[0].shape()[2];
  1189. size_t OW = dest[0].shape()[3];
  1190. auto dest_ptr = dest[0].ptr<float>();
  1191. for (size_t i = 0; i < N; i++) {
  1192. auto bias_ptr = inp[2]->ptr<float>();
  1193. for (size_t c = 0; c < OC; c++) {
  1194. for (size_t hw = 0; hw < OH * OW; hw++) {
  1195. *(dest_ptr++) += *(bias_ptr);
  1196. }
  1197. bias_ptr++;
  1198. }
  1199. }
  1200. };
  1201. auto run_with_param = [&](size_t SH = 1, size_t SW = 1, size_t PH = 0,
  1202. size_t PW = 0) {
  1203. param.pad_h = PH;
  1204. param.pad_w = PW;
  1205. param.stride_h = SH;
  1206. param.stride_w = SW;
  1207. Checker2 checker2{make_graph2, fwd2};
  1208. Checker2::RunOptions opt2;
  1209. checker2.set_output_allow_grad(0, false);
  1210. Checker3 checker3{make_graph3, fwd3};
  1211. Checker3::RunOptions opt3;
  1212. checker3.set_output_allow_grad(0, false);
  1213. auto run = [&](size_t N, size_t IC, size_t OC, size_t IH, size_t IW,
  1214. size_t FH, size_t FW) {
  1215. auto opr = megdnn_naive_handle()
  1216. ->create_operator<megdnn::ConvolutionForward>();
  1217. opr->param() = convert_to_conv_param(param);
  1218. TensorLayout dest_layout;
  1219. opr->deduce_layout({{N, IC, IH, IW}, dtype::Float32()},
  1220. {{OC, IC, FH, FW}, dtype::Float32()},
  1221. dest_layout);
  1222. checker2.run({TensorShape{N, IC, IH, IW}, {OC, IC, FH, FW}}, opt2);
  1223. checker3.run({TensorShape{N, IC, IH, IW},
  1224. {OC, IC, FH, FW},
  1225. {1, OC, 1, 1}},
  1226. opt3);
  1227. };
  1228. run(1, 1, 1, 5, 5, 1, 1);
  1229. run(1, 1, 1, 5, 5, 3, 3);
  1230. run(2, 3, 4, 5, 5, 3, 3);
  1231. run(3, 3, 4, 224, 223, 3, 3);
  1232. run(3, 3, 4, 224, 223, 2, 2);
  1233. };
  1234. run_with_param();
  1235. run_with_param(2, 2, 3, 3);
  1236. run_with_param(3, 2, 1, 2);
  1237. run_with_param(2, 3, 2, 2);
  1238. }
  1239. TEST(TestOprDNN, ConvBiasForwardWithZ) {
  1240. REQUIRE_GPU(1);
  1241. using Checker4 = AutoOprChecker<4, 1>;
  1242. opr::ConvBiasForward::Param param;
  1243. auto make_graph4 =
  1244. [&](const Checker4::SymInpArray& inputs) -> Checker4::SymOutArray {
  1245. return {opr::ConvBiasForward::make(inputs[0], inputs[1], inputs[2],
  1246. inputs[3], param)};
  1247. };
  1248. auto fwd4 = [&](Checker4::NumOutArray& dest, Checker4::NumInpArray inp) {
  1249. std::shared_ptr<HostTensorND> sh_out;
  1250. convolution_brute({inp[0], inp[1]}, sh_out,
  1251. convert_to_conv_param(param));
  1252. dest[0] = *sh_out;
  1253. size_t N = dest[0].shape()[0];
  1254. size_t OC = dest[0].shape()[1];
  1255. size_t OH = dest[0].shape()[2];
  1256. size_t OW = dest[0].shape()[3];
  1257. auto dest_ptr = dest[0].ptr<float>();
  1258. float* z_ptr = inp[3]->ptr<float>();
  1259. for (size_t i = 0; i < N; i++) {
  1260. auto bias_ptr = inp[2]->ptr<float>();
  1261. for (size_t c = 0; c < OC; c++) {
  1262. for (size_t hw = 0; hw < OH * OW; hw++) {
  1263. *(dest_ptr++) += *(bias_ptr) + *(z_ptr++);
  1264. }
  1265. bias_ptr++;
  1266. }
  1267. }
  1268. };
  1269. auto run_with_param = [&](size_t SH = 1, size_t SW = 1, size_t PH = 0,
  1270. size_t PW = 0) {
  1271. param.pad_h = PH;
  1272. param.pad_w = PW;
  1273. param.stride_h = SH;
  1274. param.stride_w = SW;
  1275. Checker4 checker4{make_graph4, fwd4};
  1276. Checker4::RunOptions opt4;
  1277. checker4.set_output_allow_grad(0, false);
  1278. auto run = [&](size_t N, size_t IC, size_t OC, size_t IH, size_t IW,
  1279. size_t FH, size_t FW) {
  1280. auto opr = megdnn_naive_handle()
  1281. ->create_operator<megdnn::ConvolutionForward>();
  1282. opr->param() = convert_to_conv_param(param);
  1283. TensorLayout dest_layout;
  1284. opr->deduce_layout({{N, IC, IH, IW}, dtype::Float32()},
  1285. {{OC, IC, FH, FW}, dtype::Float32()},
  1286. dest_layout);
  1287. checker4.run({TensorShape{N, IC, IH, IW},
  1288. {OC, IC, FH, FW},
  1289. {1, OC, 1, 1},
  1290. {N, OC, dest_layout[2], dest_layout[3]}},
  1291. opt4);
  1292. };
  1293. run(1, 1, 1, 5, 5, 3, 3);
  1294. run(2, 3, 4, 5, 5, 3, 3);
  1295. run(3, 3, 4, 224, 223, 3, 3);
  1296. run(3, 3, 4, 224, 223, 2, 2);
  1297. };
  1298. run_with_param();
  1299. run_with_param(2, 2, 3, 3);
  1300. run_with_param(3, 2, 1, 2);
  1301. run_with_param(2, 3, 2, 2);
  1302. }
  1303. TEST(TestOprDNN, ConvBiasINT8x8xX_NCHW4) {
  1304. using Checker = AutoOprChecker<3, 1>;
  1305. using Param = opr::ConvBias::Param;
  1306. opr::ConvBiasForward::Param param;
  1307. auto make_quantized = [&](SymbolVar x, const DType& dtype) {
  1308. return opr::TypeCvt::make(x, dtype);
  1309. };
  1310. auto make_graph =
  1311. [&](const Checker::SymInpArray& inputs) -> Checker::SymOutArray {
  1312. auto conv_param = convert_to_conv_param(param);
  1313. auto y = opr::Convolution::make(
  1314. make_quantized(inputs[0], dtype::QuantizedS8(0.3f)),
  1315. make_quantized(inputs[1], dtype::QuantizedS8(0.1f)), conv_param);
  1316. y = y + make_quantized(inputs[2], dtype::QuantizedS32(0.03f));
  1317. if (param.nonlineMode == Param::NonlineMode::RELU)
  1318. y = opr::Elemwise::make(
  1319. {y}, {opr::Elemwise::Mode::RELU});
  1320. y = opr::TypeCvt::make(y, dtype::QuantizedS8(0.5f));
  1321. return {opr::TypeCvt::make(y, dtype::Float32())};
  1322. };
  1323. auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
  1324. auto graph = ComputingGraph::make();
  1325. Checker::SymInpArray inputs;
  1326. for (size_t i = 0; i < inp.size(); ++i) {
  1327. inputs[i] = opr::Host2DeviceCopy::make(
  1328. *graph, inp[i]);
  1329. }
  1330. auto options = gopt::OptimizeForInferenceOptions{};
  1331. options.enable_fuse_conv_bias_nonlinearity();
  1332. auto y = gopt::optimize_for_inference({make_graph(inputs)[0]},
  1333. options)[0];
  1334. auto func = graph->compile({make_callback_copy(y, dest[0])});
  1335. func->execute();
  1336. func->wait();
  1337. };
  1338. auto run_with_param = [&](size_t SH = 1, size_t SW = 1, size_t PH = 0,
  1339. size_t PW = 0, size_t group = 1) {
  1340. param.pad_h = PH;
  1341. param.pad_w = PW;
  1342. param.stride_h = SH;
  1343. param.stride_w = SW;
  1344. param.format = Param::Format::NCHW4;
  1345. if (group != 1)
  1346. param.sparse = Param::Sparse::GROUP;
  1347. Checker checker{make_graph, fwd, CompNode::load("cpu0")};
  1348. Checker::RunOptions opt;
  1349. checker.set_output_allow_grad(0, false);
  1350. auto run = [&](size_t N, size_t IC, size_t OC, size_t IH, size_t IW,
  1351. size_t FH, size_t FW) {
  1352. mgb_assert(IC % 4 == 0 && OC % 4 == 0);
  1353. checker.run({TensorShape{N, group * IC / 4, IH, IW, 4},
  1354. {group, OC, IC / 4, FH, FW, 4},
  1355. {1, group * OC / 4, 1, 1, 4}},
  1356. opt);
  1357. };
  1358. run(1, 8, 8, 56, 56, 3, 3);
  1359. run(1, 8, 8, 56, 56, 3, 3);
  1360. run(1, 8, 8, 56, 56, 3, 3);
  1361. };
  1362. run_with_param(1, 1, 1, 1, 8);
  1363. run_with_param();
  1364. run_with_param(2, 2, 3, 3);
  1365. run_with_param(3, 2, 1, 2);
  1366. run_with_param(2, 3, 2, 2);
  1367. }
  1368. TEST(TestOprDNN, ConvolutionDTypeInference) {
  1369. Param param;
  1370. param.mode = Mode::CONVOLUTION;
  1371. auto cn = CompNode::load("cpu0");
  1372. auto graph = ComputingGraph::make();
  1373. HostTensorND inp_host{
  1374. cn, {1, 3, 7, 7}, dtype::Quantized8Asymm(0.233f, (uint8_t)123)};
  1375. HostTensorND filt_host{
  1376. cn, {8, 3, 1, 1}, dtype::Quantized8Asymm(0.874f, (uint8_t)234)};
  1377. auto inp = opr::ImmutableTensor::make(*graph, inp_host);
  1378. auto filt = opr::ImmutableTensor::make(*graph, filt_host);
  1379. auto opr = opr::Convolution::make(inp, filt, param);
  1380. ASSERT_EQ(opr.dtype().enumv(), DTypeEnum::QuantizedS32);
  1381. // This has to be EQ instead of NEAR
  1382. EXPECT_EQ(opr.dtype().param<dtype::QuantizedS32>().scale, 0.233f * 0.874f);
  1383. inp_host = {cn, {1, 3, 7, 7}, dtype::QuantizedS8(0.1234f)};
  1384. filt_host = {cn, {8, 3, 1, 1}, dtype::QuantizedS8(0.2345f)};
  1385. inp = opr::ImmutableTensor::make(*graph, inp_host);
  1386. filt = opr::ImmutableTensor::make(*graph, filt_host);
  1387. opr = opr::Convolution::make(inp, filt, param);
  1388. ASSERT_EQ(opr.dtype().enumv(), DTypeEnum::QuantizedS32);
  1389. EXPECT_EQ(opr.dtype().param<dtype::QuantizedS32>().scale,
  1390. 0.1234f * 0.2345f);
  1391. inp_host = {cn, {1, 3, 7, 7}, dtype::Int8()};
  1392. filt_host = {cn, {8, 3, 1, 1}, dtype::Int8()};
  1393. inp = opr::ImmutableTensor::make(*graph, inp_host);
  1394. filt = opr::ImmutableTensor::make(*graph, filt_host);
  1395. opr = opr::Convolution::make(inp, filt, param);
  1396. ASSERT_EQ(opr.dtype().enumv(), DTypeEnum::Int32);
  1397. }
  1398. TEST(TestOprDNN, ConvBiasINT8x8xXDTypeInference) {
  1399. float inp_scale = 1.926f;
  1400. float filt_scale = 0.817f;
  1401. float bias_scale = inp_scale * filt_scale;
  1402. opr::ConvBias::Param param;
  1403. param.mode = Mode::CONVOLUTION;
  1404. auto cn = CompNode::load("cpu0");
  1405. auto graph = ComputingGraph::make();
  1406. HostTensorND inp_host{cn, {1, 3, 7, 7}, dtype::QuantizedS8(inp_scale)};
  1407. HostTensorND filt_host{cn, {8, 3, 1, 1}, dtype::QuantizedS8(filt_scale)};
  1408. DType output_dtype = dtype::QuantizedS8(bias_scale);
  1409. HostTensorND bias_host{cn, {1, 3, 7, 7}, dtype::QuantizedS32(bias_scale)};
  1410. auto inp = opr::ImmutableTensor::make(*graph, inp_host);
  1411. auto filt = opr::ImmutableTensor::make(*graph, filt_host);
  1412. auto bias = opr::ImmutableTensor::make(*graph, filt_host);
  1413. auto opr = opr::ConvBiasForward::make(inp, filt, bias, param,
  1414. {}, OperatorNodeConfig{output_dtype});
  1415. ASSERT_EQ(opr.dtype().enumv(), DTypeEnum::QuantizedS8);
  1416. EXPECT_EQ(opr.dtype().param<dtype::QuantizedS8>().scale, bias_scale);
  1417. }
  1418. TEST(TestOprDNN, ConvBiasINT8x8xXSerialization) {
  1419. using namespace serialization;
  1420. float inp_scale = 1.926f;
  1421. float filt_scale = 0.817f;
  1422. float bias_scale = inp_scale * filt_scale;
  1423. DType output_dtype = dtype::QuantizedS8(bias_scale);
  1424. auto fname = output_file("ConvBiasINT8x8xXTest");
  1425. auto dump = [&]() {
  1426. opr::ConvBias::Param param;
  1427. param.mode = Mode::CONVOLUTION;
  1428. auto cn = CompNode::load("cpu0");
  1429. auto graph = ComputingGraph::make();
  1430. HostTensorND inp_host{cn, {1, 3, 7, 7}, dtype::QuantizedS8(inp_scale)};
  1431. HostTensorND filt_host{
  1432. cn, {8, 3, 1, 1}, dtype::QuantizedS8(filt_scale)};
  1433. HostTensorND bias_host{
  1434. cn, {1, 3, 7, 7}, dtype::QuantizedS32(bias_scale)};
  1435. auto inp = opr::ImmutableTensor::make(*graph, inp_host);
  1436. auto filt = opr::ImmutableTensor::make(*graph, filt_host);
  1437. auto bias = opr::ImmutableTensor::make(*graph, filt_host);
  1438. auto opr = opr::ConvBiasForward::make(inp, filt, bias, param,
  1439. {},
  1440. OperatorNodeConfig{output_dtype});
  1441. auto dumper = GraphDumper::make(OutputFile::make_fs(fname.c_str()));
  1442. auto rst = dumper->dump({opr});
  1443. ASSERT_EQ(rst.outputs.size(), 1u);
  1444. };
  1445. auto load = [&]() {
  1446. auto loader = GraphLoader::make(InputFile::make_fs(fname.c_str()));
  1447. auto rst = loader->load();
  1448. ASSERT_EQ(rst.output_var_list.size(), 1u);
  1449. EXPECT_EQ(rst.output_var_list[0].dtype(), output_dtype);
  1450. };
  1451. dump();
  1452. load();
  1453. }
  1454. TEST(TestOprDNN, LocalShareForward) {
  1455. REQUIRE_GPU(1);
  1456. using Checker = AutoOprChecker<2, 1>;
  1457. using Param = opr::LocalShare::Param;
  1458. Param param;
  1459. param.mode = Param::Mode::CROSS_CORRELATION;
  1460. param.sparse = Param::Sparse::DENSE;
  1461. auto make_graph =
  1462. [&](const Checker::SymInpArray& inputs) -> Checker::SymOutArray {
  1463. return {opr::LocalShare::make(inputs[0], inputs[1], param)};
  1464. };
  1465. auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
  1466. mgb_assert(inp.size() == 2);
  1467. mgb_assert(dest.size() == 1);
  1468. std::shared_ptr<HostTensorND> out;
  1469. local_share_brute({inp[0], inp[1]}, out, param);
  1470. dest[0] = *out;
  1471. };
  1472. auto run_with_param = [&](size_t fh = 3, size_t fw = 3, size_t sh = 1,
  1473. size_t sw = 1, size_t sgh = 3, size_t sgw = 3) {
  1474. size_t ph = fh / 2, pw = fw / 2;
  1475. param.pad_h = ph, param.pad_w = pw;
  1476. param.stride_h = sh, param.stride_w = sw, param.spatial_groups_h = sgh,
  1477. param.spatial_groups_w = sgw;
  1478. Checker checker{make_graph, fwd};
  1479. Checker::RunOptions opt;
  1480. checker.set_output_allow_grad(0, false);
  1481. checker.set_input_dtype(0, dtype::Float32());
  1482. checker.set_input_dtype(1, dtype::Float32());
  1483. auto run = [&](size_t n, size_t ci, size_t co, size_t hi, size_t wi) {
  1484. size_t ho = (hi + 2 * ph - fh) / sh + 1;
  1485. size_t wo = (wi + 2 * pw - fw) / sw + 1;
  1486. if (ho % sgh != 0 || wo % sgw != 0)
  1487. return;
  1488. checker.run({TensorShape{n, ci, hi, wi},
  1489. TensorShape{sgh, sgw, ci, fh, fw, co}},
  1490. opt);
  1491. };
  1492. run(32, 2, 7, 24, 24);
  1493. run(16, 2, 7, 24, 24);
  1494. run(32, 2, 8, 12, 12);
  1495. run(16, 2, 9, 6, 6);
  1496. };
  1497. run_with_param(1, 1, 1, 1, 3, 3);
  1498. run_with_param(3, 3, 1, 1, 2, 2);
  1499. run_with_param(5, 5, 1, 1, 2, 2);
  1500. run_with_param(7, 7, 1, 1, 2, 2);
  1501. run_with_param(1, 1, 2, 2, 3, 3);
  1502. run_with_param(3, 3, 2, 2, 2, 2);
  1503. run_with_param(5, 5, 1, 1, 2, 2);
  1504. run_with_param(7, 7, 1, 1, 2, 2);
  1505. }
  1506. TEST(TestOprDNN, LocalShareForwardGrad) {
  1507. REQUIRE_GPU(1);
  1508. using Checker = AutoOprChecker<2, 1>;
  1509. using Param = opr::LocalShare::Param;
  1510. Param param;
  1511. param.mode = Param::Mode::CROSS_CORRELATION;
  1512. param.sparse = Param::Sparse::DENSE;
  1513. auto make_graph =
  1514. [&](const Checker::SymInpArray& inputs) -> Checker::SymOutArray {
  1515. return {opr::LocalShare::make(inputs[0], inputs[1], param)};
  1516. };
  1517. auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
  1518. mgb_assert(inp.size() == 2);
  1519. mgb_assert(dest.size() == 1);
  1520. std::shared_ptr<HostTensorND> out;
  1521. local_share_brute({inp[0], inp[1]}, out, param);
  1522. dest[0] = *out;
  1523. };
  1524. auto run_with_param = [&](size_t fh = 3, size_t fw = 3, size_t sh = 1,
  1525. size_t sw = 1, size_t sgh = 3, size_t sgw = 3) {
  1526. size_t ph = fh / 2, pw = fw / 2;
  1527. param.pad_h = ph, param.pad_w = pw;
  1528. param.stride_h = sh, param.stride_w = sw, param.spatial_groups_h = sgh,
  1529. param.spatial_groups_w = sgw;
  1530. Checker checker{make_graph, fwd};
  1531. Checker::RunOptions opt;
  1532. checker.set_output_allow_grad(0, true);
  1533. opt.numdiff_max_err = 1e-1;
  1534. checker.set_input_dtype(0, dtype::Float32());
  1535. checker.set_input_dtype(1, dtype::Float32());
  1536. auto run = [&](size_t n, size_t ci, size_t co, size_t hi, size_t wi) {
  1537. size_t ho = (hi + 2 * ph - fh) / sh + 1;
  1538. size_t wo = (wi + 2 * pw - fw) / sw + 1;
  1539. if (ho % sgh != 0 || wo % sgw != 0)
  1540. return;
  1541. checker.run({TensorShape{n, ci, hi, wi},
  1542. TensorShape{sgh, sgw, ci, fh, fw, co}},
  1543. opt);
  1544. };
  1545. run(4, 2, 8, 24, 24);
  1546. run(8, 2, 4, 6, 6);
  1547. run(16, 4, 8, 12, 12);
  1548. run(4, 4, 8, 12, 12);
  1549. };
  1550. run_with_param(1, 1, 1, 1, 3, 3);
  1551. run_with_param(1, 1, 2, 2, 3, 3);
  1552. run_with_param(3, 3, 2, 2, 2, 2);
  1553. }
  1554. TEST(TestOprDNN, LocalShareForwardExecPolicy) {
  1555. REQUIRE_GPU(1);
  1556. using Checker = AutoOprChecker<2, 1>;
  1557. using Policy = opr::LocalShare::ExecutionPolicy;
  1558. using S = Policy::Strategy;
  1559. using Param = opr::LocalShare::Param;
  1560. Param param;
  1561. param.mode = Param::Mode::CROSS_CORRELATION;
  1562. param.sparse = Param::Sparse::DENSE;
  1563. int nr_get = 0;
  1564. auto on_get = [&nr_get](const std::string&, const void*, size_t,
  1565. const void*, size_t) { ++nr_get; };
  1566. PersistentCacheHook cache_hook{on_get};
  1567. #if MGB_ENABLE_FASTRUN
  1568. for (auto strategy :
  1569. SmallVector<S>{S::PROFILE, S::HEURISTIC, S::PROFILE | S::REPRODUCIBLE,
  1570. S::PROFILE | S::HEURISTIC, S::PROFILE | S::OPTIMIZED}) {
  1571. #else
  1572. for (auto strategy :
  1573. SmallVector<S>{S : HEURISTIC, S::PROFILE | S::HEURISTIC}) {
  1574. #endif
  1575. auto make_graph = [&](const Checker::SymInpArray& inputs)
  1576. -> Checker::SymOutArray {
  1577. Policy policy;
  1578. policy.strategy = strategy;
  1579. return {opr::LocalShare::make(inputs[0], inputs[1], param, policy)};
  1580. };
  1581. auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
  1582. mgb_assert(inp.size() == 2);
  1583. mgb_assert(dest.size() == 1);
  1584. std::shared_ptr<HostTensorND> out;
  1585. local_share_brute({inp[0], inp[1]}, out, param);
  1586. dest[0] = *out;
  1587. };
  1588. auto run_with_param = [&](size_t fh = 3, size_t fw = 3, size_t sh = 1,
  1589. size_t sw = 1, size_t sgh = 3,
  1590. size_t sgw = 3) {
  1591. size_t ph = fh / 2, pw = fw / 2;
  1592. param.pad_h = ph, param.pad_w = pw;
  1593. param.stride_h = sh, param.stride_w = sw,
  1594. param.spatial_groups_h = sgh, param.spatial_groups_w = sgw;
  1595. Checker checker{make_graph, fwd};
  1596. Checker::RunOptions opt;
  1597. checker.set_output_allow_grad(0, false);
  1598. checker.set_input_dtype(0, dtype::Float32());
  1599. checker.set_input_dtype(1, dtype::Float32());
  1600. nr_get = 0;
  1601. opt.outputs_max_err = 1e-3;
  1602. auto run = [&](size_t n, size_t ci, size_t co, size_t hi,
  1603. size_t wi) {
  1604. size_t ho = (hi + 2 * ph - fh) / sh + 1;
  1605. size_t wo = (wi + 2 * pw - fw) / sw + 1;
  1606. if (ho % sgh != 0 || wo % sgw != 0)
  1607. return;
  1608. checker.run({TensorShape{n, ci, hi, wi},
  1609. TensorShape{sgh, sgw, ci, fh, fw, co}},
  1610. opt);
  1611. };
  1612. run(32, 4, 8, 24, 24);
  1613. run(32, 4, 8, 12, 12);
  1614. run(16, 4, 8, 12, 12);
  1615. run(32, 4, 8, 6, 6);
  1616. if (strategy == S::HEURISTIC) {
  1617. ASSERT_EQ(0, nr_get);
  1618. } else {
  1619. ASSERT_LT(0, nr_get);
  1620. }
  1621. };
  1622. run_with_param(1, 1, 1, 1, 3, 3);
  1623. run_with_param(3, 3, 1, 1, 2, 2);
  1624. run_with_param(5, 5, 1, 1, 2, 2);
  1625. run_with_param(7, 7, 1, 1, 2, 2);
  1626. run_with_param(1, 1, 2, 2, 3, 3);
  1627. run_with_param(3, 3, 2, 2, 2, 2);
  1628. run_with_param(5, 5, 1, 1, 2, 2);
  1629. run_with_param(7, 7, 1, 1, 2, 2);
  1630. }
  1631. }
  1632. TEST(TestOprDNN, LocalShareSerialization) {
  1633. using namespace serialization;
  1634. auto fname = output_file("LocalShareForwardTest");
  1635. auto dump = [&]() {
  1636. opr::LocalShare::Param param;
  1637. param.mode = Mode::CROSS_CORRELATION;
  1638. param.stride_h = param.stride_w = 1;
  1639. param.pad_h = param.pad_w = 0;
  1640. param.spatial_groups_h = param.spatial_groups_w = 3;
  1641. auto cn = CompNode::load("cpu0");
  1642. auto graph = ComputingGraph::make();
  1643. HostTensorND inp_host{cn, {32, 4, 24, 24}, dtype::Float32()};
  1644. HostTensorND filt_host{
  1645. cn, {3, 3, 4, 1, 1, 8}, dtype::Float32()};
  1646. auto inp = opr::ImmutableTensor::make(*graph, inp_host);
  1647. auto filt = opr::ImmutableTensor::make(*graph, filt_host);
  1648. auto opr = opr::LocalShareForward::make(inp, filt, param,
  1649. {});
  1650. auto dumper = GraphDumper::make(OutputFile::make_fs(fname.c_str()));
  1651. auto rst = dumper->dump({opr});
  1652. ASSERT_EQ(rst.outputs.size(), 1u);
  1653. };
  1654. auto load = [&]() {
  1655. auto loader = GraphLoader::make(InputFile::make_fs(fname.c_str()));
  1656. auto rst = loader->load();
  1657. ASSERT_EQ(rst.output_var_list.size(), 1u);
  1658. };
  1659. dump();
  1660. load();
  1661. }
  1662. TEST(TestOprDNN, DeformableConvForward) {
  1663. REQUIRE_GPU(1);
  1664. using Checker = AutoOprChecker<4, 1>;
  1665. using Policy = opr::DeformableConvForward::ExecutionPolicy;
  1666. using S = Policy::Strategy;
  1667. using Param = opr::DeformableConvForward::Param;
  1668. Param param;
  1669. #if MGB_ENABLE_FASTRUN
  1670. for (auto strategy :
  1671. SmallVector<S>{S::PROFILE, S::HEURISTIC, S::PROFILE | S::REPRODUCIBLE,
  1672. S::PROFILE | S::HEURISTIC, S::PROFILE | S::OPTIMIZED}) {
  1673. #else
  1674. for (auto strategy :
  1675. SmallVector<S>{S : HEURISTIC, S::PROFILE | S::HEURISTIC}) {
  1676. #endif
  1677. auto make_graph = [&](const Checker::SymInpArray& inputs)
  1678. -> Checker::SymOutArray {
  1679. Policy policy;
  1680. policy.strategy = strategy;
  1681. return {opr::DeformableConvForward::make(
  1682. inputs[0], inputs[1], inputs[2], inputs[3], param, policy)};
  1683. };
  1684. auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
  1685. auto opr =
  1686. megdnn_naive_handle()
  1687. ->create_operator<megdnn::DeformableConvForward>();
  1688. opr->param() = param;
  1689. TensorLayout dest_layout;
  1690. opr->deduce_layout(inp[0]->layout(), inp[1]->layout(),
  1691. inp[2]->layout(), inp[3]->layout(), dest_layout);
  1692. std::vector<dt_byte> workspace(opr->get_workspace_in_bytes(
  1693. inp[0]->layout(), inp[1]->layout(), inp[2]->layout(),
  1694. inp[3]->layout(), dest_layout));
  1695. dest[0].dtype(dtype::Float32())
  1696. .comp_node(inp[0]->comp_node())
  1697. .resize(dest_layout);
  1698. opr->exec(inp[0]->as_megdnn(), inp[1]->as_megdnn(),
  1699. inp[2]->as_megdnn(), inp[3]->as_megdnn(),
  1700. dest[0].as_megdnn(),
  1701. {workspace.data(), workspace.size()});
  1702. };
  1703. auto run_with_param = [&](size_t fh, size_t fw, size_t sh, size_t sw,
  1704. size_t dh, size_t dw, size_t group,
  1705. size_t deformable_group) {
  1706. Checker checker{make_graph, fwd};
  1707. size_t ph = fh / 2, pw = fw / 2;
  1708. param.pad_h = ph, param.pad_w = pw;
  1709. param.stride_h = sh, param.stride_w = sw;
  1710. param.dilate_h = dh, param.dilate_w = dw;
  1711. param.format = Param::Format::NCHW;
  1712. param.mode = Param::Mode::CROSS_CORRELATION;
  1713. param.sparse = Param::Sparse::DENSE;
  1714. if (group > 1)
  1715. param.sparse = Param::Sparse::GROUP;
  1716. Checker::RunOptions opt;
  1717. float DELTA = 1e-3;
  1718. opt.numdiff_eps = DELTA;
  1719. opt.numdiff_max_err = 1e-1;
  1720. auto gen_off = [DELTA](HostTensorND& off, float l = -2.f, float h = 2.f) {
  1721. RNGxorshf rng{next_rand_seed()};
  1722. auto elems = off.shape().total_nr_elems();
  1723. auto ptr = off.ptr<float>();
  1724. auto rand_real = [](RNGxorshf& rng, float lo, float hi) {
  1725. std::uniform_real_distribution<float> dist(lo, hi);
  1726. return dist(rng);
  1727. };
  1728. for (size_t i = 0; i < elems; ++i) {
  1729. do {
  1730. float val = rand_real(rng, l, h);
  1731. if (abs(floor(val + 2 * DELTA) - floor(val)) <= 1e-6f &&
  1732. abs(floor(val - 2 * DELTA) - floor(val)) <= 1e-6f) {
  1733. ptr[i] = val;
  1734. break;
  1735. }
  1736. } while (true);
  1737. }
  1738. };
  1739. //! generate offset to avoid value near integer
  1740. /// because bilinear function is not derivable over there
  1741. checker.set_input_generator(2, gen_off);
  1742. checker.set_input_dtype(0, dtype::Float32());
  1743. checker.set_input_dtype(1, dtype::Float32());
  1744. checker.set_input_dtype(2, dtype::Float32());
  1745. checker.set_input_dtype(3, dtype::Float32());
  1746. auto run = [&](size_t n, size_t ih, size_t iw, size_t icpg,
  1747. size_t ocpg) {
  1748. size_t oh = (ih + 2 * ph - fh) / sh + 1;
  1749. size_t ow = (iw + 2 * pw - fw) / sw + 1;
  1750. checker.run({TensorShape{n, group * icpg, ih, iw},
  1751. (param.sparse == Param::Sparse::GROUP)
  1752. ? TensorShape{group, ocpg, icpg, fh, fw}
  1753. : TensorShape{group * ocpg, group * icpg,
  1754. fh, fw},
  1755. {n, 2 * deformable_group * fh * fw, oh, ow},
  1756. {n, deformable_group * fh * fw, oh, ow}},
  1757. opt);
  1758. };
  1759. run(1, 3, 3, 2, 1);
  1760. run(2, 3, 3, 2, 2);
  1761. run(1, 5, 5, 2, 1);
  1762. };
  1763. // run_with_param(1, 1, 1, 1, 1, 1, 1, 1);
  1764. run_with_param(3, 3, 1, 1, 1, 1, 2, 2);
  1765. // run_with_param(5, 5, 1, 1, 1, 1, 2, 2);
  1766. }
  1767. }
  1768. TEST(TestOprDNN, DeformableConvSerialization) {
  1769. using namespace serialization;
  1770. auto fname = output_file("DeformableConvTest");
  1771. auto dump = [&]() {
  1772. using Param = opr::DeformableConvForward::Param;
  1773. Param param;
  1774. size_t n = 16, ocpg = 2, icpg = 4;
  1775. size_t ih = 24, iw = 24, fh = 3, fw = 3, ph = 2, pw = 2, sh = 1, sw = 1, dh = 1, dw = 1;
  1776. size_t group = 1, deformable_group =1;
  1777. size_t oh = (ih + 2 * ph - fh) / sh + 1;
  1778. size_t ow = (iw + 2 * pw - fw) / sw + 1;
  1779. param.pad_h = ph, param.pad_w = pw;
  1780. param.stride_h = sh, param.stride_w = sw;
  1781. param.dilate_h = dh, param.dilate_w = dw;
  1782. param.format = Param::Format::NCHW;
  1783. param.mode = Param::Mode::CROSS_CORRELATION;
  1784. param.sparse = Param::Sparse::DENSE;
  1785. auto cn = CompNode::load("cpu0");
  1786. auto graph = ComputingGraph::make();
  1787. HostTensorND inp_host{cn, {n, group * icpg, ih, iw}, dtype::Float32()};
  1788. HostTensorND filt_host{
  1789. cn, {group * ocpg, group * icpg, fh, fw}, dtype::Float32()};
  1790. HostTensorND offset_host{
  1791. cn, {n, 2 * deformable_group * fh * fw, oh, ow}, dtype::Float32()};
  1792. HostTensorND mask_host{
  1793. cn, {n, deformable_group * fh * fw, oh, ow}, dtype::Float32()};
  1794. auto inp = opr::ImmutableTensor::make(*graph, inp_host);
  1795. auto filt = opr::ImmutableTensor::make(*graph, filt_host);
  1796. auto offset = opr::ImmutableTensor::make(*graph, offset_host);
  1797. auto mask = opr::ImmutableTensor::make(*graph, mask_host);
  1798. auto opr = opr::DeformableConvForward::make(inp, filt, offset, mask,
  1799. param, {}, {});
  1800. auto dumper = GraphDumper::make(OutputFile::make_fs(fname.c_str()));
  1801. auto rst = dumper->dump({opr});
  1802. ASSERT_EQ(rst.outputs.size(), 1u);
  1803. };
  1804. auto load = [&]() {
  1805. auto loader = GraphLoader::make(InputFile::make_fs(fname.c_str()));
  1806. auto rst = loader->load();
  1807. ASSERT_EQ(rst.output_var_list.size(), 1u);
  1808. };
  1809. dump();
  1810. load();
  1811. }
  1812. #if MGB_CUDA
  1813. TEST(TestOprDNN, BatchConvBiasForward) {
  1814. REQUIRE_GPU(1);
  1815. auto cn = CompNode::load("gpu0");
  1816. cn.activate();
  1817. REQUIRE_CUDA_COMPUTE_CAPABILITY(6, 1);
  1818. using Checker = AutoOprChecker<3, 1>;
  1819. using Policy = opr::BatchConvBiasForward::ExecutionPolicy;
  1820. using S = Policy::Strategy;
  1821. using Param = opr::BatchConvBiasForward::Param;
  1822. Param param;
  1823. param.format = Param::Format::NCHW4;
  1824. param.mode = Param::Mode::CROSS_CORRELATION;
  1825. param.sparse = Param::Sparse::DENSE;
  1826. #if MGB_ENABLE_FASTRUN
  1827. for (auto strategy :
  1828. SmallVector<S>{S::PROFILE, S::HEURISTIC, S::PROFILE | S::REPRODUCIBLE,
  1829. S::PROFILE | S::HEURISTIC, S::PROFILE | S::OPTIMIZED}) {
  1830. #else
  1831. for (auto strategy :
  1832. SmallVector<S>{S : HEURISTIC, S::PROFILE | S::HEURISTIC}) {
  1833. #endif
  1834. auto make_quantized = [&](SymbolVar x, const DType& dtype) {
  1835. return opr::TypeCvt::make(x, dtype);
  1836. };
  1837. auto make_graph = [&](const Checker::SymInpArray& inputs)
  1838. -> Checker::SymOutArray {
  1839. Policy policy;
  1840. policy.strategy = strategy;
  1841. auto conv_bias = opr::BatchConvBiasForward::make(
  1842. make_quantized(inputs[0], dtype::QuantizedS8{1.1f}),
  1843. make_quantized(inputs[1], dtype::QuantizedS8{1.2f}),
  1844. make_quantized(inputs[2], dtype::QuantizedS32{1.1f * 1.2f}),
  1845. param, policy,
  1846. OperatorNodeConfig{dtype::QuantizedS8{1.3f}});
  1847. return {opr::TypeCvt::make(conv_bias, dtype::Float32())};
  1848. };
  1849. auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
  1850. mgb_assert(inp.size() == 3);
  1851. mgb_assert(dest.size() == 1);
  1852. auto graph = ComputingGraph::make();
  1853. Checker::SymInpArray inputs;
  1854. for (size_t i = 0; i < inp.size(); ++i) {
  1855. inputs[i] = opr::Host2DeviceCopy::make(*graph, inp[i]);
  1856. }
  1857. auto src = make_quantized(inputs[0], dtype::QuantizedS8{1.1f}),
  1858. filter = make_quantized(inputs[1], dtype::QuantizedS8{1.2f}),
  1859. bias = make_quantized(inputs[2],
  1860. dtype::QuantizedS32{1.1f * 1.2f});
  1861. {
  1862. auto xshp = opr::GetVarShape::make(src);
  1863. auto cv = [&src](int v) { return src.make_scalar(v); };
  1864. auto sub = [&xshp, &cv](int idx) {
  1865. return opr::IndexAt::make(xshp, {{0, cv(idx)}});
  1866. };
  1867. auto tshp = opr::Concat::make(
  1868. {cv(1), sub(0) * sub(1), sub(2), sub(3), sub(4)}, 0);
  1869. src = opr::Reshape::make(src, tshp);
  1870. }
  1871. auto conv_param = convert_to_conv_param(param);
  1872. conv_param.sparse = opr::BatchConvBias::Param::Sparse::GROUP;
  1873. auto y = opr::Convolution::make(src, filter, conv_param);
  1874. {
  1875. auto fshp = opr::GetVarShape::make(filter);
  1876. auto batch =
  1877. opr::IndexAt::make(fshp, {{0, filter.make_scalar(0)}});
  1878. auto xshp = opr::GetVarShape::make(y);
  1879. auto cv = [&y](int v) { return y.make_scalar(v); };
  1880. auto sub = [&xshp, &cv](int idx) {
  1881. return opr::IndexAt::make(xshp, {{0, cv(idx)}});
  1882. };
  1883. auto tshp = opr::Concat::make(
  1884. {batch, sub(1) / batch, sub(2), sub(3), sub(4)}, 0);
  1885. y = opr::Reshape::make(y, tshp);
  1886. }
  1887. y = y + bias;
  1888. y = opr::TypeCvt::make(y, dtype::QuantizedS8{1.3f});
  1889. y = opr::TypeCvt::make(y, dtype::Float32());
  1890. auto func = graph->compile({make_callback_copy(y, dest[0])});
  1891. func->execute();
  1892. func->wait();
  1893. };
  1894. auto run_with_param = [&](size_t sh = 1, size_t sw = 1) {
  1895. size_t fh = 1;
  1896. size_t fw = 1;
  1897. size_t ph = fh / 2, pw = fw / 2;
  1898. param.pad_h = ph, param.pad_w = pw;
  1899. param.stride_h = sh, param.stride_w = sw;
  1900. Checker checker{make_graph, fwd, cn};
  1901. Checker::RunOptions opt;
  1902. checker.set_output_allow_grad(0, false);
  1903. checker.set_input_dtype(0, dtype::Float32());
  1904. checker.set_input_dtype(1, dtype::Float32());
  1905. checker.set_input_dtype(2, dtype::Float32());
  1906. auto run = [&](size_t n, size_t ci, size_t co, size_t hi,
  1907. size_t wi) {
  1908. checker.run({TensorShape{n, ci / 4, hi, wi, 4},
  1909. TensorShape{n, co, ci / 4, fh, fw, 4},
  1910. TensorShape{1, co / 4, 1, 1, 4}},
  1911. opt);
  1912. };
  1913. run(32, 16, 32, 24, 24);
  1914. run(16, 16, 32, 24, 24);
  1915. run(32, 16, 64, 12, 12);
  1916. run(16, 16, 64, 6, 6);
  1917. };
  1918. run_with_param(1, 1);
  1919. run_with_param(2, 2);
  1920. }
  1921. }
  1922. #endif
  1923. TEST(TestOprDNN, BatchConvBiasSerialization) {
  1924. using namespace serialization;
  1925. auto fname = output_file("BatchConvBiasForwardTest");
  1926. auto dump = [&]() {
  1927. opr::BatchConvBias::Param param;
  1928. param.mode = Mode::CROSS_CORRELATION;
  1929. param.format = opr::BatchConvBias::Param::Format::NCHW4;
  1930. param.stride_h = param.stride_w = 1;
  1931. param.pad_h = param.pad_w = 0;
  1932. auto cn = CompNode::load("cpu0");
  1933. auto graph = ComputingGraph::make();
  1934. HostTensorND inp_host{cn, {32, 1, 24, 24, 4}, dtype::QuantizedS8{1.1f}};
  1935. HostTensorND filt_host{cn, {32, 8, 1, 1, 1, 4}, dtype::QuantizedS8{1.2f}};
  1936. auto inp = opr::ImmutableTensor::make(*graph, inp_host);
  1937. auto filt = opr::ImmutableTensor::make(*graph, filt_host);
  1938. auto opr = opr::BatchConvBiasForward::make(
  1939. inp, filt, param, {},
  1940. OperatorNodeConfig{dtype::QuantizedS8{1.3f}});
  1941. auto dumper = GraphDumper::make(OutputFile::make_fs(fname.c_str()));
  1942. auto rst = dumper->dump({opr});
  1943. ASSERT_EQ(rst.outputs.size(), 1u);
  1944. };
  1945. auto load = [&]() {
  1946. auto loader = GraphLoader::make(InputFile::make_fs(fname.c_str()));
  1947. auto rst = loader->load();
  1948. ASSERT_EQ(rst.output_var_list.size(), 1u);
  1949. };
  1950. dump();
  1951. load();
  1952. }
  1953. TEST(TestOprDNN, HeuristicReproducible) {
  1954. using Policy = opr::ConvolutionBackwardFilter::ExecutionPolicy;
  1955. using S = Policy::Strategy;
  1956. using Checker = AutoOprChecker<3, 1>;
  1957. constexpr size_t PH = 1, PW = 1, SH = 1, SW = 1;
  1958. for (auto strategy :
  1959. SmallVector<S>{S::HEURISTIC, S::HEURISTIC | S::REPRODUCIBLE}) {
  1960. VarNode* bwd_flt;
  1961. auto make_graph = [&](const Checker::SymInpArray& inputs)
  1962. -> Checker::SymOutArray {
  1963. Param param{Mode::CROSS_CORRELATION, PH, PW, SH, SW};
  1964. Policy policy;
  1965. policy.strategy = strategy;
  1966. auto out = opr::ConvolutionBackwardFilter::make(
  1967. inputs[0], inputs[1], inputs[2], param, policy);
  1968. bwd_flt = out.node();
  1969. return {out};
  1970. };
  1971. auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
  1972. std::shared_ptr<HostTensorND> out;
  1973. conv_bwd_flt_brute({inp[0], inp[1], inp[2]}, out,
  1974. Param{Mode::CROSS_CORRELATION, PH, PW, SH, SW});
  1975. dest[0] = *out;
  1976. };
  1977. #define get_shp(N, P, S, F) ((N + 2 * P - F) / S + 1)
  1978. #define inp_tensor(N, IC, OC, IH, IW, FH, FW) \
  1979. { \
  1980. TensorShape{N, IC, IH, IW}, \
  1981. {N, OC, get_shp(IH, PH, SH, FH), get_shp(IW, PW, SW, FW)}, { \
  1982. OC, IC, FH, FW \
  1983. } \
  1984. }
  1985. Checker::RunOptions opt;
  1986. opt.numdiff_eps = 1;
  1987. opt.outputs_max_err = 1e-3;
  1988. std::string algo_name0, algo_name1;
  1989. {
  1990. Checker checker(make_graph, fwd);
  1991. checker.run(inp_tensor(2, 3, 4, 9, 8, 3, 3), opt)
  1992. .run(inp_tensor(1, 5, 3, 7, 9, 3, 3), opt)
  1993. .run(inp_tensor(3, 4, 4, 9, 9, 3, 3), opt);
  1994. auto&& megdnn_opr = static_cast<megdnn::ConvolutionBackwardFilter*>(
  1995. static_cast<opr::ConvolutionBackwardFilter*>(
  1996. bwd_flt->owner_opr())
  1997. ->megdnn_opr());
  1998. auto&& algo = megdnn_opr->execution_policy().algo;
  1999. megdnn::Algorithm* palgo =
  2000. megdnn_opr->get_algorithm_from_desc(algo);
  2001. mgb_assert(palgo, "Unknown algo description");
  2002. if (strategy == S(S::HEURISTIC | S::REPRODUCIBLE)) {
  2003. EXPECT_TRUE(palgo->contain_attribute_all(
  2004. megdnn::AlgoAttribute::REPRODUCIBLE));
  2005. }
  2006. algo_name0 = palgo->name();
  2007. }
  2008. {
  2009. Checker checker(make_graph, fwd);
  2010. checker.run(inp_tensor(2, 3, 4, 9, 8, 3, 3), opt)
  2011. .run(inp_tensor(1, 5, 3, 7, 9, 3, 3), opt)
  2012. .run(inp_tensor(3, 4, 4, 9, 9, 3, 3), opt);
  2013. auto&& megdnn_opr = static_cast<megdnn::ConvolutionBackwardFilter*>(
  2014. static_cast<opr::ConvolutionBackwardFilter*>(
  2015. bwd_flt->owner_opr())
  2016. ->megdnn_opr());
  2017. auto&& algo = megdnn_opr->execution_policy().algo;
  2018. megdnn::Algorithm* palgo =
  2019. megdnn_opr->get_algorithm_from_desc(algo);
  2020. mgb_assert(palgo, "Unknown algo description");
  2021. algo_name1 = palgo->name();
  2022. }
  2023. EXPECT_TRUE(algo_name0 == algo_name1);
  2024. }
  2025. #undef inp_tensor
  2026. #undef get_shp
  2027. }
  2028. #if MGB_CUDA
  2029. TEST(TestOprDNN, ConvolutionMultiCompNode) {
  2030. REQUIRE_GPU(1);
  2031. auto cn0 = CompNode::load("gpu0:0"), cn1 = CompNode::load("gpu0:1");
  2032. cn0.activate();
  2033. auto&& prop = CompNodeEnv::from_comp_node(cn0).cuda_env().device_prop;
  2034. auto sm_ver = prop.major * 10 + prop.minor;
  2035. if (sm_ver < 61) {
  2036. printf("This testcast ignored due to insufficient cuda cap(got: %d, "
  2037. "expected: %d)\n",
  2038. sm_ver, 61);
  2039. return;
  2040. }
  2041. HostTensorGenerator<dtype::Int8> gen;
  2042. auto mkvar = [&gen](const char* name, const TensorShape& shp,
  2043. const DType& dtype,
  2044. std::shared_ptr<ComputingGraph> graph,
  2045. const CompNode& cn) {
  2046. return opr::TypeCvt::make(
  2047. opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
  2048. dtype);
  2049. };
  2050. auto mkcvar = [&gen](const char* name, const TensorShape& shp,
  2051. const DType& dtype,
  2052. std::shared_ptr<ComputingGraph> graph,
  2053. const CompNode& cn) {
  2054. return opr::TypeCvt::make(
  2055. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  2056. .rename(name),
  2057. dtype);
  2058. };
  2059. auto graph0 = ComputingGraph::make();
  2060. graph0->options().graph_opt_level = 0;
  2061. auto graph1 = ComputingGraph::make();
  2062. graph1->options().graph_opt_level = 0;
  2063. auto make_func = [&gen, &mkvar, &mkcvar](
  2064. std::shared_ptr<ComputingGraph> graph,
  2065. const CompNode& cn) {
  2066. using Policy = opr::ConvBias::ExecutionPolicy;
  2067. using S = Policy::Strategy;
  2068. auto x = mkvar("x", {64, 32, 28, 28, 4}, dtype::QuantizedS8(2.5f),
  2069. graph, cn),
  2070. w1 = mkcvar("w1", {256, 32, 5, 5, 4}, dtype::QuantizedS8(2.5f),
  2071. graph, cn),
  2072. b1 = mkcvar("b1", {1, 64, 1, 1, 4}, dtype::QuantizedS32(6.25f),
  2073. graph, cn),
  2074. w2 = mkcvar("w2", {256, 64, 3, 3, 4}, dtype::QuantizedS8(2.5f),
  2075. graph, cn),
  2076. b2 = mkcvar("b2", {1, 64, 1, 1, 4}, dtype::QuantizedS32(6.25f),
  2077. graph, cn);
  2078. opr::ConvBias::Param param;
  2079. param.format = opr::ConvBias::Param::Format::NCHW4;
  2080. param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU;
  2081. param.stride_h = param.stride_w = 2;
  2082. param.pad_h = param.pad_w = 2;
  2083. Policy policy;
  2084. policy.strategy = S::PROFILE;
  2085. auto y = opr::ConvBias::make(
  2086. x, w1, b1, param, policy,
  2087. OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  2088. param.stride_h = param.stride_w = 1;
  2089. param.pad_h = param.pad_w = 1;
  2090. y = opr::ConvBias::make(y, w2, b2, param, policy,
  2091. OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  2092. return y;
  2093. };
  2094. auto y0 = make_func(graph0, cn0);
  2095. auto y1 = make_func(graph1, cn1);
  2096. HostTensorND host_y0, host_y1;
  2097. auto func0 = graph0->compile({make_callback_copy(y0, host_y0)});
  2098. auto func1 = graph1->compile({make_callback_copy(y1, host_y1)});
  2099. auto worker = [&func0, &func1](int wid) {
  2100. static const int iter_num = 1000;
  2101. if (wid == 0) {
  2102. for (int i = 0; i < iter_num; ++i)
  2103. func0->execute();
  2104. } else {
  2105. for (int i = 0; i < iter_num; ++i)
  2106. func1->execute();
  2107. }
  2108. };
  2109. std::thread worker0(worker, 0);
  2110. std::thread worker1(worker, 1);
  2111. worker0.join();
  2112. worker1.join();
  2113. }
  2114. #endif
  2115. } // anonymous namespace
  2116. #ifndef _WIN32
  2117. namespace mgb {
  2118. namespace opr {
  2119. namespace testing {
  2120. class ConvolutionTestingPeer {
  2121. opr::ConvolutionForward& m_conv_opr;
  2122. public:
  2123. explicit ConvolutionTestingPeer(cg::OperatorNodeBase* opr)
  2124. : m_conv_opr(opr->cast_final_safe<opr::ConvolutionForward>()) {}
  2125. void set_megdnn_opr(
  2126. std::unique_ptr<megdnn::ConvolutionForward> megdnn_opr) {
  2127. m_conv_opr.set_megdnn_opr(std::move(megdnn_opr));
  2128. }
  2129. };
  2130. } // namespace testing
  2131. } // namespace opr
  2132. } // namespace mgb
  2133. namespace {
  2134. using megdnn::TensorND;
  2135. using megdnn::Workspace;
  2136. using opr::testing::ConvolutionTestingPeer;
  2137. class MockConvolutionForward : public megdnn::ConvolutionForward {
  2138. const char* m_algorithm_set_name;
  2139. public:
  2140. MockConvolutionForward(megdnn::ConvolutionForward* orig,
  2141. const char* algo_set_name)
  2142. : megdnn::ConvolutionForward(orig->handle()),
  2143. m_algorithm_set_name(algo_set_name) {}
  2144. MOCK_METHOD5(exec, void(_megdnn_tensor_in src, _megdnn_tensor_in filter,
  2145. _megdnn_tensor_out dst,
  2146. const PreprocessedFilter* preprocessed_filter,
  2147. _megdnn_workspace workspace));
  2148. MOCK_METHOD5(exec_preprocess,
  2149. void(const TensorLayout& src_layout, _megdnn_tensor_in filter,
  2150. const TensorLayout& dst_layout,
  2151. PreprocessedFilter* preprocessed_filter,
  2152. _megdnn_workspace workspace));
  2153. MOCK_METHOD4(get_workspace_in_bytes,
  2154. size_t(const TensorLayout& src, const TensorLayout& filter,
  2155. const TensorLayout& dst,
  2156. const PreprocessedFilter* preprocessed_filter));
  2157. MOCK_METHOD3(deduce_preprocessed_filter_layout,
  2158. SmallVector<TensorLayout>(const TensorLayout& src,
  2159. const TensorLayout& filter,
  2160. const TensorLayout& dst));
  2161. MOCK_METHOD3(get_preprocess_workspace_in_bytes,
  2162. size_t(const TensorLayout& src, const TensorLayout& filter,
  2163. const TensorLayout& dst));
  2164. MOCK_METHOD3(get_all_algorithms_info,
  2165. std::vector<AlgorithmInfo>(const TensorLayout& p0,
  2166. const TensorLayout& p1,
  2167. const TensorLayout& p2));
  2168. MOCK_METHOD6(get_algorithm_info_heuristic,
  2169. AlgorithmInfo(const TensorLayout& p0, const TensorLayout& p1,
  2170. const TensorLayout& p2,
  2171. size_t workspace_limit_in_bytes,
  2172. const AlgoAttribute& positive_attr,
  2173. const AlgoAttribute& negative_attr));
  2174. MOCK_METHOD3(get_all_algorithms,
  2175. std::vector<Algorithm*>(const TensorLayout& p0,
  2176. const TensorLayout& p1,
  2177. const TensorLayout& p2));
  2178. MOCK_METHOD6(get_algorithm_heuristic,
  2179. Algorithm*(const TensorLayout& p0, const TensorLayout& p1,
  2180. const TensorLayout& p2,
  2181. size_t workspace_limit_in_bytes,
  2182. const AlgoAttribute& positive_attr,
  2183. const AlgoAttribute& negative_attr));
  2184. MOCK_METHOD1(get_algorithm_from_desc,
  2185. Algorithm*(const AlgorithmDesc&));
  2186. protected:
  2187. const char* get_algorithm_set_name() const override {
  2188. return m_algorithm_set_name;
  2189. }
  2190. };
  2191. class MockAlgorithm : public megdnn::detail::Algorithm {
  2192. const char* m_name;
  2193. public:
  2194. MockAlgorithm(const char* name = "NotImportant") : m_name(name) {}
  2195. Attribute attribute() const override {
  2196. return Attribute::REPRODUCIBLE;
  2197. }
  2198. const char* name() const override { return m_name; }
  2199. uint32_t type() const override {
  2200. return megdnn::detail::Algorithm::INVALID_ALGO_TYPE;
  2201. }
  2202. virtual ~MockAlgorithm() = default;
  2203. };
  2204. class TestWeightPreprocess : public ::testing::Test {
  2205. protected:
  2206. CompNode comp_node;
  2207. std::shared_ptr<ComputingGraph> graph;
  2208. std::shared_ptr<HostTensorND> x_host;
  2209. MockConvolutionForward* mock_conv_ptr;
  2210. SymbolVar y;
  2211. HostTensorND y_host;
  2212. std::unique_ptr<cg::AsyncExecutable> func;
  2213. MockConvolutionForward& mock_conv() { return *mock_conv_ptr; }
  2214. void SetUp() override {
  2215. constexpr uint32_t ih = 10, ic = 16, oc = 32, ph = 0, sh = 1, fh = 2,
  2216. iw = ih;
  2217. comp_node = CompNode::load("cpux");
  2218. graph = ComputingGraph::make();
  2219. graph->options().graph_opt.weight_preprocess = is_weight_preprocess();
  2220. TensorShape x_shape{1, ic, ih, iw}, w_shape{oc, ic, fh, fh};
  2221. x_host = std::make_shared<HostTensorND>(comp_node, x_shape);
  2222. auto x = opr::Host2DeviceCopy::make(*graph, x_host);
  2223. auto w = opr::ImmutableTensor::make(*graph, {comp_node, w_shape});
  2224. Param param;
  2225. param.pad_h = param.pad_w = ph;
  2226. param.stride_h = param.stride_w = sh;
  2227. param.format = Param::Format::NCHW;
  2228. y = opr::ConvolutionForward::make(x, w, param);
  2229. auto& opr =
  2230. y.node()->owner_opr()->cast_final<opr::ConvolutionForward>();
  2231. auto mock = std::make_unique<MockConvolutionForward>(
  2232. opr.megdnn_opr(), ::testing::UnitTest::GetInstance()
  2233. ->current_test_info()
  2234. ->name());
  2235. mock_conv_ptr = mock.get();
  2236. ConvolutionTestingPeer{&opr}.set_megdnn_opr(std::move(mock));
  2237. func = graph->compile({make_callback_copy(y, y_host)});
  2238. }
  2239. void run() { func->execute().wait(); }
  2240. virtual bool is_weight_preprocess() { return true; }
  2241. void TearDown() override {
  2242. func.reset();
  2243. // Triggers mock check
  2244. graph.reset();
  2245. x_host.reset();
  2246. }
  2247. };
  2248. TEST_F(TestWeightPreprocess, NoPreprocessNeeded) {
  2249. using ::testing::_;
  2250. using ::testing::Return;
  2251. auto& mock = mock_conv();
  2252. MockAlgorithm algo;
  2253. EXPECT_CALL(mock, get_algorithm_heuristic(_, _, _, _, _, _))
  2254. .WillRepeatedly(Return(&algo));
  2255. EXPECT_CALL(mock, get_algorithm_from_desc(_))
  2256. .WillRepeatedly(Return(&algo));
  2257. EXPECT_CALL(mock, get_workspace_in_bytes(_, _, _, _))
  2258. .WillRepeatedly(Return(0));
  2259. EXPECT_CALL(mock, get_preprocess_workspace_in_bytes(_, _, _))
  2260. .WillRepeatedly(Return(0));
  2261. {
  2262. ::testing::InSequence seq;
  2263. // Return empty preprocess filters, indicating no need to preprocess
  2264. EXPECT_CALL(mock, deduce_preprocessed_filter_layout(_, _, _))
  2265. .WillRepeatedly(Return(SmallVector<TensorLayout>{}));
  2266. EXPECT_CALL(mock, exec_preprocess(_, _, _, _, _)).Times(0);
  2267. EXPECT_CALL(mock, exec(_, _, _, nullptr, _));
  2268. run();
  2269. }
  2270. }
  2271. TEST_F(TestWeightPreprocess, PreprocessCalledOnlyOnce) {
  2272. using ::testing::_;
  2273. using ::testing::Return;
  2274. using ::testing::Field;
  2275. using ::testing::Invoke;
  2276. using ::testing::Expectation;
  2277. using PF = MockConvolutionForward::PreprocessedFilter;
  2278. auto& mock = mock_conv();
  2279. MockAlgorithm algo;
  2280. SmallVector<TensorLayout> filter_layout{{{1, 2, 3, 4}, dtype::Float32()},
  2281. {{5, 6, 7, 8}, dtype::Float32()}};
  2282. EXPECT_CALL(mock, deduce_preprocessed_filter_layout(_, _, _))
  2283. .WillRepeatedly(Return(filter_layout));
  2284. EXPECT_CALL(mock, get_algorithm_from_desc(_))
  2285. .WillRepeatedly(Return(&algo));
  2286. Expectation algo_call =
  2287. EXPECT_CALL(mock, get_algorithm_heuristic(_, _, _, _, _, _))
  2288. .WillOnce(Return(&algo));
  2289. Expectation ws_call = EXPECT_CALL(mock, get_workspace_in_bytes(_, _, _, _))
  2290. .After(algo_call)
  2291. .WillOnce(Return(0));
  2292. Expectation pre_ws_call =
  2293. EXPECT_CALL(mock, get_preprocess_workspace_in_bytes(_, _, _))
  2294. .After(algo_call)
  2295. .WillOnce(Return(233));
  2296. {
  2297. ::testing::InSequence seq;
  2298. // exec_preprocess should be called only once, with workspace allocated
  2299. int salt = 0;
  2300. EXPECT_CALL(mock, exec_preprocess(_, _, _, _, _))
  2301. .After(ws_call, pre_ws_call)
  2302. .WillOnce(Invoke([&](const TensorLayout&, _megdnn_tensor_in,
  2303. const TensorLayout&, PF* pf,
  2304. _megdnn_workspace workspace) {
  2305. ASSERT_EQ(workspace.size, 233);
  2306. ASSERT_NE(pf, nullptr);
  2307. pf->algorithm_id = &salt;
  2308. ASSERT_EQ(pf->tensors.size(), 2);
  2309. ASSERT_TRUE(pf->tensors[0].layout.eq_shape({1, 2, 3, 4}));
  2310. ASSERT_TRUE(pf->tensors[1].layout.eq_shape({5, 6, 7, 8}));
  2311. ASSERT_NE(pf->tensors[0].raw_ptr, nullptr);
  2312. ASSERT_NE(pf->tensors[1].raw_ptr, nullptr);
  2313. pf->tensors[0].ptr<float>()[0] = 114.514f;
  2314. pf->tensors[1].ptr<float>()[0] = 1926.0817f;
  2315. }));
  2316. // Run the graph multiple times.
  2317. for (int i = 0; i < 3; i++) {
  2318. if (i > 0) {
  2319. EXPECT_CALL(mock, exec_preprocess(_, _, _, _, _)).Times(0);
  2320. }
  2321. EXPECT_CALL(mock, exec(_, _, _, _, _))
  2322. .WillOnce(Invoke([&](_megdnn_tensor_in, _megdnn_tensor_in,
  2323. _megdnn_tensor_out, const PF* pf,
  2324. _megdnn_workspace) {
  2325. ASSERT_NE(pf, nullptr);
  2326. ASSERT_EQ(pf->algorithm_id, &salt);
  2327. ASSERT_EQ(pf->tensors[0].ptr<float>()[0], 114.514f);
  2328. ASSERT_EQ(pf->tensors[1].ptr<float>()[0], 1926.0817f);
  2329. }));
  2330. run();
  2331. }
  2332. }
  2333. }
  2334. class TestNoWeightPreprocess : public TestWeightPreprocess {
  2335. bool is_weight_preprocess() override { return false; }
  2336. };
  2337. TEST_F(TestNoWeightPreprocess, NoPreprocess) {
  2338. using ::testing::_;
  2339. using ::testing::Return;
  2340. auto& mock = mock_conv();
  2341. MockAlgorithm algo;
  2342. EXPECT_CALL(mock, get_algorithm_heuristic(_, _, _, _, _, _))
  2343. .WillRepeatedly(Return(&algo));
  2344. EXPECT_CALL(mock, get_algorithm_from_desc(_))
  2345. .WillRepeatedly(Return(&algo));
  2346. EXPECT_CALL(mock, get_workspace_in_bytes(_, _, _, _))
  2347. .WillRepeatedly(Return(0));
  2348. EXPECT_CALL(mock, get_preprocess_workspace_in_bytes(_, _, _))
  2349. .WillRepeatedly(Return(0));
  2350. {
  2351. ::testing::InSequence seq;
  2352. // Return empty preprocess filters, indicating no need to preprocess
  2353. EXPECT_CALL(mock, deduce_preprocessed_filter_layout(_, _, _)).Times(0);
  2354. EXPECT_CALL(mock, exec_preprocess(_, _, _, _, _)).Times(0);
  2355. EXPECT_CALL(mock, exec(_, _, _, nullptr, _));
  2356. run();
  2357. }
  2358. }
  2359. } // anonymous namespace
  2360. #endif
  2361. namespace {
  2362. TEST(TestOprDNN, ConvBiasInt4NCHW) {
  2363. REQUIRE_GPU(1);
  2364. auto cn = CompNode::load("gpu0");
  2365. cn.activate();
  2366. auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
  2367. auto sm_ver = prop.major * 10 + prop.minor;
  2368. if (sm_ver != 75) {
  2369. printf("This testcast ignored due to insufficient cuda cap(got: %d, "
  2370. "expected: %d)\n",
  2371. sm_ver, 75);
  2372. return;
  2373. }
  2374. auto run = [&cn](size_t N, size_t C, size_t H, size_t W, size_t F, size_t S,
  2375. size_t P) {
  2376. auto graph = ComputingGraph::make();
  2377. HostTensorGenerator<dtype::Int8> gen;
  2378. auto mkvar = [&gen](const char* name, const TensorShape& shp,
  2379. const DType& dtype,
  2380. std::shared_ptr<ComputingGraph> graph,
  2381. const CompNode& cn) {
  2382. return opr::TypeCvt::make(
  2383. opr::Host2DeviceCopy::make(*graph, gen(shp, cn))
  2384. .rename(name),
  2385. dtype);
  2386. };
  2387. auto mkcvar = [&gen](const char* name, const TensorShape& shp,
  2388. const DType& dtype,
  2389. std::shared_ptr<ComputingGraph> graph,
  2390. const CompNode& cn) {
  2391. return opr::TypeCvt::make(
  2392. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  2393. .rename(name),
  2394. dtype);
  2395. };
  2396. using Policy = opr::ConvBias::ExecutionPolicy;
  2397. using Strategy = Policy::Strategy;
  2398. auto x = mkvar("x", {N, C * 4, H, W}, dtype::QuantizedS4(1.19960327f),
  2399. graph, cn),
  2400. w = mkcvar("w1", {C, C * 4, F, F}, dtype::QuantizedS4(1.19970327f),
  2401. graph, cn),
  2402. b = mkcvar("b1", {1, C, 1, 1},
  2403. dtype::QuantizedS32(1.19960327f * 1.19970327f), graph,
  2404. cn);
  2405. opr::ConvBias::Param param;
  2406. param.format = opr::ConvBias::Param::Format::NCHW;
  2407. param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU;
  2408. param.stride_h = param.stride_w = S;
  2409. param.pad_h = param.pad_w = P;
  2410. Policy policy;
  2411. policy.strategy = Strategy::PROFILE;
  2412. auto y = opr::ConvBias::make(
  2413. x, w, b, param, policy,
  2414. OperatorNodeConfig{dtype::QuantizedS4(11.9960501f)});
  2415. y = opr::TypeCvt::make(y, dtype::Float32());
  2416. auto x_f32 = opr::TypeCvt::make(x, dtype::Float32()),
  2417. w_f32 = opr::TypeCvt::make(w, dtype::Float32()),
  2418. b_f32 = opr::TypeCvt::make(b, dtype::Float32());
  2419. auto y_f32 = opr::ConvBias::make(x_f32, w_f32, b_f32, param, policy);
  2420. auto y_q4 = opr::TypeCvt::make(y_f32, dtype::QuantizedS4{11.9960501f});
  2421. y_q4 = opr::TypeCvt::make(y_q4, dtype::Float32());
  2422. HostTensorND host_y, host_y_q4;
  2423. auto func = graph->compile({make_callback_copy(y, host_y),
  2424. make_callback_copy(y_q4, host_y_q4)});
  2425. func->execute();
  2426. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_q4, 1e-3);
  2427. };
  2428. run(2, 64, 14, 14, 3, 2, 1);
  2429. run(2, 64, 7, 7, 3, 1, 1);
  2430. run(2, 64, 14, 14, 1, 2, 0);
  2431. run(2, 64, 7, 7, 1, 1, 0);
  2432. }
  2433. TEST(TestOprDNN, ConvBiasInt4NCHW64) {
  2434. REQUIRE_GPU(1);
  2435. auto cn = CompNode::load("gpu0");
  2436. cn.activate();
  2437. auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
  2438. auto sm_ver = prop.major * 10 + prop.minor;
  2439. if (sm_ver != 75) {
  2440. printf("This testcast ignored due to insufficient cuda cap(got: %d, "
  2441. "expected: %d)\n",
  2442. sm_ver, 75);
  2443. return;
  2444. }
  2445. auto nchw2nchw64 = [](SymbolVar x) {
  2446. auto y = opr::RelayoutFormat::make(
  2447. x, opr::RelayoutFormat::Param::Mode::NCHW_NCHW64);
  2448. return y;
  2449. };
  2450. auto nchw642nchw = [](SymbolVar x) {
  2451. auto y = opr::RelayoutFormat::make(
  2452. x, opr::RelayoutFormat::Param::Mode::NCHW64_NCHW);
  2453. return y;
  2454. };
  2455. auto run = [&](size_t N, size_t C, size_t H, size_t W, size_t F, size_t S,
  2456. size_t P) {
  2457. auto graph = ComputingGraph::make();
  2458. HostTensorGenerator<dtype::Int8> gen;
  2459. auto mkvar = [&gen](const char* name, const TensorShape& shp,
  2460. const DType& dtype,
  2461. std::shared_ptr<ComputingGraph> graph,
  2462. const CompNode& cn) {
  2463. return opr::TypeCvt::make(
  2464. opr::Host2DeviceCopy::make(*graph, gen(shp, cn))
  2465. .rename(name),
  2466. dtype);
  2467. };
  2468. auto mkcvar = [&gen](const char* name, const TensorShape& shp,
  2469. const DType& dtype,
  2470. std::shared_ptr<ComputingGraph> graph,
  2471. const CompNode& cn) {
  2472. return opr::TypeCvt::make(
  2473. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  2474. .rename(name),
  2475. dtype);
  2476. };
  2477. using Policy = opr::ConvBias::ExecutionPolicy;
  2478. using Strategy = Policy::Strategy;
  2479. auto x = mkvar("x", {N, C / 16, H, W, 64},
  2480. dtype::QuantizedS4(1.19960327f), graph, cn),
  2481. w = mkcvar("w1", {C, C / 16, F, F, 64},
  2482. dtype::QuantizedS4(1.19970327f), graph, cn),
  2483. b = mkcvar("b1", {1, C / 64, 1, 1, 64},
  2484. dtype::QuantizedS32(1.19960327f * 1.19970327f), graph,
  2485. cn);
  2486. opr::ConvBias::Param param;
  2487. param.format = opr::ConvBias::Param::Format::NCHW64;
  2488. param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU;
  2489. param.stride_h = param.stride_w = S;
  2490. param.pad_h = param.pad_w = P;
  2491. Policy policy;
  2492. policy.strategy = Strategy::PROFILE;
  2493. auto y = opr::ConvBias::make(
  2494. x, w, b, param, policy,
  2495. OperatorNodeConfig{dtype::QuantizedS4(11.9960501f)});
  2496. y = opr::TypeCvt::make(y, dtype::Float32());
  2497. x = nchw642nchw(x);
  2498. w = nchw642nchw(w);
  2499. b = nchw642nchw(b);
  2500. auto x_f32 = opr::TypeCvt::make(x, dtype::Float32()),
  2501. w_f32 = opr::TypeCvt::make(w, dtype::Float32()),
  2502. b_f32 = opr::TypeCvt::make(b, dtype::Float32());
  2503. param.format = opr::ConvBias::Param::Format::NCHW;
  2504. auto y_f32 = opr::ConvBias::make(x_f32, w_f32, b_f32, param, policy);
  2505. auto y_q4 = opr::TypeCvt::make(y_f32, dtype::QuantizedS4{11.9960501f});
  2506. y_q4 = opr::TypeCvt::make(y_q4, dtype::Float32());
  2507. y_q4 = nchw2nchw64(y_q4);
  2508. HostTensorND host_y, host_y_q4;
  2509. auto func = graph->compile({make_callback_copy(y, host_y),
  2510. make_callback_copy(y_q4, host_y_q4)});
  2511. func->execute();
  2512. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_q4, 1e-3);
  2513. };
  2514. run(2, 64, 14, 14, 3, 2, 1);
  2515. run(2, 64, 7, 7, 3, 1, 1);
  2516. run(2, 64, 14, 14, 1, 2, 0);
  2517. run(2, 64, 7, 7, 1, 1, 0);
  2518. }
  2519. TEST(TestOprDNN, ConvBiasInt4Serialize) {
  2520. using namespace serialization;
  2521. float inp_scale = 1.20210327f;
  2522. float filt_scale = 1.20210406f;
  2523. float bias_scale = inp_scale * filt_scale;
  2524. DType output_dtype = dtype::QuantizedS4{inp_scale};
  2525. HostTensorGenerator<dtype::Int8> gen;
  2526. std::shared_ptr<HostTensorND> xv;
  2527. auto mkvar = [&gen](const char* name, const DType& dtype,
  2528. std::shared_ptr<ComputingGraph> graph,
  2529. std::shared_ptr<HostTensorND> val) {
  2530. return opr::TypeCvt::make(
  2531. opr::Host2DeviceCopy::make(*graph, val).rename(name), dtype);
  2532. };
  2533. auto mkcvar =
  2534. [&gen](const char* name, const TensorShape& shp, const DType& dtype,
  2535. std::shared_ptr<ComputingGraph> graph, const CompNode& cn) {
  2536. return opr::TypeCvt::make(
  2537. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  2538. .rename(name),
  2539. dtype);
  2540. };
  2541. auto fname = output_file("ConvBiasInt4Serialize");
  2542. HostTensorND y1, y2;
  2543. auto dump = [&]() {
  2544. opr::ConvBias::Param param;
  2545. param.mode = Mode::CONVOLUTION;
  2546. auto cn = CompNode::load("cpu0");
  2547. auto graph = ComputingGraph::make();
  2548. xv = gen({1, 64, 56, 56}, cn);
  2549. auto x = mkvar("x", dtype::QuantizedS4{inp_scale}, graph, xv);
  2550. auto w = mkcvar("w", {256, 64, 1, 1}, dtype::QuantizedS4{filt_scale}, graph, cn);
  2551. auto b = mkcvar("b", {1, 256, 1, 1}, dtype::QuantizedS32{bias_scale}, graph, cn);
  2552. auto y = opr::ConvBiasForward::make(x, w, b, param, {},
  2553. OperatorNodeConfig{output_dtype});
  2554. auto w1 = mkcvar("w1", {64, 256, 1, 1}, dtype::QuantizedS4{filt_scale},
  2555. graph, cn);
  2556. auto b1 = mkcvar("b1", {1, 64, 1, 1}, dtype::QuantizedS32{bias_scale},
  2557. graph, cn);
  2558. y = opr::ConvBiasForward::make(y, w1, b1, param, {},
  2559. OperatorNodeConfig{output_dtype});
  2560. y = opr::TypeCvt::make(y, dtype::Float32());
  2561. auto dumper = GraphDumper::make(OutputFile::make_fs(fname.c_str()));
  2562. auto func = graph->compile({make_callback_copy(y, y1)});
  2563. func->execute();
  2564. func->wait();
  2565. auto rst = dumper->dump({y});
  2566. ASSERT_EQ(rst.outputs.size(), 1u);
  2567. };
  2568. auto load = [&]() {
  2569. auto loader = GraphLoader::make(InputFile::make_fs(fname.c_str()));
  2570. auto rst = loader->load();
  2571. for (const auto& t : rst.tensor_map) {
  2572. t.second->copy_from(*xv).sync();
  2573. }
  2574. auto func = rst.graph->compile(
  2575. {make_callback_copy(rst.output_var_list[0], y2)});
  2576. func->execute();
  2577. func->wait();
  2578. ASSERT_EQ(rst.output_var_list.size(), 1u);
  2579. EXPECT_EQ(rst.output_var_list[0].dtype(), dtype::Float32());
  2580. };
  2581. dump();
  2582. load();
  2583. MGB_ASSERT_TENSOR_NEAR(y1, y2, 1e-3);
  2584. }
  2585. TEST(TestOprDNN, ConvBiasInt4SerializeWithParamFuse) {
  2586. using namespace serialization;
  2587. float inp_scale = 1.20210327f;
  2588. float filt_scale = 1.20210406f;
  2589. float bias_scale = inp_scale * filt_scale;
  2590. DType output_dtype = dtype::QuantizedS4{inp_scale};
  2591. HostTensorGenerator<dtype::Int8> gen;
  2592. std::shared_ptr<HostTensorND> xv;
  2593. auto mkvar = [&gen](const char* name, const DType& dtype,
  2594. std::shared_ptr<ComputingGraph> graph,
  2595. std::shared_ptr<HostTensorND> val) {
  2596. return opr::TypeCvt::make(
  2597. opr::Host2DeviceCopy::make(*graph, val).rename(name), dtype);
  2598. };
  2599. auto mkcvar =
  2600. [&gen](const char* name, const TensorShape& shp, const DType& dtype,
  2601. std::shared_ptr<ComputingGraph> graph, const CompNode& cn) {
  2602. return opr::TypeCvt::make(
  2603. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  2604. .rename(name),
  2605. dtype);
  2606. };
  2607. auto fname = output_file("ConvBiasInt4SerializeWithParamFuse");
  2608. HostTensorND y1, y2;
  2609. auto dump = [&]() {
  2610. opr::ConvBias::Param param;
  2611. param.mode = Mode::CONVOLUTION;
  2612. auto cn = CompNode::load("cpu0");
  2613. auto graph = ComputingGraph::make();
  2614. xv = gen({1, 64, 56, 56}, cn);
  2615. auto x = mkvar("x", dtype::QuantizedS4{inp_scale}, graph, xv);
  2616. auto w = mkcvar("w", {256, 64, 1, 1}, dtype::QuantizedS4{filt_scale}, graph, cn);
  2617. auto b = mkcvar("b", {1, 256, 1, 1}, dtype::QuantizedS32{bias_scale}, graph, cn);
  2618. auto y = opr::ConvBiasForward::make(x, w, b, param, {},
  2619. OperatorNodeConfig{output_dtype});
  2620. auto w1 = mkcvar("w1", {64, 256, 1, 1}, dtype::QuantizedS4{filt_scale},
  2621. graph, cn);
  2622. auto b1 = mkcvar("b1", {1, 64, 1, 1}, dtype::QuantizedS32{bias_scale},
  2623. graph, cn);
  2624. y = opr::ConvBiasForward::make(y, w1, b1, param, {},
  2625. OperatorNodeConfig{output_dtype});
  2626. y = opr::TypeCvt::make(y, dtype::Float32());
  2627. SymbolVar y_param_fused;
  2628. unpack_vector(gopt::GraphOptimizer{}
  2629. .add_pass<gopt::ParamFusePass>()
  2630. .apply({{y}})
  2631. .endpoint_vars(),
  2632. y_param_fused);
  2633. auto dumper = GraphDumper::make(OutputFile::make_fs(fname.c_str()));
  2634. auto func = graph->compile({make_callback_copy(y_param_fused, y1)});
  2635. func->execute();
  2636. func->wait();
  2637. auto rst = dumper->dump({y_param_fused});
  2638. ASSERT_EQ(rst.outputs.size(), 1u);
  2639. };
  2640. auto load = [&]() {
  2641. auto loader = GraphLoader::make(InputFile::make_fs(fname.c_str()));
  2642. auto rst = loader->load();
  2643. for (const auto& t : rst.tensor_map) {
  2644. t.second->copy_from(*xv).sync();
  2645. }
  2646. auto func = rst.graph->compile(
  2647. {make_callback_copy(rst.output_var_list[0], y2)});
  2648. func->execute();
  2649. func->wait();
  2650. ASSERT_EQ(rst.output_var_list.size(), 1u);
  2651. EXPECT_EQ(rst.output_var_list[0].dtype(), dtype::Float32());
  2652. };
  2653. dump();
  2654. load();
  2655. MGB_ASSERT_TENSOR_NEAR(y1, y2, 1e-3);
  2656. }
  2657. } // namespace
  2658. // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台