You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

inference.cpp 170 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199
  1. /**
  2. * \file src/gopt/test/inference.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
  10. * implied.
  11. */
  12. #include "megbrain/opr/dnn/local.h"
  13. #include "megbrain/test/helper.h"
  14. #include "megbrain/gopt/basic_arith.h"
  15. #include "megbrain/gopt/gtrans.h"
  16. #include "megbrain/gopt/inference.h"
  17. #include "megbrain/opr/basic_arith_wrapper.h"
  18. #include "megbrain/opr/blas.h"
  19. #include "megbrain/opr/dnn/batch_norm.h"
  20. #include "megbrain/opr/dnn/convolution.h"
  21. #include "megbrain/opr/dnn/pooling.h"
  22. #include "megbrain/opr/imgproc.h"
  23. #include "megbrain/opr/io.h"
  24. #include "megbrain/opr/nn_int.h"
  25. #include "megbrain/opr/tensor_gen.h"
  26. #include "megbrain/opr/tensor_manip.h"
  27. #include "megbrain/opr/utility.h"
  28. #include "./helper.h"
  29. #include "megbrain/comp_node_env.h"
  30. #include "megdnn/tensor_format.h"
  31. #include <random>
  32. using namespace mgb;
  33. namespace {
  34. //! find first the operator of specific type; raise exception if not found
  35. template <typename T>
  36. T& find_opr(SymbolVar endpoint) {
  37. T* found = nullptr;
  38. auto cb = [&found](cg::OperatorNodeBase* opr) {
  39. if (!found && opr->same_type<T>()) {
  40. found = &opr->cast_final_safe<T>();
  41. }
  42. };
  43. cg::DepOprIter{cb}.add(endpoint.node()->owner_opr());
  44. mgb_assert(found, "not found opr from %s", endpoint.node()->name().c_str());
  45. return *found;
  46. }
  47. template <typename T>
  48. T& find_opr(SymbolVar endpoint, const std::string& node_name) {
  49. T* found = nullptr;
  50. auto cb = [&found, &node_name](cg::OperatorNodeBase* opr) {
  51. if (!found && opr->same_type<T>() && opr->name() == node_name) {
  52. found = &opr->cast_final_safe<T>();
  53. }
  54. };
  55. cg::DepOprIter{cb}.add(endpoint.node()->owner_opr());
  56. mgb_assert(found, "not found opr %s from %s", node_name.c_str(),
  57. endpoint.node()->name().c_str());
  58. return *found;
  59. }
  60. template <typename T>
  61. size_t find_opr_num(SymbolVar endpoint) {
  62. size_t opr_num = 0;
  63. auto cb = [&opr_num](cg::OperatorNodeBase* opr) {
  64. if (opr->same_type<T>()) {
  65. opr_num++;
  66. }
  67. };
  68. cg::DepOprIter{cb}.add(endpoint.node()->owner_opr());
  69. return opr_num;
  70. }
  71. class NaiveMegDNNHandleScope {
  72. int m_orig_level;
  73. public:
  74. NaiveMegDNNHandleScope()
  75. : m_orig_level{MegDNNHandle::exchange_default_dbg_level(2)} {
  76. CompNode::finalize();
  77. }
  78. ~NaiveMegDNNHandleScope() {
  79. auto set = MegDNNHandle::exchange_default_dbg_level(m_orig_level);
  80. mgb_assert(set == 2);
  81. CompNode::finalize();
  82. }
  83. };
  84. #if MGB_CUDA
  85. //! this function is only used in TestGoptInference.EnableCHWN4...
  86. void warp_perspective_mat_gen(HostTensorND& mat, size_t N, size_t INP_H,
  87. size_t INP_W) {
  88. static std::mt19937 rng(next_rand_seed());
  89. auto rand_real = [&](double lo, double hi) {
  90. return rng() / (std::mt19937::max() + 1.0) * (hi - lo) + lo;
  91. };
  92. auto rand_real2 = [&](double range) { return rand_real(-range, range); };
  93. auto ptr = mat.ptr<float>();
  94. for (size_t i = 0; i < N; ++i) {
  95. auto rot = rand_real(0, M_PI * 2), scale = rand_real(0.8, 1.2),
  96. sheer = rand_real(0.9, 1.1), dy = rand_real2(INP_H * 0.5),
  97. dx = rand_real2(INP_W * 0.5), ky = rand_real2(0.1 / INP_H),
  98. kx = rand_real2(0.1 / INP_W), kb = rand_real2(0.1) + 1;
  99. ptr[0] = ptr[4] = cos(rot) * scale;
  100. ptr[1] = -(ptr[3] = sin(rot) * scale);
  101. ptr[3] *= sheer;
  102. ptr[4] *= sheer;
  103. ptr[2] = dx;
  104. ptr[5] = dy;
  105. ptr[6] = kx;
  106. ptr[7] = ky;
  107. ptr[8] = kb;
  108. ptr += 9;
  109. }
  110. mgb_assert(ptr == mat.ptr<float>() + mat.shape().total_nr_elems());
  111. }
  112. #endif
  113. } // namespace
  114. TEST(TestGoptInference, ParamFuseConstEndPoint) {
  115. constexpr size_t SIZE = 23;
  116. HostTensorGenerator<> gen;
  117. auto host_x = gen({SIZE}), host_y = gen({1}), host_p = gen({1});
  118. auto graph = ComputingGraph::make();
  119. graph->options().graph_opt_level = 0;
  120. auto x = opr::SharedDeviceTensor::make(*graph, *host_x),
  121. y = opr::SharedDeviceTensor::make(*graph, *host_y),
  122. p = opr::Host2DeviceCopy::make(*graph, host_p), q = p + x, a = y + 3,
  123. z0 = a + q, z1 = a + 4;
  124. HostTensorND host_z0, host_z1;
  125. SymbolVar z0_1, z1_1;
  126. unpack_vector(gopt::GraphOptimizer{}
  127. .add_pass<gopt::ParamFusePass>()
  128. .apply({{z1, z0}})
  129. .endpoint_vars(),
  130. z1_1, z0_1);
  131. auto func = graph->compile({make_callback_copy(z0_1, host_z0),
  132. make_callback_copy(z1_1, host_z1)});
  133. func->to_json()->writeto_fpath(
  134. output_file("TestGoptInference.ParamFuseEndPoint.json"));
  135. func->execute();
  136. int nr_opr = 0;
  137. func->iter_opr_seq([&](cg::OperatorNodeBase*) {
  138. ++nr_opr;
  139. return true;
  140. });
  141. ASSERT_EQ(8, nr_opr);
  142. auto px = host_x->ptr<float>(), pz0 = host_z0.ptr<float>();
  143. auto yv = host_y->ptr<float>()[0], pv = host_p->ptr<float>()[0],
  144. pz1 = host_z1.ptr<float>()[0];
  145. for (size_t i = 0; i < SIZE; ++i) {
  146. MGB_ASSERT_FLOAT_EQ(px[i] + yv + 3 + pv, pz0[i]);
  147. }
  148. MGB_ASSERT_FLOAT_EQ(yv + 7, pz1);
  149. }
  150. TEST(TestGoptInference, ParamFuse) {
  151. constexpr size_t SIZE = 23;
  152. HostTensorGenerator<> gen;
  153. auto host_x = gen({SIZE}), host_y = gen({1}), host_p = gen({1});
  154. auto graph = ComputingGraph::make();
  155. graph->options().graph_opt_level = 0;
  156. auto x = opr::SharedDeviceTensor::make(*graph, *host_x),
  157. y = opr::SharedDeviceTensor::make(*graph, *host_y),
  158. p = opr::Host2DeviceCopy::make(*graph, host_p),
  159. z = x + y, // endpoint
  160. q = x * y + p; // middle point
  161. SymbolVar z1, q1;
  162. unpack_vector(gopt::GraphOptimizer{}
  163. .add_pass<gopt::ParamFusePass>()
  164. .apply({{z, q}})
  165. .endpoint_vars(),
  166. z1, q1);
  167. ASSERT_TRUE(z1.node()->owner_opr()->same_type<opr::SharedDeviceTensor>());
  168. ASSERT_NE(q1.node()->owner_opr(), q.node()->owner_opr());
  169. ASSERT_EQ(q1.node()->owner_opr()->dyn_typeinfo(),
  170. q.node()->owner_opr()->dyn_typeinfo());
  171. HostTensorND host_z, host_q;
  172. auto func = graph->compile(
  173. {make_callback_copy(z1, host_z), make_callback_copy(q1, host_q)});
  174. func->execute();
  175. int nr_opr = 0;
  176. func->iter_opr_seq([&](cg::OperatorNodeBase*) {
  177. ++nr_opr;
  178. return true;
  179. });
  180. ASSERT_EQ(6, nr_opr);
  181. auto px = host_x->ptr<float>(), pz = host_z.ptr<float>(),
  182. pq = host_q.ptr<float>();
  183. auto yv = host_y->ptr<float>()[0], pv = host_p->ptr<float>()[0];
  184. for (size_t i = 0; i < SIZE; ++i) {
  185. MGB_ASSERT_FLOAT_EQ(px[i] + yv, pz[i]);
  186. MGB_ASSERT_FLOAT_EQ(px[i] * yv + pv, pq[i]);
  187. }
  188. }
  189. TEST(TestGoptInference, ParamFuseMultiDeviceTensorHolder) {
  190. constexpr size_t SIZE = 23;
  191. HostTensorGenerator<> gen;
  192. auto host_x = gen({SIZE}), host_y = gen({1}), host_p = gen({1});
  193. auto graph = ComputingGraph::make();
  194. graph->options().graph_opt_level = 0;
  195. auto x = opr::SharedDeviceTensor::make(*graph, *host_x),
  196. y = opr::SharedDeviceTensor::make(*graph, *host_y),
  197. p = opr::Host2DeviceCopy::make(*graph, host_p),
  198. z = x + y, //! endpoint
  199. q = x * y + p; //! middle point
  200. SymbolVar z1, q1;
  201. unpack_vector(gopt::GraphOptimizer{}
  202. .add_pass<gopt::ParamMergePass>()
  203. .apply({{z}})
  204. .endpoint_vars(),
  205. z1);
  206. ASSERT_TRUE(z1.node()
  207. ->owner_opr()
  208. ->input(0)
  209. ->owner_opr()
  210. ->same_type<opr::MultipleDeviceTensorHolder>());
  211. unpack_vector(gopt::GraphOptimizer{}
  212. .add_pass<gopt::ParamMergePass>()
  213. .add_pass<gopt::ParamFusePass>()
  214. .apply({{z, q}})
  215. .endpoint_vars(),
  216. z1, q1);
  217. ASSERT_TRUE(z1.node()->owner_opr()->same_type<opr::SharedDeviceTensor>());
  218. ASSERT_NE(q1.node()->owner_opr(), q.node()->owner_opr());
  219. ASSERT_EQ(q1.node()->owner_opr()->dyn_typeinfo(),
  220. q.node()->owner_opr()->dyn_typeinfo());
  221. HostTensorND host_z, host_q;
  222. auto func = graph->compile(
  223. {make_callback_copy(z1, host_z), make_callback_copy(q1, host_q)});
  224. func->execute();
  225. int nr_opr = 0;
  226. func->iter_opr_seq([&](cg::OperatorNodeBase* op) {
  227. ++nr_opr;
  228. return true;
  229. });
  230. ASSERT_EQ(6, nr_opr);
  231. auto px = host_x->ptr<float>(), pz = host_z.ptr<float>(),
  232. pq = host_q.ptr<float>();
  233. auto yv = host_y->ptr<float>()[0], pv = host_p->ptr<float>()[0];
  234. for (size_t i = 0; i < SIZE; ++i) {
  235. MGB_ASSERT_FLOAT_EQ(px[i] + yv, pz[i]);
  236. MGB_ASSERT_FLOAT_EQ(px[i] * yv + pv, pq[i]);
  237. }
  238. }
  239. TEST(TestGoptInference, ParamFuseMultiRead) {
  240. HostTensorGenerator<> gen;
  241. auto graph = ComputingGraph::make();
  242. graph->options().graph_opt_level = 0;
  243. auto mkvar = [&](const char* name, const TensorShape& shp) {
  244. return opr::Host2DeviceCopy::make(*graph, gen(shp)).rename(name);
  245. };
  246. auto mkcvar = [&](const char* name, const TensorShape& shp) {
  247. return opr::SharedDeviceTensor::make(*graph, *gen(shp)).rename(name);
  248. };
  249. auto x = mkvar("x", {23}), p0 = mkcvar("p0", {1}), p1 = mkcvar("p1", {1}),
  250. z0 = x * (p0 + p1) + x / (p0 + p1);
  251. SymbolVar z1;
  252. unpack_vector(gopt::GraphOptimizer{}
  253. .add_pass<gopt::ParamFusePass>()
  254. .apply({{z0}})
  255. .endpoint_vars(),
  256. z1);
  257. ASSERT_NE(z0.node(), z1.node());
  258. ASSERT_TRUE(z1.node()
  259. ->owner_opr()
  260. ->input(0)
  261. ->owner_opr()
  262. ->input(1)
  263. ->owner_opr()
  264. ->same_type<opr::SharedDeviceTensor>());
  265. ASSERT_TRUE(z1.node()
  266. ->owner_opr()
  267. ->input(1)
  268. ->owner_opr()
  269. ->input(1)
  270. ->owner_opr()
  271. ->same_type<opr::SharedDeviceTensor>());
  272. HostTensorND host_z0, host_z1;
  273. graph->compile({make_callback_copy(z0, host_z0),
  274. make_callback_copy(z1, host_z1)})
  275. ->execute();
  276. MGB_ASSERT_TENSOR_EQ(host_z0, host_z1);
  277. }
  278. TEST(TestGoptInference, ParamFuseStaticInfer) {
  279. HostTensorGenerator<> gen;
  280. auto graph = ComputingGraph::make();
  281. auto mkvar = [&](const char* name, const TensorShape& shp) {
  282. return opr::Host2DeviceCopy::make(*graph, gen(shp)).rename(name);
  283. };
  284. auto mkcvar = [&](const char* name, const TensorShape& shp) {
  285. return opr::SharedDeviceTensor::make(*graph, *gen(shp)).rename(name);
  286. };
  287. auto a = mkvar("x", {4}),
  288. b = a.reshape(opr::GetVarShape::make(mkcvar("tshp", {2, 2})));
  289. SymbolVar b1;
  290. unpack_vector(gopt::GraphOptimizer{}
  291. .add_pass<gopt::ParamFusePass>()
  292. .apply({{b}})
  293. .endpoint_vars(),
  294. b1);
  295. ASSERT_EQ(b1, a.reshape({2, 2}));
  296. }
  297. TEST(TestGoptInference, ParamRedistributeConvMul) {
  298. constexpr size_t N = 4, IC = 3, IH = 5, IW = 4, OC = 4, KH = 3, KW = 2;
  299. HostTensorGenerator<> gen;
  300. auto host_x = gen({N, IC, IH, IW}), host_k = gen({IC}),
  301. host_w = gen({OC, IC, KH, KW});
  302. auto graph = ComputingGraph::make();
  303. auto x = opr::Host2DeviceCopy::make(*graph, host_x),
  304. k = opr::Dimshuffle::make(
  305. opr::SharedDeviceTensor::make(*graph, *host_k),
  306. {-1, 0, -1, -1}),
  307. w = opr::SharedDeviceTensor::make(*graph, *host_w),
  308. y0 = opr::Convolution::make(x * k, w);
  309. SymbolVar y1;
  310. unpack_vector(gopt::GraphOptimizer{}
  311. .add_pass<gopt::ParamRedistributePass>()
  312. .apply({{y0}})
  313. .endpoint_vars(),
  314. y1);
  315. ASSERT_NE(y0.node(), y1.node());
  316. HostTensorND host_y0, host_y1;
  317. auto func = graph->compile(
  318. {make_callback_copy(y0, host_y0), make_callback_copy(y1, host_y1)});
  319. func->execute();
  320. MGB_ASSERT_TENSOR_EQ(host_y0, host_y1);
  321. }
  322. TEST(TestGoptInference, ParamRedistributeConvMulUniqReader) {
  323. constexpr size_t N = 4, C = 3, IH = 5, IW = 4, KH = 1, KW = 1;
  324. HostTensorGenerator<> gen;
  325. auto host_x = gen({N, C, IH, IW}), host_k = gen({C}),
  326. host_w = gen({C, C, KH, KW});
  327. auto graph = ComputingGraph::make();
  328. graph->options().graph_opt_level = 0;
  329. auto x = opr::Host2DeviceCopy::make(*graph, host_x),
  330. k = opr::Dimshuffle::make(
  331. opr::SharedDeviceTensor::make(*graph, *host_k) + 2,
  332. {-1, 0, -1, -1}),
  333. w = opr::SharedDeviceTensor::make(*graph, *host_w),
  334. // y0 should be replaced
  335. y0 = opr::powf(opr::Convolution::make(x * k, w).rename("y0") + 2,
  336. 2),
  337. y0k = (y0 * k).rename("y0k"),
  338. // y0k is accessed twice, so it should not be replaced
  339. y1 = opr::Convolution::make(y0k, w).rename("y1"), z0 = y1 / y0k;
  340. SymbolVar z1;
  341. unpack_vector(gopt::GraphOptimizer{}
  342. .add_pass<gopt::ParamRedistributePass>()
  343. .apply({{z0}})
  344. .endpoint_vars(),
  345. z1);
  346. ASSERT_NE(z0.node(), z1.node());
  347. auto y1_repl = z1.node()->owner_opr()->input(0)->owner_opr();
  348. ASSERT_TRUE(y1_repl->same_type<opr::Convolution>());
  349. ASSERT_EQ(y1_repl->input(0), z1.node()->owner_opr()->input(1));
  350. HostTensorND host_z0, host_z1;
  351. auto func = graph->compile(
  352. {make_callback_copy(z0, host_z0), make_callback_copy(z1, host_z1)});
  353. func->execute();
  354. MGB_ASSERT_TENSOR_NEAR(host_z0, host_z1, 5e-5);
  355. }
  356. TEST(TestGoptInference, ParamRedistributeMulConvMul) {
  357. constexpr size_t N = 4, IC = 3, IH = 5, IW = 4, OC = 4, KH = 3, KW = 2;
  358. HostTensorGenerator<> gen;
  359. auto host_x = gen({N, IC, IH, IW}), host_k1 = gen({IC}),
  360. host_k2 = gen({1, OC, 1, 1}), host_w = gen({OC, IC, KH, KW});
  361. auto graph = ComputingGraph::make();
  362. auto x = opr::Host2DeviceCopy::make(*graph, host_x),
  363. k1 = opr::Dimshuffle::make(
  364. opr::SharedDeviceTensor::make(*graph, *host_k1),
  365. {-1, 0, -1, -1}),
  366. k2 = opr::SharedDeviceTensor::make(*graph, *host_k2),
  367. w = opr::SharedDeviceTensor::make(*graph, *host_w),
  368. y0 = opr::Convolution::make(x * k1, w) * k2;
  369. SymbolVar y1;
  370. unpack_vector(gopt::GraphOptimizer{}
  371. .add_pass<gopt::ParamRedistributePass>()
  372. .add_pass<gopt::ParamFusePass>()
  373. .apply({{y0}})
  374. .endpoint_vars(),
  375. y1);
  376. auto y1opr = y1.node()->owner_opr();
  377. ASSERT_TRUE(y1opr->same_type<opr::Convolution>());
  378. ASSERT_EQ(y1opr->input(0), x.node());
  379. HostTensorND host_y0, host_y1;
  380. auto func = graph->compile(
  381. {make_callback_copy(y0, host_y0), make_callback_copy(y1, host_y1)});
  382. func->execute();
  383. MGB_ASSERT_TENSOR_NEAR(host_y0, host_y1, 5e-6);
  384. }
  385. TEST(TestGoptInference, ParamRedistributeConvAdd) {
  386. constexpr size_t N = 4, IC = 3, IH = 5, IW = 4, OC = 4, KH = 3, KW = 2;
  387. HostTensorGenerator<> gen;
  388. auto host_x = gen({N, IC, IH, IW}), host_b = gen({IC}),
  389. host_w = gen({OC, IC, KH, KW});
  390. auto graph = ComputingGraph::make();
  391. auto x = opr::Host2DeviceCopy::make(*graph, host_x),
  392. b = opr::Dimshuffle::make(
  393. opr::SharedDeviceTensor::make(*graph, *host_b),
  394. {-1, 0, -1, -1}),
  395. w = opr::SharedDeviceTensor::make(*graph, *host_w),
  396. y0 = opr::Convolution::make(x + b, w);
  397. SymbolVar y1;
  398. unpack_vector(gopt::GraphOptimizer{}
  399. .add_pass<gopt::ParamRedistributePass>()
  400. .add_pass<gopt::ParamFusePass>()
  401. .apply({{y0}})
  402. .endpoint_vars(),
  403. y1);
  404. ASSERT_NE(y0.node(), y1.node());
  405. HostTensorND host_y0, host_y1;
  406. auto func = graph->compile(
  407. {make_callback_copy(y0, host_y0), make_callback_copy(y1, host_y1)});
  408. func->execute();
  409. MGB_ASSERT_TENSOR_NEAR(host_y0, host_y1, 1e-5);
  410. }
  411. TEST(TestGoptInference, ParamRedistributeDistThenReasso) {
  412. constexpr size_t N = 4, IC0 = 3, IC1 = 6, IH = 5, IW = 4, OC = 4, KH = 3,
  413. KW = 2;
  414. HostTensorGenerator<> gen;
  415. auto graph = ComputingGraph::make();
  416. auto mkvar = [&](const char* name, const TensorShape& shp) {
  417. return opr::Host2DeviceCopy::make(*graph, gen(shp)).rename(name);
  418. };
  419. auto mkcvar = [&](const char* name, const TensorShape& shp) {
  420. return opr::SharedDeviceTensor::make(*graph, *gen(shp)).rename(name);
  421. };
  422. auto x0 = mkvar("x0", {N, IC0, IH, IW}), x1 = mkvar("x1", {N, IC1, IH, IW}),
  423. k0 = opr::Dimshuffle::make(mkcvar("x1_", {IC0}), {-1, 0, -1, -1})
  424. .rename("x1"),
  425. w0 = mkcvar("w0", {OC, IC0, KH, KW}),
  426. k1 = mkcvar("k1", {1, IC1, 1, 1}),
  427. w1 = mkcvar("w1", {OC, IC1, KH, KW}), b0 = mkvar("b0", {1, OC, 1, 1}),
  428. b1 = mkcvar("b1", {1}), k2 = mkcvar("k2", {1}),
  429. y0 = (opr::Convolution::make(x0 * k0, w0) +
  430. opr::Convolution::make(x1 + k1, w1) + b0 + b1) *
  431. k2;
  432. SymbolVar y1;
  433. unpack_vector(gopt::GraphOptimizer{}
  434. .add_pass<gopt::ParamRedistributePass>()
  435. .add_pass<gopt::ReorderArithChainPass>(
  436. gopt::ConstVarType::IMMUTABLE_AND_PARAM)
  437. .add_pass<gopt::ParamFusePass>()
  438. .apply({{y0}})
  439. .endpoint_vars(),
  440. y1);
  441. ASSERT_NE(y0.node(), y1.node());
  442. HostTensorND host_y0, host_y1;
  443. auto func = graph->compile(
  444. {make_callback_copy(y0, host_y0), make_callback_copy(y1, host_y1)});
  445. func->execute();
  446. MGB_ASSERT_TENSOR_NEAR(host_y0, host_y1, 1e-5);
  447. auto chain =
  448. gopt::extract_opr_leaves(y1.node(), [](cg::OperatorNodeBase* opr) {
  449. return gopt::as_elem_opr(opr, opr::Elemwise::Mode::ADD);
  450. });
  451. size_t nr_conv = 0;
  452. for (auto i : chain) {
  453. auto opr = i->owner_opr();
  454. if (opr->same_type<opr::Convolution>()) {
  455. ++nr_conv;
  456. ASSERT_TRUE(opr->input(0)
  457. ->owner_opr()
  458. ->same_type<opr::Host2DeviceCopy>());
  459. ASSERT_TRUE(opr->input(1)
  460. ->owner_opr()
  461. ->same_type<opr::SharedDeviceTensor>());
  462. }
  463. }
  464. ASSERT_EQ(2u, nr_conv);
  465. ASSERT_EQ(4u, chain.size());
  466. }
  467. TEST(TestGoptInference, ParamRedistributeMultiChange) {
  468. constexpr size_t N = 4, IC = 3, IH = 5, IW = 4, OC = 4, KH = 3, KW = 2;
  469. HostTensorGenerator<> gen;
  470. auto graph = ComputingGraph::make();
  471. graph->options().graph_opt_level = 0;
  472. auto mkvar = [&](const char* name, const TensorShape& shp) {
  473. return opr::Host2DeviceCopy::make(*graph, gen(shp)).rename(name);
  474. };
  475. auto mkcvar = [&](const char* name, const TensorShape& shp) {
  476. return opr::SharedDeviceTensor::make(*graph, *gen(shp)).rename(name);
  477. };
  478. auto x = mkvar("x", {N, IC, IH, IW}), k0 = mkcvar("k0", {1, IC, 1, 1}),
  479. b0 = mkcvar("b0", {1, IC, 1, 1}), k1 = mkcvar("k0", {1}),
  480. b1 = mkcvar("b0", {1}), w = mkcvar("w", {OC, IC, KH, KW}),
  481. y0 = (opr::Convolution::make(x * k0 + b0, w) + b1) * k1;
  482. SymbolVar y1;
  483. unpack_vector(gopt::GraphOptimizer{}
  484. .add_pass<gopt::ParamRedistributePass>()
  485. .add_pass<gopt::ParamFusePass>()
  486. .apply({{y0}})
  487. .endpoint_vars(),
  488. y1);
  489. ASSERT_NE(y0.node(), y1.node());
  490. HostTensorND host_y0, host_y1;
  491. auto func = graph->compile(
  492. {make_callback_copy(y0, host_y0), make_callback_copy(y1, host_y1)});
  493. func->execute();
  494. MGB_ASSERT_TENSOR_NEAR(host_y0, host_y1, 1e-5);
  495. auto y1elem = gopt::as_elem_opr(y1.node(), opr::Elemwise::Mode::ADD);
  496. ASSERT_TRUE(y1elem);
  497. auto yconv = y1elem->input(0)->owner_opr();
  498. if (!yconv->same_type<opr::Convolution>())
  499. yconv = y1elem->input(1)->owner_opr();
  500. ASSERT_TRUE(yconv->same_type<opr::Convolution>());
  501. ASSERT_EQ(x.node(), yconv->input(0));
  502. }
  503. TEST(TestGoptInference, ParamRedistributeMultiReader) {
  504. constexpr size_t N = 4, IC = 3, IH = 5, IW = 4, OC = 4, KH = 3, KW = 2;
  505. HostTensorGenerator<> gen;
  506. auto graph = ComputingGraph::make();
  507. graph->options().graph_opt_level = 0;
  508. auto mkvar = [&](const char* name, const TensorShape& shp) {
  509. return opr::Host2DeviceCopy::make(*graph, gen(shp)).rename(name);
  510. };
  511. auto mkcvar = [&](const char* name, const TensorShape& shp) {
  512. return opr::SharedDeviceTensor::make(*graph, *gen(shp)).rename(name);
  513. };
  514. auto x = mkvar("x", {N, IC, IH, IW}), k = mkcvar("k", {1, OC, 1, 1}),
  515. w = mkcvar("w", {OC, IC, KH, KW});
  516. auto conv = opr::Convolution::make(x, w);
  517. auto t = conv * k;
  518. auto y0 = t * 4.2f + t * 2.4f;
  519. SymbolVar y1;
  520. unpack_vector(gopt::GraphOptimizer{}
  521. .add_pass<gopt::ParamRedistributePass>()
  522. .add_pass<gopt::ParamFusePass>()
  523. .apply({{y0}})
  524. .endpoint_vars(),
  525. y1);
  526. ASSERT_NE(y0.node(), y1.node());
  527. HostTensorND host_y0, host_y1;
  528. auto func = graph->compile(
  529. {make_callback_copy(y0, host_y0), make_callback_copy(y1, host_y1)});
  530. func->execute();
  531. MGB_ASSERT_TENSOR_NEAR(host_y0, host_y1, 1e-5);
  532. auto y1elem = gopt::as_elem_opr(y1.node(), opr::Elemwise::Mode::ADD);
  533. ASSERT_TRUE(y1elem);
  534. auto ymul0 = gopt::as_elem_opr(y1elem->input(0), opr::Elemwise::Mode::MUL),
  535. ymul1 = gopt::as_elem_opr(y1elem->input(1), opr::Elemwise::Mode::MUL);
  536. ASSERT_TRUE(ymul0);
  537. ASSERT_TRUE(ymul1);
  538. auto yconv = ymul0->input(0)->owner_opr();
  539. if (!yconv->same_type<opr::Convolution>()) {
  540. yconv = ymul0->input(1)->owner_opr();
  541. }
  542. ASSERT_TRUE(yconv->same_type<opr::Convolution>());
  543. if (ymul1->input(0) != yconv->output(0)) {
  544. ASSERT_EQ(yconv->output(0), ymul1->input(1));
  545. }
  546. ASSERT_EQ(x.node(), yconv->input(0));
  547. }
  548. TEST(TestGoptInference, ParamFuseBiasMerge) {
  549. HostTensorGenerator<> gen;
  550. auto graph = ComputingGraph::make();
  551. graph->options().graph_opt_level = 0;
  552. auto mkvar = [&](const char* name, const TensorShape& shp) {
  553. return opr::Host2DeviceCopy::make(*graph, gen(shp)).rename(name);
  554. };
  555. auto mkcvar = [&](const char* name, const TensorShape& shp) {
  556. return opr::SharedDeviceTensor::make(*graph, *gen(shp)).rename(name);
  557. };
  558. auto x = mkvar("x", {6, 3, 8, 8}), w1 = mkcvar("w1", {4, 3, 3, 3}),
  559. w2 = mkcvar("w2", {4, 3, 3, 3}), b1 = mkcvar("b1", {1, 4, 1, 1}),
  560. b2 = mkcvar("b2", {1, 4, 1, 1}),
  561. y1 = opr::Convolution::make(x, w1) + b1,
  562. y2 = opr::Convolution::make(x, w2) + b2, y = y1 + y2;
  563. SymbolVar y_opt;
  564. unpack_vector(gopt::optimize_for_inference({y}), y_opt);
  565. HostTensorND host_y, host_y_opt;
  566. auto func = graph->compile({make_callback_copy(y, host_y),
  567. make_callback_copy(y_opt, host_y_opt)});
  568. func->execute();
  569. MGB_ASSERT_TENSOR_EQ(host_y, host_y_opt);
  570. graph->compile({{y_opt, {}}})
  571. ->to_json()
  572. ->writeto_fpath(
  573. output_file("TestGoptInference.ParamFuseConvMerge.json"));
  574. auto chain = gopt::extract_opr_leaves(
  575. y_opt.node(), [](cg::OperatorNodeBase* opr) {
  576. return gopt::as_elem_opr(opr, opr::Elemwise::Mode::ADD);
  577. });
  578. ASSERT_EQ(3u, chain.size());
  579. }
  580. TEST(TestGoptInference, Float16IOFloat32Compute) {
  581. constexpr size_t INP_H = 10, INP_W = 10;
  582. HostTensorGenerator<> gen;
  583. auto graph = ComputingGraph::make();
  584. auto mkvar = [&](const char* name, const TensorShape& shp) {
  585. return opr::Host2DeviceCopy::make(*graph, gen(shp)).rename(name);
  586. };
  587. graph->options().graph_opt_level = 0;
  588. auto a = mkvar("a", {1, 4, INP_H, INP_W}),
  589. s0 = mkvar("s0", {20, 3, INP_H, INP_W}),
  590. s1 = mkvar("s1", {4, 3, 1, 1});
  591. auto b = opr::Convolution::make(s0, s1, {}, {});
  592. auto y = a + b;
  593. y = opr::Concat::make({y, -y}, 0);
  594. y = opr::Reduce::make(y, {}, y.make_scalar(1));
  595. SymbolVar y_opt;
  596. auto options = gopt::OptimizeForInferenceOptions{};
  597. options.enable_f16_io_f32_comp();
  598. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  599. ASSERT_EQ(y_opt.dtype(), dtype::Float32());
  600. HostTensorND host_y, host_y_opt;
  601. auto func = graph->compile({make_callback_copy(y, host_y),
  602. make_callback_copy(y_opt, host_y_opt)});
  603. func->execute();
  604. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3);
  605. }
  606. TEST(TestGoptInference, Float16IOFloat32ComputeDeConv) {
  607. constexpr size_t INP_H = 10, INP_W = 10;
  608. HostTensorGenerator<> gen;
  609. auto graph = ComputingGraph::make();
  610. auto mkvar = [&](const char* name, const TensorShape& shp) {
  611. return opr::Host2DeviceCopy::make(*graph, gen(shp)).rename(name);
  612. };
  613. graph->options().graph_opt_level = 0;
  614. auto s0 = mkvar("s0", {5, 5, 3, 3}), s1 = mkvar("s1", {1, 5, INP_H, INP_W});
  615. auto y = opr::ConvolutionBackwardData::make(s0, s1, {}, {});
  616. SymbolVar y_opt;
  617. auto options = gopt::OptimizeForInferenceOptions{};
  618. options.enable_f16_io_f32_comp();
  619. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  620. ASSERT_EQ(
  621. find_opr<opr::ConvolutionBackwardData>(y_opt).param().compute_mode,
  622. opr::ConvBias::Param::ConvBias::ComputeMode::FLOAT32);
  623. ASSERT_EQ(y_opt.dtype(), dtype::Float32());
  624. HostTensorND host_y, host_y_opt;
  625. auto func = graph->compile({make_callback_copy(y, host_y),
  626. make_callback_copy(y_opt, host_y_opt)});
  627. func->execute();
  628. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-2);
  629. }
  630. TEST(TestGoptInference, Float16IOFloat32ComputeWarpPerspective) {
  631. constexpr size_t INP_H = 10, INP_W = 10, N = 2;
  632. HostTensorGenerator<> gen;
  633. auto graph = ComputingGraph::make();
  634. auto mkvar = [&](const char* name, const TensorShape& shp) {
  635. return opr::Host2DeviceCopy::make(*graph, gen(shp)).rename(name);
  636. };
  637. graph->options().graph_opt_level = 0;
  638. auto a = mkvar("a", {N, 4, INP_H, INP_W});
  639. float value1 = M_PI, value2 = 0.6;
  640. auto gen_mat = [&](HostTensorND& mat) {
  641. auto ptr = mat.ptr<float>();
  642. for (size_t i = 0; i < N; ++i) {
  643. auto rot = value1, scale = value2, sheer = value1, dy = value2,
  644. dx = value2, ky = value2, kx = value2, kb = value2;
  645. ptr[0] = ptr[4] = cos(rot) * scale;
  646. ptr[1] = -(ptr[3] = sin(rot) * scale);
  647. ptr[3] *= sheer;
  648. ptr[4] *= sheer;
  649. ptr[2] = dx;
  650. ptr[5] = dy;
  651. ptr[6] = kx;
  652. ptr[7] = ky;
  653. ptr[8] = kb;
  654. ptr += 9;
  655. }
  656. mgb_assert(ptr == mat.ptr<float>() + mat.shape().total_nr_elems());
  657. };
  658. auto mat_host = std::make_shared<HostTensorND>(
  659. a.node()->comp_node(), TensorShape{N, 3, 3}, dtype::Float32());
  660. gen_mat(*mat_host);
  661. auto mat = opr::Host2DeviceCopy::make(*graph, mat_host).rename("mat");
  662. TensorShape out_shp{20, 20};
  663. auto y = opr::WarpPerspective::make(a, mat, out_shp);
  664. SymbolVar y_opt;
  665. auto options = gopt::OptimizeForInferenceOptions{};
  666. options.enable_f16_io_f32_comp();
  667. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  668. ASSERT_EQ(y_opt.dtype(), dtype::Float32());
  669. HostTensorND host_y, host_y_opt;
  670. auto func = graph->compile({make_callback_copy(y, host_y),
  671. make_callback_copy(y_opt, host_y_opt)});
  672. func->execute();
  673. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3);
  674. }
  675. TEST(TestGoptInference, Float16IOFloat32ComputeRemap) {
  676. auto cn = CompNode::load("cpu1");
  677. constexpr size_t INP_H = 10, INP_W = 10, N = 2;
  678. HostTensorGenerator<> gen;
  679. auto graph = ComputingGraph::make();
  680. auto mkvar = [&](const char* name, const TensorShape& shp) {
  681. return opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name);
  682. };
  683. graph->options().graph_opt_level = 0;
  684. auto a = mkvar("a", {N, 4, INP_H, INP_W});
  685. auto gen_map = [&](HostTensorND& mat) {
  686. auto ptr = mat.ptr<float>();
  687. for (size_t n = 0; n < N; ++n) {
  688. for (int h = 0; h < 5; ++h) {
  689. for (int w = 0; w < 5; ++w) {
  690. *ptr++ = (h * 5 * 2) + 5 * 2 + 0;
  691. *ptr++ = (h * 5 * 2) + 5 * 2 + 1;
  692. }
  693. }
  694. }
  695. mgb_assert(ptr == mat.ptr<float>() + mat.shape().total_nr_elems());
  696. };
  697. auto map_host = std::make_shared<HostTensorND>(
  698. a.node()->comp_node(), TensorShape{N, 5, 5, 2}, dtype::Float32());
  699. gen_map(*map_host);
  700. auto map = opr::Host2DeviceCopy::make(*graph, map_host).rename("map");
  701. auto y = opr::Remap::make(a, map);
  702. SymbolVar y_opt;
  703. auto options = gopt::OptimizeForInferenceOptions{};
  704. options.enable_f16_io_f32_comp();
  705. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  706. ASSERT_EQ(y_opt.dtype(), dtype::Float32());
  707. HostTensorND host_y, host_y_opt;
  708. auto func = graph->compile({make_callback_copy(y, host_y),
  709. make_callback_copy(y_opt, host_y_opt)});
  710. func->execute();
  711. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3);
  712. }
  713. TEST(TestGoptInference, Uint8IOFloat16ComputeWarpPerspective) {
  714. constexpr size_t INP_H = 10, INP_W = 10, N = 2;
  715. HostTensorGenerator<dtype::Uint8> gen_uint8;
  716. auto graph = ComputingGraph::make();
  717. auto mkvar = [&](const char* name, const TensorShape& shp) {
  718. return opr::Host2DeviceCopy::make(*graph, gen_uint8(shp)).rename(name);
  719. };
  720. graph->options().graph_opt_level = 0;
  721. auto a = mkvar("a", {N, 4, INP_H, INP_W});
  722. float value1 = M_PI, value2 = 0.6;
  723. auto gen_mat = [&](HostTensorND& mat) {
  724. auto ptr = mat.ptr<float>();
  725. for (size_t i = 0; i < N; ++i) {
  726. auto rot = value1, scale = value2, sheer = value1, dy = value2,
  727. dx = value2, ky = value2, kx = value2, kb = value2;
  728. ptr[0] = ptr[4] = cos(rot) * scale;
  729. ptr[1] = -(ptr[3] = sin(rot) * scale);
  730. ptr[3] *= sheer;
  731. ptr[4] *= sheer;
  732. ptr[2] = dx;
  733. ptr[5] = dy;
  734. ptr[6] = kx;
  735. ptr[7] = ky;
  736. ptr[8] = kb;
  737. ptr += 9;
  738. }
  739. mgb_assert(ptr == mat.ptr<float>() + mat.shape().total_nr_elems());
  740. };
  741. auto mat_host = std::make_shared<HostTensorND>(
  742. a.node()->comp_node(), TensorShape{N, 3, 3}, dtype::Float32());
  743. gen_mat(*mat_host);
  744. auto mat = opr::Host2DeviceCopy::make(*graph, mat_host).rename("mat");
  745. TensorShape out_shp{20, 20};
  746. auto y = opr::WarpPerspective::make(a, mat, out_shp);
  747. SymbolVar y_opt;
  748. auto options = gopt::OptimizeForInferenceOptions{};
  749. options.enable_f16_io_comp();
  750. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  751. ASSERT_EQ(y_opt.dtype(), dtype::Uint8());
  752. HostTensorND host_y, host_y_opt;
  753. auto func = graph->compile({make_callback_copy(y, host_y),
  754. make_callback_copy(y_opt, host_y_opt)});
  755. func->execute();
  756. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3);
  757. }
  758. TEST(TestGoptInference, Float32TOFloat16) {
  759. CompNode cn = CompNode::load("cpu0");
  760. HostTensorGenerator<> gen(0, 1, 0);
  761. auto host_x0 = gen({1, 4, 16, 8}, cn), host_x1 = gen({2, 3, 16, 8}, cn),
  762. host_x2 = gen({4, 3, 1, 1}, cn);
  763. auto graph = ComputingGraph::make();
  764. auto make_f32_to_f16_graph = [&]() {
  765. graph->options().graph_opt_level = 0;
  766. auto d0 = opr::Host2DeviceCopy::make(*graph, host_x0),
  767. d1 = opr::Host2DeviceCopy::make(*graph, host_x1),
  768. d2 = opr::SharedDeviceTensor::make(*graph, *host_x2);
  769. auto b = opr::Convolution::make(d1, d2, {}, {});
  770. auto y = d0 + b;
  771. y = opr::Reduce::make(y, {}, y.make_scalar(1));
  772. SymbolVar y_opt;
  773. auto options = gopt::OptimizeForInferenceOptions{};
  774. options.enable_f16_io_comp();
  775. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  776. return y_opt;
  777. };
  778. auto make_f16_graph = [&]() {
  779. auto d0 = opr::TypeCvt::make(
  780. opr::Host2DeviceCopy::make(*graph, host_x0),
  781. dtype::Float16{}),
  782. d1 = opr::TypeCvt::make(
  783. opr::Host2DeviceCopy::make(*graph, host_x1),
  784. dtype::Float16{}),
  785. d2 = opr::TypeCvt::make(
  786. opr::SharedDeviceTensor::make(*graph, *host_x2),
  787. dtype::Float16{});
  788. auto b = opr::Convolution::make(d1, d2, {}, {});
  789. SymbolVar y = d0 + b;
  790. y = opr::Reduce::make(y, {}, y.make_scalar(1));
  791. y = opr::TypeCvt::make(y, dtype::Float32{});
  792. return y;
  793. };
  794. auto y_opt = make_f32_to_f16_graph();
  795. auto y = make_f16_graph();
  796. ASSERT_EQ(y_opt.dtype(), dtype::Float32{});
  797. ASSERT_EQ(y.dtype(), dtype::Float32{});
  798. HostTensorND host_y_opt, host_y;
  799. auto func = graph->compile({make_callback_copy(y, host_y),
  800. make_callback_copy(y_opt, host_y_opt)});
  801. func->execute();
  802. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3);
  803. }
  804. TEST(TestGoptInference, Float32TOFloat16C32) {
  805. CompNode cn = CompNode::load("cpu0");
  806. HostTensorGenerator<> gen(0, 1, 0);
  807. auto host_x0 = gen({1, 4, 1, 1}, cn), host_x1 = gen({2, 3, 16, 8}, cn),
  808. host_x2 = gen({4, 3, 1, 1}, cn);
  809. auto graph = ComputingGraph::make();
  810. auto make_f32_to_f16_graph = [&]() {
  811. graph->options().graph_opt_level = 0;
  812. auto d0 = opr::Host2DeviceCopy::make(*graph, host_x0),
  813. d1 = opr::Host2DeviceCopy::make(*graph, host_x1),
  814. d2 = opr::SharedDeviceTensor::make(*graph, *host_x2);
  815. auto y = opr::ConvBias::make(d1, d2, d0);
  816. y = opr::Reduce::make(y, {}, y.make_scalar(1));
  817. SymbolVar y_opt;
  818. auto options = gopt::OptimizeForInferenceOptions{};
  819. options.enable_f16_io_f32_comp();
  820. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  821. return y_opt;
  822. };
  823. auto make_f16_graph = [&]() {
  824. auto d0 = opr::TypeCvt::make(
  825. opr::TypeCvt::make(
  826. opr::Host2DeviceCopy::make(*graph, host_x0),
  827. dtype::Float16{}),
  828. dtype::Float32{}),
  829. d1 = opr::TypeCvt::make(
  830. opr::TypeCvt::make(
  831. opr::Host2DeviceCopy::make(*graph, host_x1),
  832. dtype::Float16{}),
  833. dtype::Float32{}),
  834. d2 = opr::TypeCvt::make(
  835. opr::TypeCvt::make(
  836. opr::SharedDeviceTensor::make(*graph, *host_x2),
  837. dtype::Float16{}),
  838. dtype::Float32{});
  839. auto y = opr::ConvBias::make(d1, d2, d0);
  840. y = opr::Reduce::make(y, {}, y.make_scalar(1));
  841. y = opr::TypeCvt::make(opr::TypeCvt::make(y, dtype::Float16{}),
  842. dtype::Float32{});
  843. return y;
  844. };
  845. auto y_opt = make_f32_to_f16_graph();
  846. auto y = make_f16_graph();
  847. ASSERT_EQ(find_opr<opr::ConvBias>(y_opt).param().compute_mode,
  848. opr::ConvBias::Param::ConvBias::ComputeMode::FLOAT32);
  849. ASSERT_EQ(y_opt.dtype(), dtype::Float32{});
  850. ASSERT_EQ(y.dtype(), dtype::Float32{});
  851. HostTensorND host_y_opt, host_y;
  852. auto func = graph->compile({make_callback_copy(y, host_y),
  853. make_callback_copy(y_opt, host_y_opt)});
  854. func->execute();
  855. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3);
  856. }
  857. TEST(TestGoptInference, Float32TOFloat16EndpointElemwise) {
  858. CompNode cn = CompNode::load("cpu0");
  859. HostTensorGenerator<> gen(0, 1, 0);
  860. auto host_x0 = gen({1, 4, 16, 8}, cn), host_x1 = gen({2, 3, 16, 8}, cn),
  861. host_x2 = gen({4, 3, 1, 1}, cn);
  862. auto graph = ComputingGraph::make();
  863. auto make_f32_to_f16_graph = [&]() {
  864. graph->options().graph_opt_level = 0;
  865. auto d0 = opr::Host2DeviceCopy::make(*graph, host_x0),
  866. d1 = opr::Host2DeviceCopy::make(*graph, host_x1),
  867. d2 = opr::SharedDeviceTensor::make(*graph, *host_x2);
  868. auto b = opr::Convolution::make(d1, d2, {}, {});
  869. auto y = d0 + b;
  870. SymbolVar y_opt;
  871. auto options = gopt::OptimizeForInferenceOptions{};
  872. options.enable_f16_io_comp();
  873. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  874. return y_opt;
  875. };
  876. auto make_f16_graph = [&]() {
  877. auto d0 = opr::TypeCvt::make(
  878. opr::Host2DeviceCopy::make(*graph, host_x0),
  879. dtype::Float16{}),
  880. d1 = opr::TypeCvt::make(
  881. opr::Host2DeviceCopy::make(*graph, host_x1),
  882. dtype::Float16{}),
  883. d2 = opr::TypeCvt::make(
  884. opr::SharedDeviceTensor::make(*graph, *host_x2),
  885. dtype::Float16{});
  886. auto b = opr::Convolution::make(d1, d2, {}, {});
  887. SymbolVar y = d0 + b;
  888. y = opr::TypeCvt::make(y, dtype::Float32{});
  889. return y;
  890. };
  891. auto y_opt = make_f32_to_f16_graph();
  892. auto y = make_f16_graph();
  893. ASSERT_EQ(y_opt.dtype(), dtype::Float32{});
  894. ASSERT_EQ(y.dtype(), dtype::Float32{});
  895. HostTensorND host_y_opt, host_y;
  896. auto func = graph->compile({make_callback_copy(y, host_y),
  897. make_callback_copy(y_opt, host_y_opt)});
  898. func->execute();
  899. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3);
  900. }
  901. TEST(TestGoptInference, Float32TOFloat16Linspace) {
  902. CompNode cn = CompNode::load("cpu0");
  903. HostTensorGenerator<> gen(0, 1, 0);
  904. auto host_x = gen({3, 1}, cn);
  905. auto graph = ComputingGraph::make();
  906. auto make_f32_to_f16_graph = [&]() {
  907. graph->options().graph_opt_level = 0;
  908. auto x = opr::Host2DeviceCopy::make(*graph, host_x);
  909. auto xshp = opr::GetVarShape::make(x);
  910. auto cv = [&x](int v) { return x.make_scalar(v); };
  911. auto sub = [&xshp, &cv](int idx) {
  912. return opr::IndexAt::make(xshp, {{0, cv(idx)}});
  913. };
  914. auto lin = opr::Linspace::make(cv(0), sub(0) - 1, sub(0), {}, {});
  915. auto shp = opr::Concat::make({sub(1), sub(0)}, 0);
  916. auto y = opr::Reshape::make(lin, shp);
  917. auto mm = opr::MatrixMul::make(x, y);
  918. SymbolVar mm_opt;
  919. auto options = gopt::OptimizeForInferenceOptions{};
  920. options.enable_f16_io_comp();
  921. unpack_vector(gopt::optimize_for_inference({mm}, options), mm_opt);
  922. return mm_opt;
  923. };
  924. auto make_f16_graph = [&]() {
  925. auto x = opr::TypeCvt::make(opr::Host2DeviceCopy::make(*graph, host_x),
  926. dtype::Float16());
  927. auto xshp = opr::GetVarShape::make(x);
  928. auto cv = [&x](int v) { return x.make_scalar(v); };
  929. auto sub = [&xshp, &cv](int idx) {
  930. return opr::IndexAt::make(xshp, {{0, cv(idx)}});
  931. };
  932. auto lin = opr::Linspace::make(cv(0), sub(0) - 1, sub(0), {}, {});
  933. lin = opr::TypeCvt::make(lin, dtype::Float16());
  934. auto shp = opr::Concat::make({sub(1), sub(0)}, 0);
  935. auto y = opr::Reshape::make(lin, shp);
  936. auto mm = opr::MatrixMul::make(x, y);
  937. mm = opr::TypeCvt::make(mm, dtype::Float32{});
  938. return mm;
  939. };
  940. auto y_opt = make_f32_to_f16_graph();
  941. auto y = make_f16_graph();
  942. ASSERT_EQ(y_opt.dtype(), dtype::Float32{});
  943. ASSERT_EQ(y.dtype(), dtype::Float32{});
  944. HostTensorND host_y_opt, host_y;
  945. auto func = graph->compile({make_callback_copy(y, host_y),
  946. make_callback_copy(y_opt, host_y_opt)});
  947. func->execute();
  948. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3);
  949. }
  950. TEST(TestGoptInference, Float32TOFloat16Endpoints) {
  951. HostTensorGenerator<> gen;
  952. auto graph = ComputingGraph::make();
  953. auto mkvar = [&](const char* name, const TensorShape& shp) {
  954. return opr::Host2DeviceCopy::make(*graph, gen(shp)).rename(name);
  955. };
  956. auto mkcvar = [&](const char* name, const TensorShape& shp) {
  957. return opr::SharedDeviceTensor::make(*graph, *gen(shp)).rename(name);
  958. };
  959. graph->options().graph_opt_level = 0;
  960. opr::Convolution::Param param;
  961. param.pad_h = param.pad_w = 0;
  962. auto x = mkvar("x", {8, 8, 8, 8}), y = mkvar("y", {8, 8, 8, 8}),
  963. w = mkcvar("w", {4, 8, 3, 3}),
  964. z = opr::Convolution::make(x + y, w, param);
  965. auto options = gopt::OptimizeForInferenceOptions{};
  966. options.enable_f16_io_f32_comp();
  967. SymbolVarArray out = gopt::optimize_for_inference({x + y, z}, options);
  968. ASSERT_EQ(out[0].dtype(), dtype::Float32());
  969. ASSERT_EQ(out[1].dtype(), dtype::Float32());
  970. ASSERT_EQ(out[0].node()->owner_opr()->input(0)->dtype(), dtype::Float16());
  971. ASSERT_EQ(out[1].node()->owner_opr()->input(0)->dtype(), dtype::Float16());
  972. }
  973. TEST(TestGoptInference, ConvertFormatNHWCD4) {
  974. // hwcd4 is only supported in naive handle
  975. NaiveMegDNNHandleScope naive_megdnn_handle;
  976. HostTensorGenerator<> gen;
  977. auto cn = CompNode::load("cpu0");
  978. auto graph = ComputingGraph::make();
  979. graph->options().graph_opt_level = 0;
  980. auto mkvar = [&](const char* name, const TensorShape& shp) {
  981. return opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name);
  982. };
  983. auto mkcvar = [&](const char* name, const TensorShape& shp) {
  984. return opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  985. .rename(name);
  986. };
  987. auto host_x = gen({8, 8, 8, 8}, cn);
  988. auto x = opr::Host2DeviceCopy::make(*graph, host_x);
  989. opr::Convolution::Param param;
  990. param.pad_h = param.pad_w = 0;
  991. auto w1 = mkcvar("w1", {4, 8, 3, 3}),
  992. conv = opr::Convolution::make(x, w1, param);
  993. auto shape_of = opr::GetVarShape::make(conv);
  994. auto subtensor = opr::Subtensor::make(
  995. shape_of, {opr::Subtensor::AxisIndexer::make_interval(
  996. 0, x.make_scalar(2), None, x.make_scalar(1))});
  997. opr::Resize::Param param_resize;
  998. param_resize.format = opr::Resize::Param::Format::NCHW;
  999. auto resize = opr::ResizeForward::make(conv, subtensor * 2, param_resize);
  1000. auto mat = mkcvar("mat", {8, 3, 3}),
  1001. warp = opr::WarpPerspectiveForward::make(
  1002. resize, mat, nullptr, cg::var_from_tensor_shape(x, {4, 4}));
  1003. auto b = mkvar("b", {1, 4, 1, 1}),
  1004. elem = opr::Elemwise::make({warp + b},
  1005. opr::Elemwise::Param::Mode::RELU);
  1006. param.pad_h = param.pad_w = 1;
  1007. auto w2 = mkcvar("w2", {4, 4, 3, 3}),
  1008. y = opr::Convolution::make(elem, w2, param),
  1009. z = opr::AxisAddRemove::make(
  1010. y, {opr::AxisAddRemove::AxisDesc::make_add(0)});
  1011. SymbolVar y_opt, z_opt;
  1012. auto options = gopt::OptimizeForInferenceOptions{};
  1013. options.enable_nhwcd4();
  1014. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  1015. unpack_vector(gopt::optimize_for_inference({z}, options), z_opt);
  1016. ASSERT_EQ(opr::Convolution::Param::Format::NHWCD4,
  1017. find_opr<opr::Convolution>(y_opt).param().format);
  1018. ASSERT_EQ(TensorFormat::Type::DEFAULT,
  1019. find_opr<opr::AxisAddRemove>(z_opt).input(0)->format().type());
  1020. ASSERT_EQ(4, find_opr<opr::AxisAddRemove>(z_opt).input(0)->shape().ndim);
  1021. graph->compile({{y_opt, {}}})
  1022. ->to_json()
  1023. ->writeto_fpath(
  1024. output_file("TestGoptInference.ConvertFormatNHWCD4.json"));
  1025. HostTensorND host_y_opt, host_y;
  1026. auto func = graph->compile({make_callback_copy(y, host_y),
  1027. make_callback_copy(y_opt, host_y_opt)});
  1028. func->execute();
  1029. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3);
  1030. *host_x = *gen({8, 8, 16, 16}, cn);
  1031. func->execute();
  1032. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3);
  1033. }
  1034. TEST(TestGoptInference, ConvertFormatNHWCD4Elemwise) {
  1035. // hwcd4 is only supported in naive handle
  1036. NaiveMegDNNHandleScope naive_megdnn_handle;
  1037. HostTensorGenerator<> gen;
  1038. auto cn = CompNode::load("cpu0");
  1039. auto graph = ComputingGraph::make();
  1040. graph->options().graph_opt_level = 0;
  1041. auto mkvar = [&](const char* name, const TensorShape& shp) {
  1042. return opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name);
  1043. };
  1044. auto mkcvar = [&](const char* name, const TensorShape& shp) {
  1045. return opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  1046. .rename(name);
  1047. };
  1048. auto host_x = gen({8, 8, 8, 8}, cn);
  1049. auto x = opr::Host2DeviceCopy::make(*graph, host_x);
  1050. opr::Convolution::Param param;
  1051. param.pad_h = param.pad_w = 0;
  1052. auto w1 = mkcvar("w1", {8, 8, 3, 3}),
  1053. conv = opr::Convolution::make(x, w1, param);
  1054. auto b = mkvar("b", {1, 1, 1, 1}),
  1055. elem = opr::Elemwise::make({conv + b},
  1056. opr::Elemwise::Param::Mode::RELU);
  1057. param.pad_h = param.pad_w = 1;
  1058. auto w2 = mkcvar("w2", {8, 8, 3, 3}),
  1059. conv2 = opr::Convolution::make(elem, w2, param);
  1060. auto b_scaler = mkvar("b", {1}), elem2 = conv2 + b_scaler;
  1061. param.pad_h = param.pad_w = 1;
  1062. auto w3 = mkcvar("w2", {8, 8, 3, 3}),
  1063. y = opr::Convolution::make(elem2, w3, param);
  1064. SymbolVar y_opt;
  1065. auto options = gopt::OptimizeForInferenceOptions{};
  1066. options.enable_nhwcd4();
  1067. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  1068. ASSERT_EQ(opr::Convolution::Param::Format::NHWCD4,
  1069. find_opr<opr::Convolution>(y_opt).param().format);
  1070. graph->compile({{y_opt, {}}})
  1071. ->to_json()
  1072. ->writeto_fpath(output_file(
  1073. "TestGoptInference.ConvertFormatNHWCD4Elemwise.json"));
  1074. HostTensorND host_y_opt, host_y;
  1075. auto func = graph->compile({make_callback_copy(y, host_y),
  1076. make_callback_copy(y_opt, host_y_opt)});
  1077. func->execute();
  1078. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3);
  1079. *host_x = *gen({8, 8, 16, 16}, cn);
  1080. func->execute();
  1081. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3);
  1082. }
  1083. TEST(TestGoptInference, ConvertFormatNHWCD4TypeCvt) {
  1084. NaiveMegDNNHandleScope naive_megdnn_handle;
  1085. HostTensorGenerator<> gen;
  1086. auto cn = CompNode::load("cpu0");
  1087. auto graph = ComputingGraph::make();
  1088. graph->options().graph_opt_level = 0;
  1089. auto mkcvar = [&](const char* name, const TensorShape& shp) {
  1090. return opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  1091. .rename(name);
  1092. };
  1093. auto host_x = gen({8, 8, 8, 8}, cn);
  1094. auto x = opr::Host2DeviceCopy::make(*graph, host_x);
  1095. opr::Convolution::Param param;
  1096. param.pad_h = param.pad_w = 0;
  1097. auto w1 = mkcvar("w1", {8, 8, 3, 3}),
  1098. conv1 = opr::Convolution::make(x, w1, param),
  1099. tcvt1 = opr::TypeCvt::make(conv1, dtype::Float16());
  1100. auto w2 = mkcvar("w2", {8, 8, 3, 3}),
  1101. conv2 = opr::Convolution::make(x, w2, param),
  1102. tcvt2 = opr::TypeCvt::make(conv2, dtype::Float16());
  1103. auto y = opr::Elemwise::make({tcvt1, tcvt2}, opr::Elemwise::Param::Mode::ADD);
  1104. SymbolVar y_opt;
  1105. auto options = gopt::OptimizeForInferenceOptions{};
  1106. options.enable_nhwcd4();
  1107. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  1108. ASSERT_EQ(opr::Convolution::Param::Format::NHWCD4,
  1109. find_opr<opr::Convolution>(y_opt).param().format);
  1110. graph->compile({{y_opt, {}}})
  1111. ->to_json()
  1112. ->writeto_fpath(output_file(
  1113. "TestGoptInference.ConvertFormatNHWCD4TypeCvt.json"));
  1114. HostTensorND host_y_opt, host_y;
  1115. auto func = graph->compile({make_callback_copy(y, host_y),
  1116. make_callback_copy(y_opt, host_y_opt)});
  1117. func->execute();
  1118. MGB_ASSERT_TENSOR_EQ(host_y, host_y_opt);
  1119. *host_x = *gen({8, 8, 16, 16}, cn);
  1120. func->execute();
  1121. MGB_ASSERT_TENSOR_EQ(host_y, host_y_opt);
  1122. }
  1123. TEST(TestGoptInference, ConvertFormatNHWCD4LOCAL) {
  1124. // hwcd4 is only supported in naive handle
  1125. NaiveMegDNNHandleScope naive_megdnn_handle;
  1126. HostTensorGenerator<> gen;
  1127. auto cn = CompNode::load("cpu0");
  1128. auto graph = ComputingGraph::make();
  1129. graph->options().graph_opt_level = 0;
  1130. auto mkcvar = [&](const char* name, const TensorShape& shp) {
  1131. return opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  1132. .rename(name);
  1133. };
  1134. auto host_x = gen({2, 8, 8, 16}, cn);
  1135. auto x = opr::Host2DeviceCopy::make(*graph, host_x);
  1136. opr::Convolution::Param param;
  1137. param.pad_h = param.pad_w = 1;
  1138. auto w1 = mkcvar("w1", {4, 8, 3, 3}),
  1139. conv1 = opr::Convolution::make(x, w1, param);
  1140. auto w2 = mkcvar("w2", {8, 16, 4, 3, 3, 4}),
  1141. local = opr::Local::make(conv1, w2, param);
  1142. auto w3 = mkcvar("w3", {4, 4, 3, 3}),
  1143. conv2 = opr::Convolution::make(local, w3, param);
  1144. opr::GroupLocal::Param param_group_local;
  1145. param_group_local.pad_h = param_group_local.pad_w = 1;
  1146. auto w4 = mkcvar("w4", {2, 8, 16, 2, 3, 3, 2}),
  1147. group_local = opr::GroupLocal::make(conv2, w4, param_group_local);
  1148. auto w5 = mkcvar("w5", {4, 4, 3, 3}),
  1149. y = opr::Convolution::make(group_local, w5, param);
  1150. SymbolVar y_opt;
  1151. auto options = gopt::OptimizeForInferenceOptions{};
  1152. options.enable_nhwcd4();
  1153. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  1154. ASSERT_EQ(opr::Convolution::Param::Format::NHWCD4,
  1155. find_opr<opr::Convolution>(y_opt).param().format);
  1156. ASSERT_EQ(opr::Local::Param::Format::NCHW,
  1157. find_opr<opr::Local>(y_opt).param().format);
  1158. ASSERT_EQ(opr::GroupLocal::Param::Format::NCHW,
  1159. find_opr<opr::GroupLocal>(y_opt).param().format);
  1160. graph->compile({{y_opt, {}}})
  1161. ->to_json()
  1162. ->writeto_fpath(output_file(
  1163. "TestGoptInference.ConvertFormatNHWCD4LOCAL.json"));
  1164. HostTensorND host_y_opt, host_y;
  1165. auto func = graph->compile({make_callback_copy(y, host_y),
  1166. make_callback_copy(y_opt, host_y_opt)});
  1167. func->execute();
  1168. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3);
  1169. }
  1170. TEST(TestGoptInference, ConvertFormatNHWCD4Deconv) {
  1171. // hwcd4 is only supported in naive handle
  1172. NaiveMegDNNHandleScope naive_megdnn_handle;
  1173. HostTensorGenerator<> gen;
  1174. auto cn = CompNode::load("cpu0");
  1175. auto graph = ComputingGraph::make();
  1176. graph->options().graph_opt_level = 0;
  1177. auto mkcvar = [&](const char* name, const TensorShape& shp) {
  1178. return opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  1179. .rename(name);
  1180. };
  1181. auto host_x = gen({8, 8, 8, 8}, cn);
  1182. auto x = opr::Host2DeviceCopy::make(*graph, host_x);
  1183. opr::Convolution::Param param;
  1184. param.pad_h = param.pad_w = 0;
  1185. auto w0 = mkcvar("w1", {4, 8, 2, 2}),
  1186. conv = opr::Convolution::make(x, w0, param);
  1187. auto w1 = mkcvar("w1", {4, 1, 2, 2}),
  1188. y = opr::ConvolutionBackwardData::make(w1, conv, param, {}, {});
  1189. SymbolVar y_opt;
  1190. auto options = gopt::OptimizeForInferenceOptions{};
  1191. options.enable_nhwcd4();
  1192. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  1193. ASSERT_EQ(opr::Convolution::Param::Format::NCHW,
  1194. find_opr<opr::ConvolutionBackwardData>(y_opt).param().format);
  1195. ASSERT_EQ(opr::Convolution::Param::Format::NHWCD4,
  1196. find_opr<opr::Convolution>(y_opt).param().format);
  1197. HostTensorND host_y_opt, host_y;
  1198. auto func = graph->compile({make_callback_copy(y, host_y),
  1199. make_callback_copy(y_opt, host_y_opt)});
  1200. func->execute();
  1201. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3);
  1202. }
  1203. TEST(TestGoptInference, ConvertFormatNHWCD4Qint8) {
  1204. // hwcd4 is only supported in naive handle
  1205. NaiveMegDNNHandleScope naive_megdnn_handle;
  1206. HostTensorGenerator<> gen;
  1207. auto cn = CompNode::load("cpu0");
  1208. auto graph = ComputingGraph::make();
  1209. graph->options().graph_opt_level = 0;
  1210. auto mkcvar = [&](const char* name, const TensorShape& shp,
  1211. const DType& dtype) {
  1212. return opr::TypeCvt::make(
  1213. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  1214. .rename(name),
  1215. dtype);
  1216. };
  1217. auto host_x = gen({8, 8, 8, 8}, cn);
  1218. auto _x = opr::Host2DeviceCopy::make(*graph, host_x),
  1219. x = opr::TypeCvt::make(_x, dtype::QuantizedS8(0.2f));
  1220. opr::ConvBias::Param param;
  1221. param.pad_h = param.pad_w = 0;
  1222. auto w = mkcvar("w", {4, 8, 3, 3}, dtype::QuantizedS8(0.1f)),
  1223. b = mkcvar("b", {1, 4, 1, 1}, dtype::QuantizedS32(0.02f)),
  1224. y = opr::ConvBias::make(x, w, b, param, {},
  1225. OperatorNodeConfig{dtype::QuantizedS8(0.2f)});
  1226. SymbolVar y_opt;
  1227. auto options = gopt::OptimizeForInferenceOptions{};
  1228. options.enable_nhwcd4();
  1229. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  1230. ASSERT_EQ(opr::ConvBias::Param::Format::NHWCD4,
  1231. find_opr<opr::ConvBias>(y_opt).param().format);
  1232. graph->compile({{y_opt, {}}})
  1233. ->to_json()
  1234. ->writeto_fpath(output_file(
  1235. "TestGoptInference.ConvertFormatNHWCD4Qint8.json"));
  1236. auto float_y = opr::TypeCvt::make(y, dtype::Float32()),
  1237. float_y_opt = opr::TypeCvt::make(y_opt, dtype::Float32());
  1238. HostTensorND host_y_opt, host_y;
  1239. auto func = graph->compile({make_callback_copy(float_y, host_y),
  1240. make_callback_copy(float_y_opt, host_y_opt)});
  1241. func->execute();
  1242. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3);
  1243. }
  1244. TEST(TestGoptInference, ConvertFormatPadIC) {
  1245. // hwcd4 is only supported in naive handle
  1246. NaiveMegDNNHandleScope naive_megdnn_handle;
  1247. HostTensorGenerator<> gen;
  1248. auto cn = CompNode::load("cpu0");
  1249. auto graph = ComputingGraph::make();
  1250. graph->options().graph_opt_level = 0;
  1251. auto mkcvar = [&](const char* name, const TensorShape& shp) {
  1252. return opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  1253. .rename(name);
  1254. };
  1255. auto host_inp1 = gen({1, 6, 128, 128}, cn),
  1256. host_inp2 = gen({1, 6, 256, 256}, cn);
  1257. auto inp1 = opr::Host2DeviceCopy::make(*graph, host_inp1),
  1258. inp2 = opr::Host2DeviceCopy::make(*graph, host_inp2);
  1259. auto shape_tmp = mkcvar("tmp", {256, 256});
  1260. auto shape_of = opr::GetVarShape::make(shape_tmp);
  1261. opr::Resize::Param param_resize;
  1262. param_resize.format = opr::Resize::Param::Format::NCHW;
  1263. auto resize = opr::ResizeForward::make(inp1, shape_of, param_resize);
  1264. auto concat = opr::Concat::make({inp2, resize}, 1);
  1265. opr::Convolution::Param param;
  1266. param.pad_h = param.pad_w = 1;
  1267. param.sparse = opr::Convolution::Param::Sparse::DENSE;
  1268. auto w1 = mkcvar("w1", {12, 12, 3, 3});
  1269. auto y = opr::Convolution::make(concat, w1, param);
  1270. SymbolVar y_opt;
  1271. auto options = gopt::OptimizeForInferenceOptions{};
  1272. options.enable_nhwcd4();
  1273. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  1274. HostTensorND host_y_opt, host_y;
  1275. auto func = graph->compile({make_callback_copy(y, host_y),
  1276. make_callback_copy(y_opt, host_y_opt)});
  1277. func->execute();
  1278. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3);
  1279. }
  1280. TEST(TestGoptInference, ConvertBatchNormPass) {
  1281. auto cn = CompNode::load("cpu0");
  1282. HostTensorGenerator<> gen(0, 1, 0);
  1283. auto graph = ComputingGraph::make();
  1284. graph->options().graph_opt_level = 0;
  1285. auto mkvar = [&](const char* name, const TensorShape& shp) {
  1286. return opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name);
  1287. };
  1288. auto mkcvar = [&](const char* name, const TensorShape& shp) {
  1289. return opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  1290. .rename(name);
  1291. };
  1292. using Param = opr::BatchNorm::Param;
  1293. Param param(Param::ParamDim::DIM_1C11, Param::FwdMode::INFERENCE);
  1294. TensorShape shp = {1, 3, 1, 1};
  1295. auto x = mkvar("x", {2, 3, 16, 24}), scale = mkcvar("scale", shp),
  1296. bias = mkcvar("bias", shp), mean = mkcvar("mean", shp);
  1297. auto host_variance = gen(shp, cn);
  1298. for (size_t i = 0; i < shp.total_nr_elems(); ++i) {
  1299. host_variance->ptr<float>()[i] =
  1300. std::abs(host_variance->ptr<float>()[i]);
  1301. }
  1302. auto variance = opr::SharedDeviceTensor::make(*graph, *host_variance)
  1303. .rename("variance");
  1304. auto y = opr::BatchNorm::make(x, scale, bias, mean, variance, param)[4];
  1305. SymbolVar y_opt;
  1306. unpack_vector(gopt::optimize_for_inference(
  1307. {y}, gopt::OptimizeForInferenceOptions{}),
  1308. y_opt);
  1309. ASSERT_EQ(0u, find_opr_num<opr::BatchNorm>(y_opt));
  1310. graph->compile({{y_opt, {}}})
  1311. ->to_json()
  1312. ->writeto_fpath(
  1313. output_file("TestGoptInference.ConvertBatchNormPass.json"));
  1314. HostTensorND host_y, host_y_opt;
  1315. auto func = graph->compile({make_callback_copy(y, host_y),
  1316. make_callback_copy(y_opt, host_y_opt)});
  1317. func->execute();
  1318. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-5);
  1319. }
  1320. TEST(TestGoptInference, ConvBiasNonlinearityFusePass) {
  1321. // hwcd4 is only supported in naive handle
  1322. NaiveMegDNNHandleScope naive_megdnn_handle;
  1323. auto cn = CompNode::load("cpu0");
  1324. HostTensorGenerator<> gen;
  1325. auto graph = ComputingGraph::make();
  1326. graph->options().graph_opt_level = 0;
  1327. auto mkvar = [&](const char* name, const TensorShape& shp) {
  1328. return opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name);
  1329. };
  1330. auto mkcvar = [&](const char* name, const TensorShape& shp) {
  1331. return opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  1332. .rename(name);
  1333. };
  1334. opr::Convolution::Param param;
  1335. auto x = mkvar("x", {5, 8, 16, 24}), w1 = mkcvar("w1", {4, 8, 1, 1}),
  1336. w2 = mkcvar("w2", {4, 4, 3, 3}), b1 = mkcvar("b1", {1, 4, 1, 1}),
  1337. b2 = mkcvar("b2", {1, 4, 1, 1}), w3 = mkcvar("w3", {8, 4, 1, 1}),
  1338. y_cut = opr::Convolution::make(x, w1, param),
  1339. y1 = opr::Elemwise::make({y_cut + b1},
  1340. opr::Elemwise::Param::Mode::RELU);
  1341. param.pad_w = param.pad_h = 1;
  1342. auto y2 = opr::Elemwise::make({opr::Convolution::make(y1, w2, param) + b2},
  1343. opr::Elemwise::Param::Mode::SIGMOID);
  1344. param.pad_w = param.pad_h = 0;
  1345. auto y3 = opr::Convolution::make(y2, w3, param), y_tmp = y3 + x,
  1346. y_expand =
  1347. opr::Elemwise::make({y_cut}, opr::Elemwise::Param::Mode::RELU),
  1348. y_y = opr::Convolution::make(y_expand, w3, param), y = y_y + y_tmp;
  1349. SymbolVar y_opt;
  1350. auto options = gopt::OptimizeForInferenceOptions{};
  1351. options.enable_nhwcd4().enable_fuse_conv_bias_nonlinearity();
  1352. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  1353. ASSERT_EQ(3u, find_opr<opr::ConvBias>(y_opt).input().size());
  1354. graph->compile({{y_opt, {}}})
  1355. ->to_json()
  1356. ->writeto_fpath(output_file(
  1357. "TestGoptInference.FuseConvBiasNonlinPass.json"));
  1358. HostTensorND host_y, host_y_opt;
  1359. auto func = graph->compile({make_callback_copy(y, host_y),
  1360. make_callback_copy(y_opt, host_y_opt)});
  1361. func->execute();
  1362. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-4);
  1363. }
  1364. TEST(TestGoptInference, ConvBiasNonlinearityFusePass_FullBias) {
  1365. NaiveMegDNNHandleScope naive_megdnn_handle;
  1366. for (int i = 0; i < 2; i++) {
  1367. auto graph = ComputingGraph::make();
  1368. auto cn = CompNode::load("cpu0");
  1369. HostTensorGenerator<> gen;
  1370. auto mkImvar = [&](const char* name, const TensorShape& shp) {
  1371. return opr::ImmutableTensor::make(*graph, *gen(shp, cn))
  1372. .rename(name);
  1373. };
  1374. graph->options().graph_opt_level = 0;
  1375. auto mkcvar = [&](const char* name, const TensorShape& shp) {
  1376. return opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  1377. .rename(name);
  1378. };
  1379. opr::Convolution::Param param;
  1380. auto host_x = gen({1, 8, 16, 24}, cn);
  1381. auto x = opr::Host2DeviceCopy::make(*graph, host_x),
  1382. w1 = mkcvar("w1", {4, 8, 1, 1}), w2 = mkcvar("w2", {4, 8, 3, 3}),
  1383. w3 = mkcvar("w3", {4, 4, 1, 1}),
  1384. b = i == 0 ? mkcvar("b", {1, 4, 16, 24})
  1385. : mkImvar("bias", {1, 4, 16, 24}),
  1386. y_cut0 = opr::Convolution::make(x, w1, param);
  1387. param.pad_w = param.pad_h = 1;
  1388. auto y_cut1 = opr::Convolution::make(x, w2, param);
  1389. auto y1 = opr::Elemwise::make({y_cut0 + y_cut1},
  1390. opr::Elemwise::Param::Mode::RELU);
  1391. param.pad_w = param.pad_h = 0;
  1392. auto y2 = opr::Convolution::make(y1, w3, param);
  1393. auto y =
  1394. opr::Elemwise::make({y2 + b}, opr::Elemwise::Param::Mode::RELU);
  1395. SymbolVar y_opt;
  1396. auto options = gopt::OptimizeForInferenceOptions{};
  1397. options.enable_fuse_conv_bias_nonlinearity();
  1398. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  1399. ASSERT_EQ(3u, find_opr<opr::ConvBias>(y_opt).input().size());
  1400. graph->compile({{y_opt, {}}})
  1401. ->to_json()
  1402. ->writeto_fpath(
  1403. output_file("TestGoptInference.FuseConvBiasNonlinPass_"
  1404. "FulBias.json"));
  1405. HostTensorND host_y, host_y_opt;
  1406. auto func = graph->compile({make_callback_copy(y, host_y),
  1407. make_callback_copy(y_opt, host_y_opt)});
  1408. func->execute();
  1409. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-4);
  1410. *host_x = *gen({4, 8, 16, 24}, cn);
  1411. func->execute();
  1412. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-4);
  1413. }
  1414. }
  1415. TEST(TestGoptInference, ParamMerge) {
  1416. auto cns = load_multiple_xpus(2);
  1417. HostTensorGenerator<> gen;
  1418. auto graph = ComputingGraph::make();
  1419. auto var0 = opr::SharedDeviceTensor::make(*graph, *gen({2, 3}, cns[0])),
  1420. var1 = opr::SharedDeviceTensor::make(*graph, *gen({1, 3}, cns[1])),
  1421. y = var0 + opr::Copy::make(var1, {cns[0]});
  1422. HostTensorND y_expected_val;
  1423. graph->compile({make_callback_copy(y, y_expected_val)})->execute();
  1424. SymbolVar y_opt;
  1425. unpack_vector(gopt::GraphOptimizer{}
  1426. .add_pass<gopt::ParamMergePass>()
  1427. .apply({{y}})
  1428. .endpoint_vars(),
  1429. y_opt);
  1430. auto opr = y_opt.node()->owner_opr();
  1431. ASSERT_EQ(2u, opr->input().size());
  1432. ASSERT_EQ(2u,
  1433. find_opr<opr::MultipleDeviceTensorHolder>(y_opt).output().size());
  1434. HostTensorND y_got_val;
  1435. graph->compile({make_callback_copy(y_opt, y_got_val)})->execute();
  1436. MGB_ASSERT_TENSOR_EQ(y_expected_val, y_got_val);
  1437. }
  1438. TEST(TestGoptInference, ParamMergeFormat) {
  1439. auto cns = load_multiple_xpus(2);
  1440. auto make_dv = [](const HostTensorND& hv) {
  1441. TensorLayout layout{hv.layout(), hv.layout().dtype,
  1442. megdnn::Image2DPack4TensorFormat::make_raw(1, 64)};
  1443. auto ret = std::make_shared<DeviceTensorND>(hv.comp_node(), layout);
  1444. ret->copy_from_fixlayout(hv).sync();
  1445. return ret;
  1446. };
  1447. HostTensorGenerator<> gen;
  1448. auto graph = ComputingGraph::make();
  1449. auto var0 = opr::SharedDeviceTensorWithFormat::make(
  1450. *graph, make_dv(*gen({2, 32}, cns[0]))),
  1451. var1 = opr::SharedDeviceTensorWithFormat::make(
  1452. *graph, make_dv(*gen({1, 32}, cns[1]))),
  1453. y = var0 + opr::Copy::make(var1, {cns[0]});
  1454. HostTensorND y_expected_val;
  1455. graph->compile({make_callback_copy(y, y_expected_val)})->execute();
  1456. SymbolVar y_opt;
  1457. unpack_vector(gopt::GraphOptimizer{}
  1458. .add_pass<gopt::ParamMergePass>()
  1459. .apply({{y}})
  1460. .endpoint_vars(),
  1461. y_opt);
  1462. auto opr = y_opt.node()->owner_opr();
  1463. ASSERT_EQ(2u, opr->input().size());
  1464. ASSERT_EQ(2u, find_opr<opr::MultipleDeviceTensorWithFormatHolder>(y_opt)
  1465. .output()
  1466. .size());
  1467. HostTensorND y_got_val;
  1468. graph->compile({make_callback_copy(y_opt, y_got_val)})->execute();
  1469. MGB_ASSERT_TENSOR_EQ(y_expected_val, y_got_val);
  1470. }
  1471. #if MGB_ENABLE_FASTRUN
  1472. TEST(TestGoptInference, AlgoProfile) {
  1473. HostTensorGenerator<> gen;
  1474. auto graph = ComputingGraph::make();
  1475. auto host_x = gen({4, 3, 8, 9}), host_y = gen({2, 3, 3, 3});
  1476. auto x = opr::Host2DeviceCopy::make(*graph, host_x),
  1477. y = opr::Host2DeviceCopy::make(*graph, host_y),
  1478. z = opr::Convolution::make(x, y);
  1479. auto&& conv = z.node()->owner_opr()->cast_final_safe<opr::Convolution>();
  1480. using S = opr::Convolution::ExecutionPolicy::Strategy;
  1481. ASSERT_EQ(S::HEURISTIC, conv.execution_policy_transient().strategy);
  1482. gopt::enable_opr_algo_profiling_inplace({z + 2.3f});
  1483. ASSERT_EQ(S::PROFILE, conv.execution_policy().strategy);
  1484. }
  1485. #endif
  1486. TEST(TestGoptInference, ProfileCache) {
  1487. HostTensorGenerator<> gen;
  1488. auto graph = ComputingGraph::make();
  1489. auto host_x = gen({4, 3, 8, 9}), host_y = gen({2, 3, 3, 3});
  1490. auto x = opr::Host2DeviceCopy::make(*graph, host_x),
  1491. y = opr::Host2DeviceCopy::make(*graph, host_y),
  1492. z = opr::Convolution::make(x, y);
  1493. auto&& conv = z.node()->owner_opr()->cast_final_safe<opr::Convolution>();
  1494. using S = opr::Convolution::ExecutionPolicy::Strategy;
  1495. ASSERT_EQ(S::HEURISTIC, conv.execution_policy_transient().strategy);
  1496. gopt::enable_opr_use_profiling_cache_inplace({z + 2.3f});
  1497. ASSERT_EQ(S::PROFILE | S::HEURISTIC, conv.execution_policy().strategy);
  1498. }
  1499. TEST(TestGoptInference, FastProfileCache) {
  1500. HostTensorGenerator<> gen;
  1501. auto graph = ComputingGraph::make();
  1502. auto host_x = gen({4, 3, 8, 9}), host_y = gen({2, 3, 3, 3});
  1503. auto x = opr::Host2DeviceCopy::make(*graph, host_x),
  1504. y = opr::Host2DeviceCopy::make(*graph, host_y),
  1505. z = opr::Convolution::make(x, y);
  1506. auto&& conv = z.node()->owner_opr()->cast_final_safe<opr::Convolution>();
  1507. using S = opr::Convolution::ExecutionPolicy::Strategy;
  1508. ASSERT_EQ(S::HEURISTIC, conv.execution_policy_transient().strategy);
  1509. gopt::modify_opr_algo_strategy_inplace({z + 2.3f},
  1510. S::PROFILE | S::OPTMIZED);
  1511. ASSERT_EQ(S::PROFILE | S::OPTMIZED, conv.execution_policy().strategy);
  1512. }
  1513. TEST(TestGoptInference, AlgoWorkspaceLimit) {
  1514. HostTensorGenerator<> gen;
  1515. auto graph = ComputingGraph::make();
  1516. auto host_x = gen({4, 3, 8, 9}), host_y = gen({2, 3, 3, 3});
  1517. auto x = opr::Host2DeviceCopy::make(*graph, host_x),
  1518. y = opr::Host2DeviceCopy::make(*graph, host_y),
  1519. z = opr::Convolution::make(x, y);
  1520. auto&& conv = z.node()->owner_opr()->cast_final_safe<opr::Convolution>();
  1521. ASSERT_EQ(std::numeric_limits<uint64_t>::max(),
  1522. conv.execution_policy_transient().workspace_limit);
  1523. gopt::set_opr_algo_workspace_limit_inplace({z + 2.3f}, 10000u);
  1524. ASSERT_EQ(10000u, conv.execution_policy().workspace_limit);
  1525. }
  1526. TEST_PASS(FuseConvBiasNonlinPass, Basic) {
  1527. auto cn = CompNode::load("xpux");
  1528. HostTensorGenerator<dtype::Int8> gen;
  1529. auto graph = ComputingGraph::make();
  1530. graph->options().graph_opt_level = 0;
  1531. auto mkvar = [&](const char* name, const TensorShape& shp,
  1532. const DType& dtype) {
  1533. return opr::TypeCvt::make(
  1534. opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
  1535. dtype);
  1536. };
  1537. auto mkcvar = [&](const char* name, const TensorShape& shp,
  1538. const DType& dtype) {
  1539. return opr::TypeCvt::make(
  1540. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  1541. .rename(name),
  1542. dtype);
  1543. };
  1544. for (auto format : {opr::Convolution::Param::Format::NCHW,
  1545. opr::Convolution::Param::Format::NHWC,
  1546. opr::Convolution::Param::Format::NCHW4}) {
  1547. opr::Convolution::Param param;
  1548. param.format = format;
  1549. SymbolVar x, w, b;
  1550. if (format == opr::Convolution::Param::Format::NHWC) {
  1551. x = mkvar("x", {20, 20, 20, 4}, dtype::QuantizedS8(2.5f)),
  1552. w = mkcvar("w1", {24, 1, 1, 4}, dtype::QuantizedS8(2.5f)),
  1553. b = mkcvar("b", {1, 1, 1, 24}, dtype::QuantizedS32(6.25f));
  1554. } else if (format == opr::Convolution::Param::Format::NCHW) {
  1555. x = mkvar("x", {20, 4, 20, 20}, dtype::QuantizedS8(2.5f)),
  1556. w = mkcvar("w1", {24, 4, 1, 1}, dtype::QuantizedS8(2.5f)),
  1557. b = mkcvar("b", {1, 24, 1, 1}, dtype::QuantizedS32(6.25f));
  1558. } else {
  1559. mgb_assert(format == opr::Convolution::Param::Format::NCHW4);
  1560. x = mkvar("x", {20, 1, 20, 20, 4}, dtype::QuantizedS8(2.5f)),
  1561. w = mkcvar("w1", {24, 1, 1, 1, 4}, dtype::QuantizedS8(2.5f)),
  1562. b = mkcvar("b", {1, 6, 1, 1, 4}, dtype::QuantizedS32(6.25f));
  1563. }
  1564. auto y = opr::Convolution::make(x, w, param);
  1565. y = opr::Elemwise::make({y + b}, opr::Elemwise::Param::Mode::RELU);
  1566. y = opr::TypeCvt::make(y, dtype::QuantizedS8(2.5f));
  1567. opr::ConvBias::Param conv_bias_param;
  1568. conv_bias_param.format = format;
  1569. conv_bias_param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU;
  1570. auto concret_y = opr::ConvBias::make(
  1571. x, w, b, conv_bias_param, {},
  1572. OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  1573. check(concret_y, y);
  1574. }
  1575. }
  1576. #if MGB_CUDA
  1577. TEST(TestEnableTensorCore, SmallInputShape) {
  1578. REQUIRE_GPU(1);
  1579. auto cn = CompNode::load("gpu0");
  1580. cn.activate();
  1581. auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
  1582. auto sm_ver = prop.major * 10 + prop.minor;
  1583. if (sm_ver < 75) {
  1584. printf("This testcast ignored due to insufficient cuda cap(got: %d, "
  1585. "expected: %d)\n",
  1586. sm_ver, 75);
  1587. return;
  1588. }
  1589. HostTensorGenerator<dtype::Int8> gen;
  1590. auto graph = ComputingGraph::make();
  1591. graph->options().graph_opt_level = 0;
  1592. auto mkvar = [&](const char* name, const TensorShape& shp,
  1593. const DType& dtype) {
  1594. return opr::TypeCvt::make(
  1595. opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
  1596. dtype);
  1597. };
  1598. auto mkcvar = [&](const char* name, const TensorShape& shp,
  1599. const DType& dtype) {
  1600. return opr::TypeCvt::make(
  1601. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  1602. .rename(name),
  1603. dtype);
  1604. };
  1605. auto x = mkvar("x", {32, 16, 4, 8, 4}, dtype::QuantizedS8(2.5f)),
  1606. w = mkcvar("w1", {64, 16, 3, 3, 4}, dtype::QuantizedS8(2.5f)),
  1607. b = mkcvar("b", {1, 16, 1, 1, 4}, dtype::QuantizedS32(6.25f)),
  1608. z = mkcvar("b1", {32, 16, 2, 4, 4}, dtype::QuantizedS8(2.5f));
  1609. opr::ConvBias::Param param;
  1610. param.format = opr::ConvBias::Param::Format::NCHW4;
  1611. param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU;
  1612. param.stride_h = param.stride_w = 2;
  1613. param.pad_h = param.pad_w = 1;
  1614. auto y = opr::ConvBias::make(x, w, b, z, param, {},
  1615. OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  1616. y = opr::ConvBias::make(y, w, b, param, {},
  1617. OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  1618. y = opr::TypeCvt::make(y, dtype::Float32());
  1619. SymbolVar y_opt;
  1620. SymbolVar y_no_tc;
  1621. {
  1622. auto options = gopt::OptimizeForInferenceOptions{};
  1623. options.enable_nchw32().enable_fuse_conv_bias_nonlinearity();
  1624. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  1625. }
  1626. {
  1627. auto options = gopt::OptimizeForInferenceOptions{};
  1628. options.enable_fuse_conv_bias_nonlinearity();
  1629. unpack_vector(gopt::optimize_for_inference({y}, options), y_no_tc);
  1630. }
  1631. auto nr_dimshuffle = find_opr_num<mgb::opr::Dimshuffle>(y_opt);
  1632. ASSERT_EQ(2u, nr_dimshuffle);
  1633. HostTensorND host_y, host_y_opt;
  1634. auto func = graph->compile({make_callback_copy(y_no_tc, host_y),
  1635. make_callback_copy(y_opt, host_y_opt)});
  1636. func->execute();
  1637. MGB_ASSERT_TENSOR_EQ(host_y, host_y_opt);
  1638. }
  1639. //! close for cu111 ci, reopen it when bug fixed
  1640. #if CUDA_VERSION < 11000
  1641. TEST(TestEnableTensorCore, Nchw4Nchw) {
  1642. REQUIRE_GPU(1);
  1643. auto cn = CompNode::load("gpu0");
  1644. cn.activate();
  1645. auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
  1646. auto sm_ver = prop.major * 10 + prop.minor;
  1647. if (sm_ver < 75) {
  1648. printf("This testcast ignored due to insufficient cuda cap(got: %d, "
  1649. "expected: %d)\n",
  1650. sm_ver, 75);
  1651. return;
  1652. }
  1653. HostTensorGenerator<dtype::Int8> gen;
  1654. auto graph = ComputingGraph::make();
  1655. graph->options().graph_opt_level = 0;
  1656. auto mkvar = [&](const char* name, const TensorShape& shp,
  1657. const DType& dtype) {
  1658. return opr::TypeCvt::make(
  1659. opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
  1660. dtype);
  1661. };
  1662. auto mkcvar = [&](const char* name, const TensorShape& shp,
  1663. const DType& dtype) {
  1664. return opr::TypeCvt::make(
  1665. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  1666. .rename(name),
  1667. dtype);
  1668. };
  1669. auto mkshape = [](opr::ConvBias::Param::Format format, size_t N, size_t C,
  1670. size_t H, size_t W) -> TensorShape {
  1671. mgb_assert(C % 4 == 0);
  1672. if (format == opr::ConvBias::Param::Format::NCHW4) {
  1673. return {N, C / 4, H, W, 4};
  1674. } else {
  1675. mgb_assert(format == opr::ConvBias::Param::Format::NCHW);
  1676. return {N, C, H, W};
  1677. }
  1678. };
  1679. for (auto format : {opr::ConvBias::Param::Format::NCHW,
  1680. opr::ConvBias::Param::Format::NCHW4}) {
  1681. auto x = mkvar("x", mkshape(format, 32, 64, 16, 16),
  1682. dtype::QuantizedS8(2.5f)),
  1683. w = mkcvar("w1", mkshape(format, 64, 64, 3, 3),
  1684. dtype::QuantizedS8(2.5f)),
  1685. b = mkcvar("b", mkshape(format, 1, 64, 1, 1),
  1686. dtype::QuantizedS32(6.25f)),
  1687. z = mkcvar("b1", mkshape(format, 32, 64, 8, 8),
  1688. dtype::QuantizedS8(2.5f));
  1689. opr::ConvBias::Param param;
  1690. param.format = format;
  1691. param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU;
  1692. param.stride_h = param.stride_w = 2;
  1693. param.pad_h = param.pad_w = 1;
  1694. auto y = opr::ConvBias::make(
  1695. x, w, b, z, param, {},
  1696. OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  1697. y = opr::ConvBias::make(y, w, b, param, {},
  1698. OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  1699. y = opr::TypeCvt::make(y, dtype::Float32());
  1700. SymbolVar y_opt;
  1701. SymbolVar y_no_tc;
  1702. {
  1703. auto options = gopt::OptimizeForInferenceOptions{};
  1704. options.enable_nchw32().enable_fuse_conv_bias_nonlinearity();
  1705. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  1706. }
  1707. {
  1708. auto options = gopt::OptimizeForInferenceOptions{};
  1709. options.enable_fuse_conv_bias_nonlinearity();
  1710. unpack_vector(gopt::optimize_for_inference({y}, options), y_no_tc);
  1711. }
  1712. auto nr_dimshuffle = find_opr_num<mgb::opr::Dimshuffle>(y_opt);
  1713. if (format == opr::ConvBias::Param::Format::NCHW4) {
  1714. #if CUDA_VERSION >= 10020
  1715. //! try_conv_reformat_nchw322nchw4 used when cuda_version >= 10020
  1716. ASSERT_EQ(1u, nr_dimshuffle);
  1717. #else
  1718. ASSERT_EQ(2u, nr_dimshuffle);
  1719. #endif
  1720. } else {
  1721. ASSERT_EQ(2u, nr_dimshuffle);
  1722. }
  1723. std::string json_name;
  1724. if (format == opr::ConvBias::Param::Format::NCHW4) {
  1725. json_name = "TestGoptInference.Nchw4Nchw.NCHW4.json";
  1726. } else {
  1727. mgb_assert(format == opr::ConvBias::Param::Format::NCHW);
  1728. json_name = "TestGoptInference.Nchw4Nchw.NCHW.json";
  1729. }
  1730. graph->compile({{y_opt, {}}})
  1731. ->to_json()
  1732. ->writeto_fpath(output_file(json_name.c_str()));
  1733. HostTensorND host_y, host_y_opt;
  1734. auto func = graph->compile({make_callback_copy(y_no_tc, host_y),
  1735. make_callback_copy(y_opt, host_y_opt)});
  1736. func->execute();
  1737. MGB_ASSERT_TENSOR_EQ(host_y, host_y_opt);
  1738. }
  1739. }
  1740. #endif
  1741. //! close for cu111 ci, reopen it when bug fixed
  1742. #if CUDA_VERSION < 11000
  1743. TEST(TestEnableTensorCore, ConvBiasWithZ) {
  1744. REQUIRE_GPU(1);
  1745. auto cn = CompNode::load("gpu0");
  1746. cn.activate();
  1747. auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
  1748. auto sm_ver = prop.major * 10 + prop.minor;
  1749. if (sm_ver < 75) {
  1750. printf("This testcast ignored due to insufficient cuda cap(got: %d, "
  1751. "expected: %d)\n",
  1752. sm_ver, 75);
  1753. return;
  1754. }
  1755. HostTensorGenerator<dtype::Int8> gen;
  1756. auto graph = ComputingGraph::make();
  1757. graph->options().graph_opt_level = 0;
  1758. auto mkvar = [&](const char* name, const TensorShape& shp,
  1759. const DType& dtype) {
  1760. return opr::TypeCvt::make(
  1761. opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
  1762. dtype);
  1763. };
  1764. auto mkcvar = [&](const char* name, const TensorShape& shp,
  1765. const DType& dtype) {
  1766. return opr::TypeCvt::make(
  1767. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  1768. .rename(name),
  1769. dtype);
  1770. };
  1771. auto x = mkvar("x", {32, 16, 16, 16, 4}, dtype::QuantizedS8(2.5f)),
  1772. w = mkcvar("w1", {64, 16, 3, 3, 4}, dtype::QuantizedS8(2.5f)),
  1773. b = mkcvar("b", {1, 16, 1, 1, 4}, dtype::QuantizedS32(6.25f)),
  1774. z = mkvar("b1", {32, 16, 16, 16, 4}, dtype::QuantizedS8(2.5f));
  1775. opr::ConvBias::Param param;
  1776. param.format = opr::ConvBias::Param::Format::NCHW4;
  1777. param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU;
  1778. param.stride_h = param.stride_w = 1;
  1779. param.pad_h = param.pad_w = 1;
  1780. auto y = opr::ConvBias::make(x, w, b, z, param, {},
  1781. OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  1782. y = opr::TypeCvt::make(y, dtype::Float32());
  1783. SymbolVar y_opt;
  1784. SymbolVar y_no_tc;
  1785. {
  1786. auto options = gopt::OptimizeForInferenceOptions{};
  1787. options.enable_fuse_conv_bias_nonlinearity().enable_nchw32();
  1788. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  1789. }
  1790. {
  1791. auto options = gopt::OptimizeForInferenceOptions{};
  1792. options.enable_fuse_conv_bias_nonlinearity();
  1793. unpack_vector(gopt::optimize_for_inference({y}, options), y_no_tc);
  1794. }
  1795. HostTensorND host_y, host_y_opt;
  1796. auto func = graph->compile({make_callback_copy(y_no_tc, host_y),
  1797. make_callback_copy(y_opt, host_y_opt)});
  1798. func->execute();
  1799. MGB_ASSERT_TENSOR_EQ(host_y, host_y_opt);
  1800. }
  1801. #endif
  1802. //! close for cu111 ci, reopen it when bug fixed
  1803. #if CUDA_VERSION < 11000
  1804. TEST(TestEnableTensorCore, Pooling) {
  1805. REQUIRE_GPU(1);
  1806. auto cn = CompNode::load("gpu0");
  1807. cn.activate();
  1808. auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
  1809. auto sm_ver = prop.major * 10 + prop.minor;
  1810. if (sm_ver < 75) {
  1811. printf("This testcast ignored due to insufficient cuda cap(got: %d, "
  1812. "expected: %d)\n",
  1813. sm_ver, 75);
  1814. return;
  1815. }
  1816. HostTensorGenerator<dtype::Int8> gen;
  1817. auto graph = ComputingGraph::make();
  1818. graph->options().graph_opt_level = 0;
  1819. auto mkvar = [&](const char* name, const TensorShape& shp,
  1820. const DType& dtype) {
  1821. return opr::TypeCvt::make(
  1822. opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
  1823. dtype);
  1824. };
  1825. auto mkcvar = [&](const char* name, const TensorShape& shp,
  1826. const DType& dtype) {
  1827. return opr::TypeCvt::make(
  1828. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  1829. .rename(name),
  1830. dtype);
  1831. };
  1832. auto x = mkvar("x", {32, 16, 16, 16, 4}, dtype::QuantizedS8(2.5f)),
  1833. w = mkcvar("w1", {64, 16, 3, 3, 4}, dtype::QuantizedS8(2.5f)),
  1834. b = mkcvar("b", {1, 16, 1, 1, 4}, dtype::QuantizedS32(6.25f)),
  1835. z = mkvar("b1", {32, 16, 16, 16, 4}, dtype::QuantizedS8(2.5f));
  1836. opr::ConvBias::Param param;
  1837. param.format = opr::ConvBias::Param::Format::NCHW4;
  1838. param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU;
  1839. param.stride_h = param.stride_w = 1;
  1840. param.pad_h = param.pad_w = 1;
  1841. auto y = opr::ConvBias::make(x, w, b, z, param, {},
  1842. OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  1843. opr::Pooling::Param pool_param;
  1844. pool_param.format = opr::Pooling::Param::Format::NCHW4;
  1845. y = opr::Pooling::make(y, pool_param);
  1846. y = opr::TypeCvt::make(y, dtype::Float32());
  1847. SymbolVar y_opt;
  1848. SymbolVar y_no_tc;
  1849. {
  1850. auto options = gopt::OptimizeForInferenceOptions{};
  1851. options.enable_fuse_conv_bias_nonlinearity().enable_nchw32();
  1852. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  1853. }
  1854. ASSERT_EQ(opr::Pooling::Param::Format::NCHW32,
  1855. find_opr<opr::Pooling>(y_opt).param().format);
  1856. {
  1857. auto options = gopt::OptimizeForInferenceOptions{};
  1858. options.enable_fuse_conv_bias_nonlinearity();
  1859. unpack_vector(gopt::optimize_for_inference({y}, options), y_no_tc);
  1860. }
  1861. HostTensorND host_y, host_y_opt;
  1862. auto func = graph->compile({make_callback_copy(y_no_tc, host_y),
  1863. make_callback_copy(y_opt, host_y_opt)});
  1864. func->execute();
  1865. MGB_ASSERT_TENSOR_EQ(host_y, host_y_opt);
  1866. }
  1867. #endif
  1868. TEST(TestGoptInference, EnableTensorCore) {
  1869. REQUIRE_GPU(1);
  1870. auto cn = CompNode::load("gpu0");
  1871. cn.activate();
  1872. auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
  1873. auto sm_ver = prop.major * 10 + prop.minor;
  1874. if (sm_ver < 75) {
  1875. printf("This testcast ignored due to insufficient cuda cap(got: %d, "
  1876. "expected: %d)\n",
  1877. sm_ver, 75);
  1878. return;
  1879. }
  1880. HostTensorGenerator<dtype::Int8> gen;
  1881. auto graph = ComputingGraph::make();
  1882. graph->options().graph_opt_level = 0;
  1883. auto mkvar = [&](const char* name, const TensorShape& shp,
  1884. const DType& dtype) {
  1885. return opr::TypeCvt::make(
  1886. opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
  1887. dtype);
  1888. };
  1889. auto mkcvar = [&](const char* name, const TensorShape& shp,
  1890. const DType& dtype) {
  1891. return opr::TypeCvt::make(
  1892. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  1893. .rename(name),
  1894. dtype);
  1895. };
  1896. auto x = mkvar("x", {32, 16, 16, 16, 4}, dtype::QuantizedS8(2.5f)),
  1897. w = mkcvar("w1", {64, 16, 3, 3, 4}, dtype::QuantizedS8(2.5f)),
  1898. b = mkcvar("b", {1, 16, 1, 1, 4}, dtype::QuantizedS32(6.25f)),
  1899. b1 = mkvar("b1", {32, 16, 16, 16, 4}, dtype::QuantizedS8(2.5f));
  1900. opr::Convolution::Param param;
  1901. param.format = opr::Convolution::Param::Format::NCHW4;
  1902. param.stride_h = param.stride_w = 1;
  1903. param.pad_h = param.pad_w = 1;
  1904. auto y = opr::Convolution::make(x, w, param);
  1905. y = opr::Elemwise::make({y + b}, opr::Elemwise::Param::Mode::RELU);
  1906. y = opr::TypeCvt::make(y, dtype::QuantizedS8(2.5f));
  1907. auto y1 = y + b1, y2 = opr::Convolution::make(y, w, param),
  1908. y3 = opr::Elemwise::make({y - b1}, opr::Elemwise::Param::Mode::RELU);
  1909. y2 = opr::Elemwise::make({y2 + b}, opr::Elemwise::Param::Mode::RELU),
  1910. y2 = opr::TypeCvt::make(y2, dtype::QuantizedS8(2.5f));
  1911. auto y4 = y1 + y2 + y3;
  1912. y4 = opr::TypeCvt::make(y4, dtype::Float32());
  1913. SymbolVar y_opt;
  1914. SymbolVar y_no_tc;
  1915. {
  1916. auto options = gopt::OptimizeForInferenceOptions{};
  1917. options.enable_fuse_conv_bias_nonlinearity().enable_nchw32();
  1918. unpack_vector(gopt::optimize_for_inference({y4}, options), y_opt);
  1919. }
  1920. {
  1921. auto options = gopt::OptimizeForInferenceOptions{};
  1922. options.enable_fuse_conv_bias_nonlinearity().enable_nchw32();
  1923. unpack_vector(gopt::optimize_for_inference({y4}, options), y_no_tc);
  1924. }
  1925. auto nr_dimshuffle = find_opr_num<mgb::opr::Dimshuffle>(y_opt);
  1926. ASSERT_EQ(3u, nr_dimshuffle);
  1927. graph->compile({{y_opt, {}}})
  1928. ->to_json()
  1929. ->writeto_fpath(
  1930. output_file("TestGoptInference.EnableTensorCorePass.json"));
  1931. HostTensorND host_y, host_y_opt;
  1932. auto func = graph->compile({make_callback_copy(y_no_tc, host_y),
  1933. make_callback_copy(y_opt, host_y_opt)});
  1934. func->execute();
  1935. MGB_ASSERT_TENSOR_EQ(host_y, host_y_opt);
  1936. }
  1937. //! close for cu111 ci, reopen it when bug fixed
  1938. #if CUDA_VERSION < 11000
  1939. TEST(FuseConvBiasZPass, BlockFuse) {
  1940. REQUIRE_GPU(1);
  1941. auto cn = CompNode::load("gpu0");
  1942. cn.activate();
  1943. auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
  1944. auto sm_ver = prop.major * 10 + prop.minor;
  1945. if (sm_ver < 61) {
  1946. printf("This testcast ignored due to insufficient cuda cap(got: %d, "
  1947. "expected: %d)\n",
  1948. sm_ver, 61);
  1949. return;
  1950. }
  1951. HostTensorGenerator<dtype::Int8> gen;
  1952. auto graph = ComputingGraph::make();
  1953. graph->options().graph_opt_level = 0;
  1954. auto mkvar = [&](const char* name, const TensorShape& shp,
  1955. const DType& dtype) {
  1956. return opr::TypeCvt::make(
  1957. opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
  1958. dtype);
  1959. };
  1960. auto mkcvar = [&](const char* name, const TensorShape& shp,
  1961. const DType& dtype) {
  1962. return opr::TypeCvt::make(
  1963. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  1964. .rename(name),
  1965. dtype);
  1966. };
  1967. using ElemMultiMode = opr::ElemwiseMultiType::Param::Mode;
  1968. using NonlineMode = opr::ConvBias::Param::NonlineMode;
  1969. for (auto mode :
  1970. {ElemMultiMode::QFUSE_ADD_RELU, ElemMultiMode::QFUSE_ADD_H_SWISH}) {
  1971. auto x = mkvar("x", {32, 16, 16, 16, 4}, dtype::QuantizedS8(2.5f)),
  1972. w1 = mkcvar("w1", {64, 16, 3, 3, 4}, dtype::QuantizedS8(2.5f)),
  1973. b1 = mkcvar("b1", {1, 16, 1, 1, 4}, dtype::QuantizedS32(6.25f)),
  1974. w2 = mkcvar("w2", {64, 16, 3, 3, 4}, dtype::QuantizedS8(2.5f)),
  1975. b2 = mkcvar("b2", {1, 16, 1, 1, 4}, dtype::QuantizedS32(6.25f)),
  1976. w3 = mkcvar("w3", {64, 16, 3, 3, 4}, dtype::QuantizedS8(2.5f)),
  1977. b3 = mkcvar("b3", {1, 16, 1, 1, 4}, dtype::QuantizedS32(3.0f));
  1978. NonlineMode nonline_mode = NonlineMode::RELU;
  1979. if (mode == ElemMultiMode::QFUSE_ADD_H_SWISH) {
  1980. nonline_mode = NonlineMode::H_SWISH;
  1981. }
  1982. opr::ConvBias::Param param;
  1983. param.format = opr::Convolution::Param::Format::NCHW4;
  1984. param.nonlineMode = nonline_mode;
  1985. param.stride_h = param.stride_w = 1;
  1986. param.pad_h = param.pad_w = 1;
  1987. auto y1 = opr::ConvBias::make(
  1988. x, w1, b1, param, {},
  1989. OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  1990. param.nonlineMode = opr::ConvBias::Param::NonlineMode::IDENTITY;
  1991. auto y2 = opr::ConvBias::make(
  1992. y1, w2, b2, param, {},
  1993. OperatorNodeConfig{dtype::QuantizedS8(2.5f)}),
  1994. y3 = opr::ElemwiseMultiType::make(
  1995. {y1, y2}, {mode},
  1996. OperatorNodeConfig{dtype::QuantizedS8(1.2f)});
  1997. param.nonlineMode = nonline_mode;
  1998. auto y4 = opr::ConvBias::make(
  1999. y3, w3, b3, param, {},
  2000. OperatorNodeConfig{dtype::QuantizedS8(2.5f)}),
  2001. z = opr::ElemwiseMultiType::make(
  2002. {y3, y4}, {opr::ElemwiseMultiType::Param::Mode::QADD},
  2003. OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  2004. z = opr::TypeCvt::make(z, dtype::Float32());
  2005. //! fuse z mannually
  2006. auto z0 = opr::ConvBias::make(
  2007. x, w1, b1, param, {},
  2008. OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  2009. auto z1 = opr::ConvBias::make(
  2010. z0, w2, b2, z0, param, {},
  2011. OperatorNodeConfig{dtype::QuantizedS8(1.2f)}),
  2012. z2 = opr::ConvBias::make(
  2013. z1, w3, b3, param, {},
  2014. OperatorNodeConfig{dtype::QuantizedS8(2.5f)}),
  2015. z4 = opr::ElemwiseMultiType::make(
  2016. {z1, z2}, {opr::ElemwiseMultiType::Mode::QADD},
  2017. OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  2018. z4 = opr::TypeCvt::make(z4, dtype::Float32());
  2019. SymbolVar z_fuse;
  2020. SymbolVar z_nonfuse;
  2021. {
  2022. auto options = gopt::OptimizeForInferenceOptions{};
  2023. options.enable_fuse_conv_bias_nonlinearity()
  2024. .enable_fuse_conv_bias_with_z();
  2025. unpack_vector(gopt::optimize_for_inference({z}, options), z_fuse);
  2026. }
  2027. {
  2028. auto options = gopt::OptimizeForInferenceOptions{};
  2029. options.enable_fuse_conv_bias_nonlinearity();
  2030. unpack_vector(gopt::optimize_for_inference({z4}, options),
  2031. z_nonfuse);
  2032. }
  2033. auto nr_elem_multi_type =
  2034. find_opr_num<mgb::opr::ElemwiseMultiType>(z_fuse);
  2035. MGB_MARK_USED_VAR(nr_elem_multi_type);
  2036. ASSERT_EQ(1u, nr_elem_multi_type);
  2037. graph->compile({{z_fuse, {}}})
  2038. ->to_json()
  2039. ->writeto_fpath(
  2040. output_file("FuseConvBiasZPass.BlockFuse_fuse.json"));
  2041. graph->compile({{z_nonfuse, {}}})
  2042. ->to_json()
  2043. ->writeto_fpath(output_file(
  2044. "FuseConvBiasZPass.BlockFuse_nonfuse.json"));
  2045. HostTensorND host_z_fuse, host_z_nonfuse;
  2046. auto func =
  2047. graph->compile({make_callback_copy(z_nonfuse, host_z_nonfuse),
  2048. make_callback_copy(z_fuse, host_z_fuse)});
  2049. func->execute();
  2050. MGB_ASSERT_TENSOR_EQ(host_z_fuse, host_z_nonfuse);
  2051. }
  2052. }
  2053. #endif
  2054. //! close for cu111 ci, reopen it when bug fixed
  2055. #if CUDA_VERSION < 11000
  2056. TEST(TestEnableTensorCore, ShuffleMerge) {
  2057. REQUIRE_GPU(1);
  2058. auto cn = CompNode::load("gpu0");
  2059. cn.activate();
  2060. auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
  2061. auto sm_ver = prop.major * 10 + prop.minor;
  2062. if (sm_ver < 75) {
  2063. printf("This testcast ignored due to insufficient cuda cap(got: %d, "
  2064. "expected: %d)\n",
  2065. sm_ver, 75);
  2066. return;
  2067. }
  2068. HostTensorGenerator<dtype::Int8> gen;
  2069. auto graph = ComputingGraph::make();
  2070. graph->options().graph_opt_level = 0;
  2071. auto mkvar = [&](const char* name, const TensorShape& shp,
  2072. const DType& dtype) {
  2073. return opr::TypeCvt::make(
  2074. opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
  2075. dtype);
  2076. };
  2077. auto mkcvar = [&](const char* name, const TensorShape& shp,
  2078. const DType& dtype) {
  2079. return opr::TypeCvt::make(
  2080. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  2081. .rename(name),
  2082. dtype);
  2083. };
  2084. auto nchw2nchw4 = [](SymbolVar x) {
  2085. auto xshp = opr::GetVarShape::make(x);
  2086. auto cv = [&x](int v) { return x.make_scalar(v); };
  2087. auto sub = [&xshp, &cv](int idx) {
  2088. return opr::IndexAt::make(xshp, {{0, cv(idx)}});
  2089. };
  2090. auto tshp = opr::Concat::make(
  2091. {sub(0), sub(1) / 4, cv(4), sub(2), sub(3)}, 0);
  2092. auto y0 = opr::Reshape::make(x, tshp);
  2093. auto y1 = opr::Dimshuffle::make(y0, {0, 1, 3, 4, 2});
  2094. return y1;
  2095. };
  2096. auto nchw42nchw = [](SymbolVar x) {
  2097. auto xshp = opr::GetVarShape::make(x);
  2098. auto cv = [&x](int v) { return x.make_scalar(v); };
  2099. auto sub = [&xshp, &cv](int idx) {
  2100. return opr::IndexAt::make(xshp, {{0, cv(idx)}});
  2101. };
  2102. auto tshp = opr::Concat::make({sub(0), sub(1) * 4, sub(2), sub(3)}, 0);
  2103. auto y0 = opr::Dimshuffle::make(x, {0, 1, 4, 2, 3});
  2104. auto y1 = opr::Reshape::make(y0, tshp);
  2105. return y1;
  2106. };
  2107. auto x = mkvar("x", {32, 64, 16, 16}, dtype::QuantizedS8(2.5f)),
  2108. w = mkcvar("w1", {64, 64, 3, 3}, dtype::QuantizedS8(2.5f)),
  2109. b = mkcvar("b", {1, 64, 1, 1}, dtype::QuantizedS32(6.25f)),
  2110. z = mkvar("b1", {32, 64, 16, 16}, dtype::QuantizedS8(2.5f));
  2111. x = nchw2nchw4(x), w = nchw2nchw4(w), b = nchw2nchw4(b), z = nchw2nchw4(z);
  2112. opr::ConvBias::Param param;
  2113. param.format = opr::ConvBias::Param::Format::NCHW4;
  2114. param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU;
  2115. param.stride_h = param.stride_w = 1;
  2116. param.pad_h = param.pad_w = 1;
  2117. auto y = opr::ConvBias::make(x, w, b, z, param, {},
  2118. OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  2119. y = nchw42nchw(y);
  2120. y = opr::TypeCvt::make(y, dtype::Float32());
  2121. SymbolVar y_opt;
  2122. SymbolVar y_no_tc;
  2123. {
  2124. auto options = gopt::OptimizeForInferenceOptions{};
  2125. options.enable_fuse_conv_bias_nonlinearity().enable_nchw32();
  2126. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  2127. }
  2128. {
  2129. auto options = gopt::OptimizeForInferenceOptions{};
  2130. options.enable_fuse_conv_bias_nonlinearity();
  2131. unpack_vector(gopt::optimize_for_inference({y}, options), y_no_tc);
  2132. }
  2133. auto nr_dimshuffle = find_opr_num<mgb::opr::Dimshuffle>(y_opt);
  2134. ASSERT_EQ(3u, nr_dimshuffle);
  2135. HostTensorND host_y, host_y_opt;
  2136. auto func = graph->compile({make_callback_copy(y_no_tc, host_y),
  2137. make_callback_copy(y_opt, host_y_opt)});
  2138. func->execute();
  2139. MGB_ASSERT_TENSOR_EQ(host_y, host_y_opt);
  2140. }
  2141. #endif
  2142. #endif
  2143. TEST(FuseConvBiasZPass, Basic) {
  2144. REQUIRE_GPU(1);
  2145. auto cn = CompNode::load("gpu0");
  2146. HostTensorGenerator<dtype::Int8> gen;
  2147. auto graph = ComputingGraph::make();
  2148. graph->options().graph_opt_level = 0;
  2149. auto mkvar = [&](const char* name, const TensorShape& shp,
  2150. const DType& dtype) {
  2151. return opr::TypeCvt::make(
  2152. opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
  2153. dtype);
  2154. };
  2155. auto mkcvar = [&](const char* name, const TensorShape& shp,
  2156. const DType& dtype) {
  2157. return opr::TypeCvt::make(
  2158. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  2159. .rename(name),
  2160. dtype);
  2161. };
  2162. auto format = opr::Convolution::Param::Format::NCHW4;
  2163. auto x = mkvar("x", {32, 16, 16, 16, 4}, dtype::QuantizedS8(2.5f)),
  2164. w = mkcvar("w1", {64, 16, 3, 3, 4}, dtype::QuantizedS8(2.5f)),
  2165. b = mkcvar("b", {1, 16, 1, 1, 4}, dtype::QuantizedS32(6.25f)),
  2166. b1 = mkvar("b1", {32, 16, 16, 16, 4}, dtype::QuantizedS8(2.5f)),
  2167. b2 = mkvar("b2", {32, 16, 16, 16, 4}, dtype::QuantizedS8(2.5f));
  2168. opr::ConvBias::Param conv_bias_param;
  2169. conv_bias_param.format = format;
  2170. conv_bias_param.stride_h = conv_bias_param.stride_w = 1;
  2171. conv_bias_param.pad_h = conv_bias_param.pad_w = 1;
  2172. auto y = opr::ConvBias::make(x, w, b, conv_bias_param, {},
  2173. OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  2174. SymbolVar y_opt;
  2175. // check fuse mode
  2176. for (auto mode : {opr::ElemwiseMultiType::Param::Mode::QADD,
  2177. opr::ElemwiseMultiType::Param::Mode::QMUL,
  2178. opr::ElemwiseMultiType::Param::Mode::QFUSE_ADD_RELU}) {
  2179. auto y1 = opr::ElemwiseMultiType::make(
  2180. {y, b1}, {mode}, OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  2181. {
  2182. auto options = gopt::OptimizeForInferenceOptions{};
  2183. options.enable_fuse_conv_bias_nonlinearity()
  2184. .enable_fuse_conv_bias_with_z()
  2185. .enable_nchw32();
  2186. unpack_vector(gopt::optimize_for_inference({y1}, options), y_opt);
  2187. }
  2188. auto nr_elemwisemultitype = find_opr_num<opr::ElemwiseMultiType>(y_opt);
  2189. if (mode == opr::ElemwiseMultiType::Param::Mode::QMUL) {
  2190. ASSERT_NE(0u, nr_elemwisemultitype);
  2191. } else
  2192. ASSERT_EQ(0u, nr_elemwisemultitype);
  2193. // fuse convbiasz and z
  2194. if (mode == opr::ElemwiseMultiType::Param::Mode::QADD) {
  2195. auto y2 = opr::ElemwiseMultiType::make(
  2196. {y1, b2}, {mode},
  2197. OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  2198. {
  2199. auto options = gopt::OptimizeForInferenceOptions{};
  2200. options.enable_fuse_conv_bias_nonlinearity()
  2201. .enable_fuse_conv_bias_with_z()
  2202. .enable_nchw32();
  2203. unpack_vector(gopt::optimize_for_inference({y2}, options),
  2204. y_opt);
  2205. }
  2206. auto nr_elemwisemultitype =
  2207. find_opr_num<opr::ElemwiseMultiType>(y_opt);
  2208. ASSERT_NE(0u, nr_elemwisemultitype);
  2209. }
  2210. }
  2211. }
  2212. #if MGB_CUDA
  2213. //! close for cu111 ci, reopen it when bug fixed
  2214. #if CUDA_VERSION < 11000
  2215. TEST(TestGoptInference, EnableCHWN4) {
  2216. REQUIRE_GPU(1);
  2217. auto cn = CompNode::load("gpu0");
  2218. cn.activate();
  2219. auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
  2220. auto sm_ver = prop.major * 10 + prop.minor;
  2221. if (sm_ver < 61) {
  2222. printf("This testcast ignored due to insufficient cuda cap(got: %d, "
  2223. "expected: %d)\n",
  2224. sm_ver, 61);
  2225. return;
  2226. }
  2227. HostTensorGenerator<dtype::Int8> gen;
  2228. auto graph = ComputingGraph::make();
  2229. graph->options().graph_opt_level = 0;
  2230. auto mkvar = [&](const char* name, const TensorShape& shp,
  2231. const DType& dtype) {
  2232. return opr::TypeCvt::make(
  2233. opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
  2234. dtype);
  2235. };
  2236. auto mkcvar = [&](const char* name, const TensorShape& shp,
  2237. const DType& dtype) {
  2238. return opr::TypeCvt::make(
  2239. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  2240. .rename(name),
  2241. dtype);
  2242. };
  2243. auto mkshape = [](opr::ConvBias::Param::Format format, size_t N, size_t C,
  2244. size_t H, size_t W) -> TensorShape {
  2245. mgb_assert(C % 4 == 0);
  2246. if (format == opr::ConvBias::Param::Format::NCHW4) {
  2247. return {N, C / 4, H, W, 4};
  2248. } else {
  2249. mgb_assert(format == opr::ConvBias::Param::Format::NCHW);
  2250. return {N, C, H, W};
  2251. }
  2252. };
  2253. for (auto format : {opr::ConvBias::Param::Format::NCHW,
  2254. opr::ConvBias::Param::Format::NCHW4}) {
  2255. auto x = mkvar("x", mkshape(format, 32, 64, 16, 16),
  2256. dtype::QuantizedS8(2.5f)),
  2257. w = mkcvar("w1", mkshape(format, 64, 64, 3, 3),
  2258. dtype::QuantizedS8(2.5f)),
  2259. b = mkcvar("b", mkshape(format, 1, 64, 1, 1),
  2260. dtype::QuantizedS32(6.25f)),
  2261. b1 = mkvar("b1", mkshape(format, 32, 64, 16, 16),
  2262. dtype::QuantizedS8(2.5f));
  2263. opr::ConvBias::Param param;
  2264. param.format = format;
  2265. param.stride_h = param.stride_w = 1;
  2266. param.pad_h = param.pad_w = 1;
  2267. param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU;
  2268. auto y = opr::ConvBiasForward::make(
  2269. x, w, b, param, {},
  2270. OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
  2271. auto y1 = opr::ElemwiseMultiType::make(
  2272. {y, b1}, opr::ElemwiseMultiType::Mode::QFUSE_ADD_RELU,
  2273. OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
  2274. auto y2 = opr::ConvBiasForward::make(
  2275. y, w, b, param, {},
  2276. OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
  2277. auto y3 = opr::ElemwiseMultiType::make(
  2278. {y, b1}, opr::ElemwiseMultiType::Param::Mode::QSUB,
  2279. OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
  2280. auto y4 = opr::ElemwiseMultiType::make(
  2281. {y1, y2}, opr::ElemwiseMultiType::Param::Mode::QADD,
  2282. OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
  2283. y4 = opr::ElemwiseMultiType::make(
  2284. {y3, y4}, opr::ElemwiseMultiType::Param::Mode::QADD,
  2285. OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
  2286. y4 = opr::TypeCvt::make(y4, dtype::Float32());
  2287. SymbolVar y_opt;
  2288. SymbolVar y_cudnn;
  2289. {
  2290. auto options = gopt::OptimizeForInferenceOptions{};
  2291. options.enable_chwn4();
  2292. unpack_vector(gopt::optimize_for_inference({y4}, options), y_opt);
  2293. }
  2294. unpack_vector(gopt::GraphOptimizer{}
  2295. .add_pass<gopt::FuseConvBiasNonlinPass>()
  2296. .add_pass<gopt::FuseConvBiasZPass>()
  2297. .apply({{y4}})
  2298. .endpoint_vars(),
  2299. y_cudnn);
  2300. ASSERT_EQ(opr::ConvBias::Param::Format::CHWN4,
  2301. find_opr<opr::ConvBias>(y_opt).param().format);
  2302. HostTensorND host_y, host_y_opt;
  2303. auto func = graph->compile({make_callback_copy(y_cudnn, host_y),
  2304. make_callback_copy(y_opt, host_y_opt)});
  2305. func->execute();
  2306. MGB_ASSERT_TENSOR_EQ(host_y, host_y_opt);
  2307. }
  2308. }
  2309. #endif
  2310. //! close for cu111 ci, reopen it when bug fixed
  2311. #if CUDA_VERSION < 11000
  2312. TEST(TestGoptInference, EnableCHWN4WarpPespective) {
  2313. REQUIRE_GPU(1);
  2314. auto cn = CompNode::load("gpu0");
  2315. cn.activate();
  2316. auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
  2317. auto sm_ver = prop.major * 10 + prop.minor;
  2318. if (sm_ver < 61) {
  2319. printf("This testcast ignored due to insufficient cuda cap(got: %d, "
  2320. "expected: %d)\n",
  2321. sm_ver, 61);
  2322. return;
  2323. }
  2324. HostTensorGenerator<dtype::Int8> gen;
  2325. auto graph = ComputingGraph::make();
  2326. graph->options().graph_opt_level = 0;
  2327. auto mkvar = [&](const char* name, const TensorShape& shp,
  2328. const DType& dtype) {
  2329. return opr::TypeCvt::make(
  2330. opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
  2331. dtype);
  2332. };
  2333. auto mkcvar = [&](const char* name, const TensorShape& shp,
  2334. const DType& dtype) {
  2335. return opr::TypeCvt::make(
  2336. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  2337. .rename(name),
  2338. dtype);
  2339. };
  2340. std::shared_ptr<HostTensorND> mat = std::make_shared<HostTensorND>(
  2341. cn, TensorShape{32, 3, 3}, dtype::Float32());
  2342. warp_perspective_mat_gen(*mat, 32, 16, 16);
  2343. auto mat_var = opr::Host2DeviceCopy::make(*graph, mat).rename("mat");
  2344. auto x = mkvar("x", {32, 16, 16, 16, 4}, dtype::QuantizedS8(2.5f)),
  2345. w = mkcvar("w1", {64, 16, 3, 3, 4}, dtype::QuantizedS8(2.5f)),
  2346. b = mkcvar("b", {1, 16, 1, 1, 4}, dtype::QuantizedS32(6.25f));
  2347. opr::ConvBias::Param param;
  2348. param.format = opr::ConvBias::Param::Format::NCHW4;
  2349. param.stride_h = param.stride_w = 1;
  2350. param.pad_h = param.pad_w = 1;
  2351. param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU;
  2352. auto y = opr::ConvBiasForward::make(
  2353. x, w, b, param, {}, OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
  2354. opr::WarpPerspective::Param warp_param;
  2355. warp_param.format = opr::WarpPerspective::Param::Format::NCHW4;
  2356. auto y1 = opr::WarpPerspective::make(y, mat_var, TensorShape{16, 16},
  2357. warp_param);
  2358. y1 = opr::TypeCvt::make(y1, dtype::Float32());
  2359. auto nchw42nchw = [](SymbolVar x) {
  2360. auto xshp = opr::GetVarShape::make(x);
  2361. auto cv = [&x](int v) { return x.make_scalar(v); };
  2362. auto sub = [&xshp, &cv](int idx) {
  2363. return opr::IndexAt::make(xshp, {{0, cv(idx)}});
  2364. };
  2365. auto tshp = opr::Concat::make({sub(0), sub(1) * 4, sub(2), sub(3)}, 0);
  2366. auto y0 = opr::Dimshuffle::make(x, {0, 1, 4, 2, 3});
  2367. auto y1 = opr::Reshape::make(y0, tshp);
  2368. return y1;
  2369. };
  2370. y1 = nchw42nchw(y1);
  2371. warp_param.format = opr::WarpPerspective::Param::Format::NCHW;
  2372. auto y2 = opr::WarpPerspective::make(y1, mat_var, TensorShape{16, 16},
  2373. warp_param);
  2374. SymbolVar y_opt;
  2375. SymbolVar y_cudnn;
  2376. {
  2377. auto options = gopt::OptimizeForInferenceOptions{};
  2378. options.enable_chwn4();
  2379. unpack_vector(gopt::optimize_for_inference({y2}, options), y_opt);
  2380. }
  2381. unpack_vector(gopt::GraphOptimizer{}
  2382. .add_pass<gopt::FuseConvBiasNonlinPass>()
  2383. .add_pass<gopt::FuseConvBiasZPass>()
  2384. .apply({{y2}})
  2385. .endpoint_vars(),
  2386. y_cudnn);
  2387. HostTensorND host_y, host_y_opt;
  2388. auto func = graph->compile({make_callback_copy(y_cudnn, host_y),
  2389. make_callback_copy(y_opt, host_y_opt)});
  2390. func->execute();
  2391. MGB_ASSERT_TENSOR_EQ(host_y, host_y_opt);
  2392. }
  2393. #endif
  2394. TEST(TestGoptInference, EnableCHWN4Pooling) {
  2395. REQUIRE_GPU(1);
  2396. auto cn = CompNode::load("gpu0");
  2397. cn.activate();
  2398. auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
  2399. auto sm_ver = prop.major * 10 + prop.minor;
  2400. if (sm_ver < 61) {
  2401. printf("This testcast ignored due to insufficient cuda cap(got: %d, "
  2402. "expected: %d)\n",
  2403. sm_ver, 61);
  2404. return;
  2405. }
  2406. HostTensorGenerator<dtype::Int8> gen;
  2407. auto graph = ComputingGraph::make();
  2408. graph->options().graph_opt_level = 0;
  2409. auto mkvar = [&](const char* name, const TensorShape& shp,
  2410. const DType& dtype) {
  2411. return opr::TypeCvt::make(
  2412. opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
  2413. dtype);
  2414. };
  2415. auto mkcvar = [&](const char* name, const TensorShape& shp,
  2416. const DType& dtype) {
  2417. return opr::TypeCvt::make(
  2418. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  2419. .rename(name),
  2420. dtype);
  2421. };
  2422. auto x = mkvar("x", {32, 16, 16, 16, 4}, dtype::QuantizedS8(2.5f)),
  2423. w = mkcvar("w1", {64, 16, 3, 3, 4}, dtype::QuantizedS8(2.5f)),
  2424. b = mkcvar("b", {1, 16, 1, 1, 4}, dtype::QuantizedS32(6.25f));
  2425. opr::ConvBias::Param param;
  2426. param.format = opr::ConvBias::Param::Format::NCHW4;
  2427. param.stride_h = param.stride_w = 1;
  2428. param.pad_h = param.pad_w = 1;
  2429. param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU;
  2430. auto y = opr::ConvBiasForward::make(
  2431. x, w, b, param, {}, OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
  2432. opr::Pooling::Param pool_param;
  2433. pool_param.format = opr::Pooling::Param::Format::NCHW4;
  2434. y = opr::Pooling::make(y, pool_param);
  2435. y = opr::TypeCvt::make(y, dtype::Float32());
  2436. auto nchw42nchw = [](SymbolVar x) {
  2437. auto xshp = opr::GetVarShape::make(x);
  2438. auto cv = [&x](int v) { return x.make_scalar(v); };
  2439. auto sub = [&xshp, &cv](int idx) {
  2440. return opr::IndexAt::make(xshp, {{0, cv(idx)}});
  2441. };
  2442. auto tshp = opr::Concat::make({sub(0), sub(1) * 4, sub(2), sub(3)}, 0);
  2443. auto y0 = opr::Dimshuffle::make(x, {0, 1, 4, 2, 3});
  2444. auto y1 = opr::Reshape::make(y0, tshp);
  2445. return y1;
  2446. };
  2447. y = nchw42nchw(y);
  2448. pool_param.format = opr::Pooling::Param::Format::NCHW;
  2449. auto y1 = opr::Pooling::make(y, pool_param);
  2450. SymbolVar y_opt;
  2451. SymbolVar y_cudnn;
  2452. unpack_vector(
  2453. gopt::GraphOptimizer{}
  2454. .add_pass<gopt::FuseConvBiasNonlinPass>()
  2455. .add_pass(gopt::EnableCHWN4Pass::make_chwn4_converter())
  2456. .add_pass<gopt::FuseConvBiasZPass>()
  2457. .apply({{y1}})
  2458. .endpoint_vars(),
  2459. y_opt);
  2460. unpack_vector(gopt::GraphOptimizer{}
  2461. .add_pass<gopt::FuseConvBiasNonlinPass>()
  2462. .add_pass<gopt::FuseConvBiasZPass>()
  2463. .apply({{y1}})
  2464. .endpoint_vars(),
  2465. y_cudnn);
  2466. HostTensorND host_y, host_y_opt;
  2467. auto func = graph->compile({make_callback_copy(y_cudnn, host_y),
  2468. make_callback_copy(y_opt, host_y_opt)});
  2469. func->execute();
  2470. MGB_ASSERT_TENSOR_EQ(host_y, host_y_opt);
  2471. }
  2472. //! close for cu111 ci, reopen it when bug fixed
  2473. #if CUDA_VERSION < 11000
  2474. TEST(TestGoptInference, EnableCHWN4ShuffleRemove) {
  2475. REQUIRE_GPU(1);
  2476. auto cn = CompNode::load("gpu0");
  2477. cn.activate();
  2478. auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
  2479. auto sm_ver = prop.major * 10 + prop.minor;
  2480. if (sm_ver < 61) {
  2481. printf("This testcast ignored due to insufficient cuda cap(got: %d, "
  2482. "expected: %d)\n",
  2483. sm_ver, 61);
  2484. return;
  2485. }
  2486. HostTensorGenerator<dtype::Int8> gen;
  2487. auto graph = ComputingGraph::make();
  2488. graph->options().graph_opt_level = 0;
  2489. auto mkvar = [&](const char* name, const TensorShape& shp,
  2490. const DType& dtype) {
  2491. return opr::TypeCvt::make(
  2492. opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
  2493. dtype);
  2494. };
  2495. auto mkcvar = [&](const char* name, const TensorShape& shp,
  2496. const DType& dtype) {
  2497. return opr::TypeCvt::make(
  2498. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  2499. .rename(name),
  2500. dtype);
  2501. };
  2502. auto nchw2nchw4 = [](SymbolVar x) {
  2503. auto xshp = opr::GetVarShape::make(x);
  2504. auto cv = [&x](int v) { return x.make_scalar(v); };
  2505. auto sub = [&xshp, &cv](int idx) {
  2506. return opr::IndexAt::make(xshp, {{0, cv(idx)}});
  2507. };
  2508. auto tshp = opr::Concat::make(
  2509. {sub(0), sub(1) / 4, cv(4), sub(2), sub(3)}, 0);
  2510. auto y0 = opr::Reshape::make(x, tshp);
  2511. auto y1 = opr::Dimshuffle::make(y0, {0, 1, 3, 4, 2});
  2512. return y1;
  2513. };
  2514. auto nchw42nchw = [](SymbolVar x) {
  2515. auto xshp = opr::GetVarShape::make(x);
  2516. auto cv = [&x](int v) { return x.make_scalar(v); };
  2517. auto sub = [&xshp, &cv](int idx) {
  2518. return opr::IndexAt::make(xshp, {{0, cv(idx)}});
  2519. };
  2520. auto tshp = opr::Concat::make({sub(0), sub(1) * 4, sub(2), sub(3)}, 0);
  2521. auto y0 = opr::Dimshuffle::make(x, {0, 1, 4, 2, 3});
  2522. auto y1 = opr::Reshape::make(y0, tshp);
  2523. return y1;
  2524. };
  2525. auto x = mkvar("x", {32, 64, 16, 16}, dtype::QuantizedS8(2.5f)),
  2526. w = mkcvar("w1", {64, 16, 3, 3, 4}, dtype::QuantizedS8(2.5f)),
  2527. b = mkcvar("b", {1, 16, 1, 1, 4}, dtype::QuantizedS32(6.25f)),
  2528. b1 = mkcvar("b1", {32, 16, 16, 16, 4}, dtype::QuantizedS8{2.5f});
  2529. x = nchw2nchw4(x);
  2530. opr::ConvBias::Param param;
  2531. param.format = opr::ConvBias::Param::Format::NCHW4;
  2532. param.stride_h = param.stride_w = 1;
  2533. param.pad_h = param.pad_w = 1;
  2534. param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU;
  2535. auto y = opr::ConvBiasForward::make(
  2536. x, w, b, param, {}, OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
  2537. auto y1 = opr::ElemwiseMultiType::make(
  2538. {y, b1}, opr::ElemwiseMultiType::Mode::QFUSE_ADD_RELU,
  2539. OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
  2540. auto y2 = opr::ConvBiasForward::make(
  2541. y, w, b, param, {}, OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
  2542. auto y3 = opr::ElemwiseMultiType::make(
  2543. {y, b1}, opr::ElemwiseMultiType::Param::Mode::QSUB,
  2544. OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
  2545. auto y4 = opr::ElemwiseMultiType::make(
  2546. {y1, y2}, opr::ElemwiseMultiType::Param::Mode::QADD,
  2547. OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
  2548. y4 = opr::ElemwiseMultiType::make(
  2549. {y3, y4}, opr::ElemwiseMultiType::Param::Mode::QADD,
  2550. OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
  2551. y4 = opr::TypeCvt::make(y4, dtype::Float32());
  2552. y4 = nchw42nchw(y4);
  2553. SymbolVar y_opt;
  2554. SymbolVar y_cudnn;
  2555. unpack_vector(
  2556. gopt::GraphOptimizer{}
  2557. .add_pass<gopt::ParamRedistributePass>()
  2558. .add_pass<gopt::ParamFusePass>()
  2559. .add_pass<gopt::FuseConvBiasNonlinPass>()
  2560. .add_pass<gopt::FuseConvBiasZPass>()
  2561. .add_pass(gopt::EnableCHWN4Pass::make_chwn4_converter())
  2562. .add_pass<gopt::ShuffleShuffleRemovePass>()
  2563. .add_pass<gopt::ParamFusePass>()
  2564. .apply({{y4}})
  2565. .endpoint_vars(),
  2566. y_opt);
  2567. graph->compile({{y_opt, {}}})
  2568. ->to_json()
  2569. ->writeto_fpath(output_file(
  2570. "TestGoptInference.EnableCHWN4ShuffleRemove.json"));
  2571. auto nr_dimshuffle = find_opr_num<mgb::opr::Dimshuffle>(y_opt);
  2572. ASSERT_EQ(2u, nr_dimshuffle);
  2573. auto nr_reformat = find_opr_num<mgb::opr::RelayoutFormat>(y_opt);
  2574. ASSERT_EQ(0u, nr_reformat);
  2575. unpack_vector(gopt::GraphOptimizer{}
  2576. .add_pass<gopt::FuseConvBiasNonlinPass>()
  2577. .add_pass<gopt::FuseConvBiasZPass>()
  2578. .apply({{y4}})
  2579. .endpoint_vars(),
  2580. y_cudnn);
  2581. HostTensorND host_y, host_y_opt;
  2582. auto func = graph->compile({make_callback_copy(y_cudnn, host_y),
  2583. make_callback_copy(y_opt, host_y_opt)});
  2584. func->execute();
  2585. MGB_ASSERT_TENSOR_EQ(host_y, host_y_opt);
  2586. }
  2587. #endif
  2588. TEST(TestGoptInference, ConvertFormatNCHW4GPU) {
  2589. REQUIRE_GPU(1);
  2590. auto cn = CompNode::load("gpu0");
  2591. cn.activate();
  2592. auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
  2593. auto sm_ver = prop.major * 10 + prop.minor;
  2594. if (sm_ver < 61) {
  2595. printf("This testcast ignored due to insufficient cuda cap(got: %d, "
  2596. "expected: %d)\n",
  2597. sm_ver, 61);
  2598. return;
  2599. }
  2600. HostTensorGenerator<dtype::Int8> gen;
  2601. auto graph = ComputingGraph::make();
  2602. graph->options().graph_opt_level = 0;
  2603. auto mkvar = [&](const char* name, const TensorShape& shp,
  2604. const DType& dtype) {
  2605. return opr::TypeCvt::make(
  2606. opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
  2607. dtype);
  2608. };
  2609. auto mkcvar = [&](const char* name, const TensorShape& shp,
  2610. const DType& dtype) {
  2611. return opr::TypeCvt::make(
  2612. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  2613. .rename(name),
  2614. dtype);
  2615. };
  2616. auto x = mkvar("x", {2, 4, 16, 16}, dtype::QuantizedS8(2.5f));
  2617. opr::ConvBias::Param param_conv_bias;
  2618. param_conv_bias.format = opr::ConvBias::Param::Format::NCHW;
  2619. param_conv_bias.stride_h = param_conv_bias.stride_w = 1;
  2620. param_conv_bias.pad_h = param_conv_bias.pad_w = 1;
  2621. param_conv_bias.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU;
  2622. // dense
  2623. param_conv_bias.sparse = opr::ConvBias::Param::Sparse::DENSE;
  2624. auto w1 = mkcvar("w1", {8, 4, 3, 3}, dtype::QuantizedS8(2.5f)),
  2625. b1 = mkcvar("b1", {1, 8, 1, 1}, dtype::QuantizedS32(6.25f));
  2626. auto conv1 = opr::ConvBiasForward::make(
  2627. x, w1, b1, param_conv_bias, {},
  2628. OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
  2629. // group
  2630. // icpg != 1 && ocpg != 1
  2631. param_conv_bias.sparse = opr::ConvBias::Param::Sparse::GROUP;
  2632. auto w2 = mkcvar("w2", {2, 4, 4, 3, 3}, dtype::QuantizedS8(2.5f)),
  2633. b2 = mkcvar("b2", {1, 8, 1, 1}, dtype::QuantizedS32(6.25f));
  2634. auto conv2 = opr::ConvBiasForward::make(
  2635. conv1, w2, b2, param_conv_bias, {},
  2636. OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
  2637. opr::Convolution::Param param_deconv;
  2638. param_deconv.format = opr::Convolution::Param::Format::NCHW;
  2639. param_deconv.stride_h = param_deconv.stride_w = 2;
  2640. param_deconv.pad_h = param_deconv.pad_w = 2;
  2641. // dense
  2642. param_deconv.sparse = opr::Convolution::Param::Sparse::DENSE;
  2643. auto w3 = mkcvar("w3", {8, 8, 4, 4}, dtype::QuantizedS8(2.5f));
  2644. auto deconv1 = opr::ConvolutionBackwardData::make_deconv(
  2645. conv2, w3, param_deconv, {},
  2646. OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
  2647. auto deconv1_fp32 = opr::TypeCvt::make(deconv1, dtype::Float32());
  2648. auto y = deconv1_fp32 + opr::TypeCvt::make(b2, dtype::Float32());
  2649. SymbolVar y_opt;
  2650. {
  2651. auto options = gopt::OptimizeForInferenceOptions{};
  2652. options.enable_nchw4();
  2653. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  2654. }
  2655. ASSERT_EQ(opr::ConvBias::Param::Format::NCHW4,
  2656. find_opr<opr::ConvBias>(y_opt).param().format);
  2657. ASSERT_EQ(opr::ConvolutionBackwardData::Param::Format::NCHW4,
  2658. find_opr<opr::ConvolutionBackwardData>(y_opt).param().format);
  2659. auto nr_reshape = find_opr_num<mgb::opr::Reshape>(y_opt);
  2660. ASSERT_EQ(2u, nr_reshape);
  2661. graph->compile({{y_opt, {}}})
  2662. ->to_json()
  2663. ->writeto_fpath(output_file(
  2664. "TestGoptInference.ConvertFormatNCHW4GPU.json"));
  2665. HostTensorND host_y, host_y_opt;
  2666. auto func = graph->compile({make_callback_copy(y, host_y),
  2667. make_callback_copy(y_opt, host_y_opt)});
  2668. func->execute();
  2669. MGB_ASSERT_TENSOR_EQ(host_y, host_y_opt);
  2670. }
  2671. #endif
  2672. TEST(TestGoptInference, ConvertFormatNCHW4NonConvOpr) {
  2673. auto cn = CompNode::load("xpu0");
  2674. HostTensorGenerator<dtype::Int8> gen;
  2675. auto graph = ComputingGraph::make();
  2676. graph->options().graph_opt_level = 0;
  2677. auto mkvar = [&](const char* name, const TensorShape& shp,
  2678. const DType& dtype) {
  2679. return opr::TypeCvt::make(
  2680. opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
  2681. dtype);
  2682. };
  2683. auto mkcvar = [&](const char* name, const TensorShape& shp,
  2684. const DType& dtype) {
  2685. return opr::TypeCvt::make(
  2686. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  2687. .rename(name),
  2688. dtype);
  2689. };
  2690. auto mkcvarf32 = [&](const char* name, const TensorShape& shp) {
  2691. return opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  2692. .rename(name);
  2693. };
  2694. auto x = mkvar("x", {2, 4, 16, 16}, dtype::QuantizedS8(2.5f));
  2695. opr::ConvBias::Param param_conv_bias;
  2696. param_conv_bias.format = opr::ConvBias::Param::Format::NCHW;
  2697. param_conv_bias.stride_h = param_conv_bias.stride_w = 1;
  2698. param_conv_bias.pad_h = param_conv_bias.pad_w = 1;
  2699. param_conv_bias.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU;
  2700. // dense
  2701. param_conv_bias.sparse = opr::ConvBias::Param::Sparse::DENSE;
  2702. auto w1 = mkcvar("w1", {8, 4, 3, 3}, dtype::QuantizedS8(2.5f)),
  2703. b1 = mkcvar("b1", {1, 8, 1, 1}, dtype::QuantizedS32(6.25f));
  2704. auto conv1 = opr::ConvBiasForward::make(
  2705. x, w1, b1, param_conv_bias, {},
  2706. OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
  2707. // test Resize
  2708. auto shape_of = opr::GetVarShape::make(x);
  2709. auto subtensor = opr::Subtensor::make(
  2710. shape_of, {opr::Subtensor::AxisIndexer::make_interval(
  2711. 0, x.make_scalar(2), None, x.make_scalar(1))});
  2712. opr::Resize::Param param_resize;
  2713. param_resize.format = opr::Resize::Param::Format::NCHW;
  2714. auto resize = opr::ResizeForward::make(conv1, subtensor * 2, param_resize);
  2715. // test WarpPerspective
  2716. auto mat = mkcvarf32("mat", {2, 3, 3}),
  2717. warp = opr::WarpPerspectiveForward::make(
  2718. resize, mat, nullptr, cg::var_from_tensor_shape(x, {32, 32}));
  2719. opr::Pooling::Param pool_param;
  2720. pool_param.format = opr::Pooling::Param::Format::NCHW;
  2721. // test Pooling
  2722. auto pool = opr::Pooling::make(warp, pool_param);
  2723. // group
  2724. // icpg != 1 && ocpg != 1
  2725. param_conv_bias.sparse = opr::ConvBias::Param::Sparse::GROUP;
  2726. auto w2 = mkcvar("w2", {2, 4, 4, 3, 3}, dtype::QuantizedS8(2.5f)),
  2727. b2 = mkcvar("b2", {1, 8, 1, 1}, dtype::QuantizedS32(6.25f));
  2728. auto conv2 = opr::ConvBiasForward::make(
  2729. pool, w2, b2, param_conv_bias, {},
  2730. OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
  2731. auto add = opr::ElemwiseMultiType::make(
  2732. {conv1, conv2}, {opr::ElemwiseMultiType::Param::Mode::QADD},
  2733. OperatorNodeConfig{dtype::QuantizedS8{1.2f}});
  2734. auto y = opr::TypeCvt::make(add, dtype::Float32());
  2735. SymbolVar y_opt;
  2736. {
  2737. auto options = gopt::OptimizeForInferenceOptions{};
  2738. options.enable_nchw4();
  2739. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  2740. }
  2741. auto nr_dimshuffle = find_opr_num<mgb::opr::Dimshuffle>(y_opt);
  2742. ASSERT_EQ(2u, nr_dimshuffle);
  2743. ASSERT_EQ(opr::ConvBias::Param::Format::NCHW4,
  2744. find_opr<opr::ConvBias>(y_opt).param().format);
  2745. ASSERT_EQ(opr::ResizeForward::Param::Format::NCHW4,
  2746. find_opr<opr::ResizeForward>(y_opt).param().format);
  2747. ASSERT_EQ(opr::WarpPerspectiveForward::Param::Format::NCHW4,
  2748. find_opr<opr::WarpPerspectiveForward>(y_opt).param().format);
  2749. ASSERT_EQ(opr::PoolingForward::Param::Format::NCHW4,
  2750. find_opr<opr::PoolingForward>(y_opt).param().format);
  2751. }
  2752. TEST(TestGoptInference, ConvertFormatNCHW4) {
  2753. HostTensorGenerator<> gen;
  2754. auto cn = CompNode::load("cpu0");
  2755. auto graph = ComputingGraph::make();
  2756. graph->options().graph_opt_level = 0;
  2757. auto mkvar = [&](const char* name, const TensorShape& shp) {
  2758. return opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name);
  2759. };
  2760. auto mkcvar = [&](const char* name, const TensorShape& shp) {
  2761. return opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  2762. .rename(name);
  2763. };
  2764. auto x = mkvar("x", {2, 4, 16, 16});
  2765. // ConvBias test dense
  2766. opr::ConvBias::Param param_conv_bias;
  2767. param_conv_bias.pad_h = param_conv_bias.pad_w = 1;
  2768. param_conv_bias.sparse = opr::ConvBias::Param::Sparse::DENSE;
  2769. auto w1 = mkcvar("w1", {8, 4, 3, 3}), b1 = mkcvar("b1", {1, 8, 1, 1});
  2770. auto conv1 = opr::ConvBias::make(x, w1, b1, param_conv_bias);
  2771. param_conv_bias.sparse = opr::ConvBias::Param::Sparse::GROUP;
  2772. auto w2 = mkcvar("w2", {2, 4, 4, 3, 3}), b2 = mkcvar("b2", {1, 8, 1, 1});
  2773. auto conv2 = opr::ConvBias::make(conv1, w2, b2, param_conv_bias);
  2774. // Convolution
  2775. opr::Convolution::Param param_conv;
  2776. param_conv.pad_h = param_conv.pad_w = 1;
  2777. param_conv.sparse = opr::Convolution::Param::Sparse::DENSE;
  2778. auto w3 = mkcvar("w3", {8, 8, 3, 3});
  2779. auto y = opr::Convolution::make(conv2, w3, param_conv);
  2780. SymbolVar y_opt;
  2781. {
  2782. auto options = gopt::OptimizeForInferenceOptions{};
  2783. options.enable_nchw4();
  2784. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  2785. }
  2786. ASSERT_EQ(opr::ConvBias::Param::Format::NCHW,
  2787. find_opr<opr::ConvBias>(y_opt).param().format);
  2788. graph->compile({{y_opt, {}}})
  2789. ->to_json()
  2790. ->writeto_fpath(
  2791. output_file("TestGoptInference.ConvertFormatNCHW4.json"));
  2792. HostTensorND host_y_opt, host_y;
  2793. auto func = graph->compile({make_callback_copy(y, host_y),
  2794. make_callback_copy(y_opt, host_y_opt)});
  2795. func->execute();
  2796. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3);
  2797. }
  2798. TEST(TestGoptInference, ConvertFormatNCHW4Ic3) {
  2799. REQUIRE_GPU(1);
  2800. auto cn = CompNode::load("gpu0");
  2801. cn.activate();
  2802. REQUIRE_CUDA_COMPUTE_CAPABILITY(6, 1);
  2803. HostTensorGenerator<dtype::Float32, RandomDistribution::UNIFORM> gen{
  2804. 1.2f, 127 * 127};
  2805. auto graph = ComputingGraph::make();
  2806. graph->options().graph_opt_level = 0;
  2807. auto mkvar = [&](const char* name, const TensorShape& shp,
  2808. const DType& dtype) {
  2809. return opr::TypeCvt::make(
  2810. opr::Host2DeviceCopy::make(*graph, gen(shp)).rename(name),
  2811. dtype);
  2812. };
  2813. auto mkcvar = [&](const char* name, const TensorShape& shp,
  2814. const DType& dtype) {
  2815. return opr::TypeCvt::make(
  2816. opr::SharedDeviceTensor::make(*graph, *gen(shp)).rename(name),
  2817. dtype);
  2818. };
  2819. auto x = mkvar("x", {2, 3, 16, 16}, dtype::QuantizedS8(2.5f));
  2820. // ConvBias test dense
  2821. opr::ConvBias::Param param_conv_bias;
  2822. param_conv_bias.pad_h = param_conv_bias.pad_w = 1;
  2823. param_conv_bias.sparse = opr::ConvBias::Param::Sparse::DENSE;
  2824. auto w1 = mkcvar("w1", {8, 3, 3, 3}, dtype::QuantizedS8(2.5f)),
  2825. b1 = mkcvar("b1", {1, 8, 1, 1}, dtype::QuantizedS32(6.25f));
  2826. auto conv1 =
  2827. opr::ConvBias::make(x, w1, b1, param_conv_bias, {},
  2828. OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
  2829. param_conv_bias.sparse = opr::ConvBias::Param::Sparse::GROUP;
  2830. auto w2 = mkcvar("w2", {2, 4, 4, 3, 3}, dtype::QuantizedS8(2.5f)),
  2831. b2 = mkcvar("b2", {1, 8, 1, 1}, dtype::QuantizedS32(6.25f));
  2832. auto conv2 =
  2833. opr::ConvBias::make(conv1, w2, b2, param_conv_bias, {},
  2834. OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
  2835. auto y = opr::TypeCvt::make(conv2, dtype::Float32());
  2836. SymbolVar y_opt;
  2837. {
  2838. auto options = gopt::OptimizeForInferenceOptions{};
  2839. options.enable_nchw4();
  2840. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  2841. }
  2842. ASSERT_EQ(opr::ConvBias::Param::Format::NCHW4,
  2843. find_opr<opr::ConvBias>(y_opt).param().format);
  2844. graph->compile({{y_opt, {}}})
  2845. ->to_json()
  2846. ->writeto_fpath(output_file(
  2847. "TestGoptInference.ConvertFormatNCHW4Ic3.json"));
  2848. HostTensorND host_y_opt, host_y;
  2849. auto func = graph->compile({make_callback_copy(y, host_y),
  2850. make_callback_copy(y_opt, host_y_opt)});
  2851. func->execute();
  2852. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3);
  2853. }
  2854. TEST(TestGoptInference, ConvertFormatNCHW88) {
  2855. HostTensorGenerator<> gen;
  2856. auto cn = CompNode::load("cpu0");
  2857. auto graph = ComputingGraph::make();
  2858. graph->options().graph_opt_level = 0;
  2859. auto mkvar = [&](const char* name, const TensorShape& shp) {
  2860. return opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name);
  2861. };
  2862. auto mkcvar = [&](const char* name, const TensorShape& shp) {
  2863. return opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  2864. .rename(name);
  2865. };
  2866. auto host_x = gen({2, 3, 16, 16}, cn);
  2867. auto x = opr::Host2DeviceCopy::make(*graph, host_x);
  2868. //! Hybrid nchw88 mode
  2869. opr::Convolution::Param param_conv;
  2870. param_conv.pad_h = param_conv.pad_w = 1;
  2871. auto w1 = mkcvar("w1", {8, 3, 3, 3}),
  2872. conv1 = opr::Convolution::make(x, w1, param_conv, {},
  2873. OperatorNodeConfig("conv1"));
  2874. //! channel wise
  2875. opr::ConvBias::Param param_conv_bias;
  2876. param_conv_bias.pad_h = param_conv_bias.pad_w = 1;
  2877. param_conv_bias.sparse = opr::ConvBias::Param::Sparse::GROUP;
  2878. auto w2 = mkcvar("w2", {8, 1, 1, 3, 3}), b2 = mkcvar("b2", {1, 8, 1, 1}),
  2879. conv2 = opr::ConvBias::make(conv1, w2, b2, param_conv_bias);
  2880. //! group
  2881. auto w3 = mkcvar("w3", {1, 8, 8, 3, 3}), b3 = mkcvar("b3", {1, 8, 1, 1}),
  2882. conv3 = opr::ConvBias::make(conv2, w3, b3, param_conv_bias);
  2883. auto shape_of = opr::GetVarShape::make(conv3);
  2884. auto subtensor = opr::Subtensor::make(
  2885. shape_of, {opr::Subtensor::AxisIndexer::make_interval(
  2886. 0, x.make_scalar(2), None, x.make_scalar(1))});
  2887. opr::Resize::Param param_resize;
  2888. param_resize.format = opr::Resize::Param::Format::NCHW;
  2889. auto resize = opr::ResizeForward::make(conv3, subtensor * 2, param_resize);
  2890. auto mat = mkcvar("mat", {2, 3, 3}),
  2891. warp = opr::WarpPerspectiveForward::make(
  2892. resize, mat, nullptr, cg::var_from_tensor_shape(x, {4, 4}));
  2893. auto b = mkvar("b", {1, 8, 1, 1}),
  2894. elem = opr::Elemwise::make({warp + b},
  2895. opr::Elemwise::Param::Mode::RELU);
  2896. //! Dense
  2897. param_conv_bias.pad_h = param_conv_bias.pad_w = 1;
  2898. auto w4 = mkcvar("w4", {2, 6, 4, 3, 3}), b4 = mkcvar("b4", {1, 12, 1, 1}),
  2899. conv4 = opr::ConvBias::make(elem, w4, b4, param_conv_bias);
  2900. param_conv_bias.sparse = opr::ConvBias::Param::Sparse::DENSE;
  2901. auto w5 = mkcvar("w5", {8, 12, 3, 3}), b5 = mkcvar("b5", {1, 8, 1, 1}),
  2902. conv5 = opr::ConvBias::make(conv4, w5, b5, param_conv_bias);
  2903. auto w6 = mkcvar("w6", {8, 8, 3, 3}), b6 = mkcvar("b6", {1, 8, 1, 1}),
  2904. y = opr::ConvBias::make(conv5, w6, b6, param_conv_bias);
  2905. SymbolVar y_opt;
  2906. {
  2907. auto options = gopt::OptimizeForInferenceOptions{};
  2908. options.enable_nchw88();
  2909. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  2910. }
  2911. ASSERT_EQ(opr::ConvBias::Param::Format::NCHW88,
  2912. find_opr<opr::Convolution>(y_opt, "conv1").param().format);
  2913. ASSERT_EQ(opr::ConvBias::Param::Format::NCHW88,
  2914. find_opr<opr::ConvBias>(y_opt).param().format);
  2915. graph->compile({{y_opt, {}}})
  2916. ->to_json()
  2917. ->writeto_fpath(
  2918. output_file("TestGoptInference.ConvertFormatNCHW88.json"));
  2919. HostTensorND host_y_opt, host_y;
  2920. auto func = graph->compile({make_callback_copy(y, host_y),
  2921. make_callback_copy(y_opt, host_y_opt)});
  2922. func->execute();
  2923. //! meybe go to winograd in x86-32, so set error 1e-1
  2924. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-1);
  2925. *host_x = *gen({2, 3, 32, 32}, cn);
  2926. func->execute();
  2927. //! meybe go to winograd in x86-32, so set error 1e-1
  2928. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-1);
  2929. }
  2930. TEST(TestGoptInference, ConvertFormatNCHW44) {
  2931. HostTensorGenerator<> gen;
  2932. auto cn = CompNode::load("cpu0");
  2933. auto graph = ComputingGraph::make();
  2934. graph->options().graph_opt_level = 0;
  2935. auto mkvar = [&](const char* name, const TensorShape& shp) {
  2936. return opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name);
  2937. };
  2938. auto mkcvar = [&](const char* name, const TensorShape& shp) {
  2939. return opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  2940. .rename(name);
  2941. };
  2942. auto mkcvar_dtype = [&](const char* name, const TensorShape& shp,
  2943. const DType& dtype) {
  2944. return opr::TypeCvt::make(
  2945. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  2946. .rename(name),
  2947. dtype);
  2948. };
  2949. auto host_x = gen({2, 3, 16, 16}, cn);
  2950. auto x = opr::Host2DeviceCopy::make(*graph, host_x);
  2951. //! Hybrid nchw44 mode
  2952. opr::Convolution::Param param_conv;
  2953. param_conv.pad_h = param_conv.pad_w = 1;
  2954. auto w1 = mkcvar("w1", {8, 3, 3, 3}),
  2955. conv1 = opr::Convolution::make(x, w1, param_conv, {},
  2956. OperatorNodeConfig("conv1"));
  2957. //! no supported hybrid nchw44
  2958. opr::ConvBias::Param param_conv_bias_pad0;
  2959. param_conv_bias_pad0.pad_h = param_conv_bias_pad0.pad_w = 0;
  2960. auto w1_f1 = mkcvar("w1_1", {8, 3, 1, 1});
  2961. auto conv1_f1 = opr::ConvBias::make(x, w1_f1, param_conv_bias_pad0, {},
  2962. OperatorNodeConfig("conv1_f1"));
  2963. auto conv1_add = conv1_f1 * conv1;
  2964. auto conv_1_q8 = opr::TypeCvt::make(conv1_add, dtype::QuantizedS8(2.5f));
  2965. //! s8 dense conv
  2966. opr::ConvBias::Param param_conv_bias;
  2967. param_conv_bias.pad_h = param_conv_bias.pad_w = 1;
  2968. auto w1_2 = mkcvar_dtype("w1_2", {8, 8, 3, 3}, dtype::QuantizedS8(2.5f));
  2969. auto b1_2 = mkcvar_dtype("b1_2", {1, 8, 1, 1}, dtype::QuantizedS32(6.25f));
  2970. auto conv_1_2 = opr::ConvBias::make(
  2971. conv_1_q8, w1_2, b1_2, param_conv_bias, {},
  2972. OperatorNodeConfig{"conv_1_2", cn, dtype::QuantizedS8{6.25f}});
  2973. auto conv_1_2_fp32 = opr::TypeCvt::make(conv_1_2, dtype::Float32());
  2974. //! channel wise
  2975. param_conv_bias.sparse = opr::ConvBias::Param::Sparse::GROUP;
  2976. auto w2 = mkcvar("w2", {8, 1, 1, 3, 3}), b2 = mkcvar("b2", {1, 8, 1, 1}),
  2977. conv2 = opr::ConvBias::make(conv_1_2_fp32, w2, b2, param_conv_bias);
  2978. //! group
  2979. auto w3 = mkcvar("w3", {2, 4, 4, 3, 3}), b3 = mkcvar("b3", {1, 8, 1, 1}),
  2980. conv3 = opr::ConvBias::make(conv2, w3, b3, param_conv_bias);
  2981. auto shape_of = opr::GetVarShape::make(conv3);
  2982. auto subtensor = opr::Subtensor::make(
  2983. shape_of, {opr::Subtensor::AxisIndexer::make_interval(
  2984. 0, x.make_scalar(2), None, x.make_scalar(1))});
  2985. opr::Resize::Param param_resize;
  2986. param_resize.format = opr::Resize::Param::Format::NCHW;
  2987. auto resize = opr::ResizeForward::make(conv3, subtensor * 2, param_resize);
  2988. auto mat = mkcvar("mat", {2, 3, 3}),
  2989. warp = opr::WarpPerspectiveForward::make(
  2990. resize, mat, nullptr, cg::var_from_tensor_shape(x, {4, 4}));
  2991. auto b = mkvar("b", {1, 8, 1, 1}),
  2992. elem = opr::Elemwise::make({warp + b},
  2993. opr::Elemwise::Param::Mode::RELU);
  2994. //! Dense
  2995. param_conv_bias.sparse = opr::ConvBias::Param::Sparse::DENSE;
  2996. param_conv_bias.pad_h = param_conv_bias.pad_w = 1;
  2997. auto w3_2 = mkcvar("w3_2", {16, 8, 3, 3}),
  2998. b3_2 = mkcvar("b3_2", {1, 16, 1, 1}),
  2999. conv3_2 = opr::ConvBias::make(elem, w3_2, b3_2, param_conv_bias, {},
  3000. OperatorNodeConfig("conv3_2"));
  3001. //! s8 group conv
  3002. param_conv_bias.sparse = opr::ConvBias::Param::Sparse::GROUP;
  3003. auto conv3_2_q8 = opr::TypeCvt::make(conv3_2, dtype::QuantizedS8(2.5f));
  3004. auto w3_3 = mkcvar_dtype("w3_3", {4, 8, 4, 3, 3}, dtype::QuantizedS8(2.5f)),
  3005. b3_3 = mkcvar_dtype("b3_3", {1, 32, 1, 1}, dtype::QuantizedS32(6.25f)),
  3006. conv3_3_q = opr::ConvBias::make(
  3007. conv3_2_q8, w3_3, b3_3, param_conv_bias, {},
  3008. OperatorNodeConfig{"conv_3_3_q", cn,
  3009. dtype::QuantizedS8{6.25f}});
  3010. auto conv3_3 = opr::TypeCvt::make(conv3_3_q, dtype::Float32());
  3011. //! Dense
  3012. param_conv_bias.sparse = opr::ConvBias::Param::Sparse::DENSE;
  3013. auto w4 = mkcvar("w4", {16, 32, 3, 3}), b4 = mkcvar("b4", {1, 16, 1, 1}),
  3014. conv4 = opr::ConvBias::make(conv3_3, w4, b4, param_conv_bias, {},
  3015. OperatorNodeConfig("conv4"));
  3016. auto w4_1 = mkcvar("w4_1", {16, 32, 1, 1}),
  3017. b4_1 = mkcvar("b4_1", {2, 16, 4, 4}),
  3018. conv4_1 =
  3019. opr::ConvBias::make(conv3_3, w4_1, b4_1, param_conv_bias_pad0,
  3020. {}, OperatorNodeConfig("conv4_1"));
  3021. auto conv4_add = conv4 + conv4_1;
  3022. auto w5 = mkcvar("w5", {6, 16, 3, 3}), b5 = mkcvar("b5", {1, 6, 1, 1}),
  3023. conv5 = opr::ConvBias::make(conv4_add, w5, b5, param_conv_bias, {},
  3024. OperatorNodeConfig("conv5"));
  3025. auto w6 = mkcvar("w6", {4, 6, 3, 3}), b6 = mkcvar("b6", {1, 4, 1, 1}),
  3026. y = opr::ConvBias::make(conv5, w6, b6, param_conv_bias, {},
  3027. OperatorNodeConfig("conv6"));
  3028. SymbolVar y_opt;
  3029. auto options = gopt::OptimizeForInferenceOptions{};
  3030. options.enable_fuse_conv_bias_nonlinearity();
  3031. options.enable_nchw44();
  3032. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  3033. ASSERT_EQ(opr::Convolution::Param::Format::NCHW44,
  3034. find_opr<opr::Convolution>(y_opt, "conv1").param().format);
  3035. ASSERT_EQ(opr::Convolution::Param::Format::NCHW,
  3036. find_opr<opr::ConvBias>(y_opt, "conv1_f1").param().format);
  3037. ASSERT_EQ(opr::Convolution::Param::Format::NCHW44,
  3038. find_opr<opr::ConvBias>(y_opt, "conv_1_2").param().format);
  3039. ASSERT_EQ(opr::Convolution::Param::Format::NCHW44,
  3040. find_opr<opr::ConvBias>(y_opt, "conv3_2").param().format);
  3041. ASSERT_EQ(opr::Convolution::Param::Format::NCHW44,
  3042. find_opr<opr::ConvBias>(y_opt, "conv_3_3_q").param().format);
  3043. ASSERT_EQ(opr::Convolution::Param::Format::NCHW44,
  3044. find_opr<opr::ConvBias>(y_opt, "conv4").param().format);
  3045. ASSERT_EQ(opr::Convolution::Param::Format::NCHW,
  3046. find_opr<opr::ConvBias>(y_opt, "conv5").param().format);
  3047. graph->compile({{y_opt, {}}})
  3048. ->to_json()
  3049. ->writeto_fpath(
  3050. output_file("TestGoptInference.ConvertFormatNCHW44.json"));
  3051. HostTensorND host_y_opt, host_y;
  3052. auto func = graph->compile({make_callback_copy(y, host_y),
  3053. make_callback_copy(y_opt, host_y_opt)});
  3054. func->execute();
  3055. //! meybe go to winograd in x86-32, so set error 1e-1
  3056. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-1);
  3057. *host_x = *gen({2, 3, 32, 32}, cn);
  3058. func->execute();
  3059. //! meybe go to winograd in x86-32, so set error 1e-1
  3060. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-1);
  3061. }
  3062. TEST(TestGoptInference, ConvertFormatNCHW44MultiInput) {
  3063. HostTensorGenerator<> gen;
  3064. auto cn = CompNode::load("cpu0");
  3065. auto graph = ComputingGraph::make();
  3066. graph->options().graph_opt_level = 0;
  3067. auto mkvar = [&](const char* name, const TensorShape& shp) {
  3068. return opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name);
  3069. };
  3070. auto mkcvar = [&](const char* name, const TensorShape& shp) {
  3071. return opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  3072. .rename(name);
  3073. };
  3074. auto host_x1 = gen({1, 8, 16, 16}, cn);
  3075. auto host_x2 = gen({1, 1, 16, 16}, cn);
  3076. auto x = opr::Host2DeviceCopy::make(*graph, host_x1);
  3077. opr::Convolution::Param param_conv;
  3078. param_conv.pad_h = param_conv.pad_w = 1;
  3079. auto w1 = mkcvar("w1", {8, 8, 3, 3}),
  3080. conv1 = opr::Convolution::make(x, w1, param_conv);
  3081. auto b = mkvar("b", {1, 1, 16, 16}),
  3082. elem0 = opr::Elemwise::make({conv1 + b + b},
  3083. opr::Elemwise::Param::Mode::RELU);
  3084. auto w2 = mkcvar("w2", {8, 8, 3, 3}),
  3085. conv2 = opr::Convolution::make(elem0, w2, param_conv);
  3086. auto b1 = mkvar("b1", {1}),
  3087. y = opr::Elemwise::make({conv2 + b1 + b},
  3088. opr::Elemwise::Param::Mode::RELU);
  3089. SymbolVar y_opt;
  3090. auto options = gopt::OptimizeForInferenceOptions{};
  3091. options.enable_nchw44();
  3092. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  3093. ASSERT_EQ(opr::Convolution::Param::Format::NCHW44,
  3094. find_opr<opr::Convolution>(y_opt).param().format);
  3095. graph->compile({{y_opt, {}}})
  3096. ->to_json()
  3097. ->writeto_fpath(output_file(
  3098. "TestGoptInference.ConvertFormatNCHW44MultiInput.json"));
  3099. HostTensorND host_y_opt, host_y;
  3100. auto func = graph->compile({make_callback_copy(y, host_y),
  3101. make_callback_copy(y_opt, host_y_opt)});
  3102. func->execute();
  3103. //! meybe go to winograd in x86-32, so set error 1e-1
  3104. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-1);
  3105. }
  3106. TEST(TestGoptInference, ConvertFormatNCHW44Reshape) {
  3107. HostTensorGenerator<> gen;
  3108. auto cn = CompNode::load("cpu0");
  3109. auto graph = ComputingGraph::make();
  3110. graph->options().graph_opt_level = 0;
  3111. auto mkcvar = [&](const char* name, const TensorShape& shp) {
  3112. return opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  3113. .rename(name);
  3114. };
  3115. auto host_x1 = gen({1, 8, 16, 16}, cn);
  3116. auto x = opr::Host2DeviceCopy::make(*graph, host_x1);
  3117. opr::Convolution::Param param_conv;
  3118. param_conv.pad_h = param_conv.pad_w = 1;
  3119. auto w1 = mkcvar("w1", {8, 8, 3, 3}),
  3120. conv1 = opr::Convolution::make(x, w1, param_conv);
  3121. auto y = opr::Reshape::make(conv1, {8, 16 * 16});
  3122. SymbolVar y_opt;
  3123. auto options = gopt::OptimizeForInferenceOptions{};
  3124. options.enable_nchw44();
  3125. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  3126. ASSERT_EQ(opr::Convolution::Param::Format::NCHW44,
  3127. find_opr<opr::Convolution>(y_opt).param().format);
  3128. graph->compile({{y_opt, {}}})
  3129. ->to_json()
  3130. ->writeto_fpath(output_file(
  3131. "TestGoptInference.ConvertFormatNCHW44Reshape.json"));
  3132. HostTensorND host_y_opt, host_y;
  3133. auto func = graph->compile({make_callback_copy(y, host_y),
  3134. make_callback_copy(y_opt, host_y_opt)});
  3135. func->execute();
  3136. //! meybe go to winograd in x86-32, so set error 1e-1
  3137. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-1);
  3138. }
  3139. TEST(TestGoptInference, ConvertFormatNCHW44_DOT) {
  3140. HostTensorGenerator<> gen;
  3141. auto cn = CompNode::load("cpu0");
  3142. auto graph = ComputingGraph::make();
  3143. graph->options().graph_opt_level = 0;
  3144. auto mkvar = [&](const char* name, const TensorShape& shp) {
  3145. return opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name);
  3146. };
  3147. auto mkcvar = [&](const char* name, const TensorShape& shp) {
  3148. return opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  3149. .rename(name);
  3150. };
  3151. auto mkcvar_dtype = [&](const char* name, const TensorShape& shp,
  3152. const DType& dtype) {
  3153. return opr::TypeCvt::make(
  3154. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  3155. .rename(name),
  3156. dtype);
  3157. };
  3158. auto host_x = gen({2, 3, 16, 16}, cn);
  3159. auto x = opr::Host2DeviceCopy::make(*graph, host_x);
  3160. //! Hybrid nchw44 mode
  3161. opr::Convolution::Param param_conv;
  3162. param_conv.pad_h = param_conv.pad_w = 1;
  3163. auto w1 = mkcvar("w1", {8, 3, 3, 3}),
  3164. conv1 = opr::Convolution::make(x, w1, param_conv, {},
  3165. OperatorNodeConfig("conv1"));
  3166. printf("create conv1 %s\n",
  3167. conv1.node()->owner_opr()->dyn_typeinfo()->name);
  3168. param_conv.pad_h = param_conv.pad_w = 1;
  3169. //! no supported hybrid nchw44
  3170. opr::ConvBias::Param param_conv_bias_pad0;
  3171. param_conv_bias_pad0.pad_h = param_conv_bias_pad0.pad_w = 0;
  3172. auto b1 = mkcvar("b1", {1, 8, 1, 1});
  3173. auto w1_f1 = mkcvar("w1_1", {8, 3, 1, 1});
  3174. auto conv1_f1 = opr::ConvBias::make(x, w1_f1, b1, param_conv_bias_pad0, {},
  3175. OperatorNodeConfig("conv1_f1"));
  3176. //! hybrid dot
  3177. auto x_s = opr::TypeCvt::make(x, dtype::QuantizedS8(2.5f));
  3178. auto w1_3 = mkcvar_dtype("w1_3", {8, 3, 3, 3}, dtype::QuantizedS8(2.5f));
  3179. auto conv1_3_q = opr::Convolution::make(
  3180. x_s, w1_3, param_conv, {},
  3181. OperatorNodeConfig{"conv1_3_q", cn, dtype::QuantizedS8{6.25f}});
  3182. auto conv1_3 = opr::TypeCvt::make(conv1_3_q, dtype::Float32());
  3183. auto conv1_add = conv1_f1 * conv1 * conv1_3;
  3184. auto conv_1_q8 = opr::TypeCvt::make(conv1_add, dtype::QuantizedS8(2.5f));
  3185. //! s8 dense conv
  3186. opr::ConvBias::Param param_conv_bias;
  3187. param_conv_bias.pad_h = param_conv_bias.pad_w = 1;
  3188. auto w1_2 = mkcvar_dtype("w1_2", {8, 8, 3, 3}, dtype::QuantizedS8(2.5f));
  3189. auto conv_1_2 = opr::ConvBias::make(
  3190. conv_1_q8, w1_2, param_conv_bias, {},
  3191. OperatorNodeConfig{"conv_1_2", cn, dtype::QuantizedS8{6.25f}});
  3192. auto conv_1_2_fp32 = opr::TypeCvt::make(conv_1_2, dtype::Float32());
  3193. //! channel wise
  3194. param_conv_bias.sparse = opr::ConvBias::Param::Sparse::GROUP;
  3195. auto w2 = mkcvar("w2", {8, 1, 1, 3, 3}), b2 = mkcvar("b2", {1, 8, 1, 1}),
  3196. conv2 = opr::ConvBias::make(conv_1_2_fp32, w2, b2, param_conv_bias);
  3197. //! group
  3198. auto w3 = mkcvar("w3", {2, 4, 4, 3, 3}), b3 = mkcvar("b3", {1, 8, 1, 1}),
  3199. conv3 = opr::ConvBias::make(conv2, w3, b3, param_conv_bias);
  3200. auto shape_of = opr::GetVarShape::make(conv3);
  3201. auto subtensor = opr::Subtensor::make(
  3202. shape_of, {opr::Subtensor::AxisIndexer::make_interval(
  3203. 0, x.make_scalar(2), None, x.make_scalar(1))});
  3204. opr::Resize::Param param_resize;
  3205. param_resize.format = opr::Resize::Param::Format::NCHW;
  3206. auto resize = opr::ResizeForward::make(conv3, subtensor * 2, param_resize);
  3207. auto mat = mkcvar("mat", {2, 3, 3}),
  3208. warp = opr::WarpPerspectiveForward::make(
  3209. resize, mat, nullptr, cg::var_from_tensor_shape(x, {4, 4}));
  3210. auto b = mkvar("b", {1, 8, 1, 1}),
  3211. elem = opr::Elemwise::make({warp + b},
  3212. opr::Elemwise::Param::Mode::RELU);
  3213. //! Dense
  3214. param_conv_bias.sparse = opr::ConvBias::Param::Sparse::DENSE;
  3215. param_conv_bias.pad_h = param_conv_bias.pad_w = 1;
  3216. auto w3_2 = mkcvar("w3_2", {16, 8, 3, 3}),
  3217. b3_2 = mkcvar("b3_2", {1, 16, 1, 1}),
  3218. conv3_2 = opr::ConvBias::make(elem, w3_2, b3_2, param_conv_bias, {},
  3219. OperatorNodeConfig("conv3_2"));
  3220. //! s8 group conv
  3221. param_conv_bias.sparse = opr::ConvBias::Param::Sparse::GROUP;
  3222. auto conv3_2_q8 = opr::TypeCvt::make(conv3_2, dtype::QuantizedS8(2.5f));
  3223. auto w3_3 = mkcvar_dtype("w3_3", {4, 8, 4, 3, 3}, dtype::QuantizedS8(2.5f)),
  3224. b3_3 = mkcvar_dtype("b3_3", {1, 32, 1, 1}, dtype::QuantizedS32(6.25f)),
  3225. conv3_3_q = opr::ConvBias::make(
  3226. conv3_2_q8, w3_3, b3_3, param_conv_bias, {},
  3227. OperatorNodeConfig{"conv_3_3_q", cn,
  3228. dtype::QuantizedS8{6.25f}});
  3229. auto conv3_3 = opr::TypeCvt::make(conv3_3_q, dtype::Float32());
  3230. //! Dense
  3231. param_conv_bias.sparse = opr::ConvBias::Param::Sparse::DENSE;
  3232. auto w4 = mkcvar("w4", {4, 32, 3, 3}), b4 = mkcvar("b4", {1, 4, 1, 1}),
  3233. conv4 = opr::ConvBias::make(conv3_3, w4, b4, param_conv_bias, {},
  3234. OperatorNodeConfig("conv4"));
  3235. auto w5 = mkcvar("w5", {6, 4, 3, 3}), b5 = mkcvar("b5", {1, 6, 1, 1}),
  3236. conv5 = opr::ConvBias::make(conv4, w5, b5, param_conv_bias, {},
  3237. OperatorNodeConfig("conv5"));
  3238. auto w6 = mkcvar("w6", {4, 6, 3, 3}), b6 = mkcvar("b6", {1, 4, 1, 1}),
  3239. y = opr::ConvBias::make(conv5, w6, b6, param_conv_bias, {},
  3240. OperatorNodeConfig("conv6"));
  3241. SymbolVar y_opt;
  3242. auto options = gopt::OptimizeForInferenceOptions{};
  3243. options.enable_fuse_conv_bias_nonlinearity();
  3244. options.enable_nchw44_dot();
  3245. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  3246. ASSERT_EQ(opr::Convolution::Param::Format::NCHW44,
  3247. find_opr<opr::Convolution>(y_opt, "conv1").param().format);
  3248. ASSERT_EQ(opr::Convolution::Param::Format::NCHW44_DOT,
  3249. find_opr<opr::Convolution>(y_opt, "conv1_3_q").param().format);
  3250. ASSERT_EQ(opr::Convolution::Param::Format::NCHW,
  3251. find_opr<opr::ConvBias>(y_opt, "conv1_f1").param().format);
  3252. ASSERT_EQ(opr::Convolution::Param::Format::NCHW44_DOT,
  3253. find_opr<opr::ConvBias>(y_opt, "conv_1_2").param().format);
  3254. ASSERT_EQ(opr::Convolution::Param::Format::NCHW44,
  3255. find_opr<opr::ConvBias>(y_opt, "conv3_2").param().format);
  3256. ASSERT_EQ(opr::Convolution::Param::Format::NCHW44_DOT,
  3257. find_opr<opr::ConvBias>(y_opt, "conv_3_3_q").param().format);
  3258. ASSERT_EQ(opr::Convolution::Param::Format::NCHW44,
  3259. find_opr<opr::ConvBias>(y_opt, "conv4").param().format);
  3260. ASSERT_EQ(opr::Convolution::Param::Format::NCHW,
  3261. find_opr<opr::ConvBias>(y_opt, "conv5").param().format);
  3262. graph->compile({{y_opt, {}}})
  3263. ->to_json()
  3264. ->writeto_fpath(output_file(
  3265. "TestGoptInference.ConvertFormatNCHW44_DOT.json"));
  3266. HostTensorND host_y_opt, host_y;
  3267. auto func = graph->compile({make_callback_copy(y, host_y),
  3268. make_callback_copy(y_opt, host_y_opt)});
  3269. func->execute();
  3270. //! meybe go to winograd in x86-32, so set error 1e-1
  3271. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-1);
  3272. *host_x = *gen({2, 3, 32, 32}, cn);
  3273. func->execute();
  3274. //! meybe go to winograd in x86-32, so set error 1e-1
  3275. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-1);
  3276. }
  3277. TEST(TestGoptInference, ConvertFormatCD4GroupOneConv) {
  3278. // hwcd4 is only supported in naive handle
  3279. NaiveMegDNNHandleScope naive_megdnn_handle;
  3280. HostTensorGenerator<> gen;
  3281. auto cn = CompNode::load("cpu0");
  3282. auto graph = ComputingGraph::make();
  3283. graph->options().graph_opt_level = 0;
  3284. auto mkvar = [&](const char* name, const TensorShape& shp) {
  3285. return opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name);
  3286. };
  3287. auto mkcvar = [&](const char* name, const TensorShape& shp) {
  3288. return opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  3289. .rename(name);
  3290. };
  3291. auto x = mkvar("x", {1, 3, 128, 128});
  3292. // ConvBias
  3293. opr::ConvBias::Param param_conv_bias;
  3294. param_conv_bias.pad_h = param_conv_bias.pad_w = 1;
  3295. param_conv_bias.sparse = opr::ConvBias::Param::Sparse::GROUP;
  3296. auto w1 = mkcvar("w1", {1, 16, 3, 3, 3}), b1 = mkcvar("b1", {1, 16, 1, 1});
  3297. auto conv1 = opr::ConvBias::make(x, w1, b1, param_conv_bias);
  3298. param_conv_bias.sparse = opr::ConvBias::Param::Sparse::GROUP;
  3299. // Convolution
  3300. opr::Convolution::Param param_conv;
  3301. param_conv.pad_h = param_conv.pad_w = 1;
  3302. param_conv.sparse = opr::Convolution::Param::Sparse::GROUP;
  3303. auto w3 = mkcvar("w3", {1, 16, 16, 3, 3});
  3304. auto y = opr::Convolution::make(conv1, w3, param_conv);
  3305. SymbolVar y_opt;
  3306. {
  3307. auto options = gopt::OptimizeForInferenceOptions{};
  3308. options.enable_nhwcd4();
  3309. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  3310. }
  3311. HostTensorND host_y_opt, host_y;
  3312. auto func = graph->compile({make_callback_copy(y, host_y),
  3313. make_callback_copy(y_opt, host_y_opt)});
  3314. func->execute();
  3315. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3);
  3316. }
  3317. #if MGB_CUDA
  3318. TEST(TestGoptInference, PreProcessCase0) {
  3319. REQUIRE_GPU(1);
  3320. HostTensorGenerator<dtype::Quantized8Asymm, RandomDistribution::UNIFORM>
  3321. gen(dt_quint8(0), dt_quint8(50), 1, 128, 1234);
  3322. auto cn = CompNode::load("gpu0");
  3323. auto graph = ComputingGraph::make();
  3324. graph->options().graph_opt_level = 0;
  3325. size_t n = 1;
  3326. size_t c = 3;
  3327. size_t h = 16;
  3328. size_t w = 16;
  3329. auto host_x1 = gen({n, c, h, w}, cn);
  3330. auto x = opr::Host2DeviceCopy::make(*graph, host_x1);
  3331. auto x_q8 = opr::TypeCvt::make(x, dtype::QuantizedS8(1.f), cn);
  3332. auto zero = DTypeScalar(dtype::QuantizedS8(1.f));
  3333. auto zero_tensor = opr::ImmutableTensor::make(*graph, zero, cn);
  3334. auto pad_channel_tensor =
  3335. opr::Broadcast::make(zero_tensor, {n, 1, h, w}, cn);
  3336. auto paded_x = opr::Concat::make({x_q8, pad_channel_tensor}, 1, cn)
  3337. .reshape({n, 1, 4, h, w});
  3338. auto result = opr::Dimshuffle::make(paded_x, {0, 1, 3, 4, 2}, 5, cn);
  3339. auto y = result;
  3340. SymbolVar y_opt;
  3341. auto options = gopt::OptimizeForInferenceOptions{};
  3342. options.enable_fuse_preprocess();
  3343. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  3344. graph->compile({{y_opt, {}}})
  3345. ->to_json()
  3346. ->writeto_fpath(
  3347. output_file("TestGoptInference.PreProcessCase0.json"));
  3348. HostTensorND host_y_opt, host_y;
  3349. auto func = graph->compile({make_callback_copy(y, host_y),
  3350. make_callback_copy(y_opt, host_y_opt)});
  3351. func->execute();
  3352. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-5);
  3353. ASSERT_TRUE(y_opt.node()->owner_opr()->same_type<opr::RelayoutFormat>());
  3354. }
  3355. TEST(TestGoptInference, PreProcessCase1) {
  3356. REQUIRE_GPU(1);
  3357. HostTensorGenerator<dtype::Uint8, RandomDistribution::UNIFORM> gen(0, 255);
  3358. auto cn = CompNode::load("gpu0");
  3359. auto graph = ComputingGraph::make();
  3360. graph->options().graph_opt_level = 0;
  3361. size_t n = 1;
  3362. size_t c = 3;
  3363. size_t h = 16;
  3364. size_t w = 16;
  3365. auto host_x1 = gen({n, c, h, w}, cn);
  3366. auto x = opr::Host2DeviceCopy::make(*graph, host_x1);
  3367. auto x_u8 = opr::TypeCvt::make(x, dtype::Float32(), cn);
  3368. auto x_s8 = x_u8 - 128;
  3369. auto zero = DTypeScalar(dtype::Float32());
  3370. auto zero_tensor = opr::ImmutableTensor::make(*graph, zero, cn);
  3371. auto pad_channel_tensor =
  3372. opr::Broadcast::make(zero_tensor, {n, 1, h, w}, cn);
  3373. auto paded_x = opr::Concat::make({x_s8, pad_channel_tensor}, 1, cn)
  3374. .reshape({n, 1, 4, h, w});
  3375. auto nchw4_out = opr::Dimshuffle::make(paded_x, {0, 1, 3, 4, 2}, 5, cn);
  3376. auto result = opr::TypeCvt::make(nchw4_out, dtype::QuantizedS8(1.f));
  3377. auto y = result;
  3378. SymbolVar y_opt;
  3379. auto options = gopt::OptimizeForInferenceOptions{};
  3380. options.enable_fuse_preprocess();
  3381. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  3382. graph->compile({{y_opt, {}}})
  3383. ->to_json()
  3384. ->writeto_fpath(
  3385. output_file("TestGoptInference.PreProcessCase1.json"));
  3386. HostTensorND host_y_opt, host_y;
  3387. auto func = graph->compile({make_callback_copy(y, host_y),
  3388. make_callback_copy(y_opt, host_y_opt)});
  3389. func->execute();
  3390. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-5);
  3391. ASSERT_TRUE(y_opt.node()->owner_opr()->same_type<opr::RelayoutFormat>());
  3392. }
  3393. TEST(TestGoptInference, WarpAndPreProcessCase0) {
  3394. REQUIRE_GPU(1);
  3395. HostTensorGenerator<dtype::Uint8, RandomDistribution::UNIFORM> gen(0, 255);
  3396. auto cn = CompNode::load("gpu0");
  3397. auto graph = ComputingGraph::make();
  3398. graph->options().graph_opt_level = 0;
  3399. size_t n = 1;
  3400. size_t c = 3;
  3401. size_t h = 16;
  3402. size_t w = 16;
  3403. auto host_x1 = gen({n, h, w, c}, cn);
  3404. auto x = opr::Host2DeviceCopy::make(*graph, host_x1);
  3405. auto mat_host = std::make_shared<HostTensorND>(cn, TensorShape{n, 3, 3},
  3406. dtype::Float32());
  3407. warp_perspective_mat_gen(*mat_host, n, h, w);
  3408. auto mat = opr::Host2DeviceCopy::make(*graph, mat_host).rename("mat");
  3409. opr::WarpPerspective::Param warp_param;
  3410. warp_param.format = opr::WarpPerspective::Param::Format::NHWC;
  3411. auto x_warp =
  3412. opr::WarpPerspective::make(x, mat, TensorShape{h, w}, warp_param);
  3413. auto x_nchw = opr::Dimshuffle::make(x_warp, {0, 3, 1, 2}, 4, cn);
  3414. auto x_u8 = opr::TypeCvt::make(x_nchw, dtype::Float32(), cn);
  3415. auto x_s8 = x_u8 - 128;
  3416. auto zero = DTypeScalar(dtype::Float32());
  3417. auto zero_tensor = opr::ImmutableTensor::make(*graph, zero, cn);
  3418. auto pad_channel_tensor =
  3419. opr::Broadcast::make(zero_tensor, {n, 1, h, w}, cn);
  3420. auto paded_x = opr::Concat::make({x_s8, pad_channel_tensor}, 1, cn)
  3421. .reshape({n, 1, 4, h, w});
  3422. auto nchw4_out = opr::Dimshuffle::make(paded_x, {0, 1, 3, 4, 2}, 5, cn);
  3423. auto result = opr::TypeCvt::make(nchw4_out, dtype::QuantizedS8(1.f));
  3424. auto y = result;
  3425. SymbolVar y_opt;
  3426. auto options = gopt::OptimizeForInferenceOptions{};
  3427. options.enable_fuse_preprocess();
  3428. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  3429. ASSERT_TRUE(y_opt.node()->owner_opr()->same_type<opr::WarpPerspective>());
  3430. ASSERT_EQ(opr::WarpPerspective::Param::Format::NHWC_NCHW4_IC_SMALL,
  3431. find_opr<opr::WarpPerspective>(y_opt).param().format);
  3432. graph->compile({{y_opt, {}}})
  3433. ->to_json()
  3434. ->writeto_fpath(output_file(
  3435. "TestGoptInference.WarpAndPreProcessCase0.json"));
  3436. HostTensorND host_y_opt, host_y;
  3437. auto func = graph->compile({make_callback_copy(y, host_y),
  3438. make_callback_copy(y_opt, host_y_opt)});
  3439. func->execute();
  3440. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-5);
  3441. }
  3442. TEST(TestGoptInference, WarpAndPreProcessCase1) {
  3443. REQUIRE_GPU(1);
  3444. HostTensorGenerator<dtype::Uint8, RandomDistribution::UNIFORM> gen(0, 255);
  3445. auto cn = CompNode::load("gpu0");
  3446. auto graph = ComputingGraph::make();
  3447. graph->options().graph_opt_level = 0;
  3448. size_t n = 1;
  3449. size_t c = 3;
  3450. size_t h = 16;
  3451. size_t w = 16;
  3452. auto host_x1 = gen({n, h, w, c}, cn);
  3453. auto x = opr::Host2DeviceCopy::make(*graph, host_x1);
  3454. auto mat_host = std::make_shared<HostTensorND>(cn, TensorShape{n, 3, 3},
  3455. dtype::Float32());
  3456. warp_perspective_mat_gen(*mat_host, n, h, w);
  3457. auto mat = opr::Host2DeviceCopy::make(*graph, mat_host).rename("mat");
  3458. opr::WarpPerspective::Param warp_param;
  3459. warp_param.format = opr::WarpPerspective::Param::Format::NHWC;
  3460. auto x_warp =
  3461. opr::WarpPerspective::make(x, mat, TensorShape{h, w}, warp_param);
  3462. auto x_nchw = opr::Dimshuffle::make(x_warp, {0, 3, 1, 2}, 4, cn);
  3463. auto result = opr::TypeCvt::make(x_nchw, dtype::Float32(), cn);
  3464. auto y = result;
  3465. SymbolVar y_opt;
  3466. auto options = gopt::OptimizeForInferenceOptions{};
  3467. options.enable_fuse_preprocess();
  3468. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  3469. ASSERT_TRUE(y_opt.node()->owner_opr()->same_type<opr::WarpPerspective>());
  3470. ASSERT_EQ(opr::WarpPerspective::Param::Format::NHWC_NCHW,
  3471. find_opr<opr::WarpPerspective>(y_opt).param().format);
  3472. graph->compile({{y_opt, {}}})
  3473. ->to_json()
  3474. ->writeto_fpath(output_file(
  3475. "TestGoptInference.WarpAndPreProcessCase1.json"));
  3476. HostTensorND host_y_opt, host_y;
  3477. auto func = graph->compile({make_callback_copy(y, host_y),
  3478. make_callback_copy(y_opt, host_y_opt)});
  3479. func->execute();
  3480. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-5);
  3481. }
  3482. TEST(TestGoptInference, FoldingConvDimshuffle) {
  3483. REQUIRE_GPU(1);
  3484. auto cn = CompNode::load("gpu0");
  3485. cn.activate();
  3486. auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
  3487. auto sm_ver = prop.major * 10 + prop.minor;
  3488. if (sm_ver < 61) {
  3489. printf("This testcast ignored due to insufficient cuda cap(got: %d, "
  3490. "expected: %d)\n",
  3491. sm_ver, 61);
  3492. return;
  3493. }
  3494. HostTensorGenerator<dtype::Int8> gen;
  3495. auto graph = ComputingGraph::make();
  3496. graph->options().graph_opt_level = 0;
  3497. auto mkvar = [&](const char* name, const TensorShape& shp,
  3498. const DType& dtype) {
  3499. return opr::TypeCvt::make(
  3500. opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
  3501. dtype);
  3502. };
  3503. auto mkcvar = [&](const char* name, const TensorShape& shp,
  3504. const DType& dtype) {
  3505. return opr::TypeCvt::make(
  3506. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  3507. .rename(name),
  3508. dtype);
  3509. };
  3510. auto nchw42nchw = [](SymbolVar x) {
  3511. auto xshp = opr::GetVarShape::make(x);
  3512. auto cv = [&x](int v) { return x.make_scalar(v); };
  3513. auto sub = [&xshp, &cv](int idx) {
  3514. return opr::IndexAt::make(xshp, {{0, cv(idx)}});
  3515. };
  3516. auto tshp0 = opr::Concat::make({sub(0), sub(1) * 4, sub(2), sub(3)}, 0);
  3517. auto y0 = opr::Dimshuffle::make(x, {0, 1, 4, 2, 3});
  3518. auto y1 = opr::Reshape::make(y0, tshp0);
  3519. return y1;
  3520. };
  3521. auto x = mkvar("x", {32, 16, 4, 8, 4}, dtype::QuantizedS8(2.5f)),
  3522. w = mkcvar("w", {64, 16, 3, 3, 4}, dtype::QuantizedS8(2.5f)),
  3523. b = mkcvar("b", {1, 16, 1, 1, 4}, dtype::QuantizedS32(6.25f));
  3524. opr::ConvBias::Param param;
  3525. param.format = opr::ConvBias::Param::Format::NCHW4;
  3526. param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU;
  3527. param.stride_h = param.stride_w = 2;
  3528. param.pad_h = param.pad_w = 1;
  3529. auto y = opr::ConvBias::make(x, w, b, param, {},
  3530. OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  3531. y = opr::TypeCvt::make(y, dtype::Float32());
  3532. y = nchw42nchw(y);
  3533. SymbolVar y_fuse, y_non_fuse;
  3534. unpack_vector(gopt::GraphOptimizer{}
  3535. .add_pass<gopt::ShuffleShuffleRemovePass>()
  3536. .add_pass<gopt::FoldingConvBiasDimshufflePass>()
  3537. .add_pass<gopt::ParamFusePass>()
  3538. .apply({{y}})
  3539. .endpoint_vars(),
  3540. y_fuse);
  3541. gopt::modify_opr_algo_strategy_inplace(
  3542. {y_fuse},
  3543. opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy::PROFILE);
  3544. graph->compile({{y_fuse, {}}})
  3545. ->to_json()
  3546. ->writeto_fpath(output_file(
  3547. "TestGoptInference.FoldingConvDimshuffle.json"));
  3548. ASSERT_EQ(opr::ConvBias::Param::Format::NCHW4_NCHW,
  3549. find_opr<opr::ConvBias>(y_fuse).param().format);
  3550. ASSERT_EQ(0u, find_opr_num<opr::Dimshuffle>(y_fuse));
  3551. unpack_vector(gopt::GraphOptimizer{}.apply({{y}}).endpoint_vars(),
  3552. y_non_fuse);
  3553. HostTensorND host_y_fuse, host_y_non_fuse;
  3554. auto func =
  3555. graph->compile({make_callback_copy(y_fuse, host_y_fuse),
  3556. make_callback_copy(y_non_fuse, host_y_non_fuse)});
  3557. func->execute();
  3558. }
  3559. //! close for cu111 ci, reopen it when bug fixed
  3560. #if CUDA_VERSION < 11000
  3561. TEST(TestGoptInference, FoldingConvDimshuffleNCHW4NCHW32) {
  3562. REQUIRE_GPU(1);
  3563. auto cn = CompNode::load("gpu0");
  3564. cn.activate();
  3565. auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
  3566. auto sm_ver = prop.major * 10 + prop.minor;
  3567. if (sm_ver < 61) {
  3568. printf("This testcast ignored due to insufficient cuda cap(got: %d, "
  3569. "expected: %d)\n",
  3570. sm_ver, 61);
  3571. return;
  3572. }
  3573. HostTensorGenerator<dtype::Int8> gen;
  3574. auto graph = ComputingGraph::make();
  3575. graph->options().graph_opt_level = 0;
  3576. auto mkvar = [&](const char* name, const TensorShape& shp,
  3577. const DType& dtype) {
  3578. return opr::TypeCvt::make(
  3579. opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
  3580. dtype);
  3581. };
  3582. auto mkcvar = [&](const char* name, const TensorShape& shp,
  3583. const DType& dtype) {
  3584. return opr::TypeCvt::make(
  3585. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  3586. .rename(name),
  3587. dtype);
  3588. };
  3589. auto nchw42nchw32 = [](SymbolVar x) {
  3590. auto xshp = opr::GetVarShape::make(x);
  3591. auto cv = [&x](int v) { return x.make_scalar(v); };
  3592. auto sub = [&xshp, &cv](int idx) {
  3593. return opr::IndexAt::make(xshp, {{0, cv(idx)}});
  3594. };
  3595. auto tshp0 = opr::Concat::make(
  3596. {sub(0), sub(1) / 8, cv(8), sub(2), sub(3), sub(4)}, 0),
  3597. tshp1 = opr::Concat::make(
  3598. {sub(0), sub(1) / 8, sub(2), sub(3), sub(4) * 8}, 0);
  3599. auto y0 = opr::Reshape::make(x, tshp0);
  3600. auto y1 = opr::Dimshuffle::make(y0, {0, 1, 3, 4, 2, 5});
  3601. auto y2 = opr::Reshape::make(y1, tshp1);
  3602. return y2;
  3603. };
  3604. auto x = mkvar("x", {32, 16, 4, 8, 4}, dtype::QuantizedS8(2.5f)),
  3605. w = mkcvar("w", {64, 16, 3, 3, 4}, dtype::QuantizedS8(2.5f)),
  3606. b = mkcvar("b", {1, 16, 1, 1, 4}, dtype::QuantizedS32(6.25f));
  3607. opr::ConvBias::Param param;
  3608. param.format = opr::ConvBias::Param::Format::NCHW4;
  3609. param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU;
  3610. param.stride_h = param.stride_w = 2;
  3611. param.pad_h = param.pad_w = 1;
  3612. auto y = opr::ConvBias::make(x, w, b, param, {},
  3613. OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  3614. y = nchw42nchw32(y);
  3615. y = opr::TypeCvt::make(y, dtype::Float32());
  3616. SymbolVar y_fuse, y_non_fuse;
  3617. unpack_vector(gopt::GraphOptimizer{}
  3618. .add_pass<gopt::FoldingConvBiasDimshufflePass>()
  3619. .add_pass<gopt::ParamFusePass>()
  3620. .apply({{y}})
  3621. .endpoint_vars(),
  3622. y_fuse);
  3623. gopt::modify_opr_algo_strategy_inplace(
  3624. {y_fuse},
  3625. opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy::PROFILE);
  3626. graph->compile({{y_fuse, {}}})
  3627. ->to_json()
  3628. ->writeto_fpath(output_file(
  3629. "TestGoptInference.FoldingConvDimshuffleNCHW4NCHW32.json"));
  3630. ASSERT_EQ(opr::ConvBias::Param::Format::NCHW4_NCHW32,
  3631. find_opr<opr::ConvBias>(y_fuse).param().format);
  3632. ASSERT_EQ(0u, find_opr_num<opr::Dimshuffle>(y_fuse));
  3633. unpack_vector(gopt::GraphOptimizer{}.apply({{y}}).endpoint_vars(),
  3634. y_non_fuse);
  3635. HostTensorND host_y_fuse, host_y_non_fuse;
  3636. auto func =
  3637. graph->compile({make_callback_copy(y_fuse, host_y_fuse),
  3638. make_callback_copy(y_non_fuse, host_y_non_fuse)});
  3639. func->execute();
  3640. MGB_ASSERT_TENSOR_EQ(host_y_fuse, host_y_non_fuse);
  3641. }
  3642. #endif
  3643. #if CUDA_VERSION >= 10020
  3644. TEST(TestGoptInference, FoldingConvDimshuffleNCHW32NCHW4) {
  3645. REQUIRE_GPU(1);
  3646. auto cn = CompNode::load("gpu0");
  3647. cn.activate();
  3648. auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
  3649. auto sm_ver = prop.major * 10 + prop.minor;
  3650. if (sm_ver < 75) {
  3651. printf("This testcast ignored due to insufficient cuda cap(got: %d, "
  3652. "expected: %d)\n",
  3653. sm_ver, 75);
  3654. return;
  3655. }
  3656. HostTensorGenerator<dtype::Int8> gen;
  3657. auto graph = ComputingGraph::make();
  3658. graph->options().graph_opt_level = 0;
  3659. auto mkvar = [&](const char* name, const TensorShape& shp,
  3660. const DType& dtype) {
  3661. return opr::TypeCvt::make(
  3662. opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
  3663. dtype);
  3664. };
  3665. auto mkcvar = [&](const char* name, const TensorShape& shp,
  3666. const DType& dtype) {
  3667. return opr::TypeCvt::make(
  3668. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  3669. .rename(name),
  3670. dtype);
  3671. };
  3672. auto x = mkvar("x", {32, 16, 4, 8, 4}, dtype::QuantizedS8(2.5f)),
  3673. w = mkcvar("w", {64, 16, 3, 3, 4}, dtype::QuantizedS8(2.5f)),
  3674. b = mkcvar("b", {1, 16, 1, 1, 4}, dtype::QuantizedS32(6.25f)),
  3675. w1 = mkcvar("w1", {16, 16, 3, 3, 4}, dtype::QuantizedS8(2.5f)),
  3676. b1 = mkcvar("b1", {1, 4, 1, 1, 4}, dtype::QuantizedS32(6.25f));
  3677. opr::ConvBias::Param param;
  3678. param.format = opr::ConvBias::Param::Format::NCHW4;
  3679. param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU;
  3680. param.stride_h = param.stride_w = 2;
  3681. param.pad_h = param.pad_w = 1;
  3682. auto y = opr::ConvBias::make(x, w, b, param, {},
  3683. OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  3684. param.stride_h = param.stride_w = 1;
  3685. y = opr::ConvBias::make(y, w1, b1, param, {},
  3686. OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  3687. y = opr::TypeCvt::make(y, dtype::Float32());
  3688. SymbolVar y_fuse, y_non_fuse;
  3689. {
  3690. auto options = gopt::OptimizeForInferenceOptions{};
  3691. options.enable_nchw32().enable_fuse_conv_bias_nonlinearity();
  3692. unpack_vector(gopt::optimize_for_inference({y}, options), y_fuse);
  3693. }
  3694. graph->compile({{y_fuse, {}}})
  3695. ->to_json()
  3696. ->writeto_fpath(output_file(
  3697. "TestGoptInference.FoldingConvDimshuffleNCHW32NCHW4.json"));
  3698. ASSERT_EQ(1u, find_opr_num<opr::Dimshuffle>(y_fuse));
  3699. bool found = false;
  3700. cg::DepOprIter{[&found](cg::OperatorNodeBase* opr) {
  3701. if (!found && opr->same_type<opr::ConvBias>()) {
  3702. opr::ConvBias* cb = &opr->cast_final_safe<opr::ConvBias>();
  3703. if (cb->param().format ==
  3704. opr::ConvBias::Param::Format::NCHW32_NCHW4)
  3705. found = true;
  3706. }
  3707. }}
  3708. .add(y_fuse.node()->owner_opr());
  3709. EXPECT_TRUE(found);
  3710. unpack_vector(gopt::GraphOptimizer{}.apply({{y}}).endpoint_vars(),
  3711. y_non_fuse);
  3712. HostTensorND host_y_fuse, host_y_non_fuse;
  3713. auto func =
  3714. graph->compile({make_callback_copy(y_fuse, host_y_fuse),
  3715. make_callback_copy(y_non_fuse, host_y_non_fuse)});
  3716. func->execute();
  3717. MGB_ASSERT_TENSOR_EQ(host_y_fuse, host_y_non_fuse);
  3718. }
  3719. #endif
  3720. #endif
  3721. // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台