You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

inference.cpp 167 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859386038613862386338643865386638673868386938703871387238733874387538763877387838793880388138823883388438853886388738883889389038913892389338943895389638973898389939003901390239033904390539063907390839093910391139123913391439153916391739183919392039213922392339243925392639273928392939303931393239333934393539363937393839393940394139423943394439453946394739483949395039513952395339543955395639573958395939603961396239633964396539663967396839693970397139723973397439753976397739783979398039813982398339843985398639873988398939903991399239933994399539963997399839994000400140024003400440054006400740084009401040114012401340144015401640174018401940204021402240234024402540264027402840294030403140324033403440354036403740384039404040414042404340444045404640474048404940504051405240534054405540564057405840594060406140624063406440654066406740684069407040714072407340744075407640774078407940804081408240834084408540864087408840894090409140924093409440954096409740984099410041014102410341044105410641074108410941104111411241134114411541164117
  1. /**
  2. * \file src/gopt/test/inference.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
  10. * implied.
  11. */
  12. #include "megbrain/opr/dnn/local.h"
  13. #include "megbrain/test/helper.h"
  14. #include "megbrain/gopt/basic_arith.h"
  15. #include "megbrain/gopt/gtrans.h"
  16. #include "megbrain/gopt/inference.h"
  17. #include "megbrain/opr/basic_arith_wrapper.h"
  18. #include "megbrain/opr/blas.h"
  19. #include "megbrain/opr/dnn/batch_norm.h"
  20. #include "megbrain/opr/dnn/convolution.h"
  21. #include "megbrain/opr/dnn/pooling.h"
  22. #include "megbrain/opr/imgproc.h"
  23. #include "megbrain/opr/io.h"
  24. #include "megbrain/opr/nn_int.h"
  25. #include "megbrain/opr/tensor_gen.h"
  26. #include "megbrain/opr/tensor_manip.h"
  27. #include "megbrain/opr/utility.h"
  28. #include "./helper.h"
  29. #include "megbrain/comp_node_env.h"
  30. #include "megdnn/tensor_format.h"
  31. #include <random>
  32. using namespace mgb;
  33. namespace {
  34. //! find first the operator of specific type; raise exception if not found
  35. template <typename T>
  36. T& find_opr(SymbolVar endpoint) {
  37. T* found = nullptr;
  38. auto cb = [&found](cg::OperatorNodeBase* opr) {
  39. if (!found && opr->same_type<T>()) {
  40. found = &opr->cast_final_safe<T>();
  41. }
  42. };
  43. cg::DepOprIter{cb}.add(endpoint.node()->owner_opr());
  44. mgb_assert(found, "not found opr from %s", endpoint.node()->name().c_str());
  45. return *found;
  46. }
  47. template <typename T>
  48. T& find_opr(SymbolVar endpoint, const std::string& node_name) {
  49. T* found = nullptr;
  50. auto cb = [&found, &node_name](cg::OperatorNodeBase* opr) {
  51. if (!found && opr->same_type<T>() && opr->name() == node_name) {
  52. found = &opr->cast_final_safe<T>();
  53. }
  54. };
  55. cg::DepOprIter{cb}.add(endpoint.node()->owner_opr());
  56. mgb_assert(found, "not found opr %s from %s", node_name.c_str(),
  57. endpoint.node()->name().c_str());
  58. return *found;
  59. }
  60. template <typename T>
  61. size_t find_opr_num(SymbolVar endpoint) {
  62. size_t opr_num = 0;
  63. auto cb = [&opr_num](cg::OperatorNodeBase* opr) {
  64. if (opr->same_type<T>()) {
  65. opr_num++;
  66. }
  67. };
  68. cg::DepOprIter{cb}.add(endpoint.node()->owner_opr());
  69. return opr_num;
  70. }
  71. class NaiveMegDNNHandleScope {
  72. int m_orig_level;
  73. public:
  74. NaiveMegDNNHandleScope()
  75. : m_orig_level{MegDNNHandle::exchange_default_dbg_level(2)} {
  76. CompNode::finalize();
  77. }
  78. ~NaiveMegDNNHandleScope() {
  79. auto set = MegDNNHandle::exchange_default_dbg_level(m_orig_level);
  80. mgb_assert(set == 2);
  81. CompNode::finalize();
  82. }
  83. };
  84. #if MGB_CUDA
  85. //! this function is only used in TestGoptInference.EnableCHWN4...
  86. void warp_perspective_mat_gen(HostTensorND& mat, size_t N, size_t INP_H,
  87. size_t INP_W) {
  88. static std::mt19937 rng(next_rand_seed());
  89. auto rand_real = [&](double lo, double hi) {
  90. return rng() / (std::mt19937::max() + 1.0) * (hi - lo) + lo;
  91. };
  92. auto rand_real2 = [&](double range) { return rand_real(-range, range); };
  93. auto ptr = mat.ptr<float>();
  94. for (size_t i = 0; i < N; ++i) {
  95. auto rot = rand_real(0, M_PI * 2), scale = rand_real(0.8, 1.2),
  96. sheer = rand_real(0.9, 1.1), dy = rand_real2(INP_H * 0.5),
  97. dx = rand_real2(INP_W * 0.5), ky = rand_real2(0.1 / INP_H),
  98. kx = rand_real2(0.1 / INP_W), kb = rand_real2(0.1) + 1;
  99. ptr[0] = ptr[4] = cos(rot) * scale;
  100. ptr[1] = -(ptr[3] = sin(rot) * scale);
  101. ptr[3] *= sheer;
  102. ptr[4] *= sheer;
  103. ptr[2] = dx;
  104. ptr[5] = dy;
  105. ptr[6] = kx;
  106. ptr[7] = ky;
  107. ptr[8] = kb;
  108. ptr += 9;
  109. }
  110. mgb_assert(ptr == mat.ptr<float>() + mat.shape().total_nr_elems());
  111. }
  112. #endif
  113. } // namespace
  114. TEST(TestGoptInference, ParamFuseConstEndPoint) {
  115. constexpr size_t SIZE = 23;
  116. HostTensorGenerator<> gen;
  117. auto host_x = gen({SIZE}), host_y = gen({1}), host_p = gen({1});
  118. auto graph = ComputingGraph::make();
  119. graph->options().graph_opt_level = 0;
  120. auto x = opr::SharedDeviceTensor::make(*graph, *host_x),
  121. y = opr::SharedDeviceTensor::make(*graph, *host_y),
  122. p = opr::Host2DeviceCopy::make(*graph, host_p), q = p + x, a = y + 3,
  123. z0 = a + q, z1 = a + 4;
  124. HostTensorND host_z0, host_z1;
  125. SymbolVar z0_1, z1_1;
  126. unpack_vector(gopt::GraphOptimizer{}
  127. .add_pass<gopt::ParamFusePass>()
  128. .apply({{z1, z0}})
  129. .endpoint_vars(),
  130. z1_1, z0_1);
  131. auto func = graph->compile({make_callback_copy(z0_1, host_z0),
  132. make_callback_copy(z1_1, host_z1)});
  133. func->to_json()->writeto_fpath(
  134. output_file("TestGoptInference.ParamFuseEndPoint.json"));
  135. func->execute();
  136. int nr_opr = 0;
  137. func->iter_opr_seq([&](cg::OperatorNodeBase*) {
  138. ++nr_opr;
  139. return true;
  140. });
  141. ASSERT_EQ(8, nr_opr);
  142. auto px = host_x->ptr<float>(), pz0 = host_z0.ptr<float>();
  143. auto yv = host_y->ptr<float>()[0], pv = host_p->ptr<float>()[0],
  144. pz1 = host_z1.ptr<float>()[0];
  145. for (size_t i = 0; i < SIZE; ++i) {
  146. MGB_ASSERT_FLOAT_EQ(px[i] + yv + 3 + pv, pz0[i]);
  147. }
  148. MGB_ASSERT_FLOAT_EQ(yv + 7, pz1);
  149. }
  150. TEST(TestGoptInference, ParamFuse) {
  151. constexpr size_t SIZE = 23;
  152. HostTensorGenerator<> gen;
  153. auto host_x = gen({SIZE}), host_y = gen({1}), host_p = gen({1});
  154. auto graph = ComputingGraph::make();
  155. graph->options().graph_opt_level = 0;
  156. auto x = opr::SharedDeviceTensor::make(*graph, *host_x),
  157. y = opr::SharedDeviceTensor::make(*graph, *host_y),
  158. p = opr::Host2DeviceCopy::make(*graph, host_p),
  159. z = x + y, // endpoint
  160. q = x * y + p; // middle point
  161. SymbolVar z1, q1;
  162. unpack_vector(gopt::GraphOptimizer{}
  163. .add_pass<gopt::ParamFusePass>()
  164. .apply({{z, q}})
  165. .endpoint_vars(),
  166. z1, q1);
  167. ASSERT_TRUE(z1.node()->owner_opr()->same_type<opr::SharedDeviceTensor>());
  168. ASSERT_NE(q1.node()->owner_opr(), q.node()->owner_opr());
  169. ASSERT_EQ(q1.node()->owner_opr()->dyn_typeinfo(),
  170. q.node()->owner_opr()->dyn_typeinfo());
  171. HostTensorND host_z, host_q;
  172. auto func = graph->compile(
  173. {make_callback_copy(z1, host_z), make_callback_copy(q1, host_q)});
  174. func->execute();
  175. int nr_opr = 0;
  176. func->iter_opr_seq([&](cg::OperatorNodeBase*) {
  177. ++nr_opr;
  178. return true;
  179. });
  180. ASSERT_EQ(6, nr_opr);
  181. auto px = host_x->ptr<float>(), pz = host_z.ptr<float>(),
  182. pq = host_q.ptr<float>();
  183. auto yv = host_y->ptr<float>()[0], pv = host_p->ptr<float>()[0];
  184. for (size_t i = 0; i < SIZE; ++i) {
  185. MGB_ASSERT_FLOAT_EQ(px[i] + yv, pz[i]);
  186. MGB_ASSERT_FLOAT_EQ(px[i] * yv + pv, pq[i]);
  187. }
  188. }
  189. TEST(TestGoptInference, ParamFuseMultiDeviceTensorHolder) {
  190. constexpr size_t SIZE = 23;
  191. HostTensorGenerator<> gen;
  192. auto host_x = gen({SIZE}), host_y = gen({1}), host_p = gen({1});
  193. auto graph = ComputingGraph::make();
  194. graph->options().graph_opt_level = 0;
  195. auto x = opr::SharedDeviceTensor::make(*graph, *host_x),
  196. y = opr::SharedDeviceTensor::make(*graph, *host_y),
  197. p = opr::Host2DeviceCopy::make(*graph, host_p),
  198. z = x + y, //! endpoint
  199. q = x * y + p; //! middle point
  200. SymbolVar z1, q1;
  201. unpack_vector(gopt::GraphOptimizer{}
  202. .add_pass<gopt::ParamMergePass>()
  203. .apply({{z}})
  204. .endpoint_vars(),
  205. z1);
  206. ASSERT_TRUE(z1.node()
  207. ->owner_opr()
  208. ->input(0)
  209. ->owner_opr()
  210. ->same_type<opr::MultipleDeviceTensorHolder>());
  211. unpack_vector(gopt::GraphOptimizer{}
  212. .add_pass<gopt::ParamMergePass>()
  213. .add_pass<gopt::ParamFusePass>()
  214. .apply({{z, q}})
  215. .endpoint_vars(),
  216. z1, q1);
  217. ASSERT_TRUE(z1.node()->owner_opr()->same_type<opr::SharedDeviceTensor>());
  218. ASSERT_NE(q1.node()->owner_opr(), q.node()->owner_opr());
  219. ASSERT_EQ(q1.node()->owner_opr()->dyn_typeinfo(),
  220. q.node()->owner_opr()->dyn_typeinfo());
  221. HostTensorND host_z, host_q;
  222. auto func = graph->compile(
  223. {make_callback_copy(z1, host_z), make_callback_copy(q1, host_q)});
  224. func->execute();
  225. int nr_opr = 0;
  226. func->iter_opr_seq([&](cg::OperatorNodeBase* op) {
  227. ++nr_opr;
  228. return true;
  229. });
  230. ASSERT_EQ(6, nr_opr);
  231. auto px = host_x->ptr<float>(), pz = host_z.ptr<float>(),
  232. pq = host_q.ptr<float>();
  233. auto yv = host_y->ptr<float>()[0], pv = host_p->ptr<float>()[0];
  234. for (size_t i = 0; i < SIZE; ++i) {
  235. MGB_ASSERT_FLOAT_EQ(px[i] + yv, pz[i]);
  236. MGB_ASSERT_FLOAT_EQ(px[i] * yv + pv, pq[i]);
  237. }
  238. }
  239. TEST(TestGoptInference, ParamFuseMultiRead) {
  240. HostTensorGenerator<> gen;
  241. auto graph = ComputingGraph::make();
  242. graph->options().graph_opt_level = 0;
  243. auto mkvar = [&](const char* name, const TensorShape& shp) {
  244. return opr::Host2DeviceCopy::make(*graph, gen(shp)).rename(name);
  245. };
  246. auto mkcvar = [&](const char* name, const TensorShape& shp) {
  247. return opr::SharedDeviceTensor::make(*graph, *gen(shp)).rename(name);
  248. };
  249. auto x = mkvar("x", {23}), p0 = mkcvar("p0", {1}), p1 = mkcvar("p1", {1}),
  250. z0 = x * (p0 + p1) + x / (p0 + p1);
  251. SymbolVar z1;
  252. unpack_vector(gopt::GraphOptimizer{}
  253. .add_pass<gopt::ParamFusePass>()
  254. .apply({{z0}})
  255. .endpoint_vars(),
  256. z1);
  257. ASSERT_NE(z0.node(), z1.node());
  258. ASSERT_TRUE(z1.node()
  259. ->owner_opr()
  260. ->input(0)
  261. ->owner_opr()
  262. ->input(1)
  263. ->owner_opr()
  264. ->same_type<opr::SharedDeviceTensor>());
  265. ASSERT_TRUE(z1.node()
  266. ->owner_opr()
  267. ->input(1)
  268. ->owner_opr()
  269. ->input(1)
  270. ->owner_opr()
  271. ->same_type<opr::SharedDeviceTensor>());
  272. HostTensorND host_z0, host_z1;
  273. graph->compile({make_callback_copy(z0, host_z0),
  274. make_callback_copy(z1, host_z1)})
  275. ->execute();
  276. MGB_ASSERT_TENSOR_EQ(host_z0, host_z1);
  277. }
  278. TEST(TestGoptInference, ParamFuseStaticInfer) {
  279. HostTensorGenerator<> gen;
  280. auto graph = ComputingGraph::make();
  281. auto mkvar = [&](const char* name, const TensorShape& shp) {
  282. return opr::Host2DeviceCopy::make(*graph, gen(shp)).rename(name);
  283. };
  284. auto mkcvar = [&](const char* name, const TensorShape& shp) {
  285. return opr::SharedDeviceTensor::make(*graph, *gen(shp)).rename(name);
  286. };
  287. auto a = mkvar("x", {4}),
  288. b = a.reshape(opr::GetVarShape::make(mkcvar("tshp", {2, 2})));
  289. SymbolVar b1;
  290. unpack_vector(gopt::GraphOptimizer{}
  291. .add_pass<gopt::ParamFusePass>()
  292. .apply({{b}})
  293. .endpoint_vars(),
  294. b1);
  295. ASSERT_EQ(b1, a.reshape({2, 2}));
  296. }
  297. TEST(TestGoptInference, ParamRedistributeConvMul) {
  298. constexpr size_t N = 4, IC = 3, IH = 5, IW = 4, OC = 4, KH = 3, KW = 2;
  299. HostTensorGenerator<> gen;
  300. auto host_x = gen({N, IC, IH, IW}), host_k = gen({IC}),
  301. host_w = gen({OC, IC, KH, KW});
  302. auto graph = ComputingGraph::make();
  303. auto x = opr::Host2DeviceCopy::make(*graph, host_x),
  304. k = opr::Dimshuffle::make(
  305. opr::SharedDeviceTensor::make(*graph, *host_k),
  306. {-1, 0, -1, -1}),
  307. w = opr::SharedDeviceTensor::make(*graph, *host_w),
  308. y0 = opr::Convolution::make(x * k, w);
  309. SymbolVar y1;
  310. unpack_vector(gopt::GraphOptimizer{}
  311. .add_pass<gopt::ParamRedistributePass>()
  312. .apply({{y0}})
  313. .endpoint_vars(),
  314. y1);
  315. ASSERT_NE(y0.node(), y1.node());
  316. HostTensorND host_y0, host_y1;
  317. auto func = graph->compile(
  318. {make_callback_copy(y0, host_y0), make_callback_copy(y1, host_y1)});
  319. func->execute();
  320. MGB_ASSERT_TENSOR_EQ(host_y0, host_y1);
  321. }
  322. TEST(TestGoptInference, ParamRedistributeConvMulUniqReader) {
  323. constexpr size_t N = 4, C = 3, IH = 5, IW = 4, KH = 1, KW = 1;
  324. HostTensorGenerator<> gen;
  325. auto host_x = gen({N, C, IH, IW}), host_k = gen({C}),
  326. host_w = gen({C, C, KH, KW});
  327. auto graph = ComputingGraph::make();
  328. graph->options().graph_opt_level = 0;
  329. auto x = opr::Host2DeviceCopy::make(*graph, host_x),
  330. k = opr::Dimshuffle::make(
  331. opr::SharedDeviceTensor::make(*graph, *host_k) + 2,
  332. {-1, 0, -1, -1}),
  333. w = opr::SharedDeviceTensor::make(*graph, *host_w),
  334. // y0 should be replaced
  335. y0 = opr::powf(opr::Convolution::make(x * k, w).rename("y0") + 2,
  336. 2),
  337. y0k = (y0 * k).rename("y0k"),
  338. // y0k is accessed twice, so it should not be replaced
  339. y1 = opr::Convolution::make(y0k, w).rename("y1"), z0 = y1 / y0k;
  340. SymbolVar z1;
  341. unpack_vector(gopt::GraphOptimizer{}
  342. .add_pass<gopt::ParamRedistributePass>()
  343. .apply({{z0}})
  344. .endpoint_vars(),
  345. z1);
  346. ASSERT_NE(z0.node(), z1.node());
  347. auto y1_repl = z1.node()->owner_opr()->input(0)->owner_opr();
  348. ASSERT_TRUE(y1_repl->same_type<opr::Convolution>());
  349. ASSERT_EQ(y1_repl->input(0), z1.node()->owner_opr()->input(1));
  350. HostTensorND host_z0, host_z1;
  351. auto func = graph->compile(
  352. {make_callback_copy(z0, host_z0), make_callback_copy(z1, host_z1)});
  353. func->execute();
  354. MGB_ASSERT_TENSOR_NEAR(host_z0, host_z1, 5e-5);
  355. }
  356. TEST(TestGoptInference, ParamRedistributeMulConvMul) {
  357. constexpr size_t N = 4, IC = 3, IH = 5, IW = 4, OC = 4, KH = 3, KW = 2;
  358. HostTensorGenerator<> gen;
  359. auto host_x = gen({N, IC, IH, IW}), host_k1 = gen({IC}),
  360. host_k2 = gen({1, OC, 1, 1}), host_w = gen({OC, IC, KH, KW});
  361. auto graph = ComputingGraph::make();
  362. auto x = opr::Host2DeviceCopy::make(*graph, host_x),
  363. k1 = opr::Dimshuffle::make(
  364. opr::SharedDeviceTensor::make(*graph, *host_k1),
  365. {-1, 0, -1, -1}),
  366. k2 = opr::SharedDeviceTensor::make(*graph, *host_k2),
  367. w = opr::SharedDeviceTensor::make(*graph, *host_w),
  368. y0 = opr::Convolution::make(x * k1, w) * k2;
  369. SymbolVar y1;
  370. unpack_vector(gopt::GraphOptimizer{}
  371. .add_pass<gopt::ParamRedistributePass>()
  372. .add_pass<gopt::ParamFusePass>()
  373. .apply({{y0}})
  374. .endpoint_vars(),
  375. y1);
  376. auto y1opr = y1.node()->owner_opr();
  377. ASSERT_TRUE(y1opr->same_type<opr::Convolution>());
  378. ASSERT_EQ(y1opr->input(0), x.node());
  379. HostTensorND host_y0, host_y1;
  380. auto func = graph->compile(
  381. {make_callback_copy(y0, host_y0), make_callback_copy(y1, host_y1)});
  382. func->execute();
  383. MGB_ASSERT_TENSOR_NEAR(host_y0, host_y1, 5e-6);
  384. }
  385. TEST(TestGoptInference, ParamRedistributeConvAdd) {
  386. constexpr size_t N = 4, IC = 3, IH = 5, IW = 4, OC = 4, KH = 3, KW = 2;
  387. HostTensorGenerator<> gen;
  388. auto host_x = gen({N, IC, IH, IW}), host_b = gen({IC}),
  389. host_w = gen({OC, IC, KH, KW});
  390. auto graph = ComputingGraph::make();
  391. auto x = opr::Host2DeviceCopy::make(*graph, host_x),
  392. b = opr::Dimshuffle::make(
  393. opr::SharedDeviceTensor::make(*graph, *host_b),
  394. {-1, 0, -1, -1}),
  395. w = opr::SharedDeviceTensor::make(*graph, *host_w),
  396. y0 = opr::Convolution::make(x + b, w);
  397. SymbolVar y1;
  398. unpack_vector(gopt::GraphOptimizer{}
  399. .add_pass<gopt::ParamRedistributePass>()
  400. .add_pass<gopt::ParamFusePass>()
  401. .apply({{y0}})
  402. .endpoint_vars(),
  403. y1);
  404. ASSERT_NE(y0.node(), y1.node());
  405. HostTensorND host_y0, host_y1;
  406. auto func = graph->compile(
  407. {make_callback_copy(y0, host_y0), make_callback_copy(y1, host_y1)});
  408. func->execute();
  409. MGB_ASSERT_TENSOR_NEAR(host_y0, host_y1, 1e-5);
  410. }
  411. TEST(TestGoptInference, ParamRedistributeDistThenReasso) {
  412. constexpr size_t N = 4, IC0 = 3, IC1 = 6, IH = 5, IW = 4, OC = 4, KH = 3,
  413. KW = 2;
  414. HostTensorGenerator<> gen;
  415. auto graph = ComputingGraph::make();
  416. auto mkvar = [&](const char* name, const TensorShape& shp) {
  417. return opr::Host2DeviceCopy::make(*graph, gen(shp)).rename(name);
  418. };
  419. auto mkcvar = [&](const char* name, const TensorShape& shp) {
  420. return opr::SharedDeviceTensor::make(*graph, *gen(shp)).rename(name);
  421. };
  422. auto x0 = mkvar("x0", {N, IC0, IH, IW}), x1 = mkvar("x1", {N, IC1, IH, IW}),
  423. k0 = opr::Dimshuffle::make(mkcvar("x1_", {IC0}), {-1, 0, -1, -1})
  424. .rename("x1"),
  425. w0 = mkcvar("w0", {OC, IC0, KH, KW}),
  426. k1 = mkcvar("k1", {1, IC1, 1, 1}),
  427. w1 = mkcvar("w1", {OC, IC1, KH, KW}), b0 = mkvar("b0", {1, OC, 1, 1}),
  428. b1 = mkcvar("b1", {1}), k2 = mkcvar("k2", {1}),
  429. y0 = (opr::Convolution::make(x0 * k0, w0) +
  430. opr::Convolution::make(x1 + k1, w1) + b0 + b1) *
  431. k2;
  432. SymbolVar y1;
  433. unpack_vector(gopt::GraphOptimizer{}
  434. .add_pass<gopt::ParamRedistributePass>()
  435. .add_pass<gopt::ReorderArithChainPass>(
  436. gopt::ConstVarType::IMMUTABLE_AND_PARAM)
  437. .add_pass<gopt::ParamFusePass>()
  438. .apply({{y0}})
  439. .endpoint_vars(),
  440. y1);
  441. ASSERT_NE(y0.node(), y1.node());
  442. HostTensorND host_y0, host_y1;
  443. auto func = graph->compile(
  444. {make_callback_copy(y0, host_y0), make_callback_copy(y1, host_y1)});
  445. func->execute();
  446. MGB_ASSERT_TENSOR_NEAR(host_y0, host_y1, 1e-5);
  447. auto chain =
  448. gopt::extract_opr_leaves(y1.node(), [](cg::OperatorNodeBase* opr) {
  449. return gopt::as_elem_opr(opr, opr::Elemwise::Mode::ADD);
  450. });
  451. size_t nr_conv = 0;
  452. for (auto i : chain) {
  453. auto opr = i->owner_opr();
  454. if (opr->same_type<opr::Convolution>()) {
  455. ++nr_conv;
  456. ASSERT_TRUE(opr->input(0)
  457. ->owner_opr()
  458. ->same_type<opr::Host2DeviceCopy>());
  459. ASSERT_TRUE(opr->input(1)
  460. ->owner_opr()
  461. ->same_type<opr::SharedDeviceTensor>());
  462. }
  463. }
  464. ASSERT_EQ(2u, nr_conv);
  465. ASSERT_EQ(4u, chain.size());
  466. }
  467. TEST(TestGoptInference, ParamRedistributeMultiChange) {
  468. constexpr size_t N = 4, IC = 3, IH = 5, IW = 4, OC = 4, KH = 3, KW = 2;
  469. HostTensorGenerator<> gen;
  470. auto graph = ComputingGraph::make();
  471. graph->options().graph_opt_level = 0;
  472. auto mkvar = [&](const char* name, const TensorShape& shp) {
  473. return opr::Host2DeviceCopy::make(*graph, gen(shp)).rename(name);
  474. };
  475. auto mkcvar = [&](const char* name, const TensorShape& shp) {
  476. return opr::SharedDeviceTensor::make(*graph, *gen(shp)).rename(name);
  477. };
  478. auto x = mkvar("x", {N, IC, IH, IW}), k0 = mkcvar("k0", {1, IC, 1, 1}),
  479. b0 = mkcvar("b0", {1, IC, 1, 1}), k1 = mkcvar("k0", {1}),
  480. b1 = mkcvar("b0", {1}), w = mkcvar("w", {OC, IC, KH, KW}),
  481. y0 = (opr::Convolution::make(x * k0 + b0, w) + b1) * k1;
  482. SymbolVar y1;
  483. unpack_vector(gopt::GraphOptimizer{}
  484. .add_pass<gopt::ParamRedistributePass>()
  485. .add_pass<gopt::ParamFusePass>()
  486. .apply({{y0}})
  487. .endpoint_vars(),
  488. y1);
  489. ASSERT_NE(y0.node(), y1.node());
  490. HostTensorND host_y0, host_y1;
  491. auto func = graph->compile(
  492. {make_callback_copy(y0, host_y0), make_callback_copy(y1, host_y1)});
  493. func->execute();
  494. MGB_ASSERT_TENSOR_NEAR(host_y0, host_y1, 1e-5);
  495. auto y1elem = gopt::as_elem_opr(y1.node(), opr::Elemwise::Mode::ADD);
  496. ASSERT_TRUE(y1elem);
  497. auto yconv = y1elem->input(0)->owner_opr();
  498. if (!yconv->same_type<opr::Convolution>())
  499. yconv = y1elem->input(1)->owner_opr();
  500. ASSERT_TRUE(yconv->same_type<opr::Convolution>());
  501. ASSERT_EQ(x.node(), yconv->input(0));
  502. }
  503. TEST(TestGoptInference, ParamRedistributeMultiReader) {
  504. constexpr size_t N = 4, IC = 3, IH = 5, IW = 4, OC = 4, KH = 3, KW = 2;
  505. HostTensorGenerator<> gen;
  506. auto graph = ComputingGraph::make();
  507. graph->options().graph_opt_level = 0;
  508. auto mkvar = [&](const char* name, const TensorShape& shp) {
  509. return opr::Host2DeviceCopy::make(*graph, gen(shp)).rename(name);
  510. };
  511. auto mkcvar = [&](const char* name, const TensorShape& shp) {
  512. return opr::SharedDeviceTensor::make(*graph, *gen(shp)).rename(name);
  513. };
  514. auto x = mkvar("x", {N, IC, IH, IW}), k = mkcvar("k", {1, OC, 1, 1}),
  515. w = mkcvar("w", {OC, IC, KH, KW});
  516. auto conv = opr::Convolution::make(x, w);
  517. auto t = conv * k;
  518. auto y0 = t * 4.2f + t * 2.4f;
  519. SymbolVar y1;
  520. unpack_vector(gopt::GraphOptimizer{}
  521. .add_pass<gopt::ParamRedistributePass>()
  522. .add_pass<gopt::ParamFusePass>()
  523. .apply({{y0}})
  524. .endpoint_vars(),
  525. y1);
  526. ASSERT_NE(y0.node(), y1.node());
  527. HostTensorND host_y0, host_y1;
  528. auto func = graph->compile(
  529. {make_callback_copy(y0, host_y0), make_callback_copy(y1, host_y1)});
  530. func->execute();
  531. MGB_ASSERT_TENSOR_NEAR(host_y0, host_y1, 1e-5);
  532. auto y1elem = gopt::as_elem_opr(y1.node(), opr::Elemwise::Mode::ADD);
  533. ASSERT_TRUE(y1elem);
  534. auto ymul0 = gopt::as_elem_opr(y1elem->input(0), opr::Elemwise::Mode::MUL),
  535. ymul1 = gopt::as_elem_opr(y1elem->input(1), opr::Elemwise::Mode::MUL);
  536. ASSERT_TRUE(ymul0);
  537. ASSERT_TRUE(ymul1);
  538. auto yconv = ymul0->input(0)->owner_opr();
  539. if (!yconv->same_type<opr::Convolution>()) {
  540. yconv = ymul0->input(1)->owner_opr();
  541. }
  542. ASSERT_TRUE(yconv->same_type<opr::Convolution>());
  543. if (ymul1->input(0) != yconv->output(0)) {
  544. ASSERT_EQ(yconv->output(0), ymul1->input(1));
  545. }
  546. ASSERT_EQ(x.node(), yconv->input(0));
  547. }
  548. TEST(TestGoptInference, ParamFuseBiasMerge) {
  549. HostTensorGenerator<> gen;
  550. auto graph = ComputingGraph::make();
  551. graph->options().graph_opt_level = 0;
  552. auto mkvar = [&](const char* name, const TensorShape& shp) {
  553. return opr::Host2DeviceCopy::make(*graph, gen(shp)).rename(name);
  554. };
  555. auto mkcvar = [&](const char* name, const TensorShape& shp) {
  556. return opr::SharedDeviceTensor::make(*graph, *gen(shp)).rename(name);
  557. };
  558. auto x = mkvar("x", {6, 3, 8, 8}), w1 = mkcvar("w1", {4, 3, 3, 3}),
  559. w2 = mkcvar("w2", {4, 3, 3, 3}), b1 = mkcvar("b1", {1, 4, 1, 1}),
  560. b2 = mkcvar("b2", {1, 4, 1, 1}),
  561. y1 = opr::Convolution::make(x, w1) + b1,
  562. y2 = opr::Convolution::make(x, w2) + b2, y = y1 + y2;
  563. SymbolVar y_opt;
  564. unpack_vector(gopt::optimize_for_inference({y}), y_opt);
  565. HostTensorND host_y, host_y_opt;
  566. auto func = graph->compile({make_callback_copy(y, host_y),
  567. make_callback_copy(y_opt, host_y_opt)});
  568. func->execute();
  569. MGB_ASSERT_TENSOR_EQ(host_y, host_y_opt);
  570. graph->compile({{y_opt, {}}})
  571. ->to_json()
  572. ->writeto_fpath(
  573. output_file("TestGoptInference.ParamFuseConvMerge.json"));
  574. auto chain = gopt::extract_opr_leaves(
  575. y_opt.node(), [](cg::OperatorNodeBase* opr) {
  576. return gopt::as_elem_opr(opr, opr::Elemwise::Mode::ADD);
  577. });
  578. ASSERT_EQ(3u, chain.size());
  579. }
  580. TEST(TestGoptInference, Float16IOFloat32Compute) {
  581. constexpr size_t INP_H = 10, INP_W = 10;
  582. HostTensorGenerator<> gen;
  583. auto graph = ComputingGraph::make();
  584. auto mkvar = [&](const char* name, const TensorShape& shp) {
  585. return opr::Host2DeviceCopy::make(*graph, gen(shp)).rename(name);
  586. };
  587. graph->options().graph_opt_level = 0;
  588. auto a = mkvar("a", {1, 4, INP_H, INP_W}),
  589. s0 = mkvar("s0", {20, 3, INP_H, INP_W}),
  590. s1 = mkvar("s1", {4, 3, 1, 1});
  591. auto b = opr::Convolution::make(s0, s1, {}, {});
  592. auto y = a + b;
  593. y = opr::Concat::make({y, -y}, 0);
  594. y = opr::Reduce::make(y, {}, y.make_scalar(1));
  595. SymbolVar y_opt;
  596. auto options = gopt::OptimizeForInferenceOptions{};
  597. options.enable_f16_io_f32_comp();
  598. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  599. ASSERT_EQ(y_opt.dtype(), dtype::Float32());
  600. HostTensorND host_y, host_y_opt;
  601. auto func = graph->compile({make_callback_copy(y, host_y),
  602. make_callback_copy(y_opt, host_y_opt)});
  603. func->execute();
  604. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3);
  605. }
  606. TEST(TestGoptInference, Float16IOFloat32ComputeDeConv) {
  607. constexpr size_t INP_H = 10, INP_W = 10;
  608. HostTensorGenerator<> gen;
  609. auto graph = ComputingGraph::make();
  610. auto mkvar = [&](const char* name, const TensorShape& shp) {
  611. return opr::Host2DeviceCopy::make(*graph, gen(shp)).rename(name);
  612. };
  613. graph->options().graph_opt_level = 0;
  614. auto s0 = mkvar("s0", {5, 5, 3, 3}), s1 = mkvar("s1", {1, 5, INP_H, INP_W});
  615. auto y = opr::ConvolutionBackwardData::make(s0, s1, {}, {});
  616. SymbolVar y_opt;
  617. auto options = gopt::OptimizeForInferenceOptions{};
  618. options.enable_f16_io_f32_comp();
  619. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  620. ASSERT_EQ(
  621. find_opr<opr::ConvolutionBackwardData>(y_opt).param().compute_mode,
  622. opr::ConvBias::Param::ConvBias::ComputeMode::FLOAT32);
  623. ASSERT_EQ(y_opt.dtype(), dtype::Float32());
  624. HostTensorND host_y, host_y_opt;
  625. auto func = graph->compile({make_callback_copy(y, host_y),
  626. make_callback_copy(y_opt, host_y_opt)});
  627. func->execute();
  628. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-2);
  629. }
  630. TEST(TestGoptInference, Float16IOFloat32ComputeWarpPerspective) {
  631. constexpr size_t INP_H = 10, INP_W = 10, N = 2;
  632. HostTensorGenerator<> gen;
  633. auto graph = ComputingGraph::make();
  634. auto mkvar = [&](const char* name, const TensorShape& shp) {
  635. return opr::Host2DeviceCopy::make(*graph, gen(shp)).rename(name);
  636. };
  637. graph->options().graph_opt_level = 0;
  638. auto a = mkvar("a", {N, 4, INP_H, INP_W});
  639. float value1 = M_PI, value2 = 0.6;
  640. auto gen_mat = [&](HostTensorND& mat) {
  641. auto ptr = mat.ptr<float>();
  642. for (size_t i = 0; i < N; ++i) {
  643. auto rot = value1, scale = value2, sheer = value1, dy = value2,
  644. dx = value2, ky = value2, kx = value2, kb = value2;
  645. ptr[0] = ptr[4] = cos(rot) * scale;
  646. ptr[1] = -(ptr[3] = sin(rot) * scale);
  647. ptr[3] *= sheer;
  648. ptr[4] *= sheer;
  649. ptr[2] = dx;
  650. ptr[5] = dy;
  651. ptr[6] = kx;
  652. ptr[7] = ky;
  653. ptr[8] = kb;
  654. ptr += 9;
  655. }
  656. mgb_assert(ptr == mat.ptr<float>() + mat.shape().total_nr_elems());
  657. };
  658. auto mat_host = std::make_shared<HostTensorND>(
  659. a.node()->comp_node(), TensorShape{N, 3, 3}, dtype::Float32());
  660. gen_mat(*mat_host);
  661. auto mat = opr::Host2DeviceCopy::make(*graph, mat_host).rename("mat");
  662. TensorShape out_shp{20, 20};
  663. auto y = opr::WarpPerspective::make(a, mat, out_shp);
  664. SymbolVar y_opt;
  665. auto options = gopt::OptimizeForInferenceOptions{};
  666. options.enable_f16_io_f32_comp();
  667. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  668. ASSERT_EQ(y_opt.dtype(), dtype::Float32());
  669. HostTensorND host_y, host_y_opt;
  670. auto func = graph->compile({make_callback_copy(y, host_y),
  671. make_callback_copy(y_opt, host_y_opt)});
  672. func->execute();
  673. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3);
  674. }
  675. TEST(TestGoptInference, Float16IOFloat32ComputeRemap) {
  676. auto cn = CompNode::load("cpu1");
  677. constexpr size_t INP_H = 10, INP_W = 10, N = 2;
  678. HostTensorGenerator<> gen;
  679. auto graph = ComputingGraph::make();
  680. auto mkvar = [&](const char* name, const TensorShape& shp) {
  681. return opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name);
  682. };
  683. graph->options().graph_opt_level = 0;
  684. auto a = mkvar("a", {N, 4, INP_H, INP_W});
  685. auto gen_map = [&](HostTensorND& mat) {
  686. auto ptr = mat.ptr<float>();
  687. for (size_t n = 0; n < N; ++n) {
  688. for (int h = 0; h < 5; ++h) {
  689. for (int w = 0; w < 5; ++w) {
  690. *ptr++ = (h * 5 * 2) + 5 * 2 + 0;
  691. *ptr++ = (h * 5 * 2) + 5 * 2 + 1;
  692. }
  693. }
  694. }
  695. mgb_assert(ptr == mat.ptr<float>() + mat.shape().total_nr_elems());
  696. };
  697. auto map_host = std::make_shared<HostTensorND>(
  698. a.node()->comp_node(), TensorShape{N, 5, 5, 2}, dtype::Float32());
  699. gen_map(*map_host);
  700. auto map = opr::Host2DeviceCopy::make(*graph, map_host).rename("map");
  701. auto y = opr::Remap::make(a, map);
  702. SymbolVar y_opt;
  703. auto options = gopt::OptimizeForInferenceOptions{};
  704. options.enable_f16_io_f32_comp();
  705. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  706. ASSERT_EQ(y_opt.dtype(), dtype::Float32());
  707. HostTensorND host_y, host_y_opt;
  708. auto func = graph->compile({make_callback_copy(y, host_y),
  709. make_callback_copy(y_opt, host_y_opt)});
  710. func->execute();
  711. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3);
  712. }
  713. TEST(TestGoptInference, Uint8IOFloat16ComputeWarpPerspective) {
  714. constexpr size_t INP_H = 10, INP_W = 10, N = 2;
  715. HostTensorGenerator<dtype::Uint8> gen_uint8;
  716. auto graph = ComputingGraph::make();
  717. auto mkvar = [&](const char* name, const TensorShape& shp) {
  718. return opr::Host2DeviceCopy::make(*graph, gen_uint8(shp)).rename(name);
  719. };
  720. graph->options().graph_opt_level = 0;
  721. auto a = mkvar("a", {N, 4, INP_H, INP_W});
  722. float value1 = M_PI, value2 = 0.6;
  723. auto gen_mat = [&](HostTensorND& mat) {
  724. auto ptr = mat.ptr<float>();
  725. for (size_t i = 0; i < N; ++i) {
  726. auto rot = value1, scale = value2, sheer = value1, dy = value2,
  727. dx = value2, ky = value2, kx = value2, kb = value2;
  728. ptr[0] = ptr[4] = cos(rot) * scale;
  729. ptr[1] = -(ptr[3] = sin(rot) * scale);
  730. ptr[3] *= sheer;
  731. ptr[4] *= sheer;
  732. ptr[2] = dx;
  733. ptr[5] = dy;
  734. ptr[6] = kx;
  735. ptr[7] = ky;
  736. ptr[8] = kb;
  737. ptr += 9;
  738. }
  739. mgb_assert(ptr == mat.ptr<float>() + mat.shape().total_nr_elems());
  740. };
  741. auto mat_host = std::make_shared<HostTensorND>(
  742. a.node()->comp_node(), TensorShape{N, 3, 3}, dtype::Float32());
  743. gen_mat(*mat_host);
  744. auto mat = opr::Host2DeviceCopy::make(*graph, mat_host).rename("mat");
  745. TensorShape out_shp{20, 20};
  746. auto y = opr::WarpPerspective::make(a, mat, out_shp);
  747. SymbolVar y_opt;
  748. auto options = gopt::OptimizeForInferenceOptions{};
  749. options.enable_f16_io_comp();
  750. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  751. ASSERT_EQ(y_opt.dtype(), dtype::Uint8());
  752. HostTensorND host_y, host_y_opt;
  753. auto func = graph->compile({make_callback_copy(y, host_y),
  754. make_callback_copy(y_opt, host_y_opt)});
  755. func->execute();
  756. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3);
  757. }
  758. TEST(TestGoptInference, Float32TOFloat16) {
  759. CompNode cn = CompNode::load("cpu0");
  760. HostTensorGenerator<> gen(0, 1, 0);
  761. auto host_x0 = gen({1, 4, 16, 8}, cn), host_x1 = gen({2, 3, 16, 8}, cn),
  762. host_x2 = gen({4, 3, 1, 1}, cn);
  763. auto graph = ComputingGraph::make();
  764. auto make_f32_to_f16_graph = [&]() {
  765. graph->options().graph_opt_level = 0;
  766. auto d0 = opr::Host2DeviceCopy::make(*graph, host_x0),
  767. d1 = opr::Host2DeviceCopy::make(*graph, host_x1),
  768. d2 = opr::SharedDeviceTensor::make(*graph, *host_x2);
  769. auto b = opr::Convolution::make(d1, d2, {}, {});
  770. auto y = d0 + b;
  771. y = opr::Reduce::make(y, {}, y.make_scalar(1));
  772. SymbolVar y_opt;
  773. auto options = gopt::OptimizeForInferenceOptions{};
  774. options.enable_f16_io_comp();
  775. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  776. return y_opt;
  777. };
  778. auto make_f16_graph = [&]() {
  779. auto d0 = opr::TypeCvt::make(
  780. opr::Host2DeviceCopy::make(*graph, host_x0),
  781. dtype::Float16{}),
  782. d1 = opr::TypeCvt::make(
  783. opr::Host2DeviceCopy::make(*graph, host_x1),
  784. dtype::Float16{}),
  785. d2 = opr::TypeCvt::make(
  786. opr::SharedDeviceTensor::make(*graph, *host_x2),
  787. dtype::Float16{});
  788. auto b = opr::Convolution::make(d1, d2, {}, {});
  789. SymbolVar y = d0 + b;
  790. y = opr::Reduce::make(y, {}, y.make_scalar(1));
  791. y = opr::TypeCvt::make(y, dtype::Float32{});
  792. return y;
  793. };
  794. auto y_opt = make_f32_to_f16_graph();
  795. auto y = make_f16_graph();
  796. ASSERT_EQ(y_opt.dtype(), dtype::Float32{});
  797. ASSERT_EQ(y.dtype(), dtype::Float32{});
  798. HostTensorND host_y_opt, host_y;
  799. auto func = graph->compile({make_callback_copy(y, host_y),
  800. make_callback_copy(y_opt, host_y_opt)});
  801. func->execute();
  802. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3);
  803. }
  804. TEST(TestGoptInference, Float32TOFloat16C32) {
  805. CompNode cn = CompNode::load("cpu0");
  806. HostTensorGenerator<> gen(0, 1, 0);
  807. auto host_x0 = gen({1, 4, 1, 1}, cn), host_x1 = gen({2, 3, 16, 8}, cn),
  808. host_x2 = gen({4, 3, 1, 1}, cn);
  809. auto graph = ComputingGraph::make();
  810. auto make_f32_to_f16_graph = [&]() {
  811. graph->options().graph_opt_level = 0;
  812. auto d0 = opr::Host2DeviceCopy::make(*graph, host_x0),
  813. d1 = opr::Host2DeviceCopy::make(*graph, host_x1),
  814. d2 = opr::SharedDeviceTensor::make(*graph, *host_x2);
  815. auto y = opr::ConvBias::make(d1, d2, d0);
  816. y = opr::Reduce::make(y, {}, y.make_scalar(1));
  817. SymbolVar y_opt;
  818. auto options = gopt::OptimizeForInferenceOptions{};
  819. options.enable_f16_io_f32_comp();
  820. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  821. return y_opt;
  822. };
  823. auto make_f16_graph = [&]() {
  824. auto d0 = opr::TypeCvt::make(
  825. opr::TypeCvt::make(
  826. opr::Host2DeviceCopy::make(*graph, host_x0),
  827. dtype::Float16{}),
  828. dtype::Float32{}),
  829. d1 = opr::TypeCvt::make(
  830. opr::TypeCvt::make(
  831. opr::Host2DeviceCopy::make(*graph, host_x1),
  832. dtype::Float16{}),
  833. dtype::Float32{}),
  834. d2 = opr::TypeCvt::make(
  835. opr::TypeCvt::make(
  836. opr::SharedDeviceTensor::make(*graph, *host_x2),
  837. dtype::Float16{}),
  838. dtype::Float32{});
  839. auto y = opr::ConvBias::make(d1, d2, d0);
  840. y = opr::Reduce::make(y, {}, y.make_scalar(1));
  841. y = opr::TypeCvt::make(opr::TypeCvt::make(y, dtype::Float16{}),
  842. dtype::Float32{});
  843. return y;
  844. };
  845. auto y_opt = make_f32_to_f16_graph();
  846. auto y = make_f16_graph();
  847. ASSERT_EQ(find_opr<opr::ConvBias>(y_opt).param().compute_mode,
  848. opr::ConvBias::Param::ConvBias::ComputeMode::FLOAT32);
  849. ASSERT_EQ(y_opt.dtype(), dtype::Float32{});
  850. ASSERT_EQ(y.dtype(), dtype::Float32{});
  851. HostTensorND host_y_opt, host_y;
  852. auto func = graph->compile({make_callback_copy(y, host_y),
  853. make_callback_copy(y_opt, host_y_opt)});
  854. func->execute();
  855. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3);
  856. }
  857. TEST(TestGoptInference, Float32TOFloat16EndpointElemwise) {
  858. CompNode cn = CompNode::load("cpu0");
  859. HostTensorGenerator<> gen(0, 1, 0);
  860. auto host_x0 = gen({1, 4, 16, 8}, cn), host_x1 = gen({2, 3, 16, 8}, cn),
  861. host_x2 = gen({4, 3, 1, 1}, cn);
  862. auto graph = ComputingGraph::make();
  863. auto make_f32_to_f16_graph = [&]() {
  864. graph->options().graph_opt_level = 0;
  865. auto d0 = opr::Host2DeviceCopy::make(*graph, host_x0),
  866. d1 = opr::Host2DeviceCopy::make(*graph, host_x1),
  867. d2 = opr::SharedDeviceTensor::make(*graph, *host_x2);
  868. auto b = opr::Convolution::make(d1, d2, {}, {});
  869. auto y = d0 + b;
  870. SymbolVar y_opt;
  871. auto options = gopt::OptimizeForInferenceOptions{};
  872. options.enable_f16_io_comp();
  873. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  874. return y_opt;
  875. };
  876. auto make_f16_graph = [&]() {
  877. auto d0 = opr::TypeCvt::make(
  878. opr::Host2DeviceCopy::make(*graph, host_x0),
  879. dtype::Float16{}),
  880. d1 = opr::TypeCvt::make(
  881. opr::Host2DeviceCopy::make(*graph, host_x1),
  882. dtype::Float16{}),
  883. d2 = opr::TypeCvt::make(
  884. opr::SharedDeviceTensor::make(*graph, *host_x2),
  885. dtype::Float16{});
  886. auto b = opr::Convolution::make(d1, d2, {}, {});
  887. SymbolVar y = d0 + b;
  888. y = opr::TypeCvt::make(y, dtype::Float32{});
  889. return y;
  890. };
  891. auto y_opt = make_f32_to_f16_graph();
  892. auto y = make_f16_graph();
  893. ASSERT_EQ(y_opt.dtype(), dtype::Float32{});
  894. ASSERT_EQ(y.dtype(), dtype::Float32{});
  895. HostTensorND host_y_opt, host_y;
  896. auto func = graph->compile({make_callback_copy(y, host_y),
  897. make_callback_copy(y_opt, host_y_opt)});
  898. func->execute();
  899. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3);
  900. }
  901. TEST(TestGoptInference, Float32TOFloat16Linspace) {
  902. CompNode cn = CompNode::load("cpu0");
  903. HostTensorGenerator<> gen(0, 1, 0);
  904. auto host_x = gen({3, 1}, cn);
  905. auto graph = ComputingGraph::make();
  906. auto make_f32_to_f16_graph = [&]() {
  907. graph->options().graph_opt_level = 0;
  908. auto x = opr::Host2DeviceCopy::make(*graph, host_x);
  909. auto xshp = opr::GetVarShape::make(x);
  910. auto cv = [&x](int v) { return x.make_scalar(v); };
  911. auto sub = [&xshp, &cv](int idx) {
  912. return opr::IndexAt::make(xshp, {{0, cv(idx)}});
  913. };
  914. auto lin = opr::Linspace::make(cv(0), sub(0) - 1, sub(0), {}, {});
  915. auto shp = opr::Concat::make({sub(1), sub(0)}, 0);
  916. auto y = opr::Reshape::make(lin, shp);
  917. auto mm = opr::MatrixMul::make(x, y);
  918. SymbolVar mm_opt;
  919. auto options = gopt::OptimizeForInferenceOptions{};
  920. options.enable_f16_io_comp();
  921. unpack_vector(gopt::optimize_for_inference({mm}, options), mm_opt);
  922. return mm_opt;
  923. };
  924. auto make_f16_graph = [&]() {
  925. auto x = opr::TypeCvt::make(opr::Host2DeviceCopy::make(*graph, host_x),
  926. dtype::Float16());
  927. auto xshp = opr::GetVarShape::make(x);
  928. auto cv = [&x](int v) { return x.make_scalar(v); };
  929. auto sub = [&xshp, &cv](int idx) {
  930. return opr::IndexAt::make(xshp, {{0, cv(idx)}});
  931. };
  932. auto lin = opr::Linspace::make(cv(0), sub(0) - 1, sub(0), {}, {});
  933. lin = opr::TypeCvt::make(lin, dtype::Float16());
  934. auto shp = opr::Concat::make({sub(1), sub(0)}, 0);
  935. auto y = opr::Reshape::make(lin, shp);
  936. auto mm = opr::MatrixMul::make(x, y);
  937. mm = opr::TypeCvt::make(mm, dtype::Float32{});
  938. return mm;
  939. };
  940. auto y_opt = make_f32_to_f16_graph();
  941. auto y = make_f16_graph();
  942. ASSERT_EQ(y_opt.dtype(), dtype::Float32{});
  943. ASSERT_EQ(y.dtype(), dtype::Float32{});
  944. HostTensorND host_y_opt, host_y;
  945. auto func = graph->compile({make_callback_copy(y, host_y),
  946. make_callback_copy(y_opt, host_y_opt)});
  947. func->execute();
  948. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3);
  949. }
  950. TEST(TestGoptInference, Float32TOFloat16Endpoints) {
  951. HostTensorGenerator<> gen;
  952. auto graph = ComputingGraph::make();
  953. auto mkvar = [&](const char* name, const TensorShape& shp) {
  954. return opr::Host2DeviceCopy::make(*graph, gen(shp)).rename(name);
  955. };
  956. auto mkcvar = [&](const char* name, const TensorShape& shp) {
  957. return opr::SharedDeviceTensor::make(*graph, *gen(shp)).rename(name);
  958. };
  959. graph->options().graph_opt_level = 0;
  960. opr::Convolution::Param param;
  961. param.pad_h = param.pad_w = 0;
  962. auto x = mkvar("x", {8, 8, 8, 8}), y = mkvar("y", {8, 8, 8, 8}),
  963. w = mkcvar("w", {4, 8, 3, 3}),
  964. z = opr::Convolution::make(x + y, w, param);
  965. auto options = gopt::OptimizeForInferenceOptions{};
  966. options.enable_f16_io_f32_comp();
  967. SymbolVarArray out = gopt::optimize_for_inference({x + y, z}, options);
  968. ASSERT_EQ(out[0].dtype(), dtype::Float32());
  969. ASSERT_EQ(out[1].dtype(), dtype::Float32());
  970. ASSERT_EQ(out[0].node()->owner_opr()->input(0)->dtype(), dtype::Float16());
  971. ASSERT_EQ(out[1].node()->owner_opr()->input(0)->dtype(), dtype::Float16());
  972. }
  973. TEST(TestGoptInference, ConvertFormatNHWCD4) {
  974. // hwcd4 is only supported in naive handle
  975. NaiveMegDNNHandleScope naive_megdnn_handle;
  976. HostTensorGenerator<> gen;
  977. auto cn = CompNode::load("cpu0");
  978. auto graph = ComputingGraph::make();
  979. graph->options().graph_opt_level = 0;
  980. auto mkvar = [&](const char* name, const TensorShape& shp) {
  981. return opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name);
  982. };
  983. auto mkcvar = [&](const char* name, const TensorShape& shp) {
  984. return opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  985. .rename(name);
  986. };
  987. auto host_x = gen({8, 8, 8, 8}, cn);
  988. auto x = opr::Host2DeviceCopy::make(*graph, host_x);
  989. opr::Convolution::Param param;
  990. param.pad_h = param.pad_w = 0;
  991. auto w1 = mkcvar("w1", {4, 8, 3, 3}),
  992. conv = opr::Convolution::make(x, w1, param);
  993. auto shape_of = opr::GetVarShape::make(conv);
  994. auto subtensor = opr::Subtensor::make(
  995. shape_of, {opr::Subtensor::AxisIndexer::make_interval(
  996. 0, x.make_scalar(2), None, x.make_scalar(1))});
  997. opr::Resize::Param param_resize;
  998. param_resize.format = opr::Resize::Param::Format::NCHW;
  999. auto resize = opr::ResizeForward::make(conv, subtensor * 2, param_resize);
  1000. auto mat = mkcvar("mat", {8, 3, 3}),
  1001. warp = opr::WarpPerspectiveForward::make(
  1002. resize, mat, nullptr, cg::var_from_tensor_shape(x, {4, 4}));
  1003. auto b = mkvar("b", {1, 4, 1, 1}),
  1004. elem = opr::Elemwise::make({warp + b},
  1005. opr::Elemwise::Param::Mode::RELU);
  1006. param.pad_h = param.pad_w = 1;
  1007. auto w2 = mkcvar("w2", {4, 4, 3, 3}),
  1008. y = opr::Convolution::make(elem, w2, param),
  1009. z = opr::AxisAddRemove::make(
  1010. y, {opr::AxisAddRemove::AxisDesc::make_add(0)});
  1011. SymbolVar y_opt, z_opt;
  1012. auto options = gopt::OptimizeForInferenceOptions{};
  1013. options.enable_nhwcd4();
  1014. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  1015. unpack_vector(gopt::optimize_for_inference({z}, options), z_opt);
  1016. ASSERT_EQ(opr::Convolution::Param::Format::NHWCD4,
  1017. find_opr<opr::Convolution>(y_opt).param().format);
  1018. ASSERT_EQ(TensorFormat::Type::DEFAULT,
  1019. find_opr<opr::AxisAddRemove>(z_opt).input(0)->format().type());
  1020. ASSERT_EQ(4, find_opr<opr::AxisAddRemove>(z_opt).input(0)->shape().ndim);
  1021. graph->compile({{y_opt, {}}})
  1022. ->to_json()
  1023. ->writeto_fpath(
  1024. output_file("TestGoptInference.ConvertFormatNHWCD4.json"));
  1025. HostTensorND host_y_opt, host_y;
  1026. auto func = graph->compile({make_callback_copy(y, host_y),
  1027. make_callback_copy(y_opt, host_y_opt)});
  1028. func->execute();
  1029. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3);
  1030. *host_x = *gen({8, 8, 16, 16}, cn);
  1031. func->execute();
  1032. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3);
  1033. }
  1034. TEST(TestGoptInference, ConvertFormatNHWCD4Elemwise) {
  1035. // hwcd4 is only supported in naive handle
  1036. NaiveMegDNNHandleScope naive_megdnn_handle;
  1037. HostTensorGenerator<> gen;
  1038. auto cn = CompNode::load("cpu0");
  1039. auto graph = ComputingGraph::make();
  1040. graph->options().graph_opt_level = 0;
  1041. auto mkvar = [&](const char* name, const TensorShape& shp) {
  1042. return opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name);
  1043. };
  1044. auto mkcvar = [&](const char* name, const TensorShape& shp) {
  1045. return opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  1046. .rename(name);
  1047. };
  1048. auto host_x = gen({8, 8, 8, 8}, cn);
  1049. auto x = opr::Host2DeviceCopy::make(*graph, host_x);
  1050. opr::Convolution::Param param;
  1051. param.pad_h = param.pad_w = 0;
  1052. auto w1 = mkcvar("w1", {8, 8, 3, 3}),
  1053. conv = opr::Convolution::make(x, w1, param);
  1054. auto b = mkvar("b", {1, 1, 1, 1}),
  1055. elem = opr::Elemwise::make({conv + b},
  1056. opr::Elemwise::Param::Mode::RELU);
  1057. param.pad_h = param.pad_w = 1;
  1058. auto w2 = mkcvar("w2", {8, 8, 3, 3}),
  1059. conv2 = opr::Convolution::make(elem, w2, param);
  1060. auto b_scaler = mkvar("b", {1}), elem2 = conv2 + b_scaler;
  1061. param.pad_h = param.pad_w = 1;
  1062. auto w3 = mkcvar("w2", {8, 8, 3, 3}),
  1063. y = opr::Convolution::make(elem2, w3, param);
  1064. SymbolVar y_opt;
  1065. auto options = gopt::OptimizeForInferenceOptions{};
  1066. options.enable_nhwcd4();
  1067. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  1068. ASSERT_EQ(opr::Convolution::Param::Format::NHWCD4,
  1069. find_opr<opr::Convolution>(y_opt).param().format);
  1070. graph->compile({{y_opt, {}}})
  1071. ->to_json()
  1072. ->writeto_fpath(output_file(
  1073. "TestGoptInference.ConvertFormatNHWCD4Elemwise.json"));
  1074. HostTensorND host_y_opt, host_y;
  1075. auto func = graph->compile({make_callback_copy(y, host_y),
  1076. make_callback_copy(y_opt, host_y_opt)});
  1077. func->execute();
  1078. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3);
  1079. *host_x = *gen({8, 8, 16, 16}, cn);
  1080. func->execute();
  1081. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3);
  1082. }
  1083. TEST(TestGoptInference, ConvertFormatNHWCD4LOCAL) {
  1084. // hwcd4 is only supported in naive handle
  1085. NaiveMegDNNHandleScope naive_megdnn_handle;
  1086. HostTensorGenerator<> gen;
  1087. auto cn = CompNode::load("cpu0");
  1088. auto graph = ComputingGraph::make();
  1089. graph->options().graph_opt_level = 0;
  1090. auto mkcvar = [&](const char* name, const TensorShape& shp) {
  1091. return opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  1092. .rename(name);
  1093. };
  1094. auto host_x = gen({2, 8, 8, 16}, cn);
  1095. auto x = opr::Host2DeviceCopy::make(*graph, host_x);
  1096. opr::Convolution::Param param;
  1097. param.pad_h = param.pad_w = 1;
  1098. auto w1 = mkcvar("w1", {4, 8, 3, 3}),
  1099. conv1 = opr::Convolution::make(x, w1, param);
  1100. auto w2 = mkcvar("w2", {8, 16, 4, 3, 3, 4}),
  1101. local = opr::Local::make(conv1, w2, param);
  1102. auto w3 = mkcvar("w3", {4, 4, 3, 3}),
  1103. conv2 = opr::Convolution::make(local, w3, param);
  1104. opr::GroupLocal::Param param_group_local;
  1105. param_group_local.pad_h = param_group_local.pad_w = 1;
  1106. auto w4 = mkcvar("w4", {2, 8, 16, 2, 3, 3, 2}),
  1107. group_local = opr::GroupLocal::make(conv2, w4, param_group_local);
  1108. auto w5 = mkcvar("w5", {4, 4, 3, 3}),
  1109. y = opr::Convolution::make(group_local, w5, param);
  1110. SymbolVar y_opt;
  1111. auto options = gopt::OptimizeForInferenceOptions{};
  1112. options.enable_nhwcd4();
  1113. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  1114. ASSERT_EQ(opr::Convolution::Param::Format::NHWCD4,
  1115. find_opr<opr::Convolution>(y_opt).param().format);
  1116. ASSERT_EQ(opr::Local::Param::Format::NCHW,
  1117. find_opr<opr::Local>(y_opt).param().format);
  1118. ASSERT_EQ(opr::GroupLocal::Param::Format::NCHW,
  1119. find_opr<opr::GroupLocal>(y_opt).param().format);
  1120. graph->compile({{y_opt, {}}})
  1121. ->to_json()
  1122. ->writeto_fpath(output_file(
  1123. "TestGoptInference.ConvertFormatNHWCD4LOCAL.json"));
  1124. HostTensorND host_y_opt, host_y;
  1125. auto func = graph->compile({make_callback_copy(y, host_y),
  1126. make_callback_copy(y_opt, host_y_opt)});
  1127. func->execute();
  1128. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3);
  1129. }
  1130. TEST(TestGoptInference, ConvertFormatNHWCD4Deconv) {
  1131. // hwcd4 is only supported in naive handle
  1132. NaiveMegDNNHandleScope naive_megdnn_handle;
  1133. HostTensorGenerator<> gen;
  1134. auto cn = CompNode::load("cpu0");
  1135. auto graph = ComputingGraph::make();
  1136. graph->options().graph_opt_level = 0;
  1137. auto mkcvar = [&](const char* name, const TensorShape& shp) {
  1138. return opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  1139. .rename(name);
  1140. };
  1141. auto host_x = gen({8, 8, 8, 8}, cn);
  1142. auto x = opr::Host2DeviceCopy::make(*graph, host_x);
  1143. opr::Convolution::Param param;
  1144. param.pad_h = param.pad_w = 0;
  1145. auto w0 = mkcvar("w1", {4, 8, 2, 2}),
  1146. conv = opr::Convolution::make(x, w0, param);
  1147. auto w1 = mkcvar("w1", {4, 1, 2, 2}),
  1148. y = opr::ConvolutionBackwardData::make(w1, conv, param, {}, {});
  1149. SymbolVar y_opt;
  1150. auto options = gopt::OptimizeForInferenceOptions{};
  1151. options.enable_nhwcd4();
  1152. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  1153. ASSERT_EQ(opr::Convolution::Param::Format::NCHW,
  1154. find_opr<opr::ConvolutionBackwardData>(y_opt).param().format);
  1155. ASSERT_EQ(opr::Convolution::Param::Format::NHWCD4,
  1156. find_opr<opr::Convolution>(y_opt).param().format);
  1157. HostTensorND host_y_opt, host_y;
  1158. auto func = graph->compile({make_callback_copy(y, host_y),
  1159. make_callback_copy(y_opt, host_y_opt)});
  1160. func->execute();
  1161. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3);
  1162. }
  1163. TEST(TestGoptInference, ConvertFormatNHWCD4Qint8) {
  1164. // hwcd4 is only supported in naive handle
  1165. NaiveMegDNNHandleScope naive_megdnn_handle;
  1166. HostTensorGenerator<> gen;
  1167. auto cn = CompNode::load("cpu0");
  1168. auto graph = ComputingGraph::make();
  1169. graph->options().graph_opt_level = 0;
  1170. auto mkcvar = [&](const char* name, const TensorShape& shp,
  1171. const DType& dtype) {
  1172. return opr::TypeCvt::make(
  1173. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  1174. .rename(name),
  1175. dtype);
  1176. };
  1177. auto host_x = gen({8, 8, 8, 8}, cn);
  1178. auto _x = opr::Host2DeviceCopy::make(*graph, host_x),
  1179. x = opr::TypeCvt::make(_x, dtype::QuantizedS8(0.2f));
  1180. opr::ConvBias::Param param;
  1181. param.pad_h = param.pad_w = 0;
  1182. auto w = mkcvar("w", {4, 8, 3, 3}, dtype::QuantizedS8(0.1f)),
  1183. b = mkcvar("b", {1, 4, 1, 1}, dtype::QuantizedS32(0.02f)),
  1184. y = opr::ConvBias::make(x, w, b, param, {},
  1185. OperatorNodeConfig{dtype::QuantizedS8(0.2f)});
  1186. SymbolVar y_opt;
  1187. auto options = gopt::OptimizeForInferenceOptions{};
  1188. options.enable_nhwcd4();
  1189. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  1190. ASSERT_EQ(opr::ConvBias::Param::Format::NHWCD4,
  1191. find_opr<opr::ConvBias>(y_opt).param().format);
  1192. graph->compile({{y_opt, {}}})
  1193. ->to_json()
  1194. ->writeto_fpath(output_file(
  1195. "TestGoptInference.ConvertFormatNHWCD4Qint8.json"));
  1196. auto float_y = opr::TypeCvt::make(y, dtype::Float32()),
  1197. float_y_opt = opr::TypeCvt::make(y_opt, dtype::Float32());
  1198. HostTensorND host_y_opt, host_y;
  1199. auto func = graph->compile({make_callback_copy(float_y, host_y),
  1200. make_callback_copy(float_y_opt, host_y_opt)});
  1201. func->execute();
  1202. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3);
  1203. }
  1204. TEST(TestGoptInference, ConvertFormatPadIC) {
  1205. // hwcd4 is only supported in naive handle
  1206. NaiveMegDNNHandleScope naive_megdnn_handle;
  1207. HostTensorGenerator<> gen;
  1208. auto cn = CompNode::load("cpu0");
  1209. auto graph = ComputingGraph::make();
  1210. graph->options().graph_opt_level = 0;
  1211. auto mkcvar = [&](const char* name, const TensorShape& shp) {
  1212. return opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  1213. .rename(name);
  1214. };
  1215. auto host_inp1 = gen({1, 6, 128, 128}, cn),
  1216. host_inp2 = gen({1, 6, 256, 256}, cn);
  1217. auto inp1 = opr::Host2DeviceCopy::make(*graph, host_inp1),
  1218. inp2 = opr::Host2DeviceCopy::make(*graph, host_inp2);
  1219. auto shape_tmp = mkcvar("tmp", {256, 256});
  1220. auto shape_of = opr::GetVarShape::make(shape_tmp);
  1221. opr::Resize::Param param_resize;
  1222. param_resize.format = opr::Resize::Param::Format::NCHW;
  1223. auto resize = opr::ResizeForward::make(inp1, shape_of, param_resize);
  1224. auto concat = opr::Concat::make({inp2, resize}, 1);
  1225. opr::Convolution::Param param;
  1226. param.pad_h = param.pad_w = 1;
  1227. param.sparse = opr::Convolution::Param::Sparse::DENSE;
  1228. auto w1 = mkcvar("w1", {12, 12, 3, 3});
  1229. auto y = opr::Convolution::make(concat, w1, param);
  1230. SymbolVar y_opt;
  1231. auto options = gopt::OptimizeForInferenceOptions{};
  1232. options.enable_nhwcd4();
  1233. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  1234. HostTensorND host_y_opt, host_y;
  1235. auto func = graph->compile({make_callback_copy(y, host_y),
  1236. make_callback_copy(y_opt, host_y_opt)});
  1237. func->execute();
  1238. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3);
  1239. }
  1240. TEST(TestGoptInference, ConvertBatchNormPass) {
  1241. auto cn = CompNode::load("cpu0");
  1242. HostTensorGenerator<> gen(0, 1, 0);
  1243. auto graph = ComputingGraph::make();
  1244. graph->options().graph_opt_level = 0;
  1245. auto mkvar = [&](const char* name, const TensorShape& shp) {
  1246. return opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name);
  1247. };
  1248. auto mkcvar = [&](const char* name, const TensorShape& shp) {
  1249. return opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  1250. .rename(name);
  1251. };
  1252. using Param = opr::BatchNorm::Param;
  1253. Param param(Param::ParamDim::DIM_1C11, Param::FwdMode::INFERENCE);
  1254. TensorShape shp = {1, 3, 1, 1};
  1255. auto x = mkvar("x", {2, 3, 16, 24}), scale = mkcvar("scale", shp),
  1256. bias = mkcvar("bias", shp), mean = mkcvar("mean", shp);
  1257. auto host_variance = gen(shp, cn);
  1258. for (size_t i = 0; i < shp.total_nr_elems(); ++i) {
  1259. host_variance->ptr<float>()[i] =
  1260. std::abs(host_variance->ptr<float>()[i]);
  1261. }
  1262. auto variance = opr::SharedDeviceTensor::make(*graph, *host_variance)
  1263. .rename("variance");
  1264. auto y = opr::BatchNorm::make(x, scale, bias, mean, variance, param)[4];
  1265. SymbolVar y_opt;
  1266. unpack_vector(gopt::optimize_for_inference(
  1267. {y}, gopt::OptimizeForInferenceOptions{}),
  1268. y_opt);
  1269. ASSERT_EQ(0u, find_opr_num<opr::BatchNorm>(y_opt));
  1270. graph->compile({{y_opt, {}}})
  1271. ->to_json()
  1272. ->writeto_fpath(
  1273. output_file("TestGoptInference.ConvertBatchNormPass.json"));
  1274. HostTensorND host_y, host_y_opt;
  1275. auto func = graph->compile({make_callback_copy(y, host_y),
  1276. make_callback_copy(y_opt, host_y_opt)});
  1277. func->execute();
  1278. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-5);
  1279. }
  1280. TEST(TestGoptInference, ConvBiasNonlinearityFusePass) {
  1281. // hwcd4 is only supported in naive handle
  1282. NaiveMegDNNHandleScope naive_megdnn_handle;
  1283. auto cn = CompNode::load("cpu0");
  1284. HostTensorGenerator<> gen;
  1285. auto graph = ComputingGraph::make();
  1286. graph->options().graph_opt_level = 0;
  1287. auto mkvar = [&](const char* name, const TensorShape& shp) {
  1288. return opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name);
  1289. };
  1290. auto mkcvar = [&](const char* name, const TensorShape& shp) {
  1291. return opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  1292. .rename(name);
  1293. };
  1294. opr::Convolution::Param param;
  1295. auto x = mkvar("x", {5, 8, 16, 24}), w1 = mkcvar("w1", {4, 8, 1, 1}),
  1296. w2 = mkcvar("w2", {4, 4, 3, 3}), b1 = mkcvar("b1", {1, 4, 1, 1}),
  1297. b2 = mkcvar("b2", {1, 4, 1, 1}), w3 = mkcvar("w3", {8, 4, 1, 1}),
  1298. y_cut = opr::Convolution::make(x, w1, param),
  1299. y1 = opr::Elemwise::make({y_cut + b1},
  1300. opr::Elemwise::Param::Mode::RELU);
  1301. param.pad_w = param.pad_h = 1;
  1302. auto y2 = opr::Elemwise::make({opr::Convolution::make(y1, w2, param) + b2},
  1303. opr::Elemwise::Param::Mode::SIGMOID);
  1304. param.pad_w = param.pad_h = 0;
  1305. auto y3 = opr::Convolution::make(y2, w3, param), y_tmp = y3 + x,
  1306. y_expand =
  1307. opr::Elemwise::make({y_cut}, opr::Elemwise::Param::Mode::RELU),
  1308. y_y = opr::Convolution::make(y_expand, w3, param), y = y_y + y_tmp;
  1309. SymbolVar y_opt;
  1310. auto options = gopt::OptimizeForInferenceOptions{};
  1311. options.enable_nhwcd4().enable_fuse_conv_bias_nonlinearity();
  1312. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  1313. ASSERT_EQ(3u, find_opr<opr::ConvBias>(y_opt).input().size());
  1314. graph->compile({{y_opt, {}}})
  1315. ->to_json()
  1316. ->writeto_fpath(output_file(
  1317. "TestGoptInference.FuseConvBiasNonlinPass.json"));
  1318. HostTensorND host_y, host_y_opt;
  1319. auto func = graph->compile({make_callback_copy(y, host_y),
  1320. make_callback_copy(y_opt, host_y_opt)});
  1321. func->execute();
  1322. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-4);
  1323. }
  1324. TEST(TestGoptInference, ConvBiasNonlinearityFusePass_FullBias) {
  1325. NaiveMegDNNHandleScope naive_megdnn_handle;
  1326. for (int i = 0; i < 2; i++) {
  1327. auto graph = ComputingGraph::make();
  1328. auto cn = CompNode::load("cpu0");
  1329. HostTensorGenerator<> gen;
  1330. auto mkImvar = [&](const char* name, const TensorShape& shp) {
  1331. return opr::ImmutableTensor::make(*graph, *gen(shp, cn))
  1332. .rename(name);
  1333. };
  1334. graph->options().graph_opt_level = 0;
  1335. auto mkcvar = [&](const char* name, const TensorShape& shp) {
  1336. return opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  1337. .rename(name);
  1338. };
  1339. opr::Convolution::Param param;
  1340. auto host_x = gen({1, 8, 16, 24}, cn);
  1341. auto x = opr::Host2DeviceCopy::make(*graph, host_x),
  1342. w1 = mkcvar("w1", {4, 8, 1, 1}), w2 = mkcvar("w2", {4, 8, 3, 3}),
  1343. w3 = mkcvar("w3", {4, 4, 1, 1}),
  1344. b = i == 0 ? mkcvar("b", {1, 4, 16, 24})
  1345. : mkImvar("bias", {1, 4, 16, 24}),
  1346. y_cut0 = opr::Convolution::make(x, w1, param);
  1347. param.pad_w = param.pad_h = 1;
  1348. auto y_cut1 = opr::Convolution::make(x, w2, param);
  1349. auto y1 = opr::Elemwise::make({y_cut0 + y_cut1},
  1350. opr::Elemwise::Param::Mode::RELU);
  1351. param.pad_w = param.pad_h = 0;
  1352. auto y2 = opr::Convolution::make(y1, w3, param);
  1353. auto y =
  1354. opr::Elemwise::make({y2 + b}, opr::Elemwise::Param::Mode::RELU);
  1355. SymbolVar y_opt;
  1356. auto options = gopt::OptimizeForInferenceOptions{};
  1357. options.enable_fuse_conv_bias_nonlinearity();
  1358. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  1359. ASSERT_EQ(3u, find_opr<opr::ConvBias>(y_opt).input().size());
  1360. graph->compile({{y_opt, {}}})
  1361. ->to_json()
  1362. ->writeto_fpath(
  1363. output_file("TestGoptInference.FuseConvBiasNonlinPass_"
  1364. "FulBias.json"));
  1365. HostTensorND host_y, host_y_opt;
  1366. auto func = graph->compile({make_callback_copy(y, host_y),
  1367. make_callback_copy(y_opt, host_y_opt)});
  1368. func->execute();
  1369. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-4);
  1370. *host_x = *gen({4, 8, 16, 24}, cn);
  1371. func->execute();
  1372. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-4);
  1373. }
  1374. }
  1375. TEST(TestGoptInference, ParamMerge) {
  1376. auto cns = load_multiple_xpus(2);
  1377. HostTensorGenerator<> gen;
  1378. auto graph = ComputingGraph::make();
  1379. auto var0 = opr::SharedDeviceTensor::make(*graph, *gen({2, 3}, cns[0])),
  1380. var1 = opr::SharedDeviceTensor::make(*graph, *gen({1, 3}, cns[1])),
  1381. y = var0 + opr::Copy::make(var1, {cns[0]});
  1382. HostTensorND y_expected_val;
  1383. graph->compile({make_callback_copy(y, y_expected_val)})->execute();
  1384. SymbolVar y_opt;
  1385. unpack_vector(gopt::GraphOptimizer{}
  1386. .add_pass<gopt::ParamMergePass>()
  1387. .apply({{y}})
  1388. .endpoint_vars(),
  1389. y_opt);
  1390. auto opr = y_opt.node()->owner_opr();
  1391. ASSERT_EQ(2u, opr->input().size());
  1392. ASSERT_EQ(2u,
  1393. find_opr<opr::MultipleDeviceTensorHolder>(y_opt).output().size());
  1394. HostTensorND y_got_val;
  1395. graph->compile({make_callback_copy(y_opt, y_got_val)})->execute();
  1396. MGB_ASSERT_TENSOR_EQ(y_expected_val, y_got_val);
  1397. }
  1398. TEST(TestGoptInference, ParamMergeFormat) {
  1399. auto cns = load_multiple_xpus(2);
  1400. auto make_dv = [](const HostTensorND& hv) {
  1401. TensorLayout layout{hv.layout(), hv.layout().dtype,
  1402. megdnn::Image2DPack4TensorFormat::make_raw(1, 64)};
  1403. auto ret = std::make_shared<DeviceTensorND>(hv.comp_node(), layout);
  1404. ret->copy_from_fixlayout(hv).sync();
  1405. return ret;
  1406. };
  1407. HostTensorGenerator<> gen;
  1408. auto graph = ComputingGraph::make();
  1409. auto var0 = opr::SharedDeviceTensorWithFormat::make(
  1410. *graph, make_dv(*gen({2, 32}, cns[0]))),
  1411. var1 = opr::SharedDeviceTensorWithFormat::make(
  1412. *graph, make_dv(*gen({1, 32}, cns[1]))),
  1413. y = var0 + opr::Copy::make(var1, {cns[0]});
  1414. HostTensorND y_expected_val;
  1415. graph->compile({make_callback_copy(y, y_expected_val)})->execute();
  1416. SymbolVar y_opt;
  1417. unpack_vector(gopt::GraphOptimizer{}
  1418. .add_pass<gopt::ParamMergePass>()
  1419. .apply({{y}})
  1420. .endpoint_vars(),
  1421. y_opt);
  1422. auto opr = y_opt.node()->owner_opr();
  1423. ASSERT_EQ(2u, opr->input().size());
  1424. ASSERT_EQ(2u, find_opr<opr::MultipleDeviceTensorWithFormatHolder>(y_opt)
  1425. .output()
  1426. .size());
  1427. HostTensorND y_got_val;
  1428. graph->compile({make_callback_copy(y_opt, y_got_val)})->execute();
  1429. MGB_ASSERT_TENSOR_EQ(y_expected_val, y_got_val);
  1430. }
  1431. #if MGB_ENABLE_FASTRUN
  1432. TEST(TestGoptInference, AlgoProfile) {
  1433. HostTensorGenerator<> gen;
  1434. auto graph = ComputingGraph::make();
  1435. auto host_x = gen({4, 3, 8, 9}), host_y = gen({2, 3, 3, 3});
  1436. auto x = opr::Host2DeviceCopy::make(*graph, host_x),
  1437. y = opr::Host2DeviceCopy::make(*graph, host_y),
  1438. z = opr::Convolution::make(x, y);
  1439. auto&& conv = z.node()->owner_opr()->cast_final_safe<opr::Convolution>();
  1440. using S = opr::Convolution::ExecutionPolicy::Strategy;
  1441. ASSERT_EQ(S::HEURISTIC, conv.execution_policy_transient().strategy);
  1442. gopt::enable_opr_algo_profiling_inplace({z + 2.3f});
  1443. ASSERT_EQ(S::PROFILE, conv.execution_policy().strategy);
  1444. }
  1445. #endif
  1446. TEST(TestGoptInference, ProfileCache) {
  1447. HostTensorGenerator<> gen;
  1448. auto graph = ComputingGraph::make();
  1449. auto host_x = gen({4, 3, 8, 9}), host_y = gen({2, 3, 3, 3});
  1450. auto x = opr::Host2DeviceCopy::make(*graph, host_x),
  1451. y = opr::Host2DeviceCopy::make(*graph, host_y),
  1452. z = opr::Convolution::make(x, y);
  1453. auto&& conv = z.node()->owner_opr()->cast_final_safe<opr::Convolution>();
  1454. using S = opr::Convolution::ExecutionPolicy::Strategy;
  1455. ASSERT_EQ(S::HEURISTIC, conv.execution_policy_transient().strategy);
  1456. gopt::enable_opr_use_profiling_cache_inplace({z + 2.3f});
  1457. ASSERT_EQ(S::PROFILE_HEURISTIC, conv.execution_policy().strategy);
  1458. }
  1459. TEST(TestGoptInference, AlgoWorkspaceLimit) {
  1460. HostTensorGenerator<> gen;
  1461. auto graph = ComputingGraph::make();
  1462. auto host_x = gen({4, 3, 8, 9}), host_y = gen({2, 3, 3, 3});
  1463. auto x = opr::Host2DeviceCopy::make(*graph, host_x),
  1464. y = opr::Host2DeviceCopy::make(*graph, host_y),
  1465. z = opr::Convolution::make(x, y);
  1466. auto&& conv = z.node()->owner_opr()->cast_final_safe<opr::Convolution>();
  1467. ASSERT_EQ(std::numeric_limits<uint64_t>::max(),
  1468. conv.execution_policy_transient().workspace_limit);
  1469. gopt::set_opr_algo_workspace_limit_inplace({z + 2.3f}, 10000u);
  1470. ASSERT_EQ(10000u, conv.execution_policy().workspace_limit);
  1471. }
  1472. TEST_PASS(FuseConvBiasNonlinPass, Basic) {
  1473. auto cn = CompNode::load("xpux");
  1474. HostTensorGenerator<dtype::Int8> gen;
  1475. auto graph = ComputingGraph::make();
  1476. graph->options().graph_opt_level = 0;
  1477. auto mkvar = [&](const char* name, const TensorShape& shp,
  1478. const DType& dtype) {
  1479. return opr::TypeCvt::make(
  1480. opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
  1481. dtype);
  1482. };
  1483. auto mkcvar = [&](const char* name, const TensorShape& shp,
  1484. const DType& dtype) {
  1485. return opr::TypeCvt::make(
  1486. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  1487. .rename(name),
  1488. dtype);
  1489. };
  1490. for (auto format : {opr::Convolution::Param::Format::NCHW,
  1491. opr::Convolution::Param::Format::NHWC,
  1492. opr::Convolution::Param::Format::NCHW4}) {
  1493. opr::Convolution::Param param;
  1494. param.format = format;
  1495. SymbolVar x, w, b;
  1496. if (format == opr::Convolution::Param::Format::NHWC) {
  1497. x = mkvar("x", {20, 20, 20, 4}, dtype::QuantizedS8(2.5f)),
  1498. w = mkcvar("w1", {24, 1, 1, 4}, dtype::QuantizedS8(2.5f)),
  1499. b = mkcvar("b", {1, 1, 1, 24}, dtype::QuantizedS32(6.25f));
  1500. } else if (format == opr::Convolution::Param::Format::NCHW) {
  1501. x = mkvar("x", {20, 4, 20, 20}, dtype::QuantizedS8(2.5f)),
  1502. w = mkcvar("w1", {24, 4, 1, 1}, dtype::QuantizedS8(2.5f)),
  1503. b = mkcvar("b", {1, 24, 1, 1}, dtype::QuantizedS32(6.25f));
  1504. } else {
  1505. mgb_assert(format == opr::Convolution::Param::Format::NCHW4);
  1506. x = mkvar("x", {20, 1, 20, 20, 4}, dtype::QuantizedS8(2.5f)),
  1507. w = mkcvar("w1", {24, 1, 1, 1, 4}, dtype::QuantizedS8(2.5f)),
  1508. b = mkcvar("b", {1, 6, 1, 1, 4}, dtype::QuantizedS32(6.25f));
  1509. }
  1510. auto y = opr::Convolution::make(x, w, param);
  1511. y = opr::Elemwise::make({y + b}, opr::Elemwise::Param::Mode::RELU);
  1512. y = opr::TypeCvt::make(y, dtype::QuantizedS8(2.5f));
  1513. opr::ConvBias::Param conv_bias_param;
  1514. conv_bias_param.format = format;
  1515. conv_bias_param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU;
  1516. auto concret_y = opr::ConvBias::make(
  1517. x, w, b, conv_bias_param, {},
  1518. OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  1519. check(concret_y, y);
  1520. }
  1521. }
  1522. #if MGB_CUDA
  1523. TEST(TestEnableTensorCore, SmallInputShape) {
  1524. REQUIRE_GPU(1);
  1525. auto cn = CompNode::load("gpu0");
  1526. cn.activate();
  1527. auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
  1528. auto sm_ver = prop.major * 10 + prop.minor;
  1529. if (sm_ver < 75) {
  1530. printf("This testcast ignored due to insufficient cuda cap(got: %d, "
  1531. "expected: %d)\n",
  1532. sm_ver, 75);
  1533. return;
  1534. }
  1535. HostTensorGenerator<dtype::Int8> gen;
  1536. auto graph = ComputingGraph::make();
  1537. graph->options().graph_opt_level = 0;
  1538. auto mkvar = [&](const char* name, const TensorShape& shp,
  1539. const DType& dtype) {
  1540. return opr::TypeCvt::make(
  1541. opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
  1542. dtype);
  1543. };
  1544. auto mkcvar = [&](const char* name, const TensorShape& shp,
  1545. const DType& dtype) {
  1546. return opr::TypeCvt::make(
  1547. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  1548. .rename(name),
  1549. dtype);
  1550. };
  1551. auto x = mkvar("x", {32, 16, 4, 8, 4}, dtype::QuantizedS8(2.5f)),
  1552. w = mkcvar("w1", {64, 16, 3, 3, 4}, dtype::QuantizedS8(2.5f)),
  1553. b = mkcvar("b", {1, 16, 1, 1, 4}, dtype::QuantizedS32(6.25f)),
  1554. z = mkcvar("b1", {32, 16, 2, 4, 4}, dtype::QuantizedS8(2.5f));
  1555. opr::ConvBias::Param param;
  1556. param.format = opr::ConvBias::Param::Format::NCHW4;
  1557. param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU;
  1558. param.stride_h = param.stride_w = 2;
  1559. param.pad_h = param.pad_w = 1;
  1560. auto y = opr::ConvBias::make(x, w, b, z, param, {},
  1561. OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  1562. y = opr::ConvBias::make(y, w, b, param, {},
  1563. OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  1564. y = opr::TypeCvt::make(y, dtype::Float32());
  1565. SymbolVar y_opt;
  1566. SymbolVar y_no_tc;
  1567. {
  1568. auto options = gopt::OptimizeForInferenceOptions{};
  1569. options.enable_nchw32().enable_fuse_conv_bias_nonlinearity();
  1570. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  1571. }
  1572. {
  1573. auto options = gopt::OptimizeForInferenceOptions{};
  1574. options.enable_fuse_conv_bias_nonlinearity();
  1575. unpack_vector(gopt::optimize_for_inference({y}, options), y_no_tc);
  1576. }
  1577. auto nr_dimshuffle = find_opr_num<mgb::opr::Dimshuffle>(y_opt);
  1578. ASSERT_EQ(2u, nr_dimshuffle);
  1579. HostTensorND host_y, host_y_opt;
  1580. auto func = graph->compile({make_callback_copy(y_no_tc, host_y),
  1581. make_callback_copy(y_opt, host_y_opt)});
  1582. func->execute();
  1583. MGB_ASSERT_TENSOR_EQ(host_y, host_y_opt);
  1584. }
  1585. //! close for cu111 ci, reopen it when bug fixed
  1586. #if 0
  1587. TEST(TestEnableTensorCore, Nchw4Nchw) {
  1588. REQUIRE_GPU(1);
  1589. auto cn = CompNode::load("gpu0");
  1590. cn.activate();
  1591. auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
  1592. auto sm_ver = prop.major * 10 + prop.minor;
  1593. if (sm_ver < 75) {
  1594. printf("This testcast ignored due to insufficient cuda cap(got: %d, "
  1595. "expected: %d)\n",
  1596. sm_ver, 75);
  1597. return;
  1598. }
  1599. HostTensorGenerator<dtype::Int8> gen;
  1600. auto graph = ComputingGraph::make();
  1601. graph->options().graph_opt_level = 0;
  1602. auto mkvar = [&](const char* name, const TensorShape& shp,
  1603. const DType& dtype) {
  1604. return opr::TypeCvt::make(
  1605. opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
  1606. dtype);
  1607. };
  1608. auto mkcvar = [&](const char* name, const TensorShape& shp,
  1609. const DType& dtype) {
  1610. return opr::TypeCvt::make(
  1611. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  1612. .rename(name),
  1613. dtype);
  1614. };
  1615. auto mkshape = [](opr::ConvBias::Param::Format format, size_t N, size_t C,
  1616. size_t H, size_t W) -> TensorShape {
  1617. mgb_assert(C % 4 == 0);
  1618. if (format == opr::ConvBias::Param::Format::NCHW4) {
  1619. return {N, C / 4, H, W, 4};
  1620. } else {
  1621. mgb_assert(format == opr::ConvBias::Param::Format::NCHW);
  1622. return {N, C, H, W};
  1623. }
  1624. };
  1625. for (auto format : {opr::ConvBias::Param::Format::NCHW,
  1626. opr::ConvBias::Param::Format::NCHW4}) {
  1627. auto x = mkvar("x", mkshape(format, 32, 64, 16, 16),
  1628. dtype::QuantizedS8(2.5f)),
  1629. w = mkcvar("w1", mkshape(format, 64, 64, 3, 3),
  1630. dtype::QuantizedS8(2.5f)),
  1631. b = mkcvar("b", mkshape(format, 1, 64, 1, 1),
  1632. dtype::QuantizedS32(6.25f)),
  1633. z = mkcvar("b1", mkshape(format, 32, 64, 8, 8),
  1634. dtype::QuantizedS8(2.5f));
  1635. opr::ConvBias::Param param;
  1636. param.format = format;
  1637. param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU;
  1638. param.stride_h = param.stride_w = 2;
  1639. param.pad_h = param.pad_w = 1;
  1640. auto y = opr::ConvBias::make(
  1641. x, w, b, z, param, {},
  1642. OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  1643. y = opr::ConvBias::make(y, w, b, param, {},
  1644. OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  1645. y = opr::TypeCvt::make(y, dtype::Float32());
  1646. SymbolVar y_opt;
  1647. SymbolVar y_no_tc;
  1648. {
  1649. auto options = gopt::OptimizeForInferenceOptions{};
  1650. options.enable_nchw32().enable_fuse_conv_bias_nonlinearity();
  1651. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  1652. }
  1653. {
  1654. auto options = gopt::OptimizeForInferenceOptions{};
  1655. options.enable_fuse_conv_bias_nonlinearity();
  1656. unpack_vector(gopt::optimize_for_inference({y}, options), y_no_tc);
  1657. }
  1658. auto nr_dimshuffle = find_opr_num<mgb::opr::Dimshuffle>(y_opt);
  1659. std::string json_name;
  1660. ASSERT_EQ(2u, nr_dimshuffle);
  1661. if (format == opr::ConvBias::Param::Format::NCHW4) {
  1662. json_name = "TestGoptInference.Nchw4Nchw.NCHW4.json";
  1663. } else {
  1664. mgb_assert(format == opr::ConvBias::Param::Format::NCHW);
  1665. json_name = "TestGoptInference.Nchw4Nchw.NCHW.json";
  1666. }
  1667. graph->compile({{y_opt, {}}})
  1668. ->to_json()
  1669. ->writeto_fpath(output_file(json_name.c_str()));
  1670. HostTensorND host_y, host_y_opt;
  1671. auto func = graph->compile({make_callback_copy(y_no_tc, host_y),
  1672. make_callback_copy(y_opt, host_y_opt)});
  1673. func->execute();
  1674. MGB_ASSERT_TENSOR_EQ(host_y, host_y_opt);
  1675. }
  1676. }
  1677. #endif
  1678. //! close for cu111 ci, reopen it when bug fixed
  1679. #if 0
  1680. TEST(TestEnableTensorCore, ConvBiasWithZ) {
  1681. REQUIRE_GPU(1);
  1682. auto cn = CompNode::load("gpu0");
  1683. cn.activate();
  1684. auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
  1685. auto sm_ver = prop.major * 10 + prop.minor;
  1686. if (sm_ver < 75) {
  1687. printf("This testcast ignored due to insufficient cuda cap(got: %d, "
  1688. "expected: %d)\n",
  1689. sm_ver, 75);
  1690. return;
  1691. }
  1692. HostTensorGenerator<dtype::Int8> gen;
  1693. auto graph = ComputingGraph::make();
  1694. graph->options().graph_opt_level = 0;
  1695. auto mkvar = [&](const char* name, const TensorShape& shp,
  1696. const DType& dtype) {
  1697. return opr::TypeCvt::make(
  1698. opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
  1699. dtype);
  1700. };
  1701. auto mkcvar = [&](const char* name, const TensorShape& shp,
  1702. const DType& dtype) {
  1703. return opr::TypeCvt::make(
  1704. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  1705. .rename(name),
  1706. dtype);
  1707. };
  1708. auto x = mkvar("x", {32, 16, 16, 16, 4}, dtype::QuantizedS8(2.5f)),
  1709. w = mkcvar("w1", {64, 16, 3, 3, 4}, dtype::QuantizedS8(2.5f)),
  1710. b = mkcvar("b", {1, 16, 1, 1, 4}, dtype::QuantizedS32(6.25f)),
  1711. z = mkvar("b1", {32, 16, 16, 16, 4}, dtype::QuantizedS8(2.5f));
  1712. opr::ConvBias::Param param;
  1713. param.format = opr::ConvBias::Param::Format::NCHW4;
  1714. param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU;
  1715. param.stride_h = param.stride_w = 1;
  1716. param.pad_h = param.pad_w = 1;
  1717. auto y = opr::ConvBias::make(x, w, b, z, param, {},
  1718. OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  1719. y = opr::TypeCvt::make(y, dtype::Float32());
  1720. SymbolVar y_opt;
  1721. SymbolVar y_no_tc;
  1722. {
  1723. auto options = gopt::OptimizeForInferenceOptions{};
  1724. options.enable_fuse_conv_bias_nonlinearity().enable_nchw32();
  1725. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  1726. }
  1727. {
  1728. auto options = gopt::OptimizeForInferenceOptions{};
  1729. options.enable_fuse_conv_bias_nonlinearity();
  1730. unpack_vector(gopt::optimize_for_inference({y}, options), y_no_tc);
  1731. }
  1732. HostTensorND host_y, host_y_opt;
  1733. auto func = graph->compile({make_callback_copy(y_no_tc, host_y),
  1734. make_callback_copy(y_opt, host_y_opt)});
  1735. func->execute();
  1736. MGB_ASSERT_TENSOR_EQ(host_y, host_y_opt);
  1737. }
  1738. #endif
  1739. //! close for cu111 ci, reopen it when bug fixed
  1740. #if 0
  1741. TEST(TestEnableTensorCore, Pooling) {
  1742. REQUIRE_GPU(1);
  1743. auto cn = CompNode::load("gpu0");
  1744. cn.activate();
  1745. auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
  1746. auto sm_ver = prop.major * 10 + prop.minor;
  1747. if (sm_ver < 75) {
  1748. printf("This testcast ignored due to insufficient cuda cap(got: %d, "
  1749. "expected: %d)\n",
  1750. sm_ver, 75);
  1751. return;
  1752. }
  1753. HostTensorGenerator<dtype::Int8> gen;
  1754. auto graph = ComputingGraph::make();
  1755. graph->options().graph_opt_level = 0;
  1756. auto mkvar = [&](const char* name, const TensorShape& shp,
  1757. const DType& dtype) {
  1758. return opr::TypeCvt::make(
  1759. opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
  1760. dtype);
  1761. };
  1762. auto mkcvar = [&](const char* name, const TensorShape& shp,
  1763. const DType& dtype) {
  1764. return opr::TypeCvt::make(
  1765. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  1766. .rename(name),
  1767. dtype);
  1768. };
  1769. auto x = mkvar("x", {32, 16, 16, 16, 4}, dtype::QuantizedS8(2.5f)),
  1770. w = mkcvar("w1", {64, 16, 3, 3, 4}, dtype::QuantizedS8(2.5f)),
  1771. b = mkcvar("b", {1, 16, 1, 1, 4}, dtype::QuantizedS32(6.25f)),
  1772. z = mkvar("b1", {32, 16, 16, 16, 4}, dtype::QuantizedS8(2.5f));
  1773. opr::ConvBias::Param param;
  1774. param.format = opr::ConvBias::Param::Format::NCHW4;
  1775. param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU;
  1776. param.stride_h = param.stride_w = 1;
  1777. param.pad_h = param.pad_w = 1;
  1778. auto y = opr::ConvBias::make(x, w, b, z, param, {},
  1779. OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  1780. opr::Pooling::Param pool_param;
  1781. pool_param.format = opr::Pooling::Param::Format::NCHW4;
  1782. y = opr::Pooling::make(y, pool_param);
  1783. y = opr::TypeCvt::make(y, dtype::Float32());
  1784. SymbolVar y_opt;
  1785. SymbolVar y_no_tc;
  1786. {
  1787. auto options = gopt::OptimizeForInferenceOptions{};
  1788. options.enable_fuse_conv_bias_nonlinearity().enable_nchw32();
  1789. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  1790. }
  1791. ASSERT_EQ(opr::Pooling::Param::Format::NCHW32,
  1792. find_opr<opr::Pooling>(y_opt).param().format);
  1793. {
  1794. auto options = gopt::OptimizeForInferenceOptions{};
  1795. options.enable_fuse_conv_bias_nonlinearity();
  1796. unpack_vector(gopt::optimize_for_inference({y}, options), y_no_tc);
  1797. }
  1798. HostTensorND host_y, host_y_opt;
  1799. auto func = graph->compile({make_callback_copy(y_no_tc, host_y),
  1800. make_callback_copy(y_opt, host_y_opt)});
  1801. func->execute();
  1802. MGB_ASSERT_TENSOR_EQ(host_y, host_y_opt);
  1803. }
  1804. #endif
  1805. TEST(TestGoptInference, EnableTensorCore) {
  1806. REQUIRE_GPU(1);
  1807. auto cn = CompNode::load("gpu0");
  1808. cn.activate();
  1809. auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
  1810. auto sm_ver = prop.major * 10 + prop.minor;
  1811. if (sm_ver < 75) {
  1812. printf("This testcast ignored due to insufficient cuda cap(got: %d, "
  1813. "expected: %d)\n",
  1814. sm_ver, 75);
  1815. return;
  1816. }
  1817. HostTensorGenerator<dtype::Int8> gen;
  1818. auto graph = ComputingGraph::make();
  1819. graph->options().graph_opt_level = 0;
  1820. auto mkvar = [&](const char* name, const TensorShape& shp,
  1821. const DType& dtype) {
  1822. return opr::TypeCvt::make(
  1823. opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
  1824. dtype);
  1825. };
  1826. auto mkcvar = [&](const char* name, const TensorShape& shp,
  1827. const DType& dtype) {
  1828. return opr::TypeCvt::make(
  1829. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  1830. .rename(name),
  1831. dtype);
  1832. };
  1833. auto x = mkvar("x", {32, 16, 16, 16, 4}, dtype::QuantizedS8(2.5f)),
  1834. w = mkcvar("w1", {64, 16, 3, 3, 4}, dtype::QuantizedS8(2.5f)),
  1835. b = mkcvar("b", {1, 16, 1, 1, 4}, dtype::QuantizedS32(6.25f)),
  1836. b1 = mkvar("b1", {32, 16, 16, 16, 4}, dtype::QuantizedS8(2.5f));
  1837. opr::Convolution::Param param;
  1838. param.format = opr::Convolution::Param::Format::NCHW4;
  1839. param.stride_h = param.stride_w = 1;
  1840. param.pad_h = param.pad_w = 1;
  1841. auto y = opr::Convolution::make(x, w, param);
  1842. y = opr::Elemwise::make({y + b}, opr::Elemwise::Param::Mode::RELU);
  1843. y = opr::TypeCvt::make(y, dtype::QuantizedS8(2.5f));
  1844. auto y1 = y + b1, y2 = opr::Convolution::make(y, w, param),
  1845. y3 = opr::Elemwise::make({y - b1}, opr::Elemwise::Param::Mode::RELU);
  1846. y2 = opr::Elemwise::make({y2 + b}, opr::Elemwise::Param::Mode::RELU),
  1847. y2 = opr::TypeCvt::make(y2, dtype::QuantizedS8(2.5f));
  1848. auto y4 = y1 + y2 + y3;
  1849. y4 = opr::TypeCvt::make(y4, dtype::Float32());
  1850. SymbolVar y_opt;
  1851. SymbolVar y_no_tc;
  1852. {
  1853. auto options = gopt::OptimizeForInferenceOptions{};
  1854. options.enable_fuse_conv_bias_nonlinearity().enable_nchw32();
  1855. unpack_vector(gopt::optimize_for_inference({y4}, options), y_opt);
  1856. }
  1857. {
  1858. auto options = gopt::OptimizeForInferenceOptions{};
  1859. options.enable_fuse_conv_bias_nonlinearity().enable_nchw32();
  1860. unpack_vector(gopt::optimize_for_inference({y4}, options), y_no_tc);
  1861. }
  1862. auto nr_dimshuffle = find_opr_num<mgb::opr::Dimshuffle>(y_opt);
  1863. ASSERT_EQ(3u, nr_dimshuffle);
  1864. graph->compile({{y_opt, {}}})
  1865. ->to_json()
  1866. ->writeto_fpath(
  1867. output_file("TestGoptInference.EnableTensorCorePass.json"));
  1868. HostTensorND host_y, host_y_opt;
  1869. auto func = graph->compile({make_callback_copy(y_no_tc, host_y),
  1870. make_callback_copy(y_opt, host_y_opt)});
  1871. func->execute();
  1872. MGB_ASSERT_TENSOR_EQ(host_y, host_y_opt);
  1873. }
  1874. //! close for cu111 ci, reopen it when bug fixed
  1875. #if 0
  1876. TEST(FuseConvBiasZPass, BlockFuse) {
  1877. REQUIRE_GPU(1);
  1878. auto cn = CompNode::load("gpu0");
  1879. cn.activate();
  1880. auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
  1881. auto sm_ver = prop.major * 10 + prop.minor;
  1882. if (sm_ver < 61) {
  1883. printf("This testcast ignored due to insufficient cuda cap(got: %d, "
  1884. "expected: %d)\n",
  1885. sm_ver, 61);
  1886. return;
  1887. }
  1888. HostTensorGenerator<dtype::Int8> gen;
  1889. auto graph = ComputingGraph::make();
  1890. graph->options().graph_opt_level = 0;
  1891. auto mkvar = [&](const char* name, const TensorShape& shp,
  1892. const DType& dtype) {
  1893. return opr::TypeCvt::make(
  1894. opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
  1895. dtype);
  1896. };
  1897. auto mkcvar = [&](const char* name, const TensorShape& shp,
  1898. const DType& dtype) {
  1899. return opr::TypeCvt::make(
  1900. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  1901. .rename(name),
  1902. dtype);
  1903. };
  1904. using ElemMultiMode = opr::ElemwiseMultiType::Param::Mode;
  1905. using NonlineMode = opr::ConvBias::Param::NonlineMode;
  1906. for (auto mode :
  1907. {ElemMultiMode::QFUSE_ADD_RELU, ElemMultiMode::QFUSE_ADD_H_SWISH}) {
  1908. auto x = mkvar("x", {32, 16, 16, 16, 4}, dtype::QuantizedS8(2.5f)),
  1909. w1 = mkcvar("w1", {64, 16, 3, 3, 4}, dtype::QuantizedS8(2.5f)),
  1910. b1 = mkcvar("b1", {1, 16, 1, 1, 4}, dtype::QuantizedS32(6.25f)),
  1911. w2 = mkcvar("w2", {64, 16, 3, 3, 4}, dtype::QuantizedS8(2.5f)),
  1912. b2 = mkcvar("b2", {1, 16, 1, 1, 4}, dtype::QuantizedS32(6.25f)),
  1913. w3 = mkcvar("w3", {64, 16, 3, 3, 4}, dtype::QuantizedS8(2.5f)),
  1914. b3 = mkcvar("b3", {1, 16, 1, 1, 4}, dtype::QuantizedS32(3.0f));
  1915. NonlineMode nonline_mode = NonlineMode::RELU;
  1916. if (mode == ElemMultiMode::QFUSE_ADD_H_SWISH) {
  1917. nonline_mode = NonlineMode::H_SWISH;
  1918. }
  1919. opr::ConvBias::Param param;
  1920. param.format = opr::Convolution::Param::Format::NCHW4;
  1921. param.nonlineMode = nonline_mode;
  1922. param.stride_h = param.stride_w = 1;
  1923. param.pad_h = param.pad_w = 1;
  1924. auto y1 = opr::ConvBias::make(
  1925. x, w1, b1, param, {},
  1926. OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  1927. param.nonlineMode = opr::ConvBias::Param::NonlineMode::IDENTITY;
  1928. auto y2 = opr::ConvBias::make(
  1929. y1, w2, b2, param, {},
  1930. OperatorNodeConfig{dtype::QuantizedS8(2.5f)}),
  1931. y3 = opr::ElemwiseMultiType::make(
  1932. {y1, y2}, {mode},
  1933. OperatorNodeConfig{dtype::QuantizedS8(1.2f)});
  1934. param.nonlineMode = nonline_mode;
  1935. auto y4 = opr::ConvBias::make(
  1936. y3, w3, b3, param, {},
  1937. OperatorNodeConfig{dtype::QuantizedS8(2.5f)}),
  1938. z = opr::ElemwiseMultiType::make(
  1939. {y3, y4}, {opr::ElemwiseMultiType::Param::Mode::QADD},
  1940. OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  1941. z = opr::TypeCvt::make(z, dtype::Float32());
  1942. //! fuse z mannually
  1943. auto z0 = opr::ConvBias::make(
  1944. x, w1, b1, param, {},
  1945. OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  1946. auto z1 = opr::ConvBias::make(
  1947. z0, w2, b2, z0, param, {},
  1948. OperatorNodeConfig{dtype::QuantizedS8(1.2f)}),
  1949. z2 = opr::ConvBias::make(
  1950. z1, w3, b3, param, {},
  1951. OperatorNodeConfig{dtype::QuantizedS8(2.5f)}),
  1952. z4 = opr::ElemwiseMultiType::make(
  1953. {z1, z2}, {opr::ElemwiseMultiType::Mode::QADD},
  1954. OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  1955. z4 = opr::TypeCvt::make(z4, dtype::Float32());
  1956. SymbolVar z_fuse;
  1957. SymbolVar z_nonfuse;
  1958. {
  1959. auto options = gopt::OptimizeForInferenceOptions{};
  1960. options.enable_fuse_conv_bias_nonlinearity()
  1961. .enable_fuse_conv_bias_with_z();
  1962. unpack_vector(gopt::optimize_for_inference({z}, options), z_fuse);
  1963. }
  1964. {
  1965. auto options = gopt::OptimizeForInferenceOptions{};
  1966. options.enable_fuse_conv_bias_nonlinearity();
  1967. unpack_vector(gopt::optimize_for_inference({z4}, options),
  1968. z_nonfuse);
  1969. }
  1970. auto nr_elem_multi_type =
  1971. find_opr_num<mgb::opr::ElemwiseMultiType>(z_fuse);
  1972. MGB_MARK_USED_VAR(nr_elem_multi_type);
  1973. ASSERT_EQ(1u, nr_elem_multi_type);
  1974. graph->compile({{z_fuse, {}}})
  1975. ->to_json()
  1976. ->writeto_fpath(
  1977. output_file("FuseConvBiasZPass.BlockFuse_fuse.json"));
  1978. graph->compile({{z_nonfuse, {}}})
  1979. ->to_json()
  1980. ->writeto_fpath(output_file(
  1981. "FuseConvBiasZPass.BlockFuse_nonfuse.json"));
  1982. HostTensorND host_z_fuse, host_z_nonfuse;
  1983. auto func =
  1984. graph->compile({make_callback_copy(z_nonfuse, host_z_nonfuse),
  1985. make_callback_copy(z_fuse, host_z_fuse)});
  1986. func->execute();
  1987. MGB_ASSERT_TENSOR_EQ(host_z_fuse, host_z_nonfuse);
  1988. }
  1989. }
  1990. #endif
  1991. //! close for cu111 ci, reopen it when bug fixed
  1992. #if 0
  1993. TEST(TestEnableTensorCore, ShuffleMerge) {
  1994. REQUIRE_GPU(1);
  1995. auto cn = CompNode::load("gpu0");
  1996. cn.activate();
  1997. auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
  1998. auto sm_ver = prop.major * 10 + prop.minor;
  1999. if (sm_ver < 75) {
  2000. printf("This testcast ignored due to insufficient cuda cap(got: %d, "
  2001. "expected: %d)\n",
  2002. sm_ver, 75);
  2003. return;
  2004. }
  2005. HostTensorGenerator<dtype::Int8> gen;
  2006. auto graph = ComputingGraph::make();
  2007. graph->options().graph_opt_level = 0;
  2008. auto mkvar = [&](const char* name, const TensorShape& shp,
  2009. const DType& dtype) {
  2010. return opr::TypeCvt::make(
  2011. opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
  2012. dtype);
  2013. };
  2014. auto mkcvar = [&](const char* name, const TensorShape& shp,
  2015. const DType& dtype) {
  2016. return opr::TypeCvt::make(
  2017. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  2018. .rename(name),
  2019. dtype);
  2020. };
  2021. auto nchw2nchw4 = [](SymbolVar x) {
  2022. auto xshp = opr::GetVarShape::make(x);
  2023. auto cv = [&x](int v) { return x.make_scalar(v); };
  2024. auto sub = [&xshp, &cv](int idx) {
  2025. return opr::IndexAt::make(xshp, {{0, cv(idx)}});
  2026. };
  2027. auto tshp = opr::Concat::make(
  2028. {sub(0), sub(1) / 4, cv(4), sub(2), sub(3)}, 0);
  2029. auto y0 = opr::Reshape::make(x, tshp);
  2030. auto y1 = opr::Dimshuffle::make(y0, {0, 1, 3, 4, 2});
  2031. return y1;
  2032. };
  2033. auto nchw42nchw = [](SymbolVar x) {
  2034. auto xshp = opr::GetVarShape::make(x);
  2035. auto cv = [&x](int v) { return x.make_scalar(v); };
  2036. auto sub = [&xshp, &cv](int idx) {
  2037. return opr::IndexAt::make(xshp, {{0, cv(idx)}});
  2038. };
  2039. auto tshp = opr::Concat::make({sub(0), sub(1) * 4, sub(2), sub(3)}, 0);
  2040. auto y0 = opr::Dimshuffle::make(x, {0, 1, 4, 2, 3});
  2041. auto y1 = opr::Reshape::make(y0, tshp);
  2042. return y1;
  2043. };
  2044. auto x = mkvar("x", {32, 64, 16, 16}, dtype::QuantizedS8(2.5f)),
  2045. w = mkcvar("w1", {64, 64, 3, 3}, dtype::QuantizedS8(2.5f)),
  2046. b = mkcvar("b", {1, 64, 1, 1}, dtype::QuantizedS32(6.25f)),
  2047. z = mkvar("b1", {32, 64, 16, 16}, dtype::QuantizedS8(2.5f));
  2048. x = nchw2nchw4(x), w = nchw2nchw4(w), b = nchw2nchw4(b), z = nchw2nchw4(z);
  2049. opr::ConvBias::Param param;
  2050. param.format = opr::ConvBias::Param::Format::NCHW4;
  2051. param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU;
  2052. param.stride_h = param.stride_w = 1;
  2053. param.pad_h = param.pad_w = 1;
  2054. auto y = opr::ConvBias::make(x, w, b, z, param, {},
  2055. OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  2056. y = nchw42nchw(y);
  2057. y = opr::TypeCvt::make(y, dtype::Float32());
  2058. SymbolVar y_opt;
  2059. SymbolVar y_no_tc;
  2060. {
  2061. auto options = gopt::OptimizeForInferenceOptions{};
  2062. options.enable_fuse_conv_bias_nonlinearity().enable_nchw32();
  2063. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  2064. }
  2065. {
  2066. auto options = gopt::OptimizeForInferenceOptions{};
  2067. options.enable_fuse_conv_bias_nonlinearity();
  2068. unpack_vector(gopt::optimize_for_inference({y}, options), y_no_tc);
  2069. }
  2070. auto nr_dimshuffle = find_opr_num<mgb::opr::Dimshuffle>(y_opt);
  2071. ASSERT_EQ(3u, nr_dimshuffle);
  2072. HostTensorND host_y, host_y_opt;
  2073. auto func = graph->compile({make_callback_copy(y_no_tc, host_y),
  2074. make_callback_copy(y_opt, host_y_opt)});
  2075. func->execute();
  2076. MGB_ASSERT_TENSOR_EQ(host_y, host_y_opt);
  2077. }
  2078. #endif
  2079. #endif
  2080. TEST(FuseConvBiasZPass, Basic) {
  2081. REQUIRE_GPU(1);
  2082. auto cn = CompNode::load("gpu0");
  2083. HostTensorGenerator<dtype::Int8> gen;
  2084. auto graph = ComputingGraph::make();
  2085. graph->options().graph_opt_level = 0;
  2086. auto mkvar = [&](const char* name, const TensorShape& shp,
  2087. const DType& dtype) {
  2088. return opr::TypeCvt::make(
  2089. opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
  2090. dtype);
  2091. };
  2092. auto mkcvar = [&](const char* name, const TensorShape& shp,
  2093. const DType& dtype) {
  2094. return opr::TypeCvt::make(
  2095. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  2096. .rename(name),
  2097. dtype);
  2098. };
  2099. auto format = opr::Convolution::Param::Format::NCHW4;
  2100. auto x = mkvar("x", {32, 16, 16, 16, 4}, dtype::QuantizedS8(2.5f)),
  2101. w = mkcvar("w1", {64, 16, 3, 3, 4}, dtype::QuantizedS8(2.5f)),
  2102. b = mkcvar("b", {1, 16, 1, 1, 4}, dtype::QuantizedS32(6.25f)),
  2103. b1 = mkvar("b1", {32, 16, 16, 16, 4}, dtype::QuantizedS8(2.5f)),
  2104. b2 = mkvar("b2", {32, 16, 16, 16, 4}, dtype::QuantizedS8(2.5f));
  2105. opr::ConvBias::Param conv_bias_param;
  2106. conv_bias_param.format = format;
  2107. conv_bias_param.stride_h = conv_bias_param.stride_w = 1;
  2108. conv_bias_param.pad_h = conv_bias_param.pad_w = 1;
  2109. auto y = opr::ConvBias::make(x, w, b, conv_bias_param, {},
  2110. OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  2111. SymbolVar y_opt;
  2112. // check fuse mode
  2113. for (auto mode : {opr::ElemwiseMultiType::Param::Mode::QADD,
  2114. opr::ElemwiseMultiType::Param::Mode::QMUL,
  2115. opr::ElemwiseMultiType::Param::Mode::QFUSE_ADD_RELU}) {
  2116. auto y1 = opr::ElemwiseMultiType::make(
  2117. {y, b1}, {mode}, OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  2118. {
  2119. auto options = gopt::OptimizeForInferenceOptions{};
  2120. options.enable_fuse_conv_bias_nonlinearity()
  2121. .enable_fuse_conv_bias_with_z()
  2122. .enable_nchw32();
  2123. unpack_vector(gopt::optimize_for_inference({y1}, options), y_opt);
  2124. }
  2125. auto nr_elemwisemultitype = find_opr_num<opr::ElemwiseMultiType>(y_opt);
  2126. if (mode == opr::ElemwiseMultiType::Param::Mode::QMUL) {
  2127. ASSERT_NE(0u, nr_elemwisemultitype);
  2128. } else
  2129. ASSERT_EQ(0u, nr_elemwisemultitype);
  2130. // fuse convbiasz and z
  2131. if (mode == opr::ElemwiseMultiType::Param::Mode::QADD) {
  2132. auto y2 = opr::ElemwiseMultiType::make(
  2133. {y1, b2}, {mode},
  2134. OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  2135. {
  2136. auto options = gopt::OptimizeForInferenceOptions{};
  2137. options.enable_fuse_conv_bias_nonlinearity()
  2138. .enable_fuse_conv_bias_with_z()
  2139. .enable_nchw32();
  2140. unpack_vector(gopt::optimize_for_inference({y2}, options),
  2141. y_opt);
  2142. }
  2143. auto nr_elemwisemultitype =
  2144. find_opr_num<opr::ElemwiseMultiType>(y_opt);
  2145. ASSERT_NE(0u, nr_elemwisemultitype);
  2146. }
  2147. }
  2148. }
  2149. #if MGB_CUDA
  2150. //! close for cu111 ci, reopen it when bug fixed
  2151. #if 0
  2152. TEST(TestGoptInference, EnableCHWN4) {
  2153. REQUIRE_GPU(1);
  2154. auto cn = CompNode::load("gpu0");
  2155. cn.activate();
  2156. auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
  2157. auto sm_ver = prop.major * 10 + prop.minor;
  2158. if (sm_ver < 61) {
  2159. printf("This testcast ignored due to insufficient cuda cap(got: %d, "
  2160. "expected: %d)\n",
  2161. sm_ver, 61);
  2162. return;
  2163. }
  2164. HostTensorGenerator<dtype::Int8> gen;
  2165. auto graph = ComputingGraph::make();
  2166. graph->options().graph_opt_level = 0;
  2167. auto mkvar = [&](const char* name, const TensorShape& shp,
  2168. const DType& dtype) {
  2169. return opr::TypeCvt::make(
  2170. opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
  2171. dtype);
  2172. };
  2173. auto mkcvar = [&](const char* name, const TensorShape& shp,
  2174. const DType& dtype) {
  2175. return opr::TypeCvt::make(
  2176. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  2177. .rename(name),
  2178. dtype);
  2179. };
  2180. auto mkshape = [](opr::ConvBias::Param::Format format, size_t N, size_t C,
  2181. size_t H, size_t W) -> TensorShape {
  2182. mgb_assert(C % 4 == 0);
  2183. if (format == opr::ConvBias::Param::Format::NCHW4) {
  2184. return {N, C / 4, H, W, 4};
  2185. } else {
  2186. mgb_assert(format == opr::ConvBias::Param::Format::NCHW);
  2187. return {N, C, H, W};
  2188. }
  2189. };
  2190. for (auto format : {opr::ConvBias::Param::Format::NCHW,
  2191. opr::ConvBias::Param::Format::NCHW4}) {
  2192. auto x = mkvar("x", mkshape(format, 32, 64, 16, 16),
  2193. dtype::QuantizedS8(2.5f)),
  2194. w = mkcvar("w1", mkshape(format, 64, 64, 3, 3),
  2195. dtype::QuantizedS8(2.5f)),
  2196. b = mkcvar("b", mkshape(format, 1, 64, 1, 1),
  2197. dtype::QuantizedS32(6.25f)),
  2198. b1 = mkvar("b1", mkshape(format, 32, 64, 16, 16),
  2199. dtype::QuantizedS8(2.5f));
  2200. opr::ConvBias::Param param;
  2201. param.format = format;
  2202. param.stride_h = param.stride_w = 1;
  2203. param.pad_h = param.pad_w = 1;
  2204. param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU;
  2205. auto y = opr::ConvBiasForward::make(
  2206. x, w, b, param, {},
  2207. OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
  2208. auto y1 = opr::ElemwiseMultiType::make(
  2209. {y, b1}, opr::ElemwiseMultiType::Mode::QFUSE_ADD_RELU,
  2210. OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
  2211. auto y2 = opr::ConvBiasForward::make(
  2212. y, w, b, param, {},
  2213. OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
  2214. auto y3 = opr::ElemwiseMultiType::make(
  2215. {y, b1}, opr::ElemwiseMultiType::Param::Mode::QSUB,
  2216. OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
  2217. auto y4 = opr::ElemwiseMultiType::make(
  2218. {y1, y2}, opr::ElemwiseMultiType::Param::Mode::QADD,
  2219. OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
  2220. y4 = opr::ElemwiseMultiType::make(
  2221. {y3, y4}, opr::ElemwiseMultiType::Param::Mode::QADD,
  2222. OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
  2223. y4 = opr::TypeCvt::make(y4, dtype::Float32());
  2224. SymbolVar y_opt;
  2225. SymbolVar y_cudnn;
  2226. {
  2227. auto options = gopt::OptimizeForInferenceOptions{};
  2228. options.enable_chwn4();
  2229. unpack_vector(gopt::optimize_for_inference({y4}, options), y_opt);
  2230. }
  2231. unpack_vector(gopt::GraphOptimizer{}
  2232. .add_pass<gopt::FuseConvBiasNonlinPass>()
  2233. .add_pass<gopt::FuseConvBiasZPass>()
  2234. .apply({{y4}})
  2235. .endpoint_vars(),
  2236. y_cudnn);
  2237. ASSERT_EQ(opr::ConvBias::Param::Format::CHWN4,
  2238. find_opr<opr::ConvBias>(y_opt).param().format);
  2239. HostTensorND host_y, host_y_opt;
  2240. auto func = graph->compile({make_callback_copy(y_cudnn, host_y),
  2241. make_callback_copy(y_opt, host_y_opt)});
  2242. func->execute();
  2243. MGB_ASSERT_TENSOR_EQ(host_y, host_y_opt);
  2244. }
  2245. }
  2246. #endif
  2247. //! close for cu111 ci, reopen it when bug fixed
  2248. #if 0
  2249. TEST(TestGoptInference, EnableCHWN4WarpPespective) {
  2250. REQUIRE_GPU(1);
  2251. auto cn = CompNode::load("gpu0");
  2252. cn.activate();
  2253. auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
  2254. auto sm_ver = prop.major * 10 + prop.minor;
  2255. if (sm_ver < 61) {
  2256. printf("This testcast ignored due to insufficient cuda cap(got: %d, "
  2257. "expected: %d)\n",
  2258. sm_ver, 61);
  2259. return;
  2260. }
  2261. HostTensorGenerator<dtype::Int8> gen;
  2262. auto graph = ComputingGraph::make();
  2263. graph->options().graph_opt_level = 0;
  2264. auto mkvar = [&](const char* name, const TensorShape& shp,
  2265. const DType& dtype) {
  2266. return opr::TypeCvt::make(
  2267. opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
  2268. dtype);
  2269. };
  2270. auto mkcvar = [&](const char* name, const TensorShape& shp,
  2271. const DType& dtype) {
  2272. return opr::TypeCvt::make(
  2273. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  2274. .rename(name),
  2275. dtype);
  2276. };
  2277. std::shared_ptr<HostTensorND> mat = std::make_shared<HostTensorND>(
  2278. cn, TensorShape{32, 3, 3}, dtype::Float32());
  2279. warp_perspective_mat_gen(*mat, 32, 16, 16);
  2280. auto mat_var = opr::Host2DeviceCopy::make(*graph, mat).rename("mat");
  2281. auto x = mkvar("x", {32, 16, 16, 16, 4}, dtype::QuantizedS8(2.5f)),
  2282. w = mkcvar("w1", {64, 16, 3, 3, 4}, dtype::QuantizedS8(2.5f)),
  2283. b = mkcvar("b", {1, 16, 1, 1, 4}, dtype::QuantizedS32(6.25f));
  2284. opr::ConvBias::Param param;
  2285. param.format = opr::ConvBias::Param::Format::NCHW4;
  2286. param.stride_h = param.stride_w = 1;
  2287. param.pad_h = param.pad_w = 1;
  2288. param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU;
  2289. auto y = opr::ConvBiasForward::make(
  2290. x, w, b, param, {}, OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
  2291. opr::WarpPerspective::Param warp_param;
  2292. warp_param.format = opr::WarpPerspective::Param::Format::NCHW4;
  2293. auto y1 = opr::WarpPerspective::make(y, mat_var, TensorShape{16, 16},
  2294. warp_param);
  2295. y1 = opr::TypeCvt::make(y1, dtype::Float32());
  2296. auto nchw42nchw = [](SymbolVar x) {
  2297. auto xshp = opr::GetVarShape::make(x);
  2298. auto cv = [&x](int v) { return x.make_scalar(v); };
  2299. auto sub = [&xshp, &cv](int idx) {
  2300. return opr::IndexAt::make(xshp, {{0, cv(idx)}});
  2301. };
  2302. auto tshp = opr::Concat::make({sub(0), sub(1) * 4, sub(2), sub(3)}, 0);
  2303. auto y0 = opr::Dimshuffle::make(x, {0, 1, 4, 2, 3});
  2304. auto y1 = opr::Reshape::make(y0, tshp);
  2305. return y1;
  2306. };
  2307. y1 = nchw42nchw(y1);
  2308. warp_param.format = opr::WarpPerspective::Param::Format::NCHW;
  2309. auto y2 = opr::WarpPerspective::make(y1, mat_var, TensorShape{16, 16},
  2310. warp_param);
  2311. SymbolVar y_opt;
  2312. SymbolVar y_cudnn;
  2313. {
  2314. auto options = gopt::OptimizeForInferenceOptions{};
  2315. options.enable_chwn4();
  2316. unpack_vector(gopt::optimize_for_inference({y2}, options), y_opt);
  2317. }
  2318. unpack_vector(gopt::GraphOptimizer{}
  2319. .add_pass<gopt::FuseConvBiasNonlinPass>()
  2320. .add_pass<gopt::FuseConvBiasZPass>()
  2321. .apply({{y2}})
  2322. .endpoint_vars(),
  2323. y_cudnn);
  2324. HostTensorND host_y, host_y_opt;
  2325. auto func = graph->compile({make_callback_copy(y_cudnn, host_y),
  2326. make_callback_copy(y_opt, host_y_opt)});
  2327. func->execute();
  2328. MGB_ASSERT_TENSOR_EQ(host_y, host_y_opt);
  2329. }
  2330. #endif
  2331. TEST(TestGoptInference, EnableCHWN4Pooling) {
  2332. REQUIRE_GPU(1);
  2333. auto cn = CompNode::load("gpu0");
  2334. cn.activate();
  2335. auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
  2336. auto sm_ver = prop.major * 10 + prop.minor;
  2337. if (sm_ver < 61) {
  2338. printf("This testcast ignored due to insufficient cuda cap(got: %d, "
  2339. "expected: %d)\n",
  2340. sm_ver, 61);
  2341. return;
  2342. }
  2343. HostTensorGenerator<dtype::Int8> gen;
  2344. auto graph = ComputingGraph::make();
  2345. graph->options().graph_opt_level = 0;
  2346. auto mkvar = [&](const char* name, const TensorShape& shp,
  2347. const DType& dtype) {
  2348. return opr::TypeCvt::make(
  2349. opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
  2350. dtype);
  2351. };
  2352. auto mkcvar = [&](const char* name, const TensorShape& shp,
  2353. const DType& dtype) {
  2354. return opr::TypeCvt::make(
  2355. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  2356. .rename(name),
  2357. dtype);
  2358. };
  2359. auto x = mkvar("x", {32, 16, 16, 16, 4}, dtype::QuantizedS8(2.5f)),
  2360. w = mkcvar("w1", {64, 16, 3, 3, 4}, dtype::QuantizedS8(2.5f)),
  2361. b = mkcvar("b", {1, 16, 1, 1, 4}, dtype::QuantizedS32(6.25f));
  2362. opr::ConvBias::Param param;
  2363. param.format = opr::ConvBias::Param::Format::NCHW4;
  2364. param.stride_h = param.stride_w = 1;
  2365. param.pad_h = param.pad_w = 1;
  2366. param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU;
  2367. auto y = opr::ConvBiasForward::make(
  2368. x, w, b, param, {}, OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
  2369. opr::Pooling::Param pool_param;
  2370. pool_param.format = opr::Pooling::Param::Format::NCHW4;
  2371. y = opr::Pooling::make(y, pool_param);
  2372. y = opr::TypeCvt::make(y, dtype::Float32());
  2373. auto nchw42nchw = [](SymbolVar x) {
  2374. auto xshp = opr::GetVarShape::make(x);
  2375. auto cv = [&x](int v) { return x.make_scalar(v); };
  2376. auto sub = [&xshp, &cv](int idx) {
  2377. return opr::IndexAt::make(xshp, {{0, cv(idx)}});
  2378. };
  2379. auto tshp = opr::Concat::make({sub(0), sub(1) * 4, sub(2), sub(3)}, 0);
  2380. auto y0 = opr::Dimshuffle::make(x, {0, 1, 4, 2, 3});
  2381. auto y1 = opr::Reshape::make(y0, tshp);
  2382. return y1;
  2383. };
  2384. y = nchw42nchw(y);
  2385. pool_param.format = opr::Pooling::Param::Format::NCHW;
  2386. auto y1 = opr::Pooling::make(y, pool_param);
  2387. SymbolVar y_opt;
  2388. SymbolVar y_cudnn;
  2389. unpack_vector(
  2390. gopt::GraphOptimizer{}
  2391. .add_pass<gopt::FuseConvBiasNonlinPass>()
  2392. .add_pass(gopt::EnableCHWN4Pass::make_chwn4_converter())
  2393. .add_pass<gopt::FuseConvBiasZPass>()
  2394. .apply({{y1}})
  2395. .endpoint_vars(),
  2396. y_opt);
  2397. unpack_vector(gopt::GraphOptimizer{}
  2398. .add_pass<gopt::FuseConvBiasNonlinPass>()
  2399. .add_pass<gopt::FuseConvBiasZPass>()
  2400. .apply({{y1}})
  2401. .endpoint_vars(),
  2402. y_cudnn);
  2403. HostTensorND host_y, host_y_opt;
  2404. auto func = graph->compile({make_callback_copy(y_cudnn, host_y),
  2405. make_callback_copy(y_opt, host_y_opt)});
  2406. func->execute();
  2407. MGB_ASSERT_TENSOR_EQ(host_y, host_y_opt);
  2408. }
  2409. //! close for cu111 ci, reopen it when bug fixed
  2410. #if 0
  2411. TEST(TestGoptInference, EnableCHWN4ShuffleRemove) {
  2412. REQUIRE_GPU(1);
  2413. auto cn = CompNode::load("gpu0");
  2414. cn.activate();
  2415. auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
  2416. auto sm_ver = prop.major * 10 + prop.minor;
  2417. if (sm_ver < 61) {
  2418. printf("This testcast ignored due to insufficient cuda cap(got: %d, "
  2419. "expected: %d)\n",
  2420. sm_ver, 61);
  2421. return;
  2422. }
  2423. HostTensorGenerator<dtype::Int8> gen;
  2424. auto graph = ComputingGraph::make();
  2425. graph->options().graph_opt_level = 0;
  2426. auto mkvar = [&](const char* name, const TensorShape& shp,
  2427. const DType& dtype) {
  2428. return opr::TypeCvt::make(
  2429. opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
  2430. dtype);
  2431. };
  2432. auto mkcvar = [&](const char* name, const TensorShape& shp,
  2433. const DType& dtype) {
  2434. return opr::TypeCvt::make(
  2435. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  2436. .rename(name),
  2437. dtype);
  2438. };
  2439. auto nchw2nchw4 = [](SymbolVar x) {
  2440. auto xshp = opr::GetVarShape::make(x);
  2441. auto cv = [&x](int v) { return x.make_scalar(v); };
  2442. auto sub = [&xshp, &cv](int idx) {
  2443. return opr::IndexAt::make(xshp, {{0, cv(idx)}});
  2444. };
  2445. auto tshp = opr::Concat::make(
  2446. {sub(0), sub(1) / 4, cv(4), sub(2), sub(3)}, 0);
  2447. auto y0 = opr::Reshape::make(x, tshp);
  2448. auto y1 = opr::Dimshuffle::make(y0, {0, 1, 3, 4, 2});
  2449. return y1;
  2450. };
  2451. auto nchw42nchw = [](SymbolVar x) {
  2452. auto xshp = opr::GetVarShape::make(x);
  2453. auto cv = [&x](int v) { return x.make_scalar(v); };
  2454. auto sub = [&xshp, &cv](int idx) {
  2455. return opr::IndexAt::make(xshp, {{0, cv(idx)}});
  2456. };
  2457. auto tshp = opr::Concat::make({sub(0), sub(1) * 4, sub(2), sub(3)}, 0);
  2458. auto y0 = opr::Dimshuffle::make(x, {0, 1, 4, 2, 3});
  2459. auto y1 = opr::Reshape::make(y0, tshp);
  2460. return y1;
  2461. };
  2462. auto x = mkvar("x", {32, 64, 16, 16}, dtype::QuantizedS8(2.5f)),
  2463. w = mkcvar("w1", {64, 16, 3, 3, 4}, dtype::QuantizedS8(2.5f)),
  2464. b = mkcvar("b", {1, 16, 1, 1, 4}, dtype::QuantizedS32(6.25f)),
  2465. b1 = mkcvar("b1", {32, 16, 16, 16, 4}, dtype::QuantizedS8{2.5f});
  2466. x = nchw2nchw4(x);
  2467. opr::ConvBias::Param param;
  2468. param.format = opr::ConvBias::Param::Format::NCHW4;
  2469. param.stride_h = param.stride_w = 1;
  2470. param.pad_h = param.pad_w = 1;
  2471. param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU;
  2472. auto y = opr::ConvBiasForward::make(
  2473. x, w, b, param, {}, OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
  2474. auto y1 = opr::ElemwiseMultiType::make(
  2475. {y, b1}, opr::ElemwiseMultiType::Mode::QFUSE_ADD_RELU,
  2476. OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
  2477. auto y2 = opr::ConvBiasForward::make(
  2478. y, w, b, param, {}, OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
  2479. auto y3 = opr::ElemwiseMultiType::make(
  2480. {y, b1}, opr::ElemwiseMultiType::Param::Mode::QSUB,
  2481. OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
  2482. auto y4 = opr::ElemwiseMultiType::make(
  2483. {y1, y2}, opr::ElemwiseMultiType::Param::Mode::QADD,
  2484. OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
  2485. y4 = opr::ElemwiseMultiType::make(
  2486. {y3, y4}, opr::ElemwiseMultiType::Param::Mode::QADD,
  2487. OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
  2488. y4 = opr::TypeCvt::make(y4, dtype::Float32());
  2489. y4 = nchw42nchw(y4);
  2490. SymbolVar y_opt;
  2491. SymbolVar y_cudnn;
  2492. unpack_vector(
  2493. gopt::GraphOptimizer{}
  2494. .add_pass<gopt::ParamRedistributePass>()
  2495. .add_pass<gopt::ParamFusePass>()
  2496. .add_pass<gopt::FuseConvBiasNonlinPass>()
  2497. .add_pass<gopt::FuseConvBiasZPass>()
  2498. .add_pass(gopt::EnableCHWN4Pass::make_chwn4_converter())
  2499. .add_pass<gopt::ShuffleShuffleRemovePass>()
  2500. .add_pass<gopt::ParamFusePass>()
  2501. .apply({{y4}})
  2502. .endpoint_vars(),
  2503. y_opt);
  2504. graph->compile({{y_opt, {}}})
  2505. ->to_json()
  2506. ->writeto_fpath(output_file(
  2507. "TestGoptInference.EnableCHWN4ShuffleRemove.json"));
  2508. auto nr_dimshuffle = find_opr_num<mgb::opr::Dimshuffle>(y_opt);
  2509. ASSERT_EQ(2u, nr_dimshuffle);
  2510. auto nr_reformat = find_opr_num<mgb::opr::RelayoutFormat>(y_opt);
  2511. ASSERT_EQ(0u, nr_reformat);
  2512. unpack_vector(gopt::GraphOptimizer{}
  2513. .add_pass<gopt::FuseConvBiasNonlinPass>()
  2514. .add_pass<gopt::FuseConvBiasZPass>()
  2515. .apply({{y4}})
  2516. .endpoint_vars(),
  2517. y_cudnn);
  2518. HostTensorND host_y, host_y_opt;
  2519. auto func = graph->compile({make_callback_copy(y_cudnn, host_y),
  2520. make_callback_copy(y_opt, host_y_opt)});
  2521. func->execute();
  2522. MGB_ASSERT_TENSOR_EQ(host_y, host_y_opt);
  2523. }
  2524. #endif
  2525. //! close for cu111 ci, reopen it when bug fixed
  2526. #if 0
  2527. TEST(TestGoptInference, ConvertFormatNCHW4GPU) {
  2528. REQUIRE_GPU(1);
  2529. auto cn = CompNode::load("gpu0");
  2530. cn.activate();
  2531. auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
  2532. auto sm_ver = prop.major * 10 + prop.minor;
  2533. if (sm_ver < 61) {
  2534. printf("This testcast ignored due to insufficient cuda cap(got: %d, "
  2535. "expected: %d)\n",
  2536. sm_ver, 61);
  2537. return;
  2538. }
  2539. HostTensorGenerator<dtype::Int8> gen;
  2540. auto graph = ComputingGraph::make();
  2541. graph->options().graph_opt_level = 0;
  2542. auto mkvar = [&](const char* name, const TensorShape& shp,
  2543. const DType& dtype) {
  2544. return opr::TypeCvt::make(
  2545. opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
  2546. dtype);
  2547. };
  2548. auto mkcvar = [&](const char* name, const TensorShape& shp,
  2549. const DType& dtype) {
  2550. return opr::TypeCvt::make(
  2551. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  2552. .rename(name),
  2553. dtype);
  2554. };
  2555. auto x = mkvar("x", {2, 4, 16, 16}, dtype::QuantizedS8(2.5f));
  2556. opr::ConvBias::Param param_conv_bias;
  2557. param_conv_bias.format = opr::ConvBias::Param::Format::NCHW;
  2558. param_conv_bias.stride_h = param_conv_bias.stride_w = 1;
  2559. param_conv_bias.pad_h = param_conv_bias.pad_w = 1;
  2560. param_conv_bias.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU;
  2561. // dense
  2562. param_conv_bias.sparse = opr::ConvBias::Param::Sparse::DENSE;
  2563. auto w1 = mkcvar("w1", {8, 4, 3, 3}, dtype::QuantizedS8(2.5f)),
  2564. b1 = mkcvar("b1", {1, 8, 1, 1}, dtype::QuantizedS32(6.25f));
  2565. auto conv1 = opr::ConvBiasForward::make(
  2566. x, w1, b1, param_conv_bias, {},
  2567. OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
  2568. // group
  2569. // icpg != 1 && ocpg != 1
  2570. param_conv_bias.sparse = opr::ConvBias::Param::Sparse::GROUP;
  2571. auto w2 = mkcvar("w2", {2, 4, 4, 3, 3}, dtype::QuantizedS8(2.5f)),
  2572. b2 = mkcvar("b2", {1, 8, 1, 1}, dtype::QuantizedS32(6.25f));
  2573. auto conv2 = opr::ConvBiasForward::make(
  2574. conv1, w2, b2, param_conv_bias, {},
  2575. OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
  2576. auto y = opr::TypeCvt::make(conv2, dtype::Float32());
  2577. SymbolVar y_opt;
  2578. {
  2579. auto options = gopt::OptimizeForInferenceOptions{};
  2580. options.enable_nchw4();
  2581. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  2582. }
  2583. ASSERT_EQ(opr::ConvBias::Param::Format::NCHW4,
  2584. find_opr<opr::ConvBias>(y_opt).param().format);
  2585. auto nr_reshape = find_opr_num<mgb::opr::Reshape>(y_opt);
  2586. ASSERT_EQ(2u, nr_reshape);
  2587. graph->compile({{y_opt, {}}})
  2588. ->to_json()
  2589. ->writeto_fpath(output_file(
  2590. "TestGoptInference.ConvertFormatNCHW4GPU.json"));
  2591. HostTensorND host_y, host_y_opt;
  2592. auto func = graph->compile({make_callback_copy(y, host_y),
  2593. make_callback_copy(y_opt, host_y_opt)});
  2594. func->execute();
  2595. MGB_ASSERT_TENSOR_EQ(host_y, host_y_opt);
  2596. }
  2597. #endif
  2598. #endif
  2599. TEST(TestGoptInference, ConvertFormatNCHW4NonConvOpr) {
  2600. auto cn = CompNode::load("xpu0");
  2601. HostTensorGenerator<dtype::Int8> gen;
  2602. auto graph = ComputingGraph::make();
  2603. graph->options().graph_opt_level = 0;
  2604. auto mkvar = [&](const char* name, const TensorShape& shp,
  2605. const DType& dtype) {
  2606. return opr::TypeCvt::make(
  2607. opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
  2608. dtype);
  2609. };
  2610. auto mkcvar = [&](const char* name, const TensorShape& shp,
  2611. const DType& dtype) {
  2612. return opr::TypeCvt::make(
  2613. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  2614. .rename(name),
  2615. dtype);
  2616. };
  2617. auto mkcvarf32 = [&](const char* name, const TensorShape& shp) {
  2618. return opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  2619. .rename(name);
  2620. };
  2621. auto x = mkvar("x", {2, 4, 16, 16}, dtype::QuantizedS8(2.5f));
  2622. opr::ConvBias::Param param_conv_bias;
  2623. param_conv_bias.format = opr::ConvBias::Param::Format::NCHW;
  2624. param_conv_bias.stride_h = param_conv_bias.stride_w = 1;
  2625. param_conv_bias.pad_h = param_conv_bias.pad_w = 1;
  2626. param_conv_bias.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU;
  2627. // dense
  2628. param_conv_bias.sparse = opr::ConvBias::Param::Sparse::DENSE;
  2629. auto w1 = mkcvar("w1", {8, 4, 3, 3}, dtype::QuantizedS8(2.5f)),
  2630. b1 = mkcvar("b1", {1, 8, 1, 1}, dtype::QuantizedS32(6.25f));
  2631. auto conv1 = opr::ConvBiasForward::make(
  2632. x, w1, b1, param_conv_bias, {},
  2633. OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
  2634. // test Resize
  2635. auto shape_of = opr::GetVarShape::make(x);
  2636. auto subtensor = opr::Subtensor::make(
  2637. shape_of, {opr::Subtensor::AxisIndexer::make_interval(
  2638. 0, x.make_scalar(2), None, x.make_scalar(1))});
  2639. opr::Resize::Param param_resize;
  2640. param_resize.format = opr::Resize::Param::Format::NCHW;
  2641. auto resize = opr::ResizeForward::make(conv1, subtensor * 2, param_resize);
  2642. // test WarpPerspective
  2643. auto mat = mkcvarf32("mat", {2, 3, 3}),
  2644. warp = opr::WarpPerspectiveForward::make(
  2645. resize, mat, nullptr, cg::var_from_tensor_shape(x, {32, 32}));
  2646. opr::Pooling::Param pool_param;
  2647. pool_param.format = opr::Pooling::Param::Format::NCHW;
  2648. // test Pooling
  2649. auto pool = opr::Pooling::make(warp, pool_param);
  2650. // group
  2651. // icpg != 1 && ocpg != 1
  2652. param_conv_bias.sparse = opr::ConvBias::Param::Sparse::GROUP;
  2653. auto w2 = mkcvar("w2", {2, 4, 4, 3, 3}, dtype::QuantizedS8(2.5f)),
  2654. b2 = mkcvar("b2", {1, 8, 1, 1}, dtype::QuantizedS32(6.25f));
  2655. auto conv2 = opr::ConvBiasForward::make(
  2656. pool, w2, b2, param_conv_bias, {},
  2657. OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
  2658. auto add = opr::ElemwiseMultiType::make(
  2659. {conv1, conv2}, {opr::ElemwiseMultiType::Param::Mode::QADD},
  2660. OperatorNodeConfig{dtype::QuantizedS8{1.2f}});
  2661. auto y = opr::TypeCvt::make(add, dtype::Float32());
  2662. SymbolVar y_opt;
  2663. {
  2664. auto options = gopt::OptimizeForInferenceOptions{};
  2665. options.enable_nchw4();
  2666. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  2667. }
  2668. auto nr_dimshuffle = find_opr_num<mgb::opr::Dimshuffle>(y_opt);
  2669. ASSERT_EQ(2u, nr_dimshuffle);
  2670. ASSERT_EQ(opr::ConvBias::Param::Format::NCHW4,
  2671. find_opr<opr::ConvBias>(y_opt).param().format);
  2672. ASSERT_EQ(opr::ResizeForward::Param::Format::NCHW4,
  2673. find_opr<opr::ResizeForward>(y_opt).param().format);
  2674. ASSERT_EQ(opr::WarpPerspectiveForward::Param::Format::NCHW4,
  2675. find_opr<opr::WarpPerspectiveForward>(y_opt).param().format);
  2676. ASSERT_EQ(opr::PoolingForward::Param::Format::NCHW4,
  2677. find_opr<opr::PoolingForward>(y_opt).param().format);
  2678. }
  2679. TEST(TestGoptInference, ConvertFormatNCHW4) {
  2680. HostTensorGenerator<> gen;
  2681. auto cn = CompNode::load("cpu0");
  2682. auto graph = ComputingGraph::make();
  2683. graph->options().graph_opt_level = 0;
  2684. auto mkvar = [&](const char* name, const TensorShape& shp) {
  2685. return opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name);
  2686. };
  2687. auto mkcvar = [&](const char* name, const TensorShape& shp) {
  2688. return opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  2689. .rename(name);
  2690. };
  2691. auto x = mkvar("x", {2, 4, 16, 16});
  2692. // ConvBias test dense
  2693. opr::ConvBias::Param param_conv_bias;
  2694. param_conv_bias.pad_h = param_conv_bias.pad_w = 1;
  2695. param_conv_bias.sparse = opr::ConvBias::Param::Sparse::DENSE;
  2696. auto w1 = mkcvar("w1", {8, 4, 3, 3}), b1 = mkcvar("b1", {1, 8, 1, 1});
  2697. auto conv1 = opr::ConvBias::make(x, w1, b1, param_conv_bias);
  2698. param_conv_bias.sparse = opr::ConvBias::Param::Sparse::GROUP;
  2699. auto w2 = mkcvar("w2", {2, 4, 4, 3, 3}), b2 = mkcvar("b2", {1, 8, 1, 1});
  2700. auto conv2 = opr::ConvBias::make(conv1, w2, b2, param_conv_bias);
  2701. // Convolution
  2702. opr::Convolution::Param param_conv;
  2703. param_conv.pad_h = param_conv.pad_w = 1;
  2704. param_conv.sparse = opr::Convolution::Param::Sparse::DENSE;
  2705. auto w3 = mkcvar("w3", {8, 8, 3, 3});
  2706. auto y = opr::Convolution::make(conv2, w3, param_conv);
  2707. SymbolVar y_opt;
  2708. {
  2709. auto options = gopt::OptimizeForInferenceOptions{};
  2710. options.enable_nchw4();
  2711. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  2712. }
  2713. ASSERT_EQ(opr::ConvBias::Param::Format::NCHW,
  2714. find_opr<opr::ConvBias>(y_opt).param().format);
  2715. graph->compile({{y_opt, {}}})
  2716. ->to_json()
  2717. ->writeto_fpath(
  2718. output_file("TestGoptInference.ConvertFormatNCHW4.json"));
  2719. HostTensorND host_y_opt, host_y;
  2720. auto func = graph->compile({make_callback_copy(y, host_y),
  2721. make_callback_copy(y_opt, host_y_opt)});
  2722. func->execute();
  2723. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3);
  2724. }
  2725. //! close for cu111 ci, reopen it when bug fixed
  2726. #if 0
  2727. TEST(TestGoptInference, ConvertFormatNCHW4Ic3) {
  2728. REQUIRE_GPU(1);
  2729. auto cn = CompNode::load("gpu0");
  2730. cn.activate();
  2731. REQUIRE_CUDA_COMPUTE_CAPABILITY(6, 1);
  2732. HostTensorGenerator<dtype::Float32, RandomDistribution::UNIFORM> gen{
  2733. 1.2f, 127 * 127};
  2734. auto graph = ComputingGraph::make();
  2735. graph->options().graph_opt_level = 0;
  2736. auto mkvar = [&](const char* name, const TensorShape& shp,
  2737. const DType& dtype) {
  2738. return opr::TypeCvt::make(
  2739. opr::Host2DeviceCopy::make(*graph, gen(shp)).rename(name),
  2740. dtype);
  2741. };
  2742. auto mkcvar = [&](const char* name, const TensorShape& shp,
  2743. const DType& dtype) {
  2744. return opr::TypeCvt::make(
  2745. opr::SharedDeviceTensor::make(*graph, *gen(shp)).rename(name),
  2746. dtype);
  2747. };
  2748. auto x = mkvar("x", {2, 3, 16, 16}, dtype::QuantizedS8(2.5f));
  2749. // ConvBias test dense
  2750. opr::ConvBias::Param param_conv_bias;
  2751. param_conv_bias.pad_h = param_conv_bias.pad_w = 1;
  2752. param_conv_bias.sparse = opr::ConvBias::Param::Sparse::DENSE;
  2753. auto w1 = mkcvar("w1", {8, 3, 3, 3}, dtype::QuantizedS8(2.5f)),
  2754. b1 = mkcvar("b1", {1, 8, 1, 1}, dtype::QuantizedS32(6.25f));
  2755. auto conv1 =
  2756. opr::ConvBias::make(x, w1, b1, param_conv_bias, {},
  2757. OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
  2758. param_conv_bias.sparse = opr::ConvBias::Param::Sparse::GROUP;
  2759. auto w2 = mkcvar("w2", {2, 4, 4, 3, 3}, dtype::QuantizedS8(2.5f)),
  2760. b2 = mkcvar("b2", {1, 8, 1, 1}, dtype::QuantizedS32(6.25f));
  2761. auto conv2 =
  2762. opr::ConvBias::make(conv1, w2, b2, param_conv_bias, {},
  2763. OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
  2764. auto y = opr::TypeCvt::make(conv2, dtype::Float32());
  2765. SymbolVar y_opt;
  2766. {
  2767. auto options = gopt::OptimizeForInferenceOptions{};
  2768. options.enable_nchw4();
  2769. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  2770. }
  2771. ASSERT_EQ(opr::ConvBias::Param::Format::NCHW4,
  2772. find_opr<opr::ConvBias>(y_opt).param().format);
  2773. graph->compile({{y_opt, {}}})
  2774. ->to_json()
  2775. ->writeto_fpath(output_file(
  2776. "TestGoptInference.ConvertFormatNCHW4Ic3.json"));
  2777. HostTensorND host_y_opt, host_y;
  2778. auto func = graph->compile({make_callback_copy(y, host_y),
  2779. make_callback_copy(y_opt, host_y_opt)});
  2780. func->execute();
  2781. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3);
  2782. }
  2783. #endif
  2784. TEST(TestGoptInference, ConvertFormatNCHW88) {
  2785. HostTensorGenerator<> gen;
  2786. auto cn = CompNode::load("cpu0");
  2787. auto graph = ComputingGraph::make();
  2788. graph->options().graph_opt_level = 0;
  2789. auto mkvar = [&](const char* name, const TensorShape& shp) {
  2790. return opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name);
  2791. };
  2792. auto mkcvar = [&](const char* name, const TensorShape& shp) {
  2793. return opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  2794. .rename(name);
  2795. };
  2796. auto host_x = gen({2, 3, 16, 16}, cn);
  2797. auto x = opr::Host2DeviceCopy::make(*graph, host_x);
  2798. //! Hybrid nchw88 mode
  2799. opr::Convolution::Param param_conv;
  2800. param_conv.pad_h = param_conv.pad_w = 1;
  2801. auto w1 = mkcvar("w1", {8, 3, 3, 3}),
  2802. conv1 = opr::Convolution::make(x, w1, param_conv, {},
  2803. OperatorNodeConfig("conv1"));
  2804. //! channel wise
  2805. opr::ConvBias::Param param_conv_bias;
  2806. param_conv_bias.pad_h = param_conv_bias.pad_w = 1;
  2807. param_conv_bias.sparse = opr::ConvBias::Param::Sparse::GROUP;
  2808. auto w2 = mkcvar("w2", {8, 1, 1, 3, 3}), b2 = mkcvar("b2", {1, 8, 1, 1}),
  2809. conv2 = opr::ConvBias::make(conv1, w2, b2, param_conv_bias);
  2810. //! group
  2811. auto w3 = mkcvar("w3", {1, 8, 8, 3, 3}), b3 = mkcvar("b3", {1, 8, 1, 1}),
  2812. conv3 = opr::ConvBias::make(conv2, w3, b3, param_conv_bias);
  2813. auto shape_of = opr::GetVarShape::make(conv3);
  2814. auto subtensor = opr::Subtensor::make(
  2815. shape_of, {opr::Subtensor::AxisIndexer::make_interval(
  2816. 0, x.make_scalar(2), None, x.make_scalar(1))});
  2817. opr::Resize::Param param_resize;
  2818. param_resize.format = opr::Resize::Param::Format::NCHW;
  2819. auto resize = opr::ResizeForward::make(conv3, subtensor * 2, param_resize);
  2820. auto mat = mkcvar("mat", {2, 3, 3}),
  2821. warp = opr::WarpPerspectiveForward::make(
  2822. resize, mat, nullptr, cg::var_from_tensor_shape(x, {4, 4}));
  2823. auto b = mkvar("b", {1, 8, 1, 1}),
  2824. elem = opr::Elemwise::make({warp + b},
  2825. opr::Elemwise::Param::Mode::RELU);
  2826. //! Dense
  2827. param_conv_bias.pad_h = param_conv_bias.pad_w = 1;
  2828. auto w4 = mkcvar("w4", {2, 6, 4, 3, 3}), b4 = mkcvar("b4", {1, 12, 1, 1}),
  2829. conv4 = opr::ConvBias::make(elem, w4, b4, param_conv_bias);
  2830. param_conv_bias.sparse = opr::ConvBias::Param::Sparse::DENSE;
  2831. auto w5 = mkcvar("w5", {8, 12, 3, 3}), b5 = mkcvar("b5", {1, 8, 1, 1}),
  2832. conv5 = opr::ConvBias::make(conv4, w5, b5, param_conv_bias);
  2833. auto w6 = mkcvar("w6", {8, 8, 3, 3}), b6 = mkcvar("b6", {1, 8, 1, 1}),
  2834. y = opr::ConvBias::make(conv5, w6, b6, param_conv_bias);
  2835. SymbolVar y_opt;
  2836. {
  2837. auto options = gopt::OptimizeForInferenceOptions{};
  2838. options.enable_nchw88();
  2839. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  2840. }
  2841. ASSERT_EQ(opr::ConvBias::Param::Format::NCHW88,
  2842. find_opr<opr::Convolution>(y_opt, "conv1").param().format);
  2843. ASSERT_EQ(opr::ConvBias::Param::Format::NCHW88,
  2844. find_opr<opr::ConvBias>(y_opt).param().format);
  2845. graph->compile({{y_opt, {}}})
  2846. ->to_json()
  2847. ->writeto_fpath(
  2848. output_file("TestGoptInference.ConvertFormatNCHW88.json"));
  2849. HostTensorND host_y_opt, host_y;
  2850. auto func = graph->compile({make_callback_copy(y, host_y),
  2851. make_callback_copy(y_opt, host_y_opt)});
  2852. func->execute();
  2853. //! meybe go to winograd in x86-32, so set error 1e-1
  2854. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-1);
  2855. *host_x = *gen({2, 3, 32, 32}, cn);
  2856. func->execute();
  2857. //! meybe go to winograd in x86-32, so set error 1e-1
  2858. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-1);
  2859. }
  2860. TEST(TestGoptInference, ConvertFormatNCHW44) {
  2861. HostTensorGenerator<> gen;
  2862. auto cn = CompNode::load("cpu0");
  2863. auto graph = ComputingGraph::make();
  2864. graph->options().graph_opt_level = 0;
  2865. auto mkvar = [&](const char* name, const TensorShape& shp) {
  2866. return opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name);
  2867. };
  2868. auto mkcvar = [&](const char* name, const TensorShape& shp) {
  2869. return opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  2870. .rename(name);
  2871. };
  2872. auto mkcvar_dtype = [&](const char* name, const TensorShape& shp,
  2873. const DType& dtype) {
  2874. return opr::TypeCvt::make(
  2875. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  2876. .rename(name),
  2877. dtype);
  2878. };
  2879. auto host_x = gen({2, 3, 16, 16}, cn);
  2880. auto x = opr::Host2DeviceCopy::make(*graph, host_x);
  2881. //! Hybrid nchw44 mode
  2882. opr::Convolution::Param param_conv;
  2883. param_conv.pad_h = param_conv.pad_w = 1;
  2884. auto w1 = mkcvar("w1", {8, 3, 3, 3}),
  2885. conv1 = opr::Convolution::make(x, w1, param_conv, {},
  2886. OperatorNodeConfig("conv1"));
  2887. //! no supported hybrid nchw44
  2888. opr::ConvBias::Param param_conv_bias_pad0;
  2889. param_conv_bias_pad0.pad_h = param_conv_bias_pad0.pad_w = 0;
  2890. auto w1_f1 = mkcvar("w1_1", {8, 3, 1, 1});
  2891. auto conv1_f1 = opr::ConvBias::make(x, w1_f1, param_conv_bias_pad0, {},
  2892. OperatorNodeConfig("conv1_f1"));
  2893. auto conv1_add = conv1_f1 * conv1;
  2894. auto conv_1_q8 = opr::TypeCvt::make(conv1_add, dtype::QuantizedS8(2.5f));
  2895. //! s8 dense conv
  2896. opr::ConvBias::Param param_conv_bias;
  2897. param_conv_bias.pad_h = param_conv_bias.pad_w = 1;
  2898. auto w1_2 = mkcvar_dtype("w1_2", {8, 8, 3, 3}, dtype::QuantizedS8(2.5f));
  2899. auto b1_2 = mkcvar_dtype("b1_2", {1, 8, 1, 1}, dtype::QuantizedS32(6.25f));
  2900. auto conv_1_2 = opr::ConvBias::make(
  2901. conv_1_q8, w1_2, b1_2, param_conv_bias, {},
  2902. OperatorNodeConfig{"conv_1_2", cn, dtype::QuantizedS8{6.25f}});
  2903. auto conv_1_2_fp32 = opr::TypeCvt::make(conv_1_2, dtype::Float32());
  2904. //! channel wise
  2905. param_conv_bias.sparse = opr::ConvBias::Param::Sparse::GROUP;
  2906. auto w2 = mkcvar("w2", {8, 1, 1, 3, 3}), b2 = mkcvar("b2", {1, 8, 1, 1}),
  2907. conv2 = opr::ConvBias::make(conv_1_2_fp32, w2, b2, param_conv_bias);
  2908. //! group
  2909. auto w3 = mkcvar("w3", {2, 4, 4, 3, 3}), b3 = mkcvar("b3", {1, 8, 1, 1}),
  2910. conv3 = opr::ConvBias::make(conv2, w3, b3, param_conv_bias);
  2911. auto shape_of = opr::GetVarShape::make(conv3);
  2912. auto subtensor = opr::Subtensor::make(
  2913. shape_of, {opr::Subtensor::AxisIndexer::make_interval(
  2914. 0, x.make_scalar(2), None, x.make_scalar(1))});
  2915. opr::Resize::Param param_resize;
  2916. param_resize.format = opr::Resize::Param::Format::NCHW;
  2917. auto resize = opr::ResizeForward::make(conv3, subtensor * 2, param_resize);
  2918. auto mat = mkcvar("mat", {2, 3, 3}),
  2919. warp = opr::WarpPerspectiveForward::make(
  2920. resize, mat, nullptr, cg::var_from_tensor_shape(x, {4, 4}));
  2921. auto b = mkvar("b", {1, 8, 1, 1}),
  2922. elem = opr::Elemwise::make({warp + b},
  2923. opr::Elemwise::Param::Mode::RELU);
  2924. //! Dense
  2925. param_conv_bias.sparse = opr::ConvBias::Param::Sparse::DENSE;
  2926. param_conv_bias.pad_h = param_conv_bias.pad_w = 1;
  2927. auto w3_2 = mkcvar("w3_2", {16, 8, 3, 3}),
  2928. b3_2 = mkcvar("b3_2", {1, 16, 1, 1}),
  2929. conv3_2 = opr::ConvBias::make(elem, w3_2, b3_2, param_conv_bias, {},
  2930. OperatorNodeConfig("conv3_2"));
  2931. //! s8 group conv
  2932. param_conv_bias.sparse = opr::ConvBias::Param::Sparse::GROUP;
  2933. auto conv3_2_q8 = opr::TypeCvt::make(conv3_2, dtype::QuantizedS8(2.5f));
  2934. auto w3_3 = mkcvar_dtype("w3_3", {4, 8, 4, 3, 3}, dtype::QuantizedS8(2.5f)),
  2935. b3_3 = mkcvar_dtype("b3_3", {1, 32, 1, 1}, dtype::QuantizedS32(6.25f)),
  2936. conv3_3_q = opr::ConvBias::make(
  2937. conv3_2_q8, w3_3, b3_3, param_conv_bias, {},
  2938. OperatorNodeConfig{"conv_3_3_q", cn,
  2939. dtype::QuantizedS8{6.25f}});
  2940. auto conv3_3 = opr::TypeCvt::make(conv3_3_q, dtype::Float32());
  2941. //! Dense
  2942. param_conv_bias.sparse = opr::ConvBias::Param::Sparse::DENSE;
  2943. auto w4 = mkcvar("w4", {16, 32, 3, 3}), b4 = mkcvar("b4", {1, 16, 1, 1}),
  2944. conv4 = opr::ConvBias::make(conv3_3, w4, b4, param_conv_bias, {},
  2945. OperatorNodeConfig("conv4"));
  2946. auto w4_1 = mkcvar("w4_1", {16, 32, 1, 1}),
  2947. b4_1 = mkcvar("b4_1", {2, 16, 4, 4}),
  2948. conv4_1 =
  2949. opr::ConvBias::make(conv3_3, w4_1, b4_1, param_conv_bias_pad0,
  2950. {}, OperatorNodeConfig("conv4_1"));
  2951. auto conv4_add = conv4 + conv4_1;
  2952. auto w5 = mkcvar("w5", {6, 16, 3, 3}), b5 = mkcvar("b5", {1, 6, 1, 1}),
  2953. conv5 = opr::ConvBias::make(conv4_add, w5, b5, param_conv_bias, {},
  2954. OperatorNodeConfig("conv5"));
  2955. auto w6 = mkcvar("w6", {4, 6, 3, 3}), b6 = mkcvar("b6", {1, 4, 1, 1}),
  2956. y = opr::ConvBias::make(conv5, w6, b6, param_conv_bias, {},
  2957. OperatorNodeConfig("conv6"));
  2958. SymbolVar y_opt;
  2959. auto options = gopt::OptimizeForInferenceOptions{};
  2960. options.enable_fuse_conv_bias_nonlinearity();
  2961. options.enable_nchw44();
  2962. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  2963. ASSERT_EQ(opr::Convolution::Param::Format::NCHW44,
  2964. find_opr<opr::Convolution>(y_opt, "conv1").param().format);
  2965. ASSERT_EQ(opr::Convolution::Param::Format::NCHW,
  2966. find_opr<opr::ConvBias>(y_opt, "conv1_f1").param().format);
  2967. ASSERT_EQ(opr::Convolution::Param::Format::NCHW44,
  2968. find_opr<opr::ConvBias>(y_opt, "conv_1_2").param().format);
  2969. ASSERT_EQ(opr::Convolution::Param::Format::NCHW44,
  2970. find_opr<opr::ConvBias>(y_opt, "conv3_2").param().format);
  2971. ASSERT_EQ(opr::Convolution::Param::Format::NCHW44,
  2972. find_opr<opr::ConvBias>(y_opt, "conv_3_3_q").param().format);
  2973. ASSERT_EQ(opr::Convolution::Param::Format::NCHW44,
  2974. find_opr<opr::ConvBias>(y_opt, "conv4").param().format);
  2975. ASSERT_EQ(opr::Convolution::Param::Format::NCHW,
  2976. find_opr<opr::ConvBias>(y_opt, "conv5").param().format);
  2977. graph->compile({{y_opt, {}}})
  2978. ->to_json()
  2979. ->writeto_fpath(
  2980. output_file("TestGoptInference.ConvertFormatNCHW44.json"));
  2981. HostTensorND host_y_opt, host_y;
  2982. auto func = graph->compile({make_callback_copy(y, host_y),
  2983. make_callback_copy(y_opt, host_y_opt)});
  2984. func->execute();
  2985. //! meybe go to winograd in x86-32, so set error 1e-1
  2986. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-1);
  2987. *host_x = *gen({2, 3, 32, 32}, cn);
  2988. func->execute();
  2989. //! meybe go to winograd in x86-32, so set error 1e-1
  2990. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-1);
  2991. }
  2992. TEST(TestGoptInference, ConvertFormatNCHW44MultiInput) {
  2993. HostTensorGenerator<> gen;
  2994. auto cn = CompNode::load("cpu0");
  2995. auto graph = ComputingGraph::make();
  2996. graph->options().graph_opt_level = 0;
  2997. auto mkvar = [&](const char* name, const TensorShape& shp) {
  2998. return opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name);
  2999. };
  3000. auto mkcvar = [&](const char* name, const TensorShape& shp) {
  3001. return opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  3002. .rename(name);
  3003. };
  3004. auto host_x1 = gen({1, 8, 16, 16}, cn);
  3005. auto host_x2 = gen({1, 1, 16, 16}, cn);
  3006. auto x = opr::Host2DeviceCopy::make(*graph, host_x1);
  3007. opr::Convolution::Param param_conv;
  3008. param_conv.pad_h = param_conv.pad_w = 1;
  3009. auto w1 = mkcvar("w1", {8, 8, 3, 3}),
  3010. conv1 = opr::Convolution::make(x, w1, param_conv);
  3011. auto b = mkvar("b", {1, 1, 16, 16}),
  3012. elem0 = opr::Elemwise::make({conv1 + b + b},
  3013. opr::Elemwise::Param::Mode::RELU);
  3014. auto w2 = mkcvar("w2", {8, 8, 3, 3}),
  3015. conv2 = opr::Convolution::make(elem0, w2, param_conv);
  3016. auto b1 = mkvar("b1", {1}),
  3017. y = opr::Elemwise::make({conv2 + b1 + b},
  3018. opr::Elemwise::Param::Mode::RELU);
  3019. SymbolVar y_opt;
  3020. auto options = gopt::OptimizeForInferenceOptions{};
  3021. options.enable_nchw44();
  3022. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  3023. ASSERT_EQ(opr::Convolution::Param::Format::NCHW44,
  3024. find_opr<opr::Convolution>(y_opt).param().format);
  3025. graph->compile({{y_opt, {}}})
  3026. ->to_json()
  3027. ->writeto_fpath(output_file(
  3028. "TestGoptInference.ConvertFormatNCHW44MultiInput.json"));
  3029. HostTensorND host_y_opt, host_y;
  3030. auto func = graph->compile({make_callback_copy(y, host_y),
  3031. make_callback_copy(y_opt, host_y_opt)});
  3032. func->execute();
  3033. //! meybe go to winograd in x86-32, so set error 1e-1
  3034. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-1);
  3035. }
  3036. TEST(TestGoptInference, ConvertFormatNCHW44Reshape) {
  3037. HostTensorGenerator<> gen;
  3038. auto cn = CompNode::load("cpu0");
  3039. auto graph = ComputingGraph::make();
  3040. graph->options().graph_opt_level = 0;
  3041. auto mkcvar = [&](const char* name, const TensorShape& shp) {
  3042. return opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  3043. .rename(name);
  3044. };
  3045. auto host_x1 = gen({1, 8, 16, 16}, cn);
  3046. auto x = opr::Host2DeviceCopy::make(*graph, host_x1);
  3047. opr::Convolution::Param param_conv;
  3048. param_conv.pad_h = param_conv.pad_w = 1;
  3049. auto w1 = mkcvar("w1", {8, 8, 3, 3}),
  3050. conv1 = opr::Convolution::make(x, w1, param_conv);
  3051. auto y = opr::Reshape::make(conv1, {8, 16 * 16});
  3052. SymbolVar y_opt;
  3053. auto options = gopt::OptimizeForInferenceOptions{};
  3054. options.enable_nchw44();
  3055. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  3056. ASSERT_EQ(opr::Convolution::Param::Format::NCHW44,
  3057. find_opr<opr::Convolution>(y_opt).param().format);
  3058. graph->compile({{y_opt, {}}})
  3059. ->to_json()
  3060. ->writeto_fpath(output_file(
  3061. "TestGoptInference.ConvertFormatNCHW44Reshape.json"));
  3062. HostTensorND host_y_opt, host_y;
  3063. auto func = graph->compile({make_callback_copy(y, host_y),
  3064. make_callback_copy(y_opt, host_y_opt)});
  3065. func->execute();
  3066. //! meybe go to winograd in x86-32, so set error 1e-1
  3067. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-1);
  3068. }
  3069. TEST(TestGoptInference, ConvertFormatNCHW44_DOT) {
  3070. HostTensorGenerator<> gen;
  3071. auto cn = CompNode::load("cpu0");
  3072. auto graph = ComputingGraph::make();
  3073. graph->options().graph_opt_level = 0;
  3074. auto mkvar = [&](const char* name, const TensorShape& shp) {
  3075. return opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name);
  3076. };
  3077. auto mkcvar = [&](const char* name, const TensorShape& shp) {
  3078. return opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  3079. .rename(name);
  3080. };
  3081. auto mkcvar_dtype = [&](const char* name, const TensorShape& shp,
  3082. const DType& dtype) {
  3083. return opr::TypeCvt::make(
  3084. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  3085. .rename(name),
  3086. dtype);
  3087. };
  3088. auto host_x = gen({2, 3, 16, 16}, cn);
  3089. auto x = opr::Host2DeviceCopy::make(*graph, host_x);
  3090. //! Hybrid nchw44 mode
  3091. opr::Convolution::Param param_conv;
  3092. param_conv.pad_h = param_conv.pad_w = 1;
  3093. auto w1 = mkcvar("w1", {8, 3, 3, 3}),
  3094. conv1 = opr::Convolution::make(x, w1, param_conv, {},
  3095. OperatorNodeConfig("conv1"));
  3096. printf("create conv1 %s\n",
  3097. conv1.node()->owner_opr()->dyn_typeinfo()->name);
  3098. param_conv.pad_h = param_conv.pad_w = 1;
  3099. //! no supported hybrid nchw44
  3100. opr::ConvBias::Param param_conv_bias_pad0;
  3101. param_conv_bias_pad0.pad_h = param_conv_bias_pad0.pad_w = 0;
  3102. auto b1 = mkcvar("b1", {1, 8, 1, 1});
  3103. auto w1_f1 = mkcvar("w1_1", {8, 3, 1, 1});
  3104. auto conv1_f1 = opr::ConvBias::make(x, w1_f1, b1, param_conv_bias_pad0, {},
  3105. OperatorNodeConfig("conv1_f1"));
  3106. //! hybrid dot
  3107. auto x_s = opr::TypeCvt::make(x, dtype::QuantizedS8(2.5f));
  3108. auto w1_3 = mkcvar_dtype("w1_3", {8, 3, 3, 3}, dtype::QuantizedS8(2.5f));
  3109. auto conv1_3_q = opr::Convolution::make(
  3110. x_s, w1_3, param_conv, {},
  3111. OperatorNodeConfig{"conv1_3_q", cn, dtype::QuantizedS8{6.25f}});
  3112. auto conv1_3 = opr::TypeCvt::make(conv1_3_q, dtype::Float32());
  3113. auto conv1_add = conv1_f1 * conv1 * conv1_3;
  3114. auto conv_1_q8 = opr::TypeCvt::make(conv1_add, dtype::QuantizedS8(2.5f));
  3115. //! s8 dense conv
  3116. opr::ConvBias::Param param_conv_bias;
  3117. param_conv_bias.pad_h = param_conv_bias.pad_w = 1;
  3118. auto w1_2 = mkcvar_dtype("w1_2", {8, 8, 3, 3}, dtype::QuantizedS8(2.5f));
  3119. auto conv_1_2 = opr::ConvBias::make(
  3120. conv_1_q8, w1_2, param_conv_bias, {},
  3121. OperatorNodeConfig{"conv_1_2", cn, dtype::QuantizedS8{6.25f}});
  3122. auto conv_1_2_fp32 = opr::TypeCvt::make(conv_1_2, dtype::Float32());
  3123. //! channel wise
  3124. param_conv_bias.sparse = opr::ConvBias::Param::Sparse::GROUP;
  3125. auto w2 = mkcvar("w2", {8, 1, 1, 3, 3}), b2 = mkcvar("b2", {1, 8, 1, 1}),
  3126. conv2 = opr::ConvBias::make(conv_1_2_fp32, w2, b2, param_conv_bias);
  3127. //! group
  3128. auto w3 = mkcvar("w3", {2, 4, 4, 3, 3}), b3 = mkcvar("b3", {1, 8, 1, 1}),
  3129. conv3 = opr::ConvBias::make(conv2, w3, b3, param_conv_bias);
  3130. auto shape_of = opr::GetVarShape::make(conv3);
  3131. auto subtensor = opr::Subtensor::make(
  3132. shape_of, {opr::Subtensor::AxisIndexer::make_interval(
  3133. 0, x.make_scalar(2), None, x.make_scalar(1))});
  3134. opr::Resize::Param param_resize;
  3135. param_resize.format = opr::Resize::Param::Format::NCHW;
  3136. auto resize = opr::ResizeForward::make(conv3, subtensor * 2, param_resize);
  3137. auto mat = mkcvar("mat", {2, 3, 3}),
  3138. warp = opr::WarpPerspectiveForward::make(
  3139. resize, mat, nullptr, cg::var_from_tensor_shape(x, {4, 4}));
  3140. auto b = mkvar("b", {1, 8, 1, 1}),
  3141. elem = opr::Elemwise::make({warp + b},
  3142. opr::Elemwise::Param::Mode::RELU);
  3143. //! Dense
  3144. param_conv_bias.sparse = opr::ConvBias::Param::Sparse::DENSE;
  3145. param_conv_bias.pad_h = param_conv_bias.pad_w = 1;
  3146. auto w3_2 = mkcvar("w3_2", {16, 8, 3, 3}),
  3147. b3_2 = mkcvar("b3_2", {1, 16, 1, 1}),
  3148. conv3_2 = opr::ConvBias::make(elem, w3_2, b3_2, param_conv_bias, {},
  3149. OperatorNodeConfig("conv3_2"));
  3150. //! s8 group conv
  3151. param_conv_bias.sparse = opr::ConvBias::Param::Sparse::GROUP;
  3152. auto conv3_2_q8 = opr::TypeCvt::make(conv3_2, dtype::QuantizedS8(2.5f));
  3153. auto w3_3 = mkcvar_dtype("w3_3", {4, 8, 4, 3, 3}, dtype::QuantizedS8(2.5f)),
  3154. b3_3 = mkcvar_dtype("b3_3", {1, 32, 1, 1}, dtype::QuantizedS32(6.25f)),
  3155. conv3_3_q = opr::ConvBias::make(
  3156. conv3_2_q8, w3_3, b3_3, param_conv_bias, {},
  3157. OperatorNodeConfig{"conv_3_3_q", cn,
  3158. dtype::QuantizedS8{6.25f}});
  3159. auto conv3_3 = opr::TypeCvt::make(conv3_3_q, dtype::Float32());
  3160. //! Dense
  3161. param_conv_bias.sparse = opr::ConvBias::Param::Sparse::DENSE;
  3162. auto w4 = mkcvar("w4", {4, 32, 3, 3}), b4 = mkcvar("b4", {1, 4, 1, 1}),
  3163. conv4 = opr::ConvBias::make(conv3_3, w4, b4, param_conv_bias, {},
  3164. OperatorNodeConfig("conv4"));
  3165. auto w5 = mkcvar("w5", {6, 4, 3, 3}), b5 = mkcvar("b5", {1, 6, 1, 1}),
  3166. conv5 = opr::ConvBias::make(conv4, w5, b5, param_conv_bias, {},
  3167. OperatorNodeConfig("conv5"));
  3168. auto w6 = mkcvar("w6", {4, 6, 3, 3}), b6 = mkcvar("b6", {1, 4, 1, 1}),
  3169. y = opr::ConvBias::make(conv5, w6, b6, param_conv_bias, {},
  3170. OperatorNodeConfig("conv6"));
  3171. SymbolVar y_opt;
  3172. auto options = gopt::OptimizeForInferenceOptions{};
  3173. options.enable_fuse_conv_bias_nonlinearity();
  3174. options.enable_nchw44_dot();
  3175. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  3176. ASSERT_EQ(opr::Convolution::Param::Format::NCHW44,
  3177. find_opr<opr::Convolution>(y_opt, "conv1").param().format);
  3178. ASSERT_EQ(opr::Convolution::Param::Format::NCHW44_DOT,
  3179. find_opr<opr::Convolution>(y_opt, "conv1_3_q").param().format);
  3180. ASSERT_EQ(opr::Convolution::Param::Format::NCHW,
  3181. find_opr<opr::ConvBias>(y_opt, "conv1_f1").param().format);
  3182. ASSERT_EQ(opr::Convolution::Param::Format::NCHW44_DOT,
  3183. find_opr<opr::ConvBias>(y_opt, "conv_1_2").param().format);
  3184. ASSERT_EQ(opr::Convolution::Param::Format::NCHW44,
  3185. find_opr<opr::ConvBias>(y_opt, "conv3_2").param().format);
  3186. ASSERT_EQ(opr::Convolution::Param::Format::NCHW44_DOT,
  3187. find_opr<opr::ConvBias>(y_opt, "conv_3_3_q").param().format);
  3188. ASSERT_EQ(opr::Convolution::Param::Format::NCHW44,
  3189. find_opr<opr::ConvBias>(y_opt, "conv4").param().format);
  3190. ASSERT_EQ(opr::Convolution::Param::Format::NCHW,
  3191. find_opr<opr::ConvBias>(y_opt, "conv5").param().format);
  3192. graph->compile({{y_opt, {}}})
  3193. ->to_json()
  3194. ->writeto_fpath(output_file(
  3195. "TestGoptInference.ConvertFormatNCHW44_DOT.json"));
  3196. HostTensorND host_y_opt, host_y;
  3197. auto func = graph->compile({make_callback_copy(y, host_y),
  3198. make_callback_copy(y_opt, host_y_opt)});
  3199. func->execute();
  3200. //! meybe go to winograd in x86-32, so set error 1e-1
  3201. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-1);
  3202. *host_x = *gen({2, 3, 32, 32}, cn);
  3203. func->execute();
  3204. //! meybe go to winograd in x86-32, so set error 1e-1
  3205. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-1);
  3206. }
  3207. TEST(TestGoptInference, ConvertFormatCD4GroupOneConv) {
  3208. // hwcd4 is only supported in naive handle
  3209. NaiveMegDNNHandleScope naive_megdnn_handle;
  3210. HostTensorGenerator<> gen;
  3211. auto cn = CompNode::load("cpu0");
  3212. auto graph = ComputingGraph::make();
  3213. graph->options().graph_opt_level = 0;
  3214. auto mkvar = [&](const char* name, const TensorShape& shp) {
  3215. return opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name);
  3216. };
  3217. auto mkcvar = [&](const char* name, const TensorShape& shp) {
  3218. return opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  3219. .rename(name);
  3220. };
  3221. auto x = mkvar("x", {1, 3, 128, 128});
  3222. // ConvBias
  3223. opr::ConvBias::Param param_conv_bias;
  3224. param_conv_bias.pad_h = param_conv_bias.pad_w = 1;
  3225. param_conv_bias.sparse = opr::ConvBias::Param::Sparse::GROUP;
  3226. auto w1 = mkcvar("w1", {1, 16, 3, 3, 3}), b1 = mkcvar("b1", {1, 16, 1, 1});
  3227. auto conv1 = opr::ConvBias::make(x, w1, b1, param_conv_bias);
  3228. param_conv_bias.sparse = opr::ConvBias::Param::Sparse::GROUP;
  3229. // Convolution
  3230. opr::Convolution::Param param_conv;
  3231. param_conv.pad_h = param_conv.pad_w = 1;
  3232. param_conv.sparse = opr::Convolution::Param::Sparse::GROUP;
  3233. auto w3 = mkcvar("w3", {1, 16, 16, 3, 3});
  3234. auto y = opr::Convolution::make(conv1, w3, param_conv);
  3235. SymbolVar y_opt;
  3236. {
  3237. auto options = gopt::OptimizeForInferenceOptions{};
  3238. options.enable_nhwcd4();
  3239. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  3240. }
  3241. HostTensorND host_y_opt, host_y;
  3242. auto func = graph->compile({make_callback_copy(y, host_y),
  3243. make_callback_copy(y_opt, host_y_opt)});
  3244. func->execute();
  3245. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3);
  3246. }
  3247. #if MGB_CUDA
  3248. TEST(TestGoptInference, PreProcessCase0) {
  3249. REQUIRE_GPU(1);
  3250. HostTensorGenerator<dtype::Quantized8Asymm, RandomDistribution::UNIFORM>
  3251. gen(dt_quint8(0), dt_quint8(50), 1, 128, 1234);
  3252. auto cn = CompNode::load("gpu0");
  3253. auto graph = ComputingGraph::make();
  3254. graph->options().graph_opt_level = 0;
  3255. size_t n = 1;
  3256. size_t c = 3;
  3257. size_t h = 16;
  3258. size_t w = 16;
  3259. auto host_x1 = gen({n, c, h, w}, cn);
  3260. auto x = opr::Host2DeviceCopy::make(*graph, host_x1);
  3261. auto x_q8 = opr::TypeCvt::make(x, dtype::QuantizedS8(1.f), cn);
  3262. auto zero = DTypeScalar(dtype::QuantizedS8(1.f));
  3263. auto zero_tensor = opr::ImmutableTensor::make(*graph, zero, cn);
  3264. auto pad_channel_tensor =
  3265. opr::Broadcast::make(zero_tensor, {n, 1, h, w}, cn);
  3266. auto paded_x = opr::Concat::make({x_q8, pad_channel_tensor}, 1, cn)
  3267. .reshape({n, 1, 4, h, w});
  3268. auto result = opr::Dimshuffle::make(paded_x, {0, 1, 3, 4, 2}, 5, cn);
  3269. auto y = result;
  3270. SymbolVar y_opt;
  3271. auto options = gopt::OptimizeForInferenceOptions{};
  3272. options.enable_fuse_preprocess();
  3273. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  3274. graph->compile({{y_opt, {}}})
  3275. ->to_json()
  3276. ->writeto_fpath(
  3277. output_file("TestGoptInference.PreProcessCase0.json"));
  3278. HostTensorND host_y_opt, host_y;
  3279. auto func = graph->compile({make_callback_copy(y, host_y),
  3280. make_callback_copy(y_opt, host_y_opt)});
  3281. func->execute();
  3282. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-5);
  3283. ASSERT_TRUE(y_opt.node()->owner_opr()->same_type<opr::RelayoutFormat>());
  3284. }
  3285. TEST(TestGoptInference, PreProcessCase1) {
  3286. REQUIRE_GPU(1);
  3287. HostTensorGenerator<dtype::Uint8, RandomDistribution::UNIFORM> gen(0, 255);
  3288. auto cn = CompNode::load("gpu0");
  3289. auto graph = ComputingGraph::make();
  3290. graph->options().graph_opt_level = 0;
  3291. size_t n = 1;
  3292. size_t c = 3;
  3293. size_t h = 16;
  3294. size_t w = 16;
  3295. auto host_x1 = gen({n, c, h, w}, cn);
  3296. auto x = opr::Host2DeviceCopy::make(*graph, host_x1);
  3297. auto x_u8 = opr::TypeCvt::make(x, dtype::Float32(), cn);
  3298. auto x_s8 = x_u8 - 128;
  3299. auto zero = DTypeScalar(dtype::Float32());
  3300. auto zero_tensor = opr::ImmutableTensor::make(*graph, zero, cn);
  3301. auto pad_channel_tensor =
  3302. opr::Broadcast::make(zero_tensor, {n, 1, h, w}, cn);
  3303. auto paded_x = opr::Concat::make({x_s8, pad_channel_tensor}, 1, cn)
  3304. .reshape({n, 1, 4, h, w});
  3305. auto nchw4_out = opr::Dimshuffle::make(paded_x, {0, 1, 3, 4, 2}, 5, cn);
  3306. auto result = opr::TypeCvt::make(nchw4_out, dtype::QuantizedS8(1.f));
  3307. auto y = result;
  3308. SymbolVar y_opt;
  3309. auto options = gopt::OptimizeForInferenceOptions{};
  3310. options.enable_fuse_preprocess();
  3311. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  3312. graph->compile({{y_opt, {}}})
  3313. ->to_json()
  3314. ->writeto_fpath(
  3315. output_file("TestGoptInference.PreProcessCase1.json"));
  3316. HostTensorND host_y_opt, host_y;
  3317. auto func = graph->compile({make_callback_copy(y, host_y),
  3318. make_callback_copy(y_opt, host_y_opt)});
  3319. func->execute();
  3320. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-5);
  3321. ASSERT_TRUE(y_opt.node()->owner_opr()->same_type<opr::RelayoutFormat>());
  3322. }
  3323. TEST(TestGoptInference, WarpAndPreProcessCase0) {
  3324. REQUIRE_GPU(1);
  3325. HostTensorGenerator<dtype::Uint8, RandomDistribution::UNIFORM> gen(0, 255);
  3326. auto cn = CompNode::load("gpu0");
  3327. auto graph = ComputingGraph::make();
  3328. graph->options().graph_opt_level = 0;
  3329. size_t n = 1;
  3330. size_t c = 3;
  3331. size_t h = 16;
  3332. size_t w = 16;
  3333. auto host_x1 = gen({n, h, w, c}, cn);
  3334. auto x = opr::Host2DeviceCopy::make(*graph, host_x1);
  3335. auto mat_host = std::make_shared<HostTensorND>(cn, TensorShape{n, 3, 3},
  3336. dtype::Float32());
  3337. warp_perspective_mat_gen(*mat_host, n, h, w);
  3338. auto mat = opr::Host2DeviceCopy::make(*graph, mat_host).rename("mat");
  3339. opr::WarpPerspective::Param warp_param;
  3340. warp_param.format = opr::WarpPerspective::Param::Format::NHWC;
  3341. auto x_warp =
  3342. opr::WarpPerspective::make(x, mat, TensorShape{h, w}, warp_param);
  3343. auto x_nchw = opr::Dimshuffle::make(x_warp, {0, 3, 1, 2}, 4, cn);
  3344. auto x_u8 = opr::TypeCvt::make(x_nchw, dtype::Float32(), cn);
  3345. auto x_s8 = x_u8 - 128;
  3346. auto zero = DTypeScalar(dtype::Float32());
  3347. auto zero_tensor = opr::ImmutableTensor::make(*graph, zero, cn);
  3348. auto pad_channel_tensor =
  3349. opr::Broadcast::make(zero_tensor, {n, 1, h, w}, cn);
  3350. auto paded_x = opr::Concat::make({x_s8, pad_channel_tensor}, 1, cn)
  3351. .reshape({n, 1, 4, h, w});
  3352. auto nchw4_out = opr::Dimshuffle::make(paded_x, {0, 1, 3, 4, 2}, 5, cn);
  3353. auto result = opr::TypeCvt::make(nchw4_out, dtype::QuantizedS8(1.f));
  3354. auto y = result;
  3355. SymbolVar y_opt;
  3356. auto options = gopt::OptimizeForInferenceOptions{};
  3357. options.enable_fuse_preprocess();
  3358. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  3359. ASSERT_TRUE(y_opt.node()->owner_opr()->same_type<opr::WarpPerspective>());
  3360. ASSERT_EQ(opr::WarpPerspective::Param::Format::NHWC_NCHW4_IC_SMALL,
  3361. find_opr<opr::WarpPerspective>(y_opt).param().format);
  3362. graph->compile({{y_opt, {}}})
  3363. ->to_json()
  3364. ->writeto_fpath(output_file(
  3365. "TestGoptInference.WarpAndPreProcessCase0.json"));
  3366. HostTensorND host_y_opt, host_y;
  3367. auto func = graph->compile({make_callback_copy(y, host_y),
  3368. make_callback_copy(y_opt, host_y_opt)});
  3369. func->execute();
  3370. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-5);
  3371. }
  3372. TEST(TestGoptInference, WarpAndPreProcessCase1) {
  3373. REQUIRE_GPU(1);
  3374. HostTensorGenerator<dtype::Uint8, RandomDistribution::UNIFORM> gen(0, 255);
  3375. auto cn = CompNode::load("gpu0");
  3376. auto graph = ComputingGraph::make();
  3377. graph->options().graph_opt_level = 0;
  3378. size_t n = 1;
  3379. size_t c = 3;
  3380. size_t h = 16;
  3381. size_t w = 16;
  3382. auto host_x1 = gen({n, h, w, c}, cn);
  3383. auto x = opr::Host2DeviceCopy::make(*graph, host_x1);
  3384. auto mat_host = std::make_shared<HostTensorND>(cn, TensorShape{n, 3, 3},
  3385. dtype::Float32());
  3386. warp_perspective_mat_gen(*mat_host, n, h, w);
  3387. auto mat = opr::Host2DeviceCopy::make(*graph, mat_host).rename("mat");
  3388. opr::WarpPerspective::Param warp_param;
  3389. warp_param.format = opr::WarpPerspective::Param::Format::NHWC;
  3390. auto x_warp =
  3391. opr::WarpPerspective::make(x, mat, TensorShape{h, w}, warp_param);
  3392. auto x_nchw = opr::Dimshuffle::make(x_warp, {0, 3, 1, 2}, 4, cn);
  3393. auto result = opr::TypeCvt::make(x_nchw, dtype::Float32(), cn);
  3394. auto y = result;
  3395. SymbolVar y_opt;
  3396. auto options = gopt::OptimizeForInferenceOptions{};
  3397. options.enable_fuse_preprocess();
  3398. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  3399. ASSERT_TRUE(y_opt.node()->owner_opr()->same_type<opr::WarpPerspective>());
  3400. ASSERT_EQ(opr::WarpPerspective::Param::Format::NHWC_NCHW,
  3401. find_opr<opr::WarpPerspective>(y_opt).param().format);
  3402. graph->compile({{y_opt, {}}})
  3403. ->to_json()
  3404. ->writeto_fpath(output_file(
  3405. "TestGoptInference.WarpAndPreProcessCase1.json"));
  3406. HostTensorND host_y_opt, host_y;
  3407. auto func = graph->compile({make_callback_copy(y, host_y),
  3408. make_callback_copy(y_opt, host_y_opt)});
  3409. func->execute();
  3410. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-5);
  3411. }
  3412. TEST(TestGoptInference, FoldingConvDimshuffle) {
  3413. REQUIRE_GPU(1);
  3414. auto cn = CompNode::load("gpu0");
  3415. cn.activate();
  3416. auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
  3417. auto sm_ver = prop.major * 10 + prop.minor;
  3418. if (sm_ver < 61) {
  3419. printf("This testcast ignored due to insufficient cuda cap(got: %d, "
  3420. "expected: %d)\n",
  3421. sm_ver, 61);
  3422. return;
  3423. }
  3424. HostTensorGenerator<dtype::Int8> gen;
  3425. auto graph = ComputingGraph::make();
  3426. graph->options().graph_opt_level = 0;
  3427. auto mkvar = [&](const char* name, const TensorShape& shp,
  3428. const DType& dtype) {
  3429. return opr::TypeCvt::make(
  3430. opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
  3431. dtype);
  3432. };
  3433. auto mkcvar = [&](const char* name, const TensorShape& shp,
  3434. const DType& dtype) {
  3435. return opr::TypeCvt::make(
  3436. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  3437. .rename(name),
  3438. dtype);
  3439. };
  3440. auto nchw42nchw = [](SymbolVar x) {
  3441. auto xshp = opr::GetVarShape::make(x);
  3442. auto cv = [&x](int v) { return x.make_scalar(v); };
  3443. auto sub = [&xshp, &cv](int idx) {
  3444. return opr::IndexAt::make(xshp, {{0, cv(idx)}});
  3445. };
  3446. auto tshp0 = opr::Concat::make({sub(0), sub(1) * 4, sub(2), sub(3)}, 0);
  3447. auto y0 = opr::Dimshuffle::make(x, {0, 1, 4, 2, 3});
  3448. auto y1 = opr::Reshape::make(y0, tshp0);
  3449. return y1;
  3450. };
  3451. auto x = mkvar("x", {32, 16, 4, 8, 4}, dtype::QuantizedS8(2.5f)),
  3452. w = mkcvar("w", {64, 16, 3, 3, 4}, dtype::QuantizedS8(2.5f)),
  3453. b = mkcvar("b", {1, 16, 1, 1, 4}, dtype::QuantizedS32(6.25f));
  3454. opr::ConvBias::Param param;
  3455. param.format = opr::ConvBias::Param::Format::NCHW4;
  3456. param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU;
  3457. param.stride_h = param.stride_w = 2;
  3458. param.pad_h = param.pad_w = 1;
  3459. auto y = opr::ConvBias::make(x, w, b, param, {},
  3460. OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  3461. y = opr::TypeCvt::make(y, dtype::Float32());
  3462. y = nchw42nchw(y);
  3463. SymbolVar y_fuse, y_non_fuse;
  3464. unpack_vector(gopt::GraphOptimizer{}
  3465. .add_pass<gopt::ShuffleShuffleRemovePass>()
  3466. .add_pass<gopt::FoldingConvBiasDimshufflePass>()
  3467. .add_pass<gopt::ParamFusePass>()
  3468. .apply({{y}})
  3469. .endpoint_vars(),
  3470. y_fuse);
  3471. gopt::modify_opr_algo_strategy_inplace(
  3472. {y_fuse},
  3473. opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy::PROFILE);
  3474. graph->compile({{y_fuse, {}}})
  3475. ->to_json()
  3476. ->writeto_fpath(output_file(
  3477. "TestGoptInference.FoldingConvDimshuffle.json"));
  3478. ASSERT_EQ(opr::ConvBias::Param::Format::NCHW4_NCHW,
  3479. find_opr<opr::ConvBias>(y_fuse).param().format);
  3480. ASSERT_EQ(0u, find_opr_num<opr::Dimshuffle>(y_fuse));
  3481. unpack_vector(gopt::GraphOptimizer{}.apply({{y}}).endpoint_vars(),
  3482. y_non_fuse);
  3483. HostTensorND host_y_fuse, host_y_non_fuse;
  3484. auto func =
  3485. graph->compile({make_callback_copy(y_fuse, host_y_fuse),
  3486. make_callback_copy(y_non_fuse, host_y_non_fuse)});
  3487. func->execute();
  3488. }
  3489. //! close for cu111 ci, reopen it when bug fixed
  3490. #if 0
  3491. TEST(TestGoptInference, FoldingConvDimshuffleNCHW4NCHW32) {
  3492. REQUIRE_GPU(1);
  3493. auto cn = CompNode::load("gpu0");
  3494. cn.activate();
  3495. auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
  3496. auto sm_ver = prop.major * 10 + prop.minor;
  3497. if (sm_ver < 61) {
  3498. printf("This testcast ignored due to insufficient cuda cap(got: %d, "
  3499. "expected: %d)\n",
  3500. sm_ver, 61);
  3501. return;
  3502. }
  3503. HostTensorGenerator<dtype::Int8> gen;
  3504. auto graph = ComputingGraph::make();
  3505. graph->options().graph_opt_level = 0;
  3506. auto mkvar = [&](const char* name, const TensorShape& shp,
  3507. const DType& dtype) {
  3508. return opr::TypeCvt::make(
  3509. opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
  3510. dtype);
  3511. };
  3512. auto mkcvar = [&](const char* name, const TensorShape& shp,
  3513. const DType& dtype) {
  3514. return opr::TypeCvt::make(
  3515. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  3516. .rename(name),
  3517. dtype);
  3518. };
  3519. auto nchw42nchw32 = [](SymbolVar x) {
  3520. auto xshp = opr::GetVarShape::make(x);
  3521. auto cv = [&x](int v) { return x.make_scalar(v); };
  3522. auto sub = [&xshp, &cv](int idx) {
  3523. return opr::IndexAt::make(xshp, {{0, cv(idx)}});
  3524. };
  3525. auto tshp0 = opr::Concat::make(
  3526. {sub(0), sub(1) / 8, cv(8), sub(2), sub(3), sub(4)}, 0),
  3527. tshp1 = opr::Concat::make(
  3528. {sub(0), sub(1) / 8, sub(2), sub(3), sub(4) * 8}, 0);
  3529. auto y0 = opr::Reshape::make(x, tshp0);
  3530. auto y1 = opr::Dimshuffle::make(y0, {0, 1, 3, 4, 2, 5});
  3531. auto y2 = opr::Reshape::make(y1, tshp1);
  3532. return y2;
  3533. };
  3534. auto x = mkvar("x", {32, 16, 4, 8, 4}, dtype::QuantizedS8(2.5f)),
  3535. w = mkcvar("w", {64, 16, 3, 3, 4}, dtype::QuantizedS8(2.5f)),
  3536. b = mkcvar("b", {1, 16, 1, 1, 4}, dtype::QuantizedS32(6.25f));
  3537. opr::ConvBias::Param param;
  3538. param.format = opr::ConvBias::Param::Format::NCHW4;
  3539. param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU;
  3540. param.stride_h = param.stride_w = 2;
  3541. param.pad_h = param.pad_w = 1;
  3542. auto y = opr::ConvBias::make(x, w, b, param, {},
  3543. OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  3544. y = nchw42nchw32(y);
  3545. y = opr::TypeCvt::make(y, dtype::Float32());
  3546. SymbolVar y_fuse, y_non_fuse;
  3547. unpack_vector(gopt::GraphOptimizer{}
  3548. .add_pass<gopt::FoldingConvBiasDimshufflePass>()
  3549. .add_pass<gopt::ParamFusePass>()
  3550. .apply({{y}})
  3551. .endpoint_vars(),
  3552. y_fuse);
  3553. gopt::modify_opr_algo_strategy_inplace(
  3554. {y_fuse},
  3555. opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy::PROFILE);
  3556. graph->compile({{y_fuse, {}}})
  3557. ->to_json()
  3558. ->writeto_fpath(output_file(
  3559. "TestGoptInference.FoldingConvDimshuffleNCHW4NCHW32.json"));
  3560. ASSERT_EQ(opr::ConvBias::Param::Format::NCHW4_NCHW32,
  3561. find_opr<opr::ConvBias>(y_fuse).param().format);
  3562. ASSERT_EQ(0u, find_opr_num<opr::Dimshuffle>(y_fuse));
  3563. unpack_vector(gopt::GraphOptimizer{}.apply({{y}}).endpoint_vars(),
  3564. y_non_fuse);
  3565. HostTensorND host_y_fuse, host_y_non_fuse;
  3566. auto func =
  3567. graph->compile({make_callback_copy(y_fuse, host_y_fuse),
  3568. make_callback_copy(y_non_fuse, host_y_non_fuse)});
  3569. func->execute();
  3570. MGB_ASSERT_TENSOR_EQ(host_y_fuse, host_y_non_fuse);
  3571. }
  3572. #endif
  3573. #if CUDA_VERSION >= 10020
  3574. TEST(TestGoptInference, FoldingConvDimshuffleNCHW32NCHW4) {
  3575. REQUIRE_GPU(1);
  3576. auto cn = CompNode::load("gpu0");
  3577. cn.activate();
  3578. auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
  3579. auto sm_ver = prop.major * 10 + prop.minor;
  3580. if (sm_ver < 75) {
  3581. printf("This testcast ignored due to insufficient cuda cap(got: %d, "
  3582. "expected: %d)\n",
  3583. sm_ver, 75);
  3584. return;
  3585. }
  3586. HostTensorGenerator<dtype::Int8> gen;
  3587. auto graph = ComputingGraph::make();
  3588. graph->options().graph_opt_level = 0;
  3589. auto mkvar = [&](const char* name, const TensorShape& shp,
  3590. const DType& dtype) {
  3591. return opr::TypeCvt::make(
  3592. opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
  3593. dtype);
  3594. };
  3595. auto mkcvar = [&](const char* name, const TensorShape& shp,
  3596. const DType& dtype) {
  3597. return opr::TypeCvt::make(
  3598. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  3599. .rename(name),
  3600. dtype);
  3601. };
  3602. auto x = mkvar("x", {32, 16, 4, 8, 4}, dtype::QuantizedS8(2.5f)),
  3603. w = mkcvar("w", {64, 16, 3, 3, 4}, dtype::QuantizedS8(2.5f)),
  3604. b = mkcvar("b", {1, 16, 1, 1, 4}, dtype::QuantizedS32(6.25f)),
  3605. w1 = mkcvar("w1", {16, 16, 3, 3, 4}, dtype::QuantizedS8(2.5f)),
  3606. b1 = mkcvar("b1", {1, 4, 1, 1, 4}, dtype::QuantizedS32(6.25f));
  3607. opr::ConvBias::Param param;
  3608. param.format = opr::ConvBias::Param::Format::NCHW4;
  3609. param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU;
  3610. param.stride_h = param.stride_w = 2;
  3611. param.pad_h = param.pad_w = 1;
  3612. auto y = opr::ConvBias::make(x, w, b, param, {},
  3613. OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  3614. param.stride_h = param.stride_w = 1;
  3615. y = opr::ConvBias::make(y, w1, b1, param, {},
  3616. OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  3617. y = opr::TypeCvt::make(y, dtype::Float32());
  3618. SymbolVar y_fuse, y_non_fuse;
  3619. {
  3620. auto options = gopt::OptimizeForInferenceOptions{};
  3621. options.enable_nchw32().enable_fuse_conv_bias_nonlinearity();
  3622. unpack_vector(gopt::optimize_for_inference({y}, options), y_fuse);
  3623. }
  3624. graph->compile({{y_fuse, {}}})
  3625. ->to_json()
  3626. ->writeto_fpath(output_file(
  3627. "TestGoptInference.FoldingConvDimshuffleNCHW32NCHW4.json"));
  3628. ASSERT_EQ(1u, find_opr_num<opr::Dimshuffle>(y_fuse));
  3629. bool found = false;
  3630. cg::DepOprIter{[&found](cg::OperatorNodeBase* opr) {
  3631. if (!found && opr->same_type<opr::ConvBias>()) {
  3632. opr::ConvBias* cb = &opr->cast_final_safe<opr::ConvBias>();
  3633. if (cb->param().format ==
  3634. opr::ConvBias::Param::Format::NCHW32_NCHW4)
  3635. found = true;
  3636. }
  3637. }}
  3638. .add(y_fuse.node()->owner_opr());
  3639. EXPECT_TRUE(found);
  3640. unpack_vector(gopt::GraphOptimizer{}.apply({{y}}).endpoint_vars(),
  3641. y_non_fuse);
  3642. HostTensorND host_y_fuse, host_y_non_fuse;
  3643. auto func =
  3644. graph->compile({make_callback_copy(y_fuse, host_y_fuse),
  3645. make_callback_copy(y_non_fuse, host_y_non_fuse)});
  3646. func->execute();
  3647. MGB_ASSERT_TENSOR_EQ(host_y_fuse, host_y_non_fuse);
  3648. }
  3649. #endif
  3650. #endif
  3651. // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台