From 6b4738df30bb85b7f9a7cb4281f03dba75031c86 Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Tue, 19 Mar 2019 11:26:25 +0100 Subject: [PATCH] rearrange pygraph/kernels file. --- notebooks/check_gm.py | 4 +- notebooks/preimage/results.gm.npz | Bin 0 -> 17288 bytes notebooks/run_marginalizedkernel.py | 24 +- notebooks/run_structuralspkernel.py | 2 +- preimage/preimage.py | 141 ++++ .../kernels/{ => building}/cyclicPatternKernel.py | 0 pygraph/kernels/{ => building}/pathKernel.py | 0 .../kernels/{ => building}/treePatternKernel.py | 0 pygraph/kernels/{ => building}/treeletKernel.py | 0 .../{ => building}/weisfeilerLehmanKernel.py | 0 pygraph/kernels/else/rwalk_sym.py | 842 +++++++++++++++++++++ pygraph/kernels/else/sp_sym.py | 200 +++++ pygraph/kernels/else/ssp_sym.py | 464 ++++++++++++ pygraph/kernels/marginalizedKernel.py | 33 +- pygraph/kernels/spKernel.py | 5 +- pygraph/kernels/structuralspKernel.py | 2 +- pygraph/utils/model_selection_precomputed.py | 14 +- pygraph/utils/parallel.py | 18 +- 18 files changed, 1705 insertions(+), 44 deletions(-) create mode 100644 notebooks/preimage/results.gm.npz create mode 100644 preimage/preimage.py rename pygraph/kernels/{ => building}/cyclicPatternKernel.py (100%) rename pygraph/kernels/{ => building}/pathKernel.py (100%) rename pygraph/kernels/{ => building}/treePatternKernel.py (100%) rename pygraph/kernels/{ => building}/treeletKernel.py (100%) rename pygraph/kernels/{ => building}/weisfeilerLehmanKernel.py (100%) create mode 100644 pygraph/kernels/else/rwalk_sym.py create mode 100644 pygraph/kernels/else/sp_sym.py create mode 100644 pygraph/kernels/else/ssp_sym.py diff --git a/notebooks/check_gm.py b/notebooks/check_gm.py index d301996..e778751 100644 --- a/notebooks/check_gm.py +++ b/notebooks/check_gm.py @@ -12,8 +12,8 @@ import matplotlib.pyplot as plt from numpy.linalg import eig # read gram matrices from file. -results_dir = 'results/marginalizedkernel' -ds_name = 'Letter-med' +results_dir = 'results/marginalizedkernel/myria' +ds_name = 'ENZYMES' gmfile = np.load(results_dir + '/' + ds_name + '.gm.npz') #print('gm time: ', gmfile['gmtime']) # a list to store gram matrices for all param_grid_precomputed diff --git a/notebooks/preimage/results.gm.npz b/notebooks/preimage/results.gm.npz new file mode 100644 index 0000000000000000000000000000000000000000..beef66d9590a2c0dd00191a3acb80215253414d3 GIT binary patch literal 17288 zcmV;3LwCGTO9KQH000000000X0NDEh+sFU_0LTCU01N;C0Ay)%Ut(o*bS`did6Q06 zO;A|@0CoU-CuC)FV{#`tASXO#I43M1CuVPQbaG*CUvF|`WpXDvAVy(qb7d?bCv#|F zaAhYtASgL3DJ&p;ARr(hARr(hARr(hARr(hARr(hARr(hARr(hARr(hARr(hARr(h zARr(hARr(hARr(hARr(hARr(h3IM=4kf1Mo1QY-O00000 z03iSng#L@(K>z^WK>z>_0001Ia$#_2Ut(o*bS`did6Q06O;A|@0CoU-CuC)FV{#`t zASZlJCoCW*W^ZzIa$#;?Z*pX1awj?ELVRCX|c?w^0Wn*t{b98cbV{{5}0Ap@-ZE$%C zZe(F{a$$K2aREyJg>eF7Ut@1%Wn*&+Wo~0{WMv9*16Tn7003ff1XuPEJby|Ns9=|NsC0OFwjR5Mqg4aS4?9ZeL#tWNBk`3UL-#8UO$QVPtA-X>)X6Z*_EKa$jU=V{~6;VPkY}a(QtV zXckx+0001CWNK__b97&6Zf<3AUu0=xbYEs+V{~tFd2twM7FZSl003oVXJububaZlG zWNBk`UuI!rbZ>HbaT#b9SOow80B3SxaA2OAXota001*_A#+OseQ_daAXota001*_BXdgv zeQ_jcAXota001*_C38yyeQ_peAXota001*_Cv!^#eQ_vgAXota001*_DRWB&eQ_#i zAXota001*_D|1T*eQ_*kAXota001*_Eptl;eQ_>mAXota001*_FLO%>eQ_{oAXota z001*_F>^}^eQ`2qAXota001*_GjmG{eQ`8sAXota001*_HFHY~eQ`EuAXota001*_ zH*-r2eQ`KwAXota001*_Ide-5eQ`QyAXota001*_J9A48eQ`W!AXota001*_J#$MB zeQ`c$AXota001*_KXXeEeQ`i&AXota001*_L32wHeQ`o)AXota001*_Lvu?KeQ`u+ zAXota001*_MRQ9NeQ`!;AXota001*_M{`RQeQ`)=AXota001*_NpnjTeQ`=?AXota z001*_OLI#WeQ``^AXota001^|O>;{ZeQ{1`AXota001{}PjgEceQ{7|AXota001{} zQFBWfeQ{D~AXota001^|Q*%oieQ{K1AXota001{}RdY)leQ{Q3AXota001{}S95h( z1ONa4UtwfwaaetEStv^YeQ{bSV{UbAaCt6cZ*pZWZFOvPX<>45VR;I3V_|Gzatd)< zXbV^Z0001KIB{G{080Ufaa~ezUMNcgSOEY406cMDPEJlr|NsC0O8@`=|4RULabRL- z16T+E00031000000041eXa$CGVp4HqhH+$4ab^UT003fdZe(9{d2nTMXIKFM z001y?Xme;=XkKUoSO@?B01W^D000000C8z(1%`2IQgLgBacoj?ZGCZWXlH0>b9GAr zeQ|FnXj^DrXaiUX00000000000001SaA*aFadA>{a)xnpQgL)>W@uYzUT6bY2mk;8 z0ssI200000adl_~hH-XMad(Dscv5kBeQ|ndXJ}}1bxQ($aeF9eTWDTr16T+E00031 z000000041(Xa$CGeNu6LhH-yVae!!gXj^DrXaiUX00003000000001SfoKJWae`8D zgNAX0QgMZSafWDTXlQe2TWDTr16T+E000XB000000041^Xa$CGh*EKhhH;8gaf^L% zjA&+~UT6bY2mk;80ssI200000agJyOhH;Nlagc^_ky3GzXohH8 zXkKUoSO@?B00aO4000000CAIO1%`2yQgM}rah6hXmwj=VXlH0>b9GAueQ}v6Xj^Dr zXaiUX00003000000001SnrH=vahp+~UT6bY2mk;81poj500000akyv&hH<%4ak_?ayHatyXs~EoXkKUoSO@?B z00#g7000000CBx&1%`3HQgOeAalle>!F_SUXlH0>b7)&=UT6bY2mk;882|tP00000 zal>c@hH=DFam9vl#!_*|eR0TWXJ}}1bxQ|*amgrXTWDTr16T+E000I6000000042y zXa$CG%TjU7hH=eOan5MMXj^DrXaiUX00008000000001S&u9gPanMq6(S~u-QgPFL zanxvMXlQeFO9*{&)hK9NXkKUoSO@?B00#g7000000CCo61%`3gQgPUZaoJLF+Gx~h zTWDTr16T+E000R9000000042@Xa$CG+){DfhH>6fao>G$;Am%PXmfQ-34L+lC}>+~ zUT6bY2mk;82mk;800000apGtNhH>LkapZ<^wR(TXlH0>b7)&=UT6bY2mk;85&!@I00000aqVaYhH>sv zaqot4@KSN{eR1+=XJ}}1bxR6;aq}o>TWDTr16T+E000F5000000043HXa$CG^-^*6 zhH>{&arkJqXj^DrXaiUX00009000000001S`Dg`(ar#nm`-XA+QgQug>}XqPUT6bY z2mk;83jhEB00000asFrphH?K=asUAU0ETh_0RR9}asmMW0DW=;0RRAKXJ}}1bxR9< zas&YY04Qi%XkKUoSO@?B00IC2000000CEKZ003wOhH?f0002^Q2LS*8hH?l2002^Q z2>}2AXpCrEXkKUoSO@?B015yA000000CEZe003wOhH?u5002^Q3;_TDhH?!7002^Q z4gmlFY6AfP0BBoiUT6bY2mk;83;+NC00000at{Fj0B8kG0RRAKXJ}}1Xj^DrXaiUX0000H z000000001TAprmYXa$CHA^`vZQgR~!004$^Bmn>bQgS5$004b*CIJ8dXlH0>b9GA% zeR3xO001axTWDTr16T+E00000000000043*0RRAK1%`4d0RR9}aw-7;0ETia0RR9} zax4J=0BCM#TWDTr16T+E000aC000000043=0RRAK1%`4i0RR9}axVb@0ETif0RR9} zaxnn_0BRrs003xPXkKUoSO@?B01f~E000000CF+`003wOhH^6j002^QGywnrhH^Cl z002^QHUR(teR4Me003xbXlQeFOAdW+~UT6bY2mk;84FCWD00000aybD2 z0B8k+{&QgT88004b*LjeE)XlH0>b9GA(eR4zr001ax zTWDTr16T+E000gE000000044D0RRAK1%`4)0RR9}az_CG0ETi%0RR9}a!COI0BS=4 z003xPXkKUoSO@?B01yBG000000CGwJ003wOhH^^*002^QOaTA@hH^~-002^QP5}S_ zeR59$003xbXlQeFOAvi>Pyqk{C}>+~UT6bY2mk;84*&oF00000a!~;Q0B8kb7)&=UT6bY2mk;86#xJL00000 za$Erb0B8kIQgUYj003$x z0RRAKTWDTr16T+E000mG000000044m0RRAK1%`5I0RR9}a%uqp0ETjF0RR9}a%=$r z0BTzS003xPXkKUoSO@?B01^NI000000CH^s003wOhH`EJ002^QZvg-RhH`KL002^Q zaRC4TeR6UE003xbXlQeFOA>u@a{&MVC}>+~UT6bY2mk;82><{900000a&!Rz0B8k< za&-X!08(;x0RRAoa(4j$08(;z0RRAK@@QLVUT6bY2mk;85dZ)H00000a(Mv&0B8k< za(V#(08(;$0RRAoa(n>*08(;&0RRAMasdDUXj^DrXaiUX0000J000000001TegOag zXa$CHe*pjhQgVO+004$^fdK#jQgVU;004b*g8={lXlH0>b9GA-eR6~W001axTWDTr z16T+E000sI000000044@0RRAK1%`5l0RR9}a)$u`0ETji0RR9}a)|)|0BVB)003xP zXkKUoSO@?B02BZK000000CI`}003wOhH{Gm002^Qi~#@uhH{Mo002^QjsXAweR7Wh z003xbXlQeFOB8)_kO2SyC}>+~UT6bY2mk;86951J00000a*+W50B8kb9GAk-8QgX5Z003&N0RRAKTWDTr16T+E000*N000000045c0RRAK1%`680RR9}a5h0DW?}0RRAKXJ}}1Xj^DrXaiUX0000O000000001Txd8wGXa$CH zx&Z(HQgXWi004$^ya50JQgXck004b*z5xILXlH0>b9GA>eR976003xPXkKUoSO@?B z02TlM000000CKNVhH}XP002^Q$^ifX zYQ6yg0CP(jeR9hI001axTWDTr16T+E000I6000000045#0RRAK1%`6X0RR9}a?Sw& z0ETkU0RR9}a?k+)0BFc)TWDTr16T+E000^Q000000045)0RRAK1%`6c0RR9}a?=3- z0ETkZ0RR9}a@7F<0DW@S0RRAKXJ}}1Xj^DrXaiUX0000R000000001T*8ub9GA^eRACa003xPXkKUoSO@?B z02u%P000000CL^|003wOhH~El002^Q-~j*thH~Kn002^Q;sF2vYSsY&0CP(leRAUg z003xPXkKUoSO@?B02u%P000000CMC3003wOhH~Wr002^Q<^cczhH~ct002^Q=m7u# zYTN+;0CRO%1poj5Zf|5|b8_hc003idWpsCMa%*@lV{Bn_b7gZba%FIDa&&fSWp{H5 zPH$voR%vB-3UcZJ001e0a_a#A0DW@o0RRA41^@s6Uv6(?Wpi@v0RRAK9&=)KVrUa+ z5-EXl?g0P*eRA&s001ax6=)V{7ibn}7-$w~8E6)08hvu`0RRAK97_muXdQiW@c{q; zC`$l+a`FKH0B9g+A#+OseRA^w003wpXd`n=0)2Ay0RRAKAZR6XO9Op!^#K3?Xdq}O zb4vt$a`piL0B9g+DRWB&eRB5!003wpXe)C|27Pk)0RRAKAZRUfO9y>&`2hd`Xdq}W zb4v()a{2)P0B9g+F>^}^eRBH&003wpXftz53Vm|?0RRAKAZRsnOACE+{Q&>~Xdq}e zb4v_;a{d7T0B9g+Ide-5eRBT+003wpXghOD4t;U}0ssJLAZR^vOAmc=0RjL3Xdq}m zb4w6?asmPX0B9g+L32wHeR2Z=003wpXhU;L5`A(60ssJLAZSH%OA~!^1p)v7Xdq}u zb4wI`as~nb0B9g+NpnjTeR2l^003wpXiIZT7JYIE0ssJLAZSfb7*aSauEUm0BC1uXmfQ-0ex~3 z0ssI@0BR5d0047K0)2850ssJLg?(}q0ssJLXJ}}1b4vq#auosq04PfWY7_zh0BDze zauxyr0BC1uXme;=XkKUoSO@?B01W^D000000CE=s003wOhH@AJ002^Q83F(RhH@GL z002^Q8v+0TeR3QE003xbXlQeFO9Xv#9RdIVC`$us76JeOXrp~{9s&RWXlH0>b7)&= zUT6bY2mk;84*&oF00000avuT!0B8kb4wC^ zawq}-0BC1uXme;=XkKUoSO@?B02KfL000000CFh;003wOhH@$b002^QD*^xjhH@+d z002^QEdl@leR3`W003xbXlQeFO9p*%F9HAnC`$!uCISEeXu*AQFaiJoXlH0>b7;qX zaxnq`0BC1uXmd*yeR47a003xbXlQeFO9y>&GXekrC`$%vFaiJoXw!XiGy(tsXlH0> zb7)&=UT6bY2mk;85dZ)H00000ay0?~0B8kb9GAz zeR4wr001aU2x>e6003y~eR4zs003xbXlQe2@qKbd0ssJLXJ}}1bxR6;az+9G04PfZ zY9|5!080sKL;?T+Y61ZO0DW>t0ssJLXJ}}1bxR9XlH0>b9GA$eR4_y001aU2x>wC002u1YDoeB0BRor004b*O9B7@XlH0>b8004 z004b*OacG^XlH0>b7)&=UT6bY2mk;85&!@I00000a!mpN0B8k*Q08(;M0ssJga#8{S0BC1uXmd*xeR5L*003xbXlQeFOAUQ;R0041C`$lp5dr`J zO9N^g0ssI@3~EaP003$>0RRAfa#aEV0BC1uXmfQ-4t;V~0ssIgOATsO0ssJNLID5( zeR5X<003xbXlQeFOAmc=SONe5C`$xtBmw{cOAcyR0ssJYOAvi>Spon6C~8^(004b* zS^@w7XlH0>b827#004b*TLJ(8XlH0>b9GA*eR5m^001aU2WmP3002u2YD@wE080>R zS^@w7YH@0ssJLXJ}}1bxRU`a$W)e04PfZYA6B#080sKMFIc-OAKmK0ssI@ z5o%om003%&0RRAfa$f=f0BC1uXmfQ-6Mb@E0ssIgOA=~d0ssJNjsXAweR5#}003xb zXlQeFOB8)_VgdjFC`$}#Qvv`0OA~5g0ssJNngIX+eR5+0003xbXlQeFOBH=`WC8#H zC`$!uE&>1mO9pB(0ssI@5NcZj002uAYGVQb0BWlN004b*WdZ;IXlH0>b4we2a%KVm z0BC1uXmfQ-7JYJO0ssIgOBHHm0ssJNw*deEeR605003xbXlQe4y#W9KeR636003xb zXlQeFOBa1|Y61WNOBQNq0ssJWOBj7}YXSfOC`%Sb9GA@ zeR6FA001aU25K<^002uEYHR`k0BY3%004b*ZUO)RXlH0>b86cG004b*Zvp@SXlH0> zb9GA^eR6OD002uFYHk7m0CP(leR6RE001aU6>4Sz002uFYHtDn0Cjb0=>Y%$YU%+1 z04afTasmJVeR6XG003(30RRAM@c{q;b7FO3Xc8%ba&!U!0DW?G0ssIgXccG{XcuS} zXc%Y~Xc=f0Xc~QTb^-tZXdFujb7&oXa(4m%04PfUeR6mL003wpXd!b;0ey0L0ssJL zAZR0VO9FjzdIA6dXdq}Mb4vq#a(e;*0B9g+Cv!^#eR6yP003wpXeo0`1$}aT0ssJL zAZROdO9p*%egXghXdq}Ub4v$(a(@B<0B9g+FLO%>eR6;T003wpXfbn334L;b0ssJL zAZRmlOA38*f&u^lXdq}cb4v?-a)SZ@0B9g+H*-r2eR6~X003wpXgPCB4SjNj0ssJL zAZR;tOAdWa)$x{0B9g+KXXeEeR7Bb003wpXhCyJ5q)xr0ssJL zAZSB#OA>u@iUI%tXdq}sb4wF_a*F~00B9g+M{`RQeR7Nf003wpXi0NR6@7Az0ssJL zAZSZ-OBQ`{jsgGxXdq}!b4wR}a*qN40B9g+PjgEceR7Zj003wpXi;-Z8GUk*0ssJL zAZSx_OB#K0k^%q#Xdq}+b4we2a+3l80B9g+S95h}SbcJo0ssIgO8|Xxl>z_&C}?GU za+U%B0BC1uXme<7eR7uq003xbXlQe2TWDTr16T+E000gE000000045B0ssJL1%`5& z0ssI~a+(4F0ETj#0ssI~a-0GH0DW?u0ssJLXJ}}1Xj^DrXaiUX00007000000001T zo&o>>Xa$CHp8@~?QgWaI004$^p#lH^QgWgK004b*qXGZ`XlH0>b9GAreR8A%001aU z0BV*3003xteR8D&003xbXlQeFO9FjzrUC!}C`$oqr2+r|XoY=prvd-~XlH0>b7+fw za;O3T0BC1uXmd*reR8P+003xbXlQdw6n%240ssJLXJ}}1bxQ+%a;pLW04PfWYNrAK z0BDzea;yRX0BC1uXmfQ-1buR?0ssIgO9N`G0ssJLqkVF&0ssJLXJ}}1OA&o?uL1x7 zXlH0>b9GAveR8k@001aU1Zu7V003yOeR8n^003xbXlQe2wS9830ssJLXJ}}1O9_2) zvjPABXlH0>b9GAweR8w{001aU1!}PZ003ygeR8z|003xbXlQe2$9;0P0ssJLXJ}}1 zXj^DrXaiUX0000R000000001Tw*mkFXa$CHxB>tGQgXQh004$^x&ib4v_;a=iio0BC1uXmfQ-2YqtB0ssIgO8{!40ssI@25Pke003yyeR977 z003xbXlQe2TWDTr16T+E000sI000000045p0ssJL1%`6L0ssI~a>4=t0ETkI0ssI~ za>N1v0DW@B0ssJLXJ}}1OA~!^#sUBUXlH0>b9GAyeR9VF001aU2Wr0p003y;eR9YG z003xbXlQdw41IFR0ssJLXJ}}1bxR3-a>@b#04PfZYO?|W080pJ$N~TWXzP7)%K`uZ zXlH0>b7=8>a?An%0BC1uXmfQ-3Vm|T0ssIgO9g7O0ssI@32Ms%003$N0RRAfa?Sz( z0BC1uXmfQ-3w?6W0ssIgO9E=B0ssI@3Tn;*003$e0RRAfa?k<*0BC1uXmfQ-41IFZ z0ssIgO9pDa0ssI@2x`d!002u1YS01z0BRor004b*(gFYgXlH0>b9GA%eR9(R001aU z0BV;4002u2YSID#0BSY?004b*)B*qiXlH0>b4w6?a@7I=0BC1uXmfQ-4t;Xg0ssIg zO8{z}0ssI@0&1xO002u3YSaP%0BS-3004b**8%_lXlH0>b9GA(eR9|W001aU4r0BV8(004b*-~s>u zXlH0>b9GA-eRAOf001aU2WrLw002u8YTyC@0BVi_004b*;sO8wXlH0>b9GA;eRAUh z001aU0&1!P002u9YT^O_0BV{6004b*b9GAb9GA=eRAgl001aU6>8=J003&Y0RRAfa_9m80BC1u zXme`40RRAfa_Is90BC1uXmfQ-7kzT-0ssI@7Ha4M0047K7=3c<0ssI@7Ha7N0047K z8GUl>0ssIgO9pDT0ssJN)d2tieRAyr003xbXlQe4+W`OoeRA#s003xbXlQeFOB#K0 z?*ae-OBrhI0ssJWOB;Q1@B#n;C`$%vyaE6KOBrhJ0ssJYb!zDW003(00RR9gfpYNz z004b*@&W(=YV8340BUyv0047hbz*1|DS>kH0ssJga`XZK04Qh`XclM}XclM~XclN0 zXclN1eRA~z003wlO9*pl9er~40ssIgO8|Xx_W}R_Xdq}Ib4vkza`*xO0B9g+BXdgv zeRBB%003wpXeDz?1ATJ(0ssJLAZRCZO9Xv#`vL#}Xdq}Qb4vw%a{K}S0B9g+D|1T* zeRBN*003wpXf1O~2Yqt>0ssJLAZRahO9*{({{jF2Xdq}Yb4v+*asUGW0B9g+GjmG{ zeR2T<003wpXf<<73w?3|0{{SMAZRypOALK-0|Nj6Xdq}gb4v|2Lk{AXdq}ob4w9@atH$e0B9g+Lvu?K zeR2r{003wpXhm~N6Mb?D0{{SMAZSN(OB8)_3j+WEXdq}wb4wL{ats3i0B9g+OLI#W zeR2&0003wpXialV7kzRL0{{SMAZSl>OBj7}4+8)IXdq}&b4wY0au5Rm0B9g+Q*%oi zeR2^4003wpXjOAd8+~#T0{{SMAZS-}b!b?9auWjp04PfUeR327001axWqooL0{{SM zXJ}}1Xl;FR76SkPXlH0>b9GAreR3BA001aU0BRKj003xteR3EB003xbXlQdw6n%0T z0{{SMXJ}}1bxQ($avB2w04PfVY8V3m0BD7MavK8x0BC1uXmeXXXlH0>b4v|b7)&=UT6bY2mk;85dZ)H00000awr1; z0B8k08(-+0{{ShaxDV@0BC1uXmfQ-1$}ZZ0{{RhO9W~m z0{{SMuYGba0{{SMXJ}}1XtjNEFarPpXlH0>b4v?-axnt{0BC1uXmfQ-27Pie0{{Rh zO9g5#0{{SM!F_Tw0{{SMXJ}}1XvckWGy?ztXlH0>b9GAxeR4Ge001aU25K_{003yy zeR4Jf003xbXlQeFO9*{(Hv<3wC`$)wHUj_vXy1KuI0FCxXlH0>b9GAzeR4Si001aU z2x>S3003y~eR4Vj003xbXlQe2@qKbT0{{SMXJ}}1OBH=`JOcm#XlH0>b9GA!eR4em z001aU1!^z@002t~YB~b|0BQmO004b*J_7&%XlH0>b4wI`az6tA0BC1uXmd*reR4np z003xbXlQdw41IDz0{{SMXJ}}1bxR9?~ceR4(v003xbXlQe2TWDTr16T+E000yK000000044F0{{SM1%`4+0{{S0 za!CUK0ETi(0{{S0a!UgM0DW>y0{{SMXJ}}1bxRF>a!msO04PfUY8C?k080aE9|HgY zOAKm70{{SOHUR(teR56%003xbXlQdw6n%0}0{{SMXJ}}1bxRI?a!>;R04PfeYCr=3 z080&OP6Ge{YC-`30DW>%0{{SMXJ}}1Xj^DrXaiUX0000H000000001TQUd@0Xa$CH zQv(11QgT!S004$^RRaJ3QgT)U004b*R|5b5XlH0>b9GA(eR5a>001aU4r);Y003%E z0RRAfa#;fa0BC1uXmd*ueR5g@003xbXlQeFOAvi>TLS<9C`%7&Spxt7YFYsR0DW>? z0{{SMXJ}}1YG45X0DW>@0{{SMXJ}}1bxRR_a$W-f04PfYYApi*080#NMgsryHC`$oq83Ob9GAb85W-004b*Y6AcOXlH0>b9GA> zeR699001aU6>4V#002uCYH0%i0Ch_keR6CA002uCYH9-j0CP(jeR6FB001aU25K|| z003&$0RRAfa&7|v0BC1uXme`Y0RRAfa&H3w0BC1uXmfQ-8hvtb0{{R^8ES3=0047K z8+~$d0{{R^8ES6>0047!YUu$00BY(1001e0a&iLz0DW?E0{{SO?EwG)YW4yE0CQq> zVrUX6fpT;M004b*bprqZC}@3dXdq}Kb4vn!a(V**0B9g+C38yyeR6vP003wp zXeV<^1buRR0{{SMAZRIbO9g#$eFFdhXdq}Sb4vz&a()8<0B9g+Eptl;eR6*T003wp zXfJb12z_#Z0{{SMAZRgjO9_2)fdc>lXdq}ab4v<+a)JW@0B9g+HFHY~eR6{X003wp zXg70941IEh0{{SMAZR&rOAUQ;g#!QpXdq}ib4w0=a)tu{0B9g+J#$MBeR78b003wp zXg_mH5Pfop0{{SMAZS5zOA&o?i30!tXdq}qb4wC^a*6{00B9g+MRQ9NeR7Kf003wp zXh(BP6n%1x0{{SMAZST*OBH=`jRODxXdq}yb4wO|a*hK40B9g+O>;{ZeR7Wj003wp zXisxX7=3b(0{{SMAZSr@OBsD~kpln#Xdq})b4wb1a*_i80B9g+RdY)leR7in003wp zXjgM}Xjpx6lmh?&C`$l+a+L!B04Qi>eR7rq003xbXlQe2ZGCc=0{{SMXJ}}1OALK- zm;(R+XlH0>b9GAreR7!t001aU0BV*4003xteR7%u003xbXlQeFO9Fjzn*#sb7+fwa-9PJ0BC1uXmfQ-1ATIy0{{RhO9E<~0{{SMmwj@d z0{{SMXJ}}1bxQ<&a-ahM04PfXYM%oD0BECqa-jnN0BC1uXmd*reR84$003xbXlQdw z5Pfo^0{{SMXJ}}1bxQ?(a-;(Q04PfYYM}!F0BEm$a-{b7;qXa;XCV0BC1uXmd*reR8S; z003xbXlQe2TWDTr16T+E000yK000000045U0{{SM1%`600{{S0a;*aZ0ETj|0{{S0 za<2mb0DW?>0{{SMXJ}}1bxQ|*ab9GAzeR8$~001aU2x_zg z003y~eR8)0003xbXlQe2@qKc*0{{SMXJ}}1OBH=`xdQ+IXlH0>b4w0=a=HTm0BC1u zXmfQ-3Vm|B0{{RhO9g7C0{{R^32L_k003$N0RRAfa=Zfo0BC1uXmfQ-3w?6E0{{Rh zO9E=00{{R^3TnIq003$e0RRAfa=rrq0BC1uXmfQ-41IFH0{{RhO8{z^0{{R^3u?Xt z003$q0RRAfa=-%s0BC1uXme^M0RRAfa=`-t0BC1uXmfQ-4SjOL0{{RhO8{z@0{{R^ z3~Imw003$>0RRAfa>D}v0BC1uXmfQ-4t;XO0{{RhO9X170{{R^25PDU002t}YPACZ z080sKx&r_JOATtn0{{SOLID5(eR9PE003xbXlQdw5q)yT0{{SMXJ}}1bxRL@a>oMz z04PfiYQ+Np0BTMF004b*$O8ZXXlH0>b9GA)eR9bI001aU1ZtxL002u5YRCfs0BTwR z004b*$^!rZXlH0>b827#004b*%L4!aXlH0>b9GA*eR9kL001aU3~Ipx002u4YQ_Tq z080>R$^!rZYHw`cYJvd( z0DW@L0{{SMXJ}}1bxRX{a?k?+04PfmYR>}z0BVi_004b*(E|VgXlH0>b9GA;eR9$R z001aU25PVa002u9YS9A#0BV{6004b*(*pniXlH0>b9GAb9GA=eR9_W001aU z6>8N3003&e0RRAfa@Yd^0BC1uXmfQ-7kzTs0{{R^6>8Q40047K7=3cu0{{R^7HZf7 z0047K8GUlw0{{RhO9pDG0{{SO)d2tieRA9b003xbXlQe4+W`OoeRACc003xbXlQeF zOB#K0-U9#tOBrh10{{SXOB;Q1-va;uOBrh20{{SXb!zDW003(00RR9gfpXvj004b* z;R65wYV8340BUyw0047hbz*1|DS>k00{{Sha^nL404Qh`XclM}XclM~XclN0XclN1 zeRAXj003wlO9gXi9er};0{{RhO8|Xx<^uo#XdqYt0000pa_0j80CP(LeRAjn003wp zSOEY405fvw0{{SXO9Fjz>H`1(XdqYt0000pa_a*C0CP(NeRAvr003wpSOEY405fv! z0{{SXO9Xv#?gIb-XdqYt0000pa_<8G0CP(PeRA*v003wpSOEY405fv&0{{SXO9p*% z@&f<>XdqYt0000pa`OWK0CP(ReRA{z003wpSOEY405fv+0{{SXO9*{(_5%O_XdqYt z0000pa`yuO0CP(TeRB8%003wpSOEY405fv=0{{SXOA38*`U3y}XdqYt0000pa{B`S z0CP(VeRBK*003wpSOEY405fv^0{{SXOALK-{sRC2XdqYt0000pa{mJW0CP(XeR2Q< z003wpSOEY405fs{1ONbYOAdW<0t5g6XdqYt0000pasvba0CP(ZeR2c@003wpSOEY4 z05ft01ONbYOAvi>1_S^AXdqYt0000tat8ze0CP(beR2o{003wpSOEY405)<71ONbY zOA>u@3IqTEXdqYt0000tatj0i0CP(deR2#0003wpSOEY405@_C1ONbYb!b?9at;Il z04PfUeR2;3001axTWDTr16T+E00031000000043j1ONbN1%`4F1ONb1auNgp0ETiC z1ONb1aufsr0DW>51ONbNXJ}}1Xj^DrXaiUX0000D000000001T76bqQXa$CH7X$zR zQgRps004$^83X_TQgRvu004b*8w3CVXlH0>b4v?-avTHz0BC1uXmfQ-0ex~E1ONai zO8{yW1ONbNTWDTr16T+E00062000000043x1ONbN1%`4T1ONb1av%f%0ETiQ1ONb1 zav}r(0DW>J1ONbNXJ}}1bxQ($awG%*04PfVY9j;y0BBoiUT6bY2mk;80{{R300000 zawP-+0B8k0BC1uXme;=XkKUoSO@?B z01E&B000000CFn?003wOhH@+f002^QEd&4nhH@?h002^QF9ZMpeR41a003xbXlQeF zO9Op!F$4erC`$rrDg*!kXj^DrXaiUX00004000000001TG6VnsXa$CHGXwwtQgSo| z004$^H3R?vQgSu~004b*Hv|9xXlH0>b4v<+aySG40BC1uXme;=XkKUoSO@?B00sa6 z000000CG75003wOhH^Rt002^QI|Kj#hH^Xv002^QJp=#%eR4ho003xbXlQeFO9Xv# zKLh{(C`$usHv|9xXj^DrXaiUX00005000000001TKm-5)Xa$CHK?DE*QgT8B004$^ zLj(W-QgTED004b*MFaob4v+*az+FI0BC1uXmfQ-1$}Zy1ONaiO9W~~1ONbN zTWDTr16T+E000I6000000044G1ONbN1%`4-1ONb1a!LdM0ETi)1ONb1a!dpO0DW>z z1ONbNXJ}}1OAUQ;P6Pk|XlH0>b9GAweR59(001aU18P15002t`YE1+H0Ch_TeR5C) z003xPXkKUoSO@?B00;m8000000CG_T003wOhH_E_002^QQv?72hH_K{002^QRRjP4 zeR5U=003xbXlQeDO9*{(R|Eh6C`$)wRs;Y5Xj^DrXaiUX00009000000001TSOfq7 zXa$CHSp)z8QgT`Z004$^TLb_AQgU1b004b*T?7CCXlH0>b7)&=UT6bY2mk;85dZ)H z00000a$W=g0B8kE002^Q zX9NHMhH_{G002^QX#@ZOeR669003xbXlQdw3w?5H1ONbNXJ}}1Xj^DrXaiUX0000D z000000001TYyb9GA! zeR6UH001aU18O(~002u2eR6XI003xbXlQeFOACE+bOZnZC`$lp90ULWO9E;z1ONa_ z32JKu004DM41IES1ONaiOA2ap1ONbNTWDTr16T+E000dD000000044!1ONbN1%`5W z1ONb1a(Dy)0ETjT1ONb1a(V;+0DW?M1ONbNXJ}}1Xj^DrXaiUX0000E000000001T zd;|ahXa$CHeFOjiQgVI-004$^e*^#kQgVO<004b*fdl{mXlH0>b9GA%eR6^X001aU z0BRcq002t`YEA?I080sKaRdMWOAKmz1ONbaOAdW zb9GA(eR7Hf001aU32JHt002u4YKa5@0BBoiUT6bY2mk;85C8xG00000a*G520B8k< za*PB308(;|1ONbra*hN508(;~1ONbia*zZ70BC1uXmfQ-5Pfoy1ONa_4{DGE0047K z5q)x!1ONaiO9*OX1ONbNTWDTr16T+E000sI00000004561ONbN1%`5z1ONb1a+L%C z0ETjw1ONb1a+d@E0DW?p1ONbNXJ}}1Xj^DrXaiUX0000J000000001TnFIg;Xa$CH zngjpQgWRH004b*o&*2@XlH0>b9GA+eR7`!002u7YM2B70CP(d zeR7}#002u7YMulD0CRO}=>Y%$YU%+104afTp#%T`eR84%003(30RRAM003wpYViXA0CP(QeR8b? z003wpYV!jC0CP(ReR8e@003wpYV`vE0CP(SeR8h^003wpYWD*G0CP(TeR8k_003wp zYWV{I0CP(UeR8n`003wpYWo8K0CP(VeR8q{003wpYW)KM0CP(WeR8t|003wpYX1WO z0CP(XeR8w}003wpY5@cQ0CP(YeR8z~003wpY6AoS0CP(ZeR8%0003wpY6S!U0CP(a zeR8)1003wpY6k=W0CP(beR8-2003wpY6%1Y0CP(ceR8=3003wpY6}Da0CP(deR8@4 z003wpY7GPc0CRO{SbcK41ONaiO8|XxyaWIMC~6c0004b*y#xRNXlH0>b7~p{004b* zz61aOXlH0>b9GAreR979001axTWDTr16T+E00000000000045p1ONbN1%`6L1ONb1 za>4`v0ETkI1ONb1a>N7x0BXGi003$t1ONbia>WDy0BC1uXmfQ-0)2AE1ONaiXj^Dr zXaiUX00001000000001T#{>WXXa$CH$OHfYQgX=z004$^$^-xaQgX`#003&m1ONbP zDFgrjeR9kM003xbXlQe4F9ZMpeR9nN003xbXlQeFO9Op!&IAAeC}>+~UT6bY2mk;8 z0ssI200000a?b<+0B8kg002^Q z*aQFohH}{i002^Q+5`XqYSaV(0BS@8004b*+XMgrXlH0>b9GAveRA9c001axTWDTr z16T+E000C4000000045`1ONbN1%`6o1ONb1a^D010ETkl1ONb1a^VC30BYL=003%C z1ONbia^eI40BC1uXme;=XkKUoSO@?B015yA000000CM95003wOhH~Tt002^Qz}(C}>+~UT6bY2mk;81poj500000 za_R&C0B8kb9GAxeRA~# z001axTWDTr16T+E000I6000000046K1ONbN1%`6>1ONb1a`*%Q0ETk;1ONb1a{2@S z0BZCE003%L1ONbia{B}T0BC1uXmfQ-2z_$=1ONaiXj^DrXaiUX00007000000001T z{R992Xa$CH{saI3QgZ(U004$^00jU5QgQ(W003(H1ONbPTm%3BeR2W?003xbXlQe4 zVgvvHeR2Z@003xbXlQeFO9_2)1O)&9C}>+~UT6bY2mk;82mk;800000as>qd0B8k< zas~we08(-X1poksatH+g08(-Z1pokQ0tEm7Xj^DrXaiUX0000A000000001T3IzZF zXa$CH3k3iGQgRFh004$^4Fv!IQgRLj004b*4+Q`KXlH0>b82Y>004b*5Cs4LXlH0> zb9GA!eR2^6001axTWDTr16T+E000F5000000043l1pokO1%`4H1pok2aufvs0ETiE z1pok2aux*u0BYz2003xPXkKUoSO@?B00{s9000000CE=v003wOhH@AM002^Q83h0U zhH@GO002^Q8wCIWY7Yef0BBoiUT6bY2mk;83jhEB00000avTK!0B8k+~UT6bY2mk;83jhEB00000ax?`10B8kf1pokOXJ}}1YJda)0DW>g1pokOXJ}}1bxRF>aytb8 z04Qi%XkKUoSO@?B00000000000CGG9003wOhH^ax002^QJ_P^(hH^gz002^QKm`B* zYQ6*j0BBoiUT6bY2mk;83;+NC00000azO;002^QO9cP`YB~h~0BVQ?004b*Oa%Y{XlH0>b9GA(eR53&001axTWDTr z16T+E000R9000000044M1pokO1%`4@1pok2a!>^T0ETi=1pok2a#95V0BR5g003xP zXkKUoSO@?B01f~E000000CH0W003wOhH_K|002^QRRsV5hH_Q~002^QR|Nn7YD@(H z0BVl}004b*SOow8XlH0>b9GA)eR5d^003xPXkKUoSO@?B01p5F000000CHLd003wO zhH_g4002^QTm=9ChH_m6002^QUIhREYFGsT0CP(beR5v~001axTWDTr16T+E000O8 z000000044e1pokO1%`5A1pok2a$*Gl0ETj71pok2a%2Sn0BQpT003&21ONbia%BYo z0BC1uXme_v1ONbia%Kep0BC1uXmfQ-5`A)K1pokOTWDTr16T+E000pH000000044m z1pokO1%`5I1pok2a%u$t0ETjF1pok2a%=?v0BU6g0047K6Mb@R1pokOTWDTr16T+E z000pH000000044s1pokO1%`5O1pok2a&QFz0ETjL1pok2a&iR#0BU9h0047!YUu$0 z0BY(1001e0a&rX$0DW?F1pokQ?EwG)YNiAL0CQq>VrruV003ola&-j&0AemsO928D z0~7!N0000003iU_`vKd?0002U0000C00000000000001h0RR910Ay)%Ut(o*bS`di zc~DCM0u%!j000000000X01$-!i{3#10Nz0W01f~E00000000000DuAV0001Ia$#_2 iUt(o*bS`dic~DCQ1^@s600IC40CoTX0Q^D#0001oMG1%i literal 0 KcmV+b0RR6000031 diff --git a/notebooks/run_marginalizedkernel.py b/notebooks/run_marginalizedkernel.py index df37cb9..e7b7ffd 100644 --- a/notebooks/run_marginalizedkernel.py +++ b/notebooks/run_marginalizedkernel.py @@ -12,17 +12,17 @@ import multiprocessing from pygraph.kernels.marginalizedKernel import marginalizedkernel dslist = [ -# {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds', -# 'task': 'regression'}, # node symb -# {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression', -# 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt', }, -# # contains single node graph, node symb -# {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds', }, # node/edge symb -# {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds', }, # unlabeled -# {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat', -# 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, # node/edge symb -# {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'}, -# # node nsymb + {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds', + 'task': 'regression'}, # node symb + {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression', + 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt', }, + # contains single node graph, node symb + {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds', }, # node/edge symb + {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds', }, # unlabeled + {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat', + 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, # node/edge symb + {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'}, + # node nsymb {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'}, # node symb/nsymb # {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, @@ -81,5 +81,5 @@ for ds in dslist: extra_params=(ds['extra_params'] if 'extra_params' in ds else None), ds_name=ds['name'], n_jobs=multiprocessing.cpu_count(), - read_gm_from_file=True) + read_gm_from_file=False) print() diff --git a/notebooks/run_structuralspkernel.py b/notebooks/run_structuralspkernel.py index f8fd1a0..339a9aa 100644 --- a/notebooks/run_structuralspkernel.py +++ b/notebooks/run_structuralspkernel.py @@ -65,7 +65,7 @@ param_grid_precomputed = {'node_kernels': [{'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}], 'edge_kernels': [{'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}], - 'compute_method': ['trie']} + 'compute_method': ['naive']} param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)}, {'alpha': np.logspace(-10, 10, num=41, base=10)}] diff --git a/preimage/preimage.py b/preimage/preimage.py new file mode 100644 index 0000000..c466087 --- /dev/null +++ b/preimage/preimage.py @@ -0,0 +1,141 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Wed Mar 6 16:03:11 2019 + +pre-image +@author: ljia +""" + +import sys +import numpy as np +import multiprocessing +from tqdm import tqdm +import networkx as nx +import matplotlib.pyplot as plt + + +sys.path.insert(0, "../") +from pygraph.kernels.marginalizedKernel import marginalizedkernel +from pygraph.utils.graphfiles import loadDataset + + +ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat', + 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}} # node/edge symb + +DN, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params']) +DN = DN[0:10] + +lmbda = 0.03 # termination probalility +r_max = 10 # recursions +l = 500 +alpha_range = np.linspace(0.1, 0.9, 9) +k = 5 # k nearest neighbors + +# randomly select two molecules +np.random.seed(1) +idx1, idx2 = np.random.randint(0, len(DN), 2) +g1 = DN[idx1] +g2 = DN[idx2] + +# compute +k_list = [] # kernel between each graph and itself. +k_g1_list = [] # kernel between each graph and g1 +k_g2_list = [] # kernel between each graph and g2 +for ig, g in tqdm(enumerate(DN), desc='computing self kernels', file=sys.stdout): + ktemp = marginalizedkernel([g, g1, g2], node_label='atom', edge_label=None, + p_quit=lmbda, n_iteration=20, remove_totters=False, + n_jobs=multiprocessing.cpu_count(), verbose=False) + k_list.append(ktemp[0][0, 0]) + k_g1_list.append(ktemp[0][0, 1]) + k_g2_list.append(ktemp[0][0, 2]) + +g_best = [] +dis_best = [] +# for each alpha +for alpha in alpha_range: + print('alpha =', alpha) + # compute k nearest neighbors of phi in DN. + dis_list = [] # distance between g_star and each graph. + for ig, g in tqdm(enumerate(DN), desc='computing distances', file=sys.stdout): + dtemp = k_list[ig] - 2 * (alpha * k_g1_list[ig] + (1 - alpha) * + k_g2_list[ig]) + (alpha * alpha * k_list[idx1] + alpha * + (1 - alpha) * k_g2_list[idx1] + (1 - alpha) * alpha * + k_g1_list[idx2] + (1 - alpha) * (1 - alpha) * k_list[idx2]) + dis_list.append(dtemp) + + # sort + sort_idx = np.argsort(dis_list) + dis_gs = [dis_list[idis] for idis in sort_idx[0:k]] + g0hat = DN[sort_idx[0]] # the nearest neighbor of phi in DN + if dis_gs[0] == 0: # the exact pre-image. + print('The exact pre-image is found from the input dataset.') + g_pimg = g0hat + break + dhat = dis_gs[0] # the nearest distance + Dk = [DN[ig] for ig in sort_idx[0:k]] # the k nearest neighbors + gihat_list = [] + + i = 1 + r = 1 + while r < r_max: + print('r =', r) + found = False + for ig, gs in enumerate(Dk + gihat_list): +# nx.draw_networkx(gs) +# plt.show() + fdgs = int(np.abs(np.ceil(np.log(alpha * dis_gs[ig])))) # @todo ??? + for trail in tqdm(range(0, l), desc='l loop', file=sys.stdout): + # add and delete edges. + gtemp = gs.copy() + np.random.seed() + # which edges to change. + idx_change = np.random.randint(0, nx.number_of_nodes(gs) * + (nx.number_of_nodes(gs) - 1), fdgs) + for item in idx_change: + node1 = int(item / (nx.number_of_nodes(gs) - 1)) + node2 = (item - node1 * (nx.number_of_nodes(gs) - 1)) + if node2 >= node1: + node2 += 1 + # @todo: is the randomness correct? + if not gtemp.has_edge(node1, node2): + gtemp.add_edges_from([(node1, node2, {'bond_type': 0})]) +# nx.draw_networkx(gs) +# plt.show() +# nx.draw_networkx(gtemp) +# plt.show() + else: + gtemp.remove_edge(node1, node2) +# nx.draw_networkx(gs) +# plt.show() +# nx.draw_networkx(gtemp) +# plt.show() +# nx.draw_networkx(gtemp) +# plt.show() + + # compute distance between phi and the new generated graph. + knew = marginalizedkernel([gtemp, g1, g2], node_label='atom', edge_label=None, + p_quit=lmbda, n_iteration=20, remove_totters=False, + n_jobs=multiprocessing.cpu_count(), verbose=False) + dnew = knew[0][0, 0] - 2 * (alpha * knew[0][0, 1] + (1 - alpha) * + knew[0][0, 2]) + (alpha * alpha * k_list[idx1] + alpha * + (1 - alpha) * k_g2_list[idx1] + (1 - alpha) * alpha * + k_g1_list[idx2] + (1 - alpha) * (1 - alpha) * k_list[idx2]) + if dnew <= dhat: # the new distance is smaller + print('I am smaller!') + dhat = dnew + gnew = gtemp.copy() + found = True # found better graph. + if found: + gihat_list = [gnew] + dis_gs.append(dhat) + else: + r += 1 + dis_best.append(dhat) + g_best += ([g0hat] if len(gihat_list) == 0 else gihat_list) + +for idx, item in enumerate(alpha_range): + print('when alpha is', item, 'the shortest distance is', dis_best[idx]) + print('the corresponding pre-image is') + nx.draw_networkx(g_best[idx]) + plt.show() \ No newline at end of file diff --git a/pygraph/kernels/cyclicPatternKernel.py b/pygraph/kernels/building/cyclicPatternKernel.py similarity index 100% rename from pygraph/kernels/cyclicPatternKernel.py rename to pygraph/kernels/building/cyclicPatternKernel.py diff --git a/pygraph/kernels/pathKernel.py b/pygraph/kernels/building/pathKernel.py similarity index 100% rename from pygraph/kernels/pathKernel.py rename to pygraph/kernels/building/pathKernel.py diff --git a/pygraph/kernels/treePatternKernel.py b/pygraph/kernels/building/treePatternKernel.py similarity index 100% rename from pygraph/kernels/treePatternKernel.py rename to pygraph/kernels/building/treePatternKernel.py diff --git a/pygraph/kernels/treeletKernel.py b/pygraph/kernels/building/treeletKernel.py similarity index 100% rename from pygraph/kernels/treeletKernel.py rename to pygraph/kernels/building/treeletKernel.py diff --git a/pygraph/kernels/weisfeilerLehmanKernel.py b/pygraph/kernels/building/weisfeilerLehmanKernel.py similarity index 100% rename from pygraph/kernels/weisfeilerLehmanKernel.py rename to pygraph/kernels/building/weisfeilerLehmanKernel.py diff --git a/pygraph/kernels/else/rwalk_sym.py b/pygraph/kernels/else/rwalk_sym.py new file mode 100644 index 0000000..f45cd9a --- /dev/null +++ b/pygraph/kernels/else/rwalk_sym.py @@ -0,0 +1,842 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Sun Dec 23 16:53:57 2018 + +@author: ljia +@references: S Vichy N Vishwanathan, Nicol N Schraudolph, Risi Kondor, and +Karsten M Borgwardt. Graph kernels. Journal of Machine Learning Research, +11(Apr):1201–1242, 2010. +""" + +import sys +sys.path.insert(0, "../") +import time +from functools import partial +from tqdm import tqdm + +import networkx as nx +import numpy as np +from scipy.sparse import identity, kron +from scipy.sparse.linalg import cg +from scipy.optimize import fixed_point + +from pygraph.utils.graphdataset import get_dataset_attributes +from pygraph.utils.parallel import parallel_gm + +def randomwalkkernel(*args, + # params for all method. + compute_method=None, + weight=1, + p=None, + q=None, + edge_weight=None, + # params for conjugate and fp method. + node_kernels=None, + edge_kernels=None, + node_label='atom', + edge_label='bond_type', + # params for spectral method. + sub_kernel=None, + n_jobs=None): + """Calculate random walk graph kernels. + Parameters + ---------- + Gn : List of NetworkX graph + List of graphs between which the kernels are calculated. + / + G1, G2 : NetworkX graphs + 2 graphs between which the kernel is calculated. + node_label : string + node attribute used as label. The default node label is atom. + edge_label : string + edge attribute used as label. The default edge label is bond_type. + h : integer + Longest length of walks. + method : string + Method used to compute the random walk kernel. Available methods are 'sylvester', 'conjugate', 'fp', 'spectral' and 'kron'. + + Return + ------ + Kmatrix : Numpy matrix + Kernel matrix, each element of which is the path kernel up to d between 2 praphs. + """ + compute_method = compute_method.lower() + Gn = args[0] if len(args) == 1 else [args[0], args[1]] + + eweight = None + if edge_weight == None: + print('\n None edge weight specified. Set all weight to 1.\n') + else: + try: + some_weight = list( + nx.get_edge_attributes(Gn[0], edge_weight).values())[0] + if isinstance(some_weight, float) or isinstance(some_weight, int): + eweight = edge_weight + else: + print( + '\n Edge weight with name %s is not float or integer. Set all weight to 1.\n' + % edge_weight) + except: + print( + '\n Edge weight with name "%s" is not found in the edge attributes. Set all weight to 1.\n' + % edge_weight) + + ds_attrs = get_dataset_attributes( + Gn, + attr_names=['node_labeled', 'node_attr_dim', 'edge_labeled', + 'edge_attr_dim', 'is_directed'], + node_label=node_label, + edge_label=edge_label) + ds_attrs['node_attr_dim'] = 0 + ds_attrs['edge_attr_dim'] = 0 + + # remove graphs with no edges, as no walk can be found in their structures, + # so the weight matrix between such a graph and itself might be zero. + len_gn = len(Gn) + Gn = [(idx, G) for idx, G in enumerate(Gn) if nx.number_of_edges(G) != 0] + idx = [G[0] for G in Gn] + Gn = [G[1] for G in Gn] + if len(Gn) != len_gn: + print('\n %d graphs are removed as they don\'t contain edges.\n' % + (len_gn - len(Gn))) + + start_time = time.time() + +# # get vertex and edge concatenated labels for each graph +# label_list, d = getLabels(Gn, node_label, edge_label, ds_attrs['is_directed']) +# gmf = filterGramMatrix(A_wave_list[0], label_list[0], ('C', '0', 'O'), ds_attrs['is_directed']) + + if compute_method == 'sylvester': + import warnings + warnings.warn('All labels are ignored.') + Kmatrix = _sylvester_equation(Gn, weight, p, q, eweight, n_jobs) + + elif compute_method == 'conjugate': + Kmatrix = _conjugate_gradient(Gn, weight, p, q, ds_attrs, + node_kernels, edge_kernels, + node_label, edge_label, eweight, n_jobs) + + elif compute_method == 'fp': + Kmatrix = _fixed_point(Gn, weight, p, q, ds_attrs, node_kernels, + edge_kernels, node_label, edge_label, + eweight, n_jobs) + + elif compute_method == 'spectral': + import warnings + warnings.warn('All labels are ignored. Only works for undirected graphs.') + Kmatrix = _spectral_decomposition(Gn, weight, p, q, sub_kernel, eweight, n_jobs) + + elif compute_method == 'kron': + for i in range(0, len(Gn)): + for j in range(i, len(Gn)): + Kmatrix[i][j] = _randomwalkkernel_kron(Gn[i], Gn[j], + node_label, edge_label) + Kmatrix[j][i] = Kmatrix[i][j] + else: + raise Exception( + 'compute method name incorrect. Available methods: "sylvester", "conjugate", "fp", "spectral" and "kron".' + ) + + run_time = time.time() - start_time + print( + "\n --- kernel matrix of random walk kernel of size %d built in %s seconds ---" + % (len(Gn), run_time)) + + return Kmatrix, run_time, idx + + +############################################################################### +def _sylvester_equation(Gn, lmda, p, q, eweight, n_jobs): + """Calculate walk graph kernels up to n between 2 graphs using Sylvester method. + + Parameters + ---------- + G1, G2 : NetworkX graph + Graphs between which the kernel is calculated. + node_label : string + node attribute used as label. + edge_label : string + edge attribute used as label. + + Return + ------ + kernel : float + Kernel between 2 graphs. + """ + Kmatrix = np.zeros((len(Gn), len(Gn))) + + if q == None: + # don't normalize adjacency matrices if q is a uniform vector. Note + # A_wave_list accually contains the transposes of the adjacency matrices. + A_wave_list = [ + nx.adjacency_matrix(G, eweight).todense().transpose() for G in tqdm( + Gn, desc='compute adjacency matrices', file=sys.stdout) + ] +# # normalized adjacency matrices +# A_wave_list = [] +# for G in tqdm(Gn, desc='compute adjacency matrices', file=sys.stdout): +# A_tilde = nx.adjacency_matrix(G, eweight).todense().transpose() +# norm = A_tilde.sum(axis=0) +# norm[norm == 0] = 1 +# A_wave_list.append(A_tilde / norm) + if p == None: # p is uniform distribution as default. + def init_worker(Awl_toshare): + global G_Awl + G_Awl = Awl_toshare + do_partial = partial(wrapper_se_do, lmda) + parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, + glbv=(A_wave_list,), n_jobs=n_jobs) + +# pbar = tqdm( +# total=(1 + len(Gn)) * len(Gn) / 2, +# desc='calculating kernels', +# file=sys.stdout) +# for i in range(0, len(Gn)): +# for j in range(i, len(Gn)): +# S = lmda * A_wave_list[j] +# T_t = A_wave_list[i] +# # use uniform distribution if there is no prior knowledge. +# nb_pd = len(A_wave_list[i]) * len(A_wave_list[j]) +# p_times_uni = 1 / nb_pd +# M0 = np.full((len(A_wave_list[j]), len(A_wave_list[i])), p_times_uni) +# X = dlyap(S, T_t, M0) +# X = np.reshape(X, (-1, 1), order='F') +# # use uniform distribution if there is no prior knowledge. +# q_times = np.full((1, nb_pd), p_times_uni) +# Kmatrix[i][j] = np.dot(q_times, X) +# Kmatrix[j][i] = Kmatrix[i][j] +# pbar.update(1) + + return Kmatrix + + +def wrapper_se_do(lmda, itr): + i = itr[0] + j = itr[1] + return i, j, _se_do(G_Awl[i], G_Awl[j], lmda) + + +def _se_do(A_wave1, A_wave2, lmda): + from control import dlyap + S = lmda * A_wave2 + T_t = A_wave1 + # use uniform distribution if there is no prior knowledge. + nb_pd = len(A_wave1) * len(A_wave2) + p_times_uni = 1 / nb_pd + M0 = np.full((len(A_wave2), len(A_wave1)), p_times_uni) + X = dlyap(S, T_t, M0) + X = np.reshape(X, (-1, 1), order='F') + # use uniform distribution if there is no prior knowledge. + q_times = np.full((1, nb_pd), p_times_uni) + return np.dot(q_times, X) + + +############################################################################### +def _conjugate_gradient(Gn, lmda, p, q, ds_attrs, node_kernels, edge_kernels, + node_label, edge_label, eweight, n_jobs): + """Calculate walk graph kernels up to n between 2 graphs using conjugate method. + + Parameters + ---------- + G1, G2 : NetworkX graph + Graphs between which the kernel is calculated. + node_label : string + node attribute used as label. + edge_label : string + edge attribute used as label. + + Return + ------ + kernel : float + Kernel between 2 graphs. + """ + Kmatrix = np.zeros((len(Gn), len(Gn))) + +# if not ds_attrs['node_labeled'] and ds_attrs['node_attr_dim'] < 1 and \ +# not ds_attrs['edge_labeled'] and ds_attrs['edge_attr_dim'] < 1: +# # this is faster from unlabeled graphs. @todo: why? +# if q == None: +# # don't normalize adjacency matrices if q is a uniform vector. Note +# # A_wave_list accually contains the transposes of the adjacency matrices. +# A_wave_list = [ +# nx.adjacency_matrix(G, eweight).todense().transpose() for G in +# tqdm(Gn, desc='compute adjacency matrices', file=sys.stdout) +# ] +# if p == None: # p is uniform distribution as default. +# def init_worker(Awl_toshare): +# global G_Awl +# G_Awl = Awl_toshare +# do_partial = partial(wrapper_cg_unlabled_do, lmda) +# parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, +# glbv=(A_wave_list,), n_jobs=n_jobs) +# else: + # reindex nodes using consecutive integers for convenience of kernel calculation. + Gn = [nx.convert_node_labels_to_integers( + g, first_label=0, label_attribute='label_orignal') for g in tqdm( + Gn, desc='reindex vertices', file=sys.stdout)] + + if p == None and q == None: # p and q are uniform distributions as default. + def init_worker(gn_toshare): + global G_gn + G_gn = gn_toshare + do_partial = partial(wrapper_cg_labled_do, ds_attrs, node_kernels, + node_label, edge_kernels, edge_label, lmda) + parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, + glbv=(Gn,), n_jobs=n_jobs) + +# pbar = tqdm( +# total=(1 + len(Gn)) * len(Gn) / 2, +# desc='calculating kernels', +# file=sys.stdout) +# for i in range(0, len(Gn)): +# for j in range(i, len(Gn)): +# result = _cg_labled_do(Gn[i], Gn[j], ds_attrs, node_kernels, +# node_label, edge_kernels, edge_label, lmda) +# Kmatrix[i][j] = result +# Kmatrix[j][i] = Kmatrix[i][j] +# pbar.update(1) + return Kmatrix + + +def wrapper_cg_unlabled_do(lmda, itr): + i = itr[0] + j = itr[1] + return i, j, _cg_unlabled_do(G_Awl[i], G_Awl[j], lmda) + + +def _cg_unlabled_do(A_wave1, A_wave2, lmda): + nb_pd = len(A_wave1) * len(A_wave2) + p_times_uni = 1 / nb_pd + w_times = kron(A_wave1, A_wave2).todense() + A = identity(w_times.shape[0]) - w_times * lmda + b = np.full((nb_pd, 1), p_times_uni) + x, _ = cg(A, b) + # use uniform distribution if there is no prior knowledge. + q_times = np.full((1, nb_pd), p_times_uni) + return np.dot(q_times, x) + + +def wrapper_cg_labled_do(ds_attrs, node_kernels, node_label, edge_kernels, + edge_label, lmda, itr): + i = itr[0] + j = itr[1] + return i, j, _cg_labled_do(G_gn[i], G_gn[j], ds_attrs, node_kernels, + node_label, edge_kernels, edge_label, lmda) + + +def _cg_labled_do(g1, g2, ds_attrs, node_kernels, node_label, + edge_kernels, edge_label, lmda): + # Frist, ompute kernels between all pairs of nodes, method borrowed + # from FCSP. It is faster than directly computing all edge kernels + # when $d_1d_2>2$, where $d_1$ and $d_2$ are vertex degrees of the + # graphs compared, which is the most case we went though. For very + # sparse graphs, this would be slow. + vk_dict = computeVK(g1, g2, ds_attrs, node_kernels, node_label) + + # Compute weight matrix of the direct product graph. + w_times, w_dim = computeW(g1, g2, vk_dict, ds_attrs, + edge_kernels, edge_label) + # use uniform distribution if there is no prior knowledge. + p_times_uni = 1 / w_dim + A = identity(w_times.shape[0]) - w_times * lmda + b = np.full((w_dim, 1), p_times_uni) + x, _ = cg(A, b) + # use uniform distribution if there is no prior knowledge. + q_times = np.full((1, w_dim), p_times_uni) + return np.dot(q_times, x) + + +############################################################################### +def _fixed_point(Gn, lmda, p, q, ds_attrs, node_kernels, edge_kernels, + node_label, edge_label, eweight, n_jobs): + """Calculate walk graph kernels up to n between 2 graphs using Fixed-Point method. + + Parameters + ---------- + G1, G2 : NetworkX graph + Graphs between which the kernel is calculated. + node_label : string + node attribute used as label. + edge_label : string + edge attribute used as label. + + Return + ------ + kernel : float + Kernel between 2 graphs. + """ + + + Kmatrix = np.zeros((len(Gn), len(Gn))) + +# if not ds_attrs['node_labeled'] and ds_attrs['node_attr_dim'] < 1 and \ +# not ds_attrs['edge_labeled'] and ds_attrs['edge_attr_dim'] > 1: +# # this is faster from unlabeled graphs. @todo: why? +# if q == None: +# # don't normalize adjacency matrices if q is a uniform vector. Note +# # A_wave_list accually contains the transposes of the adjacency matrices. +# A_wave_list = [ +# nx.adjacency_matrix(G, eweight).todense().transpose() for G in +# tqdm(Gn, desc='compute adjacency matrices', file=sys.stdout) +# ] +# if p == None: # p is uniform distribution as default. +# pbar = tqdm( +# total=(1 + len(Gn)) * len(Gn) / 2, +# desc='calculating kernels', +# file=sys.stdout) +# for i in range(0, len(Gn)): +# for j in range(i, len(Gn)): +# # use uniform distribution if there is no prior knowledge. +# nb_pd = len(A_wave_list[i]) * len(A_wave_list[j]) +# p_times_uni = 1 / nb_pd +# w_times = kron(A_wave_list[i], A_wave_list[j]).todense() +# p_times = np.full((nb_pd, 1), p_times_uni) +# x = fixed_point(func_fp, p_times, args=(p_times, lmda, w_times)) +# # use uniform distribution if there is no prior knowledge. +# q_times = np.full((1, nb_pd), p_times_uni) +# Kmatrix[i][j] = np.dot(q_times, x) +# Kmatrix[j][i] = Kmatrix[i][j] +# pbar.update(1) +# else: + # reindex nodes using consecutive integers for convenience of kernel calculation. + Gn = [nx.convert_node_labels_to_integers( + g, first_label=0, label_attribute='label_orignal') for g in tqdm( + Gn, desc='reindex vertices', file=sys.stdout)] + + if p == None and q == None: # p and q are uniform distributions as default. + def init_worker(gn_toshare): + global G_gn + G_gn = gn_toshare + do_partial = partial(wrapper_fp_labled_do, ds_attrs, node_kernels, + node_label, edge_kernels, edge_label, lmda) + parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, + glbv=(Gn,), n_jobs=n_jobs) + return Kmatrix + + +def wrapper_fp_labled_do(ds_attrs, node_kernels, node_label, edge_kernels, + edge_label, lmda, itr): + i = itr[0] + j = itr[1] + return i, j, _fp_labled_do(G_gn[i], G_gn[j], ds_attrs, node_kernels, + node_label, edge_kernels, edge_label, lmda) + + +def _fp_labled_do(g1, g2, ds_attrs, node_kernels, node_label, + edge_kernels, edge_label, lmda): + # Frist, ompute kernels between all pairs of nodes, method borrowed + # from FCSP. It is faster than directly computing all edge kernels + # when $d_1d_2>2$, where $d_1$ and $d_2$ are vertex degrees of the + # graphs compared, which is the most case we went though. For very + # sparse graphs, this would be slow. + vk_dict = computeVK(g1, g2, ds_attrs, node_kernels, node_label) + + # Compute weight matrix of the direct product graph. + w_times, w_dim = computeW(g1, g2, vk_dict, ds_attrs, + edge_kernels, edge_label) + # use uniform distribution if there is no prior knowledge. + p_times_uni = 1 / w_dim + p_times = np.full((w_dim, 1), p_times_uni) + x = fixed_point(func_fp, p_times, args=(p_times, lmda, w_times), + xtol=1e-06, maxiter=1000) + # use uniform distribution if there is no prior knowledge. + q_times = np.full((1, w_dim), p_times_uni) + return np.dot(q_times, x) + + +def func_fp(x, p_times, lmda, w_times): + haha = w_times * x + haha = lmda * haha + haha = p_times + haha + return p_times + lmda * np.dot(w_times, x) + + +############################################################################### +def _spectral_decomposition(Gn, weight, p, q, sub_kernel, eweight, n_jobs): + """Calculate walk graph kernels up to n between 2 unlabeled graphs using + spectral decomposition method. Labels will be ignored. + + Parameters + ---------- + G1, G2 : NetworkX graph + Graphs between which the kernel is calculated. + node_label : string + node attribute used as label. + edge_label : string + edge attribute used as label. + + Return + ------ + kernel : float + Kernel between 2 graphs. + """ + Kmatrix = np.zeros((len(Gn), len(Gn))) + + if q == None: + # precompute the spectral decomposition of each graph. + P_list = [] + D_list = [] + for G in tqdm(Gn, desc='spectral decompose', file=sys.stdout): + # don't normalize adjacency matrices if q is a uniform vector. Note + # A accually is the transpose of the adjacency matrix. + A = nx.adjacency_matrix(G, eweight).todense().transpose() + ew, ev = np.linalg.eig(A) + D_list.append(ew) + P_list.append(ev) +# P_inv_list = [p.T for p in P_list] # @todo: also works for directed graphs? + + if p == None: # p is uniform distribution as default. + q_T_list = [np.full((1, nx.number_of_nodes(G)), 1 / nx.number_of_nodes(G)) for G in Gn] +# q_T_list = [q.T for q in q_list] + def init_worker(q_T_toshare, P_toshare, D_toshare): + global G_q_T, G_P, G_D + G_q_T = q_T_toshare + G_P = P_toshare + G_D = D_toshare + do_partial = partial(wrapper_sd_do, weight, sub_kernel) + parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, + glbv=(q_T_list, P_list, D_list), n_jobs=n_jobs) + + +# pbar = tqdm( +# total=(1 + len(Gn)) * len(Gn) / 2, +# desc='calculating kernels', +# file=sys.stdout) +# for i in range(0, len(Gn)): +# for j in range(i, len(Gn)): +# result = _sd_do(q_T_list[i], q_T_list[j], P_list[i], P_list[j], +# D_list[i], D_list[j], weight, sub_kernel) +# Kmatrix[i][j] = result +# Kmatrix[j][i] = Kmatrix[i][j] +# pbar.update(1) + return Kmatrix + + +def wrapper_sd_do(weight, sub_kernel, itr): + i = itr[0] + j = itr[1] + return i, j, _sd_do(G_q_T[i], G_q_T[j], G_P[i], G_P[j], G_D[i], G_D[j], + weight, sub_kernel) + + +def _sd_do(q_T1, q_T2, P1, P2, D1, D2, weight, sub_kernel): + # use uniform distribution if there is no prior knowledge. + kl = kron(np.dot(q_T1, P1), np.dot(q_T2, P2)).todense() + # @todo: this is not be needed when p = q (kr = kl.T) for undirected graphs +# kr = kron(np.dot(P_inv_list[i], q_list[i]), np.dot(P_inv_list[j], q_list[j])).todense() + if sub_kernel == 'exp': + D_diag = np.array([d1 * d2 for d1 in D1 for d2 in D2]) + kmiddle = np.diag(np.exp(weight * D_diag)) + elif sub_kernel == 'geo': + D_diag = np.array([d1 * d2 for d1 in D1 for d2 in D2]) + kmiddle = np.diag(weight * D_diag) + kmiddle = np.identity(len(kmiddle)) - weight * kmiddle + kmiddle = np.linalg.inv(kmiddle) + return np.dot(np.dot(kl, kmiddle), kl.T)[0, 0] + + +############################################################################### +def _randomwalkkernel_kron(G1, G2, node_label, edge_label): + """Calculate walk graph kernels up to n between 2 graphs using nearest Kronecker product approximation method. + + Parameters + ---------- + G1, G2 : NetworkX graph + Graphs between which the kernel is calculated. + node_label : string + node attribute used as label. + edge_label : string + edge attribute used as label. + + Return + ------ + kernel : float + Kernel between 2 graphs. + """ + pass + + +############################################################################### +def getLabels(Gn, node_label, edge_label, directed): + """Get symbolic labels of a graph dataset, where vertex labels are dealt + with by concatenating them to the edge labels of adjacent edges. + """ + label_list = [] + label_set = set() + for g in Gn: + label_g = {} + for e in g.edges(data=True): + nl1 = g.node[e[0]][node_label] + nl2 = g.node[e[1]][node_label] + if not directed and nl1 > nl2: + nl1, nl2 = nl2, nl1 + label = (nl1, e[2][edge_label], nl2) + label_g[(e[0], e[1])] = label + label_list.append(label_g) + label_set = set([l for lg in label_list for l in lg.values()]) + return label_list, len(label_set) + + +def filterGramMatrix(gmt, label_dict, label, directed): + """Compute (the transpose of) the Gram matrix filtered by a label. + """ + gmf = np.zeros(gmt.shape) + for (n1, n2), l in label_dict.items(): + if l == label: + gmf[n2, n1] = gmt[n2, n1] + if not directed: + gmf[n1, n2] = gmt[n1, n2] + return gmf + + +def computeVK(g1, g2, ds_attrs, node_kernels, node_label): + '''Compute vertex kernels between vertices of two graphs. + ''' + vk_dict = {} # shortest path matrices dict + if ds_attrs['node_labeled']: + # node symb and non-synb labeled + if ds_attrs['node_attr_dim'] > 0: + kn = node_kernels['mix'] + for n1 in g1.nodes(data=True): + for n2 in g2.nodes(data=True): + vk_dict[(n1[0], n2[0])] = kn( + n1[1][node_label], n2[1][node_label], + n1[1]['attributes'], n2[1]['attributes']) + # node symb labeled + else: + kn = node_kernels['symb'] + for n1 in g1.nodes(data=True): + for n2 in g2.nodes(data=True): + vk_dict[(n1[0], n2[0])] = kn(n1[1][node_label], + n2[1][node_label]) + else: + # node non-synb labeled + if ds_attrs['node_attr_dim'] > 0: + kn = node_kernels['nsymb'] + for n1 in g1.nodes(data=True): + for n2 in g2.nodes(data=True): + vk_dict[(n1[0], n2[0])] = kn(n1[1]['attributes'], + n2[1]['attributes']) + # node unlabeled + else: + pass + return vk_dict + + +def computeW(g1, g2, vk_dict, ds_attrs, edge_kernels, edge_label): + '''Compute weight matrix of the direct product graph. + ''' + w_dim = nx.number_of_nodes(g1) * nx.number_of_nodes(g2) + w_times = np.zeros((w_dim, w_dim)) + if vk_dict: # node labeled + if ds_attrs['is_directed']: + if ds_attrs['edge_labeled']: + # edge symb and non-synb labeled + if ds_attrs['edge_attr_dim'] > 0: + ke = edge_kernels['mix'] + for e1 in g1.edges(data=True): + for e2 in g2.edges(data=True): + ek_temp = ke(e1[2][edge_label], e2[2][edge_label], + e1[2]['attributes'], e2[2]['attributes']) + w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0], + e1[1] * nx.number_of_nodes(g2) + e2[1]) + w_times[w_idx] = vk_dict[(e1[0], e2[0])] \ + * ek_temp * vk_dict[(e1[1], e2[1])] + # edge symb labeled + else: + ke = edge_kernels['symb'] + for e1 in g1.edges(data=True): + for e2 in g2.edges(data=True): + ek_temp = ke(e1[2][edge_label], e2[2][edge_label]) + w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0], + e1[1] * nx.number_of_nodes(g2) + e2[1]) + w_times[w_idx] = vk_dict[(e1[0], e2[0])] \ + * ek_temp * vk_dict[(e1[1], e2[1])] + else: + # edge non-synb labeled + if ds_attrs['edge_attr_dim'] > 0: + ke = edge_kernels['nsymb'] + for e1 in g1.edges(data=True): + for e2 in g2.edges(data=True): + ek_temp = ke(e1[2]['attributes'], e2[2]['attributes']) + w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0], + e1[1] * nx.number_of_nodes(g2) + e2[1]) + w_times[w_idx] = vk_dict[(e1[0], e2[0])] \ + * ek_temp * vk_dict[(e1[1], e2[1])] + # edge unlabeled + else: + for e1 in g1.edges(data=True): + for e2 in g2.edges(data=True): + w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0], + e1[1] * nx.number_of_nodes(g2) + e2[1]) + w_times[w_idx] = vk_dict[(e1[0], e2[0])] \ + * vk_dict[(e1[1], e2[1])] + else: # undirected + if ds_attrs['edge_labeled']: + # edge symb and non-synb labeled + if ds_attrs['edge_attr_dim'] > 0: + ke = edge_kernels['mix'] + for e1 in g1.edges(data=True): + for e2 in g2.edges(data=True): + ek_temp = ke(e1[2][edge_label], e2[2][edge_label], + e1[2]['attributes'], e2[2]['attributes']) + w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0], + e1[1] * nx.number_of_nodes(g2) + e2[1]) + w_times[w_idx] = vk_dict[(e1[0], e2[0])] \ + * ek_temp * vk_dict[(e1[1], e2[1])] \ + + vk_dict[(e1[0], e2[1])] \ + * ek_temp * vk_dict[(e1[1], e2[0])] + w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]] + w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1], + e1[1] * nx.number_of_nodes(g2) + e2[0]) + w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]] + w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]] + # edge symb labeled + else: + ke = edge_kernels['symb'] + for e1 in g1.edges(data=True): + for e2 in g2.edges(data=True): + ek_temp = ke(e1[2][edge_label], e2[2][edge_label]) + w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0], + e1[1] * nx.number_of_nodes(g2) + e2[1]) + w_times[w_idx] = vk_dict[(e1[0], e2[0])] \ + * ek_temp * vk_dict[(e1[1], e2[1])] \ + + vk_dict[(e1[0], e2[1])] \ + * ek_temp * vk_dict[(e1[1], e2[0])] + w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]] + w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1], + e1[1] * nx.number_of_nodes(g2) + e2[0]) + w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]] + w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]] + else: + # edge non-synb labeled + if ds_attrs['edge_attr_dim'] > 0: + ke = edge_kernels['nsymb'] + for e1 in g1.edges(data=True): + for e2 in g2.edges(data=True): + ek_temp = ke(e1[2]['attributes'], e2[2]['attributes']) + w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0], + e1[1] * nx.number_of_nodes(g2) + e2[1]) + w_times[w_idx] = vk_dict[(e1[0], e2[0])] \ + * ek_temp * vk_dict[(e1[1], e2[1])] \ + + vk_dict[(e1[0], e2[1])] \ + * ek_temp * vk_dict[(e1[1], e2[0])] + w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]] + w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1], + e1[1] * nx.number_of_nodes(g2) + e2[0]) + w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]] + w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]] + # edge unlabeled + else: + for e1 in g1.edges(data=True): + for e2 in g2.edges(data=True): + w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0], + e1[1] * nx.number_of_nodes(g2) + e2[1]) + w_times[w_idx] = vk_dict[(e1[0], e2[0])] \ + * vk_dict[(e1[1], e2[1])] \ + + vk_dict[(e1[0], e2[1])] \ + * vk_dict[(e1[1], e2[0])] + w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]] + w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1], + e1[1] * nx.number_of_nodes(g2) + e2[0]) + w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]] + w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]] + else: # node unlabeled + if ds_attrs['is_directed']: + if ds_attrs['edge_labeled']: + # edge symb and non-synb labeled + if ds_attrs['edge_attr_dim'] > 0: + ke = edge_kernels['mix'] + for e1 in g1.edges(data=True): + for e2 in g2.edges(data=True): + ek_temp = ke(e1[2][edge_label], e2[2][edge_label], + e1[2]['attributes'], e2[2]['attributes']) + w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0], + e1[1] * nx.number_of_nodes(g2) + e2[1]) + w_times[w_idx] = ek_temp + # edge symb labeled + else: + ke = edge_kernels['symb'] + for e1 in g1.edges(data=True): + for e2 in g2.edges(data=True): + ek_temp = ke(e1[2][edge_label], e2[2][edge_label]) + w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0], + e1[1] * nx.number_of_nodes(g2) + e2[1]) + w_times[w_idx] = ek_temp + else: + # edge non-synb labeled + if ds_attrs['edge_attr_dim'] > 0: + ke = edge_kernels['nsymb'] + for e1 in g1.edges(data=True): + for e2 in g2.edges(data=True): + ek_temp = ke(e1[2]['attributes'], e2[2]['attributes']) + w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0], + e1[1] * nx.number_of_nodes(g2) + e2[1]) + w_times[w_idx] = ek_temp + # edge unlabeled + else: + for e1 in g1.edges(data=True): + for e2 in g2.edges(data=True): + w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0], + e1[1] * nx.number_of_nodes(g2) + e2[1]) + w_times[w_idx] = 1 + else: # undirected + if ds_attrs['edge_labeled']: + # edge symb and non-synb labeled + if ds_attrs['edge_attr_dim'] > 0: + ke = edge_kernels['mix'] + for e1 in g1.edges(data=True): + for e2 in g2.edges(data=True): + ek_temp = ke(e1[2][edge_label], e2[2][edge_label], + e1[2]['attributes'], e2[2]['attributes']) + w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0], + e1[1] * nx.number_of_nodes(g2) + e2[1]) + w_times[w_idx] = ek_temp + w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]] + w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1], + e1[1] * nx.number_of_nodes(g2) + e2[0]) + w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]] + w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]] + # edge symb labeled + else: + ke = edge_kernels['symb'] + for e1 in g1.edges(data=True): + for e2 in g2.edges(data=True): + ek_temp = ke(e1[2][edge_label], e2[2][edge_label]) + w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0], + e1[1] * nx.number_of_nodes(g2) + e2[1]) + w_times[w_idx] = ek_temp + w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]] + w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1], + e1[1] * nx.number_of_nodes(g2) + e2[0]) + w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]] + w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]] + else: + # edge non-synb labeled + if ds_attrs['edge_attr_dim'] > 0: + ke = edge_kernels['nsymb'] + for e1 in g1.edges(data=True): + for e2 in g2.edges(data=True): + ek_temp = ke(e1[2]['attributes'], e2[2]['attributes']) + w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0], + e1[1] * nx.number_of_nodes(g2) + e2[1]) + w_times[w_idx] = ek_temp + w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]] + w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1], + e1[1] * nx.number_of_nodes(g2) + e2[0]) + w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]] + w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]] + # edge unlabeled + else: + for e1 in g1.edges(data=True): + for e2 in g2.edges(data=True): + w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0], + e1[1] * nx.number_of_nodes(g2) + e2[1]) + w_times[w_idx] = 1 + w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]] + w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1], + e1[1] * nx.number_of_nodes(g2) + e2[0]) + w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]] + w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]] + return w_times, w_dim \ No newline at end of file diff --git a/pygraph/kernels/else/sp_sym.py b/pygraph/kernels/else/sp_sym.py new file mode 100644 index 0000000..649c65f --- /dev/null +++ b/pygraph/kernels/else/sp_sym.py @@ -0,0 +1,200 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Fri Dec 21 18:02:00 2018 + +@author: ljia +""" + +import sys +import time +from itertools import product +from functools import partial +from multiprocessing import Pool +from tqdm import tqdm + +import networkx as nx +import numpy as np + +from pygraph.utils.utils import getSPGraph +from pygraph.utils.graphdataset import get_dataset_attributes +from pygraph.utils.parallel import parallel_gm +sys.path.insert(0, "../") + +def spkernel(*args, + node_label='atom', + edge_weight=None, + node_kernels=None, + n_jobs=None): + """Calculate shortest-path kernels between graphs. + + Parameters + ---------- + Gn : List of NetworkX graph + List of graphs between which the kernels are calculated. + / + G1, G2 : NetworkX graphs + 2 graphs between which the kernel is calculated. + node_label : string + node attribute used as label. The default node label is atom. + edge_weight : string + Edge attribute name corresponding to the edge weight. + node_kernels: dict + A dictionary of kernel functions for nodes, including 3 items: 'symb' + for symbolic node labels, 'nsymb' for non-symbolic node labels, 'mix' + for both labels. The first 2 functions take two node labels as + parameters, and the 'mix' function takes 4 parameters, a symbolic and a + non-symbolic label for each the two nodes. Each label is in form of 2-D + dimension array (n_samples, n_features). Each function returns an + number as the kernel value. Ignored when nodes are unlabeled. + + Return + ------ + Kmatrix : Numpy matrix + Kernel matrix, each element of which is the sp kernel between 2 praphs. + """ + # pre-process + Gn = args[0] if len(args) == 1 else [args[0], args[1]] + weight = None + if edge_weight is None: + print('\n None edge weight specified. Set all weight to 1.\n') + else: + try: + some_weight = list( + nx.get_edge_attributes(Gn[0], edge_weight).values())[0] + if isinstance(some_weight, (float, int)): + weight = edge_weight + else: + print( + '\n Edge weight with name %s is not float or integer. Set all weight to 1.\n' + % edge_weight) + except: + print( + '\n Edge weight with name "%s" is not found in the edge attributes. Set all weight to 1.\n' + % edge_weight) + ds_attrs = get_dataset_attributes( + Gn, + attr_names=['node_labeled', 'node_attr_dim', 'is_directed'], + node_label=node_label) + ds_attrs['node_attr_dim'] = 0 + + # remove graphs with no edges, as no sp can be found in their structures, + # so the kernel between such a graph and itself will be zero. + len_gn = len(Gn) + Gn = [(idx, G) for idx, G in enumerate(Gn) if nx.number_of_edges(G) != 0] + idx = [G[0] for G in Gn] + Gn = [G[1] for G in Gn] + if len(Gn) != len_gn: + print('\n %d graphs are removed as they don\'t contain edges.\n' % + (len_gn - len(Gn))) + + start_time = time.time() + + pool = Pool(n_jobs) + # get shortest path graphs of Gn + getsp_partial = partial(wrapper_getSPGraph, weight) + itr = zip(Gn, range(0, len(Gn))) + if len(Gn) < 100 * n_jobs: +# # use default chunksize as pool.map when iterable is less than 100 +# chunksize, extra = divmod(len(Gn), n_jobs * 4) +# if extra: +# chunksize += 1 + chunksize = int(len(Gn) / n_jobs) + 1 + else: + chunksize = 100 + for i, g in tqdm( + pool.imap_unordered(getsp_partial, itr, chunksize), + desc='getting sp graphs', file=sys.stdout): + Gn[i] = g + pool.close() + pool.join() + + Kmatrix = np.zeros((len(Gn), len(Gn))) + + # ---- use pool.imap_unordered to parallel and track progress. ---- + def init_worker(gn_toshare): + global G_gn + G_gn = gn_toshare + do_partial = partial(wrapper_sp_do, ds_attrs, node_label, node_kernels) + parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, + glbv=(Gn,), n_jobs=n_jobs) + + run_time = time.time() - start_time + print( + "\n --- shortest path kernel matrix of size %d built in %s seconds ---" + % (len(Gn), run_time)) + + return Kmatrix, run_time, idx + + +def spkernel_do(g1, g2, ds_attrs, node_label, node_kernels): + + kernel = 0 + + # compute shortest path matrices first, method borrowed from FCSP. + vk_dict = {} # shortest path matrices dict + if ds_attrs['node_labeled']: + # node symb and non-synb labeled + if ds_attrs['node_attr_dim'] > 0: + kn = node_kernels['mix'] + for n1, n2 in product( + g1.nodes(data=True), g2.nodes(data=True)): + vk_dict[(n1[0], n2[0])] = kn( + n1[1][node_label], n2[1][node_label], + n1[1]['attributes'], n2[1]['attributes']) + # node symb labeled + else: + kn = node_kernels['symb'] + for n1 in g1.nodes(data=True): + for n2 in g2.nodes(data=True): + vk_dict[(n1[0], n2[0])] = kn(n1[1][node_label], + n2[1][node_label]) + else: + # node non-synb labeled + if ds_attrs['node_attr_dim'] > 0: + kn = node_kernels['nsymb'] + for n1 in g1.nodes(data=True): + for n2 in g2.nodes(data=True): + vk_dict[(n1[0], n2[0])] = kn(n1[1]['attributes'], + n2[1]['attributes']) + # node unlabeled + else: + for e1, e2 in product( + g1.edges(data=True), g2.edges(data=True)): + if e1[2]['cost'] == e2[2]['cost']: + kernel += 1 + return kernel + + # compute graph kernels + if ds_attrs['is_directed']: + for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)): + if e1[2]['cost'] == e2[2]['cost']: + nk11, nk22 = vk_dict[(e1[0], e2[0])], vk_dict[(e1[1], + e2[1])] + kn1 = nk11 * nk22 + kernel += kn1 + else: + for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)): + if e1[2]['cost'] == e2[2]['cost']: + # each edge walk is counted twice, starting from both its extreme nodes. + nk11, nk12, nk21, nk22 = vk_dict[(e1[0], e2[0])], vk_dict[( + e1[0], e2[1])], vk_dict[(e1[1], + e2[0])], vk_dict[(e1[1], + e2[1])] + kn1 = nk11 * nk22 + kn2 = nk12 * nk21 + kernel += kn1 + kn2 + + return kernel + + +def wrapper_sp_do(ds_attrs, node_label, node_kernels, itr): + i = itr[0] + j = itr[1] + return i, j, spkernel_do(G_gn[i], G_gn[j], ds_attrs, node_label, node_kernels) + + +def wrapper_getSPGraph(weight, itr_item): + g = itr_item[0] + i = itr_item[1] + return i, getSPGraph(g, edge_weight=weight) \ No newline at end of file diff --git a/pygraph/kernels/else/ssp_sym.py b/pygraph/kernels/else/ssp_sym.py new file mode 100644 index 0000000..2ce50ca --- /dev/null +++ b/pygraph/kernels/else/ssp_sym.py @@ -0,0 +1,464 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Sun Dec 23 16:42:48 2018 + +@author: ljia +""" + +import sys +import time +from itertools import combinations, product +from functools import partial +from multiprocessing import Pool +from tqdm import tqdm + +import networkx as nx +import numpy as np + +from pygraph.utils.graphdataset import get_dataset_attributes +from pygraph.utils.parallel import parallel_gm + +sys.path.insert(0, "../") + + +def structuralspkernel(*args, + node_label='atom', + edge_weight=None, + edge_label='bond_type', + node_kernels=None, + edge_kernels=None, + n_jobs=None): + """Calculate mean average structural shortest path kernels between graphs. + + Parameters + ---------- + Gn : List of NetworkX graph + List of graphs between which the kernels are calculated. + / + G1, G2 : NetworkX graphs + 2 graphs between which the kernel is calculated. + node_label : string + node attribute used as label. The default node label is atom. + edge_weight : string + Edge attribute name corresponding to the edge weight. + edge_label : string + edge attribute used as label. The default edge label is bond_type. + node_kernels: dict + A dictionary of kernel functions for nodes, including 3 items: 'symb' + for symbolic node labels, 'nsymb' for non-symbolic node labels, 'mix' + for both labels. The first 2 functions take two node labels as + parameters, and the 'mix' function takes 4 parameters, a symbolic and a + non-symbolic label for each the two nodes. Each label is in form of 2-D + dimension array (n_samples, n_features). Each function returns a number + as the kernel value. Ignored when nodes are unlabeled. + edge_kernels: dict + A dictionary of kernel functions for edges, including 3 items: 'symb' + for symbolic edge labels, 'nsymb' for non-symbolic edge labels, 'mix' + for both labels. The first 2 functions take two edge labels as + parameters, and the 'mix' function takes 4 parameters, a symbolic and a + non-symbolic label for each the two edges. Each label is in form of 2-D + dimension array (n_samples, n_features). Each function returns a number + as the kernel value. Ignored when edges are unlabeled. + + Return + ------ + Kmatrix : Numpy matrix + Kernel matrix, each element of which is the mean average structural + shortest path kernel between 2 praphs. + """ + # pre-process + Gn = args[0] if len(args) == 1 else [args[0], args[1]] + weight = None + if edge_weight is None: + print('\n None edge weight specified. Set all weight to 1.\n') + else: + try: + some_weight = list( + nx.get_edge_attributes(Gn[0], edge_weight).values())[0] + if isinstance(some_weight, (float, int)): + weight = edge_weight + else: + print( + '\n Edge weight with name %s is not float or integer. Set all weight to 1.\n' + % edge_weight) + except: + print( + '\n Edge weight with name "%s" is not found in the edge attributes. Set all weight to 1.\n' + % edge_weight) + ds_attrs = get_dataset_attributes( + Gn, + attr_names=['node_labeled', 'node_attr_dim', 'edge_labeled', + 'edge_attr_dim', 'is_directed'], + node_label=node_label, edge_label=edge_label) + ds_attrs['node_attr_dim'] = 0 + ds_attrs['edge_attr_dim'] = 0 + + start_time = time.time() + + # get shortest paths of each graph in Gn + splist = [None] * len(Gn) + pool = Pool(n_jobs) + # get shortest path graphs of Gn + getsp_partial = partial(wrapper_getSP, weight, ds_attrs['is_directed']) + itr = zip(Gn, range(0, len(Gn))) + if len(Gn) < 100 * n_jobs: + chunksize = int(len(Gn) / n_jobs) + 1 + else: + chunksize = 100 + # chunksize = 300 # int(len(list(itr)) / n_jobs) + for i, sp in tqdm( + pool.imap_unordered(getsp_partial, itr, chunksize), + desc='getting shortest paths', + file=sys.stdout): + splist[i] = sp +# time.sleep(10) + pool.close() + pool.join() + + +# # get shortest paths of each graph in Gn +# splist = [[] for _ in range(len(Gn))] +# # get shortest path graphs of Gn +# getsp_partial = partial(wrapper_getSP, weight, ds_attrs['is_directed']) +# itr = zip(Gn, range(0, len(Gn))) +# if len(Gn) < 1000 * n_jobs: +# chunksize = int(len(Gn) / n_jobs) + 1 +# else: +# chunksize = 1000 +# # chunksize = 300 # int(len(list(itr)) / n_jobs) +# from contextlib import closing +# with closing(Pool(n_jobs)) as pool: +## for i, sp in tqdm( +# res = pool.imap_unordered(getsp_partial, itr, 10) +## desc='getting shortest paths', +## file=sys.stdout): +## splist[i] = sp +## time.sleep(10) +# pool.close() +# pool.join() + +# ss = 0 +# ss += sys.getsizeof(splist) +# for spss in splist: +# ss += sys.getsizeof(spss) +# for spp in spss: +# ss += sys.getsizeof(spp) + + +# time.sleep(20) + +# # ---- direct running, normally use single CPU core. ---- +# splist = [] +# for g in tqdm(Gn, desc='getting sp graphs', file=sys.stdout): +# splist.append(get_shortest_paths(g, weight, ds_attrs['is_directed'])) + + # # ---- only for the Fast Computation of Shortest Path Kernel (FCSP) + # sp_ml = [0] * len(Gn) # shortest path matrices + # for i in result_sp: + # sp_ml[i[0]] = i[1] + # edge_x_g = [[] for i in range(len(sp_ml))] + # edge_y_g = [[] for i in range(len(sp_ml))] + # edge_w_g = [[] for i in range(len(sp_ml))] + # for idx, item in enumerate(sp_ml): + # for i1 in range(len(item)): + # for i2 in range(i1 + 1, len(item)): + # if item[i1, i2] != np.inf: + # edge_x_g[idx].append(i1) + # edge_y_g[idx].append(i2) + # edge_w_g[idx].append(item[i1, i2]) + # print(len(edge_x_g[0])) + # print(len(edge_y_g[0])) + # print(len(edge_w_g[0])) + + Kmatrix = np.zeros((len(Gn), len(Gn))) + + # ---- use pool.imap_unordered to parallel and track progress. ---- + def init_worker(spl_toshare, gs_toshare): + global G_spl, G_gs + G_spl = spl_toshare + G_gs = gs_toshare + do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label, + node_kernels, edge_kernels) + parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, + glbv=(splist, Gn), n_jobs=n_jobs) + + +# # ---- use pool.imap_unordered to parallel and track progress. ---- +# pool = Pool(n_jobs) +# do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label, +# node_kernels, edge_kernels) +# itr = zip(combinations_with_replacement(Gn, 2), +# combinations_with_replacement(splist, 2), +# combinations_with_replacement(range(0, len(Gn)), 2)) +# len_itr = int(len(Gn) * (len(Gn) + 1) / 2) +# if len_itr < 1000 * n_jobs: +# chunksize = int(len_itr / n_jobs) + 1 +# else: +# chunksize = 1000 +# for i, j, kernel in tqdm( +# pool.imap_unordered(do_partial, itr, chunksize), +# desc='calculating kernels', +# file=sys.stdout): +# Kmatrix[i][j] = kernel +# Kmatrix[j][i] = kernel +# pool.close() +# pool.join() + +# # ---- use pool.map to parallel. ---- +# pool = Pool(n_jobs) +# do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label, +# node_kernels, edge_kernels) +# itr = zip(combinations_with_replacement(Gn, 2), +# combinations_with_replacement(splist, 2), +# combinations_with_replacement(range(0, len(Gn)), 2)) +# for i, j, kernel in tqdm( +# pool.map(do_partial, itr), desc='calculating kernels', +# file=sys.stdout): +# Kmatrix[i][j] = kernel +# Kmatrix[j][i] = kernel +# pool.close() +# pool.join() + +# # ---- use pool.imap_unordered to parallel and track progress. ---- +# do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label, +# node_kernels, edge_kernels) +# itr = zip(combinations_with_replacement(Gn, 2), +# combinations_with_replacement(splist, 2), +# combinations_with_replacement(range(0, len(Gn)), 2)) +# len_itr = int(len(Gn) * (len(Gn) + 1) / 2) +# if len_itr < 1000 * n_jobs: +# chunksize = int(len_itr / n_jobs) + 1 +# else: +# chunksize = 1000 +# from contextlib import closing +# with closing(Pool(n_jobs)) as pool: +# for i, j, kernel in tqdm( +# pool.imap_unordered(do_partial, itr, 1000), +# desc='calculating kernels', +# file=sys.stdout): +# Kmatrix[i][j] = kernel +# Kmatrix[j][i] = kernel +# pool.close() +# pool.join() + + +# # ---- direct running, normally use single CPU core. ---- +# from itertools import combinations_with_replacement +# itr = combinations_with_replacement(range(0, len(Gn)), 2) +# for i, j in tqdm(itr, desc='calculating kernels', file=sys.stdout): +# kernel = structuralspkernel_do(Gn[i], Gn[j], splist[i], splist[j], +# ds_attrs, node_label, edge_label, node_kernels, edge_kernels) +## if(kernel > 1): +## print("error here ") +# Kmatrix[i][j] = kernel +# Kmatrix[j][i] = kernel + + run_time = time.time() - start_time + print( + "\n --- shortest path kernel matrix of size %d built in %s seconds ---" + % (len(Gn), run_time)) + + return Kmatrix, run_time + + +def structuralspkernel_do(g1, g2, spl1, spl2, ds_attrs, node_label, edge_label, + node_kernels, edge_kernels): + + kernel = 0 + + # First, compute shortest path matrices, method borrowed from FCSP. + vk_dict = {} # shortest path matrices dict + if ds_attrs['node_labeled']: + # node symb and non-synb labeled + if ds_attrs['node_attr_dim'] > 0: + kn = node_kernels['mix'] + for n1, n2 in product( + g1.nodes(data=True), g2.nodes(data=True)): + vk_dict[(n1[0], n2[0])] = kn( + n1[1][node_label], n2[1][node_label], + n1[1]['attributes'], n2[1]['attributes']) + # node symb labeled + else: + kn = node_kernels['symb'] + for n1 in g1.nodes(data=True): + for n2 in g2.nodes(data=True): + vk_dict[(n1[0], n2[0])] = kn(n1[1][node_label], + n2[1][node_label]) + else: + # node non-synb labeled + if ds_attrs['node_attr_dim'] > 0: + kn = node_kernels['nsymb'] + for n1 in g1.nodes(data=True): + for n2 in g2.nodes(data=True): + vk_dict[(n1[0], n2[0])] = kn(n1[1]['attributes'], + n2[1]['attributes']) + # node unlabeled + else: + pass + + # Then, compute kernels between all pairs of edges, which idea is an + # extension of FCSP. It suits sparse graphs, which is the most case we + # went though. For dense graphs, this would be slow. + ek_dict = {} # dict of edge kernels + if ds_attrs['edge_labeled']: + # edge symb and non-synb labeled + if ds_attrs['edge_attr_dim'] > 0: + ke = edge_kernels['mix'] + for e1, e2 in product( + g1.edges(data=True), g2.edges(data=True)): + ek_temp = ke(e1[2][edge_label], e2[2][edge_label], + e1[2]['attributes'], e2[2]['attributes']) + ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ek_temp + ek_dict[((e1[1], e1[0]), (e2[0], e2[1]))] = ek_temp + ek_dict[((e1[0], e1[1]), (e2[1], e2[0]))] = ek_temp + ek_dict[((e1[1], e1[0]), (e2[1], e2[0]))] = ek_temp + # edge symb labeled + else: + ke = edge_kernels['symb'] + for e1 in g1.edges(data=True): + for e2 in g2.edges(data=True): + ek_temp = ke(e1[2][edge_label], e2[2][edge_label]) + ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ek_temp + ek_dict[((e1[1], e1[0]), (e2[0], e2[1]))] = ek_temp + ek_dict[((e1[0], e1[1]), (e2[1], e2[0]))] = ek_temp + ek_dict[((e1[1], e1[0]), (e2[1], e2[0]))] = ek_temp + else: + # edge non-synb labeled + if ds_attrs['edge_attr_dim'] > 0: + ke = edge_kernels['nsymb'] + for e1 in g1.edges(data=True): + for e2 in g2.edges(data=True): + ek_temp = kn(e1[2]['attributes'], e2[2]['attributes']) + ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ek_temp + ek_dict[((e1[1], e1[0]), (e2[0], e2[1]))] = ek_temp + ek_dict[((e1[0], e1[1]), (e2[1], e2[0]))] = ek_temp + ek_dict[((e1[1], e1[0]), (e2[1], e2[0]))] = ek_temp + # edge unlabeled + else: + pass + + # compute graph kernels + if vk_dict: + if ek_dict: + for p1, p2 in product(spl1, spl2): + if len(p1) == len(p2): + kpath = vk_dict[(p1[0], p2[0])] + if kpath: + for idx in range(1, len(p1)): + kpath *= vk_dict[(p1[idx], p2[idx])] * \ + ek_dict[((p1[idx-1], p1[idx]), + (p2[idx-1], p2[idx]))] + if not kpath: + break + kernel += kpath # add up kernels of all paths + else: + for p1, p2 in product(spl1, spl2): + if len(p1) == len(p2): + kpath = vk_dict[(p1[0], p2[0])] + if kpath: + for idx in range(1, len(p1)): + kpath *= vk_dict[(p1[idx], p2[idx])] + if not kpath: + break + kernel += kpath # add up kernels of all paths + else: + if ek_dict: + for p1, p2 in product(spl1, spl2): + if len(p1) == len(p2): + if len(p1) == 0: + kernel += 1 + else: + kpath = 1 + for idx in range(0, len(p1) - 1): + kpath *= ek_dict[((p1[idx], p1[idx+1]), + (p2[idx], p2[idx+1]))] + if not kpath: + break + kernel += kpath # add up kernels of all paths + else: + for p1, p2 in product(spl1, spl2): + if len(p1) == len(p2): + kernel += 1 + + kernel = kernel / (len(spl1) * len(spl2)) # calculate mean average + + # # ---- exact implementation of the Fast Computation of Shortest Path Kernel (FCSP), reference [2], sadly it is slower than the current implementation + # # compute vertex kernel matrix + # try: + # vk_mat = np.zeros((nx.number_of_nodes(g1), + # nx.number_of_nodes(g2))) + # g1nl = enumerate(g1.nodes(data=True)) + # g2nl = enumerate(g2.nodes(data=True)) + # for i1, n1 in g1nl: + # for i2, n2 in g2nl: + # vk_mat[i1][i2] = kn( + # n1[1][node_label], n2[1][node_label], + # [n1[1]['attributes']], [n2[1]['attributes']]) + + # range1 = range(0, len(edge_w_g[i])) + # range2 = range(0, len(edge_w_g[j])) + # for i1 in range1: + # x1 = edge_x_g[i][i1] + # y1 = edge_y_g[i][i1] + # w1 = edge_w_g[i][i1] + # for i2 in range2: + # x2 = edge_x_g[j][i2] + # y2 = edge_y_g[j][i2] + # w2 = edge_w_g[j][i2] + # ke = (w1 == w2) + # if ke > 0: + # kn1 = vk_mat[x1][x2] * vk_mat[y1][y2] + # kn2 = vk_mat[x1][y2] * vk_mat[y1][x2] + # Kmatrix += kn1 + kn2 + return kernel + + +def wrapper_ssp_do(ds_attrs, node_label, edge_label, node_kernels, + edge_kernels, itr): + i = itr[0] + j = itr[1] + return i, j, structuralspkernel_do(G_gs[i], G_gs[j], G_spl[i], G_spl[j], + ds_attrs, node_label, edge_label, + node_kernels, edge_kernels) + + +def get_shortest_paths(G, weight, directed): + """Get all shortest paths of a graph. + + Parameters + ---------- + G : NetworkX graphs + The graphs whose paths are calculated. + weight : string/None + edge attribute used as weight to calculate the shortest path. + directed: boolean + Whether graph is directed. + + Return + ------ + sp : list of list + List of shortest paths of the graph, where each path is represented by a list of nodes. + """ + sp = [] + for n1, n2 in combinations(G.nodes(), 2): + try: + spltemp = list(nx.all_shortest_paths(G, n1, n2, weight=weight)) + except nx.NetworkXNoPath: # nodes not connected + # sp.append([]) + pass + else: + sp += spltemp + # each edge walk is counted twice, starting from both its extreme nodes. + if not directed: + sp += [sptemp[::-1] for sptemp in spltemp] + + # add single nodes as length 0 paths. + sp += [[n] for n in G.nodes()] + return sp + + +def wrapper_getSP(weight, directed, itr_item): + g = itr_item[0] + i = itr_item[1] + return i, get_shortest_paths(g, weight, directed) \ No newline at end of file diff --git a/pygraph/kernels/marginalizedKernel.py b/pygraph/kernels/marginalizedKernel.py index 38a0edc..a00850a 100644 --- a/pygraph/kernels/marginalizedKernel.py +++ b/pygraph/kernels/marginalizedKernel.py @@ -33,8 +33,9 @@ def marginalizedkernel(*args, edge_label='bond_type', p_quit=0.5, n_iteration=20, - remove_totters=True, - n_jobs=None): + remove_totters=False, + n_jobs=None, + verbose=True): """Calculate marginalized graph kernels between graphs. Parameters @@ -63,16 +64,18 @@ def marginalizedkernel(*args, """ # pre-process n_iteration = int(n_iteration) - Gn = args[0] if len(args) == 1 else [args[0], args[1]] + Gn = args[0][:] if len(args) == 1 else [args[0].copy(), args[1].copy()] ds_attrs = get_dataset_attributes( Gn, attr_names=['node_labeled', 'edge_labeled', 'is_directed'], node_label=node_label, edge_label=edge_label) - if not ds_attrs['node_labeled']: + if not ds_attrs['node_labeled'] or node_label == None: + node_label = 'atom' for G in Gn: nx.set_node_attributes(G, '0', 'atom') - if not ds_attrs['edge_labeled']: + if not ds_attrs['edge_labeled'] or edge_label == None: + edge_label = 'bond_type' for G in Gn: nx.set_edge_attributes(G, '0', 'bond_type') @@ -110,26 +113,26 @@ def marginalizedkernel(*args, do_partial = partial(wrapper_marg_do, node_label, edge_label, p_quit, n_iteration) parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, - glbv=(Gn,), n_jobs=n_jobs) + glbv=(Gn,), n_jobs=n_jobs, verbose=verbose) # # ---- direct running, normally use single CPU core. ---- -# pbar = tqdm( -# total=(1 + len(Gn)) * len(Gn) / 2, -# desc='calculating kernels', -# file=sys.stdout) +## pbar = tqdm( +## total=(1 + len(Gn)) * len(Gn) / 2, +## desc='calculating kernels', +## file=sys.stdout) # for i in range(0, len(Gn)): # for j in range(i, len(Gn)): -# print(i, j) +## print(i, j) # Kmatrix[i][j] = _marginalizedkernel_do(Gn[i], Gn[j], node_label, # edge_label, p_quit, n_iteration) # Kmatrix[j][i] = Kmatrix[i][j] -# pbar.update(1) +## pbar.update(1) run_time = time.time() - start_time - print( - "\n --- marginalized kernel matrix of size %d built in %s seconds ---" - % (len(Gn), run_time)) + if verbose: + print("\n --- marginalized kernel matrix of size %d built in %s seconds ---" + % (len(Gn), run_time)) return Kmatrix, run_time diff --git a/pygraph/kernels/spKernel.py b/pygraph/kernels/spKernel.py index cea91bb..1a2bd82 100644 --- a/pygraph/kernels/spKernel.py +++ b/pygraph/kernels/spKernel.py @@ -23,7 +23,8 @@ def spkernel(*args, node_label='atom', edge_weight=None, node_kernels=None, - n_jobs=None): + n_jobs=None, + verbose=True): """Calculate shortest-path kernels between graphs. Parameters @@ -148,7 +149,7 @@ def spkernel(*args, G_gn = gn_toshare do_partial = partial(wrapper_sp_do, ds_attrs, node_label, node_kernels) parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, - glbv=(Gn,), n_jobs=n_jobs) + glbv=(Gn,), n_jobs=n_jobs, verbose=verbose) # # ---- use pool.map to parallel. ---- diff --git a/pygraph/kernels/structuralspKernel.py b/pygraph/kernels/structuralspKernel.py index 0925179..2f9d8a3 100644 --- a/pygraph/kernels/structuralspKernel.py +++ b/pygraph/kernels/structuralspKernel.py @@ -31,7 +31,7 @@ def structuralspkernel(*args, edge_label='bond_type', node_kernels=None, edge_kernels=None, - compute_method='trie', + compute_method='naive', n_jobs=None): """Calculate mean average structural shortest path kernels between graphs. diff --git a/pygraph/utils/model_selection_precomputed.py b/pygraph/utils/model_selection_precomputed.py index ab67cbe..6192371 100644 --- a/pygraph/utils/model_selection_precomputed.py +++ b/pygraph/utils/model_selection_precomputed.py @@ -242,7 +242,7 @@ def model_selection_for_precomputed_kernel(datafile, # gms_array = Array('d', np.reshape(gram_matrices.copy(), -1, order='C')) # pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(gms_array, gms_shape)) pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(gram_matrices,)) - trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, y, model_type) + trial_do_partial = partial(parallel_trial_do, param_list_pre_revised, param_list, y, model_type) train_pref = [] val_pref = [] test_pref = [] @@ -473,7 +473,7 @@ def model_selection_for_precomputed_kernel(datafile, G_gms = gms_toshare pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(gram_matrices,)) - trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, y, model_type) + trial_do_partial = partial(parallel_trial_do, param_list_pre_revised, param_list, y, model_type) train_pref = [] val_pref = [] test_pref = [] @@ -656,7 +656,7 @@ def model_selection_for_precomputed_kernel(datafile, f.write(str_fw + '\n\n\n' + content) -def trial_do(param_list_pre_revised, param_list, y, model_type, trial): # Test set level +def trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, trial): # Test set level # # get gram matrices from global variables. # gram_matrices = np.reshape(G_gms.copy(), G_gms_shape, order='C') @@ -679,7 +679,7 @@ def trial_do(param_list_pre_revised, param_list, y, model_type, trial): # Test s # get gram matrices from global variables. # gm_now = G_gms[index_out * G_gms_shape[1] * G_gms_shape[2]:(index_out + 1) * G_gms_shape[1] * G_gms_shape[2]] # gm_now = np.reshape(gm_now.copy(), (G_gms_shape[1], G_gms_shape[2]), order='C') - gm_now = G_gms[index_out].copy() + gm_now = gram_matrices[index_out].copy() # split gram matrix and y to app and test sets. indices = range(len(y)) @@ -822,6 +822,12 @@ def trial_do(param_list_pre_revised, param_list, y, model_type, trial): # Test s return train_pref, val_pref, test_pref +def parallel_trial_do(param_list_pre_revised, param_list, y, model_type, trial): + train_pref, val_pref, test_pref = trial_do(param_list_pre_revised, + param_list, G_gms, y, + model_type, trial) + return train_pref, val_pref, test_pref + def compute_gram_matrices(dataset, y, estimator, param_list_precomputed, results_dir, ds_name, diff --git a/pygraph/utils/parallel.py b/pygraph/utils/parallel.py index bca9304..fdf2244 100644 --- a/pygraph/utils/parallel.py +++ b/pygraph/utils/parallel.py @@ -11,7 +11,8 @@ from tqdm import tqdm import sys def parallel_me(func, func_assign, var_to_assign, itr, len_itr=None, init_worker=None, - glbv=None, method=None, n_jobs=None, chunksize=None, itr_desc=''): + glbv=None, method=None, n_jobs=None, chunksize=None, itr_desc='', + verbose=True): ''' ''' if method == 'imap_unordered': @@ -29,8 +30,9 @@ def parallel_me(func, func_assign, var_to_assign, itr, len_itr=None, init_worker chunksize = int(len_itr / n_jobs) + 1 else: chunksize = 100 - for result in tqdm(pool.imap_unordered(func, itr, chunksize), - desc=itr_desc, file=sys.stdout): + for result in (tqdm(pool.imap_unordered(func, itr, chunksize), + desc=itr_desc, file=sys.stdout) if verbose else + pool.imap_unordered(func, itr, chunksize)): func_assign(result, var_to_assign) else: with Pool(processes=n_jobs) as pool: @@ -41,14 +43,16 @@ def parallel_me(func, func_assign, var_to_assign, itr, len_itr=None, init_worker chunksize = int(len_itr / n_jobs) + 1 else: chunksize = 100 - for result in tqdm(pool.imap_unordered(func, itr, chunksize), - desc=itr_desc, file=sys.stdout): + for result in (tqdm(pool.imap_unordered(func, itr, chunksize), + desc=itr_desc, file=sys.stdout) if verbose else + pool.imap_unordered(func, itr, chunksize)): func_assign(result, var_to_assign) def parallel_gm(func, Kmatrix, Gn, init_worker=None, glbv=None, - method='imap_unordered', n_jobs=None, chunksize=None): + method='imap_unordered', n_jobs=None, chunksize=None, + verbose=True): from itertools import combinations_with_replacement def func_assign(result, var_to_assign): var_to_assign[result[0]][result[1]] = result[2] @@ -57,4 +61,4 @@ def parallel_gm(func, Kmatrix, Gn, init_worker=None, glbv=None, len_itr = int(len(Gn) * (len(Gn) + 1) / 2) parallel_me(func, func_assign, Kmatrix, itr, len_itr=len_itr, init_worker=init_worker, glbv=glbv, method=method, n_jobs=n_jobs, - chunksize=chunksize, itr_desc='calculating kernels') \ No newline at end of file + chunksize=chunksize, itr_desc='calculating kernels', verbose=verbose) \ No newline at end of file