|
1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930 |
- {
- "cells": [
- {
- "cell_type": "code",
- "execution_count": 8,
- "metadata": {
- "scrolled": true
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "{'O', 'C'}\n",
- "{'O', 'C'}\n",
- "--- shortest path kernel built in 0.0002582073211669922 seconds ---\n",
- "3\n"
- ]
- },
- {
- "data": {
- "image/png": "\n",
- "text/plain": [
- "<matplotlib.figure.Figure at 0x7f773eab40b8>"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "[(0, {'label': 'C'}), (1, {'label': 'C'}), (2, {'label': 'C'}), (3, {'label': 'C'}), (4, {'label': 'O'})]\n",
- " -> \n"
- ]
- },
- {
- "data": {
- "image/png": "\n",
- "text/plain": [
- "<matplotlib.figure.Figure at 0x7f773ca1cc88>"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "[(0, {'label': 'CC'}), (1, {'label': 'CC'}), (2, {'label': 'CO'}), (3, {'label': 'CCCO'}), (4, {'label': 'OCC'})]\n",
- " -> \n"
- ]
- },
- {
- "data": {
- "image/png": "\n",
- "text/plain": [
- "<matplotlib.figure.Figure at 0x7f773c9a44e0>"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "[(0, {'label': '0'}), (1, {'label': '0'}), (2, {'label': '3'}), (3, {'label': '1'}), (4, {'label': '2'})]\n"
- ]
- },
- {
- "data": {
- "image/png": "\n",
- "text/plain": [
- "<matplotlib.figure.Figure at 0x7f773c9957b8>"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "[(0, {'label': 'C'}), (1, {'label': 'C'}), (2, {'label': 'C'}), (3, {'label': 'C'}), (4, {'label': 'C'}), (5, {'label': 'C'}), (6, {'label': 'O'})]\n",
- " -> \n"
- ]
- },
- {
- "data": {
- "image/png": "\n",
- "text/plain": [
- "<matplotlib.figure.Figure at 0x7f7788e0e390>"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "[(0, {'label': 'CC'}), (1, {'label': 'CC'}), (2, {'label': 'CC'}), (3, {'label': 'CO'}), (4, {'label': 'CCCC'}), (5, {'label': 'CCCO'}), (6, {'label': 'OCC'})]\n",
- " -> \n"
- ]
- },
- {
- "data": {
- "image/png": "\n",
- "text/plain": [
- "<matplotlib.figure.Figure at 0x7f773c95a5f8>"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "[(0, {'label': '0'}), (1, {'label': '0'}), (2, {'label': '0'}), (3, {'label': '3'}), (4, {'label': '4'}), (5, {'label': '1'}), (6, {'label': '2'})]\n",
- "--- shortest path kernel built in 0.00026607513427734375 seconds ---\n",
- "6\n"
- ]
- }
- ],
- "source": [
- "import sys\n",
- "import networkx as nx\n",
- "sys.path.insert(0, \"../\")\n",
- "from pygraph.utils.graphfiles import loadDataset\n",
- "from pygraph.kernels.spkernel import spkernel\n",
- "\n",
- "import matplotlib.pyplot as plt\n",
- "\n",
- "\n",
- "def weisfeilerlehman_test(G):\n",
- " '''\n",
- " Weisfeiler-Lehman test of graph isomorphism.\n",
- " '''\n",
- "\n",
- " nx.draw_networkx(G)\n",
- " plt.show()\n",
- " nx.draw_networkx_labels(G, nx.spring_layout(G), labels = nx.get_node_attributes(G,'label'))\n",
- " print(G.nodes(data = True))\n",
- " \n",
- " set_multisets = []\n",
- " for node in G.nodes(data = True):\n",
- " # Multiset-label determination.\n",
- " multiset = [ G.node[neighbors]['label'] for neighbors in G[node[0]] ]\n",
- " # sorting each multiset\n",
- " multiset.sort()\n",
- " multiset = node[1]['label'] + ''.join(multiset) # concatenate to a string and add the prefix \n",
- " set_multisets.append(multiset)\n",
- " \n",
- " # label compression\n",
- "# set_multisets.sort() # this is unnecessary\n",
- " set_unique = list(set(set_multisets)) # set of unique multiset labels\n",
- " set_compressed = { value : str(set_unique.index(value)) for value in set_unique } # assign indices as the new labels\n",
- "# print(set_compressed)\n",
- "# print(set_multisets)\n",
- " \n",
- " # relabel nodes with multisets\n",
- " for node in G.nodes(data = True):\n",
- " node[1]['label'] = set_multisets[node[0]]\n",
- " print(' -> ')\n",
- " nx.draw_networkx(G)\n",
- " plt.show()\n",
- " print(G.nodes(data = True))\n",
- "\n",
- " \n",
- " # relabel nodes\n",
- " for node in G.nodes(data = True):\n",
- " node[1]['label'] = set_compressed[set_multisets[node[0]]]\n",
- " \n",
- " print(' -> ')\n",
- " nx.draw_networkx(G)\n",
- " plt.show()\n",
- " print(G.nodes(data = True))\n",
- "\n",
- "dataset, y = loadDataset(\"../../../../datasets/acyclic/Acyclic/dataset_bps.ds\")\n",
- "G1 = dataset[12]\n",
- "G2 = dataset[55]\n",
- "\n",
- "# init.\n",
- "kernel = 0 # init kernel\n",
- "num_nodes1 = G1.number_of_nodes()\n",
- "num_nodes2 = G2.number_of_nodes()\n",
- "\n",
- "# the first iteration.\n",
- "labelset1 = { G1.nodes(data = True)[i]['label'] for i in range(num_nodes1) }\n",
- "labelset2 = { G2.nodes(data = True)[i]['label'] for i in range(num_nodes2) }\n",
- "print(labelset1)\n",
- "print(labelset2)\n",
- "kernel += spkernel(G1, G2)\n",
- "print(kernel)\n",
- "\n",
- "\n",
- "\n",
- "for height in range(0, min(num_nodes1, num_nodes2)): #Q how to determine the upper bound of the height?\n",
- " if labelset1 != labelset2:\n",
- " break\n",
- " \n",
- " # Weisfeiler-Lehman test of graph isomorphism.\n",
- " weisfeilerlehman_test(G1)\n",
- " weisfeilerlehman_test(G2)\n",
- " \n",
- " # calculate kernel\n",
- " kernel += spkernel(G1, G2)\n",
- " \n",
- " # get label sets of both graphs\n",
- " labelset1 = { G1.nodes(data = True)[i]['label'] for i in range(num_nodes1) }\n",
- " labelset2 = { G2.nodes(data = True)[i]['label'] for i in range(num_nodes2) }\n",
- "# print(labelset1)\n",
- "# print(labelset2)\n",
- "\n",
- "print(kernel)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "{0: 'C', 1: 'C', 2: 'C', 3: 'C', 4: 'C', 5: 'O', 6: 'O'}\n",
- "{0: 'C', 1: 'C', 2: 'C', 3: 'C', 4: 'C', 5: 'C', 6: 'S', 7: 'S'}\n",
- "\n",
- " --- height = 0 --- \n",
- "\n",
- " --- for graph 0 --- \n",
- "\n",
- "labels_ori: ['C', 'C', 'C', 'C', 'C', 'O', 'O']\n",
- "num_of_each_label: {'C': 5, 'O': 2}\n",
- "num_of_labels: 2\n"
- ]
- },
- {
- "ename": "UnboundLocalError",
- "evalue": "local variable 'all_labels_ori' referenced before assignment",
- "output_type": "error",
- "traceback": [
- "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
- "\u001b[0;31mUnboundLocalError\u001b[0m Traceback (most recent call last)",
- "\u001b[0;32m<ipython-input-4-a65d6180cda5>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 328\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnx\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_node_attributes\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mG2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'label'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 329\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 330\u001b[0;31m \u001b[0mweisfeilerlehmankernel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mG1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mG2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mheight\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 331\u001b[0m \u001b[0;31m# Kmatrix = weisfeilerlehmankernel(G1, G2)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
- "\u001b[0;32m<ipython-input-4-a65d6180cda5>\u001b[0m in \u001b[0;36mweisfeilerlehmankernel\u001b[0;34m(height, base_kernel, *args)\u001b[0m\n\u001b[1;32m 78\u001b[0m \u001b[0margs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0margs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 79\u001b[0m \u001b[0;31m# print(args)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 80\u001b[0;31m \u001b[0mkernel\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_wl_subtreekernel_do\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mheight\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mheight\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbase_kernel\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'subtree'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 81\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 82\u001b[0m \u001b[0;31m# for WL edge kernel\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
- "\u001b[0;32m<ipython-input-4-a65d6180cda5>\u001b[0m in \u001b[0;36m_wl_subtreekernel_do\u001b[0;34m(height, base_kernel, *args)\u001b[0m\n\u001b[1;32m 220\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'num_of_labels: %s'\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mnum_of_labels\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 221\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 222\u001b[0;31m \u001b[0mall_labels_ori\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mupdate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlabels_ori\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 223\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'all_labels_ori: %s'\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mall_labels_ori\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 224\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
- "\u001b[0;31mUnboundLocalError\u001b[0m: local variable 'all_labels_ori' referenced before assignment"
- ]
- }
- ],
- "source": [
- "# test of WL subtree kernel on many graphs\n",
- "\n",
- "import sys\n",
- "import pathlib\n",
- "from collections import Counter\n",
- "sys.path.insert(0, \"../\")\n",
- "\n",
- "import networkx as nx\n",
- "import numpy as np\n",
- "import time\n",
- "\n",
- "from pygraph.kernels.spkernel import spkernel\n",
- "from pygraph.kernels.pathKernel import pathkernel\n",
- "\n",
- "def weisfeilerlehmankernel(*args, height = 0, base_kernel = 'subtree'):\n",
- " \"\"\"Calculate Weisfeiler-Lehman kernels between graphs.\n",
- " \n",
- " Parameters\n",
- " ----------\n",
- " Gn : List of NetworkX graph\n",
- " List of graphs between which the kernels are calculated.\n",
- " /\n",
- " G1, G2 : NetworkX graphs\n",
- " 2 graphs between which the kernel is calculated.\n",
- " \n",
- " height : subtree height\n",
- " \n",
- " base_kernel : base kernel used in each iteration of WL kernel\n",
- " the default base kernel is subtree kernel\n",
- " \n",
- " Return\n",
- " ------\n",
- " Kmatrix/Kernel : Numpy matrix/int\n",
- " Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs. / Weisfeiler-Lehman Kernel between 2 graphs.\n",
- " \n",
- " Notes\n",
- " -----\n",
- " This function now supports WL subtree kernel and WL shortest path kernel.\n",
- " \n",
- " References\n",
- " ----------\n",
- " [1] Shervashidze N, Schweitzer P, Leeuwen EJ, Mehlhorn K, Borgwardt KM. Weisfeiler-lehman graph kernels. Journal of Machine Learning Research. 2011;12(Sep):2539-61.\n",
- " \"\"\"\n",
- " if len(args) == 1: # for a list of graphs\n",
- "\n",
- "# print(args)\n",
- " start_time = time.time()\n",
- " \n",
- " # for WL subtree kernel\n",
- " if base_kernel == 'subtree': \n",
- " Kmatrix = _wl_subtreekernel_do(args[0], height = height, base_kernel = 'subtree')\n",
- " \n",
- " # for WL edge kernel\n",
- " elif base_kernel == 'edge':\n",
- " print('edge')\n",
- " \n",
- " # for WL shortest path kernel\n",
- " elif base_kernel == 'sp':\n",
- " Gn = args[0]\n",
- " Kmatrix = np.zeros((len(Gn), len(Gn)))\n",
- " \n",
- " for i in range(0, len(Gn)):\n",
- " for j in range(i, len(Gn)):\n",
- " Kmatrix[i][j] = _weisfeilerlehmankernel_do(Gn[i], Gn[j])\n",
- " Kmatrix[j][i] = Kmatrix[i][j]\n",
- "\n",
- " print(\"\\n --- Weisfeiler-Lehman %s kernel matrix of size %d built in %s seconds ---\" % (base_kernel, len(args[0]), (time.time() - start_time)))\n",
- " \n",
- " return Kmatrix\n",
- " \n",
- " else: # for only 2 graphs\n",
- " \n",
- " start_time = time.time()\n",
- " \n",
- " # for WL subtree kernel\n",
- " if base_kernel == 'subtree':\n",
- " \n",
- " args = [args[0], args[1]]\n",
- "# print(args)\n",
- " kernel = _wl_subtreekernel_do(args, height = height, base_kernel = 'subtree')\n",
- " \n",
- " # for WL edge kernel\n",
- " elif base_kernel == 'edge':\n",
- " print('edge')\n",
- " \n",
- " # for WL shortest path kernel\n",
- " elif base_kernel == 'sp':\n",
- " \n",
- "\n",
- " kernel = _pathkernel_do(args[0], args[1])\n",
- "\n",
- " print(\"\\n --- Weisfeiler-Lehman %s kernel built in %s seconds ---\" % (base_kernel, time.time() - start_time))\n",
- " \n",
- " return kernel\n",
- " \n",
- " \n",
- "def _weisfeilerlehmankernel_do(G1, G2):\n",
- " \"\"\"Calculate Weisfeiler-Lehman kernels between 2 graphs. This kernel use shortest path kernel to calculate kernel between two graphs in each iteration.\n",
- " \n",
- " Parameters\n",
- " ----------\n",
- " G1, G2 : NetworkX graphs\n",
- " 2 graphs between which the kernel is calculated.\n",
- " \n",
- " Return\n",
- " ------\n",
- " Kernel : int\n",
- " Weisfeiler-Lehman Kernel between 2 graphs.\n",
- " \"\"\"\n",
- " \n",
- " # init.\n",
- " kernel = 0 # init kernel\n",
- " num_nodes1 = G1.number_of_nodes()\n",
- " num_nodes2 = G2.number_of_nodes()\n",
- " height = 12 #min(num_nodes1, num_nodes2)) #Q how to determine the upper bound of the height?\n",
- " \n",
- " # the first iteration.\n",
- " labelset1 = { G1.nodes(data = True)[i]['label'] for i in range(num_nodes1) }\n",
- " labelset2 = { G2.nodes(data = True)[i]['label'] for i in range(num_nodes2) }\n",
- " kernel += pathkernel(G1, G2) # change your base kernel here (and one more below)\n",
- " \n",
- " for h in range(0, height):\n",
- "# if labelset1 != labelset2:\n",
- "# break\n",
- "\n",
- " # Weisfeiler-Lehman test of graph isomorphism.\n",
- " relabel(G1)\n",
- " relabel(G2)\n",
- "\n",
- " # calculate kernel\n",
- " kernel += pathkernel(G1, G2) # change your base kernel here (and one more before)\n",
- "\n",
- " # get label sets of both graphs\n",
- " labelset1 = { G1.nodes(data = True)[i]['label'] for i in range(num_nodes1) }\n",
- " labelset2 = { G2.nodes(data = True)[i]['label'] for i in range(num_nodes2) }\n",
- " \n",
- " return kernel\n",
- "\n",
- "\n",
- "def relabel(G):\n",
- " '''\n",
- " Relabel nodes in graph G in one iteration of the 1-dim. WL test of graph isomorphism.\n",
- " \n",
- " Parameters\n",
- " ----------\n",
- " G : NetworkX graph\n",
- " The graphs whose nodes are relabeled.\n",
- " '''\n",
- " \n",
- " # get the set of original labels\n",
- " labels_ori = list(nx.get_node_attributes(G, 'label').values())\n",
- " print(labels_ori)\n",
- " num_of_each_label = dict(Counter(labels_ori))\n",
- " print(num_of_each_label)\n",
- " num_of_labels = len(num_of_each_label)\n",
- " print(num_of_labels)\n",
- " \n",
- " set_multisets = []\n",
- " for node in G.nodes(data = True):\n",
- " # Multiset-label determination.\n",
- " multiset = [ G.node[neighbors]['label'] for neighbors in G[node[0]] ]\n",
- " # sorting each multiset\n",
- " multiset.sort()\n",
- " multiset = node[1]['label'] + ''.join(multiset) # concatenate to a string and add the prefix \n",
- " set_multisets.append(multiset)\n",
- " print(set_multisets)\n",
- " \n",
- " # label compression\n",
- "# set_multisets.sort() # this is unnecessary\n",
- " set_unique = list(set(set_multisets)) # set of unique multiset labels\n",
- " print(set_unique)\n",
- " set_compressed = { value : str(set_unique.index(value) + num_of_labels + 1) for value in set_unique } # assign new labels\n",
- " print(set_compressed)\n",
- " \n",
- " # relabel nodes\n",
- "# nx.relabel_nodes(G, set_compressed, copy = False)\n",
- " for node in G.nodes(data = True):\n",
- " node[1]['label'] = set_compressed[set_multisets[node[0]]]\n",
- " print(nx.get_node_attributes(G, 'label'))\n",
- "\n",
- " # get the set of compressed labels\n",
- " labels_comp = list(nx.get_node_attributes(G, 'label').values())\n",
- " print(labels_comp)\n",
- " num_of_each_label.update(dict(Counter(labels_comp)))\n",
- " print(num_of_each_label)\n",
- " \n",
- " \n",
- "def _wl_subtreekernel_do(*args, height = 0, base_kernel = 'subtree'):\n",
- " \"\"\"Calculate Weisfeiler-Lehman subtree kernels between graphs.\n",
- " \n",
- " Parameters\n",
- " ----------\n",
- " Gn : List of NetworkX graph\n",
- " List of graphs between which the kernels are calculated.\n",
- " \n",
- " Return\n",
- " ------\n",
- " Kmatrix/Kernel : Numpy matrix/int\n",
- " Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.\n",
- " \"\"\"\n",
- " \n",
- "# print(args)\n",
- " Gn = args[0]\n",
- "# print(Gn)\n",
- "\n",
- " Kmatrix = np.zeros((len(Gn), len(Gn)))\n",
- " all_num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs\n",
- " \n",
- " # initial for height = 0\n",
- " print('\\n --- height = 0 --- ')\n",
- " all_labels_ori = set() # all unique orignal labels in all graphs in this iteration\n",
- " all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration\n",
- " all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration\n",
- " num_of_labels_occured = all_num_of_labels_occured # number of the set of letters that occur before as node labels at least once in all graphs\n",
- "\n",
- " # for each graph\n",
- " for idx, G in enumerate(Gn):\n",
- " # get the set of original labels\n",
- " print('\\n --- for graph %d --- \\n' % (idx))\n",
- " labels_ori = list(nx.get_node_attributes(G, 'label').values())\n",
- " print('labels_ori: %s' % (labels_ori))\n",
- " num_of_each_label = dict(Counter(labels_ori)) # number of occurence of each label in graph\n",
- " print('num_of_each_label: %s' % (num_of_each_label))\n",
- " all_num_of_each_label.append(num_of_each_label)\n",
- " print('all_num_of_each_label: %s' % (all_num_of_each_label))\n",
- " num_of_labels = len(num_of_each_label) # number of all unique labels\n",
- " print('num_of_labels: %s' % (num_of_labels))\n",
- " \n",
- "\n",
- " all_labels_ori.update(labels_ori)\n",
- " print('all_labels_ori: %s' % (all_labels_ori))\n",
- " \n",
- " # calculate subtree kernel with the 0th iteration and add it to the final kernel\n",
- " for i in range(0, len(Gn)):\n",
- " for j in range(i, len(Gn)):\n",
- " labels = set(list(all_num_of_each_label[i].keys()) + list(all_num_of_each_label[j].keys()))\n",
- " print('\\n labels: %s' % (labels))\n",
- " vector1 = np.matrix([ (all_num_of_each_label[i][label] if (label in all_num_of_each_label[i].keys()) else 0) for label in labels ])\n",
- " vector2 = np.matrix([ (all_num_of_each_label[j][label] if (label in all_num_of_each_label[j].keys()) else 0) for label in labels ])\n",
- " print('\\n vector1: %s' % (vector1))\n",
- " print('\\n vector2: %s' % (vector2))\n",
- " Kmatrix[i][j] += np.dot(vector1, vector2.transpose())\n",
- " Kmatrix[j][i] = Kmatrix[i][j]\n",
- " \n",
- " \n",
- " \n",
- " # iterate each height\n",
- " for h in range(height + 1):\n",
- " print('\\n --- height = %d --- ' % (h))\n",
- " all_labels_ori = set() # all unique orignal labels in all graphs in this iteration\n",
- "# all_labels_comp = set() # all unique compressed labels in all graphs in this iteration\n",
- " all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration\n",
- " all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration\n",
- " num_of_labels_occured = all_num_of_labels_occured # number of the set of letters that occur before as node labels at least once in all graphs\n",
- " \n",
- " # for each graph\n",
- " for idx, G in enumerate(Gn):\n",
- " # get the set of original labels\n",
- " print('\\n --- for graph %d --- \\n' % (idx))\n",
- " labels_ori = list(nx.get_node_attributes(G, 'label').values())\n",
- " print('labels_ori: %s' % (labels_ori))\n",
- " num_of_each_label = dict(Counter(labels_ori)) # number of occurence of each label in graph\n",
- " print('num_of_each_label: %s' % (num_of_each_label))\n",
- " num_of_labels = len(num_of_each_label) # number of all unique labels\n",
- " print('num_of_labels: %s' % (num_of_labels))\n",
- " \n",
- " all_labels_ori.update(labels_ori)\n",
- " print('all_labels_ori: %s' % (all_labels_ori))\n",
- " # num_of_labels_occured += num_of_labels #@todo not precise\n",
- " num_of_labels_occured = all_num_of_labels_occured + len(all_labels_ori) + len(all_set_compressed)\n",
- " print('num_of_labels_occured: %s' % (num_of_labels_occured))\n",
- " \n",
- " set_multisets = []\n",
- " for node in G.nodes(data = True):\n",
- " # Multiset-label determination.\n",
- " multiset = [ G.node[neighbors]['label'] for neighbors in G[node[0]] ]\n",
- " # sorting each multiset\n",
- " multiset.sort()\n",
- " multiset = node[1]['label'] + ''.join(multiset) # concatenate to a string and add the prefix \n",
- " set_multisets.append(multiset)\n",
- " print('multiset: %s' % (set_multisets))\n",
- "\n",
- " # label compression\n",
- " # set_multisets.sort() # this is unnecessary\n",
- " set_unique = list(set(set_multisets)) # set of unique multiset labels\n",
- " print('set_unique: %s' % (set_unique))\n",
- " # a dictionary mapping original labels to new ones. \n",
- " set_compressed = {}\n",
- " # if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label \n",
- " for value in set_unique:\n",
- " if value in all_set_compressed.keys():\n",
- " set_compressed.update({ value : all_set_compressed[value] })\n",
- " else:\n",
- " set_compressed.update({ value : str(num_of_labels_occured + 1) })\n",
- " num_of_labels_occured += 1\n",
- "# set_compressed = { value : (all_set_compressed[value] if value in all_set_compressed.keys() else str(set_unique.index(value) + num_of_labels_occured + 1)) for value in set_unique }\n",
- " print('set_compressed: %s' % (set_compressed))\n",
- " \n",
- " all_set_compressed.update(set_compressed)\n",
- " print('all_set_compressed: %s' % (all_set_compressed))\n",
- "# num_of_labels_occured += len(set_compressed) #@todo not precise\n",
- " print('num_of_labels_occured: %s' % (num_of_labels_occured))\n",
- " \n",
- "\n",
- " # relabel nodes\n",
- " # nx.relabel_nodes(G, set_compressed, copy = False)\n",
- " for node in G.nodes(data = True):\n",
- " node[1]['label'] = set_compressed[set_multisets[node[0]]]\n",
- " print('\\n compressed labels: %s' % (nx.get_node_attributes(G, 'label')))\n",
- "\n",
- " # get the set of compressed labels\n",
- " labels_comp = list(nx.get_node_attributes(G, 'label').values())\n",
- " print('labels_comp: %s' % (labels_comp))\n",
- " num_of_each_label.update(dict(Counter(labels_comp)))\n",
- " print('num_of_each_label: %s' % (num_of_each_label))\n",
- " all_num_of_each_label.append(num_of_each_label)\n",
- " print('all_num_of_each_label: %s' % (all_num_of_each_label))\n",
- " \n",
- " # calculate subtree kernel with h iterations and add it to the final kernel\n",
- " for i in range(0, len(Gn)):\n",
- " for j in range(i, len(Gn)):\n",
- " labels = set(list(all_num_of_each_label[i].keys()) + list(all_num_of_each_label[j].keys()))\n",
- " print('\\n labels: %s' % (labels))\n",
- " vector1 = np.matrix([ (all_num_of_each_label[i][label] if (label in all_num_of_each_label[i].keys()) else 0) for label in labels ])\n",
- " vector2 = np.matrix([ (all_num_of_each_label[j][label] if (label in all_num_of_each_label[j].keys()) else 0) for label in labels ])\n",
- " print('\\n vector1: %s' % (vector1))\n",
- " print('\\n vector2: %s' % (vector2))\n",
- " Kmatrix[i][j] += np.dot(vector1, vector2.transpose())\n",
- " Kmatrix[j][i] = Kmatrix[i][j]\n",
- " \n",
- " all_num_of_labels_occured += len(all_labels_ori)\n",
- " print('\\n all_num_of_labels_occured: %s' % (all_num_of_labels_occured))\n",
- " print('\\n Kmatrix: %s' % (Kmatrix))\n",
- "\n",
- " return Kmatrix\n",
- "\n",
- " \n",
- "# main\n",
- "import sys\n",
- "from collections import Counter\n",
- "import networkx as nx\n",
- "sys.path.insert(0, \"../\")\n",
- "from pygraph.utils.graphfiles import loadDataset\n",
- "from pygraph.kernels.spkernel import spkernel\n",
- "\n",
- "dataset, y = loadDataset(\"../../../../datasets/acyclic/Acyclic/dataset_bps.ds\")\n",
- "G1 = dataset[15]\n",
- "print(nx.get_node_attributes(G1, 'label'))\n",
- "G2 = dataset[80]\n",
- "print(nx.get_node_attributes(G2, 'label'))\n",
- "\n",
- "weisfeilerlehmankernel(G1, G2, height = 1)\n",
- "# Kmatrix = weisfeilerlehmankernel(G1, G2)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "- This script take as input a kernel matrix\n",
- "and returns the classification or regression performance\n",
- "- The kernel matrix can be calculated using any of the graph kernels approaches\n",
- "- The criteria used for prediction are SVM for classification and kernel Ridge regression for regression\n",
- "- For predition we divide the data in training, validation and test. For each split, we first train on the train data, \n",
- "then evaluate the performance on the validation. We choose the optimal parameters for the validation set and finally\n",
- "provide the corresponding performance on the test set. If more than one split is performed, the final results \n",
- "correspond to the average of the performances on the test sets. \n",
- "\n",
- "@references\n",
- " https://github.com/eghisu/GraphKernels/blob/master/GraphKernelsCollection/python_scripts/compute_perf_gk.py\n",
- "\n",
- "\n",
- " --- calculating kernel matrix when subtree height = 0 ---\n",
- "\n",
- " Loading dataset from file...\n",
- "[ -23.7 14. 37.3 109.7 10.8 39. 42. 66.6 135. 148.5\n",
- " 40. 34.6 32. 63. 53.5 67. 64.4 84.7 95.5 92.\n",
- " 84.4 154. 156. 166. 183. 70.3 63.6 52.5 59. 59.5\n",
- " 55.2 88. 83. 104.5 102. 92. 107.4 123.2 112.5 118.5\n",
- " 101.5 173.7 165.5 181. 99.5 92.3 90.1 80.2 82. 91.2\n",
- " 91.5 81.2 93. 69. 86.3 82. 103. 103.5 96. 112. 104.\n",
- " 132.5 123.5 120.3 145. 144.2 142.8 132. 134.2 137. 139.\n",
- " 133.6 120.4 120. 137. 195.8 177.2 181. 185.9 175.7 186. 211.\n",
- " 125. 118. 117.1 107. 102.5 112. 97.4 91.5 87.6 106.5\n",
- " 101. 99.3 90. 137. 114. 126. 124. 140.5 157.5 146. 145.\n",
- " 141. 171. 166. 155. 145. 159. 138. 142. 159. 163.5\n",
- " 229.5 142. 125. 132. 130.5 125. 122. 121. 122.2 112. 106.\n",
- " 114.5 151. 128.5 109.5 126. 147. 158. 147. 165. 188.9\n",
- " 170. 178. 148.5 165. 177. 167. 195. 226. 215. 201. 205.\n",
- " 151.5 165.5 157. 139. 163. 153.5 139. 162. 173. 159.5\n",
- " 159.5 155.5 141. 126. 164. 163. 166.5 146. 165. 159. 195.\n",
- " 218. 250. 235. 186.5 156.5 162. 162. 170.2 173.2 186.8\n",
- " 173. 187. 174. 188.5 199. 228. 215. 216. 240. ]\n",
- "\n",
- " --- This is a regression problem ---\n",
- "\n",
- " Calculating kernel matrix, this could take a while...\n",
- "\n",
- " --- Weisfeiler-Lehman subtree kernel matrix of size 185 built in 0.49373626708984375 seconds ---\n",
- "[[ 10. 10. 4. ..., 20. 20. 20.]\n",
- " [ 10. 16. 4. ..., 20. 20. 20.]\n",
- " [ 4. 4. 10. ..., 22. 22. 24.]\n",
- " ..., \n",
- " [ 20. 20. 22. ..., 130. 130. 122.]\n",
- " [ 20. 20. 22. ..., 130. 130. 122.]\n",
- " [ 20. 20. 24. ..., 122. 122. 154.]]\n",
- "\n",
- " Saving kernel matrix to file...\n",
- "\n",
- " Mean performance on val set: 12.761978\n",
- "With standard deviation: 10.086502\n",
- "\n",
- " Mean performance on test set: 9.014031\n",
- "With standard deviation: 6.357865\n",
- "\n",
- " --- calculating kernel matrix when subtree height = 1 ---\n",
- "\n",
- " Loading dataset from file...\n",
- "[ -23.7 14. 37.3 109.7 10.8 39. 42. 66.6 135. 148.5\n",
- " 40. 34.6 32. 63. 53.5 67. 64.4 84.7 95.5 92.\n",
- " 84.4 154. 156. 166. 183. 70.3 63.6 52.5 59. 59.5\n",
- " 55.2 88. 83. 104.5 102. 92. 107.4 123.2 112.5 118.5\n",
- " 101.5 173.7 165.5 181. 99.5 92.3 90.1 80.2 82. 91.2\n",
- " 91.5 81.2 93. 69. 86.3 82. 103. 103.5 96. 112. 104.\n",
- " 132.5 123.5 120.3 145. 144.2 142.8 132. 134.2 137. 139.\n",
- " 133.6 120.4 120. 137. 195.8 177.2 181. 185.9 175.7 186. 211.\n",
- " 125. 118. 117.1 107. 102.5 112. 97.4 91.5 87.6 106.5\n",
- " 101. 99.3 90. 137. 114. 126. 124. 140.5 157.5 146. 145.\n",
- " 141. 171. 166. 155. 145. 159. 138. 142. 159. 163.5\n",
- " 229.5 142. 125. 132. 130.5 125. 122. 121. 122.2 112. 106.\n",
- " 114.5 151. 128.5 109.5 126. 147. 158. 147. 165. 188.9\n",
- " 170. 178. 148.5 165. 177. 167. 195. 226. 215. 201. 205.\n",
- " 151.5 165.5 157. 139. 163. 153.5 139. 162. 173. 159.5\n",
- " 159.5 155.5 141. 126. 164. 163. 166.5 146. 165. 159. 195.\n",
- " 218. 250. 235. 186.5 156.5 162. 162. 170.2 173.2 186.8\n",
- " 173. 187. 174. 188.5 199. 228. 215. 216. 240. ]\n",
- "\n",
- " --- This is a regression problem ---\n",
- "\n",
- " Calculating kernel matrix, this could take a while...\n",
- "\n",
- " --- Weisfeiler-Lehman subtree kernel matrix of size 185 built in 1.0043864250183105 seconds ---\n",
- "[[ 20. 14. 8. ..., 20. 20. 22.]\n",
- " [ 14. 32. 4. ..., 28. 28. 22.]\n",
- " [ 8. 4. 20. ..., 25. 25. 30.]\n",
- " ..., \n",
- " [ 20. 28. 25. ..., 188. 180. 145.]\n",
- " [ 20. 28. 25. ..., 180. 182. 145.]\n",
- " [ 22. 22. 30. ..., 145. 145. 238.]]\n",
- "\n",
- " Saving kernel matrix to file...\n",
- "\n",
- " Mean performance on val set: 22.747869\n",
- "With standard deviation: 7.561365\n",
- "\n",
- " Mean performance on test set: 19.457133\n",
- "With standard deviation: 5.057464\n",
- "\n",
- " --- calculating kernel matrix when subtree height = 2 ---\n",
- "\n",
- " Loading dataset from file...\n",
- "[ -23.7 14. 37.3 109.7 10.8 39. 42. 66.6 135. 148.5\n",
- " 40. 34.6 32. 63. 53.5 67. 64.4 84.7 95.5 92.\n",
- " 84.4 154. 156. 166. 183. 70.3 63.6 52.5 59. 59.5\n",
- " 55.2 88. 83. 104.5 102. 92. 107.4 123.2 112.5 118.5\n",
- " 101.5 173.7 165.5 181. 99.5 92.3 90.1 80.2 82. 91.2\n",
- " 91.5 81.2 93. 69. 86.3 82. 103. 103.5 96. 112. 104.\n",
- " 132.5 123.5 120.3 145. 144.2 142.8 132. 134.2 137. 139.\n",
- " 133.6 120.4 120. 137. 195.8 177.2 181. 185.9 175.7 186. 211.\n",
- " 125. 118. 117.1 107. 102.5 112. 97.4 91.5 87.6 106.5\n",
- " 101. 99.3 90. 137. 114. 126. 124. 140.5 157.5 146. 145.\n",
- " 141. 171. 166. 155. 145. 159. 138. 142. 159. 163.5\n",
- " 229.5 142. 125. 132. 130.5 125. 122. 121. 122.2 112. 106.\n",
- " 114.5 151. 128.5 109.5 126. 147. 158. 147. 165. 188.9\n",
- " 170. 178. 148.5 165. 177. 167. 195. 226. 215. 201. 205.\n",
- " 151.5 165.5 157. 139. 163. 153.5 139. 162. 173. 159.5\n",
- " 159.5 155.5 141. 126. 164. 163. 166.5 146. 165. 159. 195.\n",
- " 218. 250. 235. 186.5 156.5 162. 162. 170.2 173.2 186.8\n",
- " 173. 187. 174. 188.5 199. 228. 215. 216. 240. ]\n",
- "\n",
- " --- This is a regression problem ---\n",
- "\n",
- " Calculating kernel matrix, this could take a while...\n",
- "\n",
- " --- Weisfeiler-Lehman subtree kernel matrix of size 185 built in 1.602942705154419 seconds ---\n",
- "[[ 30. 14. 8. ..., 20. 20. 23.]\n",
- " [ 14. 48. 4. ..., 28. 28. 22.]\n",
- " [ 8. 4. 30. ..., 25. 25. 32.]\n",
- " ..., \n",
- " [ 20. 28. 25. ..., 246. 209. 147.]\n",
- " [ 20. 28. 25. ..., 209. 220. 147.]\n",
- " [ 23. 22. 32. ..., 147. 147. 286.]]\n",
- "\n",
- " Saving kernel matrix to file...\n",
- "\n",
- " Mean performance on val set: 21.407092\n",
- "With standard deviation: 6.415967\n",
- "\n",
- " Mean performance on test set: 23.466810\n",
- "With standard deviation: 5.836831\n",
- "\n",
- " --- calculating kernel matrix when subtree height = 3 ---\n",
- "\n",
- " Loading dataset from file...\n",
- "[ -23.7 14. 37.3 109.7 10.8 39. 42. 66.6 135. 148.5\n",
- " 40. 34.6 32. 63. 53.5 67. 64.4 84.7 95.5 92.\n",
- " 84.4 154. 156. 166. 183. 70.3 63.6 52.5 59. 59.5\n",
- " 55.2 88. 83. 104.5 102. 92. 107.4 123.2 112.5 118.5\n",
- " 101.5 173.7 165.5 181. 99.5 92.3 90.1 80.2 82. 91.2\n",
- " 91.5 81.2 93. 69. 86.3 82. 103. 103.5 96. 112. 104.\n",
- " 132.5 123.5 120.3 145. 144.2 142.8 132. 134.2 137. 139.\n",
- " 133.6 120.4 120. 137. 195.8 177.2 181. 185.9 175.7 186. 211.\n",
- " 125. 118. 117.1 107. 102.5 112. 97.4 91.5 87.6 106.5\n",
- " 101. 99.3 90. 137. 114. 126. 124. 140.5 157.5 146. 145.\n",
- " 141. 171. 166. 155. 145. 159. 138. 142. 159. 163.5\n",
- " 229.5 142. 125. 132. 130.5 125. 122. 121. 122.2 112. 106.\n",
- " 114.5 151. 128.5 109.5 126. 147. 158. 147. 165. 188.9\n",
- " 170. 178. 148.5 165. 177. 167. 195. 226. 215. 201. 205.\n",
- " 151.5 165.5 157. 139. 163. 153.5 139. 162. 173. 159.5\n",
- " 159.5 155.5 141. 126. 164. 163. 166.5 146. 165. 159. 195.\n",
- " 218. 250. 235. 186.5 156.5 162. 162. 170.2 173.2 186.8\n",
- " 173. 187. 174. 188.5 199. 228. 215. 216. 240. ]\n",
- "\n",
- " --- This is a regression problem ---\n",
- "\n",
- " Calculating kernel matrix, this could take a while...\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- " --- Weisfeiler-Lehman subtree kernel matrix of size 185 built in 2.2096023559570312 seconds ---\n",
- "[[ 40. 14. 8. ..., 20. 20. 23.]\n",
- " [ 14. 64. 4. ..., 28. 28. 22.]\n",
- " [ 8. 4. 40. ..., 25. 25. 32.]\n",
- " ..., \n",
- " [ 20. 28. 25. ..., 304. 217. 147.]\n",
- " [ 20. 28. 25. ..., 217. 250. 147.]\n",
- " [ 23. 22. 32. ..., 147. 147. 314.]]\n",
- "\n",
- " Saving kernel matrix to file...\n",
- "\n",
- " Mean performance on val set: 24.747018\n",
- "With standard deviation: 6.547340\n",
- "\n",
- " Mean performance on test set: 27.961360\n",
- "With standard deviation: 6.291821\n",
- "\n",
- " --- calculating kernel matrix when subtree height = 4 ---\n",
- "\n",
- " Loading dataset from file...\n",
- "[ -23.7 14. 37.3 109.7 10.8 39. 42. 66.6 135. 148.5\n",
- " 40. 34.6 32. 63. 53.5 67. 64.4 84.7 95.5 92.\n",
- " 84.4 154. 156. 166. 183. 70.3 63.6 52.5 59. 59.5\n",
- " 55.2 88. 83. 104.5 102. 92. 107.4 123.2 112.5 118.5\n",
- " 101.5 173.7 165.5 181. 99.5 92.3 90.1 80.2 82. 91.2\n",
- " 91.5 81.2 93. 69. 86.3 82. 103. 103.5 96. 112. 104.\n",
- " 132.5 123.5 120.3 145. 144.2 142.8 132. 134.2 137. 139.\n",
- " 133.6 120.4 120. 137. 195.8 177.2 181. 185.9 175.7 186. 211.\n",
- " 125. 118. 117.1 107. 102.5 112. 97.4 91.5 87.6 106.5\n",
- " 101. 99.3 90. 137. 114. 126. 124. 140.5 157.5 146. 145.\n",
- " 141. 171. 166. 155. 145. 159. 138. 142. 159. 163.5\n",
- " 229.5 142. 125. 132. 130.5 125. 122. 121. 122.2 112. 106.\n",
- " 114.5 151. 128.5 109.5 126. 147. 158. 147. 165. 188.9\n",
- " 170. 178. 148.5 165. 177. 167. 195. 226. 215. 201. 205.\n",
- " 151.5 165.5 157. 139. 163. 153.5 139. 162. 173. 159.5\n",
- " 159.5 155.5 141. 126. 164. 163. 166.5 146. 165. 159. 195.\n",
- " 218. 250. 235. 186.5 156.5 162. 162. 170.2 173.2 186.8\n",
- " 173. 187. 174. 188.5 199. 228. 215. 216. 240. ]\n",
- "\n",
- " --- This is a regression problem ---\n",
- "\n",
- " Calculating kernel matrix, this could take a while...\n",
- "\n",
- " --- Weisfeiler-Lehman subtree kernel matrix of size 185 built in 2.7832393646240234 seconds ---\n",
- "[[ 50. 14. 8. ..., 20. 20. 23.]\n",
- " [ 14. 80. 4. ..., 28. 28. 22.]\n",
- " [ 8. 4. 50. ..., 25. 25. 32.]\n",
- " ..., \n",
- " [ 20. 28. 25. ..., 362. 217. 151.]\n",
- " [ 20. 28. 25. ..., 217. 280. 147.]\n",
- " [ 23. 22. 32. ..., 151. 147. 336.]]\n",
- "\n",
- " Saving kernel matrix to file...\n",
- "\n",
- " Mean performance on val set: 27.197367\n",
- "With standard deviation: 5.980185\n",
- "\n",
- " Mean performance on test set: 30.614531\n",
- "With standard deviation: 6.852841\n",
- "\n",
- " --- calculating kernel matrix when subtree height = 5 ---\n",
- "\n",
- " Loading dataset from file...\n",
- "[ -23.7 14. 37.3 109.7 10.8 39. 42. 66.6 135. 148.5\n",
- " 40. 34.6 32. 63. 53.5 67. 64.4 84.7 95.5 92.\n",
- " 84.4 154. 156. 166. 183. 70.3 63.6 52.5 59. 59.5\n",
- " 55.2 88. 83. 104.5 102. 92. 107.4 123.2 112.5 118.5\n",
- " 101.5 173.7 165.5 181. 99.5 92.3 90.1 80.2 82. 91.2\n",
- " 91.5 81.2 93. 69. 86.3 82. 103. 103.5 96. 112. 104.\n",
- " 132.5 123.5 120.3 145. 144.2 142.8 132. 134.2 137. 139.\n",
- " 133.6 120.4 120. 137. 195.8 177.2 181. 185.9 175.7 186. 211.\n",
- " 125. 118. 117.1 107. 102.5 112. 97.4 91.5 87.6 106.5\n",
- " 101. 99.3 90. 137. 114. 126. 124. 140.5 157.5 146. 145.\n",
- " 141. 171. 166. 155. 145. 159. 138. 142. 159. 163.5\n",
- " 229.5 142. 125. 132. 130.5 125. 122. 121. 122.2 112. 106.\n",
- " 114.5 151. 128.5 109.5 126. 147. 158. 147. 165. 188.9\n",
- " 170. 178. 148.5 165. 177. 167. 195. 226. 215. 201. 205.\n",
- " 151.5 165.5 157. 139. 163. 153.5 139. 162. 173. 159.5\n",
- " 159.5 155.5 141. 126. 164. 163. 166.5 146. 165. 159. 195.\n",
- " 218. 250. 235. 186.5 156.5 162. 162. 170.2 173.2 186.8\n",
- " 173. 187. 174. 188.5 199. 228. 215. 216. 240. ]\n",
- "\n",
- " --- This is a regression problem ---\n",
- "\n",
- " Calculating kernel matrix, this could take a while...\n",
- "\n",
- " --- Weisfeiler-Lehman subtree kernel matrix of size 185 built in 3.276118040084839 seconds ---\n",
- "[[ 60. 14. 8. ..., 20. 20. 23.]\n",
- " [ 14. 96. 4. ..., 28. 28. 22.]\n",
- " [ 8. 4. 60. ..., 25. 25. 32.]\n",
- " ..., \n",
- " [ 20. 28. 25. ..., 420. 217. 151.]\n",
- " [ 20. 28. 25. ..., 217. 310. 147.]\n",
- " [ 23. 22. 32. ..., 151. 147. 358.]]\n",
- "\n",
- " Saving kernel matrix to file...\n",
- "\n",
- " Mean performance on val set: 29.010593\n",
- "With standard deviation: 6.073672\n",
- "\n",
- " Mean performance on test set: 32.130815\n",
- "With standard deviation: 7.062947\n",
- "\n",
- " --- calculating kernel matrix when subtree height = 6 ---\n",
- "\n",
- " Loading dataset from file...\n",
- "[ -23.7 14. 37.3 109.7 10.8 39. 42. 66.6 135. 148.5\n",
- " 40. 34.6 32. 63. 53.5 67. 64.4 84.7 95.5 92.\n",
- " 84.4 154. 156. 166. 183. 70.3 63.6 52.5 59. 59.5\n",
- " 55.2 88. 83. 104.5 102. 92. 107.4 123.2 112.5 118.5\n",
- " 101.5 173.7 165.5 181. 99.5 92.3 90.1 80.2 82. 91.2\n",
- " 91.5 81.2 93. 69. 86.3 82. 103. 103.5 96. 112. 104.\n",
- " 132.5 123.5 120.3 145. 144.2 142.8 132. 134.2 137. 139.\n",
- " 133.6 120.4 120. 137. 195.8 177.2 181. 185.9 175.7 186. 211.\n",
- " 125. 118. 117.1 107. 102.5 112. 97.4 91.5 87.6 106.5\n",
- " 101. 99.3 90. 137. 114. 126. 124. 140.5 157.5 146. 145.\n",
- " 141. 171. 166. 155. 145. 159. 138. 142. 159. 163.5\n",
- " 229.5 142. 125. 132. 130.5 125. 122. 121. 122.2 112. 106.\n",
- " 114.5 151. 128.5 109.5 126. 147. 158. 147. 165. 188.9\n",
- " 170. 178. 148.5 165. 177. 167. 195. 226. 215. 201. 205.\n",
- " 151.5 165.5 157. 139. 163. 153.5 139. 162. 173. 159.5\n",
- " 159.5 155.5 141. 126. 164. 163. 166.5 146. 165. 159. 195.\n",
- " 218. 250. 235. 186.5 156.5 162. 162. 170.2 173.2 186.8\n",
- " 173. 187. 174. 188.5 199. 228. 215. 216. 240. ]\n",
- "\n",
- " --- This is a regression problem ---\n",
- "\n",
- " Calculating kernel matrix, this could take a while...\n",
- "\n",
- " --- Weisfeiler-Lehman subtree kernel matrix of size 185 built in 3.779860496520996 seconds ---\n",
- "[[ 70. 14. 8. ..., 20. 20. 23.]\n",
- " [ 14. 112. 4. ..., 28. 28. 22.]\n",
- " [ 8. 4. 70. ..., 25. 25. 32.]\n",
- " ..., \n",
- " [ 20. 28. 25. ..., 478. 217. 151.]\n",
- " [ 20. 28. 25. ..., 217. 340. 147.]\n",
- " [ 23. 22. 32. ..., 151. 147. 380.]]\n",
- "\n",
- " Saving kernel matrix to file...\n",
- "\n",
- " Mean performance on val set: 30.909632\n",
- "With standard deviation: 6.490001\n",
- "\n",
- " Mean performance on test set: 33.117974\n",
- "With standard deviation: 7.069399\n",
- "\n",
- " --- calculating kernel matrix when subtree height = 7 ---\n",
- "\n",
- " Loading dataset from file...\n",
- "[ -23.7 14. 37.3 109.7 10.8 39. 42. 66.6 135. 148.5\n",
- " 40. 34.6 32. 63. 53.5 67. 64.4 84.7 95.5 92.\n",
- " 84.4 154. 156. 166. 183. 70.3 63.6 52.5 59. 59.5\n",
- " 55.2 88. 83. 104.5 102. 92. 107.4 123.2 112.5 118.5\n",
- " 101.5 173.7 165.5 181. 99.5 92.3 90.1 80.2 82. 91.2\n",
- " 91.5 81.2 93. 69. 86.3 82. 103. 103.5 96. 112. 104.\n",
- " 132.5 123.5 120.3 145. 144.2 142.8 132. 134.2 137. 139.\n",
- " 133.6 120.4 120. 137. 195.8 177.2 181. 185.9 175.7 186. 211.\n",
- " 125. 118. 117.1 107. 102.5 112. 97.4 91.5 87.6 106.5\n",
- " 101. 99.3 90. 137. 114. 126. 124. 140.5 157.5 146. 145.\n",
- " 141. 171. 166. 155. 145. 159. 138. 142. 159. 163.5\n",
- " 229.5 142. 125. 132. 130.5 125. 122. 121. 122.2 112. 106.\n",
- " 114.5 151. 128.5 109.5 126. 147. 158. 147. 165. 188.9\n",
- " 170. 178. 148.5 165. 177. 167. 195. 226. 215. 201. 205.\n",
- " 151.5 165.5 157. 139. 163. 153.5 139. 162. 173. 159.5\n",
- " 159.5 155.5 141. 126. 164. 163. 166.5 146. 165. 159. 195.\n",
- " 218. 250. 235. 186.5 156.5 162. 162. 170.2 173.2 186.8\n",
- " 173. 187. 174. 188.5 199. 228. 215. 216. 240. ]\n",
- "\n",
- " --- This is a regression problem ---\n",
- "\n",
- " Calculating kernel matrix, this could take a while...\n",
- "\n",
- " --- Weisfeiler-Lehman subtree kernel matrix of size 185 built in 4.446576833724976 seconds ---\n",
- "[[ 80. 14. 8. ..., 20. 20. 23.]\n",
- " [ 14. 128. 4. ..., 28. 28. 22.]\n",
- " [ 8. 4. 80. ..., 25. 25. 32.]\n",
- " ..., \n",
- " [ 20. 28. 25. ..., 536. 217. 151.]\n",
- " [ 20. 28. 25. ..., 217. 370. 147.]\n",
- " [ 23. 22. 32. ..., 151. 147. 402.]]\n",
- "\n",
- " Saving kernel matrix to file...\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- " Mean performance on val set: 31.870406\n",
- "With standard deviation: 6.522032\n",
- "\n",
- " Mean performance on test set: 33.964633\n",
- "With standard deviation: 7.270535\n",
- "\n",
- " --- calculating kernel matrix when subtree height = 8 ---\n",
- "\n",
- " Loading dataset from file...\n",
- "[ -23.7 14. 37.3 109.7 10.8 39. 42. 66.6 135. 148.5\n",
- " 40. 34.6 32. 63. 53.5 67. 64.4 84.7 95.5 92.\n",
- " 84.4 154. 156. 166. 183. 70.3 63.6 52.5 59. 59.5\n",
- " 55.2 88. 83. 104.5 102. 92. 107.4 123.2 112.5 118.5\n",
- " 101.5 173.7 165.5 181. 99.5 92.3 90.1 80.2 82. 91.2\n",
- " 91.5 81.2 93. 69. 86.3 82. 103. 103.5 96. 112. 104.\n",
- " 132.5 123.5 120.3 145. 144.2 142.8 132. 134.2 137. 139.\n",
- " 133.6 120.4 120. 137. 195.8 177.2 181. 185.9 175.7 186. 211.\n",
- " 125. 118. 117.1 107. 102.5 112. 97.4 91.5 87.6 106.5\n",
- " 101. 99.3 90. 137. 114. 126. 124. 140.5 157.5 146. 145.\n",
- " 141. 171. 166. 155. 145. 159. 138. 142. 159. 163.5\n",
- " 229.5 142. 125. 132. 130.5 125. 122. 121. 122.2 112. 106.\n",
- " 114.5 151. 128.5 109.5 126. 147. 158. 147. 165. 188.9\n",
- " 170. 178. 148.5 165. 177. 167. 195. 226. 215. 201. 205.\n",
- " 151.5 165.5 157. 139. 163. 153.5 139. 162. 173. 159.5\n",
- " 159.5 155.5 141. 126. 164. 163. 166.5 146. 165. 159. 195.\n",
- " 218. 250. 235. 186.5 156.5 162. 162. 170.2 173.2 186.8\n",
- " 173. 187. 174. 188.5 199. 228. 215. 216. 240. ]\n",
- "\n",
- " --- This is a regression problem ---\n",
- "\n",
- " Calculating kernel matrix, this could take a while...\n",
- "\n",
- " --- Weisfeiler-Lehman subtree kernel matrix of size 185 built in 4.85552978515625 seconds ---\n",
- "[[ 90. 14. 8. ..., 20. 20. 23.]\n",
- " [ 14. 144. 4. ..., 28. 28. 22.]\n",
- " [ 8. 4. 90. ..., 25. 25. 32.]\n",
- " ..., \n",
- " [ 20. 28. 25. ..., 594. 217. 151.]\n",
- " [ 20. 28. 25. ..., 217. 400. 147.]\n",
- " [ 23. 22. 32. ..., 151. 147. 424.]]\n",
- "\n",
- " Saving kernel matrix to file...\n",
- "\n",
- " Mean performance on val set: 32.192715\n",
- "With standard deviation: 6.389616\n",
- "\n",
- " Mean performance on test set: 34.325288\n",
- "With standard deviation: 7.375800\n",
- "\n",
- " --- calculating kernel matrix when subtree height = 9 ---\n",
- "\n",
- " Loading dataset from file...\n",
- "[ -23.7 14. 37.3 109.7 10.8 39. 42. 66.6 135. 148.5\n",
- " 40. 34.6 32. 63. 53.5 67. 64.4 84.7 95.5 92.\n",
- " 84.4 154. 156. 166. 183. 70.3 63.6 52.5 59. 59.5\n",
- " 55.2 88. 83. 104.5 102. 92. 107.4 123.2 112.5 118.5\n",
- " 101.5 173.7 165.5 181. 99.5 92.3 90.1 80.2 82. 91.2\n",
- " 91.5 81.2 93. 69. 86.3 82. 103. 103.5 96. 112. 104.\n",
- " 132.5 123.5 120.3 145. 144.2 142.8 132. 134.2 137. 139.\n",
- " 133.6 120.4 120. 137. 195.8 177.2 181. 185.9 175.7 186. 211.\n",
- " 125. 118. 117.1 107. 102.5 112. 97.4 91.5 87.6 106.5\n",
- " 101. 99.3 90. 137. 114. 126. 124. 140.5 157.5 146. 145.\n",
- " 141. 171. 166. 155. 145. 159. 138. 142. 159. 163.5\n",
- " 229.5 142. 125. 132. 130.5 125. 122. 121. 122.2 112. 106.\n",
- " 114.5 151. 128.5 109.5 126. 147. 158. 147. 165. 188.9\n",
- " 170. 178. 148.5 165. 177. 167. 195. 226. 215. 201. 205.\n",
- " 151.5 165.5 157. 139. 163. 153.5 139. 162. 173. 159.5\n",
- " 159.5 155.5 141. 126. 164. 163. 166.5 146. 165. 159. 195.\n",
- " 218. 250. 235. 186.5 156.5 162. 162. 170.2 173.2 186.8\n",
- " 173. 187. 174. 188.5 199. 228. 215. 216. 240. ]\n",
- "\n",
- " --- This is a regression problem ---\n",
- "\n",
- " Calculating kernel matrix, this could take a while...\n",
- "\n",
- " --- Weisfeiler-Lehman subtree kernel matrix of size 185 built in 5.650352239608765 seconds ---\n",
- "[[ 100. 14. 8. ..., 20. 20. 23.]\n",
- " [ 14. 160. 4. ..., 28. 28. 22.]\n",
- " [ 8. 4. 100. ..., 25. 25. 32.]\n",
- " ..., \n",
- " [ 20. 28. 25. ..., 652. 217. 151.]\n",
- " [ 20. 28. 25. ..., 217. 430. 147.]\n",
- " [ 23. 22. 32. ..., 151. 147. 446.]]\n",
- "\n",
- " Saving kernel matrix to file...\n",
- "\n",
- " Mean performance on val set: 32.842545\n",
- "With standard deviation: 6.213069\n",
- "\n",
- " Mean performance on test set: 34.675515\n",
- "With standard deviation: 7.314709\n",
- "\n",
- " --- calculating kernel matrix when subtree height = 10 ---\n",
- "\n",
- " Loading dataset from file...\n",
- "[ -23.7 14. 37.3 109.7 10.8 39. 42. 66.6 135. 148.5\n",
- " 40. 34.6 32. 63. 53.5 67. 64.4 84.7 95.5 92.\n",
- " 84.4 154. 156. 166. 183. 70.3 63.6 52.5 59. 59.5\n",
- " 55.2 88. 83. 104.5 102. 92. 107.4 123.2 112.5 118.5\n",
- " 101.5 173.7 165.5 181. 99.5 92.3 90.1 80.2 82. 91.2\n",
- " 91.5 81.2 93. 69. 86.3 82. 103. 103.5 96. 112. 104.\n",
- " 132.5 123.5 120.3 145. 144.2 142.8 132. 134.2 137. 139.\n",
- " 133.6 120.4 120. 137. 195.8 177.2 181. 185.9 175.7 186. 211.\n",
- " 125. 118. 117.1 107. 102.5 112. 97.4 91.5 87.6 106.5\n",
- " 101. 99.3 90. 137. 114. 126. 124. 140.5 157.5 146. 145.\n",
- " 141. 171. 166. 155. 145. 159. 138. 142. 159. 163.5\n",
- " 229.5 142. 125. 132. 130.5 125. 122. 121. 122.2 112. 106.\n",
- " 114.5 151. 128.5 109.5 126. 147. 158. 147. 165. 188.9\n",
- " 170. 178. 148.5 165. 177. 167. 195. 226. 215. 201. 205.\n",
- " 151.5 165.5 157. 139. 163. 153.5 139. 162. 173. 159.5\n",
- " 159.5 155.5 141. 126. 164. 163. 166.5 146. 165. 159. 195.\n",
- " 218. 250. 235. 186.5 156.5 162. 162. 170.2 173.2 186.8\n",
- " 173. 187. 174. 188.5 199. 228. 215. 216. 240. ]\n",
- "\n",
- " --- This is a regression problem ---\n",
- "\n",
- " Calculating kernel matrix, this could take a while...\n",
- "\n",
- " --- Weisfeiler-Lehman subtree kernel matrix of size 185 built in 5.818731069564819 seconds ---\n",
- "[[ 110. 14. 8. ..., 20. 20. 23.]\n",
- " [ 14. 176. 4. ..., 28. 28. 22.]\n",
- " [ 8. 4. 110. ..., 25. 25. 32.]\n",
- " ..., \n",
- " [ 20. 28. 25. ..., 710. 217. 151.]\n",
- " [ 20. 28. 25. ..., 217. 460. 147.]\n",
- " [ 23. 22. 32. ..., 151. 147. 468.]]\n",
- "\n",
- " Saving kernel matrix to file...\n",
- "\n",
- " Mean performance on val set: 33.151974\n",
- "With standard deviation: 6.196320\n",
- "\n",
- " Mean performance on test set: 34.867215\n",
- "With standard deviation: 7.324672\n",
- "\n",
- "\n",
- " std height RMSE\n",
- "------- -------- --------\n",
- "6.35786 1 9.01403\n",
- "5.05746 2.1 19.4571\n",
- "5.83683 3.2 23.4668\n",
- "6.29182 4.3 27.9614\n",
- "6.85284 5.4 30.6145\n",
- "7.06295 6.5 32.1308\n",
- "7.0694 7.6 33.118\n",
- "7.27054 8.7 33.9646\n",
- "7.3758 9.8 34.3253\n",
- "7.31471 10.9 34.6755\n",
- "7.32467 12 34.8672\n"
- ]
- }
- ],
- "source": [
- "# Author: Elisabetta Ghisu\n",
- "# test of WL subtree kernel\n",
- "\n",
- "\"\"\"\n",
- "- This script take as input a kernel matrix\n",
- "and returns the classification or regression performance\n",
- "- The kernel matrix can be calculated using any of the graph kernels approaches\n",
- "- The criteria used for prediction are SVM for classification and kernel Ridge regression for regression\n",
- "- For predition we divide the data in training, validation and test. For each split, we first train on the train data, \n",
- "then evaluate the performance on the validation. We choose the optimal parameters for the validation set and finally\n",
- "provide the corresponding performance on the test set. If more than one split is performed, the final results \n",
- "correspond to the average of the performances on the test sets. \n",
- "\n",
- "@references\n",
- " https://github.com/eghisu/GraphKernels/blob/master/GraphKernelsCollection/python_scripts/compute_perf_gk.py\n",
- "\"\"\"\n",
- "\n",
- "print(__doc__)\n",
- "\n",
- "import sys\n",
- "import os\n",
- "import pathlib\n",
- "sys.path.insert(0, \"../\")\n",
- "from tabulate import tabulate\n",
- "\n",
- "import random\n",
- "import numpy as np\n",
- "import matplotlib.pyplot as plt\n",
- "\n",
- "from sklearn.kernel_ridge import KernelRidge # 0.17\n",
- "from sklearn.metrics import accuracy_score, mean_squared_error\n",
- "from sklearn import svm\n",
- "\n",
- "from pygraph.kernels.weisfeilerLehmanKernel import weisfeilerlehmankernel\n",
- "from pygraph.utils.graphfiles import loadDataset\n",
- "\n",
- "val_means_height = []\n",
- "val_stds_height = []\n",
- "test_means_height = []\n",
- "test_stds_height = []\n",
- "\n",
- "\n",
- "for height in np.linspace(0, 10, 11):\n",
- " print('\\n --- calculating kernel matrix when subtree height = %d ---' % height)\n",
- "\n",
- " print('\\n Loading dataset from file...')\n",
- " dataset, y = loadDataset(\"../../../../datasets/acyclic/Acyclic/dataset_bps.ds\")\n",
- " y = np.array(y)\n",
- " print(y)\n",
- "\n",
- " # setup the parameters\n",
- " model_type = 'regression' # Regression or classification problem\n",
- " print('\\n --- This is a %s problem ---' % model_type)\n",
- "\n",
- " datasize = len(dataset)\n",
- " trials = 100 # Trials for hyperparameters random search\n",
- " splits = 10 # Number of splits of the data\n",
- " alpha_grid = np.logspace(-10, 10, num = trials, base = 10) # corresponds to (2*C)^-1 in other linear models such as LogisticRegression\n",
- " C_grid = np.logspace(-10, 10, num = trials, base = 10)\n",
- " random.seed(20) # Set the seed for uniform parameter distribution\n",
- "\n",
- " # set the output path\n",
- " kernel_file_path = 'kernelmatrices_weisfeilerlehman_subtree_acyclic/'\n",
- " if not os.path.exists(kernel_file_path):\n",
- " os.makedirs(kernel_file_path)\n",
- "\n",
- " \"\"\"\n",
- " - Here starts the main program\n",
- " - First we permute the data, then for each split we evaluate corresponding performances\n",
- " - In the end, the performances are averaged over the test sets\n",
- " \"\"\"\n",
- "\n",
- " # save kernel matrices to files / read kernel matrices from files\n",
- " kernel_file = kernel_file_path + 'km.ds'\n",
- " path = pathlib.Path(kernel_file)\n",
- " # get train set kernel matrix\n",
- " if path.is_file():\n",
- " print('\\n Loading the kernel matrix from file...')\n",
- " Kmatrix = np.loadtxt(kernel_file)\n",
- " print(Kmatrix)\n",
- " else:\n",
- " print('\\n Calculating kernel matrix, this could take a while...')\n",
- " Kmatrix = weisfeilerlehmankernel(dataset, height = int(height))\n",
- " print(Kmatrix)\n",
- " print('\\n Saving kernel matrix to file...')\n",
- " # np.savetxt(kernel_file, Kmatrix)\n",
- "\n",
- " # Initialize the performance of the best parameter trial on validation with the corresponding performance on test\n",
- " val_split = []\n",
- " test_split = []\n",
- "\n",
- " # For each split of the data\n",
- " for j in range(10, 10 + splits):\n",
- " # print('\\n Starting split %d...' % j)\n",
- "\n",
- " # Set the random set for data permutation\n",
- " random_state = int(j)\n",
- " np.random.seed(random_state)\n",
- " idx_perm = np.random.permutation(datasize)\n",
- " # print(idx_perm)\n",
- "\n",
- " # Permute the data\n",
- " y_perm = y[idx_perm] # targets permutation\n",
- " # print(y_perm)\n",
- " Kmatrix_perm = Kmatrix[:, idx_perm] # inputs permutation\n",
- " # print(Kmatrix_perm)\n",
- " Kmatrix_perm = Kmatrix_perm[idx_perm, :] # inputs permutation\n",
- "\n",
- " # Set the training, validation and test\n",
- " # Note: the percentage can be set up by the user\n",
- " num_train_val = int((datasize * 90) / 100) # 90% (of entire dataset) for training and validation\n",
- " num_test = datasize - num_train_val # 10% (of entire dataset) for test\n",
- " num_train = int((num_train_val * 90) / 100) # 90% (of train + val) for training\n",
- " num_val = num_train_val - num_train # 10% (of train + val) for validation\n",
- "\n",
- " # Split the kernel matrix\n",
- " Kmatrix_train = Kmatrix_perm[0:num_train, 0:num_train]\n",
- " Kmatrix_val = Kmatrix_perm[num_train:(num_train + num_val), 0:num_train]\n",
- " Kmatrix_test = Kmatrix_perm[(num_train + num_val):datasize, 0:num_train]\n",
- "\n",
- " # Split the targets\n",
- " y_train = y_perm[0:num_train]\n",
- "\n",
- " # Normalization step (for real valued targets only)\n",
- " if model_type == 'regression':\n",
- " # print('\\n Normalizing output y...')\n",
- " y_train_mean = np.mean(y_train)\n",
- " y_train_std = np.std(y_train)\n",
- " y_train = (y_train - y_train_mean) / float(y_train_std)\n",
- " # print(y)\n",
- "\n",
- " y_val = y_perm[num_train:(num_train + num_val)]\n",
- " y_test = y_perm[(num_train + num_val):datasize]\n",
- "\n",
- " # Record the performance for each parameter trial respectively on validation and test set\n",
- " perf_all_val = []\n",
- " perf_all_test = []\n",
- "\n",
- " # For each parameter trial\n",
- " for i in range(trials):\n",
- " # For regression use the Kernel Ridge method\n",
- " if model_type == 'regression':\n",
- " # print('\\n Starting experiment for trial %d and parameter alpha = %3f\\n ' % (i, alpha_grid[i]))\n",
- "\n",
- " # Fit the kernel ridge model\n",
- " KR = KernelRidge(kernel = 'precomputed', alpha = alpha_grid[i])\n",
- " # KR = svm.SVR(kernel = 'precomputed', C = C_grid[i])\n",
- " KR.fit(Kmatrix_train, y_train)\n",
- "\n",
- " # predict on the validation and test set\n",
- " y_pred = KR.predict(Kmatrix_val)\n",
- " y_pred_test = KR.predict(Kmatrix_test)\n",
- " # print(y_pred)\n",
- "\n",
- " # adjust prediction: needed because the training targets have been normalizaed\n",
- " y_pred = y_pred * float(y_train_std) + y_train_mean\n",
- " # print(y_pred)\n",
- " y_pred_test = y_pred_test * float(y_train_std) + y_train_mean\n",
- " # print(y_pred_test)\n",
- "\n",
- " # root mean squared error on validation\n",
- " rmse = np.sqrt(mean_squared_error(y_val, y_pred))\n",
- " perf_all_val.append(rmse)\n",
- "\n",
- " # root mean squared error in test \n",
- " rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))\n",
- " perf_all_test.append(rmse_test)\n",
- "\n",
- " # print('The performance on the validation set is: %3f' % rmse)\n",
- " # print('The performance on the test set is: %3f' % rmse_test)\n",
- "\n",
- " # --- FIND THE OPTIMAL PARAMETERS --- #\n",
- " # For regression: minimise the mean squared error\n",
- " if model_type == 'regression':\n",
- "\n",
- " # get optimal parameter on validation (argmin mean squared error)\n",
- " min_idx = np.argmin(perf_all_test)\n",
- " alpha_opt = alpha_grid[min_idx]\n",
- "\n",
- " # performance corresponding to optimal parameter on val\n",
- " perf_val_opt = perf_all_val[min_idx]\n",
- "\n",
- " # corresponding performance on test for the same parameter\n",
- " perf_test_opt = perf_all_test[min_idx]\n",
- "\n",
- " # print('The best performance is for trial %d with parameter alpha = %3f' % (min_idx, alpha_opt))\n",
- " # print('The best performance on the validation set is: %3f' % perf_val_opt)\n",
- " # print('The corresponding performance on test set is: %3f' % perf_test_opt)\n",
- "\n",
- " # append the best performance on validation\n",
- " # at the current split\n",
- " val_split.append(perf_val_opt)\n",
- "\n",
- " # append the correponding performance on the test set\n",
- " test_split.append(perf_test_opt)\n",
- "\n",
- " # average the results\n",
- " # mean of the validation performances over the splits\n",
- " val_mean = np.mean(np.asarray(val_split))\n",
- " # std deviation of validation over the splits\n",
- " val_std = np.std(np.asarray(val_split))\n",
- "\n",
- " # mean of the test performances over the splits\n",
- " test_mean = np.mean(np.asarray(test_split))\n",
- " # std deviation of the test oer the splits\n",
- " test_std = np.std(np.asarray(test_split))\n",
- "\n",
- " print('\\n Mean performance on val set: %3f' % val_mean)\n",
- " print('With standard deviation: %3f' % val_std)\n",
- " print('\\n Mean performance on test set: %3f' % test_mean)\n",
- " print('With standard deviation: %3f' % test_std)\n",
- " \n",
- " val_means_height.append(val_mean)\n",
- " val_stds_height.append(val_std)\n",
- " test_means_height.append(test_mean)\n",
- " test_stds_height.append(test_std)\n",
- " \n",
- "print('\\n') \n",
- "print(tabulate({'height': np.linspace(1, 12, 11), 'RMSE': test_means_height, 'std': test_stds_height}, headers='keys'))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "scrolled": true
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "- This script take as input a kernel matrix\n",
- "and returns the classification or regression performance\n",
- "- The kernel matrix can be calculated using any of the graph kernels approaches\n",
- "- The criteria used for prediction are SVM for classification and kernel Ridge regression for regression\n",
- "- For predition we divide the data in training, validation and test. For each split, we first train on the train data, \n",
- "then evaluate the performance on the validation. We choose the optimal parameters for the validation set and finally\n",
- "provide the corresponding performance on the test set. If more than one split is performed, the final results \n",
- "correspond to the average of the performances on the test sets. \n",
- "\n",
- "@references\n",
- " https://github.com/eghisu/GraphKernels/blob/master/GraphKernelsCollection/python_scripts/compute_perf_gk.py\n",
- "\n",
- "\n",
- " --- calculating kernel matrix when subtree height = 0 ---\n",
- "\n",
- " Loading dataset from file...\n",
- "[ -23.7 14. 37.3 109.7 10.8 39. 42. 66.6 135. 148.5\n",
- " 40. 34.6 32. 63. 53.5 67. 64.4 84.7 95.5 92.\n",
- " 84.4 154. 156. 166. 183. 70.3 63.6 52.5 59. 59.5\n",
- " 55.2 88. 83. 104.5 102. 92. 107.4 123.2 112.5 118.5\n",
- " 101.5 173.7 165.5 181. 99.5 92.3 90.1 80.2 82. 91.2\n",
- " 91.5 81.2 93. 69. 86.3 82. 103. 103.5 96. 112. 104.\n",
- " 132.5 123.5 120.3 145. 144.2 142.8 132. 134.2 137. 139.\n",
- " 133.6 120.4 120. 137. 195.8 177.2 181. 185.9 175.7 186. 211.\n",
- " 125. 118. 117.1 107. 102.5 112. 97.4 91.5 87.6 106.5\n",
- " 101. 99.3 90. 137. 114. 126. 124. 140.5 157.5 146. 145.\n",
- " 141. 171. 166. 155. 145. 159. 138. 142. 159. 163.5\n",
- " 229.5 142. 125. 132. 130.5 125. 122. 121. 122.2 112. 106.\n",
- " 114.5 151. 128.5 109.5 126. 147. 158. 147. 165. 188.9\n",
- " 170. 178. 148.5 165. 177. 167. 195. 226. 215. 201. 205.\n",
- " 151.5 165.5 157. 139. 163. 153.5 139. 162. 173. 159.5\n",
- " 159.5 155.5 141. 126. 164. 163. 166.5 146. 165. 159. 195.\n",
- " 218. 250. 235. 186.5 156.5 162. 162. 170.2 173.2 186.8\n",
- " 173. 187. 174. 188.5 199. 228. 215. 216. 240. ]\n",
- "\n",
- " --- This is a regression problem ---\n",
- "\n",
- " Calculating kernel matrix, this could take a while...\n",
- "\n",
- " --- Weisfeiler-Lehman sp kernel matrix of size 185 built in 47.540945053100586 seconds ---\n",
- "[[ 6. 2. 6. ..., 2. 2. 2.]\n",
- " [ 2. 12. 2. ..., 0. 0. 6.]\n",
- " [ 6. 2. 6. ..., 2. 2. 2.]\n",
- " ..., \n",
- " [ 2. 0. 2. ..., 110. 42. 14.]\n",
- " [ 2. 0. 2. ..., 42. 110. 14.]\n",
- " [ 2. 6. 2. ..., 14. 14. 110.]]\n",
- "\n",
- " Saving kernel matrix to file...\n",
- "\n",
- " Mean performance on val set: 38.533318\n",
- "With standard deviation: 6.213602\n",
- "\n",
- " Mean performance on test set: 36.055557\n",
- "With standard deviation: 5.386696\n",
- "\n",
- " --- calculating kernel matrix when subtree height = 1 ---\n",
- "\n",
- " Loading dataset from file...\n",
- "[ -23.7 14. 37.3 109.7 10.8 39. 42. 66.6 135. 148.5\n",
- " 40. 34.6 32. 63. 53.5 67. 64.4 84.7 95.5 92.\n",
- " 84.4 154. 156. 166. 183. 70.3 63.6 52.5 59. 59.5\n",
- " 55.2 88. 83. 104.5 102. 92. 107.4 123.2 112.5 118.5\n",
- " 101.5 173.7 165.5 181. 99.5 92.3 90.1 80.2 82. 91.2\n",
- " 91.5 81.2 93. 69. 86.3 82. 103. 103.5 96. 112. 104.\n",
- " 132.5 123.5 120.3 145. 144.2 142.8 132. 134.2 137. 139.\n",
- " 133.6 120.4 120. 137. 195.8 177.2 181. 185.9 175.7 186. 211.\n",
- " 125. 118. 117.1 107. 102.5 112. 97.4 91.5 87.6 106.5\n",
- " 101. 99.3 90. 137. 114. 126. 124. 140.5 157.5 146. 145.\n",
- " 141. 171. 166. 155. 145. 159. 138. 142. 159. 163.5\n",
- " 229.5 142. 125. 132. 130.5 125. 122. 121. 122.2 112. 106.\n",
- " 114.5 151. 128.5 109.5 126. 147. 158. 147. 165. 188.9\n",
- " 170. 178. 148.5 165. 177. 167. 195. 226. 215. 201. 205.\n",
- " 151.5 165.5 157. 139. 163. 153.5 139. 162. 173. 159.5\n",
- " 159.5 155.5 141. 126. 164. 163. 166.5 146. 165. 159. 195.\n",
- " 218. 250. 235. 186.5 156.5 162. 162. 170.2 173.2 186.8\n",
- " 173. 187. 174. 188.5 199. 228. 215. 216. 240. ]\n",
- "\n",
- " --- This is a regression problem ---\n",
- "\n",
- " Calculating kernel matrix, this could take a while...\n",
- "\n",
- " --- Weisfeiler-Lehman sp kernel matrix of size 185 built in 75.94973611831665 seconds ---\n",
- "[[ 9. 3. 9. ..., 3. 3. 3.]\n",
- " [ 3. 18. 3. ..., 0. 0. 9.]\n",
- " [ 9. 3. 9. ..., 3. 3. 3.]\n",
- " ..., \n",
- " [ 3. 0. 3. ..., 165. 63. 21.]\n",
- " [ 3. 0. 3. ..., 63. 165. 21.]\n",
- " [ 3. 9. 3. ..., 21. 21. 165.]]\n",
- "\n",
- " Saving kernel matrix to file...\n",
- "\n",
- " Mean performance on val set: 38.464684\n",
- "With standard deviation: 6.299737\n",
- "\n",
- " Mean performance on test set: 36.054735\n",
- "With standard deviation: 5.384130\n",
- "\n",
- " --- calculating kernel matrix when subtree height = 2 ---\n",
- "\n",
- " Loading dataset from file...\n",
- "[ -23.7 14. 37.3 109.7 10.8 39. 42. 66.6 135. 148.5\n",
- " 40. 34.6 32. 63. 53.5 67. 64.4 84.7 95.5 92.\n",
- " 84.4 154. 156. 166. 183. 70.3 63.6 52.5 59. 59.5\n",
- " 55.2 88. 83. 104.5 102. 92. 107.4 123.2 112.5 118.5\n",
- " 101.5 173.7 165.5 181. 99.5 92.3 90.1 80.2 82. 91.2\n",
- " 91.5 81.2 93. 69. 86.3 82. 103. 103.5 96. 112. 104.\n",
- " 132.5 123.5 120.3 145. 144.2 142.8 132. 134.2 137. 139.\n",
- " 133.6 120.4 120. 137. 195.8 177.2 181. 185.9 175.7 186. 211.\n",
- " 125. 118. 117.1 107. 102.5 112. 97.4 91.5 87.6 106.5\n",
- " 101. 99.3 90. 137. 114. 126. 124. 140.5 157.5 146. 145.\n",
- " 141. 171. 166. 155. 145. 159. 138. 142. 159. 163.5\n",
- " 229.5 142. 125. 132. 130.5 125. 122. 121. 122.2 112. 106.\n",
- " 114.5 151. 128.5 109.5 126. 147. 158. 147. 165. 188.9\n",
- " 170. 178. 148.5 165. 177. 167. 195. 226. 215. 201. 205.\n",
- " 151.5 165.5 157. 139. 163. 153.5 139. 162. 173. 159.5\n",
- " 159.5 155.5 141. 126. 164. 163. 166.5 146. 165. 159. 195.\n",
- " 218. 250. 235. 186.5 156.5 162. 162. 170.2 173.2 186.8\n",
- " 173. 187. 174. 188.5 199. 228. 215. 216. 240. ]\n",
- "\n",
- " --- This is a regression problem ---\n",
- "\n",
- " Calculating kernel matrix, this could take a while...\n",
- "\n",
- " --- Weisfeiler-Lehman sp kernel matrix of size 185 built in 98.63305306434631 seconds ---\n",
- "[[ 12. 4. 12. ..., 4. 4. 4.]\n",
- " [ 4. 24. 4. ..., 0. 0. 12.]\n",
- " [ 12. 4. 12. ..., 4. 4. 4.]\n",
- " ..., \n",
- " [ 4. 0. 4. ..., 220. 84. 28.]\n",
- " [ 4. 0. 4. ..., 84. 220. 28.]\n",
- " [ 4. 12. 4. ..., 28. 28. 220.]]\n",
- "\n",
- " Saving kernel matrix to file...\n",
- "\n",
- " Mean performance on val set: 38.594816\n",
- "With standard deviation: 6.106887\n",
- "\n",
- " Mean performance on test set: 36.069839\n",
- "With standard deviation: 5.406605\n",
- "\n",
- " --- calculating kernel matrix when subtree height = 3 ---\n",
- "\n",
- " Loading dataset from file...\n",
- "[ -23.7 14. 37.3 109.7 10.8 39. 42. 66.6 135. 148.5\n",
- " 40. 34.6 32. 63. 53.5 67. 64.4 84.7 95.5 92.\n",
- " 84.4 154. 156. 166. 183. 70.3 63.6 52.5 59. 59.5\n",
- " 55.2 88. 83. 104.5 102. 92. 107.4 123.2 112.5 118.5\n",
- " 101.5 173.7 165.5 181. 99.5 92.3 90.1 80.2 82. 91.2\n",
- " 91.5 81.2 93. 69. 86.3 82. 103. 103.5 96. 112. 104.\n",
- " 132.5 123.5 120.3 145. 144.2 142.8 132. 134.2 137. 139.\n",
- " 133.6 120.4 120. 137. 195.8 177.2 181. 185.9 175.7 186. 211.\n",
- " 125. 118. 117.1 107. 102.5 112. 97.4 91.5 87.6 106.5\n",
- " 101. 99.3 90. 137. 114. 126. 124. 140.5 157.5 146. 145.\n",
- " 141. 171. 166. 155. 145. 159. 138. 142. 159. 163.5\n",
- " 229.5 142. 125. 132. 130.5 125. 122. 121. 122.2 112. 106.\n",
- " 114.5 151. 128.5 109.5 126. 147. 158. 147. 165. 188.9\n",
- " 170. 178. 148.5 165. 177. 167. 195. 226. 215. 201. 205.\n",
- " 151.5 165.5 157. 139. 163. 153.5 139. 162. 173. 159.5\n",
- " 159.5 155.5 141. 126. 164. 163. 166.5 146. 165. 159. 195.\n",
- " 218. 250. 235. 186.5 156.5 162. 162. 170.2 173.2 186.8\n",
- " 173. 187. 174. 188.5 199. 228. 215. 216. 240. ]\n",
- "\n",
- " --- This is a regression problem ---\n",
- "\n",
- " Calculating kernel matrix, this could take a while...\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- " --- Weisfeiler-Lehman sp kernel matrix of size 185 built in 126.40115857124329 seconds ---\n",
- "[[ 15. 5. 15. ..., 5. 5. 5.]\n",
- " [ 5. 30. 5. ..., 0. 0. 15.]\n",
- " [ 15. 5. 15. ..., 5. 5. 5.]\n",
- " ..., \n",
- " [ 5. 0. 5. ..., 275. 105. 35.]\n",
- " [ 5. 0. 5. ..., 105. 275. 35.]\n",
- " [ 5. 15. 5. ..., 35. 35. 275.]]\n",
- "\n",
- " Saving kernel matrix to file...\n",
- "\n",
- " Mean performance on val set: 38.545772\n",
- "With standard deviation: 6.200795\n",
- "\n",
- " Mean performance on test set: 36.055164\n",
- "With standard deviation: 5.385283\n",
- "\n",
- " --- calculating kernel matrix when subtree height = 4 ---\n",
- "\n",
- " Loading dataset from file...\n",
- "[ -23.7 14. 37.3 109.7 10.8 39. 42. 66.6 135. 148.5\n",
- " 40. 34.6 32. 63. 53.5 67. 64.4 84.7 95.5 92.\n",
- " 84.4 154. 156. 166. 183. 70.3 63.6 52.5 59. 59.5\n",
- " 55.2 88. 83. 104.5 102. 92. 107.4 123.2 112.5 118.5\n",
- " 101.5 173.7 165.5 181. 99.5 92.3 90.1 80.2 82. 91.2\n",
- " 91.5 81.2 93. 69. 86.3 82. 103. 103.5 96. 112. 104.\n",
- " 132.5 123.5 120.3 145. 144.2 142.8 132. 134.2 137. 139.\n",
- " 133.6 120.4 120. 137. 195.8 177.2 181. 185.9 175.7 186. 211.\n",
- " 125. 118. 117.1 107. 102.5 112. 97.4 91.5 87.6 106.5\n",
- " 101. 99.3 90. 137. 114. 126. 124. 140.5 157.5 146. 145.\n",
- " 141. 171. 166. 155. 145. 159. 138. 142. 159. 163.5\n",
- " 229.5 142. 125. 132. 130.5 125. 122. 121. 122.2 112. 106.\n",
- " 114.5 151. 128.5 109.5 126. 147. 158. 147. 165. 188.9\n",
- " 170. 178. 148.5 165. 177. 167. 195. 226. 215. 201. 205.\n",
- " 151.5 165.5 157. 139. 163. 153.5 139. 162. 173. 159.5\n",
- " 159.5 155.5 141. 126. 164. 163. 166.5 146. 165. 159. 195.\n",
- " 218. 250. 235. 186.5 156.5 162. 162. 170.2 173.2 186.8\n",
- " 173. 187. 174. 188.5 199. 228. 215. 216. 240. ]\n",
- "\n",
- " --- This is a regression problem ---\n",
- "\n",
- " Calculating kernel matrix, this could take a while...\n"
- ]
- }
- ],
- "source": [
- "# Author: Elisabetta Ghisu\n",
- "# test of WL subtree kernel\n",
- "\n",
- "\"\"\"\n",
- "- This script take as input a kernel matrix\n",
- "and returns the classification or regression performance\n",
- "- The kernel matrix can be calculated using any of the graph kernels approaches\n",
- "- The criteria used for prediction are SVM for classification and kernel Ridge regression for regression\n",
- "- For predition we divide the data in training, validation and test. For each split, we first train on the train data, \n",
- "then evaluate the performance on the validation. We choose the optimal parameters for the validation set and finally\n",
- "provide the corresponding performance on the test set. If more than one split is performed, the final results \n",
- "correspond to the average of the performances on the test sets. \n",
- "\n",
- "@references\n",
- " https://github.com/eghisu/GraphKernels/blob/master/GraphKernelsCollection/python_scripts/compute_perf_gk.py\n",
- "\"\"\"\n",
- "\n",
- "print(__doc__)\n",
- "\n",
- "import sys\n",
- "import os\n",
- "import pathlib\n",
- "sys.path.insert(0, \"../\")\n",
- "from tabulate import tabulate\n",
- "\n",
- "import random\n",
- "import numpy as np\n",
- "import matplotlib.pyplot as plt\n",
- "\n",
- "from sklearn.kernel_ridge import KernelRidge # 0.17\n",
- "from sklearn.metrics import accuracy_score, mean_squared_error\n",
- "from sklearn import svm\n",
- "\n",
- "from pygraph.kernels.weisfeilerLehmanKernel import weisfeilerlehmankernel\n",
- "from pygraph.utils.graphfiles import loadDataset\n",
- "\n",
- "val_means_height = []\n",
- "val_stds_height = []\n",
- "test_means_height = []\n",
- "test_stds_height = []\n",
- "\n",
- "\n",
- "for height in np.linspace(0, 10, 11):\n",
- " print('\\n --- calculating kernel matrix when subtree height = %d ---' % height)\n",
- "\n",
- " print('\\n Loading dataset from file...')\n",
- " dataset, y = loadDataset(\"../../../../datasets/acyclic/Acyclic/dataset_bps.ds\")\n",
- " y = np.array(y)\n",
- " print(y)\n",
- "\n",
- " # setup the parameters\n",
- " model_type = 'regression' # Regression or classification problem\n",
- " print('\\n --- This is a %s problem ---' % model_type)\n",
- "\n",
- " datasize = len(dataset)\n",
- " trials = 100 # Trials for hyperparameters random search\n",
- " splits = 10 # Number of splits of the data\n",
- " alpha_grid = np.logspace(-10, 10, num = trials, base = 10) # corresponds to (2*C)^-1 in other linear models such as LogisticRegression\n",
- " C_grid = np.logspace(-10, 10, num = trials, base = 10)\n",
- " random.seed(20) # Set the seed for uniform parameter distribution\n",
- "\n",
- " # set the output path\n",
- " kernel_file_path = 'kernelmatrices_weisfeilerlehman_acyclic/'\n",
- " if not os.path.exists(kernel_file_path):\n",
- " os.makedirs(kernel_file_path)\n",
- "\n",
- "\n",
- " \"\"\"\n",
- " - Here starts the main program\n",
- " - First we permute the data, then for each split we evaluate corresponding performances\n",
- " - In the end, the performances are averaged over the test sets\n",
- " \"\"\"\n",
- "\n",
- " # save kernel matrices to files / read kernel matrices from files\n",
- " kernel_file = kernel_file_path + 'km.ds'\n",
- " path = pathlib.Path(kernel_file)\n",
- " # get train set kernel matrix\n",
- " if path.is_file():\n",
- " print('\\n Loading the kernel matrix from file...')\n",
- " Kmatrix = np.loadtxt(kernel_file)\n",
- " print(Kmatrix)\n",
- " else:\n",
- " print('\\n Calculating kernel matrix, this could take a while...')\n",
- " Kmatrix = weisfeilerlehmankernel(dataset, height = int(height), base_kernel = 'sp')\n",
- " print(Kmatrix)\n",
- " print('\\n Saving kernel matrix to file...')\n",
- "# np.savetxt(kernel_file, Kmatrix)\n",
- "\n",
- " # Initialize the performance of the best parameter trial on validation with the corresponding performance on test\n",
- " val_split = []\n",
- " test_split = []\n",
- "\n",
- " # For each split of the data\n",
- " for j in range(10, 10 + splits):\n",
- " # print('\\n Starting split %d...' % j)\n",
- "\n",
- " # Set the random set for data permutation\n",
- " random_state = int(j)\n",
- " np.random.seed(random_state)\n",
- " idx_perm = np.random.permutation(datasize)\n",
- " # print(idx_perm)\n",
- "\n",
- " # Permute the data\n",
- " y_perm = y[idx_perm] # targets permutation\n",
- " # print(y_perm)\n",
- " Kmatrix_perm = Kmatrix[:, idx_perm] # inputs permutation\n",
- " # print(Kmatrix_perm)\n",
- " Kmatrix_perm = Kmatrix_perm[idx_perm, :] # inputs permutation\n",
- "\n",
- " # Set the training, validation and test\n",
- " # Note: the percentage can be set up by the user\n",
- " num_train_val = int((datasize * 90) / 100) # 90% (of entire dataset) for training and validation\n",
- " num_test = datasize - num_train_val # 10% (of entire dataset) for test\n",
- " num_train = int((num_train_val * 90) / 100) # 90% (of train + val) for training\n",
- " num_val = num_train_val - num_train # 10% (of train + val) for validation\n",
- "\n",
- " # Split the kernel matrix\n",
- " Kmatrix_train = Kmatrix_perm[0:num_train, 0:num_train]\n",
- " Kmatrix_val = Kmatrix_perm[num_train:(num_train + num_val), 0:num_train]\n",
- " Kmatrix_test = Kmatrix_perm[(num_train + num_val):datasize, 0:num_train]\n",
- "\n",
- " # Split the targets\n",
- " y_train = y_perm[0:num_train]\n",
- "\n",
- " # Normalization step (for real valued targets only)\n",
- " if model_type == 'regression':\n",
- " # print('\\n Normalizing output y...')\n",
- " y_train_mean = np.mean(y_train)\n",
- " y_train_std = np.std(y_train)\n",
- " y_train = (y_train - y_train_mean) / float(y_train_std)\n",
- " # print(y)\n",
- "\n",
- " y_val = y_perm[num_train:(num_train + num_val)]\n",
- " y_test = y_perm[(num_train + num_val):datasize]\n",
- "\n",
- " # Record the performance for each parameter trial respectively on validation and test set\n",
- " perf_all_val = []\n",
- " perf_all_test = []\n",
- "\n",
- " # For each parameter trial\n",
- " for i in range(trials):\n",
- " # For regression use the Kernel Ridge method\n",
- " if model_type == 'regression':\n",
- " # print('\\n Starting experiment for trial %d and parameter alpha = %3f\\n ' % (i, alpha_grid[i]))\n",
- "\n",
- " # Fit the kernel ridge model\n",
- " KR = KernelRidge(kernel = 'precomputed', alpha = alpha_grid[i])\n",
- " # KR = svm.SVR(kernel = 'precomputed', C = C_grid[i])\n",
- " KR.fit(Kmatrix_train, y_train)\n",
- "\n",
- " # predict on the validation and test set\n",
- " y_pred = KR.predict(Kmatrix_val)\n",
- " y_pred_test = KR.predict(Kmatrix_test)\n",
- " # print(y_pred)\n",
- "\n",
- " # adjust prediction: needed because the training targets have been normalizaed\n",
- " y_pred = y_pred * float(y_train_std) + y_train_mean\n",
- " # print(y_pred)\n",
- " y_pred_test = y_pred_test * float(y_train_std) + y_train_mean\n",
- " # print(y_pred_test)\n",
- "\n",
- " # root mean squared error on validation\n",
- " rmse = np.sqrt(mean_squared_error(y_val, y_pred))\n",
- " perf_all_val.append(rmse)\n",
- "\n",
- " # root mean squared error in test \n",
- " rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))\n",
- " perf_all_test.append(rmse_test)\n",
- "\n",
- " # print('The performance on the validation set is: %3f' % rmse)\n",
- " # print('The performance on the test set is: %3f' % rmse_test)\n",
- "\n",
- " # --- FIND THE OPTIMAL PARAMETERS --- #\n",
- " # For regression: minimise the mean squared error\n",
- " if model_type == 'regression':\n",
- "\n",
- " # get optimal parameter on validation (argmin mean squared error)\n",
- " min_idx = np.argmin(perf_all_test)\n",
- " alpha_opt = alpha_grid[min_idx]\n",
- "\n",
- " # performance corresponding to optimal parameter on val\n",
- " perf_val_opt = perf_all_val[min_idx]\n",
- "\n",
- " # corresponding performance on test for the same parameter\n",
- " perf_test_opt = perf_all_test[min_idx]\n",
- "\n",
- " # print('The best performance is for trial %d with parameter alpha = %3f' % (min_idx, alpha_opt))\n",
- " # print('The best performance on the validation set is: %3f' % perf_val_opt)\n",
- " # print('The corresponding performance on test set is: %3f' % perf_test_opt)\n",
- "\n",
- " # append the best performance on validation\n",
- " # at the current split\n",
- " val_split.append(perf_val_opt)\n",
- "\n",
- " # append the correponding performance on the test set\n",
- " test_split.append(perf_test_opt)\n",
- "\n",
- " # average the results\n",
- " # mean of the validation performances over the splits\n",
- " val_mean = np.mean(np.asarray(val_split))\n",
- " # std deviation of validation over the splits\n",
- " val_std = np.std(np.asarray(val_split))\n",
- "\n",
- " # mean of the test performances over the splits\n",
- " test_mean = np.mean(np.asarray(test_split))\n",
- " # std deviation of the test oer the splits\n",
- " test_std = np.std(np.asarray(test_split))\n",
- "\n",
- " print('\\n Mean performance on val set: %3f' % val_mean)\n",
- " print('With standard deviation: %3f' % val_std)\n",
- " print('\\n Mean performance on test set: %3f' % test_mean)\n",
- " print('With standard deviation: %3f' % test_std)\n",
- " \n",
- " val_means_height.append(val_mean)\n",
- " val_stds_height.append(val_std)\n",
- " test_means_height.append(test_mean)\n",
- " test_stds_height.append(test_std)\n",
- " \n",
- "print('\\n') \n",
- "print(tabulate({'height': np.linspace(1, 12, 11), 'RMSE': test_means_height, 'std': test_stds_height}, headers='keys'))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 15,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "{0: 'C', 1: 'C', 2: 'C', 3: 'C', 4: 'C', 5: 'O', 6: 'O'}"
- ]
- },
- "execution_count": 15,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# a = [0, 1, 3, 2]\n",
- "# b = [3, 2, 1, 0]\n",
- "# print(1 if a == b else 0)\n",
- "\n",
- "# max(1 ,2)\n",
- "\n",
- "# x = [ 'r', 'a', 's' ]\n",
- "# x.sort()\n",
- "# print(x)\n",
- "\n",
- "# def test1(*args, base = 'subtree'):\n",
- "# if base == 'subtree':\n",
- "# print('subtree')\n",
- "# elif base == 'edge':\n",
- "# print('edge')\n",
- "# else:\n",
- "# print('sp')\n",
- "\n",
- "# # function parameter usage test\n",
- "# test1('hello', 'hi', base = 'edge')\n",
- "\n",
- "# # python matrix calculation speed test\n",
- "# import numpy as np\n",
- "# import time\n",
- "\n",
- "# size = 100\n",
- "# m1 = np.random.random((size, size))\n",
- "# m2 = np.random.random((size, size))\n",
- "# itr = 1\n",
- "\n",
- "# start_time = time.time()\n",
- "# for i in range(itr):\n",
- "# np.dot(m1, m2)\n",
- "# print(time.time() - start_time)\n",
- "\n",
- "# start_time = time.time()\n",
- "# for j in range(itr):\n",
- "# result = np.zeros((size, size))\n",
- "# for i1 in range(size):\n",
- "# for i2 in range(size):\n",
- "# for i3 in range(size):\n",
- "# result[i1][i2] += m1[i1][i3] * m2[i3][i2]\n",
- "# print(time.time() - start_time)\n",
- "\n",
- "# start_time = time.time()\n",
- "# for i in range(itr):\n",
- "# print(np.dot(m1, m2))\n",
- "# print(time.time() - start_time)\n",
- "\n",
- "# start_time = time.time()\n",
- "# for j in range(itr):\n",
- "# result = np.zeros((size, size))\n",
- "# for i1 in range(size):\n",
- "# for i2 in range(size):\n",
- "# for i3 in range(size):\n",
- "# result[i1][i2] += m1[i1][i3] * m2[i3][i2]\n",
- "# print(result)\n",
- "# print(time.time() - start_time)\n",
- "\n",
- "# help(np.sum)\n",
- "\n",
- "# test dict\n",
- "import sys\n",
- "from collections import Counter\n",
- "import networkx as nx\n",
- "sys.path.insert(0, \"../\")\n",
- "from pygraph.utils.graphfiles import loadDataset\n",
- "from pygraph.kernels.spkernel import spkernel\n",
- "\n",
- "dataset, y = loadDataset(\"../../../../datasets/acyclic/Acyclic/dataset_bps.ds\")\n",
- "G1 = dataset[15]\n",
- "nx.get_node_attributes(G1, 'label')\n",
- "listhqhq = list(nx.get_node_attributes(G1, 'label').values())\n",
- "dicthaha = dict(Counter(listhqhq))\n",
- "len(dicthaha)"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.5.2"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 2
- }
|