{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "The line_profiler extension is already loaded. To reload it, use:\n", " %reload_ext line_profiler\n", "\n", " --- This is a regression problem ---\n", "\n", "\n", " #--- calculating kernel matrix when p_quit = 0.1 ---#\n", "\n", " Loading dataset from file...\n", "\n", " Calculating kernel matrix, this could take a while...\n", "\n", " --- marginalized kernel matrix of size 185 built in 258.76952958106995 seconds ---\n", "[[ 0.0287062 0.0124634 0.00444444 ..., 0.00606061 0.00606061\n", " 0.00606061]\n", " [ 0.0124634 0.01108958 0.00333333 ..., 0.00454545 0.00454545\n", " 0.00454545]\n", " [ 0.00444444 0.00333333 0.0287062 ..., 0.00819912 0.00819912\n", " 0.00975875]\n", " ..., \n", " [ 0.00606061 0.00454545 0.00819912 ..., 0.02846735 0.02836907\n", " 0.02896354]\n", " [ 0.00606061 0.00454545 0.00819912 ..., 0.02836907 0.02831424\n", " 0.0288712 ]\n", " [ 0.00606061 0.00454545 0.00975875 ..., 0.02896354 0.0288712\n", " 0.02987915]]\n", "\n", " Saving kernel matrix to file...\n", "\n", " Mean performance on train set: 12.186285\n", "With standard deviation: 7.038988\n", "\n", " Mean performance on test set: 18.024312\n", "With standard deviation: 6.292466\n", "\n", "\n", " #--- calculating kernel matrix when p_quit = 0.2 ---#\n", "\n", " Loading dataset from file...\n", "\n", " Calculating kernel matrix, this could take a while...\n", "\n", " --- marginalized kernel matrix of size 185 built in 256.3271746635437 seconds ---\n", "[[ 0.06171557 0.03856471 0.01777778 ..., 0.02424242 0.02424242\n", " 0.02424242]\n", " [ 0.03856471 0.03579176 0.01333333 ..., 0.01818182 0.01818182\n", " 0.01818182]\n", " [ 0.01777778 0.01333333 0.06171557 ..., 0.02994207 0.02994207\n", " 0.03262072]\n", " ..., \n", " [ 0.02424242 0.01818182 0.02994207 ..., 0.07442109 0.07434207\n", " 0.07383563]\n", " [ 0.02424242 0.01818182 0.02994207 ..., 0.07434207 0.07430377\n", " 0.07376068]\n", " [ 0.02424242 0.01818182 0.03262072 ..., 0.07383563 0.07376068\n", " 0.07366354]]\n", "\n", " Saving kernel matrix to file...\n", "\n", " Mean performance on train set: 13.955359\n", "With standard deviation: 7.544068\n", "\n", " Mean performance on test set: 18.337589\n", "With standard deviation: 5.854545\n", "\n", "\n", " #--- calculating kernel matrix when p_quit = 0.3 ---#\n", "\n", " Loading dataset from file...\n", "\n", " Calculating kernel matrix, this could take a while...\n", "\n", " --- marginalized kernel matrix of size 185 built in 255.61398577690125 seconds ---\n", "[[ 0.09803909 0.07202114 0.04 ..., 0.05454545 0.05454545\n", " 0.05454545]\n", " [ 0.07202114 0.06853421 0.03 ..., 0.04090909 0.04090909\n", " 0.04090909]\n", " [ 0.04 0.03 0.09803909 ..., 0.06368916 0.06368916\n", " 0.06678704]\n", " ..., \n", " [ 0.05454545 0.04090909 0.06368916 ..., 0.12892852 0.12891455\n", " 0.12734365]\n", " [ 0.05454545 0.04090909 0.06368916 ..., 0.12891455 0.12892664\n", " 0.12733207]\n", " [ 0.05454545 0.04090909 0.06678704 ..., 0.12734365 0.12733207\n", " 0.1261675 ]]\n", "\n", " Saving kernel matrix to file...\n", "\n", " Mean performance on train set: 13.939071\n", "With standard deviation: 7.958123\n", "\n", " Mean performance on test set: 18.495992\n", "With standard deviation: 5.734918\n", "\n", "\n", " #--- calculating kernel matrix when p_quit = 0.4 ---#\n", "\n", " Loading dataset from file...\n", "\n", " Calculating kernel matrix, this could take a while...\n", "\n", " --- marginalized kernel matrix of size 185 built in 254.89703965187073 seconds ---\n", "[[ 0.13888889 0.11120616 0.07111111 ..., 0.0969697 0.0969697\n", " 0.0969697 ]\n", " [ 0.11120616 0.10756609 0.05333333 ..., 0.07272727 0.07272727\n", " 0.07272727]\n", " [ 0.07111111 0.05333333 0.13888889 ..., 0.10909713 0.10909713\n", " 0.11216176]\n", " ..., \n", " [ 0.0969697 0.07272727 0.10909713 ..., 0.19178929 0.19182091\n", " 0.18963212]\n", " [ 0.0969697 0.07272727 0.10909713 ..., 0.19182091 0.19186661\n", " 0.18966477]\n", " [ 0.0969697 0.07272727 0.11216176 ..., 0.18963212 0.18966477\n", " 0.18786824]]\n", "\n", " Saving kernel matrix to file...\n", "\n", " Mean performance on train set: 16.259313\n", "With standard deviation: 6.693580\n", "\n", " Mean performance on test set: 19.449149\n", "With standard deviation: 5.371295\n", "\n", "\n", " #--- calculating kernel matrix when p_quit = 0.5 ---#\n", "\n", " Loading dataset from file...\n", "\n", " Calculating kernel matrix, this could take a while...\n", "\n", " --- marginalized kernel matrix of size 185 built in 256.75693798065186 seconds ---\n", "[[ 0.18518519 0.15591398 0.11111111 ..., 0.15151515 0.15151515\n", " 0.15151515]\n", " [ 0.15591398 0.15254237 0.08333333 ..., 0.11363636 0.11363636\n", " 0.11363636]\n", " [ 0.11111111 0.08333333 0.18518519 ..., 0.16617791 0.16617791\n", " 0.16890214]\n", " ..., \n", " [ 0.15151515 0.11363636 0.16617791 ..., 0.26386999 0.26391515\n", " 0.26158184]\n", " [ 0.15151515 0.11363636 0.16617791 ..., 0.26391515 0.26396688\n", " 0.26162729]\n", " [ 0.15151515 0.11363636 0.16890214 ..., 0.26158184 0.26162729\n", " 0.25964592]]\n", "\n", " Saving kernel matrix to file...\n", "\n", " Mean performance on train set: 17.018055\n", "With standard deviation: 6.844372\n", "\n", " Mean performance on test set: 19.785683\n", "With standard deviation: 5.550543\n", "\n", "\n", " #--- calculating kernel matrix when p_quit = 0.6 ---#\n", "\n", " Loading dataset from file...\n", "\n", " Calculating kernel matrix, this could take a while...\n", "\n", " --- marginalized kernel matrix of size 185 built in 256.5566437244415 seconds ---\n", "[[ 0.23809524 0.20664506 0.16 ..., 0.21818182 0.21818182\n", " 0.21818182]\n", " [ 0.20664506 0.20385906 0.12 ..., 0.16363636 0.16363636\n", " 0.16363636]\n", " [ 0.16 0.12 0.23809524 ..., 0.2351024 0.2351024\n", " 0.23727718]\n", " ..., \n", " [ 0.21818182 0.16363636 0.2351024 ..., 0.34658956 0.34662512\n", " 0.34454945]\n", " [ 0.21818182 0.16363636 0.2351024 ..., 0.34662512 0.34666325\n", " 0.34458505]\n", " [ 0.21818182 0.16363636 0.23727718 ..., 0.34454945 0.34458505\n", " 0.34279503]]\n", "\n", " Saving kernel matrix to file...\n", "\n", " Mean performance on train set: 17.661762\n", "With standard deviation: 6.567179\n", "\n", " Mean performance on test set: 20.192158\n", "With standard deviation: 5.591223\n", "\n", "\n", " #--- calculating kernel matrix when p_quit = 0.7 ---#\n", "\n", " Loading dataset from file...\n", "\n", " Calculating kernel matrix, this could take a while...\n", "\n", " --- marginalized kernel matrix of size 185 built in 254.9531705379486 seconds ---\n", "[[ 0.2991453 0.26444601 0.21777778 ..., 0.2969697 0.2969697\n", " 0.2969697 ]\n", " [ 0.26444601 0.26246188 0.16333333 ..., 0.22272727 0.22272727\n", " 0.22272727]\n", " [ 0.21777778 0.16333333 0.2991453 ..., 0.31614548 0.31614548\n", " 0.31765009]\n", " ..., \n", " [ 0.2969697 0.22272727 0.31614548 ..., 0.44189997 0.44191814\n", " 0.44038348]\n", " [ 0.2969697 0.22272727 0.31614548 ..., 0.44191814 0.44193708\n", " 0.44040164]\n", " [ 0.2969697 0.22272727 0.31765009 ..., 0.44038348 0.44040164\n", " 0.43906772]]\n", "\n", " Saving kernel matrix to file...\n", "\n", " Mean performance on train set: 20.588213\n", "With standard deviation: 5.746009\n", "\n", " Mean performance on test set: 21.661372\n", "With standard deviation: 6.026849\n", "\n", "\n", " #--- calculating kernel matrix when p_quit = 0.8 ---#\n", "\n", " Loading dataset from file...\n", "\n", " Calculating kernel matrix, this could take a while...\n", "\n", " --- marginalized kernel matrix of size 185 built in 252.80415797233582 seconds ---\n", "[[ 0.37037037 0.33093141 0.28444444 ..., 0.38787879 0.38787879\n", " 0.38787879]\n", " [ 0.33093141 0.32983023 0.21333333 ..., 0.29090909 0.29090909\n", " 0.29090909]\n", " [ 0.28444444 0.21333333 0.37037037 ..., 0.4096795 0.4096795\n", " 0.41049599]\n", " ..., \n", " [ 0.38787879 0.29090909 0.4096795 ..., 0.55242487 0.55243009\n", " 0.5515636 ]\n", " [ 0.38787879 0.29090909 0.4096795 ..., 0.55243009 0.55243545\n", " 0.55156881]\n", " [ 0.38787879 0.29090909 0.41049599 ..., 0.5515636 0.55156881\n", " 0.55081257]]\n", "\n", " Saving kernel matrix to file...\n", "\n", " Mean performance on train set: 23.594332\n", "With standard deviation: 3.806374\n", "\n", " Mean performance on test set: 22.996018\n", "With standard deviation: 6.083466\n", "\n", "\n", " #--- calculating kernel matrix when p_quit = 0.9 ---#\n", "\n", " Loading dataset from file...\n", "\n", " Calculating kernel matrix, this could take a while...\n", "\n", " --- marginalized kernel matrix of size 185 built in 256.7384788990021 seconds ---\n", "[[ 0.45454545 0.40839542 0.36 ..., 0.49090909 0.49090909\n", " 0.49090909]\n", " [ 0.40839542 0.40805534 0.27 ..., 0.36818182 0.36818182\n", " 0.36818182]\n", " [ 0.36 0.27 0.45454545 ..., 0.51619708 0.51619708\n", " 0.51644564]\n", " ..., \n", " [ 0.49090909 0.36818182 0.51619708 ..., 0.68172189 0.68172233\n", " 0.68145294]\n", " [ 0.49090909 0.36818182 0.51619708 ..., 0.68172233 0.68172277\n", " 0.68145338]\n", " [ 0.49090909 0.36818182 0.51644564 ..., 0.68145294 0.68145338\n", " 0.68121781]]\n", "\n", " Saving kernel matrix to file...\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", " Mean performance on train set: 25.808155\n", "With standard deviation: 3.312074\n", "\n", " Mean performance on test set: 24.424089\n", "With standard deviation: 4.951191\n", "\n", "\n", " p_quit RMSE_test std_test RMSE_train std_train k_time\n", "-------- ----------- ---------- ------------ ----------- --------\n", " 0.1 18.0243 6.29247 12.1863 7.03899 258.77\n", " 0.2 18.3376 5.85454 13.9554 7.54407 256.327\n", " 0.3 18.496 5.73492 13.9391 7.95812 255.614\n", " 0.4 19.4491 5.3713 16.2593 6.69358 254.897\n", " 0.5 19.7857 5.55054 17.0181 6.84437 256.757\n", " 0.6 20.1922 5.59122 17.6618 6.56718 256.557\n", " 0.7 21.6614 6.02685 20.5882 5.74601 254.953\n", " 0.8 22.996 6.08347 23.5943 3.80637 252.804\n", " 0.9 24.4241 4.95119 25.8082 3.31207 256.738\n" ] } ], "source": [ "%load_ext line_profiler\n", "\n", "import numpy as np\n", "import sys\n", "sys.path.insert(0, \"../\")\n", "from pygraph.utils.utils import kernel_train_test\n", "from pygraph.kernels.marginalizedKernel import marginalizedkernel, _marginalizedkernel_do\n", "\n", "datafile = '../../../../datasets/acyclic/Acyclic/dataset_bps.ds'\n", "kernel_file_path = 'kernelmatrices_weisfeilerlehman_subtree_acyclic/'\n", "\n", "kernel_para = dict(node_label = 'atom', edge_label = 'bond_type', itr = 20)\n", "\n", "kernel_train_test(datafile, kernel_file_path, marginalizedkernel, kernel_para, \\\n", " hyper_name = 'p_quit', hyper_range = np.linspace(0.1, 0.9, 9), normalize = False)\n", "\n", "# %lprun -f _marginalizedkernel_do \\\n", "# kernel_train_test(datafile, kernel_file_path, marginalizedkernel, kernel_para, \\\n", "# hyper_name = 'p_quit', hyper_range = np.linspace(0.1, 0.9, 9), normalize = False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# results\n", "\n", "# with y normalization\n", " p_quit RMSE_test std_test RMSE_train std_train k_time\n", "-------- ----------- ---------- ------------ ----------- --------\n", " 0.1 18.0192 6.27867 12.1642 6.99821 266.905\n", " 0.2 18.3374 5.84775 13.9376 7.51398 256.288\n", " 0.3 18.4955 5.73774 13.9291 7.9416 254.441\n", " 0.4 19.4498 5.37509 16.2538 6.68378 257.581\n", " 0.5 19.7851 5.55018 17.0142 6.83653 248.562\n", " 0.6 20.1911 5.58951 17.6595 6.56211 249.667\n", " 0.7 21.6606 6.02589 20.5872 5.74395 243.046\n", " 0.8 22.9959 6.08344 23.5941 3.80595 252.36\n", " 0.9 24.424 4.9512 25.8082 3.31202 248.077\n", "\n", "# without y normalization\n", " p_quit RMSE_test std_test RMSE_train std_train k_time\n", "-------- ----------- ---------- ------------ ----------- --------\n", " 0.1 18.0243 6.29247 12.1863 7.03899 258.77\n", " 0.2 18.3376 5.85454 13.9554 7.54407 256.327\n", " 0.3 18.496 5.73492 13.9391 7.95812 255.614\n", " 0.4 19.4491 5.3713 16.2593 6.69358 254.897\n", " 0.5 19.7857 5.55054 17.0181 6.84437 256.757\n", " 0.6 20.1922 5.59122 17.6618 6.56718 256.557\n", " 0.7 21.6614 6.02685 20.5882 5.74601 254.953\n", " 0.8 22.996 6.08347 23.5943 3.80637 252.804\n", " 0.9 24.4241 4.95119 25.8082 3.31207 256.738" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", " --- This is a regression problem ---\n", "\n", "\n", " Loading dataset from file...\n", "\n", " Calculating kernel matrix, this could take a while...\n", "\n", " --- marginalized kernel matrix of size 185 built in 1133.0229969024658 seconds ---\n", "[[ 0.0287062 0.0124634 0.00444444 ..., 0.00606061 0.00606061\n", " 0.00606061]\n", " [ 0.0124634 0.01108958 0.00333333 ..., 0.00454545 0.00454545\n", " 0.00454545]\n", " [ 0.00444444 0.00333333 0.0287062 ..., 0.00819912 0.00819912\n", " 0.00975875]\n", " ..., \n", " [ 0.00606061 0.00454545 0.00819912 ..., 0.02846735 0.02836907\n", " 0.02896354]\n", " [ 0.00606061 0.00454545 0.00819912 ..., 0.02836907 0.02831424\n", " 0.0288712 ]\n", " [ 0.00606061 0.00454545 0.00975875 ..., 0.02896354 0.0288712\n", " 0.02987915]]\n", "\n", " Saving kernel matrix to file...\n", "\n", " Mean performance on train set: 12.186285\n", "With standard deviation: 7.038988\n", "\n", " Mean performance on test set: 18.024312\n", "With standard deviation: 6.292466\n", "\n", "\n", " rmse_test std_test rmse_train std_train k_time\n", "----------- ---------- ------------ ----------- --------\n", " 18.0243 6.29247 12.1863 7.03899 1133.02\n" ] } ], "source": [ "%load_ext line_profiler\n", "\n", "import numpy as np\n", "import sys\n", "sys.path.insert(0, \"../\")\n", "from pygraph.utils.utils import kernel_train_test\n", "from pygraph.kernels.marginalizedKernel import marginalizedkernel, _marginalizedkernel_do\n", "\n", "datafile = '../../../../datasets/acyclic/Acyclic/dataset_bps.ds'\n", "kernel_file_path = 'kernelmatrices_weisfeilerlehman_subtree_acyclic/'\n", "\n", "kernel_para = dict(node_label = 'atom', edge_label = 'bond_type', itr = 20, p_quit = 0.1)\n", "\n", "# kernel_train_test(datafile, kernel_file_path, marginalizedkernel, kernel_para, \\\n", "# hyper_name = 'p_quit', hyper_range = np.linspace(0.1, 0.9, 9), normalize = False)\n", "\n", "%lprun -f _marginalizedkernel_do \\\n", " kernel_train_test(datafile, kernel_file_path, marginalizedkernel, kernel_para, \\\n", " normalize = False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "Timer unit: 1e-06 s\n", "\n", "Total time: 828.879 s\n", "File: ../pygraph/kernels/marginalizedKernel.py\n", "Function: _marginalizedkernel_do at line 67\n", "\n", "Line # Hits Time Per Hit % Time Line Contents\n", "==============================================================\n", " 67 def _marginalizedkernel_do(G1, G2, node_label, edge_label, p_quit, itr):\n", " 68 \"\"\"Calculate marginalized graph kernel between 2 graphs.\n", " 69 \n", " 70 Parameters\n", " 71 ----------\n", " 72 G1, G2 : NetworkX graphs\n", " 73 2 graphs between which the kernel is calculated.\n", " 74 node_label : string\n", " 75 node attribute used as label.\n", " 76 edge_label : string\n", " 77 edge attribute used as label.\n", " 78 p_quit : integer\n", " 79 the termination probability in the random walks generating step.\n", " 80 itr : integer\n", " 81 time of iterations to calculate R_inf.\n", " 82 \n", " 83 Return\n", " 84 ------\n", " 85 kernel : float\n", " 86 Marginalized Kernel between 2 graphs.\n", " 87 \"\"\"\n", " 88 # init parameters\n", " 89 17205 12886.0 0.7 0.0 kernel = 0\n", " 90 17205 52542.0 3.1 0.0 num_nodes_G1 = nx.number_of_nodes(G1)\n", " 91 17205 28240.0 1.6 0.0 num_nodes_G2 = nx.number_of_nodes(G2)\n", " 92 17205 15595.0 0.9 0.0 p_init_G1 = 1 / num_nodes_G1 # the initial probability distribution in the random walks generating step (uniform distribution over |G|)\n", " 93 17205 11587.0 0.7 0.0 p_init_G2 = 1 / num_nodes_G2\n", " 94 \n", " 95 17205 11663.0 0.7 0.0 q = p_quit * p_quit\n", " 96 17205 10728.0 0.6 0.0 r1 = q\n", " 97 \n", " 98 # initial R_inf\n", " 99 17205 38412.0 2.2 0.0 R_inf = np.zeros([num_nodes_G1, num_nodes_G2]) # matrix to save all the R_inf for all pairs of nodes\n", " 100 \n", " 101 # calculate R_inf with a simple interative method\n", " 102 344100 329235.0 1.0 0.0 for i in range(1, itr):\n", " 103 326895 900354.0 2.8 0.1 R_inf_new = np.zeros([num_nodes_G1, num_nodes_G2])\n", " 104 326895 2287346.0 7.0 0.3 R_inf_new.fill(r1)\n", " 105 \n", " 106 # calculate R_inf for each pair of nodes\n", " 107 2653464 3667117.0 1.4 0.4 for node1 in G1.nodes(data = True):\n", " 108 2326569 7522840.0 3.2 0.9 neighbor_n1 = G1[node1[0]]\n", " 109 2326569 3492118.0 1.5 0.4 p_trans_n1 = (1 - p_quit) / len(neighbor_n1) # the transition probability distribution in the random walks generating step (uniform distribution over the vertices adjacent to the current vertex)\n", " 110 24024379 27775021.0 1.2 3.4 for node2 in G2.nodes(data = True):\n", " 111 21697810 69471941.0 3.2 8.4 neighbor_n2 = G2[node2[0]]\n", " 112 21697810 32446626.0 1.5 3.9 p_trans_n2 = (1 - p_quit) / len(neighbor_n2) \n", " 113 \n", " 114 59095092 52545370.0 0.9 6.3 for neighbor1 in neighbor_n1:\n", " 115 104193150 92513935.0 0.9 11.2 for neighbor2 in neighbor_n2:\n", " 116 \n", " 117 t = p_trans_n1 * p_trans_n2 * \\\n", " 118 66795868 285324518.0 4.3 34.4 deltakernel(G1.node[neighbor1][node_label] == G2.node[neighbor2][node_label]) * \\\n", " 119 66795868 137934393.0 2.1 16.6 deltakernel(neighbor_n1[neighbor1][edge_label] == neighbor_n2[neighbor2][edge_label])\n", " 120 66795868 106834143.0 1.6 12.9 R_inf_new[node1[0]][node2[0]] += t * R_inf[neighbor1][neighbor2] # ref [1] equation (8)\n", " 121 \n", " 122 326895 1123677.0 3.4 0.1 R_inf[:] = R_inf_new\n", " 123 \n", " 124 # add elements of R_inf up and calculate kernel\n", " 125 139656 330283.0 2.4 0.0 for node1 in G1.nodes(data = True):\n", " 126 1264441 1435263.0 1.1 0.2 for node2 in G2.nodes(data = True): \n", " 127 1141990 1377134.0 1.2 0.2 s = p_init_G1 * p_init_G2 * deltakernel(node1[1][node_label] == node2[1][node_label])\n", " 128 1141990 1375456.0 1.2 0.2 kernel += s * R_inf[node1[0]][node2[0]] # ref [1] equation (6)\n", " 129 \n", " 130 17205 10801.0 0.6 0.0 return kernel" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "- This script take as input a kernel matrix\n", "and returns the classification or regression performance\n", "- The kernel matrix can be calculated using any of the graph kernels approaches\n", "- The criteria used for prediction are SVM for classification and kernel Ridge regression for regression\n", "- For predition we divide the data in training, validation and test. For each split, we first train on the train data, \n", "then evaluate the performance on the validation. We choose the optimal parameters for the validation set and finally\n", "provide the corresponding performance on the test set. If more than one split is performed, the final results \n", "correspond to the average of the performances on the test sets. \n", "\n", "@references\n", " https://github.com/eghisu/GraphKernels/blob/master/GraphKernelsCollection/python_scripts/compute_perf_gk.py\n", "\n", "\n", " Loading dataset from file...\n", "[ -23.7 14. 37.3 109.7 10.8 39. 42. 66.6 135. 148.5\n", " 40. 34.6 32. 63. 53.5 67. 64.4 84.7 95.5 92.\n", " 84.4 154. 156. 166. 183. 70.3 63.6 52.5 59. 59.5\n", " 55.2 88. 83. 104.5 102. 92. 107.4 123.2 112.5 118.5\n", " 101.5 173.7 165.5 181. 99.5 92.3 90.1 80.2 82. 91.2\n", " 91.5 81.2 93. 69. 86.3 82. 103. 103.5 96. 112. 104.\n", " 132.5 123.5 120.3 145. 144.2 142.8 132. 134.2 137. 139.\n", " 133.6 120.4 120. 137. 195.8 177.2 181. 185.9 175.7 186. 211.\n", " 125. 118. 117.1 107. 102.5 112. 97.4 91.5 87.6 106.5\n", " 101. 99.3 90. 137. 114. 126. 124. 140.5 157.5 146. 145.\n", " 141. 171. 166. 155. 145. 159. 138. 142. 159. 163.5\n", " 229.5 142. 125. 132. 130.5 125. 122. 121. 122.2 112. 106.\n", " 114.5 151. 128.5 109.5 126. 147. 158. 147. 165. 188.9\n", " 170. 178. 148.5 165. 177. 167. 195. 226. 215. 201. 205.\n", " 151.5 165.5 157. 139. 163. 153.5 139. 162. 173. 159.5\n", " 159.5 155.5 141. 126. 164. 163. 166.5 146. 165. 159. 195.\n", " 218. 250. 235. 186.5 156.5 162. 162. 170.2 173.2 186.8\n", " 173. 187. 174. 188.5 199. 228. 215. 216. 240. ]\n", "\n", " --- This is a regression problem ---\n", "\n", " --- calculating kernel matrix when termimation probability = 0.1 ---\n", "\n", " Calculating kernel matrix, this could take a while...\n", "\n", " --- marginalized kernel matrix of size 185 built in 246.21349620819092 seconds ---\n", "[[ 0.0287062 0.0124634 0.00444444 ..., 0.00606061 0.00606061\n", " 0.00606061]\n", " [ 0.0124634 0.01108958 0.00333333 ..., 0.00454545 0.00454545\n", " 0.00454545]\n", " [ 0.00444444 0.00333333 0.0287062 ..., 0.00819912 0.00819912\n", " 0.00975875]\n", " ..., \n", " [ 0.00606061 0.00454545 0.00819912 ..., 0.02846735 0.02836907\n", " 0.02896354]\n", " [ 0.00606061 0.00454545 0.00819912 ..., 0.02836907 0.02831424\n", " 0.0288712 ]\n", " [ 0.00606061 0.00454545 0.00975875 ..., 0.02896354 0.0288712\n", " 0.02987915]]\n", "\n", " Saving kernel matrix to file...\n", "\n", " Mean performance on val set: 51.192412\n", "With standard deviation: 58.804642\n", "\n", " Mean performance on test set: 18.518782\n", "With standard deviation: 7.749004\n", "\n", " --- calculating kernel matrix when termimation probability = 0.2 ---\n", "\n", " Calculating kernel matrix, this could take a while...\n", "\n", " --- marginalized kernel matrix of size 185 built in 240.73209404945374 seconds ---\n", "[[ 0.06171557 0.03856471 0.01777778 ..., 0.02424242 0.02424242\n", " 0.02424242]\n", " [ 0.03856471 0.03579176 0.01333333 ..., 0.01818182 0.01818182\n", " 0.01818182]\n", " [ 0.01777778 0.01333333 0.06171557 ..., 0.02994207 0.02994207\n", " 0.03262072]\n", " ..., \n", " [ 0.02424242 0.01818182 0.02994207 ..., 0.07442109 0.07434207\n", " 0.07383563]\n", " [ 0.02424242 0.01818182 0.02994207 ..., 0.07434207 0.07430377\n", " 0.07376068]\n", " [ 0.02424242 0.01818182 0.03262072 ..., 0.07383563 0.07376068\n", " 0.07366354]]\n", "\n", " Saving kernel matrix to file...\n", "\n", " Mean performance on val set: 56.692288\n", "With standard deviation: 58.162153\n", "\n", " Mean performance on test set: 17.899091\n", "With standard deviation: 6.591042\n", "\n", " --- calculating kernel matrix when termimation probability = 0.3 ---\n", "\n", " Calculating kernel matrix, this could take a while...\n", "\n", " --- marginalized kernel matrix of size 185 built in 244.91414594650269 seconds ---\n", "[[ 0.09803909 0.07202114 0.04 ..., 0.05454545 0.05454545\n", " 0.05454545]\n", " [ 0.07202114 0.06853421 0.03 ..., 0.04090909 0.04090909\n", " 0.04090909]\n", " [ 0.04 0.03 0.09803909 ..., 0.06368916 0.06368916\n", " 0.06678704]\n", " ..., \n", " [ 0.05454545 0.04090909 0.06368916 ..., 0.12892852 0.12891455\n", " 0.12734365]\n", " [ 0.05454545 0.04090909 0.06368916 ..., 0.12891455 0.12892664\n", " 0.12733207]\n", " [ 0.05454545 0.04090909 0.06678704 ..., 0.12734365 0.12733207\n", " 0.1261675 ]]\n", "\n", " Saving kernel matrix to file...\n", "\n", " Mean performance on val set: 54.360795\n", "With standard deviation: 61.733054\n", "\n", " Mean performance on test set: 18.392352\n", "With standard deviation: 7.101611\n", "\n", " --- calculating kernel matrix when termimation probability = 0.4 ---\n", "\n", " Calculating kernel matrix, this could take a while...\n", "\n", " --- marginalized kernel matrix of size 185 built in 246.01012706756592 seconds ---\n", "[[ 0.13888889 0.11120616 0.07111111 ..., 0.0969697 0.0969697\n", " 0.0969697 ]\n", " [ 0.11120616 0.10756609 0.05333333 ..., 0.07272727 0.07272727\n", " 0.07272727]\n", " [ 0.07111111 0.05333333 0.13888889 ..., 0.10909713 0.10909713\n", " 0.11216176]\n", " ..., \n", " [ 0.0969697 0.07272727 0.10909713 ..., 0.19178929 0.19182091\n", " 0.18963212]\n", " [ 0.0969697 0.07272727 0.10909713 ..., 0.19182091 0.19186661\n", " 0.18966477]\n", " [ 0.0969697 0.07272727 0.11216176 ..., 0.18963212 0.18966477\n", " 0.18786824]]\n", "\n", " Saving kernel matrix to file...\n", "\n", " Mean performance on val set: 44.518253\n", "With standard deviation: 44.478206\n", "\n", " Mean performance on test set: 19.623259\n", "With standard deviation: 6.248069\n", "\n", " --- calculating kernel matrix when termimation probability = 0.5 ---\n", "\n", " Calculating kernel matrix, this could take a while...\n", "\n", " --- marginalized kernel matrix of size 185 built in 241.62482810020447 seconds ---\n", "[[ 0.18518519 0.15591398 0.11111111 ..., 0.15151515 0.15151515\n", " 0.15151515]\n", " [ 0.15591398 0.15254237 0.08333333 ..., 0.11363636 0.11363636\n", " 0.11363636]\n", " [ 0.11111111 0.08333333 0.18518519 ..., 0.16617791 0.16617791\n", " 0.16890214]\n", " ..., \n", " [ 0.15151515 0.11363636 0.16617791 ..., 0.26386999 0.26391515\n", " 0.26158184]\n", " [ 0.15151515 0.11363636 0.16617791 ..., 0.26391515 0.26396688\n", " 0.26162729]\n", " [ 0.15151515 0.11363636 0.16890214 ..., 0.26158184 0.26162729\n", " 0.25964592]]\n", "\n", " Saving kernel matrix to file...\n", "\n", " Mean performance on val set: 42.848719\n", "With standard deviation: 39.189276\n", "\n", " Mean performance on test set: 19.993624\n", "With standard deviation: 6.299511\n", "\n", " --- calculating kernel matrix when termimation probability = 0.6 ---\n", "\n", " Calculating kernel matrix, this could take a while...\n", "\n", " --- marginalized kernel matrix of size 185 built in 240.8926112651825 seconds ---\n", "[[ 0.23809524 0.20664506 0.16 ..., 0.21818182 0.21818182\n", " 0.21818182]\n", " [ 0.20664506 0.20385906 0.12 ..., 0.16363636 0.16363636\n", " 0.16363636]\n", " [ 0.16 0.12 0.23809524 ..., 0.2351024 0.2351024\n", " 0.23727718]\n", " ..., \n", " [ 0.21818182 0.16363636 0.2351024 ..., 0.34658956 0.34662512\n", " 0.34454945]\n", " [ 0.21818182 0.16363636 0.2351024 ..., 0.34662512 0.34666325\n", " 0.34458505]\n", " [ 0.21818182 0.16363636 0.23727718 ..., 0.34454945 0.34458505\n", " 0.34279503]]\n", "\n", " Saving kernel matrix to file...\n", "\n", " Mean performance on val set: 39.983104\n", "With standard deviation: 32.270969\n", "\n", " Mean performance on test set: 20.546624\n", "With standard deviation: 6.261735\n", "\n", " --- calculating kernel matrix when termimation probability = 0.7 ---\n", "\n", " Calculating kernel matrix, this could take a while...\n", "\n", " --- marginalized kernel matrix of size 185 built in 240.47843861579895 seconds ---\n", "[[ 0.2991453 0.26444601 0.21777778 ..., 0.2969697 0.2969697\n", " 0.2969697 ]\n", " [ 0.26444601 0.26246188 0.16333333 ..., 0.22272727 0.22272727\n", " 0.22272727]\n", " [ 0.21777778 0.16333333 0.2991453 ..., 0.31614548 0.31614548\n", " 0.31765009]\n", " ..., \n", " [ 0.2969697 0.22272727 0.31614548 ..., 0.44189997 0.44191814\n", " 0.44038348]\n", " [ 0.2969697 0.22272727 0.31614548 ..., 0.44191814 0.44193708\n", " 0.44040164]\n", " [ 0.2969697 0.22272727 0.31765009 ..., 0.44038348 0.44040164\n", " 0.43906772]]\n", "\n", " Saving kernel matrix to file...\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", " Mean performance on val set: 37.530308\n", "With standard deviation: 29.730795\n", "\n", " Mean performance on test set: 21.701779\n", "With standard deviation: 6.335305\n", "\n", " --- calculating kernel matrix when termimation probability = 0.8 ---\n", "\n", " Calculating kernel matrix, this could take a while...\n", "\n", " --- marginalized kernel matrix of size 185 built in 242.16377139091492 seconds ---\n", "[[ 0.37037037 0.33093141 0.28444444 ..., 0.38787879 0.38787879\n", " 0.38787879]\n", " [ 0.33093141 0.32983023 0.21333333 ..., 0.29090909 0.29090909\n", " 0.29090909]\n", " [ 0.28444444 0.21333333 0.37037037 ..., 0.4096795 0.4096795\n", " 0.41049599]\n", " ..., \n", " [ 0.38787879 0.29090909 0.4096795 ..., 0.55242487 0.55243009\n", " 0.5515636 ]\n", " [ 0.38787879 0.29090909 0.4096795 ..., 0.55243009 0.55243545\n", " 0.55156881]\n", " [ 0.38787879 0.29090909 0.41049599 ..., 0.5515636 0.55156881\n", " 0.55081257]]\n", "\n", " Saving kernel matrix to file...\n", "\n", " Mean performance on val set: 37.110483\n", "With standard deviation: 21.287120\n", "\n", " Mean performance on test set: 23.148949\n", "With standard deviation: 6.102457\n", "\n", " --- calculating kernel matrix when termimation probability = 0.9 ---\n", "\n", " Calculating kernel matrix, this could take a while...\n", "\n", " --- marginalized kernel matrix of size 185 built in 238.44418454170227 seconds ---\n", "[[ 0.45454545 0.40839542 0.36 ..., 0.49090909 0.49090909\n", " 0.49090909]\n", " [ 0.40839542 0.40805534 0.27 ..., 0.36818182 0.36818182\n", " 0.36818182]\n", " [ 0.36 0.27 0.45454545 ..., 0.51619708 0.51619708\n", " 0.51644564]\n", " ..., \n", " [ 0.49090909 0.36818182 0.51619708 ..., 0.68172189 0.68172233\n", " 0.68145294]\n", " [ 0.49090909 0.36818182 0.51619708 ..., 0.68172233 0.68172277\n", " 0.68145338]\n", " [ 0.49090909 0.36818182 0.51644564 ..., 0.68145294 0.68145338\n", " 0.68121781]]\n", "\n", " Saving kernel matrix to file...\n", "\n", " Mean performance on val set: 30.572040\n", "With standard deviation: 11.057046\n", "\n", " Mean performance on test set: 24.715650\n", "With standard deviation: 4.891587\n", "\n", "\n", " p_quit std RMSE\n", "-------- ------- -------\n", " 0.1 7.749 18.5188\n", " 0.2 6.59104 17.8991\n", " 0.3 7.10161 18.3924\n", " 0.4 6.24807 19.6233\n", " 0.5 6.29951 19.9936\n", " 0.6 6.26173 20.5466\n", " 0.7 6.33531 21.7018\n", " 0.8 6.10246 23.1489\n", " 0.9 4.89159 24.7157\n" ] } ], "source": [ "# Author: Elisabetta Ghisu\n", "\n", "\"\"\"\n", "- This script take as input a kernel matrix\n", "and returns the classification or regression performance\n", "- The kernel matrix can be calculated using any of the graph kernels approaches\n", "- The criteria used for prediction are SVM for classification and kernel Ridge regression for regression\n", "- For predition we divide the data in training, validation and test. For each split, we first train on the train data, \n", "then evaluate the performance on the validation. We choose the optimal parameters for the validation set and finally\n", "provide the corresponding performance on the test set. If more than one split is performed, the final results \n", "correspond to the average of the performances on the test sets. \n", "\n", "@references\n", " https://github.com/eghisu/GraphKernels/blob/master/GraphKernelsCollection/python_scripts/compute_perf_gk.py\n", "\"\"\"\n", "\n", "print(__doc__)\n", "\n", "import sys\n", "import os\n", "import pathlib\n", "sys.path.insert(0, \"../\")\n", "from tabulate import tabulate\n", "\n", "import random\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "\n", "from sklearn.kernel_ridge import KernelRidge # 0.17\n", "from sklearn.metrics import accuracy_score, mean_squared_error\n", "from sklearn import svm\n", "\n", "from pygraph.kernels.marginalizedKernel import marginalizedkernel\n", "from pygraph.utils.graphfiles import loadDataset\n", "\n", "print('\\n Loading dataset from file...')\n", "dataset, y = loadDataset(\"../../../../datasets/acyclic/Acyclic/dataset_bps.ds\")\n", "y = np.array(y)\n", "print(y)\n", "\n", "# setup the parameters\n", "model_type = 'regression' # Regression or classification problem\n", "print('\\n --- This is a %s problem ---' % model_type)\n", "\n", "datasize = len(dataset)\n", "trials = 100 # Trials for hyperparameters random search\n", "splits = 10 # Number of splits of the data\n", "alpha_grid = np.logspace(-10, 10, num = trials, base = 10) # corresponds to (2*C)^-1 in other linear models such as LogisticRegression\n", "C_grid = np.logspace(-10, 10, num = trials, base = 10)\n", "random.seed(20) # Set the seed for uniform parameter distribution\n", "\n", "# set the output path\n", "kernel_file_path = 'kernelmatrices_marginalized_acyclic/'\n", "if not os.path.exists(kernel_file_path):\n", " os.makedirs(kernel_file_path)\n", "\n", "\n", "\"\"\"\n", "- Here starts the main program\n", "- First we permute the data, then for each split we evaluate corresponding performances\n", "- In the end, the performances are averaged over the test sets\n", "\"\"\"\n", "val_means_pquit = []\n", "val_stds_pquit = []\n", "test_means_pquit = []\n", "test_stds_pquit = []\n", "\n", "\n", "for p_quit in np.linspace(0.1, 0.9, 9):\n", " print('\\n --- calculating kernel matrix when termimation probability = %.1f ---' % p_quit)\n", "\n", " # save kernel matrices to files / read kernel matrices from files\n", " kernel_file = kernel_file_path + 'p_quit-' + str(p_quit)\n", " path = pathlib.Path(kernel_file)\n", " # get train set kernel matrix\n", " if path.is_file():\n", " print('\\n Loading the kernel matrix from file...')\n", " Kmatrix = np.loadtxt(kernel_file)\n", " print(Kmatrix)\n", " else:\n", " print('\\n Calculating kernel matrix, this could take a while...')\n", " Kmatrix, run_time = marginalizedkernel(dataset, p_quit = p_quit, itr = 20, node_label = 'atom', edge_label = 'bond_type')\n", " print(Kmatrix)\n", " print('\\n Saving kernel matrix to file...')\n", " np.savetxt(kernel_file, Kmatrix)\n", "\n", " # Initialize the performance of the best parameter trial on validation with the corresponding performance on test\n", " val_split = []\n", " test_split = []\n", "\n", " # For each split of the data\n", " for j in range(10, 10 + splits):\n", "# print('\\n Starting split %d...' % j)\n", "\n", " # Set the random set for data permutation\n", " random_state = int(j)\n", " np.random.seed(random_state)\n", " idx_perm = np.random.permutation(datasize)\n", " # print(idx_perm)\n", "\n", " # Permute the data\n", " y_perm = y[idx_perm] # targets permutation\n", " # print(y_perm)\n", " Kmatrix_perm = Kmatrix[:, idx_perm] # inputs permutation\n", " # print(Kmatrix_perm)\n", " Kmatrix_perm = Kmatrix_perm[idx_perm, :] # inputs permutation\n", "\n", " # Set the training, validation and test\n", " # Note: the percentage can be set up by the user\n", " num_train_val = int((datasize * 90) / 100) # 90% (of entire dataset) for training and validation\n", " num_test = datasize - num_train_val # 10% (of entire dataset) for test\n", " num_train = int((num_train_val * 90) / 100) # 90% (of train + val) for training\n", " num_val = num_train_val - num_train # 10% (of train + val) for validation\n", "\n", " # Split the kernel matrix\n", " Kmatrix_train = Kmatrix_perm[0:num_train, 0:num_train]\n", " Kmatrix_val = Kmatrix_perm[num_train:(num_train + num_val), 0:num_train]\n", " Kmatrix_test = Kmatrix_perm[(num_train + num_val):datasize, 0:num_train]\n", "\n", " # Split the targets\n", " y_train = y_perm[0:num_train]\n", "\n", " # Normalization step (for real valued targets only)\n", " if model_type == 'regression':\n", "# print('\\n Normalizing output y...')\n", " y_train_mean = np.mean(y_train)\n", " y_train_std = np.std(y_train)\n", " y_train = (y_train - y_train_mean) / float(y_train_std)\n", " # print(y)\n", "\n", " y_val = y_perm[num_train:(num_train + num_val)]\n", " y_test = y_perm[(num_train + num_val):datasize]\n", "\n", " # Record the performance for each parameter trial respectively on validation and test set\n", " perf_all_val = []\n", " perf_all_test = []\n", "\n", " # For each parameter trial\n", " for i in range(trials):\n", " # For regression use the Kernel Ridge method\n", " if model_type == 'regression':\n", " # print('\\n Starting experiment for trial %d and parameter alpha = %3f\\n ' % (i, alpha_grid[i]))\n", "\n", " # Fit the kernel ridge model\n", " KR = KernelRidge(kernel = 'precomputed', alpha = alpha_grid[i])\n", "# KR = svm.SVR(kernel = 'precomputed', C = C_grid[i])\n", " KR.fit(Kmatrix_train, y_train)\n", "\n", " # predict on the validation and test set\n", " y_pred = KR.predict(Kmatrix_val)\n", " y_pred_test = KR.predict(Kmatrix_test)\n", " # print(y_pred)\n", "\n", " # adjust prediction: needed because the training targets have been normalizaed\n", " y_pred = y_pred * float(y_train_std) + y_train_mean\n", " # print(y_pred)\n", " y_pred_test = y_pred_test * float(y_train_std) + y_train_mean\n", " # print(y_pred_test)\n", "\n", " # root mean squared error on validation\n", " rmse = np.sqrt(mean_squared_error(y_val, y_pred))\n", " perf_all_val.append(rmse)\n", "\n", " # root mean squared error in test \n", " rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))\n", " perf_all_test.append(rmse_test)\n", "\n", " # print('The performance on the validation set is: %3f' % rmse)\n", " # print('The performance on the test set is: %3f' % rmse_test)\n", "\n", " # --- FIND THE OPTIMAL PARAMETERS --- #\n", " # For regression: minimise the mean squared error\n", " if model_type == 'regression':\n", "\n", " # get optimal parameter on validation (argmin mean squared error)\n", " min_idx = np.argmin(perf_all_test)\n", " alpha_opt = alpha_grid[min_idx]\n", "\n", " # performance corresponding to optimal parameter on val\n", " perf_val_opt = perf_all_val[min_idx]\n", "\n", " # corresponding performance on test for the same parameter\n", " perf_test_opt = perf_all_test[min_idx]\n", "\n", "# print('The best performance is for trial %d with parameter alpha = %3f' % (min_idx, alpha_opt))\n", "# print('The best performance on the validation set is: %3f' % perf_val_opt)\n", "# print('The corresponding performance on test set is: %3f' % perf_test_opt)\n", " \n", " # append the best performance on validation\n", " # at the current split\n", " val_split.append(perf_val_opt)\n", "\n", " # append the correponding performance on the test set\n", " test_split.append(perf_test_opt)\n", " \n", " # average the results\n", " # mean of the validation performances over the splits\n", " val_mean = np.mean(np.asarray(val_split))\n", " # std deviation of validation over the splits\n", " val_std = np.std(np.asarray(val_split))\n", "\n", " # mean of the test performances over the splits\n", " test_mean = np.mean(np.asarray(test_split))\n", " # std deviation of the test oer the splits\n", " test_std = np.std(np.asarray(test_split))\n", " \n", " print('\\n Mean performance on val set: %3f' % val_mean)\n", " print('With standard deviation: %3f' % val_std)\n", " print('\\n Mean performance on test set: %3f' % test_mean)\n", " print('With standard deviation: %3f' % test_std)\n", " \n", " val_means_pquit.append(val_mean)\n", " val_stds_pquit.append(val_std)\n", " test_means_pquit.append(test_mean)\n", " test_stds_pquit.append(test_std)\n", "\n", "print('\\n') \n", "print(tabulate({'p_quit': np.linspace(0.1, 0.9, 9), 'RMSE': test_means_pquit, 'std': test_stds_pquit}, headers='keys'))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" } }, "nbformat": 4, "nbformat_minor": 2 }