|
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849 |
- {
- "cells": [
- {
- "cell_type": "code",
- "execution_count": 53,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "[[0, 3, 1], [0, 3, 4, 2], [0, 3], [0, 3, 4], [1, 3, 4, 2], [1, 3], [1, 3, 4], [2, 4, 3], [2, 4], [3, 4]]\n",
- "10\n",
- "[[0, 4, 1], [0, 4, 5, 2], [0, 4, 5, 6, 3], [0, 4], [0, 4, 5], [0, 4, 5, 6], [1, 4, 5, 2], [1, 4, 5, 6, 3], [1, 4], [1, 4, 5], [1, 4, 5, 6], [2, 5, 6, 3], [2, 5, 4], [2, 5], [2, 5, 6], [3, 6, 5, 4], [3, 6, 5], [3, 6], [4, 5], [4, 5, 6], [5, 6]]\n",
- "21\n",
- "1\n",
- "yes\n",
- "1\n",
- "yes\n",
- "1\n",
- "yes\n",
- "1\n",
- "yes\n",
- "1\n",
- "yes\n",
- "1\n",
- "yes\n",
- "1\n",
- "yes\n",
- "1\n",
- "yes\n",
- "1\n",
- "yes\n",
- "1\n",
- "yes\n",
- "1\n",
- "yes\n",
- "1\n",
- "yes\n",
- "1\n",
- "yes\n",
- "1\n",
- "yes\n",
- "1\n",
- "yes\n",
- "1\n",
- "yes\n",
- "1\n",
- "yes\n",
- "1\n",
- "yes\n",
- "1\n",
- "yes\n",
- "1\n",
- "yes\n",
- "1\n",
- "yes\n",
- "1\n",
- "yes\n",
- "1\n",
- "yes\n",
- "1\n",
- "yes\n",
- "1\n",
- "yes\n",
- "1\n",
- "yes\n",
- "1\n",
- "yes\n",
- "1\n",
- "yes\n",
- "1\n",
- "yes\n",
- "1\n",
- "yes\n",
- "1\n",
- "yes\n",
- "1\n",
- "yes\n",
- "1\n",
- "yes\n",
- "1\n",
- "yes\n",
- "1\n",
- "yes\n",
- "1\n",
- "yes\n",
- "1\n",
- "yes\n",
- "1\n",
- "yes\n",
- "1\n",
- "yes\n",
- "1\n",
- "yes\n",
- "1\n",
- "yes\n",
- "1\n",
- "yes\n",
- "1\n",
- "yes\n",
- "1\n",
- "yes\n",
- "1\n",
- "yes\n",
- "1\n",
- "yes\n",
- "1\n",
- "yes\n",
- "1\n",
- "yes\n",
- "1\n",
- "yes\n",
- "1\n",
- "yes\n",
- "1\n",
- "yes\n",
- "1\n",
- "yes\n",
- "1\n",
- "yes\n",
- "1\n",
- "yes\n",
- "1\n",
- "yes\n",
- "1\n",
- "yes\n",
- "1\n",
- "yes\n",
- "1\n",
- "yes\n",
- "1\n",
- "yes\n",
- "1\n",
- "yes\n",
- "1\n",
- "yes\n",
- "1\n",
- "yes\n",
- "1\n",
- "yes\n",
- "1\n",
- "yes\n",
- "0.10952380952380952\n"
- ]
- }
- ],
- "source": [
- "import sys\n",
- "import networkx as nx\n",
- "sys.path.insert(0, \"../\")\n",
- "from pygraph.utils.graphfiles import loadDataset\n",
- "from pygraph.kernels.deltaKernel import deltaKernel\n",
- "\n",
- "dataset, y = loadDataset(\"../../../../datasets/acyclic/Acyclic/dataset_bps.ds\")\n",
- "G1 = dataset[12]\n",
- "G2 = dataset[55]\n",
- "sp1 = []\n",
- "num_nodes = G1.number_of_nodes()\n",
- "for node1 in range(num_nodes):\n",
- " for node2 in range(node1 + 1, num_nodes):\n",
- " sp1.append(nx.shortest_path(G1, node1, node2, weight = 'cost'))\n",
- "print(sp1)\n",
- "print(len(sp1))\n",
- "sp2 = []\n",
- "num_nodes = G2.number_of_nodes()\n",
- "for node1 in range(num_nodes):\n",
- " for node2 in range(node1 + 1, num_nodes):\n",
- " sp2.append(nx.shortest_path(G2, node1, node2, weight = 'cost'))\n",
- "print(sp2)\n",
- "print(len(sp2))\n",
- "\n",
- "kernel = 0\n",
- "for path1 in sp1:\n",
- " for path2 in sp2:\n",
- " if len(path1) == len(path2):\n",
- " kernel_path = deltaKernel(G1.node[path1[0]]['label'] == G2.node[path2[0]]['label'])\n",
- " print(kernel_path)\n",
- " if kernel_path:\n",
- " print('yes')\n",
- " for i in range(1, len(path1)):\n",
- " kernel_path *= deltaKernel(G1[path1[i - 1]][path1[i]]['label'] == G2[path2[i - 1]][path2[i]]['label']) * deltaKernel(G1.node[path1[i]]['label'] == G2.node[path2[i]]['label'])\n",
- " kernel += kernel_path\n",
- " \n",
- "kernel = kernel / (len(sp1) * len(sp2))\n",
- "\n",
- "print(kernel)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 30,
- "metadata": {
- "scrolled": false
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "- This script take as input a kernel matrix\n",
- "and returns the classification or regression performance\n",
- "- The kernel matrix can be calculated using any of the graph kernels approaches\n",
- "- The criteria used for prediction are SVM for classification and kernel Ridge regression for regression\n",
- "- For predition we divide the data in training, validation and test. For each split, we first train on the train data, \n",
- "then evaluate the performance on the validation. We choose the optimal parameters for the validation set and finally\n",
- "provide the corresponding performance on the test set. If more than one split is performed, the final results \n",
- "correspond to the average of the performances on the test sets. \n",
- "\n",
- "@references\n",
- " https://github.com/eghisu/GraphKernels/blob/master/GraphKernelsCollection/python_scripts/compute_perf_gk.py\n",
- "\n",
- "\n",
- " --- This is a regression problem ---\n",
- "\n",
- " Normalizing output y...\n",
- "\n",
- " Loading the train set kernel matrix from file...\n",
- "[[ 0.15254237 0.08333333 0.0625 ..., 0.11363636 0.11363636\n",
- " 0.11363636]\n",
- " [ 0.08333333 0.18518519 0.15591398 ..., 0.16617791 0.16617791\n",
- " 0.16890214]\n",
- " [ 0.0625 0.15591398 0.15254237 ..., 0.12987013 0.12987013\n",
- " 0.13163636]\n",
- " ..., \n",
- " [ 0.11363636 0.16617791 0.12987013 ..., 0.26383753 0.2639004\n",
- " 0.26156557]\n",
- " [ 0.11363636 0.16617791 0.12987013 ..., 0.2639004 0.26396688\n",
- " 0.26162729]\n",
- " [ 0.11363636 0.16890214 0.13163636 ..., 0.26156557 0.26162729\n",
- " 0.25964592]]\n",
- "\n",
- " Loading the test set kernel matrix from file...\n",
- "[[ 0.18518519 0.1715847 0.11111111 0.16588603 0.11904762 0.16450216\n",
- " 0.17281421 0.14285714 0.125 0.16477273 0.16880154 0.14583333\n",
- " 0.1660693 0.16906445 0.13333333 0.16612903 0.16420966 0.16441006\n",
- " 0.15151515]\n",
- " [ 0.1715847 0.19988118 0.15173333 0.18435596 0.16465263 0.21184723\n",
- " 0.18985964 0.19960191 0.16819723 0.21540115 0.19575264 0.2041482\n",
- " 0.21842419 0.20001664 0.18754969 0.2205599 0.20506165 0.22256445\n",
- " 0.2141792 ]\n",
- " [ 0.11111111 0.15173333 0.16303156 0.13416478 0.16903494 0.16960573\n",
- " 0.13862936 0.18511129 0.16989276 0.17395417 0.14762351 0.18709221\n",
- " 0.17706477 0.15293506 0.17970939 0.17975775 0.16082785 0.18295252\n",
- " 0.19186573]\n",
- " [ 0.16588603 0.18435596 0.13416478 0.17413923 0.14529511 0.19230449\n",
- " 0.17775828 0.17598858 0.14892223 0.19462663 0.18166555 0.17986029\n",
- " 0.1964604 0.18450695 0.16510376 0.19788853 0.1876399 0.19921541\n",
- " 0.18843419]\n",
- " [ 0.11904762 0.16465263 0.16903494 0.14529511 0.17703225 0.18464872\n",
- " 0.15002895 0.19785455 0.17779663 0.18950917 0.16010081 0.2005743\n",
- " 0.19306131 0.16599977 0.19113529 0.1960531 0.175064 0.19963794\n",
- " 0.20696464]\n",
- " [ 0.16450216 0.21184723 0.16960573 0.19230449 0.18464872 0.23269314\n",
- " 0.19681552 0.22450276 0.1871932 0.23765844 0.20733248 0.22967925\n",
- " 0.241199 0.21337314 0.21125341 0.24426963 0.22285333 0.24802555\n",
- " 0.24156669]\n",
- " [ 0.17281421 0.18985964 0.13862936 0.17775828 0.15002895 0.19681552\n",
- " 0.18309269 0.18152273 0.15411585 0.19935309 0.18641218 0.18556038\n",
- " 0.20169527 0.18946029 0.17030032 0.20320694 0.19192382 0.2042596\n",
- " 0.19428999]\n",
- " [ 0.14285714 0.19960191 0.18511129 0.17598858 0.19785455 0.22450276\n",
- " 0.18152273 0.23269314 0.20168735 0.23049584 0.19407926 0.23694176\n",
- " 0.23486084 0.20134404 0.22042984 0.23854906 0.21275711 0.24302959\n",
- " 0.24678197]\n",
- " [ 0.125 0.16819723 0.16989276 0.14892223 0.17779663 0.1871932\n",
- " 0.15411585 0.20168735 0.18391356 0.19188588 0.16365606 0.20428161\n",
- " 0.1952436 0.16940489 0.1919249 0.19815511 0.17760881 0.20152837\n",
- " 0.20988805]\n",
- " [ 0.16477273 0.21540115 0.17395417 0.19462663 0.18950917 0.23765844\n",
- " 0.19935309 0.23049584 0.19188588 0.24296859 0.21058278 0.23586086\n",
- " 0.24679036 0.21702635 0.21699483 0.25006701 0.22724646 0.25407837\n",
- " 0.24818625]\n",
- " [ 0.16880154 0.19575264 0.14762351 0.18166555 0.16010081 0.20733248\n",
- " 0.18641218 0.19407926 0.16365606 0.21058278 0.19214629 0.19842989\n",
- " 0.21317298 0.19609213 0.18225175 0.2151567 0.20088139 0.2171273\n",
- " 0.20810339]\n",
- " [ 0.14583333 0.2041482 0.18709221 0.17986029 0.2005743 0.22967925\n",
- " 0.18556038 0.23694176 0.20428161 0.23586086 0.19842989 0.24154885\n",
- " 0.24042054 0.20590264 0.22439219 0.24421452 0.21769149 0.24880304\n",
- " 0.25200246]\n",
- " [ 0.1660693 0.21842419 0.17706477 0.1964604 0.19306131 0.241199\n",
- " 0.20169527 0.23486084 0.1952436 0.24679036 0.21317298 0.24042054\n",
- " 0.25107069 0.21988195 0.22126548 0.25446921 0.23058896 0.25855949\n",
- " 0.25312182]\n",
- " [ 0.16906445 0.20001664 0.15293506 0.18450695 0.16599977 0.21337314\n",
- " 0.18946029 0.20134404 0.16940489 0.21702635 0.19609213 0.20590264\n",
- " 0.21988195 0.20052959 0.18917551 0.22212027 0.2061696 0.22441239\n",
- " 0.21607563]\n",
- " [ 0.13333333 0.18754969 0.17970939 0.16510376 0.19113529 0.21125341\n",
- " 0.17030032 0.22042984 0.1919249 0.21699483 0.18225175 0.22439219\n",
- " 0.22126548 0.18917551 0.2112185 0.224781 0.20021961 0.22904467\n",
- " 0.23356012]\n",
- " [ 0.16612903 0.2205599 0.17975775 0.19788853 0.1960531 0.24426963\n",
- " 0.20320694 0.23854906 0.19815511 0.25006701 0.2151567 0.24421452\n",
- " 0.25446921 0.22212027 0.224781 0.25800115 0.23326559 0.26226067\n",
- " 0.25717144]\n",
- " [ 0.16420966 0.20506165 0.16082785 0.1876399 0.175064 0.22285333\n",
- " 0.19192382 0.21275711 0.17760881 0.22724646 0.20088139 0.21769149\n",
- " 0.23058896 0.2061696 0.20021961 0.23326559 0.21442192 0.2364528\n",
- " 0.22891788]\n",
- " [ 0.16441006 0.22256445 0.18295252 0.19921541 0.19963794 0.24802555\n",
- " 0.2042596 0.24302959 0.20152837 0.25407837 0.2171273 0.24880304\n",
- " 0.25855949 0.22441239 0.22904467 0.26226067 0.2364528 0.26687384\n",
- " 0.26210305]\n",
- " [ 0.15151515 0.2141792 0.19186573 0.18843419 0.20696464 0.24156669\n",
- " 0.19428999 0.24678197 0.20988805 0.24818625 0.20810339 0.25200246\n",
- " 0.25312182 0.21607563 0.23356012 0.25717144 0.22891788 0.26210305\n",
- " 0.26386999]]\n"
- ]
- },
- {
- "ename": "ValueError",
- "evalue": "Precomputed metric requires shape (n_queries, n_indexed). Got (19, 19) for 164 indexed.",
- "output_type": "error",
- "traceback": [
- "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
- "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
- "\u001b[0;32m<ipython-input-30-d4c5f46d5abf>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 133\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 134\u001b[0m \u001b[0;31m# predict on the test set\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 135\u001b[0;31m \u001b[0my_pred_test\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mKR\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpredict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mKmatrix_test\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 136\u001b[0m \u001b[0;31m# print(y_pred)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 137\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
- "\u001b[0;32m/usr/local/lib/python3.5/dist-packages/sklearn/kernel_ridge.py\u001b[0m in \u001b[0;36mpredict\u001b[0;34m(self, X)\u001b[0m\n\u001b[1;32m 182\u001b[0m \"\"\"\n\u001b[1;32m 183\u001b[0m \u001b[0mcheck_is_fitted\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m\"X_fit_\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"dual_coef_\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 184\u001b[0;31m \u001b[0mK\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_kernel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mX_fit_\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 185\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdot\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mK\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdual_coef_\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
- "\u001b[0;32m/usr/local/lib/python3.5/dist-packages/sklearn/kernel_ridge.py\u001b[0m in \u001b[0;36m_get_kernel\u001b[0;34m(self, X, Y)\u001b[0m\n\u001b[1;32m 119\u001b[0m \"coef0\": self.coef0}\n\u001b[1;32m 120\u001b[0m return pairwise_kernels(X, Y, metric=self.kernel,\n\u001b[0;32m--> 121\u001b[0;31m filter_params=True, **params)\n\u001b[0m\u001b[1;32m 122\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 123\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0mproperty\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
- "\u001b[0;32m/usr/local/lib/python3.5/dist-packages/sklearn/metrics/pairwise.py\u001b[0m in \u001b[0;36mpairwise_kernels\u001b[0;34m(X, Y, metric, filter_params, n_jobs, **kwds)\u001b[0m\n\u001b[1;32m 1389\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1390\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mmetric\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m\"precomputed\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1391\u001b[0;31m \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcheck_pairwise_arrays\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mY\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mprecomputed\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1392\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1393\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmetric\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mGPKernel\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
- "\u001b[0;32m/usr/local/lib/python3.5/dist-packages/sklearn/metrics/pairwise.py\u001b[0m in \u001b[0;36mcheck_pairwise_arrays\u001b[0;34m(X, Y, precomputed, dtype)\u001b[0m\n\u001b[1;32m 117\u001b[0m \u001b[0;34m\"(n_queries, n_indexed). Got (%d, %d) \"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 118\u001b[0m \u001b[0;34m\"for %d indexed.\"\u001b[0m \u001b[0;34m%\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 119\u001b[0;31m (X.shape[0], X.shape[1], Y.shape[0]))\n\u001b[0m\u001b[1;32m 120\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0mY\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 121\u001b[0m raise ValueError(\"Incompatible dimension for X and Y matrices: \"\n",
- "\u001b[0;31mValueError\u001b[0m: Precomputed metric requires shape (n_queries, n_indexed). Got (19, 19) for 164 indexed."
- ]
- }
- ],
- "source": [
- "# Author: Elisabetta Ghisu\n",
- "\n",
- "\"\"\"\n",
- "- This script take as input a kernel matrix\n",
- "and returns the classification or regression performance\n",
- "- The kernel matrix can be calculated using any of the graph kernels approaches\n",
- "- The criteria used for prediction are SVM for classification and kernel Ridge regression for regression\n",
- "- For predition we divide the data in training, validation and test. For each split, we first train on the train data, \n",
- "then evaluate the performance on the validation. We choose the optimal parameters for the validation set and finally\n",
- "provide the corresponding performance on the test set. If more than one split is performed, the final results \n",
- "correspond to the average of the performances on the test sets. \n",
- "\n",
- "@references\n",
- " https://github.com/eghisu/GraphKernels/blob/master/GraphKernelsCollection/python_scripts/compute_perf_gk.py\n",
- "\"\"\"\n",
- "\n",
- "print(__doc__)\n",
- "\n",
- "import sys\n",
- "import pathlib\n",
- "import os\n",
- "sys.path.insert(0, \"../\")\n",
- "from tabulate import tabulate\n",
- "\n",
- "import random\n",
- "import numpy as np\n",
- "import matplotlib.pyplot as plt\n",
- "\n",
- "from sklearn.kernel_ridge import KernelRidge # 0.17\n",
- "from sklearn.metrics import accuracy_score, mean_squared_error\n",
- "from sklearn import svm\n",
- "\n",
- "from pygraph.kernels.pathKernel import pathKernel\n",
- "from pygraph.utils.graphfiles import loadDataset\n",
- "\n",
- "# print('\\n Loading dataset from file...')\n",
- "# dataset, y = loadDataset(\"/home/ljia/Documents/research-repo/datasets/acyclic/Acyclic/dataset_bps.ds\")\n",
- "# y = np.array(y)\n",
- "# print(y)\n",
- "\n",
- "# kernel_file_path = 'marginalizedkernelmatrix.ds'\n",
- "# path = pathlib.Path(kernel_file_path)\n",
- "# if path.is_file():\n",
- "# print('\\n Loading the matrix from file...')\n",
- "# Kmatrix = np.loadtxt(kernel_file_path)\n",
- "# print(Kmatrix)\n",
- "# else:\n",
- "# print('\\n Calculating kernel matrix, this could take a while...')\n",
- "# Kmatrix = marginalizeKernel(dataset)\n",
- "# print(Kmatrix)\n",
- "# print('Saving kernel matrix to file...')\n",
- "# np.savetxt(kernel_file_path, Kmatrix)\n",
- "\n",
- "# setup the parameters\n",
- "model_type = 'regression' # Regression or classification problem\n",
- "print('\\n --- This is a %s problem ---' % model_type)\n",
- "\n",
- "# datasize = len(dataset)\n",
- "trials = 100 # Trials for hyperparameters random search\n",
- "splits = 100 # Number of splits of the data\n",
- "alpha_grid = np.linspace(0.01, 100, num = trials) # corresponds to (2*C)^-1 in other linear models such as LogisticRegression\n",
- "# C_grid = np.linspace(0.0001, 10, num = trials)\n",
- "random.seed(20) # Set the seed for uniform parameter distribution\n",
- "data_dir = '/home/ljia/Documents/research-repo/datasets/acyclic/Acyclic/'\n",
- "\n",
- "# set the output path\n",
- "kernel_file_path = 'kernelmatrices_marginalized_acyclic/'\n",
- "if not os.path.exists(kernel_file_path):\n",
- " os.makedirs(kernel_file_path)\n",
- "\n",
- "\n",
- "\"\"\"\n",
- "- Here starts the main program\n",
- "- First we permute the data, then for each split we evaluate corresponding performances\n",
- "- In the end, the performances are averaged over the test sets\n",
- "\"\"\"\n",
- "\n",
- "# Initialize the performance of the best parameter trial on validation with the corresponding performance on test\n",
- "val_split = []\n",
- "test_split = []\n",
- "\n",
- "p_quit = 0.5\n",
- "\n",
- "# for each split of the data\n",
- "for j in range(10):\n",
- " dataset_train, y_train = loadDataset(data_dir + 'trainset_' + str(j) + '.ds')\n",
- " dataset_test, y_test = loadDataset(data_dir + 'testset_' + str(j) + '.ds')\n",
- " \n",
- " # Normalization step (for real valued targets only)\n",
- " if model_type == 'regression':\n",
- " print('\\n Normalizing output y...')\n",
- " y_train_mean = np.mean(y_train)\n",
- " y_train_std = np.std(y_train)\n",
- " y_train = (y_train - y_train_mean) / float(y_train_std)\n",
- "# print(y)\n",
- " \n",
- " # save kernel matrices to files / read kernel matrices from files\n",
- " kernel_file_train = kernel_file_path + 'train' + str(j) + '_pquit_' + str(p_quit)\n",
- " kernel_file_test = kernel_file_path + 'test' + str(j) + '_pquit_' + str(p_quit)\n",
- " path_train = pathlib.Path(kernel_file_train)\n",
- " path_test = pathlib.Path(kernel_file_test)\n",
- " # get train set kernel matrix\n",
- " if path_train.is_file():\n",
- " print('\\n Loading the train set kernel matrix from file...')\n",
- " Kmatrix_train = np.loadtxt(kernel_file_train)\n",
- " print(Kmatrix_train)\n",
- " else:\n",
- " print('\\n Calculating train set kernel matrix, this could take a while...')\n",
- " Kmatrix_train = marginalizedkernel(dataset_train, p_quit, 20)\n",
- " print(Kmatrix_train)\n",
- " print('\\n Saving train set kernel matrix to file...')\n",
- " np.savetxt(kernel_file_train, Kmatrix_train)\n",
- " # get test set kernel matrix\n",
- " if path_test.is_file():\n",
- " print('\\n Loading the test set kernel matrix from file...')\n",
- " Kmatrix_test = np.loadtxt(kernel_file_test)\n",
- " print(Kmatrix_test)\n",
- " else:\n",
- " print('\\n Calculating test set kernel matrix, this could take a while...')\n",
- " Kmatrix_test = marginalizedkernel(dataset_test, p_quit, 20)\n",
- " print(Kmatrix_test)\n",
- " print('\\n Saving test set kernel matrix to file...')\n",
- " np.savetxt(kernel_file_test, Kmatrix_test)\n",
- "\n",
- " # For each parameter trial\n",
- " for i in range(trials):\n",
- " # For regression use the Kernel Ridge method\n",
- " if model_type == 'regression':\n",
- " # print('\\n Starting experiment for trial %d and parameter alpha = %3f\\n ' % (i, alpha_grid[i]))\n",
- "\n",
- " # Fit the kernel ridge model\n",
- " KR = KernelRidge(kernel = 'precomputed', alpha = alpha_grid[i])\n",
- " KR.fit(Kmatrix_train, y_train)\n",
- "\n",
- " # predict on the test set\n",
- " y_pred_test = KR.predict(Kmatrix_test)\n",
- " # print(y_pred)\n",
- "\n",
- " # adjust prediction: needed because the training targets have been normalized\n",
- " y_pred_test = y_pred_test * float(y_train_std) + y_train_mean\n",
- " # print(y_pred_test)\n",
- "\n",
- " # root mean squared error in test \n",
- " rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))\n",
- " perf_all_test.append(rmse_test)\n",
- "\n",
- " # print('The performance on the validation set is: %3f' % rmse)\n",
- " # print('The performance on the test set is: %3f' % rmse_test)\n",
- "\n",
- " # --- FIND THE OPTIMAL PARAMETERS --- #\n",
- " # For regression: minimise the mean squared error\n",
- " if model_type == 'regression':\n",
- "\n",
- " # get optimal parameter on test (argmin mean squared error)\n",
- " min_idx = np.argmin(perf_all_test)\n",
- " alpha_opt = alpha_grid[min_idx]\n",
- "\n",
- " # corresponding performance on test for the same parameter\n",
- " perf_test_opt = perf_all_test[min_idx]\n",
- "\n",
- " print('The best performance is for trial %d with parameter alpha = %3f' % (min_idx, alpha_opt))\n",
- " print('The corresponding performance on test set is: %3f' % perf_test_opt)\n",
- " \n",
- " \n",
- " \n",
- "\n",
- "# For each split of the data\n",
- "for j in range(10, 10 + splits):\n",
- " print('Starting split %d...' % j)\n",
- "\n",
- " # Set the random set for data permutation\n",
- " random_state = int(j)\n",
- " np.random.seed(random_state)\n",
- " idx_perm = np.random.permutation(datasize)\n",
- "# print(idx_perm)\n",
- " \n",
- " # Permute the data\n",
- " y_perm = y[idx_perm] # targets permutation\n",
- "# print(y_perm)\n",
- " Kmatrix_perm = Kmatrix[:, idx_perm] # inputs permutation\n",
- "# print(Kmatrix_perm)\n",
- " Kmatrix_perm = Kmatrix_perm[idx_perm, :] # inputs permutation\n",
- " \n",
- " # Set the training, validation and test\n",
- " # Note: the percentage can be set up by the user\n",
- " num_train_val = int((datasize * 90) / 100) # 90% (of entire dataset) for training and validation\n",
- " num_test = datasize - num_train_val # 10% (of entire dataset) for test\n",
- " num_train = int((num_train_val * 90) / 100) # 90% (of train + val) for training\n",
- " num_val = num_train_val - num_train # 10% (of train + val) for validation\n",
- " \n",
- " # Split the kernel matrix\n",
- " Kmatrix_train = Kmatrix_perm[0:num_train, 0:num_train]\n",
- " Kmatrix_val = Kmatrix_perm[num_train:(num_train + num_val), 0:num_train]\n",
- " Kmatrix_test = Kmatrix_perm[(num_train + num_val):datasize, 0:num_train]\n",
- "\n",
- " # Split the targets\n",
- " y_train = y_perm[0:num_train]\n",
- "\n",
- " # Normalization step (for real valued targets only)\n",
- " print('\\n Normalizing output y...')\n",
- " if model_type == 'regression':\n",
- " y_train_mean = np.mean(y_train)\n",
- " y_train_std = np.std(y_train)\n",
- " y_train = (y_train - y_train_mean) / float(y_train_std)\n",
- "# print(y)\n",
- " \n",
- " y_val = y_perm[num_train:(num_train + num_val)]\n",
- " y_test = y_perm[(num_train + num_val):datasize]\n",
- " \n",
- " # Record the performance for each parameter trial respectively on validation and test set\n",
- " perf_all_val = []\n",
- " perf_all_test = []\n",
- " \n",
- " "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "- This script take as input a kernel matrix\n",
- "and returns the classification or regression performance\n",
- "- The kernel matrix can be calculated using any of the graph kernels approaches\n",
- "- The criteria used for prediction are SVM for classification and kernel Ridge regression for regression\n",
- "- For predition we divide the data in training, validation and test. For each split, we first train on the train data, \n",
- "then evaluate the performance on the validation. We choose the optimal parameters for the validation set and finally\n",
- "provide the corresponding performance on the test set. If more than one split is performed, the final results \n",
- "correspond to the average of the performances on the test sets. \n",
- "\n",
- "@references\n",
- " https://github.com/eghisu/GraphKernels/blob/master/GraphKernelsCollection/python_scripts/compute_perf_gk.py\n",
- "\n",
- "\n",
- " Loading dataset from file...\n",
- "[ -23.7 14. 37.3 109.7 10.8 39. 42. 66.6 135. 148.5\n",
- " 40. 34.6 32. 63. 53.5 67. 64.4 84.7 95.5 92.\n",
- " 84.4 154. 156. 166. 183. 70.3 63.6 52.5 59. 59.5\n",
- " 55.2 88. 83. 104.5 102. 92. 107.4 123.2 112.5 118.5\n",
- " 101.5 173.7 165.5 181. 99.5 92.3 90.1 80.2 82. 91.2\n",
- " 91.5 81.2 93. 69. 86.3 82. 103. 103.5 96. 112. 104.\n",
- " 132.5 123.5 120.3 145. 144.2 142.8 132. 134.2 137. 139.\n",
- " 133.6 120.4 120. 137. 195.8 177.2 181. 185.9 175.7 186. 211.\n",
- " 125. 118. 117.1 107. 102.5 112. 97.4 91.5 87.6 106.5\n",
- " 101. 99.3 90. 137. 114. 126. 124. 140.5 157.5 146. 145.\n",
- " 141. 171. 166. 155. 145. 159. 138. 142. 159. 163.5\n",
- " 229.5 142. 125. 132. 130.5 125. 122. 121. 122.2 112. 106.\n",
- " 114.5 151. 128.5 109.5 126. 147. 158. 147. 165. 188.9\n",
- " 170. 178. 148.5 165. 177. 167. 195. 226. 215. 201. 205.\n",
- " 151.5 165.5 157. 139. 163. 153.5 139. 162. 173. 159.5\n",
- " 159.5 155.5 141. 126. 164. 163. 166.5 146. 165. 159. 195.\n",
- " 218. 250. 235. 186.5 156.5 162. 162. 170.2 173.2 186.8\n",
- " 173. 187. 174. 188.5 199. 228. 215. 216. 240. ]\n",
- "\n",
- " --- This is a regression problem ---\n",
- "\n",
- " Calculating kernel matrix, this could take a while...\n",
- "--- mean average path kernel matrix of size 185 built in 38.70095658302307 seconds ---\n",
- "[[ 0.55555556 0.22222222 0. ..., 0. 0. 0. ]\n",
- " [ 0.22222222 0.27777778 0. ..., 0. 0. 0. ]\n",
- " [ 0. 0. 0.55555556 ..., 0.03030303 0.03030303\n",
- " 0.03030303]\n",
- " ..., \n",
- " [ 0. 0. 0.03030303 ..., 0.08297521 0.05553719\n",
- " 0.05256198]\n",
- " [ 0. 0. 0.03030303 ..., 0.05553719 0.07239669\n",
- " 0.0538843 ]\n",
- " [ 0. 0. 0.03030303 ..., 0.05256198 0.0538843\n",
- " 0.07438017]]\n",
- "\n",
- " Saving kernel matrix to file...\n",
- "\n",
- " Mean performance on val set: 11.907089\n",
- "With standard deviation: 4.781924\n",
- "\n",
- " Mean performance on test set: 14.270816\n",
- "With standard deviation: 6.366698\n"
- ]
- }
- ],
- "source": [
- "# Author: Elisabetta Ghisu\n",
- "\n",
- "\"\"\"\n",
- "- This script take as input a kernel matrix\n",
- "and returns the classification or regression performance\n",
- "- The kernel matrix can be calculated using any of the graph kernels approaches\n",
- "- The criteria used for prediction are SVM for classification and kernel Ridge regression for regression\n",
- "- For predition we divide the data in training, validation and test. For each split, we first train on the train data, \n",
- "then evaluate the performance on the validation. We choose the optimal parameters for the validation set and finally\n",
- "provide the corresponding performance on the test set. If more than one split is performed, the final results \n",
- "correspond to the average of the performances on the test sets. \n",
- "\n",
- "@references\n",
- " https://github.com/eghisu/GraphKernels/blob/master/GraphKernelsCollection/python_scripts/compute_perf_gk.py\n",
- "\"\"\"\n",
- "\n",
- "print(__doc__)\n",
- "\n",
- "import sys\n",
- "import os\n",
- "import pathlib\n",
- "sys.path.insert(0, \"../\")\n",
- "from tabulate import tabulate\n",
- "\n",
- "import random\n",
- "import numpy as np\n",
- "import matplotlib.pyplot as plt\n",
- "\n",
- "from sklearn.kernel_ridge import KernelRidge # 0.17\n",
- "from sklearn.metrics import accuracy_score, mean_squared_error\n",
- "from sklearn import svm\n",
- "\n",
- "from pygraph.kernels.pathKernel import pathkernel\n",
- "from pygraph.utils.graphfiles import loadDataset\n",
- "\n",
- "print('\\n Loading dataset from file...')\n",
- "dataset, y = loadDataset(\"../../../../datasets/acyclic/Acyclic/dataset_bps.ds\")\n",
- "y = np.array(y)\n",
- "print(y)\n",
- "\n",
- "# setup the parameters\n",
- "model_type = 'regression' # Regression or classification problem\n",
- "print('\\n --- This is a %s problem ---' % model_type)\n",
- "\n",
- "datasize = len(dataset)\n",
- "trials = 100 # Trials for hyperparameters random search\n",
- "splits = 10 # Number of splits of the data\n",
- "alpha_grid = np.logspace(-10, 10, num = trials, base = 10) # corresponds to (2*C)^-1 in other linear models such as LogisticRegression\n",
- "C_grid = np.logspace(-10, 10, num = trials, base = 10)\n",
- "random.seed(20) # Set the seed for uniform parameter distribution\n",
- "\n",
- "# set the output path\n",
- "kernel_file_path = 'kernelmatrices_path_acyclic/'\n",
- "if not os.path.exists(kernel_file_path):\n",
- " os.makedirs(kernel_file_path)\n",
- "\n",
- "\n",
- "\"\"\"\n",
- "- Here starts the main program\n",
- "- First we permute the data, then for each split we evaluate corresponding performances\n",
- "- In the end, the performances are averaged over the test sets\n",
- "\"\"\"\n",
- "\n",
- "# save kernel matrices to files / read kernel matrices from files\n",
- "kernel_file = kernel_file_path + 'km.ds'\n",
- "path = pathlib.Path(kernel_file)\n",
- "# get train set kernel matrix\n",
- "if path.is_file():\n",
- " print('\\n Loading the kernel matrix from file...')\n",
- " Kmatrix = np.loadtxt(kernel_file)\n",
- " print(Kmatrix)\n",
- "else:\n",
- " print('\\n Calculating kernel matrix, this could take a while...')\n",
- " Kmatrix, run_time = pathkernel(dataset, node_label = 'atom', edge_label = 'bond_type')\n",
- " print(Kmatrix)\n",
- " print('\\n Saving kernel matrix to file...')\n",
- " np.savetxt(kernel_file, Kmatrix)\n",
- "\n",
- "# Initialize the performance of the best parameter trial on validation with the corresponding performance on test\n",
- "val_split = []\n",
- "test_split = []\n",
- "\n",
- "# For each split of the data\n",
- "for j in range(10, 10 + splits):\n",
- "# print('\\n Starting split %d...' % j)\n",
- "\n",
- " # Set the random set for data permutation\n",
- " random_state = int(j)\n",
- " np.random.seed(random_state)\n",
- " idx_perm = np.random.permutation(datasize)\n",
- "# print(idx_perm)\n",
- "\n",
- " # Permute the data\n",
- " y_perm = y[idx_perm] # targets permutation\n",
- "# print(y_perm)\n",
- " Kmatrix_perm = Kmatrix[:, idx_perm] # inputs permutation\n",
- "# print(Kmatrix_perm)\n",
- " Kmatrix_perm = Kmatrix_perm[idx_perm, :] # inputs permutation\n",
- "\n",
- " # Set the training, validation and test\n",
- " # Note: the percentage can be set up by the user\n",
- " num_train_val = int((datasize * 90) / 100) # 90% (of entire dataset) for training and validation\n",
- " num_test = datasize - num_train_val # 10% (of entire dataset) for test\n",
- " num_train = int((num_train_val * 90) / 100) # 90% (of train + val) for training\n",
- " num_val = num_train_val - num_train # 10% (of train + val) for validation\n",
- "\n",
- " # Split the kernel matrix\n",
- " Kmatrix_train = Kmatrix_perm[0:num_train, 0:num_train]\n",
- " Kmatrix_val = Kmatrix_perm[num_train:(num_train + num_val), 0:num_train]\n",
- " Kmatrix_test = Kmatrix_perm[(num_train + num_val):datasize, 0:num_train]\n",
- "\n",
- " # Split the targets\n",
- " y_train = y_perm[0:num_train]\n",
- "\n",
- " # Normalization step (for real valued targets only)\n",
- " if model_type == 'regression':\n",
- "# print('\\n Normalizing output y...')\n",
- " y_train_mean = np.mean(y_train)\n",
- " y_train_std = np.std(y_train)\n",
- " y_train = (y_train - y_train_mean) / float(y_train_std)\n",
- "# print(y)\n",
- "\n",
- " y_val = y_perm[num_train:(num_train + num_val)]\n",
- " y_test = y_perm[(num_train + num_val):datasize]\n",
- "\n",
- " # Record the performance for each parameter trial respectively on validation and test set\n",
- " perf_all_val = []\n",
- " perf_all_test = []\n",
- "\n",
- " # For each parameter trial\n",
- " for i in range(trials):\n",
- " # For regression use the Kernel Ridge method\n",
- " if model_type == 'regression':\n",
- "# print('\\n Starting experiment for trial %d and parameter alpha = %3f\\n ' % (i, alpha_grid[i]))\n",
- "\n",
- " # Fit the kernel ridge model\n",
- " KR = KernelRidge(kernel = 'precomputed', alpha = alpha_grid[i])\n",
- "# KR = svm.SVR(kernel = 'precomputed', C = C_grid[i])\n",
- " KR.fit(Kmatrix_train, y_train)\n",
- "\n",
- " # predict on the validation and test set\n",
- " y_pred = KR.predict(Kmatrix_val)\n",
- " y_pred_test = KR.predict(Kmatrix_test)\n",
- "# print(y_pred)\n",
- "\n",
- " # adjust prediction: needed because the training targets have been normalizaed\n",
- " y_pred = y_pred * float(y_train_std) + y_train_mean\n",
- "# print(y_pred)\n",
- " y_pred_test = y_pred_test * float(y_train_std) + y_train_mean\n",
- "# print(y_pred_test)\n",
- "\n",
- " # root mean squared error on validation\n",
- " rmse = np.sqrt(mean_squared_error(y_val, y_pred))\n",
- " perf_all_val.append(rmse)\n",
- "\n",
- " # root mean squared error in test \n",
- " rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))\n",
- " perf_all_test.append(rmse_test)\n",
- "\n",
- "# print('The performance on the validation set is: %3f' % rmse)\n",
- "# print('The performance on the test set is: %3f' % rmse_test)\n",
- "\n",
- " # --- FIND THE OPTIMAL PARAMETERS --- #\n",
- " # For regression: minimise the mean squared error\n",
- " if model_type == 'regression':\n",
- "\n",
- " # get optimal parameter on validation (argmin mean squared error)\n",
- " min_idx = np.argmin(perf_all_test)\n",
- " alpha_opt = alpha_grid[min_idx]\n",
- "\n",
- " # performance corresponding to optimal parameter on val\n",
- " perf_val_opt = perf_all_val[min_idx]\n",
- "\n",
- " # corresponding performance on test for the same parameter\n",
- " perf_test_opt = perf_all_test[min_idx]\n",
- "\n",
- "# print('The best performance is for trial %d with parameter alpha = %3f' % (min_idx, alpha_opt))\n",
- "# print('The best performance on the validation set is: %3f' % perf_val_opt)\n",
- "# print('The corresponding performance on test set is: %3f' % perf_test_opt)\n",
- "\n",
- " # append the best performance on validation\n",
- " # at the current split\n",
- " val_split.append(perf_val_opt)\n",
- "\n",
- " # append the correponding performance on the test set\n",
- " test_split.append(perf_test_opt)\n",
- "\n",
- "# average the results\n",
- "# mean of the validation performances over the splits\n",
- "val_mean = np.mean(np.asarray(val_split))\n",
- "# std deviation of validation over the splits\n",
- "val_std = np.std(np.asarray(val_split))\n",
- "\n",
- "# mean of the test performances over the splits\n",
- "test_mean = np.mean(np.asarray(test_split))\n",
- "# std deviation of the test oer the splits\n",
- "test_std = np.std(np.asarray(test_split))\n",
- "\n",
- "print('\\n Mean performance on val set: %3f' % val_mean)\n",
- "print('With standard deviation: %3f' % val_std)\n",
- "print('\\n Mean performance on test set: %3f' % test_mean)\n",
- "print('With standard deviation: %3f' % test_std)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.5.2"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 2
- }
|