OpenI
/
graphkit-learn

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[0, 3, 1], [0, 3, 4, 2], [0, 3], [0, 3, 4], [1, 3, 4, 2], [1, 3], [1, 3, 4], [2, 4, 3], [2, 4], [3, 4]]\n",
      "10\n",
      "[[0, 4, 1], [0, 4, 5, 2], [0, 4, 5, 6, 3], [0, 4], [0, 4, 5], [0, 4, 5, 6], [1, 4, 5, 2], [1, 4, 5, 6, 3], [1, 4], [1, 4, 5], [1, 4, 5, 6], [2, 5, 6, 3], [2, 5, 4], [2, 5], [2, 5, 6], [3, 6, 5, 4], [3, 6, 5], [3, 6], [4, 5], [4, 5, 6], [5, 6]]\n",
      "21\n",
      "1\n",
      "yes\n",
      "1\n",
      "yes\n",
      "1\n",
      "yes\n",
      "1\n",
      "yes\n",
      "1\n",
      "yes\n",
      "1\n",
      "yes\n",
      "1\n",
      "yes\n",
      "1\n",
      "yes\n",
      "1\n",
      "yes\n",
      "1\n",
      "yes\n",
      "1\n",
      "yes\n",
      "1\n",
      "yes\n",
      "1\n",
      "yes\n",
      "1\n",
      "yes\n",
      "1\n",
      "yes\n",
      "1\n",
      "yes\n",
      "1\n",
      "yes\n",
      "1\n",
      "yes\n",
      "1\n",
      "yes\n",
      "1\n",
      "yes\n",
      "1\n",
      "yes\n",
      "1\n",
      "yes\n",
      "1\n",
      "yes\n",
      "1\n",
      "yes\n",
      "1\n",
      "yes\n",
      "1\n",
      "yes\n",
      "1\n",
      "yes\n",
      "1\n",
      "yes\n",
      "1\n",
      "yes\n",
      "1\n",
      "yes\n",
      "1\n",
      "yes\n",
      "1\n",
      "yes\n",
      "1\n",
      "yes\n",
      "1\n",
      "yes\n",
      "1\n",
      "yes\n",
      "1\n",
      "yes\n",
      "1\n",
      "yes\n",
      "1\n",
      "yes\n",
      "1\n",
      "yes\n",
      "1\n",
      "yes\n",
      "1\n",
      "yes\n",
      "1\n",
      "yes\n",
      "1\n",
      "yes\n",
      "1\n",
      "yes\n",
      "1\n",
      "yes\n",
      "1\n",
      "yes\n",
      "1\n",
      "yes\n",
      "1\n",
      "yes\n",
      "1\n",
      "yes\n",
      "1\n",
      "yes\n",
      "1\n",
      "yes\n",
      "1\n",
      "yes\n",
      "1\n",
      "yes\n",
      "1\n",
      "yes\n",
      "1\n",
      "yes\n",
      "1\n",
      "yes\n",
      "1\n",
      "yes\n",
      "1\n",
      "yes\n",
      "1\n",
      "yes\n",
      "1\n",
      "yes\n",
      "1\n",
      "yes\n",
      "1\n",
      "yes\n",
      "1\n",
      "yes\n",
      "1\n",
      "yes\n",
      "0.10952380952380952\n"
     ]
    }
   ],
   "source": [
    "import sys\n",
    "import networkx as nx\n",
    "sys.path.insert(0, \"../\")\n",
    "from pygraph.utils.graphfiles import loadDataset\n",
    "from pygraph.kernels.deltaKernel import deltaKernel\n",
    "\n",
    "dataset, y = loadDataset(\"../../../../datasets/acyclic/Acyclic/dataset_bps.ds\")\n",
    "G1 = dataset[12]\n",
    "G2 = dataset[55]\n",
    "sp1 = []\n",
    "num_nodes = G1.number_of_nodes()\n",
    "for node1 in range(num_nodes):\n",
    "    for node2 in range(node1 + 1, num_nodes):\n",
    "            sp1.append(nx.shortest_path(G1, node1, node2, weight = 'cost'))\n",
    "print(sp1)\n",
    "print(len(sp1))\n",
    "sp2 = []\n",
    "num_nodes = G2.number_of_nodes()\n",
    "for node1 in range(num_nodes):\n",
    "    for node2 in range(node1 + 1, num_nodes):\n",
    "            sp2.append(nx.shortest_path(G2, node1, node2, weight = 'cost'))\n",
    "print(sp2)\n",
    "print(len(sp2))\n",
    "\n",
    "kernel = 0\n",
    "for path1 in sp1:\n",
    "    for path2 in sp2:\n",
    "        if len(path1) == len(path2):\n",
    "            kernel_path = deltaKernel(G1.node[path1[0]]['label'] == G2.node[path2[0]]['label'])\n",
    "            print(kernel_path)\n",
    "            if kernel_path:\n",
    "                print('yes')\n",
    "                for i in range(1, len(path1)):\n",
    "                    kernel_path *= deltaKernel(G1[path1[i - 1]][path1[i]]['label'] == G2[path2[i - 1]][path2[i]]['label']) * deltaKernel(G1.node[path1[i]]['label'] == G2.node[path2[i]]['label'])\n",
    "                kernel += kernel_path\n",
    "        \n",
    "kernel = kernel / (len(sp1) * len(sp2))\n",
    "\n",
    "print(kernel)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "- This script take as input a kernel matrix\n",
      "and returns the classification or regression performance\n",
      "- The kernel matrix can be calculated using any of the graph kernels approaches\n",
      "- The criteria used for prediction are SVM for classification and kernel Ridge regression for regression\n",
      "- For predition we divide the data in training, validation and test. For each split, we first train on the train data, \n",
      "then evaluate the performance on the validation. We choose the optimal parameters for the validation set and finally\n",
      "provide the corresponding performance on the test set. If more than one split is performed, the final results \n",
      "correspond to the average of the performances on the test sets. \n",
      "\n",
      "@references\n",
      "    https://github.com/eghisu/GraphKernels/blob/master/GraphKernelsCollection/python_scripts/compute_perf_gk.py\n",
      "\n",
      "\n",
      " --- This is a regression problem ---\n",
      "\n",
      " Normalizing output y...\n",
      "\n",
      " Loading the train set kernel matrix from file...\n",
      "[[ 0.15254237  0.08333333  0.0625     ...,  0.11363636  0.11363636\n",
      "   0.11363636]\n",
      " [ 0.08333333  0.18518519  0.15591398 ...,  0.16617791  0.16617791\n",
      "   0.16890214]\n",
      " [ 0.0625      0.15591398  0.15254237 ...,  0.12987013  0.12987013\n",
      "   0.13163636]\n",
      " ..., \n",
      " [ 0.11363636  0.16617791  0.12987013 ...,  0.26383753  0.2639004\n",
      "   0.26156557]\n",
      " [ 0.11363636  0.16617791  0.12987013 ...,  0.2639004   0.26396688\n",
      "   0.26162729]\n",
      " [ 0.11363636  0.16890214  0.13163636 ...,  0.26156557  0.26162729\n",
      "   0.25964592]]\n",
      "\n",
      " Loading the test set kernel matrix from file...\n",
      "[[ 0.18518519  0.1715847   0.11111111  0.16588603  0.11904762  0.16450216\n",
      "   0.17281421  0.14285714  0.125       0.16477273  0.16880154  0.14583333\n",
      "   0.1660693   0.16906445  0.13333333  0.16612903  0.16420966  0.16441006\n",
      "   0.15151515]\n",
      " [ 0.1715847   0.19988118  0.15173333  0.18435596  0.16465263  0.21184723\n",
      "   0.18985964  0.19960191  0.16819723  0.21540115  0.19575264  0.2041482\n",
      "   0.21842419  0.20001664  0.18754969  0.2205599   0.20506165  0.22256445\n",
      "   0.2141792 ]\n",
      " [ 0.11111111  0.15173333  0.16303156  0.13416478  0.16903494  0.16960573\n",
      "   0.13862936  0.18511129  0.16989276  0.17395417  0.14762351  0.18709221\n",
      "   0.17706477  0.15293506  0.17970939  0.17975775  0.16082785  0.18295252\n",
      "   0.19186573]\n",
      " [ 0.16588603  0.18435596  0.13416478  0.17413923  0.14529511  0.19230449\n",
      "   0.17775828  0.17598858  0.14892223  0.19462663  0.18166555  0.17986029\n",
      "   0.1964604   0.18450695  0.16510376  0.19788853  0.1876399   0.19921541\n",
      "   0.18843419]\n",
      " [ 0.11904762  0.16465263  0.16903494  0.14529511  0.17703225  0.18464872\n",
      "   0.15002895  0.19785455  0.17779663  0.18950917  0.16010081  0.2005743\n",
      "   0.19306131  0.16599977  0.19113529  0.1960531   0.175064    0.19963794\n",
      "   0.20696464]\n",
      " [ 0.16450216  0.21184723  0.16960573  0.19230449  0.18464872  0.23269314\n",
      "   0.19681552  0.22450276  0.1871932   0.23765844  0.20733248  0.22967925\n",
      "   0.241199    0.21337314  0.21125341  0.24426963  0.22285333  0.24802555\n",
      "   0.24156669]\n",
      " [ 0.17281421  0.18985964  0.13862936  0.17775828  0.15002895  0.19681552\n",
      "   0.18309269  0.18152273  0.15411585  0.19935309  0.18641218  0.18556038\n",
      "   0.20169527  0.18946029  0.17030032  0.20320694  0.19192382  0.2042596\n",
      "   0.19428999]\n",
      " [ 0.14285714  0.19960191  0.18511129  0.17598858  0.19785455  0.22450276\n",
      "   0.18152273  0.23269314  0.20168735  0.23049584  0.19407926  0.23694176\n",
      "   0.23486084  0.20134404  0.22042984  0.23854906  0.21275711  0.24302959\n",
      "   0.24678197]\n",
      " [ 0.125       0.16819723  0.16989276  0.14892223  0.17779663  0.1871932\n",
      "   0.15411585  0.20168735  0.18391356  0.19188588  0.16365606  0.20428161\n",
      "   0.1952436   0.16940489  0.1919249   0.19815511  0.17760881  0.20152837\n",
      "   0.20988805]\n",
      " [ 0.16477273  0.21540115  0.17395417  0.19462663  0.18950917  0.23765844\n",
      "   0.19935309  0.23049584  0.19188588  0.24296859  0.21058278  0.23586086\n",
      "   0.24679036  0.21702635  0.21699483  0.25006701  0.22724646  0.25407837\n",
      "   0.24818625]\n",
      " [ 0.16880154  0.19575264  0.14762351  0.18166555  0.16010081  0.20733248\n",
      "   0.18641218  0.19407926  0.16365606  0.21058278  0.19214629  0.19842989\n",
      "   0.21317298  0.19609213  0.18225175  0.2151567   0.20088139  0.2171273\n",
      "   0.20810339]\n",
      " [ 0.14583333  0.2041482   0.18709221  0.17986029  0.2005743   0.22967925\n",
      "   0.18556038  0.23694176  0.20428161  0.23586086  0.19842989  0.24154885\n",
      "   0.24042054  0.20590264  0.22439219  0.24421452  0.21769149  0.24880304\n",
      "   0.25200246]\n",
      " [ 0.1660693   0.21842419  0.17706477  0.1964604   0.19306131  0.241199\n",
      "   0.20169527  0.23486084  0.1952436   0.24679036  0.21317298  0.24042054\n",
      "   0.25107069  0.21988195  0.22126548  0.25446921  0.23058896  0.25855949\n",
      "   0.25312182]\n",
      " [ 0.16906445  0.20001664  0.15293506  0.18450695  0.16599977  0.21337314\n",
      "   0.18946029  0.20134404  0.16940489  0.21702635  0.19609213  0.20590264\n",
      "   0.21988195  0.20052959  0.18917551  0.22212027  0.2061696   0.22441239\n",
      "   0.21607563]\n",
      " [ 0.13333333  0.18754969  0.17970939  0.16510376  0.19113529  0.21125341\n",
      "   0.17030032  0.22042984  0.1919249   0.21699483  0.18225175  0.22439219\n",
      "   0.22126548  0.18917551  0.2112185   0.224781    0.20021961  0.22904467\n",
      "   0.23356012]\n",
      " [ 0.16612903  0.2205599   0.17975775  0.19788853  0.1960531   0.24426963\n",
      "   0.20320694  0.23854906  0.19815511  0.25006701  0.2151567   0.24421452\n",
      "   0.25446921  0.22212027  0.224781    0.25800115  0.23326559  0.26226067\n",
      "   0.25717144]\n",
      " [ 0.16420966  0.20506165  0.16082785  0.1876399   0.175064    0.22285333\n",
      "   0.19192382  0.21275711  0.17760881  0.22724646  0.20088139  0.21769149\n",
      "   0.23058896  0.2061696   0.20021961  0.23326559  0.21442192  0.2364528\n",
      "   0.22891788]\n",
      " [ 0.16441006  0.22256445  0.18295252  0.19921541  0.19963794  0.24802555\n",
      "   0.2042596   0.24302959  0.20152837  0.25407837  0.2171273   0.24880304\n",
      "   0.25855949  0.22441239  0.22904467  0.26226067  0.2364528   0.26687384\n",
      "   0.26210305]\n",
      " [ 0.15151515  0.2141792   0.19186573  0.18843419  0.20696464  0.24156669\n",
      "   0.19428999  0.24678197  0.20988805  0.24818625  0.20810339  0.25200246\n",
      "   0.25312182  0.21607563  0.23356012  0.25717144  0.22891788  0.26210305\n",
      "   0.26386999]]\n"
     ]
    },
    {
     "ename": "ValueError",
     "evalue": "Precomputed metric requires shape (n_queries, n_indexed). Got (19, 19) for 164 indexed.",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-30-d4c5f46d5abf>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m    133\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    134\u001b[0m             \u001b[0;31m# predict on the test set\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 135\u001b[0;31m             \u001b[0my_pred_test\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mKR\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpredict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mKmatrix_test\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    136\u001b[0m     \u001b[0;31m#             print(y_pred)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    137\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/usr/local/lib/python3.5/dist-packages/sklearn/kernel_ridge.py\u001b[0m in \u001b[0;36mpredict\u001b[0;34m(self, X)\u001b[0m\n\u001b[1;32m    182\u001b[0m         \"\"\"\n\u001b[1;32m    183\u001b[0m         \u001b[0mcheck_is_fitted\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m\"X_fit_\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"dual_coef_\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 184\u001b[0;31m         \u001b[0mK\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_kernel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mX_fit_\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    185\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdot\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mK\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdual_coef_\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/usr/local/lib/python3.5/dist-packages/sklearn/kernel_ridge.py\u001b[0m in \u001b[0;36m_get_kernel\u001b[0;34m(self, X, Y)\u001b[0m\n\u001b[1;32m    119\u001b[0m                       \"coef0\": self.coef0}\n\u001b[1;32m    120\u001b[0m         return pairwise_kernels(X, Y, metric=self.kernel,\n\u001b[0;32m--> 121\u001b[0;31m                                 filter_params=True, **params)\n\u001b[0m\u001b[1;32m    122\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    123\u001b[0m     \u001b[0;34m@\u001b[0m\u001b[0mproperty\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/usr/local/lib/python3.5/dist-packages/sklearn/metrics/pairwise.py\u001b[0m in \u001b[0;36mpairwise_kernels\u001b[0;34m(X, Y, metric, filter_params, n_jobs, **kwds)\u001b[0m\n\u001b[1;32m   1389\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1390\u001b[0m     \u001b[0;32mif\u001b[0m \u001b[0mmetric\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m\"precomputed\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1391\u001b[0;31m         \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcheck_pairwise_arrays\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mY\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mprecomputed\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1392\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1393\u001b[0m     \u001b[0;32melif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmetric\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mGPKernel\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/usr/local/lib/python3.5/dist-packages/sklearn/metrics/pairwise.py\u001b[0m in \u001b[0;36mcheck_pairwise_arrays\u001b[0;34m(X, Y, precomputed, dtype)\u001b[0m\n\u001b[1;32m    117\u001b[0m                              \u001b[0;34m\"(n_queries, n_indexed). Got (%d, %d) \"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    118\u001b[0m                              \u001b[0;34m\"for %d indexed.\"\u001b[0m \u001b[0;34m%\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 119\u001b[0;31m                              (X.shape[0], X.shape[1], Y.shape[0]))\n\u001b[0m\u001b[1;32m    120\u001b[0m     \u001b[0;32melif\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0mY\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    121\u001b[0m         raise ValueError(\"Incompatible dimension for X and Y matrices: \"\n",
      "\u001b[0;31mValueError\u001b[0m: Precomputed metric requires shape (n_queries, n_indexed). Got (19, 19) for 164 indexed."
     ]
    }
   ],
   "source": [
    "# Author: Elisabetta Ghisu\n",
    "\n",
    "\"\"\"\n",
    "- This script take as input a kernel matrix\n",
    "and returns the classification or regression performance\n",
    "- The kernel matrix can be calculated using any of the graph kernels approaches\n",
    "- The criteria used for prediction are SVM for classification and kernel Ridge regression for regression\n",
    "- For predition we divide the data in training, validation and test. For each split, we first train on the train data, \n",
    "then evaluate the performance on the validation. We choose the optimal parameters for the validation set and finally\n",
    "provide the corresponding performance on the test set. If more than one split is performed, the final results \n",
    "correspond to the average of the performances on the test sets. \n",
    "\n",
    "@references\n",
    "    https://github.com/eghisu/GraphKernels/blob/master/GraphKernelsCollection/python_scripts/compute_perf_gk.py\n",
    "\"\"\"\n",
    "\n",
    "print(__doc__)\n",
    "\n",
    "import sys\n",
    "import pathlib\n",
    "import os\n",
    "sys.path.insert(0, \"../\")\n",
    "from tabulate import tabulate\n",
    "\n",
    "import random\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "from sklearn.kernel_ridge import KernelRidge # 0.17\n",
    "from sklearn.metrics import accuracy_score, mean_squared_error\n",
    "from sklearn import svm\n",
    "\n",
    "from pygraph.kernels.pathKernel import pathKernel\n",
    "from pygraph.utils.graphfiles import loadDataset\n",
    "\n",
    "# print('\\n Loading dataset from file...')\n",
    "# dataset, y = loadDataset(\"/home/ljia/Documents/research-repo/datasets/acyclic/Acyclic/dataset_bps.ds\")\n",
    "# y = np.array(y)\n",
    "# print(y)\n",
    "\n",
    "# kernel_file_path = 'marginalizedkernelmatrix.ds'\n",
    "# path = pathlib.Path(kernel_file_path)\n",
    "# if path.is_file():\n",
    "#     print('\\n Loading the matrix from file...')\n",
    "#     Kmatrix = np.loadtxt(kernel_file_path)\n",
    "#     print(Kmatrix)\n",
    "# else:\n",
    "#     print('\\n Calculating kernel matrix, this could take a while...')\n",
    "#     Kmatrix = marginalizeKernel(dataset)\n",
    "#     print(Kmatrix)\n",
    "#     print('Saving kernel matrix to file...')\n",
    "#     np.savetxt(kernel_file_path, Kmatrix)\n",
    "\n",
    "# setup the parameters\n",
    "model_type = 'regression' # Regression or classification problem\n",
    "print('\\n --- This is a %s problem ---' % model_type)\n",
    "\n",
    "# datasize = len(dataset)\n",
    "trials = 100 # Trials for hyperparameters random search\n",
    "splits = 100 # Number of splits of the data\n",
    "alpha_grid = np.linspace(0.01, 100, num = trials) # corresponds to (2*C)^-1 in other linear models such as LogisticRegression\n",
    "# C_grid = np.linspace(0.0001, 10, num = trials)\n",
    "random.seed(20) # Set the seed for uniform parameter distribution\n",
    "data_dir = '/home/ljia/Documents/research-repo/datasets/acyclic/Acyclic/'\n",
    "\n",
    "# set the output path\n",
    "kernel_file_path = 'kernelmatrices_marginalized_acyclic/'\n",
    "if not os.path.exists(kernel_file_path):\n",
    "    os.makedirs(kernel_file_path)\n",
    "\n",
    "\n",
    "\"\"\"\n",
    "-  Here starts the main program\n",
    "-  First we permute the data, then for each split we evaluate corresponding performances\n",
    "-  In the end, the performances are averaged over the test sets\n",
    "\"\"\"\n",
    "\n",
    "# Initialize the performance of the best parameter trial on validation with the corresponding performance on test\n",
    "val_split = []\n",
    "test_split = []\n",
    "\n",
    "p_quit = 0.5\n",
    "\n",
    "# for each split of the data\n",
    "for j in range(10):\n",
    "    dataset_train, y_train = loadDataset(data_dir + 'trainset_' + str(j) + '.ds')\n",
    "    dataset_test, y_test = loadDataset(data_dir + 'testset_' + str(j) + '.ds')\n",
    "    \n",
    "    # Normalization step (for real valued targets only)\n",
    "    if model_type == 'regression':\n",
    "        print('\\n Normalizing output y...')\n",
    "        y_train_mean = np.mean(y_train)\n",
    "        y_train_std = np.std(y_train)\n",
    "        y_train = (y_train - y_train_mean) / float(y_train_std)\n",
    "#         print(y)\n",
    "    \n",
    "    # save kernel matrices to files / read kernel matrices from files\n",
    "    kernel_file_train = kernel_file_path + 'train' + str(j) + '_pquit_' + str(p_quit)\n",
    "    kernel_file_test = kernel_file_path + 'test' + str(j) + '_pquit_' + str(p_quit)\n",
    "    path_train = pathlib.Path(kernel_file_train)\n",
    "    path_test = pathlib.Path(kernel_file_test)\n",
    "    # get train set kernel matrix\n",
    "    if path_train.is_file():\n",
    "        print('\\n Loading the train set kernel matrix from file...')\n",
    "        Kmatrix_train = np.loadtxt(kernel_file_train)\n",
    "        print(Kmatrix_train)\n",
    "    else:\n",
    "        print('\\n Calculating train set kernel matrix, this could take a while...')\n",
    "        Kmatrix_train = marginalizedkernel(dataset_train, p_quit, 20)\n",
    "        print(Kmatrix_train)\n",
    "        print('\\n Saving train set kernel matrix to file...')\n",
    "        np.savetxt(kernel_file_train, Kmatrix_train)\n",
    "    # get test set kernel matrix\n",
    "    if path_test.is_file():\n",
    "        print('\\n Loading the test set kernel matrix from file...')\n",
    "        Kmatrix_test = np.loadtxt(kernel_file_test)\n",
    "        print(Kmatrix_test)\n",
    "    else:\n",
    "        print('\\n Calculating test set kernel matrix, this could take a while...')\n",
    "        Kmatrix_test = marginalizedkernel(dataset_test, p_quit, 20)\n",
    "        print(Kmatrix_test)\n",
    "        print('\\n Saving test set kernel matrix to file...')\n",
    "        np.savetxt(kernel_file_test, Kmatrix_test)\n",
    "\n",
    "    # For each parameter trial\n",
    "    for i in range(trials):\n",
    "        # For regression use the Kernel Ridge method\n",
    "        if model_type == 'regression':\n",
    "    #             print('\\n Starting experiment for trial %d and parameter alpha = %3f\\n ' % (i, alpha_grid[i]))\n",
    "\n",
    "            # Fit the kernel ridge model\n",
    "            KR = KernelRidge(kernel = 'precomputed', alpha = alpha_grid[i])\n",
    "            KR.fit(Kmatrix_train, y_train)\n",
    "\n",
    "            # predict on the test set\n",
    "            y_pred_test = KR.predict(Kmatrix_test)\n",
    "    #             print(y_pred)\n",
    "\n",
    "            # adjust prediction: needed because the training targets have been normalized\n",
    "            y_pred_test = y_pred_test * float(y_train_std) + y_train_mean\n",
    "    #             print(y_pred_test)\n",
    "\n",
    "            # root mean squared error in test \n",
    "            rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))\n",
    "            perf_all_test.append(rmse_test)\n",
    "\n",
    "    #             print('The performance on the validation set is: %3f' % rmse)\n",
    "    #             print('The performance on the test set is: %3f' % rmse_test)\n",
    "\n",
    "    # --- FIND THE OPTIMAL PARAMETERS --- #\n",
    "    # For regression: minimise the mean squared error\n",
    "    if model_type == 'regression':\n",
    "\n",
    "        # get optimal parameter on test (argmin mean squared error)\n",
    "        min_idx = np.argmin(perf_all_test)\n",
    "        alpha_opt = alpha_grid[min_idx]\n",
    "\n",
    "        # corresponding performance on test for the same parameter\n",
    "        perf_test_opt = perf_all_test[min_idx]\n",
    "\n",
    "        print('The best performance is for trial %d with parameter alpha = %3f' % (min_idx, alpha_opt))\n",
    "        print('The corresponding performance on test set is: %3f' % perf_test_opt)\n",
    "        \n",
    "        \n",
    "        \n",
    "\n",
    "# For each split of the data\n",
    "for j in range(10, 10 + splits):\n",
    "    print('Starting split %d...' % j)\n",
    "\n",
    "    # Set the random set for data permutation\n",
    "    random_state = int(j)\n",
    "    np.random.seed(random_state)\n",
    "    idx_perm = np.random.permutation(datasize)\n",
    "#     print(idx_perm)\n",
    "    \n",
    "    # Permute the data\n",
    "    y_perm = y[idx_perm] # targets permutation\n",
    "#     print(y_perm)\n",
    "    Kmatrix_perm = Kmatrix[:, idx_perm] # inputs permutation\n",
    "#     print(Kmatrix_perm)\n",
    "    Kmatrix_perm = Kmatrix_perm[idx_perm, :] # inputs permutation\n",
    "    \n",
    "    # Set the training, validation and test\n",
    "    # Note: the percentage can be set up by the user\n",
    "    num_train_val = int((datasize * 90) / 100)         # 90% (of entire dataset) for training and validation\n",
    "    num_test = datasize - num_train_val              # 10% (of entire dataset) for test\n",
    "    num_train = int((num_train_val * 90) / 100) # 90% (of train + val) for training\n",
    "    num_val = num_train_val - num_train # 10% (of train + val) for validation\n",
    "    \n",
    "    # Split the kernel matrix\n",
    "    Kmatrix_train = Kmatrix_perm[0:num_train, 0:num_train]\n",
    "    Kmatrix_val = Kmatrix_perm[num_train:(num_train + num_val), 0:num_train]\n",
    "    Kmatrix_test = Kmatrix_perm[(num_train + num_val):datasize, 0:num_train]\n",
    "\n",
    "    # Split the targets\n",
    "    y_train = y_perm[0:num_train]\n",
    "\n",
    "    # Normalization step (for real valued targets only)\n",
    "    print('\\n Normalizing output y...')\n",
    "    if model_type == 'regression':\n",
    "        y_train_mean = np.mean(y_train)\n",
    "        y_train_std = np.std(y_train)\n",
    "        y_train = (y_train - y_train_mean) / float(y_train_std)\n",
    "#         print(y)\n",
    "        \n",
    "    y_val = y_perm[num_train:(num_train + num_val)]\n",
    "    y_test = y_perm[(num_train + num_val):datasize]\n",
    "    \n",
    "    # Record the performance for each parameter trial respectively on validation and test set\n",
    "    perf_all_val = []\n",
    "    perf_all_test = []\n",
    "    \n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "- This script take as input a kernel matrix\n",
      "and returns the classification or regression performance\n",
      "- The kernel matrix can be calculated using any of the graph kernels approaches\n",
      "- The criteria used for prediction are SVM for classification and kernel Ridge regression for regression\n",
      "- For predition we divide the data in training, validation and test. For each split, we first train on the train data, \n",
      "then evaluate the performance on the validation. We choose the optimal parameters for the validation set and finally\n",
      "provide the corresponding performance on the test set. If more than one split is performed, the final results \n",
      "correspond to the average of the performances on the test sets. \n",
      "\n",
      "@references\n",
      "    https://github.com/eghisu/GraphKernels/blob/master/GraphKernelsCollection/python_scripts/compute_perf_gk.py\n",
      "\n",
      "\n",
      " Loading dataset from file...\n",
      "[ -23.7   14.    37.3  109.7   10.8   39.    42.    66.6  135.   148.5\n",
      "   40.    34.6   32.    63.    53.5   67.    64.4   84.7   95.5   92.\n",
      "   84.4  154.   156.   166.   183.    70.3   63.6   52.5   59.    59.5\n",
      "   55.2   88.    83.   104.5  102.    92.   107.4  123.2  112.5  118.5\n",
      "  101.5  173.7  165.5  181.    99.5   92.3   90.1   80.2   82.    91.2\n",
      "   91.5   81.2   93.    69.    86.3   82.   103.   103.5   96.   112.   104.\n",
      "  132.5  123.5  120.3  145.   144.2  142.8  132.   134.2  137.   139.\n",
      "  133.6  120.4  120.   137.   195.8  177.2  181.   185.9  175.7  186.   211.\n",
      "  125.   118.   117.1  107.   102.5  112.    97.4   91.5   87.6  106.5\n",
      "  101.    99.3   90.   137.   114.   126.   124.   140.5  157.5  146.   145.\n",
      "  141.   171.   166.   155.   145.   159.   138.   142.   159.   163.5\n",
      "  229.5  142.   125.   132.   130.5  125.   122.   121.   122.2  112.   106.\n",
      "  114.5  151.   128.5  109.5  126.   147.   158.   147.   165.   188.9\n",
      "  170.   178.   148.5  165.   177.   167.   195.   226.   215.   201.   205.\n",
      "  151.5  165.5  157.   139.   163.   153.5  139.   162.   173.   159.5\n",
      "  159.5  155.5  141.   126.   164.   163.   166.5  146.   165.   159.   195.\n",
      "  218.   250.   235.   186.5  156.5  162.   162.   170.2  173.2  186.8\n",
      "  173.   187.   174.   188.5  199.   228.   215.   216.   240. ]\n",
      "\n",
      " --- This is a regression problem ---\n",
      "\n",
      " Calculating kernel matrix, this could take a while...\n",
      "--- mean average path kernel matrix of size 185 built in 38.70095658302307 seconds ---\n",
      "[[ 0.55555556  0.22222222  0.         ...,  0.          0.          0.        ]\n",
      " [ 0.22222222  0.27777778  0.         ...,  0.          0.          0.        ]\n",
      " [ 0.          0.          0.55555556 ...,  0.03030303  0.03030303\n",
      "   0.03030303]\n",
      " ..., \n",
      " [ 0.          0.          0.03030303 ...,  0.08297521  0.05553719\n",
      "   0.05256198]\n",
      " [ 0.          0.          0.03030303 ...,  0.05553719  0.07239669\n",
      "   0.0538843 ]\n",
      " [ 0.          0.          0.03030303 ...,  0.05256198  0.0538843\n",
      "   0.07438017]]\n",
      "\n",
      " Saving kernel matrix to file...\n",
      "\n",
      " Mean performance on val set: 11.907089\n",
      "With standard deviation: 4.781924\n",
      "\n",
      " Mean performance on test set: 14.270816\n",
      "With standard deviation: 6.366698\n"
     ]
    }
   ],
   "source": [
    "# Author: Elisabetta Ghisu\n",
    "\n",
    "\"\"\"\n",
    "- This script take as input a kernel matrix\n",
    "and returns the classification or regression performance\n",
    "- The kernel matrix can be calculated using any of the graph kernels approaches\n",
    "- The criteria used for prediction are SVM for classification and kernel Ridge regression for regression\n",
    "- For predition we divide the data in training, validation and test. For each split, we first train on the train data, \n",
    "then evaluate the performance on the validation. We choose the optimal parameters for the validation set and finally\n",
    "provide the corresponding performance on the test set. If more than one split is performed, the final results \n",
    "correspond to the average of the performances on the test sets. \n",
    "\n",
    "@references\n",
    "    https://github.com/eghisu/GraphKernels/blob/master/GraphKernelsCollection/python_scripts/compute_perf_gk.py\n",
    "\"\"\"\n",
    "\n",
    "print(__doc__)\n",
    "\n",
    "import sys\n",
    "import os\n",
    "import pathlib\n",
    "sys.path.insert(0, \"../\")\n",
    "from tabulate import tabulate\n",
    "\n",
    "import random\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "from sklearn.kernel_ridge import KernelRidge # 0.17\n",
    "from sklearn.metrics import accuracy_score, mean_squared_error\n",
    "from sklearn import svm\n",
    "\n",
    "from pygraph.kernels.pathKernel import pathkernel\n",
    "from pygraph.utils.graphfiles import loadDataset\n",
    "\n",
    "print('\\n Loading dataset from file...')\n",
    "dataset, y = loadDataset(\"../../../../datasets/acyclic/Acyclic/dataset_bps.ds\")\n",
    "y = np.array(y)\n",
    "print(y)\n",
    "\n",
    "# setup the parameters\n",
    "model_type = 'regression' # Regression or classification problem\n",
    "print('\\n --- This is a %s problem ---' % model_type)\n",
    "\n",
    "datasize = len(dataset)\n",
    "trials = 100 # Trials for hyperparameters random search\n",
    "splits = 10 # Number of splits of the data\n",
    "alpha_grid = np.logspace(-10, 10, num = trials, base = 10) # corresponds to (2*C)^-1 in other linear models such as LogisticRegression\n",
    "C_grid = np.logspace(-10, 10, num = trials, base = 10)\n",
    "random.seed(20) # Set the seed for uniform parameter distribution\n",
    "\n",
    "# set the output path\n",
    "kernel_file_path = 'kernelmatrices_path_acyclic/'\n",
    "if not os.path.exists(kernel_file_path):\n",
    "    os.makedirs(kernel_file_path)\n",
    "\n",
    "\n",
    "\"\"\"\n",
    "-  Here starts the main program\n",
    "-  First we permute the data, then for each split we evaluate corresponding performances\n",
    "-  In the end, the performances are averaged over the test sets\n",
    "\"\"\"\n",
    "\n",
    "# save kernel matrices to files / read kernel matrices from files\n",
    "kernel_file = kernel_file_path + 'km.ds'\n",
    "path = pathlib.Path(kernel_file)\n",
    "# get train set kernel matrix\n",
    "if path.is_file():\n",
    "    print('\\n Loading the kernel matrix from file...')\n",
    "    Kmatrix = np.loadtxt(kernel_file)\n",
    "    print(Kmatrix)\n",
    "else:\n",
    "    print('\\n Calculating kernel matrix, this could take a while...')\n",
    "    Kmatrix, run_time = pathkernel(dataset, node_label = 'atom', edge_label = 'bond_type')\n",
    "    print(Kmatrix)\n",
    "    print('\\n Saving kernel matrix to file...')\n",
    "    np.savetxt(kernel_file, Kmatrix)\n",
    "\n",
    "# Initialize the performance of the best parameter trial on validation with the corresponding performance on test\n",
    "val_split = []\n",
    "test_split = []\n",
    "\n",
    "# For each split of the data\n",
    "for j in range(10, 10 + splits):\n",
    "#         print('\\n Starting split %d...' % j)\n",
    "\n",
    "    # Set the random set for data permutation\n",
    "    random_state = int(j)\n",
    "    np.random.seed(random_state)\n",
    "    idx_perm = np.random.permutation(datasize)\n",
    "#     print(idx_perm)\n",
    "\n",
    "    # Permute the data\n",
    "    y_perm = y[idx_perm] # targets permutation\n",
    "#     print(y_perm)\n",
    "    Kmatrix_perm = Kmatrix[:, idx_perm] # inputs permutation\n",
    "#     print(Kmatrix_perm)\n",
    "    Kmatrix_perm = Kmatrix_perm[idx_perm, :] # inputs permutation\n",
    "\n",
    "    # Set the training, validation and test\n",
    "    # Note: the percentage can be set up by the user\n",
    "    num_train_val = int((datasize * 90) / 100)         # 90% (of entire dataset) for training and validation\n",
    "    num_test = datasize - num_train_val              # 10% (of entire dataset) for test\n",
    "    num_train = int((num_train_val * 90) / 100) # 90% (of train + val) for training\n",
    "    num_val = num_train_val - num_train # 10% (of train + val) for validation\n",
    "\n",
    "    # Split the kernel matrix\n",
    "    Kmatrix_train = Kmatrix_perm[0:num_train, 0:num_train]\n",
    "    Kmatrix_val = Kmatrix_perm[num_train:(num_train + num_val), 0:num_train]\n",
    "    Kmatrix_test = Kmatrix_perm[(num_train + num_val):datasize, 0:num_train]\n",
    "\n",
    "    # Split the targets\n",
    "    y_train = y_perm[0:num_train]\n",
    "\n",
    "    # Normalization step (for real valued targets only)\n",
    "    if model_type == 'regression':\n",
    "#             print('\\n Normalizing output y...')\n",
    "        y_train_mean = np.mean(y_train)\n",
    "        y_train_std = np.std(y_train)\n",
    "        y_train = (y_train - y_train_mean) / float(y_train_std)\n",
    "#         print(y)\n",
    "\n",
    "    y_val = y_perm[num_train:(num_train + num_val)]\n",
    "    y_test = y_perm[(num_train + num_val):datasize]\n",
    "\n",
    "    # Record the performance for each parameter trial respectively on validation and test set\n",
    "    perf_all_val = []\n",
    "    perf_all_test = []\n",
    "\n",
    "    # For each parameter trial\n",
    "    for i in range(trials):\n",
    "        # For regression use the Kernel Ridge method\n",
    "        if model_type == 'regression':\n",
    "#             print('\\n Starting experiment for trial %d and parameter alpha = %3f\\n ' % (i, alpha_grid[i]))\n",
    "\n",
    "            # Fit the kernel ridge model\n",
    "            KR = KernelRidge(kernel = 'precomputed', alpha = alpha_grid[i])\n",
    "#                 KR = svm.SVR(kernel = 'precomputed', C = C_grid[i])\n",
    "            KR.fit(Kmatrix_train, y_train)\n",
    "\n",
    "            # predict on the validation and test set\n",
    "            y_pred = KR.predict(Kmatrix_val)\n",
    "            y_pred_test = KR.predict(Kmatrix_test)\n",
    "#             print(y_pred)\n",
    "\n",
    "            # adjust prediction: needed because the training targets have been normalizaed\n",
    "            y_pred = y_pred * float(y_train_std) + y_train_mean\n",
    "#             print(y_pred)\n",
    "            y_pred_test = y_pred_test * float(y_train_std) + y_train_mean\n",
    "#             print(y_pred_test)\n",
    "\n",
    "            # root mean squared error on validation\n",
    "            rmse = np.sqrt(mean_squared_error(y_val, y_pred))\n",
    "            perf_all_val.append(rmse)\n",
    "\n",
    "            # root mean squared error in test \n",
    "            rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))\n",
    "            perf_all_test.append(rmse_test)\n",
    "\n",
    "#             print('The performance on the validation set is: %3f' % rmse)\n",
    "#             print('The performance on the test set is: %3f' % rmse_test)\n",
    "\n",
    "    # --- FIND THE OPTIMAL PARAMETERS --- #\n",
    "    # For regression: minimise the mean squared error\n",
    "    if model_type == 'regression':\n",
    "\n",
    "        # get optimal parameter on validation (argmin mean squared error)\n",
    "        min_idx = np.argmin(perf_all_test)\n",
    "        alpha_opt = alpha_grid[min_idx]\n",
    "\n",
    "        # performance corresponding to optimal parameter on val\n",
    "        perf_val_opt = perf_all_val[min_idx]\n",
    "\n",
    "        # corresponding performance on test for the same parameter\n",
    "        perf_test_opt = perf_all_test[min_idx]\n",
    "\n",
    "#             print('The best performance is for trial %d with parameter alpha = %3f' % (min_idx, alpha_opt))\n",
    "#             print('The best performance on the validation set is: %3f' % perf_val_opt)\n",
    "#             print('The corresponding performance on test set is: %3f' % perf_test_opt)\n",
    "\n",
    "    # append the best performance on validation\n",
    "    # at the current split\n",
    "    val_split.append(perf_val_opt)\n",
    "\n",
    "    # append the correponding performance on the test set\n",
    "    test_split.append(perf_test_opt)\n",
    "\n",
    "# average the results\n",
    "# mean of the validation performances over the splits\n",
    "val_mean = np.mean(np.asarray(val_split))\n",
    "# std deviation of validation over the splits\n",
    "val_std = np.std(np.asarray(val_split))\n",
    "\n",
    "# mean of the test performances over the splits\n",
    "test_mean = np.mean(np.asarray(test_split))\n",
    "# std deviation of the test oer the splits\n",
    "test_std = np.std(np.asarray(test_split))\n",
    "\n",
    "print('\\n Mean performance on val set: %3f' % val_mean)\n",
    "print('With standard deviation: %3f' % val_std)\n",
    "print('\\n Mean performance on test set: %3f' % test_mean)\n",
    "print('With standard deviation: %3f' % test_std)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}