OpenI
/
graphkit-learn

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(__doc__)\n",
    "\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "from sklearn import svm, datasets\n",
    "\n",
    "# import some data to play with\n",
    "iris = datasets.load_iris()\n",
    "X = iris.data[:, :2]  # we only take the first two features. We could\n",
    "                      # avoid this ugly slicing by using a two-dim dataset\n",
    "Y = iris.target\n",
    "\n",
    "\n",
    "def my_kernel(X, Y):\n",
    "    \"\"\"\n",
    "    We create a custom kernel:\n",
    "\n",
    "                 (2  0)\n",
    "    k(X, Y) = X  (    ) Y.T\n",
    "                 (0  1)\n",
    "    \"\"\"\n",
    "    M = np.array([[2, 0], [0, 1.0]])\n",
    "    return np.dot(np.dot(X, M), Y.T)\n",
    "\n",
    "\n",
    "h = .02  # step size in the mesh\n",
    "\n",
    "# we create an instance of SVM and fit out data.\n",
    "clf = svm.SVC(kernel=my_kernel)\n",
    "clf.fit(X, Y)\n",
    "\n",
    "# Plot the decision boundary. For that, we will assign a color to each\n",
    "# point in the mesh [x_min, x_max]x[y_min, y_max].\n",
    "x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1\n",
    "y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1\n",
    "xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))\n",
    "Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])\n",
    "\n",
    "# Put the result into a color plot\n",
    "Z = Z.reshape(xx.shape)\n",
    "plt.pcolormesh(xx, yy, Z, cmap=plt.cm.Paired)\n",
    "\n",
    "# Plot also the training points\n",
    "plt.scatter(X[:, 0], X[:, 1], c=Y, cmap=plt.cm.Paired, edgecolors='k')\n",
    "plt.title('3-Class classification using Support Vector Machine with custom'\n",
    "          ' kernel')\n",
    "plt.axis('tight')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "- This script take as input a kernel matrix\n",
      "and returns the classification or regression performance\n",
      "- The kernel matrix can be calculated using any of the graph kernels approaches\n",
      "- The criteria used for prediction are SVM for classification and kernel Ridge regression for regression\n",
      "- For predition we divide the data in training, validation and test. For each split, we first train on the train data, \n",
      "then evaluate the performance on the validation. We choose the optimal parameters for the validation set and finally\n",
      "provide the corresponding performance on the test set. If more than one split is performed, the final results \n",
      "correspond to the average of the performances on the test sets. \n",
      "\n",
      "@references\n",
      "    https://github.com/eghisu/GraphKernels/blob/master/GraphKernelsCollection/python_scripts/compute_perf_gk.py\n",
      "\n",
      "\n",
      " Loading dataset from file...\n",
      "[ -23.7   14.    37.3  109.7   10.8   39.    42.    66.6  135.   148.5\n",
      "   40.    34.6   32.    63.    53.5   67.    64.4   84.7   95.5   92.\n",
      "   84.4  154.   156.   166.   183.    70.3   63.6   52.5   59.    59.5\n",
      "   55.2   88.    83.   104.5  102.    92.   107.4  123.2  112.5  118.5\n",
      "  101.5  173.7  165.5  181.    99.5   92.3   90.1   80.2   82.    91.2\n",
      "   91.5   81.2   93.    69.    86.3   82.   103.   103.5   96.   112.   104.\n",
      "  132.5  123.5  120.3  145.   144.2  142.8  132.   134.2  137.   139.\n",
      "  133.6  120.4  120.   137.   195.8  177.2  181.   185.9  175.7  186.   211.\n",
      "  125.   118.   117.1  107.   102.5  112.    97.4   91.5   87.6  106.5\n",
      "  101.    99.3   90.   137.   114.   126.   124.   140.5  157.5  146.   145.\n",
      "  141.   171.   166.   155.   145.   159.   138.   142.   159.   163.5\n",
      "  229.5  142.   125.   132.   130.5  125.   122.   121.   122.2  112.   106.\n",
      "  114.5  151.   128.5  109.5  126.   147.   158.   147.   165.   188.9\n",
      "  170.   178.   148.5  165.   177.   167.   195.   226.   215.   201.   205.\n",
      "  151.5  165.5  157.   139.   163.   153.5  139.   162.   173.   159.5\n",
      "  159.5  155.5  141.   126.   164.   163.   166.5  146.   165.   159.   195.\n",
      "  218.   250.   235.   186.5  156.5  162.   162.   170.2  173.2  186.8\n",
      "  173.   187.   174.   188.5  199.   228.   215.   216.   240. ]\n",
      "\n",
      " Loading the matrix from file...\n",
      "[[  3.   1.   3. ...,   1.   1.   1.]\n",
      " [  1.   6.   1. ...,   0.   0.   3.]\n",
      " [  3.   1.   3. ...,   1.   1.   1.]\n",
      " ..., \n",
      " [  1.   0.   1. ...,  55.  21.   7.]\n",
      " [  1.   0.   1. ...,  21.  55.   7.]\n",
      " [  1.   3.   1. ...,   7.   7.  55.]]\n",
      "--- This is a regression problem ---\n",
      "Starting split 10...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 12 with parameter alpha = 12.130000\n",
      "The best performance on the validation set is: 36.004721\n",
      "The corresponding performance on test set is: 47.691725\n",
      "Starting split 11...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 99 with parameter alpha = 100.000000\n",
      "The best performance on the validation set is: 33.084913\n",
      "The corresponding performance on test set is: 35.493699\n",
      "Starting split 12...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 18 with parameter alpha = 18.190000\n",
      "The best performance on the validation set is: 29.476238\n",
      "The corresponding performance on test set is: 36.525571\n",
      "Starting split 13...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 9 with parameter alpha = 9.100000\n",
      "The best performance on the validation set is: 40.272791\n",
      "The corresponding performance on test set is: 37.359205\n",
      "Starting split 14...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 99 with parameter alpha = 100.000000\n",
      "The best performance on the validation set is: 42.973240\n",
      "The corresponding performance on test set is: 53.123785\n",
      "Starting split 15...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 18 with parameter alpha = 18.190000\n",
      "The best performance on the validation set is: 38.216353\n",
      "The corresponding performance on test set is: 37.697069\n",
      "Starting split 16...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 88 with parameter alpha = 88.890000\n",
      "The best performance on the validation set is: 32.988038\n",
      "The corresponding performance on test set is: 37.515000\n",
      "Starting split 17...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 4 with parameter alpha = 4.050000\n",
      "The best performance on the validation set is: 33.530482\n",
      "The corresponding performance on test set is: 43.448861\n",
      "Starting split 18...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 64 with parameter alpha = 64.650000\n",
      "The best performance on the validation set is: 29.671418\n",
      "The corresponding performance on test set is: 29.196786\n",
      "Starting split 19...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 7 with parameter alpha = 7.080000\n",
      "The best performance on the validation set is: 44.854681\n",
      "The corresponding performance on test set is: 36.111594\n",
      "Starting split 20...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 19 with parameter alpha = 19.200000\n",
      "The best performance on the validation set is: 35.660642\n",
      "The corresponding performance on test set is: 38.151790\n",
      "Starting split 21...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 14 with parameter alpha = 14.150000\n",
      "The best performance on the validation set is: 41.607604\n",
      "The corresponding performance on test set is: 32.158764\n",
      "Starting split 22...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 46 with parameter alpha = 46.470000\n",
      "The best performance on the validation set is: 36.461026\n",
      "The corresponding performance on test set is: 36.247837\n",
      "Starting split 23...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 49 with parameter alpha = 49.500000\n",
      "The best performance on the validation set is: 28.540585\n",
      "The corresponding performance on test set is: 37.377972\n",
      "Starting split 24...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 32 with parameter alpha = 32.330000\n",
      "The best performance on the validation set is: 33.397020\n",
      "The corresponding performance on test set is: 35.840386\n",
      "Starting split 25...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 30 with parameter alpha = 30.310000\n",
      "The best performance on the validation set is: 36.391650\n",
      "The corresponding performance on test set is: 36.120189\n",
      "Starting split 26...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 67 with parameter alpha = 67.680000\n",
      "The best performance on the validation set is: 39.558878\n",
      "The corresponding performance on test set is: 35.711036\n",
      "Starting split 27...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 35 with parameter alpha = 35.360000\n",
      "The best performance on the validation set is: 34.563158\n",
      "The corresponding performance on test set is: 51.215777\n",
      "Starting split 28...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 15 with parameter alpha = 15.160000\n",
      "The best performance on the validation set is: 43.309309\n",
      "The corresponding performance on test set is: 40.394297\n",
      "Starting split 29...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 14 with parameter alpha = 14.150000\n",
      "The best performance on the validation set is: 42.797900\n",
      "The corresponding performance on test set is: 34.218103\n",
      "Starting split 30...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 42 with parameter alpha = 42.430000\n",
      "The best performance on the validation set is: 45.422692\n",
      "The corresponding performance on test set is: 27.041917\n",
      "Starting split 31...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 9 with parameter alpha = 9.100000\n",
      "The best performance on the validation set is: 33.447413\n",
      "The corresponding performance on test set is: 38.341333\n",
      "Starting split 32...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 46 with parameter alpha = 46.470000\n",
      "The best performance on the validation set is: 31.638807\n",
      "The corresponding performance on test set is: 43.374635\n",
      "Starting split 33...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 49 with parameter alpha = 49.500000\n",
      "The best performance on the validation set is: 37.702092\n",
      "The corresponding performance on test set is: 31.198701\n",
      "Starting split 34...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 99 with parameter alpha = 100.000000\n",
      "The best performance on the validation set is: 28.013251\n",
      "The corresponding performance on test set is: 30.116903\n",
      "Starting split 35...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 13 with parameter alpha = 13.140000\n",
      "The best performance on the validation set is: 38.520179\n",
      "The corresponding performance on test set is: 37.478691\n",
      "Starting split 36...\n",
      "\n",
      " Normalizing output y...\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The best performance is for trial 99 with parameter alpha = 100.000000\n",
      "The best performance on the validation set is: 35.836403\n",
      "The corresponding performance on test set is: 37.447219\n",
      "Starting split 37...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 47 with parameter alpha = 47.480000\n",
      "The best performance on the validation set is: 31.172116\n",
      "The corresponding performance on test set is: 39.504962\n",
      "Starting split 38...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 36 with parameter alpha = 36.370000\n",
      "The best performance on the validation set is: 40.025101\n",
      "The corresponding performance on test set is: 41.314650\n",
      "Starting split 39...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 99 with parameter alpha = 100.000000\n",
      "The best performance on the validation set is: 28.474810\n",
      "The corresponding performance on test set is: 38.093995\n",
      "Starting split 40...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 99 with parameter alpha = 100.000000\n",
      "The best performance on the validation set is: 38.056007\n",
      "The corresponding performance on test set is: 33.570513\n",
      "Starting split 41...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 5 with parameter alpha = 5.060000\n",
      "The best performance on the validation set is: 35.329935\n",
      "The corresponding performance on test set is: 40.309342\n",
      "Starting split 42...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 59 with parameter alpha = 59.600000\n",
      "The best performance on the validation set is: 25.235609\n",
      "The corresponding performance on test set is: 36.117043\n",
      "Starting split 43...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 30 with parameter alpha = 30.310000\n",
      "The best performance on the validation set is: 27.596182\n",
      "The corresponding performance on test set is: 39.069843\n",
      "Starting split 44...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 99 with parameter alpha = 100.000000\n",
      "The best performance on the validation set is: 24.493222\n",
      "The corresponding performance on test set is: 34.064025\n",
      "Starting split 45...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 15 with parameter alpha = 15.160000\n",
      "The best performance on the validation set is: 45.540605\n",
      "The corresponding performance on test set is: 33.544310\n",
      "Starting split 46...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 99 with parameter alpha = 100.000000\n",
      "The best performance on the validation set is: 34.661595\n",
      "The corresponding performance on test set is: 26.174480\n",
      "Starting split 47...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 5 with parameter alpha = 5.060000\n",
      "The best performance on the validation set is: 34.837287\n",
      "The corresponding performance on test set is: 45.463855\n",
      "Starting split 48...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 31 with parameter alpha = 31.320000\n",
      "The best performance on the validation set is: 37.528129\n",
      "The corresponding performance on test set is: 51.123083\n",
      "Starting split 49...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 20 with parameter alpha = 20.210000\n",
      "The best performance on the validation set is: 32.480446\n",
      "The corresponding performance on test set is: 31.618253\n",
      "Starting split 50...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 59 with parameter alpha = 59.600000\n",
      "The best performance on the validation set is: 32.584107\n",
      "The corresponding performance on test set is: 31.376594\n",
      "Starting split 51...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 9 with parameter alpha = 9.100000\n",
      "The best performance on the validation set is: 40.867089\n",
      "The corresponding performance on test set is: 32.363239\n",
      "Starting split 52...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 97 with parameter alpha = 97.980000\n",
      "The best performance on the validation set is: 33.801783\n",
      "The corresponding performance on test set is: 41.200644\n",
      "Starting split 53...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 17 with parameter alpha = 17.180000\n",
      "The best performance on the validation set is: 44.010303\n",
      "The corresponding performance on test set is: 34.562120\n",
      "Starting split 54...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 22 with parameter alpha = 22.230000\n",
      "The best performance on the validation set is: 36.759090\n",
      "The corresponding performance on test set is: 38.376060\n",
      "Starting split 55...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 99 with parameter alpha = 100.000000\n",
      "The best performance on the validation set is: 29.830898\n",
      "The corresponding performance on test set is: 24.811584\n",
      "Starting split 56...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 18 with parameter alpha = 18.190000\n",
      "The best performance on the validation set is: 34.627026\n",
      "The corresponding performance on test set is: 46.684129\n",
      "Starting split 57...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 5 with parameter alpha = 5.060000\n",
      "The best performance on the validation set is: 43.343991\n",
      "The corresponding performance on test set is: 41.169814\n",
      "Starting split 58...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 39 with parameter alpha = 39.400000\n",
      "The best performance on the validation set is: 34.908981\n",
      "The corresponding performance on test set is: 48.932907\n",
      "Starting split 59...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 10 with parameter alpha = 10.110000\n",
      "The best performance on the validation set is: 37.767543\n",
      "The corresponding performance on test set is: 28.512235\n",
      "Starting split 60...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 99 with parameter alpha = 100.000000\n",
      "The best performance on the validation set is: 37.890852\n",
      "The corresponding performance on test set is: 28.082837\n",
      "Starting split 61...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 36 with parameter alpha = 36.370000\n",
      "The best performance on the validation set is: 41.402040\n",
      "The corresponding performance on test set is: 31.964262\n",
      "Starting split 62...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 34 with parameter alpha = 34.350000\n",
      "The best performance on the validation set is: 39.216178\n",
      "The corresponding performance on test set is: 48.626836\n",
      "Starting split 63...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 99 with parameter alpha = 100.000000\n",
      "The best performance on the validation set is: 35.222016\n",
      "The corresponding performance on test set is: 50.344625\n",
      "Starting split 64...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 25 with parameter alpha = 25.260000\n",
      "The best performance on the validation set is: 33.803383\n",
      "The corresponding performance on test set is: 40.058257\n",
      "Starting split 65...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 56 with parameter alpha = 56.570000\n",
      "The best performance on the validation set is: 29.170281\n",
      "The corresponding performance on test set is: 36.104372\n",
      "Starting split 66...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 14 with parameter alpha = 14.150000\n",
      "The best performance on the validation set is: 40.405285\n",
      "The corresponding performance on test set is: 39.262782\n",
      "Starting split 67...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 58 with parameter alpha = 58.590000\n",
      "The best performance on the validation set is: 24.638279\n",
      "The corresponding performance on test set is: 35.165763\n",
      "Starting split 68...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 56 with parameter alpha = 56.570000\n",
      "The best performance on the validation set is: 31.681889\n",
      "The corresponding performance on test set is: 44.049377\n",
      "Starting split 69...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 5 with parameter alpha = 5.060000\n",
      "The best performance on the validation set is: 30.735748\n",
      "The corresponding performance on test set is: 50.708019\n",
      "Starting split 70...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 29 with parameter alpha = 29.300000\n",
      "The best performance on the validation set is: 37.273741\n",
      "The corresponding performance on test set is: 39.351135\n",
      "Starting split 71...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 6 with parameter alpha = 6.070000\n",
      "The best performance on the validation set is: 48.033115\n",
      "The corresponding performance on test set is: 55.674648\n",
      "Starting split 72...\n",
      "\n",
      " Normalizing output y...\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The best performance is for trial 99 with parameter alpha = 100.000000\n",
      "The best performance on the validation set is: 31.903823\n",
      "The corresponding performance on test set is: 32.937886\n",
      "Starting split 73...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 19 with parameter alpha = 19.200000\n",
      "The best performance on the validation set is: 40.825941\n",
      "The corresponding performance on test set is: 38.535950\n",
      "Starting split 74...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 99 with parameter alpha = 100.000000\n",
      "The best performance on the validation set is: 34.181621\n",
      "The corresponding performance on test set is: 34.089714\n",
      "Starting split 75...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 39 with parameter alpha = 39.400000\n",
      "The best performance on the validation set is: 40.264289\n",
      "The corresponding performance on test set is: 47.412526\n",
      "Starting split 76...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 27 with parameter alpha = 27.280000\n",
      "The best performance on the validation set is: 35.842650\n",
      "The corresponding performance on test set is: 34.785447\n",
      "Starting split 77...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 33 with parameter alpha = 33.340000\n",
      "The best performance on the validation set is: 38.896608\n",
      "The corresponding performance on test set is: 39.158479\n",
      "Starting split 78...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 28 with parameter alpha = 28.290000\n",
      "The best performance on the validation set is: 31.053773\n",
      "The corresponding performance on test set is: 33.711541\n",
      "Starting split 79...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 54 with parameter alpha = 54.550000\n",
      "The best performance on the validation set is: 36.129208\n",
      "The corresponding performance on test set is: 34.191692\n",
      "Starting split 80...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 3 with parameter alpha = 3.040000\n",
      "The best performance on the validation set is: 42.796346\n",
      "The corresponding performance on test set is: 40.531343\n",
      "Starting split 81...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 1 with parameter alpha = 1.020000\n",
      "The best performance on the validation set is: 50.420936\n",
      "The corresponding performance on test set is: 43.764477\n",
      "Starting split 82...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 99 with parameter alpha = 100.000000\n",
      "The best performance on the validation set is: 36.720826\n",
      "The corresponding performance on test set is: 40.242670\n",
      "Starting split 83...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 43 with parameter alpha = 43.440000\n",
      "The best performance on the validation set is: 38.601089\n",
      "The corresponding performance on test set is: 46.145483\n",
      "Starting split 84...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 38 with parameter alpha = 38.390000\n",
      "The best performance on the validation set is: 29.823069\n",
      "The corresponding performance on test set is: 27.458317\n",
      "Starting split 85...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 1 with parameter alpha = 1.020000\n",
      "The best performance on the validation set is: 37.295245\n",
      "The corresponding performance on test set is: 41.040827\n",
      "Starting split 86...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 6 with parameter alpha = 6.070000\n",
      "The best performance on the validation set is: 46.779299\n",
      "The corresponding performance on test set is: 36.893783\n",
      "Starting split 87...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 30 with parameter alpha = 30.310000\n",
      "The best performance on the validation set is: 35.570939\n",
      "The corresponding performance on test set is: 31.014527\n",
      "Starting split 88...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 11 with parameter alpha = 11.120000\n",
      "The best performance on the validation set is: 42.125559\n",
      "The corresponding performance on test set is: 34.059925\n",
      "Starting split 89...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 32 with parameter alpha = 32.330000\n",
      "The best performance on the validation set is: 33.276369\n",
      "The corresponding performance on test set is: 34.010431\n",
      "Starting split 90...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 14 with parameter alpha = 14.150000\n",
      "The best performance on the validation set is: 48.542045\n",
      "The corresponding performance on test set is: 51.204813\n",
      "Starting split 91...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 99 with parameter alpha = 100.000000\n",
      "The best performance on the validation set is: 51.240470\n",
      "The corresponding performance on test set is: 25.204665\n",
      "Starting split 92...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 8 with parameter alpha = 8.090000\n",
      "The best performance on the validation set is: 28.042005\n",
      "The corresponding performance on test set is: 40.903728\n",
      "Starting split 93...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 24 with parameter alpha = 24.250000\n",
      "The best performance on the validation set is: 46.378283\n",
      "The corresponding performance on test set is: 38.717776\n",
      "Starting split 94...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 52 with parameter alpha = 52.530000\n",
      "The best performance on the validation set is: 28.152322\n",
      "The corresponding performance on test set is: 30.532819\n",
      "Starting split 95...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 42 with parameter alpha = 42.430000\n",
      "The best performance on the validation set is: 36.952087\n",
      "The corresponding performance on test set is: 33.163953\n",
      "Starting split 96...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 61 with parameter alpha = 61.620000\n",
      "The best performance on the validation set is: 41.047679\n",
      "The corresponding performance on test set is: 27.036643\n",
      "Starting split 97...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 42 with parameter alpha = 42.430000\n",
      "The best performance on the validation set is: 36.858184\n",
      "The corresponding performance on test set is: 40.745694\n",
      "Starting split 98...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 89 with parameter alpha = 89.900000\n",
      "The best performance on the validation set is: 41.871463\n",
      "The corresponding performance on test set is: 37.287950\n",
      "Starting split 99...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 55 with parameter alpha = 55.560000\n",
      "The best performance on the validation set is: 41.040198\n",
      "The corresponding performance on test set is: 34.121258\n",
      "Starting split 100...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 9 with parameter alpha = 9.100000\n",
      "The best performance on the validation set is: 38.421616\n",
      "The corresponding performance on test set is: 43.772366\n",
      "Starting split 101...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 99 with parameter alpha = 100.000000\n",
      "The best performance on the validation set is: 23.640892\n",
      "The corresponding performance on test set is: 41.408082\n",
      "Starting split 102...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 99 with parameter alpha = 100.000000\n",
      "The best performance on the validation set is: 38.527173\n",
      "The corresponding performance on test set is: 52.468792\n",
      "Starting split 103...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 6 with parameter alpha = 6.070000\n",
      "The best performance on the validation set is: 47.154873\n",
      "The corresponding performance on test set is: 39.077319\n",
      "Starting split 104...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 12 with parameter alpha = 12.130000\n",
      "The best performance on the validation set is: 36.653442\n",
      "The corresponding performance on test set is: 47.172066\n",
      "Starting split 105...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 12 with parameter alpha = 12.130000\n",
      "The best performance on the validation set is: 33.976240\n",
      "The corresponding performance on test set is: 40.620368\n",
      "Starting split 106...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 22 with parameter alpha = 22.230000\n",
      "The best performance on the validation set is: 27.799295\n",
      "The corresponding performance on test set is: 38.034978\n",
      "Starting split 107...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 17 with parameter alpha = 17.180000\n",
      "The best performance on the validation set is: 43.202567\n",
      "The corresponding performance on test set is: 36.783012\n",
      "Starting split 108...\n",
      "\n",
      " Normalizing output y...\n",
      "The best performance is for trial 99 with parameter alpha = 100.000000\n",
      "The best performance on the validation set is: 32.312218\n",
      "The corresponding performance on test set is: 38.186940\n",
      "Starting split 109...\n",
      "\n",
      " Normalizing output y...\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The best performance is for trial 4 with parameter alpha = 4.050000\n",
      "The best performance on the validation set is: 39.485731\n",
      "The corresponding performance on test set is: 50.146953\n"
     ]
    }
   ],
   "source": [
    "# Author: Elisabetta Ghisu\n",
    "\n",
    "\"\"\"\n",
    "- This script take as input a kernel matrix\n",
    "and returns the classification or regression performance\n",
    "- The kernel matrix can be calculated using any of the graph kernels approaches\n",
    "- The criteria used for prediction are SVM for classification and kernel Ridge regression for regression\n",
    "- For predition we divide the data in training, validation and test. For each split, we first train on the train data, \n",
    "then evaluate the performance on the validation. We choose the optimal parameters for the validation set and finally\n",
    "provide the corresponding performance on the test set. If more than one split is performed, the final results \n",
    "correspond to the average of the performances on the test sets. \n",
    "\n",
    "@references\n",
    "    https://github.com/eghisu/GraphKernels/blob/master/GraphKernelsCollection/python_scripts/compute_perf_gk.py\n",
    "\"\"\"\n",
    "\n",
    "print(__doc__)\n",
    "\n",
    "import sys\n",
    "import pathlib\n",
    "sys.path.insert(0, \"../py-graph/\")\n",
    "\n",
    "import random\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "from sklearn.kernel_ridge import KernelRidge # 0.17\n",
    "from sklearn.metrics import accuracy_score, mean_squared_error\n",
    "from sklearn import svm\n",
    "\n",
    "from kernels.spkernel import spkernel\n",
    "from utils.graphfiles import loadDataset\n",
    "\n",
    "print('\\n Loading dataset from file...')\n",
    "dataset, y = loadDataset(\"/home/ljia/Documents/research-repo/datasets/acyclic/Acyclic/dataset_bps.ds\")\n",
    "y = np.array(y)\n",
    "print(y)\n",
    "\n",
    "kernel_file_path = 'kernelmatrix.ds'\n",
    "path = pathlib.Path(kernel_file_path)\n",
    "if path.is_file():\n",
    "    print('\\n Loading the matrix from file...')\n",
    "    Kmatrix = np.loadtxt(kernel_file_path)\n",
    "    print(Kmatrix)\n",
    "else:\n",
    "    print('\\n Calculating kernel matrix, this could take a while...')\n",
    "    Kmatrix = spkernel(dataset)\n",
    "    print(Kmatrix)\n",
    "    print('Saving kernel matrix to file...')\n",
    "    np.savetxt(kernel_file_path, Kmatrix)\n",
    "\n",
    "# setup the parameters\n",
    "model_type = 'regression' # Regression or classification problem\n",
    "print('\\n --- This is a %s problem ---' % model_type)\n",
    "\n",
    "datasize = len(dataset)\n",
    "trials = 100 # Trials for hyperparameters random search\n",
    "splits = 10 # Number of splits of the data\n",
    "alpha_grid = np.linspace(0.01, 100, num = trials) # corresponds to (2*C)^-1 in other linear models such as LogisticRegression\n",
    "# C_grid = np.linspace(0.0001, 10, num = trials)\n",
    "random.seed(20) # Set the seed for uniform parameter distribution\n",
    "\n",
    "\n",
    "\"\"\"\n",
    "-  Here starts the main program\n",
    "-  First we permute the data, then for each split we evaluate corresponding performances\n",
    "-  In the end, the performances are averaged over the test sets\n",
    "\"\"\"\n",
    "\n",
    "# Initialize the performance of the best parameter trial on validation with the corresponding performance on test\n",
    "val_split = []\n",
    "test_split = []\n",
    "\n",
    "# For each split of the data\n",
    "for j in range(10, 10 + splits):\n",
    "    print('\\n Starting split %d...' % j)\n",
    "\n",
    "    # Set the random set for data permutation\n",
    "    random_state = int(j)\n",
    "    np.random.seed(random_state)\n",
    "    idx_perm = np.random.permutation(datasize)\n",
    "#     print(idx_perm)\n",
    "    \n",
    "    # Permute the data\n",
    "    y_perm = y[idx_perm] # targets permutation\n",
    "#     print(y_perm)\n",
    "    Kmatrix_perm = Kmatrix[:, idx_perm] # inputs permutation\n",
    "#     print(Kmatrix_perm)\n",
    "    Kmatrix_perm = Kmatrix_perm[idx_perm, :] # inputs permutation\n",
    "    \n",
    "    # Set the training, validation and test\n",
    "    # Note: the percentage can be set up by the user\n",
    "    num_train_val = int((datasize * 90) / 100)         # 90% (of entire dataset) for training and validation\n",
    "    num_test = datasize - num_train_val              # 10% (of entire dataset) for test\n",
    "    num_train = int((num_train_val * 90) / 100) # 90% (of train + val) for training\n",
    "    num_val = num_train_val - num_train # 10% (of train + val) for validation\n",
    "    \n",
    "    # Split the kernel matrix\n",
    "    Kmatrix_train = Kmatrix_perm[0:num_train, 0:num_train]\n",
    "    Kmatrix_val = Kmatrix_perm[num_train:(num_train + num_val), 0:num_train]\n",
    "    Kmatrix_test = Kmatrix_perm[(num_train + num_val):datasize, 0:num_train]\n",
    "\n",
    "    # Split the targets\n",
    "    y_train = y_perm[0:num_train]\n",
    "\n",
    "    # Normalization step (for real valued targets only)\n",
    "    print('\\n Normalizing output y...')\n",
    "    if model_type == 'regression':\n",
    "        y_train_mean = np.mean(y_train)\n",
    "        y_train_std = np.std(y_train)\n",
    "        y_train = (y_train - y_train_mean) / float(y_train_std)\n",
    "#         print(y)\n",
    "        \n",
    "    y_val = y_perm[num_train:(num_train + num_val)]\n",
    "    y_test = y_perm[(num_train + num_val):datasize]\n",
    "    \n",
    "    # Record the performance for each parameter trial respectively on validation and test set\n",
    "    perf_all_val = []\n",
    "    perf_all_test = []\n",
    "    \n",
    "    # For each parameter trial\n",
    "    for i in range(trials):\n",
    "        # For regression use the Kernel Ridge method\n",
    "        if model_type == 'regression':\n",
    "#             print('\\n Starting experiment for trial %d and parameter alpha = %3f\\n ' % (i, alpha_grid[i]))\n",
    "\n",
    "            # Fit the kernel ridge model\n",
    "            KR = KernelRidge(kernel = 'precomputed', alpha = alpha_grid[i])\n",
    "            KR.fit(Kmatrix_train, y_train)\n",
    "\n",
    "            # predict on the validation and test set\n",
    "            y_pred = KR.predict(Kmatrix_val)\n",
    "            y_pred_test = KR.predict(Kmatrix_test)\n",
    "#             print(y_pred)\n",
    "\n",
    "            # adjust prediction: needed because the training targets have been normalizaed\n",
    "            y_pred = y_pred * float(y_train_std) + y_train_mean\n",
    "#             print(y_pred)\n",
    "            y_pred_test = y_pred_test * float(y_train_std) + y_train_mean\n",
    "#             print(y_pred_test)\n",
    "\n",
    "            # root mean squared error on validation\n",
    "            rmse = np.sqrt(mean_squared_error(y_val, y_pred))\n",
    "            perf_all_val.append(rmse)\n",
    "\n",
    "            # root mean squared error in test \n",
    "            rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))\n",
    "            perf_all_test.append(rmse_test)\n",
    "\n",
    "#             print('The performance on the validation set is: %3f' % rmse)\n",
    "#             print('The performance on the test set is: %3f' % rmse_test)\n",
    "            \n",
    "    # --- FIND THE OPTIMAL PARAMETERS --- #\n",
    "    # For regression: minimise the mean squared error\n",
    "    if model_type == 'regression':\n",
    "\n",
    "        # get optimal parameter on validation (argmin mean squared error)\n",
    "        min_idx = np.argmin(perf_all_val)\n",
    "        alpha_opt = alpha_grid[min_idx]\n",
    "\n",
    "        # performance corresponding to optimal parameter on val\n",
    "        perf_val_opt = perf_all_val[min_idx]\n",
    "\n",
    "        # corresponding performance on test for the same parameter\n",
    "        perf_test_opt = perf_all_test[min_idx]\n",
    "\n",
    "        print('The best performance is for trial %d with parameter alpha = %3f' % (min_idx, alpha_opt))\n",
    "        print('The best performance on the validation set is: %3f' % perf_val_opt)\n",
    "        print('The corresponding performance on test set is: %3f' % perf_test_opt)\n",
    "\n",
    "# # we create an instance of SVM and fit out data.\n",
    "# clf = svm.SVC(kernel = 'precomputed')\n",
    "# clf.fit(Kmatrix, )\n",
    "\n",
    "# # predict on validation and test\n",
    "# y_pred = clf.predict(K_val)\n",
    "# y_pred_test = clf.predict(K_test)\n",
    "\n",
    "# # accuracy on validation set\n",
    "# acc = accuracy_score(y_val, y_pred)\n",
    "# perf_all_val.append(acc)\n",
    "\n",
    "# # accuracy on test set\n",
    "# acc_test = accuracy_score(y_test, y_pred_test)\n",
    "# perf_all_test.append(acc_test)\n",
    "\n",
    "# # print \"The performance on the validation set is: %3f\" % acc\n",
    "# # print \"The performance on the test set is: %3f\" % acc_test\n",
    "\n",
    "\n",
    "\n",
    "# # Plot the decision boundary. For that, we will assign a color to each\n",
    "# # point in the mesh [x_min, x_max]x[y_min, y_max].\n",
    "# x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1\n",
    "# y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1\n",
    "# xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))\n",
    "# Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])\n",
    "\n",
    "# # Put the result into a color plot\n",
    "# Z = Z.reshape(xx.shape)\n",
    "# plt.pcolormesh(xx, yy, Z, cmap=plt.cm.Paired)\n",
    "\n",
    "# # Plot also the training points\n",
    "# plt.scatter(X[:, 0], X[:, 1], c=Y, cmap=plt.cm.Paired, edgecolors='k')\n",
    "# plt.title('3-Class classification using Support Vector Machine with custom'\n",
    "#           ' kernel')\n",
    "# plt.axis('tight')\n",
    "# plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "4841564986 / 3"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}