|
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976 |
- {
- "cells": [
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "print(__doc__)\n",
- "\n",
- "import numpy as np\n",
- "import matplotlib.pyplot as plt\n",
- "from sklearn import svm, datasets\n",
- "\n",
- "# import some data to play with\n",
- "iris = datasets.load_iris()\n",
- "X = iris.data[:, :2] # we only take the first two features. We could\n",
- " # avoid this ugly slicing by using a two-dim dataset\n",
- "Y = iris.target\n",
- "\n",
- "\n",
- "def my_kernel(X, Y):\n",
- " \"\"\"\n",
- " We create a custom kernel:\n",
- "\n",
- " (2 0)\n",
- " k(X, Y) = X ( ) Y.T\n",
- " (0 1)\n",
- " \"\"\"\n",
- " M = np.array([[2, 0], [0, 1.0]])\n",
- " return np.dot(np.dot(X, M), Y.T)\n",
- "\n",
- "\n",
- "h = .02 # step size in the mesh\n",
- "\n",
- "# we create an instance of SVM and fit out data.\n",
- "clf = svm.SVC(kernel=my_kernel)\n",
- "clf.fit(X, Y)\n",
- "\n",
- "# Plot the decision boundary. For that, we will assign a color to each\n",
- "# point in the mesh [x_min, x_max]x[y_min, y_max].\n",
- "x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1\n",
- "y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1\n",
- "xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))\n",
- "Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])\n",
- "\n",
- "# Put the result into a color plot\n",
- "Z = Z.reshape(xx.shape)\n",
- "plt.pcolormesh(xx, yy, Z, cmap=plt.cm.Paired)\n",
- "\n",
- "# Plot also the training points\n",
- "plt.scatter(X[:, 0], X[:, 1], c=Y, cmap=plt.cm.Paired, edgecolors='k')\n",
- "plt.title('3-Class classification using Support Vector Machine with custom'\n",
- " ' kernel')\n",
- "plt.axis('tight')\n",
- "plt.show()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 69,
- "metadata": {
- "scrolled": false
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "- This script take as input a kernel matrix\n",
- "and returns the classification or regression performance\n",
- "- The kernel matrix can be calculated using any of the graph kernels approaches\n",
- "- The criteria used for prediction are SVM for classification and kernel Ridge regression for regression\n",
- "- For predition we divide the data in training, validation and test. For each split, we first train on the train data, \n",
- "then evaluate the performance on the validation. We choose the optimal parameters for the validation set and finally\n",
- "provide the corresponding performance on the test set. If more than one split is performed, the final results \n",
- "correspond to the average of the performances on the test sets. \n",
- "\n",
- "@references\n",
- " https://github.com/eghisu/GraphKernels/blob/master/GraphKernelsCollection/python_scripts/compute_perf_gk.py\n",
- "\n",
- "\n",
- " Loading dataset from file...\n",
- "[ -23.7 14. 37.3 109.7 10.8 39. 42. 66.6 135. 148.5\n",
- " 40. 34.6 32. 63. 53.5 67. 64.4 84.7 95.5 92.\n",
- " 84.4 154. 156. 166. 183. 70.3 63.6 52.5 59. 59.5\n",
- " 55.2 88. 83. 104.5 102. 92. 107.4 123.2 112.5 118.5\n",
- " 101.5 173.7 165.5 181. 99.5 92.3 90.1 80.2 82. 91.2\n",
- " 91.5 81.2 93. 69. 86.3 82. 103. 103.5 96. 112. 104.\n",
- " 132.5 123.5 120.3 145. 144.2 142.8 132. 134.2 137. 139.\n",
- " 133.6 120.4 120. 137. 195.8 177.2 181. 185.9 175.7 186. 211.\n",
- " 125. 118. 117.1 107. 102.5 112. 97.4 91.5 87.6 106.5\n",
- " 101. 99.3 90. 137. 114. 126. 124. 140.5 157.5 146. 145.\n",
- " 141. 171. 166. 155. 145. 159. 138. 142. 159. 163.5\n",
- " 229.5 142. 125. 132. 130.5 125. 122. 121. 122.2 112. 106.\n",
- " 114.5 151. 128.5 109.5 126. 147. 158. 147. 165. 188.9\n",
- " 170. 178. 148.5 165. 177. 167. 195. 226. 215. 201. 205.\n",
- " 151.5 165.5 157. 139. 163. 153.5 139. 162. 173. 159.5\n",
- " 159.5 155.5 141. 126. 164. 163. 166.5 146. 165. 159. 195.\n",
- " 218. 250. 235. 186.5 156.5 162. 162. 170.2 173.2 186.8\n",
- " 173. 187. 174. 188.5 199. 228. 215. 216. 240. ]\n",
- "\n",
- " Loading the matrix from file...\n",
- "[[ 3. 1. 3. ..., 1. 1. 1.]\n",
- " [ 1. 6. 1. ..., 0. 0. 3.]\n",
- " [ 3. 1. 3. ..., 1. 1. 1.]\n",
- " ..., \n",
- " [ 1. 0. 1. ..., 55. 21. 7.]\n",
- " [ 1. 0. 1. ..., 21. 55. 7.]\n",
- " [ 1. 3. 1. ..., 7. 7. 55.]]\n",
- "--- This is a regression problem ---\n",
- "Starting split 10...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 12 with parameter alpha = 12.130000\n",
- "The best performance on the validation set is: 36.004721\n",
- "The corresponding performance on test set is: 47.691725\n",
- "Starting split 11...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 99 with parameter alpha = 100.000000\n",
- "The best performance on the validation set is: 33.084913\n",
- "The corresponding performance on test set is: 35.493699\n",
- "Starting split 12...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 18 with parameter alpha = 18.190000\n",
- "The best performance on the validation set is: 29.476238\n",
- "The corresponding performance on test set is: 36.525571\n",
- "Starting split 13...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 9 with parameter alpha = 9.100000\n",
- "The best performance on the validation set is: 40.272791\n",
- "The corresponding performance on test set is: 37.359205\n",
- "Starting split 14...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 99 with parameter alpha = 100.000000\n",
- "The best performance on the validation set is: 42.973240\n",
- "The corresponding performance on test set is: 53.123785\n",
- "Starting split 15...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 18 with parameter alpha = 18.190000\n",
- "The best performance on the validation set is: 38.216353\n",
- "The corresponding performance on test set is: 37.697069\n",
- "Starting split 16...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 88 with parameter alpha = 88.890000\n",
- "The best performance on the validation set is: 32.988038\n",
- "The corresponding performance on test set is: 37.515000\n",
- "Starting split 17...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 4 with parameter alpha = 4.050000\n",
- "The best performance on the validation set is: 33.530482\n",
- "The corresponding performance on test set is: 43.448861\n",
- "Starting split 18...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 64 with parameter alpha = 64.650000\n",
- "The best performance on the validation set is: 29.671418\n",
- "The corresponding performance on test set is: 29.196786\n",
- "Starting split 19...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 7 with parameter alpha = 7.080000\n",
- "The best performance on the validation set is: 44.854681\n",
- "The corresponding performance on test set is: 36.111594\n",
- "Starting split 20...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 19 with parameter alpha = 19.200000\n",
- "The best performance on the validation set is: 35.660642\n",
- "The corresponding performance on test set is: 38.151790\n",
- "Starting split 21...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 14 with parameter alpha = 14.150000\n",
- "The best performance on the validation set is: 41.607604\n",
- "The corresponding performance on test set is: 32.158764\n",
- "Starting split 22...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 46 with parameter alpha = 46.470000\n",
- "The best performance on the validation set is: 36.461026\n",
- "The corresponding performance on test set is: 36.247837\n",
- "Starting split 23...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 49 with parameter alpha = 49.500000\n",
- "The best performance on the validation set is: 28.540585\n",
- "The corresponding performance on test set is: 37.377972\n",
- "Starting split 24...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 32 with parameter alpha = 32.330000\n",
- "The best performance on the validation set is: 33.397020\n",
- "The corresponding performance on test set is: 35.840386\n",
- "Starting split 25...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 30 with parameter alpha = 30.310000\n",
- "The best performance on the validation set is: 36.391650\n",
- "The corresponding performance on test set is: 36.120189\n",
- "Starting split 26...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 67 with parameter alpha = 67.680000\n",
- "The best performance on the validation set is: 39.558878\n",
- "The corresponding performance on test set is: 35.711036\n",
- "Starting split 27...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 35 with parameter alpha = 35.360000\n",
- "The best performance on the validation set is: 34.563158\n",
- "The corresponding performance on test set is: 51.215777\n",
- "Starting split 28...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 15 with parameter alpha = 15.160000\n",
- "The best performance on the validation set is: 43.309309\n",
- "The corresponding performance on test set is: 40.394297\n",
- "Starting split 29...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 14 with parameter alpha = 14.150000\n",
- "The best performance on the validation set is: 42.797900\n",
- "The corresponding performance on test set is: 34.218103\n",
- "Starting split 30...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 42 with parameter alpha = 42.430000\n",
- "The best performance on the validation set is: 45.422692\n",
- "The corresponding performance on test set is: 27.041917\n",
- "Starting split 31...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 9 with parameter alpha = 9.100000\n",
- "The best performance on the validation set is: 33.447413\n",
- "The corresponding performance on test set is: 38.341333\n",
- "Starting split 32...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 46 with parameter alpha = 46.470000\n",
- "The best performance on the validation set is: 31.638807\n",
- "The corresponding performance on test set is: 43.374635\n",
- "Starting split 33...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 49 with parameter alpha = 49.500000\n",
- "The best performance on the validation set is: 37.702092\n",
- "The corresponding performance on test set is: 31.198701\n",
- "Starting split 34...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 99 with parameter alpha = 100.000000\n",
- "The best performance on the validation set is: 28.013251\n",
- "The corresponding performance on test set is: 30.116903\n",
- "Starting split 35...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 13 with parameter alpha = 13.140000\n",
- "The best performance on the validation set is: 38.520179\n",
- "The corresponding performance on test set is: 37.478691\n",
- "Starting split 36...\n",
- "\n",
- " Normalizing output y...\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "The best performance is for trial 99 with parameter alpha = 100.000000\n",
- "The best performance on the validation set is: 35.836403\n",
- "The corresponding performance on test set is: 37.447219\n",
- "Starting split 37...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 47 with parameter alpha = 47.480000\n",
- "The best performance on the validation set is: 31.172116\n",
- "The corresponding performance on test set is: 39.504962\n",
- "Starting split 38...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 36 with parameter alpha = 36.370000\n",
- "The best performance on the validation set is: 40.025101\n",
- "The corresponding performance on test set is: 41.314650\n",
- "Starting split 39...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 99 with parameter alpha = 100.000000\n",
- "The best performance on the validation set is: 28.474810\n",
- "The corresponding performance on test set is: 38.093995\n",
- "Starting split 40...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 99 with parameter alpha = 100.000000\n",
- "The best performance on the validation set is: 38.056007\n",
- "The corresponding performance on test set is: 33.570513\n",
- "Starting split 41...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 5 with parameter alpha = 5.060000\n",
- "The best performance on the validation set is: 35.329935\n",
- "The corresponding performance on test set is: 40.309342\n",
- "Starting split 42...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 59 with parameter alpha = 59.600000\n",
- "The best performance on the validation set is: 25.235609\n",
- "The corresponding performance on test set is: 36.117043\n",
- "Starting split 43...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 30 with parameter alpha = 30.310000\n",
- "The best performance on the validation set is: 27.596182\n",
- "The corresponding performance on test set is: 39.069843\n",
- "Starting split 44...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 99 with parameter alpha = 100.000000\n",
- "The best performance on the validation set is: 24.493222\n",
- "The corresponding performance on test set is: 34.064025\n",
- "Starting split 45...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 15 with parameter alpha = 15.160000\n",
- "The best performance on the validation set is: 45.540605\n",
- "The corresponding performance on test set is: 33.544310\n",
- "Starting split 46...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 99 with parameter alpha = 100.000000\n",
- "The best performance on the validation set is: 34.661595\n",
- "The corresponding performance on test set is: 26.174480\n",
- "Starting split 47...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 5 with parameter alpha = 5.060000\n",
- "The best performance on the validation set is: 34.837287\n",
- "The corresponding performance on test set is: 45.463855\n",
- "Starting split 48...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 31 with parameter alpha = 31.320000\n",
- "The best performance on the validation set is: 37.528129\n",
- "The corresponding performance on test set is: 51.123083\n",
- "Starting split 49...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 20 with parameter alpha = 20.210000\n",
- "The best performance on the validation set is: 32.480446\n",
- "The corresponding performance on test set is: 31.618253\n",
- "Starting split 50...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 59 with parameter alpha = 59.600000\n",
- "The best performance on the validation set is: 32.584107\n",
- "The corresponding performance on test set is: 31.376594\n",
- "Starting split 51...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 9 with parameter alpha = 9.100000\n",
- "The best performance on the validation set is: 40.867089\n",
- "The corresponding performance on test set is: 32.363239\n",
- "Starting split 52...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 97 with parameter alpha = 97.980000\n",
- "The best performance on the validation set is: 33.801783\n",
- "The corresponding performance on test set is: 41.200644\n",
- "Starting split 53...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 17 with parameter alpha = 17.180000\n",
- "The best performance on the validation set is: 44.010303\n",
- "The corresponding performance on test set is: 34.562120\n",
- "Starting split 54...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 22 with parameter alpha = 22.230000\n",
- "The best performance on the validation set is: 36.759090\n",
- "The corresponding performance on test set is: 38.376060\n",
- "Starting split 55...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 99 with parameter alpha = 100.000000\n",
- "The best performance on the validation set is: 29.830898\n",
- "The corresponding performance on test set is: 24.811584\n",
- "Starting split 56...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 18 with parameter alpha = 18.190000\n",
- "The best performance on the validation set is: 34.627026\n",
- "The corresponding performance on test set is: 46.684129\n",
- "Starting split 57...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 5 with parameter alpha = 5.060000\n",
- "The best performance on the validation set is: 43.343991\n",
- "The corresponding performance on test set is: 41.169814\n",
- "Starting split 58...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 39 with parameter alpha = 39.400000\n",
- "The best performance on the validation set is: 34.908981\n",
- "The corresponding performance on test set is: 48.932907\n",
- "Starting split 59...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 10 with parameter alpha = 10.110000\n",
- "The best performance on the validation set is: 37.767543\n",
- "The corresponding performance on test set is: 28.512235\n",
- "Starting split 60...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 99 with parameter alpha = 100.000000\n",
- "The best performance on the validation set is: 37.890852\n",
- "The corresponding performance on test set is: 28.082837\n",
- "Starting split 61...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 36 with parameter alpha = 36.370000\n",
- "The best performance on the validation set is: 41.402040\n",
- "The corresponding performance on test set is: 31.964262\n",
- "Starting split 62...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 34 with parameter alpha = 34.350000\n",
- "The best performance on the validation set is: 39.216178\n",
- "The corresponding performance on test set is: 48.626836\n",
- "Starting split 63...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 99 with parameter alpha = 100.000000\n",
- "The best performance on the validation set is: 35.222016\n",
- "The corresponding performance on test set is: 50.344625\n",
- "Starting split 64...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 25 with parameter alpha = 25.260000\n",
- "The best performance on the validation set is: 33.803383\n",
- "The corresponding performance on test set is: 40.058257\n",
- "Starting split 65...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 56 with parameter alpha = 56.570000\n",
- "The best performance on the validation set is: 29.170281\n",
- "The corresponding performance on test set is: 36.104372\n",
- "Starting split 66...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 14 with parameter alpha = 14.150000\n",
- "The best performance on the validation set is: 40.405285\n",
- "The corresponding performance on test set is: 39.262782\n",
- "Starting split 67...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 58 with parameter alpha = 58.590000\n",
- "The best performance on the validation set is: 24.638279\n",
- "The corresponding performance on test set is: 35.165763\n",
- "Starting split 68...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 56 with parameter alpha = 56.570000\n",
- "The best performance on the validation set is: 31.681889\n",
- "The corresponding performance on test set is: 44.049377\n",
- "Starting split 69...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 5 with parameter alpha = 5.060000\n",
- "The best performance on the validation set is: 30.735748\n",
- "The corresponding performance on test set is: 50.708019\n",
- "Starting split 70...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 29 with parameter alpha = 29.300000\n",
- "The best performance on the validation set is: 37.273741\n",
- "The corresponding performance on test set is: 39.351135\n",
- "Starting split 71...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 6 with parameter alpha = 6.070000\n",
- "The best performance on the validation set is: 48.033115\n",
- "The corresponding performance on test set is: 55.674648\n",
- "Starting split 72...\n",
- "\n",
- " Normalizing output y...\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "The best performance is for trial 99 with parameter alpha = 100.000000\n",
- "The best performance on the validation set is: 31.903823\n",
- "The corresponding performance on test set is: 32.937886\n",
- "Starting split 73...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 19 with parameter alpha = 19.200000\n",
- "The best performance on the validation set is: 40.825941\n",
- "The corresponding performance on test set is: 38.535950\n",
- "Starting split 74...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 99 with parameter alpha = 100.000000\n",
- "The best performance on the validation set is: 34.181621\n",
- "The corresponding performance on test set is: 34.089714\n",
- "Starting split 75...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 39 with parameter alpha = 39.400000\n",
- "The best performance on the validation set is: 40.264289\n",
- "The corresponding performance on test set is: 47.412526\n",
- "Starting split 76...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 27 with parameter alpha = 27.280000\n",
- "The best performance on the validation set is: 35.842650\n",
- "The corresponding performance on test set is: 34.785447\n",
- "Starting split 77...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 33 with parameter alpha = 33.340000\n",
- "The best performance on the validation set is: 38.896608\n",
- "The corresponding performance on test set is: 39.158479\n",
- "Starting split 78...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 28 with parameter alpha = 28.290000\n",
- "The best performance on the validation set is: 31.053773\n",
- "The corresponding performance on test set is: 33.711541\n",
- "Starting split 79...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 54 with parameter alpha = 54.550000\n",
- "The best performance on the validation set is: 36.129208\n",
- "The corresponding performance on test set is: 34.191692\n",
- "Starting split 80...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 3 with parameter alpha = 3.040000\n",
- "The best performance on the validation set is: 42.796346\n",
- "The corresponding performance on test set is: 40.531343\n",
- "Starting split 81...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 1 with parameter alpha = 1.020000\n",
- "The best performance on the validation set is: 50.420936\n",
- "The corresponding performance on test set is: 43.764477\n",
- "Starting split 82...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 99 with parameter alpha = 100.000000\n",
- "The best performance on the validation set is: 36.720826\n",
- "The corresponding performance on test set is: 40.242670\n",
- "Starting split 83...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 43 with parameter alpha = 43.440000\n",
- "The best performance on the validation set is: 38.601089\n",
- "The corresponding performance on test set is: 46.145483\n",
- "Starting split 84...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 38 with parameter alpha = 38.390000\n",
- "The best performance on the validation set is: 29.823069\n",
- "The corresponding performance on test set is: 27.458317\n",
- "Starting split 85...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 1 with parameter alpha = 1.020000\n",
- "The best performance on the validation set is: 37.295245\n",
- "The corresponding performance on test set is: 41.040827\n",
- "Starting split 86...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 6 with parameter alpha = 6.070000\n",
- "The best performance on the validation set is: 46.779299\n",
- "The corresponding performance on test set is: 36.893783\n",
- "Starting split 87...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 30 with parameter alpha = 30.310000\n",
- "The best performance on the validation set is: 35.570939\n",
- "The corresponding performance on test set is: 31.014527\n",
- "Starting split 88...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 11 with parameter alpha = 11.120000\n",
- "The best performance on the validation set is: 42.125559\n",
- "The corresponding performance on test set is: 34.059925\n",
- "Starting split 89...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 32 with parameter alpha = 32.330000\n",
- "The best performance on the validation set is: 33.276369\n",
- "The corresponding performance on test set is: 34.010431\n",
- "Starting split 90...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 14 with parameter alpha = 14.150000\n",
- "The best performance on the validation set is: 48.542045\n",
- "The corresponding performance on test set is: 51.204813\n",
- "Starting split 91...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 99 with parameter alpha = 100.000000\n",
- "The best performance on the validation set is: 51.240470\n",
- "The corresponding performance on test set is: 25.204665\n",
- "Starting split 92...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 8 with parameter alpha = 8.090000\n",
- "The best performance on the validation set is: 28.042005\n",
- "The corresponding performance on test set is: 40.903728\n",
- "Starting split 93...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 24 with parameter alpha = 24.250000\n",
- "The best performance on the validation set is: 46.378283\n",
- "The corresponding performance on test set is: 38.717776\n",
- "Starting split 94...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 52 with parameter alpha = 52.530000\n",
- "The best performance on the validation set is: 28.152322\n",
- "The corresponding performance on test set is: 30.532819\n",
- "Starting split 95...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 42 with parameter alpha = 42.430000\n",
- "The best performance on the validation set is: 36.952087\n",
- "The corresponding performance on test set is: 33.163953\n",
- "Starting split 96...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 61 with parameter alpha = 61.620000\n",
- "The best performance on the validation set is: 41.047679\n",
- "The corresponding performance on test set is: 27.036643\n",
- "Starting split 97...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 42 with parameter alpha = 42.430000\n",
- "The best performance on the validation set is: 36.858184\n",
- "The corresponding performance on test set is: 40.745694\n",
- "Starting split 98...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 89 with parameter alpha = 89.900000\n",
- "The best performance on the validation set is: 41.871463\n",
- "The corresponding performance on test set is: 37.287950\n",
- "Starting split 99...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 55 with parameter alpha = 55.560000\n",
- "The best performance on the validation set is: 41.040198\n",
- "The corresponding performance on test set is: 34.121258\n",
- "Starting split 100...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 9 with parameter alpha = 9.100000\n",
- "The best performance on the validation set is: 38.421616\n",
- "The corresponding performance on test set is: 43.772366\n",
- "Starting split 101...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 99 with parameter alpha = 100.000000\n",
- "The best performance on the validation set is: 23.640892\n",
- "The corresponding performance on test set is: 41.408082\n",
- "Starting split 102...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 99 with parameter alpha = 100.000000\n",
- "The best performance on the validation set is: 38.527173\n",
- "The corresponding performance on test set is: 52.468792\n",
- "Starting split 103...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 6 with parameter alpha = 6.070000\n",
- "The best performance on the validation set is: 47.154873\n",
- "The corresponding performance on test set is: 39.077319\n",
- "Starting split 104...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 12 with parameter alpha = 12.130000\n",
- "The best performance on the validation set is: 36.653442\n",
- "The corresponding performance on test set is: 47.172066\n",
- "Starting split 105...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 12 with parameter alpha = 12.130000\n",
- "The best performance on the validation set is: 33.976240\n",
- "The corresponding performance on test set is: 40.620368\n",
- "Starting split 106...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 22 with parameter alpha = 22.230000\n",
- "The best performance on the validation set is: 27.799295\n",
- "The corresponding performance on test set is: 38.034978\n",
- "Starting split 107...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 17 with parameter alpha = 17.180000\n",
- "The best performance on the validation set is: 43.202567\n",
- "The corresponding performance on test set is: 36.783012\n",
- "Starting split 108...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 99 with parameter alpha = 100.000000\n",
- "The best performance on the validation set is: 32.312218\n",
- "The corresponding performance on test set is: 38.186940\n",
- "Starting split 109...\n",
- "\n",
- " Normalizing output y...\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "The best performance is for trial 4 with parameter alpha = 4.050000\n",
- "The best performance on the validation set is: 39.485731\n",
- "The corresponding performance on test set is: 50.146953\n"
- ]
- }
- ],
- "source": [
- "# Author: Elisabetta Ghisu\n",
- "\n",
- "\"\"\"\n",
- "- This script take as input a kernel matrix\n",
- "and returns the classification or regression performance\n",
- "- The kernel matrix can be calculated using any of the graph kernels approaches\n",
- "- The criteria used for prediction are SVM for classification and kernel Ridge regression for regression\n",
- "- For predition we divide the data in training, validation and test. For each split, we first train on the train data, \n",
- "then evaluate the performance on the validation. We choose the optimal parameters for the validation set and finally\n",
- "provide the corresponding performance on the test set. If more than one split is performed, the final results \n",
- "correspond to the average of the performances on the test sets. \n",
- "\n",
- "@references\n",
- " https://github.com/eghisu/GraphKernels/blob/master/GraphKernelsCollection/python_scripts/compute_perf_gk.py\n",
- "\"\"\"\n",
- "\n",
- "print(__doc__)\n",
- "\n",
- "import sys\n",
- "import pathlib\n",
- "sys.path.insert(0, \"../py-graph/\")\n",
- "\n",
- "import random\n",
- "import numpy as np\n",
- "import matplotlib.pyplot as plt\n",
- "\n",
- "from sklearn.kernel_ridge import KernelRidge # 0.17\n",
- "from sklearn.metrics import accuracy_score, mean_squared_error\n",
- "from sklearn import svm\n",
- "\n",
- "from kernels.spkernel import spkernel\n",
- "from utils.graphfiles import loadDataset\n",
- "\n",
- "print('\\n Loading dataset from file...')\n",
- "dataset, y = loadDataset(\"/home/ljia/Documents/research-repo/datasets/acyclic/Acyclic/dataset_bps.ds\")\n",
- "y = np.array(y)\n",
- "print(y)\n",
- "\n",
- "kernel_file_path = 'kernelmatrix.ds'\n",
- "path = pathlib.Path(kernel_file_path)\n",
- "if path.is_file():\n",
- " print('\\n Loading the matrix from file...')\n",
- " Kmatrix = np.loadtxt(kernel_file_path)\n",
- " print(Kmatrix)\n",
- "else:\n",
- " print('\\n Calculating kernel matrix, this could take a while...')\n",
- " Kmatrix = spkernel(dataset)\n",
- " print(Kmatrix)\n",
- " print('Saving kernel matrix to file...')\n",
- " np.savetxt(kernel_file_path, Kmatrix)\n",
- "\n",
- "# setup the parameters\n",
- "model_type = 'regression' # Regression or classification problem\n",
- "print('\\n --- This is a %s problem ---' % model_type)\n",
- "\n",
- "datasize = len(dataset)\n",
- "trials = 100 # Trials for hyperparameters random search\n",
- "splits = 10 # Number of splits of the data\n",
- "alpha_grid = np.linspace(0.01, 100, num = trials) # corresponds to (2*C)^-1 in other linear models such as LogisticRegression\n",
- "# C_grid = np.linspace(0.0001, 10, num = trials)\n",
- "random.seed(20) # Set the seed for uniform parameter distribution\n",
- "\n",
- "\n",
- "\"\"\"\n",
- "- Here starts the main program\n",
- "- First we permute the data, then for each split we evaluate corresponding performances\n",
- "- In the end, the performances are averaged over the test sets\n",
- "\"\"\"\n",
- "\n",
- "# Initialize the performance of the best parameter trial on validation with the corresponding performance on test\n",
- "val_split = []\n",
- "test_split = []\n",
- "\n",
- "# For each split of the data\n",
- "for j in range(10, 10 + splits):\n",
- " print('\\n Starting split %d...' % j)\n",
- "\n",
- " # Set the random set for data permutation\n",
- " random_state = int(j)\n",
- " np.random.seed(random_state)\n",
- " idx_perm = np.random.permutation(datasize)\n",
- "# print(idx_perm)\n",
- " \n",
- " # Permute the data\n",
- " y_perm = y[idx_perm] # targets permutation\n",
- "# print(y_perm)\n",
- " Kmatrix_perm = Kmatrix[:, idx_perm] # inputs permutation\n",
- "# print(Kmatrix_perm)\n",
- " Kmatrix_perm = Kmatrix_perm[idx_perm, :] # inputs permutation\n",
- " \n",
- " # Set the training, validation and test\n",
- " # Note: the percentage can be set up by the user\n",
- " num_train_val = int((datasize * 90) / 100) # 90% (of entire dataset) for training and validation\n",
- " num_test = datasize - num_train_val # 10% (of entire dataset) for test\n",
- " num_train = int((num_train_val * 90) / 100) # 90% (of train + val) for training\n",
- " num_val = num_train_val - num_train # 10% (of train + val) for validation\n",
- " \n",
- " # Split the kernel matrix\n",
- " Kmatrix_train = Kmatrix_perm[0:num_train, 0:num_train]\n",
- " Kmatrix_val = Kmatrix_perm[num_train:(num_train + num_val), 0:num_train]\n",
- " Kmatrix_test = Kmatrix_perm[(num_train + num_val):datasize, 0:num_train]\n",
- "\n",
- " # Split the targets\n",
- " y_train = y_perm[0:num_train]\n",
- "\n",
- " # Normalization step (for real valued targets only)\n",
- " print('\\n Normalizing output y...')\n",
- " if model_type == 'regression':\n",
- " y_train_mean = np.mean(y_train)\n",
- " y_train_std = np.std(y_train)\n",
- " y_train = (y_train - y_train_mean) / float(y_train_std)\n",
- "# print(y)\n",
- " \n",
- " y_val = y_perm[num_train:(num_train + num_val)]\n",
- " y_test = y_perm[(num_train + num_val):datasize]\n",
- " \n",
- " # Record the performance for each parameter trial respectively on validation and test set\n",
- " perf_all_val = []\n",
- " perf_all_test = []\n",
- " \n",
- " # For each parameter trial\n",
- " for i in range(trials):\n",
- " # For regression use the Kernel Ridge method\n",
- " if model_type == 'regression':\n",
- "# print('\\n Starting experiment for trial %d and parameter alpha = %3f\\n ' % (i, alpha_grid[i]))\n",
- "\n",
- " # Fit the kernel ridge model\n",
- " KR = KernelRidge(kernel = 'precomputed', alpha = alpha_grid[i])\n",
- " KR.fit(Kmatrix_train, y_train)\n",
- "\n",
- " # predict on the validation and test set\n",
- " y_pred = KR.predict(Kmatrix_val)\n",
- " y_pred_test = KR.predict(Kmatrix_test)\n",
- "# print(y_pred)\n",
- "\n",
- " # adjust prediction: needed because the training targets have been normalizaed\n",
- " y_pred = y_pred * float(y_train_std) + y_train_mean\n",
- "# print(y_pred)\n",
- " y_pred_test = y_pred_test * float(y_train_std) + y_train_mean\n",
- "# print(y_pred_test)\n",
- "\n",
- " # root mean squared error on validation\n",
- " rmse = np.sqrt(mean_squared_error(y_val, y_pred))\n",
- " perf_all_val.append(rmse)\n",
- "\n",
- " # root mean squared error in test \n",
- " rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))\n",
- " perf_all_test.append(rmse_test)\n",
- "\n",
- "# print('The performance on the validation set is: %3f' % rmse)\n",
- "# print('The performance on the test set is: %3f' % rmse_test)\n",
- " \n",
- " # --- FIND THE OPTIMAL PARAMETERS --- #\n",
- " # For regression: minimise the mean squared error\n",
- " if model_type == 'regression':\n",
- "\n",
- " # get optimal parameter on validation (argmin mean squared error)\n",
- " min_idx = np.argmin(perf_all_val)\n",
- " alpha_opt = alpha_grid[min_idx]\n",
- "\n",
- " # performance corresponding to optimal parameter on val\n",
- " perf_val_opt = perf_all_val[min_idx]\n",
- "\n",
- " # corresponding performance on test for the same parameter\n",
- " perf_test_opt = perf_all_test[min_idx]\n",
- "\n",
- " print('The best performance is for trial %d with parameter alpha = %3f' % (min_idx, alpha_opt))\n",
- " print('The best performance on the validation set is: %3f' % perf_val_opt)\n",
- " print('The corresponding performance on test set is: %3f' % perf_test_opt)\n",
- "\n",
- "# # we create an instance of SVM and fit out data.\n",
- "# clf = svm.SVC(kernel = 'precomputed')\n",
- "# clf.fit(Kmatrix, )\n",
- "\n",
- "# # predict on validation and test\n",
- "# y_pred = clf.predict(K_val)\n",
- "# y_pred_test = clf.predict(K_test)\n",
- "\n",
- "# # accuracy on validation set\n",
- "# acc = accuracy_score(y_val, y_pred)\n",
- "# perf_all_val.append(acc)\n",
- "\n",
- "# # accuracy on test set\n",
- "# acc_test = accuracy_score(y_test, y_pred_test)\n",
- "# perf_all_test.append(acc_test)\n",
- "\n",
- "# # print \"The performance on the validation set is: %3f\" % acc\n",
- "# # print \"The performance on the test set is: %3f\" % acc_test\n",
- "\n",
- "\n",
- "\n",
- "# # Plot the decision boundary. For that, we will assign a color to each\n",
- "# # point in the mesh [x_min, x_max]x[y_min, y_max].\n",
- "# x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1\n",
- "# y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1\n",
- "# xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))\n",
- "# Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])\n",
- "\n",
- "# # Put the result into a color plot\n",
- "# Z = Z.reshape(xx.shape)\n",
- "# plt.pcolormesh(xx, yy, Z, cmap=plt.cm.Paired)\n",
- "\n",
- "# # Plot also the training points\n",
- "# plt.scatter(X[:, 0], X[:, 1], c=Y, cmap=plt.cm.Paired, edgecolors='k')\n",
- "# plt.title('3-Class classification using Support Vector Machine with custom'\n",
- "# ' kernel')\n",
- "# plt.axis('tight')\n",
- "# plt.show()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "4841564986 / 3"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.5.2"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 2
- }
|