|
12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238 |
- {
- "cells": [
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {
- "scrolled": false
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Automatically created module for IPython interactive environment\n",
- "# Tuning hyper-parameters for precision\n",
- "\n",
- "Best parameters set found on development set:\n",
- "\n",
- "{'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}\n",
- "\n",
- "Grid scores on development set:\n",
- "\n",
- "0.986 (+/-0.016) for {'C': 1, 'gamma': 0.001, 'kernel': 'rbf'}\n",
- "0.959 (+/-0.029) for {'C': 1, 'gamma': 0.0001, 'kernel': 'rbf'}\n",
- "0.988 (+/-0.017) for {'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}\n",
- "0.982 (+/-0.026) for {'C': 10, 'gamma': 0.0001, 'kernel': 'rbf'}\n",
- "0.988 (+/-0.017) for {'C': 100, 'gamma': 0.001, 'kernel': 'rbf'}\n",
- "0.982 (+/-0.025) for {'C': 100, 'gamma': 0.0001, 'kernel': 'rbf'}\n",
- "0.988 (+/-0.017) for {'C': 1000, 'gamma': 0.001, 'kernel': 'rbf'}\n",
- "0.982 (+/-0.025) for {'C': 1000, 'gamma': 0.0001, 'kernel': 'rbf'}\n",
- "0.975 (+/-0.014) for {'C': 1, 'kernel': 'linear'}\n",
- "0.975 (+/-0.014) for {'C': 10, 'kernel': 'linear'}\n",
- "0.975 (+/-0.014) for {'C': 100, 'kernel': 'linear'}\n",
- "0.975 (+/-0.014) for {'C': 1000, 'kernel': 'linear'}\n",
- "\n",
- "Detailed classification report:\n",
- "\n",
- "The model is trained on the full development set.\n",
- "The scores are computed on the full evaluation set.\n",
- "\n",
- " precision recall f1-score support\n",
- "\n",
- " 0 1.00 1.00 1.00 89\n",
- " 1 0.97 1.00 0.98 90\n",
- " 2 0.99 0.98 0.98 92\n",
- " 3 1.00 0.99 0.99 93\n",
- " 4 1.00 1.00 1.00 76\n",
- " 5 0.99 0.98 0.99 108\n",
- " 6 0.99 1.00 0.99 89\n",
- " 7 0.99 1.00 0.99 78\n",
- " 8 1.00 0.98 0.99 92\n",
- " 9 0.99 0.99 0.99 92\n",
- "\n",
- "avg / total 0.99 0.99 0.99 899\n",
- "\n",
- "\n",
- "# Tuning hyper-parameters for recall\n",
- "\n",
- "Best parameters set found on development set:\n",
- "\n",
- "{'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}\n",
- "\n",
- "Grid scores on development set:\n",
- "\n",
- "0.986 (+/-0.019) for {'C': 1, 'gamma': 0.001, 'kernel': 'rbf'}\n",
- "0.957 (+/-0.029) for {'C': 1, 'gamma': 0.0001, 'kernel': 'rbf'}\n",
- "0.987 (+/-0.019) for {'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}\n",
- "0.981 (+/-0.028) for {'C': 10, 'gamma': 0.0001, 'kernel': 'rbf'}\n",
- "0.987 (+/-0.019) for {'C': 100, 'gamma': 0.001, 'kernel': 'rbf'}\n",
- "0.981 (+/-0.026) for {'C': 100, 'gamma': 0.0001, 'kernel': 'rbf'}\n",
- "0.987 (+/-0.019) for {'C': 1000, 'gamma': 0.001, 'kernel': 'rbf'}\n",
- "0.981 (+/-0.026) for {'C': 1000, 'gamma': 0.0001, 'kernel': 'rbf'}\n",
- "0.972 (+/-0.012) for {'C': 1, 'kernel': 'linear'}\n",
- "0.972 (+/-0.012) for {'C': 10, 'kernel': 'linear'}\n",
- "0.972 (+/-0.012) for {'C': 100, 'kernel': 'linear'}\n",
- "0.972 (+/-0.012) for {'C': 1000, 'kernel': 'linear'}\n",
- "\n",
- "Detailed classification report:\n",
- "\n",
- "The model is trained on the full development set.\n",
- "The scores are computed on the full evaluation set.\n",
- "\n",
- " precision recall f1-score support\n",
- "\n",
- " 0 1.00 1.00 1.00 89\n",
- " 1 0.97 1.00 0.98 90\n",
- " 2 0.99 0.98 0.98 92\n",
- " 3 1.00 0.99 0.99 93\n",
- " 4 1.00 1.00 1.00 76\n",
- " 5 0.99 0.98 0.99 108\n",
- " 6 0.99 1.00 0.99 89\n",
- " 7 0.99 1.00 0.99 78\n",
- " 8 1.00 0.98 0.99 92\n",
- " 9 0.99 0.99 0.99 92\n",
- "\n",
- "avg / total 0.99 0.99 0.99 899\n",
- "\n",
- "\n"
- ]
- }
- ],
- "source": [
- "# Parameter estimation using grid search with cross-validation\n",
- "from __future__ import print_function\n",
- "\n",
- "from sklearn import datasets\n",
- "from sklearn.model_selection import train_test_split\n",
- "from sklearn.model_selection import GridSearchCV\n",
- "from sklearn.metrics import classification_report\n",
- "from sklearn.svm import SVC\n",
- "\n",
- "print(__doc__)\n",
- "\n",
- "# Loading the Digits dataset\n",
- "digits = datasets.load_digits()\n",
- "\n",
- "# To apply an classifier on this data, we need to flatten the image, to\n",
- "# turn the data in a (samples, feature) matrix:\n",
- "n_samples = len(digits.images)\n",
- "X = digits.images.reshape((n_samples, -1))\n",
- "y = digits.target\n",
- "\n",
- "# Split the dataset in two equal parts\n",
- "X_train, X_test, y_train, y_test = train_test_split(\n",
- " X, y, test_size=0.5, random_state=0)\n",
- "\n",
- "# Set the parameters by cross-validation\n",
- "tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],\n",
- " 'C': [1, 10, 100, 1000]},\n",
- " {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]\n",
- "\n",
- "scores = ['precision', 'recall']\n",
- "\n",
- "for score in scores:\n",
- " print(\"# Tuning hyper-parameters for %s\" % score)\n",
- " print()\n",
- "\n",
- " clf = GridSearchCV(SVC(), tuned_parameters, cv=5,\n",
- " scoring='%s_macro' % score)\n",
- " clf.fit(X_train, y_train)\n",
- "\n",
- " print(\"Best parameters set found on development set:\")\n",
- " print()\n",
- " print(clf.best_params_)\n",
- " print()\n",
- " print(\"Grid scores on development set:\")\n",
- " print()\n",
- " means = clf.cv_results_['mean_test_score']\n",
- " stds = clf.cv_results_['std_test_score']\n",
- " for mean, std, params in zip(means, stds, clf.cv_results_['params']):\n",
- " print(\"%0.3f (+/-%0.03f) for %r\"\n",
- " % (mean, std * 2, params))\n",
- " print()\n",
- "\n",
- " print(\"Detailed classification report:\")\n",
- " print()\n",
- " print(\"The model is trained on the full development set.\")\n",
- " print(\"The scores are computed on the full evaluation set.\")\n",
- " print()\n",
- " y_true, y_pred = y_test, clf.predict(X_test)\n",
- " print(classification_report(y_true, y_pred))\n",
- " print()\n",
- "\n",
- "# Note the problem is too easy: the hyperparameter plateau is too flat and the\n",
- "# output model is the same for precision and recall with ties in quality."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "GridSearchCV(cv=None, error_score='raise',\n",
- " estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,\n",
- " decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',\n",
- " max_iter=-1, probability=False, random_state=None, shrinking=True,\n",
- " tol=0.001, verbose=False),\n",
- " fit_params=None, iid=True, n_jobs=1,\n",
- " param_grid={'C': [1, 10], 'kernel': ('linear', 'rbf')},\n",
- " pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',\n",
- " scoring=None, verbose=0)"
- ]
- },
- "execution_count": 2,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "from sklearn import svm, datasets\n",
- "from sklearn.model_selection import GridSearchCV\n",
- "iris = datasets.load_iris()\n",
- "parameters = {'kernel': ('linear', 'rbf'), 'C': [1, 10]}\n",
- "svc = svm.SVC()\n",
- "clf = GridSearchCV(svc, parameters)\n",
- "clf.fit(iris.data, iris.target)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "['mean_fit_time',\n",
- " 'mean_score_time',\n",
- " 'mean_test_score',\n",
- " 'mean_train_score',\n",
- " 'param_C',\n",
- " 'param_kernel',\n",
- " 'params',\n",
- " 'rank_test_score',\n",
- " 'split0_test_score',\n",
- " 'split0_train_score',\n",
- " 'split1_test_score',\n",
- " 'split1_train_score',\n",
- " 'split2_test_score',\n",
- " 'split2_train_score',\n",
- " 'std_fit_time',\n",
- " 'std_score_time',\n",
- " 'std_test_score',\n",
- " 'std_train_score']"
- ]
- },
- "execution_count": 3,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "sorted(clf.cv_results_.keys())"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "dict_values([array([0.98 , 0.97333333, 0.97333333, 0.98 ]), array([1. , 0.98039216, 1. , 0.98039216]), array([0.00030899, 0.00021172, 0.00019932, 0.00017134]), array([1, 3, 3, 1], dtype=int32), array([0.01617914, 0.00902067, 0.03715363, 0.01592466]), masked_array(data=['linear', 'rbf', 'linear', 'rbf'],\n",
- " mask=[False, False, False, False],\n",
- " fill_value='?',\n",
- " dtype=object), array([1., 1., 1., 1.]), array([0.98999802, 0.98336304, 0.97999604, 0.97999604]), array([6.43618303e-05, 6.20771049e-05, 7.16528819e-05, 9.16456815e-06]), array([0.97979798, 0.96969697, 0.95959596, 0.95959596]), [{'kernel': 'linear', 'C': 1}, {'kernel': 'rbf', 'C': 1}, {'kernel': 'linear', 'C': 10}, {'kernel': 'rbf', 'C': 10}], array([0.00036526, 0.00039411, 0.0002923 , 0.00032218]), array([0.00824863, 0.01254825, 0.01649726, 0.01649726]), array([0.97916667, 0.97916667, 1. , 1. ]), array([0.99019608, 0.98039216, 0.98039216, 0.98039216]), array([5.54407363e-05, 3.25514857e-05, 7.09833681e-05, 3.70551530e-06]), masked_array(data=[1, 1, 10, 10],\n",
- " mask=[False, False, False, False],\n",
- " fill_value='?',\n",
- " dtype=object), array([0.96078431, 0.96078431, 0.92156863, 0.96078431])])"
- ]
- },
- "execution_count": 6,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "clf.cv_results_.values()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "print(__doc__)\n",
- "\n",
- "import numpy as np\n",
- "import matplotlib.pyplot as plt\n",
- "from sklearn import svm, datasets\n",
- "\n",
- "# import some data to play with\n",
- "iris = datasets.load_iris()\n",
- "X = iris.data[:, :2] # we only take the first two features. We could\n",
- " # avoid this ugly slicing by using a two-dim dataset\n",
- "Y = iris.target\n",
- "\n",
- "\n",
- "def my_kernel(X, Y):\n",
- " \"\"\"\n",
- " We create a custom kernel:\n",
- "\n",
- " (2 0)\n",
- " k(X, Y) = X ( ) Y.T\n",
- " (0 1)\n",
- " \"\"\"\n",
- " M = np.array([[2, 0], [0, 1.0]])\n",
- " return np.dot(np.dot(X, M), Y.T)\n",
- "\n",
- "\n",
- "h = .02 # step size in the mesh\n",
- "\n",
- "# we create an instance of SVM and fit out data.\n",
- "clf = svm.SVC(kernel=my_kernel)\n",
- "clf.fit(X, Y)\n",
- "\n",
- "# Plot the decision boundary. For that, we will assign a color to each\n",
- "# point in the mesh [x_min, x_max]x[y_min, y_max].\n",
- "x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1\n",
- "y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1\n",
- "xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))\n",
- "Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])\n",
- "\n",
- "# Put the result into a color plot\n",
- "Z = Z.reshape(xx.shape)\n",
- "plt.pcolormesh(xx, yy, Z, cmap=plt.cm.Paired)\n",
- "\n",
- "# Plot also the training points\n",
- "plt.scatter(X[:, 0], X[:, 1], c=Y, cmap=plt.cm.Paired, edgecolors='k')\n",
- "plt.title('3-Class classification using Support Vector Machine with custom'\n",
- " ' kernel')\n",
- "plt.axis('tight')\n",
- "plt.show()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 69,
- "metadata": {
- "scrolled": false
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "- This script take as input a kernel matrix\n",
- "and returns the classification or regression performance\n",
- "- The kernel matrix can be calculated using any of the graph kernels approaches\n",
- "- The criteria used for prediction are SVM for classification and kernel Ridge regression for regression\n",
- "- For predition we divide the data in training, validation and test. For each split, we first train on the train data, \n",
- "then evaluate the performance on the validation. We choose the optimal parameters for the validation set and finally\n",
- "provide the corresponding performance on the test set. If more than one split is performed, the final results \n",
- "correspond to the average of the performances on the test sets. \n",
- "\n",
- "@references\n",
- " https://github.com/eghisu/GraphKernels/blob/master/GraphKernelsCollection/python_scripts/compute_perf_gk.py\n",
- "\n",
- "\n",
- " Loading dataset from file...\n",
- "[ -23.7 14. 37.3 109.7 10.8 39. 42. 66.6 135. 148.5\n",
- " 40. 34.6 32. 63. 53.5 67. 64.4 84.7 95.5 92.\n",
- " 84.4 154. 156. 166. 183. 70.3 63.6 52.5 59. 59.5\n",
- " 55.2 88. 83. 104.5 102. 92. 107.4 123.2 112.5 118.5\n",
- " 101.5 173.7 165.5 181. 99.5 92.3 90.1 80.2 82. 91.2\n",
- " 91.5 81.2 93. 69. 86.3 82. 103. 103.5 96. 112. 104.\n",
- " 132.5 123.5 120.3 145. 144.2 142.8 132. 134.2 137. 139.\n",
- " 133.6 120.4 120. 137. 195.8 177.2 181. 185.9 175.7 186. 211.\n",
- " 125. 118. 117.1 107. 102.5 112. 97.4 91.5 87.6 106.5\n",
- " 101. 99.3 90. 137. 114. 126. 124. 140.5 157.5 146. 145.\n",
- " 141. 171. 166. 155. 145. 159. 138. 142. 159. 163.5\n",
- " 229.5 142. 125. 132. 130.5 125. 122. 121. 122.2 112. 106.\n",
- " 114.5 151. 128.5 109.5 126. 147. 158. 147. 165. 188.9\n",
- " 170. 178. 148.5 165. 177. 167. 195. 226. 215. 201. 205.\n",
- " 151.5 165.5 157. 139. 163. 153.5 139. 162. 173. 159.5\n",
- " 159.5 155.5 141. 126. 164. 163. 166.5 146. 165. 159. 195.\n",
- " 218. 250. 235. 186.5 156.5 162. 162. 170.2 173.2 186.8\n",
- " 173. 187. 174. 188.5 199. 228. 215. 216. 240. ]\n",
- "\n",
- " Loading the matrix from file...\n",
- "[[ 3. 1. 3. ..., 1. 1. 1.]\n",
- " [ 1. 6. 1. ..., 0. 0. 3.]\n",
- " [ 3. 1. 3. ..., 1. 1. 1.]\n",
- " ..., \n",
- " [ 1. 0. 1. ..., 55. 21. 7.]\n",
- " [ 1. 0. 1. ..., 21. 55. 7.]\n",
- " [ 1. 3. 1. ..., 7. 7. 55.]]\n",
- "--- This is a regression problem ---\n",
- "Starting split 10...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 12 with parameter alpha = 12.130000\n",
- "The best performance on the validation set is: 36.004721\n",
- "The corresponding performance on test set is: 47.691725\n",
- "Starting split 11...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 99 with parameter alpha = 100.000000\n",
- "The best performance on the validation set is: 33.084913\n",
- "The corresponding performance on test set is: 35.493699\n",
- "Starting split 12...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 18 with parameter alpha = 18.190000\n",
- "The best performance on the validation set is: 29.476238\n",
- "The corresponding performance on test set is: 36.525571\n",
- "Starting split 13...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 9 with parameter alpha = 9.100000\n",
- "The best performance on the validation set is: 40.272791\n",
- "The corresponding performance on test set is: 37.359205\n",
- "Starting split 14...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 99 with parameter alpha = 100.000000\n",
- "The best performance on the validation set is: 42.973240\n",
- "The corresponding performance on test set is: 53.123785\n",
- "Starting split 15...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 18 with parameter alpha = 18.190000\n",
- "The best performance on the validation set is: 38.216353\n",
- "The corresponding performance on test set is: 37.697069\n",
- "Starting split 16...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 88 with parameter alpha = 88.890000\n",
- "The best performance on the validation set is: 32.988038\n",
- "The corresponding performance on test set is: 37.515000\n",
- "Starting split 17...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 4 with parameter alpha = 4.050000\n",
- "The best performance on the validation set is: 33.530482\n",
- "The corresponding performance on test set is: 43.448861\n",
- "Starting split 18...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 64 with parameter alpha = 64.650000\n",
- "The best performance on the validation set is: 29.671418\n",
- "The corresponding performance on test set is: 29.196786\n",
- "Starting split 19...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 7 with parameter alpha = 7.080000\n",
- "The best performance on the validation set is: 44.854681\n",
- "The corresponding performance on test set is: 36.111594\n",
- "Starting split 20...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 19 with parameter alpha = 19.200000\n",
- "The best performance on the validation set is: 35.660642\n",
- "The corresponding performance on test set is: 38.151790\n",
- "Starting split 21...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 14 with parameter alpha = 14.150000\n",
- "The best performance on the validation set is: 41.607604\n",
- "The corresponding performance on test set is: 32.158764\n",
- "Starting split 22...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 46 with parameter alpha = 46.470000\n",
- "The best performance on the validation set is: 36.461026\n",
- "The corresponding performance on test set is: 36.247837\n",
- "Starting split 23...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 49 with parameter alpha = 49.500000\n",
- "The best performance on the validation set is: 28.540585\n",
- "The corresponding performance on test set is: 37.377972\n",
- "Starting split 24...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 32 with parameter alpha = 32.330000\n",
- "The best performance on the validation set is: 33.397020\n",
- "The corresponding performance on test set is: 35.840386\n",
- "Starting split 25...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 30 with parameter alpha = 30.310000\n",
- "The best performance on the validation set is: 36.391650\n",
- "The corresponding performance on test set is: 36.120189\n",
- "Starting split 26...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 67 with parameter alpha = 67.680000\n",
- "The best performance on the validation set is: 39.558878\n",
- "The corresponding performance on test set is: 35.711036\n",
- "Starting split 27...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 35 with parameter alpha = 35.360000\n",
- "The best performance on the validation set is: 34.563158\n",
- "The corresponding performance on test set is: 51.215777\n",
- "Starting split 28...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 15 with parameter alpha = 15.160000\n",
- "The best performance on the validation set is: 43.309309\n",
- "The corresponding performance on test set is: 40.394297\n",
- "Starting split 29...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 14 with parameter alpha = 14.150000\n",
- "The best performance on the validation set is: 42.797900\n",
- "The corresponding performance on test set is: 34.218103\n",
- "Starting split 30...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 42 with parameter alpha = 42.430000\n",
- "The best performance on the validation set is: 45.422692\n",
- "The corresponding performance on test set is: 27.041917\n",
- "Starting split 31...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 9 with parameter alpha = 9.100000\n",
- "The best performance on the validation set is: 33.447413\n",
- "The corresponding performance on test set is: 38.341333\n",
- "Starting split 32...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 46 with parameter alpha = 46.470000\n",
- "The best performance on the validation set is: 31.638807\n",
- "The corresponding performance on test set is: 43.374635\n",
- "Starting split 33...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 49 with parameter alpha = 49.500000\n",
- "The best performance on the validation set is: 37.702092\n",
- "The corresponding performance on test set is: 31.198701\n",
- "Starting split 34...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 99 with parameter alpha = 100.000000\n",
- "The best performance on the validation set is: 28.013251\n",
- "The corresponding performance on test set is: 30.116903\n",
- "Starting split 35...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 13 with parameter alpha = 13.140000\n",
- "The best performance on the validation set is: 38.520179\n",
- "The corresponding performance on test set is: 37.478691\n",
- "Starting split 36...\n",
- "\n",
- " Normalizing output y...\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "The best performance is for trial 99 with parameter alpha = 100.000000\n",
- "The best performance on the validation set is: 35.836403\n",
- "The corresponding performance on test set is: 37.447219\n",
- "Starting split 37...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 47 with parameter alpha = 47.480000\n",
- "The best performance on the validation set is: 31.172116\n",
- "The corresponding performance on test set is: 39.504962\n",
- "Starting split 38...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 36 with parameter alpha = 36.370000\n",
- "The best performance on the validation set is: 40.025101\n",
- "The corresponding performance on test set is: 41.314650\n",
- "Starting split 39...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 99 with parameter alpha = 100.000000\n",
- "The best performance on the validation set is: 28.474810\n",
- "The corresponding performance on test set is: 38.093995\n",
- "Starting split 40...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 99 with parameter alpha = 100.000000\n",
- "The best performance on the validation set is: 38.056007\n",
- "The corresponding performance on test set is: 33.570513\n",
- "Starting split 41...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 5 with parameter alpha = 5.060000\n",
- "The best performance on the validation set is: 35.329935\n",
- "The corresponding performance on test set is: 40.309342\n",
- "Starting split 42...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 59 with parameter alpha = 59.600000\n",
- "The best performance on the validation set is: 25.235609\n",
- "The corresponding performance on test set is: 36.117043\n",
- "Starting split 43...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 30 with parameter alpha = 30.310000\n",
- "The best performance on the validation set is: 27.596182\n",
- "The corresponding performance on test set is: 39.069843\n",
- "Starting split 44...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 99 with parameter alpha = 100.000000\n",
- "The best performance on the validation set is: 24.493222\n",
- "The corresponding performance on test set is: 34.064025\n",
- "Starting split 45...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 15 with parameter alpha = 15.160000\n",
- "The best performance on the validation set is: 45.540605\n",
- "The corresponding performance on test set is: 33.544310\n",
- "Starting split 46...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 99 with parameter alpha = 100.000000\n",
- "The best performance on the validation set is: 34.661595\n",
- "The corresponding performance on test set is: 26.174480\n",
- "Starting split 47...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 5 with parameter alpha = 5.060000\n",
- "The best performance on the validation set is: 34.837287\n",
- "The corresponding performance on test set is: 45.463855\n",
- "Starting split 48...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 31 with parameter alpha = 31.320000\n",
- "The best performance on the validation set is: 37.528129\n",
- "The corresponding performance on test set is: 51.123083\n",
- "Starting split 49...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 20 with parameter alpha = 20.210000\n",
- "The best performance on the validation set is: 32.480446\n",
- "The corresponding performance on test set is: 31.618253\n",
- "Starting split 50...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 59 with parameter alpha = 59.600000\n",
- "The best performance on the validation set is: 32.584107\n",
- "The corresponding performance on test set is: 31.376594\n",
- "Starting split 51...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 9 with parameter alpha = 9.100000\n",
- "The best performance on the validation set is: 40.867089\n",
- "The corresponding performance on test set is: 32.363239\n",
- "Starting split 52...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 97 with parameter alpha = 97.980000\n",
- "The best performance on the validation set is: 33.801783\n",
- "The corresponding performance on test set is: 41.200644\n",
- "Starting split 53...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 17 with parameter alpha = 17.180000\n",
- "The best performance on the validation set is: 44.010303\n",
- "The corresponding performance on test set is: 34.562120\n",
- "Starting split 54...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 22 with parameter alpha = 22.230000\n",
- "The best performance on the validation set is: 36.759090\n",
- "The corresponding performance on test set is: 38.376060\n",
- "Starting split 55...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 99 with parameter alpha = 100.000000\n",
- "The best performance on the validation set is: 29.830898\n",
- "The corresponding performance on test set is: 24.811584\n",
- "Starting split 56...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 18 with parameter alpha = 18.190000\n",
- "The best performance on the validation set is: 34.627026\n",
- "The corresponding performance on test set is: 46.684129\n",
- "Starting split 57...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 5 with parameter alpha = 5.060000\n",
- "The best performance on the validation set is: 43.343991\n",
- "The corresponding performance on test set is: 41.169814\n",
- "Starting split 58...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 39 with parameter alpha = 39.400000\n",
- "The best performance on the validation set is: 34.908981\n",
- "The corresponding performance on test set is: 48.932907\n",
- "Starting split 59...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 10 with parameter alpha = 10.110000\n",
- "The best performance on the validation set is: 37.767543\n",
- "The corresponding performance on test set is: 28.512235\n",
- "Starting split 60...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 99 with parameter alpha = 100.000000\n",
- "The best performance on the validation set is: 37.890852\n",
- "The corresponding performance on test set is: 28.082837\n",
- "Starting split 61...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 36 with parameter alpha = 36.370000\n",
- "The best performance on the validation set is: 41.402040\n",
- "The corresponding performance on test set is: 31.964262\n",
- "Starting split 62...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 34 with parameter alpha = 34.350000\n",
- "The best performance on the validation set is: 39.216178\n",
- "The corresponding performance on test set is: 48.626836\n",
- "Starting split 63...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 99 with parameter alpha = 100.000000\n",
- "The best performance on the validation set is: 35.222016\n",
- "The corresponding performance on test set is: 50.344625\n",
- "Starting split 64...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 25 with parameter alpha = 25.260000\n",
- "The best performance on the validation set is: 33.803383\n",
- "The corresponding performance on test set is: 40.058257\n",
- "Starting split 65...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 56 with parameter alpha = 56.570000\n",
- "The best performance on the validation set is: 29.170281\n",
- "The corresponding performance on test set is: 36.104372\n",
- "Starting split 66...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 14 with parameter alpha = 14.150000\n",
- "The best performance on the validation set is: 40.405285\n",
- "The corresponding performance on test set is: 39.262782\n",
- "Starting split 67...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 58 with parameter alpha = 58.590000\n",
- "The best performance on the validation set is: 24.638279\n",
- "The corresponding performance on test set is: 35.165763\n",
- "Starting split 68...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 56 with parameter alpha = 56.570000\n",
- "The best performance on the validation set is: 31.681889\n",
- "The corresponding performance on test set is: 44.049377\n",
- "Starting split 69...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 5 with parameter alpha = 5.060000\n",
- "The best performance on the validation set is: 30.735748\n",
- "The corresponding performance on test set is: 50.708019\n",
- "Starting split 70...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 29 with parameter alpha = 29.300000\n",
- "The best performance on the validation set is: 37.273741\n",
- "The corresponding performance on test set is: 39.351135\n",
- "Starting split 71...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 6 with parameter alpha = 6.070000\n",
- "The best performance on the validation set is: 48.033115\n",
- "The corresponding performance on test set is: 55.674648\n",
- "Starting split 72...\n",
- "\n",
- " Normalizing output y...\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "The best performance is for trial 99 with parameter alpha = 100.000000\n",
- "The best performance on the validation set is: 31.903823\n",
- "The corresponding performance on test set is: 32.937886\n",
- "Starting split 73...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 19 with parameter alpha = 19.200000\n",
- "The best performance on the validation set is: 40.825941\n",
- "The corresponding performance on test set is: 38.535950\n",
- "Starting split 74...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 99 with parameter alpha = 100.000000\n",
- "The best performance on the validation set is: 34.181621\n",
- "The corresponding performance on test set is: 34.089714\n",
- "Starting split 75...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 39 with parameter alpha = 39.400000\n",
- "The best performance on the validation set is: 40.264289\n",
- "The corresponding performance on test set is: 47.412526\n",
- "Starting split 76...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 27 with parameter alpha = 27.280000\n",
- "The best performance on the validation set is: 35.842650\n",
- "The corresponding performance on test set is: 34.785447\n",
- "Starting split 77...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 33 with parameter alpha = 33.340000\n",
- "The best performance on the validation set is: 38.896608\n",
- "The corresponding performance on test set is: 39.158479\n",
- "Starting split 78...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 28 with parameter alpha = 28.290000\n",
- "The best performance on the validation set is: 31.053773\n",
- "The corresponding performance on test set is: 33.711541\n",
- "Starting split 79...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 54 with parameter alpha = 54.550000\n",
- "The best performance on the validation set is: 36.129208\n",
- "The corresponding performance on test set is: 34.191692\n",
- "Starting split 80...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 3 with parameter alpha = 3.040000\n",
- "The best performance on the validation set is: 42.796346\n",
- "The corresponding performance on test set is: 40.531343\n",
- "Starting split 81...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 1 with parameter alpha = 1.020000\n",
- "The best performance on the validation set is: 50.420936\n",
- "The corresponding performance on test set is: 43.764477\n",
- "Starting split 82...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 99 with parameter alpha = 100.000000\n",
- "The best performance on the validation set is: 36.720826\n",
- "The corresponding performance on test set is: 40.242670\n",
- "Starting split 83...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 43 with parameter alpha = 43.440000\n",
- "The best performance on the validation set is: 38.601089\n",
- "The corresponding performance on test set is: 46.145483\n",
- "Starting split 84...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 38 with parameter alpha = 38.390000\n",
- "The best performance on the validation set is: 29.823069\n",
- "The corresponding performance on test set is: 27.458317\n",
- "Starting split 85...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 1 with parameter alpha = 1.020000\n",
- "The best performance on the validation set is: 37.295245\n",
- "The corresponding performance on test set is: 41.040827\n",
- "Starting split 86...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 6 with parameter alpha = 6.070000\n",
- "The best performance on the validation set is: 46.779299\n",
- "The corresponding performance on test set is: 36.893783\n",
- "Starting split 87...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 30 with parameter alpha = 30.310000\n",
- "The best performance on the validation set is: 35.570939\n",
- "The corresponding performance on test set is: 31.014527\n",
- "Starting split 88...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 11 with parameter alpha = 11.120000\n",
- "The best performance on the validation set is: 42.125559\n",
- "The corresponding performance on test set is: 34.059925\n",
- "Starting split 89...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 32 with parameter alpha = 32.330000\n",
- "The best performance on the validation set is: 33.276369\n",
- "The corresponding performance on test set is: 34.010431\n",
- "Starting split 90...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 14 with parameter alpha = 14.150000\n",
- "The best performance on the validation set is: 48.542045\n",
- "The corresponding performance on test set is: 51.204813\n",
- "Starting split 91...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 99 with parameter alpha = 100.000000\n",
- "The best performance on the validation set is: 51.240470\n",
- "The corresponding performance on test set is: 25.204665\n",
- "Starting split 92...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 8 with parameter alpha = 8.090000\n",
- "The best performance on the validation set is: 28.042005\n",
- "The corresponding performance on test set is: 40.903728\n",
- "Starting split 93...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 24 with parameter alpha = 24.250000\n",
- "The best performance on the validation set is: 46.378283\n",
- "The corresponding performance on test set is: 38.717776\n",
- "Starting split 94...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 52 with parameter alpha = 52.530000\n",
- "The best performance on the validation set is: 28.152322\n",
- "The corresponding performance on test set is: 30.532819\n",
- "Starting split 95...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 42 with parameter alpha = 42.430000\n",
- "The best performance on the validation set is: 36.952087\n",
- "The corresponding performance on test set is: 33.163953\n",
- "Starting split 96...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 61 with parameter alpha = 61.620000\n",
- "The best performance on the validation set is: 41.047679\n",
- "The corresponding performance on test set is: 27.036643\n",
- "Starting split 97...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 42 with parameter alpha = 42.430000\n",
- "The best performance on the validation set is: 36.858184\n",
- "The corresponding performance on test set is: 40.745694\n",
- "Starting split 98...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 89 with parameter alpha = 89.900000\n",
- "The best performance on the validation set is: 41.871463\n",
- "The corresponding performance on test set is: 37.287950\n",
- "Starting split 99...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 55 with parameter alpha = 55.560000\n",
- "The best performance on the validation set is: 41.040198\n",
- "The corresponding performance on test set is: 34.121258\n",
- "Starting split 100...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 9 with parameter alpha = 9.100000\n",
- "The best performance on the validation set is: 38.421616\n",
- "The corresponding performance on test set is: 43.772366\n",
- "Starting split 101...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 99 with parameter alpha = 100.000000\n",
- "The best performance on the validation set is: 23.640892\n",
- "The corresponding performance on test set is: 41.408082\n",
- "Starting split 102...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 99 with parameter alpha = 100.000000\n",
- "The best performance on the validation set is: 38.527173\n",
- "The corresponding performance on test set is: 52.468792\n",
- "Starting split 103...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 6 with parameter alpha = 6.070000\n",
- "The best performance on the validation set is: 47.154873\n",
- "The corresponding performance on test set is: 39.077319\n",
- "Starting split 104...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 12 with parameter alpha = 12.130000\n",
- "The best performance on the validation set is: 36.653442\n",
- "The corresponding performance on test set is: 47.172066\n",
- "Starting split 105...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 12 with parameter alpha = 12.130000\n",
- "The best performance on the validation set is: 33.976240\n",
- "The corresponding performance on test set is: 40.620368\n",
- "Starting split 106...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 22 with parameter alpha = 22.230000\n",
- "The best performance on the validation set is: 27.799295\n",
- "The corresponding performance on test set is: 38.034978\n",
- "Starting split 107...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 17 with parameter alpha = 17.180000\n",
- "The best performance on the validation set is: 43.202567\n",
- "The corresponding performance on test set is: 36.783012\n",
- "Starting split 108...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 99 with parameter alpha = 100.000000\n",
- "The best performance on the validation set is: 32.312218\n",
- "The corresponding performance on test set is: 38.186940\n",
- "Starting split 109...\n",
- "\n",
- " Normalizing output y...\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "The best performance is for trial 4 with parameter alpha = 4.050000\n",
- "The best performance on the validation set is: 39.485731\n",
- "The corresponding performance on test set is: 50.146953\n"
- ]
- }
- ],
- "source": [
- "# Author: Elisabetta Ghisu\n",
- "\n",
- "\"\"\"\n",
- "- This script take as input a kernel matrix\n",
- "and returns the classification or regression performance\n",
- "- The kernel matrix can be calculated using any of the graph kernels approaches\n",
- "- The criteria used for prediction are SVM for classification and kernel Ridge regression for regression\n",
- "- For predition we divide the data in training, validation and test. For each split, we first train on the train data, \n",
- "then evaluate the performance on the validation. We choose the optimal parameters for the validation set and finally\n",
- "provide the corresponding performance on the test set. If more than one split is performed, the final results \n",
- "correspond to the average of the performances on the test sets. \n",
- "\n",
- "@references\n",
- " https://github.com/eghisu/GraphKernels/blob/master/GraphKernelsCollection/python_scripts/compute_perf_gk.py\n",
- "\"\"\"\n",
- "\n",
- "print(__doc__)\n",
- "\n",
- "import sys\n",
- "import pathlib\n",
- "sys.path.insert(0, \"../py-graph/\")\n",
- "\n",
- "import random\n",
- "import numpy as np\n",
- "import matplotlib.pyplot as plt\n",
- "\n",
- "from sklearn.kernel_ridge import KernelRidge # 0.17\n",
- "from sklearn.metrics import accuracy_score, mean_squared_error\n",
- "from sklearn import svm\n",
- "\n",
- "from kernels.spkernel import spkernel\n",
- "from utils.graphfiles import loadDataset\n",
- "\n",
- "print('\\n Loading dataset from file...')\n",
- "dataset, y = loadDataset(\"/home/ljia/Documents/research-repo/datasets/acyclic/Acyclic/dataset_bps.ds\")\n",
- "y = np.array(y)\n",
- "print(y)\n",
- "\n",
- "kernel_file_path = 'kernelmatrix.ds'\n",
- "path = pathlib.Path(kernel_file_path)\n",
- "if path.is_file():\n",
- " print('\\n Loading the matrix from file...')\n",
- " Kmatrix = np.loadtxt(kernel_file_path)\n",
- " print(Kmatrix)\n",
- "else:\n",
- " print('\\n Calculating kernel matrix, this could take a while...')\n",
- " Kmatrix = spkernel(dataset)\n",
- " print(Kmatrix)\n",
- " print('Saving kernel matrix to file...')\n",
- " np.savetxt(kernel_file_path, Kmatrix)\n",
- "\n",
- "# setup the parameters\n",
- "model_type = 'regression' # Regression or classification problem\n",
- "print('\\n --- This is a %s problem ---' % model_type)\n",
- "\n",
- "datasize = len(dataset)\n",
- "trials = 100 # Trials for hyperparameters random search\n",
- "splits = 10 # Number of splits of the data\n",
- "alpha_grid = np.linspace(0.01, 100, num = trials) # corresponds to (2*C)^-1 in other linear models such as LogisticRegression\n",
- "# C_grid = np.linspace(0.0001, 10, num = trials)\n",
- "random.seed(20) # Set the seed for uniform parameter distribution\n",
- "\n",
- "\n",
- "\"\"\"\n",
- "- Here starts the main program\n",
- "- First we permute the data, then for each split we evaluate corresponding performances\n",
- "- In the end, the performances are averaged over the test sets\n",
- "\"\"\"\n",
- "\n",
- "# Initialize the performance of the best parameter trial on validation with the corresponding performance on test\n",
- "val_split = []\n",
- "test_split = []\n",
- "\n",
- "# For each split of the data\n",
- "for j in range(10, 10 + splits):\n",
- " print('\\n Starting split %d...' % j)\n",
- "\n",
- " # Set the random set for data permutation\n",
- " random_state = int(j)\n",
- " np.random.seed(random_state)\n",
- " idx_perm = np.random.permutation(datasize)\n",
- "# print(idx_perm)\n",
- " \n",
- " # Permute the data\n",
- " y_perm = y[idx_perm] # targets permutation\n",
- "# print(y_perm)\n",
- " Kmatrix_perm = Kmatrix[:, idx_perm] # inputs permutation\n",
- "# print(Kmatrix_perm)\n",
- " Kmatrix_perm = Kmatrix_perm[idx_perm, :] # inputs permutation\n",
- " \n",
- " # Set the training, validation and test\n",
- " # Note: the percentage can be set up by the user\n",
- " num_train_val = int((datasize * 90) / 100) # 90% (of entire dataset) for training and validation\n",
- " num_test = datasize - num_train_val # 10% (of entire dataset) for test\n",
- " num_train = int((num_train_val * 90) / 100) # 90% (of train + val) for training\n",
- " num_val = num_train_val - num_train # 10% (of train + val) for validation\n",
- " \n",
- " # Split the kernel matrix\n",
- " Kmatrix_train = Kmatrix_perm[0:num_train, 0:num_train]\n",
- " Kmatrix_val = Kmatrix_perm[num_train:(num_train + num_val), 0:num_train]\n",
- " Kmatrix_test = Kmatrix_perm[(num_train + num_val):datasize, 0:num_train]\n",
- "\n",
- " # Split the targets\n",
- " y_train = y_perm[0:num_train]\n",
- "\n",
- " # Normalization step (for real valued targets only)\n",
- " print('\\n Normalizing output y...')\n",
- " if model_type == 'regression':\n",
- " y_train_mean = np.mean(y_train)\n",
- " y_train_std = np.std(y_train)\n",
- " y_train = (y_train - y_train_mean) / float(y_train_std)\n",
- "# print(y)\n",
- " \n",
- " y_val = y_perm[num_train:(num_train + num_val)]\n",
- " y_test = y_perm[(num_train + num_val):datasize]\n",
- " \n",
- " # Record the performance for each parameter trial respectively on validation and test set\n",
- " perf_all_val = []\n",
- " perf_all_test = []\n",
- " \n",
- " # For each parameter trial\n",
- " for i in range(trials):\n",
- " # For regression use the Kernel Ridge method\n",
- " if model_type == 'regression':\n",
- "# print('\\n Starting experiment for trial %d and parameter alpha = %3f\\n ' % (i, alpha_grid[i]))\n",
- "\n",
- " # Fit the kernel ridge model\n",
- " KR = KernelRidge(kernel = 'precomputed', alpha = alpha_grid[i])\n",
- " KR.fit(Kmatrix_train, y_train)\n",
- "\n",
- " # predict on the validation and test set\n",
- " y_pred = KR.predict(Kmatrix_val)\n",
- " y_pred_test = KR.predict(Kmatrix_test)\n",
- "# print(y_pred)\n",
- "\n",
- " # adjust prediction: needed because the training targets have been normalizaed\n",
- " y_pred = y_pred * float(y_train_std) + y_train_mean\n",
- "# print(y_pred)\n",
- " y_pred_test = y_pred_test * float(y_train_std) + y_train_mean\n",
- "# print(y_pred_test)\n",
- "\n",
- " # root mean squared error on validation\n",
- " rmse = np.sqrt(mean_squared_error(y_val, y_pred))\n",
- " perf_all_val.append(rmse)\n",
- "\n",
- " # root mean squared error in test \n",
- " rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))\n",
- " perf_all_test.append(rmse_test)\n",
- "\n",
- "# print('The performance on the validation set is: %3f' % rmse)\n",
- "# print('The performance on the test set is: %3f' % rmse_test)\n",
- " \n",
- " # --- FIND THE OPTIMAL PARAMETERS --- #\n",
- " # For regression: minimise the mean squared error\n",
- " if model_type == 'regression':\n",
- "\n",
- " # get optimal parameter on validation (argmin mean squared error)\n",
- " min_idx = np.argmin(perf_all_val)\n",
- " alpha_opt = alpha_grid[min_idx]\n",
- "\n",
- " # performance corresponding to optimal parameter on val\n",
- " perf_val_opt = perf_all_val[min_idx]\n",
- "\n",
- " # corresponding performance on test for the same parameter\n",
- " perf_test_opt = perf_all_test[min_idx]\n",
- "\n",
- " print('The best performance is for trial %d with parameter alpha = %3f' % (min_idx, alpha_opt))\n",
- " print('The best performance on the validation set is: %3f' % perf_val_opt)\n",
- " print('The corresponding performance on test set is: %3f' % perf_test_opt)\n",
- "\n",
- "# # we create an instance of SVM and fit out data.\n",
- "# clf = svm.SVC(kernel = 'precomputed')\n",
- "# clf.fit(Kmatrix, )\n",
- "\n",
- "# # predict on validation and test\n",
- "# y_pred = clf.predict(K_val)\n",
- "# y_pred_test = clf.predict(K_test)\n",
- "\n",
- "# # accuracy on validation set\n",
- "# acc = accuracy_score(y_val, y_pred)\n",
- "# perf_all_val.append(acc)\n",
- "\n",
- "# # accuracy on test set\n",
- "# acc_test = accuracy_score(y_test, y_pred_test)\n",
- "# perf_all_test.append(acc_test)\n",
- "\n",
- "# # print \"The performance on the validation set is: %3f\" % acc\n",
- "# # print \"The performance on the test set is: %3f\" % acc_test\n",
- "\n",
- "\n",
- "\n",
- "# # Plot the decision boundary. For that, we will assign a color to each\n",
- "# # point in the mesh [x_min, x_max]x[y_min, y_max].\n",
- "# x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1\n",
- "# y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1\n",
- "# xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))\n",
- "# Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])\n",
- "\n",
- "# # Put the result into a color plot\n",
- "# Z = Z.reshape(xx.shape)\n",
- "# plt.pcolormesh(xx, yy, Z, cmap=plt.cm.Paired)\n",
- "\n",
- "# # Plot also the training points\n",
- "# plt.scatter(X[:, 0], X[:, 1], c=Y, cmap=plt.cm.Paired, edgecolors='k')\n",
- "# plt.title('3-Class classification using Support Vector Machine with custom'\n",
- "# ' kernel')\n",
- "# plt.axis('tight')\n",
- "# plt.show()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "4841564986 / 3"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.5.2"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 2
- }
|