|
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365 |
- {
- "cells": [
- {
- "cell_type": "code",
- "execution_count": 26,
- "metadata": {
- "scrolled": false
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "- This script take as input a kernel matrix\n",
- "and returns the classification or regression performance\n",
- "- The kernel matrix can be calculated using any of the graph kernels approaches\n",
- "- The criteria used for prediction are SVM for classification and kernel Ridge regression for regression\n",
- "- For predition we divide the data in training, validation and test. For each split, we first train on the train data, \n",
- "then evaluate the performance on the validation. We choose the optimal parameters for the validation set and finally\n",
- "provide the corresponding performance on the test set. If more than one split is performed, the final results \n",
- "correspond to the average of the performances on the test sets. \n",
- "\n",
- "@references\n",
- " https://github.com/eghisu/GraphKernels/blob/master/GraphKernelsCollection/python_scripts/compute_perf_gk.py\n",
- "\n",
- "\n",
- " Loading dataset from file...\n",
- "[ -23.7 14. 37.3 109.7 10.8 39. 42. 66.6 135. 148.5\n",
- " 40. 34.6 32. 63. 53.5 67. 64.4 84.7 95.5 92.\n",
- " 84.4 154. 156. 166. 183. 70.3 63.6 52.5 59. 59.5\n",
- " 55.2 88. 83. 104.5 102. 92. 107.4 123.2 112.5 118.5\n",
- " 101.5 173.7 165.5 181. 99.5 92.3 90.1 80.2 82. 91.2\n",
- " 91.5 81.2 93. 69. 86.3 82. 103. 103.5 96. 112. 104.\n",
- " 132.5 123.5 120.3 145. 144.2 142.8 132. 134.2 137. 139.\n",
- " 133.6 120.4 120. 137. 195.8 177.2 181. 185.9 175.7 186. 211.\n",
- " 125. 118. 117.1 107. 102.5 112. 97.4 91.5 87.6 106.5\n",
- " 101. 99.3 90. 137. 114. 126. 124. 140.5 157.5 146. 145.\n",
- " 141. 171. 166. 155. 145. 159. 138. 142. 159. 163.5\n",
- " 229.5 142. 125. 132. 130.5 125. 122. 121. 122.2 112. 106.\n",
- " 114.5 151. 128.5 109.5 126. 147. 158. 147. 165. 188.9\n",
- " 170. 178. 148.5 165. 177. 167. 195. 226. 215. 201. 205.\n",
- " 151.5 165.5 157. 139. 163. 153.5 139. 162. 173. 159.5\n",
- " 159.5 155.5 141. 126. 164. 163. 166.5 146. 165. 159. 195.\n",
- " 218. 250. 235. 186.5 156.5 162. 162. 170.2 173.2 186.8\n",
- " 173. 187. 174. 188.5 199. 228. 215. 216. 240. ]\n",
- "\n",
- " Loading the matrix from file...\n",
- "[[ 3. 1. 3. ..., 1. 1. 1.]\n",
- " [ 1. 6. 1. ..., 0. 0. 3.]\n",
- " [ 3. 1. 3. ..., 1. 1. 1.]\n",
- " ..., \n",
- " [ 1. 0. 1. ..., 55. 21. 7.]\n",
- " [ 1. 0. 1. ..., 21. 55. 7.]\n",
- " [ 1. 3. 1. ..., 7. 7. 55.]]\n",
- "\n",
- " --- This is a regression problem ---\n",
- "\n",
- " Starting split 10...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 12 with parameter alpha = 100.000000\n",
- "The best performance on the validation set is: 40.422382\n",
- "The corresponding performance on test set is: 47.424532\n",
- "\n",
- " Starting split 11...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 12 with parameter alpha = 100.000000\n",
- "The best performance on the validation set is: 33.084913\n",
- "The corresponding performance on test set is: 35.493699\n",
- "\n",
- " Starting split 12...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 12 with parameter alpha = 100.000000\n",
- "The best performance on the validation set is: 31.306710\n",
- "The corresponding performance on test set is: 33.173366\n",
- "\n",
- " Starting split 13...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 12 with parameter alpha = 100.000000\n",
- "The best performance on the validation set is: 43.500424\n",
- "The corresponding performance on test set is: 32.633129\n",
- "\n",
- " Starting split 14...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 10 with parameter alpha = 1.000000\n",
- "The best performance on the validation set is: 53.561752\n",
- "The corresponding performance on test set is: 42.883548\n",
- "\n",
- " Starting split 15...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 12 with parameter alpha = 100.000000\n",
- "The best performance on the validation set is: 40.444773\n",
- "The corresponding performance on test set is: 32.713040\n",
- "\n",
- " Starting split 16...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 11 with parameter alpha = 10.000000\n",
- "The best performance on the validation set is: 37.046818\n",
- "The corresponding performance on test set is: 37.337851\n",
- "\n",
- " Starting split 17...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 12 with parameter alpha = 100.000000\n",
- "The best performance on the validation set is: 39.907628\n",
- "The corresponding performance on test set is: 38.889064\n",
- "\n",
- " Starting split 18...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 12 with parameter alpha = 100.000000\n",
- "The best performance on the validation set is: 29.879950\n",
- "The corresponding performance on test set is: 27.652558\n",
- "\n",
- " Starting split 19...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 11 with parameter alpha = 10.000000\n",
- "The best performance on the validation set is: 44.911892\n",
- "The corresponding performance on test set is: 35.804454\n",
- "\n",
- " Mean performance on val set: 39.406724\n",
- "With standard deviation: 6.720820\n",
- "\n",
- " Mean performance on test set: 36.400524\n",
- "With standard deviation: 5.352940\n"
- ]
- }
- ],
- "source": [
- "# Author: Elisabetta Ghisu\n",
- "\n",
- "\"\"\"\n",
- "- This script take as input a kernel matrix\n",
- "and returns the classification or regression performance\n",
- "- The kernel matrix can be calculated using any of the graph kernels approaches\n",
- "- The criteria used for prediction are SVM for classification and kernel Ridge regression for regression\n",
- "- For predition we divide the data in training, validation and test. For each split, we first train on the train data, \n",
- "then evaluate the performance on the validation. We choose the optimal parameters for the validation set and finally\n",
- "provide the corresponding performance on the test set. If more than one split is performed, the final results \n",
- "correspond to the average of the performances on the test sets. \n",
- "\n",
- "@references\n",
- " https://github.com/eghisu/GraphKernels/blob/master/GraphKernelsCollection/python_scripts/compute_perf_gk.py\n",
- "\"\"\"\n",
- "\n",
- "print(__doc__)\n",
- "\n",
- "import sys\n",
- "import pathlib\n",
- "sys.path.insert(0, \"../py-graph/\")\n",
- "from tabulate import tabulate\n",
- "\n",
- "import random\n",
- "import numpy as np\n",
- "import matplotlib.pyplot as plt\n",
- "\n",
- "from sklearn.kernel_ridge import KernelRidge # 0.17\n",
- "from sklearn.metrics import accuracy_score, mean_squared_error\n",
- "from sklearn import svm\n",
- "\n",
- "from kernels.spkernel import spkernel\n",
- "from utils.graphfiles import loadDataset\n",
- "\n",
- "print('\\n Loading dataset from file...')\n",
- "dataset, y = loadDataset(\"../../../../datasets/acyclic/Acyclic/dataset_bps.ds\")\n",
- "y = np.array(y)\n",
- "print(y)\n",
- "\n",
- "kernel_file_path = 'kernelmatrix.ds'\n",
- "path = pathlib.Path(kernel_file_path)\n",
- "if path.is_file():\n",
- " print('\\n Loading the matrix from file...')\n",
- " Kmatrix = np.loadtxt(kernel_file_path)\n",
- " print(Kmatrix)\n",
- "else:\n",
- " print('\\n Calculating kernel matrix, this could take a while...')\n",
- " Kmatrix = spkernel(dataset)\n",
- " print(Kmatrix)\n",
- " print('Saving kernel matrix to file...')\n",
- " np.savetxt(kernel_file_path, Kmatrix)\n",
- "\n",
- "# setup the parameters\n",
- "model_type = 'regression' # Regression or classification problem\n",
- "print('\\n --- This is a %s problem ---' % model_type)\n",
- "\n",
- "datasize = len(dataset)\n",
- "trials = 21 # Trials for hyperparameters random search\n",
- "splits = 10 # Number of splits of the data\n",
- "alpha_grid = np.logspace(-10, 10, num = trials, base = 10) # corresponds to (2*C)^-1 in other linear models such as LogisticRegression\n",
- "C_grid = np.logspace(-10, 10, num = trials, base = 10)\n",
- "random.seed(20) # Set the seed for uniform parameter distribution\n",
- "\n",
- "\n",
- "\"\"\"\n",
- "- Here starts the main program\n",
- "- First we permute the data, then for each split we evaluate corresponding performances\n",
- "- In the end, the performances are averaged over the test sets\n",
- "\"\"\"\n",
- "\n",
- "# Initialize the performance of the best parameter trial on validation with the corresponding performance on test\n",
- "val_split = []\n",
- "test_split = []\n",
- "\n",
- "# For each split of the data\n",
- "for j in range(10, 10 + splits):\n",
- " print('\\n Starting split %d...' % j)\n",
- "\n",
- " # Set the random set for data permutation\n",
- " random_state = int(j)\n",
- " np.random.seed(random_state)\n",
- " idx_perm = np.random.permutation(datasize)\n",
- "# print(idx_perm)\n",
- " \n",
- " # Permute the data\n",
- " y_perm = y[idx_perm] # targets permutation\n",
- "# print(y_perm)\n",
- " Kmatrix_perm = Kmatrix[:, idx_perm] # inputs permutation\n",
- "# print(Kmatrix_perm)\n",
- " Kmatrix_perm = Kmatrix_perm[idx_perm, :] # inputs permutation\n",
- " \n",
- " # Set the training, validation and test\n",
- " # Note: the percentage can be set up by the user\n",
- " num_train_val = int((datasize * 90) / 100) # 90% (of entire dataset) for training and validation\n",
- " num_test = datasize - num_train_val # 10% (of entire dataset) for test\n",
- " num_train = int((num_train_val * 90) / 100) # 90% (of train + val) for training\n",
- " num_val = num_train_val - num_train # 10% (of train + val) for validation\n",
- " \n",
- " # Split the kernel matrix\n",
- " Kmatrix_train = Kmatrix_perm[0:num_train, 0:num_train]\n",
- " Kmatrix_val = Kmatrix_perm[num_train:(num_train + num_val), 0:num_train]\n",
- " Kmatrix_test = Kmatrix_perm[(num_train + num_val):datasize, 0:num_train]\n",
- "\n",
- " # Split the targets\n",
- " y_train = y_perm[0:num_train]\n",
- "\n",
- " # Normalization step (for real valued targets only)\n",
- " print('\\n Normalizing output y...')\n",
- " if model_type == 'regression':\n",
- " y_train_mean = np.mean(y_train)\n",
- " y_train_std = np.std(y_train)\n",
- " y_train = (y_train - y_train_mean) / float(y_train_std)\n",
- "# print(y)\n",
- " \n",
- " y_val = y_perm[num_train:(num_train + num_val)]\n",
- " y_test = y_perm[(num_train + num_val):datasize]\n",
- " \n",
- " # Record the performance for each parameter trial respectively on validation and test set\n",
- " perf_all_val = []\n",
- " perf_all_test = []\n",
- " \n",
- " # For each parameter trial\n",
- " for i in range(trials):\n",
- " # For regression use the Kernel Ridge method\n",
- " if model_type == 'regression':\n",
- "# print('\\n Starting experiment for trial %d and parameter alpha = %3f\\n ' % (i, alpha_grid[i]))\n",
- "\n",
- " # Fit the kernel ridge model\n",
- " KR = KernelRidge(kernel = 'precomputed', alpha = alpha_grid[i])\n",
- "# KR = svm.SVR(kernel = 'precomputed', C = C_grid[i])\n",
- " KR.fit(Kmatrix_train, y_train)\n",
- "\n",
- " # predict on the validation and test set\n",
- " y_pred = KR.predict(Kmatrix_val)\n",
- " y_pred_test = KR.predict(Kmatrix_test)\n",
- "# print(y_pred)\n",
- "\n",
- " # adjust prediction: needed because the training targets have been normalizaed\n",
- " y_pred = y_pred * float(y_train_std) + y_train_mean\n",
- "# print(y_pred)\n",
- " y_pred_test = y_pred_test * float(y_train_std) + y_train_mean\n",
- "# print(y_pred_test)\n",
- "\n",
- " # root mean squared error on validation\n",
- " rmse = np.sqrt(mean_squared_error(y_val, y_pred))\n",
- " perf_all_val.append(rmse)\n",
- "\n",
- " # root mean squared error in test \n",
- " rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))\n",
- " perf_all_test.append(rmse_test)\n",
- "\n",
- "# print('The performance on the validation set is: %3f' % rmse)\n",
- "# print('The performance on the test set is: %3f' % rmse_test)\n",
- " \n",
- " # --- FIND THE OPTIMAL PARAMETERS --- #\n",
- " # For regression: minimise the mean squared error\n",
- " if model_type == 'regression':\n",
- "\n",
- " # get optimal parameter on validation (argmin mean squared error)\n",
- " min_idx = np.argmin(perf_all_test)\n",
- " alpha_opt = alpha_grid[min_idx]\n",
- "\n",
- " # performance corresponding to optimal parameter on val\n",
- " perf_val_opt = perf_all_val[min_idx]\n",
- "\n",
- " # corresponding performance on test for the same parameter\n",
- " perf_test_opt = perf_all_test[min_idx]\n",
- "\n",
- " print('The best performance is for trial %d with parameter alpha = %3f' % (min_idx, alpha_opt))\n",
- " print('The best performance on the validation set is: %3f' % perf_val_opt)\n",
- " print('The corresponding performance on test set is: %3f' % perf_test_opt)\n",
- "\n",
- " # append the best performance on validation\n",
- " # at the current split\n",
- " val_split.append(perf_val_opt)\n",
- "\n",
- " # append the correponding performance on the test set\n",
- " test_split.append(perf_test_opt)\n",
- "\n",
- "# average the results\n",
- "# mean of the validation performances over the splits\n",
- "val_mean = np.mean(np.asarray(val_split))\n",
- "# std deviation of validation over the splits\n",
- "val_std = np.std(np.asarray(val_split))\n",
- "\n",
- "# mean of the test performances over the splits\n",
- "test_mean = np.mean(np.asarray(test_split))\n",
- "# std deviation of the test oer the splits\n",
- "test_std = np.std(np.asarray(test_split))\n",
- "\n",
- "print('\\n Mean performance on val set: %3f' % val_mean)\n",
- "print('With standard deviation: %3f' % val_std)\n",
- "print('\\n Mean performance on test set: %3f' % test_mean)\n",
- "print('With standard deviation: %3f' % test_std)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- " "
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.6.3"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 2
- }
|