|
@@ -1,665 +0,0 @@ |
|
|
{ |
|
|
|
|
|
"cells": [ |
|
|
|
|
|
{ |
|
|
|
|
|
"cell_type": "code", |
|
|
|
|
|
"execution_count": 2, |
|
|
|
|
|
"metadata": {}, |
|
|
|
|
|
"outputs": [ |
|
|
|
|
|
{ |
|
|
|
|
|
"name": "stdout", |
|
|
|
|
|
"output_type": "stream", |
|
|
|
|
|
"text": [ |
|
|
|
|
|
"The line_profiler extension is already loaded. To reload it, use:\n", |
|
|
|
|
|
" %reload_ext line_profiler\n", |
|
|
|
|
|
"\n", |
|
|
|
|
|
" --- This is a regression problem ---\n", |
|
|
|
|
|
"\n", |
|
|
|
|
|
"\n", |
|
|
|
|
|
" Loading dataset from file...\n", |
|
|
|
|
|
"\n", |
|
|
|
|
|
" Calculating kernel matrix, this could take a while...\n", |
|
|
|
|
|
"\n", |
|
|
|
|
|
" --- mean average path kernel matrix of size 185 built in 29.430902242660522 seconds ---\n", |
|
|
|
|
|
"[[ 0.55555556 0.22222222 0. ..., 0. 0. 0. ]\n", |
|
|
|
|
|
" [ 0.22222222 0.27777778 0. ..., 0. 0. 0. ]\n", |
|
|
|
|
|
" [ 0. 0. 0.55555556 ..., 0.03030303 0.03030303\n", |
|
|
|
|
|
" 0.03030303]\n", |
|
|
|
|
|
" ..., \n", |
|
|
|
|
|
" [ 0. 0. 0.03030303 ..., 0.08297521 0.05553719\n", |
|
|
|
|
|
" 0.05256198]\n", |
|
|
|
|
|
" [ 0. 0. 0.03030303 ..., 0.05553719 0.07239669\n", |
|
|
|
|
|
" 0.0538843 ]\n", |
|
|
|
|
|
" [ 0. 0. 0.03030303 ..., 0.05256198 0.0538843\n", |
|
|
|
|
|
" 0.07438017]]\n", |
|
|
|
|
|
"\n", |
|
|
|
|
|
" Saving kernel matrix to file...\n", |
|
|
|
|
|
"\n", |
|
|
|
|
|
" Mean performance on train set: 3.619948\n", |
|
|
|
|
|
"With standard deviation: 0.512351\n", |
|
|
|
|
|
"\n", |
|
|
|
|
|
" Mean performance on test set: 18.418852\n", |
|
|
|
|
|
"With standard deviation: 10.781119\n", |
|
|
|
|
|
"\n", |
|
|
|
|
|
"\n", |
|
|
|
|
|
" rmse_test std_test rmse_train std_train k_time\n", |
|
|
|
|
|
"----------- ---------- ------------ ----------- --------\n", |
|
|
|
|
|
" 18.4189 10.7811 3.61995 0.512351 29.4309\n" |
|
|
|
|
|
] |
|
|
|
|
|
} |
|
|
|
|
|
], |
|
|
|
|
|
"source": [ |
|
|
|
|
|
"%load_ext line_profiler\n", |
|
|
|
|
|
"\n", |
|
|
|
|
|
"import sys\n", |
|
|
|
|
|
"sys.path.insert(0, \"../\")\n", |
|
|
|
|
|
"from pygraph.utils.utils import kernel_train_test\n", |
|
|
|
|
|
"from pygraph.kernels.pathKernel import pathkernel, _pathkernel_do\n", |
|
|
|
|
|
"\n", |
|
|
|
|
|
"datafile = '../../../../datasets/acyclic/Acyclic/dataset_bps.ds'\n", |
|
|
|
|
|
"kernel_file_path = 'kernelmatrices_path_acyclic/'\n", |
|
|
|
|
|
"\n", |
|
|
|
|
|
"kernel_para = dict(node_label = 'atom', edge_label = 'bond_type')\n", |
|
|
|
|
|
"\n", |
|
|
|
|
|
"kernel_train_test(datafile, kernel_file_path, pathkernel, kernel_para, normalize = False)\n", |
|
|
|
|
|
"\n", |
|
|
|
|
|
"# %lprun -f _pathkernel_do \\\n", |
|
|
|
|
|
"# kernel_train_test(datafile, kernel_file_path, pathkernel, kernel_para, normalize = False)" |
|
|
|
|
|
] |
|
|
|
|
|
}, |
|
|
|
|
|
{ |
|
|
|
|
|
"cell_type": "code", |
|
|
|
|
|
"execution_count": null, |
|
|
|
|
|
"metadata": {}, |
|
|
|
|
|
"outputs": [], |
|
|
|
|
|
"source": [ |
|
|
|
|
|
"# results\n", |
|
|
|
|
|
"\n", |
|
|
|
|
|
"# with y normalization\n", |
|
|
|
|
|
" RMSE_test std_test RMSE_train std_train k_time\n", |
|
|
|
|
|
"----------- ---------- ------------ ----------- --------\n", |
|
|
|
|
|
" 14.0015 6.93602 3.76191 0.702594 37.5759\n", |
|
|
|
|
|
"\n", |
|
|
|
|
|
"# without y normalization\n", |
|
|
|
|
|
" RMSE_test std_test RMSE_train std_train k_time\n", |
|
|
|
|
|
"----------- ---------- ------------ ----------- --------\n", |
|
|
|
|
|
" 18.4189 10.7811 3.61995 0.512351 29.4309" |
|
|
|
|
|
] |
|
|
|
|
|
}, |
|
|
|
|
|
{ |
|
|
|
|
|
"cell_type": "code", |
|
|
|
|
|
"execution_count": 1, |
|
|
|
|
|
"metadata": { |
|
|
|
|
|
"scrolled": true |
|
|
|
|
|
}, |
|
|
|
|
|
"outputs": [ |
|
|
|
|
|
{ |
|
|
|
|
|
"name": "stdout", |
|
|
|
|
|
"output_type": "stream", |
|
|
|
|
|
"text": [ |
|
|
|
|
|
"\n", |
|
|
|
|
|
"- This script take as input a kernel matrix\n", |
|
|
|
|
|
"and returns the classification or regression performance\n", |
|
|
|
|
|
"- The kernel matrix can be calculated using any of the graph kernels approaches\n", |
|
|
|
|
|
"- The criteria used for prediction are SVM for classification and kernel Ridge regression for regression\n", |
|
|
|
|
|
"- For predition we divide the data in training, validation and test. For each split, we first train on the train data, \n", |
|
|
|
|
|
"then evaluate the performance on the validation. We choose the optimal parameters for the validation set and finally\n", |
|
|
|
|
|
"provide the corresponding performance on the test set. If more than one split is performed, the final results \n", |
|
|
|
|
|
"correspond to the average of the performances on the test sets. \n", |
|
|
|
|
|
"\n", |
|
|
|
|
|
"@references\n", |
|
|
|
|
|
" https://github.com/eghisu/GraphKernels/blob/master/GraphKernelsCollection/python_scripts/compute_perf_gk.py\n", |
|
|
|
|
|
"\n" |
|
|
|
|
|
] |
|
|
|
|
|
}, |
|
|
|
|
|
{ |
|
|
|
|
|
"ename": "IndentationError", |
|
|
|
|
|
"evalue": "unindent does not match any outer indentation level (utils.py, line 106)", |
|
|
|
|
|
"output_type": "error", |
|
|
|
|
|
"traceback": [ |
|
|
|
|
|
"Traceback \u001b[0;36m(most recent call last)\u001b[0m:\n", |
|
|
|
|
|
" File \u001b[1;32m\"/usr/local/lib/python3.5/dist-packages/IPython/core/interactiveshell.py\"\u001b[0m, line \u001b[1;32m2910\u001b[0m, in \u001b[1;35mrun_code\u001b[0m\n exec(code_obj, self.user_global_ns, self.user_ns)\n", |
|
|
|
|
|
"\u001b[0;36m File \u001b[0;32m\"<ipython-input-1-0b5b9ebb5cc4>\"\u001b[0;36m, line \u001b[0;32m31\u001b[0;36m, in \u001b[0;35m<module>\u001b[0;36m\u001b[0m\n\u001b[0;31m from pygraph.utils.utils import split_train_test\u001b[0m\n", |
|
|
|
|
|
"\u001b[0;36m File \u001b[0;32m\"../pygraph/utils/utils.py\"\u001b[0;36m, line \u001b[0;32m106\u001b[0m\n\u001b[0;31m train_means_list = []\u001b[0m\n\u001b[0m ^\u001b[0m\n\u001b[0;31mIndentationError\u001b[0m\u001b[0;31m:\u001b[0m unindent does not match any outer indentation level\n" |
|
|
|
|
|
] |
|
|
|
|
|
} |
|
|
|
|
|
], |
|
|
|
|
|
"source": [ |
|
|
|
|
|
"# Author: Elisabetta Ghisu\n", |
|
|
|
|
|
"\n", |
|
|
|
|
|
"\"\"\"\n", |
|
|
|
|
|
"- This script take as input a kernel matrix\n", |
|
|
|
|
|
"and returns the classification or regression performance\n", |
|
|
|
|
|
"- The kernel matrix can be calculated using any of the graph kernels approaches\n", |
|
|
|
|
|
"- The criteria used for prediction are SVM for classification and kernel Ridge regression for regression\n", |
|
|
|
|
|
"- For predition we divide the data in training, validation and test. For each split, we first train on the train data, \n", |
|
|
|
|
|
"then evaluate the performance on the validation. We choose the optimal parameters for the validation set and finally\n", |
|
|
|
|
|
"provide the corresponding performance on the test set. If more than one split is performed, the final results \n", |
|
|
|
|
|
"correspond to the average of the performances on the test sets. \n", |
|
|
|
|
|
"\n", |
|
|
|
|
|
"@references\n", |
|
|
|
|
|
" https://github.com/eghisu/GraphKernels/blob/master/GraphKernelsCollection/python_scripts/compute_perf_gk.py\n", |
|
|
|
|
|
"\"\"\"\n", |
|
|
|
|
|
"\n", |
|
|
|
|
|
"print(__doc__)\n", |
|
|
|
|
|
"\n", |
|
|
|
|
|
"import sys\n", |
|
|
|
|
|
"import os\n", |
|
|
|
|
|
"import pathlib\n", |
|
|
|
|
|
"from collections import OrderedDict\n", |
|
|
|
|
|
"sys.path.insert(0, \"../\")\n", |
|
|
|
|
|
"from tabulate import tabulate\n", |
|
|
|
|
|
"\n", |
|
|
|
|
|
"import numpy as np\n", |
|
|
|
|
|
"import matplotlib.pyplot as plt\n", |
|
|
|
|
|
"\n", |
|
|
|
|
|
"from pygraph.kernels.pathKernel import pathkernel\n", |
|
|
|
|
|
"from pygraph.utils.graphfiles import loadDataset\n", |
|
|
|
|
|
"from pygraph.utils.utils import split_train_test\n", |
|
|
|
|
|
"\n", |
|
|
|
|
|
"train_means_list = []\n", |
|
|
|
|
|
"train_stds_list = []\n", |
|
|
|
|
|
"test_means_list = []\n", |
|
|
|
|
|
"test_stds_list = []\n", |
|
|
|
|
|
"kernel_time_list = []\n", |
|
|
|
|
|
"\n", |
|
|
|
|
|
"print('\\n Loading dataset from file...')\n", |
|
|
|
|
|
"dataset, y = loadDataset(\"../../../../datasets/acyclic/Acyclic/dataset_bps.ds\")\n", |
|
|
|
|
|
"y = np.array(y)\n", |
|
|
|
|
|
"print(y)\n", |
|
|
|
|
|
"\n", |
|
|
|
|
|
"# setup the parameters\n", |
|
|
|
|
|
"model_type = 'regression' # Regression or classification problem\n", |
|
|
|
|
|
"print('\\n --- This is a %s problem ---' % model_type)\n", |
|
|
|
|
|
"\n", |
|
|
|
|
|
"trials = 100 # Trials for hyperparameters random search\n", |
|
|
|
|
|
"splits = 10 # Number of splits of the data\n", |
|
|
|
|
|
"alpha_grid = np.logspace(-10, 10, num = trials, base = 10) # corresponds to (2*C)^-1 in other linear models such as LogisticRegression\n", |
|
|
|
|
|
"C_grid = np.logspace(-10, 10, num = trials, base = 10)\n", |
|
|
|
|
|
"\n", |
|
|
|
|
|
"# set the output path\n", |
|
|
|
|
|
"kernel_file_path = 'kernelmatrices_path_acyclic/'\n", |
|
|
|
|
|
"if not os.path.exists(kernel_file_path):\n", |
|
|
|
|
|
" os.makedirs(kernel_file_path)\n", |
|
|
|
|
|
"\n", |
|
|
|
|
|
"\"\"\"\n", |
|
|
|
|
|
"- Here starts the main program\n", |
|
|
|
|
|
"- First we permute the data, then for each split we evaluate corresponding performances\n", |
|
|
|
|
|
"- In the end, the performances are averaged over the test sets\n", |
|
|
|
|
|
"\"\"\"\n", |
|
|
|
|
|
"\n", |
|
|
|
|
|
"# save kernel matrices to files / read kernel matrices from files\n", |
|
|
|
|
|
"kernel_file = kernel_file_path + 'km.ds'\n", |
|
|
|
|
|
"path = pathlib.Path(kernel_file)\n", |
|
|
|
|
|
"# get train set kernel matrix\n", |
|
|
|
|
|
"if path.is_file():\n", |
|
|
|
|
|
" print('\\n Loading the kernel matrix from file...')\n", |
|
|
|
|
|
" Kmatrix = np.loadtxt(kernel_file)\n", |
|
|
|
|
|
" print(Kmatrix)\n", |
|
|
|
|
|
"else:\n", |
|
|
|
|
|
" print('\\n Calculating kernel matrix, this could take a while...')\n", |
|
|
|
|
|
" Kmatrix, run_time = pathkernel(dataset, node_label = 'atom', edge_label = 'bond_type')\n", |
|
|
|
|
|
" kernel_time_list.append(run_time)\n", |
|
|
|
|
|
" print(Kmatrix)\n", |
|
|
|
|
|
" print('\\n Saving kernel matrix to file...')\n", |
|
|
|
|
|
"# np.savetxt(kernel_file, Kmatrix)\n", |
|
|
|
|
|
" \n", |
|
|
|
|
|
"train_mean, train_std, test_mean, test_std = \\\n", |
|
|
|
|
|
" split_train_test(Kmatrix, y, alpha_grid, C_grid, splits, trials, model_type, normalize = True)\n", |
|
|
|
|
|
" \n", |
|
|
|
|
|
"train_means_list.append(train_mean)\n", |
|
|
|
|
|
"train_stds_list.append(train_std)\n", |
|
|
|
|
|
"test_means_list.append(test_mean)\n", |
|
|
|
|
|
"test_stds_list.append(test_std)\n", |
|
|
|
|
|
" \n", |
|
|
|
|
|
"print('\\n') \n", |
|
|
|
|
|
"table_dict = {'RMSE_test': test_means_list, 'std_test': test_stds_list, \\\n", |
|
|
|
|
|
" 'RMSE_train': train_means_list, 'std_train': train_stds_list, 'k_time': kernel_time_list}\n", |
|
|
|
|
|
"keyorder = ['RMSE_test', 'std_test', 'RMSE_train', 'std_train', 'k_time']\n", |
|
|
|
|
|
"print(tabulate(OrderedDict(sorted(table_dict.items(), key = lambda i:keyorder.index(i[0]))), headers='keys'))" |
|
|
|
|
|
] |
|
|
|
|
|
}, |
|
|
|
|
|
{ |
|
|
|
|
|
"cell_type": "code", |
|
|
|
|
|
"execution_count": 1, |
|
|
|
|
|
"metadata": {}, |
|
|
|
|
|
"outputs": [ |
|
|
|
|
|
{ |
|
|
|
|
|
"ename": "ImportError", |
|
|
|
|
|
"evalue": "cannot import name 'deltaKernel'", |
|
|
|
|
|
"output_type": "error", |
|
|
|
|
|
"traceback": [ |
|
|
|
|
|
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", |
|
|
|
|
|
"\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)", |
|
|
|
|
|
"\u001b[0;32m<ipython-input-1-51fa7de99690>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0msys\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0minsert\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"../\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mpygraph\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mutils\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgraphfiles\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mloadDataset\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mpygraph\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mkernels\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdeltaKernel\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mdeltaKernel\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0mdataset\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mloadDataset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"../../../../datasets/acyclic/Acyclic/dataset_bps.ds\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", |
|
|
|
|
|
"\u001b[0;31mImportError\u001b[0m: cannot import name 'deltaKernel'" |
|
|
|
|
|
] |
|
|
|
|
|
} |
|
|
|
|
|
], |
|
|
|
|
|
"source": [ |
|
|
|
|
|
"import sys\n", |
|
|
|
|
|
"import networkx as nx\n", |
|
|
|
|
|
"sys.path.insert(0, \"../\")\n", |
|
|
|
|
|
"from pygraph.utils.graphfiles import loadDataset\n", |
|
|
|
|
|
"from pygraph.kernels.deltaKernel import deltaKernel\n", |
|
|
|
|
|
"\n", |
|
|
|
|
|
"dataset, y = loadDataset(\"../../../../datasets/acyclic/Acyclic/dataset_bps.ds\")\n", |
|
|
|
|
|
"G1 = dataset[12]\n", |
|
|
|
|
|
"G2 = dataset[55]\n", |
|
|
|
|
|
"sp1 = []\n", |
|
|
|
|
|
"num_nodes = G1.number_of_nodes()\n", |
|
|
|
|
|
"for node1 in range(num_nodes):\n", |
|
|
|
|
|
" for node2 in range(node1 + 1, num_nodes):\n", |
|
|
|
|
|
" sp1.append(nx.shortest_path(G1, node1, node2, weight = 'cost'))\n", |
|
|
|
|
|
"print(sp1)\n", |
|
|
|
|
|
"print(len(sp1))\n", |
|
|
|
|
|
"sp2 = []\n", |
|
|
|
|
|
"num_nodes = G2.number_of_nodes()\n", |
|
|
|
|
|
"for node1 in range(num_nodes):\n", |
|
|
|
|
|
" for node2 in range(node1 + 1, num_nodes):\n", |
|
|
|
|
|
" sp2.append(nx.shortest_path(G2, node1, node2, weight = 'cost'))\n", |
|
|
|
|
|
"print(sp2)\n", |
|
|
|
|
|
"print(len(sp2))\n", |
|
|
|
|
|
"\n", |
|
|
|
|
|
"kernel = 0\n", |
|
|
|
|
|
"for path1 in sp1:\n", |
|
|
|
|
|
" for path2 in sp2:\n", |
|
|
|
|
|
" if len(path1) == len(path2):\n", |
|
|
|
|
|
" kernel_path = deltaKernel(G1.node[path1[0]]['label'] == G2.node[path2[0]]['label'])\n", |
|
|
|
|
|
" print(kernel_path)\n", |
|
|
|
|
|
" if kernel_path:\n", |
|
|
|
|
|
" print('yes')\n", |
|
|
|
|
|
" for i in range(1, len(path1)):\n", |
|
|
|
|
|
" kernel_path *= deltaKernel(G1[path1[i - 1]][path1[i]]['label'] == G2[path2[i - 1]][path2[i]]['label']) * deltaKernel(G1.node[path1[i]]['label'] == G2.node[path2[i]]['label'])\n", |
|
|
|
|
|
" kernel += kernel_path\n", |
|
|
|
|
|
" \n", |
|
|
|
|
|
"kernel = kernel / (len(sp1) * len(sp2))\n", |
|
|
|
|
|
"\n", |
|
|
|
|
|
"print(kernel)" |
|
|
|
|
|
] |
|
|
|
|
|
}, |
|
|
|
|
|
{ |
|
|
|
|
|
"cell_type": "code", |
|
|
|
|
|
"execution_count": 30, |
|
|
|
|
|
"metadata": { |
|
|
|
|
|
"scrolled": false |
|
|
|
|
|
}, |
|
|
|
|
|
"outputs": [ |
|
|
|
|
|
{ |
|
|
|
|
|
"name": "stdout", |
|
|
|
|
|
"output_type": "stream", |
|
|
|
|
|
"text": [ |
|
|
|
|
|
"\n", |
|
|
|
|
|
"- This script take as input a kernel matrix\n", |
|
|
|
|
|
"and returns the classification or regression performance\n", |
|
|
|
|
|
"- The kernel matrix can be calculated using any of the graph kernels approaches\n", |
|
|
|
|
|
"- The criteria used for prediction are SVM for classification and kernel Ridge regression for regression\n", |
|
|
|
|
|
"- For predition we divide the data in training, validation and test. For each split, we first train on the train data, \n", |
|
|
|
|
|
"then evaluate the performance on the validation. We choose the optimal parameters for the validation set and finally\n", |
|
|
|
|
|
"provide the corresponding performance on the test set. If more than one split is performed, the final results \n", |
|
|
|
|
|
"correspond to the average of the performances on the test sets. \n", |
|
|
|
|
|
"\n", |
|
|
|
|
|
"@references\n", |
|
|
|
|
|
" https://github.com/eghisu/GraphKernels/blob/master/GraphKernelsCollection/python_scripts/compute_perf_gk.py\n", |
|
|
|
|
|
"\n", |
|
|
|
|
|
"\n", |
|
|
|
|
|
" --- This is a regression problem ---\n", |
|
|
|
|
|
"\n", |
|
|
|
|
|
" Normalizing output y...\n", |
|
|
|
|
|
"\n", |
|
|
|
|
|
" Loading the train set kernel matrix from file...\n", |
|
|
|
|
|
"[[ 0.15254237 0.08333333 0.0625 ..., 0.11363636 0.11363636\n", |
|
|
|
|
|
" 0.11363636]\n", |
|
|
|
|
|
" [ 0.08333333 0.18518519 0.15591398 ..., 0.16617791 0.16617791\n", |
|
|
|
|
|
" 0.16890214]\n", |
|
|
|
|
|
" [ 0.0625 0.15591398 0.15254237 ..., 0.12987013 0.12987013\n", |
|
|
|
|
|
" 0.13163636]\n", |
|
|
|
|
|
" ..., \n", |
|
|
|
|
|
" [ 0.11363636 0.16617791 0.12987013 ..., 0.26383753 0.2639004\n", |
|
|
|
|
|
" 0.26156557]\n", |
|
|
|
|
|
" [ 0.11363636 0.16617791 0.12987013 ..., 0.2639004 0.26396688\n", |
|
|
|
|
|
" 0.26162729]\n", |
|
|
|
|
|
" [ 0.11363636 0.16890214 0.13163636 ..., 0.26156557 0.26162729\n", |
|
|
|
|
|
" 0.25964592]]\n", |
|
|
|
|
|
"\n", |
|
|
|
|
|
" Loading the test set kernel matrix from file...\n", |
|
|
|
|
|
"[[ 0.18518519 0.1715847 0.11111111 0.16588603 0.11904762 0.16450216\n", |
|
|
|
|
|
" 0.17281421 0.14285714 0.125 0.16477273 0.16880154 0.14583333\n", |
|
|
|
|
|
" 0.1660693 0.16906445 0.13333333 0.16612903 0.16420966 0.16441006\n", |
|
|
|
|
|
" 0.15151515]\n", |
|
|
|
|
|
" [ 0.1715847 0.19988118 0.15173333 0.18435596 0.16465263 0.21184723\n", |
|
|
|
|
|
" 0.18985964 0.19960191 0.16819723 0.21540115 0.19575264 0.2041482\n", |
|
|
|
|
|
" 0.21842419 0.20001664 0.18754969 0.2205599 0.20506165 0.22256445\n", |
|
|
|
|
|
" 0.2141792 ]\n", |
|
|
|
|
|
" [ 0.11111111 0.15173333 0.16303156 0.13416478 0.16903494 0.16960573\n", |
|
|
|
|
|
" 0.13862936 0.18511129 0.16989276 0.17395417 0.14762351 0.18709221\n", |
|
|
|
|
|
" 0.17706477 0.15293506 0.17970939 0.17975775 0.16082785 0.18295252\n", |
|
|
|
|
|
" 0.19186573]\n", |
|
|
|
|
|
" [ 0.16588603 0.18435596 0.13416478 0.17413923 0.14529511 0.19230449\n", |
|
|
|
|
|
" 0.17775828 0.17598858 0.14892223 0.19462663 0.18166555 0.17986029\n", |
|
|
|
|
|
" 0.1964604 0.18450695 0.16510376 0.19788853 0.1876399 0.19921541\n", |
|
|
|
|
|
" 0.18843419]\n", |
|
|
|
|
|
" [ 0.11904762 0.16465263 0.16903494 0.14529511 0.17703225 0.18464872\n", |
|
|
|
|
|
" 0.15002895 0.19785455 0.17779663 0.18950917 0.16010081 0.2005743\n", |
|
|
|
|
|
" 0.19306131 0.16599977 0.19113529 0.1960531 0.175064 0.19963794\n", |
|
|
|
|
|
" 0.20696464]\n", |
|
|
|
|
|
" [ 0.16450216 0.21184723 0.16960573 0.19230449 0.18464872 0.23269314\n", |
|
|
|
|
|
" 0.19681552 0.22450276 0.1871932 0.23765844 0.20733248 0.22967925\n", |
|
|
|
|
|
" 0.241199 0.21337314 0.21125341 0.24426963 0.22285333 0.24802555\n", |
|
|
|
|
|
" 0.24156669]\n", |
|
|
|
|
|
" [ 0.17281421 0.18985964 0.13862936 0.17775828 0.15002895 0.19681552\n", |
|
|
|
|
|
" 0.18309269 0.18152273 0.15411585 0.19935309 0.18641218 0.18556038\n", |
|
|
|
|
|
" 0.20169527 0.18946029 0.17030032 0.20320694 0.19192382 0.2042596\n", |
|
|
|
|
|
" 0.19428999]\n", |
|
|
|
|
|
" [ 0.14285714 0.19960191 0.18511129 0.17598858 0.19785455 0.22450276\n", |
|
|
|
|
|
" 0.18152273 0.23269314 0.20168735 0.23049584 0.19407926 0.23694176\n", |
|
|
|
|
|
" 0.23486084 0.20134404 0.22042984 0.23854906 0.21275711 0.24302959\n", |
|
|
|
|
|
" 0.24678197]\n", |
|
|
|
|
|
" [ 0.125 0.16819723 0.16989276 0.14892223 0.17779663 0.1871932\n", |
|
|
|
|
|
" 0.15411585 0.20168735 0.18391356 0.19188588 0.16365606 0.20428161\n", |
|
|
|
|
|
" 0.1952436 0.16940489 0.1919249 0.19815511 0.17760881 0.20152837\n", |
|
|
|
|
|
" 0.20988805]\n", |
|
|
|
|
|
" [ 0.16477273 0.21540115 0.17395417 0.19462663 0.18950917 0.23765844\n", |
|
|
|
|
|
" 0.19935309 0.23049584 0.19188588 0.24296859 0.21058278 0.23586086\n", |
|
|
|
|
|
" 0.24679036 0.21702635 0.21699483 0.25006701 0.22724646 0.25407837\n", |
|
|
|
|
|
" 0.24818625]\n", |
|
|
|
|
|
" [ 0.16880154 0.19575264 0.14762351 0.18166555 0.16010081 0.20733248\n", |
|
|
|
|
|
" 0.18641218 0.19407926 0.16365606 0.21058278 0.19214629 0.19842989\n", |
|
|
|
|
|
" 0.21317298 0.19609213 0.18225175 0.2151567 0.20088139 0.2171273\n", |
|
|
|
|
|
" 0.20810339]\n", |
|
|
|
|
|
" [ 0.14583333 0.2041482 0.18709221 0.17986029 0.2005743 0.22967925\n", |
|
|
|
|
|
" 0.18556038 0.23694176 0.20428161 0.23586086 0.19842989 0.24154885\n", |
|
|
|
|
|
" 0.24042054 0.20590264 0.22439219 0.24421452 0.21769149 0.24880304\n", |
|
|
|
|
|
" 0.25200246]\n", |
|
|
|
|
|
" [ 0.1660693 0.21842419 0.17706477 0.1964604 0.19306131 0.241199\n", |
|
|
|
|
|
" 0.20169527 0.23486084 0.1952436 0.24679036 0.21317298 0.24042054\n", |
|
|
|
|
|
" 0.25107069 0.21988195 0.22126548 0.25446921 0.23058896 0.25855949\n", |
|
|
|
|
|
" 0.25312182]\n", |
|
|
|
|
|
" [ 0.16906445 0.20001664 0.15293506 0.18450695 0.16599977 0.21337314\n", |
|
|
|
|
|
" 0.18946029 0.20134404 0.16940489 0.21702635 0.19609213 0.20590264\n", |
|
|
|
|
|
" 0.21988195 0.20052959 0.18917551 0.22212027 0.2061696 0.22441239\n", |
|
|
|
|
|
" 0.21607563]\n", |
|
|
|
|
|
" [ 0.13333333 0.18754969 0.17970939 0.16510376 0.19113529 0.21125341\n", |
|
|
|
|
|
" 0.17030032 0.22042984 0.1919249 0.21699483 0.18225175 0.22439219\n", |
|
|
|
|
|
" 0.22126548 0.18917551 0.2112185 0.224781 0.20021961 0.22904467\n", |
|
|
|
|
|
" 0.23356012]\n", |
|
|
|
|
|
" [ 0.16612903 0.2205599 0.17975775 0.19788853 0.1960531 0.24426963\n", |
|
|
|
|
|
" 0.20320694 0.23854906 0.19815511 0.25006701 0.2151567 0.24421452\n", |
|
|
|
|
|
" 0.25446921 0.22212027 0.224781 0.25800115 0.23326559 0.26226067\n", |
|
|
|
|
|
" 0.25717144]\n", |
|
|
|
|
|
" [ 0.16420966 0.20506165 0.16082785 0.1876399 0.175064 0.22285333\n", |
|
|
|
|
|
" 0.19192382 0.21275711 0.17760881 0.22724646 0.20088139 0.21769149\n", |
|
|
|
|
|
" 0.23058896 0.2061696 0.20021961 0.23326559 0.21442192 0.2364528\n", |
|
|
|
|
|
" 0.22891788]\n", |
|
|
|
|
|
" [ 0.16441006 0.22256445 0.18295252 0.19921541 0.19963794 0.24802555\n", |
|
|
|
|
|
" 0.2042596 0.24302959 0.20152837 0.25407837 0.2171273 0.24880304\n", |
|
|
|
|
|
" 0.25855949 0.22441239 0.22904467 0.26226067 0.2364528 0.26687384\n", |
|
|
|
|
|
" 0.26210305]\n", |
|
|
|
|
|
" [ 0.15151515 0.2141792 0.19186573 0.18843419 0.20696464 0.24156669\n", |
|
|
|
|
|
" 0.19428999 0.24678197 0.20988805 0.24818625 0.20810339 0.25200246\n", |
|
|
|
|
|
" 0.25312182 0.21607563 0.23356012 0.25717144 0.22891788 0.26210305\n", |
|
|
|
|
|
" 0.26386999]]\n" |
|
|
|
|
|
] |
|
|
|
|
|
}, |
|
|
|
|
|
{ |
|
|
|
|
|
"ename": "ValueError", |
|
|
|
|
|
"evalue": "Precomputed metric requires shape (n_queries, n_indexed). Got (19, 19) for 164 indexed.", |
|
|
|
|
|
"output_type": "error", |
|
|
|
|
|
"traceback": [ |
|
|
|
|
|
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", |
|
|
|
|
|
"\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", |
|
|
|
|
|
"\u001b[0;32m<ipython-input-30-d4c5f46d5abf>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 133\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 134\u001b[0m \u001b[0;31m# predict on the test set\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 135\u001b[0;31m \u001b[0my_pred_test\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mKR\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpredict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mKmatrix_test\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 136\u001b[0m \u001b[0;31m# print(y_pred)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 137\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", |
|
|
|
|
|
"\u001b[0;32m/usr/local/lib/python3.5/dist-packages/sklearn/kernel_ridge.py\u001b[0m in \u001b[0;36mpredict\u001b[0;34m(self, X)\u001b[0m\n\u001b[1;32m 182\u001b[0m \"\"\"\n\u001b[1;32m 183\u001b[0m \u001b[0mcheck_is_fitted\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m\"X_fit_\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"dual_coef_\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 184\u001b[0;31m \u001b[0mK\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_kernel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mX_fit_\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 185\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdot\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mK\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdual_coef_\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", |
|
|
|
|
|
"\u001b[0;32m/usr/local/lib/python3.5/dist-packages/sklearn/kernel_ridge.py\u001b[0m in \u001b[0;36m_get_kernel\u001b[0;34m(self, X, Y)\u001b[0m\n\u001b[1;32m 119\u001b[0m \"coef0\": self.coef0}\n\u001b[1;32m 120\u001b[0m return pairwise_kernels(X, Y, metric=self.kernel,\n\u001b[0;32m--> 121\u001b[0;31m filter_params=True, **params)\n\u001b[0m\u001b[1;32m 122\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 123\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0mproperty\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", |
|
|
|
|
|
"\u001b[0;32m/usr/local/lib/python3.5/dist-packages/sklearn/metrics/pairwise.py\u001b[0m in \u001b[0;36mpairwise_kernels\u001b[0;34m(X, Y, metric, filter_params, n_jobs, **kwds)\u001b[0m\n\u001b[1;32m 1389\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1390\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mmetric\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m\"precomputed\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1391\u001b[0;31m \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcheck_pairwise_arrays\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mY\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mprecomputed\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1392\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1393\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmetric\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mGPKernel\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", |
|
|
|
|
|
"\u001b[0;32m/usr/local/lib/python3.5/dist-packages/sklearn/metrics/pairwise.py\u001b[0m in \u001b[0;36mcheck_pairwise_arrays\u001b[0;34m(X, Y, precomputed, dtype)\u001b[0m\n\u001b[1;32m 117\u001b[0m \u001b[0;34m\"(n_queries, n_indexed). Got (%d, %d) \"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 118\u001b[0m \u001b[0;34m\"for %d indexed.\"\u001b[0m \u001b[0;34m%\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 119\u001b[0;31m (X.shape[0], X.shape[1], Y.shape[0]))\n\u001b[0m\u001b[1;32m 120\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0mY\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 121\u001b[0m raise ValueError(\"Incompatible dimension for X and Y matrices: \"\n", |
|
|
|
|
|
"\u001b[0;31mValueError\u001b[0m: Precomputed metric requires shape (n_queries, n_indexed). Got (19, 19) for 164 indexed." |
|
|
|
|
|
] |
|
|
|
|
|
} |
|
|
|
|
|
], |
|
|
|
|
|
"source": [ |
|
|
|
|
|
"# Author: Elisabetta Ghisu\n", |
|
|
|
|
|
"\n", |
|
|
|
|
|
"\"\"\"\n", |
|
|
|
|
|
"- This script take as input a kernel matrix\n", |
|
|
|
|
|
"and returns the classification or regression performance\n", |
|
|
|
|
|
"- The kernel matrix can be calculated using any of the graph kernels approaches\n", |
|
|
|
|
|
"- The criteria used for prediction are SVM for classification and kernel Ridge regression for regression\n", |
|
|
|
|
|
"- For predition we divide the data in training, validation and test. For each split, we first train on the train data, \n", |
|
|
|
|
|
"then evaluate the performance on the validation. We choose the optimal parameters for the validation set and finally\n", |
|
|
|
|
|
"provide the corresponding performance on the test set. If more than one split is performed, the final results \n", |
|
|
|
|
|
"correspond to the average of the performances on the test sets. \n", |
|
|
|
|
|
"\n", |
|
|
|
|
|
"@references\n", |
|
|
|
|
|
" https://github.com/eghisu/GraphKernels/blob/master/GraphKernelsCollection/python_scripts/compute_perf_gk.py\n", |
|
|
|
|
|
"\"\"\"\n", |
|
|
|
|
|
"\n", |
|
|
|
|
|
"print(__doc__)\n", |
|
|
|
|
|
"\n", |
|
|
|
|
|
"import sys\n", |
|
|
|
|
|
"import pathlib\n", |
|
|
|
|
|
"import os\n", |
|
|
|
|
|
"sys.path.insert(0, \"../\")\n", |
|
|
|
|
|
"from tabulate import tabulate\n", |
|
|
|
|
|
"\n", |
|
|
|
|
|
"import random\n", |
|
|
|
|
|
"import numpy as np\n", |
|
|
|
|
|
"import matplotlib.pyplot as plt\n", |
|
|
|
|
|
"\n", |
|
|
|
|
|
"from sklearn.kernel_ridge import KernelRidge # 0.17\n", |
|
|
|
|
|
"from sklearn.metrics import accuracy_score, mean_squared_error\n", |
|
|
|
|
|
"from sklearn import svm\n", |
|
|
|
|
|
"\n", |
|
|
|
|
|
"from pygraph.kernels.pathKernel import pathKernel\n", |
|
|
|
|
|
"from pygraph.utils.graphfiles import loadDataset\n", |
|
|
|
|
|
"\n", |
|
|
|
|
|
"# print('\\n Loading dataset from file...')\n", |
|
|
|
|
|
"# dataset, y = loadDataset(\"/home/ljia/Documents/research-repo/datasets/acyclic/Acyclic/dataset_bps.ds\")\n", |
|
|
|
|
|
"# y = np.array(y)\n", |
|
|
|
|
|
"# print(y)\n", |
|
|
|
|
|
"\n", |
|
|
|
|
|
"# kernel_file_path = 'marginalizedkernelmatrix.ds'\n", |
|
|
|
|
|
"# path = pathlib.Path(kernel_file_path)\n", |
|
|
|
|
|
"# if path.is_file():\n", |
|
|
|
|
|
"# print('\\n Loading the matrix from file...')\n", |
|
|
|
|
|
"# Kmatrix = np.loadtxt(kernel_file_path)\n", |
|
|
|
|
|
"# print(Kmatrix)\n", |
|
|
|
|
|
"# else:\n", |
|
|
|
|
|
"# print('\\n Calculating kernel matrix, this could take a while...')\n", |
|
|
|
|
|
"# Kmatrix = marginalizeKernel(dataset)\n", |
|
|
|
|
|
"# print(Kmatrix)\n", |
|
|
|
|
|
"# print('Saving kernel matrix to file...')\n", |
|
|
|
|
|
"# np.savetxt(kernel_file_path, Kmatrix)\n", |
|
|
|
|
|
"\n", |
|
|
|
|
|
"# setup the parameters\n", |
|
|
|
|
|
"model_type = 'regression' # Regression or classification problem\n", |
|
|
|
|
|
"print('\\n --- This is a %s problem ---' % model_type)\n", |
|
|
|
|
|
"\n", |
|
|
|
|
|
"# datasize = len(dataset)\n", |
|
|
|
|
|
"trials = 100 # Trials for hyperparameters random search\n", |
|
|
|
|
|
"splits = 100 # Number of splits of the data\n", |
|
|
|
|
|
"alpha_grid = np.linspace(0.01, 100, num = trials) # corresponds to (2*C)^-1 in other linear models such as LogisticRegression\n", |
|
|
|
|
|
"# C_grid = np.linspace(0.0001, 10, num = trials)\n", |
|
|
|
|
|
"random.seed(20) # Set the seed for uniform parameter distribution\n", |
|
|
|
|
|
"data_dir = '/home/ljia/Documents/research-repo/datasets/acyclic/Acyclic/'\n", |
|
|
|
|
|
"\n", |
|
|
|
|
|
"# set the output path\n", |
|
|
|
|
|
"kernel_file_path = 'kernelmatrices_marginalized_acyclic/'\n", |
|
|
|
|
|
"if not os.path.exists(kernel_file_path):\n", |
|
|
|
|
|
" os.makedirs(kernel_file_path)\n", |
|
|
|
|
|
"\n", |
|
|
|
|
|
"\n", |
|
|
|
|
|
"\"\"\"\n", |
|
|
|
|
|
"- Here starts the main program\n", |
|
|
|
|
|
"- First we permute the data, then for each split we evaluate corresponding performances\n", |
|
|
|
|
|
"- In the end, the performances are averaged over the test sets\n", |
|
|
|
|
|
"\"\"\"\n", |
|
|
|
|
|
"\n", |
|
|
|
|
|
"# Initialize the performance of the best parameter trial on validation with the corresponding performance on test\n", |
|
|
|
|
|
"val_split = []\n", |
|
|
|
|
|
"test_split = []\n", |
|
|
|
|
|
"\n", |
|
|
|
|
|
"p_quit = 0.5\n", |
|
|
|
|
|
"\n", |
|
|
|
|
|
"# for each split of the data\n", |
|
|
|
|
|
"for j in range(10):\n", |
|
|
|
|
|
" dataset_train, y_train = loadDataset(data_dir + 'trainset_' + str(j) + '.ds')\n", |
|
|
|
|
|
" dataset_test, y_test = loadDataset(data_dir + 'testset_' + str(j) + '.ds')\n", |
|
|
|
|
|
" \n", |
|
|
|
|
|
" # Normalization step (for real valued targets only)\n", |
|
|
|
|
|
" if model_type == 'regression':\n", |
|
|
|
|
|
" print('\\n Normalizing output y...')\n", |
|
|
|
|
|
" y_train_mean = np.mean(y_train)\n", |
|
|
|
|
|
" y_train_std = np.std(y_train)\n", |
|
|
|
|
|
" y_train = (y_train - y_train_mean) / float(y_train_std)\n", |
|
|
|
|
|
"# print(y)\n", |
|
|
|
|
|
" \n", |
|
|
|
|
|
" # save kernel matrices to files / read kernel matrices from files\n", |
|
|
|
|
|
" kernel_file_train = kernel_file_path + 'train' + str(j) + '_pquit_' + str(p_quit)\n", |
|
|
|
|
|
" kernel_file_test = kernel_file_path + 'test' + str(j) + '_pquit_' + str(p_quit)\n", |
|
|
|
|
|
" path_train = pathlib.Path(kernel_file_train)\n", |
|
|
|
|
|
" path_test = pathlib.Path(kernel_file_test)\n", |
|
|
|
|
|
" # get train set kernel matrix\n", |
|
|
|
|
|
" if path_train.is_file():\n", |
|
|
|
|
|
" print('\\n Loading the train set kernel matrix from file...')\n", |
|
|
|
|
|
" Kmatrix_train = np.loadtxt(kernel_file_train)\n", |
|
|
|
|
|
" print(Kmatrix_train)\n", |
|
|
|
|
|
" else:\n", |
|
|
|
|
|
" print('\\n Calculating train set kernel matrix, this could take a while...')\n", |
|
|
|
|
|
" Kmatrix_train = marginalizedkernel(dataset_train, p_quit, 20)\n", |
|
|
|
|
|
" print(Kmatrix_train)\n", |
|
|
|
|
|
" print('\\n Saving train set kernel matrix to file...')\n", |
|
|
|
|
|
" np.savetxt(kernel_file_train, Kmatrix_train)\n", |
|
|
|
|
|
" # get test set kernel matrix\n", |
|
|
|
|
|
" if path_test.is_file():\n", |
|
|
|
|
|
" print('\\n Loading the test set kernel matrix from file...')\n", |
|
|
|
|
|
" Kmatrix_test = np.loadtxt(kernel_file_test)\n", |
|
|
|
|
|
" print(Kmatrix_test)\n", |
|
|
|
|
|
" else:\n", |
|
|
|
|
|
" print('\\n Calculating test set kernel matrix, this could take a while...')\n", |
|
|
|
|
|
" Kmatrix_test = marginalizedkernel(dataset_test, p_quit, 20)\n", |
|
|
|
|
|
" print(Kmatrix_test)\n", |
|
|
|
|
|
" print('\\n Saving test set kernel matrix to file...')\n", |
|
|
|
|
|
" np.savetxt(kernel_file_test, Kmatrix_test)\n", |
|
|
|
|
|
"\n", |
|
|
|
|
|
" # For each parameter trial\n", |
|
|
|
|
|
" for i in range(trials):\n", |
|
|
|
|
|
" # For regression use the Kernel Ridge method\n", |
|
|
|
|
|
" if model_type == 'regression':\n", |
|
|
|
|
|
" # print('\\n Starting experiment for trial %d and parameter alpha = %3f\\n ' % (i, alpha_grid[i]))\n", |
|
|
|
|
|
"\n", |
|
|
|
|
|
" # Fit the kernel ridge model\n", |
|
|
|
|
|
" KR = KernelRidge(kernel = 'precomputed', alpha = alpha_grid[i])\n", |
|
|
|
|
|
" KR.fit(Kmatrix_train, y_train)\n", |
|
|
|
|
|
"\n", |
|
|
|
|
|
" # predict on the test set\n", |
|
|
|
|
|
" y_pred_test = KR.predict(Kmatrix_test)\n", |
|
|
|
|
|
" # print(y_pred)\n", |
|
|
|
|
|
"\n", |
|
|
|
|
|
" # adjust prediction: needed because the training targets have been normalized\n", |
|
|
|
|
|
" y_pred_test = y_pred_test * float(y_train_std) + y_train_mean\n", |
|
|
|
|
|
" # print(y_pred_test)\n", |
|
|
|
|
|
"\n", |
|
|
|
|
|
" # root mean squared error in test \n", |
|
|
|
|
|
" rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))\n", |
|
|
|
|
|
" perf_all_test.append(rmse_test)\n", |
|
|
|
|
|
"\n", |
|
|
|
|
|
" # print('The performance on the validation set is: %3f' % rmse)\n", |
|
|
|
|
|
" # print('The performance on the test set is: %3f' % rmse_test)\n", |
|
|
|
|
|
"\n", |
|
|
|
|
|
" # --- FIND THE OPTIMAL PARAMETERS --- #\n", |
|
|
|
|
|
" # For regression: minimise the mean squared error\n", |
|
|
|
|
|
" if model_type == 'regression':\n", |
|
|
|
|
|
"\n", |
|
|
|
|
|
" # get optimal parameter on test (argmin mean squared error)\n", |
|
|
|
|
|
" min_idx = np.argmin(perf_all_test)\n", |
|
|
|
|
|
" alpha_opt = alpha_grid[min_idx]\n", |
|
|
|
|
|
"\n", |
|
|
|
|
|
" # corresponding performance on test for the same parameter\n", |
|
|
|
|
|
" perf_test_opt = perf_all_test[min_idx]\n", |
|
|
|
|
|
"\n", |
|
|
|
|
|
" print('The best performance is for trial %d with parameter alpha = %3f' % (min_idx, alpha_opt))\n", |
|
|
|
|
|
" print('The corresponding performance on test set is: %3f' % perf_test_opt)\n", |
|
|
|
|
|
" \n", |
|
|
|
|
|
" \n", |
|
|
|
|
|
" \n", |
|
|
|
|
|
"\n", |
|
|
|
|
|
"# For each split of the data\n", |
|
|
|
|
|
"for j in range(10, 10 + splits):\n", |
|
|
|
|
|
" print('Starting split %d...' % j)\n", |
|
|
|
|
|
"\n", |
|
|
|
|
|
" # Set the random set for data permutation\n", |
|
|
|
|
|
" random_state = int(j)\n", |
|
|
|
|
|
" np.random.seed(random_state)\n", |
|
|
|
|
|
" idx_perm = np.random.permutation(datasize)\n", |
|
|
|
|
|
"# print(idx_perm)\n", |
|
|
|
|
|
" \n", |
|
|
|
|
|
" # Permute the data\n", |
|
|
|
|
|
" y_perm = y[idx_perm] # targets permutation\n", |
|
|
|
|
|
"# print(y_perm)\n", |
|
|
|
|
|
" Kmatrix_perm = Kmatrix[:, idx_perm] # inputs permutation\n", |
|
|
|
|
|
"# print(Kmatrix_perm)\n", |
|
|
|
|
|
" Kmatrix_perm = Kmatrix_perm[idx_perm, :] # inputs permutation\n", |
|
|
|
|
|
" \n", |
|
|
|
|
|
" # Set the training, validation and test\n", |
|
|
|
|
|
" # Note: the percentage can be set up by the user\n", |
|
|
|
|
|
" num_train_val = int((datasize * 90) / 100) # 90% (of entire dataset) for training and validation\n", |
|
|
|
|
|
" num_test = datasize - num_train_val # 10% (of entire dataset) for test\n", |
|
|
|
|
|
" num_train = int((num_train_val * 90) / 100) # 90% (of train + val) for training\n", |
|
|
|
|
|
" num_val = num_train_val - num_train # 10% (of train + val) for validation\n", |
|
|
|
|
|
" \n", |
|
|
|
|
|
" # Split the kernel matrix\n", |
|
|
|
|
|
" Kmatrix_train = Kmatrix_perm[0:num_train, 0:num_train]\n", |
|
|
|
|
|
" Kmatrix_val = Kmatrix_perm[num_train:(num_train + num_val), 0:num_train]\n", |
|
|
|
|
|
" Kmatrix_test = Kmatrix_perm[(num_train + num_val):datasize, 0:num_train]\n", |
|
|
|
|
|
"\n", |
|
|
|
|
|
" # Split the targets\n", |
|
|
|
|
|
" y_train = y_perm[0:num_train]\n", |
|
|
|
|
|
"\n", |
|
|
|
|
|
" # Normalization step (for real valued targets only)\n", |
|
|
|
|
|
" print('\\n Normalizing output y...')\n", |
|
|
|
|
|
" if model_type == 'regression':\n", |
|
|
|
|
|
" y_train_mean = np.mean(y_train)\n", |
|
|
|
|
|
" y_train_std = np.std(y_train)\n", |
|
|
|
|
|
" y_train = (y_train - y_train_mean) / float(y_train_std)\n", |
|
|
|
|
|
"# print(y)\n", |
|
|
|
|
|
" \n", |
|
|
|
|
|
" y_val = y_perm[num_train:(num_train + num_val)]\n", |
|
|
|
|
|
" y_test = y_perm[(num_train + num_val):datasize]\n", |
|
|
|
|
|
" \n", |
|
|
|
|
|
" # Record the performance for each parameter trial respectively on validation and test set\n", |
|
|
|
|
|
" perf_all_val = []\n", |
|
|
|
|
|
" perf_all_test = []\n", |
|
|
|
|
|
" \n", |
|
|
|
|
|
" " |
|
|
|
|
|
] |
|
|
|
|
|
}, |
|
|
|
|
|
{ |
|
|
|
|
|
"cell_type": "code", |
|
|
|
|
|
"execution_count": null, |
|
|
|
|
|
"metadata": {}, |
|
|
|
|
|
"outputs": [], |
|
|
|
|
|
"source": [] |
|
|
|
|
|
} |
|
|
|
|
|
], |
|
|
|
|
|
"metadata": { |
|
|
|
|
|
"kernelspec": { |
|
|
|
|
|
"display_name": "Python 3", |
|
|
|
|
|
"language": "python", |
|
|
|
|
|
"name": "python3" |
|
|
|
|
|
}, |
|
|
|
|
|
"language_info": { |
|
|
|
|
|
"codemirror_mode": { |
|
|
|
|
|
"name": "ipython", |
|
|
|
|
|
"version": 3 |
|
|
|
|
|
}, |
|
|
|
|
|
"file_extension": ".py", |
|
|
|
|
|
"mimetype": "text/x-python", |
|
|
|
|
|
"name": "python", |
|
|
|
|
|
"nbconvert_exporter": "python", |
|
|
|
|
|
"pygments_lexer": "ipython3", |
|
|
|
|
|
"version": "3.5.2" |
|
|
|
|
|
} |
|
|
|
|
|
}, |
|
|
|
|
|
"nbformat": 4, |
|
|
|
|
|
"nbformat_minor": 2 |
|
|
|
|
|
} |
|
|
|