Browse Source

remove unnecessary files.

v0.1
jajupmochi 7 years ago
parent
commit
b683eb9e5e
38 changed files with 18581 additions and 37269 deletions
  1. +4
    -0
      .gitignore
  2. +0
    -2015
      .ipynb_checkpoints/run_weisfeilerLehmankernel_acyclic-checkpoint.ipynb
  3. +0
    -3388
      notebooks/.ipynb_checkpoints/plot_all_graphs-checkpoint.ipynb
  4. +0
    -170
      notebooks/.ipynb_checkpoints/py-graph_test-checkpoint.ipynb
  5. +0
    -2305
      notebooks/.ipynb_checkpoints/run_cyclicpatternkernel-checkpoint.ipynb
  6. +0
    -1075
      notebooks/.ipynb_checkpoints/run_marginalizedkernel_acyclic-checkpoint.ipynb
  7. +0
    -665
      notebooks/.ipynb_checkpoints/run_pathkernel_acyclic-checkpoint.ipynb
  8. +0
    -559
      notebooks/.ipynb_checkpoints/run_spkernel_acyclic-checkpoint.ipynb
  9. +0
    -641
      notebooks/.ipynb_checkpoints/run_treeletkernel_acyclic-checkpoint.ipynb
  10. +0
    -3191
      notebooks/.ipynb_checkpoints/run_treepatternkernel-checkpoint.ipynb
  11. +0
    -3183
      notebooks/.ipynb_checkpoints/run_untildpathkernel_acyclic-checkpoint.ipynb
  12. +0
    -12297
      notebooks/.ipynb_checkpoints/run_untilnwalkkernel-checkpoint.ipynb
  13. +0
    -1755
      notebooks/.ipynb_checkpoints/run_weisfeilerLehmankernel_acyclic-checkpoint.ipynb
  14. +0
    -175
      notebooks/.ipynb_checkpoints/test_lib-checkpoint.ipynb
  15. +0
    -2236
      notebooks/.ipynb_checkpoints/test_modelselection-checkpoint.ipynb
  16. +0
    -1271
      notebooks/.ipynb_checkpoints/test_scikit_ksvm-checkpoint.ipynb
  17. +0
    -1136
      notebooks/.ipynb_checkpoints/test_spkernel-checkpoint.ipynb
  18. +56
    -36
      notebooks/run_treeletkernel_acyclic.ipynb
  19. +17880
    -1
      notebooks/run_treepatternkernel.ipynb
  20. +43
    -618
      notebooks/run_untilnwalkkernel.ipynb
  21. +500
    -497
      notebooks/run_weisfeilerLehmankernel_acyclic.ipynb
  22. BIN
      pygraph/__pycache__/__init__.cpython-35.pyc
  23. BIN
      pygraph/kernels/__pycache__/cyclicPatternKernel.cpython-35.pyc
  24. BIN
      pygraph/kernels/__pycache__/deltaKernel.cpython-35.pyc
  25. BIN
      pygraph/kernels/__pycache__/marginalizedKernel.cpython-35.pyc
  26. BIN
      pygraph/kernels/__pycache__/pathKernel.cpython-35.pyc
  27. BIN
      pygraph/kernels/__pycache__/spKernel.cpython-35.pyc
  28. BIN
      pygraph/kernels/__pycache__/spkernel.cpython-35.pyc
  29. BIN
      pygraph/kernels/__pycache__/treePatternKernel.cpython-35.pyc
  30. BIN
      pygraph/kernels/__pycache__/treeletKernel.cpython-35.pyc
  31. BIN
      pygraph/kernels/__pycache__/untildPathKernel.cpython-35.pyc
  32. BIN
      pygraph/kernels/__pycache__/untilnWalkKernel.cpython-35.pyc
  33. BIN
      pygraph/kernels/__pycache__/weisfeilerLehmanKernel.cpython-35.pyc
  34. BIN
      pygraph/utils/__pycache__/__init__.cpython-35.pyc
  35. BIN
      pygraph/utils/__pycache__/graphfiles.cpython-35.pyc
  36. BIN
      pygraph/utils/__pycache__/model_selection_precomputed.cpython-35.pyc
  37. BIN
      pygraph/utils/__pycache__/utils.cpython-35.pyc
  38. +98
    -55
      pygraph/utils/model_selection_precomputed.py

+ 4
- 0
.gitignore View File

@@ -0,0 +1,4 @@
# Jupyter Notebook
.ipynb_checkpoints

__pycache__

+ 0
- 2015
.ipynb_checkpoints/run_weisfeilerLehmankernel_acyclic-checkpoint.ipynb
File diff suppressed because it is too large
View File


+ 0
- 3388
notebooks/.ipynb_checkpoints/plot_all_graphs-checkpoint.ipynb
File diff suppressed because it is too large
View File


+ 0
- 170
notebooks/.ipynb_checkpoints/py-graph_test-checkpoint.ipynb View File

@@ -1,170 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"autoscroll": false,
"ein.tags": "worksheet-0",
"slideshow": {
"slide_type": "-"
}
},
"outputs": [],
"source": [
"import numpy as np\n",
"import paths\n",
"\n",
"import pygraph\n",
"\n",
"from pygraph.utils.graphfiles import loadDataset\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"autoscroll": false,
"ein.tags": "worksheet-0",
"slideshow": {
"slide_type": "-"
}
},
"outputs": [],
"source": [
"import networkx as nx\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"\n",
"# We load a ds dataset\n",
"# load it from https://brunl01.users.greyc.fr/CHEMISTRY/Acyclic.tar.gz\n",
"dataset, y = loadDataset(\"/home/bgauzere/work/Datasets/Acyclic/dataset_bps.ds\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"autoscroll": false,
"ein.tags": "worksheet-0",
"slideshow": {
"slide_type": "-"
}
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 183/183 [07:41<00:00, 2.52s/it]\n",
"100%|██████████| 183/183 [08:39<00:00, 2.84s/it]\n",
"100%|██████████| 183/183 [05:19<00:00, 1.75s/it]\n",
"100%|██████████| 183/183 [05:50<00:00, 1.91s/it]\n"
]
}
],
"source": [
"#Compute graph edit distances\n",
"\n",
"from tqdm import tqdm\n",
"from pygraph.c_ext.lsape_binders import lsap_solverHG\n",
"from pygraph.ged.costfunctions import ConstantCostFunction\n",
"from pygraph.ged.GED import ged\n",
"import time\n",
"\n",
"cf = ConstantCostFunction(1,3,1,3)\n",
"N=len(dataset)\n",
"\n",
"methods=['Riesen + LSAP', 'Neigh + LSAP', 'Riesen + LSAPE', 'Neigh + LSAPE']\n",
"ged_distances = [ np.zeros((N,N)), np.zeros((N,N)), np.zeros((N,N)), np.zeros((N,N))]\n",
"\n",
"times = list()\n",
"start = time.clock()\n",
"for i in tqdm(range(0,N)):\n",
" for j in range(0,N):\n",
" ged_distances[0][i,j] = ged(dataset[i],dataset[j],cf=cf, method='Riesen')[0]\n",
"times.append(time.clock() - start)\n",
"\n",
"\n",
"start = time.clock()\n",
"for i in tqdm(range(0,N)):\n",
" for j in range(0,N):\n",
" ged_distances[1][i,j] = ged(dataset[i],dataset[j],cf=cf, method='Neighboorhood')[0]\n",
"\n",
"times.append(time.clock() - start)\n",
"\n",
"start = time.clock()\n",
"for i in tqdm(range(0,N)):\n",
" for j in range(0,N):\n",
" ged_distances[2][i,j] = ged(dataset[i],dataset[j],cf=cf, method='Riesen',solver=lsap_solverHG)[0]\n",
"times.append(time.clock() - start)\n",
"\n",
"start = time.clock()\n",
"for i in tqdm(range(0,N)):\n",
" for j in range(0,N):\n",
" ged_distances[3][i,j] = ged(dataset[i],dataset[j],cf=cf, method='Neighboorhood',solver=lsap_solverHG)[0]\n",
"times.append(time.clock() - start)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"autoscroll": false,
"ein.tags": "worksheet-0",
"slideshow": {
"slide_type": "-"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" method \t mean \t mean \t time\n",
" Riesen + LSAP \t 37.79903849025053 \t 35.31207262086058 \t 463.300405 \n",
" Neigh + LSAP \t 36.2281047508137 \t 33.85869987159963 \t 521.7821730000001 \n",
" Riesen + LSAPE \t 35.95508973095643 \t 34.10092866314312 \t 319.83455500000014 \n",
" Neigh + LSAPE \t 34.5005822807489 \t 32.5735614679447 \t 350.48029599999995 \n"
]
}
],
"source": [
"print(\" method \\t mean \\t mean \\t time\")\n",
"data = list()\n",
"for i in range(0,len(ged_distances)):\n",
" ged_ = np.minimum(ged_distances[i],ged_distances[i].transpose())\n",
" print(\" {} \\t {} \\t {} \\t {} \".format(methods[i], np.mean(ged_distances[i]),np.mean(ged_), times[i]))\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
},
"name": "py-graph_test.ipynb"
},
"nbformat": 4,
"nbformat_minor": 2
}

+ 0
- 2305
notebooks/.ipynb_checkpoints/run_cyclicpatternkernel-checkpoint.ipynb
File diff suppressed because it is too large
View File


+ 0
- 1075
notebooks/.ipynb_checkpoints/run_marginalizedkernel_acyclic-checkpoint.ipynb
File diff suppressed because it is too large
View File


+ 0
- 665
notebooks/.ipynb_checkpoints/run_pathkernel_acyclic-checkpoint.ipynb View File

@@ -1,665 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The line_profiler extension is already loaded. To reload it, use:\n",
" %reload_ext line_profiler\n",
"\n",
" --- This is a regression problem ---\n",
"\n",
"\n",
" Loading dataset from file...\n",
"\n",
" Calculating kernel matrix, this could take a while...\n",
"\n",
" --- mean average path kernel matrix of size 185 built in 29.430902242660522 seconds ---\n",
"[[ 0.55555556 0.22222222 0. ..., 0. 0. 0. ]\n",
" [ 0.22222222 0.27777778 0. ..., 0. 0. 0. ]\n",
" [ 0. 0. 0.55555556 ..., 0.03030303 0.03030303\n",
" 0.03030303]\n",
" ..., \n",
" [ 0. 0. 0.03030303 ..., 0.08297521 0.05553719\n",
" 0.05256198]\n",
" [ 0. 0. 0.03030303 ..., 0.05553719 0.07239669\n",
" 0.0538843 ]\n",
" [ 0. 0. 0.03030303 ..., 0.05256198 0.0538843\n",
" 0.07438017]]\n",
"\n",
" Saving kernel matrix to file...\n",
"\n",
" Mean performance on train set: 3.619948\n",
"With standard deviation: 0.512351\n",
"\n",
" Mean performance on test set: 18.418852\n",
"With standard deviation: 10.781119\n",
"\n",
"\n",
" rmse_test std_test rmse_train std_train k_time\n",
"----------- ---------- ------------ ----------- --------\n",
" 18.4189 10.7811 3.61995 0.512351 29.4309\n"
]
}
],
"source": [
"%load_ext line_profiler\n",
"\n",
"import sys\n",
"sys.path.insert(0, \"../\")\n",
"from pygraph.utils.utils import kernel_train_test\n",
"from pygraph.kernels.pathKernel import pathkernel, _pathkernel_do\n",
"\n",
"datafile = '../../../../datasets/acyclic/Acyclic/dataset_bps.ds'\n",
"kernel_file_path = 'kernelmatrices_path_acyclic/'\n",
"\n",
"kernel_para = dict(node_label = 'atom', edge_label = 'bond_type')\n",
"\n",
"kernel_train_test(datafile, kernel_file_path, pathkernel, kernel_para, normalize = False)\n",
"\n",
"# %lprun -f _pathkernel_do \\\n",
"# kernel_train_test(datafile, kernel_file_path, pathkernel, kernel_para, normalize = False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# results\n",
"\n",
"# with y normalization\n",
" RMSE_test std_test RMSE_train std_train k_time\n",
"----------- ---------- ------------ ----------- --------\n",
" 14.0015 6.93602 3.76191 0.702594 37.5759\n",
"\n",
"# without y normalization\n",
" RMSE_test std_test RMSE_train std_train k_time\n",
"----------- ---------- ------------ ----------- --------\n",
" 18.4189 10.7811 3.61995 0.512351 29.4309"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"- This script take as input a kernel matrix\n",
"and returns the classification or regression performance\n",
"- The kernel matrix can be calculated using any of the graph kernels approaches\n",
"- The criteria used for prediction are SVM for classification and kernel Ridge regression for regression\n",
"- For predition we divide the data in training, validation and test. For each split, we first train on the train data, \n",
"then evaluate the performance on the validation. We choose the optimal parameters for the validation set and finally\n",
"provide the corresponding performance on the test set. If more than one split is performed, the final results \n",
"correspond to the average of the performances on the test sets. \n",
"\n",
"@references\n",
" https://github.com/eghisu/GraphKernels/blob/master/GraphKernelsCollection/python_scripts/compute_perf_gk.py\n",
"\n"
]
},
{
"ename": "IndentationError",
"evalue": "unindent does not match any outer indentation level (utils.py, line 106)",
"output_type": "error",
"traceback": [
"Traceback \u001b[0;36m(most recent call last)\u001b[0m:\n",
" File \u001b[1;32m\"/usr/local/lib/python3.5/dist-packages/IPython/core/interactiveshell.py\"\u001b[0m, line \u001b[1;32m2910\u001b[0m, in \u001b[1;35mrun_code\u001b[0m\n exec(code_obj, self.user_global_ns, self.user_ns)\n",
"\u001b[0;36m File \u001b[0;32m\"<ipython-input-1-0b5b9ebb5cc4>\"\u001b[0;36m, line \u001b[0;32m31\u001b[0;36m, in \u001b[0;35m<module>\u001b[0;36m\u001b[0m\n\u001b[0;31m from pygraph.utils.utils import split_train_test\u001b[0m\n",
"\u001b[0;36m File \u001b[0;32m\"../pygraph/utils/utils.py\"\u001b[0;36m, line \u001b[0;32m106\u001b[0m\n\u001b[0;31m train_means_list = []\u001b[0m\n\u001b[0m ^\u001b[0m\n\u001b[0;31mIndentationError\u001b[0m\u001b[0;31m:\u001b[0m unindent does not match any outer indentation level\n"
]
}
],
"source": [
"# Author: Elisabetta Ghisu\n",
"\n",
"\"\"\"\n",
"- This script take as input a kernel matrix\n",
"and returns the classification or regression performance\n",
"- The kernel matrix can be calculated using any of the graph kernels approaches\n",
"- The criteria used for prediction are SVM for classification and kernel Ridge regression for regression\n",
"- For predition we divide the data in training, validation and test. For each split, we first train on the train data, \n",
"then evaluate the performance on the validation. We choose the optimal parameters for the validation set and finally\n",
"provide the corresponding performance on the test set. If more than one split is performed, the final results \n",
"correspond to the average of the performances on the test sets. \n",
"\n",
"@references\n",
" https://github.com/eghisu/GraphKernels/blob/master/GraphKernelsCollection/python_scripts/compute_perf_gk.py\n",
"\"\"\"\n",
"\n",
"print(__doc__)\n",
"\n",
"import sys\n",
"import os\n",
"import pathlib\n",
"from collections import OrderedDict\n",
"sys.path.insert(0, \"../\")\n",
"from tabulate import tabulate\n",
"\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"\n",
"from pygraph.kernels.pathKernel import pathkernel\n",
"from pygraph.utils.graphfiles import loadDataset\n",
"from pygraph.utils.utils import split_train_test\n",
"\n",
"train_means_list = []\n",
"train_stds_list = []\n",
"test_means_list = []\n",
"test_stds_list = []\n",
"kernel_time_list = []\n",
"\n",
"print('\\n Loading dataset from file...')\n",
"dataset, y = loadDataset(\"../../../../datasets/acyclic/Acyclic/dataset_bps.ds\")\n",
"y = np.array(y)\n",
"print(y)\n",
"\n",
"# setup the parameters\n",
"model_type = 'regression' # Regression or classification problem\n",
"print('\\n --- This is a %s problem ---' % model_type)\n",
"\n",
"trials = 100 # Trials for hyperparameters random search\n",
"splits = 10 # Number of splits of the data\n",
"alpha_grid = np.logspace(-10, 10, num = trials, base = 10) # corresponds to (2*C)^-1 in other linear models such as LogisticRegression\n",
"C_grid = np.logspace(-10, 10, num = trials, base = 10)\n",
"\n",
"# set the output path\n",
"kernel_file_path = 'kernelmatrices_path_acyclic/'\n",
"if not os.path.exists(kernel_file_path):\n",
" os.makedirs(kernel_file_path)\n",
"\n",
"\"\"\"\n",
"- Here starts the main program\n",
"- First we permute the data, then for each split we evaluate corresponding performances\n",
"- In the end, the performances are averaged over the test sets\n",
"\"\"\"\n",
"\n",
"# save kernel matrices to files / read kernel matrices from files\n",
"kernel_file = kernel_file_path + 'km.ds'\n",
"path = pathlib.Path(kernel_file)\n",
"# get train set kernel matrix\n",
"if path.is_file():\n",
" print('\\n Loading the kernel matrix from file...')\n",
" Kmatrix = np.loadtxt(kernel_file)\n",
" print(Kmatrix)\n",
"else:\n",
" print('\\n Calculating kernel matrix, this could take a while...')\n",
" Kmatrix, run_time = pathkernel(dataset, node_label = 'atom', edge_label = 'bond_type')\n",
" kernel_time_list.append(run_time)\n",
" print(Kmatrix)\n",
" print('\\n Saving kernel matrix to file...')\n",
"# np.savetxt(kernel_file, Kmatrix)\n",
" \n",
"train_mean, train_std, test_mean, test_std = \\\n",
" split_train_test(Kmatrix, y, alpha_grid, C_grid, splits, trials, model_type, normalize = True)\n",
" \n",
"train_means_list.append(train_mean)\n",
"train_stds_list.append(train_std)\n",
"test_means_list.append(test_mean)\n",
"test_stds_list.append(test_std)\n",
" \n",
"print('\\n') \n",
"table_dict = {'RMSE_test': test_means_list, 'std_test': test_stds_list, \\\n",
" 'RMSE_train': train_means_list, 'std_train': train_stds_list, 'k_time': kernel_time_list}\n",
"keyorder = ['RMSE_test', 'std_test', 'RMSE_train', 'std_train', 'k_time']\n",
"print(tabulate(OrderedDict(sorted(table_dict.items(), key = lambda i:keyorder.index(i[0]))), headers='keys'))"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"ename": "ImportError",
"evalue": "cannot import name 'deltaKernel'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-1-51fa7de99690>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0msys\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0minsert\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"../\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mpygraph\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mutils\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgraphfiles\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mloadDataset\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mpygraph\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mkernels\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdeltaKernel\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mdeltaKernel\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0mdataset\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mloadDataset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"../../../../datasets/acyclic/Acyclic/dataset_bps.ds\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mImportError\u001b[0m: cannot import name 'deltaKernel'"
]
}
],
"source": [
"import sys\n",
"import networkx as nx\n",
"sys.path.insert(0, \"../\")\n",
"from pygraph.utils.graphfiles import loadDataset\n",
"from pygraph.kernels.deltaKernel import deltaKernel\n",
"\n",
"dataset, y = loadDataset(\"../../../../datasets/acyclic/Acyclic/dataset_bps.ds\")\n",
"G1 = dataset[12]\n",
"G2 = dataset[55]\n",
"sp1 = []\n",
"num_nodes = G1.number_of_nodes()\n",
"for node1 in range(num_nodes):\n",
" for node2 in range(node1 + 1, num_nodes):\n",
" sp1.append(nx.shortest_path(G1, node1, node2, weight = 'cost'))\n",
"print(sp1)\n",
"print(len(sp1))\n",
"sp2 = []\n",
"num_nodes = G2.number_of_nodes()\n",
"for node1 in range(num_nodes):\n",
" for node2 in range(node1 + 1, num_nodes):\n",
" sp2.append(nx.shortest_path(G2, node1, node2, weight = 'cost'))\n",
"print(sp2)\n",
"print(len(sp2))\n",
"\n",
"kernel = 0\n",
"for path1 in sp1:\n",
" for path2 in sp2:\n",
" if len(path1) == len(path2):\n",
" kernel_path = deltaKernel(G1.node[path1[0]]['label'] == G2.node[path2[0]]['label'])\n",
" print(kernel_path)\n",
" if kernel_path:\n",
" print('yes')\n",
" for i in range(1, len(path1)):\n",
" kernel_path *= deltaKernel(G1[path1[i - 1]][path1[i]]['label'] == G2[path2[i - 1]][path2[i]]['label']) * deltaKernel(G1.node[path1[i]]['label'] == G2.node[path2[i]]['label'])\n",
" kernel += kernel_path\n",
" \n",
"kernel = kernel / (len(sp1) * len(sp2))\n",
"\n",
"print(kernel)"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"- This script take as input a kernel matrix\n",
"and returns the classification or regression performance\n",
"- The kernel matrix can be calculated using any of the graph kernels approaches\n",
"- The criteria used for prediction are SVM for classification and kernel Ridge regression for regression\n",
"- For predition we divide the data in training, validation and test. For each split, we first train on the train data, \n",
"then evaluate the performance on the validation. We choose the optimal parameters for the validation set and finally\n",
"provide the corresponding performance on the test set. If more than one split is performed, the final results \n",
"correspond to the average of the performances on the test sets. \n",
"\n",
"@references\n",
" https://github.com/eghisu/GraphKernels/blob/master/GraphKernelsCollection/python_scripts/compute_perf_gk.py\n",
"\n",
"\n",
" --- This is a regression problem ---\n",
"\n",
" Normalizing output y...\n",
"\n",
" Loading the train set kernel matrix from file...\n",
"[[ 0.15254237 0.08333333 0.0625 ..., 0.11363636 0.11363636\n",
" 0.11363636]\n",
" [ 0.08333333 0.18518519 0.15591398 ..., 0.16617791 0.16617791\n",
" 0.16890214]\n",
" [ 0.0625 0.15591398 0.15254237 ..., 0.12987013 0.12987013\n",
" 0.13163636]\n",
" ..., \n",
" [ 0.11363636 0.16617791 0.12987013 ..., 0.26383753 0.2639004\n",
" 0.26156557]\n",
" [ 0.11363636 0.16617791 0.12987013 ..., 0.2639004 0.26396688\n",
" 0.26162729]\n",
" [ 0.11363636 0.16890214 0.13163636 ..., 0.26156557 0.26162729\n",
" 0.25964592]]\n",
"\n",
" Loading the test set kernel matrix from file...\n",
"[[ 0.18518519 0.1715847 0.11111111 0.16588603 0.11904762 0.16450216\n",
" 0.17281421 0.14285714 0.125 0.16477273 0.16880154 0.14583333\n",
" 0.1660693 0.16906445 0.13333333 0.16612903 0.16420966 0.16441006\n",
" 0.15151515]\n",
" [ 0.1715847 0.19988118 0.15173333 0.18435596 0.16465263 0.21184723\n",
" 0.18985964 0.19960191 0.16819723 0.21540115 0.19575264 0.2041482\n",
" 0.21842419 0.20001664 0.18754969 0.2205599 0.20506165 0.22256445\n",
" 0.2141792 ]\n",
" [ 0.11111111 0.15173333 0.16303156 0.13416478 0.16903494 0.16960573\n",
" 0.13862936 0.18511129 0.16989276 0.17395417 0.14762351 0.18709221\n",
" 0.17706477 0.15293506 0.17970939 0.17975775 0.16082785 0.18295252\n",
" 0.19186573]\n",
" [ 0.16588603 0.18435596 0.13416478 0.17413923 0.14529511 0.19230449\n",
" 0.17775828 0.17598858 0.14892223 0.19462663 0.18166555 0.17986029\n",
" 0.1964604 0.18450695 0.16510376 0.19788853 0.1876399 0.19921541\n",
" 0.18843419]\n",
" [ 0.11904762 0.16465263 0.16903494 0.14529511 0.17703225 0.18464872\n",
" 0.15002895 0.19785455 0.17779663 0.18950917 0.16010081 0.2005743\n",
" 0.19306131 0.16599977 0.19113529 0.1960531 0.175064 0.19963794\n",
" 0.20696464]\n",
" [ 0.16450216 0.21184723 0.16960573 0.19230449 0.18464872 0.23269314\n",
" 0.19681552 0.22450276 0.1871932 0.23765844 0.20733248 0.22967925\n",
" 0.241199 0.21337314 0.21125341 0.24426963 0.22285333 0.24802555\n",
" 0.24156669]\n",
" [ 0.17281421 0.18985964 0.13862936 0.17775828 0.15002895 0.19681552\n",
" 0.18309269 0.18152273 0.15411585 0.19935309 0.18641218 0.18556038\n",
" 0.20169527 0.18946029 0.17030032 0.20320694 0.19192382 0.2042596\n",
" 0.19428999]\n",
" [ 0.14285714 0.19960191 0.18511129 0.17598858 0.19785455 0.22450276\n",
" 0.18152273 0.23269314 0.20168735 0.23049584 0.19407926 0.23694176\n",
" 0.23486084 0.20134404 0.22042984 0.23854906 0.21275711 0.24302959\n",
" 0.24678197]\n",
" [ 0.125 0.16819723 0.16989276 0.14892223 0.17779663 0.1871932\n",
" 0.15411585 0.20168735 0.18391356 0.19188588 0.16365606 0.20428161\n",
" 0.1952436 0.16940489 0.1919249 0.19815511 0.17760881 0.20152837\n",
" 0.20988805]\n",
" [ 0.16477273 0.21540115 0.17395417 0.19462663 0.18950917 0.23765844\n",
" 0.19935309 0.23049584 0.19188588 0.24296859 0.21058278 0.23586086\n",
" 0.24679036 0.21702635 0.21699483 0.25006701 0.22724646 0.25407837\n",
" 0.24818625]\n",
" [ 0.16880154 0.19575264 0.14762351 0.18166555 0.16010081 0.20733248\n",
" 0.18641218 0.19407926 0.16365606 0.21058278 0.19214629 0.19842989\n",
" 0.21317298 0.19609213 0.18225175 0.2151567 0.20088139 0.2171273\n",
" 0.20810339]\n",
" [ 0.14583333 0.2041482 0.18709221 0.17986029 0.2005743 0.22967925\n",
" 0.18556038 0.23694176 0.20428161 0.23586086 0.19842989 0.24154885\n",
" 0.24042054 0.20590264 0.22439219 0.24421452 0.21769149 0.24880304\n",
" 0.25200246]\n",
" [ 0.1660693 0.21842419 0.17706477 0.1964604 0.19306131 0.241199\n",
" 0.20169527 0.23486084 0.1952436 0.24679036 0.21317298 0.24042054\n",
" 0.25107069 0.21988195 0.22126548 0.25446921 0.23058896 0.25855949\n",
" 0.25312182]\n",
" [ 0.16906445 0.20001664 0.15293506 0.18450695 0.16599977 0.21337314\n",
" 0.18946029 0.20134404 0.16940489 0.21702635 0.19609213 0.20590264\n",
" 0.21988195 0.20052959 0.18917551 0.22212027 0.2061696 0.22441239\n",
" 0.21607563]\n",
" [ 0.13333333 0.18754969 0.17970939 0.16510376 0.19113529 0.21125341\n",
" 0.17030032 0.22042984 0.1919249 0.21699483 0.18225175 0.22439219\n",
" 0.22126548 0.18917551 0.2112185 0.224781 0.20021961 0.22904467\n",
" 0.23356012]\n",
" [ 0.16612903 0.2205599 0.17975775 0.19788853 0.1960531 0.24426963\n",
" 0.20320694 0.23854906 0.19815511 0.25006701 0.2151567 0.24421452\n",
" 0.25446921 0.22212027 0.224781 0.25800115 0.23326559 0.26226067\n",
" 0.25717144]\n",
" [ 0.16420966 0.20506165 0.16082785 0.1876399 0.175064 0.22285333\n",
" 0.19192382 0.21275711 0.17760881 0.22724646 0.20088139 0.21769149\n",
" 0.23058896 0.2061696 0.20021961 0.23326559 0.21442192 0.2364528\n",
" 0.22891788]\n",
" [ 0.16441006 0.22256445 0.18295252 0.19921541 0.19963794 0.24802555\n",
" 0.2042596 0.24302959 0.20152837 0.25407837 0.2171273 0.24880304\n",
" 0.25855949 0.22441239 0.22904467 0.26226067 0.2364528 0.26687384\n",
" 0.26210305]\n",
" [ 0.15151515 0.2141792 0.19186573 0.18843419 0.20696464 0.24156669\n",
" 0.19428999 0.24678197 0.20988805 0.24818625 0.20810339 0.25200246\n",
" 0.25312182 0.21607563 0.23356012 0.25717144 0.22891788 0.26210305\n",
" 0.26386999]]\n"
]
},
{
"ename": "ValueError",
"evalue": "Precomputed metric requires shape (n_queries, n_indexed). Got (19, 19) for 164 indexed.",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-30-d4c5f46d5abf>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 133\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 134\u001b[0m \u001b[0;31m# predict on the test set\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 135\u001b[0;31m \u001b[0my_pred_test\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mKR\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpredict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mKmatrix_test\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 136\u001b[0m \u001b[0;31m# print(y_pred)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 137\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/usr/local/lib/python3.5/dist-packages/sklearn/kernel_ridge.py\u001b[0m in \u001b[0;36mpredict\u001b[0;34m(self, X)\u001b[0m\n\u001b[1;32m 182\u001b[0m \"\"\"\n\u001b[1;32m 183\u001b[0m \u001b[0mcheck_is_fitted\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m\"X_fit_\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"dual_coef_\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 184\u001b[0;31m \u001b[0mK\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_kernel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mX_fit_\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 185\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdot\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mK\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdual_coef_\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/usr/local/lib/python3.5/dist-packages/sklearn/kernel_ridge.py\u001b[0m in \u001b[0;36m_get_kernel\u001b[0;34m(self, X, Y)\u001b[0m\n\u001b[1;32m 119\u001b[0m \"coef0\": self.coef0}\n\u001b[1;32m 120\u001b[0m return pairwise_kernels(X, Y, metric=self.kernel,\n\u001b[0;32m--> 121\u001b[0;31m filter_params=True, **params)\n\u001b[0m\u001b[1;32m 122\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 123\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0mproperty\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/usr/local/lib/python3.5/dist-packages/sklearn/metrics/pairwise.py\u001b[0m in \u001b[0;36mpairwise_kernels\u001b[0;34m(X, Y, metric, filter_params, n_jobs, **kwds)\u001b[0m\n\u001b[1;32m 1389\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1390\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mmetric\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m\"precomputed\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1391\u001b[0;31m \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcheck_pairwise_arrays\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mY\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mprecomputed\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1392\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1393\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmetric\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mGPKernel\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/usr/local/lib/python3.5/dist-packages/sklearn/metrics/pairwise.py\u001b[0m in \u001b[0;36mcheck_pairwise_arrays\u001b[0;34m(X, Y, precomputed, dtype)\u001b[0m\n\u001b[1;32m 117\u001b[0m \u001b[0;34m\"(n_queries, n_indexed). Got (%d, %d) \"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 118\u001b[0m \u001b[0;34m\"for %d indexed.\"\u001b[0m \u001b[0;34m%\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 119\u001b[0;31m (X.shape[0], X.shape[1], Y.shape[0]))\n\u001b[0m\u001b[1;32m 120\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0mY\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 121\u001b[0m raise ValueError(\"Incompatible dimension for X and Y matrices: \"\n",
"\u001b[0;31mValueError\u001b[0m: Precomputed metric requires shape (n_queries, n_indexed). Got (19, 19) for 164 indexed."
]
}
],
"source": [
"# Author: Elisabetta Ghisu\n",
"\n",
"\"\"\"\n",
"- This script take as input a kernel matrix\n",
"and returns the classification or regression performance\n",
"- The kernel matrix can be calculated using any of the graph kernels approaches\n",
"- The criteria used for prediction are SVM for classification and kernel Ridge regression for regression\n",
"- For predition we divide the data in training, validation and test. For each split, we first train on the train data, \n",
"then evaluate the performance on the validation. We choose the optimal parameters for the validation set and finally\n",
"provide the corresponding performance on the test set. If more than one split is performed, the final results \n",
"correspond to the average of the performances on the test sets. \n",
"\n",
"@references\n",
" https://github.com/eghisu/GraphKernels/blob/master/GraphKernelsCollection/python_scripts/compute_perf_gk.py\n",
"\"\"\"\n",
"\n",
"print(__doc__)\n",
"\n",
"import sys\n",
"import pathlib\n",
"import os\n",
"sys.path.insert(0, \"../\")\n",
"from tabulate import tabulate\n",
"\n",
"import random\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"\n",
"from sklearn.kernel_ridge import KernelRidge # 0.17\n",
"from sklearn.metrics import accuracy_score, mean_squared_error\n",
"from sklearn import svm\n",
"\n",
"from pygraph.kernels.pathKernel import pathKernel\n",
"from pygraph.utils.graphfiles import loadDataset\n",
"\n",
"# print('\\n Loading dataset from file...')\n",
"# dataset, y = loadDataset(\"/home/ljia/Documents/research-repo/datasets/acyclic/Acyclic/dataset_bps.ds\")\n",
"# y = np.array(y)\n",
"# print(y)\n",
"\n",
"# kernel_file_path = 'marginalizedkernelmatrix.ds'\n",
"# path = pathlib.Path(kernel_file_path)\n",
"# if path.is_file():\n",
"# print('\\n Loading the matrix from file...')\n",
"# Kmatrix = np.loadtxt(kernel_file_path)\n",
"# print(Kmatrix)\n",
"# else:\n",
"# print('\\n Calculating kernel matrix, this could take a while...')\n",
"# Kmatrix = marginalizeKernel(dataset)\n",
"# print(Kmatrix)\n",
"# print('Saving kernel matrix to file...')\n",
"# np.savetxt(kernel_file_path, Kmatrix)\n",
"\n",
"# setup the parameters\n",
"model_type = 'regression' # Regression or classification problem\n",
"print('\\n --- This is a %s problem ---' % model_type)\n",
"\n",
"# datasize = len(dataset)\n",
"trials = 100 # Trials for hyperparameters random search\n",
"splits = 100 # Number of splits of the data\n",
"alpha_grid = np.linspace(0.01, 100, num = trials) # corresponds to (2*C)^-1 in other linear models such as LogisticRegression\n",
"# C_grid = np.linspace(0.0001, 10, num = trials)\n",
"random.seed(20) # Set the seed for uniform parameter distribution\n",
"data_dir = '/home/ljia/Documents/research-repo/datasets/acyclic/Acyclic/'\n",
"\n",
"# set the output path\n",
"kernel_file_path = 'kernelmatrices_marginalized_acyclic/'\n",
"if not os.path.exists(kernel_file_path):\n",
" os.makedirs(kernel_file_path)\n",
"\n",
"\n",
"\"\"\"\n",
"- Here starts the main program\n",
"- First we permute the data, then for each split we evaluate corresponding performances\n",
"- In the end, the performances are averaged over the test sets\n",
"\"\"\"\n",
"\n",
"# Initialize the performance of the best parameter trial on validation with the corresponding performance on test\n",
"val_split = []\n",
"test_split = []\n",
"\n",
"p_quit = 0.5\n",
"\n",
"# for each split of the data\n",
"for j in range(10):\n",
" dataset_train, y_train = loadDataset(data_dir + 'trainset_' + str(j) + '.ds')\n",
" dataset_test, y_test = loadDataset(data_dir + 'testset_' + str(j) + '.ds')\n",
" \n",
" # Normalization step (for real valued targets only)\n",
" if model_type == 'regression':\n",
" print('\\n Normalizing output y...')\n",
" y_train_mean = np.mean(y_train)\n",
" y_train_std = np.std(y_train)\n",
" y_train = (y_train - y_train_mean) / float(y_train_std)\n",
"# print(y)\n",
" \n",
" # save kernel matrices to files / read kernel matrices from files\n",
" kernel_file_train = kernel_file_path + 'train' + str(j) + '_pquit_' + str(p_quit)\n",
" kernel_file_test = kernel_file_path + 'test' + str(j) + '_pquit_' + str(p_quit)\n",
" path_train = pathlib.Path(kernel_file_train)\n",
" path_test = pathlib.Path(kernel_file_test)\n",
" # get train set kernel matrix\n",
" if path_train.is_file():\n",
" print('\\n Loading the train set kernel matrix from file...')\n",
" Kmatrix_train = np.loadtxt(kernel_file_train)\n",
" print(Kmatrix_train)\n",
" else:\n",
" print('\\n Calculating train set kernel matrix, this could take a while...')\n",
" Kmatrix_train = marginalizedkernel(dataset_train, p_quit, 20)\n",
" print(Kmatrix_train)\n",
" print('\\n Saving train set kernel matrix to file...')\n",
" np.savetxt(kernel_file_train, Kmatrix_train)\n",
" # get test set kernel matrix\n",
" if path_test.is_file():\n",
" print('\\n Loading the test set kernel matrix from file...')\n",
" Kmatrix_test = np.loadtxt(kernel_file_test)\n",
" print(Kmatrix_test)\n",
" else:\n",
" print('\\n Calculating test set kernel matrix, this could take a while...')\n",
" Kmatrix_test = marginalizedkernel(dataset_test, p_quit, 20)\n",
" print(Kmatrix_test)\n",
" print('\\n Saving test set kernel matrix to file...')\n",
" np.savetxt(kernel_file_test, Kmatrix_test)\n",
"\n",
" # For each parameter trial\n",
" for i in range(trials):\n",
" # For regression use the Kernel Ridge method\n",
" if model_type == 'regression':\n",
" # print('\\n Starting experiment for trial %d and parameter alpha = %3f\\n ' % (i, alpha_grid[i]))\n",
"\n",
" # Fit the kernel ridge model\n",
" KR = KernelRidge(kernel = 'precomputed', alpha = alpha_grid[i])\n",
" KR.fit(Kmatrix_train, y_train)\n",
"\n",
" # predict on the test set\n",
" y_pred_test = KR.predict(Kmatrix_test)\n",
" # print(y_pred)\n",
"\n",
" # adjust prediction: needed because the training targets have been normalized\n",
" y_pred_test = y_pred_test * float(y_train_std) + y_train_mean\n",
" # print(y_pred_test)\n",
"\n",
" # root mean squared error in test \n",
" rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))\n",
" perf_all_test.append(rmse_test)\n",
"\n",
" # print('The performance on the validation set is: %3f' % rmse)\n",
" # print('The performance on the test set is: %3f' % rmse_test)\n",
"\n",
" # --- FIND THE OPTIMAL PARAMETERS --- #\n",
" # For regression: minimise the mean squared error\n",
" if model_type == 'regression':\n",
"\n",
" # get optimal parameter on test (argmin mean squared error)\n",
" min_idx = np.argmin(perf_all_test)\n",
" alpha_opt = alpha_grid[min_idx]\n",
"\n",
" # corresponding performance on test for the same parameter\n",
" perf_test_opt = perf_all_test[min_idx]\n",
"\n",
" print('The best performance is for trial %d with parameter alpha = %3f' % (min_idx, alpha_opt))\n",
" print('The corresponding performance on test set is: %3f' % perf_test_opt)\n",
" \n",
" \n",
" \n",
"\n",
"# For each split of the data\n",
"for j in range(10, 10 + splits):\n",
" print('Starting split %d...' % j)\n",
"\n",
" # Set the random set for data permutation\n",
" random_state = int(j)\n",
" np.random.seed(random_state)\n",
" idx_perm = np.random.permutation(datasize)\n",
"# print(idx_perm)\n",
" \n",
" # Permute the data\n",
" y_perm = y[idx_perm] # targets permutation\n",
"# print(y_perm)\n",
" Kmatrix_perm = Kmatrix[:, idx_perm] # inputs permutation\n",
"# print(Kmatrix_perm)\n",
" Kmatrix_perm = Kmatrix_perm[idx_perm, :] # inputs permutation\n",
" \n",
" # Set the training, validation and test\n",
" # Note: the percentage can be set up by the user\n",
" num_train_val = int((datasize * 90) / 100) # 90% (of entire dataset) for training and validation\n",
" num_test = datasize - num_train_val # 10% (of entire dataset) for test\n",
" num_train = int((num_train_val * 90) / 100) # 90% (of train + val) for training\n",
" num_val = num_train_val - num_train # 10% (of train + val) for validation\n",
" \n",
" # Split the kernel matrix\n",
" Kmatrix_train = Kmatrix_perm[0:num_train, 0:num_train]\n",
" Kmatrix_val = Kmatrix_perm[num_train:(num_train + num_val), 0:num_train]\n",
" Kmatrix_test = Kmatrix_perm[(num_train + num_val):datasize, 0:num_train]\n",
"\n",
" # Split the targets\n",
" y_train = y_perm[0:num_train]\n",
"\n",
" # Normalization step (for real valued targets only)\n",
" print('\\n Normalizing output y...')\n",
" if model_type == 'regression':\n",
" y_train_mean = np.mean(y_train)\n",
" y_train_std = np.std(y_train)\n",
" y_train = (y_train - y_train_mean) / float(y_train_std)\n",
"# print(y)\n",
" \n",
" y_val = y_perm[num_train:(num_train + num_val)]\n",
" y_test = y_perm[(num_train + num_val):datasize]\n",
" \n",
" # Record the performance for each parameter trial respectively on validation and test set\n",
" perf_all_val = []\n",
" perf_all_test = []\n",
" \n",
" "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

+ 0
- 559
notebooks/.ipynb_checkpoints/run_spkernel_acyclic-checkpoint.ipynb
File diff suppressed because it is too large
View File


+ 0
- 641
notebooks/.ipynb_checkpoints/run_treeletkernel_acyclic-checkpoint.ipynb
File diff suppressed because it is too large
View File


+ 0
- 3191
notebooks/.ipynb_checkpoints/run_treepatternkernel-checkpoint.ipynb
File diff suppressed because it is too large
View File


+ 0
- 3183
notebooks/.ipynb_checkpoints/run_untildpathkernel_acyclic-checkpoint.ipynb
File diff suppressed because it is too large
View File


+ 0
- 12297
notebooks/.ipynb_checkpoints/run_untilnwalkkernel-checkpoint.ipynb
File diff suppressed because it is too large
View File


+ 0
- 1755
notebooks/.ipynb_checkpoints/run_weisfeilerLehmankernel_acyclic-checkpoint.ipynb
File diff suppressed because it is too large
View File


+ 0
- 175
notebooks/.ipynb_checkpoints/test_lib-checkpoint.ipynb
File diff suppressed because it is too large
View File


+ 0
- 2236
notebooks/.ipynb_checkpoints/test_modelselection-checkpoint.ipynb
File diff suppressed because it is too large
View File


+ 0
- 1271
notebooks/.ipynb_checkpoints/test_scikit_ksvm-checkpoint.ipynb
File diff suppressed because it is too large
View File


+ 0
- 1136
notebooks/.ipynb_checkpoints/test_spkernel-checkpoint.ipynb
File diff suppressed because it is too large
View File


+ 56
- 36
notebooks/run_treeletkernel_acyclic.ipynb
File diff suppressed because it is too large
View File


+ 17880
- 1
notebooks/run_treepatternkernel.ipynb
File diff suppressed because it is too large
View File


+ 43
- 618
notebooks/run_untilnwalkkernel.ipynb
File diff suppressed because it is too large
View File


+ 500
- 497
notebooks/run_weisfeilerLehmankernel_acyclic.ipynb
File diff suppressed because it is too large
View File


BIN
pygraph/__pycache__/__init__.cpython-35.pyc View File


BIN
pygraph/kernels/__pycache__/cyclicPatternKernel.cpython-35.pyc View File


BIN
pygraph/kernels/__pycache__/deltaKernel.cpython-35.pyc View File


BIN
pygraph/kernels/__pycache__/marginalizedKernel.cpython-35.pyc View File


BIN
pygraph/kernels/__pycache__/pathKernel.cpython-35.pyc View File


BIN
pygraph/kernels/__pycache__/spKernel.cpython-35.pyc View File


BIN
pygraph/kernels/__pycache__/spkernel.cpython-35.pyc View File


BIN
pygraph/kernels/__pycache__/treePatternKernel.cpython-35.pyc View File


BIN
pygraph/kernels/__pycache__/treeletKernel.cpython-35.pyc View File


BIN
pygraph/kernels/__pycache__/untildPathKernel.cpython-35.pyc View File


BIN
pygraph/kernels/__pycache__/untilnWalkKernel.cpython-35.pyc View File


BIN
pygraph/kernels/__pycache__/weisfeilerLehmanKernel.cpython-35.pyc View File


BIN
pygraph/utils/__pycache__/__init__.cpython-35.pyc View File


BIN
pygraph/utils/__pycache__/graphfiles.cpython-35.pyc View File


BIN
pygraph/utils/__pycache__/model_selection_precomputed.cpython-35.pyc View File


BIN
pygraph/utils/__pycache__/utils.cpython-35.pyc View File


+ 98
- 55
pygraph/utils/model_selection_precomputed.py View File

@@ -1,4 +1,9 @@
def model_selection_for_precomputed_kernel(datafile, estimator, param_grid_precomputed, param_grid, model_type, NUM_TRIALS = 30, datafile_y = ''):


def model_selection_for_precomputed_kernel(datafile, estimator,
param_grid_precomputed, param_grid,
model_type, NUM_TRIALS=30,
datafile_y=''):
"""Perform model selection, fitting and testing for precomputed kernels using nested cv. Print out neccessary data during the process then finally the results.

Parameters
@@ -50,26 +55,30 @@ def model_selection_for_precomputed_kernel(datafile, estimator, param_grid_preco
# setup the model type
model_type = model_type.lower()
if model_type != 'regression' and model_type != 'classification':
raise Exception('The model type is incorrect! Please choose from regression or classification.')
raise Exception(
'The model type is incorrect! Please choose from regression or classification.')
print()
print('--- This is a %s problem ---' % model_type)

# Load the dataset
print()
print('1. Loading dataset from file...')
dataset, y = loadDataset(datafile, filename_y = datafile_y)
dataset, y = loadDataset(datafile, filename_y=datafile_y)

# Grid of parameters with a discrete number of values for each.
param_list_precomputed = list(ParameterGrid(param_grid_precomputed))
param_list = list(ParameterGrid(param_grid))

# Arrays to store scores
train_pref = np.zeros((NUM_TRIALS, len(param_list_precomputed), len(param_list)))
val_pref = np.zeros((NUM_TRIALS, len(param_list_precomputed), len(param_list)))
test_pref = np.zeros((NUM_TRIALS, len(param_list_precomputed), len(param_list)))
train_pref = np.zeros(
(NUM_TRIALS, len(param_list_precomputed), len(param_list)))
val_pref = np.zeros(
(NUM_TRIALS, len(param_list_precomputed), len(param_list)))
test_pref = np.zeros(
(NUM_TRIALS, len(param_list_precomputed), len(param_list)))

gram_matrices = [] # a list to store gram matrices for all param_grid_precomputed
gram_matrix_time = [] # a list to store time to calculate gram matrices
gram_matrices = [] # a list to store gram matrices for all param_grid_precomputed
gram_matrix_time = [] # a list to store time to calculate gram matrices

# calculate all gram matrices
print()
@@ -80,6 +89,7 @@ def model_selection_for_precomputed_kernel(datafile, estimator, param_grid_preco
print('gram matrix with parameters', params_out, 'is: ')
print(Kmatrix)
plt.matshow(Kmatrix)
plt.colorbar()
plt.show()
# plt.savefig('../../notebooks/gram_matrix_figs/{}_{}'.format(estimator.__name__, params_out))
gram_matrices.append(Kmatrix)
@@ -88,17 +98,18 @@ def model_selection_for_precomputed_kernel(datafile, estimator, param_grid_preco
print()
print('3. Fitting and predicting using nested cross validation. This could really take a while...')
# Loop for each trial
pbar = tqdm(total = NUM_TRIALS * len(param_list_precomputed) * len(param_list),
desc = 'calculate performance', file=sys.stdout)
for trial in range(NUM_TRIALS): # Test set level
pbar = tqdm(total=NUM_TRIALS * len(param_list_precomputed) * len(param_list),
desc='calculate performance', file=sys.stdout)
for trial in range(NUM_TRIALS): # Test set level
# loop for each outer param tuple
for index_out, params_out in enumerate(param_list_precomputed):
# split gram matrix and y to app and test sets.
X_app, X_test, y_app, y_test = train_test_split(gram_matrices[index_out], y, test_size=0.1)
X_app, X_test, y_app, y_test = train_test_split(
gram_matrices[index_out], y, test_size=0.1)
split_index_app = [y.index(y_i) for y_i in y_app if y_i in y]
split_index_test = [y.index(y_i) for y_i in y_test if y_i in y]
X_app = X_app[:,split_index_app]
X_test = X_test[:,split_index_app]
X_app = X_app[:, split_index_app]
X_test = X_test[:, split_index_app]
y_app = np.array(y_app)
y_test = np.array(y_test)

@@ -110,43 +121,60 @@ def model_selection_for_precomputed_kernel(datafile, estimator, param_grid_preco
current_test_perf = []

# For regression use the Kernel Ridge method
if model_type == 'regression':
KR = KernelRidge(kernel = 'precomputed', **params_in)
if model_type == 'regression':
KR = KernelRidge(kernel='precomputed', **params_in)
# loop for each split on validation set level
for train_index, valid_index in inner_cv.split(X_app): # validation set level
KR.fit(X_app[train_index,:][:,train_index], y_app[train_index])
# validation set level
for train_index, valid_index in inner_cv.split(X_app):
KR.fit(X_app[train_index, :]
[:, train_index], y_app[train_index])

# predict on the train, validation and test set
y_pred_train = KR.predict(X_app[train_index,:][:,train_index])
y_pred_valid = KR.predict(X_app[valid_index,:][:,train_index])
y_pred_test = KR.predict(X_test[:,train_index])
y_pred_train = KR.predict(
X_app[train_index, :][:, train_index])
y_pred_valid = KR.predict(
X_app[valid_index, :][:, train_index])
y_pred_test = KR.predict(X_test[:, train_index])

# root mean squared errors
current_train_perf.append(np.sqrt(mean_squared_error(y_app[train_index], y_pred_train)))
current_valid_perf.append(np.sqrt(mean_squared_error(y_app[valid_index], y_pred_valid)))
current_test_perf.append(np.sqrt(mean_squared_error(y_test, y_pred_test)))
current_train_perf.append(
np.sqrt(mean_squared_error(y_app[train_index], y_pred_train)))
current_valid_perf.append(
np.sqrt(mean_squared_error(y_app[valid_index], y_pred_valid)))
current_test_perf.append(
np.sqrt(mean_squared_error(y_test, y_pred_test)))
# For clcassification use SVM
else:
KR = SVC(kernel = 'precomputed', **params_in)
KR = SVC(kernel='precomputed', **params_in)
# loop for each split on validation set level
for train_index, valid_index in inner_cv.split(X_app): # validation set level
KR.fit(X_app[train_index,:][:,train_index], y_app[train_index])
# validation set level
for train_index, valid_index in inner_cv.split(X_app):
KR.fit(X_app[train_index, :]
[:, train_index], y_app[train_index])

# predict on the train, validation and test set
y_pred_train = KR.predict(X_app[train_index,:][:,train_index])
y_pred_valid = KR.predict(X_app[valid_index,:][:,train_index])
y_pred_test = KR.predict(X_test[:,train_index])
y_pred_train = KR.predict(
X_app[train_index, :][:, train_index])
y_pred_valid = KR.predict(
X_app[valid_index, :][:, train_index])
y_pred_test = KR.predict(X_test[:, train_index])

# root mean squared errors
current_train_perf.append(accuracy_score(y_app[train_index], y_pred_train))
current_valid_perf.append(accuracy_score(y_app[valid_index], y_pred_valid))
current_test_perf.append(accuracy_score(y_test, y_pred_test))
current_train_perf.append(accuracy_score(
y_app[train_index], y_pred_train))
current_valid_perf.append(accuracy_score(
y_app[valid_index], y_pred_valid))
current_test_perf.append(
accuracy_score(y_test, y_pred_test))

# average performance on inner splits
train_pref[trial][index_out][index_in] = np.mean(current_train_perf)
val_pref[trial][index_out][index_in] = np.mean(current_valid_perf)
test_pref[trial][index_out][index_in] = np.mean(current_test_perf)
train_pref[trial][index_out][index_in] = np.mean(
current_train_perf)
val_pref[trial][index_out][index_in] = np.mean(
current_valid_perf)
test_pref[trial][index_out][index_in] = np.mean(
current_test_perf)

pbar.update(1)
pbar.clear()

@@ -156,7 +184,8 @@ def model_selection_for_precomputed_kernel(datafile, estimator, param_grid_preco
average_train_scores = np.mean(train_pref, axis=0)
average_val_scores = np.mean(val_pref, axis=0)
average_perf_scores = np.mean(test_pref, axis=0)
std_train_scores = np.std(train_pref, axis=0, ddof=1) # sample std is used here
# sample std is used here
std_train_scores = np.std(train_pref, axis=0, ddof=1)
std_val_scores = np.std(val_pref, axis=0, ddof=1)
std_perf_scores = np.std(test_pref, axis=0, ddof=1)

@@ -171,23 +200,34 @@ def model_selection_for_precomputed_kernel(datafile, estimator, param_grid_preco
# print('best_params_index: ', best_params_index)
print('best_params_out: ', best_params_out)
print('best_params_in: ', best_params_in)
print()
print('best_val_perf: ', best_val_perf)

# below: only find one performance; muitiple pref might exist
best_val_std = std_val_scores[best_params_index[0][0]][best_params_index[1][0]]
best_val_std = std_val_scores[best_params_index[0]
[0]][best_params_index[1][0]]
print('best_val_std: ', best_val_std)

final_performance = average_perf_scores[best_params_index[0][0]][best_params_index[1][0]]
final_confidence = std_perf_scores[best_params_index[0][0]][best_params_index[1][0]]
final_performance = average_perf_scores[best_params_index[0]
[0]][best_params_index[1][0]]
final_confidence = std_perf_scores[best_params_index[0]
[0]][best_params_index[1][0]]
print('final_performance: ', final_performance)
print('final_confidence: ', final_confidence)
train_performance = average_train_scores[best_params_index[0][0]][best_params_index[1][0]]
train_std = std_train_scores[best_params_index[0][0]][best_params_index[1][0]]
train_performance = average_train_scores[best_params_index[0]
[0]][best_params_index[1][0]]
train_std = std_train_scores[best_params_index[0]
[0]][best_params_index[1][0]]
print('train_performance: ', train_performance)
print('train_std: ', train_std)

print()
average_gram_matrix_time = np.mean(gram_matrix_time)
std_gram_matrix_time = np.std(gram_matrix_time, ddof=1)
best_gram_matrix_time = gram_matrix_time[best_params_index[0][0]]
print('time to calculate gram matrix: ', best_gram_matrix_time, 's')
print('time to calculate gram matrix with different hyperpapams: {:.2f}±{:.2f}'
.format(average_gram_matrix_time, std_gram_matrix_time))
print('time to calculate best gram matrix: ', best_gram_matrix_time, 's')

# print out as table.
from collections import OrderedDict
@@ -199,15 +239,18 @@ def model_selection_for_precomputed_kernel(datafile, estimator, param_grid_preco
else:
for param_in in param_list:
param_in['C'] = '{:.2e}'.format(param_in['C'])
table_dict['params'] = [ {**param_out, **param_in} for param_in in param_list for param_out in param_list_precomputed ]
table_dict['gram_matrix_time'] = [ '{:.2f}'.format(gram_matrix_time[index_out])
for param_in in param_list for index_out, _ in enumerate(param_list_precomputed) ]
table_dict['valid_perf'] = [ '{:.2f}±{:.2f}'.format(average_val_scores[index_out][index_in], std_val_scores[index_out][index_in])
for index_in, _ in enumerate(param_list) for index_out, _ in enumerate(param_list_precomputed) ]
table_dict['test_perf'] = [ '{:.2f}±{:.2f}'.format(average_perf_scores[index_out][index_in], std_perf_scores[index_out][index_in])
for index_in, _ in enumerate(param_list) for index_out, _ in enumerate(param_list_precomputed) ]
table_dict['train_perf'] = [ '{:.2f}±{:.2f}'.format(average_train_scores[index_out][index_in], std_train_scores[index_out][index_in])
for index_in, _ in enumerate(param_list) for index_out, _ in enumerate(param_list_precomputed) ]
keyorder = ['params', 'train_perf', 'valid_perf', 'test_perf', 'gram_matrix_time']
table_dict['params'] = [{**param_out, **param_in}
for param_in in param_list for param_out in param_list_precomputed]
table_dict['gram_matrix_time'] = ['{:.2f}'.format(gram_matrix_time[index_out])
for param_in in param_list for index_out, _ in enumerate(param_list_precomputed)]
table_dict['valid_perf'] = ['{:.2f}±{:.2f}'.format(average_val_scores[index_out][index_in], std_val_scores[index_out][index_in])
for index_in, _ in enumerate(param_list) for index_out, _ in enumerate(param_list_precomputed)]
table_dict['test_perf'] = ['{:.2f}±{:.2f}'.format(average_perf_scores[index_out][index_in], std_perf_scores[index_out][index_in])
for index_in, _ in enumerate(param_list) for index_out, _ in enumerate(param_list_precomputed)]
table_dict['train_perf'] = ['{:.2f}±{:.2f}'.format(average_train_scores[index_out][index_in], std_train_scores[index_out][index_in])
for index_in, _ in enumerate(param_list) for index_out, _ in enumerate(param_list_precomputed)]
keyorder = ['params', 'train_perf', 'valid_perf',
'test_perf', 'gram_matrix_time']
print()
print(tabulate(OrderedDict(sorted(table_dict.items(), key = lambda i:keyorder.index(i[0]))), headers='keys'))
print(tabulate(OrderedDict(sorted(table_dict.items(),
key=lambda i: keyorder.index(i[0]))), headers='keys'))

Loading…
Cancel
Save