@@ -12,7 +12,7 @@ import matplotlib.pyplot as plt | |||
from numpy.linalg import eig | |||
# read gram matrices from file. | |||
results_dir = 'results/structuralspkernel/' | |||
results_dir = 'results/untilhpathkernel/myria' | |||
ds_name = 'Letter-med' | |||
gmfile = np.load(results_dir + '/' + ds_name + '.gm.npz') | |||
#print('gm time: ', gmfile['gmtime']) | |||
@@ -6,94 +6,116 @@ | |||
"metadata": { | |||
"scrolled": false | |||
}, | |||
"outputs": [], | |||
"outputs": [ | |||
{ | |||
"name": "stdout", | |||
"output_type": "stream", | |||
"text": [ | |||
"\n", | |||
"MAO\n", | |||
"\n", | |||
"--- This is a classification problem ---\n", | |||
"\n", | |||
"\n", | |||
"1. Loading dataset from file...\n", | |||
"\n", | |||
"2. Calculating gram matrices. This could take a while...\n", | |||
"\n", | |||
" None edge weight specified. Set all weight to 1.\n", | |||
"\n", | |||
"getting sp graphs: 68it [00:00, 692.11it/s]\n", | |||
"calculating kernels: 2346it [00:05, 399.28it/s]\n", | |||
"\n", | |||
" --- shortest path kernel matrix of size 68 built in 6.345669507980347 seconds ---\n", | |||
"\n", | |||
"the gram matrix with parameters {'node_kernels': {'symb': <function deltakernel at 0x7fe240afd620>, 'nsymb': <function gaussiankernel at 0x7fe240afd9d8>, 'mix': functools.partial(<function kernelproduct at 0x7fe240aaf0d0>, <function deltakernel at 0x7fe240afd620>, <function gaussiankernel at 0x7fe240afd9d8>)}, 'n_jobs': 8} is: \n", | |||
"\n", | |||
"1 gram matrices are calculated, 0 of which are ignored.\n", | |||
"\n", | |||
"3. Fitting and predicting using nested cross validation. This could really take a while...\n", | |||
"cross validation: 7it [00:09, 4.67s/it]" | |||
] | |||
} | |||
], | |||
"source": [ | |||
"%load_ext line_profiler\n", | |||
"%matplotlib inline\n", | |||
"import functools\n", | |||
"from libs import *\n", | |||
"import multiprocessing\n", | |||
"from sklearn.metrics.pairwise import rbf_kernel\n", | |||
"\n", | |||
"from pygraph.kernels.spKernel import spkernel, spkernel_do\n", | |||
"from pygraph.utils.kernels import deltakernel, kernelsum\n", | |||
"from pygraph.utils.model_selection_precomputed import trial_do\n", | |||
"\n", | |||
"dslist = [ \n", | |||
" {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds', 'task': 'regression'}, # node symb\n", | |||
"# {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression', \n", | |||
"# 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt',}, # contains single node graph, node symb\n", | |||
"# {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds',}, # node/edge symb\n", | |||
"# {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds',}, # unlabeled\n", | |||
"# {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat',\n", | |||
"# 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, # node/edge symb\n", | |||
" {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'},\n", | |||
" {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'}, # node symb/nsymb\n", | |||
" {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, # node/edge symb\n", | |||
" {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat',\n", | |||
" 'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}}, # node symb\n", | |||
"\n", | |||
"# {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb\n", | |||
"# # # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb\n", | |||
"# # # {'name': 'COX2', 'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt'}, # node symb/nsymb\n", | |||
"# {'name': 'Fingerprint', 'dataset': '../datasets/Fingerprint/Fingerprint_A.txt'},\n", | |||
"# \n", | |||
"# # {'name': 'DHFR', 'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt'}, # node symb/nsymb\n", | |||
"# # {'name': 'SYNTHETIC', 'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt'}, # node symb/nsymb\n", | |||
"# # {'name': 'MSRC9', 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'}, # node symb\n", | |||
"# # {'name': 'MSRC21', 'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'}, # node symb\n", | |||
"# # {'name': 'FIRSTMM_DB', 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'}, # node symb/nsymb ,edge nsymb\n", | |||
"\n", | |||
"# # {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'}, # node symb/nsymb\n", | |||
"# # {'name': 'PROTEINS_full', 'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'}, # node symb/nsymb\n", | |||
"# # {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb\n", | |||
"# {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1.mat',\n", | |||
"# 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb\n", | |||
"# {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109.mat',\n", | |||
"# 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb\n", | |||
"# {'name': 'NCI-HIV', 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf',\n", | |||
"# 'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',}, # node/edge symb\n", | |||
" \n", | |||
"# # not working below\n", | |||
"# {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',},\n", | |||
"# {'name': 'PTC_FR', 'dataset': '../datasets/PTC/Train/FR.ds',},\n", | |||
"# {'name': 'PTC_MM', 'dataset': '../datasets/PTC/Train/MM.ds',},\n", | |||
"# {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',},\n", | |||
"from pygraph.kernels.spKernel import spkernel\n", | |||
"from pygraph.utils.kernels import deltakernel, gaussiankernel, kernelproduct\n", | |||
"#from pygraph.utils.model_selection_precomputed import trial_do\n", | |||
"\n", | |||
"dslist = [\n", | |||
"# {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',\n", | |||
"# 'task': 'regression'}, # node symb\n", | |||
"# {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression',\n", | |||
"# 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt', }, \n", | |||
"# # contains single node graph, node symb\n", | |||
" {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds', }, # node/edge symb\n", | |||
"# {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds', }, # unlabeled\n", | |||
"# {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat',\n", | |||
"# 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, # node/edge symb\n", | |||
"# {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'},\n", | |||
"# # node nsymb\n", | |||
"# {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'},\n", | |||
"# # node symb/nsymb\n", | |||
"# {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'},\n", | |||
" # node/edge symb\n", | |||
"# {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat',\n", | |||
"# 'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}}, # node symb\n", | |||
"\n", | |||
" # {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb\n", | |||
" # # # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb\n", | |||
" # # # {'name': 'COX2', 'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt'}, # node symb/nsymb\n", | |||
" # {'name': 'Fingerprint', 'dataset': '../datasets/Fingerprint/Fingerprint_A.txt'},\n", | |||
" #\n", | |||
" # # {'name': 'DHFR', 'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt'}, # node symb/nsymb\n", | |||
" # # {'name': 'SYNTHETIC', 'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt'}, # node symb/nsymb\n", | |||
" # # {'name': 'MSRC9', 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'}, # node symb\n", | |||
" # # {'name': 'MSRC21', 'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'}, # node symb\n", | |||
" # # {'name': 'FIRSTMM_DB', 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'}, # node symb/nsymb ,edge nsymb\n", | |||
"\n", | |||
" # # {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'}, # node symb/nsymb\n", | |||
" # # {'name': 'PROTEINS_full', 'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'}, # node symb/nsymb\n", | |||
" # # {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb\n", | |||
" # {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1.mat',\n", | |||
" # 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb\n", | |||
" # {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109.mat',\n", | |||
" # 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb\n", | |||
" # {'name': 'NCI-HIV', 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf',\n", | |||
" # 'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',}, # node/edge symb\n", | |||
"\n", | |||
" # # not working below\n", | |||
" # {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',},\n", | |||
" # {'name': 'PTC_FR', 'dataset': '../datasets/PTC/Train/FR.ds',},\n", | |||
" # {'name': 'PTC_MM', 'dataset': '../datasets/PTC/Train/MM.ds',},\n", | |||
" # {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',},\n", | |||
"]\n", | |||
"estimator = spkernel\n", | |||
"mixkernel = functools.partial(kernelsum, deltakernel, rbf_kernel)\n", | |||
"param_grid_precomputed = {'node_kernels': [{'symb': deltakernel, 'nsymb': rbf_kernel, 'mix': mixkernel}]}\n", | |||
"param_grid = [{'C': np.logspace(-10, 10, num = 41, base = 10)}, \n", | |||
" {'alpha': np.logspace(-10, 10, num = 41, base = 10)}]\n", | |||
"mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)\n", | |||
"param_grid_precomputed = {'node_kernels': [\n", | |||
" {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}]}\n", | |||
"param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)},\n", | |||
" {'alpha': np.logspace(-10, 10, num=41, base=10)}]\n", | |||
"\n", | |||
"for ds in dslist:\n", | |||
" print()\n", | |||
" print(ds['name'])\n", | |||
" model_selection_for_precomputed_kernel(\n", | |||
" ds['dataset'], \n", | |||
" estimator, \n", | |||
" param_grid_precomputed, \n", | |||
" (param_grid[1] if ('task' in ds and ds['task'] == 'regression') else param_grid[0]), \n", | |||
" (ds['task'] if 'task' in ds else 'classification'), \n", | |||
" ds['dataset'],\n", | |||
" estimator,\n", | |||
" param_grid_precomputed,\n", | |||
" (param_grid[1] if ('task' in ds and ds['task']\n", | |||
" == 'regression') else param_grid[0]),\n", | |||
" (ds['task'] if 'task' in ds else 'classification'),\n", | |||
" NUM_TRIALS=30,\n", | |||
" datafile_y=(ds['dataset_y'] if 'dataset_y' in ds else None),\n", | |||
" extra_params=(ds['extra_params'] if 'extra_params' in ds else None),\n", | |||
" ds_name=ds['name'],\n", | |||
" n_jobs=multiprocessing.cpu_count())\n", | |||
" \n", | |||
"# %lprun -f trial_do -f spkernel -f spkernel_do -f model_selection_for_precomputed_kernel \\\n", | |||
"# model_selection_for_precomputed_kernel( \\\n", | |||
"# ds['dataset'], \\\n", | |||
"# estimator, \\\n", | |||
"# param_grid_precomputed, \\\n", | |||
"# (param_grid[1] if ('task' in ds and ds['task'] == 'regression') else param_grid[0]), \\\n", | |||
"# (ds['task'] if 'task' in ds else 'classification'), \\\n", | |||
"# NUM_TRIALS=30, \\\n", | |||
"# datafile_y=(ds['dataset_y'] if 'dataset_y' in ds else None), \\\n", | |||
"# extra_params=(ds['extra_params'] if 'extra_params' in ds else None), \\\n", | |||
"# ds_name=ds['name'], \\\n", | |||
"# n_jobs=multiprocessing.cpu_count()) \n", | |||
" print()" | |||
" n_jobs=multiprocessing.cpu_count(),\n", | |||
" read_gm_from_file=False)\n", | |||
" print()\n" | |||
] | |||
}, | |||
{ | |||
@@ -713,8 +735,8 @@ | |||
], | |||
"metadata": { | |||
"kernelspec": { | |||
"display_name": "Python 3 (Spyder)", | |||
"language": "python3", | |||
"display_name": "Python 3", | |||
"language": "python", | |||
"name": "python3" | |||
}, | |||
"language_info": { | |||
@@ -727,7 +749,7 @@ | |||
"name": "python", | |||
"nbconvert_exporter": "python", | |||
"pygments_lexer": "ipython3", | |||
"version": "3.5.2" | |||
"version": "3.6.6" | |||
} | |||
}, | |||
"nbformat": 4, | |||
@@ -7,21 +7,21 @@ from pygraph.utils.kernels import deltakernel, gaussiankernel, kernelproduct | |||
#from pygraph.utils.model_selection_precomputed import trial_do | |||
dslist = [ | |||
# {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds', | |||
# 'task': 'regression'}, # node symb | |||
# {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression', | |||
# 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt', }, | |||
# # contains single node graph, node symb | |||
# {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds', }, # node/edge symb | |||
# {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds', }, # unlabeled | |||
# {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat', | |||
# 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, # node/edge symb | |||
{'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds', | |||
'task': 'regression'}, # node symb | |||
{'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression', | |||
'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt', }, | |||
# contains single node graph, node symb | |||
{'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds', }, # node/edge symb | |||
{'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds', }, # unlabeled | |||
{'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat', | |||
'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, # node/edge symb | |||
{'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'}, | |||
# node nsymb | |||
{'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'}, | |||
# node symb/nsymb | |||
# {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, | |||
# # node/edge symb | |||
# node/edge symb | |||
# {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat', | |||
# 'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}}, # node symb | |||
@@ -56,7 +56,7 @@ estimator = spkernel | |||
mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) | |||
param_grid_precomputed = {'node_kernels': [ | |||
{'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}]} | |||
param_grid = [{'C': np.logspace(-10, 3, num=27, base=10)}, | |||
param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)}, | |||
{'alpha': np.logspace(-10, 10, num=41, base=10)}] | |||
for ds in dslist: | |||
@@ -23,10 +23,10 @@ dslist = [ | |||
# {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds', }, # unlabeled | |||
# {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat', | |||
# 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, # node/edge symb | |||
{'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'}, | |||
# node nsymb | |||
# {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'}, | |||
# # node symb/nsymb | |||
# {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'}, | |||
# # node nsymb | |||
{'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'}, | |||
# node symb/nsymb | |||
# {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, | |||
# # node/edge symb | |||
# {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat', | |||
@@ -39,8 +39,8 @@ dslist = [ | |||
# | |||
# # {'name': 'DHFR', 'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt'}, # node symb/nsymb | |||
# # {'name': 'SYNTHETIC', 'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt'}, # node symb/nsymb | |||
# # {'name': 'MSRC9', 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'}, # node symb, missing values | |||
# # {'name': 'MSRC21', 'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'}, # node symb, missing values | |||
# {'name': 'MSRC9', 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'}, # node symb, missing values | |||
# {'name': 'MSRC21', 'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'}, # node symb, missing values | |||
# # {'name': 'FIRSTMM_DB', 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'}, # node symb/nsymb ,edge nsymb | |||
# # {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'}, # node symb/nsymb | |||
@@ -53,8 +53,8 @@ dslist = [ | |||
# {'name': 'NCI-HIV', 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf', | |||
# 'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',}, # node/edge symb | |||
# # not working below | |||
# {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',}, | |||
# # not working below | |||
# {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',}, | |||
# {'name': 'PTC_FR', 'dataset': '../datasets/PTC/Train/FR.ds',}, | |||
# {'name': 'PTC_MM', 'dataset': '../datasets/PTC/Train/MM.ds',}, | |||
# {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',}, | |||
@@ -62,7 +62,7 @@ dslist = [ | |||
] | |||
estimator = untilhpathkernel | |||
mixkernel = functools.partial(kernelproduct, deltakernel, rbf_kernel) | |||
param_grid_precomputed = {'depth': np.linspace(7, 10, 10), | |||
param_grid_precomputed = {'depth': np.linspace(1, 10, 10), | |||
'k_func': ['tanimoto', 'MinMax']} | |||
param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)}, | |||
{'alpha': np.logspace(-10, 10, num=41, base=10)}] | |||
@@ -1,77 +0,0 @@ | |||
#!/usr/bin/env python3 | |||
# -*- coding: utf-8 -*- | |||
""" | |||
Created on Fri Sep 28 16:37:29 2018 | |||
@author: ljia | |||
""" | |||
import functools | |||
from libs import * | |||
import multiprocessing | |||
from sklearn.metrics.pairwise import rbf_kernel | |||
from pygraph.kernels.structuralspKernel import structuralspkernel | |||
from pygraph.utils.kernels import deltakernel, kernelproduct | |||
dslist = [ | |||
# {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds', | |||
# 'task': 'regression'}, # node symb | |||
# {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression', | |||
# 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt', }, # contains single node graph, node symb | |||
{'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds', }, # node/edge symb | |||
# {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb | |||
# # # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb | |||
# # # {'name': 'COX2', 'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt'}, # node symb/nsymb | |||
# {'name': 'Fingerprint', 'dataset': '../datasets/Fingerprint/Fingerprint_A.txt'}, | |||
# | |||
# # {'name': 'DHFR', 'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt'}, # node symb/nsymb | |||
# # {'name': 'SYNTHETIC', 'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt'}, # node symb/nsymb | |||
# # {'name': 'MSRC9', 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'}, # node symb | |||
# # {'name': 'MSRC21', 'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'}, # node symb | |||
# # {'name': 'FIRSTMM_DB', 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'}, # node symb/nsymb ,edge nsymb | |||
# # {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'}, # node symb/nsymb | |||
# # {'name': 'PROTEINS_full', 'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'}, # node symb/nsymb | |||
# # {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb | |||
# {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1.mat', | |||
# 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb | |||
# {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109.mat', | |||
# 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb | |||
# {'name': 'NCI-HIV', 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf', | |||
# 'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',}, # node/edge symb | |||
# # not working below | |||
# {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',}, | |||
# {'name': 'PTC_FR', 'dataset': '../datasets/PTC/Train/FR.ds',}, | |||
# {'name': 'PTC_MM', 'dataset': '../datasets/PTC/Train/MM.ds',}, | |||
# {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',}, | |||
] | |||
estimator = structuralspkernel | |||
mixkernel = functools.partial(kernelproduct, deltakernel, rbf_kernel) | |||
param_grid_precomputed = {'node_kernels': | |||
[{'symb': deltakernel, 'nsymb': rbf_kernel, 'mix': mixkernel}], | |||
'edge_kernels': | |||
[{'symb': deltakernel, 'nsymb': rbf_kernel, 'mix': mixkernel}]} | |||
param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)}, | |||
{'alpha': np.logspace(-10, 10, num=41, base=10)}] | |||
for ds in dslist: | |||
print() | |||
print(ds['name']) | |||
model_selection_for_precomputed_kernel( | |||
ds['dataset'], | |||
estimator, | |||
param_grid_precomputed, | |||
(param_grid[1] if ('task' in ds and ds['task'] | |||
== 'regression') else param_grid[0]), | |||
(ds['task'] if 'task' in ds else 'classification'), | |||
NUM_TRIALS=30, | |||
datafile_y=(ds['dataset_y'] if 'dataset_y' in ds else None), | |||
extra_params=(ds['extra_params'] if 'extra_params' in ds else None), | |||
ds_name=ds['name'], | |||
n_jobs=multiprocessing.cpu_count(), | |||
read_gm_from_file=False) | |||
print() |
@@ -85,21 +85,20 @@ def commonwalkkernel(*args, | |||
# ---- use pool.imap_unordered to parallel and track progress. ---- | |||
pool = Pool(n_jobs) | |||
itr = combinations_with_replacement(range(0, len(Gn)), 2) | |||
itr = zip(combinations_with_replacement(Gn, 2), | |||
combinations_with_replacement(range(0, len(Gn)), 2)) | |||
len_itr = int(len(Gn) * (len(Gn) + 1) / 2) | |||
if len_itr < 1000 * n_jobs: | |||
chunksize = int(len_itr / n_jobs) + 1 | |||
else: | |||
chunksize = 100 | |||
chunksize = 1000 | |||
# direct product graph method - exponential | |||
if compute_method == 'exp': | |||
do_partial = partial(_commonwalkkernel_exp, Gn, node_label, edge_label, | |||
weight) | |||
do_partial = partial(wrapper_cw_exp, node_label, edge_label, weight) | |||
# direct product graph method - geometric | |||
elif compute_method == 'geo': | |||
do_partial = partial(_commonwalkkernel_geo, Gn, node_label, edge_label, | |||
weight) | |||
do_partial = partial(wrapper_cw_geo, node_label, edge_label, weight) | |||
for i, j, kernel in tqdm( | |||
pool.imap_unordered(do_partial, itr, chunksize), | |||
@@ -153,7 +152,7 @@ def commonwalkkernel(*args, | |||
return Kmatrix, run_time | |||
def _commonwalkkernel_exp(Gn, node_label, edge_label, beta, ij): | |||
def _commonwalkkernel_exp(g1, g2, node_label, edge_label, beta): | |||
"""Calculate walk graph kernels up to n between 2 graphs using exponential | |||
series. | |||
@@ -175,10 +174,6 @@ def _commonwalkkernel_exp(Gn, node_label, edge_label, beta, ij): | |||
kernel : float | |||
The common walk Kernel between 2 graphs. | |||
""" | |||
iglobal = ij[0] | |||
jglobal = ij[1] | |||
g1 = Gn[iglobal] | |||
g2 = Gn[jglobal] | |||
# get tensor product / direct product | |||
gp = direct_product(g1, g2, node_label, edge_label) | |||
@@ -219,10 +214,18 @@ def _commonwalkkernel_exp(Gn, node_label, edge_label, beta, ij): | |||
# print(np.exp(weight * A)) | |||
# print('-------') | |||
return iglobal, jglobal, exp_D.sum() | |||
return exp_D.sum() | |||
def _commonwalkkernel_geo(Gn, node_label, edge_label, gamma, ij): | |||
def wrapper_cw_exp(node_label, edge_label, beta, itr_item): | |||
g1 = itr_item[0][0] | |||
g2 = itr_item[0][1] | |||
i = itr_item[1][0] | |||
j = itr_item[1][1] | |||
return i, j, _commonwalkkernel_exp(g1, g2, node_label, edge_label, beta) | |||
def _commonwalkkernel_geo(g1, g2, node_label, edge_label, gamma): | |||
"""Calculate common walk graph kernels up to n between 2 graphs using | |||
geometric series. | |||
@@ -244,19 +247,22 @@ def _commonwalkkernel_geo(Gn, node_label, edge_label, gamma, ij): | |||
kernel : float | |||
The common walk Kernel between 2 graphs. | |||
""" | |||
iglobal = ij[0] | |||
jglobal = ij[1] | |||
g1 = Gn[iglobal] | |||
g2 = Gn[jglobal] | |||
# get tensor product / direct product | |||
gp = direct_product(g1, g2, node_label, edge_label) | |||
A = nx.adjacency_matrix(gp).todense() | |||
mat = np.identity(len(A)) - gamma * A | |||
try: | |||
return iglobal, jglobal, mat.I.sum() | |||
return mat.I.sum() | |||
except np.linalg.LinAlgError: | |||
return iglobal, jglobal, np.nan | |||
return np.nan | |||
def wrapper_cw_geo(node_label, edge_label, gama, itr_item): | |||
g1 = itr_item[0][0] | |||
g2 = itr_item[0][1] | |||
i = itr_item[1][0] | |||
j = itr_item[1][1] | |||
return i, j, _commonwalkkernel_geo(g1, g2, node_label, edge_label, gama) | |||
def _commonwalkkernel_brute(walks1, | |||
@@ -8,7 +8,6 @@ import sys | |||
import time | |||
from itertools import combinations_with_replacement, product | |||
from functools import partial | |||
from joblib import Parallel, delayed | |||
from multiprocessing import Pool | |||
from tqdm import tqdm | |||
@@ -89,7 +88,8 @@ def spkernel(*args, | |||
pool = Pool(n_jobs) | |||
# get shortest path graphs of Gn | |||
getsp_partial = partial(wrap_getSPGraph, Gn, weight) | |||
getsp_partial = partial(wrapper_getSPGraph, weight) | |||
itr = zip(Gn, range(0, len(Gn))) | |||
if len(Gn) < 1000 * n_jobs: | |||
# # use default chunksize as pool.map when iterable is less than 100 | |||
# chunksize, extra = divmod(len(Gn), n_jobs * 4) | |||
@@ -98,9 +98,8 @@ def spkernel(*args, | |||
chunksize = int(len(Gn) / n_jobs) + 1 | |||
else: | |||
chunksize = 1000 | |||
# chunksize = 300 # int(len(list(itr)) / n_jobs) | |||
for i, g in tqdm( | |||
pool.imap_unordered(getsp_partial, range(0, len(Gn)), chunksize), | |||
pool.imap_unordered(getsp_partial, itr, chunksize), | |||
desc='getting sp graphs', file=sys.stdout): | |||
Gn[i] = g | |||
pool.close() | |||
@@ -144,8 +143,9 @@ def spkernel(*args, | |||
# ---- use pool.imap_unordered to parallel and track progress. ---- | |||
pool = Pool(n_jobs) | |||
do_partial = partial(spkernel_do, Gn, ds_attrs, node_label, node_kernels) | |||
itr = combinations_with_replacement(range(0, len(Gn)), 2) | |||
do_partial = partial(wrapper_sp_do, ds_attrs, node_label, node_kernels) | |||
itr = zip(combinations_with_replacement(Gn, 2), | |||
combinations_with_replacement(range(0, len(Gn)), 2)) | |||
len_itr = int(len(Gn) * (len(Gn) + 1) / 2) | |||
if len_itr < 1000 * n_jobs: | |||
chunksize = int(len_itr / n_jobs) + 1 | |||
@@ -200,15 +200,10 @@ def spkernel(*args, | |||
return Kmatrix, run_time, idx | |||
def spkernel_do(Gn, ds_attrs, node_label, node_kernels, ij): | |||
i = ij[0] | |||
j = ij[1] | |||
g1 = Gn[i] | |||
g2 = Gn[j] | |||
def spkernel_do(g1, g2, ds_attrs, node_label, node_kernels): | |||
kernel = 0 | |||
# try: | |||
# compute shortest path matrices first, method borrowed from FCSP. | |||
if ds_attrs['node_labeled']: | |||
# node symb and non-synb labeled | |||
@@ -243,7 +238,7 @@ def spkernel_do(Gn, ds_attrs, node_label, node_kernels, ij): | |||
g1.edges(data=True), g2.edges(data=True)): | |||
if e1[2]['cost'] == e2[2]['cost']: | |||
kernel += 1 | |||
return i, j, kernel | |||
return kernel | |||
# compute graph kernels | |||
if ds_attrs['is_directed']: | |||
@@ -293,12 +288,20 @@ def spkernel_do(Gn, ds_attrs, node_label, node_kernels, ij): | |||
# kn1 = vk_mat[x1][x2] * vk_mat[y1][y2] | |||
# kn2 = vk_mat[x1][y2] * vk_mat[y1][x2] | |||
# kernel += kn1 + kn2 | |||
# except KeyError: # missing labels or attributes | |||
# pass | |||
return i, j, kernel | |||
return kernel | |||
def wrapper_sp_do(ds_attrs, node_label, node_kernels, itr_item): | |||
g1 = itr_item[0][0] | |||
g2 = itr_item[0][1] | |||
i = itr_item[1][0] | |||
j = itr_item[1][1] | |||
return i, j, spkernel_do(g1, g2, ds_attrs, node_label, node_kernels) | |||
def wrap_getSPGraph(Gn, weight, i): | |||
return i, getSPGraph(Gn[i], edge_weight=weight) | |||
# return i, nx.floyd_warshall_numpy(Gn[i], weight=weight) | |||
def wrapper_getSPGraph(weight, itr_item): | |||
g = itr_item[0] | |||
i = itr_item[1] | |||
return i, getSPGraph(g, edge_weight=weight) | |||
# return i, nx.floyd_warshall_numpy(g, weight=weight) |
@@ -12,7 +12,6 @@ import sys | |||
import time | |||
from itertools import combinations, combinations_with_replacement, product | |||
from functools import partial | |||
from joblib import Parallel, delayed | |||
from multiprocessing import Pool | |||
from tqdm import tqdm | |||
@@ -71,7 +70,6 @@ def structuralspkernel(*args, | |||
""" | |||
# pre-process | |||
Gn = args[0] if len(args) == 1 else [args[0], args[1]] | |||
weight = None | |||
if edge_weight is None: | |||
print('\n None edge weight specified. Set all weight to 1.\n') | |||
@@ -98,34 +96,61 @@ def structuralspkernel(*args, | |||
start_time = time.time() | |||
# get shortest paths of each graph in Gn | |||
splist = [[] for _ in range(len(Gn))] | |||
splist = [None] * len(Gn) | |||
pool = Pool(n_jobs) | |||
# get shortest path graphs of Gn | |||
getsp_partial = partial(wrap_getSP, Gn, weight, ds_attrs['is_directed']) | |||
getsp_partial = partial(wrapper_getSP, weight, ds_attrs['is_directed']) | |||
itr = zip(Gn, range(0, len(Gn))) | |||
if len(Gn) < 1000 * n_jobs: | |||
chunksize = int(len(Gn) / n_jobs) + 1 | |||
else: | |||
chunksize = 1000 | |||
# chunksize = 300 # int(len(list(itr)) / n_jobs) | |||
for i, sp in tqdm( | |||
pool.imap_unordered(getsp_partial, range(0, len(Gn)), chunksize), | |||
pool.imap_unordered(getsp_partial, itr, chunksize), | |||
desc='getting shortest paths', | |||
file=sys.stdout): | |||
splist[i] = sp | |||
# time.sleep(10) | |||
pool.close() | |||
pool.join() | |||
# # ---- use pool.map to parallel ---- | |||
# result_sp = pool.map(getsp_partial, range(0, len(Gn))) | |||
# for i in result_sp: | |||
# Gn[i[0]] = i[1] | |||
# or | |||
# getsp_partial = partial(wrap_getSP, Gn, weight) | |||
# for i, g in tqdm( | |||
# pool.map(getsp_partial, range(0, len(Gn))), | |||
# desc='getting sp graphs', | |||
# file=sys.stdout): | |||
# Gn[i] = g | |||
# # get shortest paths of each graph in Gn | |||
# splist = [[] for _ in range(len(Gn))] | |||
# # get shortest path graphs of Gn | |||
# getsp_partial = partial(wrapper_getSP, weight, ds_attrs['is_directed']) | |||
# itr = zip(Gn, range(0, len(Gn))) | |||
# if len(Gn) < 1000 * n_jobs: | |||
# chunksize = int(len(Gn) / n_jobs) + 1 | |||
# else: | |||
# chunksize = 1000 | |||
# # chunksize = 300 # int(len(list(itr)) / n_jobs) | |||
# from contextlib import closing | |||
# with closing(Pool(n_jobs)) as pool: | |||
## for i, sp in tqdm( | |||
# res = pool.imap_unordered(getsp_partial, itr, 10) | |||
## desc='getting shortest paths', | |||
## file=sys.stdout): | |||
## splist[i] = sp | |||
## time.sleep(10) | |||
# pool.close() | |||
# pool.join() | |||
# ss = 0 | |||
# ss += sys.getsizeof(splist) | |||
# for spss in splist: | |||
# ss += sys.getsizeof(spss) | |||
# for spp in spss: | |||
# ss += sys.getsizeof(spp) | |||
# time.sleep(20) | |||
# # ---- direct running, normally use single CPU core. ---- | |||
# splist = [] | |||
# for g in tqdm(Gn, desc='getting sp graphs', file=sys.stdout): | |||
# splist.append(get_shortest_paths(g, weight, ds_attrs['is_directed'])) | |||
# # ---- only for the Fast Computation of Shortest Path Kernel (FCSP) | |||
# sp_ml = [0] * len(Gn) # shortest path matrices | |||
@@ -149,9 +174,11 @@ def structuralspkernel(*args, | |||
# ---- use pool.imap_unordered to parallel and track progress. ---- | |||
pool = Pool(n_jobs) | |||
do_partial = partial(structuralspkernel_do, Gn, splist, ds_attrs, | |||
node_label, edge_label, node_kernels, edge_kernels) | |||
itr = combinations_with_replacement(range(0, len(Gn)), 2) | |||
do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label, | |||
node_kernels, edge_kernels) | |||
itr = zip(combinations_with_replacement(Gn, 2), | |||
combinations_with_replacement(splist, 2), | |||
combinations_with_replacement(range(0, len(Gn)), 2)) | |||
len_itr = int(len(Gn) * (len(Gn) + 1) / 2) | |||
if len_itr < 1000 * n_jobs: | |||
chunksize = int(len_itr / n_jobs) + 1 | |||
@@ -166,36 +193,36 @@ def structuralspkernel(*args, | |||
pool.close() | |||
pool.join() | |||
# # ---- use pool.map to parallel. ---- | |||
# # result_perf = pool.map(do_partial, itr) | |||
# do_partial = partial(spkernel_do, Gn, ds_attrs, node_label, node_kernels) | |||
# itr = combinations_with_replacement(range(0, len(Gn)), 2) | |||
# for i, j, kernel in tqdm( | |||
# pool.map(do_partial, itr), desc='calculating kernels', | |||
# file=sys.stdout): | |||
# Kmatrix[i][j] = kernel | |||
# Kmatrix[j][i] = kernel | |||
# pool.close() | |||
# pool.join() | |||
# # ---- use joblib.Parallel to parallel and track progress. ---- | |||
# result_perf = Parallel( | |||
# n_jobs=n_jobs, verbose=10)( | |||
# delayed(do_partial)(ij) | |||
# for ij in combinations_with_replacement(range(0, len(Gn)), 2)) | |||
# result_perf = [ | |||
# do_partial(ij) | |||
# for ij in combinations_with_replacement(range(0, len(Gn)), 2) | |||
# ] | |||
# for i in result_perf: | |||
# Kmatrix[i[0]][i[1]] = i[2] | |||
# Kmatrix[i[1]][i[0]] = i[2] | |||
# # ---- use pool.imap_unordered to parallel and track progress. ---- | |||
# do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label, | |||
# node_kernels, edge_kernels) | |||
# itr = zip(combinations_with_replacement(Gn, 2), | |||
# combinations_with_replacement(splist, 2), | |||
# combinations_with_replacement(range(0, len(Gn)), 2)) | |||
# len_itr = int(len(Gn) * (len(Gn) + 1) / 2) | |||
# if len_itr < 1000 * n_jobs: | |||
# chunksize = int(len_itr / n_jobs) + 1 | |||
# else: | |||
# chunksize = 1000 | |||
# from contextlib import closing | |||
# with closing(Pool(n_jobs)) as pool: | |||
# for i, j, kernel in tqdm( | |||
# pool.imap_unordered(do_partial, itr, 1000), | |||
# desc='calculating kernels', | |||
# file=sys.stdout): | |||
# Kmatrix[i][j] = kernel | |||
# Kmatrix[j][i] = kernel | |||
# pool.close() | |||
# pool.join() | |||
# # ---- direct running, normally use single CPU core. ---- | |||
# itr = combinations_with_replacement(range(0, len(Gn)), 2) | |||
# itr = zip(combinations_with_replacement(Gn, 2), | |||
# combinations_with_replacement(splist, 2), | |||
# combinations_with_replacement(range(0, len(Gn)), 2)) | |||
# for gs in tqdm(itr, desc='calculating kernels', file=sys.stdout): | |||
# i, j, kernel = structuralspkernel_do(Gn, splist, ds_attrs, | |||
# node_label, edge_label, node_kernels, edge_kernels, gs) | |||
# i, j, kernel = wrapper_ssp_do(ds_attrs, node_label, edge_label, | |||
# node_kernels, edge_kernels, gs) | |||
# if(kernel > 1): | |||
# print("error here ") | |||
# Kmatrix[i][j] = kernel | |||
@@ -209,18 +236,11 @@ def structuralspkernel(*args, | |||
return Kmatrix, run_time | |||
def structuralspkernel_do(Gn, splist, ds_attrs, node_label, edge_label, | |||
node_kernels, edge_kernels, ij): | |||
iglobal = ij[0] | |||
jglobal = ij[1] | |||
g1 = Gn[iglobal] | |||
g2 = Gn[jglobal] | |||
spl1 = splist[iglobal] | |||
spl2 = splist[jglobal] | |||
def structuralspkernel_do(g1, g2, spl1, spl2, ds_attrs, node_label, edge_label, | |||
node_kernels, edge_kernels): | |||
kernel = 0 | |||
#try: | |||
# First, compute shortest path matrices, method borrowed from FCSP. | |||
if ds_attrs['node_labeled']: | |||
# node symb and non-synb labeled | |||
@@ -369,11 +389,19 @@ def structuralspkernel_do(Gn, splist, ds_attrs, node_label, edge_label, | |||
# kn1 = vk_mat[x1][x2] * vk_mat[y1][y2] | |||
# kn2 = vk_mat[x1][y2] * vk_mat[y1][x2] | |||
# Kmatrix += kn1 + kn2 | |||
#except KeyError: # missing labels or attributes | |||
# print("toto") | |||
# pass | |||
return kernel | |||
return iglobal, jglobal, kernel | |||
def wrapper_ssp_do(ds_attrs, node_label, edge_label, node_kernels, | |||
edge_kernels, itr_item): | |||
g1 = itr_item[0][0] | |||
g2 = itr_item[0][1] | |||
spl1 = itr_item[1][0] | |||
spl2 = itr_item[1][1] | |||
i = itr_item[2][0] | |||
j = itr_item[2][1] | |||
return i, j, structuralspkernel_do(g1, g2, spl1, spl2, ds_attrs, | |||
node_label, edge_label, node_kernels, edge_kernels) | |||
def get_shortest_paths(G, weight, directed): | |||
@@ -397,17 +425,21 @@ def get_shortest_paths(G, weight, directed): | |||
for n1, n2 in combinations(G.nodes(), 2): | |||
try: | |||
spltemp = list(nx.all_shortest_paths(G, n1, n2, weight=weight)) | |||
except nx.NetworkXNoPath: # nodes not connected | |||
# sp.append([]) | |||
pass | |||
else: | |||
sp += spltemp | |||
# each edge walk is counted twice, starting from both its extreme nodes. | |||
if not directed: | |||
sp += [sptemp[::-1] for sptemp in spltemp] | |||
except nx.NetworkXNoPath: # nodes not connected | |||
# sp.append([]) | |||
pass | |||
# add single nodes as length 0 paths. | |||
sp += [[n] for n in G.nodes()] | |||
return sp | |||
def wrap_getSP(Gn, weight, directed, i): | |||
return i, get_shortest_paths(Gn[i], weight, directed) | |||
def wrapper_getSP(weight, directed, itr_item): | |||
g = itr_item[0] | |||
i = itr_item[1] | |||
return i, get_shortest_paths(g, weight, directed) |
@@ -13,7 +13,6 @@ from itertools import chain, combinations_with_replacement | |||
from functools import partial | |||
from multiprocessing import Pool | |||
from tqdm import tqdm | |||
import traceback | |||
import networkx as nx | |||
import numpy as np | |||
@@ -77,15 +76,15 @@ def untilhpathkernel(*args, | |||
# but this may cost a lot of memory for large datasets. | |||
pool = Pool(n_jobs) | |||
all_paths = [[] for _ in range(len(Gn))] | |||
getps_partial = partial(wrap_find_all_paths_until_length, Gn, depth, | |||
getps_partial = partial(wrapper_find_all_paths_until_length, depth, | |||
ds_attrs, node_label, edge_label) | |||
itr = zip(Gn, range(0, len(Gn))) | |||
if len(Gn) < 1000 * n_jobs: | |||
chunksize = int(len(Gn) / n_jobs) + 1 | |||
else: | |||
chunksize = 1000 | |||
# chunksize = 300 # int(len(list(itr)) / n_jobs) | |||
for i, ps in tqdm( | |||
pool.imap_unordered(getps_partial, range(0, len(Gn)), chunksize), | |||
pool.imap_unordered(getps_partial, itr, chunksize), | |||
desc='getting paths', file=sys.stdout): | |||
all_paths[i] = ps | |||
pool.close() | |||
@@ -110,8 +109,9 @@ def untilhpathkernel(*args, | |||
pass | |||
else: | |||
pool = Pool(n_jobs) | |||
do_partial = partial(_untilhpathkernel_do_naive, all_paths, k_func) | |||
itr = combinations_with_replacement(range(0, len(Gn)), 2) | |||
do_partial = partial(wrapper_uhpath_do_naive, k_func) | |||
itr = zip(combinations_with_replacement(all_paths, 2), | |||
combinations_with_replacement(range(0, len(Gn)), 2)) | |||
len_itr = int(len(Gn) * (len(Gn) + 1) / 2) | |||
if len_itr < 1000 * n_jobs: | |||
chunksize = int(len_itr / n_jobs) + 1 | |||
@@ -216,7 +216,7 @@ def _untilhpathkernel_do_gst(gst1, gst2, paths1, paths2, k_func): | |||
return kernel | |||
def _untilhpathkernel_do_naive(paths_list, k_func, ij): | |||
def _untilhpathkernel_do_naive(paths1, paths2, k_func): | |||
"""Calculate path graph kernels up to depth d between 2 graphs naively. | |||
Parameters | |||
@@ -235,10 +235,6 @@ def _untilhpathkernel_do_naive(paths_list, k_func, ij): | |||
kernel : float | |||
Path kernel up to h between 2 graphs. | |||
""" | |||
iglobal = ij[0] | |||
jglobal = ij[1] | |||
paths1 = paths_list[iglobal] | |||
paths2 = paths_list[jglobal] | |||
all_paths = list(set(paths1 + paths2)) | |||
if k_func == 'tanimoto': | |||
@@ -260,12 +256,18 @@ def _untilhpathkernel_do_naive(paths_list, k_func, ij): | |||
kernel = np.sum(np.minimum(vector1, vector2)) / \ | |||
np.sum(np.maximum(vector1, vector2)) | |||
return iglobal, jglobal, kernel | |||
return kernel | |||
# @todo: (can be removed maybe) this method find paths repetively, it could be faster. | |||
def wrapper_uhpath_do_naive(k_func, itr_item): | |||
plist1 = itr_item[0][0] | |||
plist2 = itr_item[0][1] | |||
i = itr_item[1][0] | |||
j = itr_item[1][1] | |||
return i, j, _untilhpathkernel_do_naive(plist1, plist2, k_func) | |||
# @todo: (can be removed maybe) this method find paths repetively, it could be faster. | |||
def find_all_paths_until_length(G, | |||
length, | |||
ds_attrs, | |||
@@ -368,15 +370,12 @@ def find_all_paths_until_length(G, | |||
return [tuple([len(path)]) for path in all_paths] | |||
def wrap_find_all_paths_until_length(Gn, length, ds_attrs, node_label, | |||
edge_label, i): | |||
try: | |||
return i, find_all_paths_until_length(Gn[i], length, ds_attrs, | |||
def wrapper_find_all_paths_until_length(length, ds_attrs, node_label, | |||
edge_label, itr_item): | |||
g = itr_item[0] | |||
i = itr_item[1] | |||
return i, find_all_paths_until_length(g, length, ds_attrs, | |||
node_label=node_label, edge_label=edge_label) | |||
except Exception as e: | |||
traceback.print_exc() | |||
print('') | |||
raise e | |||
def paths2GSuffixTree(paths): | |||
@@ -206,54 +206,50 @@ def model_selection_for_precomputed_kernel(datafile, | |||
'3. Fitting and predicting using nested cross validation. This could really take a while...' | |||
) | |||
# pool = Pool(n_jobs) | |||
# trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, gram_matrices, y, model_type) | |||
# train_pref = [] | |||
# val_pref = [] | |||
# test_pref = [] | |||
## if NUM_TRIALS < 1000 * n_jobs: | |||
## chunksize = int(NUM_TRIALS / n_jobs) + 1 | |||
## else: | |||
## chunksize = 1000 | |||
# chunksize = 1 | |||
# for o1, o2, o3 in tqdm(pool.imap_unordered(trial_do_partial, range(NUM_TRIALS), chunksize), desc='cross validation', file=sys.stdout): | |||
# train_pref.append(o1) | |||
# val_pref.append(o2) | |||
# test_pref.append(o3) | |||
# pool.close() | |||
# pool.join() | |||
# ---- use pool.map to parallel. ---- | |||
pool = Pool(n_jobs) | |||
trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, gram_matrices, y, model_type) | |||
train_pref = [] | |||
val_pref = [] | |||
test_pref = [] | |||
# if NUM_TRIALS < 100: | |||
# chunksize, extra = divmod(NUM_TRIALS, n_jobs * 4) | |||
# if extra: | |||
# chunksize += 1 | |||
# else: | |||
# chunksize = 100 | |||
chunksize = 1 | |||
for o1, o2, o3 in tqdm(pool.imap_unordered(trial_do_partial, range(NUM_TRIALS), chunksize), desc='cross validation', file=sys.stdout): | |||
train_pref.append(o1) | |||
val_pref.append(o2) | |||
test_pref.append(o3) | |||
pool.close() | |||
pool.join() | |||
# # ---- use pool.map to parallel. ---- | |||
# result_perf = pool.map(trial_do_partial, range(NUM_TRIALS)) | |||
# train_pref = [item[0] for item in result_perf] | |||
# val_pref = [item[1] for item in result_perf] | |||
# test_pref = [item[2] for item in result_perf] | |||
result_perf = pool.map(trial_do_partial, range(NUM_TRIALS)) | |||
train_pref = [item[0] for item in result_perf] | |||
val_pref = [item[1] for item in result_perf] | |||
test_pref = [item[2] for item in result_perf] | |||
# # ---- use joblib.Parallel to parallel and track progress. ---- | |||
# trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, gram_matrices, y, model_type) | |||
# result_perf = Parallel(n_jobs=n_jobs, verbose=10)(delayed(trial_do_partial)(trial) for trial in range(NUM_TRIALS)) | |||
# train_pref = [item[0] for item in result_perf] | |||
# val_pref = [item[1] for item in result_perf] | |||
# test_pref = [item[2] for item in result_perf] | |||
# # ---- direct running, normally use a single CPU core. ---- | |||
# train_pref = [] | |||
# val_pref = [] | |||
# test_pref = [] | |||
# for i in tqdm(range(NUM_TRIALS), desc='cross validation', file=sys.stdout): | |||
# o1, o2, o3 = trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, i) | |||
# train_pref.append(o1) | |||
# val_pref.append(o2) | |||
# test_pref.append(o3) | |||
# # ---- direct running, normally use a single CPU core. ---- | |||
# train_pref = [] | |||
# val_pref = [] | |||
# test_pref = [] | |||
# for i in tqdm(range(NUM_TRIALS), desc='cross validation', file=sys.stdout): | |||
# o1, o2, o3 = trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, i) | |||
# train_pref.append(o1) | |||
# val_pref.append(o2) | |||
# test_pref.append(o3) | |||
# print() | |||
print() | |||
print('4. Getting final performance...') | |||
str_fw += '\nIII. Performance.\n\n' | |||
# averages and confidences of performances on outer trials for each combination of parameters | |||
average_train_scores = np.mean(train_pref, axis=0) | |||
# print('val_pref: ', val_pref[0][0]) | |||
average_val_scores = np.mean(val_pref, axis=0) | |||
# print('test_pref: ', test_pref[0][0]) | |||
average_perf_scores = np.mean(test_pref, axis=0) | |||
# sample std is used here | |||
std_train_scores = np.std(train_pref, axis=0, ddof=1) | |||
@@ -264,6 +260,9 @@ def model_selection_for_precomputed_kernel(datafile, | |||
best_val_perf = np.amin(average_val_scores) | |||
else: | |||
best_val_perf = np.amax(average_val_scores) | |||
# print('average_val_scores: ', average_val_scores) | |||
# print('best_val_perf: ', best_val_perf) | |||
# print() | |||
best_params_index = np.where(average_val_scores == best_val_perf) | |||
# find smallest val std with best val perf. | |||
best_val_stds = [ | |||
@@ -286,6 +285,9 @@ def model_selection_for_precomputed_kernel(datafile, | |||
str_fw += 'best_val_perf: %s\n' % best_val_perf | |||
str_fw += 'best_val_std: %s\n' % min_val_std | |||
# print(best_params_index) | |||
# print(best_params_index[0]) | |||
# print(average_perf_scores) | |||
final_performance = [ | |||
average_perf_scores[value][best_params_index[1][idx]] | |||
for idx, value in enumerate(best_params_index[0]) | |||
@@ -429,23 +431,23 @@ def model_selection_for_precomputed_kernel(datafile, | |||
'3. Fitting and predicting using nested cross validation. This could really take a while...' | |||
) | |||
# pool = Pool(n_jobs) | |||
# trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, gram_matrices, y, model_type) | |||
# train_pref = [] | |||
# val_pref = [] | |||
# test_pref = [] | |||
# if NUM_TRIALS < 100: | |||
# chunksize, extra = divmod(NUM_TRIALS, n_jobs * 4) | |||
# if extra: | |||
# chunksize += 1 | |||
# else: | |||
# chunksize = 100 | |||
# for o1, o2, o3 in tqdm(pool.imap_unordered(trial_do_partial, range(NUM_TRIALS), chunksize), desc='cross validation', file=sys.stdout): | |||
# train_pref.append(o1) | |||
# val_pref.append(o2) | |||
# test_pref.append(o3) | |||
# pool.close() | |||
# pool.join() | |||
pool = Pool(n_jobs) | |||
trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, gram_matrices, y, model_type) | |||
train_pref = [] | |||
val_pref = [] | |||
test_pref = [] | |||
if NUM_TRIALS < 100: | |||
chunksize, extra = divmod(NUM_TRIALS, n_jobs * 4) | |||
if extra: | |||
chunksize += 1 | |||
else: | |||
chunksize = 100 | |||
for o1, o2, o3 in tqdm(pool.imap_unordered(trial_do_partial, range(NUM_TRIALS), chunksize), desc='cross validation', file=sys.stdout): | |||
train_pref.append(o1) | |||
val_pref.append(o2) | |||
test_pref.append(o3) | |||
pool.close() | |||
pool.join() | |||
# # ---- use pool.map to parallel. ---- | |||
# result_perf = pool.map(trial_do_partial, range(NUM_TRIALS)) | |||
@@ -460,15 +462,15 @@ def model_selection_for_precomputed_kernel(datafile, | |||
# val_pref = [item[1] for item in result_perf] | |||
# test_pref = [item[2] for item in result_perf] | |||
# ---- direct running, normally use a single CPU core. ---- | |||
train_pref = [] | |||
val_pref = [] | |||
test_pref = [] | |||
for i in tqdm(range(NUM_TRIALS), desc='cross validation', file=sys.stdout): | |||
o1, o2, o3 = trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, i) | |||
train_pref.append(o1) | |||
val_pref.append(o2) | |||
test_pref.append(o3) | |||
# # ---- direct running, normally use a single CPU core. ---- | |||
# train_pref = [] | |||
# val_pref = [] | |||
# test_pref = [] | |||
# for i in tqdm(range(NUM_TRIALS), desc='cross validation', file=sys.stdout): | |||
# o1, o2, o3 = trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, i) | |||
# train_pref.append(o1) | |||
# val_pref.append(o2) | |||
# test_pref.append(o3) | |||
print() | |||
print('4. Getting final performance...') | |||
@@ -623,89 +625,142 @@ def trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, t | |||
val_pref = np.zeros((len(param_list_pre_revised), len(param_list))) | |||
test_pref = np.zeros((len(param_list_pre_revised), len(param_list))) | |||
# randomness added to seeds of split function below. "high" is "size" times | |||
# 10 so that at least 10 different random output will be yielded. Remove | |||
# these lines if identical outputs is required. | |||
rdm_out = np.random.RandomState(seed=None) | |||
rdm_seed_out_l = rdm_out.uniform(high=len(param_list_pre_revised) * 10, | |||
size=len(param_list_pre_revised)) | |||
# print(trial, rdm_seed_out_l) | |||
# print() | |||
# loop for each outer param tuple | |||
for index_out, params_out in enumerate(param_list_pre_revised): | |||
# split gram matrix and y to app and test sets. | |||
indices = range(len(y)) | |||
# The argument "random_state" in function "train_test_split" can not be | |||
# set to None, because it will use RandomState instance used by | |||
# np.random, which is possible for multiple subprocesses to inherit the | |||
# same seed if they forked at the same time, leading to identical | |||
# random variates for different subprocesses. Instead, we use "trial" | |||
# and "index_out" parameters to generate different seeds for different | |||
# trials/subprocesses and outer loops. "rdm_seed_out_l" is used to add | |||
# randomness into seeds, so that it yields a different output every | |||
# time the program is run. To yield identical outputs every time, | |||
# remove the second line below. Same method is used to the "KFold" | |||
# function in the inner loop. | |||
rdm_seed_out = (trial + 1) * (index_out + 1) | |||
rdm_seed_out = (rdm_seed_out + int(rdm_seed_out_l[index_out])) % (2 ** 32 - 1) | |||
# print(trial, rdm_seed_out) | |||
X_app, X_test, y_app, y_test, idx_app, idx_test = train_test_split( | |||
gram_matrices[index_out], y, indices, test_size=0.1, | |||
random_state=None, shuffle=True) | |||
random_state=rdm_seed_out, shuffle=True) | |||
# print(trial, idx_app, idx_test) | |||
# print() | |||
X_app = X_app[:, idx_app] | |||
X_test = X_test[:, idx_app] | |||
y_app = np.array(y_app) | |||
y_test = np.array(y_test) | |||
rdm_seed_in_l = rdm_out.uniform(high=len(param_list) * 10, | |||
size=len(param_list)) | |||
# loop for each inner param tuple | |||
for index_in, params_in in enumerate(param_list): | |||
# print(index_in, params_in) | |||
# if trial == 0: | |||
# print(index_out, index_in) | |||
# print('params_in: ', params_in) | |||
# st = time.time() | |||
inner_cv = KFold(n_splits=10, shuffle=True, random_state=trial) | |||
rdm_seed_in = (trial + 1) * (index_out + 1) * (index_in + 1) | |||
# print("rdm_seed_in1: ", trial, index_in, rdm_seed_in) | |||
rdm_seed_in = (rdm_seed_in + int(rdm_seed_in_l[index_in])) % (2 ** 32 - 1) | |||
# print("rdm_seed_in2: ", trial, index_in, rdm_seed_in) | |||
inner_cv = KFold(n_splits=10, shuffle=True, random_state=rdm_seed_in) | |||
current_train_perf = [] | |||
current_valid_perf = [] | |||
current_test_perf = [] | |||
# For regression use the Kernel Ridge method | |||
try: | |||
if model_type == 'regression': | |||
kr = KernelRidge(kernel='precomputed', **params_in) | |||
# loop for each split on validation set level | |||
# validation set level | |||
for train_index, valid_index in inner_cv.split(X_app): | |||
kr.fit(X_app[train_index, :][:, train_index], | |||
y_app[train_index]) | |||
# try: | |||
if model_type == 'regression': | |||
kr = KernelRidge(kernel='precomputed', **params_in) | |||
# loop for each split on validation set level | |||
# validation set level | |||
for train_index, valid_index in inner_cv.split(X_app): | |||
# print("train_index, valid_index: ", trial, index_in, train_index, valid_index) | |||
# if trial == 0: | |||
# print('train_index: ', train_index) | |||
# print('valid_index: ', valid_index) | |||
# print('idx_test: ', idx_test) | |||
# print('y_app[train_index]: ', y_app[train_index]) | |||
# print('X_app[train_index, :][:, train_index]: ', X_app[train_index, :][:, train_index]) | |||
# print('X_app[valid_index, :][:, train_index]: ', X_app[valid_index, :][:, train_index]) | |||
kr.fit(X_app[train_index, :][:, train_index], | |||
y_app[train_index]) | |||
# predict on the train, validation and test set | |||
y_pred_train = kr.predict( | |||
X_app[train_index, :][:, train_index]) | |||
y_pred_valid = kr.predict( | |||
X_app[valid_index, :][:, train_index]) | |||
y_pred_test = kr.predict( | |||
X_test[:, train_index]) | |||
# predict on the train, validation and test set | |||
y_pred_train = kr.predict( | |||
X_app[train_index, :][:, train_index]) | |||
y_pred_valid = kr.predict( | |||
X_app[valid_index, :][:, train_index]) | |||
# if trial == 0: | |||
# print('y_pred_valid: ', y_pred_valid) | |||
# print() | |||
y_pred_test = kr.predict( | |||
X_test[:, train_index]) | |||
# root mean squared errors | |||
current_train_perf.append( | |||
np.sqrt( | |||
mean_squared_error( | |||
y_app[train_index], y_pred_train))) | |||
current_valid_perf.append( | |||
np.sqrt( | |||
mean_squared_error( | |||
y_app[valid_index], y_pred_valid))) | |||
current_test_perf.append( | |||
np.sqrt( | |||
mean_squared_error( | |||
y_test, y_pred_test))) | |||
# For clcassification use SVM | |||
else: | |||
svc = SVC(kernel='precomputed', cache_size=200, | |||
verbose=False, **params_in) | |||
# loop for each split on validation set level | |||
# validation set level | |||
for train_index, valid_index in inner_cv.split(X_app): | |||
# root mean squared errors | |||
current_train_perf.append( | |||
np.sqrt( | |||
mean_squared_error( | |||
y_app[train_index], y_pred_train))) | |||
current_valid_perf.append( | |||
np.sqrt( | |||
mean_squared_error( | |||
y_app[valid_index], y_pred_valid))) | |||
# if trial == 0: | |||
# print(mean_squared_error( | |||
# y_app[valid_index], y_pred_valid)) | |||
current_test_perf.append( | |||
np.sqrt( | |||
mean_squared_error( | |||
y_test, y_pred_test))) | |||
# For clcassification use SVM | |||
else: | |||
svc = SVC(kernel='precomputed', cache_size=200, | |||
verbose=False, **params_in) | |||
# loop for each split on validation set level | |||
# validation set level | |||
for train_index, valid_index in inner_cv.split(X_app): | |||
# np.savez("bug.npy",X_app[train_index, :][:, train_index],y_app[train_index]) | |||
svc.fit(X_app[train_index, :][:, train_index], | |||
y_app[train_index]) | |||
# predict on the train, validation and test set | |||
y_pred_train = svc.predict( | |||
X_app[train_index, :][:, train_index]) | |||
y_pred_valid = svc.predict( | |||
X_app[valid_index, :][:, train_index]) | |||
y_pred_test = svc.predict( | |||
X_test[:, train_index]) | |||
# if trial == 0: | |||
# print('train_index: ', train_index) | |||
# print('valid_index: ', valid_index) | |||
# print('idx_test: ', idx_test) | |||
# print('y_app[train_index]: ', y_app[train_index]) | |||
# print('X_app[train_index, :][:, train_index]: ', X_app[train_index, :][:, train_index]) | |||
# print('X_app[valid_index, :][:, train_index]: ', X_app[valid_index, :][:, train_index]) | |||
svc.fit(X_app[train_index, :][:, train_index], | |||
y_app[train_index]) | |||
# predict on the train, validation and test set | |||
y_pred_train = svc.predict( | |||
X_app[train_index, :][:, train_index]) | |||
y_pred_valid = svc.predict( | |||
X_app[valid_index, :][:, train_index]) | |||
y_pred_test = svc.predict( | |||
X_test[:, train_index]) | |||
# root mean squared errors | |||
current_train_perf.append( | |||
accuracy_score(y_app[train_index], | |||
y_pred_train)) | |||
current_valid_perf.append( | |||
accuracy_score(y_app[valid_index], | |||
y_pred_valid)) | |||
current_test_perf.append( | |||
accuracy_score(y_test, y_pred_test)) | |||
except ValueError: | |||
print(sys.exc_info()[0]) | |||
print(params_out, params_in) | |||
# root mean squared errors | |||
current_train_perf.append( | |||
accuracy_score(y_app[train_index], | |||
y_pred_train)) | |||
current_valid_perf.append( | |||
accuracy_score(y_app[valid_index], | |||
y_pred_valid)) | |||
current_test_perf.append( | |||
accuracy_score(y_test, y_pred_test)) | |||
# except ValueError: | |||
# print(sys.exc_info()[0]) | |||
# print(params_out, params_in) | |||
# average performance on inner splits | |||
train_pref[index_out][index_in] = np.mean( | |||
@@ -715,5 +770,8 @@ def trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, t | |||
test_pref[index_out][index_in] = np.mean( | |||
current_test_perf) | |||
# print(time.time() - st) | |||
# if trial == 0: | |||
# print('val_pref: ', val_pref) | |||
# print('test_pref: ', test_pref) | |||
return train_pref, val_pref, test_pref |