2. modify model_selection_precomputed so that all results are written into memory and then to a file at last section of code, in case that on cpu/disk seperated systems the IO takes too much time. 3. correct utils.floyd_warshall_numpy function. DONNOT use the last version.v0.1
@@ -3,12 +3,15 @@ A python package for graph kernels. | |||
## Requirements | |||
* numpy - 1.13.3 | |||
* scipy - 1.0.0 | |||
* matplotlib - 2.1.0 | |||
* networkx - 2.0 | |||
* sklearn - 0.19.1 | |||
* tabulate - 0.8.2 | |||
numpy==1.14.5 | |||
scipy==1.1.0 | |||
matplotlib==2.2.2 | |||
networkx==2.1 | |||
scikit-learn==0.19.1 | |||
tabulate==0.8.2 | |||
tqdm==4.23.4 | |||
control==0.7.0 (for generalized random walk kernels only) | |||
slycot===0.3.2.dev-5263ada (for generalized random walk kernels only, requires fortran compiler, gfortran for example) | |||
## Results with minimal test RMSE for each kernel on dataset Asyclic | |||
@@ -28,7 +31,7 @@ For prediction we randomly divide the data in train and test subset, where 90\% | |||
| WL shortest path | 28.74±0.60 | 38.20±0.62 | 39.02±6.09 | 'height': 10.0, 'alpha': '1.00' | 146.83"/80.63"±45.04" | | |||
| WL edge | 30.21±0.64 | 36.53±1.02 | 38.42±6.42 | 'height': 5.0, 'alpha': '6.31e-01' | 5.24"/5.15"±2.83" | | |||
| Treelet | 7.33±0.64 | 13.86±0.80 | 15.38±3.56 | 'alpha': '1.12e+01' | 0.48" | | |||
| Path up to d | 5.76±0.27 | 9.89±0.87 | 10.21±4.16 | 'depth': 2.0, 'k_func': 'MinMax', 'alpha': '0.1' | 0.56"/1.16"±0.75" | | |||
| Path up to d | 5.76±0.27 | 9.89±0.87 | 10.21±4.16 | 'depth': 2.0, 'k_func': 'MinMax', 'alpha ': '0.1' | 0.56"/1.16"±0.75" | | |||
| Cyclic pattern | | | | | | | |||
| Walk up to n | 20.88±0.74 | 23.34±1.11 | 24.46±6.57 | 'n': 2.0, 'alpha': '1.00e-03' | 0.56"/331.70"±753.44" | | |||
@@ -3,106 +3,66 @@ dslist = [ | |||
'name': 'Acyclic', | |||
'dataset': '../datasets/acyclic/dataset_bps.ds', | |||
'task': 'regression' | |||
}, # node_labeled | |||
{ | |||
'name': 'COIL-DEL', | |||
'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt' | |||
}, # edge_labeled | |||
}, # node symb | |||
# {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb | |||
{ | |||
'name': 'PAH', | |||
'dataset': '../datasets/PAH/dataset.ds', | |||
}, # unlabeled | |||
{ | |||
'name': 'Mutagenicity', | |||
'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt' | |||
}, # fully_labeled | |||
{ | |||
'name': 'MAO', | |||
'dataset': '../datasets/MAO/dataset.ds', | |||
}, | |||
}, # node/edge symb | |||
{ | |||
'name': 'MUTAG', | |||
'dataset': '../datasets/MUTAG/MUTAG.mat', | |||
'extra_params': { | |||
'am_sp_al_nl_el': [0, 0, 3, 1, 2] | |||
} | |||
}, | |||
}, # node/edge symb | |||
{ | |||
'name': 'Alkane', | |||
'dataset': '../datasets/Alkane/dataset.ds', | |||
'task': 'regression', | |||
'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt', | |||
}, | |||
{ | |||
'name': 'BZR', | |||
'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt' | |||
}, | |||
}, # contains single node graph, node symb | |||
# {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb | |||
# {'name': 'COX2', 'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt'}, # node symb/nsymb | |||
{ | |||
'name': 'COX2', | |||
'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt' | |||
}, | |||
'name': 'Mutagenicity', | |||
'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt' | |||
}, # node/edge symb | |||
{ | |||
'name': 'ENZYMES', | |||
'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt' | |||
}, | |||
{ | |||
'name': 'DHFR', | |||
'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt' | |||
}, | |||
{ | |||
'name': 'SYNTHETIC', | |||
'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt' | |||
}, | |||
{ | |||
'name': 'MSRC9', | |||
'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt' | |||
}, | |||
{ | |||
'name': 'MSRC21', | |||
'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt' | |||
}, | |||
}, # node symb/nsymb | |||
# {'name': 'Fingerprint', 'dataset': '../datasets/Fingerprint/Fingerprint_A.txt'}, | |||
{ | |||
'name': 'FIRSTMM_DB', | |||
'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt' | |||
}, | |||
{ | |||
'name': 'PROTEINS', | |||
'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt' | |||
}, | |||
{ | |||
'name': 'PROTEINS_full', | |||
'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt' | |||
'name': 'Letter-med', | |||
'dataset': '../datasets/Letter-med/Letter-med_A.txt' | |||
}, | |||
# {'name': 'DHFR', 'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt'}, # node symb/nsymb | |||
# {'name': 'SYNTHETIC', 'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt'}, # node symb/nsymb | |||
# {'name': 'MSRC9', 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'}, # node symb | |||
# {'name': 'MSRC21', 'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'}, # node symb | |||
# {'name': 'FIRSTMM_DB', 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'}, # node symb/nsymb ,edge nsymb | |||
# {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'}, # node symb/nsymb | |||
# {'name': 'PROTEINS_full', 'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'}, # node symb/nsymb | |||
{ | |||
'name': 'D&D', | |||
'dataset': '../datasets/D&D/DD.mat', | |||
'extra_params': { | |||
'am_sp_al_nl_el': [0, 1, 2, 1, -1] | |||
} | |||
}, | |||
{ | |||
'name': 'AIDS', | |||
'dataset': '../datasets/AIDS/AIDS_A.txt' | |||
}, | |||
{ | |||
'name': 'NCI1', | |||
'dataset': '../datasets/NCI1/NCI1.mat', | |||
'extra_params': { | |||
'am_sp_al_nl_el': [1, 1, 2, 0, -1] | |||
} | |||
}, | |||
{ | |||
'name': 'NCI109', | |||
'dataset': '../datasets/NCI109/NCI109.mat', | |||
'extra_params': { | |||
'am_sp_al_nl_el': [1, 1, 2, 0, -1] | |||
} | |||
}, | |||
{ | |||
'name': 'NCI-HIV', | |||
'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf', | |||
'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt', | |||
}, | |||
}, # node symb | |||
# {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb | |||
# {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1.mat', | |||
# 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb | |||
# {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109.mat', | |||
# 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb | |||
# {'name': 'NCI-HIV', 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf', | |||
# 'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',}, # node/edge symb | |||
# # not working below | |||
# {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',}, | |||
@@ -110,3 +70,116 @@ dslist = [ | |||
# {'name': 'PTC_MM', 'dataset': '../datasets/PTC/Train/MM.ds',}, | |||
# {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',}, | |||
] | |||
# dslist = [ | |||
# { | |||
# 'name': 'Acyclic', | |||
# 'dataset': '../datasets/acyclic/dataset_bps.ds', | |||
# 'task': 'regression' | |||
# }, # node_labeled | |||
# { | |||
# 'name': 'COIL-DEL', | |||
# 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt' | |||
# }, # edge_labeled | |||
# { | |||
# 'name': 'PAH', | |||
# 'dataset': '../datasets/PAH/dataset.ds', | |||
# }, # unlabeled | |||
# { | |||
# 'name': 'Mutagenicity', | |||
# 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt' | |||
# }, # fully_labeled | |||
# { | |||
# 'name': 'MAO', | |||
# 'dataset': '../datasets/MAO/dataset.ds', | |||
# }, | |||
# { | |||
# 'name': 'MUTAG', | |||
# 'dataset': '../datasets/MUTAG/MUTAG.mat', | |||
# 'extra_params': { | |||
# 'am_sp_al_nl_el': [0, 0, 3, 1, 2] | |||
# } | |||
# }, | |||
# { | |||
# 'name': 'Alkane', | |||
# 'dataset': '../datasets/Alkane/dataset.ds', | |||
# 'task': 'regression', | |||
# 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt', | |||
# }, | |||
# { | |||
# 'name': 'BZR', | |||
# 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt' | |||
# }, | |||
# { | |||
# 'name': 'COX2', | |||
# 'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt' | |||
# }, | |||
# { | |||
# 'name': 'ENZYMES', | |||
# 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt' | |||
# }, | |||
# { | |||
# 'name': 'DHFR', | |||
# 'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt' | |||
# }, | |||
# { | |||
# 'name': 'SYNTHETIC', | |||
# 'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt' | |||
# }, | |||
# { | |||
# 'name': 'MSRC9', | |||
# 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt' | |||
# }, | |||
# { | |||
# 'name': 'MSRC21', | |||
# 'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt' | |||
# }, | |||
# { | |||
# 'name': 'FIRSTMM_DB', | |||
# 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt' | |||
# }, | |||
# { | |||
# 'name': 'PROTEINS', | |||
# 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt' | |||
# }, | |||
# { | |||
# 'name': 'PROTEINS_full', | |||
# 'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt' | |||
# }, | |||
# { | |||
# 'name': 'D&D', | |||
# 'dataset': '../datasets/D&D/DD.mat', | |||
# 'extra_params': { | |||
# 'am_sp_al_nl_el': [0, 1, 2, 1, -1] | |||
# } | |||
# }, | |||
# { | |||
# 'name': 'AIDS', | |||
# 'dataset': '../datasets/AIDS/AIDS_A.txt' | |||
# }, | |||
# { | |||
# 'name': 'NCI1', | |||
# 'dataset': '../datasets/NCI1/NCI1.mat', | |||
# 'extra_params': { | |||
# 'am_sp_al_nl_el': [1, 1, 2, 0, -1] | |||
# } | |||
# }, | |||
# { | |||
# 'name': 'NCI109', | |||
# 'dataset': '../datasets/NCI109/NCI109.mat', | |||
# 'extra_params': { | |||
# 'am_sp_al_nl_el': [1, 1, 2, 0, -1] | |||
# } | |||
# }, | |||
# { | |||
# 'name': 'NCI-HIV', | |||
# 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf', | |||
# 'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt', | |||
# }, | |||
# # # not working below | |||
# # {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',}, | |||
# # {'name': 'PTC_FR', 'dataset': '../datasets/PTC/Train/FR.ds',}, | |||
# # {'name': 'PTC_MM', 'dataset': '../datasets/PTC/Train/MM.ds',}, | |||
# # {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',}, | |||
# ] |
@@ -1,56 +1,157 @@ | |||
import functools | |||
from libs import * | |||
from pygraph.kernels.spKernel import spkernel | |||
from pygraph.utils.kernels import deltakernel, kernelsum | |||
from sklearn.metrics.pairwise import rbf_kernel | |||
dslist = [ | |||
# {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds', 'task': 'regression'}, # node_labeled | |||
# {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge_labeled | |||
# dslist = [ | |||
# {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds', 'task': 'regression'}, # node symb | |||
# # {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb | |||
# {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds',}, # unlabeled | |||
{'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, # fully_labeled | |||
# {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds',}, | |||
# {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds',}, # node/edge symb | |||
# {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat', | |||
# 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, | |||
# {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression', | |||
# 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt',}, | |||
# {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, | |||
# {'name': 'COX2', 'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt'}, | |||
{'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'}, | |||
# {'name': 'DHFR', 'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt'}, | |||
# {'name': 'SYNTHETIC', 'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt'}, | |||
# {'name': 'MSRC9', 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'}, | |||
# {'name': 'MSRC21', 'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'}, | |||
# {'name': 'FIRSTMM_DB', 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'}, | |||
# {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'}, | |||
# {'name': 'PROTEINS_full', 'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'}, | |||
# 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, # node/edge symb | |||
# {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression', | |||
# 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt',}, # contains single node graph, node symb | |||
# # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb | |||
# # {'name': 'COX2', 'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt'}, # node symb/nsymb | |||
# {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, # node/edge symb | |||
# {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'}, # node symb/nsymb | |||
# # {'name': 'Fingerprint', 'dataset': '../datasets/Fingerprint/Fingerprint_A.txt'}, | |||
# {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'}, | |||
# # {'name': 'DHFR', 'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt'}, # node symb/nsymb | |||
# # {'name': 'SYNTHETIC', 'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt'}, # node symb/nsymb | |||
# # {'name': 'MSRC9', 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'}, # node symb | |||
# # {'name': 'MSRC21', 'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'}, # node symb | |||
# # {'name': 'FIRSTMM_DB', 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'}, # node symb/nsymb ,edge nsymb | |||
# # {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'}, # node symb/nsymb | |||
# # {'name': 'PROTEINS_full', 'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'}, # node symb/nsymb | |||
# {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat', | |||
# 'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}}, | |||
# {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, | |||
# {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1.mat', | |||
# 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, | |||
# {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109.mat', | |||
# 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, | |||
# {'name': 'NCI-HIV', 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf', | |||
# 'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',}, | |||
# # not working below | |||
# {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',}, | |||
# {'name': 'PTC_FR', 'dataset': '../datasets/PTC/Train/FR.ds',}, | |||
# {'name': 'PTC_MM', 'dataset': '../datasets/PTC/Train/MM.ds',}, | |||
# {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',}, | |||
] | |||
# 'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}}, # node symb | |||
# # {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb | |||
# # {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1.mat', | |||
# # 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb | |||
# # {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109.mat', | |||
# # 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb | |||
# # {'name': 'NCI-HIV', 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf', | |||
# # 'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',}, # node/edge symb | |||
# # # not working below | |||
# # {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',}, | |||
# # {'name': 'PTC_FR', 'dataset': '../datasets/PTC/Train/FR.ds',}, | |||
# # {'name': 'PTC_MM', 'dataset': '../datasets/PTC/Train/MM.ds',}, | |||
# # {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',}, | |||
# ] | |||
import ast | |||
ds = ast.literal_eval(sys.argv[1]) | |||
estimator = spkernel | |||
param_grid_precomputed = {} | |||
param_grid = [{'C': np.logspace(-10, 10, num = 41, base = 10)}, | |||
{'alpha': np.logspace(-10, 10, num = 41, base = 10)}] | |||
for ds in dslist: | |||
print() | |||
print(ds['name']) | |||
model_selection_for_precomputed_kernel( | |||
ds['dataset'], estimator, param_grid_precomputed, | |||
(param_grid[1] if ('task' in ds and ds['task'] == 'regression') else param_grid[0]), | |||
(ds['task'] if 'task' in ds else 'classification'), NUM_TRIALS=30, | |||
datafile_y=(ds['dataset_y'] if 'dataset_y' in ds else None), | |||
extra_params=(ds['extra_params'] if 'extra_params' in ds else None)) | |||
print() | |||
mixkernel = functools.partial(kernelsum, deltakernel, rbf_kernel) | |||
param_grid_precomputed = { | |||
'node_kernels': [{ | |||
'symb': deltakernel, | |||
'nsymb': rbf_kernel, | |||
'mix': mixkernel | |||
}] | |||
} | |||
param_grid = [{ | |||
'C': np.logspace(-10, 10, num=41, base=10) | |||
}, { | |||
'alpha': np.logspace(-10, 10, num=41, base=10) | |||
}] | |||
print() | |||
print(ds['name']) | |||
model_selection_for_precomputed_kernel( | |||
ds['dataset'], | |||
estimator, | |||
param_grid_precomputed, | |||
(param_grid[1] | |||
if ('task' in ds and ds['task'] == 'regression') else param_grid[0]), | |||
(ds['task'] if 'task' in ds else 'classification'), | |||
NUM_TRIALS=30, | |||
datafile_y=(ds['dataset_y'] if 'dataset_y' in ds else None), | |||
extra_params=(ds['extra_params'] if 'extra_params' in ds else None), | |||
ds_name=ds['name']) | |||
# %lprun -f spkernel \ | |||
# model_selection_for_precomputed_kernel( \ | |||
# ds['dataset'], estimator, param_grid_precomputed, \ | |||
# (param_grid[1] if ('task' in ds and ds['task'] == 'regression') else param_grid[0]), \ | |||
# (ds['task'] if 'task' in ds else 'classification'), NUM_TRIALS=30, \ | |||
# datafile_y=(ds['dataset_y'] if 'dataset_y' in ds else None), \ | |||
# extra_params=(ds['extra_params'] if 'extra_params' in ds else None)) | |||
print() | |||
# import functools | |||
# from libs import * | |||
# from pygraph.kernels.spKernel import spkernel | |||
# from pygraph.utils.kernels import deltakernel, kernelsum | |||
# from sklearn.metrics.pairwise import rbf_kernel | |||
# dslist = [ | |||
# {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds', 'task': 'regression'}, # node symb | |||
# # {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb | |||
# # {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds',}, # unlabeled | |||
# # {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds',}, # node/edge symb | |||
# # {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat', | |||
# # 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, # node/edge symb | |||
# # {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression', | |||
# # 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt',}, # contains single node graph, node symb | |||
# # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb | |||
# # {'name': 'COX2', 'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt'}, # node symb/nsymb | |||
# # {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, # node/edge symb | |||
# # {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'}, # node symb/nsymb | |||
# # {'name': 'Fingerprint', 'dataset': '../datasets/Fingerprint/Fingerprint_A.txt'}, | |||
# # {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'}, | |||
# # {'name': 'DHFR', 'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt'}, # node symb/nsymb | |||
# # {'name': 'SYNTHETIC', 'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt'}, # node symb/nsymb | |||
# # {'name': 'MSRC9', 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'}, # node symb | |||
# # {'name': 'MSRC21', 'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'}, # node symb | |||
# # {'name': 'FIRSTMM_DB', 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'}, # node symb/nsymb ,edge nsymb | |||
# # {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'}, # node symb/nsymb | |||
# # {'name': 'PROTEINS_full', 'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'}, # node symb/nsymb | |||
# # {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat', | |||
# # 'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}}, # node symb | |||
# # {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb | |||
# # {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1.mat', | |||
# # 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb | |||
# # {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109.mat', | |||
# # 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb | |||
# # {'name': 'NCI-HIV', 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf', | |||
# # 'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',}, # node/edge symb | |||
# # # not working below | |||
# # {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',}, | |||
# # {'name': 'PTC_FR', 'dataset': '../datasets/PTC/Train/FR.ds',}, | |||
# # {'name': 'PTC_MM', 'dataset': '../datasets/PTC/Train/MM.ds',}, | |||
# # {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',}, | |||
# ] | |||
# estimator = spkernel | |||
# mixkernel = functools.partial(kernelsum, deltakernel, rbf_kernel) | |||
# param_grid_precomputed = {'node_kernels': [{'symb': deltakernel, 'nsymb': rbf_kernel, 'mix': mixkernel}]} | |||
# param_grid = [{'C': np.logspace(-10, 10, num = 41, base = 10)}, | |||
# {'alpha': np.logspace(-10, 10, num = 41, base = 10)}] | |||
# for ds in dslist: | |||
# print() | |||
# print(ds['name']) | |||
# model_selection_for_precomputed_kernel( | |||
# ds['dataset'], estimator, param_grid_precomputed, | |||
# (param_grid[1] if ('task' in ds and ds['task'] == 'regression') else param_grid[0]), | |||
# (ds['task'] if 'task' in ds else 'classification'), NUM_TRIALS=30, | |||
# datafile_y=(ds['dataset_y'] if 'dataset_y' in ds else None), | |||
# extra_params=(ds['extra_params'] if 'extra_params' in ds else None), | |||
# ds_name=ds['name']) | |||
# # %lprun -f spkernel \ | |||
# # model_selection_for_precomputed_kernel( \ | |||
# # ds['dataset'], estimator, param_grid_precomputed, \ | |||
# # (param_grid[1] if ('task' in ds and ds['task'] == 'regression') else param_grid[0]), \ | |||
# # (ds['task'] if 'task' in ds else 'classification'), NUM_TRIALS=30, \ | |||
# # datafile_y=(ds['dataset_y'] if 'dataset_y' in ds else None), \ | |||
# # extra_params=(ds['extra_params'] if 'extra_params' in ds else None)) | |||
# print() |
@@ -1 +0,0 @@ | |||
ljia@ljia-Precision-7520.4716:1530265749 |
@@ -9,6 +9,9 @@ sys.path.insert(0, "../") | |||
from tqdm import tqdm | |||
import time | |||
from itertools import combinations_with_replacement, product | |||
from functools import partial | |||
from joblib import Parallel, delayed | |||
from multiprocessing import Pool | |||
import networkx as nx | |||
import numpy as np | |||
@@ -17,7 +20,11 @@ from pygraph.utils.utils import getSPGraph | |||
from pygraph.utils.graphdataset import get_dataset_attributes | |||
def spkernel(*args, node_label='atom', edge_weight=None, node_kernels=None): | |||
def spkernel(*args, | |||
node_label='atom', | |||
edge_weight=None, | |||
node_kernels=None, | |||
n_jobs=None): | |||
"""Calculate shortest-path kernels between graphs. | |||
Parameters | |||
@@ -70,180 +77,344 @@ def spkernel(*args, node_label='atom', edge_weight=None, node_kernels=None): | |||
if len(Gn) != len_gn: | |||
print('\n %d graphs are removed as they don\'t contain edges.\n' % | |||
(len_gn - len(Gn))) | |||
start_time = time.time() | |||
pool = Pool(n_jobs) | |||
# get shortest path graphs of Gn | |||
Gn = [ | |||
getSPGraph(G, edge_weight=edge_weight) | |||
for G in tqdm(Gn, desc='getting sp graphs', file=sys.stdout) | |||
] | |||
getsp_partial = partial(wrap_getSPGraph, Gn, edge_weight) | |||
result_sp = pool.map(getsp_partial, range(0, len(Gn))) | |||
for i in result_sp: | |||
Gn[i[0]] = i[1] | |||
# Gn = [ | |||
# getSPGraph(G, edge_weight=edge_weight) | |||
# for G in tqdm(Gn, desc='getting sp graphs', file=sys.stdout) | |||
# ] | |||
Kmatrix = np.zeros((len(Gn), len(Gn))) | |||
pbar = tqdm( | |||
total=((len(Gn) + 1) * len(Gn) / 2), | |||
desc='calculating kernels', | |||
file=sys.stdout) | |||
do_partial = partial(spkernel_do, Gn, ds_attrs, node_label, node_kernels) | |||
itr = combinations_with_replacement(range(0, len(Gn)), 2) | |||
# chunksize = 2000 # int(len(list(itr)) / n_jobs) | |||
# for i, j, kernel in tqdm(pool.imap_unordered(do_partial, itr, chunksize)): | |||
# Kmatrix[i][j] = kernel | |||
# Kmatrix[j][i] = kernel | |||
result_perf = pool.map(do_partial, itr) | |||
pool.close() | |||
pool.join() | |||
# result_perf = Parallel( | |||
# n_jobs=n_jobs, verbose=10)( | |||
# delayed(do_partial)(ij) | |||
# for ij in combinations_with_replacement(range(0, len(Gn)), 2)) | |||
# result_perf = [ | |||
# do_partial(ij) | |||
# for ij in combinations_with_replacement(range(0, len(Gn)), 2) | |||
# ] | |||
for i in result_perf: | |||
Kmatrix[i[0]][i[1]] = i[2] | |||
Kmatrix[i[1]][i[0]] = i[2] | |||
# pbar = tqdm( | |||
# total=((len(Gn) + 1) * len(Gn) / 2), | |||
# desc='calculating kernels', | |||
# file=sys.stdout) | |||
# if ds_attrs['node_labeled']: | |||
# # node symb and non-synb labeled | |||
# if ds_attrs['node_attr_dim'] > 0: | |||
# if ds_attrs['is_directed']: | |||
# for i, j in combinations_with_replacement( | |||
# range(0, len(Gn)), 2): | |||
# for e1, e2 in product( | |||
# Gn[i].edges(data=True), Gn[j].edges(data=True)): | |||
# if e1[2]['cost'] == e2[2]['cost']: | |||
# kn = node_kernels['mix'] | |||
# try: | |||
# n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[ | |||
# i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[ | |||
# j].nodes[e2[1]] | |||
# kn1 = kn(n11[node_label], n21[node_label], [ | |||
# n11['attributes'] | |||
# ], [n21['attributes']]) * kn( | |||
# n12[node_label], n22[node_label], | |||
# [n12['attributes']], [n22['attributes']]) | |||
# Kmatrix[i][j] += kn1 | |||
# except KeyError: # missing labels or attributes | |||
# pass | |||
# Kmatrix[j][i] = Kmatrix[i][j] | |||
# pbar.update(1) | |||
# else: | |||
# for i, j in combinations_with_replacement( | |||
# range(0, len(Gn)), 2): | |||
# for e1, e2 in product( | |||
# Gn[i].edges(data=True), Gn[j].edges(data=True)): | |||
# if e1[2]['cost'] == e2[2]['cost']: | |||
# kn = node_kernels['mix'] | |||
# try: | |||
# # each edge walk is counted twice, starting from both its extreme nodes. | |||
# n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[ | |||
# i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[ | |||
# j].nodes[e2[1]] | |||
# kn1 = kn(n11[node_label], n21[node_label], [ | |||
# n11['attributes'] | |||
# ], [n21['attributes']]) * kn( | |||
# n12[node_label], n22[node_label], | |||
# [n12['attributes']], [n22['attributes']]) | |||
# kn2 = kn(n11[node_label], n22[node_label], [ | |||
# n11['attributes'] | |||
# ], [n22['attributes']]) * kn( | |||
# n12[node_label], n21[node_label], | |||
# [n12['attributes']], [n21['attributes']]) | |||
# Kmatrix[i][j] += kn1 + kn2 | |||
# except KeyError: # missing labels or attributes | |||
# pass | |||
# Kmatrix[j][i] = Kmatrix[i][j] | |||
# pbar.update(1) | |||
# # node symb labeled | |||
# else: | |||
# if ds_attrs['is_directed']: | |||
# for i, j in combinations_with_replacement( | |||
# range(0, len(Gn)), 2): | |||
# for e1, e2 in product( | |||
# Gn[i].edges(data=True), Gn[j].edges(data=True)): | |||
# if e1[2]['cost'] == e2[2]['cost']: | |||
# kn = node_kernels['symb'] | |||
# try: | |||
# n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[ | |||
# i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[ | |||
# j].nodes[e2[1]] | |||
# kn1 = kn(n11[node_label], | |||
# n21[node_label]) * kn( | |||
# n12[node_label], n22[node_label]) | |||
# Kmatrix[i][j] += kn1 | |||
# except KeyError: # missing labels | |||
# pass | |||
# Kmatrix[j][i] = Kmatrix[i][j] | |||
# pbar.update(1) | |||
# else: | |||
# for i, j in combinations_with_replacement( | |||
# range(0, len(Gn)), 2): | |||
# for e1, e2 in product( | |||
# Gn[i].edges(data=True), Gn[j].edges(data=True)): | |||
# if e1[2]['cost'] == e2[2]['cost']: | |||
# kn = node_kernels['symb'] | |||
# try: | |||
# # each edge walk is counted twice, starting from both its extreme nodes. | |||
# n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[ | |||
# i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[ | |||
# j].nodes[e2[1]] | |||
# kn1 = kn(n11[node_label], | |||
# n21[node_label]) * kn( | |||
# n12[node_label], n22[node_label]) | |||
# kn2 = kn(n11[node_label], | |||
# n22[node_label]) * kn( | |||
# n12[node_label], n21[node_label]) | |||
# Kmatrix[i][j] += kn1 + kn2 | |||
# except KeyError: # missing labels | |||
# pass | |||
# Kmatrix[j][i] = Kmatrix[i][j] | |||
# pbar.update(1) | |||
# else: | |||
# # node non-synb labeled | |||
# if ds_attrs['node_attr_dim'] > 0: | |||
# if ds_attrs['is_directed']: | |||
# for i, j in combinations_with_replacement( | |||
# range(0, len(Gn)), 2): | |||
# for e1, e2 in product( | |||
# Gn[i].edges(data=True), Gn[j].edges(data=True)): | |||
# if e1[2]['cost'] == e2[2]['cost']: | |||
# kn = node_kernels['nsymb'] | |||
# try: | |||
# # each edge walk is counted twice, starting from both its extreme nodes. | |||
# n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[ | |||
# i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[ | |||
# j].nodes[e2[1]] | |||
# kn1 = kn([n11['attributes']], | |||
# [n21['attributes']]) * kn( | |||
# [n12['attributes']], | |||
# [n22['attributes']]) | |||
# Kmatrix[i][j] += kn1 | |||
# except KeyError: # missing attributes | |||
# pass | |||
# Kmatrix[j][i] = Kmatrix[i][j] | |||
# pbar.update(1) | |||
# else: | |||
# for i, j in combinations_with_replacement( | |||
# range(0, len(Gn)), 2): | |||
# for e1, e2 in product( | |||
# Gn[i].edges(data=True), Gn[j].edges(data=True)): | |||
# if e1[2]['cost'] == e2[2]['cost']: | |||
# kn = node_kernels['nsymb'] | |||
# try: | |||
# # each edge walk is counted twice, starting from both its extreme nodes. | |||
# n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[ | |||
# i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[ | |||
# j].nodes[e2[1]] | |||
# kn1 = kn([n11['attributes']], | |||
# [n21['attributes']]) * kn( | |||
# [n12['attributes']], | |||
# [n22['attributes']]) | |||
# kn2 = kn([n11['attributes']], | |||
# [n22['attributes']]) * kn( | |||
# [n12['attributes']], | |||
# [n21['attributes']]) | |||
# Kmatrix[i][j] += kn1 + kn2 | |||
# except KeyError: # missing attributes | |||
# pass | |||
# Kmatrix[j][i] = Kmatrix[i][j] | |||
# pbar.update(1) | |||
# # node unlabeled | |||
# else: | |||
# for i, j in combinations_with_replacement(range(0, len(Gn)), 2): | |||
# for e1, e2 in product( | |||
# Gn[i].edges(data=True), Gn[j].edges(data=True)): | |||
# if e1[2]['cost'] == e2[2]['cost']: | |||
# Kmatrix[i][j] += 1 | |||
# Kmatrix[j][i] = Kmatrix[i][j] | |||
# pbar.update(1) | |||
run_time = time.time() - start_time | |||
print( | |||
"\n --- shortest path kernel matrix of size %d built in %s seconds ---" | |||
% (len(Gn), run_time)) | |||
return Kmatrix, run_time, idx | |||
def spkernel_do(Gn, ds_attrs, node_label, node_kernels, ij): | |||
i = ij[0] | |||
j = ij[1] | |||
Kmatrix = 0 | |||
if ds_attrs['node_labeled']: | |||
# node symb and non-synb labeled | |||
if ds_attrs['node_attr_dim'] > 0: | |||
if ds_attrs['is_directed']: | |||
for i, j in combinations_with_replacement( | |||
range(0, len(Gn)), 2): | |||
for e1, e2 in product( | |||
Gn[i].edges(data=True), Gn[j].edges(data=True)): | |||
if e1[2]['cost'] == e2[2]['cost']: | |||
kn = node_kernels['mix'] | |||
try: | |||
n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[ | |||
i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[ | |||
j].nodes[e2[1]] | |||
kn1 = kn(n11[node_label], n21[node_label], [ | |||
n11['attributes'] | |||
], [n21['attributes']]) * kn( | |||
for e1, e2 in product( | |||
Gn[i].edges(data=True), Gn[j].edges(data=True)): | |||
if e1[2]['cost'] == e2[2]['cost']: | |||
kn = node_kernels['mix'] | |||
try: | |||
n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[ | |||
i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[ | |||
j].nodes[e2[1]] | |||
kn1 = kn( | |||
n11[node_label], n21[node_label], | |||
[n11['attributes']], [n21['attributes']]) * kn( | |||
n12[node_label], n22[node_label], | |||
[n12['attributes']], [n22['attributes']]) | |||
Kmatrix[i][j] += kn1 | |||
except KeyError: # missing labels or attributes | |||
pass | |||
Kmatrix[j][i] = Kmatrix[i][j] | |||
pbar.update(1) | |||
Kmatrix += kn1 | |||
except KeyError: # missing labels or attributes | |||
pass | |||
else: | |||
for i, j in combinations_with_replacement( | |||
range(0, len(Gn)), 2): | |||
for e1, e2 in product( | |||
Gn[i].edges(data=True), Gn[j].edges(data=True)): | |||
if e1[2]['cost'] == e2[2]['cost']: | |||
kn = node_kernels['mix'] | |||
try: | |||
# each edge walk is counted twice, starting from both its extreme nodes. | |||
n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[ | |||
i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[ | |||
j].nodes[e2[1]] | |||
kn1 = kn(n11[node_label], n21[node_label], [ | |||
n11['attributes'] | |||
], [n21['attributes']]) * kn( | |||
for e1, e2 in product( | |||
Gn[i].edges(data=True), Gn[j].edges(data=True)): | |||
if e1[2]['cost'] == e2[2]['cost']: | |||
kn = node_kernels['mix'] | |||
try: | |||
# each edge walk is counted twice, starting from both its extreme nodes. | |||
n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[ | |||
i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[ | |||
j].nodes[e2[1]] | |||
kn1 = kn( | |||
n11[node_label], n21[node_label], | |||
[n11['attributes']], [n21['attributes']]) * kn( | |||
n12[node_label], n22[node_label], | |||
[n12['attributes']], [n22['attributes']]) | |||
kn2 = kn(n11[node_label], n22[node_label], [ | |||
n11['attributes'] | |||
], [n22['attributes']]) * kn( | |||
kn2 = kn( | |||
n11[node_label], n22[node_label], | |||
[n11['attributes']], [n22['attributes']]) * kn( | |||
n12[node_label], n21[node_label], | |||
[n12['attributes']], [n21['attributes']]) | |||
Kmatrix[i][j] += kn1 + kn2 | |||
except KeyError: # missing labels or attributes | |||
pass | |||
Kmatrix[j][i] = Kmatrix[i][j] | |||
pbar.update(1) | |||
Kmatrix += kn1 + kn2 | |||
except KeyError: # missing labels or attributes | |||
pass | |||
# node symb labeled | |||
else: | |||
if ds_attrs['is_directed']: | |||
for i, j in combinations_with_replacement( | |||
range(0, len(Gn)), 2): | |||
for e1, e2 in product( | |||
Gn[i].edges(data=True), Gn[j].edges(data=True)): | |||
if e1[2]['cost'] == e2[2]['cost']: | |||
kn = node_kernels['symb'] | |||
try: | |||
n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[ | |||
i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[ | |||
j].nodes[e2[1]] | |||
kn1 = kn(n11[node_label], | |||
n21[node_label]) * kn( | |||
n12[node_label], n22[node_label]) | |||
Kmatrix[i][j] += kn1 | |||
except KeyError: # missing labels | |||
pass | |||
Kmatrix[j][i] = Kmatrix[i][j] | |||
pbar.update(1) | |||
for e1, e2 in product( | |||
Gn[i].edges(data=True), Gn[j].edges(data=True)): | |||
if e1[2]['cost'] == e2[2]['cost']: | |||
kn = node_kernels['symb'] | |||
try: | |||
n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[ | |||
i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[ | |||
j].nodes[e2[1]] | |||
kn1 = kn(n11[node_label], n21[node_label]) * kn( | |||
n12[node_label], n22[node_label]) | |||
Kmatrix += kn1 | |||
except KeyError: # missing labels | |||
pass | |||
else: | |||
for i, j in combinations_with_replacement( | |||
range(0, len(Gn)), 2): | |||
for e1, e2 in product( | |||
Gn[i].edges(data=True), Gn[j].edges(data=True)): | |||
if e1[2]['cost'] == e2[2]['cost']: | |||
kn = node_kernels['symb'] | |||
try: | |||
# each edge walk is counted twice, starting from both its extreme nodes. | |||
n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[ | |||
i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[ | |||
j].nodes[e2[1]] | |||
kn1 = kn(n11[node_label], | |||
n21[node_label]) * kn( | |||
n12[node_label], n22[node_label]) | |||
kn2 = kn(n11[node_label], | |||
n22[node_label]) * kn( | |||
n12[node_label], n21[node_label]) | |||
Kmatrix[i][j] += kn1 + kn2 | |||
except KeyError: # missing labels | |||
pass | |||
Kmatrix[j][i] = Kmatrix[i][j] | |||
pbar.update(1) | |||
for e1, e2 in product( | |||
Gn[i].edges(data=True), Gn[j].edges(data=True)): | |||
if e1[2]['cost'] == e2[2]['cost']: | |||
kn = node_kernels['symb'] | |||
try: | |||
# each edge walk is counted twice, starting from both its extreme nodes. | |||
n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[ | |||
i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[ | |||
j].nodes[e2[1]] | |||
kn1 = kn(n11[node_label], n21[node_label]) * kn( | |||
n12[node_label], n22[node_label]) | |||
kn2 = kn(n11[node_label], n22[node_label]) * kn( | |||
n12[node_label], n21[node_label]) | |||
Kmatrix += kn1 + kn2 | |||
except KeyError: # missing labels | |||
pass | |||
else: | |||
# node non-synb labeled | |||
if ds_attrs['node_attr_dim'] > 0: | |||
if ds_attrs['is_directed']: | |||
for i, j in combinations_with_replacement( | |||
range(0, len(Gn)), 2): | |||
for e1, e2 in product( | |||
Gn[i].edges(data=True), Gn[j].edges(data=True)): | |||
if e1[2]['cost'] == e2[2]['cost']: | |||
kn = node_kernels['nsymb'] | |||
try: | |||
# each edge walk is counted twice, starting from both its extreme nodes. | |||
n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[ | |||
i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[ | |||
j].nodes[e2[1]] | |||
kn1 = kn([n11['attributes']], | |||
[n21['attributes']]) * kn( | |||
[n12['attributes']], | |||
[n22['attributes']]) | |||
Kmatrix[i][j] += kn1 | |||
except KeyError: # missing attributes | |||
pass | |||
Kmatrix[j][i] = Kmatrix[i][j] | |||
pbar.update(1) | |||
for e1, e2 in product( | |||
Gn[i].edges(data=True), Gn[j].edges(data=True)): | |||
if e1[2]['cost'] == e2[2]['cost']: | |||
kn = node_kernels['nsymb'] | |||
try: | |||
# each edge walk is counted twice, starting from both its extreme nodes. | |||
n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[ | |||
i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[ | |||
j].nodes[e2[1]] | |||
kn1 = kn( | |||
[n11['attributes']], [n21['attributes']]) * kn( | |||
[n12['attributes']], [n22['attributes']]) | |||
Kmatrix += kn1 | |||
except KeyError: # missing attributes | |||
pass | |||
else: | |||
for i, j in combinations_with_replacement( | |||
range(0, len(Gn)), 2): | |||
for e1, e2 in product( | |||
Gn[i].edges(data=True), Gn[j].edges(data=True)): | |||
if e1[2]['cost'] == e2[2]['cost']: | |||
kn = node_kernels['nsymb'] | |||
try: | |||
# each edge walk is counted twice, starting from both its extreme nodes. | |||
n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[ | |||
i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[ | |||
j].nodes[e2[1]] | |||
kn1 = kn([n11['attributes']], | |||
[n21['attributes']]) * kn( | |||
[n12['attributes']], | |||
[n22['attributes']]) | |||
kn2 = kn([n11['attributes']], | |||
[n22['attributes']]) * kn( | |||
[n12['attributes']], | |||
[n21['attributes']]) | |||
Kmatrix[i][j] += kn1 + kn2 | |||
except KeyError: # missing attributes | |||
pass | |||
Kmatrix[j][i] = Kmatrix[i][j] | |||
pbar.update(1) | |||
# node unlabeled | |||
else: | |||
for i, j in combinations_with_replacement(range(0, len(Gn)), 2): | |||
for e1, e2 in product( | |||
Gn[i].edges(data=True), Gn[j].edges(data=True)): | |||
if e1[2]['cost'] == e2[2]['cost']: | |||
Kmatrix[i][j] += 1 | |||
Kmatrix[j][i] = Kmatrix[i][j] | |||
pbar.update(1) | |||
kn = node_kernels['nsymb'] | |||
try: | |||
# each edge walk is counted twice, starting from both its extreme nodes. | |||
n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[ | |||
i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[ | |||
j].nodes[e2[1]] | |||
kn1 = kn( | |||
[n11['attributes']], [n21['attributes']]) * kn( | |||
[n12['attributes']], [n22['attributes']]) | |||
kn2 = kn( | |||
[n11['attributes']], [n22['attributes']]) * kn( | |||
[n12['attributes']], [n21['attributes']]) | |||
Kmatrix += kn1 + kn2 | |||
except KeyError: # missing attributes | |||
pass | |||
# node unlabeled | |||
else: | |||
for e1, e2 in product( | |||
Gn[i].edges(data=True), Gn[j].edges(data=True)): | |||
if e1[2]['cost'] == e2[2]['cost']: | |||
Kmatrix += 1 | |||
run_time = time.time() - start_time | |||
print( | |||
"\n --- shortest path kernel matrix of size %d built in %s seconds ---" | |||
% (len(Gn), run_time)) | |||
return i, j, Kmatrix | |||
return Kmatrix, run_time, idx | |||
def wrap_getSPGraph(Gn, weight, i): | |||
return i, getSPGraph(Gn[i], edge_weight=weight) |
@@ -1,11 +1,32 @@ | |||
import numpy as np | |||
from matplotlib import pyplot as plt | |||
from sklearn.kernel_ridge import KernelRidge | |||
from sklearn.svm import SVC | |||
from sklearn.metrics import accuracy_score, mean_squared_error | |||
from sklearn.model_selection import KFold, train_test_split, ParameterGrid | |||
from joblib import Parallel, delayed | |||
from multiprocessing import Pool | |||
from functools import partial | |||
import sys | |||
sys.path.insert(0, "../") | |||
import os | |||
import time | |||
from os.path import basename, splitext | |||
from pygraph.utils.graphfiles import loadDataset | |||
from tqdm import tqdm | |||
def model_selection_for_precomputed_kernel(datafile, estimator, | |||
param_grid_precomputed, param_grid, | |||
model_type, NUM_TRIALS=30, | |||
def model_selection_for_precomputed_kernel(datafile, | |||
estimator, | |||
param_grid_precomputed, | |||
param_grid, | |||
model_type, | |||
NUM_TRIALS=30, | |||
datafile_y=None, | |||
extra_params=None, | |||
ds_name='ds-unknown'): | |||
ds_name='ds-unknown', | |||
n_jobs=1): | |||
"""Perform model selection, fitting and testing for precomputed kernels using nested cv. Print out neccessary data during the process then finally the results. | |||
Parameters | |||
@@ -40,94 +61,101 @@ def model_selection_for_precomputed_kernel(datafile, estimator, | |||
>>> | |||
>>> model_selection_for_precomputed_kernel(datafile, estimator, param_grid_precomputed, param_grid, 'regression') | |||
""" | |||
import numpy as np | |||
from matplotlib import pyplot as plt | |||
from sklearn.kernel_ridge import KernelRidge | |||
from sklearn.svm import SVC | |||
from sklearn.metrics import accuracy_score, mean_squared_error | |||
from sklearn.model_selection import KFold, train_test_split, ParameterGrid | |||
import sys | |||
sys.path.insert(0, "../") | |||
import os | |||
from os.path import basename, splitext | |||
from pygraph.utils.graphfiles import loadDataset | |||
from tqdm import tqdm | |||
tqdm.monitor_interval = 0 | |||
results_dir = '../notebooks/results/' + estimator.__name__ | |||
if not os.path.exists(results_dir): | |||
os.makedirs(results_dir) | |||
# open file to save all results for this dataset. | |||
with open(results_dir + '/' + ds_name + '.txt', 'w') as fresults: | |||
fresults.write('# This file contains results of ' + estimator.__name__ + ' on dataset ' + ds_name + ',\n# including gram matrices, serial numbers for gram matrix figures and performance.\n\n') | |||
# setup the model type | |||
model_type = model_type.lower() | |||
if model_type != 'regression' and model_type != 'classification': | |||
raise Exception( | |||
'The model type is incorrect! Please choose from regression or classification.') | |||
print() | |||
print('--- This is a %s problem ---' % model_type) | |||
fresults.write('This is a %s problem.\n\n' % model_type) | |||
# a string to save all the results. | |||
str_fw = '# This file contains results of ' + estimator.__name__ + ' on dataset ' + ds_name + ',\n# including gram matrices, serial numbers for gram matrix figures and performance.\n\n' | |||
# Load the dataset | |||
print() | |||
print('\nI. Loading dataset from file...') | |||
dataset, y = loadDataset(datafile, filename_y=datafile_y, extra_params=extra_params) | |||
# setup the model type | |||
model_type = model_type.lower() | |||
if model_type != 'regression' and model_type != 'classification': | |||
raise Exception( | |||
'The model type is incorrect! Please choose from regression or classification.' | |||
) | |||
print() | |||
print('--- This is a %s problem ---' % model_type) | |||
str_fw += 'This is a %s problem.\n\n' % model_type | |||
# Load the dataset | |||
print() | |||
print('\nI. Loading dataset from file...') | |||
dataset, y = loadDataset( | |||
datafile, filename_y=datafile_y, extra_params=extra_params) | |||
# import matplotlib.pyplot as plt | |||
# import matplotlib.pyplot as plt | |||
# import networkx as nx | |||
# nx.draw_networkx(dataset[30]) | |||
# plt.show() | |||
# Grid of parameters with a discrete number of values for each. | |||
param_list_precomputed = list(ParameterGrid(param_grid_precomputed)) | |||
param_list = list(ParameterGrid(param_grid)) | |||
# np.savetxt(results_name_pre + 'param_grid_precomputed.dt', | |||
# [[key, value] for key, value in sorted(param_grid_precomputed)]) | |||
# np.savetxt(results_name_pre + 'param_grid.dt', | |||
# [[key, value] for key, value in sorted(param_grid)]) | |||
# Grid of parameters with a discrete number of values for each. | |||
param_list_precomputed = list(ParameterGrid(param_grid_precomputed)) | |||
param_list = list(ParameterGrid(param_grid)) | |||
# np.savetxt(results_name_pre + 'param_grid_precomputed.dt', | |||
# [[key, value] for key, value in sorted(param_grid_precomputed)]) | |||
# np.savetxt(results_name_pre + 'param_grid.dt', | |||
# [[key, value] for key, value in sorted(param_grid)]) | |||
gram_matrices = [] # a list to store gram matrices for all param_grid_precomputed | |||
gram_matrix_time = [] # a list to store time to calculate gram matrices | |||
param_list_pre_revised = [] # list to store param grids precomputed ignoring the useless ones | |||
gram_matrices = [ | |||
] # a list to store gram matrices for all param_grid_precomputed | |||
gram_matrix_time = [ | |||
] # a list to store time to calculate gram matrices | |||
param_list_pre_revised = [ | |||
] # list to store param grids precomputed ignoring the useless ones | |||
# calculate all gram matrices | |||
print() | |||
print('2. Calculating gram matrices. This could take a while...') | |||
str_fw += '\nI. Gram matrices.\n\n' | |||
tts = time.time() # start training time | |||
nb_gm_ignore = 0 # the number of gram matrices those should not be considered, as they may contain elements that are not numbers (NaN) | |||
for idx, params_out in enumerate(param_list_precomputed): | |||
params_out['n_jobs'] = n_jobs | |||
rtn_data = estimator(dataset, **params_out) | |||
Kmatrix = rtn_data[0] | |||
current_run_time = rtn_data[1] | |||
if len(rtn_data) == 3: | |||
idx_trim = rtn_data[2] # the index of trimmed graph list | |||
y = [y[idx] for idx in idx_trim] | |||
Kmatrix_diag = Kmatrix.diagonal().copy() | |||
# remove graphs whose kernels with themselves are zeros | |||
nb_g_ignore = 0 | |||
for idx, diag in enumerate(Kmatrix_diag): | |||
if diag == 0: | |||
Kmatrix = np.delete(Kmatrix, (idx - nb_g_ignore), axis=0) | |||
Kmatrix = np.delete(Kmatrix, (idx - nb_g_ignore), axis=1) | |||
nb_g_ignore += 1 | |||
# normalization | |||
for i in range(len(Kmatrix)): | |||
for j in range(i, len(Kmatrix)): | |||
Kmatrix[i][j] /= np.sqrt(Kmatrix_diag[i] * Kmatrix_diag[j]) | |||
Kmatrix[j][i] = Kmatrix[i][j] | |||
# calculate all gram matrices | |||
print() | |||
print('2. Calculating gram matrices. This could take a while...') | |||
fresults.write('\nI. Gram matrices.\n\n') | |||
nb_gm_ignore = 0 # the number of gram matrices those should not be considered, as they may contain elements that are not numbers (NaN) | |||
for idx, params_out in enumerate(param_list_precomputed): | |||
rtn_data = estimator(dataset, **params_out) | |||
Kmatrix = rtn_data[0] | |||
current_run_time = rtn_data[1] | |||
if len(rtn_data) == 3: | |||
idx_trim = rtn_data[2] # the index of trimmed graph list | |||
y = [y[idx] for idx in idx_trim] | |||
Kmatrix_diag = Kmatrix.diagonal().copy() | |||
for i in range(len(Kmatrix)): | |||
for j in range(i, len(Kmatrix)): | |||
# if Kmatrix_diag[i] != 0 and Kmatrix_diag[j] != 0: | |||
Kmatrix[i][j] /= np.sqrt(Kmatrix_diag[i] * Kmatrix_diag[j]) | |||
Kmatrix[j][i] = Kmatrix[i][j] | |||
print() | |||
if params_out == {}: | |||
print('the gram matrix is: ') | |||
fresults.write('the gram matrix is:\n\n') | |||
else: | |||
print('the gram matrix with parameters', params_out, 'is: ') | |||
fresults.write('the gram matrix with parameters %s is:\n\n' % params_out) | |||
if np.isnan(Kmatrix).any(): # if the matrix contains elements that are not numbers | |||
if params_out == {}: | |||
print('the gram matrix is: ') | |||
str_fw += 'the gram matrix is:\n\n' | |||
else: | |||
print('the gram matrix with parameters', params_out, 'is: ') | |||
str_fw += 'the gram matrix with parameters %s is:\n\n' % params_out | |||
if len(Kmatrix) < 2: | |||
nb_gm_ignore += 1 | |||
print('ignored, as at most only one of all its diagonal value is non-zero.') | |||
str_fw += 'ignored, as at most only one of all its diagonal value is non-zero.\n\n' | |||
else: | |||
if np.isnan(Kmatrix).any( | |||
): # if the matrix contains elements that are not numbers | |||
nb_gm_ignore += 1 | |||
print('ignored, as it contains elements that are not numbers.') | |||
fresults.write('ignored, as it contains elements that are not numbers.\n\n') | |||
str_fw += 'ignored, as it contains elements that are not numbers.\n\n' | |||
else: | |||
print(Kmatrix) | |||
fresults.write(np.array2string(Kmatrix, separator=',', threshold=np.inf, floatmode='unique') + '\n\n') | |||
str_fw += np.array2string( | |||
Kmatrix, | |||
separator=',', | |||
threshold=np.inf, | |||
floatmode='unique') + '\n\n' | |||
plt.matshow(Kmatrix) | |||
plt.colorbar() | |||
fig_file_name = results_dir + '/GM[ds]' + ds_name | |||
@@ -138,115 +166,52 @@ def model_selection_for_precomputed_kernel(datafile, estimator, | |||
gram_matrices.append(Kmatrix) | |||
gram_matrix_time.append(current_run_time) | |||
param_list_pre_revised.append(params_out) | |||
print() | |||
print('{} gram matrices are calculated, {} of which are ignored.'.format(len(param_list_precomputed), nb_gm_ignore)) | |||
fresults.write('{} gram matrices are calculated, {} of which are ignored.\n\n'.format(len(param_list_precomputed), nb_gm_ignore)) | |||
fresults.write('serial numbers of gram matrix figure and their corresponding parameters settings:\n\n') | |||
fresults.write(''.join(['{}: {}\n'.format(idx, params_out) | |||
for idx, params_out in enumerate(param_list_precomputed)])) | |||
if nb_g_ignore > 0: | |||
print(', where %d graphs are ignored as their graph kernels with themselves are zeros.' % nb_g_ignore) | |||
str_fw += ', where %d graphs are ignored as their graph kernels with themselves are zeros.' % nb_g_ignore | |||
print() | |||
print( | |||
'{} gram matrices are calculated, {} of which are ignored.'.format( | |||
len(param_list_precomputed), nb_gm_ignore)) | |||
str_fw += '{} gram matrices are calculated, {} of which are ignored.\n\n'.format(len(param_list_precomputed), nb_gm_ignore) | |||
str_fw += 'serial numbers of gram matrix figures and their corresponding parameters settings:\n\n' | |||
str_fw += ''.join([ | |||
'{}: {}\n'.format(idx, params_out) | |||
for idx, params_out in enumerate(param_list_precomputed) | |||
]) | |||
print() | |||
print('3. Fitting and predicting using nested cross validation. This could really take a while...') | |||
# Arrays to store scores | |||
train_pref = np.zeros( | |||
(NUM_TRIALS, len(param_list_pre_revised), len(param_list))) | |||
val_pref = np.zeros( | |||
(NUM_TRIALS, len(param_list_pre_revised), len(param_list))) | |||
test_pref = np.zeros( | |||
(NUM_TRIALS, len(param_list_pre_revised), len(param_list))) | |||
# Loop for each trial | |||
pbar = tqdm(total=NUM_TRIALS * len(param_list_pre_revised) * len(param_list), | |||
desc='calculate performance', file=sys.stdout) | |||
for trial in range(NUM_TRIALS): # Test set level | |||
# loop for each outer param tuple | |||
for index_out, params_out in enumerate(param_list_pre_revised): | |||
# split gram matrix and y to app and test sets. | |||
X_app, X_test, y_app, y_test = train_test_split( | |||
gram_matrices[index_out], y, test_size=0.1) | |||
split_index_app = [y.index(y_i) for y_i in y_app if y_i in y] | |||
# split_index_test = [y.index(y_i) for y_i in y_test if y_i in y] | |||
X_app = X_app[:, split_index_app] | |||
X_test = X_test[:, split_index_app] | |||
y_app = np.array(y_app) | |||
y_test = np.array(y_test) | |||
# loop for each inner param tuple | |||
for index_in, params_in in enumerate(param_list): | |||
inner_cv = KFold(n_splits=10, shuffle=True, random_state=trial) | |||
current_train_perf = [] | |||
current_valid_perf = [] | |||
current_test_perf = [] | |||
# For regression use the Kernel Ridge method | |||
try: | |||
if model_type == 'regression': | |||
KR = KernelRidge(kernel='precomputed', **params_in) | |||
# loop for each split on validation set level | |||
# validation set level | |||
for train_index, valid_index in inner_cv.split(X_app): | |||
KR.fit(X_app[train_index, :] | |||
[:, train_index], y_app[train_index]) | |||
# predict on the train, validation and test set | |||
y_pred_train = KR.predict( | |||
X_app[train_index, :][:, train_index]) | |||
y_pred_valid = KR.predict( | |||
X_app[valid_index, :][:, train_index]) | |||
y_pred_test = KR.predict(X_test[:, train_index]) | |||
# root mean squared errors | |||
current_train_perf.append( | |||
np.sqrt(mean_squared_error(y_app[train_index], y_pred_train))) | |||
current_valid_perf.append( | |||
np.sqrt(mean_squared_error(y_app[valid_index], y_pred_valid))) | |||
current_test_perf.append( | |||
np.sqrt(mean_squared_error(y_test, y_pred_test))) | |||
# For clcassification use SVM | |||
else: | |||
KR = SVC(kernel='precomputed', **params_in) | |||
# loop for each split on validation set level | |||
# validation set level | |||
for train_index, valid_index in inner_cv.split(X_app): | |||
KR.fit(X_app[train_index, :] | |||
[:, train_index], y_app[train_index]) | |||
# predict on the train, validation and test set | |||
y_pred_train = KR.predict( | |||
X_app[train_index, :][:, train_index]) | |||
y_pred_valid = KR.predict( | |||
X_app[valid_index, :][:, train_index]) | |||
y_pred_test = KR.predict( | |||
X_test[:, train_index]) | |||
# root mean squared errors | |||
current_train_perf.append(accuracy_score( | |||
y_app[train_index], y_pred_train)) | |||
current_valid_perf.append(accuracy_score( | |||
y_app[valid_index], y_pred_valid)) | |||
current_test_perf.append( | |||
accuracy_score(y_test, y_pred_test)) | |||
except ValueError: | |||
print(sys.exc_info()[0]) | |||
print(params_out, params_in) | |||
# average performance on inner splits | |||
train_pref[trial][index_out][index_in] = np.mean( | |||
current_train_perf) | |||
val_pref[trial][index_out][index_in] = np.mean( | |||
current_valid_perf) | |||
test_pref[trial][index_out][index_in] = np.mean( | |||
current_test_perf) | |||
pbar.update(1) | |||
pbar.clear() | |||
print() | |||
if len(gram_matrices) == 0: | |||
print('all gram matrices are ignored, no results obtained.') | |||
str_fw += '\nall gram matrices are ignored, no results obtained.\n\n' | |||
else: | |||
print( | |||
'3. Fitting and predicting using nested cross validation. This could really take a while...' | |||
) | |||
pool = Pool(n_jobs) | |||
trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, gram_matrices, y, model_type) | |||
result_perf = pool.map(trial_do_partial, range(NUM_TRIALS)) | |||
train_pref = [item[0] for item in result_perf] | |||
val_pref = [item[1] for item in result_perf] | |||
test_pref = [item[2] for item in result_perf] | |||
pool.close() | |||
pool.join() | |||
# trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, gram_matrices, y, model_type) | |||
# result_perf = Parallel(n_jobs=n_jobs, verbose=10)(delayed(trial_do_partial)(trial) for trial in range(NUM_TRIALS)) | |||
# train_pref = [item[0] for item in result_perf] | |||
# val_pref = [item[1] for item in result_perf] | |||
# test_pref = [item[2] for item in result_perf] | |||
# pbar.clear() | |||
# np.save(results_name_pre + 'train_pref.dt', train_pref) | |||
# np.save(results_name_pre + 'val_pref.dt', val_pref) | |||
# np.save(results_name_pre + 'test_pref.dt', test_pref) | |||
print() | |||
print('4. Getting final performance...') | |||
fresults.write('\nII. Performance.\n\n') | |||
str_fw += '\nII. Performance.\n\n' | |||
# averages and confidences of performances on outer trials for each combination of parameters | |||
average_train_scores = np.mean(train_pref, axis=0) | |||
average_val_scores = np.mean(val_pref, axis=0) | |||
@@ -255,53 +220,78 @@ def model_selection_for_precomputed_kernel(datafile, estimator, | |||
std_train_scores = np.std(train_pref, axis=0, ddof=1) | |||
std_val_scores = np.std(val_pref, axis=0, ddof=1) | |||
std_perf_scores = np.std(test_pref, axis=0, ddof=1) | |||
if model_type == 'regression': | |||
best_val_perf = np.amin(average_val_scores) | |||
else: | |||
best_val_perf = np.amax(average_val_scores) | |||
best_params_index = np.where(average_val_scores == best_val_perf) | |||
# find smallest val std with best val perf. | |||
best_val_stds = [std_val_scores[value][best_params_index[1][idx]] for idx, value in enumerate(best_params_index[0])] | |||
best_val_stds = [ | |||
std_val_scores[value][best_params_index[1][idx]] | |||
for idx, value in enumerate(best_params_index[0]) | |||
] | |||
min_val_std = np.amin(best_val_stds) | |||
best_params_index = np.where(std_val_scores == min_val_std) | |||
best_params_out = [param_list_pre_revised[i] for i in best_params_index[0]] | |||
best_params_out = [ | |||
param_list_pre_revised[i] for i in best_params_index[0] | |||
] | |||
best_params_in = [param_list[i] for i in best_params_index[1]] | |||
print('best_params_out: ', best_params_out) | |||
print('best_params_in: ', best_params_in) | |||
print() | |||
print('best_val_perf: ', best_val_perf) | |||
print('best_val_std: ', min_val_std) | |||
fresults.write('best settings of hyper-params to build gram matrix: %s\n' % best_params_out) | |||
fresults.write('best settings of other hyper-params: %s\n\n' % best_params_in) | |||
fresults.write('best_val_perf: %s\n' % best_val_perf) | |||
fresults.write('best_val_std: %s\n' % min_val_std) | |||
str_fw += 'best settings of hyper-params to build gram matrix: %s\n' % best_params_out | |||
str_fw += 'best settings of other hyper-params: %s\n\n' % best_params_in | |||
str_fw += 'best_val_perf: %s\n' % best_val_perf | |||
str_fw += 'best_val_std: %s\n' % min_val_std | |||
final_performance = [average_perf_scores[value][best_params_index[1][idx]] for idx, value in enumerate(best_params_index[0])] | |||
final_confidence = [std_perf_scores[value][best_params_index[1][idx]] for idx, value in enumerate(best_params_index[0])] | |||
final_performance = [ | |||
average_perf_scores[value][best_params_index[1][idx]] | |||
for idx, value in enumerate(best_params_index[0]) | |||
] | |||
final_confidence = [ | |||
std_perf_scores[value][best_params_index[1][idx]] | |||
for idx, value in enumerate(best_params_index[0]) | |||
] | |||
print('final_performance: ', final_performance) | |||
print('final_confidence: ', final_confidence) | |||
fresults.write('final_performance: %s\n' % final_performance) | |||
fresults.write('final_confidence: %s\n' % final_confidence) | |||
train_performance = [average_train_scores[value][best_params_index[1][idx]] for idx, value in enumerate(best_params_index[0])] | |||
train_std = [std_train_scores[value][best_params_index[1][idx]] for idx, value in enumerate(best_params_index[0])] | |||
str_fw += 'final_performance: %s\n' % final_performance | |||
str_fw += 'final_confidence: %s\n' % final_confidence | |||
train_performance = [ | |||
average_train_scores[value][best_params_index[1][idx]] | |||
for idx, value in enumerate(best_params_index[0]) | |||
] | |||
train_std = [ | |||
std_train_scores[value][best_params_index[1][idx]] | |||
for idx, value in enumerate(best_params_index[0]) | |||
] | |||
print('train_performance: %s' % train_performance) | |||
print('train_std: ', train_std) | |||
fresults.write('train_performance: %s\n' % train_performance) | |||
fresults.write('train_std: %s\n\n' % train_std) | |||
str_fw += 'train_performance: %s\n' % train_performance | |||
str_fw += 'train_std: %s\n\n' % train_std | |||
print() | |||
tt_total = time.time() - tts # training time for all hyper-parameters | |||
average_gram_matrix_time = np.mean(gram_matrix_time) | |||
std_gram_matrix_time = np.std(gram_matrix_time, ddof=1) | |||
best_gram_matrix_time = [gram_matrix_time[i] for i in best_params_index[0]] | |||
best_gram_matrix_time = [ | |||
gram_matrix_time[i] for i in best_params_index[0] | |||
] | |||
ave_bgmt = np.mean(best_gram_matrix_time) | |||
std_bgmt = np.std(best_gram_matrix_time, ddof=1) | |||
print('time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s' | |||
.format(average_gram_matrix_time, std_gram_matrix_time)) | |||
print('time to calculate best gram matrix: {:.2f}±{:.2f}s'.format(ave_bgmt, std_bgmt)) | |||
fresults.write('time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s\n' | |||
.format(average_gram_matrix_time, std_gram_matrix_time)) | |||
fresults.write('time to calculate best gram matrix: {:.2f}±{:.2f}s\n\n'.format(ave_bgmt, std_bgmt)) | |||
print( | |||
'time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s' | |||
.format(average_gram_matrix_time, std_gram_matrix_time)) | |||
print('time to calculate best gram matrix: {:.2f}±{:.2f}s'.format( | |||
ave_bgmt, std_bgmt)) | |||
print( | |||
'total training time with all hyper-param choices: {:.2f}s'.format( | |||
tt_total)) | |||
str_fw += 'time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s\n'.format(average_gram_matrix_time, std_gram_matrix_time) | |||
str_fw += 'time to calculate best gram matrix: {:.2f}±{:.2f}s\n'.format(ave_bgmt, std_bgmt) | |||
str_fw += 'total training time with all hyper-param choices: {:.2f}s\n\n'.format(tt_total) | |||
# # save results to file | |||
# np.savetxt(results_name_pre + 'average_train_scores.dt', | |||
@@ -312,7 +302,7 @@ def model_selection_for_precomputed_kernel(datafile, estimator, | |||
# np.savetxt(results_name_pre + 'std_train_scores.dt', std_train_scores) | |||
# np.savetxt(results_name_pre + 'std_val_scores.dt', std_val_scores) | |||
# np.savetxt(results_name_pre + 'std_perf_scores.dt', std_perf_scores) | |||
# np.save(results_name_pre + 'best_params_index', best_params_index) | |||
# np.save(results_name_pre + 'best_params_pre.dt', best_params_out) | |||
# np.save(results_name_pre + 'best_params_in.dt', best_params_in) | |||
@@ -322,7 +312,7 @@ def model_selection_for_precomputed_kernel(datafile, estimator, | |||
# np.save(results_name_pre + 'final_confidence.dt', final_confidence) | |||
# np.save(results_name_pre + 'train_performance.dt', train_performance) | |||
# np.save(results_name_pre + 'train_std.dt', train_std) | |||
# np.save(results_name_pre + 'gram_matrix_time.dt', gram_matrix_time) | |||
# np.save(results_name_pre + 'average_gram_matrix_time.dt', | |||
# average_gram_matrix_time) | |||
@@ -330,7 +320,7 @@ def model_selection_for_precomputed_kernel(datafile, estimator, | |||
# std_gram_matrix_time) | |||
# np.save(results_name_pre + 'best_gram_matrix_time.dt', | |||
# best_gram_matrix_time) | |||
# print out as table. | |||
from collections import OrderedDict | |||
from tabulate import tabulate | |||
@@ -343,20 +333,150 @@ def model_selection_for_precomputed_kernel(datafile, estimator, | |||
param_in['C'] = '{:.2e}'.format(param_in['C']) | |||
table_dict['params'] = [{**param_out, **param_in} | |||
for param_in in param_list for param_out in param_list_pre_revised] | |||
table_dict['gram_matrix_time'] = ['{:.2f}'.format(gram_matrix_time[index_out]) | |||
for param_in in param_list for index_out, _ in enumerate(param_list_pre_revised)] | |||
table_dict['valid_perf'] = ['{:.2f}±{:.2f}'.format(average_val_scores[index_out][index_in], std_val_scores[index_out][index_in]) | |||
for index_in, _ in enumerate(param_list) for index_out, _ in enumerate(param_list_pre_revised)] | |||
table_dict['test_perf'] = ['{:.2f}±{:.2f}'.format(average_perf_scores[index_out][index_in], std_perf_scores[index_out][index_in]) | |||
for index_in, _ in enumerate(param_list) for index_out, _ in enumerate(param_list_pre_revised)] | |||
table_dict['train_perf'] = ['{:.2f}±{:.2f}'.format(average_train_scores[index_out][index_in], std_train_scores[index_out][index_in]) | |||
for index_in, _ in enumerate(param_list) for index_out, _ in enumerate(param_list_pre_revised)] | |||
keyorder = ['params', 'train_perf', 'valid_perf', | |||
'test_perf', 'gram_matrix_time'] | |||
table_dict['gram_matrix_time'] = [ | |||
'{:.2f}'.format(gram_matrix_time[index_out]) | |||
for param_in in param_list | |||
for index_out, _ in enumerate(param_list_pre_revised) | |||
] | |||
table_dict['valid_perf'] = [ | |||
'{:.2f}±{:.2f}'.format(average_val_scores[index_out][index_in], | |||
std_val_scores[index_out][index_in]) | |||
for index_in, _ in enumerate(param_list) | |||
for index_out, _ in enumerate(param_list_pre_revised) | |||
] | |||
table_dict['test_perf'] = [ | |||
'{:.2f}±{:.2f}'.format(average_perf_scores[index_out][index_in], | |||
std_perf_scores[index_out][index_in]) | |||
for index_in, _ in enumerate(param_list) | |||
for index_out, _ in enumerate(param_list_pre_revised) | |||
] | |||
table_dict['train_perf'] = [ | |||
'{:.2f}±{:.2f}'.format(average_train_scores[index_out][index_in], | |||
std_train_scores[index_out][index_in]) | |||
for index_in, _ in enumerate(param_list) | |||
for index_out, _ in enumerate(param_list_pre_revised) | |||
] | |||
keyorder = [ | |||
'params', 'train_perf', 'valid_perf', 'test_perf', | |||
'gram_matrix_time' | |||
] | |||
print() | |||
tb_print = tabulate(OrderedDict(sorted(table_dict.items(), | |||
key=lambda i: keyorder.index(i[0]))), headers='keys') | |||
tb_print = tabulate( | |||
OrderedDict( | |||
sorted(table_dict.items(), | |||
key=lambda i: keyorder.index(i[0]))), | |||
headers='keys') | |||
print(tb_print) | |||
fresults.write('table of performance v.s. hyper-params:\n\n%s\n\n' % tb_print) | |||
str_fw += 'table of performance v.s. hyper-params:\n\n%s\n\n' % tb_print | |||
# open file to save all results for this dataset. | |||
if not os.path.exists(results_dir): | |||
os.makedirs(results_dir) | |||
with open(results_dir + '/' + ds_name + '.txt', 'w') as fresults: | |||
fresults.write(str_fw) | |||
fresults.close() | |||
def trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, trial): # Test set level | |||
# Arrays to store scores | |||
train_pref = np.zeros((len(param_list_pre_revised), | |||
len(param_list))) | |||
val_pref = np.zeros((len(param_list_pre_revised), | |||
len(param_list))) | |||
test_pref = np.zeros((len(param_list_pre_revised), | |||
len(param_list))) | |||
# loop for each outer param tuple | |||
for index_out, params_out in enumerate(param_list_pre_revised): | |||
# split gram matrix and y to app and test sets. | |||
X_app, X_test, y_app, y_test = train_test_split( | |||
gram_matrices[index_out], y, test_size=0.1) | |||
split_index_app = [y.index(y_i) for y_i in y_app if y_i in y] | |||
# split_index_test = [y.index(y_i) for y_i in y_test if y_i in y] | |||
X_app = X_app[:, split_index_app] | |||
X_test = X_test[:, split_index_app] | |||
y_app = np.array(y_app) | |||
y_test = np.array(y_test) | |||
# loop for each inner param tuple | |||
for index_in, params_in in enumerate(param_list): | |||
inner_cv = KFold( | |||
n_splits=10, shuffle=True, random_state=trial) | |||
current_train_perf = [] | |||
current_valid_perf = [] | |||
current_test_perf = [] | |||
# For regression use the Kernel Ridge method | |||
try: | |||
if model_type == 'regression': | |||
KR = KernelRidge(kernel='precomputed', **params_in) | |||
# loop for each split on validation set level | |||
# validation set level | |||
for train_index, valid_index in inner_cv.split( | |||
X_app): | |||
KR.fit(X_app[train_index, :][:, train_index], | |||
y_app[train_index]) | |||
# predict on the train, validation and test set | |||
y_pred_train = KR.predict( | |||
X_app[train_index, :][:, train_index]) | |||
y_pred_valid = KR.predict( | |||
X_app[valid_index, :][:, train_index]) | |||
y_pred_test = KR.predict( | |||
X_test[:, train_index]) | |||
# root mean squared errors | |||
current_train_perf.append( | |||
np.sqrt( | |||
mean_squared_error( | |||
y_app[train_index], y_pred_train))) | |||
current_valid_perf.append( | |||
np.sqrt( | |||
mean_squared_error( | |||
y_app[valid_index], y_pred_valid))) | |||
current_test_perf.append( | |||
np.sqrt( | |||
mean_squared_error( | |||
y_test, y_pred_test))) | |||
# For clcassification use SVM | |||
else: | |||
KR = SVC(kernel='precomputed', **params_in) | |||
# loop for each split on validation set level | |||
# validation set level | |||
for train_index, valid_index in inner_cv.split( | |||
X_app): | |||
KR.fit(X_app[train_index, :][:, train_index], | |||
y_app[train_index]) | |||
# predict on the train, validation and test set | |||
y_pred_train = KR.predict( | |||
X_app[train_index, :][:, train_index]) | |||
y_pred_valid = KR.predict( | |||
X_app[valid_index, :][:, train_index]) | |||
y_pred_test = KR.predict( | |||
X_test[:, train_index]) | |||
# root mean squared errors | |||
current_train_perf.append( | |||
accuracy_score(y_app[train_index], | |||
y_pred_train)) | |||
current_valid_perf.append( | |||
accuracy_score(y_app[valid_index], | |||
y_pred_valid)) | |||
current_test_perf.append( | |||
accuracy_score(y_test, y_pred_test)) | |||
except ValueError: | |||
print(sys.exc_info()[0]) | |||
print(params_out, params_in) | |||
# average performance on inner splits | |||
train_pref[index_out][index_in] = np.mean( | |||
current_train_perf) | |||
val_pref[index_out][index_in] = np.mean( | |||
current_valid_perf) | |||
test_pref[index_out][index_in] = np.mean( | |||
current_test_perf) | |||
fresults.close() | |||
return train_pref, val_pref, test_pref |
@@ -61,10 +61,11 @@ def floydTransformation(G, edge_weight=None): | |||
spMatrix = nx.floyd_warshall_numpy(G, weight=edge_weight) | |||
S = nx.Graph() | |||
S.add_nodes_from(G.nodes(data=True)) | |||
ns = list(G.nodes()) | |||
for i in range(0, G.number_of_nodes()): | |||
for j in range(i + 1, G.number_of_nodes()): | |||
if spMatrix[i, j] != np.inf: | |||
S.add_edge(i, j, cost=spMatrix[i, j]) | |||
S.add_edge(ns[i], ns[j], cost=spMatrix[i, j]) | |||
return S | |||