2. modify model_selection_precomputed so that all results are written into memory and then to a file at last section of code, in case that on cpu/disk seperated systems the IO takes too much time. 3. correct utils.floyd_warshall_numpy function. DONNOT use the last version.v0.1
@@ -3,12 +3,15 @@ A python package for graph kernels. | |||||
## Requirements | ## Requirements | ||||
* numpy - 1.13.3 | |||||
* scipy - 1.0.0 | |||||
* matplotlib - 2.1.0 | |||||
* networkx - 2.0 | |||||
* sklearn - 0.19.1 | |||||
* tabulate - 0.8.2 | |||||
numpy==1.14.5 | |||||
scipy==1.1.0 | |||||
matplotlib==2.2.2 | |||||
networkx==2.1 | |||||
scikit-learn==0.19.1 | |||||
tabulate==0.8.2 | |||||
tqdm==4.23.4 | |||||
control==0.7.0 (for generalized random walk kernels only) | |||||
slycot===0.3.2.dev-5263ada (for generalized random walk kernels only, requires fortran compiler, gfortran for example) | |||||
## Results with minimal test RMSE for each kernel on dataset Asyclic | ## Results with minimal test RMSE for each kernel on dataset Asyclic | ||||
@@ -28,7 +31,7 @@ For prediction we randomly divide the data in train and test subset, where 90\% | |||||
| WL shortest path | 28.74±0.60 | 38.20±0.62 | 39.02±6.09 | 'height': 10.0, 'alpha': '1.00' | 146.83"/80.63"±45.04" | | | WL shortest path | 28.74±0.60 | 38.20±0.62 | 39.02±6.09 | 'height': 10.0, 'alpha': '1.00' | 146.83"/80.63"±45.04" | | ||||
| WL edge | 30.21±0.64 | 36.53±1.02 | 38.42±6.42 | 'height': 5.0, 'alpha': '6.31e-01' | 5.24"/5.15"±2.83" | | | WL edge | 30.21±0.64 | 36.53±1.02 | 38.42±6.42 | 'height': 5.0, 'alpha': '6.31e-01' | 5.24"/5.15"±2.83" | | ||||
| Treelet | 7.33±0.64 | 13.86±0.80 | 15.38±3.56 | 'alpha': '1.12e+01' | 0.48" | | | Treelet | 7.33±0.64 | 13.86±0.80 | 15.38±3.56 | 'alpha': '1.12e+01' | 0.48" | | ||||
| Path up to d | 5.76±0.27 | 9.89±0.87 | 10.21±4.16 | 'depth': 2.0, 'k_func': 'MinMax', 'alpha': '0.1' | 0.56"/1.16"±0.75" | | |||||
| Path up to d | 5.76±0.27 | 9.89±0.87 | 10.21±4.16 | 'depth': 2.0, 'k_func': 'MinMax', 'alpha ': '0.1' | 0.56"/1.16"±0.75" | | |||||
| Cyclic pattern | | | | | | | | Cyclic pattern | | | | | | | ||||
| Walk up to n | 20.88±0.74 | 23.34±1.11 | 24.46±6.57 | 'n': 2.0, 'alpha': '1.00e-03' | 0.56"/331.70"±753.44" | | | Walk up to n | 20.88±0.74 | 23.34±1.11 | 24.46±6.57 | 'n': 2.0, 'alpha': '1.00e-03' | 0.56"/331.70"±753.44" | | ||||
@@ -3,106 +3,66 @@ dslist = [ | |||||
'name': 'Acyclic', | 'name': 'Acyclic', | ||||
'dataset': '../datasets/acyclic/dataset_bps.ds', | 'dataset': '../datasets/acyclic/dataset_bps.ds', | ||||
'task': 'regression' | 'task': 'regression' | ||||
}, # node_labeled | |||||
{ | |||||
'name': 'COIL-DEL', | |||||
'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt' | |||||
}, # edge_labeled | |||||
}, # node symb | |||||
# {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb | |||||
{ | { | ||||
'name': 'PAH', | 'name': 'PAH', | ||||
'dataset': '../datasets/PAH/dataset.ds', | 'dataset': '../datasets/PAH/dataset.ds', | ||||
}, # unlabeled | }, # unlabeled | ||||
{ | { | ||||
'name': 'Mutagenicity', | |||||
'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt' | |||||
}, # fully_labeled | |||||
{ | |||||
'name': 'MAO', | 'name': 'MAO', | ||||
'dataset': '../datasets/MAO/dataset.ds', | 'dataset': '../datasets/MAO/dataset.ds', | ||||
}, | |||||
}, # node/edge symb | |||||
{ | { | ||||
'name': 'MUTAG', | 'name': 'MUTAG', | ||||
'dataset': '../datasets/MUTAG/MUTAG.mat', | 'dataset': '../datasets/MUTAG/MUTAG.mat', | ||||
'extra_params': { | 'extra_params': { | ||||
'am_sp_al_nl_el': [0, 0, 3, 1, 2] | 'am_sp_al_nl_el': [0, 0, 3, 1, 2] | ||||
} | } | ||||
}, | |||||
}, # node/edge symb | |||||
{ | { | ||||
'name': 'Alkane', | 'name': 'Alkane', | ||||
'dataset': '../datasets/Alkane/dataset.ds', | 'dataset': '../datasets/Alkane/dataset.ds', | ||||
'task': 'regression', | 'task': 'regression', | ||||
'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt', | 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt', | ||||
}, | |||||
{ | |||||
'name': 'BZR', | |||||
'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt' | |||||
}, | |||||
}, # contains single node graph, node symb | |||||
# {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb | |||||
# {'name': 'COX2', 'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt'}, # node symb/nsymb | |||||
{ | { | ||||
'name': 'COX2', | |||||
'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt' | |||||
}, | |||||
'name': 'Mutagenicity', | |||||
'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt' | |||||
}, # node/edge symb | |||||
{ | { | ||||
'name': 'ENZYMES', | 'name': 'ENZYMES', | ||||
'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt' | 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt' | ||||
}, | |||||
{ | |||||
'name': 'DHFR', | |||||
'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt' | |||||
}, | |||||
{ | |||||
'name': 'SYNTHETIC', | |||||
'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt' | |||||
}, | |||||
{ | |||||
'name': 'MSRC9', | |||||
'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt' | |||||
}, | |||||
{ | |||||
'name': 'MSRC21', | |||||
'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt' | |||||
}, | |||||
}, # node symb/nsymb | |||||
# {'name': 'Fingerprint', 'dataset': '../datasets/Fingerprint/Fingerprint_A.txt'}, | |||||
{ | { | ||||
'name': 'FIRSTMM_DB', | |||||
'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt' | |||||
}, | |||||
{ | |||||
'name': 'PROTEINS', | |||||
'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt' | |||||
}, | |||||
{ | |||||
'name': 'PROTEINS_full', | |||||
'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt' | |||||
'name': 'Letter-med', | |||||
'dataset': '../datasets/Letter-med/Letter-med_A.txt' | |||||
}, | }, | ||||
# {'name': 'DHFR', 'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt'}, # node symb/nsymb | |||||
# {'name': 'SYNTHETIC', 'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt'}, # node symb/nsymb | |||||
# {'name': 'MSRC9', 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'}, # node symb | |||||
# {'name': 'MSRC21', 'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'}, # node symb | |||||
# {'name': 'FIRSTMM_DB', 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'}, # node symb/nsymb ,edge nsymb | |||||
# {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'}, # node symb/nsymb | |||||
# {'name': 'PROTEINS_full', 'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'}, # node symb/nsymb | |||||
{ | { | ||||
'name': 'D&D', | 'name': 'D&D', | ||||
'dataset': '../datasets/D&D/DD.mat', | 'dataset': '../datasets/D&D/DD.mat', | ||||
'extra_params': { | 'extra_params': { | ||||
'am_sp_al_nl_el': [0, 1, 2, 1, -1] | 'am_sp_al_nl_el': [0, 1, 2, 1, -1] | ||||
} | } | ||||
}, | |||||
{ | |||||
'name': 'AIDS', | |||||
'dataset': '../datasets/AIDS/AIDS_A.txt' | |||||
}, | |||||
{ | |||||
'name': 'NCI1', | |||||
'dataset': '../datasets/NCI1/NCI1.mat', | |||||
'extra_params': { | |||||
'am_sp_al_nl_el': [1, 1, 2, 0, -1] | |||||
} | |||||
}, | |||||
{ | |||||
'name': 'NCI109', | |||||
'dataset': '../datasets/NCI109/NCI109.mat', | |||||
'extra_params': { | |||||
'am_sp_al_nl_el': [1, 1, 2, 0, -1] | |||||
} | |||||
}, | |||||
{ | |||||
'name': 'NCI-HIV', | |||||
'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf', | |||||
'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt', | |||||
}, | |||||
}, # node symb | |||||
# {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb | |||||
# {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1.mat', | |||||
# 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb | |||||
# {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109.mat', | |||||
# 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb | |||||
# {'name': 'NCI-HIV', 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf', | |||||
# 'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',}, # node/edge symb | |||||
# # not working below | # # not working below | ||||
# {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',}, | # {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',}, | ||||
@@ -110,3 +70,116 @@ dslist = [ | |||||
# {'name': 'PTC_MM', 'dataset': '../datasets/PTC/Train/MM.ds',}, | # {'name': 'PTC_MM', 'dataset': '../datasets/PTC/Train/MM.ds',}, | ||||
# {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',}, | # {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',}, | ||||
] | ] | ||||
# dslist = [ | |||||
# { | |||||
# 'name': 'Acyclic', | |||||
# 'dataset': '../datasets/acyclic/dataset_bps.ds', | |||||
# 'task': 'regression' | |||||
# }, # node_labeled | |||||
# { | |||||
# 'name': 'COIL-DEL', | |||||
# 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt' | |||||
# }, # edge_labeled | |||||
# { | |||||
# 'name': 'PAH', | |||||
# 'dataset': '../datasets/PAH/dataset.ds', | |||||
# }, # unlabeled | |||||
# { | |||||
# 'name': 'Mutagenicity', | |||||
# 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt' | |||||
# }, # fully_labeled | |||||
# { | |||||
# 'name': 'MAO', | |||||
# 'dataset': '../datasets/MAO/dataset.ds', | |||||
# }, | |||||
# { | |||||
# 'name': 'MUTAG', | |||||
# 'dataset': '../datasets/MUTAG/MUTAG.mat', | |||||
# 'extra_params': { | |||||
# 'am_sp_al_nl_el': [0, 0, 3, 1, 2] | |||||
# } | |||||
# }, | |||||
# { | |||||
# 'name': 'Alkane', | |||||
# 'dataset': '../datasets/Alkane/dataset.ds', | |||||
# 'task': 'regression', | |||||
# 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt', | |||||
# }, | |||||
# { | |||||
# 'name': 'BZR', | |||||
# 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt' | |||||
# }, | |||||
# { | |||||
# 'name': 'COX2', | |||||
# 'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt' | |||||
# }, | |||||
# { | |||||
# 'name': 'ENZYMES', | |||||
# 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt' | |||||
# }, | |||||
# { | |||||
# 'name': 'DHFR', | |||||
# 'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt' | |||||
# }, | |||||
# { | |||||
# 'name': 'SYNTHETIC', | |||||
# 'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt' | |||||
# }, | |||||
# { | |||||
# 'name': 'MSRC9', | |||||
# 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt' | |||||
# }, | |||||
# { | |||||
# 'name': 'MSRC21', | |||||
# 'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt' | |||||
# }, | |||||
# { | |||||
# 'name': 'FIRSTMM_DB', | |||||
# 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt' | |||||
# }, | |||||
# { | |||||
# 'name': 'PROTEINS', | |||||
# 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt' | |||||
# }, | |||||
# { | |||||
# 'name': 'PROTEINS_full', | |||||
# 'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt' | |||||
# }, | |||||
# { | |||||
# 'name': 'D&D', | |||||
# 'dataset': '../datasets/D&D/DD.mat', | |||||
# 'extra_params': { | |||||
# 'am_sp_al_nl_el': [0, 1, 2, 1, -1] | |||||
# } | |||||
# }, | |||||
# { | |||||
# 'name': 'AIDS', | |||||
# 'dataset': '../datasets/AIDS/AIDS_A.txt' | |||||
# }, | |||||
# { | |||||
# 'name': 'NCI1', | |||||
# 'dataset': '../datasets/NCI1/NCI1.mat', | |||||
# 'extra_params': { | |||||
# 'am_sp_al_nl_el': [1, 1, 2, 0, -1] | |||||
# } | |||||
# }, | |||||
# { | |||||
# 'name': 'NCI109', | |||||
# 'dataset': '../datasets/NCI109/NCI109.mat', | |||||
# 'extra_params': { | |||||
# 'am_sp_al_nl_el': [1, 1, 2, 0, -1] | |||||
# } | |||||
# }, | |||||
# { | |||||
# 'name': 'NCI-HIV', | |||||
# 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf', | |||||
# 'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt', | |||||
# }, | |||||
# # # not working below | |||||
# # {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',}, | |||||
# # {'name': 'PTC_FR', 'dataset': '../datasets/PTC/Train/FR.ds',}, | |||||
# # {'name': 'PTC_MM', 'dataset': '../datasets/PTC/Train/MM.ds',}, | |||||
# # {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',}, | |||||
# ] |
@@ -1,56 +1,157 @@ | |||||
import functools | |||||
from libs import * | from libs import * | ||||
from pygraph.kernels.spKernel import spkernel | from pygraph.kernels.spKernel import spkernel | ||||
from pygraph.utils.kernels import deltakernel, kernelsum | |||||
from sklearn.metrics.pairwise import rbf_kernel | |||||
dslist = [ | |||||
# {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds', 'task': 'regression'}, # node_labeled | |||||
# {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge_labeled | |||||
# dslist = [ | |||||
# {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds', 'task': 'regression'}, # node symb | |||||
# # {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb | |||||
# {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds',}, # unlabeled | # {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds',}, # unlabeled | ||||
{'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, # fully_labeled | |||||
# {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds',}, | |||||
# {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds',}, # node/edge symb | |||||
# {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat', | # {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat', | ||||
# 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, | |||||
# {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression', | |||||
# 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt',}, | |||||
# {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, | |||||
# {'name': 'COX2', 'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt'}, | |||||
{'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'}, | |||||
# {'name': 'DHFR', 'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt'}, | |||||
# {'name': 'SYNTHETIC', 'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt'}, | |||||
# {'name': 'MSRC9', 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'}, | |||||
# {'name': 'MSRC21', 'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'}, | |||||
# {'name': 'FIRSTMM_DB', 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'}, | |||||
# {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'}, | |||||
# {'name': 'PROTEINS_full', 'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'}, | |||||
# 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, # node/edge symb | |||||
# {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression', | |||||
# 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt',}, # contains single node graph, node symb | |||||
# # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb | |||||
# # {'name': 'COX2', 'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt'}, # node symb/nsymb | |||||
# {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, # node/edge symb | |||||
# {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'}, # node symb/nsymb | |||||
# # {'name': 'Fingerprint', 'dataset': '../datasets/Fingerprint/Fingerprint_A.txt'}, | |||||
# {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'}, | |||||
# # {'name': 'DHFR', 'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt'}, # node symb/nsymb | |||||
# # {'name': 'SYNTHETIC', 'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt'}, # node symb/nsymb | |||||
# # {'name': 'MSRC9', 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'}, # node symb | |||||
# # {'name': 'MSRC21', 'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'}, # node symb | |||||
# # {'name': 'FIRSTMM_DB', 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'}, # node symb/nsymb ,edge nsymb | |||||
# # {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'}, # node symb/nsymb | |||||
# # {'name': 'PROTEINS_full', 'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'}, # node symb/nsymb | |||||
# {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat', | # {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat', | ||||
# 'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}}, | |||||
# {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, | |||||
# {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1.mat', | |||||
# 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, | |||||
# {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109.mat', | |||||
# 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, | |||||
# {'name': 'NCI-HIV', 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf', | |||||
# 'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',}, | |||||
# # not working below | |||||
# {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',}, | |||||
# {'name': 'PTC_FR', 'dataset': '../datasets/PTC/Train/FR.ds',}, | |||||
# {'name': 'PTC_MM', 'dataset': '../datasets/PTC/Train/MM.ds',}, | |||||
# {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',}, | |||||
] | |||||
# 'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}}, # node symb | |||||
# # {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb | |||||
# # {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1.mat', | |||||
# # 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb | |||||
# # {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109.mat', | |||||
# # 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb | |||||
# # {'name': 'NCI-HIV', 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf', | |||||
# # 'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',}, # node/edge symb | |||||
# # # not working below | |||||
# # {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',}, | |||||
# # {'name': 'PTC_FR', 'dataset': '../datasets/PTC/Train/FR.ds',}, | |||||
# # {'name': 'PTC_MM', 'dataset': '../datasets/PTC/Train/MM.ds',}, | |||||
# # {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',}, | |||||
# ] | |||||
import ast | |||||
ds = ast.literal_eval(sys.argv[1]) | |||||
estimator = spkernel | estimator = spkernel | ||||
param_grid_precomputed = {} | |||||
param_grid = [{'C': np.logspace(-10, 10, num = 41, base = 10)}, | |||||
{'alpha': np.logspace(-10, 10, num = 41, base = 10)}] | |||||
for ds in dslist: | |||||
print() | |||||
print(ds['name']) | |||||
model_selection_for_precomputed_kernel( | |||||
ds['dataset'], estimator, param_grid_precomputed, | |||||
(param_grid[1] if ('task' in ds and ds['task'] == 'regression') else param_grid[0]), | |||||
(ds['task'] if 'task' in ds else 'classification'), NUM_TRIALS=30, | |||||
datafile_y=(ds['dataset_y'] if 'dataset_y' in ds else None), | |||||
extra_params=(ds['extra_params'] if 'extra_params' in ds else None)) | |||||
print() | |||||
mixkernel = functools.partial(kernelsum, deltakernel, rbf_kernel) | |||||
param_grid_precomputed = { | |||||
'node_kernels': [{ | |||||
'symb': deltakernel, | |||||
'nsymb': rbf_kernel, | |||||
'mix': mixkernel | |||||
}] | |||||
} | |||||
param_grid = [{ | |||||
'C': np.logspace(-10, 10, num=41, base=10) | |||||
}, { | |||||
'alpha': np.logspace(-10, 10, num=41, base=10) | |||||
}] | |||||
print() | |||||
print(ds['name']) | |||||
model_selection_for_precomputed_kernel( | |||||
ds['dataset'], | |||||
estimator, | |||||
param_grid_precomputed, | |||||
(param_grid[1] | |||||
if ('task' in ds and ds['task'] == 'regression') else param_grid[0]), | |||||
(ds['task'] if 'task' in ds else 'classification'), | |||||
NUM_TRIALS=30, | |||||
datafile_y=(ds['dataset_y'] if 'dataset_y' in ds else None), | |||||
extra_params=(ds['extra_params'] if 'extra_params' in ds else None), | |||||
ds_name=ds['name']) | |||||
# %lprun -f spkernel \ | |||||
# model_selection_for_precomputed_kernel( \ | |||||
# ds['dataset'], estimator, param_grid_precomputed, \ | |||||
# (param_grid[1] if ('task' in ds and ds['task'] == 'regression') else param_grid[0]), \ | |||||
# (ds['task'] if 'task' in ds else 'classification'), NUM_TRIALS=30, \ | |||||
# datafile_y=(ds['dataset_y'] if 'dataset_y' in ds else None), \ | |||||
# extra_params=(ds['extra_params'] if 'extra_params' in ds else None)) | |||||
print() | |||||
# import functools | |||||
# from libs import * | |||||
# from pygraph.kernels.spKernel import spkernel | |||||
# from pygraph.utils.kernels import deltakernel, kernelsum | |||||
# from sklearn.metrics.pairwise import rbf_kernel | |||||
# dslist = [ | |||||
# {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds', 'task': 'regression'}, # node symb | |||||
# # {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb | |||||
# # {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds',}, # unlabeled | |||||
# # {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds',}, # node/edge symb | |||||
# # {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat', | |||||
# # 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, # node/edge symb | |||||
# # {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression', | |||||
# # 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt',}, # contains single node graph, node symb | |||||
# # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb | |||||
# # {'name': 'COX2', 'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt'}, # node symb/nsymb | |||||
# # {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, # node/edge symb | |||||
# # {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'}, # node symb/nsymb | |||||
# # {'name': 'Fingerprint', 'dataset': '../datasets/Fingerprint/Fingerprint_A.txt'}, | |||||
# # {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'}, | |||||
# # {'name': 'DHFR', 'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt'}, # node symb/nsymb | |||||
# # {'name': 'SYNTHETIC', 'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt'}, # node symb/nsymb | |||||
# # {'name': 'MSRC9', 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'}, # node symb | |||||
# # {'name': 'MSRC21', 'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'}, # node symb | |||||
# # {'name': 'FIRSTMM_DB', 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'}, # node symb/nsymb ,edge nsymb | |||||
# # {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'}, # node symb/nsymb | |||||
# # {'name': 'PROTEINS_full', 'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'}, # node symb/nsymb | |||||
# # {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat', | |||||
# # 'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}}, # node symb | |||||
# # {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb | |||||
# # {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1.mat', | |||||
# # 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb | |||||
# # {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109.mat', | |||||
# # 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb | |||||
# # {'name': 'NCI-HIV', 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf', | |||||
# # 'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',}, # node/edge symb | |||||
# # # not working below | |||||
# # {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',}, | |||||
# # {'name': 'PTC_FR', 'dataset': '../datasets/PTC/Train/FR.ds',}, | |||||
# # {'name': 'PTC_MM', 'dataset': '../datasets/PTC/Train/MM.ds',}, | |||||
# # {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',}, | |||||
# ] | |||||
# estimator = spkernel | |||||
# mixkernel = functools.partial(kernelsum, deltakernel, rbf_kernel) | |||||
# param_grid_precomputed = {'node_kernels': [{'symb': deltakernel, 'nsymb': rbf_kernel, 'mix': mixkernel}]} | |||||
# param_grid = [{'C': np.logspace(-10, 10, num = 41, base = 10)}, | |||||
# {'alpha': np.logspace(-10, 10, num = 41, base = 10)}] | |||||
# for ds in dslist: | |||||
# print() | |||||
# print(ds['name']) | |||||
# model_selection_for_precomputed_kernel( | |||||
# ds['dataset'], estimator, param_grid_precomputed, | |||||
# (param_grid[1] if ('task' in ds and ds['task'] == 'regression') else param_grid[0]), | |||||
# (ds['task'] if 'task' in ds else 'classification'), NUM_TRIALS=30, | |||||
# datafile_y=(ds['dataset_y'] if 'dataset_y' in ds else None), | |||||
# extra_params=(ds['extra_params'] if 'extra_params' in ds else None), | |||||
# ds_name=ds['name']) | |||||
# # %lprun -f spkernel \ | |||||
# # model_selection_for_precomputed_kernel( \ | |||||
# # ds['dataset'], estimator, param_grid_precomputed, \ | |||||
# # (param_grid[1] if ('task' in ds and ds['task'] == 'regression') else param_grid[0]), \ | |||||
# # (ds['task'] if 'task' in ds else 'classification'), NUM_TRIALS=30, \ | |||||
# # datafile_y=(ds['dataset_y'] if 'dataset_y' in ds else None), \ | |||||
# # extra_params=(ds['extra_params'] if 'extra_params' in ds else None)) | |||||
# print() |
@@ -1 +0,0 @@ | |||||
ljia@ljia-Precision-7520.4716:1530265749 |
@@ -9,6 +9,9 @@ sys.path.insert(0, "../") | |||||
from tqdm import tqdm | from tqdm import tqdm | ||||
import time | import time | ||||
from itertools import combinations_with_replacement, product | from itertools import combinations_with_replacement, product | ||||
from functools import partial | |||||
from joblib import Parallel, delayed | |||||
from multiprocessing import Pool | |||||
import networkx as nx | import networkx as nx | ||||
import numpy as np | import numpy as np | ||||
@@ -17,7 +20,11 @@ from pygraph.utils.utils import getSPGraph | |||||
from pygraph.utils.graphdataset import get_dataset_attributes | from pygraph.utils.graphdataset import get_dataset_attributes | ||||
def spkernel(*args, node_label='atom', edge_weight=None, node_kernels=None): | |||||
def spkernel(*args, | |||||
node_label='atom', | |||||
edge_weight=None, | |||||
node_kernels=None, | |||||
n_jobs=None): | |||||
"""Calculate shortest-path kernels between graphs. | """Calculate shortest-path kernels between graphs. | ||||
Parameters | Parameters | ||||
@@ -70,180 +77,344 @@ def spkernel(*args, node_label='atom', edge_weight=None, node_kernels=None): | |||||
if len(Gn) != len_gn: | if len(Gn) != len_gn: | ||||
print('\n %d graphs are removed as they don\'t contain edges.\n' % | print('\n %d graphs are removed as they don\'t contain edges.\n' % | ||||
(len_gn - len(Gn))) | (len_gn - len(Gn))) | ||||
start_time = time.time() | start_time = time.time() | ||||
pool = Pool(n_jobs) | |||||
# get shortest path graphs of Gn | # get shortest path graphs of Gn | ||||
Gn = [ | |||||
getSPGraph(G, edge_weight=edge_weight) | |||||
for G in tqdm(Gn, desc='getting sp graphs', file=sys.stdout) | |||||
] | |||||
getsp_partial = partial(wrap_getSPGraph, Gn, edge_weight) | |||||
result_sp = pool.map(getsp_partial, range(0, len(Gn))) | |||||
for i in result_sp: | |||||
Gn[i[0]] = i[1] | |||||
# Gn = [ | |||||
# getSPGraph(G, edge_weight=edge_weight) | |||||
# for G in tqdm(Gn, desc='getting sp graphs', file=sys.stdout) | |||||
# ] | |||||
Kmatrix = np.zeros((len(Gn), len(Gn))) | Kmatrix = np.zeros((len(Gn), len(Gn))) | ||||
pbar = tqdm( | |||||
total=((len(Gn) + 1) * len(Gn) / 2), | |||||
desc='calculating kernels', | |||||
file=sys.stdout) | |||||
do_partial = partial(spkernel_do, Gn, ds_attrs, node_label, node_kernels) | |||||
itr = combinations_with_replacement(range(0, len(Gn)), 2) | |||||
# chunksize = 2000 # int(len(list(itr)) / n_jobs) | |||||
# for i, j, kernel in tqdm(pool.imap_unordered(do_partial, itr, chunksize)): | |||||
# Kmatrix[i][j] = kernel | |||||
# Kmatrix[j][i] = kernel | |||||
result_perf = pool.map(do_partial, itr) | |||||
pool.close() | |||||
pool.join() | |||||
# result_perf = Parallel( | |||||
# n_jobs=n_jobs, verbose=10)( | |||||
# delayed(do_partial)(ij) | |||||
# for ij in combinations_with_replacement(range(0, len(Gn)), 2)) | |||||
# result_perf = [ | |||||
# do_partial(ij) | |||||
# for ij in combinations_with_replacement(range(0, len(Gn)), 2) | |||||
# ] | |||||
for i in result_perf: | |||||
Kmatrix[i[0]][i[1]] = i[2] | |||||
Kmatrix[i[1]][i[0]] = i[2] | |||||
# pbar = tqdm( | |||||
# total=((len(Gn) + 1) * len(Gn) / 2), | |||||
# desc='calculating kernels', | |||||
# file=sys.stdout) | |||||
# if ds_attrs['node_labeled']: | |||||
# # node symb and non-synb labeled | |||||
# if ds_attrs['node_attr_dim'] > 0: | |||||
# if ds_attrs['is_directed']: | |||||
# for i, j in combinations_with_replacement( | |||||
# range(0, len(Gn)), 2): | |||||
# for e1, e2 in product( | |||||
# Gn[i].edges(data=True), Gn[j].edges(data=True)): | |||||
# if e1[2]['cost'] == e2[2]['cost']: | |||||
# kn = node_kernels['mix'] | |||||
# try: | |||||
# n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[ | |||||
# i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[ | |||||
# j].nodes[e2[1]] | |||||
# kn1 = kn(n11[node_label], n21[node_label], [ | |||||
# n11['attributes'] | |||||
# ], [n21['attributes']]) * kn( | |||||
# n12[node_label], n22[node_label], | |||||
# [n12['attributes']], [n22['attributes']]) | |||||
# Kmatrix[i][j] += kn1 | |||||
# except KeyError: # missing labels or attributes | |||||
# pass | |||||
# Kmatrix[j][i] = Kmatrix[i][j] | |||||
# pbar.update(1) | |||||
# else: | |||||
# for i, j in combinations_with_replacement( | |||||
# range(0, len(Gn)), 2): | |||||
# for e1, e2 in product( | |||||
# Gn[i].edges(data=True), Gn[j].edges(data=True)): | |||||
# if e1[2]['cost'] == e2[2]['cost']: | |||||
# kn = node_kernels['mix'] | |||||
# try: | |||||
# # each edge walk is counted twice, starting from both its extreme nodes. | |||||
# n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[ | |||||
# i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[ | |||||
# j].nodes[e2[1]] | |||||
# kn1 = kn(n11[node_label], n21[node_label], [ | |||||
# n11['attributes'] | |||||
# ], [n21['attributes']]) * kn( | |||||
# n12[node_label], n22[node_label], | |||||
# [n12['attributes']], [n22['attributes']]) | |||||
# kn2 = kn(n11[node_label], n22[node_label], [ | |||||
# n11['attributes'] | |||||
# ], [n22['attributes']]) * kn( | |||||
# n12[node_label], n21[node_label], | |||||
# [n12['attributes']], [n21['attributes']]) | |||||
# Kmatrix[i][j] += kn1 + kn2 | |||||
# except KeyError: # missing labels or attributes | |||||
# pass | |||||
# Kmatrix[j][i] = Kmatrix[i][j] | |||||
# pbar.update(1) | |||||
# # node symb labeled | |||||
# else: | |||||
# if ds_attrs['is_directed']: | |||||
# for i, j in combinations_with_replacement( | |||||
# range(0, len(Gn)), 2): | |||||
# for e1, e2 in product( | |||||
# Gn[i].edges(data=True), Gn[j].edges(data=True)): | |||||
# if e1[2]['cost'] == e2[2]['cost']: | |||||
# kn = node_kernels['symb'] | |||||
# try: | |||||
# n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[ | |||||
# i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[ | |||||
# j].nodes[e2[1]] | |||||
# kn1 = kn(n11[node_label], | |||||
# n21[node_label]) * kn( | |||||
# n12[node_label], n22[node_label]) | |||||
# Kmatrix[i][j] += kn1 | |||||
# except KeyError: # missing labels | |||||
# pass | |||||
# Kmatrix[j][i] = Kmatrix[i][j] | |||||
# pbar.update(1) | |||||
# else: | |||||
# for i, j in combinations_with_replacement( | |||||
# range(0, len(Gn)), 2): | |||||
# for e1, e2 in product( | |||||
# Gn[i].edges(data=True), Gn[j].edges(data=True)): | |||||
# if e1[2]['cost'] == e2[2]['cost']: | |||||
# kn = node_kernels['symb'] | |||||
# try: | |||||
# # each edge walk is counted twice, starting from both its extreme nodes. | |||||
# n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[ | |||||
# i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[ | |||||
# j].nodes[e2[1]] | |||||
# kn1 = kn(n11[node_label], | |||||
# n21[node_label]) * kn( | |||||
# n12[node_label], n22[node_label]) | |||||
# kn2 = kn(n11[node_label], | |||||
# n22[node_label]) * kn( | |||||
# n12[node_label], n21[node_label]) | |||||
# Kmatrix[i][j] += kn1 + kn2 | |||||
# except KeyError: # missing labels | |||||
# pass | |||||
# Kmatrix[j][i] = Kmatrix[i][j] | |||||
# pbar.update(1) | |||||
# else: | |||||
# # node non-synb labeled | |||||
# if ds_attrs['node_attr_dim'] > 0: | |||||
# if ds_attrs['is_directed']: | |||||
# for i, j in combinations_with_replacement( | |||||
# range(0, len(Gn)), 2): | |||||
# for e1, e2 in product( | |||||
# Gn[i].edges(data=True), Gn[j].edges(data=True)): | |||||
# if e1[2]['cost'] == e2[2]['cost']: | |||||
# kn = node_kernels['nsymb'] | |||||
# try: | |||||
# # each edge walk is counted twice, starting from both its extreme nodes. | |||||
# n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[ | |||||
# i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[ | |||||
# j].nodes[e2[1]] | |||||
# kn1 = kn([n11['attributes']], | |||||
# [n21['attributes']]) * kn( | |||||
# [n12['attributes']], | |||||
# [n22['attributes']]) | |||||
# Kmatrix[i][j] += kn1 | |||||
# except KeyError: # missing attributes | |||||
# pass | |||||
# Kmatrix[j][i] = Kmatrix[i][j] | |||||
# pbar.update(1) | |||||
# else: | |||||
# for i, j in combinations_with_replacement( | |||||
# range(0, len(Gn)), 2): | |||||
# for e1, e2 in product( | |||||
# Gn[i].edges(data=True), Gn[j].edges(data=True)): | |||||
# if e1[2]['cost'] == e2[2]['cost']: | |||||
# kn = node_kernels['nsymb'] | |||||
# try: | |||||
# # each edge walk is counted twice, starting from both its extreme nodes. | |||||
# n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[ | |||||
# i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[ | |||||
# j].nodes[e2[1]] | |||||
# kn1 = kn([n11['attributes']], | |||||
# [n21['attributes']]) * kn( | |||||
# [n12['attributes']], | |||||
# [n22['attributes']]) | |||||
# kn2 = kn([n11['attributes']], | |||||
# [n22['attributes']]) * kn( | |||||
# [n12['attributes']], | |||||
# [n21['attributes']]) | |||||
# Kmatrix[i][j] += kn1 + kn2 | |||||
# except KeyError: # missing attributes | |||||
# pass | |||||
# Kmatrix[j][i] = Kmatrix[i][j] | |||||
# pbar.update(1) | |||||
# # node unlabeled | |||||
# else: | |||||
# for i, j in combinations_with_replacement(range(0, len(Gn)), 2): | |||||
# for e1, e2 in product( | |||||
# Gn[i].edges(data=True), Gn[j].edges(data=True)): | |||||
# if e1[2]['cost'] == e2[2]['cost']: | |||||
# Kmatrix[i][j] += 1 | |||||
# Kmatrix[j][i] = Kmatrix[i][j] | |||||
# pbar.update(1) | |||||
run_time = time.time() - start_time | |||||
print( | |||||
"\n --- shortest path kernel matrix of size %d built in %s seconds ---" | |||||
% (len(Gn), run_time)) | |||||
return Kmatrix, run_time, idx | |||||
def spkernel_do(Gn, ds_attrs, node_label, node_kernels, ij): | |||||
i = ij[0] | |||||
j = ij[1] | |||||
Kmatrix = 0 | |||||
if ds_attrs['node_labeled']: | if ds_attrs['node_labeled']: | ||||
# node symb and non-synb labeled | # node symb and non-synb labeled | ||||
if ds_attrs['node_attr_dim'] > 0: | if ds_attrs['node_attr_dim'] > 0: | ||||
if ds_attrs['is_directed']: | if ds_attrs['is_directed']: | ||||
for i, j in combinations_with_replacement( | |||||
range(0, len(Gn)), 2): | |||||
for e1, e2 in product( | |||||
Gn[i].edges(data=True), Gn[j].edges(data=True)): | |||||
if e1[2]['cost'] == e2[2]['cost']: | |||||
kn = node_kernels['mix'] | |||||
try: | |||||
n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[ | |||||
i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[ | |||||
j].nodes[e2[1]] | |||||
kn1 = kn(n11[node_label], n21[node_label], [ | |||||
n11['attributes'] | |||||
], [n21['attributes']]) * kn( | |||||
for e1, e2 in product( | |||||
Gn[i].edges(data=True), Gn[j].edges(data=True)): | |||||
if e1[2]['cost'] == e2[2]['cost']: | |||||
kn = node_kernels['mix'] | |||||
try: | |||||
n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[ | |||||
i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[ | |||||
j].nodes[e2[1]] | |||||
kn1 = kn( | |||||
n11[node_label], n21[node_label], | |||||
[n11['attributes']], [n21['attributes']]) * kn( | |||||
n12[node_label], n22[node_label], | n12[node_label], n22[node_label], | ||||
[n12['attributes']], [n22['attributes']]) | [n12['attributes']], [n22['attributes']]) | ||||
Kmatrix[i][j] += kn1 | |||||
except KeyError: # missing labels or attributes | |||||
pass | |||||
Kmatrix[j][i] = Kmatrix[i][j] | |||||
pbar.update(1) | |||||
Kmatrix += kn1 | |||||
except KeyError: # missing labels or attributes | |||||
pass | |||||
else: | else: | ||||
for i, j in combinations_with_replacement( | |||||
range(0, len(Gn)), 2): | |||||
for e1, e2 in product( | |||||
Gn[i].edges(data=True), Gn[j].edges(data=True)): | |||||
if e1[2]['cost'] == e2[2]['cost']: | |||||
kn = node_kernels['mix'] | |||||
try: | |||||
# each edge walk is counted twice, starting from both its extreme nodes. | |||||
n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[ | |||||
i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[ | |||||
j].nodes[e2[1]] | |||||
kn1 = kn(n11[node_label], n21[node_label], [ | |||||
n11['attributes'] | |||||
], [n21['attributes']]) * kn( | |||||
for e1, e2 in product( | |||||
Gn[i].edges(data=True), Gn[j].edges(data=True)): | |||||
if e1[2]['cost'] == e2[2]['cost']: | |||||
kn = node_kernels['mix'] | |||||
try: | |||||
# each edge walk is counted twice, starting from both its extreme nodes. | |||||
n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[ | |||||
i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[ | |||||
j].nodes[e2[1]] | |||||
kn1 = kn( | |||||
n11[node_label], n21[node_label], | |||||
[n11['attributes']], [n21['attributes']]) * kn( | |||||
n12[node_label], n22[node_label], | n12[node_label], n22[node_label], | ||||
[n12['attributes']], [n22['attributes']]) | [n12['attributes']], [n22['attributes']]) | ||||
kn2 = kn(n11[node_label], n22[node_label], [ | |||||
n11['attributes'] | |||||
], [n22['attributes']]) * kn( | |||||
kn2 = kn( | |||||
n11[node_label], n22[node_label], | |||||
[n11['attributes']], [n22['attributes']]) * kn( | |||||
n12[node_label], n21[node_label], | n12[node_label], n21[node_label], | ||||
[n12['attributes']], [n21['attributes']]) | [n12['attributes']], [n21['attributes']]) | ||||
Kmatrix[i][j] += kn1 + kn2 | |||||
except KeyError: # missing labels or attributes | |||||
pass | |||||
Kmatrix[j][i] = Kmatrix[i][j] | |||||
pbar.update(1) | |||||
Kmatrix += kn1 + kn2 | |||||
except KeyError: # missing labels or attributes | |||||
pass | |||||
# node symb labeled | # node symb labeled | ||||
else: | else: | ||||
if ds_attrs['is_directed']: | if ds_attrs['is_directed']: | ||||
for i, j in combinations_with_replacement( | |||||
range(0, len(Gn)), 2): | |||||
for e1, e2 in product( | |||||
Gn[i].edges(data=True), Gn[j].edges(data=True)): | |||||
if e1[2]['cost'] == e2[2]['cost']: | |||||
kn = node_kernels['symb'] | |||||
try: | |||||
n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[ | |||||
i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[ | |||||
j].nodes[e2[1]] | |||||
kn1 = kn(n11[node_label], | |||||
n21[node_label]) * kn( | |||||
n12[node_label], n22[node_label]) | |||||
Kmatrix[i][j] += kn1 | |||||
except KeyError: # missing labels | |||||
pass | |||||
Kmatrix[j][i] = Kmatrix[i][j] | |||||
pbar.update(1) | |||||
for e1, e2 in product( | |||||
Gn[i].edges(data=True), Gn[j].edges(data=True)): | |||||
if e1[2]['cost'] == e2[2]['cost']: | |||||
kn = node_kernels['symb'] | |||||
try: | |||||
n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[ | |||||
i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[ | |||||
j].nodes[e2[1]] | |||||
kn1 = kn(n11[node_label], n21[node_label]) * kn( | |||||
n12[node_label], n22[node_label]) | |||||
Kmatrix += kn1 | |||||
except KeyError: # missing labels | |||||
pass | |||||
else: | else: | ||||
for i, j in combinations_with_replacement( | |||||
range(0, len(Gn)), 2): | |||||
for e1, e2 in product( | |||||
Gn[i].edges(data=True), Gn[j].edges(data=True)): | |||||
if e1[2]['cost'] == e2[2]['cost']: | |||||
kn = node_kernels['symb'] | |||||
try: | |||||
# each edge walk is counted twice, starting from both its extreme nodes. | |||||
n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[ | |||||
i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[ | |||||
j].nodes[e2[1]] | |||||
kn1 = kn(n11[node_label], | |||||
n21[node_label]) * kn( | |||||
n12[node_label], n22[node_label]) | |||||
kn2 = kn(n11[node_label], | |||||
n22[node_label]) * kn( | |||||
n12[node_label], n21[node_label]) | |||||
Kmatrix[i][j] += kn1 + kn2 | |||||
except KeyError: # missing labels | |||||
pass | |||||
Kmatrix[j][i] = Kmatrix[i][j] | |||||
pbar.update(1) | |||||
for e1, e2 in product( | |||||
Gn[i].edges(data=True), Gn[j].edges(data=True)): | |||||
if e1[2]['cost'] == e2[2]['cost']: | |||||
kn = node_kernels['symb'] | |||||
try: | |||||
# each edge walk is counted twice, starting from both its extreme nodes. | |||||
n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[ | |||||
i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[ | |||||
j].nodes[e2[1]] | |||||
kn1 = kn(n11[node_label], n21[node_label]) * kn( | |||||
n12[node_label], n22[node_label]) | |||||
kn2 = kn(n11[node_label], n22[node_label]) * kn( | |||||
n12[node_label], n21[node_label]) | |||||
Kmatrix += kn1 + kn2 | |||||
except KeyError: # missing labels | |||||
pass | |||||
else: | else: | ||||
# node non-synb labeled | # node non-synb labeled | ||||
if ds_attrs['node_attr_dim'] > 0: | if ds_attrs['node_attr_dim'] > 0: | ||||
if ds_attrs['is_directed']: | if ds_attrs['is_directed']: | ||||
for i, j in combinations_with_replacement( | |||||
range(0, len(Gn)), 2): | |||||
for e1, e2 in product( | |||||
Gn[i].edges(data=True), Gn[j].edges(data=True)): | |||||
if e1[2]['cost'] == e2[2]['cost']: | |||||
kn = node_kernels['nsymb'] | |||||
try: | |||||
# each edge walk is counted twice, starting from both its extreme nodes. | |||||
n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[ | |||||
i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[ | |||||
j].nodes[e2[1]] | |||||
kn1 = kn([n11['attributes']], | |||||
[n21['attributes']]) * kn( | |||||
[n12['attributes']], | |||||
[n22['attributes']]) | |||||
Kmatrix[i][j] += kn1 | |||||
except KeyError: # missing attributes | |||||
pass | |||||
Kmatrix[j][i] = Kmatrix[i][j] | |||||
pbar.update(1) | |||||
for e1, e2 in product( | |||||
Gn[i].edges(data=True), Gn[j].edges(data=True)): | |||||
if e1[2]['cost'] == e2[2]['cost']: | |||||
kn = node_kernels['nsymb'] | |||||
try: | |||||
# each edge walk is counted twice, starting from both its extreme nodes. | |||||
n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[ | |||||
i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[ | |||||
j].nodes[e2[1]] | |||||
kn1 = kn( | |||||
[n11['attributes']], [n21['attributes']]) * kn( | |||||
[n12['attributes']], [n22['attributes']]) | |||||
Kmatrix += kn1 | |||||
except KeyError: # missing attributes | |||||
pass | |||||
else: | else: | ||||
for i, j in combinations_with_replacement( | |||||
range(0, len(Gn)), 2): | |||||
for e1, e2 in product( | |||||
Gn[i].edges(data=True), Gn[j].edges(data=True)): | |||||
if e1[2]['cost'] == e2[2]['cost']: | |||||
kn = node_kernels['nsymb'] | |||||
try: | |||||
# each edge walk is counted twice, starting from both its extreme nodes. | |||||
n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[ | |||||
i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[ | |||||
j].nodes[e2[1]] | |||||
kn1 = kn([n11['attributes']], | |||||
[n21['attributes']]) * kn( | |||||
[n12['attributes']], | |||||
[n22['attributes']]) | |||||
kn2 = kn([n11['attributes']], | |||||
[n22['attributes']]) * kn( | |||||
[n12['attributes']], | |||||
[n21['attributes']]) | |||||
Kmatrix[i][j] += kn1 + kn2 | |||||
except KeyError: # missing attributes | |||||
pass | |||||
Kmatrix[j][i] = Kmatrix[i][j] | |||||
pbar.update(1) | |||||
# node unlabeled | |||||
else: | |||||
for i, j in combinations_with_replacement(range(0, len(Gn)), 2): | |||||
for e1, e2 in product( | for e1, e2 in product( | ||||
Gn[i].edges(data=True), Gn[j].edges(data=True)): | Gn[i].edges(data=True), Gn[j].edges(data=True)): | ||||
if e1[2]['cost'] == e2[2]['cost']: | if e1[2]['cost'] == e2[2]['cost']: | ||||
Kmatrix[i][j] += 1 | |||||
Kmatrix[j][i] = Kmatrix[i][j] | |||||
pbar.update(1) | |||||
kn = node_kernels['nsymb'] | |||||
try: | |||||
# each edge walk is counted twice, starting from both its extreme nodes. | |||||
n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[ | |||||
i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[ | |||||
j].nodes[e2[1]] | |||||
kn1 = kn( | |||||
[n11['attributes']], [n21['attributes']]) * kn( | |||||
[n12['attributes']], [n22['attributes']]) | |||||
kn2 = kn( | |||||
[n11['attributes']], [n22['attributes']]) * kn( | |||||
[n12['attributes']], [n21['attributes']]) | |||||
Kmatrix += kn1 + kn2 | |||||
except KeyError: # missing attributes | |||||
pass | |||||
# node unlabeled | |||||
else: | |||||
for e1, e2 in product( | |||||
Gn[i].edges(data=True), Gn[j].edges(data=True)): | |||||
if e1[2]['cost'] == e2[2]['cost']: | |||||
Kmatrix += 1 | |||||
run_time = time.time() - start_time | |||||
print( | |||||
"\n --- shortest path kernel matrix of size %d built in %s seconds ---" | |||||
% (len(Gn), run_time)) | |||||
return i, j, Kmatrix | |||||
return Kmatrix, run_time, idx | |||||
def wrap_getSPGraph(Gn, weight, i): | |||||
return i, getSPGraph(Gn[i], edge_weight=weight) |
@@ -1,11 +1,32 @@ | |||||
import numpy as np | |||||
from matplotlib import pyplot as plt | |||||
from sklearn.kernel_ridge import KernelRidge | |||||
from sklearn.svm import SVC | |||||
from sklearn.metrics import accuracy_score, mean_squared_error | |||||
from sklearn.model_selection import KFold, train_test_split, ParameterGrid | |||||
from joblib import Parallel, delayed | |||||
from multiprocessing import Pool | |||||
from functools import partial | |||||
import sys | |||||
sys.path.insert(0, "../") | |||||
import os | |||||
import time | |||||
from os.path import basename, splitext | |||||
from pygraph.utils.graphfiles import loadDataset | |||||
from tqdm import tqdm | |||||
def model_selection_for_precomputed_kernel(datafile, estimator, | |||||
param_grid_precomputed, param_grid, | |||||
model_type, NUM_TRIALS=30, | |||||
def model_selection_for_precomputed_kernel(datafile, | |||||
estimator, | |||||
param_grid_precomputed, | |||||
param_grid, | |||||
model_type, | |||||
NUM_TRIALS=30, | |||||
datafile_y=None, | datafile_y=None, | ||||
extra_params=None, | extra_params=None, | ||||
ds_name='ds-unknown'): | |||||
ds_name='ds-unknown', | |||||
n_jobs=1): | |||||
"""Perform model selection, fitting and testing for precomputed kernels using nested cv. Print out neccessary data during the process then finally the results. | """Perform model selection, fitting and testing for precomputed kernels using nested cv. Print out neccessary data during the process then finally the results. | ||||
Parameters | Parameters | ||||
@@ -40,94 +61,101 @@ def model_selection_for_precomputed_kernel(datafile, estimator, | |||||
>>> | >>> | ||||
>>> model_selection_for_precomputed_kernel(datafile, estimator, param_grid_precomputed, param_grid, 'regression') | >>> model_selection_for_precomputed_kernel(datafile, estimator, param_grid_precomputed, param_grid, 'regression') | ||||
""" | """ | ||||
import numpy as np | |||||
from matplotlib import pyplot as plt | |||||
from sklearn.kernel_ridge import KernelRidge | |||||
from sklearn.svm import SVC | |||||
from sklearn.metrics import accuracy_score, mean_squared_error | |||||
from sklearn.model_selection import KFold, train_test_split, ParameterGrid | |||||
import sys | |||||
sys.path.insert(0, "../") | |||||
import os | |||||
from os.path import basename, splitext | |||||
from pygraph.utils.graphfiles import loadDataset | |||||
from tqdm import tqdm | |||||
tqdm.monitor_interval = 0 | tqdm.monitor_interval = 0 | ||||
results_dir = '../notebooks/results/' + estimator.__name__ | results_dir = '../notebooks/results/' + estimator.__name__ | ||||
if not os.path.exists(results_dir): | |||||
os.makedirs(results_dir) | |||||
# open file to save all results for this dataset. | |||||
with open(results_dir + '/' + ds_name + '.txt', 'w') as fresults: | |||||
fresults.write('# This file contains results of ' + estimator.__name__ + ' on dataset ' + ds_name + ',\n# including gram matrices, serial numbers for gram matrix figures and performance.\n\n') | |||||
# setup the model type | |||||
model_type = model_type.lower() | |||||
if model_type != 'regression' and model_type != 'classification': | |||||
raise Exception( | |||||
'The model type is incorrect! Please choose from regression or classification.') | |||||
print() | |||||
print('--- This is a %s problem ---' % model_type) | |||||
fresults.write('This is a %s problem.\n\n' % model_type) | |||||
# a string to save all the results. | |||||
str_fw = '# This file contains results of ' + estimator.__name__ + ' on dataset ' + ds_name + ',\n# including gram matrices, serial numbers for gram matrix figures and performance.\n\n' | |||||
# Load the dataset | |||||
print() | |||||
print('\nI. Loading dataset from file...') | |||||
dataset, y = loadDataset(datafile, filename_y=datafile_y, extra_params=extra_params) | |||||
# setup the model type | |||||
model_type = model_type.lower() | |||||
if model_type != 'regression' and model_type != 'classification': | |||||
raise Exception( | |||||
'The model type is incorrect! Please choose from regression or classification.' | |||||
) | |||||
print() | |||||
print('--- This is a %s problem ---' % model_type) | |||||
str_fw += 'This is a %s problem.\n\n' % model_type | |||||
# Load the dataset | |||||
print() | |||||
print('\nI. Loading dataset from file...') | |||||
dataset, y = loadDataset( | |||||
datafile, filename_y=datafile_y, extra_params=extra_params) | |||||
# import matplotlib.pyplot as plt | |||||
# import matplotlib.pyplot as plt | |||||
# import networkx as nx | # import networkx as nx | ||||
# nx.draw_networkx(dataset[30]) | # nx.draw_networkx(dataset[30]) | ||||
# plt.show() | # plt.show() | ||||
# Grid of parameters with a discrete number of values for each. | |||||
param_list_precomputed = list(ParameterGrid(param_grid_precomputed)) | |||||
param_list = list(ParameterGrid(param_grid)) | |||||
# np.savetxt(results_name_pre + 'param_grid_precomputed.dt', | |||||
# [[key, value] for key, value in sorted(param_grid_precomputed)]) | |||||
# np.savetxt(results_name_pre + 'param_grid.dt', | |||||
# [[key, value] for key, value in sorted(param_grid)]) | |||||
# Grid of parameters with a discrete number of values for each. | |||||
param_list_precomputed = list(ParameterGrid(param_grid_precomputed)) | |||||
param_list = list(ParameterGrid(param_grid)) | |||||
# np.savetxt(results_name_pre + 'param_grid_precomputed.dt', | |||||
# [[key, value] for key, value in sorted(param_grid_precomputed)]) | |||||
# np.savetxt(results_name_pre + 'param_grid.dt', | |||||
# [[key, value] for key, value in sorted(param_grid)]) | |||||
gram_matrices = [] # a list to store gram matrices for all param_grid_precomputed | |||||
gram_matrix_time = [] # a list to store time to calculate gram matrices | |||||
param_list_pre_revised = [] # list to store param grids precomputed ignoring the useless ones | |||||
gram_matrices = [ | |||||
] # a list to store gram matrices for all param_grid_precomputed | |||||
gram_matrix_time = [ | |||||
] # a list to store time to calculate gram matrices | |||||
param_list_pre_revised = [ | |||||
] # list to store param grids precomputed ignoring the useless ones | |||||
# calculate all gram matrices | |||||
print() | |||||
print('2. Calculating gram matrices. This could take a while...') | |||||
str_fw += '\nI. Gram matrices.\n\n' | |||||
tts = time.time() # start training time | |||||
nb_gm_ignore = 0 # the number of gram matrices those should not be considered, as they may contain elements that are not numbers (NaN) | |||||
for idx, params_out in enumerate(param_list_precomputed): | |||||
params_out['n_jobs'] = n_jobs | |||||
rtn_data = estimator(dataset, **params_out) | |||||
Kmatrix = rtn_data[0] | |||||
current_run_time = rtn_data[1] | |||||
if len(rtn_data) == 3: | |||||
idx_trim = rtn_data[2] # the index of trimmed graph list | |||||
y = [y[idx] for idx in idx_trim] | |||||
Kmatrix_diag = Kmatrix.diagonal().copy() | |||||
# remove graphs whose kernels with themselves are zeros | |||||
nb_g_ignore = 0 | |||||
for idx, diag in enumerate(Kmatrix_diag): | |||||
if diag == 0: | |||||
Kmatrix = np.delete(Kmatrix, (idx - nb_g_ignore), axis=0) | |||||
Kmatrix = np.delete(Kmatrix, (idx - nb_g_ignore), axis=1) | |||||
nb_g_ignore += 1 | |||||
# normalization | |||||
for i in range(len(Kmatrix)): | |||||
for j in range(i, len(Kmatrix)): | |||||
Kmatrix[i][j] /= np.sqrt(Kmatrix_diag[i] * Kmatrix_diag[j]) | |||||
Kmatrix[j][i] = Kmatrix[i][j] | |||||
# calculate all gram matrices | |||||
print() | print() | ||||
print('2. Calculating gram matrices. This could take a while...') | |||||
fresults.write('\nI. Gram matrices.\n\n') | |||||
nb_gm_ignore = 0 # the number of gram matrices those should not be considered, as they may contain elements that are not numbers (NaN) | |||||
for idx, params_out in enumerate(param_list_precomputed): | |||||
rtn_data = estimator(dataset, **params_out) | |||||
Kmatrix = rtn_data[0] | |||||
current_run_time = rtn_data[1] | |||||
if len(rtn_data) == 3: | |||||
idx_trim = rtn_data[2] # the index of trimmed graph list | |||||
y = [y[idx] for idx in idx_trim] | |||||
Kmatrix_diag = Kmatrix.diagonal().copy() | |||||
for i in range(len(Kmatrix)): | |||||
for j in range(i, len(Kmatrix)): | |||||
# if Kmatrix_diag[i] != 0 and Kmatrix_diag[j] != 0: | |||||
Kmatrix[i][j] /= np.sqrt(Kmatrix_diag[i] * Kmatrix_diag[j]) | |||||
Kmatrix[j][i] = Kmatrix[i][j] | |||||
print() | |||||
if params_out == {}: | |||||
print('the gram matrix is: ') | |||||
fresults.write('the gram matrix is:\n\n') | |||||
else: | |||||
print('the gram matrix with parameters', params_out, 'is: ') | |||||
fresults.write('the gram matrix with parameters %s is:\n\n' % params_out) | |||||
if np.isnan(Kmatrix).any(): # if the matrix contains elements that are not numbers | |||||
if params_out == {}: | |||||
print('the gram matrix is: ') | |||||
str_fw += 'the gram matrix is:\n\n' | |||||
else: | |||||
print('the gram matrix with parameters', params_out, 'is: ') | |||||
str_fw += 'the gram matrix with parameters %s is:\n\n' % params_out | |||||
if len(Kmatrix) < 2: | |||||
nb_gm_ignore += 1 | |||||
print('ignored, as at most only one of all its diagonal value is non-zero.') | |||||
str_fw += 'ignored, as at most only one of all its diagonal value is non-zero.\n\n' | |||||
else: | |||||
if np.isnan(Kmatrix).any( | |||||
): # if the matrix contains elements that are not numbers | |||||
nb_gm_ignore += 1 | nb_gm_ignore += 1 | ||||
print('ignored, as it contains elements that are not numbers.') | print('ignored, as it contains elements that are not numbers.') | ||||
fresults.write('ignored, as it contains elements that are not numbers.\n\n') | |||||
str_fw += 'ignored, as it contains elements that are not numbers.\n\n' | |||||
else: | else: | ||||
print(Kmatrix) | print(Kmatrix) | ||||
fresults.write(np.array2string(Kmatrix, separator=',', threshold=np.inf, floatmode='unique') + '\n\n') | |||||
str_fw += np.array2string( | |||||
Kmatrix, | |||||
separator=',', | |||||
threshold=np.inf, | |||||
floatmode='unique') + '\n\n' | |||||
plt.matshow(Kmatrix) | plt.matshow(Kmatrix) | ||||
plt.colorbar() | plt.colorbar() | ||||
fig_file_name = results_dir + '/GM[ds]' + ds_name | fig_file_name = results_dir + '/GM[ds]' + ds_name | ||||
@@ -138,115 +166,52 @@ def model_selection_for_precomputed_kernel(datafile, estimator, | |||||
gram_matrices.append(Kmatrix) | gram_matrices.append(Kmatrix) | ||||
gram_matrix_time.append(current_run_time) | gram_matrix_time.append(current_run_time) | ||||
param_list_pre_revised.append(params_out) | param_list_pre_revised.append(params_out) | ||||
print() | |||||
print('{} gram matrices are calculated, {} of which are ignored.'.format(len(param_list_precomputed), nb_gm_ignore)) | |||||
fresults.write('{} gram matrices are calculated, {} of which are ignored.\n\n'.format(len(param_list_precomputed), nb_gm_ignore)) | |||||
fresults.write('serial numbers of gram matrix figure and their corresponding parameters settings:\n\n') | |||||
fresults.write(''.join(['{}: {}\n'.format(idx, params_out) | |||||
for idx, params_out in enumerate(param_list_precomputed)])) | |||||
if nb_g_ignore > 0: | |||||
print(', where %d graphs are ignored as their graph kernels with themselves are zeros.' % nb_g_ignore) | |||||
str_fw += ', where %d graphs are ignored as their graph kernels with themselves are zeros.' % nb_g_ignore | |||||
print() | |||||
print( | |||||
'{} gram matrices are calculated, {} of which are ignored.'.format( | |||||
len(param_list_precomputed), nb_gm_ignore)) | |||||
str_fw += '{} gram matrices are calculated, {} of which are ignored.\n\n'.format(len(param_list_precomputed), nb_gm_ignore) | |||||
str_fw += 'serial numbers of gram matrix figures and their corresponding parameters settings:\n\n' | |||||
str_fw += ''.join([ | |||||
'{}: {}\n'.format(idx, params_out) | |||||
for idx, params_out in enumerate(param_list_precomputed) | |||||
]) | |||||
print() | |||||
print('3. Fitting and predicting using nested cross validation. This could really take a while...') | |||||
# Arrays to store scores | |||||
train_pref = np.zeros( | |||||
(NUM_TRIALS, len(param_list_pre_revised), len(param_list))) | |||||
val_pref = np.zeros( | |||||
(NUM_TRIALS, len(param_list_pre_revised), len(param_list))) | |||||
test_pref = np.zeros( | |||||
(NUM_TRIALS, len(param_list_pre_revised), len(param_list))) | |||||
# Loop for each trial | |||||
pbar = tqdm(total=NUM_TRIALS * len(param_list_pre_revised) * len(param_list), | |||||
desc='calculate performance', file=sys.stdout) | |||||
for trial in range(NUM_TRIALS): # Test set level | |||||
# loop for each outer param tuple | |||||
for index_out, params_out in enumerate(param_list_pre_revised): | |||||
# split gram matrix and y to app and test sets. | |||||
X_app, X_test, y_app, y_test = train_test_split( | |||||
gram_matrices[index_out], y, test_size=0.1) | |||||
split_index_app = [y.index(y_i) for y_i in y_app if y_i in y] | |||||
# split_index_test = [y.index(y_i) for y_i in y_test if y_i in y] | |||||
X_app = X_app[:, split_index_app] | |||||
X_test = X_test[:, split_index_app] | |||||
y_app = np.array(y_app) | |||||
y_test = np.array(y_test) | |||||
# loop for each inner param tuple | |||||
for index_in, params_in in enumerate(param_list): | |||||
inner_cv = KFold(n_splits=10, shuffle=True, random_state=trial) | |||||
current_train_perf = [] | |||||
current_valid_perf = [] | |||||
current_test_perf = [] | |||||
# For regression use the Kernel Ridge method | |||||
try: | |||||
if model_type == 'regression': | |||||
KR = KernelRidge(kernel='precomputed', **params_in) | |||||
# loop for each split on validation set level | |||||
# validation set level | |||||
for train_index, valid_index in inner_cv.split(X_app): | |||||
KR.fit(X_app[train_index, :] | |||||
[:, train_index], y_app[train_index]) | |||||
# predict on the train, validation and test set | |||||
y_pred_train = KR.predict( | |||||
X_app[train_index, :][:, train_index]) | |||||
y_pred_valid = KR.predict( | |||||
X_app[valid_index, :][:, train_index]) | |||||
y_pred_test = KR.predict(X_test[:, train_index]) | |||||
# root mean squared errors | |||||
current_train_perf.append( | |||||
np.sqrt(mean_squared_error(y_app[train_index], y_pred_train))) | |||||
current_valid_perf.append( | |||||
np.sqrt(mean_squared_error(y_app[valid_index], y_pred_valid))) | |||||
current_test_perf.append( | |||||
np.sqrt(mean_squared_error(y_test, y_pred_test))) | |||||
# For clcassification use SVM | |||||
else: | |||||
KR = SVC(kernel='precomputed', **params_in) | |||||
# loop for each split on validation set level | |||||
# validation set level | |||||
for train_index, valid_index in inner_cv.split(X_app): | |||||
KR.fit(X_app[train_index, :] | |||||
[:, train_index], y_app[train_index]) | |||||
# predict on the train, validation and test set | |||||
y_pred_train = KR.predict( | |||||
X_app[train_index, :][:, train_index]) | |||||
y_pred_valid = KR.predict( | |||||
X_app[valid_index, :][:, train_index]) | |||||
y_pred_test = KR.predict( | |||||
X_test[:, train_index]) | |||||
# root mean squared errors | |||||
current_train_perf.append(accuracy_score( | |||||
y_app[train_index], y_pred_train)) | |||||
current_valid_perf.append(accuracy_score( | |||||
y_app[valid_index], y_pred_valid)) | |||||
current_test_perf.append( | |||||
accuracy_score(y_test, y_pred_test)) | |||||
except ValueError: | |||||
print(sys.exc_info()[0]) | |||||
print(params_out, params_in) | |||||
# average performance on inner splits | |||||
train_pref[trial][index_out][index_in] = np.mean( | |||||
current_train_perf) | |||||
val_pref[trial][index_out][index_in] = np.mean( | |||||
current_valid_perf) | |||||
test_pref[trial][index_out][index_in] = np.mean( | |||||
current_test_perf) | |||||
pbar.update(1) | |||||
pbar.clear() | |||||
print() | |||||
if len(gram_matrices) == 0: | |||||
print('all gram matrices are ignored, no results obtained.') | |||||
str_fw += '\nall gram matrices are ignored, no results obtained.\n\n' | |||||
else: | |||||
print( | |||||
'3. Fitting and predicting using nested cross validation. This could really take a while...' | |||||
) | |||||
pool = Pool(n_jobs) | |||||
trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, gram_matrices, y, model_type) | |||||
result_perf = pool.map(trial_do_partial, range(NUM_TRIALS)) | |||||
train_pref = [item[0] for item in result_perf] | |||||
val_pref = [item[1] for item in result_perf] | |||||
test_pref = [item[2] for item in result_perf] | |||||
pool.close() | |||||
pool.join() | |||||
# trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, gram_matrices, y, model_type) | |||||
# result_perf = Parallel(n_jobs=n_jobs, verbose=10)(delayed(trial_do_partial)(trial) for trial in range(NUM_TRIALS)) | |||||
# train_pref = [item[0] for item in result_perf] | |||||
# val_pref = [item[1] for item in result_perf] | |||||
# test_pref = [item[2] for item in result_perf] | |||||
# pbar.clear() | |||||
# np.save(results_name_pre + 'train_pref.dt', train_pref) | # np.save(results_name_pre + 'train_pref.dt', train_pref) | ||||
# np.save(results_name_pre + 'val_pref.dt', val_pref) | # np.save(results_name_pre + 'val_pref.dt', val_pref) | ||||
# np.save(results_name_pre + 'test_pref.dt', test_pref) | # np.save(results_name_pre + 'test_pref.dt', test_pref) | ||||
print() | print() | ||||
print('4. Getting final performance...') | print('4. Getting final performance...') | ||||
fresults.write('\nII. Performance.\n\n') | |||||
str_fw += '\nII. Performance.\n\n' | |||||
# averages and confidences of performances on outer trials for each combination of parameters | # averages and confidences of performances on outer trials for each combination of parameters | ||||
average_train_scores = np.mean(train_pref, axis=0) | average_train_scores = np.mean(train_pref, axis=0) | ||||
average_val_scores = np.mean(val_pref, axis=0) | average_val_scores = np.mean(val_pref, axis=0) | ||||
@@ -255,53 +220,78 @@ def model_selection_for_precomputed_kernel(datafile, estimator, | |||||
std_train_scores = np.std(train_pref, axis=0, ddof=1) | std_train_scores = np.std(train_pref, axis=0, ddof=1) | ||||
std_val_scores = np.std(val_pref, axis=0, ddof=1) | std_val_scores = np.std(val_pref, axis=0, ddof=1) | ||||
std_perf_scores = np.std(test_pref, axis=0, ddof=1) | std_perf_scores = np.std(test_pref, axis=0, ddof=1) | ||||
if model_type == 'regression': | if model_type == 'regression': | ||||
best_val_perf = np.amin(average_val_scores) | best_val_perf = np.amin(average_val_scores) | ||||
else: | else: | ||||
best_val_perf = np.amax(average_val_scores) | best_val_perf = np.amax(average_val_scores) | ||||
best_params_index = np.where(average_val_scores == best_val_perf) | best_params_index = np.where(average_val_scores == best_val_perf) | ||||
# find smallest val std with best val perf. | # find smallest val std with best val perf. | ||||
best_val_stds = [std_val_scores[value][best_params_index[1][idx]] for idx, value in enumerate(best_params_index[0])] | |||||
best_val_stds = [ | |||||
std_val_scores[value][best_params_index[1][idx]] | |||||
for idx, value in enumerate(best_params_index[0]) | |||||
] | |||||
min_val_std = np.amin(best_val_stds) | min_val_std = np.amin(best_val_stds) | ||||
best_params_index = np.where(std_val_scores == min_val_std) | best_params_index = np.where(std_val_scores == min_val_std) | ||||
best_params_out = [param_list_pre_revised[i] for i in best_params_index[0]] | |||||
best_params_out = [ | |||||
param_list_pre_revised[i] for i in best_params_index[0] | |||||
] | |||||
best_params_in = [param_list[i] for i in best_params_index[1]] | best_params_in = [param_list[i] for i in best_params_index[1]] | ||||
print('best_params_out: ', best_params_out) | print('best_params_out: ', best_params_out) | ||||
print('best_params_in: ', best_params_in) | print('best_params_in: ', best_params_in) | ||||
print() | print() | ||||
print('best_val_perf: ', best_val_perf) | print('best_val_perf: ', best_val_perf) | ||||
print('best_val_std: ', min_val_std) | print('best_val_std: ', min_val_std) | ||||
fresults.write('best settings of hyper-params to build gram matrix: %s\n' % best_params_out) | |||||
fresults.write('best settings of other hyper-params: %s\n\n' % best_params_in) | |||||
fresults.write('best_val_perf: %s\n' % best_val_perf) | |||||
fresults.write('best_val_std: %s\n' % min_val_std) | |||||
str_fw += 'best settings of hyper-params to build gram matrix: %s\n' % best_params_out | |||||
str_fw += 'best settings of other hyper-params: %s\n\n' % best_params_in | |||||
str_fw += 'best_val_perf: %s\n' % best_val_perf | |||||
str_fw += 'best_val_std: %s\n' % min_val_std | |||||
final_performance = [average_perf_scores[value][best_params_index[1][idx]] for idx, value in enumerate(best_params_index[0])] | |||||
final_confidence = [std_perf_scores[value][best_params_index[1][idx]] for idx, value in enumerate(best_params_index[0])] | |||||
final_performance = [ | |||||
average_perf_scores[value][best_params_index[1][idx]] | |||||
for idx, value in enumerate(best_params_index[0]) | |||||
] | |||||
final_confidence = [ | |||||
std_perf_scores[value][best_params_index[1][idx]] | |||||
for idx, value in enumerate(best_params_index[0]) | |||||
] | |||||
print('final_performance: ', final_performance) | print('final_performance: ', final_performance) | ||||
print('final_confidence: ', final_confidence) | print('final_confidence: ', final_confidence) | ||||
fresults.write('final_performance: %s\n' % final_performance) | |||||
fresults.write('final_confidence: %s\n' % final_confidence) | |||||
train_performance = [average_train_scores[value][best_params_index[1][idx]] for idx, value in enumerate(best_params_index[0])] | |||||
train_std = [std_train_scores[value][best_params_index[1][idx]] for idx, value in enumerate(best_params_index[0])] | |||||
str_fw += 'final_performance: %s\n' % final_performance | |||||
str_fw += 'final_confidence: %s\n' % final_confidence | |||||
train_performance = [ | |||||
average_train_scores[value][best_params_index[1][idx]] | |||||
for idx, value in enumerate(best_params_index[0]) | |||||
] | |||||
train_std = [ | |||||
std_train_scores[value][best_params_index[1][idx]] | |||||
for idx, value in enumerate(best_params_index[0]) | |||||
] | |||||
print('train_performance: %s' % train_performance) | print('train_performance: %s' % train_performance) | ||||
print('train_std: ', train_std) | print('train_std: ', train_std) | ||||
fresults.write('train_performance: %s\n' % train_performance) | |||||
fresults.write('train_std: %s\n\n' % train_std) | |||||
str_fw += 'train_performance: %s\n' % train_performance | |||||
str_fw += 'train_std: %s\n\n' % train_std | |||||
print() | print() | ||||
tt_total = time.time() - tts # training time for all hyper-parameters | |||||
average_gram_matrix_time = np.mean(gram_matrix_time) | average_gram_matrix_time = np.mean(gram_matrix_time) | ||||
std_gram_matrix_time = np.std(gram_matrix_time, ddof=1) | std_gram_matrix_time = np.std(gram_matrix_time, ddof=1) | ||||
best_gram_matrix_time = [gram_matrix_time[i] for i in best_params_index[0]] | |||||
best_gram_matrix_time = [ | |||||
gram_matrix_time[i] for i in best_params_index[0] | |||||
] | |||||
ave_bgmt = np.mean(best_gram_matrix_time) | ave_bgmt = np.mean(best_gram_matrix_time) | ||||
std_bgmt = np.std(best_gram_matrix_time, ddof=1) | std_bgmt = np.std(best_gram_matrix_time, ddof=1) | ||||
print('time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s' | |||||
.format(average_gram_matrix_time, std_gram_matrix_time)) | |||||
print('time to calculate best gram matrix: {:.2f}±{:.2f}s'.format(ave_bgmt, std_bgmt)) | |||||
fresults.write('time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s\n' | |||||
.format(average_gram_matrix_time, std_gram_matrix_time)) | |||||
fresults.write('time to calculate best gram matrix: {:.2f}±{:.2f}s\n\n'.format(ave_bgmt, std_bgmt)) | |||||
print( | |||||
'time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s' | |||||
.format(average_gram_matrix_time, std_gram_matrix_time)) | |||||
print('time to calculate best gram matrix: {:.2f}±{:.2f}s'.format( | |||||
ave_bgmt, std_bgmt)) | |||||
print( | |||||
'total training time with all hyper-param choices: {:.2f}s'.format( | |||||
tt_total)) | |||||
str_fw += 'time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s\n'.format(average_gram_matrix_time, std_gram_matrix_time) | |||||
str_fw += 'time to calculate best gram matrix: {:.2f}±{:.2f}s\n'.format(ave_bgmt, std_bgmt) | |||||
str_fw += 'total training time with all hyper-param choices: {:.2f}s\n\n'.format(tt_total) | |||||
# # save results to file | # # save results to file | ||||
# np.savetxt(results_name_pre + 'average_train_scores.dt', | # np.savetxt(results_name_pre + 'average_train_scores.dt', | ||||
@@ -312,7 +302,7 @@ def model_selection_for_precomputed_kernel(datafile, estimator, | |||||
# np.savetxt(results_name_pre + 'std_train_scores.dt', std_train_scores) | # np.savetxt(results_name_pre + 'std_train_scores.dt', std_train_scores) | ||||
# np.savetxt(results_name_pre + 'std_val_scores.dt', std_val_scores) | # np.savetxt(results_name_pre + 'std_val_scores.dt', std_val_scores) | ||||
# np.savetxt(results_name_pre + 'std_perf_scores.dt', std_perf_scores) | # np.savetxt(results_name_pre + 'std_perf_scores.dt', std_perf_scores) | ||||
# np.save(results_name_pre + 'best_params_index', best_params_index) | # np.save(results_name_pre + 'best_params_index', best_params_index) | ||||
# np.save(results_name_pre + 'best_params_pre.dt', best_params_out) | # np.save(results_name_pre + 'best_params_pre.dt', best_params_out) | ||||
# np.save(results_name_pre + 'best_params_in.dt', best_params_in) | # np.save(results_name_pre + 'best_params_in.dt', best_params_in) | ||||
@@ -322,7 +312,7 @@ def model_selection_for_precomputed_kernel(datafile, estimator, | |||||
# np.save(results_name_pre + 'final_confidence.dt', final_confidence) | # np.save(results_name_pre + 'final_confidence.dt', final_confidence) | ||||
# np.save(results_name_pre + 'train_performance.dt', train_performance) | # np.save(results_name_pre + 'train_performance.dt', train_performance) | ||||
# np.save(results_name_pre + 'train_std.dt', train_std) | # np.save(results_name_pre + 'train_std.dt', train_std) | ||||
# np.save(results_name_pre + 'gram_matrix_time.dt', gram_matrix_time) | # np.save(results_name_pre + 'gram_matrix_time.dt', gram_matrix_time) | ||||
# np.save(results_name_pre + 'average_gram_matrix_time.dt', | # np.save(results_name_pre + 'average_gram_matrix_time.dt', | ||||
# average_gram_matrix_time) | # average_gram_matrix_time) | ||||
@@ -330,7 +320,7 @@ def model_selection_for_precomputed_kernel(datafile, estimator, | |||||
# std_gram_matrix_time) | # std_gram_matrix_time) | ||||
# np.save(results_name_pre + 'best_gram_matrix_time.dt', | # np.save(results_name_pre + 'best_gram_matrix_time.dt', | ||||
# best_gram_matrix_time) | # best_gram_matrix_time) | ||||
# print out as table. | # print out as table. | ||||
from collections import OrderedDict | from collections import OrderedDict | ||||
from tabulate import tabulate | from tabulate import tabulate | ||||
@@ -343,20 +333,150 @@ def model_selection_for_precomputed_kernel(datafile, estimator, | |||||
param_in['C'] = '{:.2e}'.format(param_in['C']) | param_in['C'] = '{:.2e}'.format(param_in['C']) | ||||
table_dict['params'] = [{**param_out, **param_in} | table_dict['params'] = [{**param_out, **param_in} | ||||
for param_in in param_list for param_out in param_list_pre_revised] | for param_in in param_list for param_out in param_list_pre_revised] | ||||
table_dict['gram_matrix_time'] = ['{:.2f}'.format(gram_matrix_time[index_out]) | |||||
for param_in in param_list for index_out, _ in enumerate(param_list_pre_revised)] | |||||
table_dict['valid_perf'] = ['{:.2f}±{:.2f}'.format(average_val_scores[index_out][index_in], std_val_scores[index_out][index_in]) | |||||
for index_in, _ in enumerate(param_list) for index_out, _ in enumerate(param_list_pre_revised)] | |||||
table_dict['test_perf'] = ['{:.2f}±{:.2f}'.format(average_perf_scores[index_out][index_in], std_perf_scores[index_out][index_in]) | |||||
for index_in, _ in enumerate(param_list) for index_out, _ in enumerate(param_list_pre_revised)] | |||||
table_dict['train_perf'] = ['{:.2f}±{:.2f}'.format(average_train_scores[index_out][index_in], std_train_scores[index_out][index_in]) | |||||
for index_in, _ in enumerate(param_list) for index_out, _ in enumerate(param_list_pre_revised)] | |||||
keyorder = ['params', 'train_perf', 'valid_perf', | |||||
'test_perf', 'gram_matrix_time'] | |||||
table_dict['gram_matrix_time'] = [ | |||||
'{:.2f}'.format(gram_matrix_time[index_out]) | |||||
for param_in in param_list | |||||
for index_out, _ in enumerate(param_list_pre_revised) | |||||
] | |||||
table_dict['valid_perf'] = [ | |||||
'{:.2f}±{:.2f}'.format(average_val_scores[index_out][index_in], | |||||
std_val_scores[index_out][index_in]) | |||||
for index_in, _ in enumerate(param_list) | |||||
for index_out, _ in enumerate(param_list_pre_revised) | |||||
] | |||||
table_dict['test_perf'] = [ | |||||
'{:.2f}±{:.2f}'.format(average_perf_scores[index_out][index_in], | |||||
std_perf_scores[index_out][index_in]) | |||||
for index_in, _ in enumerate(param_list) | |||||
for index_out, _ in enumerate(param_list_pre_revised) | |||||
] | |||||
table_dict['train_perf'] = [ | |||||
'{:.2f}±{:.2f}'.format(average_train_scores[index_out][index_in], | |||||
std_train_scores[index_out][index_in]) | |||||
for index_in, _ in enumerate(param_list) | |||||
for index_out, _ in enumerate(param_list_pre_revised) | |||||
] | |||||
keyorder = [ | |||||
'params', 'train_perf', 'valid_perf', 'test_perf', | |||||
'gram_matrix_time' | |||||
] | |||||
print() | print() | ||||
tb_print = tabulate(OrderedDict(sorted(table_dict.items(), | |||||
key=lambda i: keyorder.index(i[0]))), headers='keys') | |||||
tb_print = tabulate( | |||||
OrderedDict( | |||||
sorted(table_dict.items(), | |||||
key=lambda i: keyorder.index(i[0]))), | |||||
headers='keys') | |||||
print(tb_print) | print(tb_print) | ||||
fresults.write('table of performance v.s. hyper-params:\n\n%s\n\n' % tb_print) | |||||
str_fw += 'table of performance v.s. hyper-params:\n\n%s\n\n' % tb_print | |||||
# open file to save all results for this dataset. | |||||
if not os.path.exists(results_dir): | |||||
os.makedirs(results_dir) | |||||
with open(results_dir + '/' + ds_name + '.txt', 'w') as fresults: | |||||
fresults.write(str_fw) | |||||
fresults.close() | |||||
def trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, trial): # Test set level | |||||
# Arrays to store scores | |||||
train_pref = np.zeros((len(param_list_pre_revised), | |||||
len(param_list))) | |||||
val_pref = np.zeros((len(param_list_pre_revised), | |||||
len(param_list))) | |||||
test_pref = np.zeros((len(param_list_pre_revised), | |||||
len(param_list))) | |||||
# loop for each outer param tuple | |||||
for index_out, params_out in enumerate(param_list_pre_revised): | |||||
# split gram matrix and y to app and test sets. | |||||
X_app, X_test, y_app, y_test = train_test_split( | |||||
gram_matrices[index_out], y, test_size=0.1) | |||||
split_index_app = [y.index(y_i) for y_i in y_app if y_i in y] | |||||
# split_index_test = [y.index(y_i) for y_i in y_test if y_i in y] | |||||
X_app = X_app[:, split_index_app] | |||||
X_test = X_test[:, split_index_app] | |||||
y_app = np.array(y_app) | |||||
y_test = np.array(y_test) | |||||
# loop for each inner param tuple | |||||
for index_in, params_in in enumerate(param_list): | |||||
inner_cv = KFold( | |||||
n_splits=10, shuffle=True, random_state=trial) | |||||
current_train_perf = [] | |||||
current_valid_perf = [] | |||||
current_test_perf = [] | |||||
# For regression use the Kernel Ridge method | |||||
try: | |||||
if model_type == 'regression': | |||||
KR = KernelRidge(kernel='precomputed', **params_in) | |||||
# loop for each split on validation set level | |||||
# validation set level | |||||
for train_index, valid_index in inner_cv.split( | |||||
X_app): | |||||
KR.fit(X_app[train_index, :][:, train_index], | |||||
y_app[train_index]) | |||||
# predict on the train, validation and test set | |||||
y_pred_train = KR.predict( | |||||
X_app[train_index, :][:, train_index]) | |||||
y_pred_valid = KR.predict( | |||||
X_app[valid_index, :][:, train_index]) | |||||
y_pred_test = KR.predict( | |||||
X_test[:, train_index]) | |||||
# root mean squared errors | |||||
current_train_perf.append( | |||||
np.sqrt( | |||||
mean_squared_error( | |||||
y_app[train_index], y_pred_train))) | |||||
current_valid_perf.append( | |||||
np.sqrt( | |||||
mean_squared_error( | |||||
y_app[valid_index], y_pred_valid))) | |||||
current_test_perf.append( | |||||
np.sqrt( | |||||
mean_squared_error( | |||||
y_test, y_pred_test))) | |||||
# For clcassification use SVM | |||||
else: | |||||
KR = SVC(kernel='precomputed', **params_in) | |||||
# loop for each split on validation set level | |||||
# validation set level | |||||
for train_index, valid_index in inner_cv.split( | |||||
X_app): | |||||
KR.fit(X_app[train_index, :][:, train_index], | |||||
y_app[train_index]) | |||||
# predict on the train, validation and test set | |||||
y_pred_train = KR.predict( | |||||
X_app[train_index, :][:, train_index]) | |||||
y_pred_valid = KR.predict( | |||||
X_app[valid_index, :][:, train_index]) | |||||
y_pred_test = KR.predict( | |||||
X_test[:, train_index]) | |||||
# root mean squared errors | |||||
current_train_perf.append( | |||||
accuracy_score(y_app[train_index], | |||||
y_pred_train)) | |||||
current_valid_perf.append( | |||||
accuracy_score(y_app[valid_index], | |||||
y_pred_valid)) | |||||
current_test_perf.append( | |||||
accuracy_score(y_test, y_pred_test)) | |||||
except ValueError: | |||||
print(sys.exc_info()[0]) | |||||
print(params_out, params_in) | |||||
# average performance on inner splits | |||||
train_pref[index_out][index_in] = np.mean( | |||||
current_train_perf) | |||||
val_pref[index_out][index_in] = np.mean( | |||||
current_valid_perf) | |||||
test_pref[index_out][index_in] = np.mean( | |||||
current_test_perf) | |||||
fresults.close() | |||||
return train_pref, val_pref, test_pref |
@@ -61,10 +61,11 @@ def floydTransformation(G, edge_weight=None): | |||||
spMatrix = nx.floyd_warshall_numpy(G, weight=edge_weight) | spMatrix = nx.floyd_warshall_numpy(G, weight=edge_weight) | ||||
S = nx.Graph() | S = nx.Graph() | ||||
S.add_nodes_from(G.nodes(data=True)) | S.add_nodes_from(G.nodes(data=True)) | ||||
ns = list(G.nodes()) | |||||
for i in range(0, G.number_of_nodes()): | for i in range(0, G.number_of_nodes()): | ||||
for j in range(i + 1, G.number_of_nodes()): | for j in range(i + 1, G.number_of_nodes()): | ||||
if spMatrix[i, j] != np.inf: | if spMatrix[i, j] != np.inf: | ||||
S.add_edge(i, j, cost=spMatrix[i, j]) | |||||
S.add_edge(ns[i], ns[j], cost=spMatrix[i, j]) | |||||
return S | return S | ||||