Browse Source

1. add parallel computing scheme to spkernel and model_selection_precomputed.

2. modify model_selection_precomputed so that all results are written into memory and then to a file at last section of code, in case that on cpu/disk seperated systems the IO takes too much time.
3. correct utils.floyd_warshall_numpy function. DONNOT use the last version.
v0.1
jajupmochi 7 years ago
parent
commit
22a1f1e8d8
9 changed files with 2635 additions and 1596 deletions
  1. +10
    -7
      README.md
  2. +143
    -70
      datasets/ds.py
  3. +911
    -895
      notebooks/run_randomwalkkernel.ipynb
  4. +763
    -208
      notebooks/run_spkernel.ipynb
  5. +150
    -49
      notebooks/run_spkernel.py
  6. +0
    -1
      pygraph/kernels/.#commonWalkKernel.py
  7. +319
    -148
      pygraph/kernels/spKernel.py
  8. +337
    -217
      pygraph/utils/model_selection_precomputed.py
  9. +2
    -1
      pygraph/utils/utils.py

+ 10
- 7
README.md View File

@@ -3,12 +3,15 @@ A python package for graph kernels.


## Requirements ## Requirements


* numpy - 1.13.3
* scipy - 1.0.0
* matplotlib - 2.1.0
* networkx - 2.0
* sklearn - 0.19.1
* tabulate - 0.8.2
numpy==1.14.5
scipy==1.1.0
matplotlib==2.2.2
networkx==2.1
scikit-learn==0.19.1
tabulate==0.8.2
tqdm==4.23.4
control==0.7.0 (for generalized random walk kernels only)
slycot===0.3.2.dev-5263ada (for generalized random walk kernels only, requires fortran compiler, gfortran for example)


## Results with minimal test RMSE for each kernel on dataset Asyclic ## Results with minimal test RMSE for each kernel on dataset Asyclic


@@ -28,7 +31,7 @@ For prediction we randomly divide the data in train and test subset, where 90\%
| WL shortest path | 28.74±0.60 | 38.20±0.62 | 39.02±6.09 | 'height': 10.0, 'alpha': '1.00' | 146.83"/80.63"±45.04" | | WL shortest path | 28.74±0.60 | 38.20±0.62 | 39.02±6.09 | 'height': 10.0, 'alpha': '1.00' | 146.83"/80.63"±45.04" |
| WL edge | 30.21±0.64 | 36.53±1.02 | 38.42±6.42 | 'height': 5.0, 'alpha': '6.31e-01' | 5.24"/5.15"±2.83" | | WL edge | 30.21±0.64 | 36.53±1.02 | 38.42±6.42 | 'height': 5.0, 'alpha': '6.31e-01' | 5.24"/5.15"±2.83" |
| Treelet | 7.33±0.64 | 13.86±0.80 | 15.38±3.56 | 'alpha': '1.12e+01' | 0.48" | | Treelet | 7.33±0.64 | 13.86±0.80 | 15.38±3.56 | 'alpha': '1.12e+01' | 0.48" |
| Path up to d | 5.76±0.27 | 9.89±0.87 | 10.21±4.16 | 'depth': 2.0, 'k_func': 'MinMax', 'alpha': '0.1' | 0.56"/1.16"±0.75" |
| Path up to d | 5.76±0.27 | 9.89±0.87 | 10.21±4.16 | 'depth': 2.0, 'k_func': 'MinMax', 'alpha ': '0.1' | 0.56"/1.16"±0.75" |
| Cyclic pattern | | | | | | | Cyclic pattern | | | | | |
| Walk up to n | 20.88±0.74 | 23.34±1.11 | 24.46±6.57 | 'n': 2.0, 'alpha': '1.00e-03' | 0.56"/331.70"±753.44" | | Walk up to n | 20.88±0.74 | 23.34±1.11 | 24.46±6.57 | 'n': 2.0, 'alpha': '1.00e-03' | 0.56"/331.70"±753.44" |




+ 143
- 70
datasets/ds.py View File

@@ -3,106 +3,66 @@ dslist = [
'name': 'Acyclic', 'name': 'Acyclic',
'dataset': '../datasets/acyclic/dataset_bps.ds', 'dataset': '../datasets/acyclic/dataset_bps.ds',
'task': 'regression' 'task': 'regression'
}, # node_labeled
{
'name': 'COIL-DEL',
'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'
}, # edge_labeled
}, # node symb
# {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb
{ {
'name': 'PAH', 'name': 'PAH',
'dataset': '../datasets/PAH/dataset.ds', 'dataset': '../datasets/PAH/dataset.ds',
}, # unlabeled }, # unlabeled
{ {
'name': 'Mutagenicity',
'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'
}, # fully_labeled
{
'name': 'MAO', 'name': 'MAO',
'dataset': '../datasets/MAO/dataset.ds', 'dataset': '../datasets/MAO/dataset.ds',
},
}, # node/edge symb
{ {
'name': 'MUTAG', 'name': 'MUTAG',
'dataset': '../datasets/MUTAG/MUTAG.mat', 'dataset': '../datasets/MUTAG/MUTAG.mat',
'extra_params': { 'extra_params': {
'am_sp_al_nl_el': [0, 0, 3, 1, 2] 'am_sp_al_nl_el': [0, 0, 3, 1, 2]
} }
},
}, # node/edge symb
{ {
'name': 'Alkane', 'name': 'Alkane',
'dataset': '../datasets/Alkane/dataset.ds', 'dataset': '../datasets/Alkane/dataset.ds',
'task': 'regression', 'task': 'regression',
'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt', 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt',
},
{
'name': 'BZR',
'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'
},
}, # contains single node graph, node symb
# {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb
# {'name': 'COX2', 'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt'}, # node symb/nsymb
{ {
'name': 'COX2',
'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt'
},
'name': 'Mutagenicity',
'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'
}, # node/edge symb
{ {
'name': 'ENZYMES', 'name': 'ENZYMES',
'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt' 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'
},
{
'name': 'DHFR',
'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt'
},
{
'name': 'SYNTHETIC',
'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt'
},
{
'name': 'MSRC9',
'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'
},
{
'name': 'MSRC21',
'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'
},
}, # node symb/nsymb
# {'name': 'Fingerprint', 'dataset': '../datasets/Fingerprint/Fingerprint_A.txt'},
{ {
'name': 'FIRSTMM_DB',
'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'
},
{
'name': 'PROTEINS',
'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'
},
{
'name': 'PROTEINS_full',
'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'
'name': 'Letter-med',
'dataset': '../datasets/Letter-med/Letter-med_A.txt'
}, },
# {'name': 'DHFR', 'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt'}, # node symb/nsymb
# {'name': 'SYNTHETIC', 'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt'}, # node symb/nsymb
# {'name': 'MSRC9', 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'}, # node symb
# {'name': 'MSRC21', 'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'}, # node symb
# {'name': 'FIRSTMM_DB', 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'}, # node symb/nsymb ,edge nsymb

# {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'}, # node symb/nsymb
# {'name': 'PROTEINS_full', 'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'}, # node symb/nsymb
{ {
'name': 'D&D', 'name': 'D&D',
'dataset': '../datasets/D&D/DD.mat', 'dataset': '../datasets/D&D/DD.mat',
'extra_params': { 'extra_params': {
'am_sp_al_nl_el': [0, 1, 2, 1, -1] 'am_sp_al_nl_el': [0, 1, 2, 1, -1]
} }
},
{
'name': 'AIDS',
'dataset': '../datasets/AIDS/AIDS_A.txt'
},
{
'name': 'NCI1',
'dataset': '../datasets/NCI1/NCI1.mat',
'extra_params': {
'am_sp_al_nl_el': [1, 1, 2, 0, -1]
}
},
{
'name': 'NCI109',
'dataset': '../datasets/NCI109/NCI109.mat',
'extra_params': {
'am_sp_al_nl_el': [1, 1, 2, 0, -1]
}
},
{
'name': 'NCI-HIV',
'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf',
'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',
},
}, # node symb
# {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb
# {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1.mat',
# 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb
# {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109.mat',
# 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb
# {'name': 'NCI-HIV', 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf',
# 'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',}, # node/edge symb


# # not working below # # not working below
# {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',}, # {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',},
@@ -110,3 +70,116 @@ dslist = [
# {'name': 'PTC_MM', 'dataset': '../datasets/PTC/Train/MM.ds',}, # {'name': 'PTC_MM', 'dataset': '../datasets/PTC/Train/MM.ds',},
# {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',}, # {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',},
] ]

# dslist = [
# {
# 'name': 'Acyclic',
# 'dataset': '../datasets/acyclic/dataset_bps.ds',
# 'task': 'regression'
# }, # node_labeled
# {
# 'name': 'COIL-DEL',
# 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'
# }, # edge_labeled
# {
# 'name': 'PAH',
# 'dataset': '../datasets/PAH/dataset.ds',
# }, # unlabeled
# {
# 'name': 'Mutagenicity',
# 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'
# }, # fully_labeled
# {
# 'name': 'MAO',
# 'dataset': '../datasets/MAO/dataset.ds',
# },
# {
# 'name': 'MUTAG',
# 'dataset': '../datasets/MUTAG/MUTAG.mat',
# 'extra_params': {
# 'am_sp_al_nl_el': [0, 0, 3, 1, 2]
# }
# },
# {
# 'name': 'Alkane',
# 'dataset': '../datasets/Alkane/dataset.ds',
# 'task': 'regression',
# 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt',
# },
# {
# 'name': 'BZR',
# 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'
# },
# {
# 'name': 'COX2',
# 'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt'
# },
# {
# 'name': 'ENZYMES',
# 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'
# },
# {
# 'name': 'DHFR',
# 'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt'
# },
# {
# 'name': 'SYNTHETIC',
# 'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt'
# },
# {
# 'name': 'MSRC9',
# 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'
# },
# {
# 'name': 'MSRC21',
# 'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'
# },
# {
# 'name': 'FIRSTMM_DB',
# 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'
# },
# {
# 'name': 'PROTEINS',
# 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'
# },
# {
# 'name': 'PROTEINS_full',
# 'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'
# },
# {
# 'name': 'D&D',
# 'dataset': '../datasets/D&D/DD.mat',
# 'extra_params': {
# 'am_sp_al_nl_el': [0, 1, 2, 1, -1]
# }
# },
# {
# 'name': 'AIDS',
# 'dataset': '../datasets/AIDS/AIDS_A.txt'
# },
# {
# 'name': 'NCI1',
# 'dataset': '../datasets/NCI1/NCI1.mat',
# 'extra_params': {
# 'am_sp_al_nl_el': [1, 1, 2, 0, -1]
# }
# },
# {
# 'name': 'NCI109',
# 'dataset': '../datasets/NCI109/NCI109.mat',
# 'extra_params': {
# 'am_sp_al_nl_el': [1, 1, 2, 0, -1]
# }
# },
# {
# 'name': 'NCI-HIV',
# 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf',
# 'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',
# },

# # # not working below
# # {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',},
# # {'name': 'PTC_FR', 'dataset': '../datasets/PTC/Train/FR.ds',},
# # {'name': 'PTC_MM', 'dataset': '../datasets/PTC/Train/MM.ds',},
# # {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',},
# ]

+ 911
- 895
notebooks/run_randomwalkkernel.ipynb
File diff suppressed because it is too large
View File


+ 763
- 208
notebooks/run_spkernel.ipynb
File diff suppressed because it is too large
View File


+ 150
- 49
notebooks/run_spkernel.py View File

@@ -1,56 +1,157 @@
import functools
from libs import * from libs import *
from pygraph.kernels.spKernel import spkernel from pygraph.kernels.spKernel import spkernel
from pygraph.utils.kernels import deltakernel, kernelsum
from sklearn.metrics.pairwise import rbf_kernel


dslist = [
# {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds', 'task': 'regression'}, # node_labeled
# {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge_labeled
# dslist = [
# {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds', 'task': 'regression'}, # node symb
# # {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb
# {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds',}, # unlabeled # {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds',}, # unlabeled
{'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, # fully_labeled
# {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds',},

# {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds',}, # node/edge symb
# {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat', # {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat',
# 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}},
# {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression',
# 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt',},
# {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'},
# {'name': 'COX2', 'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt'},
{'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'},
# {'name': 'DHFR', 'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt'},
# {'name': 'SYNTHETIC', 'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt'},
# {'name': 'MSRC9', 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'},
# {'name': 'MSRC21', 'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'},
# {'name': 'FIRSTMM_DB', 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'},

# {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'},
# {'name': 'PROTEINS_full', 'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'},
# 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, # node/edge symb
# {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression',
# 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt',}, # contains single node graph, node symb
# # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb
# # {'name': 'COX2', 'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt'}, # node symb/nsymb
# {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, # node/edge symb
# {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'}, # node symb/nsymb
# # {'name': 'Fingerprint', 'dataset': '../datasets/Fingerprint/Fingerprint_A.txt'},
# {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'},
# # {'name': 'DHFR', 'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt'}, # node symb/nsymb
# # {'name': 'SYNTHETIC', 'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt'}, # node symb/nsymb
# # {'name': 'MSRC9', 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'}, # node symb
# # {'name': 'MSRC21', 'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'}, # node symb
# # {'name': 'FIRSTMM_DB', 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'}, # node symb/nsymb ,edge nsymb

# # {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'}, # node symb/nsymb
# # {'name': 'PROTEINS_full', 'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'}, # node symb/nsymb
# {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat', # {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat',
# 'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}},
# {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'},
# {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1.mat',
# 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}},
# {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109.mat',
# 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}},
# {'name': 'NCI-HIV', 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf',
# 'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',},
# # not working below
# {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',},
# {'name': 'PTC_FR', 'dataset': '../datasets/PTC/Train/FR.ds',},
# {'name': 'PTC_MM', 'dataset': '../datasets/PTC/Train/MM.ds',},
# {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',},
]
# 'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}}, # node symb
# # {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb
# # {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1.mat',
# # 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb
# # {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109.mat',
# # 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb
# # {'name': 'NCI-HIV', 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf',
# # 'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',}, # node/edge symb

# # # not working below
# # {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',},
# # {'name': 'PTC_FR', 'dataset': '../datasets/PTC/Train/FR.ds',},
# # {'name': 'PTC_MM', 'dataset': '../datasets/PTC/Train/MM.ds',},
# # {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',},
# ]

import ast
ds = ast.literal_eval(sys.argv[1])

estimator = spkernel estimator = spkernel
param_grid_precomputed = {}
param_grid = [{'C': np.logspace(-10, 10, num = 41, base = 10)},
{'alpha': np.logspace(-10, 10, num = 41, base = 10)}]

for ds in dslist:
print()
print(ds['name'])
model_selection_for_precomputed_kernel(
ds['dataset'], estimator, param_grid_precomputed,
(param_grid[1] if ('task' in ds and ds['task'] == 'regression') else param_grid[0]),
(ds['task'] if 'task' in ds else 'classification'), NUM_TRIALS=30,
datafile_y=(ds['dataset_y'] if 'dataset_y' in ds else None),
extra_params=(ds['extra_params'] if 'extra_params' in ds else None))
print()
mixkernel = functools.partial(kernelsum, deltakernel, rbf_kernel)
param_grid_precomputed = {
'node_kernels': [{
'symb': deltakernel,
'nsymb': rbf_kernel,
'mix': mixkernel
}]
}
param_grid = [{
'C': np.logspace(-10, 10, num=41, base=10)
}, {
'alpha': np.logspace(-10, 10, num=41, base=10)
}]

print()
print(ds['name'])
model_selection_for_precomputed_kernel(
ds['dataset'],
estimator,
param_grid_precomputed,
(param_grid[1]
if ('task' in ds and ds['task'] == 'regression') else param_grid[0]),
(ds['task'] if 'task' in ds else 'classification'),
NUM_TRIALS=30,
datafile_y=(ds['dataset_y'] if 'dataset_y' in ds else None),
extra_params=(ds['extra_params'] if 'extra_params' in ds else None),
ds_name=ds['name'])

# %lprun -f spkernel \
# model_selection_for_precomputed_kernel( \
# ds['dataset'], estimator, param_grid_precomputed, \
# (param_grid[1] if ('task' in ds and ds['task'] == 'regression') else param_grid[0]), \
# (ds['task'] if 'task' in ds else 'classification'), NUM_TRIALS=30, \
# datafile_y=(ds['dataset_y'] if 'dataset_y' in ds else None), \
# extra_params=(ds['extra_params'] if 'extra_params' in ds else None))
print()

# import functools
# from libs import *
# from pygraph.kernels.spKernel import spkernel
# from pygraph.utils.kernels import deltakernel, kernelsum
# from sklearn.metrics.pairwise import rbf_kernel

# dslist = [
# {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds', 'task': 'regression'}, # node symb
# # {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb
# # {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds',}, # unlabeled
# # {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds',}, # node/edge symb
# # {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat',
# # 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, # node/edge symb
# # {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression',
# # 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt',}, # contains single node graph, node symb
# # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb
# # {'name': 'COX2', 'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt'}, # node symb/nsymb
# # {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, # node/edge symb
# # {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'}, # node symb/nsymb
# # {'name': 'Fingerprint', 'dataset': '../datasets/Fingerprint/Fingerprint_A.txt'},
# # {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'},
# # {'name': 'DHFR', 'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt'}, # node symb/nsymb
# # {'name': 'SYNTHETIC', 'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt'}, # node symb/nsymb
# # {'name': 'MSRC9', 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'}, # node symb
# # {'name': 'MSRC21', 'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'}, # node symb
# # {'name': 'FIRSTMM_DB', 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'}, # node symb/nsymb ,edge nsymb

# # {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'}, # node symb/nsymb
# # {'name': 'PROTEINS_full', 'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'}, # node symb/nsymb
# # {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat',
# # 'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}}, # node symb
# # {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb
# # {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1.mat',
# # 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb
# # {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109.mat',
# # 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb
# # {'name': 'NCI-HIV', 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf',
# # 'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',}, # node/edge symb

# # # not working below
# # {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',},
# # {'name': 'PTC_FR', 'dataset': '../datasets/PTC/Train/FR.ds',},
# # {'name': 'PTC_MM', 'dataset': '../datasets/PTC/Train/MM.ds',},
# # {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',},
# ]
# estimator = spkernel
# mixkernel = functools.partial(kernelsum, deltakernel, rbf_kernel)
# param_grid_precomputed = {'node_kernels': [{'symb': deltakernel, 'nsymb': rbf_kernel, 'mix': mixkernel}]}
# param_grid = [{'C': np.logspace(-10, 10, num = 41, base = 10)},
# {'alpha': np.logspace(-10, 10, num = 41, base = 10)}]

# for ds in dslist:
# print()
# print(ds['name'])
# model_selection_for_precomputed_kernel(
# ds['dataset'], estimator, param_grid_precomputed,
# (param_grid[1] if ('task' in ds and ds['task'] == 'regression') else param_grid[0]),
# (ds['task'] if 'task' in ds else 'classification'), NUM_TRIALS=30,
# datafile_y=(ds['dataset_y'] if 'dataset_y' in ds else None),
# extra_params=(ds['extra_params'] if 'extra_params' in ds else None),
# ds_name=ds['name'])

# # %lprun -f spkernel \
# # model_selection_for_precomputed_kernel( \
# # ds['dataset'], estimator, param_grid_precomputed, \
# # (param_grid[1] if ('task' in ds and ds['task'] == 'regression') else param_grid[0]), \
# # (ds['task'] if 'task' in ds else 'classification'), NUM_TRIALS=30, \
# # datafile_y=(ds['dataset_y'] if 'dataset_y' in ds else None), \
# # extra_params=(ds['extra_params'] if 'extra_params' in ds else None))
# print()

+ 0
- 1
pygraph/kernels/.#commonWalkKernel.py View File

@@ -1 +0,0 @@
ljia@ljia-Precision-7520.4716:1530265749

+ 319
- 148
pygraph/kernels/spKernel.py View File

@@ -9,6 +9,9 @@ sys.path.insert(0, "../")
from tqdm import tqdm from tqdm import tqdm
import time import time
from itertools import combinations_with_replacement, product from itertools import combinations_with_replacement, product
from functools import partial
from joblib import Parallel, delayed
from multiprocessing import Pool


import networkx as nx import networkx as nx
import numpy as np import numpy as np
@@ -17,7 +20,11 @@ from pygraph.utils.utils import getSPGraph
from pygraph.utils.graphdataset import get_dataset_attributes from pygraph.utils.graphdataset import get_dataset_attributes




def spkernel(*args, node_label='atom', edge_weight=None, node_kernels=None):
def spkernel(*args,
node_label='atom',
edge_weight=None,
node_kernels=None,
n_jobs=None):
"""Calculate shortest-path kernels between graphs. """Calculate shortest-path kernels between graphs.


Parameters Parameters
@@ -70,180 +77,344 @@ def spkernel(*args, node_label='atom', edge_weight=None, node_kernels=None):
if len(Gn) != len_gn: if len(Gn) != len_gn:
print('\n %d graphs are removed as they don\'t contain edges.\n' % print('\n %d graphs are removed as they don\'t contain edges.\n' %
(len_gn - len(Gn))) (len_gn - len(Gn)))

start_time = time.time() start_time = time.time()
pool = Pool(n_jobs)


# get shortest path graphs of Gn # get shortest path graphs of Gn
Gn = [
getSPGraph(G, edge_weight=edge_weight)
for G in tqdm(Gn, desc='getting sp graphs', file=sys.stdout)
]
getsp_partial = partial(wrap_getSPGraph, Gn, edge_weight)
result_sp = pool.map(getsp_partial, range(0, len(Gn)))
for i in result_sp:
Gn[i[0]] = i[1]

# Gn = [
# getSPGraph(G, edge_weight=edge_weight)
# for G in tqdm(Gn, desc='getting sp graphs', file=sys.stdout)
# ]


Kmatrix = np.zeros((len(Gn), len(Gn))) Kmatrix = np.zeros((len(Gn), len(Gn)))
pbar = tqdm(
total=((len(Gn) + 1) * len(Gn) / 2),
desc='calculating kernels',
file=sys.stdout)

do_partial = partial(spkernel_do, Gn, ds_attrs, node_label, node_kernels)
itr = combinations_with_replacement(range(0, len(Gn)), 2)
# chunksize = 2000 # int(len(list(itr)) / n_jobs)
# for i, j, kernel in tqdm(pool.imap_unordered(do_partial, itr, chunksize)):
# Kmatrix[i][j] = kernel
# Kmatrix[j][i] = kernel

result_perf = pool.map(do_partial, itr)
pool.close()
pool.join()

# result_perf = Parallel(
# n_jobs=n_jobs, verbose=10)(
# delayed(do_partial)(ij)
# for ij in combinations_with_replacement(range(0, len(Gn)), 2))

# result_perf = [
# do_partial(ij)
# for ij in combinations_with_replacement(range(0, len(Gn)), 2)
# ]

for i in result_perf:
Kmatrix[i[0]][i[1]] = i[2]
Kmatrix[i[1]][i[0]] = i[2]

# pbar = tqdm(
# total=((len(Gn) + 1) * len(Gn) / 2),
# desc='calculating kernels',
# file=sys.stdout)
# if ds_attrs['node_labeled']:
# # node symb and non-synb labeled
# if ds_attrs['node_attr_dim'] > 0:
# if ds_attrs['is_directed']:
# for i, j in combinations_with_replacement(
# range(0, len(Gn)), 2):
# for e1, e2 in product(
# Gn[i].edges(data=True), Gn[j].edges(data=True)):
# if e1[2]['cost'] == e2[2]['cost']:
# kn = node_kernels['mix']
# try:
# n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[
# i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[
# j].nodes[e2[1]]
# kn1 = kn(n11[node_label], n21[node_label], [
# n11['attributes']
# ], [n21['attributes']]) * kn(
# n12[node_label], n22[node_label],
# [n12['attributes']], [n22['attributes']])
# Kmatrix[i][j] += kn1
# except KeyError: # missing labels or attributes
# pass
# Kmatrix[j][i] = Kmatrix[i][j]
# pbar.update(1)

# else:
# for i, j in combinations_with_replacement(
# range(0, len(Gn)), 2):
# for e1, e2 in product(
# Gn[i].edges(data=True), Gn[j].edges(data=True)):
# if e1[2]['cost'] == e2[2]['cost']:
# kn = node_kernels['mix']
# try:
# # each edge walk is counted twice, starting from both its extreme nodes.
# n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[
# i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[
# j].nodes[e2[1]]
# kn1 = kn(n11[node_label], n21[node_label], [
# n11['attributes']
# ], [n21['attributes']]) * kn(
# n12[node_label], n22[node_label],
# [n12['attributes']], [n22['attributes']])
# kn2 = kn(n11[node_label], n22[node_label], [
# n11['attributes']
# ], [n22['attributes']]) * kn(
# n12[node_label], n21[node_label],
# [n12['attributes']], [n21['attributes']])
# Kmatrix[i][j] += kn1 + kn2
# except KeyError: # missing labels or attributes
# pass
# Kmatrix[j][i] = Kmatrix[i][j]
# pbar.update(1)
# # node symb labeled
# else:
# if ds_attrs['is_directed']:
# for i, j in combinations_with_replacement(
# range(0, len(Gn)), 2):
# for e1, e2 in product(
# Gn[i].edges(data=True), Gn[j].edges(data=True)):
# if e1[2]['cost'] == e2[2]['cost']:
# kn = node_kernels['symb']
# try:
# n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[
# i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[
# j].nodes[e2[1]]
# kn1 = kn(n11[node_label],
# n21[node_label]) * kn(
# n12[node_label], n22[node_label])
# Kmatrix[i][j] += kn1
# except KeyError: # missing labels
# pass
# Kmatrix[j][i] = Kmatrix[i][j]
# pbar.update(1)

# else:
# for i, j in combinations_with_replacement(
# range(0, len(Gn)), 2):
# for e1, e2 in product(
# Gn[i].edges(data=True), Gn[j].edges(data=True)):
# if e1[2]['cost'] == e2[2]['cost']:
# kn = node_kernels['symb']
# try:
# # each edge walk is counted twice, starting from both its extreme nodes.
# n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[
# i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[
# j].nodes[e2[1]]
# kn1 = kn(n11[node_label],
# n21[node_label]) * kn(
# n12[node_label], n22[node_label])
# kn2 = kn(n11[node_label],
# n22[node_label]) * kn(
# n12[node_label], n21[node_label])
# Kmatrix[i][j] += kn1 + kn2
# except KeyError: # missing labels
# pass
# Kmatrix[j][i] = Kmatrix[i][j]
# pbar.update(1)
# else:
# # node non-synb labeled
# if ds_attrs['node_attr_dim'] > 0:
# if ds_attrs['is_directed']:
# for i, j in combinations_with_replacement(
# range(0, len(Gn)), 2):
# for e1, e2 in product(
# Gn[i].edges(data=True), Gn[j].edges(data=True)):
# if e1[2]['cost'] == e2[2]['cost']:
# kn = node_kernels['nsymb']
# try:
# # each edge walk is counted twice, starting from both its extreme nodes.
# n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[
# i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[
# j].nodes[e2[1]]
# kn1 = kn([n11['attributes']],
# [n21['attributes']]) * kn(
# [n12['attributes']],
# [n22['attributes']])
# Kmatrix[i][j] += kn1
# except KeyError: # missing attributes
# pass
# Kmatrix[j][i] = Kmatrix[i][j]
# pbar.update(1)
# else:
# for i, j in combinations_with_replacement(
# range(0, len(Gn)), 2):
# for e1, e2 in product(
# Gn[i].edges(data=True), Gn[j].edges(data=True)):
# if e1[2]['cost'] == e2[2]['cost']:
# kn = node_kernels['nsymb']
# try:
# # each edge walk is counted twice, starting from both its extreme nodes.
# n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[
# i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[
# j].nodes[e2[1]]
# kn1 = kn([n11['attributes']],
# [n21['attributes']]) * kn(
# [n12['attributes']],
# [n22['attributes']])
# kn2 = kn([n11['attributes']],
# [n22['attributes']]) * kn(
# [n12['attributes']],
# [n21['attributes']])
# Kmatrix[i][j] += kn1 + kn2
# except KeyError: # missing attributes
# pass
# Kmatrix[j][i] = Kmatrix[i][j]
# pbar.update(1)

# # node unlabeled
# else:
# for i, j in combinations_with_replacement(range(0, len(Gn)), 2):
# for e1, e2 in product(
# Gn[i].edges(data=True), Gn[j].edges(data=True)):
# if e1[2]['cost'] == e2[2]['cost']:
# Kmatrix[i][j] += 1
# Kmatrix[j][i] = Kmatrix[i][j]
# pbar.update(1)

run_time = time.time() - start_time
print(
"\n --- shortest path kernel matrix of size %d built in %s seconds ---"
% (len(Gn), run_time))

return Kmatrix, run_time, idx


def spkernel_do(Gn, ds_attrs, node_label, node_kernels, ij):

i = ij[0]
j = ij[1]
Kmatrix = 0
if ds_attrs['node_labeled']: if ds_attrs['node_labeled']:
# node symb and non-synb labeled # node symb and non-synb labeled
if ds_attrs['node_attr_dim'] > 0: if ds_attrs['node_attr_dim'] > 0:
if ds_attrs['is_directed']: if ds_attrs['is_directed']:
for i, j in combinations_with_replacement(
range(0, len(Gn)), 2):
for e1, e2 in product(
Gn[i].edges(data=True), Gn[j].edges(data=True)):
if e1[2]['cost'] == e2[2]['cost']:
kn = node_kernels['mix']
try:
n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[
i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[
j].nodes[e2[1]]
kn1 = kn(n11[node_label], n21[node_label], [
n11['attributes']
], [n21['attributes']]) * kn(
for e1, e2 in product(
Gn[i].edges(data=True), Gn[j].edges(data=True)):
if e1[2]['cost'] == e2[2]['cost']:
kn = node_kernels['mix']
try:
n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[
i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[
j].nodes[e2[1]]
kn1 = kn(
n11[node_label], n21[node_label],
[n11['attributes']], [n21['attributes']]) * kn(
n12[node_label], n22[node_label], n12[node_label], n22[node_label],
[n12['attributes']], [n22['attributes']]) [n12['attributes']], [n22['attributes']])
Kmatrix[i][j] += kn1
except KeyError: # missing labels or attributes
pass
Kmatrix[j][i] = Kmatrix[i][j]
pbar.update(1)

Kmatrix += kn1
except KeyError: # missing labels or attributes
pass
else: else:
for i, j in combinations_with_replacement(
range(0, len(Gn)), 2):
for e1, e2 in product(
Gn[i].edges(data=True), Gn[j].edges(data=True)):
if e1[2]['cost'] == e2[2]['cost']:
kn = node_kernels['mix']
try:
# each edge walk is counted twice, starting from both its extreme nodes.
n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[
i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[
j].nodes[e2[1]]
kn1 = kn(n11[node_label], n21[node_label], [
n11['attributes']
], [n21['attributes']]) * kn(
for e1, e2 in product(
Gn[i].edges(data=True), Gn[j].edges(data=True)):
if e1[2]['cost'] == e2[2]['cost']:
kn = node_kernels['mix']
try:
# each edge walk is counted twice, starting from both its extreme nodes.
n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[
i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[
j].nodes[e2[1]]
kn1 = kn(
n11[node_label], n21[node_label],
[n11['attributes']], [n21['attributes']]) * kn(
n12[node_label], n22[node_label], n12[node_label], n22[node_label],
[n12['attributes']], [n22['attributes']]) [n12['attributes']], [n22['attributes']])
kn2 = kn(n11[node_label], n22[node_label], [
n11['attributes']
], [n22['attributes']]) * kn(
kn2 = kn(
n11[node_label], n22[node_label],
[n11['attributes']], [n22['attributes']]) * kn(
n12[node_label], n21[node_label], n12[node_label], n21[node_label],
[n12['attributes']], [n21['attributes']]) [n12['attributes']], [n21['attributes']])
Kmatrix[i][j] += kn1 + kn2
except KeyError: # missing labels or attributes
pass
Kmatrix[j][i] = Kmatrix[i][j]
pbar.update(1)
Kmatrix += kn1 + kn2
except KeyError: # missing labels or attributes
pass
# node symb labeled # node symb labeled
else: else:
if ds_attrs['is_directed']: if ds_attrs['is_directed']:
for i, j in combinations_with_replacement(
range(0, len(Gn)), 2):
for e1, e2 in product(
Gn[i].edges(data=True), Gn[j].edges(data=True)):
if e1[2]['cost'] == e2[2]['cost']:
kn = node_kernels['symb']
try:
n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[
i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[
j].nodes[e2[1]]
kn1 = kn(n11[node_label],
n21[node_label]) * kn(
n12[node_label], n22[node_label])
Kmatrix[i][j] += kn1
except KeyError: # missing labels
pass
Kmatrix[j][i] = Kmatrix[i][j]
pbar.update(1)

for e1, e2 in product(
Gn[i].edges(data=True), Gn[j].edges(data=True)):
if e1[2]['cost'] == e2[2]['cost']:
kn = node_kernels['symb']
try:
n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[
i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[
j].nodes[e2[1]]
kn1 = kn(n11[node_label], n21[node_label]) * kn(
n12[node_label], n22[node_label])
Kmatrix += kn1
except KeyError: # missing labels
pass
else: else:
for i, j in combinations_with_replacement(
range(0, len(Gn)), 2):
for e1, e2 in product(
Gn[i].edges(data=True), Gn[j].edges(data=True)):
if e1[2]['cost'] == e2[2]['cost']:
kn = node_kernels['symb']
try:
# each edge walk is counted twice, starting from both its extreme nodes.
n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[
i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[
j].nodes[e2[1]]
kn1 = kn(n11[node_label],
n21[node_label]) * kn(
n12[node_label], n22[node_label])
kn2 = kn(n11[node_label],
n22[node_label]) * kn(
n12[node_label], n21[node_label])
Kmatrix[i][j] += kn1 + kn2
except KeyError: # missing labels
pass
Kmatrix[j][i] = Kmatrix[i][j]
pbar.update(1)
for e1, e2 in product(
Gn[i].edges(data=True), Gn[j].edges(data=True)):
if e1[2]['cost'] == e2[2]['cost']:
kn = node_kernels['symb']
try:
# each edge walk is counted twice, starting from both its extreme nodes.
n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[
i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[
j].nodes[e2[1]]
kn1 = kn(n11[node_label], n21[node_label]) * kn(
n12[node_label], n22[node_label])
kn2 = kn(n11[node_label], n22[node_label]) * kn(
n12[node_label], n21[node_label])
Kmatrix += kn1 + kn2
except KeyError: # missing labels
pass
else: else:
# node non-synb labeled # node non-synb labeled
if ds_attrs['node_attr_dim'] > 0: if ds_attrs['node_attr_dim'] > 0:
if ds_attrs['is_directed']: if ds_attrs['is_directed']:
for i, j in combinations_with_replacement(
range(0, len(Gn)), 2):
for e1, e2 in product(
Gn[i].edges(data=True), Gn[j].edges(data=True)):
if e1[2]['cost'] == e2[2]['cost']:
kn = node_kernels['nsymb']
try:
# each edge walk is counted twice, starting from both its extreme nodes.
n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[
i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[
j].nodes[e2[1]]
kn1 = kn([n11['attributes']],
[n21['attributes']]) * kn(
[n12['attributes']],
[n22['attributes']])
Kmatrix[i][j] += kn1
except KeyError: # missing attributes
pass
Kmatrix[j][i] = Kmatrix[i][j]
pbar.update(1)
for e1, e2 in product(
Gn[i].edges(data=True), Gn[j].edges(data=True)):
if e1[2]['cost'] == e2[2]['cost']:
kn = node_kernels['nsymb']
try:
# each edge walk is counted twice, starting from both its extreme nodes.
n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[
i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[
j].nodes[e2[1]]
kn1 = kn(
[n11['attributes']], [n21['attributes']]) * kn(
[n12['attributes']], [n22['attributes']])
Kmatrix += kn1
except KeyError: # missing attributes
pass
else: else:
for i, j in combinations_with_replacement(
range(0, len(Gn)), 2):
for e1, e2 in product(
Gn[i].edges(data=True), Gn[j].edges(data=True)):
if e1[2]['cost'] == e2[2]['cost']:
kn = node_kernels['nsymb']
try:
# each edge walk is counted twice, starting from both its extreme nodes.
n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[
i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[
j].nodes[e2[1]]
kn1 = kn([n11['attributes']],
[n21['attributes']]) * kn(
[n12['attributes']],
[n22['attributes']])
kn2 = kn([n11['attributes']],
[n22['attributes']]) * kn(
[n12['attributes']],
[n21['attributes']])
Kmatrix[i][j] += kn1 + kn2
except KeyError: # missing attributes
pass
Kmatrix[j][i] = Kmatrix[i][j]
pbar.update(1)

# node unlabeled
else:
for i, j in combinations_with_replacement(range(0, len(Gn)), 2):
for e1, e2 in product( for e1, e2 in product(
Gn[i].edges(data=True), Gn[j].edges(data=True)): Gn[i].edges(data=True), Gn[j].edges(data=True)):
if e1[2]['cost'] == e2[2]['cost']: if e1[2]['cost'] == e2[2]['cost']:
Kmatrix[i][j] += 1
Kmatrix[j][i] = Kmatrix[i][j]
pbar.update(1)
kn = node_kernels['nsymb']
try:
# each edge walk is counted twice, starting from both its extreme nodes.
n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[
i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[
j].nodes[e2[1]]
kn1 = kn(
[n11['attributes']], [n21['attributes']]) * kn(
[n12['attributes']], [n22['attributes']])
kn2 = kn(
[n11['attributes']], [n22['attributes']]) * kn(
[n12['attributes']], [n21['attributes']])
Kmatrix += kn1 + kn2
except KeyError: # missing attributes
pass
# node unlabeled
else:
for e1, e2 in product(
Gn[i].edges(data=True), Gn[j].edges(data=True)):
if e1[2]['cost'] == e2[2]['cost']:
Kmatrix += 1


run_time = time.time() - start_time
print(
"\n --- shortest path kernel matrix of size %d built in %s seconds ---"
% (len(Gn), run_time))
return i, j, Kmatrix


return Kmatrix, run_time, idx

def wrap_getSPGraph(Gn, weight, i):
return i, getSPGraph(Gn[i], edge_weight=weight)

+ 337
- 217
pygraph/utils/model_selection_precomputed.py View File

@@ -1,11 +1,32 @@
import numpy as np
from matplotlib import pyplot as plt
from sklearn.kernel_ridge import KernelRidge
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.model_selection import KFold, train_test_split, ParameterGrid


from joblib import Parallel, delayed
from multiprocessing import Pool
from functools import partial
import sys
sys.path.insert(0, "../")
import os
import time
from os.path import basename, splitext
from pygraph.utils.graphfiles import loadDataset
from tqdm import tqdm


def model_selection_for_precomputed_kernel(datafile, estimator,
param_grid_precomputed, param_grid,
model_type, NUM_TRIALS=30,

def model_selection_for_precomputed_kernel(datafile,
estimator,
param_grid_precomputed,
param_grid,
model_type,
NUM_TRIALS=30,
datafile_y=None, datafile_y=None,
extra_params=None, extra_params=None,
ds_name='ds-unknown'):
ds_name='ds-unknown',
n_jobs=1):
"""Perform model selection, fitting and testing for precomputed kernels using nested cv. Print out neccessary data during the process then finally the results. """Perform model selection, fitting and testing for precomputed kernels using nested cv. Print out neccessary data during the process then finally the results.


Parameters Parameters
@@ -40,94 +61,101 @@ def model_selection_for_precomputed_kernel(datafile, estimator,
>>> >>>
>>> model_selection_for_precomputed_kernel(datafile, estimator, param_grid_precomputed, param_grid, 'regression') >>> model_selection_for_precomputed_kernel(datafile, estimator, param_grid_precomputed, param_grid, 'regression')
""" """
import numpy as np
from matplotlib import pyplot as plt
from sklearn.kernel_ridge import KernelRidge
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.model_selection import KFold, train_test_split, ParameterGrid

import sys
sys.path.insert(0, "../")
import os
from os.path import basename, splitext
from pygraph.utils.graphfiles import loadDataset
from tqdm import tqdm
tqdm.monitor_interval = 0 tqdm.monitor_interval = 0


results_dir = '../notebooks/results/' + estimator.__name__ results_dir = '../notebooks/results/' + estimator.__name__
if not os.path.exists(results_dir):
os.makedirs(results_dir)

# open file to save all results for this dataset.
with open(results_dir + '/' + ds_name + '.txt', 'w') as fresults:
fresults.write('# This file contains results of ' + estimator.__name__ + ' on dataset ' + ds_name + ',\n# including gram matrices, serial numbers for gram matrix figures and performance.\n\n')

# setup the model type
model_type = model_type.lower()
if model_type != 'regression' and model_type != 'classification':
raise Exception(
'The model type is incorrect! Please choose from regression or classification.')
print()
print('--- This is a %s problem ---' % model_type)
fresults.write('This is a %s problem.\n\n' % model_type)
# a string to save all the results.
str_fw = '# This file contains results of ' + estimator.__name__ + ' on dataset ' + ds_name + ',\n# including gram matrices, serial numbers for gram matrix figures and performance.\n\n'


# Load the dataset
print()
print('\nI. Loading dataset from file...')
dataset, y = loadDataset(datafile, filename_y=datafile_y, extra_params=extra_params)
# setup the model type
model_type = model_type.lower()
if model_type != 'regression' and model_type != 'classification':
raise Exception(
'The model type is incorrect! Please choose from regression or classification.'
)
print()
print('--- This is a %s problem ---' % model_type)
str_fw += 'This is a %s problem.\n\n' % model_type

# Load the dataset
print()
print('\nI. Loading dataset from file...')
dataset, y = loadDataset(
datafile, filename_y=datafile_y, extra_params=extra_params)


# import matplotlib.pyplot as plt
# import matplotlib.pyplot as plt
# import networkx as nx # import networkx as nx
# nx.draw_networkx(dataset[30]) # nx.draw_networkx(dataset[30])
# plt.show() # plt.show()


# Grid of parameters with a discrete number of values for each.
param_list_precomputed = list(ParameterGrid(param_grid_precomputed))
param_list = list(ParameterGrid(param_grid))
# np.savetxt(results_name_pre + 'param_grid_precomputed.dt',
# [[key, value] for key, value in sorted(param_grid_precomputed)])
# np.savetxt(results_name_pre + 'param_grid.dt',
# [[key, value] for key, value in sorted(param_grid)])
# Grid of parameters with a discrete number of values for each.
param_list_precomputed = list(ParameterGrid(param_grid_precomputed))
param_list = list(ParameterGrid(param_grid))
# np.savetxt(results_name_pre + 'param_grid_precomputed.dt',
# [[key, value] for key, value in sorted(param_grid_precomputed)])
# np.savetxt(results_name_pre + 'param_grid.dt',
# [[key, value] for key, value in sorted(param_grid)])


gram_matrices = [] # a list to store gram matrices for all param_grid_precomputed
gram_matrix_time = [] # a list to store time to calculate gram matrices
param_list_pre_revised = [] # list to store param grids precomputed ignoring the useless ones
gram_matrices = [
] # a list to store gram matrices for all param_grid_precomputed
gram_matrix_time = [
] # a list to store time to calculate gram matrices
param_list_pre_revised = [
] # list to store param grids precomputed ignoring the useless ones

# calculate all gram matrices
print()
print('2. Calculating gram matrices. This could take a while...')
str_fw += '\nI. Gram matrices.\n\n'
tts = time.time() # start training time
nb_gm_ignore = 0 # the number of gram matrices those should not be considered, as they may contain elements that are not numbers (NaN)
for idx, params_out in enumerate(param_list_precomputed):
params_out['n_jobs'] = n_jobs
rtn_data = estimator(dataset, **params_out)
Kmatrix = rtn_data[0]
current_run_time = rtn_data[1]
if len(rtn_data) == 3:
idx_trim = rtn_data[2] # the index of trimmed graph list
y = [y[idx] for idx in idx_trim]

Kmatrix_diag = Kmatrix.diagonal().copy()
# remove graphs whose kernels with themselves are zeros
nb_g_ignore = 0
for idx, diag in enumerate(Kmatrix_diag):
if diag == 0:
Kmatrix = np.delete(Kmatrix, (idx - nb_g_ignore), axis=0)
Kmatrix = np.delete(Kmatrix, (idx - nb_g_ignore), axis=1)
nb_g_ignore += 1
# normalization
for i in range(len(Kmatrix)):
for j in range(i, len(Kmatrix)):
Kmatrix[i][j] /= np.sqrt(Kmatrix_diag[i] * Kmatrix_diag[j])
Kmatrix[j][i] = Kmatrix[i][j]


# calculate all gram matrices
print() print()
print('2. Calculating gram matrices. This could take a while...')
fresults.write('\nI. Gram matrices.\n\n')
nb_gm_ignore = 0 # the number of gram matrices those should not be considered, as they may contain elements that are not numbers (NaN)
for idx, params_out in enumerate(param_list_precomputed):
rtn_data = estimator(dataset, **params_out)
Kmatrix = rtn_data[0]
current_run_time = rtn_data[1]
if len(rtn_data) == 3:
idx_trim = rtn_data[2] # the index of trimmed graph list
y = [y[idx] for idx in idx_trim]
Kmatrix_diag = Kmatrix.diagonal().copy()
for i in range(len(Kmatrix)):
for j in range(i, len(Kmatrix)):
# if Kmatrix_diag[i] != 0 and Kmatrix_diag[j] != 0:
Kmatrix[i][j] /= np.sqrt(Kmatrix_diag[i] * Kmatrix_diag[j])
Kmatrix[j][i] = Kmatrix[i][j]

print()
if params_out == {}:
print('the gram matrix is: ')
fresults.write('the gram matrix is:\n\n')
else:
print('the gram matrix with parameters', params_out, 'is: ')
fresults.write('the gram matrix with parameters %s is:\n\n' % params_out)
if np.isnan(Kmatrix).any(): # if the matrix contains elements that are not numbers
if params_out == {}:
print('the gram matrix is: ')
str_fw += 'the gram matrix is:\n\n'
else:
print('the gram matrix with parameters', params_out, 'is: ')
str_fw += 'the gram matrix with parameters %s is:\n\n' % params_out
if len(Kmatrix) < 2:
nb_gm_ignore += 1
print('ignored, as at most only one of all its diagonal value is non-zero.')
str_fw += 'ignored, as at most only one of all its diagonal value is non-zero.\n\n'
else:
if np.isnan(Kmatrix).any(
): # if the matrix contains elements that are not numbers
nb_gm_ignore += 1 nb_gm_ignore += 1
print('ignored, as it contains elements that are not numbers.') print('ignored, as it contains elements that are not numbers.')
fresults.write('ignored, as it contains elements that are not numbers.\n\n')
str_fw += 'ignored, as it contains elements that are not numbers.\n\n'
else: else:
print(Kmatrix) print(Kmatrix)
fresults.write(np.array2string(Kmatrix, separator=',', threshold=np.inf, floatmode='unique') + '\n\n')
str_fw += np.array2string(
Kmatrix,
separator=',',
threshold=np.inf,
floatmode='unique') + '\n\n'
plt.matshow(Kmatrix) plt.matshow(Kmatrix)
plt.colorbar() plt.colorbar()
fig_file_name = results_dir + '/GM[ds]' + ds_name fig_file_name = results_dir + '/GM[ds]' + ds_name
@@ -138,115 +166,52 @@ def model_selection_for_precomputed_kernel(datafile, estimator,
gram_matrices.append(Kmatrix) gram_matrices.append(Kmatrix)
gram_matrix_time.append(current_run_time) gram_matrix_time.append(current_run_time)
param_list_pre_revised.append(params_out) param_list_pre_revised.append(params_out)
print()
print('{} gram matrices are calculated, {} of which are ignored.'.format(len(param_list_precomputed), nb_gm_ignore))
fresults.write('{} gram matrices are calculated, {} of which are ignored.\n\n'.format(len(param_list_precomputed), nb_gm_ignore))
fresults.write('serial numbers of gram matrix figure and their corresponding parameters settings:\n\n')
fresults.write(''.join(['{}: {}\n'.format(idx, params_out)
for idx, params_out in enumerate(param_list_precomputed)]))
if nb_g_ignore > 0:
print(', where %d graphs are ignored as their graph kernels with themselves are zeros.' % nb_g_ignore)
str_fw += ', where %d graphs are ignored as their graph kernels with themselves are zeros.' % nb_g_ignore
print()
print(
'{} gram matrices are calculated, {} of which are ignored.'.format(
len(param_list_precomputed), nb_gm_ignore))
str_fw += '{} gram matrices are calculated, {} of which are ignored.\n\n'.format(len(param_list_precomputed), nb_gm_ignore)
str_fw += 'serial numbers of gram matrix figures and their corresponding parameters settings:\n\n'
str_fw += ''.join([
'{}: {}\n'.format(idx, params_out)
for idx, params_out in enumerate(param_list_precomputed)
])


print()
print('3. Fitting and predicting using nested cross validation. This could really take a while...')
# Arrays to store scores
train_pref = np.zeros(
(NUM_TRIALS, len(param_list_pre_revised), len(param_list)))
val_pref = np.zeros(
(NUM_TRIALS, len(param_list_pre_revised), len(param_list)))
test_pref = np.zeros(
(NUM_TRIALS, len(param_list_pre_revised), len(param_list)))

# Loop for each trial
pbar = tqdm(total=NUM_TRIALS * len(param_list_pre_revised) * len(param_list),
desc='calculate performance', file=sys.stdout)
for trial in range(NUM_TRIALS): # Test set level
# loop for each outer param tuple
for index_out, params_out in enumerate(param_list_pre_revised):
# split gram matrix and y to app and test sets.
X_app, X_test, y_app, y_test = train_test_split(
gram_matrices[index_out], y, test_size=0.1)
split_index_app = [y.index(y_i) for y_i in y_app if y_i in y]
# split_index_test = [y.index(y_i) for y_i in y_test if y_i in y]
X_app = X_app[:, split_index_app]
X_test = X_test[:, split_index_app]
y_app = np.array(y_app)
y_test = np.array(y_test)

# loop for each inner param tuple
for index_in, params_in in enumerate(param_list):
inner_cv = KFold(n_splits=10, shuffle=True, random_state=trial)
current_train_perf = []
current_valid_perf = []
current_test_perf = []

# For regression use the Kernel Ridge method
try:
if model_type == 'regression':
KR = KernelRidge(kernel='precomputed', **params_in)
# loop for each split on validation set level
# validation set level
for train_index, valid_index in inner_cv.split(X_app):
KR.fit(X_app[train_index, :]
[:, train_index], y_app[train_index])
# predict on the train, validation and test set
y_pred_train = KR.predict(
X_app[train_index, :][:, train_index])
y_pred_valid = KR.predict(
X_app[valid_index, :][:, train_index])
y_pred_test = KR.predict(X_test[:, train_index])
# root mean squared errors
current_train_perf.append(
np.sqrt(mean_squared_error(y_app[train_index], y_pred_train)))
current_valid_perf.append(
np.sqrt(mean_squared_error(y_app[valid_index], y_pred_valid)))
current_test_perf.append(
np.sqrt(mean_squared_error(y_test, y_pred_test)))
# For clcassification use SVM
else:
KR = SVC(kernel='precomputed', **params_in)
# loop for each split on validation set level
# validation set level
for train_index, valid_index in inner_cv.split(X_app):
KR.fit(X_app[train_index, :]
[:, train_index], y_app[train_index])
# predict on the train, validation and test set
y_pred_train = KR.predict(
X_app[train_index, :][:, train_index])
y_pred_valid = KR.predict(
X_app[valid_index, :][:, train_index])
y_pred_test = KR.predict(
X_test[:, train_index])
# root mean squared errors
current_train_perf.append(accuracy_score(
y_app[train_index], y_pred_train))
current_valid_perf.append(accuracy_score(
y_app[valid_index], y_pred_valid))
current_test_perf.append(
accuracy_score(y_test, y_pred_test))
except ValueError:
print(sys.exc_info()[0])
print(params_out, params_in)
# average performance on inner splits
train_pref[trial][index_out][index_in] = np.mean(
current_train_perf)
val_pref[trial][index_out][index_in] = np.mean(
current_valid_perf)
test_pref[trial][index_out][index_in] = np.mean(
current_test_perf)
pbar.update(1)
pbar.clear()
print()
if len(gram_matrices) == 0:
print('all gram matrices are ignored, no results obtained.')
str_fw += '\nall gram matrices are ignored, no results obtained.\n\n'
else:
print(
'3. Fitting and predicting using nested cross validation. This could really take a while...'
)
pool = Pool(n_jobs)
trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, gram_matrices, y, model_type)
result_perf = pool.map(trial_do_partial, range(NUM_TRIALS))
train_pref = [item[0] for item in result_perf]
val_pref = [item[1] for item in result_perf]
test_pref = [item[2] for item in result_perf]
pool.close()
pool.join()

# trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, gram_matrices, y, model_type)
# result_perf = Parallel(n_jobs=n_jobs, verbose=10)(delayed(trial_do_partial)(trial) for trial in range(NUM_TRIALS))
# train_pref = [item[0] for item in result_perf]
# val_pref = [item[1] for item in result_perf]
# test_pref = [item[2] for item in result_perf]


# pbar.clear()
# np.save(results_name_pre + 'train_pref.dt', train_pref) # np.save(results_name_pre + 'train_pref.dt', train_pref)
# np.save(results_name_pre + 'val_pref.dt', val_pref) # np.save(results_name_pre + 'val_pref.dt', val_pref)
# np.save(results_name_pre + 'test_pref.dt', test_pref) # np.save(results_name_pre + 'test_pref.dt', test_pref)


print() print()
print('4. Getting final performance...') print('4. Getting final performance...')
fresults.write('\nII. Performance.\n\n')
str_fw += '\nII. Performance.\n\n'
# averages and confidences of performances on outer trials for each combination of parameters # averages and confidences of performances on outer trials for each combination of parameters
average_train_scores = np.mean(train_pref, axis=0) average_train_scores = np.mean(train_pref, axis=0)
average_val_scores = np.mean(val_pref, axis=0) average_val_scores = np.mean(val_pref, axis=0)
@@ -255,53 +220,78 @@ def model_selection_for_precomputed_kernel(datafile, estimator,
std_train_scores = np.std(train_pref, axis=0, ddof=1) std_train_scores = np.std(train_pref, axis=0, ddof=1)
std_val_scores = np.std(val_pref, axis=0, ddof=1) std_val_scores = np.std(val_pref, axis=0, ddof=1)
std_perf_scores = np.std(test_pref, axis=0, ddof=1) std_perf_scores = np.std(test_pref, axis=0, ddof=1)
if model_type == 'regression': if model_type == 'regression':
best_val_perf = np.amin(average_val_scores) best_val_perf = np.amin(average_val_scores)
else: else:
best_val_perf = np.amax(average_val_scores) best_val_perf = np.amax(average_val_scores)
best_params_index = np.where(average_val_scores == best_val_perf) best_params_index = np.where(average_val_scores == best_val_perf)
# find smallest val std with best val perf. # find smallest val std with best val perf.
best_val_stds = [std_val_scores[value][best_params_index[1][idx]] for idx, value in enumerate(best_params_index[0])]
best_val_stds = [
std_val_scores[value][best_params_index[1][idx]]
for idx, value in enumerate(best_params_index[0])
]
min_val_std = np.amin(best_val_stds) min_val_std = np.amin(best_val_stds)
best_params_index = np.where(std_val_scores == min_val_std) best_params_index = np.where(std_val_scores == min_val_std)
best_params_out = [param_list_pre_revised[i] for i in best_params_index[0]]
best_params_out = [
param_list_pre_revised[i] for i in best_params_index[0]
]
best_params_in = [param_list[i] for i in best_params_index[1]] best_params_in = [param_list[i] for i in best_params_index[1]]
print('best_params_out: ', best_params_out) print('best_params_out: ', best_params_out)
print('best_params_in: ', best_params_in) print('best_params_in: ', best_params_in)
print() print()
print('best_val_perf: ', best_val_perf) print('best_val_perf: ', best_val_perf)
print('best_val_std: ', min_val_std) print('best_val_std: ', min_val_std)
fresults.write('best settings of hyper-params to build gram matrix: %s\n' % best_params_out)
fresults.write('best settings of other hyper-params: %s\n\n' % best_params_in)
fresults.write('best_val_perf: %s\n' % best_val_perf)
fresults.write('best_val_std: %s\n' % min_val_std)
str_fw += 'best settings of hyper-params to build gram matrix: %s\n' % best_params_out
str_fw += 'best settings of other hyper-params: %s\n\n' % best_params_in
str_fw += 'best_val_perf: %s\n' % best_val_perf
str_fw += 'best_val_std: %s\n' % min_val_std


final_performance = [average_perf_scores[value][best_params_index[1][idx]] for idx, value in enumerate(best_params_index[0])]
final_confidence = [std_perf_scores[value][best_params_index[1][idx]] for idx, value in enumerate(best_params_index[0])]
final_performance = [
average_perf_scores[value][best_params_index[1][idx]]
for idx, value in enumerate(best_params_index[0])
]
final_confidence = [
std_perf_scores[value][best_params_index[1][idx]]
for idx, value in enumerate(best_params_index[0])
]
print('final_performance: ', final_performance) print('final_performance: ', final_performance)
print('final_confidence: ', final_confidence) print('final_confidence: ', final_confidence)
fresults.write('final_performance: %s\n' % final_performance)
fresults.write('final_confidence: %s\n' % final_confidence)
train_performance = [average_train_scores[value][best_params_index[1][idx]] for idx, value in enumerate(best_params_index[0])]
train_std = [std_train_scores[value][best_params_index[1][idx]] for idx, value in enumerate(best_params_index[0])]
str_fw += 'final_performance: %s\n' % final_performance
str_fw += 'final_confidence: %s\n' % final_confidence
train_performance = [
average_train_scores[value][best_params_index[1][idx]]
for idx, value in enumerate(best_params_index[0])
]
train_std = [
std_train_scores[value][best_params_index[1][idx]]
for idx, value in enumerate(best_params_index[0])
]
print('train_performance: %s' % train_performance) print('train_performance: %s' % train_performance)
print('train_std: ', train_std) print('train_std: ', train_std)
fresults.write('train_performance: %s\n' % train_performance)
fresults.write('train_std: %s\n\n' % train_std)
str_fw += 'train_performance: %s\n' % train_performance
str_fw += 'train_std: %s\n\n' % train_std


print() print()
tt_total = time.time() - tts # training time for all hyper-parameters
average_gram_matrix_time = np.mean(gram_matrix_time) average_gram_matrix_time = np.mean(gram_matrix_time)
std_gram_matrix_time = np.std(gram_matrix_time, ddof=1) std_gram_matrix_time = np.std(gram_matrix_time, ddof=1)
best_gram_matrix_time = [gram_matrix_time[i] for i in best_params_index[0]]
best_gram_matrix_time = [
gram_matrix_time[i] for i in best_params_index[0]
]
ave_bgmt = np.mean(best_gram_matrix_time) ave_bgmt = np.mean(best_gram_matrix_time)
std_bgmt = np.std(best_gram_matrix_time, ddof=1) std_bgmt = np.std(best_gram_matrix_time, ddof=1)
print('time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s'
.format(average_gram_matrix_time, std_gram_matrix_time))
print('time to calculate best gram matrix: {:.2f}±{:.2f}s'.format(ave_bgmt, std_bgmt))
fresults.write('time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s\n'
.format(average_gram_matrix_time, std_gram_matrix_time))
fresults.write('time to calculate best gram matrix: {:.2f}±{:.2f}s\n\n'.format(ave_bgmt, std_bgmt))
print(
'time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s'
.format(average_gram_matrix_time, std_gram_matrix_time))
print('time to calculate best gram matrix: {:.2f}±{:.2f}s'.format(
ave_bgmt, std_bgmt))
print(
'total training time with all hyper-param choices: {:.2f}s'.format(
tt_total))
str_fw += 'time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s\n'.format(average_gram_matrix_time, std_gram_matrix_time)
str_fw += 'time to calculate best gram matrix: {:.2f}±{:.2f}s\n'.format(ave_bgmt, std_bgmt)
str_fw += 'total training time with all hyper-param choices: {:.2f}s\n\n'.format(tt_total)


# # save results to file # # save results to file
# np.savetxt(results_name_pre + 'average_train_scores.dt', # np.savetxt(results_name_pre + 'average_train_scores.dt',
@@ -312,7 +302,7 @@ def model_selection_for_precomputed_kernel(datafile, estimator,
# np.savetxt(results_name_pre + 'std_train_scores.dt', std_train_scores) # np.savetxt(results_name_pre + 'std_train_scores.dt', std_train_scores)
# np.savetxt(results_name_pre + 'std_val_scores.dt', std_val_scores) # np.savetxt(results_name_pre + 'std_val_scores.dt', std_val_scores)
# np.savetxt(results_name_pre + 'std_perf_scores.dt', std_perf_scores) # np.savetxt(results_name_pre + 'std_perf_scores.dt', std_perf_scores)
# np.save(results_name_pre + 'best_params_index', best_params_index) # np.save(results_name_pre + 'best_params_index', best_params_index)
# np.save(results_name_pre + 'best_params_pre.dt', best_params_out) # np.save(results_name_pre + 'best_params_pre.dt', best_params_out)
# np.save(results_name_pre + 'best_params_in.dt', best_params_in) # np.save(results_name_pre + 'best_params_in.dt', best_params_in)
@@ -322,7 +312,7 @@ def model_selection_for_precomputed_kernel(datafile, estimator,
# np.save(results_name_pre + 'final_confidence.dt', final_confidence) # np.save(results_name_pre + 'final_confidence.dt', final_confidence)
# np.save(results_name_pre + 'train_performance.dt', train_performance) # np.save(results_name_pre + 'train_performance.dt', train_performance)
# np.save(results_name_pre + 'train_std.dt', train_std) # np.save(results_name_pre + 'train_std.dt', train_std)
# np.save(results_name_pre + 'gram_matrix_time.dt', gram_matrix_time) # np.save(results_name_pre + 'gram_matrix_time.dt', gram_matrix_time)
# np.save(results_name_pre + 'average_gram_matrix_time.dt', # np.save(results_name_pre + 'average_gram_matrix_time.dt',
# average_gram_matrix_time) # average_gram_matrix_time)
@@ -330,7 +320,7 @@ def model_selection_for_precomputed_kernel(datafile, estimator,
# std_gram_matrix_time) # std_gram_matrix_time)
# np.save(results_name_pre + 'best_gram_matrix_time.dt', # np.save(results_name_pre + 'best_gram_matrix_time.dt',
# best_gram_matrix_time) # best_gram_matrix_time)
# print out as table. # print out as table.
from collections import OrderedDict from collections import OrderedDict
from tabulate import tabulate from tabulate import tabulate
@@ -343,20 +333,150 @@ def model_selection_for_precomputed_kernel(datafile, estimator,
param_in['C'] = '{:.2e}'.format(param_in['C']) param_in['C'] = '{:.2e}'.format(param_in['C'])
table_dict['params'] = [{**param_out, **param_in} table_dict['params'] = [{**param_out, **param_in}
for param_in in param_list for param_out in param_list_pre_revised] for param_in in param_list for param_out in param_list_pre_revised]
table_dict['gram_matrix_time'] = ['{:.2f}'.format(gram_matrix_time[index_out])
for param_in in param_list for index_out, _ in enumerate(param_list_pre_revised)]
table_dict['valid_perf'] = ['{:.2f}±{:.2f}'.format(average_val_scores[index_out][index_in], std_val_scores[index_out][index_in])
for index_in, _ in enumerate(param_list) for index_out, _ in enumerate(param_list_pre_revised)]
table_dict['test_perf'] = ['{:.2f}±{:.2f}'.format(average_perf_scores[index_out][index_in], std_perf_scores[index_out][index_in])
for index_in, _ in enumerate(param_list) for index_out, _ in enumerate(param_list_pre_revised)]
table_dict['train_perf'] = ['{:.2f}±{:.2f}'.format(average_train_scores[index_out][index_in], std_train_scores[index_out][index_in])
for index_in, _ in enumerate(param_list) for index_out, _ in enumerate(param_list_pre_revised)]
keyorder = ['params', 'train_perf', 'valid_perf',
'test_perf', 'gram_matrix_time']
table_dict['gram_matrix_time'] = [
'{:.2f}'.format(gram_matrix_time[index_out])
for param_in in param_list
for index_out, _ in enumerate(param_list_pre_revised)
]
table_dict['valid_perf'] = [
'{:.2f}±{:.2f}'.format(average_val_scores[index_out][index_in],
std_val_scores[index_out][index_in])
for index_in, _ in enumerate(param_list)
for index_out, _ in enumerate(param_list_pre_revised)
]
table_dict['test_perf'] = [
'{:.2f}±{:.2f}'.format(average_perf_scores[index_out][index_in],
std_perf_scores[index_out][index_in])
for index_in, _ in enumerate(param_list)
for index_out, _ in enumerate(param_list_pre_revised)
]
table_dict['train_perf'] = [
'{:.2f}±{:.2f}'.format(average_train_scores[index_out][index_in],
std_train_scores[index_out][index_in])
for index_in, _ in enumerate(param_list)
for index_out, _ in enumerate(param_list_pre_revised)
]
keyorder = [
'params', 'train_perf', 'valid_perf', 'test_perf',
'gram_matrix_time'
]
print() print()
tb_print = tabulate(OrderedDict(sorted(table_dict.items(),
key=lambda i: keyorder.index(i[0]))), headers='keys')
tb_print = tabulate(
OrderedDict(
sorted(table_dict.items(),
key=lambda i: keyorder.index(i[0]))),
headers='keys')
print(tb_print) print(tb_print)
fresults.write('table of performance v.s. hyper-params:\n\n%s\n\n' % tb_print)
str_fw += 'table of performance v.s. hyper-params:\n\n%s\n\n' % tb_print

# open file to save all results for this dataset.
if not os.path.exists(results_dir):
os.makedirs(results_dir)

with open(results_dir + '/' + ds_name + '.txt', 'w') as fresults:
fresults.write(str_fw)
fresults.close()


def trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, trial): # Test set level

# Arrays to store scores
train_pref = np.zeros((len(param_list_pre_revised),
len(param_list)))
val_pref = np.zeros((len(param_list_pre_revised),
len(param_list)))
test_pref = np.zeros((len(param_list_pre_revised),
len(param_list)))

# loop for each outer param tuple
for index_out, params_out in enumerate(param_list_pre_revised):
# split gram matrix and y to app and test sets.
X_app, X_test, y_app, y_test = train_test_split(
gram_matrices[index_out], y, test_size=0.1)
split_index_app = [y.index(y_i) for y_i in y_app if y_i in y]
# split_index_test = [y.index(y_i) for y_i in y_test if y_i in y]
X_app = X_app[:, split_index_app]
X_test = X_test[:, split_index_app]
y_app = np.array(y_app)
y_test = np.array(y_test)

# loop for each inner param tuple
for index_in, params_in in enumerate(param_list):
inner_cv = KFold(
n_splits=10, shuffle=True, random_state=trial)
current_train_perf = []
current_valid_perf = []
current_test_perf = []

# For regression use the Kernel Ridge method
try:
if model_type == 'regression':
KR = KernelRidge(kernel='precomputed', **params_in)
# loop for each split on validation set level
# validation set level
for train_index, valid_index in inner_cv.split(
X_app):
KR.fit(X_app[train_index, :][:, train_index],
y_app[train_index])

# predict on the train, validation and test set
y_pred_train = KR.predict(
X_app[train_index, :][:, train_index])
y_pred_valid = KR.predict(
X_app[valid_index, :][:, train_index])
y_pred_test = KR.predict(
X_test[:, train_index])

# root mean squared errors
current_train_perf.append(
np.sqrt(
mean_squared_error(
y_app[train_index], y_pred_train)))
current_valid_perf.append(
np.sqrt(
mean_squared_error(
y_app[valid_index], y_pred_valid)))
current_test_perf.append(
np.sqrt(
mean_squared_error(
y_test, y_pred_test)))
# For clcassification use SVM
else:
KR = SVC(kernel='precomputed', **params_in)
# loop for each split on validation set level
# validation set level
for train_index, valid_index in inner_cv.split(
X_app):
KR.fit(X_app[train_index, :][:, train_index],
y_app[train_index])

# predict on the train, validation and test set
y_pred_train = KR.predict(
X_app[train_index, :][:, train_index])
y_pred_valid = KR.predict(
X_app[valid_index, :][:, train_index])
y_pred_test = KR.predict(
X_test[:, train_index])

# root mean squared errors
current_train_perf.append(
accuracy_score(y_app[train_index],
y_pred_train))
current_valid_perf.append(
accuracy_score(y_app[valid_index],
y_pred_valid))
current_test_perf.append(
accuracy_score(y_test, y_pred_test))
except ValueError:
print(sys.exc_info()[0])
print(params_out, params_in)

# average performance on inner splits
train_pref[index_out][index_in] = np.mean(
current_train_perf)
val_pref[index_out][index_in] = np.mean(
current_valid_perf)
test_pref[index_out][index_in] = np.mean(
current_test_perf)


fresults.close()
return train_pref, val_pref, test_pref

+ 2
- 1
pygraph/utils/utils.py View File

@@ -61,10 +61,11 @@ def floydTransformation(G, edge_weight=None):
spMatrix = nx.floyd_warshall_numpy(G, weight=edge_weight) spMatrix = nx.floyd_warshall_numpy(G, weight=edge_weight)
S = nx.Graph() S = nx.Graph()
S.add_nodes_from(G.nodes(data=True)) S.add_nodes_from(G.nodes(data=True))
ns = list(G.nodes())
for i in range(0, G.number_of_nodes()): for i in range(0, G.number_of_nodes()):
for j in range(i + 1, G.number_of_nodes()): for j in range(i + 1, G.number_of_nodes()):
if spMatrix[i, j] != np.inf: if spMatrix[i, j] != np.inf:
S.add_edge(i, j, cost=spMatrix[i, j])
S.add_edge(ns[i], ns[j], cost=spMatrix[i, j])
return S return S






Loading…
Cancel
Save