Browse Source

1. add parallel computing scheme to spkernel and model_selection_precomputed.

2. modify model_selection_precomputed so that all results are written into memory and then to a file at last section of code, in case that on cpu/disk seperated systems the IO takes too much time.
3. correct utils.floyd_warshall_numpy function. DONNOT use the last version.
v0.1
jajupmochi 7 years ago
parent
commit
22a1f1e8d8
9 changed files with 2635 additions and 1596 deletions
  1. +10
    -7
      README.md
  2. +143
    -70
      datasets/ds.py
  3. +911
    -895
      notebooks/run_randomwalkkernel.ipynb
  4. +763
    -208
      notebooks/run_spkernel.ipynb
  5. +150
    -49
      notebooks/run_spkernel.py
  6. +0
    -1
      pygraph/kernels/.#commonWalkKernel.py
  7. +319
    -148
      pygraph/kernels/spKernel.py
  8. +337
    -217
      pygraph/utils/model_selection_precomputed.py
  9. +2
    -1
      pygraph/utils/utils.py

+ 10
- 7
README.md View File

@@ -3,12 +3,15 @@ A python package for graph kernels.

## Requirements

* numpy - 1.13.3
* scipy - 1.0.0
* matplotlib - 2.1.0
* networkx - 2.0
* sklearn - 0.19.1
* tabulate - 0.8.2
numpy==1.14.5
scipy==1.1.0
matplotlib==2.2.2
networkx==2.1
scikit-learn==0.19.1
tabulate==0.8.2
tqdm==4.23.4
control==0.7.0 (for generalized random walk kernels only)
slycot===0.3.2.dev-5263ada (for generalized random walk kernels only, requires fortran compiler, gfortran for example)

## Results with minimal test RMSE for each kernel on dataset Asyclic

@@ -28,7 +31,7 @@ For prediction we randomly divide the data in train and test subset, where 90\%
| WL shortest path | 28.74±0.60 | 38.20±0.62 | 39.02±6.09 | 'height': 10.0, 'alpha': '1.00' | 146.83"/80.63"±45.04" |
| WL edge | 30.21±0.64 | 36.53±1.02 | 38.42±6.42 | 'height': 5.0, 'alpha': '6.31e-01' | 5.24"/5.15"±2.83" |
| Treelet | 7.33±0.64 | 13.86±0.80 | 15.38±3.56 | 'alpha': '1.12e+01' | 0.48" |
| Path up to d | 5.76±0.27 | 9.89±0.87 | 10.21±4.16 | 'depth': 2.0, 'k_func': 'MinMax', 'alpha': '0.1' | 0.56"/1.16"±0.75" |
| Path up to d | 5.76±0.27 | 9.89±0.87 | 10.21±4.16 | 'depth': 2.0, 'k_func': 'MinMax', 'alpha ': '0.1' | 0.56"/1.16"±0.75" |
| Cyclic pattern | | | | | |
| Walk up to n | 20.88±0.74 | 23.34±1.11 | 24.46±6.57 | 'n': 2.0, 'alpha': '1.00e-03' | 0.56"/331.70"±753.44" |



+ 143
- 70
datasets/ds.py View File

@@ -3,106 +3,66 @@ dslist = [
'name': 'Acyclic',
'dataset': '../datasets/acyclic/dataset_bps.ds',
'task': 'regression'
}, # node_labeled
{
'name': 'COIL-DEL',
'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'
}, # edge_labeled
}, # node symb
# {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb
{
'name': 'PAH',
'dataset': '../datasets/PAH/dataset.ds',
}, # unlabeled
{
'name': 'Mutagenicity',
'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'
}, # fully_labeled
{
'name': 'MAO',
'dataset': '../datasets/MAO/dataset.ds',
},
}, # node/edge symb
{
'name': 'MUTAG',
'dataset': '../datasets/MUTAG/MUTAG.mat',
'extra_params': {
'am_sp_al_nl_el': [0, 0, 3, 1, 2]
}
},
}, # node/edge symb
{
'name': 'Alkane',
'dataset': '../datasets/Alkane/dataset.ds',
'task': 'regression',
'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt',
},
{
'name': 'BZR',
'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'
},
}, # contains single node graph, node symb
# {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb
# {'name': 'COX2', 'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt'}, # node symb/nsymb
{
'name': 'COX2',
'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt'
},
'name': 'Mutagenicity',
'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'
}, # node/edge symb
{
'name': 'ENZYMES',
'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'
},
{
'name': 'DHFR',
'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt'
},
{
'name': 'SYNTHETIC',
'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt'
},
{
'name': 'MSRC9',
'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'
},
{
'name': 'MSRC21',
'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'
},
}, # node symb/nsymb
# {'name': 'Fingerprint', 'dataset': '../datasets/Fingerprint/Fingerprint_A.txt'},
{
'name': 'FIRSTMM_DB',
'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'
},
{
'name': 'PROTEINS',
'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'
},
{
'name': 'PROTEINS_full',
'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'
'name': 'Letter-med',
'dataset': '../datasets/Letter-med/Letter-med_A.txt'
},
# {'name': 'DHFR', 'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt'}, # node symb/nsymb
# {'name': 'SYNTHETIC', 'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt'}, # node symb/nsymb
# {'name': 'MSRC9', 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'}, # node symb
# {'name': 'MSRC21', 'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'}, # node symb
# {'name': 'FIRSTMM_DB', 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'}, # node symb/nsymb ,edge nsymb

# {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'}, # node symb/nsymb
# {'name': 'PROTEINS_full', 'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'}, # node symb/nsymb
{
'name': 'D&D',
'dataset': '../datasets/D&D/DD.mat',
'extra_params': {
'am_sp_al_nl_el': [0, 1, 2, 1, -1]
}
},
{
'name': 'AIDS',
'dataset': '../datasets/AIDS/AIDS_A.txt'
},
{
'name': 'NCI1',
'dataset': '../datasets/NCI1/NCI1.mat',
'extra_params': {
'am_sp_al_nl_el': [1, 1, 2, 0, -1]
}
},
{
'name': 'NCI109',
'dataset': '../datasets/NCI109/NCI109.mat',
'extra_params': {
'am_sp_al_nl_el': [1, 1, 2, 0, -1]
}
},
{
'name': 'NCI-HIV',
'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf',
'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',
},
}, # node symb
# {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb
# {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1.mat',
# 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb
# {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109.mat',
# 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb
# {'name': 'NCI-HIV', 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf',
# 'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',}, # node/edge symb

# # not working below
# {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',},
@@ -110,3 +70,116 @@ dslist = [
# {'name': 'PTC_MM', 'dataset': '../datasets/PTC/Train/MM.ds',},
# {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',},
]

# dslist = [
# {
# 'name': 'Acyclic',
# 'dataset': '../datasets/acyclic/dataset_bps.ds',
# 'task': 'regression'
# }, # node_labeled
# {
# 'name': 'COIL-DEL',
# 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'
# }, # edge_labeled
# {
# 'name': 'PAH',
# 'dataset': '../datasets/PAH/dataset.ds',
# }, # unlabeled
# {
# 'name': 'Mutagenicity',
# 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'
# }, # fully_labeled
# {
# 'name': 'MAO',
# 'dataset': '../datasets/MAO/dataset.ds',
# },
# {
# 'name': 'MUTAG',
# 'dataset': '../datasets/MUTAG/MUTAG.mat',
# 'extra_params': {
# 'am_sp_al_nl_el': [0, 0, 3, 1, 2]
# }
# },
# {
# 'name': 'Alkane',
# 'dataset': '../datasets/Alkane/dataset.ds',
# 'task': 'regression',
# 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt',
# },
# {
# 'name': 'BZR',
# 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'
# },
# {
# 'name': 'COX2',
# 'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt'
# },
# {
# 'name': 'ENZYMES',
# 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'
# },
# {
# 'name': 'DHFR',
# 'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt'
# },
# {
# 'name': 'SYNTHETIC',
# 'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt'
# },
# {
# 'name': 'MSRC9',
# 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'
# },
# {
# 'name': 'MSRC21',
# 'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'
# },
# {
# 'name': 'FIRSTMM_DB',
# 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'
# },
# {
# 'name': 'PROTEINS',
# 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'
# },
# {
# 'name': 'PROTEINS_full',
# 'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'
# },
# {
# 'name': 'D&D',
# 'dataset': '../datasets/D&D/DD.mat',
# 'extra_params': {
# 'am_sp_al_nl_el': [0, 1, 2, 1, -1]
# }
# },
# {
# 'name': 'AIDS',
# 'dataset': '../datasets/AIDS/AIDS_A.txt'
# },
# {
# 'name': 'NCI1',
# 'dataset': '../datasets/NCI1/NCI1.mat',
# 'extra_params': {
# 'am_sp_al_nl_el': [1, 1, 2, 0, -1]
# }
# },
# {
# 'name': 'NCI109',
# 'dataset': '../datasets/NCI109/NCI109.mat',
# 'extra_params': {
# 'am_sp_al_nl_el': [1, 1, 2, 0, -1]
# }
# },
# {
# 'name': 'NCI-HIV',
# 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf',
# 'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',
# },

# # # not working below
# # {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',},
# # {'name': 'PTC_FR', 'dataset': '../datasets/PTC/Train/FR.ds',},
# # {'name': 'PTC_MM', 'dataset': '../datasets/PTC/Train/MM.ds',},
# # {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',},
# ]

+ 911
- 895
notebooks/run_randomwalkkernel.ipynb
File diff suppressed because it is too large
View File


+ 763
- 208
notebooks/run_spkernel.ipynb
File diff suppressed because it is too large
View File


+ 150
- 49
notebooks/run_spkernel.py View File

@@ -1,56 +1,157 @@
import functools
from libs import *
from pygraph.kernels.spKernel import spkernel
from pygraph.utils.kernels import deltakernel, kernelsum
from sklearn.metrics.pairwise import rbf_kernel

dslist = [
# {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds', 'task': 'regression'}, # node_labeled
# {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge_labeled
# dslist = [
# {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds', 'task': 'regression'}, # node symb
# # {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb
# {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds',}, # unlabeled
{'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, # fully_labeled
# {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds',},

# {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds',}, # node/edge symb
# {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat',
# 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}},
# {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression',
# 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt',},
# {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'},
# {'name': 'COX2', 'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt'},
{'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'},
# {'name': 'DHFR', 'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt'},
# {'name': 'SYNTHETIC', 'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt'},
# {'name': 'MSRC9', 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'},
# {'name': 'MSRC21', 'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'},
# {'name': 'FIRSTMM_DB', 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'},

# {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'},
# {'name': 'PROTEINS_full', 'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'},
# 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, # node/edge symb
# {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression',
# 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt',}, # contains single node graph, node symb
# # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb
# # {'name': 'COX2', 'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt'}, # node symb/nsymb
# {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, # node/edge symb
# {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'}, # node symb/nsymb
# # {'name': 'Fingerprint', 'dataset': '../datasets/Fingerprint/Fingerprint_A.txt'},
# {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'},
# # {'name': 'DHFR', 'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt'}, # node symb/nsymb
# # {'name': 'SYNTHETIC', 'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt'}, # node symb/nsymb
# # {'name': 'MSRC9', 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'}, # node symb
# # {'name': 'MSRC21', 'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'}, # node symb
# # {'name': 'FIRSTMM_DB', 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'}, # node symb/nsymb ,edge nsymb

# # {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'}, # node symb/nsymb
# # {'name': 'PROTEINS_full', 'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'}, # node symb/nsymb
# {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat',
# 'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}},
# {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'},
# {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1.mat',
# 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}},
# {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109.mat',
# 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}},
# {'name': 'NCI-HIV', 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf',
# 'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',},
# # not working below
# {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',},
# {'name': 'PTC_FR', 'dataset': '../datasets/PTC/Train/FR.ds',},
# {'name': 'PTC_MM', 'dataset': '../datasets/PTC/Train/MM.ds',},
# {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',},
]
# 'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}}, # node symb
# # {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb
# # {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1.mat',
# # 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb
# # {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109.mat',
# # 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb
# # {'name': 'NCI-HIV', 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf',
# # 'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',}, # node/edge symb

# # # not working below
# # {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',},
# # {'name': 'PTC_FR', 'dataset': '../datasets/PTC/Train/FR.ds',},
# # {'name': 'PTC_MM', 'dataset': '../datasets/PTC/Train/MM.ds',},
# # {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',},
# ]

import ast
ds = ast.literal_eval(sys.argv[1])

estimator = spkernel
param_grid_precomputed = {}
param_grid = [{'C': np.logspace(-10, 10, num = 41, base = 10)},
{'alpha': np.logspace(-10, 10, num = 41, base = 10)}]

for ds in dslist:
print()
print(ds['name'])
model_selection_for_precomputed_kernel(
ds['dataset'], estimator, param_grid_precomputed,
(param_grid[1] if ('task' in ds and ds['task'] == 'regression') else param_grid[0]),
(ds['task'] if 'task' in ds else 'classification'), NUM_TRIALS=30,
datafile_y=(ds['dataset_y'] if 'dataset_y' in ds else None),
extra_params=(ds['extra_params'] if 'extra_params' in ds else None))
print()
mixkernel = functools.partial(kernelsum, deltakernel, rbf_kernel)
param_grid_precomputed = {
'node_kernels': [{
'symb': deltakernel,
'nsymb': rbf_kernel,
'mix': mixkernel
}]
}
param_grid = [{
'C': np.logspace(-10, 10, num=41, base=10)
}, {
'alpha': np.logspace(-10, 10, num=41, base=10)
}]

print()
print(ds['name'])
model_selection_for_precomputed_kernel(
ds['dataset'],
estimator,
param_grid_precomputed,
(param_grid[1]
if ('task' in ds and ds['task'] == 'regression') else param_grid[0]),
(ds['task'] if 'task' in ds else 'classification'),
NUM_TRIALS=30,
datafile_y=(ds['dataset_y'] if 'dataset_y' in ds else None),
extra_params=(ds['extra_params'] if 'extra_params' in ds else None),
ds_name=ds['name'])

# %lprun -f spkernel \
# model_selection_for_precomputed_kernel( \
# ds['dataset'], estimator, param_grid_precomputed, \
# (param_grid[1] if ('task' in ds and ds['task'] == 'regression') else param_grid[0]), \
# (ds['task'] if 'task' in ds else 'classification'), NUM_TRIALS=30, \
# datafile_y=(ds['dataset_y'] if 'dataset_y' in ds else None), \
# extra_params=(ds['extra_params'] if 'extra_params' in ds else None))
print()

# import functools
# from libs import *
# from pygraph.kernels.spKernel import spkernel
# from pygraph.utils.kernels import deltakernel, kernelsum
# from sklearn.metrics.pairwise import rbf_kernel

# dslist = [
# {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds', 'task': 'regression'}, # node symb
# # {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb
# # {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds',}, # unlabeled
# # {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds',}, # node/edge symb
# # {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat',
# # 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, # node/edge symb
# # {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression',
# # 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt',}, # contains single node graph, node symb
# # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb
# # {'name': 'COX2', 'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt'}, # node symb/nsymb
# # {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, # node/edge symb
# # {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'}, # node symb/nsymb
# # {'name': 'Fingerprint', 'dataset': '../datasets/Fingerprint/Fingerprint_A.txt'},
# # {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'},
# # {'name': 'DHFR', 'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt'}, # node symb/nsymb
# # {'name': 'SYNTHETIC', 'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt'}, # node symb/nsymb
# # {'name': 'MSRC9', 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'}, # node symb
# # {'name': 'MSRC21', 'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'}, # node symb
# # {'name': 'FIRSTMM_DB', 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'}, # node symb/nsymb ,edge nsymb

# # {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'}, # node symb/nsymb
# # {'name': 'PROTEINS_full', 'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'}, # node symb/nsymb
# # {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat',
# # 'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}}, # node symb
# # {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb
# # {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1.mat',
# # 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb
# # {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109.mat',
# # 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb
# # {'name': 'NCI-HIV', 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf',
# # 'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',}, # node/edge symb

# # # not working below
# # {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',},
# # {'name': 'PTC_FR', 'dataset': '../datasets/PTC/Train/FR.ds',},
# # {'name': 'PTC_MM', 'dataset': '../datasets/PTC/Train/MM.ds',},
# # {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',},
# ]
# estimator = spkernel
# mixkernel = functools.partial(kernelsum, deltakernel, rbf_kernel)
# param_grid_precomputed = {'node_kernels': [{'symb': deltakernel, 'nsymb': rbf_kernel, 'mix': mixkernel}]}
# param_grid = [{'C': np.logspace(-10, 10, num = 41, base = 10)},
# {'alpha': np.logspace(-10, 10, num = 41, base = 10)}]

# for ds in dslist:
# print()
# print(ds['name'])
# model_selection_for_precomputed_kernel(
# ds['dataset'], estimator, param_grid_precomputed,
# (param_grid[1] if ('task' in ds and ds['task'] == 'regression') else param_grid[0]),
# (ds['task'] if 'task' in ds else 'classification'), NUM_TRIALS=30,
# datafile_y=(ds['dataset_y'] if 'dataset_y' in ds else None),
# extra_params=(ds['extra_params'] if 'extra_params' in ds else None),
# ds_name=ds['name'])

# # %lprun -f spkernel \
# # model_selection_for_precomputed_kernel( \
# # ds['dataset'], estimator, param_grid_precomputed, \
# # (param_grid[1] if ('task' in ds and ds['task'] == 'regression') else param_grid[0]), \
# # (ds['task'] if 'task' in ds else 'classification'), NUM_TRIALS=30, \
# # datafile_y=(ds['dataset_y'] if 'dataset_y' in ds else None), \
# # extra_params=(ds['extra_params'] if 'extra_params' in ds else None))
# print()

+ 0
- 1
pygraph/kernels/.#commonWalkKernel.py View File

@@ -1 +0,0 @@
ljia@ljia-Precision-7520.4716:1530265749

+ 319
- 148
pygraph/kernels/spKernel.py View File

@@ -9,6 +9,9 @@ sys.path.insert(0, "../")
from tqdm import tqdm
import time
from itertools import combinations_with_replacement, product
from functools import partial
from joblib import Parallel, delayed
from multiprocessing import Pool

import networkx as nx
import numpy as np
@@ -17,7 +20,11 @@ from pygraph.utils.utils import getSPGraph
from pygraph.utils.graphdataset import get_dataset_attributes


def spkernel(*args, node_label='atom', edge_weight=None, node_kernels=None):
def spkernel(*args,
node_label='atom',
edge_weight=None,
node_kernels=None,
n_jobs=None):
"""Calculate shortest-path kernels between graphs.

Parameters
@@ -70,180 +77,344 @@ def spkernel(*args, node_label='atom', edge_weight=None, node_kernels=None):
if len(Gn) != len_gn:
print('\n %d graphs are removed as they don\'t contain edges.\n' %
(len_gn - len(Gn)))

start_time = time.time()
pool = Pool(n_jobs)

# get shortest path graphs of Gn
Gn = [
getSPGraph(G, edge_weight=edge_weight)
for G in tqdm(Gn, desc='getting sp graphs', file=sys.stdout)
]
getsp_partial = partial(wrap_getSPGraph, Gn, edge_weight)
result_sp = pool.map(getsp_partial, range(0, len(Gn)))
for i in result_sp:
Gn[i[0]] = i[1]

# Gn = [
# getSPGraph(G, edge_weight=edge_weight)
# for G in tqdm(Gn, desc='getting sp graphs', file=sys.stdout)
# ]

Kmatrix = np.zeros((len(Gn), len(Gn)))
pbar = tqdm(
total=((len(Gn) + 1) * len(Gn) / 2),
desc='calculating kernels',
file=sys.stdout)

do_partial = partial(spkernel_do, Gn, ds_attrs, node_label, node_kernels)
itr = combinations_with_replacement(range(0, len(Gn)), 2)
# chunksize = 2000 # int(len(list(itr)) / n_jobs)
# for i, j, kernel in tqdm(pool.imap_unordered(do_partial, itr, chunksize)):
# Kmatrix[i][j] = kernel
# Kmatrix[j][i] = kernel

result_perf = pool.map(do_partial, itr)
pool.close()
pool.join()

# result_perf = Parallel(
# n_jobs=n_jobs, verbose=10)(
# delayed(do_partial)(ij)
# for ij in combinations_with_replacement(range(0, len(Gn)), 2))

# result_perf = [
# do_partial(ij)
# for ij in combinations_with_replacement(range(0, len(Gn)), 2)
# ]

for i in result_perf:
Kmatrix[i[0]][i[1]] = i[2]
Kmatrix[i[1]][i[0]] = i[2]

# pbar = tqdm(
# total=((len(Gn) + 1) * len(Gn) / 2),
# desc='calculating kernels',
# file=sys.stdout)
# if ds_attrs['node_labeled']:
# # node symb and non-synb labeled
# if ds_attrs['node_attr_dim'] > 0:
# if ds_attrs['is_directed']:
# for i, j in combinations_with_replacement(
# range(0, len(Gn)), 2):
# for e1, e2 in product(
# Gn[i].edges(data=True), Gn[j].edges(data=True)):
# if e1[2]['cost'] == e2[2]['cost']:
# kn = node_kernels['mix']
# try:
# n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[
# i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[
# j].nodes[e2[1]]
# kn1 = kn(n11[node_label], n21[node_label], [
# n11['attributes']
# ], [n21['attributes']]) * kn(
# n12[node_label], n22[node_label],
# [n12['attributes']], [n22['attributes']])
# Kmatrix[i][j] += kn1
# except KeyError: # missing labels or attributes
# pass
# Kmatrix[j][i] = Kmatrix[i][j]
# pbar.update(1)

# else:
# for i, j in combinations_with_replacement(
# range(0, len(Gn)), 2):
# for e1, e2 in product(
# Gn[i].edges(data=True), Gn[j].edges(data=True)):
# if e1[2]['cost'] == e2[2]['cost']:
# kn = node_kernels['mix']
# try:
# # each edge walk is counted twice, starting from both its extreme nodes.
# n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[
# i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[
# j].nodes[e2[1]]
# kn1 = kn(n11[node_label], n21[node_label], [
# n11['attributes']
# ], [n21['attributes']]) * kn(
# n12[node_label], n22[node_label],
# [n12['attributes']], [n22['attributes']])
# kn2 = kn(n11[node_label], n22[node_label], [
# n11['attributes']
# ], [n22['attributes']]) * kn(
# n12[node_label], n21[node_label],
# [n12['attributes']], [n21['attributes']])
# Kmatrix[i][j] += kn1 + kn2
# except KeyError: # missing labels or attributes
# pass
# Kmatrix[j][i] = Kmatrix[i][j]
# pbar.update(1)
# # node symb labeled
# else:
# if ds_attrs['is_directed']:
# for i, j in combinations_with_replacement(
# range(0, len(Gn)), 2):
# for e1, e2 in product(
# Gn[i].edges(data=True), Gn[j].edges(data=True)):
# if e1[2]['cost'] == e2[2]['cost']:
# kn = node_kernels['symb']
# try:
# n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[
# i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[
# j].nodes[e2[1]]
# kn1 = kn(n11[node_label],
# n21[node_label]) * kn(
# n12[node_label], n22[node_label])
# Kmatrix[i][j] += kn1
# except KeyError: # missing labels
# pass
# Kmatrix[j][i] = Kmatrix[i][j]
# pbar.update(1)

# else:
# for i, j in combinations_with_replacement(
# range(0, len(Gn)), 2):
# for e1, e2 in product(
# Gn[i].edges(data=True), Gn[j].edges(data=True)):
# if e1[2]['cost'] == e2[2]['cost']:
# kn = node_kernels['symb']
# try:
# # each edge walk is counted twice, starting from both its extreme nodes.
# n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[
# i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[
# j].nodes[e2[1]]
# kn1 = kn(n11[node_label],
# n21[node_label]) * kn(
# n12[node_label], n22[node_label])
# kn2 = kn(n11[node_label],
# n22[node_label]) * kn(
# n12[node_label], n21[node_label])
# Kmatrix[i][j] += kn1 + kn2
# except KeyError: # missing labels
# pass
# Kmatrix[j][i] = Kmatrix[i][j]
# pbar.update(1)
# else:
# # node non-synb labeled
# if ds_attrs['node_attr_dim'] > 0:
# if ds_attrs['is_directed']:
# for i, j in combinations_with_replacement(
# range(0, len(Gn)), 2):
# for e1, e2 in product(
# Gn[i].edges(data=True), Gn[j].edges(data=True)):
# if e1[2]['cost'] == e2[2]['cost']:
# kn = node_kernels['nsymb']
# try:
# # each edge walk is counted twice, starting from both its extreme nodes.
# n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[
# i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[
# j].nodes[e2[1]]
# kn1 = kn([n11['attributes']],
# [n21['attributes']]) * kn(
# [n12['attributes']],
# [n22['attributes']])
# Kmatrix[i][j] += kn1
# except KeyError: # missing attributes
# pass
# Kmatrix[j][i] = Kmatrix[i][j]
# pbar.update(1)
# else:
# for i, j in combinations_with_replacement(
# range(0, len(Gn)), 2):
# for e1, e2 in product(
# Gn[i].edges(data=True), Gn[j].edges(data=True)):
# if e1[2]['cost'] == e2[2]['cost']:
# kn = node_kernels['nsymb']
# try:
# # each edge walk is counted twice, starting from both its extreme nodes.
# n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[
# i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[
# j].nodes[e2[1]]
# kn1 = kn([n11['attributes']],
# [n21['attributes']]) * kn(
# [n12['attributes']],
# [n22['attributes']])
# kn2 = kn([n11['attributes']],
# [n22['attributes']]) * kn(
# [n12['attributes']],
# [n21['attributes']])
# Kmatrix[i][j] += kn1 + kn2
# except KeyError: # missing attributes
# pass
# Kmatrix[j][i] = Kmatrix[i][j]
# pbar.update(1)

# # node unlabeled
# else:
# for i, j in combinations_with_replacement(range(0, len(Gn)), 2):
# for e1, e2 in product(
# Gn[i].edges(data=True), Gn[j].edges(data=True)):
# if e1[2]['cost'] == e2[2]['cost']:
# Kmatrix[i][j] += 1
# Kmatrix[j][i] = Kmatrix[i][j]
# pbar.update(1)

run_time = time.time() - start_time
print(
"\n --- shortest path kernel matrix of size %d built in %s seconds ---"
% (len(Gn), run_time))

return Kmatrix, run_time, idx


def spkernel_do(Gn, ds_attrs, node_label, node_kernels, ij):

i = ij[0]
j = ij[1]
Kmatrix = 0
if ds_attrs['node_labeled']:
# node symb and non-synb labeled
if ds_attrs['node_attr_dim'] > 0:
if ds_attrs['is_directed']:
for i, j in combinations_with_replacement(
range(0, len(Gn)), 2):
for e1, e2 in product(
Gn[i].edges(data=True), Gn[j].edges(data=True)):
if e1[2]['cost'] == e2[2]['cost']:
kn = node_kernels['mix']
try:
n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[
i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[
j].nodes[e2[1]]
kn1 = kn(n11[node_label], n21[node_label], [
n11['attributes']
], [n21['attributes']]) * kn(
for e1, e2 in product(
Gn[i].edges(data=True), Gn[j].edges(data=True)):
if e1[2]['cost'] == e2[2]['cost']:
kn = node_kernels['mix']
try:
n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[
i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[
j].nodes[e2[1]]
kn1 = kn(
n11[node_label], n21[node_label],
[n11['attributes']], [n21['attributes']]) * kn(
n12[node_label], n22[node_label],
[n12['attributes']], [n22['attributes']])
Kmatrix[i][j] += kn1
except KeyError: # missing labels or attributes
pass
Kmatrix[j][i] = Kmatrix[i][j]
pbar.update(1)

Kmatrix += kn1
except KeyError: # missing labels or attributes
pass
else:
for i, j in combinations_with_replacement(
range(0, len(Gn)), 2):
for e1, e2 in product(
Gn[i].edges(data=True), Gn[j].edges(data=True)):
if e1[2]['cost'] == e2[2]['cost']:
kn = node_kernels['mix']
try:
# each edge walk is counted twice, starting from both its extreme nodes.
n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[
i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[
j].nodes[e2[1]]
kn1 = kn(n11[node_label], n21[node_label], [
n11['attributes']
], [n21['attributes']]) * kn(
for e1, e2 in product(
Gn[i].edges(data=True), Gn[j].edges(data=True)):
if e1[2]['cost'] == e2[2]['cost']:
kn = node_kernels['mix']
try:
# each edge walk is counted twice, starting from both its extreme nodes.
n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[
i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[
j].nodes[e2[1]]
kn1 = kn(
n11[node_label], n21[node_label],
[n11['attributes']], [n21['attributes']]) * kn(
n12[node_label], n22[node_label],
[n12['attributes']], [n22['attributes']])
kn2 = kn(n11[node_label], n22[node_label], [
n11['attributes']
], [n22['attributes']]) * kn(
kn2 = kn(
n11[node_label], n22[node_label],
[n11['attributes']], [n22['attributes']]) * kn(
n12[node_label], n21[node_label],
[n12['attributes']], [n21['attributes']])
Kmatrix[i][j] += kn1 + kn2
except KeyError: # missing labels or attributes
pass
Kmatrix[j][i] = Kmatrix[i][j]
pbar.update(1)
Kmatrix += kn1 + kn2
except KeyError: # missing labels or attributes
pass
# node symb labeled
else:
if ds_attrs['is_directed']:
for i, j in combinations_with_replacement(
range(0, len(Gn)), 2):
for e1, e2 in product(
Gn[i].edges(data=True), Gn[j].edges(data=True)):
if e1[2]['cost'] == e2[2]['cost']:
kn = node_kernels['symb']
try:
n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[
i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[
j].nodes[e2[1]]
kn1 = kn(n11[node_label],
n21[node_label]) * kn(
n12[node_label], n22[node_label])
Kmatrix[i][j] += kn1
except KeyError: # missing labels
pass
Kmatrix[j][i] = Kmatrix[i][j]
pbar.update(1)

for e1, e2 in product(
Gn[i].edges(data=True), Gn[j].edges(data=True)):
if e1[2]['cost'] == e2[2]['cost']:
kn = node_kernels['symb']
try:
n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[
i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[
j].nodes[e2[1]]
kn1 = kn(n11[node_label], n21[node_label]) * kn(
n12[node_label], n22[node_label])
Kmatrix += kn1
except KeyError: # missing labels
pass
else:
for i, j in combinations_with_replacement(
range(0, len(Gn)), 2):
for e1, e2 in product(
Gn[i].edges(data=True), Gn[j].edges(data=True)):
if e1[2]['cost'] == e2[2]['cost']:
kn = node_kernels['symb']
try:
# each edge walk is counted twice, starting from both its extreme nodes.
n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[
i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[
j].nodes[e2[1]]
kn1 = kn(n11[node_label],
n21[node_label]) * kn(
n12[node_label], n22[node_label])
kn2 = kn(n11[node_label],
n22[node_label]) * kn(
n12[node_label], n21[node_label])
Kmatrix[i][j] += kn1 + kn2
except KeyError: # missing labels
pass
Kmatrix[j][i] = Kmatrix[i][j]
pbar.update(1)
for e1, e2 in product(
Gn[i].edges(data=True), Gn[j].edges(data=True)):
if e1[2]['cost'] == e2[2]['cost']:
kn = node_kernels['symb']
try:
# each edge walk is counted twice, starting from both its extreme nodes.
n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[
i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[
j].nodes[e2[1]]
kn1 = kn(n11[node_label], n21[node_label]) * kn(
n12[node_label], n22[node_label])
kn2 = kn(n11[node_label], n22[node_label]) * kn(
n12[node_label], n21[node_label])
Kmatrix += kn1 + kn2
except KeyError: # missing labels
pass
else:
# node non-synb labeled
if ds_attrs['node_attr_dim'] > 0:
if ds_attrs['is_directed']:
for i, j in combinations_with_replacement(
range(0, len(Gn)), 2):
for e1, e2 in product(
Gn[i].edges(data=True), Gn[j].edges(data=True)):
if e1[2]['cost'] == e2[2]['cost']:
kn = node_kernels['nsymb']
try:
# each edge walk is counted twice, starting from both its extreme nodes.
n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[
i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[
j].nodes[e2[1]]
kn1 = kn([n11['attributes']],
[n21['attributes']]) * kn(
[n12['attributes']],
[n22['attributes']])
Kmatrix[i][j] += kn1
except KeyError: # missing attributes
pass
Kmatrix[j][i] = Kmatrix[i][j]
pbar.update(1)
for e1, e2 in product(
Gn[i].edges(data=True), Gn[j].edges(data=True)):
if e1[2]['cost'] == e2[2]['cost']:
kn = node_kernels['nsymb']
try:
# each edge walk is counted twice, starting from both its extreme nodes.
n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[
i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[
j].nodes[e2[1]]
kn1 = kn(
[n11['attributes']], [n21['attributes']]) * kn(
[n12['attributes']], [n22['attributes']])
Kmatrix += kn1
except KeyError: # missing attributes
pass
else:
for i, j in combinations_with_replacement(
range(0, len(Gn)), 2):
for e1, e2 in product(
Gn[i].edges(data=True), Gn[j].edges(data=True)):
if e1[2]['cost'] == e2[2]['cost']:
kn = node_kernels['nsymb']
try:
# each edge walk is counted twice, starting from both its extreme nodes.
n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[
i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[
j].nodes[e2[1]]
kn1 = kn([n11['attributes']],
[n21['attributes']]) * kn(
[n12['attributes']],
[n22['attributes']])
kn2 = kn([n11['attributes']],
[n22['attributes']]) * kn(
[n12['attributes']],
[n21['attributes']])
Kmatrix[i][j] += kn1 + kn2
except KeyError: # missing attributes
pass
Kmatrix[j][i] = Kmatrix[i][j]
pbar.update(1)

# node unlabeled
else:
for i, j in combinations_with_replacement(range(0, len(Gn)), 2):
for e1, e2 in product(
Gn[i].edges(data=True), Gn[j].edges(data=True)):
if e1[2]['cost'] == e2[2]['cost']:
Kmatrix[i][j] += 1
Kmatrix[j][i] = Kmatrix[i][j]
pbar.update(1)
kn = node_kernels['nsymb']
try:
# each edge walk is counted twice, starting from both its extreme nodes.
n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[
i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[
j].nodes[e2[1]]
kn1 = kn(
[n11['attributes']], [n21['attributes']]) * kn(
[n12['attributes']], [n22['attributes']])
kn2 = kn(
[n11['attributes']], [n22['attributes']]) * kn(
[n12['attributes']], [n21['attributes']])
Kmatrix += kn1 + kn2
except KeyError: # missing attributes
pass
# node unlabeled
else:
for e1, e2 in product(
Gn[i].edges(data=True), Gn[j].edges(data=True)):
if e1[2]['cost'] == e2[2]['cost']:
Kmatrix += 1

run_time = time.time() - start_time
print(
"\n --- shortest path kernel matrix of size %d built in %s seconds ---"
% (len(Gn), run_time))
return i, j, Kmatrix

return Kmatrix, run_time, idx

def wrap_getSPGraph(Gn, weight, i):
return i, getSPGraph(Gn[i], edge_weight=weight)

+ 337
- 217
pygraph/utils/model_selection_precomputed.py View File

@@ -1,11 +1,32 @@
import numpy as np
from matplotlib import pyplot as plt
from sklearn.kernel_ridge import KernelRidge
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.model_selection import KFold, train_test_split, ParameterGrid

from joblib import Parallel, delayed
from multiprocessing import Pool
from functools import partial
import sys
sys.path.insert(0, "../")
import os
import time
from os.path import basename, splitext
from pygraph.utils.graphfiles import loadDataset
from tqdm import tqdm

def model_selection_for_precomputed_kernel(datafile, estimator,
param_grid_precomputed, param_grid,
model_type, NUM_TRIALS=30,

def model_selection_for_precomputed_kernel(datafile,
estimator,
param_grid_precomputed,
param_grid,
model_type,
NUM_TRIALS=30,
datafile_y=None,
extra_params=None,
ds_name='ds-unknown'):
ds_name='ds-unknown',
n_jobs=1):
"""Perform model selection, fitting and testing for precomputed kernels using nested cv. Print out neccessary data during the process then finally the results.

Parameters
@@ -40,94 +61,101 @@ def model_selection_for_precomputed_kernel(datafile, estimator,
>>>
>>> model_selection_for_precomputed_kernel(datafile, estimator, param_grid_precomputed, param_grid, 'regression')
"""
import numpy as np
from matplotlib import pyplot as plt
from sklearn.kernel_ridge import KernelRidge
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.model_selection import KFold, train_test_split, ParameterGrid

import sys
sys.path.insert(0, "../")
import os
from os.path import basename, splitext
from pygraph.utils.graphfiles import loadDataset
from tqdm import tqdm
tqdm.monitor_interval = 0

results_dir = '../notebooks/results/' + estimator.__name__
if not os.path.exists(results_dir):
os.makedirs(results_dir)

# open file to save all results for this dataset.
with open(results_dir + '/' + ds_name + '.txt', 'w') as fresults:
fresults.write('# This file contains results of ' + estimator.__name__ + ' on dataset ' + ds_name + ',\n# including gram matrices, serial numbers for gram matrix figures and performance.\n\n')

# setup the model type
model_type = model_type.lower()
if model_type != 'regression' and model_type != 'classification':
raise Exception(
'The model type is incorrect! Please choose from regression or classification.')
print()
print('--- This is a %s problem ---' % model_type)
fresults.write('This is a %s problem.\n\n' % model_type)
# a string to save all the results.
str_fw = '# This file contains results of ' + estimator.__name__ + ' on dataset ' + ds_name + ',\n# including gram matrices, serial numbers for gram matrix figures and performance.\n\n'

# Load the dataset
print()
print('\nI. Loading dataset from file...')
dataset, y = loadDataset(datafile, filename_y=datafile_y, extra_params=extra_params)
# setup the model type
model_type = model_type.lower()
if model_type != 'regression' and model_type != 'classification':
raise Exception(
'The model type is incorrect! Please choose from regression or classification.'
)
print()
print('--- This is a %s problem ---' % model_type)
str_fw += 'This is a %s problem.\n\n' % model_type

# Load the dataset
print()
print('\nI. Loading dataset from file...')
dataset, y = loadDataset(
datafile, filename_y=datafile_y, extra_params=extra_params)

# import matplotlib.pyplot as plt
# import matplotlib.pyplot as plt
# import networkx as nx
# nx.draw_networkx(dataset[30])
# plt.show()

# Grid of parameters with a discrete number of values for each.
param_list_precomputed = list(ParameterGrid(param_grid_precomputed))
param_list = list(ParameterGrid(param_grid))
# np.savetxt(results_name_pre + 'param_grid_precomputed.dt',
# [[key, value] for key, value in sorted(param_grid_precomputed)])
# np.savetxt(results_name_pre + 'param_grid.dt',
# [[key, value] for key, value in sorted(param_grid)])
# Grid of parameters with a discrete number of values for each.
param_list_precomputed = list(ParameterGrid(param_grid_precomputed))
param_list = list(ParameterGrid(param_grid))
# np.savetxt(results_name_pre + 'param_grid_precomputed.dt',
# [[key, value] for key, value in sorted(param_grid_precomputed)])
# np.savetxt(results_name_pre + 'param_grid.dt',
# [[key, value] for key, value in sorted(param_grid)])

gram_matrices = [] # a list to store gram matrices for all param_grid_precomputed
gram_matrix_time = [] # a list to store time to calculate gram matrices
param_list_pre_revised = [] # list to store param grids precomputed ignoring the useless ones
gram_matrices = [
] # a list to store gram matrices for all param_grid_precomputed
gram_matrix_time = [
] # a list to store time to calculate gram matrices
param_list_pre_revised = [
] # list to store param grids precomputed ignoring the useless ones

# calculate all gram matrices
print()
print('2. Calculating gram matrices. This could take a while...')
str_fw += '\nI. Gram matrices.\n\n'
tts = time.time() # start training time
nb_gm_ignore = 0 # the number of gram matrices those should not be considered, as they may contain elements that are not numbers (NaN)
for idx, params_out in enumerate(param_list_precomputed):
params_out['n_jobs'] = n_jobs
rtn_data = estimator(dataset, **params_out)
Kmatrix = rtn_data[0]
current_run_time = rtn_data[1]
if len(rtn_data) == 3:
idx_trim = rtn_data[2] # the index of trimmed graph list
y = [y[idx] for idx in idx_trim]

Kmatrix_diag = Kmatrix.diagonal().copy()
# remove graphs whose kernels with themselves are zeros
nb_g_ignore = 0
for idx, diag in enumerate(Kmatrix_diag):
if diag == 0:
Kmatrix = np.delete(Kmatrix, (idx - nb_g_ignore), axis=0)
Kmatrix = np.delete(Kmatrix, (idx - nb_g_ignore), axis=1)
nb_g_ignore += 1
# normalization
for i in range(len(Kmatrix)):
for j in range(i, len(Kmatrix)):
Kmatrix[i][j] /= np.sqrt(Kmatrix_diag[i] * Kmatrix_diag[j])
Kmatrix[j][i] = Kmatrix[i][j]

# calculate all gram matrices
print()
print('2. Calculating gram matrices. This could take a while...')
fresults.write('\nI. Gram matrices.\n\n')
nb_gm_ignore = 0 # the number of gram matrices those should not be considered, as they may contain elements that are not numbers (NaN)
for idx, params_out in enumerate(param_list_precomputed):
rtn_data = estimator(dataset, **params_out)
Kmatrix = rtn_data[0]
current_run_time = rtn_data[1]
if len(rtn_data) == 3:
idx_trim = rtn_data[2] # the index of trimmed graph list
y = [y[idx] for idx in idx_trim]
Kmatrix_diag = Kmatrix.diagonal().copy()
for i in range(len(Kmatrix)):
for j in range(i, len(Kmatrix)):
# if Kmatrix_diag[i] != 0 and Kmatrix_diag[j] != 0:
Kmatrix[i][j] /= np.sqrt(Kmatrix_diag[i] * Kmatrix_diag[j])
Kmatrix[j][i] = Kmatrix[i][j]

print()
if params_out == {}:
print('the gram matrix is: ')
fresults.write('the gram matrix is:\n\n')
else:
print('the gram matrix with parameters', params_out, 'is: ')
fresults.write('the gram matrix with parameters %s is:\n\n' % params_out)
if np.isnan(Kmatrix).any(): # if the matrix contains elements that are not numbers
if params_out == {}:
print('the gram matrix is: ')
str_fw += 'the gram matrix is:\n\n'
else:
print('the gram matrix with parameters', params_out, 'is: ')
str_fw += 'the gram matrix with parameters %s is:\n\n' % params_out
if len(Kmatrix) < 2:
nb_gm_ignore += 1
print('ignored, as at most only one of all its diagonal value is non-zero.')
str_fw += 'ignored, as at most only one of all its diagonal value is non-zero.\n\n'
else:
if np.isnan(Kmatrix).any(
): # if the matrix contains elements that are not numbers
nb_gm_ignore += 1
print('ignored, as it contains elements that are not numbers.')
fresults.write('ignored, as it contains elements that are not numbers.\n\n')
str_fw += 'ignored, as it contains elements that are not numbers.\n\n'
else:
print(Kmatrix)
fresults.write(np.array2string(Kmatrix, separator=',', threshold=np.inf, floatmode='unique') + '\n\n')
str_fw += np.array2string(
Kmatrix,
separator=',',
threshold=np.inf,
floatmode='unique') + '\n\n'
plt.matshow(Kmatrix)
plt.colorbar()
fig_file_name = results_dir + '/GM[ds]' + ds_name
@@ -138,115 +166,52 @@ def model_selection_for_precomputed_kernel(datafile, estimator,
gram_matrices.append(Kmatrix)
gram_matrix_time.append(current_run_time)
param_list_pre_revised.append(params_out)
print()
print('{} gram matrices are calculated, {} of which are ignored.'.format(len(param_list_precomputed), nb_gm_ignore))
fresults.write('{} gram matrices are calculated, {} of which are ignored.\n\n'.format(len(param_list_precomputed), nb_gm_ignore))
fresults.write('serial numbers of gram matrix figure and their corresponding parameters settings:\n\n')
fresults.write(''.join(['{}: {}\n'.format(idx, params_out)
for idx, params_out in enumerate(param_list_precomputed)]))
if nb_g_ignore > 0:
print(', where %d graphs are ignored as their graph kernels with themselves are zeros.' % nb_g_ignore)
str_fw += ', where %d graphs are ignored as their graph kernels with themselves are zeros.' % nb_g_ignore
print()
print(
'{} gram matrices are calculated, {} of which are ignored.'.format(
len(param_list_precomputed), nb_gm_ignore))
str_fw += '{} gram matrices are calculated, {} of which are ignored.\n\n'.format(len(param_list_precomputed), nb_gm_ignore)
str_fw += 'serial numbers of gram matrix figures and their corresponding parameters settings:\n\n'
str_fw += ''.join([
'{}: {}\n'.format(idx, params_out)
for idx, params_out in enumerate(param_list_precomputed)
])

print()
print('3. Fitting and predicting using nested cross validation. This could really take a while...')
# Arrays to store scores
train_pref = np.zeros(
(NUM_TRIALS, len(param_list_pre_revised), len(param_list)))
val_pref = np.zeros(
(NUM_TRIALS, len(param_list_pre_revised), len(param_list)))
test_pref = np.zeros(
(NUM_TRIALS, len(param_list_pre_revised), len(param_list)))

# Loop for each trial
pbar = tqdm(total=NUM_TRIALS * len(param_list_pre_revised) * len(param_list),
desc='calculate performance', file=sys.stdout)
for trial in range(NUM_TRIALS): # Test set level
# loop for each outer param tuple
for index_out, params_out in enumerate(param_list_pre_revised):
# split gram matrix and y to app and test sets.
X_app, X_test, y_app, y_test = train_test_split(
gram_matrices[index_out], y, test_size=0.1)
split_index_app = [y.index(y_i) for y_i in y_app if y_i in y]
# split_index_test = [y.index(y_i) for y_i in y_test if y_i in y]
X_app = X_app[:, split_index_app]
X_test = X_test[:, split_index_app]
y_app = np.array(y_app)
y_test = np.array(y_test)

# loop for each inner param tuple
for index_in, params_in in enumerate(param_list):
inner_cv = KFold(n_splits=10, shuffle=True, random_state=trial)
current_train_perf = []
current_valid_perf = []
current_test_perf = []

# For regression use the Kernel Ridge method
try:
if model_type == 'regression':
KR = KernelRidge(kernel='precomputed', **params_in)
# loop for each split on validation set level
# validation set level
for train_index, valid_index in inner_cv.split(X_app):
KR.fit(X_app[train_index, :]
[:, train_index], y_app[train_index])
# predict on the train, validation and test set
y_pred_train = KR.predict(
X_app[train_index, :][:, train_index])
y_pred_valid = KR.predict(
X_app[valid_index, :][:, train_index])
y_pred_test = KR.predict(X_test[:, train_index])
# root mean squared errors
current_train_perf.append(
np.sqrt(mean_squared_error(y_app[train_index], y_pred_train)))
current_valid_perf.append(
np.sqrt(mean_squared_error(y_app[valid_index], y_pred_valid)))
current_test_perf.append(
np.sqrt(mean_squared_error(y_test, y_pred_test)))
# For clcassification use SVM
else:
KR = SVC(kernel='precomputed', **params_in)
# loop for each split on validation set level
# validation set level
for train_index, valid_index in inner_cv.split(X_app):
KR.fit(X_app[train_index, :]
[:, train_index], y_app[train_index])
# predict on the train, validation and test set
y_pred_train = KR.predict(
X_app[train_index, :][:, train_index])
y_pred_valid = KR.predict(
X_app[valid_index, :][:, train_index])
y_pred_test = KR.predict(
X_test[:, train_index])
# root mean squared errors
current_train_perf.append(accuracy_score(
y_app[train_index], y_pred_train))
current_valid_perf.append(accuracy_score(
y_app[valid_index], y_pred_valid))
current_test_perf.append(
accuracy_score(y_test, y_pred_test))
except ValueError:
print(sys.exc_info()[0])
print(params_out, params_in)
# average performance on inner splits
train_pref[trial][index_out][index_in] = np.mean(
current_train_perf)
val_pref[trial][index_out][index_in] = np.mean(
current_valid_perf)
test_pref[trial][index_out][index_in] = np.mean(
current_test_perf)
pbar.update(1)
pbar.clear()
print()
if len(gram_matrices) == 0:
print('all gram matrices are ignored, no results obtained.')
str_fw += '\nall gram matrices are ignored, no results obtained.\n\n'
else:
print(
'3. Fitting and predicting using nested cross validation. This could really take a while...'
)
pool = Pool(n_jobs)
trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, gram_matrices, y, model_type)
result_perf = pool.map(trial_do_partial, range(NUM_TRIALS))
train_pref = [item[0] for item in result_perf]
val_pref = [item[1] for item in result_perf]
test_pref = [item[2] for item in result_perf]
pool.close()
pool.join()

# trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, gram_matrices, y, model_type)
# result_perf = Parallel(n_jobs=n_jobs, verbose=10)(delayed(trial_do_partial)(trial) for trial in range(NUM_TRIALS))
# train_pref = [item[0] for item in result_perf]
# val_pref = [item[1] for item in result_perf]
# test_pref = [item[2] for item in result_perf]


# pbar.clear()
# np.save(results_name_pre + 'train_pref.dt', train_pref)
# np.save(results_name_pre + 'val_pref.dt', val_pref)
# np.save(results_name_pre + 'test_pref.dt', test_pref)

print()
print('4. Getting final performance...')
fresults.write('\nII. Performance.\n\n')
str_fw += '\nII. Performance.\n\n'
# averages and confidences of performances on outer trials for each combination of parameters
average_train_scores = np.mean(train_pref, axis=0)
average_val_scores = np.mean(val_pref, axis=0)
@@ -255,53 +220,78 @@ def model_selection_for_precomputed_kernel(datafile, estimator,
std_train_scores = np.std(train_pref, axis=0, ddof=1)
std_val_scores = np.std(val_pref, axis=0, ddof=1)
std_perf_scores = np.std(test_pref, axis=0, ddof=1)
if model_type == 'regression':
best_val_perf = np.amin(average_val_scores)
else:
best_val_perf = np.amax(average_val_scores)
best_params_index = np.where(average_val_scores == best_val_perf)
# find smallest val std with best val perf.
best_val_stds = [std_val_scores[value][best_params_index[1][idx]] for idx, value in enumerate(best_params_index[0])]
best_val_stds = [
std_val_scores[value][best_params_index[1][idx]]
for idx, value in enumerate(best_params_index[0])
]
min_val_std = np.amin(best_val_stds)
best_params_index = np.where(std_val_scores == min_val_std)
best_params_out = [param_list_pre_revised[i] for i in best_params_index[0]]
best_params_out = [
param_list_pre_revised[i] for i in best_params_index[0]
]
best_params_in = [param_list[i] for i in best_params_index[1]]
print('best_params_out: ', best_params_out)
print('best_params_in: ', best_params_in)
print()
print('best_val_perf: ', best_val_perf)
print('best_val_std: ', min_val_std)
fresults.write('best settings of hyper-params to build gram matrix: %s\n' % best_params_out)
fresults.write('best settings of other hyper-params: %s\n\n' % best_params_in)
fresults.write('best_val_perf: %s\n' % best_val_perf)
fresults.write('best_val_std: %s\n' % min_val_std)
str_fw += 'best settings of hyper-params to build gram matrix: %s\n' % best_params_out
str_fw += 'best settings of other hyper-params: %s\n\n' % best_params_in
str_fw += 'best_val_perf: %s\n' % best_val_perf
str_fw += 'best_val_std: %s\n' % min_val_std

final_performance = [average_perf_scores[value][best_params_index[1][idx]] for idx, value in enumerate(best_params_index[0])]
final_confidence = [std_perf_scores[value][best_params_index[1][idx]] for idx, value in enumerate(best_params_index[0])]
final_performance = [
average_perf_scores[value][best_params_index[1][idx]]
for idx, value in enumerate(best_params_index[0])
]
final_confidence = [
std_perf_scores[value][best_params_index[1][idx]]
for idx, value in enumerate(best_params_index[0])
]
print('final_performance: ', final_performance)
print('final_confidence: ', final_confidence)
fresults.write('final_performance: %s\n' % final_performance)
fresults.write('final_confidence: %s\n' % final_confidence)
train_performance = [average_train_scores[value][best_params_index[1][idx]] for idx, value in enumerate(best_params_index[0])]
train_std = [std_train_scores[value][best_params_index[1][idx]] for idx, value in enumerate(best_params_index[0])]
str_fw += 'final_performance: %s\n' % final_performance
str_fw += 'final_confidence: %s\n' % final_confidence
train_performance = [
average_train_scores[value][best_params_index[1][idx]]
for idx, value in enumerate(best_params_index[0])
]
train_std = [
std_train_scores[value][best_params_index[1][idx]]
for idx, value in enumerate(best_params_index[0])
]
print('train_performance: %s' % train_performance)
print('train_std: ', train_std)
fresults.write('train_performance: %s\n' % train_performance)
fresults.write('train_std: %s\n\n' % train_std)
str_fw += 'train_performance: %s\n' % train_performance
str_fw += 'train_std: %s\n\n' % train_std

print()
tt_total = time.time() - tts # training time for all hyper-parameters
average_gram_matrix_time = np.mean(gram_matrix_time)
std_gram_matrix_time = np.std(gram_matrix_time, ddof=1)
best_gram_matrix_time = [gram_matrix_time[i] for i in best_params_index[0]]
best_gram_matrix_time = [
gram_matrix_time[i] for i in best_params_index[0]
]
ave_bgmt = np.mean(best_gram_matrix_time)
std_bgmt = np.std(best_gram_matrix_time, ddof=1)
print('time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s'
.format(average_gram_matrix_time, std_gram_matrix_time))
print('time to calculate best gram matrix: {:.2f}±{:.2f}s'.format(ave_bgmt, std_bgmt))
fresults.write('time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s\n'
.format(average_gram_matrix_time, std_gram_matrix_time))
fresults.write('time to calculate best gram matrix: {:.2f}±{:.2f}s\n\n'.format(ave_bgmt, std_bgmt))
print(
'time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s'
.format(average_gram_matrix_time, std_gram_matrix_time))
print('time to calculate best gram matrix: {:.2f}±{:.2f}s'.format(
ave_bgmt, std_bgmt))
print(
'total training time with all hyper-param choices: {:.2f}s'.format(
tt_total))
str_fw += 'time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s\n'.format(average_gram_matrix_time, std_gram_matrix_time)
str_fw += 'time to calculate best gram matrix: {:.2f}±{:.2f}s\n'.format(ave_bgmt, std_bgmt)
str_fw += 'total training time with all hyper-param choices: {:.2f}s\n\n'.format(tt_total)

# # save results to file
# np.savetxt(results_name_pre + 'average_train_scores.dt',
@@ -312,7 +302,7 @@ def model_selection_for_precomputed_kernel(datafile, estimator,
# np.savetxt(results_name_pre + 'std_train_scores.dt', std_train_scores)
# np.savetxt(results_name_pre + 'std_val_scores.dt', std_val_scores)
# np.savetxt(results_name_pre + 'std_perf_scores.dt', std_perf_scores)
# np.save(results_name_pre + 'best_params_index', best_params_index)
# np.save(results_name_pre + 'best_params_pre.dt', best_params_out)
# np.save(results_name_pre + 'best_params_in.dt', best_params_in)
@@ -322,7 +312,7 @@ def model_selection_for_precomputed_kernel(datafile, estimator,
# np.save(results_name_pre + 'final_confidence.dt', final_confidence)
# np.save(results_name_pre + 'train_performance.dt', train_performance)
# np.save(results_name_pre + 'train_std.dt', train_std)
# np.save(results_name_pre + 'gram_matrix_time.dt', gram_matrix_time)
# np.save(results_name_pre + 'average_gram_matrix_time.dt',
# average_gram_matrix_time)
@@ -330,7 +320,7 @@ def model_selection_for_precomputed_kernel(datafile, estimator,
# std_gram_matrix_time)
# np.save(results_name_pre + 'best_gram_matrix_time.dt',
# best_gram_matrix_time)
# print out as table.
from collections import OrderedDict
from tabulate import tabulate
@@ -343,20 +333,150 @@ def model_selection_for_precomputed_kernel(datafile, estimator,
param_in['C'] = '{:.2e}'.format(param_in['C'])
table_dict['params'] = [{**param_out, **param_in}
for param_in in param_list for param_out in param_list_pre_revised]
table_dict['gram_matrix_time'] = ['{:.2f}'.format(gram_matrix_time[index_out])
for param_in in param_list for index_out, _ in enumerate(param_list_pre_revised)]
table_dict['valid_perf'] = ['{:.2f}±{:.2f}'.format(average_val_scores[index_out][index_in], std_val_scores[index_out][index_in])
for index_in, _ in enumerate(param_list) for index_out, _ in enumerate(param_list_pre_revised)]
table_dict['test_perf'] = ['{:.2f}±{:.2f}'.format(average_perf_scores[index_out][index_in], std_perf_scores[index_out][index_in])
for index_in, _ in enumerate(param_list) for index_out, _ in enumerate(param_list_pre_revised)]
table_dict['train_perf'] = ['{:.2f}±{:.2f}'.format(average_train_scores[index_out][index_in], std_train_scores[index_out][index_in])
for index_in, _ in enumerate(param_list) for index_out, _ in enumerate(param_list_pre_revised)]
keyorder = ['params', 'train_perf', 'valid_perf',
'test_perf', 'gram_matrix_time']
table_dict['gram_matrix_time'] = [
'{:.2f}'.format(gram_matrix_time[index_out])
for param_in in param_list
for index_out, _ in enumerate(param_list_pre_revised)
]
table_dict['valid_perf'] = [
'{:.2f}±{:.2f}'.format(average_val_scores[index_out][index_in],
std_val_scores[index_out][index_in])
for index_in, _ in enumerate(param_list)
for index_out, _ in enumerate(param_list_pre_revised)
]
table_dict['test_perf'] = [
'{:.2f}±{:.2f}'.format(average_perf_scores[index_out][index_in],
std_perf_scores[index_out][index_in])
for index_in, _ in enumerate(param_list)
for index_out, _ in enumerate(param_list_pre_revised)
]
table_dict['train_perf'] = [
'{:.2f}±{:.2f}'.format(average_train_scores[index_out][index_in],
std_train_scores[index_out][index_in])
for index_in, _ in enumerate(param_list)
for index_out, _ in enumerate(param_list_pre_revised)
]
keyorder = [
'params', 'train_perf', 'valid_perf', 'test_perf',
'gram_matrix_time'
]
print()
tb_print = tabulate(OrderedDict(sorted(table_dict.items(),
key=lambda i: keyorder.index(i[0]))), headers='keys')
tb_print = tabulate(
OrderedDict(
sorted(table_dict.items(),
key=lambda i: keyorder.index(i[0]))),
headers='keys')
print(tb_print)
fresults.write('table of performance v.s. hyper-params:\n\n%s\n\n' % tb_print)
str_fw += 'table of performance v.s. hyper-params:\n\n%s\n\n' % tb_print

# open file to save all results for this dataset.
if not os.path.exists(results_dir):
os.makedirs(results_dir)

with open(results_dir + '/' + ds_name + '.txt', 'w') as fresults:
fresults.write(str_fw)
fresults.close()


def trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, trial): # Test set level

# Arrays to store scores
train_pref = np.zeros((len(param_list_pre_revised),
len(param_list)))
val_pref = np.zeros((len(param_list_pre_revised),
len(param_list)))
test_pref = np.zeros((len(param_list_pre_revised),
len(param_list)))

# loop for each outer param tuple
for index_out, params_out in enumerate(param_list_pre_revised):
# split gram matrix and y to app and test sets.
X_app, X_test, y_app, y_test = train_test_split(
gram_matrices[index_out], y, test_size=0.1)
split_index_app = [y.index(y_i) for y_i in y_app if y_i in y]
# split_index_test = [y.index(y_i) for y_i in y_test if y_i in y]
X_app = X_app[:, split_index_app]
X_test = X_test[:, split_index_app]
y_app = np.array(y_app)
y_test = np.array(y_test)

# loop for each inner param tuple
for index_in, params_in in enumerate(param_list):
inner_cv = KFold(
n_splits=10, shuffle=True, random_state=trial)
current_train_perf = []
current_valid_perf = []
current_test_perf = []

# For regression use the Kernel Ridge method
try:
if model_type == 'regression':
KR = KernelRidge(kernel='precomputed', **params_in)
# loop for each split on validation set level
# validation set level
for train_index, valid_index in inner_cv.split(
X_app):
KR.fit(X_app[train_index, :][:, train_index],
y_app[train_index])

# predict on the train, validation and test set
y_pred_train = KR.predict(
X_app[train_index, :][:, train_index])
y_pred_valid = KR.predict(
X_app[valid_index, :][:, train_index])
y_pred_test = KR.predict(
X_test[:, train_index])

# root mean squared errors
current_train_perf.append(
np.sqrt(
mean_squared_error(
y_app[train_index], y_pred_train)))
current_valid_perf.append(
np.sqrt(
mean_squared_error(
y_app[valid_index], y_pred_valid)))
current_test_perf.append(
np.sqrt(
mean_squared_error(
y_test, y_pred_test)))
# For clcassification use SVM
else:
KR = SVC(kernel='precomputed', **params_in)
# loop for each split on validation set level
# validation set level
for train_index, valid_index in inner_cv.split(
X_app):
KR.fit(X_app[train_index, :][:, train_index],
y_app[train_index])

# predict on the train, validation and test set
y_pred_train = KR.predict(
X_app[train_index, :][:, train_index])
y_pred_valid = KR.predict(
X_app[valid_index, :][:, train_index])
y_pred_test = KR.predict(
X_test[:, train_index])

# root mean squared errors
current_train_perf.append(
accuracy_score(y_app[train_index],
y_pred_train))
current_valid_perf.append(
accuracy_score(y_app[valid_index],
y_pred_valid))
current_test_perf.append(
accuracy_score(y_test, y_pred_test))
except ValueError:
print(sys.exc_info()[0])
print(params_out, params_in)

# average performance on inner splits
train_pref[index_out][index_in] = np.mean(
current_train_perf)
val_pref[index_out][index_in] = np.mean(
current_valid_perf)
test_pref[index_out][index_in] = np.mean(
current_test_perf)

fresults.close()
return train_pref, val_pref, test_pref

+ 2
- 1
pygraph/utils/utils.py View File

@@ -61,10 +61,11 @@ def floydTransformation(G, edge_weight=None):
spMatrix = nx.floyd_warshall_numpy(G, weight=edge_weight)
S = nx.Graph()
S.add_nodes_from(G.nodes(data=True))
ns = list(G.nodes())
for i in range(0, G.number_of_nodes()):
for j in range(i + 1, G.number_of_nodes()):
if spMatrix[i, j] != np.inf:
S.add_edge(i, j, cost=spMatrix[i, j])
S.add_edge(ns[i], ns[j], cost=spMatrix[i, j])
return S




Loading…
Cancel
Save