Browse Source

correct randomness of data split for paralleling.

v0.1
jajupmochi 6 years ago
parent
commit
8baa21cb67
14 changed files with 39707 additions and 77953 deletions
  1. +1
    -1
      notebooks/check_gm.py
  2. BIN
      notebooks/check_gm.zip
  3. +19554
    -38693
      notebooks/check_gm/Acyclic.gm.eps
  4. +19686
    -38836
      notebooks/check_gm/Letter-med.gm.eps
  5. +97
    -75
      notebooks/run_spkernel.ipynb
  6. +11
    -11
      notebooks/run_spkernel.py
  7. +8
    -8
      notebooks/run_structuralspkernel.py
  8. +1
    -1
      notebooks/run_untilhpathkernel.py
  9. +0
    -77
      notebooks/test.py
  10. +26
    -20
      pygraph/kernels/commonWalkKernel.py
  11. +23
    -20
      pygraph/kernels/spKernel.py
  12. +98
    -66
      pygraph/kernels/structuralspKernel.py
  13. +20
    -21
      pygraph/kernels/untilHPathKernel.py
  14. +182
    -124
      pygraph/utils/model_selection_precomputed.py

+ 1
- 1
notebooks/check_gm.py View File

@@ -12,7 +12,7 @@ import matplotlib.pyplot as plt
from numpy.linalg import eig

# read gram matrices from file.
results_dir = 'results/structuralspkernel/'
results_dir = 'results/untilhpathkernel/myria'
ds_name = 'Letter-med'
gmfile = np.load(results_dir + '/' + ds_name + '.gm.npz')
#print('gm time: ', gmfile['gmtime'])


BIN
notebooks/check_gm.zip View File


+ 19554
- 38693
notebooks/check_gm/Acyclic.gm.eps
File diff suppressed because it is too large
View File


+ 19686
- 38836
notebooks/check_gm/Letter-med.gm.eps
File diff suppressed because it is too large
View File


+ 97
- 75
notebooks/run_spkernel.ipynb View File

@@ -6,94 +6,116 @@
"metadata": {
"scrolled": false
},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"MAO\n",
"\n",
"--- This is a classification problem ---\n",
"\n",
"\n",
"1. Loading dataset from file...\n",
"\n",
"2. Calculating gram matrices. This could take a while...\n",
"\n",
" None edge weight specified. Set all weight to 1.\n",
"\n",
"getting sp graphs: 68it [00:00, 692.11it/s]\n",
"calculating kernels: 2346it [00:05, 399.28it/s]\n",
"\n",
" --- shortest path kernel matrix of size 68 built in 6.345669507980347 seconds ---\n",
"\n",
"the gram matrix with parameters {'node_kernels': {'symb': <function deltakernel at 0x7fe240afd620>, 'nsymb': <function gaussiankernel at 0x7fe240afd9d8>, 'mix': functools.partial(<function kernelproduct at 0x7fe240aaf0d0>, <function deltakernel at 0x7fe240afd620>, <function gaussiankernel at 0x7fe240afd9d8>)}, 'n_jobs': 8} is: \n",
"\n",
"1 gram matrices are calculated, 0 of which are ignored.\n",
"\n",
"3. Fitting and predicting using nested cross validation. This could really take a while...\n",
"cross validation: 7it [00:09, 4.67s/it]"
]
}
],
"source": [
"%load_ext line_profiler\n",
"%matplotlib inline\n",
"import functools\n",
"from libs import *\n",
"import multiprocessing\n",
"from sklearn.metrics.pairwise import rbf_kernel\n",
"\n",
"from pygraph.kernels.spKernel import spkernel, spkernel_do\n",
"from pygraph.utils.kernels import deltakernel, kernelsum\n",
"from pygraph.utils.model_selection_precomputed import trial_do\n",
"\n",
"dslist = [ \n",
" {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds', 'task': 'regression'}, # node symb\n",
"# {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression', \n",
"# 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt',}, # contains single node graph, node symb\n",
"# {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds',}, # node/edge symb\n",
"# {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds',}, # unlabeled\n",
"# {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat',\n",
"# 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, # node/edge symb\n",
" {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'},\n",
" {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'}, # node symb/nsymb\n",
" {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, # node/edge symb\n",
" {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat',\n",
" 'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}}, # node symb\n",
"\n",
"# {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb\n",
"# # # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb\n",
"# # # {'name': 'COX2', 'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt'}, # node symb/nsymb\n",
"# {'name': 'Fingerprint', 'dataset': '../datasets/Fingerprint/Fingerprint_A.txt'},\n",
"# \n",
"# # {'name': 'DHFR', 'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt'}, # node symb/nsymb\n",
"# # {'name': 'SYNTHETIC', 'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt'}, # node symb/nsymb\n",
"# # {'name': 'MSRC9', 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'}, # node symb\n",
"# # {'name': 'MSRC21', 'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'}, # node symb\n",
"# # {'name': 'FIRSTMM_DB', 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'}, # node symb/nsymb ,edge nsymb\n",
"\n",
"# # {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'}, # node symb/nsymb\n",
"# # {'name': 'PROTEINS_full', 'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'}, # node symb/nsymb\n",
"# # {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb\n",
"# {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1.mat',\n",
"# 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb\n",
"# {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109.mat',\n",
"# 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb\n",
"# {'name': 'NCI-HIV', 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf',\n",
"# 'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',}, # node/edge symb\n",
" \n",
"# # not working below\n",
"# {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',},\n",
"# {'name': 'PTC_FR', 'dataset': '../datasets/PTC/Train/FR.ds',},\n",
"# {'name': 'PTC_MM', 'dataset': '../datasets/PTC/Train/MM.ds',},\n",
"# {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',},\n",
"from pygraph.kernels.spKernel import spkernel\n",
"from pygraph.utils.kernels import deltakernel, gaussiankernel, kernelproduct\n",
"#from pygraph.utils.model_selection_precomputed import trial_do\n",
"\n",
"dslist = [\n",
"# {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',\n",
"# 'task': 'regression'}, # node symb\n",
"# {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression',\n",
"# 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt', }, \n",
"# # contains single node graph, node symb\n",
" {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds', }, # node/edge symb\n",
"# {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds', }, # unlabeled\n",
"# {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat',\n",
"# 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, # node/edge symb\n",
"# {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'},\n",
"# # node nsymb\n",
"# {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'},\n",
"# # node symb/nsymb\n",
"# {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'},\n",
" # node/edge symb\n",
"# {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat',\n",
"# 'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}}, # node symb\n",
"\n",
" # {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb\n",
" # # # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb\n",
" # # # {'name': 'COX2', 'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt'}, # node symb/nsymb\n",
" # {'name': 'Fingerprint', 'dataset': '../datasets/Fingerprint/Fingerprint_A.txt'},\n",
" #\n",
" # # {'name': 'DHFR', 'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt'}, # node symb/nsymb\n",
" # # {'name': 'SYNTHETIC', 'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt'}, # node symb/nsymb\n",
" # # {'name': 'MSRC9', 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'}, # node symb\n",
" # # {'name': 'MSRC21', 'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'}, # node symb\n",
" # # {'name': 'FIRSTMM_DB', 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'}, # node symb/nsymb ,edge nsymb\n",
"\n",
" # # {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'}, # node symb/nsymb\n",
" # # {'name': 'PROTEINS_full', 'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'}, # node symb/nsymb\n",
" # # {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb\n",
" # {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1.mat',\n",
" # 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb\n",
" # {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109.mat',\n",
" # 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb\n",
" # {'name': 'NCI-HIV', 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf',\n",
" # 'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',}, # node/edge symb\n",
"\n",
" # # not working below\n",
" # {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',},\n",
" # {'name': 'PTC_FR', 'dataset': '../datasets/PTC/Train/FR.ds',},\n",
" # {'name': 'PTC_MM', 'dataset': '../datasets/PTC/Train/MM.ds',},\n",
" # {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',},\n",
"]\n",
"estimator = spkernel\n",
"mixkernel = functools.partial(kernelsum, deltakernel, rbf_kernel)\n",
"param_grid_precomputed = {'node_kernels': [{'symb': deltakernel, 'nsymb': rbf_kernel, 'mix': mixkernel}]}\n",
"param_grid = [{'C': np.logspace(-10, 10, num = 41, base = 10)}, \n",
" {'alpha': np.logspace(-10, 10, num = 41, base = 10)}]\n",
"mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)\n",
"param_grid_precomputed = {'node_kernels': [\n",
" {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}]}\n",
"param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)},\n",
" {'alpha': np.logspace(-10, 10, num=41, base=10)}]\n",
"\n",
"for ds in dslist:\n",
" print()\n",
" print(ds['name'])\n",
" model_selection_for_precomputed_kernel(\n",
" ds['dataset'], \n",
" estimator, \n",
" param_grid_precomputed, \n",
" (param_grid[1] if ('task' in ds and ds['task'] == 'regression') else param_grid[0]), \n",
" (ds['task'] if 'task' in ds else 'classification'), \n",
" ds['dataset'],\n",
" estimator,\n",
" param_grid_precomputed,\n",
" (param_grid[1] if ('task' in ds and ds['task']\n",
" == 'regression') else param_grid[0]),\n",
" (ds['task'] if 'task' in ds else 'classification'),\n",
" NUM_TRIALS=30,\n",
" datafile_y=(ds['dataset_y'] if 'dataset_y' in ds else None),\n",
" extra_params=(ds['extra_params'] if 'extra_params' in ds else None),\n",
" ds_name=ds['name'],\n",
" n_jobs=multiprocessing.cpu_count())\n",
" \n",
"# %lprun -f trial_do -f spkernel -f spkernel_do -f model_selection_for_precomputed_kernel \\\n",
"# model_selection_for_precomputed_kernel( \\\n",
"# ds['dataset'], \\\n",
"# estimator, \\\n",
"# param_grid_precomputed, \\\n",
"# (param_grid[1] if ('task' in ds and ds['task'] == 'regression') else param_grid[0]), \\\n",
"# (ds['task'] if 'task' in ds else 'classification'), \\\n",
"# NUM_TRIALS=30, \\\n",
"# datafile_y=(ds['dataset_y'] if 'dataset_y' in ds else None), \\\n",
"# extra_params=(ds['extra_params'] if 'extra_params' in ds else None), \\\n",
"# ds_name=ds['name'], \\\n",
"# n_jobs=multiprocessing.cpu_count()) \n",
" print()"
" n_jobs=multiprocessing.cpu_count(),\n",
" read_gm_from_file=False)\n",
" print()\n"
]
},
{
@@ -713,8 +735,8 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (Spyder)",
"language": "python3",
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
@@ -727,7 +749,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
"version": "3.6.6"
}
},
"nbformat": 4,


+ 11
- 11
notebooks/run_spkernel.py View File

@@ -7,21 +7,21 @@ from pygraph.utils.kernels import deltakernel, gaussiankernel, kernelproduct
#from pygraph.utils.model_selection_precomputed import trial_do

dslist = [
# {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',
# 'task': 'regression'}, # node symb
# {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression',
# 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt', },
# # contains single node graph, node symb
# {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds', }, # node/edge symb
# {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds', }, # unlabeled
# {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat',
# 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, # node/edge symb
{'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',
'task': 'regression'}, # node symb
{'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression',
'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt', },
# contains single node graph, node symb
{'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds', }, # node/edge symb
{'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds', }, # unlabeled
{'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat',
'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, # node/edge symb
{'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'},
# node nsymb
{'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'},
# node symb/nsymb
# {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'},
# # node/edge symb
# node/edge symb
# {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat',
# 'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}}, # node symb

@@ -56,7 +56,7 @@ estimator = spkernel
mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)
param_grid_precomputed = {'node_kernels': [
{'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}]}
param_grid = [{'C': np.logspace(-10, 3, num=27, base=10)},
param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)},
{'alpha': np.logspace(-10, 10, num=41, base=10)}]

for ds in dslist:


+ 8
- 8
notebooks/run_structuralspkernel.py View File

@@ -23,10 +23,10 @@ dslist = [
# {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds', }, # unlabeled
# {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat',
# 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, # node/edge symb
{'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'},
# node nsymb
# {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'},
# # node symb/nsymb
# {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'},
# # node nsymb
{'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'},
# node symb/nsymb
# {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'},
# # node/edge symb
# {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat',
@@ -39,8 +39,8 @@ dslist = [
#
# # {'name': 'DHFR', 'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt'}, # node symb/nsymb
# # {'name': 'SYNTHETIC', 'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt'}, # node symb/nsymb
# # {'name': 'MSRC9', 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'}, # node symb, missing values
# # {'name': 'MSRC21', 'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'}, # node symb, missing values
# {'name': 'MSRC9', 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'}, # node symb, missing values
# {'name': 'MSRC21', 'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'}, # node symb, missing values
# # {'name': 'FIRSTMM_DB', 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'}, # node symb/nsymb ,edge nsymb

# # {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'}, # node symb/nsymb
@@ -53,8 +53,8 @@ dslist = [
# {'name': 'NCI-HIV', 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf',
# 'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',}, # node/edge symb

# # not working below
# {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',},
# # not working below
# {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',},
# {'name': 'PTC_FR', 'dataset': '../datasets/PTC/Train/FR.ds',},
# {'name': 'PTC_MM', 'dataset': '../datasets/PTC/Train/MM.ds',},
# {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',},


+ 1
- 1
notebooks/run_untilhpathkernel.py View File

@@ -62,7 +62,7 @@ dslist = [
]
estimator = untilhpathkernel
mixkernel = functools.partial(kernelproduct, deltakernel, rbf_kernel)
param_grid_precomputed = {'depth': np.linspace(7, 10, 10),
param_grid_precomputed = {'depth': np.linspace(1, 10, 10),
'k_func': ['tanimoto', 'MinMax']}
param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)},
{'alpha': np.logspace(-10, 10, num=41, base=10)}]


+ 0
- 77
notebooks/test.py View File

@@ -1,77 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Sep 28 16:37:29 2018

@author: ljia
"""

import functools
from libs import *
import multiprocessing
from sklearn.metrics.pairwise import rbf_kernel

from pygraph.kernels.structuralspKernel import structuralspkernel
from pygraph.utils.kernels import deltakernel, kernelproduct

dslist = [
# {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',
# 'task': 'regression'}, # node symb
# {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression',
# 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt', }, # contains single node graph, node symb
{'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds', }, # node/edge symb

# {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb
# # # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb
# # # {'name': 'COX2', 'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt'}, # node symb/nsymb
# {'name': 'Fingerprint', 'dataset': '../datasets/Fingerprint/Fingerprint_A.txt'},
#
# # {'name': 'DHFR', 'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt'}, # node symb/nsymb
# # {'name': 'SYNTHETIC', 'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt'}, # node symb/nsymb
# # {'name': 'MSRC9', 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'}, # node symb
# # {'name': 'MSRC21', 'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'}, # node symb
# # {'name': 'FIRSTMM_DB', 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'}, # node symb/nsymb ,edge nsymb

# # {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'}, # node symb/nsymb
# # {'name': 'PROTEINS_full', 'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'}, # node symb/nsymb
# # {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb
# {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1.mat',
# 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb
# {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109.mat',
# 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb
# {'name': 'NCI-HIV', 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf',
# 'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',}, # node/edge symb

# # not working below
# {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',},
# {'name': 'PTC_FR', 'dataset': '../datasets/PTC/Train/FR.ds',},
# {'name': 'PTC_MM', 'dataset': '../datasets/PTC/Train/MM.ds',},
# {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',},
]
estimator = structuralspkernel
mixkernel = functools.partial(kernelproduct, deltakernel, rbf_kernel)
param_grid_precomputed = {'node_kernels':
[{'symb': deltakernel, 'nsymb': rbf_kernel, 'mix': mixkernel}],
'edge_kernels':
[{'symb': deltakernel, 'nsymb': rbf_kernel, 'mix': mixkernel}]}
param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)},
{'alpha': np.logspace(-10, 10, num=41, base=10)}]

for ds in dslist:
print()
print(ds['name'])
model_selection_for_precomputed_kernel(
ds['dataset'],
estimator,
param_grid_precomputed,
(param_grid[1] if ('task' in ds and ds['task']
== 'regression') else param_grid[0]),
(ds['task'] if 'task' in ds else 'classification'),
NUM_TRIALS=30,
datafile_y=(ds['dataset_y'] if 'dataset_y' in ds else None),
extra_params=(ds['extra_params'] if 'extra_params' in ds else None),
ds_name=ds['name'],
n_jobs=multiprocessing.cpu_count(),
read_gm_from_file=False)
print()

+ 26
- 20
pygraph/kernels/commonWalkKernel.py View File

@@ -85,21 +85,20 @@ def commonwalkkernel(*args,

# ---- use pool.imap_unordered to parallel and track progress. ----
pool = Pool(n_jobs)
itr = combinations_with_replacement(range(0, len(Gn)), 2)
itr = zip(combinations_with_replacement(Gn, 2),
combinations_with_replacement(range(0, len(Gn)), 2))
len_itr = int(len(Gn) * (len(Gn) + 1) / 2)
if len_itr < 1000 * n_jobs:
chunksize = int(len_itr / n_jobs) + 1
else:
chunksize = 100
chunksize = 1000

# direct product graph method - exponential
if compute_method == 'exp':
do_partial = partial(_commonwalkkernel_exp, Gn, node_label, edge_label,
weight)
do_partial = partial(wrapper_cw_exp, node_label, edge_label, weight)
# direct product graph method - geometric
elif compute_method == 'geo':
do_partial = partial(_commonwalkkernel_geo, Gn, node_label, edge_label,
weight)
do_partial = partial(wrapper_cw_geo, node_label, edge_label, weight)

for i, j, kernel in tqdm(
pool.imap_unordered(do_partial, itr, chunksize),
@@ -153,7 +152,7 @@ def commonwalkkernel(*args,
return Kmatrix, run_time


def _commonwalkkernel_exp(Gn, node_label, edge_label, beta, ij):
def _commonwalkkernel_exp(g1, g2, node_label, edge_label, beta):
"""Calculate walk graph kernels up to n between 2 graphs using exponential
series.

@@ -175,10 +174,6 @@ def _commonwalkkernel_exp(Gn, node_label, edge_label, beta, ij):
kernel : float
The common walk Kernel between 2 graphs.
"""
iglobal = ij[0]
jglobal = ij[1]
g1 = Gn[iglobal]
g2 = Gn[jglobal]

# get tensor product / direct product
gp = direct_product(g1, g2, node_label, edge_label)
@@ -219,10 +214,18 @@ def _commonwalkkernel_exp(Gn, node_label, edge_label, beta, ij):
# print(np.exp(weight * A))
# print('-------')

return iglobal, jglobal, exp_D.sum()
return exp_D.sum()


def _commonwalkkernel_geo(Gn, node_label, edge_label, gamma, ij):
def wrapper_cw_exp(node_label, edge_label, beta, itr_item):
g1 = itr_item[0][0]
g2 = itr_item[0][1]
i = itr_item[1][0]
j = itr_item[1][1]
return i, j, _commonwalkkernel_exp(g1, g2, node_label, edge_label, beta)


def _commonwalkkernel_geo(g1, g2, node_label, edge_label, gamma):
"""Calculate common walk graph kernels up to n between 2 graphs using
geometric series.

@@ -244,19 +247,22 @@ def _commonwalkkernel_geo(Gn, node_label, edge_label, gamma, ij):
kernel : float
The common walk Kernel between 2 graphs.
"""
iglobal = ij[0]
jglobal = ij[1]
g1 = Gn[iglobal]
g2 = Gn[jglobal]

# get tensor product / direct product
gp = direct_product(g1, g2, node_label, edge_label)
A = nx.adjacency_matrix(gp).todense()
mat = np.identity(len(A)) - gamma * A
try:
return iglobal, jglobal, mat.I.sum()
return mat.I.sum()
except np.linalg.LinAlgError:
return iglobal, jglobal, np.nan
return np.nan
def wrapper_cw_geo(node_label, edge_label, gama, itr_item):
g1 = itr_item[0][0]
g2 = itr_item[0][1]
i = itr_item[1][0]
j = itr_item[1][1]
return i, j, _commonwalkkernel_geo(g1, g2, node_label, edge_label, gama)


def _commonwalkkernel_brute(walks1,


+ 23
- 20
pygraph/kernels/spKernel.py View File

@@ -8,7 +8,6 @@ import sys
import time
from itertools import combinations_with_replacement, product
from functools import partial
from joblib import Parallel, delayed
from multiprocessing import Pool
from tqdm import tqdm

@@ -89,7 +88,8 @@ def spkernel(*args,

pool = Pool(n_jobs)
# get shortest path graphs of Gn
getsp_partial = partial(wrap_getSPGraph, Gn, weight)
getsp_partial = partial(wrapper_getSPGraph, weight)
itr = zip(Gn, range(0, len(Gn)))
if len(Gn) < 1000 * n_jobs:
# # use default chunksize as pool.map when iterable is less than 100
# chunksize, extra = divmod(len(Gn), n_jobs * 4)
@@ -98,9 +98,8 @@ def spkernel(*args,
chunksize = int(len(Gn) / n_jobs) + 1
else:
chunksize = 1000
# chunksize = 300 # int(len(list(itr)) / n_jobs)
for i, g in tqdm(
pool.imap_unordered(getsp_partial, range(0, len(Gn)), chunksize),
pool.imap_unordered(getsp_partial, itr, chunksize),
desc='getting sp graphs', file=sys.stdout):
Gn[i] = g
pool.close()
@@ -144,8 +143,9 @@ def spkernel(*args,

# ---- use pool.imap_unordered to parallel and track progress. ----
pool = Pool(n_jobs)
do_partial = partial(spkernel_do, Gn, ds_attrs, node_label, node_kernels)
itr = combinations_with_replacement(range(0, len(Gn)), 2)
do_partial = partial(wrapper_sp_do, ds_attrs, node_label, node_kernels)
itr = zip(combinations_with_replacement(Gn, 2),
combinations_with_replacement(range(0, len(Gn)), 2))
len_itr = int(len(Gn) * (len(Gn) + 1) / 2)
if len_itr < 1000 * n_jobs:
chunksize = int(len_itr / n_jobs) + 1
@@ -200,15 +200,10 @@ def spkernel(*args,
return Kmatrix, run_time, idx


def spkernel_do(Gn, ds_attrs, node_label, node_kernels, ij):

i = ij[0]
j = ij[1]
g1 = Gn[i]
g2 = Gn[j]
def spkernel_do(g1, g2, ds_attrs, node_label, node_kernels):
kernel = 0

# try:
# compute shortest path matrices first, method borrowed from FCSP.
if ds_attrs['node_labeled']:
# node symb and non-synb labeled
@@ -243,7 +238,7 @@ def spkernel_do(Gn, ds_attrs, node_label, node_kernels, ij):
g1.edges(data=True), g2.edges(data=True)):
if e1[2]['cost'] == e2[2]['cost']:
kernel += 1
return i, j, kernel
return kernel

# compute graph kernels
if ds_attrs['is_directed']:
@@ -293,12 +288,20 @@ def spkernel_do(Gn, ds_attrs, node_label, node_kernels, ij):
# kn1 = vk_mat[x1][x2] * vk_mat[y1][y2]
# kn2 = vk_mat[x1][y2] * vk_mat[y1][x2]
# kernel += kn1 + kn2
# except KeyError: # missing labels or attributes
# pass

return i, j, kernel
return kernel


def wrapper_sp_do(ds_attrs, node_label, node_kernels, itr_item):
g1 = itr_item[0][0]
g2 = itr_item[0][1]
i = itr_item[1][0]
j = itr_item[1][1]
return i, j, spkernel_do(g1, g2, ds_attrs, node_label, node_kernels)


def wrap_getSPGraph(Gn, weight, i):
return i, getSPGraph(Gn[i], edge_weight=weight)
# return i, nx.floyd_warshall_numpy(Gn[i], weight=weight)
def wrapper_getSPGraph(weight, itr_item):
g = itr_item[0]
i = itr_item[1]
return i, getSPGraph(g, edge_weight=weight)
# return i, nx.floyd_warshall_numpy(g, weight=weight)

+ 98
- 66
pygraph/kernels/structuralspKernel.py View File

@@ -12,7 +12,6 @@ import sys
import time
from itertools import combinations, combinations_with_replacement, product
from functools import partial
from joblib import Parallel, delayed
from multiprocessing import Pool
from tqdm import tqdm

@@ -71,7 +70,6 @@ def structuralspkernel(*args,
"""
# pre-process
Gn = args[0] if len(args) == 1 else [args[0], args[1]]

weight = None
if edge_weight is None:
print('\n None edge weight specified. Set all weight to 1.\n')
@@ -98,34 +96,61 @@ def structuralspkernel(*args,
start_time = time.time()

# get shortest paths of each graph in Gn
splist = [[] for _ in range(len(Gn))]
splist = [None] * len(Gn)
pool = Pool(n_jobs)
# get shortest path graphs of Gn
getsp_partial = partial(wrap_getSP, Gn, weight, ds_attrs['is_directed'])
getsp_partial = partial(wrapper_getSP, weight, ds_attrs['is_directed'])
itr = zip(Gn, range(0, len(Gn)))
if len(Gn) < 1000 * n_jobs:
chunksize = int(len(Gn) / n_jobs) + 1
else:
chunksize = 1000
# chunksize = 300 # int(len(list(itr)) / n_jobs)
for i, sp in tqdm(
pool.imap_unordered(getsp_partial, range(0, len(Gn)), chunksize),
pool.imap_unordered(getsp_partial, itr, chunksize),
desc='getting shortest paths',
file=sys.stdout):
splist[i] = sp
# time.sleep(10)
pool.close()
pool.join()

# # ---- use pool.map to parallel ----
# result_sp = pool.map(getsp_partial, range(0, len(Gn)))
# for i in result_sp:
# Gn[i[0]] = i[1]
# or
# getsp_partial = partial(wrap_getSP, Gn, weight)
# for i, g in tqdm(
# pool.map(getsp_partial, range(0, len(Gn))),
# desc='getting sp graphs',
# file=sys.stdout):
# Gn[i] = g
# # get shortest paths of each graph in Gn
# splist = [[] for _ in range(len(Gn))]
# # get shortest path graphs of Gn
# getsp_partial = partial(wrapper_getSP, weight, ds_attrs['is_directed'])
# itr = zip(Gn, range(0, len(Gn)))
# if len(Gn) < 1000 * n_jobs:
# chunksize = int(len(Gn) / n_jobs) + 1
# else:
# chunksize = 1000
# # chunksize = 300 # int(len(list(itr)) / n_jobs)
# from contextlib import closing
# with closing(Pool(n_jobs)) as pool:
## for i, sp in tqdm(
# res = pool.imap_unordered(getsp_partial, itr, 10)
## desc='getting shortest paths',
## file=sys.stdout):
## splist[i] = sp
## time.sleep(10)
# pool.close()
# pool.join()
# ss = 0
# ss += sys.getsizeof(splist)
# for spss in splist:
# ss += sys.getsizeof(spss)
# for spp in spss:
# ss += sys.getsizeof(spp)
# time.sleep(20)
# # ---- direct running, normally use single CPU core. ----
# splist = []
# for g in tqdm(Gn, desc='getting sp graphs', file=sys.stdout):
# splist.append(get_shortest_paths(g, weight, ds_attrs['is_directed']))

# # ---- only for the Fast Computation of Shortest Path Kernel (FCSP)
# sp_ml = [0] * len(Gn) # shortest path matrices
@@ -149,9 +174,11 @@ def structuralspkernel(*args,

# ---- use pool.imap_unordered to parallel and track progress. ----
pool = Pool(n_jobs)
do_partial = partial(structuralspkernel_do, Gn, splist, ds_attrs,
node_label, edge_label, node_kernels, edge_kernels)
itr = combinations_with_replacement(range(0, len(Gn)), 2)
do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label,
node_kernels, edge_kernels)
itr = zip(combinations_with_replacement(Gn, 2),
combinations_with_replacement(splist, 2),
combinations_with_replacement(range(0, len(Gn)), 2))
len_itr = int(len(Gn) * (len(Gn) + 1) / 2)
if len_itr < 1000 * n_jobs:
chunksize = int(len_itr / n_jobs) + 1
@@ -166,36 +193,36 @@ def structuralspkernel(*args,
pool.close()
pool.join()

# # ---- use pool.map to parallel. ----
# # result_perf = pool.map(do_partial, itr)
# do_partial = partial(spkernel_do, Gn, ds_attrs, node_label, node_kernels)
# itr = combinations_with_replacement(range(0, len(Gn)), 2)
# for i, j, kernel in tqdm(
# pool.map(do_partial, itr), desc='calculating kernels',
# file=sys.stdout):
# Kmatrix[i][j] = kernel
# Kmatrix[j][i] = kernel
# pool.close()
# pool.join()

# # ---- use joblib.Parallel to parallel and track progress. ----
# result_perf = Parallel(
# n_jobs=n_jobs, verbose=10)(
# delayed(do_partial)(ij)
# for ij in combinations_with_replacement(range(0, len(Gn)), 2))
# result_perf = [
# do_partial(ij)
# for ij in combinations_with_replacement(range(0, len(Gn)), 2)
# ]
# for i in result_perf:
# Kmatrix[i[0]][i[1]] = i[2]
# Kmatrix[i[1]][i[0]] = i[2]
# # ---- use pool.imap_unordered to parallel and track progress. ----
# do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label,
# node_kernels, edge_kernels)
# itr = zip(combinations_with_replacement(Gn, 2),
# combinations_with_replacement(splist, 2),
# combinations_with_replacement(range(0, len(Gn)), 2))
# len_itr = int(len(Gn) * (len(Gn) + 1) / 2)
# if len_itr < 1000 * n_jobs:
# chunksize = int(len_itr / n_jobs) + 1
# else:
# chunksize = 1000
# from contextlib import closing
# with closing(Pool(n_jobs)) as pool:
# for i, j, kernel in tqdm(
# pool.imap_unordered(do_partial, itr, 1000),
# desc='calculating kernels',
# file=sys.stdout):
# Kmatrix[i][j] = kernel
# Kmatrix[j][i] = kernel
# pool.close()
# pool.join()


# # ---- direct running, normally use single CPU core. ----
# itr = combinations_with_replacement(range(0, len(Gn)), 2)
# itr = zip(combinations_with_replacement(Gn, 2),
# combinations_with_replacement(splist, 2),
# combinations_with_replacement(range(0, len(Gn)), 2))
# for gs in tqdm(itr, desc='calculating kernels', file=sys.stdout):
# i, j, kernel = structuralspkernel_do(Gn, splist, ds_attrs,
# node_label, edge_label, node_kernels, edge_kernels, gs)
# i, j, kernel = wrapper_ssp_do(ds_attrs, node_label, edge_label,
# node_kernels, edge_kernels, gs)
# if(kernel > 1):
# print("error here ")
# Kmatrix[i][j] = kernel
@@ -209,18 +236,11 @@ def structuralspkernel(*args,
return Kmatrix, run_time


def structuralspkernel_do(Gn, splist, ds_attrs, node_label, edge_label,
node_kernels, edge_kernels, ij):

iglobal = ij[0]
jglobal = ij[1]
g1 = Gn[iglobal]
g2 = Gn[jglobal]
spl1 = splist[iglobal]
spl2 = splist[jglobal]
def structuralspkernel_do(g1, g2, spl1, spl2, ds_attrs, node_label, edge_label,
node_kernels, edge_kernels):
kernel = 0

#try:
# First, compute shortest path matrices, method borrowed from FCSP.
if ds_attrs['node_labeled']:
# node symb and non-synb labeled
@@ -369,11 +389,19 @@ def structuralspkernel_do(Gn, splist, ds_attrs, node_label, edge_label,
# kn1 = vk_mat[x1][x2] * vk_mat[y1][y2]
# kn2 = vk_mat[x1][y2] * vk_mat[y1][x2]
# Kmatrix += kn1 + kn2
#except KeyError: # missing labels or attributes
# print("toto")
# pass
return kernel


return iglobal, jglobal, kernel
def wrapper_ssp_do(ds_attrs, node_label, edge_label, node_kernels,
edge_kernels, itr_item):
g1 = itr_item[0][0]
g2 = itr_item[0][1]
spl1 = itr_item[1][0]
spl2 = itr_item[1][1]
i = itr_item[2][0]
j = itr_item[2][1]
return i, j, structuralspkernel_do(g1, g2, spl1, spl2, ds_attrs,
node_label, edge_label, node_kernels, edge_kernels)


def get_shortest_paths(G, weight, directed):
@@ -397,17 +425,21 @@ def get_shortest_paths(G, weight, directed):
for n1, n2 in combinations(G.nodes(), 2):
try:
spltemp = list(nx.all_shortest_paths(G, n1, n2, weight=weight))
except nx.NetworkXNoPath: # nodes not connected
# sp.append([])
pass
else:
sp += spltemp
# each edge walk is counted twice, starting from both its extreme nodes.
if not directed:
sp += [sptemp[::-1] for sptemp in spltemp]
except nx.NetworkXNoPath: # nodes not connected
# sp.append([])
pass
# add single nodes as length 0 paths.
sp += [[n] for n in G.nodes()]
return sp


def wrap_getSP(Gn, weight, directed, i):
return i, get_shortest_paths(Gn[i], weight, directed)
def wrapper_getSP(weight, directed, itr_item):
g = itr_item[0]
i = itr_item[1]
return i, get_shortest_paths(g, weight, directed)

+ 20
- 21
pygraph/kernels/untilHPathKernel.py View File

@@ -13,7 +13,6 @@ from itertools import chain, combinations_with_replacement
from functools import partial
from multiprocessing import Pool
from tqdm import tqdm
import traceback

import networkx as nx
import numpy as np
@@ -77,15 +76,15 @@ def untilhpathkernel(*args,
# but this may cost a lot of memory for large datasets.
pool = Pool(n_jobs)
all_paths = [[] for _ in range(len(Gn))]
getps_partial = partial(wrap_find_all_paths_until_length, Gn, depth,
getps_partial = partial(wrapper_find_all_paths_until_length, depth,
ds_attrs, node_label, edge_label)
itr = zip(Gn, range(0, len(Gn)))
if len(Gn) < 1000 * n_jobs:
chunksize = int(len(Gn) / n_jobs) + 1
else:
chunksize = 1000
# chunksize = 300 # int(len(list(itr)) / n_jobs)
for i, ps in tqdm(
pool.imap_unordered(getps_partial, range(0, len(Gn)), chunksize),
pool.imap_unordered(getps_partial, itr, chunksize),
desc='getting paths', file=sys.stdout):
all_paths[i] = ps
pool.close()
@@ -110,8 +109,9 @@ def untilhpathkernel(*args,
pass
else:
pool = Pool(n_jobs)
do_partial = partial(_untilhpathkernel_do_naive, all_paths, k_func)
itr = combinations_with_replacement(range(0, len(Gn)), 2)
do_partial = partial(wrapper_uhpath_do_naive, k_func)
itr = zip(combinations_with_replacement(all_paths, 2),
combinations_with_replacement(range(0, len(Gn)), 2))
len_itr = int(len(Gn) * (len(Gn) + 1) / 2)
if len_itr < 1000 * n_jobs:
chunksize = int(len_itr / n_jobs) + 1
@@ -216,7 +216,7 @@ def _untilhpathkernel_do_gst(gst1, gst2, paths1, paths2, k_func):
return kernel


def _untilhpathkernel_do_naive(paths_list, k_func, ij):
def _untilhpathkernel_do_naive(paths1, paths2, k_func):
"""Calculate path graph kernels up to depth d between 2 graphs naively.

Parameters
@@ -235,10 +235,6 @@ def _untilhpathkernel_do_naive(paths_list, k_func, ij):
kernel : float
Path kernel up to h between 2 graphs.
"""
iglobal = ij[0]
jglobal = ij[1]
paths1 = paths_list[iglobal]
paths2 = paths_list[jglobal]
all_paths = list(set(paths1 + paths2))

if k_func == 'tanimoto':
@@ -260,12 +256,18 @@ def _untilhpathkernel_do_naive(paths_list, k_func, ij):
kernel = np.sum(np.minimum(vector1, vector2)) / \
np.sum(np.maximum(vector1, vector2))

return iglobal, jglobal, kernel
return kernel


# @todo: (can be removed maybe) this method find paths repetively, it could be faster.
def wrapper_uhpath_do_naive(k_func, itr_item):
plist1 = itr_item[0][0]
plist2 = itr_item[0][1]
i = itr_item[1][0]
j = itr_item[1][1]
return i, j, _untilhpathkernel_do_naive(plist1, plist2, k_func)


# @todo: (can be removed maybe) this method find paths repetively, it could be faster.
def find_all_paths_until_length(G,
length,
ds_attrs,
@@ -368,15 +370,12 @@ def find_all_paths_until_length(G,
return [tuple([len(path)]) for path in all_paths]
def wrap_find_all_paths_until_length(Gn, length, ds_attrs, node_label,
edge_label, i):
try:
return i, find_all_paths_until_length(Gn[i], length, ds_attrs,
def wrapper_find_all_paths_until_length(length, ds_attrs, node_label,
edge_label, itr_item):
g = itr_item[0]
i = itr_item[1]
return i, find_all_paths_until_length(g, length, ds_attrs,
node_label=node_label, edge_label=edge_label)
except Exception as e:
traceback.print_exc()
print('')
raise e


def paths2GSuffixTree(paths):


+ 182
- 124
pygraph/utils/model_selection_precomputed.py View File

@@ -206,54 +206,50 @@ def model_selection_for_precomputed_kernel(datafile,
'3. Fitting and predicting using nested cross validation. This could really take a while...'
)
# pool = Pool(n_jobs)
# trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, gram_matrices, y, model_type)
# train_pref = []
# val_pref = []
# test_pref = []
## if NUM_TRIALS < 1000 * n_jobs:
## chunksize = int(NUM_TRIALS / n_jobs) + 1
## else:
## chunksize = 1000
# chunksize = 1
# for o1, o2, o3 in tqdm(pool.imap_unordered(trial_do_partial, range(NUM_TRIALS), chunksize), desc='cross validation', file=sys.stdout):
# train_pref.append(o1)
# val_pref.append(o2)
# test_pref.append(o3)
# pool.close()
# pool.join()
# ---- use pool.map to parallel. ----
pool = Pool(n_jobs)
trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, gram_matrices, y, model_type)
train_pref = []
val_pref = []
test_pref = []
# if NUM_TRIALS < 100:
# chunksize, extra = divmod(NUM_TRIALS, n_jobs * 4)
# if extra:
# chunksize += 1
# else:
# chunksize = 100
chunksize = 1
for o1, o2, o3 in tqdm(pool.imap_unordered(trial_do_partial, range(NUM_TRIALS), chunksize), desc='cross validation', file=sys.stdout):
train_pref.append(o1)
val_pref.append(o2)
test_pref.append(o3)
pool.close()
pool.join()
# # ---- use pool.map to parallel. ----
# result_perf = pool.map(trial_do_partial, range(NUM_TRIALS))
# train_pref = [item[0] for item in result_perf]
# val_pref = [item[1] for item in result_perf]
# test_pref = [item[2] for item in result_perf]
result_perf = pool.map(trial_do_partial, range(NUM_TRIALS))
train_pref = [item[0] for item in result_perf]
val_pref = [item[1] for item in result_perf]
test_pref = [item[2] for item in result_perf]
# # ---- use joblib.Parallel to parallel and track progress. ----
# trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, gram_matrices, y, model_type)
# result_perf = Parallel(n_jobs=n_jobs, verbose=10)(delayed(trial_do_partial)(trial) for trial in range(NUM_TRIALS))
# train_pref = [item[0] for item in result_perf]
# val_pref = [item[1] for item in result_perf]
# test_pref = [item[2] for item in result_perf]
# # ---- direct running, normally use a single CPU core. ----
# train_pref = []
# val_pref = []
# test_pref = []
# for i in tqdm(range(NUM_TRIALS), desc='cross validation', file=sys.stdout):
# o1, o2, o3 = trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, i)
# train_pref.append(o1)
# val_pref.append(o2)
# test_pref.append(o3)
# # ---- direct running, normally use a single CPU core. ----
# train_pref = []
# val_pref = []
# test_pref = []
# for i in tqdm(range(NUM_TRIALS), desc='cross validation', file=sys.stdout):
# o1, o2, o3 = trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, i)
# train_pref.append(o1)
# val_pref.append(o2)
# test_pref.append(o3)
# print()
print()
print('4. Getting final performance...')
str_fw += '\nIII. Performance.\n\n'
# averages and confidences of performances on outer trials for each combination of parameters
average_train_scores = np.mean(train_pref, axis=0)
# print('val_pref: ', val_pref[0][0])
average_val_scores = np.mean(val_pref, axis=0)
# print('test_pref: ', test_pref[0][0])
average_perf_scores = np.mean(test_pref, axis=0)
# sample std is used here
std_train_scores = np.std(train_pref, axis=0, ddof=1)
@@ -264,6 +260,9 @@ def model_selection_for_precomputed_kernel(datafile,
best_val_perf = np.amin(average_val_scores)
else:
best_val_perf = np.amax(average_val_scores)
# print('average_val_scores: ', average_val_scores)
# print('best_val_perf: ', best_val_perf)
# print()
best_params_index = np.where(average_val_scores == best_val_perf)
# find smallest val std with best val perf.
best_val_stds = [
@@ -286,6 +285,9 @@ def model_selection_for_precomputed_kernel(datafile,
str_fw += 'best_val_perf: %s\n' % best_val_perf
str_fw += 'best_val_std: %s\n' % min_val_std
# print(best_params_index)
# print(best_params_index[0])
# print(average_perf_scores)
final_performance = [
average_perf_scores[value][best_params_index[1][idx]]
for idx, value in enumerate(best_params_index[0])
@@ -429,23 +431,23 @@ def model_selection_for_precomputed_kernel(datafile,
'3. Fitting and predicting using nested cross validation. This could really take a while...'
)
# pool = Pool(n_jobs)
# trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, gram_matrices, y, model_type)
# train_pref = []
# val_pref = []
# test_pref = []
# if NUM_TRIALS < 100:
# chunksize, extra = divmod(NUM_TRIALS, n_jobs * 4)
# if extra:
# chunksize += 1
# else:
# chunksize = 100
# for o1, o2, o3 in tqdm(pool.imap_unordered(trial_do_partial, range(NUM_TRIALS), chunksize), desc='cross validation', file=sys.stdout):
# train_pref.append(o1)
# val_pref.append(o2)
# test_pref.append(o3)
# pool.close()
# pool.join()
pool = Pool(n_jobs)
trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, gram_matrices, y, model_type)
train_pref = []
val_pref = []
test_pref = []
if NUM_TRIALS < 100:
chunksize, extra = divmod(NUM_TRIALS, n_jobs * 4)
if extra:
chunksize += 1
else:
chunksize = 100
for o1, o2, o3 in tqdm(pool.imap_unordered(trial_do_partial, range(NUM_TRIALS), chunksize), desc='cross validation', file=sys.stdout):
train_pref.append(o1)
val_pref.append(o2)
test_pref.append(o3)
pool.close()
pool.join()
# # ---- use pool.map to parallel. ----
# result_perf = pool.map(trial_do_partial, range(NUM_TRIALS))
@@ -460,15 +462,15 @@ def model_selection_for_precomputed_kernel(datafile,
# val_pref = [item[1] for item in result_perf]
# test_pref = [item[2] for item in result_perf]

# ---- direct running, normally use a single CPU core. ----
train_pref = []
val_pref = []
test_pref = []
for i in tqdm(range(NUM_TRIALS), desc='cross validation', file=sys.stdout):
o1, o2, o3 = trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, i)
train_pref.append(o1)
val_pref.append(o2)
test_pref.append(o3)
# # ---- direct running, normally use a single CPU core. ----
# train_pref = []
# val_pref = []
# test_pref = []
# for i in tqdm(range(NUM_TRIALS), desc='cross validation', file=sys.stdout):
# o1, o2, o3 = trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, i)
# train_pref.append(o1)
# val_pref.append(o2)
# test_pref.append(o3)

print()
print('4. Getting final performance...')
@@ -623,89 +625,142 @@ def trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, t
val_pref = np.zeros((len(param_list_pre_revised), len(param_list)))
test_pref = np.zeros((len(param_list_pre_revised), len(param_list)))

# randomness added to seeds of split function below. "high" is "size" times
# 10 so that at least 10 different random output will be yielded. Remove
# these lines if identical outputs is required.
rdm_out = np.random.RandomState(seed=None)
rdm_seed_out_l = rdm_out.uniform(high=len(param_list_pre_revised) * 10,
size=len(param_list_pre_revised))
# print(trial, rdm_seed_out_l)
# print()
# loop for each outer param tuple
for index_out, params_out in enumerate(param_list_pre_revised):
# split gram matrix and y to app and test sets.
indices = range(len(y))
# The argument "random_state" in function "train_test_split" can not be
# set to None, because it will use RandomState instance used by
# np.random, which is possible for multiple subprocesses to inherit the
# same seed if they forked at the same time, leading to identical
# random variates for different subprocesses. Instead, we use "trial"
# and "index_out" parameters to generate different seeds for different
# trials/subprocesses and outer loops. "rdm_seed_out_l" is used to add
# randomness into seeds, so that it yields a different output every
# time the program is run. To yield identical outputs every time,
# remove the second line below. Same method is used to the "KFold"
# function in the inner loop.
rdm_seed_out = (trial + 1) * (index_out + 1)
rdm_seed_out = (rdm_seed_out + int(rdm_seed_out_l[index_out])) % (2 ** 32 - 1)
# print(trial, rdm_seed_out)
X_app, X_test, y_app, y_test, idx_app, idx_test = train_test_split(
gram_matrices[index_out], y, indices, test_size=0.1,
random_state=None, shuffle=True)
random_state=rdm_seed_out, shuffle=True)
# print(trial, idx_app, idx_test)
# print()
X_app = X_app[:, idx_app]
X_test = X_test[:, idx_app]
y_app = np.array(y_app)
y_test = np.array(y_test)

rdm_seed_in_l = rdm_out.uniform(high=len(param_list) * 10,
size=len(param_list))
# loop for each inner param tuple
for index_in, params_in in enumerate(param_list):
# print(index_in, params_in)
# if trial == 0:
# print(index_out, index_in)
# print('params_in: ', params_in)
# st = time.time()
inner_cv = KFold(n_splits=10, shuffle=True, random_state=trial)
rdm_seed_in = (trial + 1) * (index_out + 1) * (index_in + 1)
# print("rdm_seed_in1: ", trial, index_in, rdm_seed_in)
rdm_seed_in = (rdm_seed_in + int(rdm_seed_in_l[index_in])) % (2 ** 32 - 1)
# print("rdm_seed_in2: ", trial, index_in, rdm_seed_in)
inner_cv = KFold(n_splits=10, shuffle=True, random_state=rdm_seed_in)
current_train_perf = []
current_valid_perf = []
current_test_perf = []

# For regression use the Kernel Ridge method
try:
if model_type == 'regression':
kr = KernelRidge(kernel='precomputed', **params_in)
# loop for each split on validation set level
# validation set level
for train_index, valid_index in inner_cv.split(X_app):
kr.fit(X_app[train_index, :][:, train_index],
y_app[train_index])
# try:
if model_type == 'regression':
kr = KernelRidge(kernel='precomputed', **params_in)
# loop for each split on validation set level
# validation set level
for train_index, valid_index in inner_cv.split(X_app):
# print("train_index, valid_index: ", trial, index_in, train_index, valid_index)
# if trial == 0:
# print('train_index: ', train_index)
# print('valid_index: ', valid_index)
# print('idx_test: ', idx_test)
# print('y_app[train_index]: ', y_app[train_index])
# print('X_app[train_index, :][:, train_index]: ', X_app[train_index, :][:, train_index])
# print('X_app[valid_index, :][:, train_index]: ', X_app[valid_index, :][:, train_index])
kr.fit(X_app[train_index, :][:, train_index],
y_app[train_index])

# predict on the train, validation and test set
y_pred_train = kr.predict(
X_app[train_index, :][:, train_index])
y_pred_valid = kr.predict(
X_app[valid_index, :][:, train_index])
y_pred_test = kr.predict(
X_test[:, train_index])
# predict on the train, validation and test set
y_pred_train = kr.predict(
X_app[train_index, :][:, train_index])
y_pred_valid = kr.predict(
X_app[valid_index, :][:, train_index])
# if trial == 0:
# print('y_pred_valid: ', y_pred_valid)
# print()
y_pred_test = kr.predict(
X_test[:, train_index])

# root mean squared errors
current_train_perf.append(
np.sqrt(
mean_squared_error(
y_app[train_index], y_pred_train)))
current_valid_perf.append(
np.sqrt(
mean_squared_error(
y_app[valid_index], y_pred_valid)))
current_test_perf.append(
np.sqrt(
mean_squared_error(
y_test, y_pred_test)))
# For clcassification use SVM
else:
svc = SVC(kernel='precomputed', cache_size=200,
verbose=False, **params_in)
# loop for each split on validation set level
# validation set level
for train_index, valid_index in inner_cv.split(X_app):
# root mean squared errors
current_train_perf.append(
np.sqrt(
mean_squared_error(
y_app[train_index], y_pred_train)))
current_valid_perf.append(
np.sqrt(
mean_squared_error(
y_app[valid_index], y_pred_valid)))
# if trial == 0:
# print(mean_squared_error(
# y_app[valid_index], y_pred_valid))
current_test_perf.append(
np.sqrt(
mean_squared_error(
y_test, y_pred_test)))
# For clcassification use SVM
else:
svc = SVC(kernel='precomputed', cache_size=200,
verbose=False, **params_in)
# loop for each split on validation set level
# validation set level
for train_index, valid_index in inner_cv.split(X_app):
# np.savez("bug.npy",X_app[train_index, :][:, train_index],y_app[train_index])
svc.fit(X_app[train_index, :][:, train_index],
y_app[train_index])
# predict on the train, validation and test set
y_pred_train = svc.predict(
X_app[train_index, :][:, train_index])
y_pred_valid = svc.predict(
X_app[valid_index, :][:, train_index])
y_pred_test = svc.predict(
X_test[:, train_index])
# if trial == 0:
# print('train_index: ', train_index)
# print('valid_index: ', valid_index)
# print('idx_test: ', idx_test)
# print('y_app[train_index]: ', y_app[train_index])
# print('X_app[train_index, :][:, train_index]: ', X_app[train_index, :][:, train_index])
# print('X_app[valid_index, :][:, train_index]: ', X_app[valid_index, :][:, train_index])
svc.fit(X_app[train_index, :][:, train_index],
y_app[train_index])
# predict on the train, validation and test set
y_pred_train = svc.predict(
X_app[train_index, :][:, train_index])
y_pred_valid = svc.predict(
X_app[valid_index, :][:, train_index])
y_pred_test = svc.predict(
X_test[:, train_index])

# root mean squared errors
current_train_perf.append(
accuracy_score(y_app[train_index],
y_pred_train))
current_valid_perf.append(
accuracy_score(y_app[valid_index],
y_pred_valid))
current_test_perf.append(
accuracy_score(y_test, y_pred_test))
except ValueError:
print(sys.exc_info()[0])
print(params_out, params_in)
# root mean squared errors
current_train_perf.append(
accuracy_score(y_app[train_index],
y_pred_train))
current_valid_perf.append(
accuracy_score(y_app[valid_index],
y_pred_valid))
current_test_perf.append(
accuracy_score(y_test, y_pred_test))
# except ValueError:
# print(sys.exc_info()[0])
# print(params_out, params_in)

# average performance on inner splits
train_pref[index_out][index_in] = np.mean(
@@ -715,5 +770,8 @@ def trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, t
test_pref[index_out][index_in] = np.mean(
current_test_perf)
# print(time.time() - st)
# if trial == 0:
# print('val_pref: ', val_pref)
# print('test_pref: ', test_pref)

return train_pref, val_pref, test_pref

Loading…
Cancel
Save