Browse Source

correct randomness of data split for paralleling.

v0.1
jajupmochi 6 years ago
parent
commit
8baa21cb67
14 changed files with 39707 additions and 77953 deletions
  1. +1
    -1
      notebooks/check_gm.py
  2. BIN
      notebooks/check_gm.zip
  3. +19554
    -38693
      notebooks/check_gm/Acyclic.gm.eps
  4. +19686
    -38836
      notebooks/check_gm/Letter-med.gm.eps
  5. +97
    -75
      notebooks/run_spkernel.ipynb
  6. +11
    -11
      notebooks/run_spkernel.py
  7. +8
    -8
      notebooks/run_structuralspkernel.py
  8. +1
    -1
      notebooks/run_untilhpathkernel.py
  9. +0
    -77
      notebooks/test.py
  10. +26
    -20
      pygraph/kernels/commonWalkKernel.py
  11. +23
    -20
      pygraph/kernels/spKernel.py
  12. +98
    -66
      pygraph/kernels/structuralspKernel.py
  13. +20
    -21
      pygraph/kernels/untilHPathKernel.py
  14. +182
    -124
      pygraph/utils/model_selection_precomputed.py

+ 1
- 1
notebooks/check_gm.py View File

@@ -12,7 +12,7 @@ import matplotlib.pyplot as plt
from numpy.linalg import eig from numpy.linalg import eig


# read gram matrices from file. # read gram matrices from file.
results_dir = 'results/structuralspkernel/'
results_dir = 'results/untilhpathkernel/myria'
ds_name = 'Letter-med' ds_name = 'Letter-med'
gmfile = np.load(results_dir + '/' + ds_name + '.gm.npz') gmfile = np.load(results_dir + '/' + ds_name + '.gm.npz')
#print('gm time: ', gmfile['gmtime']) #print('gm time: ', gmfile['gmtime'])


BIN
notebooks/check_gm.zip View File


+ 19554
- 38693
notebooks/check_gm/Acyclic.gm.eps
File diff suppressed because it is too large
View File


+ 19686
- 38836
notebooks/check_gm/Letter-med.gm.eps
File diff suppressed because it is too large
View File


+ 97
- 75
notebooks/run_spkernel.ipynb View File

@@ -6,94 +6,116 @@
"metadata": { "metadata": {
"scrolled": false "scrolled": false
}, },
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"MAO\n",
"\n",
"--- This is a classification problem ---\n",
"\n",
"\n",
"1. Loading dataset from file...\n",
"\n",
"2. Calculating gram matrices. This could take a while...\n",
"\n",
" None edge weight specified. Set all weight to 1.\n",
"\n",
"getting sp graphs: 68it [00:00, 692.11it/s]\n",
"calculating kernels: 2346it [00:05, 399.28it/s]\n",
"\n",
" --- shortest path kernel matrix of size 68 built in 6.345669507980347 seconds ---\n",
"\n",
"the gram matrix with parameters {'node_kernels': {'symb': <function deltakernel at 0x7fe240afd620>, 'nsymb': <function gaussiankernel at 0x7fe240afd9d8>, 'mix': functools.partial(<function kernelproduct at 0x7fe240aaf0d0>, <function deltakernel at 0x7fe240afd620>, <function gaussiankernel at 0x7fe240afd9d8>)}, 'n_jobs': 8} is: \n",
"\n",
"1 gram matrices are calculated, 0 of which are ignored.\n",
"\n",
"3. Fitting and predicting using nested cross validation. This could really take a while...\n",
"cross validation: 7it [00:09, 4.67s/it]"
]
}
],
"source": [ "source": [
"%load_ext line_profiler\n",
"%matplotlib inline\n",
"import functools\n", "import functools\n",
"from libs import *\n", "from libs import *\n",
"import multiprocessing\n", "import multiprocessing\n",
"from sklearn.metrics.pairwise import rbf_kernel\n",
"\n",
"from pygraph.kernels.spKernel import spkernel, spkernel_do\n",
"from pygraph.utils.kernels import deltakernel, kernelsum\n",
"from pygraph.utils.model_selection_precomputed import trial_do\n",
"\n",
"dslist = [ \n",
" {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds', 'task': 'regression'}, # node symb\n",
"# {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression', \n",
"# 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt',}, # contains single node graph, node symb\n",
"# {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds',}, # node/edge symb\n",
"# {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds',}, # unlabeled\n",
"# {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat',\n",
"# 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, # node/edge symb\n",
" {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'},\n",
" {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'}, # node symb/nsymb\n",
" {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, # node/edge symb\n",
" {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat',\n",
" 'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}}, # node symb\n",
"\n", "\n",
"# {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb\n",
"# # # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb\n",
"# # # {'name': 'COX2', 'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt'}, # node symb/nsymb\n",
"# {'name': 'Fingerprint', 'dataset': '../datasets/Fingerprint/Fingerprint_A.txt'},\n",
"# \n",
"# # {'name': 'DHFR', 'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt'}, # node symb/nsymb\n",
"# # {'name': 'SYNTHETIC', 'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt'}, # node symb/nsymb\n",
"# # {'name': 'MSRC9', 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'}, # node symb\n",
"# # {'name': 'MSRC21', 'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'}, # node symb\n",
"# # {'name': 'FIRSTMM_DB', 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'}, # node symb/nsymb ,edge nsymb\n",
"\n",
"# # {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'}, # node symb/nsymb\n",
"# # {'name': 'PROTEINS_full', 'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'}, # node symb/nsymb\n",
"# # {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb\n",
"# {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1.mat',\n",
"# 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb\n",
"# {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109.mat',\n",
"# 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb\n",
"# {'name': 'NCI-HIV', 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf',\n",
"# 'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',}, # node/edge symb\n",
" \n",
"# # not working below\n",
"# {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',},\n",
"# {'name': 'PTC_FR', 'dataset': '../datasets/PTC/Train/FR.ds',},\n",
"# {'name': 'PTC_MM', 'dataset': '../datasets/PTC/Train/MM.ds',},\n",
"# {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',},\n",
"from pygraph.kernels.spKernel import spkernel\n",
"from pygraph.utils.kernels import deltakernel, gaussiankernel, kernelproduct\n",
"#from pygraph.utils.model_selection_precomputed import trial_do\n",
"\n",
"dslist = [\n",
"# {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',\n",
"# 'task': 'regression'}, # node symb\n",
"# {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression',\n",
"# 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt', }, \n",
"# # contains single node graph, node symb\n",
" {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds', }, # node/edge symb\n",
"# {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds', }, # unlabeled\n",
"# {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat',\n",
"# 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, # node/edge symb\n",
"# {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'},\n",
"# # node nsymb\n",
"# {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'},\n",
"# # node symb/nsymb\n",
"# {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'},\n",
" # node/edge symb\n",
"# {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat',\n",
"# 'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}}, # node symb\n",
"\n",
" # {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb\n",
" # # # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb\n",
" # # # {'name': 'COX2', 'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt'}, # node symb/nsymb\n",
" # {'name': 'Fingerprint', 'dataset': '../datasets/Fingerprint/Fingerprint_A.txt'},\n",
" #\n",
" # # {'name': 'DHFR', 'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt'}, # node symb/nsymb\n",
" # # {'name': 'SYNTHETIC', 'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt'}, # node symb/nsymb\n",
" # # {'name': 'MSRC9', 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'}, # node symb\n",
" # # {'name': 'MSRC21', 'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'}, # node symb\n",
" # # {'name': 'FIRSTMM_DB', 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'}, # node symb/nsymb ,edge nsymb\n",
"\n",
" # # {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'}, # node symb/nsymb\n",
" # # {'name': 'PROTEINS_full', 'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'}, # node symb/nsymb\n",
" # # {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb\n",
" # {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1.mat',\n",
" # 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb\n",
" # {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109.mat',\n",
" # 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb\n",
" # {'name': 'NCI-HIV', 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf',\n",
" # 'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',}, # node/edge symb\n",
"\n",
" # # not working below\n",
" # {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',},\n",
" # {'name': 'PTC_FR', 'dataset': '../datasets/PTC/Train/FR.ds',},\n",
" # {'name': 'PTC_MM', 'dataset': '../datasets/PTC/Train/MM.ds',},\n",
" # {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',},\n",
"]\n", "]\n",
"estimator = spkernel\n", "estimator = spkernel\n",
"mixkernel = functools.partial(kernelsum, deltakernel, rbf_kernel)\n",
"param_grid_precomputed = {'node_kernels': [{'symb': deltakernel, 'nsymb': rbf_kernel, 'mix': mixkernel}]}\n",
"param_grid = [{'C': np.logspace(-10, 10, num = 41, base = 10)}, \n",
" {'alpha': np.logspace(-10, 10, num = 41, base = 10)}]\n",
"mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)\n",
"param_grid_precomputed = {'node_kernels': [\n",
" {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}]}\n",
"param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)},\n",
" {'alpha': np.logspace(-10, 10, num=41, base=10)}]\n",
"\n", "\n",
"for ds in dslist:\n", "for ds in dslist:\n",
" print()\n", " print()\n",
" print(ds['name'])\n", " print(ds['name'])\n",
" model_selection_for_precomputed_kernel(\n", " model_selection_for_precomputed_kernel(\n",
" ds['dataset'], \n",
" estimator, \n",
" param_grid_precomputed, \n",
" (param_grid[1] if ('task' in ds and ds['task'] == 'regression') else param_grid[0]), \n",
" (ds['task'] if 'task' in ds else 'classification'), \n",
" ds['dataset'],\n",
" estimator,\n",
" param_grid_precomputed,\n",
" (param_grid[1] if ('task' in ds and ds['task']\n",
" == 'regression') else param_grid[0]),\n",
" (ds['task'] if 'task' in ds else 'classification'),\n",
" NUM_TRIALS=30,\n", " NUM_TRIALS=30,\n",
" datafile_y=(ds['dataset_y'] if 'dataset_y' in ds else None),\n", " datafile_y=(ds['dataset_y'] if 'dataset_y' in ds else None),\n",
" extra_params=(ds['extra_params'] if 'extra_params' in ds else None),\n", " extra_params=(ds['extra_params'] if 'extra_params' in ds else None),\n",
" ds_name=ds['name'],\n", " ds_name=ds['name'],\n",
" n_jobs=multiprocessing.cpu_count())\n",
" \n",
"# %lprun -f trial_do -f spkernel -f spkernel_do -f model_selection_for_precomputed_kernel \\\n",
"# model_selection_for_precomputed_kernel( \\\n",
"# ds['dataset'], \\\n",
"# estimator, \\\n",
"# param_grid_precomputed, \\\n",
"# (param_grid[1] if ('task' in ds and ds['task'] == 'regression') else param_grid[0]), \\\n",
"# (ds['task'] if 'task' in ds else 'classification'), \\\n",
"# NUM_TRIALS=30, \\\n",
"# datafile_y=(ds['dataset_y'] if 'dataset_y' in ds else None), \\\n",
"# extra_params=(ds['extra_params'] if 'extra_params' in ds else None), \\\n",
"# ds_name=ds['name'], \\\n",
"# n_jobs=multiprocessing.cpu_count()) \n",
" print()"
" n_jobs=multiprocessing.cpu_count(),\n",
" read_gm_from_file=False)\n",
" print()\n"
] ]
}, },
{ {
@@ -713,8 +735,8 @@
], ],
"metadata": { "metadata": {
"kernelspec": { "kernelspec": {
"display_name": "Python 3 (Spyder)",
"language": "python3",
"display_name": "Python 3",
"language": "python",
"name": "python3" "name": "python3"
}, },
"language_info": { "language_info": {
@@ -727,7 +749,7 @@
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": "3.5.2"
"version": "3.6.6"
} }
}, },
"nbformat": 4, "nbformat": 4,


+ 11
- 11
notebooks/run_spkernel.py View File

@@ -7,21 +7,21 @@ from pygraph.utils.kernels import deltakernel, gaussiankernel, kernelproduct
#from pygraph.utils.model_selection_precomputed import trial_do #from pygraph.utils.model_selection_precomputed import trial_do


dslist = [ dslist = [
# {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',
# 'task': 'regression'}, # node symb
# {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression',
# 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt', },
# # contains single node graph, node symb
# {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds', }, # node/edge symb
# {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds', }, # unlabeled
# {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat',
# 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, # node/edge symb
{'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',
'task': 'regression'}, # node symb
{'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression',
'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt', },
# contains single node graph, node symb
{'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds', }, # node/edge symb
{'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds', }, # unlabeled
{'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat',
'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, # node/edge symb
{'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'}, {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'},
# node nsymb # node nsymb
{'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'}, {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'},
# node symb/nsymb # node symb/nsymb
# {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, # {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'},
# # node/edge symb
# node/edge symb
# {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat', # {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat',
# 'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}}, # node symb # 'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}}, # node symb


@@ -56,7 +56,7 @@ estimator = spkernel
mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)
param_grid_precomputed = {'node_kernels': [ param_grid_precomputed = {'node_kernels': [
{'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}]} {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}]}
param_grid = [{'C': np.logspace(-10, 3, num=27, base=10)},
param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)},
{'alpha': np.logspace(-10, 10, num=41, base=10)}] {'alpha': np.logspace(-10, 10, num=41, base=10)}]


for ds in dslist: for ds in dslist:


+ 8
- 8
notebooks/run_structuralspkernel.py View File

@@ -23,10 +23,10 @@ dslist = [
# {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds', }, # unlabeled # {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds', }, # unlabeled
# {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat', # {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat',
# 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, # node/edge symb # 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, # node/edge symb
{'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'},
# node nsymb
# {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'},
# # node symb/nsymb
# {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'},
# # node nsymb
{'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'},
# node symb/nsymb
# {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, # {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'},
# # node/edge symb # # node/edge symb
# {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat', # {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat',
@@ -39,8 +39,8 @@ dslist = [
# #
# # {'name': 'DHFR', 'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt'}, # node symb/nsymb # # {'name': 'DHFR', 'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt'}, # node symb/nsymb
# # {'name': 'SYNTHETIC', 'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt'}, # node symb/nsymb # # {'name': 'SYNTHETIC', 'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt'}, # node symb/nsymb
# # {'name': 'MSRC9', 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'}, # node symb, missing values
# # {'name': 'MSRC21', 'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'}, # node symb, missing values
# {'name': 'MSRC9', 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'}, # node symb, missing values
# {'name': 'MSRC21', 'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'}, # node symb, missing values
# # {'name': 'FIRSTMM_DB', 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'}, # node symb/nsymb ,edge nsymb # # {'name': 'FIRSTMM_DB', 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'}, # node symb/nsymb ,edge nsymb


# # {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'}, # node symb/nsymb # # {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'}, # node symb/nsymb
@@ -53,8 +53,8 @@ dslist = [
# {'name': 'NCI-HIV', 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf', # {'name': 'NCI-HIV', 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf',
# 'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',}, # node/edge symb # 'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',}, # node/edge symb


# # not working below
# {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',},
# # not working below
# {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',},
# {'name': 'PTC_FR', 'dataset': '../datasets/PTC/Train/FR.ds',}, # {'name': 'PTC_FR', 'dataset': '../datasets/PTC/Train/FR.ds',},
# {'name': 'PTC_MM', 'dataset': '../datasets/PTC/Train/MM.ds',}, # {'name': 'PTC_MM', 'dataset': '../datasets/PTC/Train/MM.ds',},
# {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',}, # {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',},


+ 1
- 1
notebooks/run_untilhpathkernel.py View File

@@ -62,7 +62,7 @@ dslist = [
] ]
estimator = untilhpathkernel estimator = untilhpathkernel
mixkernel = functools.partial(kernelproduct, deltakernel, rbf_kernel) mixkernel = functools.partial(kernelproduct, deltakernel, rbf_kernel)
param_grid_precomputed = {'depth': np.linspace(7, 10, 10),
param_grid_precomputed = {'depth': np.linspace(1, 10, 10),
'k_func': ['tanimoto', 'MinMax']} 'k_func': ['tanimoto', 'MinMax']}
param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)}, param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)},
{'alpha': np.logspace(-10, 10, num=41, base=10)}] {'alpha': np.logspace(-10, 10, num=41, base=10)}]


+ 0
- 77
notebooks/test.py View File

@@ -1,77 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Sep 28 16:37:29 2018

@author: ljia
"""

import functools
from libs import *
import multiprocessing
from sklearn.metrics.pairwise import rbf_kernel

from pygraph.kernels.structuralspKernel import structuralspkernel
from pygraph.utils.kernels import deltakernel, kernelproduct

dslist = [
# {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',
# 'task': 'regression'}, # node symb
# {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression',
# 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt', }, # contains single node graph, node symb
{'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds', }, # node/edge symb

# {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb
# # # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb
# # # {'name': 'COX2', 'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt'}, # node symb/nsymb
# {'name': 'Fingerprint', 'dataset': '../datasets/Fingerprint/Fingerprint_A.txt'},
#
# # {'name': 'DHFR', 'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt'}, # node symb/nsymb
# # {'name': 'SYNTHETIC', 'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt'}, # node symb/nsymb
# # {'name': 'MSRC9', 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'}, # node symb
# # {'name': 'MSRC21', 'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'}, # node symb
# # {'name': 'FIRSTMM_DB', 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'}, # node symb/nsymb ,edge nsymb

# # {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'}, # node symb/nsymb
# # {'name': 'PROTEINS_full', 'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'}, # node symb/nsymb
# # {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb
# {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1.mat',
# 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb
# {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109.mat',
# 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb
# {'name': 'NCI-HIV', 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf',
# 'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',}, # node/edge symb

# # not working below
# {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',},
# {'name': 'PTC_FR', 'dataset': '../datasets/PTC/Train/FR.ds',},
# {'name': 'PTC_MM', 'dataset': '../datasets/PTC/Train/MM.ds',},
# {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',},
]
estimator = structuralspkernel
mixkernel = functools.partial(kernelproduct, deltakernel, rbf_kernel)
param_grid_precomputed = {'node_kernels':
[{'symb': deltakernel, 'nsymb': rbf_kernel, 'mix': mixkernel}],
'edge_kernels':
[{'symb': deltakernel, 'nsymb': rbf_kernel, 'mix': mixkernel}]}
param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)},
{'alpha': np.logspace(-10, 10, num=41, base=10)}]

for ds in dslist:
print()
print(ds['name'])
model_selection_for_precomputed_kernel(
ds['dataset'],
estimator,
param_grid_precomputed,
(param_grid[1] if ('task' in ds and ds['task']
== 'regression') else param_grid[0]),
(ds['task'] if 'task' in ds else 'classification'),
NUM_TRIALS=30,
datafile_y=(ds['dataset_y'] if 'dataset_y' in ds else None),
extra_params=(ds['extra_params'] if 'extra_params' in ds else None),
ds_name=ds['name'],
n_jobs=multiprocessing.cpu_count(),
read_gm_from_file=False)
print()

+ 26
- 20
pygraph/kernels/commonWalkKernel.py View File

@@ -85,21 +85,20 @@ def commonwalkkernel(*args,


# ---- use pool.imap_unordered to parallel and track progress. ---- # ---- use pool.imap_unordered to parallel and track progress. ----
pool = Pool(n_jobs) pool = Pool(n_jobs)
itr = combinations_with_replacement(range(0, len(Gn)), 2)
itr = zip(combinations_with_replacement(Gn, 2),
combinations_with_replacement(range(0, len(Gn)), 2))
len_itr = int(len(Gn) * (len(Gn) + 1) / 2) len_itr = int(len(Gn) * (len(Gn) + 1) / 2)
if len_itr < 1000 * n_jobs: if len_itr < 1000 * n_jobs:
chunksize = int(len_itr / n_jobs) + 1 chunksize = int(len_itr / n_jobs) + 1
else: else:
chunksize = 100
chunksize = 1000


# direct product graph method - exponential # direct product graph method - exponential
if compute_method == 'exp': if compute_method == 'exp':
do_partial = partial(_commonwalkkernel_exp, Gn, node_label, edge_label,
weight)
do_partial = partial(wrapper_cw_exp, node_label, edge_label, weight)
# direct product graph method - geometric # direct product graph method - geometric
elif compute_method == 'geo': elif compute_method == 'geo':
do_partial = partial(_commonwalkkernel_geo, Gn, node_label, edge_label,
weight)
do_partial = partial(wrapper_cw_geo, node_label, edge_label, weight)


for i, j, kernel in tqdm( for i, j, kernel in tqdm(
pool.imap_unordered(do_partial, itr, chunksize), pool.imap_unordered(do_partial, itr, chunksize),
@@ -153,7 +152,7 @@ def commonwalkkernel(*args,
return Kmatrix, run_time return Kmatrix, run_time




def _commonwalkkernel_exp(Gn, node_label, edge_label, beta, ij):
def _commonwalkkernel_exp(g1, g2, node_label, edge_label, beta):
"""Calculate walk graph kernels up to n between 2 graphs using exponential """Calculate walk graph kernels up to n between 2 graphs using exponential
series. series.


@@ -175,10 +174,6 @@ def _commonwalkkernel_exp(Gn, node_label, edge_label, beta, ij):
kernel : float kernel : float
The common walk Kernel between 2 graphs. The common walk Kernel between 2 graphs.
""" """
iglobal = ij[0]
jglobal = ij[1]
g1 = Gn[iglobal]
g2 = Gn[jglobal]


# get tensor product / direct product # get tensor product / direct product
gp = direct_product(g1, g2, node_label, edge_label) gp = direct_product(g1, g2, node_label, edge_label)
@@ -219,10 +214,18 @@ def _commonwalkkernel_exp(Gn, node_label, edge_label, beta, ij):
# print(np.exp(weight * A)) # print(np.exp(weight * A))
# print('-------') # print('-------')


return iglobal, jglobal, exp_D.sum()
return exp_D.sum()




def _commonwalkkernel_geo(Gn, node_label, edge_label, gamma, ij):
def wrapper_cw_exp(node_label, edge_label, beta, itr_item):
g1 = itr_item[0][0]
g2 = itr_item[0][1]
i = itr_item[1][0]
j = itr_item[1][1]
return i, j, _commonwalkkernel_exp(g1, g2, node_label, edge_label, beta)


def _commonwalkkernel_geo(g1, g2, node_label, edge_label, gamma):
"""Calculate common walk graph kernels up to n between 2 graphs using """Calculate common walk graph kernels up to n between 2 graphs using
geometric series. geometric series.


@@ -244,19 +247,22 @@ def _commonwalkkernel_geo(Gn, node_label, edge_label, gamma, ij):
kernel : float kernel : float
The common walk Kernel between 2 graphs. The common walk Kernel between 2 graphs.
""" """
iglobal = ij[0]
jglobal = ij[1]
g1 = Gn[iglobal]
g2 = Gn[jglobal]

# get tensor product / direct product # get tensor product / direct product
gp = direct_product(g1, g2, node_label, edge_label) gp = direct_product(g1, g2, node_label, edge_label)
A = nx.adjacency_matrix(gp).todense() A = nx.adjacency_matrix(gp).todense()
mat = np.identity(len(A)) - gamma * A mat = np.identity(len(A)) - gamma * A
try: try:
return iglobal, jglobal, mat.I.sum()
return mat.I.sum()
except np.linalg.LinAlgError: except np.linalg.LinAlgError:
return iglobal, jglobal, np.nan
return np.nan
def wrapper_cw_geo(node_label, edge_label, gama, itr_item):
g1 = itr_item[0][0]
g2 = itr_item[0][1]
i = itr_item[1][0]
j = itr_item[1][1]
return i, j, _commonwalkkernel_geo(g1, g2, node_label, edge_label, gama)




def _commonwalkkernel_brute(walks1, def _commonwalkkernel_brute(walks1,


+ 23
- 20
pygraph/kernels/spKernel.py View File

@@ -8,7 +8,6 @@ import sys
import time import time
from itertools import combinations_with_replacement, product from itertools import combinations_with_replacement, product
from functools import partial from functools import partial
from joblib import Parallel, delayed
from multiprocessing import Pool from multiprocessing import Pool
from tqdm import tqdm from tqdm import tqdm


@@ -89,7 +88,8 @@ def spkernel(*args,


pool = Pool(n_jobs) pool = Pool(n_jobs)
# get shortest path graphs of Gn # get shortest path graphs of Gn
getsp_partial = partial(wrap_getSPGraph, Gn, weight)
getsp_partial = partial(wrapper_getSPGraph, weight)
itr = zip(Gn, range(0, len(Gn)))
if len(Gn) < 1000 * n_jobs: if len(Gn) < 1000 * n_jobs:
# # use default chunksize as pool.map when iterable is less than 100 # # use default chunksize as pool.map when iterable is less than 100
# chunksize, extra = divmod(len(Gn), n_jobs * 4) # chunksize, extra = divmod(len(Gn), n_jobs * 4)
@@ -98,9 +98,8 @@ def spkernel(*args,
chunksize = int(len(Gn) / n_jobs) + 1 chunksize = int(len(Gn) / n_jobs) + 1
else: else:
chunksize = 1000 chunksize = 1000
# chunksize = 300 # int(len(list(itr)) / n_jobs)
for i, g in tqdm( for i, g in tqdm(
pool.imap_unordered(getsp_partial, range(0, len(Gn)), chunksize),
pool.imap_unordered(getsp_partial, itr, chunksize),
desc='getting sp graphs', file=sys.stdout): desc='getting sp graphs', file=sys.stdout):
Gn[i] = g Gn[i] = g
pool.close() pool.close()
@@ -144,8 +143,9 @@ def spkernel(*args,


# ---- use pool.imap_unordered to parallel and track progress. ---- # ---- use pool.imap_unordered to parallel and track progress. ----
pool = Pool(n_jobs) pool = Pool(n_jobs)
do_partial = partial(spkernel_do, Gn, ds_attrs, node_label, node_kernels)
itr = combinations_with_replacement(range(0, len(Gn)), 2)
do_partial = partial(wrapper_sp_do, ds_attrs, node_label, node_kernels)
itr = zip(combinations_with_replacement(Gn, 2),
combinations_with_replacement(range(0, len(Gn)), 2))
len_itr = int(len(Gn) * (len(Gn) + 1) / 2) len_itr = int(len(Gn) * (len(Gn) + 1) / 2)
if len_itr < 1000 * n_jobs: if len_itr < 1000 * n_jobs:
chunksize = int(len_itr / n_jobs) + 1 chunksize = int(len_itr / n_jobs) + 1
@@ -200,15 +200,10 @@ def spkernel(*args,
return Kmatrix, run_time, idx return Kmatrix, run_time, idx




def spkernel_do(Gn, ds_attrs, node_label, node_kernels, ij):

i = ij[0]
j = ij[1]
g1 = Gn[i]
g2 = Gn[j]
def spkernel_do(g1, g2, ds_attrs, node_label, node_kernels):
kernel = 0 kernel = 0


# try:
# compute shortest path matrices first, method borrowed from FCSP. # compute shortest path matrices first, method borrowed from FCSP.
if ds_attrs['node_labeled']: if ds_attrs['node_labeled']:
# node symb and non-synb labeled # node symb and non-synb labeled
@@ -243,7 +238,7 @@ def spkernel_do(Gn, ds_attrs, node_label, node_kernels, ij):
g1.edges(data=True), g2.edges(data=True)): g1.edges(data=True), g2.edges(data=True)):
if e1[2]['cost'] == e2[2]['cost']: if e1[2]['cost'] == e2[2]['cost']:
kernel += 1 kernel += 1
return i, j, kernel
return kernel


# compute graph kernels # compute graph kernels
if ds_attrs['is_directed']: if ds_attrs['is_directed']:
@@ -293,12 +288,20 @@ def spkernel_do(Gn, ds_attrs, node_label, node_kernels, ij):
# kn1 = vk_mat[x1][x2] * vk_mat[y1][y2] # kn1 = vk_mat[x1][x2] * vk_mat[y1][y2]
# kn2 = vk_mat[x1][y2] * vk_mat[y1][x2] # kn2 = vk_mat[x1][y2] * vk_mat[y1][x2]
# kernel += kn1 + kn2 # kernel += kn1 + kn2
# except KeyError: # missing labels or attributes
# pass


return i, j, kernel
return kernel


def wrapper_sp_do(ds_attrs, node_label, node_kernels, itr_item):
g1 = itr_item[0][0]
g2 = itr_item[0][1]
i = itr_item[1][0]
j = itr_item[1][1]
return i, j, spkernel_do(g1, g2, ds_attrs, node_label, node_kernels)




def wrap_getSPGraph(Gn, weight, i):
return i, getSPGraph(Gn[i], edge_weight=weight)
# return i, nx.floyd_warshall_numpy(Gn[i], weight=weight)
def wrapper_getSPGraph(weight, itr_item):
g = itr_item[0]
i = itr_item[1]
return i, getSPGraph(g, edge_weight=weight)
# return i, nx.floyd_warshall_numpy(g, weight=weight)

+ 98
- 66
pygraph/kernels/structuralspKernel.py View File

@@ -12,7 +12,6 @@ import sys
import time import time
from itertools import combinations, combinations_with_replacement, product from itertools import combinations, combinations_with_replacement, product
from functools import partial from functools import partial
from joblib import Parallel, delayed
from multiprocessing import Pool from multiprocessing import Pool
from tqdm import tqdm from tqdm import tqdm


@@ -71,7 +70,6 @@ def structuralspkernel(*args,
""" """
# pre-process # pre-process
Gn = args[0] if len(args) == 1 else [args[0], args[1]] Gn = args[0] if len(args) == 1 else [args[0], args[1]]

weight = None weight = None
if edge_weight is None: if edge_weight is None:
print('\n None edge weight specified. Set all weight to 1.\n') print('\n None edge weight specified. Set all weight to 1.\n')
@@ -98,34 +96,61 @@ def structuralspkernel(*args,
start_time = time.time() start_time = time.time()


# get shortest paths of each graph in Gn # get shortest paths of each graph in Gn
splist = [[] for _ in range(len(Gn))]
splist = [None] * len(Gn)
pool = Pool(n_jobs) pool = Pool(n_jobs)
# get shortest path graphs of Gn # get shortest path graphs of Gn
getsp_partial = partial(wrap_getSP, Gn, weight, ds_attrs['is_directed'])
getsp_partial = partial(wrapper_getSP, weight, ds_attrs['is_directed'])
itr = zip(Gn, range(0, len(Gn)))
if len(Gn) < 1000 * n_jobs: if len(Gn) < 1000 * n_jobs:
chunksize = int(len(Gn) / n_jobs) + 1 chunksize = int(len(Gn) / n_jobs) + 1
else: else:
chunksize = 1000 chunksize = 1000
# chunksize = 300 # int(len(list(itr)) / n_jobs) # chunksize = 300 # int(len(list(itr)) / n_jobs)
for i, sp in tqdm( for i, sp in tqdm(
pool.imap_unordered(getsp_partial, range(0, len(Gn)), chunksize),
pool.imap_unordered(getsp_partial, itr, chunksize),
desc='getting shortest paths', desc='getting shortest paths',
file=sys.stdout): file=sys.stdout):
splist[i] = sp splist[i] = sp
# time.sleep(10)
pool.close() pool.close()
pool.join() pool.join()

# # ---- use pool.map to parallel ----
# result_sp = pool.map(getsp_partial, range(0, len(Gn)))
# for i in result_sp:
# Gn[i[0]] = i[1]
# or
# getsp_partial = partial(wrap_getSP, Gn, weight)
# for i, g in tqdm(
# pool.map(getsp_partial, range(0, len(Gn))),
# desc='getting sp graphs',
# file=sys.stdout):
# Gn[i] = g
# # get shortest paths of each graph in Gn
# splist = [[] for _ in range(len(Gn))]
# # get shortest path graphs of Gn
# getsp_partial = partial(wrapper_getSP, weight, ds_attrs['is_directed'])
# itr = zip(Gn, range(0, len(Gn)))
# if len(Gn) < 1000 * n_jobs:
# chunksize = int(len(Gn) / n_jobs) + 1
# else:
# chunksize = 1000
# # chunksize = 300 # int(len(list(itr)) / n_jobs)
# from contextlib import closing
# with closing(Pool(n_jobs)) as pool:
## for i, sp in tqdm(
# res = pool.imap_unordered(getsp_partial, itr, 10)
## desc='getting shortest paths',
## file=sys.stdout):
## splist[i] = sp
## time.sleep(10)
# pool.close()
# pool.join()
# ss = 0
# ss += sys.getsizeof(splist)
# for spss in splist:
# ss += sys.getsizeof(spss)
# for spp in spss:
# ss += sys.getsizeof(spp)
# time.sleep(20)
# # ---- direct running, normally use single CPU core. ----
# splist = []
# for g in tqdm(Gn, desc='getting sp graphs', file=sys.stdout):
# splist.append(get_shortest_paths(g, weight, ds_attrs['is_directed']))


# # ---- only for the Fast Computation of Shortest Path Kernel (FCSP) # # ---- only for the Fast Computation of Shortest Path Kernel (FCSP)
# sp_ml = [0] * len(Gn) # shortest path matrices # sp_ml = [0] * len(Gn) # shortest path matrices
@@ -149,9 +174,11 @@ def structuralspkernel(*args,


# ---- use pool.imap_unordered to parallel and track progress. ---- # ---- use pool.imap_unordered to parallel and track progress. ----
pool = Pool(n_jobs) pool = Pool(n_jobs)
do_partial = partial(structuralspkernel_do, Gn, splist, ds_attrs,
node_label, edge_label, node_kernels, edge_kernels)
itr = combinations_with_replacement(range(0, len(Gn)), 2)
do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label,
node_kernels, edge_kernels)
itr = zip(combinations_with_replacement(Gn, 2),
combinations_with_replacement(splist, 2),
combinations_with_replacement(range(0, len(Gn)), 2))
len_itr = int(len(Gn) * (len(Gn) + 1) / 2) len_itr = int(len(Gn) * (len(Gn) + 1) / 2)
if len_itr < 1000 * n_jobs: if len_itr < 1000 * n_jobs:
chunksize = int(len_itr / n_jobs) + 1 chunksize = int(len_itr / n_jobs) + 1
@@ -166,36 +193,36 @@ def structuralspkernel(*args,
pool.close() pool.close()
pool.join() pool.join()


# # ---- use pool.map to parallel. ----
# # result_perf = pool.map(do_partial, itr)
# do_partial = partial(spkernel_do, Gn, ds_attrs, node_label, node_kernels)
# itr = combinations_with_replacement(range(0, len(Gn)), 2)
# for i, j, kernel in tqdm(
# pool.map(do_partial, itr), desc='calculating kernels',
# file=sys.stdout):
# Kmatrix[i][j] = kernel
# Kmatrix[j][i] = kernel
# pool.close()
# pool.join()

# # ---- use joblib.Parallel to parallel and track progress. ----
# result_perf = Parallel(
# n_jobs=n_jobs, verbose=10)(
# delayed(do_partial)(ij)
# for ij in combinations_with_replacement(range(0, len(Gn)), 2))
# result_perf = [
# do_partial(ij)
# for ij in combinations_with_replacement(range(0, len(Gn)), 2)
# ]
# for i in result_perf:
# Kmatrix[i[0]][i[1]] = i[2]
# Kmatrix[i[1]][i[0]] = i[2]
# # ---- use pool.imap_unordered to parallel and track progress. ----
# do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label,
# node_kernels, edge_kernels)
# itr = zip(combinations_with_replacement(Gn, 2),
# combinations_with_replacement(splist, 2),
# combinations_with_replacement(range(0, len(Gn)), 2))
# len_itr = int(len(Gn) * (len(Gn) + 1) / 2)
# if len_itr < 1000 * n_jobs:
# chunksize = int(len_itr / n_jobs) + 1
# else:
# chunksize = 1000
# from contextlib import closing
# with closing(Pool(n_jobs)) as pool:
# for i, j, kernel in tqdm(
# pool.imap_unordered(do_partial, itr, 1000),
# desc='calculating kernels',
# file=sys.stdout):
# Kmatrix[i][j] = kernel
# Kmatrix[j][i] = kernel
# pool.close()
# pool.join()



# # ---- direct running, normally use single CPU core. ---- # # ---- direct running, normally use single CPU core. ----
# itr = combinations_with_replacement(range(0, len(Gn)), 2)
# itr = zip(combinations_with_replacement(Gn, 2),
# combinations_with_replacement(splist, 2),
# combinations_with_replacement(range(0, len(Gn)), 2))
# for gs in tqdm(itr, desc='calculating kernels', file=sys.stdout): # for gs in tqdm(itr, desc='calculating kernels', file=sys.stdout):
# i, j, kernel = structuralspkernel_do(Gn, splist, ds_attrs,
# node_label, edge_label, node_kernels, edge_kernels, gs)
# i, j, kernel = wrapper_ssp_do(ds_attrs, node_label, edge_label,
# node_kernels, edge_kernels, gs)
# if(kernel > 1): # if(kernel > 1):
# print("error here ") # print("error here ")
# Kmatrix[i][j] = kernel # Kmatrix[i][j] = kernel
@@ -209,18 +236,11 @@ def structuralspkernel(*args,
return Kmatrix, run_time return Kmatrix, run_time




def structuralspkernel_do(Gn, splist, ds_attrs, node_label, edge_label,
node_kernels, edge_kernels, ij):

iglobal = ij[0]
jglobal = ij[1]
g1 = Gn[iglobal]
g2 = Gn[jglobal]
spl1 = splist[iglobal]
spl2 = splist[jglobal]
def structuralspkernel_do(g1, g2, spl1, spl2, ds_attrs, node_label, edge_label,
node_kernels, edge_kernels):
kernel = 0 kernel = 0


#try:
# First, compute shortest path matrices, method borrowed from FCSP. # First, compute shortest path matrices, method borrowed from FCSP.
if ds_attrs['node_labeled']: if ds_attrs['node_labeled']:
# node symb and non-synb labeled # node symb and non-synb labeled
@@ -369,11 +389,19 @@ def structuralspkernel_do(Gn, splist, ds_attrs, node_label, edge_label,
# kn1 = vk_mat[x1][x2] * vk_mat[y1][y2] # kn1 = vk_mat[x1][x2] * vk_mat[y1][y2]
# kn2 = vk_mat[x1][y2] * vk_mat[y1][x2] # kn2 = vk_mat[x1][y2] * vk_mat[y1][x2]
# Kmatrix += kn1 + kn2 # Kmatrix += kn1 + kn2
#except KeyError: # missing labels or attributes
# print("toto")
# pass
return kernel



return iglobal, jglobal, kernel
def wrapper_ssp_do(ds_attrs, node_label, edge_label, node_kernels,
edge_kernels, itr_item):
g1 = itr_item[0][0]
g2 = itr_item[0][1]
spl1 = itr_item[1][0]
spl2 = itr_item[1][1]
i = itr_item[2][0]
j = itr_item[2][1]
return i, j, structuralspkernel_do(g1, g2, spl1, spl2, ds_attrs,
node_label, edge_label, node_kernels, edge_kernels)




def get_shortest_paths(G, weight, directed): def get_shortest_paths(G, weight, directed):
@@ -397,17 +425,21 @@ def get_shortest_paths(G, weight, directed):
for n1, n2 in combinations(G.nodes(), 2): for n1, n2 in combinations(G.nodes(), 2):
try: try:
spltemp = list(nx.all_shortest_paths(G, n1, n2, weight=weight)) spltemp = list(nx.all_shortest_paths(G, n1, n2, weight=weight))
except nx.NetworkXNoPath: # nodes not connected
# sp.append([])
pass
else:
sp += spltemp sp += spltemp
# each edge walk is counted twice, starting from both its extreme nodes. # each edge walk is counted twice, starting from both its extreme nodes.
if not directed: if not directed:
sp += [sptemp[::-1] for sptemp in spltemp] sp += [sptemp[::-1] for sptemp in spltemp]
except nx.NetworkXNoPath: # nodes not connected
# sp.append([])
pass
# add single nodes as length 0 paths. # add single nodes as length 0 paths.
sp += [[n] for n in G.nodes()] sp += [[n] for n in G.nodes()]
return sp return sp




def wrap_getSP(Gn, weight, directed, i):
return i, get_shortest_paths(Gn[i], weight, directed)
def wrapper_getSP(weight, directed, itr_item):
g = itr_item[0]
i = itr_item[1]
return i, get_shortest_paths(g, weight, directed)

+ 20
- 21
pygraph/kernels/untilHPathKernel.py View File

@@ -13,7 +13,6 @@ from itertools import chain, combinations_with_replacement
from functools import partial from functools import partial
from multiprocessing import Pool from multiprocessing import Pool
from tqdm import tqdm from tqdm import tqdm
import traceback


import networkx as nx import networkx as nx
import numpy as np import numpy as np
@@ -77,15 +76,15 @@ def untilhpathkernel(*args,
# but this may cost a lot of memory for large datasets. # but this may cost a lot of memory for large datasets.
pool = Pool(n_jobs) pool = Pool(n_jobs)
all_paths = [[] for _ in range(len(Gn))] all_paths = [[] for _ in range(len(Gn))]
getps_partial = partial(wrap_find_all_paths_until_length, Gn, depth,
getps_partial = partial(wrapper_find_all_paths_until_length, depth,
ds_attrs, node_label, edge_label) ds_attrs, node_label, edge_label)
itr = zip(Gn, range(0, len(Gn)))
if len(Gn) < 1000 * n_jobs: if len(Gn) < 1000 * n_jobs:
chunksize = int(len(Gn) / n_jobs) + 1 chunksize = int(len(Gn) / n_jobs) + 1
else: else:
chunksize = 1000 chunksize = 1000
# chunksize = 300 # int(len(list(itr)) / n_jobs)
for i, ps in tqdm( for i, ps in tqdm(
pool.imap_unordered(getps_partial, range(0, len(Gn)), chunksize),
pool.imap_unordered(getps_partial, itr, chunksize),
desc='getting paths', file=sys.stdout): desc='getting paths', file=sys.stdout):
all_paths[i] = ps all_paths[i] = ps
pool.close() pool.close()
@@ -110,8 +109,9 @@ def untilhpathkernel(*args,
pass pass
else: else:
pool = Pool(n_jobs) pool = Pool(n_jobs)
do_partial = partial(_untilhpathkernel_do_naive, all_paths, k_func)
itr = combinations_with_replacement(range(0, len(Gn)), 2)
do_partial = partial(wrapper_uhpath_do_naive, k_func)
itr = zip(combinations_with_replacement(all_paths, 2),
combinations_with_replacement(range(0, len(Gn)), 2))
len_itr = int(len(Gn) * (len(Gn) + 1) / 2) len_itr = int(len(Gn) * (len(Gn) + 1) / 2)
if len_itr < 1000 * n_jobs: if len_itr < 1000 * n_jobs:
chunksize = int(len_itr / n_jobs) + 1 chunksize = int(len_itr / n_jobs) + 1
@@ -216,7 +216,7 @@ def _untilhpathkernel_do_gst(gst1, gst2, paths1, paths2, k_func):
return kernel return kernel




def _untilhpathkernel_do_naive(paths_list, k_func, ij):
def _untilhpathkernel_do_naive(paths1, paths2, k_func):
"""Calculate path graph kernels up to depth d between 2 graphs naively. """Calculate path graph kernels up to depth d between 2 graphs naively.


Parameters Parameters
@@ -235,10 +235,6 @@ def _untilhpathkernel_do_naive(paths_list, k_func, ij):
kernel : float kernel : float
Path kernel up to h between 2 graphs. Path kernel up to h between 2 graphs.
""" """
iglobal = ij[0]
jglobal = ij[1]
paths1 = paths_list[iglobal]
paths2 = paths_list[jglobal]
all_paths = list(set(paths1 + paths2)) all_paths = list(set(paths1 + paths2))


if k_func == 'tanimoto': if k_func == 'tanimoto':
@@ -260,12 +256,18 @@ def _untilhpathkernel_do_naive(paths_list, k_func, ij):
kernel = np.sum(np.minimum(vector1, vector2)) / \ kernel = np.sum(np.minimum(vector1, vector2)) / \
np.sum(np.maximum(vector1, vector2)) np.sum(np.maximum(vector1, vector2))


return iglobal, jglobal, kernel
return kernel




# @todo: (can be removed maybe) this method find paths repetively, it could be faster.
def wrapper_uhpath_do_naive(k_func, itr_item):
plist1 = itr_item[0][0]
plist2 = itr_item[0][1]
i = itr_item[1][0]
j = itr_item[1][1]
return i, j, _untilhpathkernel_do_naive(plist1, plist2, k_func)




# @todo: (can be removed maybe) this method find paths repetively, it could be faster.
def find_all_paths_until_length(G, def find_all_paths_until_length(G,
length, length,
ds_attrs, ds_attrs,
@@ -368,15 +370,12 @@ def find_all_paths_until_length(G,
return [tuple([len(path)]) for path in all_paths] return [tuple([len(path)]) for path in all_paths]
def wrap_find_all_paths_until_length(Gn, length, ds_attrs, node_label,
edge_label, i):
try:
return i, find_all_paths_until_length(Gn[i], length, ds_attrs,
def wrapper_find_all_paths_until_length(length, ds_attrs, node_label,
edge_label, itr_item):
g = itr_item[0]
i = itr_item[1]
return i, find_all_paths_until_length(g, length, ds_attrs,
node_label=node_label, edge_label=edge_label) node_label=node_label, edge_label=edge_label)
except Exception as e:
traceback.print_exc()
print('')
raise e




def paths2GSuffixTree(paths): def paths2GSuffixTree(paths):


+ 182
- 124
pygraph/utils/model_selection_precomputed.py View File

@@ -206,54 +206,50 @@ def model_selection_for_precomputed_kernel(datafile,
'3. Fitting and predicting using nested cross validation. This could really take a while...' '3. Fitting and predicting using nested cross validation. This could really take a while...'
) )
# pool = Pool(n_jobs)
# trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, gram_matrices, y, model_type)
# train_pref = []
# val_pref = []
# test_pref = []
## if NUM_TRIALS < 1000 * n_jobs:
## chunksize = int(NUM_TRIALS / n_jobs) + 1
## else:
## chunksize = 1000
# chunksize = 1
# for o1, o2, o3 in tqdm(pool.imap_unordered(trial_do_partial, range(NUM_TRIALS), chunksize), desc='cross validation', file=sys.stdout):
# train_pref.append(o1)
# val_pref.append(o2)
# test_pref.append(o3)
# pool.close()
# pool.join()
# ---- use pool.map to parallel. ----
pool = Pool(n_jobs) pool = Pool(n_jobs)
trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, gram_matrices, y, model_type) trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, gram_matrices, y, model_type)
train_pref = []
val_pref = []
test_pref = []
# if NUM_TRIALS < 100:
# chunksize, extra = divmod(NUM_TRIALS, n_jobs * 4)
# if extra:
# chunksize += 1
# else:
# chunksize = 100
chunksize = 1
for o1, o2, o3 in tqdm(pool.imap_unordered(trial_do_partial, range(NUM_TRIALS), chunksize), desc='cross validation', file=sys.stdout):
train_pref.append(o1)
val_pref.append(o2)
test_pref.append(o3)
pool.close()
pool.join()
# # ---- use pool.map to parallel. ----
# result_perf = pool.map(trial_do_partial, range(NUM_TRIALS))
# train_pref = [item[0] for item in result_perf]
# val_pref = [item[1] for item in result_perf]
# test_pref = [item[2] for item in result_perf]
result_perf = pool.map(trial_do_partial, range(NUM_TRIALS))
train_pref = [item[0] for item in result_perf]
val_pref = [item[1] for item in result_perf]
test_pref = [item[2] for item in result_perf]
# # ---- use joblib.Parallel to parallel and track progress. ----
# trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, gram_matrices, y, model_type)
# result_perf = Parallel(n_jobs=n_jobs, verbose=10)(delayed(trial_do_partial)(trial) for trial in range(NUM_TRIALS))
# train_pref = [item[0] for item in result_perf]
# val_pref = [item[1] for item in result_perf]
# test_pref = [item[2] for item in result_perf]
# # ---- direct running, normally use a single CPU core. ----
# train_pref = []
# val_pref = []
# test_pref = []
# for i in tqdm(range(NUM_TRIALS), desc='cross validation', file=sys.stdout):
# o1, o2, o3 = trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, i)
# train_pref.append(o1)
# val_pref.append(o2)
# test_pref.append(o3)
# # ---- direct running, normally use a single CPU core. ----
# train_pref = []
# val_pref = []
# test_pref = []
# for i in tqdm(range(NUM_TRIALS), desc='cross validation', file=sys.stdout):
# o1, o2, o3 = trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, i)
# train_pref.append(o1)
# val_pref.append(o2)
# test_pref.append(o3)
# print()
print() print()
print('4. Getting final performance...') print('4. Getting final performance...')
str_fw += '\nIII. Performance.\n\n' str_fw += '\nIII. Performance.\n\n'
# averages and confidences of performances on outer trials for each combination of parameters # averages and confidences of performances on outer trials for each combination of parameters
average_train_scores = np.mean(train_pref, axis=0) average_train_scores = np.mean(train_pref, axis=0)
# print('val_pref: ', val_pref[0][0])
average_val_scores = np.mean(val_pref, axis=0) average_val_scores = np.mean(val_pref, axis=0)
# print('test_pref: ', test_pref[0][0])
average_perf_scores = np.mean(test_pref, axis=0) average_perf_scores = np.mean(test_pref, axis=0)
# sample std is used here # sample std is used here
std_train_scores = np.std(train_pref, axis=0, ddof=1) std_train_scores = np.std(train_pref, axis=0, ddof=1)
@@ -264,6 +260,9 @@ def model_selection_for_precomputed_kernel(datafile,
best_val_perf = np.amin(average_val_scores) best_val_perf = np.amin(average_val_scores)
else: else:
best_val_perf = np.amax(average_val_scores) best_val_perf = np.amax(average_val_scores)
# print('average_val_scores: ', average_val_scores)
# print('best_val_perf: ', best_val_perf)
# print()
best_params_index = np.where(average_val_scores == best_val_perf) best_params_index = np.where(average_val_scores == best_val_perf)
# find smallest val std with best val perf. # find smallest val std with best val perf.
best_val_stds = [ best_val_stds = [
@@ -286,6 +285,9 @@ def model_selection_for_precomputed_kernel(datafile,
str_fw += 'best_val_perf: %s\n' % best_val_perf str_fw += 'best_val_perf: %s\n' % best_val_perf
str_fw += 'best_val_std: %s\n' % min_val_std str_fw += 'best_val_std: %s\n' % min_val_std
# print(best_params_index)
# print(best_params_index[0])
# print(average_perf_scores)
final_performance = [ final_performance = [
average_perf_scores[value][best_params_index[1][idx]] average_perf_scores[value][best_params_index[1][idx]]
for idx, value in enumerate(best_params_index[0]) for idx, value in enumerate(best_params_index[0])
@@ -429,23 +431,23 @@ def model_selection_for_precomputed_kernel(datafile,
'3. Fitting and predicting using nested cross validation. This could really take a while...' '3. Fitting and predicting using nested cross validation. This could really take a while...'
) )
# pool = Pool(n_jobs)
# trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, gram_matrices, y, model_type)
# train_pref = []
# val_pref = []
# test_pref = []
# if NUM_TRIALS < 100:
# chunksize, extra = divmod(NUM_TRIALS, n_jobs * 4)
# if extra:
# chunksize += 1
# else:
# chunksize = 100
# for o1, o2, o3 in tqdm(pool.imap_unordered(trial_do_partial, range(NUM_TRIALS), chunksize), desc='cross validation', file=sys.stdout):
# train_pref.append(o1)
# val_pref.append(o2)
# test_pref.append(o3)
# pool.close()
# pool.join()
pool = Pool(n_jobs)
trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, gram_matrices, y, model_type)
train_pref = []
val_pref = []
test_pref = []
if NUM_TRIALS < 100:
chunksize, extra = divmod(NUM_TRIALS, n_jobs * 4)
if extra:
chunksize += 1
else:
chunksize = 100
for o1, o2, o3 in tqdm(pool.imap_unordered(trial_do_partial, range(NUM_TRIALS), chunksize), desc='cross validation', file=sys.stdout):
train_pref.append(o1)
val_pref.append(o2)
test_pref.append(o3)
pool.close()
pool.join()
# # ---- use pool.map to parallel. ---- # # ---- use pool.map to parallel. ----
# result_perf = pool.map(trial_do_partial, range(NUM_TRIALS)) # result_perf = pool.map(trial_do_partial, range(NUM_TRIALS))
@@ -460,15 +462,15 @@ def model_selection_for_precomputed_kernel(datafile,
# val_pref = [item[1] for item in result_perf] # val_pref = [item[1] for item in result_perf]
# test_pref = [item[2] for item in result_perf] # test_pref = [item[2] for item in result_perf]


# ---- direct running, normally use a single CPU core. ----
train_pref = []
val_pref = []
test_pref = []
for i in tqdm(range(NUM_TRIALS), desc='cross validation', file=sys.stdout):
o1, o2, o3 = trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, i)
train_pref.append(o1)
val_pref.append(o2)
test_pref.append(o3)
# # ---- direct running, normally use a single CPU core. ----
# train_pref = []
# val_pref = []
# test_pref = []
# for i in tqdm(range(NUM_TRIALS), desc='cross validation', file=sys.stdout):
# o1, o2, o3 = trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, i)
# train_pref.append(o1)
# val_pref.append(o2)
# test_pref.append(o3)


print() print()
print('4. Getting final performance...') print('4. Getting final performance...')
@@ -623,89 +625,142 @@ def trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, t
val_pref = np.zeros((len(param_list_pre_revised), len(param_list))) val_pref = np.zeros((len(param_list_pre_revised), len(param_list)))
test_pref = np.zeros((len(param_list_pre_revised), len(param_list))) test_pref = np.zeros((len(param_list_pre_revised), len(param_list)))


# randomness added to seeds of split function below. "high" is "size" times
# 10 so that at least 10 different random output will be yielded. Remove
# these lines if identical outputs is required.
rdm_out = np.random.RandomState(seed=None)
rdm_seed_out_l = rdm_out.uniform(high=len(param_list_pre_revised) * 10,
size=len(param_list_pre_revised))
# print(trial, rdm_seed_out_l)
# print()
# loop for each outer param tuple # loop for each outer param tuple
for index_out, params_out in enumerate(param_list_pre_revised): for index_out, params_out in enumerate(param_list_pre_revised):
# split gram matrix and y to app and test sets. # split gram matrix and y to app and test sets.
indices = range(len(y)) indices = range(len(y))
# The argument "random_state" in function "train_test_split" can not be
# set to None, because it will use RandomState instance used by
# np.random, which is possible for multiple subprocesses to inherit the
# same seed if they forked at the same time, leading to identical
# random variates for different subprocesses. Instead, we use "trial"
# and "index_out" parameters to generate different seeds for different
# trials/subprocesses and outer loops. "rdm_seed_out_l" is used to add
# randomness into seeds, so that it yields a different output every
# time the program is run. To yield identical outputs every time,
# remove the second line below. Same method is used to the "KFold"
# function in the inner loop.
rdm_seed_out = (trial + 1) * (index_out + 1)
rdm_seed_out = (rdm_seed_out + int(rdm_seed_out_l[index_out])) % (2 ** 32 - 1)
# print(trial, rdm_seed_out)
X_app, X_test, y_app, y_test, idx_app, idx_test = train_test_split( X_app, X_test, y_app, y_test, idx_app, idx_test = train_test_split(
gram_matrices[index_out], y, indices, test_size=0.1, gram_matrices[index_out], y, indices, test_size=0.1,
random_state=None, shuffle=True)
random_state=rdm_seed_out, shuffle=True)
# print(trial, idx_app, idx_test)
# print()
X_app = X_app[:, idx_app] X_app = X_app[:, idx_app]
X_test = X_test[:, idx_app] X_test = X_test[:, idx_app]
y_app = np.array(y_app) y_app = np.array(y_app)
y_test = np.array(y_test) y_test = np.array(y_test)


rdm_seed_in_l = rdm_out.uniform(high=len(param_list) * 10,
size=len(param_list))
# loop for each inner param tuple # loop for each inner param tuple
for index_in, params_in in enumerate(param_list): for index_in, params_in in enumerate(param_list):
# print(index_in, params_in)
# if trial == 0:
# print(index_out, index_in)
# print('params_in: ', params_in)
# st = time.time() # st = time.time()
inner_cv = KFold(n_splits=10, shuffle=True, random_state=trial)
rdm_seed_in = (trial + 1) * (index_out + 1) * (index_in + 1)
# print("rdm_seed_in1: ", trial, index_in, rdm_seed_in)
rdm_seed_in = (rdm_seed_in + int(rdm_seed_in_l[index_in])) % (2 ** 32 - 1)
# print("rdm_seed_in2: ", trial, index_in, rdm_seed_in)
inner_cv = KFold(n_splits=10, shuffle=True, random_state=rdm_seed_in)
current_train_perf = [] current_train_perf = []
current_valid_perf = [] current_valid_perf = []
current_test_perf = [] current_test_perf = []


# For regression use the Kernel Ridge method # For regression use the Kernel Ridge method
try:
if model_type == 'regression':
kr = KernelRidge(kernel='precomputed', **params_in)
# loop for each split on validation set level
# validation set level
for train_index, valid_index in inner_cv.split(X_app):
kr.fit(X_app[train_index, :][:, train_index],
y_app[train_index])
# try:
if model_type == 'regression':
kr = KernelRidge(kernel='precomputed', **params_in)
# loop for each split on validation set level
# validation set level
for train_index, valid_index in inner_cv.split(X_app):
# print("train_index, valid_index: ", trial, index_in, train_index, valid_index)
# if trial == 0:
# print('train_index: ', train_index)
# print('valid_index: ', valid_index)
# print('idx_test: ', idx_test)
# print('y_app[train_index]: ', y_app[train_index])
# print('X_app[train_index, :][:, train_index]: ', X_app[train_index, :][:, train_index])
# print('X_app[valid_index, :][:, train_index]: ', X_app[valid_index, :][:, train_index])
kr.fit(X_app[train_index, :][:, train_index],
y_app[train_index])


# predict on the train, validation and test set
y_pred_train = kr.predict(
X_app[train_index, :][:, train_index])
y_pred_valid = kr.predict(
X_app[valid_index, :][:, train_index])
y_pred_test = kr.predict(
X_test[:, train_index])
# predict on the train, validation and test set
y_pred_train = kr.predict(
X_app[train_index, :][:, train_index])
y_pred_valid = kr.predict(
X_app[valid_index, :][:, train_index])
# if trial == 0:
# print('y_pred_valid: ', y_pred_valid)
# print()
y_pred_test = kr.predict(
X_test[:, train_index])


# root mean squared errors
current_train_perf.append(
np.sqrt(
mean_squared_error(
y_app[train_index], y_pred_train)))
current_valid_perf.append(
np.sqrt(
mean_squared_error(
y_app[valid_index], y_pred_valid)))
current_test_perf.append(
np.sqrt(
mean_squared_error(
y_test, y_pred_test)))
# For clcassification use SVM
else:
svc = SVC(kernel='precomputed', cache_size=200,
verbose=False, **params_in)
# loop for each split on validation set level
# validation set level
for train_index, valid_index in inner_cv.split(X_app):
# root mean squared errors
current_train_perf.append(
np.sqrt(
mean_squared_error(
y_app[train_index], y_pred_train)))
current_valid_perf.append(
np.sqrt(
mean_squared_error(
y_app[valid_index], y_pred_valid)))
# if trial == 0:
# print(mean_squared_error(
# y_app[valid_index], y_pred_valid))
current_test_perf.append(
np.sqrt(
mean_squared_error(
y_test, y_pred_test)))
# For clcassification use SVM
else:
svc = SVC(kernel='precomputed', cache_size=200,
verbose=False, **params_in)
# loop for each split on validation set level
# validation set level
for train_index, valid_index in inner_cv.split(X_app):
# np.savez("bug.npy",X_app[train_index, :][:, train_index],y_app[train_index]) # np.savez("bug.npy",X_app[train_index, :][:, train_index],y_app[train_index])
svc.fit(X_app[train_index, :][:, train_index],
y_app[train_index])
# predict on the train, validation and test set
y_pred_train = svc.predict(
X_app[train_index, :][:, train_index])
y_pred_valid = svc.predict(
X_app[valid_index, :][:, train_index])
y_pred_test = svc.predict(
X_test[:, train_index])
# if trial == 0:
# print('train_index: ', train_index)
# print('valid_index: ', valid_index)
# print('idx_test: ', idx_test)
# print('y_app[train_index]: ', y_app[train_index])
# print('X_app[train_index, :][:, train_index]: ', X_app[train_index, :][:, train_index])
# print('X_app[valid_index, :][:, train_index]: ', X_app[valid_index, :][:, train_index])
svc.fit(X_app[train_index, :][:, train_index],
y_app[train_index])
# predict on the train, validation and test set
y_pred_train = svc.predict(
X_app[train_index, :][:, train_index])
y_pred_valid = svc.predict(
X_app[valid_index, :][:, train_index])
y_pred_test = svc.predict(
X_test[:, train_index])


# root mean squared errors
current_train_perf.append(
accuracy_score(y_app[train_index],
y_pred_train))
current_valid_perf.append(
accuracy_score(y_app[valid_index],
y_pred_valid))
current_test_perf.append(
accuracy_score(y_test, y_pred_test))
except ValueError:
print(sys.exc_info()[0])
print(params_out, params_in)
# root mean squared errors
current_train_perf.append(
accuracy_score(y_app[train_index],
y_pred_train))
current_valid_perf.append(
accuracy_score(y_app[valid_index],
y_pred_valid))
current_test_perf.append(
accuracy_score(y_test, y_pred_test))
# except ValueError:
# print(sys.exc_info()[0])
# print(params_out, params_in)


# average performance on inner splits # average performance on inner splits
train_pref[index_out][index_in] = np.mean( train_pref[index_out][index_in] = np.mean(
@@ -715,5 +770,8 @@ def trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, t
test_pref[index_out][index_in] = np.mean( test_pref[index_out][index_in] = np.mean(
current_test_perf) current_test_perf)
# print(time.time() - st) # print(time.time() - st)
# if trial == 0:
# print('val_pref: ', val_pref)
# print('test_pref: ', test_pref)


return train_pref, val_pref, test_pref return train_pref, val_pref, test_pref

Loading…
Cancel
Save