@@ -12,7 +12,7 @@ import matplotlib.pyplot as plt | |||||
from numpy.linalg import eig | from numpy.linalg import eig | ||||
# read gram matrices from file. | # read gram matrices from file. | ||||
results_dir = 'results/structuralspkernel/' | |||||
results_dir = 'results/untilhpathkernel/myria' | |||||
ds_name = 'Letter-med' | ds_name = 'Letter-med' | ||||
gmfile = np.load(results_dir + '/' + ds_name + '.gm.npz') | gmfile = np.load(results_dir + '/' + ds_name + '.gm.npz') | ||||
#print('gm time: ', gmfile['gmtime']) | #print('gm time: ', gmfile['gmtime']) | ||||
@@ -6,94 +6,116 @@ | |||||
"metadata": { | "metadata": { | ||||
"scrolled": false | "scrolled": false | ||||
}, | }, | ||||
"outputs": [], | |||||
"outputs": [ | |||||
{ | |||||
"name": "stdout", | |||||
"output_type": "stream", | |||||
"text": [ | |||||
"\n", | |||||
"MAO\n", | |||||
"\n", | |||||
"--- This is a classification problem ---\n", | |||||
"\n", | |||||
"\n", | |||||
"1. Loading dataset from file...\n", | |||||
"\n", | |||||
"2. Calculating gram matrices. This could take a while...\n", | |||||
"\n", | |||||
" None edge weight specified. Set all weight to 1.\n", | |||||
"\n", | |||||
"getting sp graphs: 68it [00:00, 692.11it/s]\n", | |||||
"calculating kernels: 2346it [00:05, 399.28it/s]\n", | |||||
"\n", | |||||
" --- shortest path kernel matrix of size 68 built in 6.345669507980347 seconds ---\n", | |||||
"\n", | |||||
"the gram matrix with parameters {'node_kernels': {'symb': <function deltakernel at 0x7fe240afd620>, 'nsymb': <function gaussiankernel at 0x7fe240afd9d8>, 'mix': functools.partial(<function kernelproduct at 0x7fe240aaf0d0>, <function deltakernel at 0x7fe240afd620>, <function gaussiankernel at 0x7fe240afd9d8>)}, 'n_jobs': 8} is: \n", | |||||
"\n", | |||||
"1 gram matrices are calculated, 0 of which are ignored.\n", | |||||
"\n", | |||||
"3. Fitting and predicting using nested cross validation. This could really take a while...\n", | |||||
"cross validation: 7it [00:09, 4.67s/it]" | |||||
] | |||||
} | |||||
], | |||||
"source": [ | "source": [ | ||||
"%load_ext line_profiler\n", | |||||
"%matplotlib inline\n", | |||||
"import functools\n", | "import functools\n", | ||||
"from libs import *\n", | "from libs import *\n", | ||||
"import multiprocessing\n", | "import multiprocessing\n", | ||||
"from sklearn.metrics.pairwise import rbf_kernel\n", | |||||
"\n", | |||||
"from pygraph.kernels.spKernel import spkernel, spkernel_do\n", | |||||
"from pygraph.utils.kernels import deltakernel, kernelsum\n", | |||||
"from pygraph.utils.model_selection_precomputed import trial_do\n", | |||||
"\n", | |||||
"dslist = [ \n", | |||||
" {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds', 'task': 'regression'}, # node symb\n", | |||||
"# {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression', \n", | |||||
"# 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt',}, # contains single node graph, node symb\n", | |||||
"# {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds',}, # node/edge symb\n", | |||||
"# {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds',}, # unlabeled\n", | |||||
"# {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat',\n", | |||||
"# 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, # node/edge symb\n", | |||||
" {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'},\n", | |||||
" {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'}, # node symb/nsymb\n", | |||||
" {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, # node/edge symb\n", | |||||
" {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat',\n", | |||||
" 'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}}, # node symb\n", | |||||
"\n", | "\n", | ||||
"# {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb\n", | |||||
"# # # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb\n", | |||||
"# # # {'name': 'COX2', 'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt'}, # node symb/nsymb\n", | |||||
"# {'name': 'Fingerprint', 'dataset': '../datasets/Fingerprint/Fingerprint_A.txt'},\n", | |||||
"# \n", | |||||
"# # {'name': 'DHFR', 'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt'}, # node symb/nsymb\n", | |||||
"# # {'name': 'SYNTHETIC', 'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt'}, # node symb/nsymb\n", | |||||
"# # {'name': 'MSRC9', 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'}, # node symb\n", | |||||
"# # {'name': 'MSRC21', 'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'}, # node symb\n", | |||||
"# # {'name': 'FIRSTMM_DB', 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'}, # node symb/nsymb ,edge nsymb\n", | |||||
"\n", | |||||
"# # {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'}, # node symb/nsymb\n", | |||||
"# # {'name': 'PROTEINS_full', 'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'}, # node symb/nsymb\n", | |||||
"# # {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb\n", | |||||
"# {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1.mat',\n", | |||||
"# 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb\n", | |||||
"# {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109.mat',\n", | |||||
"# 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb\n", | |||||
"# {'name': 'NCI-HIV', 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf',\n", | |||||
"# 'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',}, # node/edge symb\n", | |||||
" \n", | |||||
"# # not working below\n", | |||||
"# {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',},\n", | |||||
"# {'name': 'PTC_FR', 'dataset': '../datasets/PTC/Train/FR.ds',},\n", | |||||
"# {'name': 'PTC_MM', 'dataset': '../datasets/PTC/Train/MM.ds',},\n", | |||||
"# {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',},\n", | |||||
"from pygraph.kernels.spKernel import spkernel\n", | |||||
"from pygraph.utils.kernels import deltakernel, gaussiankernel, kernelproduct\n", | |||||
"#from pygraph.utils.model_selection_precomputed import trial_do\n", | |||||
"\n", | |||||
"dslist = [\n", | |||||
"# {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',\n", | |||||
"# 'task': 'regression'}, # node symb\n", | |||||
"# {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression',\n", | |||||
"# 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt', }, \n", | |||||
"# # contains single node graph, node symb\n", | |||||
" {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds', }, # node/edge symb\n", | |||||
"# {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds', }, # unlabeled\n", | |||||
"# {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat',\n", | |||||
"# 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, # node/edge symb\n", | |||||
"# {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'},\n", | |||||
"# # node nsymb\n", | |||||
"# {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'},\n", | |||||
"# # node symb/nsymb\n", | |||||
"# {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'},\n", | |||||
" # node/edge symb\n", | |||||
"# {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat',\n", | |||||
"# 'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}}, # node symb\n", | |||||
"\n", | |||||
" # {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb\n", | |||||
" # # # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb\n", | |||||
" # # # {'name': 'COX2', 'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt'}, # node symb/nsymb\n", | |||||
" # {'name': 'Fingerprint', 'dataset': '../datasets/Fingerprint/Fingerprint_A.txt'},\n", | |||||
" #\n", | |||||
" # # {'name': 'DHFR', 'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt'}, # node symb/nsymb\n", | |||||
" # # {'name': 'SYNTHETIC', 'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt'}, # node symb/nsymb\n", | |||||
" # # {'name': 'MSRC9', 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'}, # node symb\n", | |||||
" # # {'name': 'MSRC21', 'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'}, # node symb\n", | |||||
" # # {'name': 'FIRSTMM_DB', 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'}, # node symb/nsymb ,edge nsymb\n", | |||||
"\n", | |||||
" # # {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'}, # node symb/nsymb\n", | |||||
" # # {'name': 'PROTEINS_full', 'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'}, # node symb/nsymb\n", | |||||
" # # {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb\n", | |||||
" # {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1.mat',\n", | |||||
" # 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb\n", | |||||
" # {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109.mat',\n", | |||||
" # 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb\n", | |||||
" # {'name': 'NCI-HIV', 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf',\n", | |||||
" # 'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',}, # node/edge symb\n", | |||||
"\n", | |||||
" # # not working below\n", | |||||
" # {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',},\n", | |||||
" # {'name': 'PTC_FR', 'dataset': '../datasets/PTC/Train/FR.ds',},\n", | |||||
" # {'name': 'PTC_MM', 'dataset': '../datasets/PTC/Train/MM.ds',},\n", | |||||
" # {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',},\n", | |||||
"]\n", | "]\n", | ||||
"estimator = spkernel\n", | "estimator = spkernel\n", | ||||
"mixkernel = functools.partial(kernelsum, deltakernel, rbf_kernel)\n", | |||||
"param_grid_precomputed = {'node_kernels': [{'symb': deltakernel, 'nsymb': rbf_kernel, 'mix': mixkernel}]}\n", | |||||
"param_grid = [{'C': np.logspace(-10, 10, num = 41, base = 10)}, \n", | |||||
" {'alpha': np.logspace(-10, 10, num = 41, base = 10)}]\n", | |||||
"mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)\n", | |||||
"param_grid_precomputed = {'node_kernels': [\n", | |||||
" {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}]}\n", | |||||
"param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)},\n", | |||||
" {'alpha': np.logspace(-10, 10, num=41, base=10)}]\n", | |||||
"\n", | "\n", | ||||
"for ds in dslist:\n", | "for ds in dslist:\n", | ||||
" print()\n", | " print()\n", | ||||
" print(ds['name'])\n", | " print(ds['name'])\n", | ||||
" model_selection_for_precomputed_kernel(\n", | " model_selection_for_precomputed_kernel(\n", | ||||
" ds['dataset'], \n", | |||||
" estimator, \n", | |||||
" param_grid_precomputed, \n", | |||||
" (param_grid[1] if ('task' in ds and ds['task'] == 'regression') else param_grid[0]), \n", | |||||
" (ds['task'] if 'task' in ds else 'classification'), \n", | |||||
" ds['dataset'],\n", | |||||
" estimator,\n", | |||||
" param_grid_precomputed,\n", | |||||
" (param_grid[1] if ('task' in ds and ds['task']\n", | |||||
" == 'regression') else param_grid[0]),\n", | |||||
" (ds['task'] if 'task' in ds else 'classification'),\n", | |||||
" NUM_TRIALS=30,\n", | " NUM_TRIALS=30,\n", | ||||
" datafile_y=(ds['dataset_y'] if 'dataset_y' in ds else None),\n", | " datafile_y=(ds['dataset_y'] if 'dataset_y' in ds else None),\n", | ||||
" extra_params=(ds['extra_params'] if 'extra_params' in ds else None),\n", | " extra_params=(ds['extra_params'] if 'extra_params' in ds else None),\n", | ||||
" ds_name=ds['name'],\n", | " ds_name=ds['name'],\n", | ||||
" n_jobs=multiprocessing.cpu_count())\n", | |||||
" \n", | |||||
"# %lprun -f trial_do -f spkernel -f spkernel_do -f model_selection_for_precomputed_kernel \\\n", | |||||
"# model_selection_for_precomputed_kernel( \\\n", | |||||
"# ds['dataset'], \\\n", | |||||
"# estimator, \\\n", | |||||
"# param_grid_precomputed, \\\n", | |||||
"# (param_grid[1] if ('task' in ds and ds['task'] == 'regression') else param_grid[0]), \\\n", | |||||
"# (ds['task'] if 'task' in ds else 'classification'), \\\n", | |||||
"# NUM_TRIALS=30, \\\n", | |||||
"# datafile_y=(ds['dataset_y'] if 'dataset_y' in ds else None), \\\n", | |||||
"# extra_params=(ds['extra_params'] if 'extra_params' in ds else None), \\\n", | |||||
"# ds_name=ds['name'], \\\n", | |||||
"# n_jobs=multiprocessing.cpu_count()) \n", | |||||
" print()" | |||||
" n_jobs=multiprocessing.cpu_count(),\n", | |||||
" read_gm_from_file=False)\n", | |||||
" print()\n" | |||||
] | ] | ||||
}, | }, | ||||
{ | { | ||||
@@ -713,8 +735,8 @@ | |||||
], | ], | ||||
"metadata": { | "metadata": { | ||||
"kernelspec": { | "kernelspec": { | ||||
"display_name": "Python 3 (Spyder)", | |||||
"language": "python3", | |||||
"display_name": "Python 3", | |||||
"language": "python", | |||||
"name": "python3" | "name": "python3" | ||||
}, | }, | ||||
"language_info": { | "language_info": { | ||||
@@ -727,7 +749,7 @@ | |||||
"name": "python", | "name": "python", | ||||
"nbconvert_exporter": "python", | "nbconvert_exporter": "python", | ||||
"pygments_lexer": "ipython3", | "pygments_lexer": "ipython3", | ||||
"version": "3.5.2" | |||||
"version": "3.6.6" | |||||
} | } | ||||
}, | }, | ||||
"nbformat": 4, | "nbformat": 4, | ||||
@@ -7,21 +7,21 @@ from pygraph.utils.kernels import deltakernel, gaussiankernel, kernelproduct | |||||
#from pygraph.utils.model_selection_precomputed import trial_do | #from pygraph.utils.model_selection_precomputed import trial_do | ||||
dslist = [ | dslist = [ | ||||
# {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds', | |||||
# 'task': 'regression'}, # node symb | |||||
# {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression', | |||||
# 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt', }, | |||||
# # contains single node graph, node symb | |||||
# {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds', }, # node/edge symb | |||||
# {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds', }, # unlabeled | |||||
# {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat', | |||||
# 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, # node/edge symb | |||||
{'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds', | |||||
'task': 'regression'}, # node symb | |||||
{'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression', | |||||
'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt', }, | |||||
# contains single node graph, node symb | |||||
{'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds', }, # node/edge symb | |||||
{'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds', }, # unlabeled | |||||
{'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat', | |||||
'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, # node/edge symb | |||||
{'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'}, | {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'}, | ||||
# node nsymb | # node nsymb | ||||
{'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'}, | {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'}, | ||||
# node symb/nsymb | # node symb/nsymb | ||||
# {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, | # {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, | ||||
# # node/edge symb | |||||
# node/edge symb | |||||
# {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat', | # {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat', | ||||
# 'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}}, # node symb | # 'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}}, # node symb | ||||
@@ -56,7 +56,7 @@ estimator = spkernel | |||||
mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) | mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) | ||||
param_grid_precomputed = {'node_kernels': [ | param_grid_precomputed = {'node_kernels': [ | ||||
{'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}]} | {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}]} | ||||
param_grid = [{'C': np.logspace(-10, 3, num=27, base=10)}, | |||||
param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)}, | |||||
{'alpha': np.logspace(-10, 10, num=41, base=10)}] | {'alpha': np.logspace(-10, 10, num=41, base=10)}] | ||||
for ds in dslist: | for ds in dslist: | ||||
@@ -23,10 +23,10 @@ dslist = [ | |||||
# {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds', }, # unlabeled | # {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds', }, # unlabeled | ||||
# {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat', | # {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat', | ||||
# 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, # node/edge symb | # 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, # node/edge symb | ||||
{'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'}, | |||||
# node nsymb | |||||
# {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'}, | |||||
# # node symb/nsymb | |||||
# {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'}, | |||||
# # node nsymb | |||||
{'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'}, | |||||
# node symb/nsymb | |||||
# {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, | # {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, | ||||
# # node/edge symb | # # node/edge symb | ||||
# {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat', | # {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat', | ||||
@@ -39,8 +39,8 @@ dslist = [ | |||||
# | # | ||||
# # {'name': 'DHFR', 'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt'}, # node symb/nsymb | # # {'name': 'DHFR', 'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt'}, # node symb/nsymb | ||||
# # {'name': 'SYNTHETIC', 'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt'}, # node symb/nsymb | # # {'name': 'SYNTHETIC', 'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt'}, # node symb/nsymb | ||||
# # {'name': 'MSRC9', 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'}, # node symb, missing values | |||||
# # {'name': 'MSRC21', 'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'}, # node symb, missing values | |||||
# {'name': 'MSRC9', 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'}, # node symb, missing values | |||||
# {'name': 'MSRC21', 'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'}, # node symb, missing values | |||||
# # {'name': 'FIRSTMM_DB', 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'}, # node symb/nsymb ,edge nsymb | # # {'name': 'FIRSTMM_DB', 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'}, # node symb/nsymb ,edge nsymb | ||||
# # {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'}, # node symb/nsymb | # # {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'}, # node symb/nsymb | ||||
@@ -53,8 +53,8 @@ dslist = [ | |||||
# {'name': 'NCI-HIV', 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf', | # {'name': 'NCI-HIV', 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf', | ||||
# 'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',}, # node/edge symb | # 'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',}, # node/edge symb | ||||
# # not working below | |||||
# {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',}, | |||||
# # not working below | |||||
# {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',}, | |||||
# {'name': 'PTC_FR', 'dataset': '../datasets/PTC/Train/FR.ds',}, | # {'name': 'PTC_FR', 'dataset': '../datasets/PTC/Train/FR.ds',}, | ||||
# {'name': 'PTC_MM', 'dataset': '../datasets/PTC/Train/MM.ds',}, | # {'name': 'PTC_MM', 'dataset': '../datasets/PTC/Train/MM.ds',}, | ||||
# {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',}, | # {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',}, | ||||
@@ -62,7 +62,7 @@ dslist = [ | |||||
] | ] | ||||
estimator = untilhpathkernel | estimator = untilhpathkernel | ||||
mixkernel = functools.partial(kernelproduct, deltakernel, rbf_kernel) | mixkernel = functools.partial(kernelproduct, deltakernel, rbf_kernel) | ||||
param_grid_precomputed = {'depth': np.linspace(7, 10, 10), | |||||
param_grid_precomputed = {'depth': np.linspace(1, 10, 10), | |||||
'k_func': ['tanimoto', 'MinMax']} | 'k_func': ['tanimoto', 'MinMax']} | ||||
param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)}, | param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)}, | ||||
{'alpha': np.logspace(-10, 10, num=41, base=10)}] | {'alpha': np.logspace(-10, 10, num=41, base=10)}] | ||||
@@ -1,77 +0,0 @@ | |||||
#!/usr/bin/env python3 | |||||
# -*- coding: utf-8 -*- | |||||
""" | |||||
Created on Fri Sep 28 16:37:29 2018 | |||||
@author: ljia | |||||
""" | |||||
import functools | |||||
from libs import * | |||||
import multiprocessing | |||||
from sklearn.metrics.pairwise import rbf_kernel | |||||
from pygraph.kernels.structuralspKernel import structuralspkernel | |||||
from pygraph.utils.kernels import deltakernel, kernelproduct | |||||
dslist = [ | |||||
# {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds', | |||||
# 'task': 'regression'}, # node symb | |||||
# {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression', | |||||
# 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt', }, # contains single node graph, node symb | |||||
{'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds', }, # node/edge symb | |||||
# {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb | |||||
# # # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb | |||||
# # # {'name': 'COX2', 'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt'}, # node symb/nsymb | |||||
# {'name': 'Fingerprint', 'dataset': '../datasets/Fingerprint/Fingerprint_A.txt'}, | |||||
# | |||||
# # {'name': 'DHFR', 'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt'}, # node symb/nsymb | |||||
# # {'name': 'SYNTHETIC', 'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt'}, # node symb/nsymb | |||||
# # {'name': 'MSRC9', 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'}, # node symb | |||||
# # {'name': 'MSRC21', 'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'}, # node symb | |||||
# # {'name': 'FIRSTMM_DB', 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'}, # node symb/nsymb ,edge nsymb | |||||
# # {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'}, # node symb/nsymb | |||||
# # {'name': 'PROTEINS_full', 'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'}, # node symb/nsymb | |||||
# # {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb | |||||
# {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1.mat', | |||||
# 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb | |||||
# {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109.mat', | |||||
# 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb | |||||
# {'name': 'NCI-HIV', 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf', | |||||
# 'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',}, # node/edge symb | |||||
# # not working below | |||||
# {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',}, | |||||
# {'name': 'PTC_FR', 'dataset': '../datasets/PTC/Train/FR.ds',}, | |||||
# {'name': 'PTC_MM', 'dataset': '../datasets/PTC/Train/MM.ds',}, | |||||
# {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',}, | |||||
] | |||||
estimator = structuralspkernel | |||||
mixkernel = functools.partial(kernelproduct, deltakernel, rbf_kernel) | |||||
param_grid_precomputed = {'node_kernels': | |||||
[{'symb': deltakernel, 'nsymb': rbf_kernel, 'mix': mixkernel}], | |||||
'edge_kernels': | |||||
[{'symb': deltakernel, 'nsymb': rbf_kernel, 'mix': mixkernel}]} | |||||
param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)}, | |||||
{'alpha': np.logspace(-10, 10, num=41, base=10)}] | |||||
for ds in dslist: | |||||
print() | |||||
print(ds['name']) | |||||
model_selection_for_precomputed_kernel( | |||||
ds['dataset'], | |||||
estimator, | |||||
param_grid_precomputed, | |||||
(param_grid[1] if ('task' in ds and ds['task'] | |||||
== 'regression') else param_grid[0]), | |||||
(ds['task'] if 'task' in ds else 'classification'), | |||||
NUM_TRIALS=30, | |||||
datafile_y=(ds['dataset_y'] if 'dataset_y' in ds else None), | |||||
extra_params=(ds['extra_params'] if 'extra_params' in ds else None), | |||||
ds_name=ds['name'], | |||||
n_jobs=multiprocessing.cpu_count(), | |||||
read_gm_from_file=False) | |||||
print() |
@@ -85,21 +85,20 @@ def commonwalkkernel(*args, | |||||
# ---- use pool.imap_unordered to parallel and track progress. ---- | # ---- use pool.imap_unordered to parallel and track progress. ---- | ||||
pool = Pool(n_jobs) | pool = Pool(n_jobs) | ||||
itr = combinations_with_replacement(range(0, len(Gn)), 2) | |||||
itr = zip(combinations_with_replacement(Gn, 2), | |||||
combinations_with_replacement(range(0, len(Gn)), 2)) | |||||
len_itr = int(len(Gn) * (len(Gn) + 1) / 2) | len_itr = int(len(Gn) * (len(Gn) + 1) / 2) | ||||
if len_itr < 1000 * n_jobs: | if len_itr < 1000 * n_jobs: | ||||
chunksize = int(len_itr / n_jobs) + 1 | chunksize = int(len_itr / n_jobs) + 1 | ||||
else: | else: | ||||
chunksize = 100 | |||||
chunksize = 1000 | |||||
# direct product graph method - exponential | # direct product graph method - exponential | ||||
if compute_method == 'exp': | if compute_method == 'exp': | ||||
do_partial = partial(_commonwalkkernel_exp, Gn, node_label, edge_label, | |||||
weight) | |||||
do_partial = partial(wrapper_cw_exp, node_label, edge_label, weight) | |||||
# direct product graph method - geometric | # direct product graph method - geometric | ||||
elif compute_method == 'geo': | elif compute_method == 'geo': | ||||
do_partial = partial(_commonwalkkernel_geo, Gn, node_label, edge_label, | |||||
weight) | |||||
do_partial = partial(wrapper_cw_geo, node_label, edge_label, weight) | |||||
for i, j, kernel in tqdm( | for i, j, kernel in tqdm( | ||||
pool.imap_unordered(do_partial, itr, chunksize), | pool.imap_unordered(do_partial, itr, chunksize), | ||||
@@ -153,7 +152,7 @@ def commonwalkkernel(*args, | |||||
return Kmatrix, run_time | return Kmatrix, run_time | ||||
def _commonwalkkernel_exp(Gn, node_label, edge_label, beta, ij): | |||||
def _commonwalkkernel_exp(g1, g2, node_label, edge_label, beta): | |||||
"""Calculate walk graph kernels up to n between 2 graphs using exponential | """Calculate walk graph kernels up to n between 2 graphs using exponential | ||||
series. | series. | ||||
@@ -175,10 +174,6 @@ def _commonwalkkernel_exp(Gn, node_label, edge_label, beta, ij): | |||||
kernel : float | kernel : float | ||||
The common walk Kernel between 2 graphs. | The common walk Kernel between 2 graphs. | ||||
""" | """ | ||||
iglobal = ij[0] | |||||
jglobal = ij[1] | |||||
g1 = Gn[iglobal] | |||||
g2 = Gn[jglobal] | |||||
# get tensor product / direct product | # get tensor product / direct product | ||||
gp = direct_product(g1, g2, node_label, edge_label) | gp = direct_product(g1, g2, node_label, edge_label) | ||||
@@ -219,10 +214,18 @@ def _commonwalkkernel_exp(Gn, node_label, edge_label, beta, ij): | |||||
# print(np.exp(weight * A)) | # print(np.exp(weight * A)) | ||||
# print('-------') | # print('-------') | ||||
return iglobal, jglobal, exp_D.sum() | |||||
return exp_D.sum() | |||||
def _commonwalkkernel_geo(Gn, node_label, edge_label, gamma, ij): | |||||
def wrapper_cw_exp(node_label, edge_label, beta, itr_item): | |||||
g1 = itr_item[0][0] | |||||
g2 = itr_item[0][1] | |||||
i = itr_item[1][0] | |||||
j = itr_item[1][1] | |||||
return i, j, _commonwalkkernel_exp(g1, g2, node_label, edge_label, beta) | |||||
def _commonwalkkernel_geo(g1, g2, node_label, edge_label, gamma): | |||||
"""Calculate common walk graph kernels up to n between 2 graphs using | """Calculate common walk graph kernels up to n between 2 graphs using | ||||
geometric series. | geometric series. | ||||
@@ -244,19 +247,22 @@ def _commonwalkkernel_geo(Gn, node_label, edge_label, gamma, ij): | |||||
kernel : float | kernel : float | ||||
The common walk Kernel between 2 graphs. | The common walk Kernel between 2 graphs. | ||||
""" | """ | ||||
iglobal = ij[0] | |||||
jglobal = ij[1] | |||||
g1 = Gn[iglobal] | |||||
g2 = Gn[jglobal] | |||||
# get tensor product / direct product | # get tensor product / direct product | ||||
gp = direct_product(g1, g2, node_label, edge_label) | gp = direct_product(g1, g2, node_label, edge_label) | ||||
A = nx.adjacency_matrix(gp).todense() | A = nx.adjacency_matrix(gp).todense() | ||||
mat = np.identity(len(A)) - gamma * A | mat = np.identity(len(A)) - gamma * A | ||||
try: | try: | ||||
return iglobal, jglobal, mat.I.sum() | |||||
return mat.I.sum() | |||||
except np.linalg.LinAlgError: | except np.linalg.LinAlgError: | ||||
return iglobal, jglobal, np.nan | |||||
return np.nan | |||||
def wrapper_cw_geo(node_label, edge_label, gama, itr_item): | |||||
g1 = itr_item[0][0] | |||||
g2 = itr_item[0][1] | |||||
i = itr_item[1][0] | |||||
j = itr_item[1][1] | |||||
return i, j, _commonwalkkernel_geo(g1, g2, node_label, edge_label, gama) | |||||
def _commonwalkkernel_brute(walks1, | def _commonwalkkernel_brute(walks1, | ||||
@@ -8,7 +8,6 @@ import sys | |||||
import time | import time | ||||
from itertools import combinations_with_replacement, product | from itertools import combinations_with_replacement, product | ||||
from functools import partial | from functools import partial | ||||
from joblib import Parallel, delayed | |||||
from multiprocessing import Pool | from multiprocessing import Pool | ||||
from tqdm import tqdm | from tqdm import tqdm | ||||
@@ -89,7 +88,8 @@ def spkernel(*args, | |||||
pool = Pool(n_jobs) | pool = Pool(n_jobs) | ||||
# get shortest path graphs of Gn | # get shortest path graphs of Gn | ||||
getsp_partial = partial(wrap_getSPGraph, Gn, weight) | |||||
getsp_partial = partial(wrapper_getSPGraph, weight) | |||||
itr = zip(Gn, range(0, len(Gn))) | |||||
if len(Gn) < 1000 * n_jobs: | if len(Gn) < 1000 * n_jobs: | ||||
# # use default chunksize as pool.map when iterable is less than 100 | # # use default chunksize as pool.map when iterable is less than 100 | ||||
# chunksize, extra = divmod(len(Gn), n_jobs * 4) | # chunksize, extra = divmod(len(Gn), n_jobs * 4) | ||||
@@ -98,9 +98,8 @@ def spkernel(*args, | |||||
chunksize = int(len(Gn) / n_jobs) + 1 | chunksize = int(len(Gn) / n_jobs) + 1 | ||||
else: | else: | ||||
chunksize = 1000 | chunksize = 1000 | ||||
# chunksize = 300 # int(len(list(itr)) / n_jobs) | |||||
for i, g in tqdm( | for i, g in tqdm( | ||||
pool.imap_unordered(getsp_partial, range(0, len(Gn)), chunksize), | |||||
pool.imap_unordered(getsp_partial, itr, chunksize), | |||||
desc='getting sp graphs', file=sys.stdout): | desc='getting sp graphs', file=sys.stdout): | ||||
Gn[i] = g | Gn[i] = g | ||||
pool.close() | pool.close() | ||||
@@ -144,8 +143,9 @@ def spkernel(*args, | |||||
# ---- use pool.imap_unordered to parallel and track progress. ---- | # ---- use pool.imap_unordered to parallel and track progress. ---- | ||||
pool = Pool(n_jobs) | pool = Pool(n_jobs) | ||||
do_partial = partial(spkernel_do, Gn, ds_attrs, node_label, node_kernels) | |||||
itr = combinations_with_replacement(range(0, len(Gn)), 2) | |||||
do_partial = partial(wrapper_sp_do, ds_attrs, node_label, node_kernels) | |||||
itr = zip(combinations_with_replacement(Gn, 2), | |||||
combinations_with_replacement(range(0, len(Gn)), 2)) | |||||
len_itr = int(len(Gn) * (len(Gn) + 1) / 2) | len_itr = int(len(Gn) * (len(Gn) + 1) / 2) | ||||
if len_itr < 1000 * n_jobs: | if len_itr < 1000 * n_jobs: | ||||
chunksize = int(len_itr / n_jobs) + 1 | chunksize = int(len_itr / n_jobs) + 1 | ||||
@@ -200,15 +200,10 @@ def spkernel(*args, | |||||
return Kmatrix, run_time, idx | return Kmatrix, run_time, idx | ||||
def spkernel_do(Gn, ds_attrs, node_label, node_kernels, ij): | |||||
i = ij[0] | |||||
j = ij[1] | |||||
g1 = Gn[i] | |||||
g2 = Gn[j] | |||||
def spkernel_do(g1, g2, ds_attrs, node_label, node_kernels): | |||||
kernel = 0 | kernel = 0 | ||||
# try: | |||||
# compute shortest path matrices first, method borrowed from FCSP. | # compute shortest path matrices first, method borrowed from FCSP. | ||||
if ds_attrs['node_labeled']: | if ds_attrs['node_labeled']: | ||||
# node symb and non-synb labeled | # node symb and non-synb labeled | ||||
@@ -243,7 +238,7 @@ def spkernel_do(Gn, ds_attrs, node_label, node_kernels, ij): | |||||
g1.edges(data=True), g2.edges(data=True)): | g1.edges(data=True), g2.edges(data=True)): | ||||
if e1[2]['cost'] == e2[2]['cost']: | if e1[2]['cost'] == e2[2]['cost']: | ||||
kernel += 1 | kernel += 1 | ||||
return i, j, kernel | |||||
return kernel | |||||
# compute graph kernels | # compute graph kernels | ||||
if ds_attrs['is_directed']: | if ds_attrs['is_directed']: | ||||
@@ -293,12 +288,20 @@ def spkernel_do(Gn, ds_attrs, node_label, node_kernels, ij): | |||||
# kn1 = vk_mat[x1][x2] * vk_mat[y1][y2] | # kn1 = vk_mat[x1][x2] * vk_mat[y1][y2] | ||||
# kn2 = vk_mat[x1][y2] * vk_mat[y1][x2] | # kn2 = vk_mat[x1][y2] * vk_mat[y1][x2] | ||||
# kernel += kn1 + kn2 | # kernel += kn1 + kn2 | ||||
# except KeyError: # missing labels or attributes | |||||
# pass | |||||
return i, j, kernel | |||||
return kernel | |||||
def wrapper_sp_do(ds_attrs, node_label, node_kernels, itr_item): | |||||
g1 = itr_item[0][0] | |||||
g2 = itr_item[0][1] | |||||
i = itr_item[1][0] | |||||
j = itr_item[1][1] | |||||
return i, j, spkernel_do(g1, g2, ds_attrs, node_label, node_kernels) | |||||
def wrap_getSPGraph(Gn, weight, i): | |||||
return i, getSPGraph(Gn[i], edge_weight=weight) | |||||
# return i, nx.floyd_warshall_numpy(Gn[i], weight=weight) | |||||
def wrapper_getSPGraph(weight, itr_item): | |||||
g = itr_item[0] | |||||
i = itr_item[1] | |||||
return i, getSPGraph(g, edge_weight=weight) | |||||
# return i, nx.floyd_warshall_numpy(g, weight=weight) |
@@ -12,7 +12,6 @@ import sys | |||||
import time | import time | ||||
from itertools import combinations, combinations_with_replacement, product | from itertools import combinations, combinations_with_replacement, product | ||||
from functools import partial | from functools import partial | ||||
from joblib import Parallel, delayed | |||||
from multiprocessing import Pool | from multiprocessing import Pool | ||||
from tqdm import tqdm | from tqdm import tqdm | ||||
@@ -71,7 +70,6 @@ def structuralspkernel(*args, | |||||
""" | """ | ||||
# pre-process | # pre-process | ||||
Gn = args[0] if len(args) == 1 else [args[0], args[1]] | Gn = args[0] if len(args) == 1 else [args[0], args[1]] | ||||
weight = None | weight = None | ||||
if edge_weight is None: | if edge_weight is None: | ||||
print('\n None edge weight specified. Set all weight to 1.\n') | print('\n None edge weight specified. Set all weight to 1.\n') | ||||
@@ -98,34 +96,61 @@ def structuralspkernel(*args, | |||||
start_time = time.time() | start_time = time.time() | ||||
# get shortest paths of each graph in Gn | # get shortest paths of each graph in Gn | ||||
splist = [[] for _ in range(len(Gn))] | |||||
splist = [None] * len(Gn) | |||||
pool = Pool(n_jobs) | pool = Pool(n_jobs) | ||||
# get shortest path graphs of Gn | # get shortest path graphs of Gn | ||||
getsp_partial = partial(wrap_getSP, Gn, weight, ds_attrs['is_directed']) | |||||
getsp_partial = partial(wrapper_getSP, weight, ds_attrs['is_directed']) | |||||
itr = zip(Gn, range(0, len(Gn))) | |||||
if len(Gn) < 1000 * n_jobs: | if len(Gn) < 1000 * n_jobs: | ||||
chunksize = int(len(Gn) / n_jobs) + 1 | chunksize = int(len(Gn) / n_jobs) + 1 | ||||
else: | else: | ||||
chunksize = 1000 | chunksize = 1000 | ||||
# chunksize = 300 # int(len(list(itr)) / n_jobs) | # chunksize = 300 # int(len(list(itr)) / n_jobs) | ||||
for i, sp in tqdm( | for i, sp in tqdm( | ||||
pool.imap_unordered(getsp_partial, range(0, len(Gn)), chunksize), | |||||
pool.imap_unordered(getsp_partial, itr, chunksize), | |||||
desc='getting shortest paths', | desc='getting shortest paths', | ||||
file=sys.stdout): | file=sys.stdout): | ||||
splist[i] = sp | splist[i] = sp | ||||
# time.sleep(10) | |||||
pool.close() | pool.close() | ||||
pool.join() | pool.join() | ||||
# # ---- use pool.map to parallel ---- | |||||
# result_sp = pool.map(getsp_partial, range(0, len(Gn))) | |||||
# for i in result_sp: | |||||
# Gn[i[0]] = i[1] | |||||
# or | |||||
# getsp_partial = partial(wrap_getSP, Gn, weight) | |||||
# for i, g in tqdm( | |||||
# pool.map(getsp_partial, range(0, len(Gn))), | |||||
# desc='getting sp graphs', | |||||
# file=sys.stdout): | |||||
# Gn[i] = g | |||||
# # get shortest paths of each graph in Gn | |||||
# splist = [[] for _ in range(len(Gn))] | |||||
# # get shortest path graphs of Gn | |||||
# getsp_partial = partial(wrapper_getSP, weight, ds_attrs['is_directed']) | |||||
# itr = zip(Gn, range(0, len(Gn))) | |||||
# if len(Gn) < 1000 * n_jobs: | |||||
# chunksize = int(len(Gn) / n_jobs) + 1 | |||||
# else: | |||||
# chunksize = 1000 | |||||
# # chunksize = 300 # int(len(list(itr)) / n_jobs) | |||||
# from contextlib import closing | |||||
# with closing(Pool(n_jobs)) as pool: | |||||
## for i, sp in tqdm( | |||||
# res = pool.imap_unordered(getsp_partial, itr, 10) | |||||
## desc='getting shortest paths', | |||||
## file=sys.stdout): | |||||
## splist[i] = sp | |||||
## time.sleep(10) | |||||
# pool.close() | |||||
# pool.join() | |||||
# ss = 0 | |||||
# ss += sys.getsizeof(splist) | |||||
# for spss in splist: | |||||
# ss += sys.getsizeof(spss) | |||||
# for spp in spss: | |||||
# ss += sys.getsizeof(spp) | |||||
# time.sleep(20) | |||||
# # ---- direct running, normally use single CPU core. ---- | |||||
# splist = [] | |||||
# for g in tqdm(Gn, desc='getting sp graphs', file=sys.stdout): | |||||
# splist.append(get_shortest_paths(g, weight, ds_attrs['is_directed'])) | |||||
# # ---- only for the Fast Computation of Shortest Path Kernel (FCSP) | # # ---- only for the Fast Computation of Shortest Path Kernel (FCSP) | ||||
# sp_ml = [0] * len(Gn) # shortest path matrices | # sp_ml = [0] * len(Gn) # shortest path matrices | ||||
@@ -149,9 +174,11 @@ def structuralspkernel(*args, | |||||
# ---- use pool.imap_unordered to parallel and track progress. ---- | # ---- use pool.imap_unordered to parallel and track progress. ---- | ||||
pool = Pool(n_jobs) | pool = Pool(n_jobs) | ||||
do_partial = partial(structuralspkernel_do, Gn, splist, ds_attrs, | |||||
node_label, edge_label, node_kernels, edge_kernels) | |||||
itr = combinations_with_replacement(range(0, len(Gn)), 2) | |||||
do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label, | |||||
node_kernels, edge_kernels) | |||||
itr = zip(combinations_with_replacement(Gn, 2), | |||||
combinations_with_replacement(splist, 2), | |||||
combinations_with_replacement(range(0, len(Gn)), 2)) | |||||
len_itr = int(len(Gn) * (len(Gn) + 1) / 2) | len_itr = int(len(Gn) * (len(Gn) + 1) / 2) | ||||
if len_itr < 1000 * n_jobs: | if len_itr < 1000 * n_jobs: | ||||
chunksize = int(len_itr / n_jobs) + 1 | chunksize = int(len_itr / n_jobs) + 1 | ||||
@@ -166,36 +193,36 @@ def structuralspkernel(*args, | |||||
pool.close() | pool.close() | ||||
pool.join() | pool.join() | ||||
# # ---- use pool.map to parallel. ---- | |||||
# # result_perf = pool.map(do_partial, itr) | |||||
# do_partial = partial(spkernel_do, Gn, ds_attrs, node_label, node_kernels) | |||||
# itr = combinations_with_replacement(range(0, len(Gn)), 2) | |||||
# for i, j, kernel in tqdm( | |||||
# pool.map(do_partial, itr), desc='calculating kernels', | |||||
# file=sys.stdout): | |||||
# Kmatrix[i][j] = kernel | |||||
# Kmatrix[j][i] = kernel | |||||
# pool.close() | |||||
# pool.join() | |||||
# # ---- use joblib.Parallel to parallel and track progress. ---- | |||||
# result_perf = Parallel( | |||||
# n_jobs=n_jobs, verbose=10)( | |||||
# delayed(do_partial)(ij) | |||||
# for ij in combinations_with_replacement(range(0, len(Gn)), 2)) | |||||
# result_perf = [ | |||||
# do_partial(ij) | |||||
# for ij in combinations_with_replacement(range(0, len(Gn)), 2) | |||||
# ] | |||||
# for i in result_perf: | |||||
# Kmatrix[i[0]][i[1]] = i[2] | |||||
# Kmatrix[i[1]][i[0]] = i[2] | |||||
# # ---- use pool.imap_unordered to parallel and track progress. ---- | |||||
# do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label, | |||||
# node_kernels, edge_kernels) | |||||
# itr = zip(combinations_with_replacement(Gn, 2), | |||||
# combinations_with_replacement(splist, 2), | |||||
# combinations_with_replacement(range(0, len(Gn)), 2)) | |||||
# len_itr = int(len(Gn) * (len(Gn) + 1) / 2) | |||||
# if len_itr < 1000 * n_jobs: | |||||
# chunksize = int(len_itr / n_jobs) + 1 | |||||
# else: | |||||
# chunksize = 1000 | |||||
# from contextlib import closing | |||||
# with closing(Pool(n_jobs)) as pool: | |||||
# for i, j, kernel in tqdm( | |||||
# pool.imap_unordered(do_partial, itr, 1000), | |||||
# desc='calculating kernels', | |||||
# file=sys.stdout): | |||||
# Kmatrix[i][j] = kernel | |||||
# Kmatrix[j][i] = kernel | |||||
# pool.close() | |||||
# pool.join() | |||||
# # ---- direct running, normally use single CPU core. ---- | # # ---- direct running, normally use single CPU core. ---- | ||||
# itr = combinations_with_replacement(range(0, len(Gn)), 2) | |||||
# itr = zip(combinations_with_replacement(Gn, 2), | |||||
# combinations_with_replacement(splist, 2), | |||||
# combinations_with_replacement(range(0, len(Gn)), 2)) | |||||
# for gs in tqdm(itr, desc='calculating kernels', file=sys.stdout): | # for gs in tqdm(itr, desc='calculating kernels', file=sys.stdout): | ||||
# i, j, kernel = structuralspkernel_do(Gn, splist, ds_attrs, | |||||
# node_label, edge_label, node_kernels, edge_kernels, gs) | |||||
# i, j, kernel = wrapper_ssp_do(ds_attrs, node_label, edge_label, | |||||
# node_kernels, edge_kernels, gs) | |||||
# if(kernel > 1): | # if(kernel > 1): | ||||
# print("error here ") | # print("error here ") | ||||
# Kmatrix[i][j] = kernel | # Kmatrix[i][j] = kernel | ||||
@@ -209,18 +236,11 @@ def structuralspkernel(*args, | |||||
return Kmatrix, run_time | return Kmatrix, run_time | ||||
def structuralspkernel_do(Gn, splist, ds_attrs, node_label, edge_label, | |||||
node_kernels, edge_kernels, ij): | |||||
iglobal = ij[0] | |||||
jglobal = ij[1] | |||||
g1 = Gn[iglobal] | |||||
g2 = Gn[jglobal] | |||||
spl1 = splist[iglobal] | |||||
spl2 = splist[jglobal] | |||||
def structuralspkernel_do(g1, g2, spl1, spl2, ds_attrs, node_label, edge_label, | |||||
node_kernels, edge_kernels): | |||||
kernel = 0 | kernel = 0 | ||||
#try: | |||||
# First, compute shortest path matrices, method borrowed from FCSP. | # First, compute shortest path matrices, method borrowed from FCSP. | ||||
if ds_attrs['node_labeled']: | if ds_attrs['node_labeled']: | ||||
# node symb and non-synb labeled | # node symb and non-synb labeled | ||||
@@ -369,11 +389,19 @@ def structuralspkernel_do(Gn, splist, ds_attrs, node_label, edge_label, | |||||
# kn1 = vk_mat[x1][x2] * vk_mat[y1][y2] | # kn1 = vk_mat[x1][x2] * vk_mat[y1][y2] | ||||
# kn2 = vk_mat[x1][y2] * vk_mat[y1][x2] | # kn2 = vk_mat[x1][y2] * vk_mat[y1][x2] | ||||
# Kmatrix += kn1 + kn2 | # Kmatrix += kn1 + kn2 | ||||
#except KeyError: # missing labels or attributes | |||||
# print("toto") | |||||
# pass | |||||
return kernel | |||||
return iglobal, jglobal, kernel | |||||
def wrapper_ssp_do(ds_attrs, node_label, edge_label, node_kernels, | |||||
edge_kernels, itr_item): | |||||
g1 = itr_item[0][0] | |||||
g2 = itr_item[0][1] | |||||
spl1 = itr_item[1][0] | |||||
spl2 = itr_item[1][1] | |||||
i = itr_item[2][0] | |||||
j = itr_item[2][1] | |||||
return i, j, structuralspkernel_do(g1, g2, spl1, spl2, ds_attrs, | |||||
node_label, edge_label, node_kernels, edge_kernels) | |||||
def get_shortest_paths(G, weight, directed): | def get_shortest_paths(G, weight, directed): | ||||
@@ -397,17 +425,21 @@ def get_shortest_paths(G, weight, directed): | |||||
for n1, n2 in combinations(G.nodes(), 2): | for n1, n2 in combinations(G.nodes(), 2): | ||||
try: | try: | ||||
spltemp = list(nx.all_shortest_paths(G, n1, n2, weight=weight)) | spltemp = list(nx.all_shortest_paths(G, n1, n2, weight=weight)) | ||||
except nx.NetworkXNoPath: # nodes not connected | |||||
# sp.append([]) | |||||
pass | |||||
else: | |||||
sp += spltemp | sp += spltemp | ||||
# each edge walk is counted twice, starting from both its extreme nodes. | # each edge walk is counted twice, starting from both its extreme nodes. | ||||
if not directed: | if not directed: | ||||
sp += [sptemp[::-1] for sptemp in spltemp] | sp += [sptemp[::-1] for sptemp in spltemp] | ||||
except nx.NetworkXNoPath: # nodes not connected | |||||
# sp.append([]) | |||||
pass | |||||
# add single nodes as length 0 paths. | # add single nodes as length 0 paths. | ||||
sp += [[n] for n in G.nodes()] | sp += [[n] for n in G.nodes()] | ||||
return sp | return sp | ||||
def wrap_getSP(Gn, weight, directed, i): | |||||
return i, get_shortest_paths(Gn[i], weight, directed) | |||||
def wrapper_getSP(weight, directed, itr_item): | |||||
g = itr_item[0] | |||||
i = itr_item[1] | |||||
return i, get_shortest_paths(g, weight, directed) |
@@ -13,7 +13,6 @@ from itertools import chain, combinations_with_replacement | |||||
from functools import partial | from functools import partial | ||||
from multiprocessing import Pool | from multiprocessing import Pool | ||||
from tqdm import tqdm | from tqdm import tqdm | ||||
import traceback | |||||
import networkx as nx | import networkx as nx | ||||
import numpy as np | import numpy as np | ||||
@@ -77,15 +76,15 @@ def untilhpathkernel(*args, | |||||
# but this may cost a lot of memory for large datasets. | # but this may cost a lot of memory for large datasets. | ||||
pool = Pool(n_jobs) | pool = Pool(n_jobs) | ||||
all_paths = [[] for _ in range(len(Gn))] | all_paths = [[] for _ in range(len(Gn))] | ||||
getps_partial = partial(wrap_find_all_paths_until_length, Gn, depth, | |||||
getps_partial = partial(wrapper_find_all_paths_until_length, depth, | |||||
ds_attrs, node_label, edge_label) | ds_attrs, node_label, edge_label) | ||||
itr = zip(Gn, range(0, len(Gn))) | |||||
if len(Gn) < 1000 * n_jobs: | if len(Gn) < 1000 * n_jobs: | ||||
chunksize = int(len(Gn) / n_jobs) + 1 | chunksize = int(len(Gn) / n_jobs) + 1 | ||||
else: | else: | ||||
chunksize = 1000 | chunksize = 1000 | ||||
# chunksize = 300 # int(len(list(itr)) / n_jobs) | |||||
for i, ps in tqdm( | for i, ps in tqdm( | ||||
pool.imap_unordered(getps_partial, range(0, len(Gn)), chunksize), | |||||
pool.imap_unordered(getps_partial, itr, chunksize), | |||||
desc='getting paths', file=sys.stdout): | desc='getting paths', file=sys.stdout): | ||||
all_paths[i] = ps | all_paths[i] = ps | ||||
pool.close() | pool.close() | ||||
@@ -110,8 +109,9 @@ def untilhpathkernel(*args, | |||||
pass | pass | ||||
else: | else: | ||||
pool = Pool(n_jobs) | pool = Pool(n_jobs) | ||||
do_partial = partial(_untilhpathkernel_do_naive, all_paths, k_func) | |||||
itr = combinations_with_replacement(range(0, len(Gn)), 2) | |||||
do_partial = partial(wrapper_uhpath_do_naive, k_func) | |||||
itr = zip(combinations_with_replacement(all_paths, 2), | |||||
combinations_with_replacement(range(0, len(Gn)), 2)) | |||||
len_itr = int(len(Gn) * (len(Gn) + 1) / 2) | len_itr = int(len(Gn) * (len(Gn) + 1) / 2) | ||||
if len_itr < 1000 * n_jobs: | if len_itr < 1000 * n_jobs: | ||||
chunksize = int(len_itr / n_jobs) + 1 | chunksize = int(len_itr / n_jobs) + 1 | ||||
@@ -216,7 +216,7 @@ def _untilhpathkernel_do_gst(gst1, gst2, paths1, paths2, k_func): | |||||
return kernel | return kernel | ||||
def _untilhpathkernel_do_naive(paths_list, k_func, ij): | |||||
def _untilhpathkernel_do_naive(paths1, paths2, k_func): | |||||
"""Calculate path graph kernels up to depth d between 2 graphs naively. | """Calculate path graph kernels up to depth d between 2 graphs naively. | ||||
Parameters | Parameters | ||||
@@ -235,10 +235,6 @@ def _untilhpathkernel_do_naive(paths_list, k_func, ij): | |||||
kernel : float | kernel : float | ||||
Path kernel up to h between 2 graphs. | Path kernel up to h between 2 graphs. | ||||
""" | """ | ||||
iglobal = ij[0] | |||||
jglobal = ij[1] | |||||
paths1 = paths_list[iglobal] | |||||
paths2 = paths_list[jglobal] | |||||
all_paths = list(set(paths1 + paths2)) | all_paths = list(set(paths1 + paths2)) | ||||
if k_func == 'tanimoto': | if k_func == 'tanimoto': | ||||
@@ -260,12 +256,18 @@ def _untilhpathkernel_do_naive(paths_list, k_func, ij): | |||||
kernel = np.sum(np.minimum(vector1, vector2)) / \ | kernel = np.sum(np.minimum(vector1, vector2)) / \ | ||||
np.sum(np.maximum(vector1, vector2)) | np.sum(np.maximum(vector1, vector2)) | ||||
return iglobal, jglobal, kernel | |||||
return kernel | |||||
# @todo: (can be removed maybe) this method find paths repetively, it could be faster. | |||||
def wrapper_uhpath_do_naive(k_func, itr_item): | |||||
plist1 = itr_item[0][0] | |||||
plist2 = itr_item[0][1] | |||||
i = itr_item[1][0] | |||||
j = itr_item[1][1] | |||||
return i, j, _untilhpathkernel_do_naive(plist1, plist2, k_func) | |||||
# @todo: (can be removed maybe) this method find paths repetively, it could be faster. | |||||
def find_all_paths_until_length(G, | def find_all_paths_until_length(G, | ||||
length, | length, | ||||
ds_attrs, | ds_attrs, | ||||
@@ -368,15 +370,12 @@ def find_all_paths_until_length(G, | |||||
return [tuple([len(path)]) for path in all_paths] | return [tuple([len(path)]) for path in all_paths] | ||||
def wrap_find_all_paths_until_length(Gn, length, ds_attrs, node_label, | |||||
edge_label, i): | |||||
try: | |||||
return i, find_all_paths_until_length(Gn[i], length, ds_attrs, | |||||
def wrapper_find_all_paths_until_length(length, ds_attrs, node_label, | |||||
edge_label, itr_item): | |||||
g = itr_item[0] | |||||
i = itr_item[1] | |||||
return i, find_all_paths_until_length(g, length, ds_attrs, | |||||
node_label=node_label, edge_label=edge_label) | node_label=node_label, edge_label=edge_label) | ||||
except Exception as e: | |||||
traceback.print_exc() | |||||
print('') | |||||
raise e | |||||
def paths2GSuffixTree(paths): | def paths2GSuffixTree(paths): | ||||
@@ -206,54 +206,50 @@ def model_selection_for_precomputed_kernel(datafile, | |||||
'3. Fitting and predicting using nested cross validation. This could really take a while...' | '3. Fitting and predicting using nested cross validation. This could really take a while...' | ||||
) | ) | ||||
# pool = Pool(n_jobs) | |||||
# trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, gram_matrices, y, model_type) | |||||
# train_pref = [] | |||||
# val_pref = [] | |||||
# test_pref = [] | |||||
## if NUM_TRIALS < 1000 * n_jobs: | |||||
## chunksize = int(NUM_TRIALS / n_jobs) + 1 | |||||
## else: | |||||
## chunksize = 1000 | |||||
# chunksize = 1 | |||||
# for o1, o2, o3 in tqdm(pool.imap_unordered(trial_do_partial, range(NUM_TRIALS), chunksize), desc='cross validation', file=sys.stdout): | |||||
# train_pref.append(o1) | |||||
# val_pref.append(o2) | |||||
# test_pref.append(o3) | |||||
# pool.close() | |||||
# pool.join() | |||||
# ---- use pool.map to parallel. ---- | |||||
pool = Pool(n_jobs) | pool = Pool(n_jobs) | ||||
trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, gram_matrices, y, model_type) | trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, gram_matrices, y, model_type) | ||||
train_pref = [] | |||||
val_pref = [] | |||||
test_pref = [] | |||||
# if NUM_TRIALS < 100: | |||||
# chunksize, extra = divmod(NUM_TRIALS, n_jobs * 4) | |||||
# if extra: | |||||
# chunksize += 1 | |||||
# else: | |||||
# chunksize = 100 | |||||
chunksize = 1 | |||||
for o1, o2, o3 in tqdm(pool.imap_unordered(trial_do_partial, range(NUM_TRIALS), chunksize), desc='cross validation', file=sys.stdout): | |||||
train_pref.append(o1) | |||||
val_pref.append(o2) | |||||
test_pref.append(o3) | |||||
pool.close() | |||||
pool.join() | |||||
# # ---- use pool.map to parallel. ---- | |||||
# result_perf = pool.map(trial_do_partial, range(NUM_TRIALS)) | |||||
# train_pref = [item[0] for item in result_perf] | |||||
# val_pref = [item[1] for item in result_perf] | |||||
# test_pref = [item[2] for item in result_perf] | |||||
result_perf = pool.map(trial_do_partial, range(NUM_TRIALS)) | |||||
train_pref = [item[0] for item in result_perf] | |||||
val_pref = [item[1] for item in result_perf] | |||||
test_pref = [item[2] for item in result_perf] | |||||
# # ---- use joblib.Parallel to parallel and track progress. ---- | |||||
# trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, gram_matrices, y, model_type) | |||||
# result_perf = Parallel(n_jobs=n_jobs, verbose=10)(delayed(trial_do_partial)(trial) for trial in range(NUM_TRIALS)) | |||||
# train_pref = [item[0] for item in result_perf] | |||||
# val_pref = [item[1] for item in result_perf] | |||||
# test_pref = [item[2] for item in result_perf] | |||||
# # ---- direct running, normally use a single CPU core. ---- | |||||
# train_pref = [] | |||||
# val_pref = [] | |||||
# test_pref = [] | |||||
# for i in tqdm(range(NUM_TRIALS), desc='cross validation', file=sys.stdout): | |||||
# o1, o2, o3 = trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, i) | |||||
# train_pref.append(o1) | |||||
# val_pref.append(o2) | |||||
# test_pref.append(o3) | |||||
# # ---- direct running, normally use a single CPU core. ---- | |||||
# train_pref = [] | |||||
# val_pref = [] | |||||
# test_pref = [] | |||||
# for i in tqdm(range(NUM_TRIALS), desc='cross validation', file=sys.stdout): | |||||
# o1, o2, o3 = trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, i) | |||||
# train_pref.append(o1) | |||||
# val_pref.append(o2) | |||||
# test_pref.append(o3) | |||||
# print() | |||||
print() | print() | ||||
print('4. Getting final performance...') | print('4. Getting final performance...') | ||||
str_fw += '\nIII. Performance.\n\n' | str_fw += '\nIII. Performance.\n\n' | ||||
# averages and confidences of performances on outer trials for each combination of parameters | # averages and confidences of performances on outer trials for each combination of parameters | ||||
average_train_scores = np.mean(train_pref, axis=0) | average_train_scores = np.mean(train_pref, axis=0) | ||||
# print('val_pref: ', val_pref[0][0]) | |||||
average_val_scores = np.mean(val_pref, axis=0) | average_val_scores = np.mean(val_pref, axis=0) | ||||
# print('test_pref: ', test_pref[0][0]) | |||||
average_perf_scores = np.mean(test_pref, axis=0) | average_perf_scores = np.mean(test_pref, axis=0) | ||||
# sample std is used here | # sample std is used here | ||||
std_train_scores = np.std(train_pref, axis=0, ddof=1) | std_train_scores = np.std(train_pref, axis=0, ddof=1) | ||||
@@ -264,6 +260,9 @@ def model_selection_for_precomputed_kernel(datafile, | |||||
best_val_perf = np.amin(average_val_scores) | best_val_perf = np.amin(average_val_scores) | ||||
else: | else: | ||||
best_val_perf = np.amax(average_val_scores) | best_val_perf = np.amax(average_val_scores) | ||||
# print('average_val_scores: ', average_val_scores) | |||||
# print('best_val_perf: ', best_val_perf) | |||||
# print() | |||||
best_params_index = np.where(average_val_scores == best_val_perf) | best_params_index = np.where(average_val_scores == best_val_perf) | ||||
# find smallest val std with best val perf. | # find smallest val std with best val perf. | ||||
best_val_stds = [ | best_val_stds = [ | ||||
@@ -286,6 +285,9 @@ def model_selection_for_precomputed_kernel(datafile, | |||||
str_fw += 'best_val_perf: %s\n' % best_val_perf | str_fw += 'best_val_perf: %s\n' % best_val_perf | ||||
str_fw += 'best_val_std: %s\n' % min_val_std | str_fw += 'best_val_std: %s\n' % min_val_std | ||||
# print(best_params_index) | |||||
# print(best_params_index[0]) | |||||
# print(average_perf_scores) | |||||
final_performance = [ | final_performance = [ | ||||
average_perf_scores[value][best_params_index[1][idx]] | average_perf_scores[value][best_params_index[1][idx]] | ||||
for idx, value in enumerate(best_params_index[0]) | for idx, value in enumerate(best_params_index[0]) | ||||
@@ -429,23 +431,23 @@ def model_selection_for_precomputed_kernel(datafile, | |||||
'3. Fitting and predicting using nested cross validation. This could really take a while...' | '3. Fitting and predicting using nested cross validation. This could really take a while...' | ||||
) | ) | ||||
# pool = Pool(n_jobs) | |||||
# trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, gram_matrices, y, model_type) | |||||
# train_pref = [] | |||||
# val_pref = [] | |||||
# test_pref = [] | |||||
# if NUM_TRIALS < 100: | |||||
# chunksize, extra = divmod(NUM_TRIALS, n_jobs * 4) | |||||
# if extra: | |||||
# chunksize += 1 | |||||
# else: | |||||
# chunksize = 100 | |||||
# for o1, o2, o3 in tqdm(pool.imap_unordered(trial_do_partial, range(NUM_TRIALS), chunksize), desc='cross validation', file=sys.stdout): | |||||
# train_pref.append(o1) | |||||
# val_pref.append(o2) | |||||
# test_pref.append(o3) | |||||
# pool.close() | |||||
# pool.join() | |||||
pool = Pool(n_jobs) | |||||
trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, gram_matrices, y, model_type) | |||||
train_pref = [] | |||||
val_pref = [] | |||||
test_pref = [] | |||||
if NUM_TRIALS < 100: | |||||
chunksize, extra = divmod(NUM_TRIALS, n_jobs * 4) | |||||
if extra: | |||||
chunksize += 1 | |||||
else: | |||||
chunksize = 100 | |||||
for o1, o2, o3 in tqdm(pool.imap_unordered(trial_do_partial, range(NUM_TRIALS), chunksize), desc='cross validation', file=sys.stdout): | |||||
train_pref.append(o1) | |||||
val_pref.append(o2) | |||||
test_pref.append(o3) | |||||
pool.close() | |||||
pool.join() | |||||
# # ---- use pool.map to parallel. ---- | # # ---- use pool.map to parallel. ---- | ||||
# result_perf = pool.map(trial_do_partial, range(NUM_TRIALS)) | # result_perf = pool.map(trial_do_partial, range(NUM_TRIALS)) | ||||
@@ -460,15 +462,15 @@ def model_selection_for_precomputed_kernel(datafile, | |||||
# val_pref = [item[1] for item in result_perf] | # val_pref = [item[1] for item in result_perf] | ||||
# test_pref = [item[2] for item in result_perf] | # test_pref = [item[2] for item in result_perf] | ||||
# ---- direct running, normally use a single CPU core. ---- | |||||
train_pref = [] | |||||
val_pref = [] | |||||
test_pref = [] | |||||
for i in tqdm(range(NUM_TRIALS), desc='cross validation', file=sys.stdout): | |||||
o1, o2, o3 = trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, i) | |||||
train_pref.append(o1) | |||||
val_pref.append(o2) | |||||
test_pref.append(o3) | |||||
# # ---- direct running, normally use a single CPU core. ---- | |||||
# train_pref = [] | |||||
# val_pref = [] | |||||
# test_pref = [] | |||||
# for i in tqdm(range(NUM_TRIALS), desc='cross validation', file=sys.stdout): | |||||
# o1, o2, o3 = trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, i) | |||||
# train_pref.append(o1) | |||||
# val_pref.append(o2) | |||||
# test_pref.append(o3) | |||||
print() | print() | ||||
print('4. Getting final performance...') | print('4. Getting final performance...') | ||||
@@ -623,89 +625,142 @@ def trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, t | |||||
val_pref = np.zeros((len(param_list_pre_revised), len(param_list))) | val_pref = np.zeros((len(param_list_pre_revised), len(param_list))) | ||||
test_pref = np.zeros((len(param_list_pre_revised), len(param_list))) | test_pref = np.zeros((len(param_list_pre_revised), len(param_list))) | ||||
# randomness added to seeds of split function below. "high" is "size" times | |||||
# 10 so that at least 10 different random output will be yielded. Remove | |||||
# these lines if identical outputs is required. | |||||
rdm_out = np.random.RandomState(seed=None) | |||||
rdm_seed_out_l = rdm_out.uniform(high=len(param_list_pre_revised) * 10, | |||||
size=len(param_list_pre_revised)) | |||||
# print(trial, rdm_seed_out_l) | |||||
# print() | |||||
# loop for each outer param tuple | # loop for each outer param tuple | ||||
for index_out, params_out in enumerate(param_list_pre_revised): | for index_out, params_out in enumerate(param_list_pre_revised): | ||||
# split gram matrix and y to app and test sets. | # split gram matrix and y to app and test sets. | ||||
indices = range(len(y)) | indices = range(len(y)) | ||||
# The argument "random_state" in function "train_test_split" can not be | |||||
# set to None, because it will use RandomState instance used by | |||||
# np.random, which is possible for multiple subprocesses to inherit the | |||||
# same seed if they forked at the same time, leading to identical | |||||
# random variates for different subprocesses. Instead, we use "trial" | |||||
# and "index_out" parameters to generate different seeds for different | |||||
# trials/subprocesses and outer loops. "rdm_seed_out_l" is used to add | |||||
# randomness into seeds, so that it yields a different output every | |||||
# time the program is run. To yield identical outputs every time, | |||||
# remove the second line below. Same method is used to the "KFold" | |||||
# function in the inner loop. | |||||
rdm_seed_out = (trial + 1) * (index_out + 1) | |||||
rdm_seed_out = (rdm_seed_out + int(rdm_seed_out_l[index_out])) % (2 ** 32 - 1) | |||||
# print(trial, rdm_seed_out) | |||||
X_app, X_test, y_app, y_test, idx_app, idx_test = train_test_split( | X_app, X_test, y_app, y_test, idx_app, idx_test = train_test_split( | ||||
gram_matrices[index_out], y, indices, test_size=0.1, | gram_matrices[index_out], y, indices, test_size=0.1, | ||||
random_state=None, shuffle=True) | |||||
random_state=rdm_seed_out, shuffle=True) | |||||
# print(trial, idx_app, idx_test) | |||||
# print() | |||||
X_app = X_app[:, idx_app] | X_app = X_app[:, idx_app] | ||||
X_test = X_test[:, idx_app] | X_test = X_test[:, idx_app] | ||||
y_app = np.array(y_app) | y_app = np.array(y_app) | ||||
y_test = np.array(y_test) | y_test = np.array(y_test) | ||||
rdm_seed_in_l = rdm_out.uniform(high=len(param_list) * 10, | |||||
size=len(param_list)) | |||||
# loop for each inner param tuple | # loop for each inner param tuple | ||||
for index_in, params_in in enumerate(param_list): | for index_in, params_in in enumerate(param_list): | ||||
# print(index_in, params_in) | |||||
# if trial == 0: | |||||
# print(index_out, index_in) | |||||
# print('params_in: ', params_in) | |||||
# st = time.time() | # st = time.time() | ||||
inner_cv = KFold(n_splits=10, shuffle=True, random_state=trial) | |||||
rdm_seed_in = (trial + 1) * (index_out + 1) * (index_in + 1) | |||||
# print("rdm_seed_in1: ", trial, index_in, rdm_seed_in) | |||||
rdm_seed_in = (rdm_seed_in + int(rdm_seed_in_l[index_in])) % (2 ** 32 - 1) | |||||
# print("rdm_seed_in2: ", trial, index_in, rdm_seed_in) | |||||
inner_cv = KFold(n_splits=10, shuffle=True, random_state=rdm_seed_in) | |||||
current_train_perf = [] | current_train_perf = [] | ||||
current_valid_perf = [] | current_valid_perf = [] | ||||
current_test_perf = [] | current_test_perf = [] | ||||
# For regression use the Kernel Ridge method | # For regression use the Kernel Ridge method | ||||
try: | |||||
if model_type == 'regression': | |||||
kr = KernelRidge(kernel='precomputed', **params_in) | |||||
# loop for each split on validation set level | |||||
# validation set level | |||||
for train_index, valid_index in inner_cv.split(X_app): | |||||
kr.fit(X_app[train_index, :][:, train_index], | |||||
y_app[train_index]) | |||||
# try: | |||||
if model_type == 'regression': | |||||
kr = KernelRidge(kernel='precomputed', **params_in) | |||||
# loop for each split on validation set level | |||||
# validation set level | |||||
for train_index, valid_index in inner_cv.split(X_app): | |||||
# print("train_index, valid_index: ", trial, index_in, train_index, valid_index) | |||||
# if trial == 0: | |||||
# print('train_index: ', train_index) | |||||
# print('valid_index: ', valid_index) | |||||
# print('idx_test: ', idx_test) | |||||
# print('y_app[train_index]: ', y_app[train_index]) | |||||
# print('X_app[train_index, :][:, train_index]: ', X_app[train_index, :][:, train_index]) | |||||
# print('X_app[valid_index, :][:, train_index]: ', X_app[valid_index, :][:, train_index]) | |||||
kr.fit(X_app[train_index, :][:, train_index], | |||||
y_app[train_index]) | |||||
# predict on the train, validation and test set | |||||
y_pred_train = kr.predict( | |||||
X_app[train_index, :][:, train_index]) | |||||
y_pred_valid = kr.predict( | |||||
X_app[valid_index, :][:, train_index]) | |||||
y_pred_test = kr.predict( | |||||
X_test[:, train_index]) | |||||
# predict on the train, validation and test set | |||||
y_pred_train = kr.predict( | |||||
X_app[train_index, :][:, train_index]) | |||||
y_pred_valid = kr.predict( | |||||
X_app[valid_index, :][:, train_index]) | |||||
# if trial == 0: | |||||
# print('y_pred_valid: ', y_pred_valid) | |||||
# print() | |||||
y_pred_test = kr.predict( | |||||
X_test[:, train_index]) | |||||
# root mean squared errors | |||||
current_train_perf.append( | |||||
np.sqrt( | |||||
mean_squared_error( | |||||
y_app[train_index], y_pred_train))) | |||||
current_valid_perf.append( | |||||
np.sqrt( | |||||
mean_squared_error( | |||||
y_app[valid_index], y_pred_valid))) | |||||
current_test_perf.append( | |||||
np.sqrt( | |||||
mean_squared_error( | |||||
y_test, y_pred_test))) | |||||
# For clcassification use SVM | |||||
else: | |||||
svc = SVC(kernel='precomputed', cache_size=200, | |||||
verbose=False, **params_in) | |||||
# loop for each split on validation set level | |||||
# validation set level | |||||
for train_index, valid_index in inner_cv.split(X_app): | |||||
# root mean squared errors | |||||
current_train_perf.append( | |||||
np.sqrt( | |||||
mean_squared_error( | |||||
y_app[train_index], y_pred_train))) | |||||
current_valid_perf.append( | |||||
np.sqrt( | |||||
mean_squared_error( | |||||
y_app[valid_index], y_pred_valid))) | |||||
# if trial == 0: | |||||
# print(mean_squared_error( | |||||
# y_app[valid_index], y_pred_valid)) | |||||
current_test_perf.append( | |||||
np.sqrt( | |||||
mean_squared_error( | |||||
y_test, y_pred_test))) | |||||
# For clcassification use SVM | |||||
else: | |||||
svc = SVC(kernel='precomputed', cache_size=200, | |||||
verbose=False, **params_in) | |||||
# loop for each split on validation set level | |||||
# validation set level | |||||
for train_index, valid_index in inner_cv.split(X_app): | |||||
# np.savez("bug.npy",X_app[train_index, :][:, train_index],y_app[train_index]) | # np.savez("bug.npy",X_app[train_index, :][:, train_index],y_app[train_index]) | ||||
svc.fit(X_app[train_index, :][:, train_index], | |||||
y_app[train_index]) | |||||
# predict on the train, validation and test set | |||||
y_pred_train = svc.predict( | |||||
X_app[train_index, :][:, train_index]) | |||||
y_pred_valid = svc.predict( | |||||
X_app[valid_index, :][:, train_index]) | |||||
y_pred_test = svc.predict( | |||||
X_test[:, train_index]) | |||||
# if trial == 0: | |||||
# print('train_index: ', train_index) | |||||
# print('valid_index: ', valid_index) | |||||
# print('idx_test: ', idx_test) | |||||
# print('y_app[train_index]: ', y_app[train_index]) | |||||
# print('X_app[train_index, :][:, train_index]: ', X_app[train_index, :][:, train_index]) | |||||
# print('X_app[valid_index, :][:, train_index]: ', X_app[valid_index, :][:, train_index]) | |||||
svc.fit(X_app[train_index, :][:, train_index], | |||||
y_app[train_index]) | |||||
# predict on the train, validation and test set | |||||
y_pred_train = svc.predict( | |||||
X_app[train_index, :][:, train_index]) | |||||
y_pred_valid = svc.predict( | |||||
X_app[valid_index, :][:, train_index]) | |||||
y_pred_test = svc.predict( | |||||
X_test[:, train_index]) | |||||
# root mean squared errors | |||||
current_train_perf.append( | |||||
accuracy_score(y_app[train_index], | |||||
y_pred_train)) | |||||
current_valid_perf.append( | |||||
accuracy_score(y_app[valid_index], | |||||
y_pred_valid)) | |||||
current_test_perf.append( | |||||
accuracy_score(y_test, y_pred_test)) | |||||
except ValueError: | |||||
print(sys.exc_info()[0]) | |||||
print(params_out, params_in) | |||||
# root mean squared errors | |||||
current_train_perf.append( | |||||
accuracy_score(y_app[train_index], | |||||
y_pred_train)) | |||||
current_valid_perf.append( | |||||
accuracy_score(y_app[valid_index], | |||||
y_pred_valid)) | |||||
current_test_perf.append( | |||||
accuracy_score(y_test, y_pred_test)) | |||||
# except ValueError: | |||||
# print(sys.exc_info()[0]) | |||||
# print(params_out, params_in) | |||||
# average performance on inner splits | # average performance on inner splits | ||||
train_pref[index_out][index_in] = np.mean( | train_pref[index_out][index_in] = np.mean( | ||||
@@ -715,5 +770,8 @@ def trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, t | |||||
test_pref[index_out][index_in] = np.mean( | test_pref[index_out][index_in] = np.mean( | ||||
current_test_perf) | current_test_perf) | ||||
# print(time.time() - st) | # print(time.time() - st) | ||||
# if trial == 0: | |||||
# print('val_pref: ', val_pref) | |||||
# print('test_pref: ', test_pref) | |||||
return train_pref, val_pref, test_pref | return train_pref, val_pref, test_pref |