1. add parallel computing scheme to spkernel and model_selection_precomputed.

2. modify model_selection_precomputed so that all results are written into memory and then to a file at last section of code, in case that on cpu/disk seperated systems the IO takes too much time. 3. correct utils.floyd_warshall_numpy function. DONNOT use the last version.
7 years ago · 22a1f1e8d8
--- a/README.md
+++ b/README.md
@@ -3,12 +3,15 @@ A python package for graph kernels.
 ## Requirements
 * numpy - 1.13.3
 * scipy - 1.0.0
 * matplotlib - 2.1.0
 * networkx - 2.0
 * sklearn - 0.19.1
 * tabulate - 0.8.2
 numpy==1.14.5
 scipy==1.1.0
 matplotlib==2.2.2
 networkx==2.1
 scikit-learn==0.19.1
 tabulate==0.8.2
 tqdm==4.23.4
 control==0.7.0 (for generalized random walk kernels only)
 slycot===0.3.2.dev-5263ada (for generalized random walk kernels only, requires fortran compiler, gfortran for example)
 ## Results with minimal test RMSE for each kernel on dataset Asyclic
@@ -28,7 +31,7 @@ For prediction we randomly divide the data in train and test subset, where 90\%
 | WL shortest path          | 28.74±0.60 | 38.20±0.62 | 39.02±6.09 | 'height': 10.0, 'alpha': '1.00'                  | 146.83"/80.63"±45.04"     |
 | WL edge                   | 30.21±0.64 | 36.53±1.02 | 38.42±6.42 | 'height': 5.0, 'alpha': '6.31e-01'               | 5.24"/5.15"±2.83"         |
 | Treelet                   | 7.33±0.64  | 13.86±0.80 | 15.38±3.56 | 'alpha': '1.12e+01'                              | 0.48"                     |
 | Path up to d              | 5.76±0.27  | 9.89±0.87  | 10.21±4.16 | 'depth': 2.0, 'k_func': 'MinMax', 'alpha': '0.1' | 0.56"/1.16"±0.75"         |
 | Path up to d              | 5.76±0.27  | 9.89±0.87  | 10.21±4.16 | 'depth': 2.0, 'k_func': 'MinMax', 'alpha	': '0.1' | 0.56"/1.16"±0.75"         |
 | Cyclic pattern            |            |            |            |                                                  |                           |
 | Walk up to n              | 20.88±0.74 | 23.34±1.11 | 24.46±6.57 | 'n': 2.0, 'alpha': '1.00e-03'                    | 0.56"/331.70"±753.44"     |
--- a/datasets/ds.py
+++ b/datasets/ds.py
@@ -3,106 +3,66 @@ dslist = [
        'name': 'Acyclic',
        'dataset': '../datasets/acyclic/dataset_bps.ds',
        'task': 'regression'
    },  # node_labeled
    {
        'name': 'COIL-DEL',
        'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'
    },  # edge_labeled
    },  # node symb
    #     {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb
    {
        'name': 'PAH',
        'dataset': '../datasets/PAH/dataset.ds',
    },  # unlabeled
    {
        'name': 'Mutagenicity',
        'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'
    },  # fully_labeled
    {
        'name': 'MAO',
        'dataset': '../datasets/MAO/dataset.ds',
    },
    },  # node/edge symb
    {
        'name': 'MUTAG',
        'dataset': '../datasets/MUTAG/MUTAG.mat',
        'extra_params': {
            'am_sp_al_nl_el': [0, 0, 3, 1, 2]
        }
    },
    },  # node/edge symb
    {
        'name': 'Alkane',
        'dataset': '../datasets/Alkane/dataset.ds',
        'task': 'regression',
        'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt',
    },
    {
        'name': 'BZR',
        'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'
    },
    },  # contains single node graph, node symb
    #     {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb
    #     {'name': 'COX2', 'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt'}, # node symb/nsymb
    {
        'name': 'COX2',
        'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt'
    },
        'name': 'Mutagenicity',
        'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'
    },  # node/edge symb
    {
        'name': 'ENZYMES',
        'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'
    },
    {
        'name': 'DHFR',
        'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt'
    },
    {
        'name': 'SYNTHETIC',
        'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt'
    },
    {
        'name': 'MSRC9',
        'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'
    },
    {
        'name': 'MSRC21',
        'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'
    },
    },  # node symb/nsymb
    #     {'name': 'Fingerprint', 'dataset': '../datasets/Fingerprint/Fingerprint_A.txt'},
    {
        'name': 'FIRSTMM_DB',
        'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'
    },
    {
        'name': 'PROTEINS',
        'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'
    },
    {
        'name': 'PROTEINS_full',
        'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'
        'name': 'Letter-med',
        'dataset': '../datasets/Letter-med/Letter-med_A.txt'
    },
    #     {'name': 'DHFR', 'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt'}, # node symb/nsymb
    #     {'name': 'SYNTHETIC', 'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt'}, # node symb/nsymb
    #     {'name': 'MSRC9', 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'}, # node symb
    #     {'name': 'MSRC21', 'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'}, # node symb
    #     {'name': 'FIRSTMM_DB', 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'}, # node symb/nsymb ,edge nsymb
    #     {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'}, # node symb/nsymb
    #     {'name': 'PROTEINS_full', 'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'}, # node symb/nsymb
    {
        'name': 'D&D',
        'dataset': '../datasets/D&D/DD.mat',
        'extra_params': {
            'am_sp_al_nl_el': [0, 1, 2, 1, -1]
        }
    },
    {
        'name': 'AIDS',
        'dataset': '../datasets/AIDS/AIDS_A.txt'
    },
    {
        'name': 'NCI1',
        'dataset': '../datasets/NCI1/NCI1.mat',
        'extra_params': {
            'am_sp_al_nl_el': [1, 1, 2, 0, -1]
        }
    },
    {
        'name': 'NCI109',
        'dataset': '../datasets/NCI109/NCI109.mat',
        'extra_params': {
            'am_sp_al_nl_el': [1, 1, 2, 0, -1]
        }
    },
    {
        'name': 'NCI-HIV',
        'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf',
        'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',
    },
    },  # node symb
    #     {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb
    #     {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1.mat',
    #         'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb
    #     {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109.mat',
    #         'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb
    #     {'name': 'NCI-HIV', 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf',
    #         'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',}, # node/edge symb
    #     # not working below
    #     {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',},
@@ -110,3 +70,116 @@ dslist = [
    #     {'name': 'PTC_MM', 'dataset': '../datasets/PTC/Train/MM.ds',},
    #     {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',},
 ]
 # dslist = [
 #     {
 #         'name': 'Acyclic',
 #         'dataset': '../datasets/acyclic/dataset_bps.ds',
 #         'task': 'regression'
 #     },  # node_labeled
 #     {
 #         'name': 'COIL-DEL',
 #         'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'
 #     },  # edge_labeled
 #     {
 #         'name': 'PAH',
 #         'dataset': '../datasets/PAH/dataset.ds',
 #     },  # unlabeled
 #     {
 #         'name': 'Mutagenicity',
 #         'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'
 #     },  # fully_labeled
 #     {
 #         'name': 'MAO',
 #         'dataset': '../datasets/MAO/dataset.ds',
 #     },
 #     {
 #         'name': 'MUTAG',
 #         'dataset': '../datasets/MUTAG/MUTAG.mat',
 #         'extra_params': {
 #             'am_sp_al_nl_el': [0, 0, 3, 1, 2]
 #         }
 #     },
 #     {
 #         'name': 'Alkane',
 #         'dataset': '../datasets/Alkane/dataset.ds',
 #         'task': 'regression',
 #         'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt',
 #     },
 #     {
 #         'name': 'BZR',
 #         'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'
 #     },
 #     {
 #         'name': 'COX2',
 #         'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt'
 #     },
 #     {
 #         'name': 'ENZYMES',
 #         'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'
 #     },
 #     {
 #         'name': 'DHFR',
 #         'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt'
 #     },
 #     {
 #         'name': 'SYNTHETIC',
 #         'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt'
 #     },
 #     {
 #         'name': 'MSRC9',
 #         'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'
 #     },
 #     {
 #         'name': 'MSRC21',
 #         'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'
 #     },
 #     {
 #         'name': 'FIRSTMM_DB',
 #         'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'
 #     },
 #     {
 #         'name': 'PROTEINS',
 #         'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'
 #     },
 #     {
 #         'name': 'PROTEINS_full',
 #         'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'
 #     },
 #     {
 #         'name': 'D&D',
 #         'dataset': '../datasets/D&D/DD.mat',
 #         'extra_params': {
 #             'am_sp_al_nl_el': [0, 1, 2, 1, -1]
 #         }
 #     },
 #     {
 #         'name': 'AIDS',
 #         'dataset': '../datasets/AIDS/AIDS_A.txt'
 #     },
 #     {
 #         'name': 'NCI1',
 #         'dataset': '../datasets/NCI1/NCI1.mat',
 #         'extra_params': {
 #             'am_sp_al_nl_el': [1, 1, 2, 0, -1]
 #         }
 #     },
 #     {
 #         'name': 'NCI109',
 #         'dataset': '../datasets/NCI109/NCI109.mat',
 #         'extra_params': {
 #             'am_sp_al_nl_el': [1, 1, 2, 0, -1]
 #         }
 #     },
 #     {
 #         'name': 'NCI-HIV',
 #         'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf',
 #         'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',
 #     },
 #     #     # not working below
 #     #     {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',},
 #     #     {'name': 'PTC_FR', 'dataset': '../datasets/PTC/Train/FR.ds',},
 #     #     {'name': 'PTC_MM', 'dataset': '../datasets/PTC/Train/MM.ds',},
 #     #     {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',},
 # ]
--- a/notebooks/run_randomwalkkernel.ipynb
+++ b/notebooks/run_randomwalkkernel.ipynb
--- a/notebooks/run_spkernel.ipynb
+++ b/notebooks/run_spkernel.ipynb
--- a/notebooks/run_spkernel.py
+++ b/notebooks/run_spkernel.py
@@ -1,56 +1,157 @@
 import functools
 from libs import *
 from pygraph.kernels.spKernel import spkernel
 from pygraph.utils.kernels import deltakernel, kernelsum
 from sklearn.metrics.pairwise import rbf_kernel
 dslist = [   
 #     {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds', 'task': 'regression'}, # node_labeled
 #     {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge_labeled
 # dslist = [
 #      {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds', 'task': 'regression'}, # node symb
 # #     {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb
 #     {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds',}, # unlabeled
    {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, # fully_labeled
 #     {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds',},
 #     {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds',}, # node/edge symb
 #     {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat',
 #         'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}},
 #     {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression', 
 #         'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt',},
 #     {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'},
 #     {'name': 'COX2', 'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt'},    
    {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'},
 #     {'name': 'DHFR', 'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt'},
 #     {'name': 'SYNTHETIC', 'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt'},
 #     {'name': 'MSRC9', 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'},
 #     {'name': 'MSRC21', 'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'},
 #     {'name': 'FIRSTMM_DB', 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'},
 #     {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'},
 #     {'name': 'PROTEINS_full', 'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'},
 #         'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, # node/edge symb
 #     {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression',
 #         'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt',}, # contains single node graph, node symb
 # #     {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb
 # #     {'name': 'COX2', 'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt'}, # node symb/nsymb
 #     {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, # node/edge symb
 #     {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'}, # node symb/nsymb
 # #     {'name': 'Fingerprint', 'dataset': '../datasets/Fingerprint/Fingerprint_A.txt'},
 #     {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'},
 # #     {'name': 'DHFR', 'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt'}, # node symb/nsymb
 # #     {'name': 'SYNTHETIC', 'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt'}, # node symb/nsymb
 # #     {'name': 'MSRC9', 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'}, # node symb
 # #     {'name': 'MSRC21', 'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'}, # node symb
 # #     {'name': 'FIRSTMM_DB', 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'}, # node symb/nsymb ,edge nsymb
 # #     {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'}, # node symb/nsymb
 # #     {'name': 'PROTEINS_full', 'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'}, # node symb/nsymb
 #     {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat',
 #      'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}},
 #     {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'},
 #     {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1.mat',
 #         'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}},
 #     {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109.mat',
 #         'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}},
 #     {'name': 'NCI-HIV', 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf',
 #         'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',},
 #     # not working below
 #     {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',},
 #     {'name': 'PTC_FR', 'dataset': '../datasets/PTC/Train/FR.ds',},
 #     {'name': 'PTC_MM', 'dataset': '../datasets/PTC/Train/MM.ds',},
 #     {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',},
 ]
 #      'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}}, # node symb
 # #     {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb
 # #     {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1.mat',
 # #         'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb
 # #     {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109.mat',
 # #         'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb
 # #     {'name': 'NCI-HIV', 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf',
 # #         'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',}, # node/edge symb
 # #     # not working below
 # #     {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',},
 # #     {'name': 'PTC_FR', 'dataset': '../datasets/PTC/Train/FR.ds',},
 # #     {'name': 'PTC_MM', 'dataset': '../datasets/PTC/Train/MM.ds',},
 # #     {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',},
 # ]
 import ast
 ds = ast.literal_eval(sys.argv[1])
 estimator = spkernel
 param_grid_precomputed = {}
 param_grid = [{'C': np.logspace(-10, 10, num = 41, base = 10)}, 
              {'alpha': np.logspace(-10, 10, num = 41, base = 10)}]
 for ds in dslist:
    print()
    print(ds['name'])
    model_selection_for_precomputed_kernel(
        ds['dataset'], estimator, param_grid_precomputed, 
        (param_grid[1] if ('task' in ds and ds['task'] == 'regression') else param_grid[0]), 
        (ds['task'] if 'task' in ds else 'classification'), NUM_TRIALS=30,
        datafile_y=(ds['dataset_y'] if 'dataset_y' in ds else None),
        extra_params=(ds['extra_params'] if 'extra_params' in ds else None))
    print()
 mixkernel = functools.partial(kernelsum, deltakernel, rbf_kernel)
 param_grid_precomputed = {
    'node_kernels': [{
        'symb': deltakernel,
        'nsymb': rbf_kernel,
        'mix': mixkernel
    }]
 }
 param_grid = [{
    'C': np.logspace(-10, 10, num=41, base=10)
 }, {
    'alpha': np.logspace(-10, 10, num=41, base=10)
 }]
 print()
 print(ds['name'])
 model_selection_for_precomputed_kernel(
    ds['dataset'],
    estimator,
    param_grid_precomputed,
    (param_grid[1]
     if ('task' in ds and ds['task'] == 'regression') else param_grid[0]),
    (ds['task'] if 'task' in ds else 'classification'),
    NUM_TRIALS=30,
    datafile_y=(ds['dataset_y'] if 'dataset_y' in ds else None),
    extra_params=(ds['extra_params'] if 'extra_params' in ds else None),
    ds_name=ds['name'])
 #     %lprun -f spkernel \
 #         model_selection_for_precomputed_kernel( \
 #             ds['dataset'], estimator, param_grid_precomputed, \
 #             (param_grid[1] if ('task' in ds and ds['task'] == 'regression') else param_grid[0]), \
 #             (ds['task'] if 'task' in ds else 'classification'), NUM_TRIALS=30, \
 #             datafile_y=(ds['dataset_y'] if 'dataset_y' in ds else None), \
 #             extra_params=(ds['extra_params'] if 'extra_params' in ds else None))
 print()
 # import functools
 # from libs import *
 # from pygraph.kernels.spKernel import spkernel
 # from pygraph.utils.kernels import deltakernel, kernelsum
 # from sklearn.metrics.pairwise import rbf_kernel
 # dslist = [
 #     {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds', 'task': 'regression'}, # node symb
 # #     {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb
 # #    {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds',}, # unlabeled
 # #    {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds',}, # node/edge symb
 # #    {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat',
 # #        'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, # node/edge symb
 # #    {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression',
 # #        'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt',}, # contains single node graph, node symb
 # #     {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb
 # #     {'name': 'COX2', 'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt'}, # node symb/nsymb
 # #    {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, # node/edge symb
 # #    {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'}, # node symb/nsymb
 # #     {'name': 'Fingerprint', 'dataset': '../datasets/Fingerprint/Fingerprint_A.txt'},
 # #    {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'},
 # #     {'name': 'DHFR', 'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt'}, # node symb/nsymb
 # #     {'name': 'SYNTHETIC', 'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt'}, # node symb/nsymb
 # #     {'name': 'MSRC9', 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'}, # node symb
 # #     {'name': 'MSRC21', 'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'}, # node symb
 # #     {'name': 'FIRSTMM_DB', 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'}, # node symb/nsymb ,edge nsymb
 # #     {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'}, # node symb/nsymb
 # #     {'name': 'PROTEINS_full', 'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'}, # node symb/nsymb
 # #    {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat',
 # #     'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}}, # node symb
 # #     {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb
 # #     {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1.mat',
 # #         'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb
 # #     {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109.mat',
 # #         'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb
 # #     {'name': 'NCI-HIV', 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf',
 # #         'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',}, # node/edge symb
 # #     # not working below
 # #     {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',},
 # #     {'name': 'PTC_FR', 'dataset': '../datasets/PTC/Train/FR.ds',},
 # #     {'name': 'PTC_MM', 'dataset': '../datasets/PTC/Train/MM.ds',},
 # #     {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',},
 # ]
 # estimator = spkernel
 # mixkernel = functools.partial(kernelsum, deltakernel, rbf_kernel)
 # param_grid_precomputed = {'node_kernels': [{'symb': deltakernel, 'nsymb': rbf_kernel, 'mix': mixkernel}]}
 # param_grid = [{'C': np.logspace(-10, 10, num = 41, base = 10)},
 #               {'alpha': np.logspace(-10, 10, num = 41, base = 10)}]
 # for ds in dslist:
 #     print()
 #     print(ds['name'])
 #     model_selection_for_precomputed_kernel(
 #         ds['dataset'], estimator, param_grid_precomputed,
 #         (param_grid[1] if ('task' in ds and ds['task'] == 'regression') else param_grid[0]),
 #         (ds['task'] if 'task' in ds else 'classification'), NUM_TRIALS=30,
 #         datafile_y=(ds['dataset_y'] if 'dataset_y' in ds else None),
 #         extra_params=(ds['extra_params'] if 'extra_params' in ds else None),
 #         ds_name=ds['name'])
 # #     %lprun -f spkernel \
 # #         model_selection_for_precomputed_kernel( \
 # #             ds['dataset'], estimator, param_grid_precomputed, \
 # #             (param_grid[1] if ('task' in ds and ds['task'] == 'regression') else param_grid[0]), \
 # #             (ds['task'] if 'task' in ds else 'classification'), NUM_TRIALS=30, \
 # #             datafile_y=(ds['dataset_y'] if 'dataset_y' in ds else None), \
 # #             extra_params=(ds['extra_params'] if 'extra_params' in ds else None))
 #     print()
--- a/pygraph/kernels/.#commonWalkKernel.py
+++ b/pygraph/kernels/.#commonWalkKernel.py
@@ -1 +0,0 @@
 ljia@ljia-Precision-7520.4716:1530265749
--- a/pygraph/kernels/spKernel.py
+++ b/pygraph/kernels/spKernel.py
@@ -9,6 +9,9 @@ sys.path.insert(0, "../")
 from tqdm import tqdm
 import time
 from itertools import combinations_with_replacement, product
 from functools import partial
 from joblib import Parallel, delayed
 from multiprocessing import Pool
 import networkx as nx
 import numpy as np
@@ -17,7 +20,11 @@ from pygraph.utils.utils import getSPGraph
 from pygraph.utils.graphdataset import get_dataset_attributes
 def spkernel(*args, node_label='atom', edge_weight=None, node_kernels=None):
 def spkernel(*args,
             node_label='atom',
             edge_weight=None,
             node_kernels=None,
             n_jobs=None):
    """Calculate shortest-path kernels between graphs.
    Parameters
@@ -70,180 +77,344 @@ def spkernel(*args, node_label='atom', edge_weight=None, node_kernels=None):
    if len(Gn) != len_gn:
        print('\n %d graphs are removed as they don\'t contain edges.\n' %
              (len_gn - len(Gn)))
    start_time = time.time()
    pool = Pool(n_jobs)
    # get shortest path graphs of Gn
    Gn = [
        getSPGraph(G, edge_weight=edge_weight)
        for G in tqdm(Gn, desc='getting sp graphs', file=sys.stdout)
    ]
    getsp_partial = partial(wrap_getSPGraph, Gn, edge_weight)
    result_sp = pool.map(getsp_partial, range(0, len(Gn)))
    for i in result_sp:
        Gn[i[0]] = i[1]
    # Gn = [
    #     getSPGraph(G, edge_weight=edge_weight)
    #     for G in tqdm(Gn, desc='getting sp graphs', file=sys.stdout)
    # ]
    Kmatrix = np.zeros((len(Gn), len(Gn)))
    pbar = tqdm(
        total=((len(Gn) + 1) * len(Gn) / 2),
        desc='calculating kernels',
        file=sys.stdout)
    do_partial = partial(spkernel_do, Gn, ds_attrs, node_label, node_kernels)
    itr = combinations_with_replacement(range(0, len(Gn)), 2)
    # chunksize = 2000  # int(len(list(itr)) / n_jobs)
    # for i, j, kernel in tqdm(pool.imap_unordered(do_partial, itr, chunksize)):
    #     Kmatrix[i][j] = kernel
    #     Kmatrix[j][i] = kernel
    result_perf = pool.map(do_partial, itr)
    pool.close()
    pool.join()
    # result_perf = Parallel(
    #     n_jobs=n_jobs, verbose=10)(
    #         delayed(do_partial)(ij)
    #         for ij in combinations_with_replacement(range(0, len(Gn)), 2))
    # result_perf = [
    #     do_partial(ij)
    #     for ij in combinations_with_replacement(range(0, len(Gn)), 2)
    # ]
    for i in result_perf:
        Kmatrix[i[0]][i[1]] = i[2]
        Kmatrix[i[1]][i[0]] = i[2]
    # pbar = tqdm(
    #     total=((len(Gn) + 1) * len(Gn) / 2),
    #     desc='calculating kernels',
    #     file=sys.stdout)
    # if ds_attrs['node_labeled']:
    #     # node symb and non-synb labeled
    #     if ds_attrs['node_attr_dim'] > 0:
    #         if ds_attrs['is_directed']:
    #             for i, j in combinations_with_replacement(
    #                     range(0, len(Gn)), 2):
    #                 for e1, e2 in product(
    #                         Gn[i].edges(data=True), Gn[j].edges(data=True)):
    #                     if e1[2]['cost'] == e2[2]['cost']:
    #                         kn = node_kernels['mix']
    #                         try:
    #                             n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[
    #                                 i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[
    #                                     j].nodes[e2[1]]
    #                             kn1 = kn(n11[node_label], n21[node_label], [
    #                                 n11['attributes']
    #                             ], [n21['attributes']]) * kn(
    #                                 n12[node_label], n22[node_label],
    #                                 [n12['attributes']], [n22['attributes']])
    #                             Kmatrix[i][j] += kn1
    #                         except KeyError:  # missing labels or attributes
    #                             pass
    #                 Kmatrix[j][i] = Kmatrix[i][j]
    #                 pbar.update(1)
    #         else:
    #             for i, j in combinations_with_replacement(
    #                     range(0, len(Gn)), 2):
    #                 for e1, e2 in product(
    #                         Gn[i].edges(data=True), Gn[j].edges(data=True)):
    #                     if e1[2]['cost'] == e2[2]['cost']:
    #                         kn = node_kernels['mix']
    #                         try:
    #                             # each edge walk is counted twice, starting from both its extreme nodes.
    #                             n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[
    #                                 i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[
    #                                     j].nodes[e2[1]]
    #                             kn1 = kn(n11[node_label], n21[node_label], [
    #                                 n11['attributes']
    #                             ], [n21['attributes']]) * kn(
    #                                 n12[node_label], n22[node_label],
    #                                 [n12['attributes']], [n22['attributes']])
    #                             kn2 = kn(n11[node_label], n22[node_label], [
    #                                 n11['attributes']
    #                             ], [n22['attributes']]) * kn(
    #                                 n12[node_label], n21[node_label],
    #                                 [n12['attributes']], [n21['attributes']])
    #                             Kmatrix[i][j] += kn1 + kn2
    #                         except KeyError:  # missing labels or attributes
    #                             pass
    #                 Kmatrix[j][i] = Kmatrix[i][j]
    #                 pbar.update(1)
    #     # node symb labeled
    #     else:
    #         if ds_attrs['is_directed']:
    #             for i, j in combinations_with_replacement(
    #                     range(0, len(Gn)), 2):
    #                 for e1, e2 in product(
    #                         Gn[i].edges(data=True), Gn[j].edges(data=True)):
    #                     if e1[2]['cost'] == e2[2]['cost']:
    #                         kn = node_kernels['symb']
    #                         try:
    #                             n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[
    #                                 i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[
    #                                     j].nodes[e2[1]]
    #                             kn1 = kn(n11[node_label],
    #                                      n21[node_label]) * kn(
    #                                          n12[node_label], n22[node_label])
    #                             Kmatrix[i][j] += kn1
    #                         except KeyError:  # missing labels
    #                             pass
    #                 Kmatrix[j][i] = Kmatrix[i][j]
    #                 pbar.update(1)
    #         else:
    #             for i, j in combinations_with_replacement(
    #                     range(0, len(Gn)), 2):
    #                 for e1, e2 in product(
    #                         Gn[i].edges(data=True), Gn[j].edges(data=True)):
    #                     if e1[2]['cost'] == e2[2]['cost']:
    #                         kn = node_kernels['symb']
    #                         try:
    #                             # each edge walk is counted twice, starting from both its extreme nodes.
    #                             n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[
    #                                 i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[
    #                                     j].nodes[e2[1]]
    #                             kn1 = kn(n11[node_label],
    #                                      n21[node_label]) * kn(
    #                                          n12[node_label], n22[node_label])
    #                             kn2 = kn(n11[node_label],
    #                                      n22[node_label]) * kn(
    #                                          n12[node_label], n21[node_label])
    #                             Kmatrix[i][j] += kn1 + kn2
    #                         except KeyError:  # missing labels
    #                             pass
    #                 Kmatrix[j][i] = Kmatrix[i][j]
    #                 pbar.update(1)
    # else:
    #     # node non-synb labeled
    #     if ds_attrs['node_attr_dim'] > 0:
    #         if ds_attrs['is_directed']:
    #             for i, j in combinations_with_replacement(
    #                     range(0, len(Gn)), 2):
    #                 for e1, e2 in product(
    #                         Gn[i].edges(data=True), Gn[j].edges(data=True)):
    #                     if e1[2]['cost'] == e2[2]['cost']:
    #                         kn = node_kernels['nsymb']
    #                         try:
    #                             # each edge walk is counted twice, starting from both its extreme nodes.
    #                             n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[
    #                                 i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[
    #                                     j].nodes[e2[1]]
    #                             kn1 = kn([n11['attributes']],
    #                                      [n21['attributes']]) * kn(
    #                                          [n12['attributes']],
    #                                          [n22['attributes']])
    #                             Kmatrix[i][j] += kn1
    #                         except KeyError:  # missing attributes
    #                             pass
    #                 Kmatrix[j][i] = Kmatrix[i][j]
    #                 pbar.update(1)
    #         else:
    #             for i, j in combinations_with_replacement(
    #                     range(0, len(Gn)), 2):
    #                 for e1, e2 in product(
    #                         Gn[i].edges(data=True), Gn[j].edges(data=True)):
    #                     if e1[2]['cost'] == e2[2]['cost']:
    #                         kn = node_kernels['nsymb']
    #                         try:
    #                             # each edge walk is counted twice, starting from both its extreme nodes.
    #                             n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[
    #                                 i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[
    #                                     j].nodes[e2[1]]
    #                             kn1 = kn([n11['attributes']],
    #                                      [n21['attributes']]) * kn(
    #                                          [n12['attributes']],
    #                                          [n22['attributes']])
    #                             kn2 = kn([n11['attributes']],
    #                                      [n22['attributes']]) * kn(
    #                                          [n12['attributes']],
    #                                          [n21['attributes']])
    #                             Kmatrix[i][j] += kn1 + kn2
    #                         except KeyError:  # missing attributes
    #                             pass
    #                 Kmatrix[j][i] = Kmatrix[i][j]
    #                 pbar.update(1)
    #     # node unlabeled
    #     else:
    #         for i, j in combinations_with_replacement(range(0, len(Gn)), 2):
    #             for e1, e2 in product(
    #                     Gn[i].edges(data=True), Gn[j].edges(data=True)):
    #                 if e1[2]['cost'] == e2[2]['cost']:
    #                     Kmatrix[i][j] += 1
    #             Kmatrix[j][i] = Kmatrix[i][j]
    #             pbar.update(1)
    run_time = time.time() - start_time
    print(
        "\n --- shortest path kernel matrix of size %d built in %s seconds ---"
        % (len(Gn), run_time))
    return Kmatrix, run_time, idx
 def spkernel_do(Gn, ds_attrs, node_label, node_kernels, ij):
    i = ij[0]
    j = ij[1]
    Kmatrix = 0
    if ds_attrs['node_labeled']:
        # node symb and non-synb labeled
        if ds_attrs['node_attr_dim'] > 0:
            if ds_attrs['is_directed']:
                for i, j in combinations_with_replacement(
                        range(0, len(Gn)), 2):
                    for e1, e2 in product(
                            Gn[i].edges(data=True), Gn[j].edges(data=True)):
                        if e1[2]['cost'] == e2[2]['cost']:
                            kn = node_kernels['mix']
                            try:
                                n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[
                                    i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[
                                        j].nodes[e2[1]]
                                kn1 = kn(n11[node_label], n21[node_label], [
                                    n11['attributes']
                                ], [n21['attributes']]) * kn(
                for e1, e2 in product(
                        Gn[i].edges(data=True), Gn[j].edges(data=True)):
                    if e1[2]['cost'] == e2[2]['cost']:
                        kn = node_kernels['mix']
                        try:
                            n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[
                                i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[
                                    j].nodes[e2[1]]
                            kn1 = kn(
                                n11[node_label], n21[node_label],
                                [n11['attributes']], [n21['attributes']]) * kn(
                                    n12[node_label], n22[node_label],
                                    [n12['attributes']], [n22['attributes']])
                                Kmatrix[i][j] += kn1
                            except KeyError:  # missing labels or attributes
                                pass
                    Kmatrix[j][i] = Kmatrix[i][j]
                    pbar.update(1)
                            Kmatrix += kn1
                        except KeyError:  # missing labels or attributes
                            pass
            else:
                for i, j in combinations_with_replacement(
                        range(0, len(Gn)), 2):
                    for e1, e2 in product(
                            Gn[i].edges(data=True), Gn[j].edges(data=True)):
                        if e1[2]['cost'] == e2[2]['cost']:
                            kn = node_kernels['mix']
                            try:
                                # each edge walk is counted twice, starting from both its extreme nodes.
                                n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[
                                    i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[
                                        j].nodes[e2[1]]
                                kn1 = kn(n11[node_label], n21[node_label], [
                                    n11['attributes']
                                ], [n21['attributes']]) * kn(
                for e1, e2 in product(
                        Gn[i].edges(data=True), Gn[j].edges(data=True)):
                    if e1[2]['cost'] == e2[2]['cost']:
                        kn = node_kernels['mix']
                        try:
                            # each edge walk is counted twice, starting from both its extreme nodes.
                            n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[
                                i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[
                                    j].nodes[e2[1]]
                            kn1 = kn(
                                n11[node_label], n21[node_label],
                                [n11['attributes']], [n21['attributes']]) * kn(
                                    n12[node_label], n22[node_label],
                                    [n12['attributes']], [n22['attributes']])
                                kn2 = kn(n11[node_label], n22[node_label], [
                                    n11['attributes']
                                ], [n22['attributes']]) * kn(
                            kn2 = kn(
                                n11[node_label], n22[node_label],
                                [n11['attributes']], [n22['attributes']]) * kn(
                                    n12[node_label], n21[node_label],
                                    [n12['attributes']], [n21['attributes']])
                                Kmatrix[i][j] += kn1 + kn2
                            except KeyError:  # missing labels or attributes
                                pass
                    Kmatrix[j][i] = Kmatrix[i][j]
                    pbar.update(1)
                            Kmatrix += kn1 + kn2
                        except KeyError:  # missing labels or attributes
                            pass
        # node symb labeled
        else:
            if ds_attrs['is_directed']:
                for i, j in combinations_with_replacement(
                        range(0, len(Gn)), 2):
                    for e1, e2 in product(
                            Gn[i].edges(data=True), Gn[j].edges(data=True)):
                        if e1[2]['cost'] == e2[2]['cost']:
                            kn = node_kernels['symb']
                            try:
                                n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[
                                    i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[
                                        j].nodes[e2[1]]
                                kn1 = kn(n11[node_label],
                                         n21[node_label]) * kn(
                                             n12[node_label], n22[node_label])
                                Kmatrix[i][j] += kn1
                            except KeyError:  # missing labels
                                pass
                    Kmatrix[j][i] = Kmatrix[i][j]
                    pbar.update(1)
                for e1, e2 in product(
                        Gn[i].edges(data=True), Gn[j].edges(data=True)):
                    if e1[2]['cost'] == e2[2]['cost']:
                        kn = node_kernels['symb']
                        try:
                            n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[
                                i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[
                                    j].nodes[e2[1]]
                            kn1 = kn(n11[node_label], n21[node_label]) * kn(
                                n12[node_label], n22[node_label])
                            Kmatrix += kn1
                        except KeyError:  # missing labels
                            pass
            else:
                for i, j in combinations_with_replacement(
                        range(0, len(Gn)), 2):
                    for e1, e2 in product(
                            Gn[i].edges(data=True), Gn[j].edges(data=True)):
                        if e1[2]['cost'] == e2[2]['cost']:
                            kn = node_kernels['symb']
                            try:
                                # each edge walk is counted twice, starting from both its extreme nodes.
                                n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[
                                    i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[
                                        j].nodes[e2[1]]
                                kn1 = kn(n11[node_label],
                                         n21[node_label]) * kn(
                                             n12[node_label], n22[node_label])
                                kn2 = kn(n11[node_label],
                                         n22[node_label]) * kn(
                                             n12[node_label], n21[node_label])
                                Kmatrix[i][j] += kn1 + kn2
                            except KeyError:  # missing labels
                                pass
                    Kmatrix[j][i] = Kmatrix[i][j]
                    pbar.update(1)
                for e1, e2 in product(
                        Gn[i].edges(data=True), Gn[j].edges(data=True)):
                    if e1[2]['cost'] == e2[2]['cost']:
                        kn = node_kernels['symb']
                        try:
                            # each edge walk is counted twice, starting from both its extreme nodes.
                            n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[
                                i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[
                                    j].nodes[e2[1]]
                            kn1 = kn(n11[node_label], n21[node_label]) * kn(
                                n12[node_label], n22[node_label])
                            kn2 = kn(n11[node_label], n22[node_label]) * kn(
                                n12[node_label], n21[node_label])
                            Kmatrix += kn1 + kn2
                        except KeyError:  # missing labels
                            pass
    else:
        # node non-synb labeled
        if ds_attrs['node_attr_dim'] > 0:
            if ds_attrs['is_directed']:
                for i, j in combinations_with_replacement(
                        range(0, len(Gn)), 2):
                    for e1, e2 in product(
                            Gn[i].edges(data=True), Gn[j].edges(data=True)):
                        if e1[2]['cost'] == e2[2]['cost']:
                            kn = node_kernels['nsymb']
                            try:
                                # each edge walk is counted twice, starting from both its extreme nodes.
                                n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[
                                    i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[
                                        j].nodes[e2[1]]
                                kn1 = kn([n11['attributes']],
                                         [n21['attributes']]) * kn(
                                             [n12['attributes']],
                                             [n22['attributes']])
                                Kmatrix[i][j] += kn1
                            except KeyError:  # missing attributes
                                pass
                    Kmatrix[j][i] = Kmatrix[i][j]
                    pbar.update(1)
                for e1, e2 in product(
                        Gn[i].edges(data=True), Gn[j].edges(data=True)):
                    if e1[2]['cost'] == e2[2]['cost']:
                        kn = node_kernels['nsymb']
                        try:
                            # each edge walk is counted twice, starting from both its extreme nodes.
                            n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[
                                i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[
                                    j].nodes[e2[1]]
                            kn1 = kn(
                                [n11['attributes']], [n21['attributes']]) * kn(
                                    [n12['attributes']], [n22['attributes']])
                            Kmatrix += kn1
                        except KeyError:  # missing attributes
                            pass
            else:
                for i, j in combinations_with_replacement(
                        range(0, len(Gn)), 2):
                    for e1, e2 in product(
                            Gn[i].edges(data=True), Gn[j].edges(data=True)):
                        if e1[2]['cost'] == e2[2]['cost']:
                            kn = node_kernels['nsymb']
                            try:
                                # each edge walk is counted twice, starting from both its extreme nodes.
                                n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[
                                    i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[
                                        j].nodes[e2[1]]
                                kn1 = kn([n11['attributes']],
                                         [n21['attributes']]) * kn(
                                             [n12['attributes']],
                                             [n22['attributes']])
                                kn2 = kn([n11['attributes']],
                                         [n22['attributes']]) * kn(
                                             [n12['attributes']],
                                             [n21['attributes']])
                                Kmatrix[i][j] += kn1 + kn2
                            except KeyError:  # missing attributes
                                pass
                    Kmatrix[j][i] = Kmatrix[i][j]
                    pbar.update(1)
        # node unlabeled
        else:
            for i, j in combinations_with_replacement(range(0, len(Gn)), 2):
                for e1, e2 in product(
                        Gn[i].edges(data=True), Gn[j].edges(data=True)):
                    if e1[2]['cost'] == e2[2]['cost']:
                        Kmatrix[i][j] += 1
                Kmatrix[j][i] = Kmatrix[i][j]
                pbar.update(1)
                        kn = node_kernels['nsymb']
                        try:
                            # each edge walk is counted twice, starting from both its extreme nodes.
                            n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[
                                i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[
                                    j].nodes[e2[1]]
                            kn1 = kn(
                                [n11['attributes']], [n21['attributes']]) * kn(
                                    [n12['attributes']], [n22['attributes']])
                            kn2 = kn(
                                [n11['attributes']], [n22['attributes']]) * kn(
                                    [n12['attributes']], [n21['attributes']])
                            Kmatrix += kn1 + kn2
                        except KeyError:  # missing attributes
                            pass
        # node unlabeled
        else:
            for e1, e2 in product(
                    Gn[i].edges(data=True), Gn[j].edges(data=True)):
                if e1[2]['cost'] == e2[2]['cost']:
                    Kmatrix += 1
    run_time = time.time() - start_time
    print(
        "\n --- shortest path kernel matrix of size %d built in %s seconds ---"
        % (len(Gn), run_time))
    return i, j, Kmatrix
    return Kmatrix, run_time, idx
 def wrap_getSPGraph(Gn, weight, i):
    return i, getSPGraph(Gn[i], edge_weight=weight)
--- a/pygraph/utils/model_selection_precomputed.py
+++ b/pygraph/utils/model_selection_precomputed.py
@@ -1,11 +1,32 @@
 import numpy as np
 from matplotlib import pyplot as plt
 from sklearn.kernel_ridge import KernelRidge
 from sklearn.svm import SVC
 from sklearn.metrics import accuracy_score, mean_squared_error
 from sklearn.model_selection import KFold, train_test_split, ParameterGrid
 from joblib import Parallel, delayed
 from multiprocessing import Pool
 from functools import partial
 import sys
 sys.path.insert(0, "../")
 import os
 import time
 from os.path import basename, splitext
 from pygraph.utils.graphfiles import loadDataset
 from tqdm import tqdm
 def model_selection_for_precomputed_kernel(datafile, estimator,
                                           param_grid_precomputed, param_grid,
                                           model_type, NUM_TRIALS=30,
 def model_selection_for_precomputed_kernel(datafile,
                                           estimator,
                                           param_grid_precomputed,
                                           param_grid,
                                           model_type,
                                           NUM_TRIALS=30,
                                           datafile_y=None,
                                           extra_params=None,
                                           ds_name='ds-unknown'):
                                           ds_name='ds-unknown',
                                           n_jobs=1):
    """Perform model selection, fitting and testing for precomputed kernels using nested cv. Print out neccessary data during the process then finally the results.
    Parameters
@@ -40,94 +61,101 @@ def model_selection_for_precomputed_kernel(datafile, estimator,
    >>>
    >>> model_selection_for_precomputed_kernel(datafile, estimator, param_grid_precomputed, param_grid, 'regression')
    """
    import numpy as np
    from matplotlib import pyplot as plt
    from sklearn.kernel_ridge import KernelRidge
    from sklearn.svm import SVC
    from sklearn.metrics import accuracy_score, mean_squared_error
    from sklearn.model_selection import KFold, train_test_split, ParameterGrid
    import sys
    sys.path.insert(0, "../")
    import os
    from os.path import basename, splitext
    from pygraph.utils.graphfiles import loadDataset
    from tqdm import tqdm
    tqdm.monitor_interval = 0
    results_dir = '../notebooks/results/' + estimator.__name__
    if not os.path.exists(results_dir):
        os.makedirs(results_dir)
    # open file to save all results for this dataset.
    with open(results_dir + '/' + ds_name + '.txt', 'w') as fresults:
        fresults.write('# This file contains results of ' + estimator.__name__ + ' on dataset ' + ds_name + ',\n# including gram matrices, serial numbers for gram matrix figures and performance.\n\n')
        # setup the model type
        model_type = model_type.lower()
        if model_type != 'regression' and model_type != 'classification':
            raise Exception(
                'The model type is incorrect! Please choose from regression or classification.')
        print()
        print('--- This is a %s problem ---' % model_type)
        fresults.write('This is a %s problem.\n\n' % model_type)
    # a string to save all the results.
    str_fw =  '# This file contains results of ' + estimator.__name__ + ' on dataset ' + ds_name + ',\n# including gram matrices, serial numbers for gram matrix figures and performance.\n\n'
        # Load the dataset
        print()
        print('\nI. Loading dataset from file...')
        dataset, y = loadDataset(datafile, filename_y=datafile_y, extra_params=extra_params)
    # setup the model type
    model_type = model_type.lower()
    if model_type != 'regression' and model_type != 'classification':
        raise Exception(
            'The model type is incorrect! Please choose from regression or classification.'
        )
    print()
    print('--- This is a %s problem ---' % model_type)
    str_fw += 'This is a %s problem.\n\n' % model_type
    # Load the dataset
    print()
    print('\nI. Loading dataset from file...')
    dataset, y = loadDataset(
        datafile, filename_y=datafile_y, extra_params=extra_params)
    #     import matplotlib.pyplot as plt      
    #     import matplotlib.pyplot as plt
    #     import networkx as nx
    #     nx.draw_networkx(dataset[30])
    #     plt.show()
        # Grid of parameters with a discrete number of values for each.
        param_list_precomputed = list(ParameterGrid(param_grid_precomputed))
        param_list = list(ParameterGrid(param_grid))
        # np.savetxt(results_name_pre + 'param_grid_precomputed.dt',
        #            [[key, value] for key, value in sorted(param_grid_precomputed)])
        # np.savetxt(results_name_pre + 'param_grid.dt',
        #            [[key, value] for key, value in sorted(param_grid)])
    # Grid of parameters with a discrete number of values for each.
    param_list_precomputed = list(ParameterGrid(param_grid_precomputed))
    param_list = list(ParameterGrid(param_grid))
    # np.savetxt(results_name_pre + 'param_grid_precomputed.dt',
    #            [[key, value] for key, value in sorted(param_grid_precomputed)])
    # np.savetxt(results_name_pre + 'param_grid.dt',
    #            [[key, value] for key, value in sorted(param_grid)])
        gram_matrices = []  # a list to store gram matrices for all param_grid_precomputed
        gram_matrix_time = []  # a list to store time to calculate gram matrices
        param_list_pre_revised = [] # list to store param grids precomputed ignoring the useless ones
    gram_matrices = [
    ]  # a list to store gram matrices for all param_grid_precomputed
    gram_matrix_time = [
    ]  # a list to store time to calculate gram matrices
    param_list_pre_revised = [
    ]  # list to store param grids precomputed ignoring the useless ones
    # calculate all gram matrices
    print()
    print('2. Calculating gram matrices. This could take a while...')
    str_fw += '\nI. Gram matrices.\n\n'
    tts = time.time()  # start training time
    nb_gm_ignore = 0  # the number of gram matrices those should not be considered, as they may contain elements that are not numbers (NaN)
    for idx, params_out in enumerate(param_list_precomputed):
        params_out['n_jobs'] = n_jobs
        rtn_data = estimator(dataset, **params_out)
        Kmatrix = rtn_data[0]
        current_run_time = rtn_data[1]
        if len(rtn_data) == 3:
            idx_trim = rtn_data[2]  # the index of trimmed graph list
            y = [y[idx] for idx in idx_trim]
        Kmatrix_diag = Kmatrix.diagonal().copy()
        # remove graphs whose kernels with themselves are zeros
        nb_g_ignore = 0
        for idx, diag in enumerate(Kmatrix_diag):
            if diag == 0:
                Kmatrix = np.delete(Kmatrix, (idx - nb_g_ignore), axis=0)
                Kmatrix = np.delete(Kmatrix, (idx - nb_g_ignore), axis=1)
                nb_g_ignore += 1
        # normalization
        for i in range(len(Kmatrix)):
            for j in range(i, len(Kmatrix)):
                Kmatrix[i][j] /= np.sqrt(Kmatrix_diag[i] * Kmatrix_diag[j])
                Kmatrix[j][i] = Kmatrix[i][j]
        # calculate all gram matrices
        print()
        print('2. Calculating gram matrices. This could take a while...')
        fresults.write('\nI. Gram matrices.\n\n')
        nb_gm_ignore = 0 # the number of gram matrices those should not be considered, as they may contain elements that are not numbers (NaN)
        for idx, params_out in enumerate(param_list_precomputed):
            rtn_data = estimator(dataset, **params_out)
            Kmatrix = rtn_data[0]
            current_run_time = rtn_data[1]
            if len(rtn_data) == 3:
                idx_trim = rtn_data[2] # the index of trimmed graph list
                y = [y[idx] for idx in idx_trim]
            Kmatrix_diag = Kmatrix.diagonal().copy()
            for i in range(len(Kmatrix)):
                for j in range(i, len(Kmatrix)):
    #                 if Kmatrix_diag[i] != 0 and Kmatrix_diag[j] != 0:
                    Kmatrix[i][j] /= np.sqrt(Kmatrix_diag[i] * Kmatrix_diag[j])
                    Kmatrix[j][i] = Kmatrix[i][j]
            print()
            if params_out == {}:
                print('the gram matrix is: ')
                fresults.write('the gram matrix is:\n\n')
            else:
                print('the gram matrix with parameters', params_out, 'is: ')
                fresults.write('the gram matrix with parameters %s is:\n\n' % params_out)
            if np.isnan(Kmatrix).any(): # if the matrix contains elements that are not numbers
        if params_out == {}:
            print('the gram matrix is: ')
            str_fw += 'the gram matrix is:\n\n'
        else:
            print('the gram matrix with parameters', params_out, 'is: ')
            str_fw += 'the gram matrix with parameters %s is:\n\n' % params_out
        if len(Kmatrix) < 2:
            nb_gm_ignore += 1
            print('ignored, as at most only one of all its diagonal value is non-zero.')
            str_fw += 'ignored, as at most only one of all its diagonal value is non-zero.\n\n'
        else:                
            if np.isnan(Kmatrix).any(
            ):  # if the matrix contains elements that are not numbers
                nb_gm_ignore += 1
                print('ignored, as it contains elements that are not numbers.')
                fresults.write('ignored, as it contains elements that are not numbers.\n\n')
                str_fw += 'ignored, as it contains elements that are not numbers.\n\n'
            else:
                print(Kmatrix)
                fresults.write(np.array2string(Kmatrix, separator=',', threshold=np.inf, floatmode='unique') + '\n\n')
                str_fw += np.array2string(
                        Kmatrix,
                        separator=',',
                        threshold=np.inf,
                        floatmode='unique') + '\n\n'
                plt.matshow(Kmatrix)
                plt.colorbar()
                fig_file_name = results_dir + '/GM[ds]' + ds_name
@@ -138,115 +166,52 @@ def model_selection_for_precomputed_kernel(datafile, estimator,
                gram_matrices.append(Kmatrix)
                gram_matrix_time.append(current_run_time)
                param_list_pre_revised.append(params_out)
        print()
        print('{} gram matrices are calculated, {} of which are ignored.'.format(len(param_list_precomputed), nb_gm_ignore))
        fresults.write('{} gram matrices are calculated, {} of which are ignored.\n\n'.format(len(param_list_precomputed), nb_gm_ignore))
        fresults.write('serial numbers of gram matrix figure and their corresponding parameters settings:\n\n')
        fresults.write(''.join(['{}: {}\n'.format(idx, params_out)
                                            for idx, params_out in enumerate(param_list_precomputed)]))
                if nb_g_ignore > 0:
                    print(', where %d graphs are ignored as their graph kernels with themselves are zeros.' % nb_g_ignore)
                    str_fw += ', where %d graphs are ignored as their graph kernels with themselves are zeros.' % nb_g_ignore
    print()
    print(
        '{} gram matrices are calculated, {} of which are ignored.'.format(
            len(param_list_precomputed), nb_gm_ignore))
    str_fw += '{} gram matrices are calculated, {} of which are ignored.\n\n'.format(len(param_list_precomputed), nb_gm_ignore)
    str_fw += 'serial numbers of gram matrix figures and their corresponding parameters settings:\n\n'
    str_fw += ''.join([
        '{}: {}\n'.format(idx, params_out)
        for idx, params_out in enumerate(param_list_precomputed)
    ])
        print()
        print('3. Fitting and predicting using nested cross validation. This could really take a while...')
        # Arrays to store scores
        train_pref = np.zeros(
            (NUM_TRIALS, len(param_list_pre_revised), len(param_list)))
        val_pref = np.zeros(
            (NUM_TRIALS, len(param_list_pre_revised), len(param_list)))
        test_pref = np.zeros(
            (NUM_TRIALS, len(param_list_pre_revised), len(param_list)))
        # Loop for each trial
        pbar = tqdm(total=NUM_TRIALS * len(param_list_pre_revised) * len(param_list),
                    desc='calculate performance', file=sys.stdout)
        for trial in range(NUM_TRIALS):  # Test set level
            # loop for each outer param tuple
            for index_out, params_out in enumerate(param_list_pre_revised):
                # split gram matrix and y to app and test sets.
                X_app, X_test, y_app, y_test = train_test_split(
                    gram_matrices[index_out], y, test_size=0.1)
                split_index_app = [y.index(y_i) for y_i in y_app if y_i in y]
                # split_index_test = [y.index(y_i) for y_i in y_test if y_i in y]
                X_app = X_app[:, split_index_app]
                X_test = X_test[:, split_index_app]
                y_app = np.array(y_app)
                y_test = np.array(y_test)
                # loop for each inner param tuple
                for index_in, params_in in enumerate(param_list):
                    inner_cv = KFold(n_splits=10, shuffle=True, random_state=trial)
                    current_train_perf = []
                    current_valid_perf = []
                    current_test_perf = []
                    # For regression use the Kernel Ridge method
                    try:
                        if model_type == 'regression':
                            KR = KernelRidge(kernel='precomputed', **params_in)
                            # loop for each split on validation set level
                            # validation set level
                            for train_index, valid_index in inner_cv.split(X_app):
                                KR.fit(X_app[train_index, :]
                                       [:, train_index], y_app[train_index])
                                # predict on the train, validation and test set
                                y_pred_train = KR.predict(
                                    X_app[train_index, :][:, train_index])
                                y_pred_valid = KR.predict(
                                    X_app[valid_index, :][:, train_index])
                                y_pred_test = KR.predict(X_test[:, train_index])
                                # root mean squared errors
                                current_train_perf.append(
                                    np.sqrt(mean_squared_error(y_app[train_index], y_pred_train)))
                                current_valid_perf.append(
                                    np.sqrt(mean_squared_error(y_app[valid_index], y_pred_valid)))
                                current_test_perf.append(
                                    np.sqrt(mean_squared_error(y_test, y_pred_test)))
                                # For clcassification use SVM
                        else:
                            KR = SVC(kernel='precomputed', **params_in)
                            # loop for each split on validation set level
                            # validation set level
                            for train_index, valid_index in inner_cv.split(X_app):
                                KR.fit(X_app[train_index, :]
                                       [:, train_index], y_app[train_index])
                                # predict on the train, validation and test set
                                y_pred_train = KR.predict(
                                    X_app[train_index, :][:, train_index])
                                y_pred_valid = KR.predict(
                                    X_app[valid_index, :][:, train_index])
                                y_pred_test = KR.predict(
                                    X_test[:, train_index])
                                # root mean squared errors
                                current_train_perf.append(accuracy_score(
                                    y_app[train_index], y_pred_train))
                                current_valid_perf.append(accuracy_score(
                                    y_app[valid_index], y_pred_valid))
                                current_test_perf.append(
                                    accuracy_score(y_test, y_pred_test))
                    except ValueError:
                        print(sys.exc_info()[0])
                        print(params_out, params_in)
                    # average performance on inner splits
                    train_pref[trial][index_out][index_in] = np.mean(
                        current_train_perf)
                    val_pref[trial][index_out][index_in] = np.mean(
                        current_valid_perf)
                    test_pref[trial][index_out][index_in] = np.mean(
                        current_test_perf)
                    pbar.update(1)
        pbar.clear()
    print()
    if len(gram_matrices) == 0:
        print('all gram matrices are ignored, no results obtained.')
        str_fw += '\nall gram matrices are ignored, no results obtained.\n\n'
    else:
        print(
            '3. Fitting and predicting using nested cross validation. This could really take a while...'
        )
        pool =  Pool(n_jobs)
        trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, gram_matrices, y, model_type)
        result_perf = pool.map(trial_do_partial, range(NUM_TRIALS))
        train_pref = [item[0] for item in result_perf]
        val_pref = [item[1] for item in result_perf]
        test_pref = [item[2] for item in result_perf]
        pool.close()
        pool.join()
        # trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, gram_matrices, y, model_type)
        # result_perf = Parallel(n_jobs=n_jobs, verbose=10)(delayed(trial_do_partial)(trial) for trial in range(NUM_TRIALS))
        # train_pref = [item[0] for item in result_perf]
        # val_pref = [item[1] for item in result_perf]
        # test_pref = [item[2] for item in result_perf]
        # pbar.clear()
        # np.save(results_name_pre + 'train_pref.dt', train_pref)
        # np.save(results_name_pre + 'val_pref.dt', val_pref)
        # np.save(results_name_pre + 'test_pref.dt', test_pref)
        print()
        print('4. Getting final performance...')
        fresults.write('\nII. Performance.\n\n')
        str_fw += '\nII. Performance.\n\n'
        # averages and confidences of performances on outer trials for each combination of parameters
        average_train_scores = np.mean(train_pref, axis=0)
        average_val_scores = np.mean(val_pref, axis=0)
@@ -255,53 +220,78 @@ def model_selection_for_precomputed_kernel(datafile, estimator,
        std_train_scores = np.std(train_pref, axis=0, ddof=1)
        std_val_scores = np.std(val_pref, axis=0, ddof=1)
        std_perf_scores = np.std(test_pref, axis=0, ddof=1)
        if model_type == 'regression':
            best_val_perf = np.amin(average_val_scores)
        else:
            best_val_perf = np.amax(average_val_scores)
        best_params_index = np.where(average_val_scores == best_val_perf)
        # find smallest val std with best val perf.
        best_val_stds = [std_val_scores[value][best_params_index[1][idx]] for idx, value in enumerate(best_params_index[0])]
        best_val_stds = [
            std_val_scores[value][best_params_index[1][idx]]
            for idx, value in enumerate(best_params_index[0])
        ]
        min_val_std = np.amin(best_val_stds)
        best_params_index = np.where(std_val_scores == min_val_std)
        best_params_out = [param_list_pre_revised[i] for i in best_params_index[0]]
        best_params_out = [
            param_list_pre_revised[i] for i in best_params_index[0]
        ]
        best_params_in = [param_list[i] for i in best_params_index[1]]
        print('best_params_out: ', best_params_out)
        print('best_params_in: ', best_params_in)
        print()
        print('best_val_perf: ', best_val_perf)
        print('best_val_std: ', min_val_std)
        fresults.write('best settings of hyper-params to build gram matrix: %s\n' % best_params_out)
        fresults.write('best settings of other hyper-params: %s\n\n' % best_params_in)
        fresults.write('best_val_perf: %s\n' % best_val_perf)
        fresults.write('best_val_std: %s\n' % min_val_std)
        str_fw += 'best settings of hyper-params to build gram matrix: %s\n' % best_params_out
        str_fw += 'best settings of other hyper-params: %s\n\n' % best_params_in
        str_fw += 'best_val_perf: %s\n' % best_val_perf
        str_fw += 'best_val_std: %s\n' % min_val_std
        final_performance = [average_perf_scores[value][best_params_index[1][idx]] for idx, value in enumerate(best_params_index[0])]
        final_confidence = [std_perf_scores[value][best_params_index[1][idx]] for idx, value in enumerate(best_params_index[0])]
        final_performance = [
            average_perf_scores[value][best_params_index[1][idx]]
            for idx, value in enumerate(best_params_index[0])
        ]
        final_confidence = [
            std_perf_scores[value][best_params_index[1][idx]]
            for idx, value in enumerate(best_params_index[0])
        ]
        print('final_performance: ', final_performance)
        print('final_confidence: ', final_confidence)
        fresults.write('final_performance: %s\n' % final_performance)
        fresults.write('final_confidence: %s\n' % final_confidence)
        train_performance = [average_train_scores[value][best_params_index[1][idx]] for idx, value in enumerate(best_params_index[0])]
        train_std = [std_train_scores[value][best_params_index[1][idx]] for idx, value in enumerate(best_params_index[0])]
        str_fw += 'final_performance: %s\n' % final_performance
        str_fw += 'final_confidence: %s\n' % final_confidence
        train_performance = [
            average_train_scores[value][best_params_index[1][idx]]
            for idx, value in enumerate(best_params_index[0])
        ]
        train_std = [
            std_train_scores[value][best_params_index[1][idx]]
            for idx, value in enumerate(best_params_index[0])
        ]
        print('train_performance: %s' % train_performance)
        print('train_std: ', train_std)
        fresults.write('train_performance: %s\n' % train_performance)
        fresults.write('train_std: %s\n\n' % train_std)
        str_fw += 'train_performance: %s\n' % train_performance
        str_fw += 'train_std: %s\n\n' % train_std
        print()
        tt_total = time.time() - tts  # training time for all hyper-parameters
        average_gram_matrix_time = np.mean(gram_matrix_time)
        std_gram_matrix_time = np.std(gram_matrix_time, ddof=1)
        best_gram_matrix_time = [gram_matrix_time[i] for i in best_params_index[0]]
        best_gram_matrix_time = [
            gram_matrix_time[i] for i in best_params_index[0]
        ]
        ave_bgmt = np.mean(best_gram_matrix_time)
        std_bgmt = np.std(best_gram_matrix_time, ddof=1)
        print('time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s'
              .format(average_gram_matrix_time, std_gram_matrix_time))
        print('time to calculate best gram matrix: {:.2f}±{:.2f}s'.format(ave_bgmt, std_bgmt))
        fresults.write('time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s\n'
              .format(average_gram_matrix_time, std_gram_matrix_time))
        fresults.write('time to calculate best gram matrix: {:.2f}±{:.2f}s\n\n'.format(ave_bgmt, std_bgmt))
        print(
            'time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s'
            .format(average_gram_matrix_time, std_gram_matrix_time))
        print('time to calculate best gram matrix: {:.2f}±{:.2f}s'.format(
            ave_bgmt, std_bgmt))
        print(
            'total training time with all hyper-param choices: {:.2f}s'.format(
                tt_total))
        str_fw += 'time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s\n'.format(average_gram_matrix_time, std_gram_matrix_time)
        str_fw += 'time to calculate best gram matrix: {:.2f}±{:.2f}s\n'.format(ave_bgmt, std_bgmt)
        str_fw += 'total training time with all hyper-param choices: {:.2f}s\n\n'.format(tt_total)
        # # save results to file
        # np.savetxt(results_name_pre + 'average_train_scores.dt',
@@ -312,7 +302,7 @@ def model_selection_for_precomputed_kernel(datafile, estimator,
        # np.savetxt(results_name_pre + 'std_train_scores.dt', std_train_scores)
        # np.savetxt(results_name_pre + 'std_val_scores.dt', std_val_scores)
        # np.savetxt(results_name_pre + 'std_perf_scores.dt', std_perf_scores)
        # np.save(results_name_pre + 'best_params_index', best_params_index)
        # np.save(results_name_pre + 'best_params_pre.dt', best_params_out)
        # np.save(results_name_pre + 'best_params_in.dt', best_params_in)
@@ -322,7 +312,7 @@ def model_selection_for_precomputed_kernel(datafile, estimator,
        # np.save(results_name_pre + 'final_confidence.dt', final_confidence)
        # np.save(results_name_pre + 'train_performance.dt', train_performance)
        # np.save(results_name_pre + 'train_std.dt', train_std)
        # np.save(results_name_pre + 'gram_matrix_time.dt', gram_matrix_time)
        # np.save(results_name_pre + 'average_gram_matrix_time.dt',
        #         average_gram_matrix_time)
@@ -330,7 +320,7 @@ def model_selection_for_precomputed_kernel(datafile, estimator,
        #         std_gram_matrix_time)
        # np.save(results_name_pre + 'best_gram_matrix_time.dt',
        #         best_gram_matrix_time)
        # print out as table.
        from collections import OrderedDict
        from tabulate import tabulate
@@ -343,20 +333,150 @@ def model_selection_for_precomputed_kernel(datafile, estimator,
                param_in['C'] = '{:.2e}'.format(param_in['C'])
        table_dict['params'] = [{**param_out, **param_in}
                                for param_in in param_list for param_out in param_list_pre_revised]
        table_dict['gram_matrix_time'] = ['{:.2f}'.format(gram_matrix_time[index_out])
                                          for param_in in param_list for index_out, _ in enumerate(param_list_pre_revised)]
        table_dict['valid_perf'] = ['{:.2f}±{:.2f}'.format(average_val_scores[index_out][index_in], std_val_scores[index_out][index_in])
                                    for index_in, _ in enumerate(param_list) for index_out, _ in enumerate(param_list_pre_revised)]
        table_dict['test_perf'] = ['{:.2f}±{:.2f}'.format(average_perf_scores[index_out][index_in], std_perf_scores[index_out][index_in])
                                   for index_in, _ in enumerate(param_list) for index_out, _ in enumerate(param_list_pre_revised)]
        table_dict['train_perf'] = ['{:.2f}±{:.2f}'.format(average_train_scores[index_out][index_in], std_train_scores[index_out][index_in])
                                    for index_in, _ in enumerate(param_list) for index_out, _ in enumerate(param_list_pre_revised)]
        keyorder = ['params', 'train_perf', 'valid_perf',
                    'test_perf', 'gram_matrix_time']
        table_dict['gram_matrix_time'] = [
            '{:.2f}'.format(gram_matrix_time[index_out])
            for param_in in param_list
            for index_out, _ in enumerate(param_list_pre_revised)
        ]
        table_dict['valid_perf'] = [
            '{:.2f}±{:.2f}'.format(average_val_scores[index_out][index_in],
                                   std_val_scores[index_out][index_in])
            for index_in, _ in enumerate(param_list)
            for index_out, _ in enumerate(param_list_pre_revised)
        ]
        table_dict['test_perf'] = [
            '{:.2f}±{:.2f}'.format(average_perf_scores[index_out][index_in],
                                   std_perf_scores[index_out][index_in])
            for index_in, _ in enumerate(param_list)
            for index_out, _ in enumerate(param_list_pre_revised)
        ]
        table_dict['train_perf'] = [
            '{:.2f}±{:.2f}'.format(average_train_scores[index_out][index_in],
                                   std_train_scores[index_out][index_in])
            for index_in, _ in enumerate(param_list)
            for index_out, _ in enumerate(param_list_pre_revised)
        ]
        keyorder = [
            'params', 'train_perf', 'valid_perf', 'test_perf',
            'gram_matrix_time'
        ]
        print()
        tb_print = tabulate(OrderedDict(sorted(table_dict.items(),
                                          key=lambda i: keyorder.index(i[0]))), headers='keys')
        tb_print = tabulate(
            OrderedDict(
                sorted(table_dict.items(),
                       key=lambda i: keyorder.index(i[0]))),
            headers='keys')
        print(tb_print)
        fresults.write('table of performance v.s. hyper-params:\n\n%s\n\n' % tb_print)
        str_fw += 'table of performance v.s. hyper-params:\n\n%s\n\n' % tb_print
        # open file to save all results for this dataset.
        if not os.path.exists(results_dir):
            os.makedirs(results_dir)
        with open(results_dir + '/' + ds_name + '.txt', 'w') as fresults:
            fresults.write(str_fw)
            fresults.close()
 def trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, trial): # Test set level
    # Arrays to store scores
    train_pref = np.zeros((len(param_list_pre_revised),
                           len(param_list)))
    val_pref = np.zeros((len(param_list_pre_revised),
                         len(param_list)))
    test_pref = np.zeros((len(param_list_pre_revised),
                          len(param_list)))
    # loop for each outer param tuple
    for index_out, params_out in enumerate(param_list_pre_revised):
        # split gram matrix and y to app and test sets.
        X_app, X_test, y_app, y_test = train_test_split(
            gram_matrices[index_out], y, test_size=0.1)
        split_index_app = [y.index(y_i) for y_i in y_app if y_i in y]
        # split_index_test = [y.index(y_i) for y_i in y_test if y_i in y]
        X_app = X_app[:, split_index_app]
        X_test = X_test[:, split_index_app]
        y_app = np.array(y_app)
        y_test = np.array(y_test)
        # loop for each inner param tuple
        for index_in, params_in in enumerate(param_list):
            inner_cv = KFold(
                n_splits=10, shuffle=True, random_state=trial)
            current_train_perf = []
            current_valid_perf = []
            current_test_perf = []
            # For regression use the Kernel Ridge method
            try:
                if model_type == 'regression':
                    KR = KernelRidge(kernel='precomputed', **params_in)
                    # loop for each split on validation set level
                    # validation set level
                    for train_index, valid_index in inner_cv.split(
                            X_app):
                        KR.fit(X_app[train_index, :][:, train_index],
                               y_app[train_index])
                        # predict on the train, validation and test set
                        y_pred_train = KR.predict(
                            X_app[train_index, :][:, train_index])
                        y_pred_valid = KR.predict(
                            X_app[valid_index, :][:, train_index])
                        y_pred_test = KR.predict(
                            X_test[:, train_index])
                        # root mean squared errors
                        current_train_perf.append(
                            np.sqrt(
                                mean_squared_error(
                                    y_app[train_index], y_pred_train)))
                        current_valid_perf.append(
                            np.sqrt(
                                mean_squared_error(
                                    y_app[valid_index], y_pred_valid)))
                        current_test_perf.append(
                            np.sqrt(
                                mean_squared_error(
                                    y_test, y_pred_test)))
                        # For clcassification use SVM
                else:
                    KR = SVC(kernel='precomputed', **params_in)
                    # loop for each split on validation set level
                    # validation set level
                    for train_index, valid_index in inner_cv.split(
                            X_app):
                        KR.fit(X_app[train_index, :][:, train_index],
                               y_app[train_index])
                        # predict on the train, validation and test set
                        y_pred_train = KR.predict(
                            X_app[train_index, :][:, train_index])
                        y_pred_valid = KR.predict(
                            X_app[valid_index, :][:, train_index])
                        y_pred_test = KR.predict(
                            X_test[:, train_index])
                        # root mean squared errors
                        current_train_perf.append(
                            accuracy_score(y_app[train_index],
                                           y_pred_train))
                        current_valid_perf.append(
                            accuracy_score(y_app[valid_index],
                                           y_pred_valid))
                        current_test_perf.append(
                            accuracy_score(y_test, y_pred_test))
            except ValueError:
                print(sys.exc_info()[0])
                print(params_out, params_in)
            # average performance on inner splits
            train_pref[index_out][index_in] = np.mean(
                current_train_perf)
            val_pref[index_out][index_in] = np.mean(
                current_valid_perf)
            test_pref[index_out][index_in] = np.mean(
                current_test_perf)
        fresults.close()
    return train_pref, val_pref, test_pref
--- a/pygraph/utils/utils.py
+++ b/pygraph/utils/utils.py
@@ -61,10 +61,11 @@ def floydTransformation(G, edge_weight=None):
    spMatrix = nx.floyd_warshall_numpy(G, weight=edge_weight)
    S = nx.Graph()
    S.add_nodes_from(G.nodes(data=True))
    ns = list(G.nodes())
    for i in range(0, G.number_of_nodes()):
        for j in range(i + 1, G.number_of_nodes()):
            if spMatrix[i, j] != np.inf:
                S.add_edge(i, j, cost=spMatrix[i, j])
                S.add_edge(ns[i], ns[j], cost=spMatrix[i, j])
    return S