correct randomness of data split for paralleling.

6 years ago · 8baa21cb67
--- a/notebooks/check_gm.py
+++ b/notebooks/check_gm.py
@@ -12,7 +12,7 @@ import matplotlib.pyplot as plt
 from numpy.linalg import eig

 # read gram matrices from file.
 results_dir = 'results/structuralspkernel/'
 results_dir = 'results/untilhpathkernel/myria'
 ds_name = 'Letter-med'
 gmfile = np.load(results_dir + '/' + ds_name + '.gm.npz')
 #print('gm time: ', gmfile['gmtime'])
--- a/notebooks/check_gm.zip
+++ b/notebooks/check_gm.zip
--- a/notebooks/check_gm/Acyclic.gm.eps
+++ b/notebooks/check_gm/Acyclic.gm.eps
--- a/notebooks/check_gm/Letter-med.gm.eps
+++ b/notebooks/check_gm/Letter-med.gm.eps
--- a/notebooks/run_spkernel.ipynb
+++ b/notebooks/run_spkernel.ipynb
@@ -6,94 +6,116 @@
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "MAO\n",
      "\n",
      "--- This is a classification problem ---\n",
      "\n",
      "\n",
      "1. Loading dataset from file...\n",
      "\n",
      "2. Calculating gram matrices. This could take a while...\n",
      "\n",
      " None edge weight specified. Set all weight to 1.\n",
      "\n",
      "getting sp graphs: 68it [00:00, 692.11it/s]\n",
      "calculating kernels: 2346it [00:05, 399.28it/s]\n",
      "\n",
      " --- shortest path kernel matrix of size 68 built in 6.345669507980347 seconds ---\n",
      "\n",
      "the gram matrix with parameters {'node_kernels': {'symb': <function deltakernel at 0x7fe240afd620>, 'nsymb': <function gaussiankernel at 0x7fe240afd9d8>, 'mix': functools.partial(<function kernelproduct at 0x7fe240aaf0d0>, <function deltakernel at 0x7fe240afd620>, <function gaussiankernel at 0x7fe240afd9d8>)}, 'n_jobs': 8} is: \n",
      "\n",
      "1 gram matrices are calculated, 0 of which are ignored.\n",
      "\n",
      "3. Fitting and predicting using nested cross validation. This could really take a while...\n",
      "cross validation: 7it [00:09,  4.67s/it]"
     ]
    }
   ],
   "source": [
    "%load_ext line_profiler\n",
    "%matplotlib inline\n",
    "import functools\n",
    "from libs import *\n",
    "import multiprocessing\n",
    "from sklearn.metrics.pairwise import rbf_kernel\n",
    "\n",
    "from pygraph.kernels.spKernel import spkernel, spkernel_do\n",
    "from pygraph.utils.kernels import deltakernel, kernelsum\n",
    "from pygraph.utils.model_selection_precomputed import trial_do\n",
    "\n",
    "dslist = [   \n",
    "    {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds', 'task': 'regression'}, # node symb\n",
    "#     {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression', \n",
    "#         'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt',}, # contains single node graph, node symb\n",
    "#     {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds',}, # node/edge symb\n",
    "#     {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds',}, # unlabeled\n",
    "#     {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat',\n",
    "#         'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, # node/edge symb\n",
    "    {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'},\n",
    "    {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'}, # node symb/nsymb\n",
    "    {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, # node/edge symb\n",
    "    {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat',\n",
    "        'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}}, # node symb\n",
    "\n",
    "#     {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb\n",
    "# # #     {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb\n",
    "# # #     {'name': 'COX2', 'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt'}, # node symb/nsymb\n",
    "#     {'name': 'Fingerprint', 'dataset': '../datasets/Fingerprint/Fingerprint_A.txt'},\n",
    "#     \n",
    "# #     {'name': 'DHFR', 'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt'}, # node symb/nsymb\n",
    "# #     {'name': 'SYNTHETIC', 'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt'}, # node symb/nsymb\n",
    "# #     {'name': 'MSRC9', 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'}, # node symb\n",
    "# #     {'name': 'MSRC21', 'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'}, # node symb\n",
    "# #     {'name': 'FIRSTMM_DB', 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'}, # node symb/nsymb ,edge nsymb\n",
    "\n",
    "# #     {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'}, # node symb/nsymb\n",
    "# #     {'name': 'PROTEINS_full', 'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'}, # node symb/nsymb\n",
    "# #     {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb\n",
    "#     {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1.mat',\n",
    "#         'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb\n",
    "#     {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109.mat',\n",
    "#         'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb\n",
    "#     {'name': 'NCI-HIV', 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf',\n",
    "#         'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',}, # node/edge symb\n",
    "    \n",
    "#     # not working below\n",
    "#     {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',},\n",
    "#     {'name': 'PTC_FR', 'dataset': '../datasets/PTC/Train/FR.ds',},\n",
    "#     {'name': 'PTC_MM', 'dataset': '../datasets/PTC/Train/MM.ds',},\n",
    "#     {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',},\n",
    "from pygraph.kernels.spKernel import spkernel\n",
    "from pygraph.utils.kernels import deltakernel, gaussiankernel, kernelproduct\n",
    "#from pygraph.utils.model_selection_precomputed import trial_do\n",
    "\n",
    "dslist = [\n",
    "#    {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',\n",
    "#        'task': 'regression'},  # node symb\n",
    "#    {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression',\n",
    "#             'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt', },  \n",
    "#    # contains single node graph, node symb\n",
    "    {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds', },  # node/edge symb\n",
    "#    {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds', },  # unlabeled\n",
    "#    {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat',\n",
    "#             'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}},  # node/edge symb\n",
    "#    {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'},\n",
    "#    # node nsymb\n",
    "#    {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'},\n",
    "#    # node symb/nsymb\n",
    "#    {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'},\n",
    "    # node/edge symb\n",
    "#    {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat',\n",
    "#     'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}},  # node symb\n",
    "\n",
    "    #     {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb\n",
    "    # # #     {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb\n",
    "    # # #     {'name': 'COX2', 'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt'}, # node symb/nsymb\n",
    "    #     {'name': 'Fingerprint', 'dataset': '../datasets/Fingerprint/Fingerprint_A.txt'},\n",
    "    #\n",
    "    # #     {'name': 'DHFR', 'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt'}, # node symb/nsymb\n",
    "    # #     {'name': 'SYNTHETIC', 'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt'}, # node symb/nsymb\n",
    "    # #     {'name': 'MSRC9', 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'}, # node symb\n",
    "    # #     {'name': 'MSRC21', 'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'}, # node symb\n",
    "    # #     {'name': 'FIRSTMM_DB', 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'}, # node symb/nsymb ,edge nsymb\n",
    "\n",
    "    # #     {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'}, # node symb/nsymb\n",
    "    # #     {'name': 'PROTEINS_full', 'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'}, # node symb/nsymb\n",
    "    # #     {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb\n",
    "    #     {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1.mat',\n",
    "    #         'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb\n",
    "    #     {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109.mat',\n",
    "    #         'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb\n",
    "    #     {'name': 'NCI-HIV', 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf',\n",
    "    #         'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',}, # node/edge symb\n",
    "\n",
    "    #     # not working below\n",
    "    #     {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',},\n",
    "    #     {'name': 'PTC_FR', 'dataset': '../datasets/PTC/Train/FR.ds',},\n",
    "    #     {'name': 'PTC_MM', 'dataset': '../datasets/PTC/Train/MM.ds',},\n",
    "    #     {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',},\n",
    "]\n",
    "estimator = spkernel\n",
    "mixkernel = functools.partial(kernelsum, deltakernel, rbf_kernel)\n",
    "param_grid_precomputed = {'node_kernels': [{'symb': deltakernel, 'nsymb': rbf_kernel, 'mix': mixkernel}]}\n",
    "param_grid = [{'C': np.logspace(-10, 10, num = 41, base = 10)}, \n",
    "              {'alpha': np.logspace(-10, 10, num = 41, base = 10)}]\n",
    "mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)\n",
    "param_grid_precomputed = {'node_kernels': [\n",
    "    {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}]}\n",
    "param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)},\n",
    "              {'alpha': np.logspace(-10, 10, num=41, base=10)}]\n",
    "\n",
    "for ds in dslist:\n",
    "    print()\n",
    "    print(ds['name'])\n",
    "    model_selection_for_precomputed_kernel(\n",
    "        ds['dataset'], \n",
    "        estimator, \n",
    "        param_grid_precomputed, \n",
    "        (param_grid[1] if ('task' in ds and ds['task'] == 'regression') else param_grid[0]), \n",
    "        (ds['task'] if 'task' in ds else 'classification'), \n",
    "        ds['dataset'],\n",
    "        estimator,\n",
    "        param_grid_precomputed,\n",
    "        (param_grid[1] if ('task' in ds and ds['task']\n",
    "                           == 'regression') else param_grid[0]),\n",
    "        (ds['task'] if 'task' in ds else 'classification'),\n",
    "        NUM_TRIALS=30,\n",
    "        datafile_y=(ds['dataset_y'] if 'dataset_y' in ds else None),\n",
    "        extra_params=(ds['extra_params'] if 'extra_params' in ds else None),\n",
    "        ds_name=ds['name'],\n",
    "        n_jobs=multiprocessing.cpu_count())\n",
    "    \n",
    "#     %lprun -f trial_do -f spkernel -f spkernel_do -f model_selection_for_precomputed_kernel \\\n",
    "#         model_selection_for_precomputed_kernel( \\\n",
    "#             ds['dataset'], \\\n",
    "#             estimator, \\\n",
    "#             param_grid_precomputed, \\\n",
    "#             (param_grid[1] if ('task' in ds and ds['task'] == 'regression') else param_grid[0]), \\\n",
    "#             (ds['task'] if 'task' in ds else 'classification'), \\\n",
    "#             NUM_TRIALS=30, \\\n",
    "#             datafile_y=(ds['dataset_y'] if 'dataset_y' in ds else None), \\\n",
    "#             extra_params=(ds['extra_params'] if 'extra_params' in ds else None), \\\n",
    "#             ds_name=ds['name'], \\\n",
    "#             n_jobs=multiprocessing.cpu_count()) \n",
    "    print()"
    "        n_jobs=multiprocessing.cpu_count(),\n",
    "        read_gm_from_file=False)\n",
    "    print()\n"
   ]
  },
  {
@@ -713,8 +735,8 @@
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (Spyder)",
   "language": "python3",
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
@@ -727,7 +749,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
   "version": "3.6.6"
  }
 },
 "nbformat": 4,
--- a/notebooks/run_spkernel.py
+++ b/notebooks/run_spkernel.py
@@ -7,21 +7,21 @@ from pygraph.utils.kernels import deltakernel, gaussiankernel, kernelproduct
 #from pygraph.utils.model_selection_precomputed import trial_do

 dslist = [
 #    {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',
 #        'task': 'regression'},  # node symb
 #    {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression',
 #             'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt', },  
 #    # contains single node graph, node symb
 #    {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds', },  # node/edge symb
 #    {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds', },  # unlabeled
 #    {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat',
 #             'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}},  # node/edge symb
    {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',
        'task': 'regression'},  # node symb
    {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression',
             'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt', },  
    # contains single node graph, node symb
    {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds', },  # node/edge symb
    {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds', },  # unlabeled
    {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat',
             'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}},  # node/edge symb
    {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'},
    # node nsymb
    {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'},
    # node symb/nsymb
 #    {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'},
 #    # node/edge symb
    # node/edge symb
 #    {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat',
 #     'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}},  # node symb

@@ -56,7 +56,7 @@ estimator = spkernel
 mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)
 param_grid_precomputed = {'node_kernels': [
    {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}]}
 param_grid = [{'C': np.logspace(-10, 3, num=27, base=10)},
 param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)},
              {'alpha': np.logspace(-10, 10, num=41, base=10)}]

 for ds in dslist:
--- a/notebooks/run_structuralspkernel.py
+++ b/notebooks/run_structuralspkernel.py
@@ -23,10 +23,10 @@ dslist = [
 #    {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds', },  # unlabeled
 #    {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat',
 #             'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}},  # node/edge symb
    {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'},
    # node nsymb
 #    {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'},
 #    # node symb/nsymb
 #    {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'},
 #    # node nsymb
    {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'},
    # node symb/nsymb
 #    {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'},
 #    # node/edge symb
 #    {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat',
@@ -39,8 +39,8 @@ dslist = [
    #
    # #     {'name': 'DHFR', 'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt'}, # node symb/nsymb
    # #     {'name': 'SYNTHETIC', 'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt'}, # node symb/nsymb
    # #     {'name': 'MSRC9', 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'}, # node symb, missing values
    # #     {'name': 'MSRC21', 'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'}, # node symb, missing values
 #    {'name': 'MSRC9', 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'}, # node symb, missing values
 #    {'name': 'MSRC21', 'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'}, # node symb, missing values
    # #     {'name': 'FIRSTMM_DB', 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'}, # node symb/nsymb ,edge nsymb

    # #     {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'}, # node symb/nsymb
@@ -53,8 +53,8 @@ dslist = [
    #     {'name': 'NCI-HIV', 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf',
    #         'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',}, # node/edge symb

    #     # not working below
    #     {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',},
 #     # not working below
 #     {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',},
    #     {'name': 'PTC_FR', 'dataset': '../datasets/PTC/Train/FR.ds',},
    #     {'name': 'PTC_MM', 'dataset': '../datasets/PTC/Train/MM.ds',},
    #     {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',},
--- a/notebooks/run_untilhpathkernel.py
+++ b/notebooks/run_untilhpathkernel.py
@@ -62,7 +62,7 @@ dslist = [
 ]
 estimator = untilhpathkernel
 mixkernel = functools.partial(kernelproduct, deltakernel, rbf_kernel)
 param_grid_precomputed = {'depth': np.linspace(7, 10, 10), 
 param_grid_precomputed = {'depth': np.linspace(1, 10, 10), 
                          'k_func': ['tanimoto', 'MinMax']}
 param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)},
              {'alpha': np.logspace(-10, 10, num=41, base=10)}]
--- a/notebooks/test.py
+++ b/notebooks/test.py
@@ -1,77 +0,0 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Fri Sep 28 16:37:29 2018

@author: ljia
 """

 import functools
 from libs import *
 import multiprocessing
 from sklearn.metrics.pairwise import rbf_kernel

 from pygraph.kernels.structuralspKernel import structuralspkernel
 from pygraph.utils.kernels import deltakernel, kernelproduct

 dslist = [
 #    {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',
 #        'task': 'regression'},  # node symb
 #    {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression',
 #             'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt', },  # contains single node graph, node symb
    {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds', },  # node/edge symb
    

    #     {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb
    # # #     {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb
    # # #     {'name': 'COX2', 'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt'}, # node symb/nsymb
    #     {'name': 'Fingerprint', 'dataset': '../datasets/Fingerprint/Fingerprint_A.txt'},
    #
    # #     {'name': 'DHFR', 'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt'}, # node symb/nsymb
    # #     {'name': 'SYNTHETIC', 'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt'}, # node symb/nsymb
    # #     {'name': 'MSRC9', 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'}, # node symb
    # #     {'name': 'MSRC21', 'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'}, # node symb
    # #     {'name': 'FIRSTMM_DB', 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'}, # node symb/nsymb ,edge nsymb

    # #     {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'}, # node symb/nsymb
    # #     {'name': 'PROTEINS_full', 'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'}, # node symb/nsymb
    # #     {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb
    #     {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1.mat',
    #         'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb
    #     {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109.mat',
    #         'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb
    #     {'name': 'NCI-HIV', 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf',
    #         'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',}, # node/edge symb

    #     # not working below
    #     {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',},
    #     {'name': 'PTC_FR', 'dataset': '../datasets/PTC/Train/FR.ds',},
    #     {'name': 'PTC_MM', 'dataset': '../datasets/PTC/Train/MM.ds',},
    #     {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',},
 ]
 estimator = structuralspkernel
 mixkernel = functools.partial(kernelproduct, deltakernel, rbf_kernel)
 param_grid_precomputed = {'node_kernels': 
    [{'symb': deltakernel, 'nsymb': rbf_kernel, 'mix': mixkernel}],
    'edge_kernels': 
    [{'symb': deltakernel, 'nsymb': rbf_kernel, 'mix': mixkernel}]}
 param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)},
              {'alpha': np.logspace(-10, 10, num=41, base=10)}]

 for ds in dslist:
    print()
    print(ds['name'])
    model_selection_for_precomputed_kernel(
        ds['dataset'],
        estimator,
        param_grid_precomputed,
        (param_grid[1] if ('task' in ds and ds['task']
                           == 'regression') else param_grid[0]),
        (ds['task'] if 'task' in ds else 'classification'),
        NUM_TRIALS=30,
        datafile_y=(ds['dataset_y'] if 'dataset_y' in ds else None),
        extra_params=(ds['extra_params'] if 'extra_params' in ds else None),
        ds_name=ds['name'],
        n_jobs=multiprocessing.cpu_count(),
        read_gm_from_file=False)
    print()
--- a/pygraph/kernels/commonWalkKernel.py
+++ b/pygraph/kernels/commonWalkKernel.py
@@ -85,21 +85,20 @@ def commonwalkkernel(*args,

    # ---- use pool.imap_unordered to parallel and track progress. ----
    pool = Pool(n_jobs)
    itr = combinations_with_replacement(range(0, len(Gn)), 2)
    itr = zip(combinations_with_replacement(Gn, 2),
              combinations_with_replacement(range(0, len(Gn)), 2))
    len_itr = int(len(Gn) * (len(Gn) + 1) / 2)
    if len_itr < 1000 * n_jobs:
        chunksize = int(len_itr / n_jobs) + 1
    else:
        chunksize = 100
        chunksize = 1000

    # direct product graph method - exponential
    if compute_method == 'exp':
        do_partial = partial(_commonwalkkernel_exp, Gn, node_label, edge_label,
                             weight)
        do_partial = partial(wrapper_cw_exp, node_label, edge_label, weight)
    # direct product graph method - geometric
    elif compute_method == 'geo':
        do_partial = partial(_commonwalkkernel_geo, Gn, node_label, edge_label,
                             weight)
        do_partial = partial(wrapper_cw_geo, node_label, edge_label, weight)

    for i, j, kernel in tqdm(
            pool.imap_unordered(do_partial, itr, chunksize),
@@ -153,7 +152,7 @@ def commonwalkkernel(*args,
    return Kmatrix, run_time


 def _commonwalkkernel_exp(Gn, node_label, edge_label, beta, ij):
 def _commonwalkkernel_exp(g1, g2, node_label, edge_label, beta):
    """Calculate walk graph kernels up to n between 2 graphs using exponential 
    series.

@@ -175,10 +174,6 @@ def _commonwalkkernel_exp(Gn, node_label, edge_label, beta, ij):
    kernel : float
        The common walk Kernel between 2 graphs.
    """
    iglobal = ij[0]
    jglobal = ij[1]
    g1 = Gn[iglobal]
    g2 = Gn[jglobal]

    # get tensor product / direct product
    gp = direct_product(g1, g2, node_label, edge_label)
@@ -219,10 +214,18 @@ def _commonwalkkernel_exp(Gn, node_label, edge_label, beta, ij):
    # print(np.exp(weight * A))
    # print('-------')

    return iglobal, jglobal, exp_D.sum()
    return exp_D.sum()


 def _commonwalkkernel_geo(Gn, node_label, edge_label, gamma, ij):
 def wrapper_cw_exp(node_label, edge_label, beta, itr_item):
    g1 = itr_item[0][0]
    g2 = itr_item[0][1]
    i = itr_item[1][0]
    j = itr_item[1][1]
    return i, j, _commonwalkkernel_exp(g1, g2, node_label, edge_label, beta)


 def _commonwalkkernel_geo(g1, g2, node_label, edge_label, gamma):
    """Calculate common walk graph kernels up to n between 2 graphs using 
    geometric series.

@@ -244,19 +247,22 @@ def _commonwalkkernel_geo(Gn, node_label, edge_label, gamma, ij):
    kernel : float
        The common walk Kernel between 2 graphs.
    """
    iglobal = ij[0]
    jglobal = ij[1]
    g1 = Gn[iglobal]
    g2 = Gn[jglobal]

    # get tensor product / direct product
    gp = direct_product(g1, g2, node_label, edge_label)
    A = nx.adjacency_matrix(gp).todense()
    mat = np.identity(len(A)) - gamma * A
    try:
        return iglobal, jglobal, mat.I.sum()
        return mat.I.sum()
    except np.linalg.LinAlgError:
        return iglobal, jglobal, np.nan
        return np.nan
    
    
 def wrapper_cw_geo(node_label, edge_label, gama, itr_item):
    g1 = itr_item[0][0]
    g2 = itr_item[0][1]
    i = itr_item[1][0]
    j = itr_item[1][1]
    return i, j, _commonwalkkernel_geo(g1, g2, node_label, edge_label, gama)


 def _commonwalkkernel_brute(walks1,
--- a/pygraph/kernels/spKernel.py
+++ b/pygraph/kernels/spKernel.py
@@ -8,7 +8,6 @@ import sys
 import time
 from itertools import combinations_with_replacement, product
 from functools import partial
 from joblib import Parallel, delayed
 from multiprocessing import Pool
 from tqdm import tqdm

@@ -89,7 +88,8 @@ def spkernel(*args,

    pool = Pool(n_jobs)
    # get shortest path graphs of Gn
    getsp_partial = partial(wrap_getSPGraph, Gn, weight)
    getsp_partial = partial(wrapper_getSPGraph, weight)
    itr = zip(Gn, range(0, len(Gn)))
    if len(Gn) < 1000 * n_jobs:
 #        # use default chunksize as pool.map when iterable is less than 100
 #        chunksize, extra = divmod(len(Gn), n_jobs * 4)
@@ -98,9 +98,8 @@ def spkernel(*args,
        chunksize = int(len(Gn) / n_jobs) + 1
    else:
        chunksize = 1000
    # chunksize = 300  # int(len(list(itr)) / n_jobs)
    for i, g in tqdm(
            pool.imap_unordered(getsp_partial, range(0, len(Gn)), chunksize),
            pool.imap_unordered(getsp_partial, itr, chunksize),
            desc='getting sp graphs', file=sys.stdout):
        Gn[i] = g
    pool.close()
@@ -144,8 +143,9 @@ def spkernel(*args,

    # ---- use pool.imap_unordered to parallel and track progress. ----
    pool = Pool(n_jobs)
    do_partial = partial(spkernel_do, Gn, ds_attrs, node_label, node_kernels)
    itr = combinations_with_replacement(range(0, len(Gn)), 2)
    do_partial = partial(wrapper_sp_do, ds_attrs, node_label, node_kernels)
    itr = zip(combinations_with_replacement(Gn, 2),
              combinations_with_replacement(range(0, len(Gn)), 2))
    len_itr = int(len(Gn) * (len(Gn) + 1) / 2)
    if len_itr < 1000 * n_jobs:
        chunksize = int(len_itr / n_jobs) + 1
@@ -200,15 +200,10 @@ def spkernel(*args,
    return Kmatrix, run_time, idx


 def spkernel_do(Gn, ds_attrs, node_label, node_kernels, ij):

    i = ij[0]
    j = ij[1]
    g1 = Gn[i]
    g2 = Gn[j]
 def spkernel_do(g1, g2, ds_attrs, node_label, node_kernels):
    
    kernel = 0

 #    try:
    # compute shortest path matrices first, method borrowed from FCSP.
    if ds_attrs['node_labeled']:
        # node symb and non-synb labeled
@@ -243,7 +238,7 @@ def spkernel_do(Gn, ds_attrs, node_label, node_kernels, ij):
                    g1.edges(data=True), g2.edges(data=True)):
                if e1[2]['cost'] == e2[2]['cost']:
                    kernel += 1
            return i, j, kernel
            return kernel

    # compute graph kernels
    if ds_attrs['is_directed']:
@@ -293,12 +288,20 @@ def spkernel_do(Gn, ds_attrs, node_label, node_kernels, ij):
        #                 kn1 = vk_mat[x1][x2] * vk_mat[y1][y2]
        #                 kn2 = vk_mat[x1][y2] * vk_mat[y1][x2]
        #                 kernel += kn1 + kn2
 #    except KeyError:  # missing labels or attributes
 #        pass

    return i, j, kernel
    return kernel


 def wrapper_sp_do(ds_attrs, node_label, node_kernels, itr_item):
    g1 = itr_item[0][0]
    g2 = itr_item[0][1]
    i = itr_item[1][0]
    j = itr_item[1][1]
    return i, j, spkernel_do(g1, g2, ds_attrs, node_label, node_kernels)


 def wrap_getSPGraph(Gn, weight, i):
    return i, getSPGraph(Gn[i], edge_weight=weight)
    # return i, nx.floyd_warshall_numpy(Gn[i], weight=weight)
 def wrapper_getSPGraph(weight, itr_item):
    g = itr_item[0]
    i = itr_item[1]
    return i, getSPGraph(g, edge_weight=weight)
    # return i, nx.floyd_warshall_numpy(g, weight=weight)
--- a/pygraph/kernels/structuralspKernel.py
+++ b/pygraph/kernels/structuralspKernel.py
@@ -12,7 +12,6 @@ import sys
 import time
 from itertools import combinations, combinations_with_replacement, product
 from functools import partial
 from joblib import Parallel, delayed
 from multiprocessing import Pool
 from tqdm import tqdm

@@ -71,7 +70,6 @@ def structuralspkernel(*args,
    """
    # pre-process
    Gn = args[0] if len(args) == 1 else [args[0], args[1]]

    weight = None
    if edge_weight is None:
        print('\n None edge weight specified. Set all weight to 1.\n')
@@ -98,34 +96,61 @@ def structuralspkernel(*args,
    start_time = time.time()

    # get shortest paths of each graph in Gn
    splist = [[] for _ in range(len(Gn))]
    splist = [None] * len(Gn)
    pool = Pool(n_jobs)
    # get shortest path graphs of Gn
    getsp_partial = partial(wrap_getSP, Gn, weight, ds_attrs['is_directed'])
    getsp_partial = partial(wrapper_getSP, weight, ds_attrs['is_directed'])
    itr = zip(Gn, range(0, len(Gn)))
    if len(Gn) < 1000 * n_jobs:
        chunksize = int(len(Gn) / n_jobs) + 1
    else:
        chunksize = 1000
    # chunksize = 300  # int(len(list(itr)) / n_jobs)
    for i, sp in tqdm(
            pool.imap_unordered(getsp_partial, range(0, len(Gn)), chunksize),
            pool.imap_unordered(getsp_partial, itr, chunksize),
            desc='getting shortest paths',
            file=sys.stdout):
        splist[i] = sp
 #        time.sleep(10)
    pool.close()
    pool.join()

    # # ---- use pool.map to parallel ----
    # result_sp = pool.map(getsp_partial, range(0, len(Gn)))
    # for i in result_sp:
    #     Gn[i[0]] = i[1]
    # or
    # getsp_partial = partial(wrap_getSP, Gn, weight)
    # for i, g in tqdm(
    #         pool.map(getsp_partial, range(0, len(Gn))),
    #         desc='getting sp graphs',
    #         file=sys.stdout):
    #     Gn[i] = g
    
    
 #    # get shortest paths of each graph in Gn
 #    splist = [[] for _ in range(len(Gn))]
 #    # get shortest path graphs of Gn
 #    getsp_partial = partial(wrapper_getSP, weight, ds_attrs['is_directed'])
 #    itr = zip(Gn, range(0, len(Gn)))
 #    if len(Gn) < 1000 * n_jobs:
 #        chunksize = int(len(Gn) / n_jobs) + 1
 #    else:
 #        chunksize = 1000
 #    # chunksize = 300  # int(len(list(itr)) / n_jobs)
 #    from contextlib import closing  
 #    with closing(Pool(n_jobs)) as pool:
 ##        for i, sp in tqdm(
 #        res = pool.imap_unordered(getsp_partial, itr, 10)
 ##                desc='getting shortest paths',
 ##                file=sys.stdout):
 ##            splist[i] = sp
 ##        time.sleep(10)
 #    pool.close()
 #    pool.join()
    
 #    ss = 0
 #    ss += sys.getsizeof(splist)
 #    for spss in splist:
 #        ss += sys.getsizeof(spss)
 #        for spp in spss:
 #            ss += sys.getsizeof(spp)
    
    
 #    time.sleep(20)
    
 #    # ---- direct running, normally use single CPU core. ----
 #    splist = []
 #    for g in tqdm(Gn, desc='getting sp graphs', file=sys.stdout):
 #        splist.append(get_shortest_paths(g, weight, ds_attrs['is_directed']))

    # # ---- only for the Fast Computation of Shortest Path Kernel (FCSP)
    # sp_ml = [0] * len(Gn)  # shortest path matrices
@@ -149,9 +174,11 @@ def structuralspkernel(*args,

    # ---- use pool.imap_unordered to parallel and track progress. ----
    pool = Pool(n_jobs)
    do_partial = partial(structuralspkernel_do, Gn, splist, ds_attrs,
                         node_label, edge_label, node_kernels, edge_kernels)
    itr = combinations_with_replacement(range(0, len(Gn)), 2)
    do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label, 
                         node_kernels, edge_kernels)
    itr = zip(combinations_with_replacement(Gn, 2),
              combinations_with_replacement(splist, 2),
              combinations_with_replacement(range(0, len(Gn)), 2))
    len_itr = int(len(Gn) * (len(Gn) + 1) / 2)
    if len_itr < 1000 * n_jobs:
        chunksize = int(len_itr / n_jobs) + 1
@@ -166,36 +193,36 @@ def structuralspkernel(*args,
    pool.close()
    pool.join()

    # # ---- use pool.map to parallel. ----
    # # result_perf = pool.map(do_partial, itr)
    # do_partial = partial(spkernel_do, Gn, ds_attrs, node_label, node_kernels)
    # itr = combinations_with_replacement(range(0, len(Gn)), 2)
    # for i, j, kernel in tqdm(
    #         pool.map(do_partial, itr), desc='calculating kernels',
    #         file=sys.stdout):
    #     Kmatrix[i][j] = kernel
    #     Kmatrix[j][i] = kernel
    # pool.close()
    # pool.join()

    # # ---- use joblib.Parallel to parallel and track progress. ----
    # result_perf = Parallel(
    #     n_jobs=n_jobs, verbose=10)(
    #         delayed(do_partial)(ij)
    #         for ij in combinations_with_replacement(range(0, len(Gn)), 2))
    # result_perf = [
    #     do_partial(ij)
    #     for ij in combinations_with_replacement(range(0, len(Gn)), 2)
    # ]
    # for i in result_perf:
    #     Kmatrix[i[0]][i[1]] = i[2]
    #     Kmatrix[i[1]][i[0]] = i[2]
 #    # ---- use pool.imap_unordered to parallel and track progress. ----
 #    do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label, 
 #                         node_kernels, edge_kernels)
 #    itr = zip(combinations_with_replacement(Gn, 2),
 #              combinations_with_replacement(splist, 2),
 #              combinations_with_replacement(range(0, len(Gn)), 2))
 #    len_itr = int(len(Gn) * (len(Gn) + 1) / 2)
 #    if len_itr < 1000 * n_jobs:
 #        chunksize = int(len_itr / n_jobs) + 1
 #    else:
 #        chunksize = 1000
 #    from contextlib import closing
 #    with closing(Pool(n_jobs)) as pool:
 #        for i, j, kernel in tqdm(
 #                pool.imap_unordered(do_partial, itr, 1000),
 #                desc='calculating kernels',
 #                file=sys.stdout):
 #            Kmatrix[i][j] = kernel
 #            Kmatrix[j][i] = kernel
 #    pool.close()
 #    pool.join()


 #    # ---- direct running, normally use single CPU core. ----
 #    itr = combinations_with_replacement(range(0, len(Gn)), 2)
 #    itr = zip(combinations_with_replacement(Gn, 2),
 #              combinations_with_replacement(splist, 2),
 #              combinations_with_replacement(range(0, len(Gn)), 2))
 #    for gs in tqdm(itr, desc='calculating kernels', file=sys.stdout):
 #        i, j, kernel = structuralspkernel_do(Gn, splist, ds_attrs, 
 #         node_label, edge_label, node_kernels, edge_kernels, gs)
 #        i, j, kernel = wrapper_ssp_do(ds_attrs, node_label, edge_label, 
 #                                      node_kernels, edge_kernels, gs)
 #        if(kernel > 1):
 #            print("error here ")
 #        Kmatrix[i][j] = kernel
@@ -209,18 +236,11 @@ def structuralspkernel(*args,
    return Kmatrix, run_time


 def structuralspkernel_do(Gn, splist, ds_attrs, node_label, edge_label,
                          node_kernels, edge_kernels, ij):

    iglobal = ij[0]
    jglobal = ij[1]
    g1 = Gn[iglobal]
    g2 = Gn[jglobal]
    spl1 = splist[iglobal]
    spl2 = splist[jglobal]
 def structuralspkernel_do(g1, g2, spl1, spl2, ds_attrs, node_label, edge_label,
                          node_kernels, edge_kernels):
    
    kernel = 0

    #try:
    # First, compute shortest path matrices, method borrowed from FCSP.
    if ds_attrs['node_labeled']:
        # node symb and non-synb labeled
@@ -369,11 +389,19 @@ def structuralspkernel_do(Gn, splist, ds_attrs, node_label, edge_label,
    #                 kn1 = vk_mat[x1][x2] * vk_mat[y1][y2]
    #                 kn2 = vk_mat[x1][y2] * vk_mat[y1][x2]
    #                 Kmatrix += kn1 + kn2
 #except KeyError:  # missing labels or attributes
    #    print("toto")
    #    pass
    return kernel


    return iglobal, jglobal, kernel
 def wrapper_ssp_do(ds_attrs, node_label, edge_label, node_kernels, 
                   edge_kernels, itr_item):
    g1 = itr_item[0][0]
    g2 = itr_item[0][1]
    spl1 = itr_item[1][0]
    spl2 = itr_item[1][1]
    i = itr_item[2][0]
    j = itr_item[2][1]
    return i, j, structuralspkernel_do(g1, g2, spl1, spl2, ds_attrs, 
        node_label, edge_label, node_kernels, edge_kernels)


 def get_shortest_paths(G, weight, directed):
@@ -397,17 +425,21 @@ def get_shortest_paths(G, weight, directed):
    for n1, n2 in combinations(G.nodes(), 2):
        try:
            spltemp = list(nx.all_shortest_paths(G, n1, n2, weight=weight))
        except nx.NetworkXNoPath:  # nodes not connected
            #            sp.append([])
            pass
        else:
            sp += spltemp
            # each edge walk is counted twice, starting from both its extreme nodes.
            if not directed:
                sp += [sptemp[::-1] for sptemp in spltemp]
        except nx.NetworkXNoPath:  # nodes not connected
            #            sp.append([])
            pass
                
    # add single nodes as length 0 paths.
    sp += [[n] for n in G.nodes()]
    return sp


 def wrap_getSP(Gn, weight, directed, i):
    return i, get_shortest_paths(Gn[i], weight, directed)
 def wrapper_getSP(weight, directed, itr_item):
    g = itr_item[0]
    i = itr_item[1]
    return i, get_shortest_paths(g, weight, directed)
--- a/pygraph/kernels/untilHPathKernel.py
+++ b/pygraph/kernels/untilHPathKernel.py
@@ -13,7 +13,6 @@ from itertools import chain, combinations_with_replacement
 from functools import partial
 from multiprocessing import Pool
 from tqdm import tqdm
 import traceback

 import networkx as nx
 import numpy as np
@@ -77,15 +76,15 @@ def untilhpathkernel(*args,
    # but this may cost a lot of memory for large datasets.
    pool = Pool(n_jobs)
    all_paths = [[] for _ in range(len(Gn))]
    getps_partial = partial(wrap_find_all_paths_until_length, Gn, depth, 
    getps_partial = partial(wrapper_find_all_paths_until_length, depth, 
                            ds_attrs, node_label, edge_label)
    itr = zip(Gn, range(0, len(Gn)))
    if len(Gn) < 1000 * n_jobs:
        chunksize = int(len(Gn) / n_jobs) + 1
    else:
        chunksize = 1000
    # chunksize = 300  # int(len(list(itr)) / n_jobs)
    for i, ps in tqdm(
            pool.imap_unordered(getps_partial, range(0, len(Gn)), chunksize),
            pool.imap_unordered(getps_partial, itr, chunksize),
            desc='getting paths', file=sys.stdout):
        all_paths[i] = ps
    pool.close()
@@ -110,8 +109,9 @@ def untilhpathkernel(*args,
        pass
    else:
        pool = Pool(n_jobs)
        do_partial = partial(_untilhpathkernel_do_naive, all_paths, k_func)
        itr = combinations_with_replacement(range(0, len(Gn)), 2)
        do_partial = partial(wrapper_uhpath_do_naive, k_func)
        itr = zip(combinations_with_replacement(all_paths, 2),
              combinations_with_replacement(range(0, len(Gn)), 2))
        len_itr = int(len(Gn) * (len(Gn) + 1) / 2)
        if len_itr < 1000 * n_jobs:
            chunksize = int(len_itr / n_jobs) + 1
@@ -216,7 +216,7 @@ def _untilhpathkernel_do_gst(gst1, gst2, paths1, paths2, k_func):
    return kernel


 def _untilhpathkernel_do_naive(paths_list, k_func, ij):
 def _untilhpathkernel_do_naive(paths1, paths2, k_func):
    """Calculate path graph kernels up to depth d between 2 graphs naively.

    Parameters
@@ -235,10 +235,6 @@ def _untilhpathkernel_do_naive(paths_list, k_func, ij):
    kernel : float
        Path kernel up to h between 2 graphs.
    """
    iglobal = ij[0]
    jglobal = ij[1]
    paths1 = paths_list[iglobal]
    paths2 = paths_list[jglobal]
    all_paths = list(set(paths1 + paths2))

    if k_func == 'tanimoto':
@@ -260,12 +256,18 @@ def _untilhpathkernel_do_naive(paths_list, k_func, ij):
        kernel = np.sum(np.minimum(vector1, vector2)) / \
            np.sum(np.maximum(vector1, vector2))

    return iglobal, jglobal, kernel
    return kernel


 # @todo: (can be removed maybe)  this method find paths repetively, it could be faster.
 def wrapper_uhpath_do_naive(k_func, itr_item):
    plist1 = itr_item[0][0]
    plist2 = itr_item[0][1]
    i = itr_item[1][0]
    j = itr_item[1][1]
    return i, j, _untilhpathkernel_do_naive(plist1, plist2, k_func)


 # @todo: (can be removed maybe)  this method find paths repetively, it could be faster.
 def find_all_paths_until_length(G,
                                length,
                                ds_attrs,
@@ -368,15 +370,12 @@ def find_all_paths_until_length(G,
            return [tuple([len(path)]) for path in all_paths]
        
        
 def wrap_find_all_paths_until_length(Gn, length, ds_attrs, node_label, 
                                     edge_label, i):
    try:
        return i, find_all_paths_until_length(Gn[i], length, ds_attrs,
 def wrapper_find_all_paths_until_length(length, ds_attrs, node_label, 
                                     edge_label, itr_item):
    g = itr_item[0]
    i = itr_item[1]
    return i, find_all_paths_until_length(g, length, ds_attrs,
                node_label=node_label, edge_label=edge_label)
    except Exception as e:
        traceback.print_exc()
        print('')
        raise e


 def paths2GSuffixTree(paths):
--- a/pygraph/utils/model_selection_precomputed.py
+++ b/pygraph/utils/model_selection_precomputed.py
@@ -206,54 +206,50 @@ def model_selection_for_precomputed_kernel(datafile,
                '3. Fitting and predicting using nested cross validation. This could really take a while...'
            )
            
 #            pool =  Pool(n_jobs)
 #            trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, gram_matrices, y, model_type)
 #            train_pref = []
 #            val_pref = []
 #            test_pref = []
 ##            if NUM_TRIALS < 1000 * n_jobs:
 ##                chunksize = int(NUM_TRIALS / n_jobs) + 1
 ##            else:
 ##                chunksize = 1000
 #            chunksize = 1
 #            for o1, o2, o3 in tqdm(pool.imap_unordered(trial_do_partial, range(NUM_TRIALS), chunksize), desc='cross validation', file=sys.stdout):
 #                train_pref.append(o1)
 #                val_pref.append(o2)
 #                test_pref.append(o3)
 #            pool.close()
 #            pool.join()
    
            # ---- use pool.map to parallel. ----
            pool =  Pool(n_jobs)
            trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, gram_matrices, y, model_type)
            train_pref = []
            val_pref = []
            test_pref = []
 #            if NUM_TRIALS < 100:
 #                chunksize, extra = divmod(NUM_TRIALS, n_jobs * 4)
 #                if extra:
 #                    chunksize += 1
 #            else:
 #                chunksize = 100
            chunksize = 1
            for o1, o2, o3 in tqdm(pool.imap_unordered(trial_do_partial, range(NUM_TRIALS), chunksize), desc='cross validation', file=sys.stdout):
                train_pref.append(o1)
                val_pref.append(o2)
                test_pref.append(o3)
            pool.close()
            pool.join()
    
            # # ---- use pool.map to parallel. ----
            # result_perf = pool.map(trial_do_partial, range(NUM_TRIALS))
            # train_pref = [item[0] for item in result_perf]
            # val_pref = [item[1] for item in result_perf]
            # test_pref = [item[2] for item in result_perf]
            result_perf = pool.map(trial_do_partial, range(NUM_TRIALS))
            train_pref = [item[0] for item in result_perf]
            val_pref = [item[1] for item in result_perf]
            test_pref = [item[2] for item in result_perf]
    
            # # ---- use joblib.Parallel to parallel and track progress. ----
            # trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, gram_matrices, y, model_type)
            # result_perf = Parallel(n_jobs=n_jobs, verbose=10)(delayed(trial_do_partial)(trial) for trial in range(NUM_TRIALS))
            # train_pref = [item[0] for item in result_perf]
            # val_pref = [item[1] for item in result_perf]
            # test_pref = [item[2] for item in result_perf]
    
            # # ---- direct running, normally use a single CPU core. ----
            # train_pref = []
            # val_pref = []
            # test_pref = []
            # for i in tqdm(range(NUM_TRIALS), desc='cross validation', file=sys.stdout):
            #     o1, o2, o3 = trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, i)
            #     train_pref.append(o1)
            #     val_pref.append(o2)
            #     test_pref.append(o3)
 #            # ---- direct running, normally use a single CPU core. ----
 #            train_pref = []
 #            val_pref = []
 #            test_pref = []
 #            for i in tqdm(range(NUM_TRIALS), desc='cross validation', file=sys.stdout):
 #                o1, o2, o3 = trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, i)
 #                train_pref.append(o1)
 #                val_pref.append(o2)
 #                test_pref.append(o3)
 #            print()
    
            print()
            print('4. Getting final performance...')
            str_fw += '\nIII. Performance.\n\n'
            # averages and confidences of performances on outer trials for each combination of parameters
            average_train_scores = np.mean(train_pref, axis=0)
 #            print('val_pref: ', val_pref[0][0])
            average_val_scores = np.mean(val_pref, axis=0)
 #            print('test_pref: ', test_pref[0][0])
            average_perf_scores = np.mean(test_pref, axis=0)
            # sample std is used here
            std_train_scores = np.std(train_pref, axis=0, ddof=1)
@@ -264,6 +260,9 @@ def model_selection_for_precomputed_kernel(datafile,
                best_val_perf = np.amin(average_val_scores)
            else:
                best_val_perf = np.amax(average_val_scores)
 #            print('average_val_scores: ', average_val_scores)
 #            print('best_val_perf: ', best_val_perf)
 #            print()
            best_params_index = np.where(average_val_scores == best_val_perf)
            # find smallest val std with best val perf.
            best_val_stds = [
@@ -286,6 +285,9 @@ def model_selection_for_precomputed_kernel(datafile,
            str_fw += 'best_val_perf: %s\n' % best_val_perf
            str_fw += 'best_val_std: %s\n' % min_val_std
    
 #            print(best_params_index)
 #            print(best_params_index[0])
 #            print(average_perf_scores)
            final_performance = [
                average_perf_scores[value][best_params_index[1][idx]]
                for idx, value in enumerate(best_params_index[0])
@@ -429,23 +431,23 @@ def model_selection_for_precomputed_kernel(datafile,
            '3. Fitting and predicting using nested cross validation. This could really take a while...'
        )
        
 #        pool =  Pool(n_jobs)
 #        trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, gram_matrices, y, model_type)
 #        train_pref = []
 #        val_pref = []
 #        test_pref = []
 #        if NUM_TRIALS < 100:
 #            chunksize, extra = divmod(NUM_TRIALS, n_jobs * 4)
 #            if extra:
 #                chunksize += 1
 #        else:
 #            chunksize = 100
 #        for o1, o2, o3 in tqdm(pool.imap_unordered(trial_do_partial, range(NUM_TRIALS), chunksize), desc='cross validation', file=sys.stdout):
 #            train_pref.append(o1)
 #            val_pref.append(o2)
 #            test_pref.append(o3)
 #        pool.close()
 #        pool.join()
        pool =  Pool(n_jobs)
        trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, gram_matrices, y, model_type)
        train_pref = []
        val_pref = []
        test_pref = []
        if NUM_TRIALS < 100:
            chunksize, extra = divmod(NUM_TRIALS, n_jobs * 4)
            if extra:
                chunksize += 1
        else:
            chunksize = 100
        for o1, o2, o3 in tqdm(pool.imap_unordered(trial_do_partial, range(NUM_TRIALS), chunksize), desc='cross validation', file=sys.stdout):
            train_pref.append(o1)
            val_pref.append(o2)
            test_pref.append(o3)
        pool.close()
        pool.join()
        
        # # ---- use pool.map to parallel. ----
        # result_perf = pool.map(trial_do_partial, range(NUM_TRIALS))
@@ -460,15 +462,15 @@ def model_selection_for_precomputed_kernel(datafile,
        # val_pref = [item[1] for item in result_perf]
        # test_pref = [item[2] for item in result_perf]

        # ---- direct running, normally use a single CPU core. ----
        train_pref = []
        val_pref = []
        test_pref = []
        for i in tqdm(range(NUM_TRIALS), desc='cross validation', file=sys.stdout):
            o1, o2, o3 = trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, i)
            train_pref.append(o1)
            val_pref.append(o2)
            test_pref.append(o3)
 #        # ---- direct running, normally use a single CPU core. ----
 #        train_pref = []
 #        val_pref = []
 #        test_pref = []
 #        for i in tqdm(range(NUM_TRIALS), desc='cross validation', file=sys.stdout):
 #            o1, o2, o3 = trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, i)
 #            train_pref.append(o1)
 #            val_pref.append(o2)
 #            test_pref.append(o3)

        print()
        print('4. Getting final performance...')
@@ -623,89 +625,142 @@ def trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, t
    val_pref = np.zeros((len(param_list_pre_revised), len(param_list)))
    test_pref = np.zeros((len(param_list_pre_revised), len(param_list)))

    # randomness added to seeds of split function below. "high" is "size" times
    # 10 so that at least 10 different random output will be yielded. Remove
    # these lines if identical outputs is required.
    rdm_out = np.random.RandomState(seed=None)
    rdm_seed_out_l = rdm_out.uniform(high=len(param_list_pre_revised) * 10, 
                                   size=len(param_list_pre_revised))
 #    print(trial, rdm_seed_out_l)
 #    print()
    # loop for each outer param tuple
    for index_out, params_out in enumerate(param_list_pre_revised):
        # split gram matrix and y to app and test sets.
        indices = range(len(y))
        # The argument "random_state" in function "train_test_split" can not be
        # set to None, because it will use RandomState instance used by 
        # np.random, which is possible for multiple subprocesses to inherit the
        # same seed if they forked at the same time, leading to identical 
        # random variates for different subprocesses. Instead, we use "trial" 
        # and "index_out" parameters to generate different seeds for different 
        # trials/subprocesses and outer loops. "rdm_seed_out_l" is used to add 
        # randomness into seeds, so that it yields a different output every 
        # time the program is run. To yield identical outputs every time,
        # remove the second line below. Same method is used to the "KFold"
        # function in the inner loop.
        rdm_seed_out = (trial + 1) * (index_out + 1)
        rdm_seed_out = (rdm_seed_out + int(rdm_seed_out_l[index_out])) % (2 ** 32 - 1)
 #        print(trial, rdm_seed_out)
        X_app, X_test, y_app, y_test, idx_app, idx_test = train_test_split(
            gram_matrices[index_out], y, indices, test_size=0.1, 
            random_state=None, shuffle=True)
            random_state=rdm_seed_out, shuffle=True)
 #        print(trial, idx_app, idx_test)
 #        print()
        X_app = X_app[:, idx_app]
        X_test = X_test[:, idx_app]
        y_app = np.array(y_app)
        y_test = np.array(y_test)

        rdm_seed_in_l = rdm_out.uniform(high=len(param_list) * 10, 
                                   size=len(param_list))
        # loop for each inner param tuple
        for index_in, params_in in enumerate(param_list):
 #            print(index_in, params_in)
 #            if trial == 0:
 #                print(index_out, index_in)
 #                print('params_in: ', params_in)
 #            st = time.time()
            inner_cv = KFold(n_splits=10, shuffle=True, random_state=trial)
            rdm_seed_in = (trial + 1) * (index_out + 1) * (index_in + 1)
 #            print("rdm_seed_in1: ", trial, index_in, rdm_seed_in)
            rdm_seed_in = (rdm_seed_in + int(rdm_seed_in_l[index_in])) % (2 ** 32 - 1)
 #            print("rdm_seed_in2: ", trial, index_in, rdm_seed_in)
            inner_cv = KFold(n_splits=10, shuffle=True, random_state=rdm_seed_in)
            current_train_perf = []
            current_valid_perf = []
            current_test_perf = [] 

            # For regression use the Kernel Ridge method
            try:
                if model_type == 'regression':
                    kr = KernelRidge(kernel='precomputed', **params_in)
                    # loop for each split on validation set level
                    # validation set level
                    for train_index, valid_index in inner_cv.split(X_app):
                        kr.fit(X_app[train_index, :][:, train_index],
                               y_app[train_index])
 #            try:
            if model_type == 'regression':
                kr = KernelRidge(kernel='precomputed', **params_in)
                # loop for each split on validation set level
                # validation set level
                for train_index, valid_index in inner_cv.split(X_app):
 #                    print("train_index, valid_index: ", trial, index_in, train_index, valid_index)
 #                    if trial == 0:
 #                        print('train_index: ', train_index)
 #                        print('valid_index: ', valid_index)
 #                        print('idx_test: ', idx_test)
 #                        print('y_app[train_index]: ', y_app[train_index])
 #                        print('X_app[train_index, :][:, train_index]: ', X_app[train_index, :][:, train_index])
 #                        print('X_app[valid_index, :][:, train_index]: ', X_app[valid_index, :][:, train_index])
                    kr.fit(X_app[train_index, :][:, train_index],
                           y_app[train_index])

                        # predict on the train, validation and test set
                        y_pred_train = kr.predict(
                            X_app[train_index, :][:, train_index])
                        y_pred_valid = kr.predict(
                            X_app[valid_index, :][:, train_index])
                        y_pred_test = kr.predict(
                            X_test[:, train_index])
                    # predict on the train, validation and test set
                    y_pred_train = kr.predict(
                        X_app[train_index, :][:, train_index])
                    y_pred_valid = kr.predict(
                        X_app[valid_index, :][:, train_index])
 #                    if trial == 0:     
 #                        print('y_pred_valid: ', y_pred_valid)
 #                        print()
                    y_pred_test = kr.predict(
                        X_test[:, train_index])

                        # root mean squared errors
                        current_train_perf.append(
                            np.sqrt(
                                mean_squared_error(
                                    y_app[train_index], y_pred_train)))
                        current_valid_perf.append(
                            np.sqrt(
                                mean_squared_error(
                                    y_app[valid_index], y_pred_valid)))
                        current_test_perf.append(
                            np.sqrt(
                                mean_squared_error(
                                    y_test, y_pred_test)))
                # For clcassification use SVM
                else:
                    svc = SVC(kernel='precomputed', cache_size=200, 
                              verbose=False, **params_in)
                    # loop for each split on validation set level
                    # validation set level
                    for train_index, valid_index in inner_cv.split(X_app):
                    # root mean squared errors
                    current_train_perf.append(
                        np.sqrt(
                            mean_squared_error(
                                y_app[train_index], y_pred_train)))
                    current_valid_perf.append(
                        np.sqrt(
                            mean_squared_error(
                                y_app[valid_index], y_pred_valid)))
 #                    if trial == 0:
 #                        print(mean_squared_error(
 #                                y_app[valid_index], y_pred_valid))
                    current_test_perf.append(
                        np.sqrt(
                            mean_squared_error(
                                y_test, y_pred_test)))
            # For clcassification use SVM
            else:
                svc = SVC(kernel='precomputed', cache_size=200, 
                          verbose=False, **params_in)
                # loop for each split on validation set level
                # validation set level
                for train_index, valid_index in inner_cv.split(X_app):
 #                        np.savez("bug.npy",X_app[train_index, :][:, train_index],y_app[train_index])
                        svc.fit(X_app[train_index, :][:, train_index],
                               y_app[train_index])
                        
                        # predict on the train, validation and test set
                        y_pred_train = svc.predict(
                            X_app[train_index, :][:, train_index])
                        y_pred_valid = svc.predict(
                            X_app[valid_index, :][:, train_index])
                        y_pred_test = svc.predict(
                            X_test[:, train_index])
 #                    if trial == 0:
 #                        print('train_index: ', train_index)
 #                        print('valid_index: ', valid_index)
 #                        print('idx_test: ', idx_test)
 #                        print('y_app[train_index]: ', y_app[train_index])
 #                        print('X_app[train_index, :][:, train_index]: ', X_app[train_index, :][:, train_index])
 #                        print('X_app[valid_index, :][:, train_index]: ', X_app[valid_index, :][:, train_index])
                    svc.fit(X_app[train_index, :][:, train_index],
                           y_app[train_index])
                    
                    # predict on the train, validation and test set
                    y_pred_train = svc.predict(
                        X_app[train_index, :][:, train_index])
                    y_pred_valid = svc.predict(
                        X_app[valid_index, :][:, train_index])
                    y_pred_test = svc.predict(
                        X_test[:, train_index])

                        # root mean squared errors
                        current_train_perf.append(
                            accuracy_score(y_app[train_index],
                                           y_pred_train))
                        current_valid_perf.append(
                            accuracy_score(y_app[valid_index],
                                           y_pred_valid))
                        current_test_perf.append(
                            accuracy_score(y_test, y_pred_test))
            except ValueError:
                print(sys.exc_info()[0])
                print(params_out, params_in)
                    # root mean squared errors
                    current_train_perf.append(
                        accuracy_score(y_app[train_index],
                                       y_pred_train))
                    current_valid_perf.append(
                        accuracy_score(y_app[valid_index],
                                       y_pred_valid))
                    current_test_perf.append(
                        accuracy_score(y_test, y_pred_test))
 #            except ValueError:
 #                print(sys.exc_info()[0])
 #                print(params_out, params_in)

            # average performance on inner splits
            train_pref[index_out][index_in] = np.mean(
@@ -715,5 +770,8 @@ def trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, t
            test_pref[index_out][index_in] = np.mean(
                current_test_perf)
 #            print(time.time() - st)
 #    if trial == 0:
 #        print('val_pref: ', val_pref)
 #        print('test_pref: ', test_pref)

    return train_pref, val_pref, test_pref