Browse Source

1. [IMPORTANT] correct an error in dataset split section in function cross_validation_pre_computed. DON NOT use previous versions. They are all wrong.

2. save gram matrices and relative data when using function cross_validation_pre_computed, before cross validation step, in case that something goes wrong with CV. Parameter read_gm_from_file can be used to choose whether to read gram matrices from file.
3. add some test code to check if a gram matrix is symmetric and positive semi-definite.
v0.1
jajupmochi 6 years ago
parent
commit
68772ba4b4
6 changed files with 652 additions and 481 deletions
  1. +1
    -1
      README.md
  2. +143
    -132
      notebooks/get_dataset_attributes.ipynb
  3. +4
    -61
      notebooks/run_spkernel.ipynb
  4. +78
    -72
      notebooks/run_spkernel.py
  5. +6
    -5
      pygraph/kernels/spKernel.py
  6. +420
    -210
      pygraph/utils/model_selection_precomputed.py

+ 1
- 1
README.md View File

@@ -3,7 +3,7 @@ A python package for graph kernels.

## Requirements

numpy==1.14.5
numpy==1.15.1
scipy==1.1.0
matplotlib==2.2.2
networkx==2.1


+ 143
- 132
notebooks/get_dataset_attributes.ipynb View File

@@ -12,30 +12,52 @@
"output_type": "stream",
"text": [
"\n",
"Letter-med:\n",
"substructures : {'linear', 'non linear'}\n",
"node_labeled : False\n",
"edge_labeled : False\n",
"is_directed : False\n",
"dataset_size : 2250\n",
"ave_graph_size : 4.674666666666667\n",
"min_graph_size : 1\n",
"max_graph_size : 9\n",
"ave_graph_edge_num : 3.2057777777777776\n",
"min_graph_edge_num : 0\n",
"max_graph_edge_num : 7\n",
"ave_graph_degree : 2.012888888888889\n",
"min_graph_degree : 0\n",
"max_graph_degree : 4\n",
"node_label_num : 0\n",
"edge_label_num : 0\n",
"node_attr_dim : 2\n",
"edge_attr_dim : 0\n",
"class_number : 15\n",
"\n",
"\n",
"Mutagenicity:\n",
"substructures : {'non linear', 'linear'}\n",
"substructures : {'linear', 'non linear'}\n",
"node_labeled : True\n",
"edge_labeled : True\n",
"is_directed : False\n",
"dataset_size : 4337\n",
"ave_graph_size : 30.3177311506\n",
"ave_graph_size : 30.317731150564907\n",
"min_graph_size : 4\n",
"max_graph_size : 417\n",
"ave_graph_edge_num : 30.7694258704\n",
"ave_graph_edge_num : 30.76942587041734\n",
"min_graph_edge_num : 3\n",
"max_graph_edge_num : 112\n",
"ave_graph_degree : 3.75651371916\n",
"ave_graph_degree : 3.75651371916071\n",
"min_graph_degree : 3\n",
"max_graph_degree : 4\n",
"node_label_num : 14\n",
"edge_label_num : 3\n",
"node_attr_dim : False\n",
"edge_attr_dim : False\n",
"node_attr_dim : 0\n",
"edge_attr_dim : 0\n",
"class_number : 2\n",
"\n",
"\n",
"AIDS:\n",
"substructures : {'non linear', 'linear'}\n",
"substructures : {'linear', 'non linear'}\n",
"node_labeled : True\n",
"edge_labeled : True\n",
"is_directed : False\n",
@@ -52,7 +74,7 @@
"node_label_num : 38\n",
"edge_label_num : 3\n",
"node_attr_dim : 4\n",
"edge_attr_dim : False\n",
"edge_attr_dim : 0\n",
"class_number : 2\n",
"\n",
"\n",
@@ -62,13 +84,13 @@
"edge_labeled : False\n",
"is_directed : False\n",
"dataset_size : 41\n",
"ave_graph_size : 1377.26829268\n",
"ave_graph_size : 1377.2682926829268\n",
"min_graph_size : 134\n",
"max_graph_size : 5037\n",
"ave_graph_edge_num : 3074.09756098\n",
"ave_graph_edge_num : 3074.0975609756097\n",
"min_graph_edge_num : 320\n",
"max_graph_edge_num : 10888\n",
"ave_graph_degree : 7.85365853659\n",
"ave_graph_degree : 7.853658536585366\n",
"min_graph_degree : 6\n",
"max_graph_degree : 10\n",
"node_label_num : 5\n",
@@ -79,51 +101,51 @@
"\n",
"\n",
"MSRC9:\n",
"substructures : {'non linear', 'linear'}\n",
"substructures : {'linear', 'non linear'}\n",
"node_labeled : True\n",
"edge_labeled : False\n",
"is_directed : False\n",
"dataset_size : 221\n",
"ave_graph_size : 40.5791855204\n",
"ave_graph_size : 40.57918552036199\n",
"min_graph_size : 25\n",
"max_graph_size : 55\n",
"ave_graph_edge_num : 97.9366515837\n",
"ave_graph_edge_num : 97.9366515837104\n",
"min_graph_edge_num : 53\n",
"max_graph_edge_num : 145\n",
"ave_graph_degree : 10.1583710407\n",
"ave_graph_degree : 10.158371040723981\n",
"min_graph_degree : 8\n",
"max_graph_degree : 16\n",
"node_label_num : 10\n",
"edge_label_num : 0\n",
"node_attr_dim : False\n",
"edge_attr_dim : False\n",
"node_attr_dim : 0\n",
"edge_attr_dim : 0\n",
"class_number : 8\n",
"\n",
"\n",
"MSRC21:\n",
"substructures : {'non linear', 'linear'}\n",
"substructures : {'linear', 'non linear'}\n",
"node_labeled : True\n",
"edge_labeled : False\n",
"is_directed : False\n",
"dataset_size : 563\n",
"ave_graph_size : 77.5204262877\n",
"ave_graph_size : 77.52042628774423\n",
"min_graph_size : 51\n",
"max_graph_size : 141\n",
"ave_graph_edge_num : 198.323268206\n",
"ave_graph_edge_num : 198.32326820603907\n",
"min_graph_edge_num : 121\n",
"max_graph_edge_num : 405\n",
"ave_graph_degree : 11.4156305506\n",
"ave_graph_degree : 11.41563055062167\n",
"min_graph_degree : 8\n",
"max_graph_degree : 23\n",
"node_label_num : 22\n",
"edge_label_num : 0\n",
"node_attr_dim : False\n",
"edge_attr_dim : False\n",
"node_attr_dim : 0\n",
"edge_attr_dim : 0\n",
"class_number : 20\n",
"\n",
"\n",
"SYNTHETIC:\n",
"substructures : {'non linear', 'linear'}\n",
"substructures : {'linear', 'non linear'}\n",
"node_labeled : True\n",
"edge_labeled : False\n",
"is_directed : False\n",
@@ -140,42 +162,42 @@
"node_label_num : 8\n",
"edge_label_num : 0\n",
"node_attr_dim : 1\n",
"edge_attr_dim : False\n",
"edge_attr_dim : 0\n",
"class_number : 2\n",
"\n",
"\n",
"BZR:\n",
"substructures : {'non linear', 'linear'}\n",
"substructures : {'linear', 'non linear'}\n",
"node_labeled : True\n",
"edge_labeled : False\n",
"is_directed : False\n",
"dataset_size : 405\n",
"ave_graph_size : 35.750617284\n",
"ave_graph_size : 35.75061728395062\n",
"min_graph_size : 13\n",
"max_graph_size : 57\n",
"ave_graph_edge_num : 38.3580246914\n",
"ave_graph_edge_num : 38.358024691358025\n",
"min_graph_edge_num : 13\n",
"max_graph_edge_num : 60\n",
"ave_graph_degree : 3.86419753086\n",
"ave_graph_degree : 3.8641975308641974\n",
"min_graph_degree : 3\n",
"max_graph_degree : 4\n",
"node_label_num : 10\n",
"edge_label_num : 0\n",
"node_attr_dim : 3\n",
"edge_attr_dim : False\n",
"edge_attr_dim : 0\n",
"class_number : 2\n",
"\n",
"\n",
"COX2:\n",
"substructures : {'non linear', 'linear'}\n",
"substructures : {'linear', 'non linear'}\n",
"node_labeled : True\n",
"edge_labeled : False\n",
"is_directed : False\n",
"dataset_size : 467\n",
"ave_graph_size : 41.2248394004\n",
"ave_graph_size : 41.224839400428266\n",
"min_graph_size : 32\n",
"max_graph_size : 56\n",
"ave_graph_edge_num : 43.4453961456\n",
"ave_graph_edge_num : 43.44539614561028\n",
"min_graph_edge_num : 34\n",
"max_graph_edge_num : 59\n",
"ave_graph_degree : 4.0\n",
@@ -184,152 +206,152 @@
"node_label_num : 8\n",
"edge_label_num : 0\n",
"node_attr_dim : 3\n",
"edge_attr_dim : False\n",
"edge_attr_dim : 0\n",
"class_number : 2\n",
"\n",
"\n",
"DHFR:\n",
"substructures : {'non linear', 'linear'}\n",
"substructures : {'linear', 'non linear'}\n",
"node_labeled : True\n",
"edge_labeled : False\n",
"is_directed : False\n",
"dataset_size : 756\n",
"ave_graph_size : 42.4272486772\n",
"ave_graph_size : 42.42724867724868\n",
"min_graph_size : 20\n",
"max_graph_size : 71\n",
"ave_graph_edge_num : 44.544973545\n",
"ave_graph_edge_num : 44.544973544973544\n",
"min_graph_edge_num : 21\n",
"max_graph_edge_num : 73\n",
"ave_graph_degree : 3.95502645503\n",
"ave_graph_degree : 3.955026455026455\n",
"min_graph_degree : 3\n",
"max_graph_degree : 4\n",
"node_label_num : 9\n",
"edge_label_num : 0\n",
"node_attr_dim : 3\n",
"edge_attr_dim : False\n",
"edge_attr_dim : 0\n",
"class_number : 2\n",
"\n",
"\n",
"ENZYMES:\n",
"substructures : {'non linear', 'linear'}\n",
"substructures : {'linear', 'non linear'}\n",
"node_labeled : True\n",
"edge_labeled : False\n",
"is_directed : False\n",
"dataset_size : 600\n",
"ave_graph_size : 32.6333333333\n",
"ave_graph_size : 32.63333333333333\n",
"min_graph_size : 2\n",
"max_graph_size : 126\n",
"ave_graph_edge_num : 62.1366666667\n",
"ave_graph_edge_num : 62.13666666666666\n",
"min_graph_edge_num : 1\n",
"max_graph_edge_num : 149\n",
"ave_graph_degree : 6.08666666667\n",
"ave_graph_degree : 6.086666666666667\n",
"min_graph_degree : 1\n",
"max_graph_degree : 9\n",
"node_label_num : 3\n",
"edge_label_num : 0\n",
"node_attr_dim : 18\n",
"edge_attr_dim : False\n",
"edge_attr_dim : 0\n",
"class_number : 6\n",
"\n",
"\n",
"PROTEINS:\n",
"substructures : {'non linear', 'linear'}\n",
"substructures : {'linear', 'non linear'}\n",
"node_labeled : True\n",
"edge_labeled : False\n",
"is_directed : False\n",
"dataset_size : 1113\n",
"ave_graph_size : 39.0575022462\n",
"ave_graph_size : 39.05750224618149\n",
"min_graph_size : 4\n",
"max_graph_size : 620\n",
"ave_graph_edge_num : 72.8158131177\n",
"ave_graph_edge_num : 72.8158131176999\n",
"min_graph_edge_num : 5\n",
"max_graph_edge_num : 1049\n",
"ave_graph_degree : 5.79424977538\n",
"ave_graph_degree : 5.794249775381851\n",
"min_graph_degree : 3\n",
"max_graph_degree : 25\n",
"node_label_num : 3\n",
"edge_label_num : 0\n",
"node_attr_dim : 1\n",
"edge_attr_dim : False\n",
"edge_attr_dim : 0\n",
"class_number : 2\n",
"\n",
"\n",
"PROTEINS_full:\n",
"substructures : {'non linear', 'linear'}\n",
"substructures : {'linear', 'non linear'}\n",
"node_labeled : True\n",
"edge_labeled : False\n",
"is_directed : False\n",
"dataset_size : 1113\n",
"ave_graph_size : 39.0575022462\n",
"ave_graph_size : 39.05750224618149\n",
"min_graph_size : 4\n",
"max_graph_size : 620\n",
"ave_graph_edge_num : 72.8158131177\n",
"ave_graph_edge_num : 72.8158131176999\n",
"min_graph_edge_num : 5\n",
"max_graph_edge_num : 1049\n",
"ave_graph_degree : 5.79424977538\n",
"ave_graph_degree : 5.794249775381851\n",
"min_graph_degree : 3\n",
"max_graph_degree : 25\n",
"node_label_num : 3\n",
"edge_label_num : 0\n",
"node_attr_dim : 29\n",
"edge_attr_dim : False\n",
"edge_attr_dim : 0\n",
"class_number : 2\n",
"\n",
"\n",
"D&D:\n",
"substructures : {'non linear', 'linear'}\n",
"substructures : {'linear', 'non linear'}\n",
"node_labeled : True\n",
"edge_labeled : False\n",
"is_directed : False\n",
"dataset_size : 1178\n",
"ave_graph_size : 284.31663837\n",
"ave_graph_size : 284.3166383701188\n",
"min_graph_size : 30\n",
"max_graph_size : 5748\n",
"ave_graph_edge_num : 715.658743633\n",
"ave_graph_edge_num : 715.6587436332767\n",
"min_graph_edge_num : 63\n",
"max_graph_edge_num : 14267\n",
"ave_graph_degree : 9.50933786078\n",
"ave_graph_degree : 9.509337860780985\n",
"min_graph_degree : 6\n",
"max_graph_degree : 19\n",
"node_label_num : 82\n",
"edge_label_num : 0\n",
"node_attr_dim : False\n",
"edge_attr_dim : False\n",
"node_attr_dim : 0\n",
"edge_attr_dim : 0\n",
"class_number : 2\n",
"\n",
"\n",
"MUTAG:\n",
"substructures : {'non linear', 'linear'}\n",
"substructures : {'linear', 'non linear'}\n",
"node_labeled : True\n",
"edge_labeled : True\n",
"is_directed : False\n",
"dataset_size : 188\n",
"ave_graph_size : 17.9308510638\n",
"ave_graph_size : 17.930851063829788\n",
"min_graph_size : 10\n",
"max_graph_size : 28\n",
"ave_graph_edge_num : 19.7925531915\n",
"ave_graph_edge_num : 19.79255319148936\n",
"min_graph_edge_num : 10\n",
"max_graph_edge_num : 33\n",
"ave_graph_degree : 3.00531914894\n",
"ave_graph_degree : 3.00531914893617\n",
"min_graph_degree : 3\n",
"max_graph_degree : 4\n",
"node_label_num : 7\n",
"edge_label_num : 11\n",
"node_attr_dim : False\n",
"edge_attr_dim : False\n",
"node_attr_dim : 0\n",
"edge_attr_dim : 0\n",
"class_number : 2\n",
"\n",
"\n",
"Alkane:\n",
"substructures : {'non linear', 'linear'}\n",
"substructures : {'linear', 'non linear'}\n",
"node_labeled : True\n",
"edge_labeled : False\n",
"is_directed : False\n",
"dataset_size : 150\n",
"ave_graph_size : 8.87333333333\n",
"ave_graph_size : 8.873333333333333\n",
"min_graph_size : 1\n",
"max_graph_size : 10\n",
"ave_graph_edge_num : 7.87333333333\n",
"ave_graph_edge_num : 7.873333333333333\n",
"min_graph_edge_num : 0\n",
"max_graph_edge_num : 9\n",
"ave_graph_degree : 3.36\n",
@@ -337,43 +359,43 @@
"max_graph_degree : 4\n",
"node_label_num : 2\n",
"edge_label_num : 1\n",
"node_attr_dim : False\n",
"edge_attr_dim : False\n",
"node_attr_dim : 0\n",
"edge_attr_dim : 0\n",
"class_number : 123\n",
"\n",
"\n",
"Acyclic:\n",
"substructures : {'non linear', 'linear'}\n",
"substructures : {'linear', 'non linear'}\n",
"node_labeled : True\n",
"edge_labeled : False\n",
"is_directed : False\n",
"dataset_size : 183\n",
"ave_graph_size : 8.15300546448\n",
"ave_graph_size : 8.153005464480874\n",
"min_graph_size : 3\n",
"max_graph_size : 11\n",
"ave_graph_edge_num : 7.15300546448\n",
"ave_graph_edge_num : 7.1530054644808745\n",
"min_graph_edge_num : 2\n",
"max_graph_edge_num : 10\n",
"ave_graph_degree : 2.80327868852\n",
"ave_graph_degree : 2.80327868852459\n",
"min_graph_degree : 2\n",
"max_graph_degree : 4\n",
"node_label_num : 3\n",
"edge_label_num : 1\n",
"node_attr_dim : False\n",
"edge_attr_dim : False\n",
"node_attr_dim : 0\n",
"edge_attr_dim : 0\n",
"class_number : 148\n",
"\n",
"\n",
"MAO:\n",
"substructures : {'non linear', 'linear'}\n",
"substructures : {'linear', 'non linear'}\n",
"node_labeled : True\n",
"edge_labeled : True\n",
"is_directed : False\n",
"dataset_size : 68\n",
"ave_graph_size : 18.3823529412\n",
"ave_graph_size : 18.38235294117647\n",
"min_graph_size : 11\n",
"max_graph_size : 27\n",
"ave_graph_edge_num : 19.6323529412\n",
"ave_graph_edge_num : 19.63235294117647\n",
"min_graph_edge_num : 12\n",
"max_graph_edge_num : 29\n",
"ave_graph_degree : 3.0\n",
@@ -381,107 +403,95 @@
"max_graph_degree : 3\n",
"node_label_num : 3\n",
"edge_label_num : 4\n",
"node_attr_dim : False\n",
"edge_attr_dim : False\n",
"node_attr_dim : 0\n",
"edge_attr_dim : 0\n",
"class_number : 2\n",
"\n",
"\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"PAH:\n",
"substructures : {'non linear', 'linear'}\n",
"substructures : {'linear', 'non linear'}\n",
"node_labeled : False\n",
"edge_labeled : False\n",
"is_directed : False\n",
"dataset_size : 94\n",
"ave_graph_size : 20.7021276596\n",
"ave_graph_size : 20.70212765957447\n",
"min_graph_size : 10\n",
"max_graph_size : 28\n",
"ave_graph_edge_num : 24.4255319149\n",
"ave_graph_edge_num : 24.425531914893618\n",
"min_graph_edge_num : 11\n",
"max_graph_edge_num : 34\n",
"ave_graph_degree : 3.01063829787\n",
"ave_graph_degree : 3.0106382978723403\n",
"min_graph_degree : 3\n",
"max_graph_degree : 4\n",
"node_label_num : 1\n",
"edge_label_num : 1\n",
"node_attr_dim : False\n",
"edge_attr_dim : False\n",
"node_attr_dim : 0\n",
"edge_attr_dim : 0\n",
"class_number : 2\n",
"\n",
"\n",
"NCI1:\n",
"substructures : {'non linear', 'linear'}\n",
"substructures : {'linear', 'non linear'}\n",
"node_labeled : True\n",
"edge_labeled : False\n",
"is_directed : False\n",
"dataset_size : 4110\n",
"ave_graph_size : 29.8654501217\n",
"ave_graph_size : 29.8654501216545\n",
"min_graph_size : 3\n",
"max_graph_size : 111\n",
"ave_graph_edge_num : 32.3\n",
"min_graph_edge_num : 2\n",
"max_graph_edge_num : 119\n",
"ave_graph_degree : 3.33600973236\n",
"ave_graph_degree : 3.3360097323600972\n",
"min_graph_degree : 2\n",
"max_graph_degree : 4\n",
"node_label_num : 37\n",
"edge_label_num : 0\n",
"node_attr_dim : False\n",
"edge_attr_dim : False\n",
"node_attr_dim : 0\n",
"edge_attr_dim : 0\n",
"class_number : 2\n",
"\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"\n",
"NCI109:\n",
"substructures : {'non linear', 'linear'}\n",
"substructures : {'linear', 'non linear'}\n",
"node_labeled : True\n",
"edge_labeled : False\n",
"is_directed : False\n",
"dataset_size : 4127\n",
"ave_graph_size : 29.6811243034\n",
"ave_graph_size : 29.681124303368065\n",
"min_graph_size : 4\n",
"max_graph_size : 111\n",
"ave_graph_edge_num : 32.1308456506\n",
"ave_graph_edge_num : 32.13084565059365\n",
"min_graph_edge_num : 3\n",
"max_graph_edge_num : 119\n",
"ave_graph_degree : 3.34383329295\n",
"ave_graph_degree : 3.343833292948873\n",
"min_graph_degree : 2\n",
"max_graph_degree : 5\n",
"node_label_num : 38\n",
"edge_label_num : 0\n",
"node_attr_dim : False\n",
"edge_attr_dim : False\n",
"node_attr_dim : 0\n",
"edge_attr_dim : 0\n",
"class_number : 2\n",
"\n",
"load SDF: 100%|██████████| 4457424/4457424 [00:11<00:00, 397248.47it/s]\n",
"ajust data: 100%|██████████| 42687/42687 [00:10<00:00, 3939.72it/s] \n",
"\n",
"NCI-HIV:\n",
"substructures : {'non linear', 'linear'}\n",
"node_labeled : True\n",
"edge_labeled : True\n",
"is_directed : False\n",
"dataset_size : 42682\n",
"ave_graph_size : 45.7094559768\n",
"min_graph_size : 2\n",
"max_graph_size : 438\n",
"ave_graph_edge_num : 47.7137903566\n",
"min_graph_edge_num : 1\n",
"max_graph_edge_num : 441\n",
"ave_graph_degree : 3.97605548006\n",
"min_graph_degree : 1\n",
"max_graph_degree : 12\n",
"node_label_num : 63\n",
"edge_label_num : 3\n",
"node_attr_dim : False\n",
"edge_attr_dim : False\n",
"class_number : 3\n",
"\n"
]
},
{
"ename": "ModuleNotFoundError",
"evalue": "No module named 'tqdm'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-1-1e4da065c026>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 45\u001b[0m \u001b[0mds\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'dataset'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 46\u001b[0m \u001b[0mfilename_y\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mds\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'dataset_y'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;34m'dataset_y'\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mds\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 47\u001b[0;31m extra_params=(ds['extra_params'] if 'extra_params' in ds else None))\n\u001b[0m\u001b[1;32m 48\u001b[0m attrs = get_dataset_attributes(\n\u001b[1;32m 49\u001b[0m dataset, target=y, node_label='atom', edge_label='bond_type')\n",
"\u001b[0;32m/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/utils/graphfiles.py\u001b[0m in \u001b[0;36mloadDataset\u001b[0;34m(filename, filename_y, extra_params)\u001b[0m\n\u001b[1;32m 377\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0mextension\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m\"sdf\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 378\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mnumpy\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 379\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mtqdm\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mtqdm\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 380\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0msys\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 381\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'tqdm'"
]
}
],
"source": [
@@ -491,6 +501,7 @@
"from pygraph.utils.graphdataset import get_dataset_attributes\n",
"\n",
"dslist = [\n",
" {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'},\n",
" {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'},\n",
" {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'},\n",
" {'name': 'FIRSTMM_DB', 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'},\n",
@@ -557,7 +568,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
"version": "3.6.5"
}
},
"nbformat": 4,


+ 4
- 61
notebooks/run_spkernel.ipynb
File diff suppressed because it is too large
View File


+ 78
- 72
notebooks/run_spkernel.py View File

@@ -1,89 +1,95 @@
# %load_ext line_profiler
# %matplotlib inline
import functools
from libs import *
from pygraph.kernels.spKernel import spkernel
from pygraph.utils.kernels import deltakernel, kernelsum
import multiprocessing
from sklearn.metrics.pairwise import rbf_kernel

# dslist = [
# {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds', 'task': 'regression'}, # node symb
# # {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb
# {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds',}, # unlabeled
# {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds',}, # node/edge symb
# {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat',
# 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, # node/edge symb
# {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression',
# 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt',}, # contains single node graph, node symb
# # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb
# # {'name': 'COX2', 'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt'}, # node symb/nsymb
# {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, # node/edge symb
# {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'}, # node symb/nsymb
# # {'name': 'Fingerprint', 'dataset': '../datasets/Fingerprint/Fingerprint_A.txt'},
# {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'},
# # {'name': 'DHFR', 'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt'}, # node symb/nsymb
# # {'name': 'SYNTHETIC', 'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt'}, # node symb/nsymb
# # {'name': 'MSRC9', 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'}, # node symb
# # {'name': 'MSRC21', 'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'}, # node symb
# # {'name': 'FIRSTMM_DB', 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'}, # node symb/nsymb ,edge nsymb
from pygraph.kernels.spKernel import spkernel, spkernel_do
from pygraph.utils.kernels import deltakernel, kernelproduct
from pygraph.utils.model_selection_precomputed import trial_do

# # {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'}, # node symb/nsymb
# # {'name': 'PROTEINS_full', 'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'}, # node symb/nsymb
# {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat',
# 'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}}, # node symb
# # {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb
# # {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1.mat',
# # 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb
# # {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109.mat',
# # 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb
# # {'name': 'NCI-HIV', 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf',
# # 'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',}, # node/edge symb
dslist = [
# {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',
# 'task': 'regression'}, # node symb
# {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression',
# 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt', }, # contains single node graph, node symb
# {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds', }, # node/edge symb
# {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds', }, # unlabeled
# {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat',
# 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, # node/edge symb
{'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'},
# # node symb/nsymb
# {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'},
# # node/edge symb
# {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'},
# {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat',
# 'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}}, # node symb

# # # not working below
# # {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',},
# # {'name': 'PTC_FR', 'dataset': '../datasets/PTC/Train/FR.ds',},
# # {'name': 'PTC_MM', 'dataset': '../datasets/PTC/Train/MM.ds',},
# # {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',},
# ]
# {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb
# # # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb
# # # {'name': 'COX2', 'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt'}, # node symb/nsymb
# {'name': 'Fingerprint', 'dataset': '../datasets/Fingerprint/Fingerprint_A.txt'},
#
# # {'name': 'DHFR', 'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt'}, # node symb/nsymb
# # {'name': 'SYNTHETIC', 'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt'}, # node symb/nsymb
# # {'name': 'MSRC9', 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'}, # node symb
# # {'name': 'MSRC21', 'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'}, # node symb
# # {'name': 'FIRSTMM_DB', 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'}, # node symb/nsymb ,edge nsymb

import ast
ds = ast.literal_eval(sys.argv[1])
# # {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'}, # node symb/nsymb
# # {'name': 'PROTEINS_full', 'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'}, # node symb/nsymb
# # {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb
# {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1.mat',
# 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb
# {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109.mat',
# 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb
# {'name': 'NCI-HIV', 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf',
# 'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',}, # node/edge symb

# # not working below
# {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',},
# {'name': 'PTC_FR', 'dataset': '../datasets/PTC/Train/FR.ds',},
# {'name': 'PTC_MM', 'dataset': '../datasets/PTC/Train/MM.ds',},
# {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',},
]
estimator = spkernel
mixkernel = functools.partial(kernelsum, deltakernel, rbf_kernel)
param_grid_precomputed = {
'node_kernels': [{
'symb': deltakernel,
'nsymb': rbf_kernel,
'mix': mixkernel
}]
}
param_grid = [{
'C': np.logspace(-10, 10, num=41, base=10)
}, {
'alpha': np.logspace(-10, 10, num=41, base=10)
}]
mixkernel = functools.partial(kernelproduct, deltakernel, rbf_kernel)
param_grid_precomputed = {'node_kernels': [
{'symb': deltakernel, 'nsymb': rbf_kernel, 'mix': mixkernel}]}
param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)},
{'alpha': np.logspace(-10, 10, num=41, base=10)}]

print()
print(ds['name'])
model_selection_for_precomputed_kernel(
ds['dataset'],
estimator,
param_grid_precomputed,
(param_grid[1]
if ('task' in ds and ds['task'] == 'regression') else param_grid[0]),
(ds['task'] if 'task' in ds else 'classification'),
NUM_TRIALS=30,
datafile_y=(ds['dataset_y'] if 'dataset_y' in ds else None),
extra_params=(ds['extra_params'] if 'extra_params' in ds else None),
ds_name=ds['name'])
for ds in dslist:
print()
print(ds['name'])
model_selection_for_precomputed_kernel(
ds['dataset'],
estimator,
param_grid_precomputed,
(param_grid[1] if ('task' in ds and ds['task']
== 'regression') else param_grid[0]),
(ds['task'] if 'task' in ds else 'classification'),
NUM_TRIALS=30,
datafile_y=(ds['dataset_y'] if 'dataset_y' in ds else None),
extra_params=(ds['extra_params'] if 'extra_params' in ds else None),
ds_name=ds['name'],
n_jobs=multiprocessing.cpu_count(),
read_gm_from_file=False)

# %lprun -f spkernel \
# %lprun -f trial_do -f spkernel -f spkernel_do -f model_selection_for_precomputed_kernel \
# model_selection_for_precomputed_kernel( \
# ds['dataset'], estimator, param_grid_precomputed, \
# ds['dataset'], \
# estimator, \
# param_grid_precomputed, \
# (param_grid[1] if ('task' in ds and ds['task'] == 'regression') else param_grid[0]), \
# (ds['task'] if 'task' in ds else 'classification'), NUM_TRIALS=30, \
# (ds['task'] if 'task' in ds else 'classification'), \
# NUM_TRIALS=30, \
# datafile_y=(ds['dataset_y'] if 'dataset_y' in ds else None), \
# extra_params=(ds['extra_params'] if 'extra_params' in ds else None))
print()
# extra_params=(ds['extra_params'] if 'extra_params' in ds else None), \
# ds_name=ds['name'], \
# n_jobs=multiprocessing.cpu_count())
print()

# import functools
# from libs import *


+ 6
- 5
pygraph/kernels/spKernel.py View File

@@ -5,13 +5,12 @@

import sys
import pathlib
sys.path.insert(0, "../")
from tqdm import tqdm
import time
from itertools import combinations, combinations_with_replacement, product
from functools import partial
from joblib import Parallel, delayed
from multiprocessing import Pool
from tqdm import tqdm

import networkx as nx
import numpy as np
@@ -19,6 +18,8 @@ import numpy as np
from pygraph.utils.utils import getSPGraph
from pygraph.utils.graphdataset import get_dataset_attributes

sys.path.insert(0, "../")


def spkernel(*args,
node_label='atom',
@@ -48,13 +49,13 @@ def spkernel(*args,
Gn = args[0] if len(args) == 1 else [args[0], args[1]]

weight = None
if edge_weight == None:
if edge_weight is None:
print('\n None edge weight specified. Set all weight to 1.\n')
else:
try:
some_weight = list(
nx.get_edge_attributes(Gn[0], edge_weight).values())[0]
if isinstance(some_weight, float) or isinstance(some_weight, int):
if isinstance(some_weight, (float, int)):
weight = edge_weight
else:
print(
@@ -241,7 +242,7 @@ def spkernel_do(Gn, ds_attrs, node_label, node_kernels, ij):
nk11, nk22 = vk_dict[(e1[0], e2[0])], vk_dict[(e1[1],
e2[1])]
kn1 = nk11 * nk22
Kmatrix += kn1 + kn2
Kmatrix += kn1
else:
for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)):
if e1[2]['cost'] == e2[2]['cost']:


+ 420
- 210
pygraph/utils/model_selection_precomputed.py View File

@@ -5,14 +5,15 @@ from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.model_selection import KFold, train_test_split, ParameterGrid

from joblib import Parallel, delayed
#from joblib import Parallel, delayed
from multiprocessing import Pool
from functools import partial
import sys
sys.path.insert(0, "../")
import os
import time
from os.path import basename, splitext
import datetime
#from os.path import basename, splitext
from pygraph.utils.graphfiles import loadDataset
from tqdm import tqdm

@@ -26,7 +27,8 @@ def model_selection_for_precomputed_kernel(datafile,
datafile_y=None,
extra_params=None,
ds_name='ds-unknown',
n_jobs=1):
n_jobs=1,
read_gm_from_file=False):
"""Perform model selection, fitting and testing for precomputed kernels using nested cv. Print out neccessary data during the process then finally the results.

Parameters
@@ -45,6 +47,8 @@ def model_selection_for_precomputed_kernel(datafile,
Number of random trials of outer cv loop. The default is 30.
datafile_y : string
Path of file storing y data. This parameter is optional depending on the given dataset file.
read_gm_from_file : boolean
Whether gram matrices are loaded from file.

Examples
--------
@@ -65,7 +69,8 @@ def model_selection_for_precomputed_kernel(datafile,

results_dir = '../notebooks/results/' + estimator.__name__
# a string to save all the results.
str_fw = '# This file contains results of ' + estimator.__name__ + ' on dataset ' + ds_name + ',\n# including gram matrices, serial numbers for gram matrix figures and performance.\n\n'
str_fw = '###################### log time: ' + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + '. ######################\n\n'
str_fw += '# This file contains results of ' + estimator.__name__ + ' on dataset ' + ds_name + ',\n# including gram matrices, serial numbers for gram matrix figures and performance.\n\n'

# setup the model type
model_type = model_type.lower()
@@ -75,119 +80,349 @@ def model_selection_for_precomputed_kernel(datafile,
)
print()
print('--- This is a %s problem ---' % model_type)
str_fw += 'This is a %s problem.\n\n' % model_type

# Load the dataset
print()
print('\nI. Loading dataset from file...')
dataset, y = loadDataset(
datafile, filename_y=datafile_y, extra_params=extra_params)

# import matplotlib.pyplot as plt
# import networkx as nx
# nx.draw_networkx(dataset[30])
# plt.show()

# Grid of parameters with a discrete number of values for each.
param_list_precomputed = list(ParameterGrid(param_grid_precomputed))
param_list = list(ParameterGrid(param_grid))
# np.savetxt(results_name_pre + 'param_grid_precomputed.dt',
# [[key, value] for key, value in sorted(param_grid_precomputed)])
# np.savetxt(results_name_pre + 'param_grid.dt',
# [[key, value] for key, value in sorted(param_grid)])

gram_matrices = [
] # a list to store gram matrices for all param_grid_precomputed
gram_matrix_time = [
] # a list to store time to calculate gram matrices
param_list_pre_revised = [
] # list to store param grids precomputed ignoring the useless ones

# calculate all gram matrices
print()
print('2. Calculating gram matrices. This could take a while...')
str_fw += '\nI. Gram matrices.\n\n'
tts = time.time() # start training time
nb_gm_ignore = 0 # the number of gram matrices those should not be considered, as they may contain elements that are not numbers (NaN)
for idx, params_out in enumerate(param_list_precomputed):
params_out['n_jobs'] = n_jobs
rtn_data = estimator(dataset, **params_out)
Kmatrix = rtn_data[0]
current_run_time = rtn_data[1]
if len(rtn_data) == 3:
idx_trim = rtn_data[2] # the index of trimmed graph list
y = [y[idx] for idx in idx_trim]

Kmatrix_diag = Kmatrix.diagonal().copy()
# remove graphs whose kernels with themselves are zeros
nb_g_ignore = 0
for idx, diag in enumerate(Kmatrix_diag):
if diag == 0:
Kmatrix = np.delete(Kmatrix, (idx - nb_g_ignore), axis=0)
Kmatrix = np.delete(Kmatrix, (idx - nb_g_ignore), axis=1)
nb_g_ignore += 1
# normalization
for i in range(len(Kmatrix)):
for j in range(i, len(Kmatrix)):
Kmatrix[i][j] /= np.sqrt(Kmatrix_diag[i] * Kmatrix_diag[j])
Kmatrix[j][i] = Kmatrix[i][j]

str_fw += 'This is a %s problem.\n' % model_type
# calculate gram matrices rather than read them from file.
if read_gm_from_file == False:
# Load the dataset
print()
if params_out == {}:
print('the gram matrix is: ')
str_fw += 'the gram matrix is:\n\n'
else:
print('the gram matrix with parameters', params_out, 'is: ')
str_fw += 'the gram matrix with parameters %s is:\n\n' % params_out
if len(Kmatrix) < 2:
nb_gm_ignore += 1
print('ignored, as at most only one of all its diagonal value is non-zero.')
str_fw += 'ignored, as at most only one of all its diagonal value is non-zero.\n\n'
else:
if np.isnan(Kmatrix).any(
): # if the matrix contains elements that are not numbers
print('\n1. Loading dataset from file...')
dataset, y = loadDataset(
datafile, filename_y=datafile_y, extra_params=extra_params)

# import matplotlib.pyplot as plt
# import networkx as nx
# nx.draw_networkx(dataset[30])
# plt.show()
# Grid of parameters with a discrete number of values for each.
param_list_precomputed = list(ParameterGrid(param_grid_precomputed))
param_list = list(ParameterGrid(param_grid))
gram_matrices = [
] # a list to store gram matrices for all param_grid_precomputed
gram_matrix_time = [
] # a list to store time to calculate gram matrices
param_list_pre_revised = [
] # list to store param grids precomputed ignoring the useless ones
# calculate all gram matrices
print()
print('2. Calculating gram matrices. This could take a while...')
str_fw += '\nII. Gram matrices.\n\n'
tts = time.time() # start training time
nb_gm_ignore = 0 # the number of gram matrices those should not be considered, as they may contain elements that are not numbers (NaN)
for idx, params_out in enumerate(param_list_precomputed):
params_out['n_jobs'] = n_jobs
rtn_data = estimator(dataset, **params_out)
Kmatrix = rtn_data[0]
current_run_time = rtn_data[1]
# for some kernels, some graphs in datasets may not meet the
# kernels' requirements for graph structure. These graphs are trimmed.
if len(rtn_data) == 3:
idx_trim = rtn_data[2] # the index of trimmed graph list
y = [y[idx] for idx in idx_trim] # trim y accordingly
Kmatrix_diag = Kmatrix.diagonal().copy()
# remove graphs whose kernels with themselves are zeros
nb_g_ignore = 0
for idx, diag in enumerate(Kmatrix_diag):
if diag == 0:
Kmatrix = np.delete(Kmatrix, (idx - nb_g_ignore), axis=0)
Kmatrix = np.delete(Kmatrix, (idx - nb_g_ignore), axis=1)
nb_g_ignore += 1
# normalization
for i in range(len(Kmatrix)):
for j in range(i, len(Kmatrix)):
Kmatrix[i][j] /= np.sqrt(Kmatrix_diag[i] * Kmatrix_diag[j])
Kmatrix[j][i] = Kmatrix[i][j]
print()
if params_out == {}:
print('the gram matrix is: ')
str_fw += 'the gram matrix is:\n\n'
else:
print('the gram matrix with parameters', params_out, 'is: ')
str_fw += 'the gram matrix with parameters %s is:\n\n' % params_out
if len(Kmatrix) < 2:
nb_gm_ignore += 1
print('ignored, as it contains elements that are not numbers.')
str_fw += 'ignored, as it contains elements that are not numbers.\n\n'
print('ignored, as at most only one of all its diagonal value is non-zero.')
str_fw += 'ignored, as at most only one of all its diagonal value is non-zero.\n\n'
else:
if np.isnan(Kmatrix).any(
): # if the matrix contains elements that are not numbers
nb_gm_ignore += 1
print('ignored, as it contains elements that are not numbers.')
str_fw += 'ignored, as it contains elements that are not numbers.\n\n'
else:
print(Kmatrix)
str_fw += np.array2string(
Kmatrix,
separator=',') + '\n\n'
# separator=',',
# threshold=np.inf,
# floatmode='unique') + '\n\n'

fig_file_name = results_dir + '/GM[ds]' + ds_name
if params_out != {}:
fig_file_name += '[params]' + str(idx)
plt.imshow(Kmatrix)
plt.colorbar()
plt.savefig(fig_file_name + '.eps', format='eps', dpi=300)
plt.show()
gram_matrices.append(Kmatrix)
gram_matrix_time.append(current_run_time)
param_list_pre_revised.append(params_out)
if nb_g_ignore > 0:
print(', where %d graphs are ignored as their graph kernels with themselves are zeros.' % nb_g_ignore)
str_fw += ', where %d graphs are ignored as their graph kernels with themselves are zeros.' % nb_g_ignore
print()
print(
'{} gram matrices are calculated, {} of which are ignored.'.format(
len(param_list_precomputed), nb_gm_ignore))
str_fw += '{} gram matrices are calculated, {} of which are ignored.\n\n'.format(len(param_list_precomputed), nb_gm_ignore)
str_fw += 'serial numbers of gram matrix figures and their corresponding parameters settings:\n\n'
str_fw += ''.join([
'{}: {}\n'.format(idx, params_out)
for idx, params_out in enumerate(param_list_precomputed)
])
print()
if len(gram_matrices) == 0:
print('all gram matrices are ignored, no results obtained.')
str_fw += '\nall gram matrices are ignored, no results obtained.\n\n'
else:
# save gram matrices to file.
np.savez(results_dir + '/' + ds_name + '.gm',
gms=gram_matrices, params=param_list_pre_revised, y=y,
gmtime=gram_matrix_time)
print(
'3. Fitting and predicting using nested cross validation. This could really take a while...'
)
pool = Pool(n_jobs)
trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, gram_matrices, y, model_type)
train_pref = []
val_pref = []
test_pref = []
if NUM_TRIALS < 100:
chunksize, extra = divmod(NUM_TRIALS, n_jobs * 4)
if extra:
chunksize += 1
else:
print(Kmatrix)
str_fw += np.array2string(
Kmatrix,
separator=',',
threshold=np.inf,
floatmode='unique') + '\n\n'
plt.matshow(Kmatrix)
plt.colorbar()
fig_file_name = results_dir + '/GM[ds]' + ds_name
if params_out != {}:
fig_file_name += '[params]' + str(idx)
plt.savefig(fig_file_name + '.eps', format='eps', dpi=300)
plt.show()
gram_matrices.append(Kmatrix)
gram_matrix_time.append(current_run_time)
param_list_pre_revised.append(params_out)
if nb_g_ignore > 0:
print(', where %d graphs are ignored as their graph kernels with themselves are zeros.' % nb_g_ignore)
str_fw += ', where %d graphs are ignored as their graph kernels with themselves are zeros.' % nb_g_ignore
print()
print(
'{} gram matrices are calculated, {} of which are ignored.'.format(
len(param_list_precomputed), nb_gm_ignore))
str_fw += '{} gram matrices are calculated, {} of which are ignored.\n\n'.format(len(param_list_precomputed), nb_gm_ignore)
str_fw += 'serial numbers of gram matrix figures and their corresponding parameters settings:\n\n'
str_fw += ''.join([
'{}: {}\n'.format(idx, params_out)
for idx, params_out in enumerate(param_list_precomputed)
])

print()
if len(gram_matrices) == 0:
print('all gram matrices are ignored, no results obtained.')
str_fw += '\nall gram matrices are ignored, no results obtained.\n\n'
else:
chunksize = 100
for o1, o2, o3 in tqdm(pool.imap_unordered(trial_do_partial, range(NUM_TRIALS), chunksize), desc='cross validation', file=sys.stdout):
train_pref.append(o1)
val_pref.append(o2)
test_pref.append(o3)
pool.close()
pool.join()
# # ---- use pool.map to parallel. ----
# result_perf = pool.map(trial_do_partial, range(NUM_TRIALS))
# train_pref = [item[0] for item in result_perf]
# val_pref = [item[1] for item in result_perf]
# test_pref = [item[2] for item in result_perf]
# # ---- use joblib.Parallel to parallel and track progress. ----
# trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, gram_matrices, y, model_type)
# result_perf = Parallel(n_jobs=n_jobs, verbose=10)(delayed(trial_do_partial)(trial) for trial in range(NUM_TRIALS))
# train_pref = [item[0] for item in result_perf]
# val_pref = [item[1] for item in result_perf]
# test_pref = [item[2] for item in result_perf]
# # ---- direct running, normally use a single CPU core. ----
# train_pref = []
# val_pref = []
# test_pref = []
# for i in tqdm(range(NUM_TRIALS), desc='cross validation', file=sys.stdout):
# o1, o2, o3 = trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, i)
# train_pref.append(o1)
# val_pref.append(o2)
# test_pref.append(o3)
print()
print('4. Getting final performance...')
str_fw += '\nIII. Performance.\n\n'
# averages and confidences of performances on outer trials for each combination of parameters
average_train_scores = np.mean(train_pref, axis=0)
average_val_scores = np.mean(val_pref, axis=0)
average_perf_scores = np.mean(test_pref, axis=0)
# sample std is used here
std_train_scores = np.std(train_pref, axis=0, ddof=1)
std_val_scores = np.std(val_pref, axis=0, ddof=1)
std_perf_scores = np.std(test_pref, axis=0, ddof=1)
if model_type == 'regression':
best_val_perf = np.amin(average_val_scores)
else:
best_val_perf = np.amax(average_val_scores)
best_params_index = np.where(average_val_scores == best_val_perf)
# find smallest val std with best val perf.
best_val_stds = [
std_val_scores[value][best_params_index[1][idx]]
for idx, value in enumerate(best_params_index[0])
]
min_val_std = np.amin(best_val_stds)
best_params_index = np.where(std_val_scores == min_val_std)
best_params_out = [
param_list_pre_revised[i] for i in best_params_index[0]
]
best_params_in = [param_list[i] for i in best_params_index[1]]
print('best_params_out: ', best_params_out)
print('best_params_in: ', best_params_in)
print()
print('best_val_perf: ', best_val_perf)
print('best_val_std: ', min_val_std)
str_fw += 'best settings of hyper-params to build gram matrix: %s\n' % best_params_out
str_fw += 'best settings of other hyper-params: %s\n\n' % best_params_in
str_fw += 'best_val_perf: %s\n' % best_val_perf
str_fw += 'best_val_std: %s\n' % min_val_std
final_performance = [
average_perf_scores[value][best_params_index[1][idx]]
for idx, value in enumerate(best_params_index[0])
]
final_confidence = [
std_perf_scores[value][best_params_index[1][idx]]
for idx, value in enumerate(best_params_index[0])
]
print('final_performance: ', final_performance)
print('final_confidence: ', final_confidence)
str_fw += 'final_performance: %s\n' % final_performance
str_fw += 'final_confidence: %s\n' % final_confidence
train_performance = [
average_train_scores[value][best_params_index[1][idx]]
for idx, value in enumerate(best_params_index[0])
]
train_std = [
std_train_scores[value][best_params_index[1][idx]]
for idx, value in enumerate(best_params_index[0])
]
print('train_performance: %s' % train_performance)
print('train_std: ', train_std)
str_fw += 'train_performance: %s\n' % train_performance
str_fw += 'train_std: %s\n\n' % train_std
print()
tt_total = time.time() - tts # training time for all hyper-parameters
average_gram_matrix_time = np.mean(gram_matrix_time)
std_gram_matrix_time = np.std(gram_matrix_time, ddof=1)
best_gram_matrix_time = [
gram_matrix_time[i] for i in best_params_index[0]
]
ave_bgmt = np.mean(best_gram_matrix_time)
std_bgmt = np.std(best_gram_matrix_time, ddof=1)
print(
'time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s'
.format(average_gram_matrix_time, std_gram_matrix_time))
print('time to calculate best gram matrix: {:.2f}±{:.2f}s'.format(
ave_bgmt, std_bgmt))
print(
'total training time with all hyper-param choices: {:.2f}s'.format(
tt_total))
str_fw += 'time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s\n'.format(average_gram_matrix_time, std_gram_matrix_time)
str_fw += 'time to calculate best gram matrix: {:.2f}±{:.2f}s\n'.format(ave_bgmt, std_bgmt)
str_fw += 'total training time with all hyper-param choices: {:.2f}s\n\n'.format(tt_total)
# # save results to file
# np.savetxt(results_name_pre + 'average_train_scores.dt',
# average_train_scores)
# np.savetxt(results_name_pre + 'average_val_scores', average_val_scores)
# np.savetxt(results_name_pre + 'average_perf_scores.dt',
# average_perf_scores)
# np.savetxt(results_name_pre + 'std_train_scores.dt', std_train_scores)
# np.savetxt(results_name_pre + 'std_val_scores.dt', std_val_scores)
# np.savetxt(results_name_pre + 'std_perf_scores.dt', std_perf_scores)
# np.save(results_name_pre + 'best_params_index', best_params_index)
# np.save(results_name_pre + 'best_params_pre.dt', best_params_out)
# np.save(results_name_pre + 'best_params_in.dt', best_params_in)
# np.save(results_name_pre + 'best_val_perf.dt', best_val_perf)
# np.save(results_name_pre + 'best_val_std.dt', best_val_std)
# np.save(results_name_pre + 'final_performance.dt', final_performance)
# np.save(results_name_pre + 'final_confidence.dt', final_confidence)
# np.save(results_name_pre + 'train_performance.dt', train_performance)
# np.save(results_name_pre + 'train_std.dt', train_std)
# np.save(results_name_pre + 'gram_matrix_time.dt', gram_matrix_time)
# np.save(results_name_pre + 'average_gram_matrix_time.dt',
# average_gram_matrix_time)
# np.save(results_name_pre + 'std_gram_matrix_time.dt',
# std_gram_matrix_time)
# np.save(results_name_pre + 'best_gram_matrix_time.dt',
# best_gram_matrix_time)
# print out as table.
from collections import OrderedDict
from tabulate import tabulate
table_dict = {}
if model_type == 'regression':
for param_in in param_list:
param_in['alpha'] = '{:.2e}'.format(param_in['alpha'])
else:
for param_in in param_list:
param_in['C'] = '{:.2e}'.format(param_in['C'])
table_dict['params'] = [{**param_out, **param_in}
for param_in in param_list for param_out in param_list_pre_revised]
table_dict['gram_matrix_time'] = [
'{:.2f}'.format(gram_matrix_time[index_out])
for param_in in param_list
for index_out, _ in enumerate(param_list_pre_revised)
]
table_dict['valid_perf'] = [
'{:.2f}±{:.2f}'.format(average_val_scores[index_out][index_in],
std_val_scores[index_out][index_in])
for index_in, _ in enumerate(param_list)
for index_out, _ in enumerate(param_list_pre_revised)
]
table_dict['test_perf'] = [
'{:.2f}±{:.2f}'.format(average_perf_scores[index_out][index_in],
std_perf_scores[index_out][index_in])
for index_in, _ in enumerate(param_list)
for index_out, _ in enumerate(param_list_pre_revised)
]
table_dict['train_perf'] = [
'{:.2f}±{:.2f}'.format(average_train_scores[index_out][index_in],
std_train_scores[index_out][index_in])
for index_in, _ in enumerate(param_list)
for index_out, _ in enumerate(param_list_pre_revised)
]
keyorder = [
'params', 'train_perf', 'valid_perf', 'test_perf',
'gram_matrix_time'
]
print()
tb_print = tabulate(
OrderedDict(
sorted(table_dict.items(),
key=lambda i: keyorder.index(i[0]))),
headers='keys')
print(tb_print)
str_fw += 'table of performance v.s. hyper-params:\n\n%s\n\n' % tb_print
# read gram matrices from file.
else:
# Grid of parameters with a discrete number of values for each.
# param_list_precomputed = list(ParameterGrid(param_grid_precomputed))
param_list = list(ParameterGrid(param_grid))
# read gram matrices from file.
print()
print('2. Reading gram matrices from file...')
str_fw += '\nII. Gram matrices.\n\nGram matrices are read from file, see last log for detail.\n'
gmfile = np.load(results_dir + '/' + ds_name + '.gm.npz')
gram_matrices = gmfile['gms'] # a list to store gram matrices for all param_grid_precomputed
param_list_pre_revised = gmfile['params'] # list to store param grids precomputed ignoring the useless ones
y = gmfile['y'].tolist()
tts = time.time() # start training time
# nb_gm_ignore = 0 # the number of gram matrices those should not be considered, as they may contain elements that are not numbers (NaN)
print(
'3. Fitting and predicting using nested cross validation. This could really take a while...'
)
pool = Pool(n_jobs)
trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, gram_matrices, y, model_type)
train_pref = []
@@ -205,7 +440,7 @@ def model_selection_for_precomputed_kernel(datafile,
test_pref.append(o3)
pool.close()
pool.join()
# # ---- use pool.map to parallel. ----
# result_perf = pool.map(trial_do_partial, range(NUM_TRIALS))
# train_pref = [item[0] for item in result_perf]
@@ -219,19 +454,19 @@ def model_selection_for_precomputed_kernel(datafile,
# val_pref = [item[1] for item in result_perf]
# test_pref = [item[2] for item in result_perf]

# # ---- direct running, normally use single CPU core. ----
# train_pref = []
# val_pref = []
# test_pref = []
# for i in tqdm(range(NUM_TRIALS), desc='cross validation', file=sys.stdout):
# o1, o2, o3 = trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, i)
# train_pref.append(o1)
# val_pref.append(o2)
# test_pref.append(o3)
# # ---- direct running, normally use a single CPU core. ----
# train_pref = []
# val_pref = []
# test_pref = []
# for i in tqdm(range(NUM_TRIALS), desc='cross validation', file=sys.stdout):
# o1, o2, o3 = trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, i)
# train_pref.append(o1)
# val_pref.append(o2)
# test_pref.append(o3)

print()
print('4. Getting final performance...')
str_fw += '\nII. Performance.\n\n'
str_fw += '\nIII. Performance.\n\n'
# averages and confidences of performances on outer trials for each combination of parameters
average_train_scores = np.mean(train_pref, axis=0)
average_val_scores = np.mean(val_pref, axis=0)
@@ -293,53 +528,25 @@ def model_selection_for_precomputed_kernel(datafile,
str_fw += 'train_std: %s\n\n' % train_std

print()
tt_total = time.time() - tts # training time for all hyper-parameters
average_gram_matrix_time = np.mean(gram_matrix_time)
std_gram_matrix_time = np.std(gram_matrix_time, ddof=1)
best_gram_matrix_time = [
gram_matrix_time[i] for i in best_params_index[0]
]
ave_bgmt = np.mean(best_gram_matrix_time)
std_bgmt = np.std(best_gram_matrix_time, ddof=1)
print(
'time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s'
.format(average_gram_matrix_time, std_gram_matrix_time))
print('time to calculate best gram matrix: {:.2f}±{:.2f}s'.format(
ave_bgmt, std_bgmt))
tt_poster = time.time() - tts # training time with hyper-param choices who did not participate in calculation of gram matrices
# average_gram_matrix_time = np.mean(gram_matrix_time)
# std_gram_matrix_time = np.std(gram_matrix_time, ddof=1)
# best_gram_matrix_time = [
# gram_matrix_time[i] for i in best_params_index[0]
# ]
# ave_bgmt = np.mean(best_gram_matrix_time)
# std_bgmt = np.std(best_gram_matrix_time, ddof=1)
# print(
# 'time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s'
# .format(average_gram_matrix_time, std_gram_matrix_time))
# print('time to calculate best gram matrix: {:.2f}±{:.2f}s'.format(
# ave_bgmt, std_bgmt))
print(
'total training time with all hyper-param choices: {:.2f}s'.format(
tt_total))
str_fw += 'time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s\n'.format(average_gram_matrix_time, std_gram_matrix_time)
str_fw += 'time to calculate best gram matrix: {:.2f}±{:.2f}s\n'.format(ave_bgmt, std_bgmt)
str_fw += 'total training time with all hyper-param choices: {:.2f}s\n\n'.format(tt_total)

# # save results to file
# np.savetxt(results_name_pre + 'average_train_scores.dt',
# average_train_scores)
# np.savetxt(results_name_pre + 'average_val_scores', average_val_scores)
# np.savetxt(results_name_pre + 'average_perf_scores.dt',
# average_perf_scores)
# np.savetxt(results_name_pre + 'std_train_scores.dt', std_train_scores)
# np.savetxt(results_name_pre + 'std_val_scores.dt', std_val_scores)
# np.savetxt(results_name_pre + 'std_perf_scores.dt', std_perf_scores)

# np.save(results_name_pre + 'best_params_index', best_params_index)
# np.save(results_name_pre + 'best_params_pre.dt', best_params_out)
# np.save(results_name_pre + 'best_params_in.dt', best_params_in)
# np.save(results_name_pre + 'best_val_perf.dt', best_val_perf)
# np.save(results_name_pre + 'best_val_std.dt', best_val_std)
# np.save(results_name_pre + 'final_performance.dt', final_performance)
# np.save(results_name_pre + 'final_confidence.dt', final_confidence)
# np.save(results_name_pre + 'train_performance.dt', train_performance)
# np.save(results_name_pre + 'train_std.dt', train_std)

# np.save(results_name_pre + 'gram_matrix_time.dt', gram_matrix_time)
# np.save(results_name_pre + 'average_gram_matrix_time.dt',
# average_gram_matrix_time)
# np.save(results_name_pre + 'std_gram_matrix_time.dt',
# std_gram_matrix_time)
# np.save(results_name_pre + 'best_gram_matrix_time.dt',
# best_gram_matrix_time)
'training time with hyper-param choices who did not participate in calculation of gram matrices: {:.2f}s'.format(
tt_poster))
# str_fw += 'time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s\n'.format(average_gram_matrix_time, std_gram_matrix_time)
# str_fw += 'time to calculate best gram matrix: {:.2f}±{:.2f}s\n'.format(ave_bgmt, std_bgmt)
str_fw += 'training time with hyper-param choices who did not participate in calculation of gram matrices: {:.2f}s\n\n'.format(tt_poster)

# print out as table.
from collections import OrderedDict
@@ -353,11 +560,11 @@ def model_selection_for_precomputed_kernel(datafile,
param_in['C'] = '{:.2e}'.format(param_in['C'])
table_dict['params'] = [{**param_out, **param_in}
for param_in in param_list for param_out in param_list_pre_revised]
table_dict['gram_matrix_time'] = [
'{:.2f}'.format(gram_matrix_time[index_out])
for param_in in param_list
for index_out, _ in enumerate(param_list_pre_revised)
]
# table_dict['gram_matrix_time'] = [
# '{:.2f}'.format(gram_matrix_time[index_out])
# for param_in in param_list
# for index_out, _ in enumerate(param_list_pre_revised)
# ]
table_dict['valid_perf'] = [
'{:.2f}±{:.2f}'.format(average_val_scores[index_out][index_in],
std_val_scores[index_out][index_in])
@@ -377,8 +584,7 @@ def model_selection_for_precomputed_kernel(datafile,
for index_out, _ in enumerate(param_list_pre_revised)
]
keyorder = [
'params', 'train_perf', 'valid_perf', 'test_perf',
'gram_matrix_time'
'params', 'train_perf', 'valid_perf', 'test_perf'
]
print()
tb_print = tabulate(
@@ -392,59 +598,62 @@ def model_selection_for_precomputed_kernel(datafile,
# open file to save all results for this dataset.
if not os.path.exists(results_dir):
os.makedirs(results_dir)

with open(results_dir + '/' + ds_name + '.txt', 'w') as fresults:
fresults.write(str_fw)
fresults.close()
# open file to save all results for this dataset.
if not os.path.exists(results_dir):
os.makedirs(results_dir)
if not os.path.exists(results_dir + '/' + ds_name + '.output.txt'):
with open(results_dir + '/' + ds_name + '.output.txt', 'w') as f:
f.write(str_fw)
else:
with open(results_dir + '/' + ds_name + '.output.txt', 'r+') as f:
content = f.read()
f.seek(0, 0)
f.write(str_fw + '\n\n\n' + content)


def trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, trial): # Test set level

# Arrays to store scores
train_pref = np.zeros((len(param_list_pre_revised),
len(param_list)))
val_pref = np.zeros((len(param_list_pre_revised),
len(param_list)))
test_pref = np.zeros((len(param_list_pre_revised),
len(param_list)))
train_pref = np.zeros((len(param_list_pre_revised), len(param_list)))
val_pref = np.zeros((len(param_list_pre_revised), len(param_list)))
test_pref = np.zeros((len(param_list_pre_revised), len(param_list)))

# loop for each outer param tuple
for index_out, params_out in enumerate(param_list_pre_revised):
# split gram matrix and y to app and test sets.
X_app, X_test, y_app, y_test = train_test_split(
gram_matrices[index_out], y, test_size=0.1)
split_index_app = [y.index(y_i) for y_i in y_app if y_i in y]
# split_index_test = [y.index(y_i) for y_i in y_test if y_i in y]
X_app = X_app[:, split_index_app]
X_test = X_test[:, split_index_app]
indices = range(len(y))
X_app, X_test, y_app, y_test, idx_app, idx_test = train_test_split(
gram_matrices[index_out], y, indices, test_size=0.1,
random_state=None, shuffle=True)
X_app = X_app[:, idx_app]
X_test = X_test[:, idx_app]
y_app = np.array(y_app)
y_test = np.array(y_test)

# loop for each inner param tuple
for index_in, params_in in enumerate(param_list):
inner_cv = KFold(
n_splits=10, shuffle=True, random_state=trial)
inner_cv = KFold(n_splits=10, shuffle=True, random_state=trial)
current_train_perf = []
current_valid_perf = []
current_test_perf = []
current_test_perf = []

# For regression use the Kernel Ridge method
try:
if model_type == 'regression':
KR = KernelRidge(kernel='precomputed', **params_in)
kr = KernelRidge(kernel='precomputed', **params_in)
# loop for each split on validation set level
# validation set level
for train_index, valid_index in inner_cv.split(
X_app):
KR.fit(X_app[train_index, :][:, train_index],
for train_index, valid_index in inner_cv.split(X_app):
kr.fit(X_app[train_index, :][:, train_index],
y_app[train_index])

# predict on the train, validation and test set
y_pred_train = KR.predict(
y_pred_train = kr.predict(
X_app[train_index, :][:, train_index])
y_pred_valid = KR.predict(
y_pred_valid = kr.predict(
X_app[valid_index, :][:, train_index])
y_pred_test = KR.predict(
y_pred_test = kr.predict(
X_test[:, train_index])

# root mean squared errors
@@ -460,22 +669,23 @@ def trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, t
np.sqrt(
mean_squared_error(
y_test, y_pred_test)))
# For clcassification use SVM
# For clcassification use SVM
else:
KR = SVC(kernel='precomputed', **params_in)
svc = SVC(kernel='precomputed', cache_size=200,
verbose=False, **params_in)
# loop for each split on validation set level
# validation set level
for train_index, valid_index in inner_cv.split(
X_app):
KR.fit(X_app[train_index, :][:, train_index],
for train_index, valid_index in inner_cv.split(X_app):
# np.savez("bug.npy",X_app[train_index, :][:, train_index],y_app[train_index])
svc.fit(X_app[train_index, :][:, train_index],
y_app[train_index])
# predict on the train, validation and test set
y_pred_train = KR.predict(
y_pred_train = svc.predict(
X_app[train_index, :][:, train_index])
y_pred_valid = KR.predict(
y_pred_valid = svc.predict(
X_app[valid_index, :][:, train_index])
y_pred_test = KR.predict(
y_pred_test = svc.predict(
X_test[:, train_index])

# root mean squared errors


Loading…
Cancel
Save