2. save gram matrices and relative data when using function cross_validation_pre_computed, before cross validation step, in case that something goes wrong with CV. Parameter read_gm_from_file can be used to choose whether to read gram matrices from file. 3. add some test code to check if a gram matrix is symmetric and positive semi-definite.v0.1
@@ -3,7 +3,7 @@ A python package for graph kernels. | |||
## Requirements | |||
numpy==1.14.5 | |||
numpy==1.15.1 | |||
scipy==1.1.0 | |||
matplotlib==2.2.2 | |||
networkx==2.1 | |||
@@ -12,30 +12,52 @@ | |||
"output_type": "stream", | |||
"text": [ | |||
"\n", | |||
"Letter-med:\n", | |||
"substructures : {'linear', 'non linear'}\n", | |||
"node_labeled : False\n", | |||
"edge_labeled : False\n", | |||
"is_directed : False\n", | |||
"dataset_size : 2250\n", | |||
"ave_graph_size : 4.674666666666667\n", | |||
"min_graph_size : 1\n", | |||
"max_graph_size : 9\n", | |||
"ave_graph_edge_num : 3.2057777777777776\n", | |||
"min_graph_edge_num : 0\n", | |||
"max_graph_edge_num : 7\n", | |||
"ave_graph_degree : 2.012888888888889\n", | |||
"min_graph_degree : 0\n", | |||
"max_graph_degree : 4\n", | |||
"node_label_num : 0\n", | |||
"edge_label_num : 0\n", | |||
"node_attr_dim : 2\n", | |||
"edge_attr_dim : 0\n", | |||
"class_number : 15\n", | |||
"\n", | |||
"\n", | |||
"Mutagenicity:\n", | |||
"substructures : {'non linear', 'linear'}\n", | |||
"substructures : {'linear', 'non linear'}\n", | |||
"node_labeled : True\n", | |||
"edge_labeled : True\n", | |||
"is_directed : False\n", | |||
"dataset_size : 4337\n", | |||
"ave_graph_size : 30.3177311506\n", | |||
"ave_graph_size : 30.317731150564907\n", | |||
"min_graph_size : 4\n", | |||
"max_graph_size : 417\n", | |||
"ave_graph_edge_num : 30.7694258704\n", | |||
"ave_graph_edge_num : 30.76942587041734\n", | |||
"min_graph_edge_num : 3\n", | |||
"max_graph_edge_num : 112\n", | |||
"ave_graph_degree : 3.75651371916\n", | |||
"ave_graph_degree : 3.75651371916071\n", | |||
"min_graph_degree : 3\n", | |||
"max_graph_degree : 4\n", | |||
"node_label_num : 14\n", | |||
"edge_label_num : 3\n", | |||
"node_attr_dim : False\n", | |||
"edge_attr_dim : False\n", | |||
"node_attr_dim : 0\n", | |||
"edge_attr_dim : 0\n", | |||
"class_number : 2\n", | |||
"\n", | |||
"\n", | |||
"AIDS:\n", | |||
"substructures : {'non linear', 'linear'}\n", | |||
"substructures : {'linear', 'non linear'}\n", | |||
"node_labeled : True\n", | |||
"edge_labeled : True\n", | |||
"is_directed : False\n", | |||
@@ -52,7 +74,7 @@ | |||
"node_label_num : 38\n", | |||
"edge_label_num : 3\n", | |||
"node_attr_dim : 4\n", | |||
"edge_attr_dim : False\n", | |||
"edge_attr_dim : 0\n", | |||
"class_number : 2\n", | |||
"\n", | |||
"\n", | |||
@@ -62,13 +84,13 @@ | |||
"edge_labeled : False\n", | |||
"is_directed : False\n", | |||
"dataset_size : 41\n", | |||
"ave_graph_size : 1377.26829268\n", | |||
"ave_graph_size : 1377.2682926829268\n", | |||
"min_graph_size : 134\n", | |||
"max_graph_size : 5037\n", | |||
"ave_graph_edge_num : 3074.09756098\n", | |||
"ave_graph_edge_num : 3074.0975609756097\n", | |||
"min_graph_edge_num : 320\n", | |||
"max_graph_edge_num : 10888\n", | |||
"ave_graph_degree : 7.85365853659\n", | |||
"ave_graph_degree : 7.853658536585366\n", | |||
"min_graph_degree : 6\n", | |||
"max_graph_degree : 10\n", | |||
"node_label_num : 5\n", | |||
@@ -79,51 +101,51 @@ | |||
"\n", | |||
"\n", | |||
"MSRC9:\n", | |||
"substructures : {'non linear', 'linear'}\n", | |||
"substructures : {'linear', 'non linear'}\n", | |||
"node_labeled : True\n", | |||
"edge_labeled : False\n", | |||
"is_directed : False\n", | |||
"dataset_size : 221\n", | |||
"ave_graph_size : 40.5791855204\n", | |||
"ave_graph_size : 40.57918552036199\n", | |||
"min_graph_size : 25\n", | |||
"max_graph_size : 55\n", | |||
"ave_graph_edge_num : 97.9366515837\n", | |||
"ave_graph_edge_num : 97.9366515837104\n", | |||
"min_graph_edge_num : 53\n", | |||
"max_graph_edge_num : 145\n", | |||
"ave_graph_degree : 10.1583710407\n", | |||
"ave_graph_degree : 10.158371040723981\n", | |||
"min_graph_degree : 8\n", | |||
"max_graph_degree : 16\n", | |||
"node_label_num : 10\n", | |||
"edge_label_num : 0\n", | |||
"node_attr_dim : False\n", | |||
"edge_attr_dim : False\n", | |||
"node_attr_dim : 0\n", | |||
"edge_attr_dim : 0\n", | |||
"class_number : 8\n", | |||
"\n", | |||
"\n", | |||
"MSRC21:\n", | |||
"substructures : {'non linear', 'linear'}\n", | |||
"substructures : {'linear', 'non linear'}\n", | |||
"node_labeled : True\n", | |||
"edge_labeled : False\n", | |||
"is_directed : False\n", | |||
"dataset_size : 563\n", | |||
"ave_graph_size : 77.5204262877\n", | |||
"ave_graph_size : 77.52042628774423\n", | |||
"min_graph_size : 51\n", | |||
"max_graph_size : 141\n", | |||
"ave_graph_edge_num : 198.323268206\n", | |||
"ave_graph_edge_num : 198.32326820603907\n", | |||
"min_graph_edge_num : 121\n", | |||
"max_graph_edge_num : 405\n", | |||
"ave_graph_degree : 11.4156305506\n", | |||
"ave_graph_degree : 11.41563055062167\n", | |||
"min_graph_degree : 8\n", | |||
"max_graph_degree : 23\n", | |||
"node_label_num : 22\n", | |||
"edge_label_num : 0\n", | |||
"node_attr_dim : False\n", | |||
"edge_attr_dim : False\n", | |||
"node_attr_dim : 0\n", | |||
"edge_attr_dim : 0\n", | |||
"class_number : 20\n", | |||
"\n", | |||
"\n", | |||
"SYNTHETIC:\n", | |||
"substructures : {'non linear', 'linear'}\n", | |||
"substructures : {'linear', 'non linear'}\n", | |||
"node_labeled : True\n", | |||
"edge_labeled : False\n", | |||
"is_directed : False\n", | |||
@@ -140,42 +162,42 @@ | |||
"node_label_num : 8\n", | |||
"edge_label_num : 0\n", | |||
"node_attr_dim : 1\n", | |||
"edge_attr_dim : False\n", | |||
"edge_attr_dim : 0\n", | |||
"class_number : 2\n", | |||
"\n", | |||
"\n", | |||
"BZR:\n", | |||
"substructures : {'non linear', 'linear'}\n", | |||
"substructures : {'linear', 'non linear'}\n", | |||
"node_labeled : True\n", | |||
"edge_labeled : False\n", | |||
"is_directed : False\n", | |||
"dataset_size : 405\n", | |||
"ave_graph_size : 35.750617284\n", | |||
"ave_graph_size : 35.75061728395062\n", | |||
"min_graph_size : 13\n", | |||
"max_graph_size : 57\n", | |||
"ave_graph_edge_num : 38.3580246914\n", | |||
"ave_graph_edge_num : 38.358024691358025\n", | |||
"min_graph_edge_num : 13\n", | |||
"max_graph_edge_num : 60\n", | |||
"ave_graph_degree : 3.86419753086\n", | |||
"ave_graph_degree : 3.8641975308641974\n", | |||
"min_graph_degree : 3\n", | |||
"max_graph_degree : 4\n", | |||
"node_label_num : 10\n", | |||
"edge_label_num : 0\n", | |||
"node_attr_dim : 3\n", | |||
"edge_attr_dim : False\n", | |||
"edge_attr_dim : 0\n", | |||
"class_number : 2\n", | |||
"\n", | |||
"\n", | |||
"COX2:\n", | |||
"substructures : {'non linear', 'linear'}\n", | |||
"substructures : {'linear', 'non linear'}\n", | |||
"node_labeled : True\n", | |||
"edge_labeled : False\n", | |||
"is_directed : False\n", | |||
"dataset_size : 467\n", | |||
"ave_graph_size : 41.2248394004\n", | |||
"ave_graph_size : 41.224839400428266\n", | |||
"min_graph_size : 32\n", | |||
"max_graph_size : 56\n", | |||
"ave_graph_edge_num : 43.4453961456\n", | |||
"ave_graph_edge_num : 43.44539614561028\n", | |||
"min_graph_edge_num : 34\n", | |||
"max_graph_edge_num : 59\n", | |||
"ave_graph_degree : 4.0\n", | |||
@@ -184,152 +206,152 @@ | |||
"node_label_num : 8\n", | |||
"edge_label_num : 0\n", | |||
"node_attr_dim : 3\n", | |||
"edge_attr_dim : False\n", | |||
"edge_attr_dim : 0\n", | |||
"class_number : 2\n", | |||
"\n", | |||
"\n", | |||
"DHFR:\n", | |||
"substructures : {'non linear', 'linear'}\n", | |||
"substructures : {'linear', 'non linear'}\n", | |||
"node_labeled : True\n", | |||
"edge_labeled : False\n", | |||
"is_directed : False\n", | |||
"dataset_size : 756\n", | |||
"ave_graph_size : 42.4272486772\n", | |||
"ave_graph_size : 42.42724867724868\n", | |||
"min_graph_size : 20\n", | |||
"max_graph_size : 71\n", | |||
"ave_graph_edge_num : 44.544973545\n", | |||
"ave_graph_edge_num : 44.544973544973544\n", | |||
"min_graph_edge_num : 21\n", | |||
"max_graph_edge_num : 73\n", | |||
"ave_graph_degree : 3.95502645503\n", | |||
"ave_graph_degree : 3.955026455026455\n", | |||
"min_graph_degree : 3\n", | |||
"max_graph_degree : 4\n", | |||
"node_label_num : 9\n", | |||
"edge_label_num : 0\n", | |||
"node_attr_dim : 3\n", | |||
"edge_attr_dim : False\n", | |||
"edge_attr_dim : 0\n", | |||
"class_number : 2\n", | |||
"\n", | |||
"\n", | |||
"ENZYMES:\n", | |||
"substructures : {'non linear', 'linear'}\n", | |||
"substructures : {'linear', 'non linear'}\n", | |||
"node_labeled : True\n", | |||
"edge_labeled : False\n", | |||
"is_directed : False\n", | |||
"dataset_size : 600\n", | |||
"ave_graph_size : 32.6333333333\n", | |||
"ave_graph_size : 32.63333333333333\n", | |||
"min_graph_size : 2\n", | |||
"max_graph_size : 126\n", | |||
"ave_graph_edge_num : 62.1366666667\n", | |||
"ave_graph_edge_num : 62.13666666666666\n", | |||
"min_graph_edge_num : 1\n", | |||
"max_graph_edge_num : 149\n", | |||
"ave_graph_degree : 6.08666666667\n", | |||
"ave_graph_degree : 6.086666666666667\n", | |||
"min_graph_degree : 1\n", | |||
"max_graph_degree : 9\n", | |||
"node_label_num : 3\n", | |||
"edge_label_num : 0\n", | |||
"node_attr_dim : 18\n", | |||
"edge_attr_dim : False\n", | |||
"edge_attr_dim : 0\n", | |||
"class_number : 6\n", | |||
"\n", | |||
"\n", | |||
"PROTEINS:\n", | |||
"substructures : {'non linear', 'linear'}\n", | |||
"substructures : {'linear', 'non linear'}\n", | |||
"node_labeled : True\n", | |||
"edge_labeled : False\n", | |||
"is_directed : False\n", | |||
"dataset_size : 1113\n", | |||
"ave_graph_size : 39.0575022462\n", | |||
"ave_graph_size : 39.05750224618149\n", | |||
"min_graph_size : 4\n", | |||
"max_graph_size : 620\n", | |||
"ave_graph_edge_num : 72.8158131177\n", | |||
"ave_graph_edge_num : 72.8158131176999\n", | |||
"min_graph_edge_num : 5\n", | |||
"max_graph_edge_num : 1049\n", | |||
"ave_graph_degree : 5.79424977538\n", | |||
"ave_graph_degree : 5.794249775381851\n", | |||
"min_graph_degree : 3\n", | |||
"max_graph_degree : 25\n", | |||
"node_label_num : 3\n", | |||
"edge_label_num : 0\n", | |||
"node_attr_dim : 1\n", | |||
"edge_attr_dim : False\n", | |||
"edge_attr_dim : 0\n", | |||
"class_number : 2\n", | |||
"\n", | |||
"\n", | |||
"PROTEINS_full:\n", | |||
"substructures : {'non linear', 'linear'}\n", | |||
"substructures : {'linear', 'non linear'}\n", | |||
"node_labeled : True\n", | |||
"edge_labeled : False\n", | |||
"is_directed : False\n", | |||
"dataset_size : 1113\n", | |||
"ave_graph_size : 39.0575022462\n", | |||
"ave_graph_size : 39.05750224618149\n", | |||
"min_graph_size : 4\n", | |||
"max_graph_size : 620\n", | |||
"ave_graph_edge_num : 72.8158131177\n", | |||
"ave_graph_edge_num : 72.8158131176999\n", | |||
"min_graph_edge_num : 5\n", | |||
"max_graph_edge_num : 1049\n", | |||
"ave_graph_degree : 5.79424977538\n", | |||
"ave_graph_degree : 5.794249775381851\n", | |||
"min_graph_degree : 3\n", | |||
"max_graph_degree : 25\n", | |||
"node_label_num : 3\n", | |||
"edge_label_num : 0\n", | |||
"node_attr_dim : 29\n", | |||
"edge_attr_dim : False\n", | |||
"edge_attr_dim : 0\n", | |||
"class_number : 2\n", | |||
"\n", | |||
"\n", | |||
"D&D:\n", | |||
"substructures : {'non linear', 'linear'}\n", | |||
"substructures : {'linear', 'non linear'}\n", | |||
"node_labeled : True\n", | |||
"edge_labeled : False\n", | |||
"is_directed : False\n", | |||
"dataset_size : 1178\n", | |||
"ave_graph_size : 284.31663837\n", | |||
"ave_graph_size : 284.3166383701188\n", | |||
"min_graph_size : 30\n", | |||
"max_graph_size : 5748\n", | |||
"ave_graph_edge_num : 715.658743633\n", | |||
"ave_graph_edge_num : 715.6587436332767\n", | |||
"min_graph_edge_num : 63\n", | |||
"max_graph_edge_num : 14267\n", | |||
"ave_graph_degree : 9.50933786078\n", | |||
"ave_graph_degree : 9.509337860780985\n", | |||
"min_graph_degree : 6\n", | |||
"max_graph_degree : 19\n", | |||
"node_label_num : 82\n", | |||
"edge_label_num : 0\n", | |||
"node_attr_dim : False\n", | |||
"edge_attr_dim : False\n", | |||
"node_attr_dim : 0\n", | |||
"edge_attr_dim : 0\n", | |||
"class_number : 2\n", | |||
"\n", | |||
"\n", | |||
"MUTAG:\n", | |||
"substructures : {'non linear', 'linear'}\n", | |||
"substructures : {'linear', 'non linear'}\n", | |||
"node_labeled : True\n", | |||
"edge_labeled : True\n", | |||
"is_directed : False\n", | |||
"dataset_size : 188\n", | |||
"ave_graph_size : 17.9308510638\n", | |||
"ave_graph_size : 17.930851063829788\n", | |||
"min_graph_size : 10\n", | |||
"max_graph_size : 28\n", | |||
"ave_graph_edge_num : 19.7925531915\n", | |||
"ave_graph_edge_num : 19.79255319148936\n", | |||
"min_graph_edge_num : 10\n", | |||
"max_graph_edge_num : 33\n", | |||
"ave_graph_degree : 3.00531914894\n", | |||
"ave_graph_degree : 3.00531914893617\n", | |||
"min_graph_degree : 3\n", | |||
"max_graph_degree : 4\n", | |||
"node_label_num : 7\n", | |||
"edge_label_num : 11\n", | |||
"node_attr_dim : False\n", | |||
"edge_attr_dim : False\n", | |||
"node_attr_dim : 0\n", | |||
"edge_attr_dim : 0\n", | |||
"class_number : 2\n", | |||
"\n", | |||
"\n", | |||
"Alkane:\n", | |||
"substructures : {'non linear', 'linear'}\n", | |||
"substructures : {'linear', 'non linear'}\n", | |||
"node_labeled : True\n", | |||
"edge_labeled : False\n", | |||
"is_directed : False\n", | |||
"dataset_size : 150\n", | |||
"ave_graph_size : 8.87333333333\n", | |||
"ave_graph_size : 8.873333333333333\n", | |||
"min_graph_size : 1\n", | |||
"max_graph_size : 10\n", | |||
"ave_graph_edge_num : 7.87333333333\n", | |||
"ave_graph_edge_num : 7.873333333333333\n", | |||
"min_graph_edge_num : 0\n", | |||
"max_graph_edge_num : 9\n", | |||
"ave_graph_degree : 3.36\n", | |||
@@ -337,43 +359,43 @@ | |||
"max_graph_degree : 4\n", | |||
"node_label_num : 2\n", | |||
"edge_label_num : 1\n", | |||
"node_attr_dim : False\n", | |||
"edge_attr_dim : False\n", | |||
"node_attr_dim : 0\n", | |||
"edge_attr_dim : 0\n", | |||
"class_number : 123\n", | |||
"\n", | |||
"\n", | |||
"Acyclic:\n", | |||
"substructures : {'non linear', 'linear'}\n", | |||
"substructures : {'linear', 'non linear'}\n", | |||
"node_labeled : True\n", | |||
"edge_labeled : False\n", | |||
"is_directed : False\n", | |||
"dataset_size : 183\n", | |||
"ave_graph_size : 8.15300546448\n", | |||
"ave_graph_size : 8.153005464480874\n", | |||
"min_graph_size : 3\n", | |||
"max_graph_size : 11\n", | |||
"ave_graph_edge_num : 7.15300546448\n", | |||
"ave_graph_edge_num : 7.1530054644808745\n", | |||
"min_graph_edge_num : 2\n", | |||
"max_graph_edge_num : 10\n", | |||
"ave_graph_degree : 2.80327868852\n", | |||
"ave_graph_degree : 2.80327868852459\n", | |||
"min_graph_degree : 2\n", | |||
"max_graph_degree : 4\n", | |||
"node_label_num : 3\n", | |||
"edge_label_num : 1\n", | |||
"node_attr_dim : False\n", | |||
"edge_attr_dim : False\n", | |||
"node_attr_dim : 0\n", | |||
"edge_attr_dim : 0\n", | |||
"class_number : 148\n", | |||
"\n", | |||
"\n", | |||
"MAO:\n", | |||
"substructures : {'non linear', 'linear'}\n", | |||
"substructures : {'linear', 'non linear'}\n", | |||
"node_labeled : True\n", | |||
"edge_labeled : True\n", | |||
"is_directed : False\n", | |||
"dataset_size : 68\n", | |||
"ave_graph_size : 18.3823529412\n", | |||
"ave_graph_size : 18.38235294117647\n", | |||
"min_graph_size : 11\n", | |||
"max_graph_size : 27\n", | |||
"ave_graph_edge_num : 19.6323529412\n", | |||
"ave_graph_edge_num : 19.63235294117647\n", | |||
"min_graph_edge_num : 12\n", | |||
"max_graph_edge_num : 29\n", | |||
"ave_graph_degree : 3.0\n", | |||
@@ -381,107 +403,95 @@ | |||
"max_graph_degree : 3\n", | |||
"node_label_num : 3\n", | |||
"edge_label_num : 4\n", | |||
"node_attr_dim : False\n", | |||
"edge_attr_dim : False\n", | |||
"node_attr_dim : 0\n", | |||
"edge_attr_dim : 0\n", | |||
"class_number : 2\n", | |||
"\n", | |||
"\n" | |||
] | |||
}, | |||
{ | |||
"name": "stdout", | |||
"output_type": "stream", | |||
"text": [ | |||
"\n", | |||
"PAH:\n", | |||
"substructures : {'non linear', 'linear'}\n", | |||
"substructures : {'linear', 'non linear'}\n", | |||
"node_labeled : False\n", | |||
"edge_labeled : False\n", | |||
"is_directed : False\n", | |||
"dataset_size : 94\n", | |||
"ave_graph_size : 20.7021276596\n", | |||
"ave_graph_size : 20.70212765957447\n", | |||
"min_graph_size : 10\n", | |||
"max_graph_size : 28\n", | |||
"ave_graph_edge_num : 24.4255319149\n", | |||
"ave_graph_edge_num : 24.425531914893618\n", | |||
"min_graph_edge_num : 11\n", | |||
"max_graph_edge_num : 34\n", | |||
"ave_graph_degree : 3.01063829787\n", | |||
"ave_graph_degree : 3.0106382978723403\n", | |||
"min_graph_degree : 3\n", | |||
"max_graph_degree : 4\n", | |||
"node_label_num : 1\n", | |||
"edge_label_num : 1\n", | |||
"node_attr_dim : False\n", | |||
"edge_attr_dim : False\n", | |||
"node_attr_dim : 0\n", | |||
"edge_attr_dim : 0\n", | |||
"class_number : 2\n", | |||
"\n", | |||
"\n", | |||
"NCI1:\n", | |||
"substructures : {'non linear', 'linear'}\n", | |||
"substructures : {'linear', 'non linear'}\n", | |||
"node_labeled : True\n", | |||
"edge_labeled : False\n", | |||
"is_directed : False\n", | |||
"dataset_size : 4110\n", | |||
"ave_graph_size : 29.8654501217\n", | |||
"ave_graph_size : 29.8654501216545\n", | |||
"min_graph_size : 3\n", | |||
"max_graph_size : 111\n", | |||
"ave_graph_edge_num : 32.3\n", | |||
"min_graph_edge_num : 2\n", | |||
"max_graph_edge_num : 119\n", | |||
"ave_graph_degree : 3.33600973236\n", | |||
"ave_graph_degree : 3.3360097323600972\n", | |||
"min_graph_degree : 2\n", | |||
"max_graph_degree : 4\n", | |||
"node_label_num : 37\n", | |||
"edge_label_num : 0\n", | |||
"node_attr_dim : False\n", | |||
"edge_attr_dim : False\n", | |||
"node_attr_dim : 0\n", | |||
"edge_attr_dim : 0\n", | |||
"class_number : 2\n", | |||
"\n" | |||
] | |||
}, | |||
{ | |||
"name": "stdout", | |||
"output_type": "stream", | |||
"text": [ | |||
"\n", | |||
"\n", | |||
"NCI109:\n", | |||
"substructures : {'non linear', 'linear'}\n", | |||
"substructures : {'linear', 'non linear'}\n", | |||
"node_labeled : True\n", | |||
"edge_labeled : False\n", | |||
"is_directed : False\n", | |||
"dataset_size : 4127\n", | |||
"ave_graph_size : 29.6811243034\n", | |||
"ave_graph_size : 29.681124303368065\n", | |||
"min_graph_size : 4\n", | |||
"max_graph_size : 111\n", | |||
"ave_graph_edge_num : 32.1308456506\n", | |||
"ave_graph_edge_num : 32.13084565059365\n", | |||
"min_graph_edge_num : 3\n", | |||
"max_graph_edge_num : 119\n", | |||
"ave_graph_degree : 3.34383329295\n", | |||
"ave_graph_degree : 3.343833292948873\n", | |||
"min_graph_degree : 2\n", | |||
"max_graph_degree : 5\n", | |||
"node_label_num : 38\n", | |||
"edge_label_num : 0\n", | |||
"node_attr_dim : False\n", | |||
"edge_attr_dim : False\n", | |||
"node_attr_dim : 0\n", | |||
"edge_attr_dim : 0\n", | |||
"class_number : 2\n", | |||
"\n", | |||
"load SDF: 100%|██████████| 4457424/4457424 [00:11<00:00, 397248.47it/s]\n", | |||
"ajust data: 100%|██████████| 42687/42687 [00:10<00:00, 3939.72it/s] \n", | |||
"\n", | |||
"NCI-HIV:\n", | |||
"substructures : {'non linear', 'linear'}\n", | |||
"node_labeled : True\n", | |||
"edge_labeled : True\n", | |||
"is_directed : False\n", | |||
"dataset_size : 42682\n", | |||
"ave_graph_size : 45.7094559768\n", | |||
"min_graph_size : 2\n", | |||
"max_graph_size : 438\n", | |||
"ave_graph_edge_num : 47.7137903566\n", | |||
"min_graph_edge_num : 1\n", | |||
"max_graph_edge_num : 441\n", | |||
"ave_graph_degree : 3.97605548006\n", | |||
"min_graph_degree : 1\n", | |||
"max_graph_degree : 12\n", | |||
"node_label_num : 63\n", | |||
"edge_label_num : 3\n", | |||
"node_attr_dim : False\n", | |||
"edge_attr_dim : False\n", | |||
"class_number : 3\n", | |||
"\n" | |||
] | |||
}, | |||
{ | |||
"ename": "ModuleNotFoundError", | |||
"evalue": "No module named 'tqdm'", | |||
"output_type": "error", | |||
"traceback": [ | |||
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", | |||
"\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", | |||
"\u001b[0;32m<ipython-input-1-1e4da065c026>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 45\u001b[0m \u001b[0mds\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'dataset'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 46\u001b[0m \u001b[0mfilename_y\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mds\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'dataset_y'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;34m'dataset_y'\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mds\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 47\u001b[0;31m extra_params=(ds['extra_params'] if 'extra_params' in ds else None))\n\u001b[0m\u001b[1;32m 48\u001b[0m attrs = get_dataset_attributes(\n\u001b[1;32m 49\u001b[0m dataset, target=y, node_label='atom', edge_label='bond_type')\n", | |||
"\u001b[0;32m/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/utils/graphfiles.py\u001b[0m in \u001b[0;36mloadDataset\u001b[0;34m(filename, filename_y, extra_params)\u001b[0m\n\u001b[1;32m 377\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0mextension\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m\"sdf\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 378\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mnumpy\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 379\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mtqdm\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mtqdm\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 380\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0msys\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 381\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", | |||
"\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'tqdm'" | |||
] | |||
} | |||
], | |||
"source": [ | |||
@@ -491,6 +501,7 @@ | |||
"from pygraph.utils.graphdataset import get_dataset_attributes\n", | |||
"\n", | |||
"dslist = [\n", | |||
" {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'},\n", | |||
" {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'},\n", | |||
" {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'},\n", | |||
" {'name': 'FIRSTMM_DB', 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'},\n", | |||
@@ -557,7 +568,7 @@ | |||
"name": "python", | |||
"nbconvert_exporter": "python", | |||
"pygments_lexer": "ipython3", | |||
"version": "3.5.2" | |||
"version": "3.6.5" | |||
} | |||
}, | |||
"nbformat": 4, | |||
@@ -1,89 +1,95 @@ | |||
# %load_ext line_profiler | |||
# %matplotlib inline | |||
import functools | |||
from libs import * | |||
from pygraph.kernels.spKernel import spkernel | |||
from pygraph.utils.kernels import deltakernel, kernelsum | |||
import multiprocessing | |||
from sklearn.metrics.pairwise import rbf_kernel | |||
# dslist = [ | |||
# {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds', 'task': 'regression'}, # node symb | |||
# # {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb | |||
# {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds',}, # unlabeled | |||
# {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds',}, # node/edge symb | |||
# {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat', | |||
# 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, # node/edge symb | |||
# {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression', | |||
# 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt',}, # contains single node graph, node symb | |||
# # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb | |||
# # {'name': 'COX2', 'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt'}, # node symb/nsymb | |||
# {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, # node/edge symb | |||
# {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'}, # node symb/nsymb | |||
# # {'name': 'Fingerprint', 'dataset': '../datasets/Fingerprint/Fingerprint_A.txt'}, | |||
# {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'}, | |||
# # {'name': 'DHFR', 'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt'}, # node symb/nsymb | |||
# # {'name': 'SYNTHETIC', 'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt'}, # node symb/nsymb | |||
# # {'name': 'MSRC9', 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'}, # node symb | |||
# # {'name': 'MSRC21', 'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'}, # node symb | |||
# # {'name': 'FIRSTMM_DB', 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'}, # node symb/nsymb ,edge nsymb | |||
from pygraph.kernels.spKernel import spkernel, spkernel_do | |||
from pygraph.utils.kernels import deltakernel, kernelproduct | |||
from pygraph.utils.model_selection_precomputed import trial_do | |||
# # {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'}, # node symb/nsymb | |||
# # {'name': 'PROTEINS_full', 'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'}, # node symb/nsymb | |||
# {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat', | |||
# 'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}}, # node symb | |||
# # {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb | |||
# # {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1.mat', | |||
# # 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb | |||
# # {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109.mat', | |||
# # 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb | |||
# # {'name': 'NCI-HIV', 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf', | |||
# # 'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',}, # node/edge symb | |||
dslist = [ | |||
# {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds', | |||
# 'task': 'regression'}, # node symb | |||
# {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression', | |||
# 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt', }, # contains single node graph, node symb | |||
# {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds', }, # node/edge symb | |||
# {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds', }, # unlabeled | |||
# {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat', | |||
# 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, # node/edge symb | |||
{'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'}, | |||
# # node symb/nsymb | |||
# {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'}, | |||
# # node/edge symb | |||
# {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, | |||
# {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat', | |||
# 'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}}, # node symb | |||
# # # not working below | |||
# # {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',}, | |||
# # {'name': 'PTC_FR', 'dataset': '../datasets/PTC/Train/FR.ds',}, | |||
# # {'name': 'PTC_MM', 'dataset': '../datasets/PTC/Train/MM.ds',}, | |||
# # {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',}, | |||
# ] | |||
# {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb | |||
# # # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb | |||
# # # {'name': 'COX2', 'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt'}, # node symb/nsymb | |||
# {'name': 'Fingerprint', 'dataset': '../datasets/Fingerprint/Fingerprint_A.txt'}, | |||
# | |||
# # {'name': 'DHFR', 'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt'}, # node symb/nsymb | |||
# # {'name': 'SYNTHETIC', 'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt'}, # node symb/nsymb | |||
# # {'name': 'MSRC9', 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'}, # node symb | |||
# # {'name': 'MSRC21', 'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'}, # node symb | |||
# # {'name': 'FIRSTMM_DB', 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'}, # node symb/nsymb ,edge nsymb | |||
import ast | |||
ds = ast.literal_eval(sys.argv[1]) | |||
# # {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'}, # node symb/nsymb | |||
# # {'name': 'PROTEINS_full', 'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'}, # node symb/nsymb | |||
# # {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb | |||
# {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1.mat', | |||
# 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb | |||
# {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109.mat', | |||
# 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb | |||
# {'name': 'NCI-HIV', 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf', | |||
# 'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',}, # node/edge symb | |||
# # not working below | |||
# {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',}, | |||
# {'name': 'PTC_FR', 'dataset': '../datasets/PTC/Train/FR.ds',}, | |||
# {'name': 'PTC_MM', 'dataset': '../datasets/PTC/Train/MM.ds',}, | |||
# {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',}, | |||
] | |||
estimator = spkernel | |||
mixkernel = functools.partial(kernelsum, deltakernel, rbf_kernel) | |||
param_grid_precomputed = { | |||
'node_kernels': [{ | |||
'symb': deltakernel, | |||
'nsymb': rbf_kernel, | |||
'mix': mixkernel | |||
}] | |||
} | |||
param_grid = [{ | |||
'C': np.logspace(-10, 10, num=41, base=10) | |||
}, { | |||
'alpha': np.logspace(-10, 10, num=41, base=10) | |||
}] | |||
mixkernel = functools.partial(kernelproduct, deltakernel, rbf_kernel) | |||
param_grid_precomputed = {'node_kernels': [ | |||
{'symb': deltakernel, 'nsymb': rbf_kernel, 'mix': mixkernel}]} | |||
param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)}, | |||
{'alpha': np.logspace(-10, 10, num=41, base=10)}] | |||
print() | |||
print(ds['name']) | |||
model_selection_for_precomputed_kernel( | |||
ds['dataset'], | |||
estimator, | |||
param_grid_precomputed, | |||
(param_grid[1] | |||
if ('task' in ds and ds['task'] == 'regression') else param_grid[0]), | |||
(ds['task'] if 'task' in ds else 'classification'), | |||
NUM_TRIALS=30, | |||
datafile_y=(ds['dataset_y'] if 'dataset_y' in ds else None), | |||
extra_params=(ds['extra_params'] if 'extra_params' in ds else None), | |||
ds_name=ds['name']) | |||
for ds in dslist: | |||
print() | |||
print(ds['name']) | |||
model_selection_for_precomputed_kernel( | |||
ds['dataset'], | |||
estimator, | |||
param_grid_precomputed, | |||
(param_grid[1] if ('task' in ds and ds['task'] | |||
== 'regression') else param_grid[0]), | |||
(ds['task'] if 'task' in ds else 'classification'), | |||
NUM_TRIALS=30, | |||
datafile_y=(ds['dataset_y'] if 'dataset_y' in ds else None), | |||
extra_params=(ds['extra_params'] if 'extra_params' in ds else None), | |||
ds_name=ds['name'], | |||
n_jobs=multiprocessing.cpu_count(), | |||
read_gm_from_file=False) | |||
# %lprun -f spkernel \ | |||
# %lprun -f trial_do -f spkernel -f spkernel_do -f model_selection_for_precomputed_kernel \ | |||
# model_selection_for_precomputed_kernel( \ | |||
# ds['dataset'], estimator, param_grid_precomputed, \ | |||
# ds['dataset'], \ | |||
# estimator, \ | |||
# param_grid_precomputed, \ | |||
# (param_grid[1] if ('task' in ds and ds['task'] == 'regression') else param_grid[0]), \ | |||
# (ds['task'] if 'task' in ds else 'classification'), NUM_TRIALS=30, \ | |||
# (ds['task'] if 'task' in ds else 'classification'), \ | |||
# NUM_TRIALS=30, \ | |||
# datafile_y=(ds['dataset_y'] if 'dataset_y' in ds else None), \ | |||
# extra_params=(ds['extra_params'] if 'extra_params' in ds else None)) | |||
print() | |||
# extra_params=(ds['extra_params'] if 'extra_params' in ds else None), \ | |||
# ds_name=ds['name'], \ | |||
# n_jobs=multiprocessing.cpu_count()) | |||
print() | |||
# import functools | |||
# from libs import * | |||
@@ -5,13 +5,12 @@ | |||
import sys | |||
import pathlib | |||
sys.path.insert(0, "../") | |||
from tqdm import tqdm | |||
import time | |||
from itertools import combinations, combinations_with_replacement, product | |||
from functools import partial | |||
from joblib import Parallel, delayed | |||
from multiprocessing import Pool | |||
from tqdm import tqdm | |||
import networkx as nx | |||
import numpy as np | |||
@@ -19,6 +18,8 @@ import numpy as np | |||
from pygraph.utils.utils import getSPGraph | |||
from pygraph.utils.graphdataset import get_dataset_attributes | |||
sys.path.insert(0, "../") | |||
def spkernel(*args, | |||
node_label='atom', | |||
@@ -48,13 +49,13 @@ def spkernel(*args, | |||
Gn = args[0] if len(args) == 1 else [args[0], args[1]] | |||
weight = None | |||
if edge_weight == None: | |||
if edge_weight is None: | |||
print('\n None edge weight specified. Set all weight to 1.\n') | |||
else: | |||
try: | |||
some_weight = list( | |||
nx.get_edge_attributes(Gn[0], edge_weight).values())[0] | |||
if isinstance(some_weight, float) or isinstance(some_weight, int): | |||
if isinstance(some_weight, (float, int)): | |||
weight = edge_weight | |||
else: | |||
print( | |||
@@ -241,7 +242,7 @@ def spkernel_do(Gn, ds_attrs, node_label, node_kernels, ij): | |||
nk11, nk22 = vk_dict[(e1[0], e2[0])], vk_dict[(e1[1], | |||
e2[1])] | |||
kn1 = nk11 * nk22 | |||
Kmatrix += kn1 + kn2 | |||
Kmatrix += kn1 | |||
else: | |||
for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)): | |||
if e1[2]['cost'] == e2[2]['cost']: | |||
@@ -5,14 +5,15 @@ from sklearn.svm import SVC | |||
from sklearn.metrics import accuracy_score, mean_squared_error | |||
from sklearn.model_selection import KFold, train_test_split, ParameterGrid | |||
from joblib import Parallel, delayed | |||
#from joblib import Parallel, delayed | |||
from multiprocessing import Pool | |||
from functools import partial | |||
import sys | |||
sys.path.insert(0, "../") | |||
import os | |||
import time | |||
from os.path import basename, splitext | |||
import datetime | |||
#from os.path import basename, splitext | |||
from pygraph.utils.graphfiles import loadDataset | |||
from tqdm import tqdm | |||
@@ -26,7 +27,8 @@ def model_selection_for_precomputed_kernel(datafile, | |||
datafile_y=None, | |||
extra_params=None, | |||
ds_name='ds-unknown', | |||
n_jobs=1): | |||
n_jobs=1, | |||
read_gm_from_file=False): | |||
"""Perform model selection, fitting and testing for precomputed kernels using nested cv. Print out neccessary data during the process then finally the results. | |||
Parameters | |||
@@ -45,6 +47,8 @@ def model_selection_for_precomputed_kernel(datafile, | |||
Number of random trials of outer cv loop. The default is 30. | |||
datafile_y : string | |||
Path of file storing y data. This parameter is optional depending on the given dataset file. | |||
read_gm_from_file : boolean | |||
Whether gram matrices are loaded from file. | |||
Examples | |||
-------- | |||
@@ -65,7 +69,8 @@ def model_selection_for_precomputed_kernel(datafile, | |||
results_dir = '../notebooks/results/' + estimator.__name__ | |||
# a string to save all the results. | |||
str_fw = '# This file contains results of ' + estimator.__name__ + ' on dataset ' + ds_name + ',\n# including gram matrices, serial numbers for gram matrix figures and performance.\n\n' | |||
str_fw = '###################### log time: ' + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + '. ######################\n\n' | |||
str_fw += '# This file contains results of ' + estimator.__name__ + ' on dataset ' + ds_name + ',\n# including gram matrices, serial numbers for gram matrix figures and performance.\n\n' | |||
# setup the model type | |||
model_type = model_type.lower() | |||
@@ -75,119 +80,349 @@ def model_selection_for_precomputed_kernel(datafile, | |||
) | |||
print() | |||
print('--- This is a %s problem ---' % model_type) | |||
str_fw += 'This is a %s problem.\n\n' % model_type | |||
# Load the dataset | |||
print() | |||
print('\nI. Loading dataset from file...') | |||
dataset, y = loadDataset( | |||
datafile, filename_y=datafile_y, extra_params=extra_params) | |||
# import matplotlib.pyplot as plt | |||
# import networkx as nx | |||
# nx.draw_networkx(dataset[30]) | |||
# plt.show() | |||
# Grid of parameters with a discrete number of values for each. | |||
param_list_precomputed = list(ParameterGrid(param_grid_precomputed)) | |||
param_list = list(ParameterGrid(param_grid)) | |||
# np.savetxt(results_name_pre + 'param_grid_precomputed.dt', | |||
# [[key, value] for key, value in sorted(param_grid_precomputed)]) | |||
# np.savetxt(results_name_pre + 'param_grid.dt', | |||
# [[key, value] for key, value in sorted(param_grid)]) | |||
gram_matrices = [ | |||
] # a list to store gram matrices for all param_grid_precomputed | |||
gram_matrix_time = [ | |||
] # a list to store time to calculate gram matrices | |||
param_list_pre_revised = [ | |||
] # list to store param grids precomputed ignoring the useless ones | |||
# calculate all gram matrices | |||
print() | |||
print('2. Calculating gram matrices. This could take a while...') | |||
str_fw += '\nI. Gram matrices.\n\n' | |||
tts = time.time() # start training time | |||
nb_gm_ignore = 0 # the number of gram matrices those should not be considered, as they may contain elements that are not numbers (NaN) | |||
for idx, params_out in enumerate(param_list_precomputed): | |||
params_out['n_jobs'] = n_jobs | |||
rtn_data = estimator(dataset, **params_out) | |||
Kmatrix = rtn_data[0] | |||
current_run_time = rtn_data[1] | |||
if len(rtn_data) == 3: | |||
idx_trim = rtn_data[2] # the index of trimmed graph list | |||
y = [y[idx] for idx in idx_trim] | |||
Kmatrix_diag = Kmatrix.diagonal().copy() | |||
# remove graphs whose kernels with themselves are zeros | |||
nb_g_ignore = 0 | |||
for idx, diag in enumerate(Kmatrix_diag): | |||
if diag == 0: | |||
Kmatrix = np.delete(Kmatrix, (idx - nb_g_ignore), axis=0) | |||
Kmatrix = np.delete(Kmatrix, (idx - nb_g_ignore), axis=1) | |||
nb_g_ignore += 1 | |||
# normalization | |||
for i in range(len(Kmatrix)): | |||
for j in range(i, len(Kmatrix)): | |||
Kmatrix[i][j] /= np.sqrt(Kmatrix_diag[i] * Kmatrix_diag[j]) | |||
Kmatrix[j][i] = Kmatrix[i][j] | |||
str_fw += 'This is a %s problem.\n' % model_type | |||
# calculate gram matrices rather than read them from file. | |||
if read_gm_from_file == False: | |||
# Load the dataset | |||
print() | |||
if params_out == {}: | |||
print('the gram matrix is: ') | |||
str_fw += 'the gram matrix is:\n\n' | |||
else: | |||
print('the gram matrix with parameters', params_out, 'is: ') | |||
str_fw += 'the gram matrix with parameters %s is:\n\n' % params_out | |||
if len(Kmatrix) < 2: | |||
nb_gm_ignore += 1 | |||
print('ignored, as at most only one of all its diagonal value is non-zero.') | |||
str_fw += 'ignored, as at most only one of all its diagonal value is non-zero.\n\n' | |||
else: | |||
if np.isnan(Kmatrix).any( | |||
): # if the matrix contains elements that are not numbers | |||
print('\n1. Loading dataset from file...') | |||
dataset, y = loadDataset( | |||
datafile, filename_y=datafile_y, extra_params=extra_params) | |||
# import matplotlib.pyplot as plt | |||
# import networkx as nx | |||
# nx.draw_networkx(dataset[30]) | |||
# plt.show() | |||
# Grid of parameters with a discrete number of values for each. | |||
param_list_precomputed = list(ParameterGrid(param_grid_precomputed)) | |||
param_list = list(ParameterGrid(param_grid)) | |||
gram_matrices = [ | |||
] # a list to store gram matrices for all param_grid_precomputed | |||
gram_matrix_time = [ | |||
] # a list to store time to calculate gram matrices | |||
param_list_pre_revised = [ | |||
] # list to store param grids precomputed ignoring the useless ones | |||
# calculate all gram matrices | |||
print() | |||
print('2. Calculating gram matrices. This could take a while...') | |||
str_fw += '\nII. Gram matrices.\n\n' | |||
tts = time.time() # start training time | |||
nb_gm_ignore = 0 # the number of gram matrices those should not be considered, as they may contain elements that are not numbers (NaN) | |||
for idx, params_out in enumerate(param_list_precomputed): | |||
params_out['n_jobs'] = n_jobs | |||
rtn_data = estimator(dataset, **params_out) | |||
Kmatrix = rtn_data[0] | |||
current_run_time = rtn_data[1] | |||
# for some kernels, some graphs in datasets may not meet the | |||
# kernels' requirements for graph structure. These graphs are trimmed. | |||
if len(rtn_data) == 3: | |||
idx_trim = rtn_data[2] # the index of trimmed graph list | |||
y = [y[idx] for idx in idx_trim] # trim y accordingly | |||
Kmatrix_diag = Kmatrix.diagonal().copy() | |||
# remove graphs whose kernels with themselves are zeros | |||
nb_g_ignore = 0 | |||
for idx, diag in enumerate(Kmatrix_diag): | |||
if diag == 0: | |||
Kmatrix = np.delete(Kmatrix, (idx - nb_g_ignore), axis=0) | |||
Kmatrix = np.delete(Kmatrix, (idx - nb_g_ignore), axis=1) | |||
nb_g_ignore += 1 | |||
# normalization | |||
for i in range(len(Kmatrix)): | |||
for j in range(i, len(Kmatrix)): | |||
Kmatrix[i][j] /= np.sqrt(Kmatrix_diag[i] * Kmatrix_diag[j]) | |||
Kmatrix[j][i] = Kmatrix[i][j] | |||
print() | |||
if params_out == {}: | |||
print('the gram matrix is: ') | |||
str_fw += 'the gram matrix is:\n\n' | |||
else: | |||
print('the gram matrix with parameters', params_out, 'is: ') | |||
str_fw += 'the gram matrix with parameters %s is:\n\n' % params_out | |||
if len(Kmatrix) < 2: | |||
nb_gm_ignore += 1 | |||
print('ignored, as it contains elements that are not numbers.') | |||
str_fw += 'ignored, as it contains elements that are not numbers.\n\n' | |||
print('ignored, as at most only one of all its diagonal value is non-zero.') | |||
str_fw += 'ignored, as at most only one of all its diagonal value is non-zero.\n\n' | |||
else: | |||
if np.isnan(Kmatrix).any( | |||
): # if the matrix contains elements that are not numbers | |||
nb_gm_ignore += 1 | |||
print('ignored, as it contains elements that are not numbers.') | |||
str_fw += 'ignored, as it contains elements that are not numbers.\n\n' | |||
else: | |||
print(Kmatrix) | |||
str_fw += np.array2string( | |||
Kmatrix, | |||
separator=',') + '\n\n' | |||
# separator=',', | |||
# threshold=np.inf, | |||
# floatmode='unique') + '\n\n' | |||
fig_file_name = results_dir + '/GM[ds]' + ds_name | |||
if params_out != {}: | |||
fig_file_name += '[params]' + str(idx) | |||
plt.imshow(Kmatrix) | |||
plt.colorbar() | |||
plt.savefig(fig_file_name + '.eps', format='eps', dpi=300) | |||
plt.show() | |||
gram_matrices.append(Kmatrix) | |||
gram_matrix_time.append(current_run_time) | |||
param_list_pre_revised.append(params_out) | |||
if nb_g_ignore > 0: | |||
print(', where %d graphs are ignored as their graph kernels with themselves are zeros.' % nb_g_ignore) | |||
str_fw += ', where %d graphs are ignored as their graph kernels with themselves are zeros.' % nb_g_ignore | |||
print() | |||
print( | |||
'{} gram matrices are calculated, {} of which are ignored.'.format( | |||
len(param_list_precomputed), nb_gm_ignore)) | |||
str_fw += '{} gram matrices are calculated, {} of which are ignored.\n\n'.format(len(param_list_precomputed), nb_gm_ignore) | |||
str_fw += 'serial numbers of gram matrix figures and their corresponding parameters settings:\n\n' | |||
str_fw += ''.join([ | |||
'{}: {}\n'.format(idx, params_out) | |||
for idx, params_out in enumerate(param_list_precomputed) | |||
]) | |||
print() | |||
if len(gram_matrices) == 0: | |||
print('all gram matrices are ignored, no results obtained.') | |||
str_fw += '\nall gram matrices are ignored, no results obtained.\n\n' | |||
else: | |||
# save gram matrices to file. | |||
np.savez(results_dir + '/' + ds_name + '.gm', | |||
gms=gram_matrices, params=param_list_pre_revised, y=y, | |||
gmtime=gram_matrix_time) | |||
print( | |||
'3. Fitting and predicting using nested cross validation. This could really take a while...' | |||
) | |||
pool = Pool(n_jobs) | |||
trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, gram_matrices, y, model_type) | |||
train_pref = [] | |||
val_pref = [] | |||
test_pref = [] | |||
if NUM_TRIALS < 100: | |||
chunksize, extra = divmod(NUM_TRIALS, n_jobs * 4) | |||
if extra: | |||
chunksize += 1 | |||
else: | |||
print(Kmatrix) | |||
str_fw += np.array2string( | |||
Kmatrix, | |||
separator=',', | |||
threshold=np.inf, | |||
floatmode='unique') + '\n\n' | |||
plt.matshow(Kmatrix) | |||
plt.colorbar() | |||
fig_file_name = results_dir + '/GM[ds]' + ds_name | |||
if params_out != {}: | |||
fig_file_name += '[params]' + str(idx) | |||
plt.savefig(fig_file_name + '.eps', format='eps', dpi=300) | |||
plt.show() | |||
gram_matrices.append(Kmatrix) | |||
gram_matrix_time.append(current_run_time) | |||
param_list_pre_revised.append(params_out) | |||
if nb_g_ignore > 0: | |||
print(', where %d graphs are ignored as their graph kernels with themselves are zeros.' % nb_g_ignore) | |||
str_fw += ', where %d graphs are ignored as their graph kernels with themselves are zeros.' % nb_g_ignore | |||
print() | |||
print( | |||
'{} gram matrices are calculated, {} of which are ignored.'.format( | |||
len(param_list_precomputed), nb_gm_ignore)) | |||
str_fw += '{} gram matrices are calculated, {} of which are ignored.\n\n'.format(len(param_list_precomputed), nb_gm_ignore) | |||
str_fw += 'serial numbers of gram matrix figures and their corresponding parameters settings:\n\n' | |||
str_fw += ''.join([ | |||
'{}: {}\n'.format(idx, params_out) | |||
for idx, params_out in enumerate(param_list_precomputed) | |||
]) | |||
print() | |||
if len(gram_matrices) == 0: | |||
print('all gram matrices are ignored, no results obtained.') | |||
str_fw += '\nall gram matrices are ignored, no results obtained.\n\n' | |||
else: | |||
chunksize = 100 | |||
for o1, o2, o3 in tqdm(pool.imap_unordered(trial_do_partial, range(NUM_TRIALS), chunksize), desc='cross validation', file=sys.stdout): | |||
train_pref.append(o1) | |||
val_pref.append(o2) | |||
test_pref.append(o3) | |||
pool.close() | |||
pool.join() | |||
# # ---- use pool.map to parallel. ---- | |||
# result_perf = pool.map(trial_do_partial, range(NUM_TRIALS)) | |||
# train_pref = [item[0] for item in result_perf] | |||
# val_pref = [item[1] for item in result_perf] | |||
# test_pref = [item[2] for item in result_perf] | |||
# # ---- use joblib.Parallel to parallel and track progress. ---- | |||
# trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, gram_matrices, y, model_type) | |||
# result_perf = Parallel(n_jobs=n_jobs, verbose=10)(delayed(trial_do_partial)(trial) for trial in range(NUM_TRIALS)) | |||
# train_pref = [item[0] for item in result_perf] | |||
# val_pref = [item[1] for item in result_perf] | |||
# test_pref = [item[2] for item in result_perf] | |||
# # ---- direct running, normally use a single CPU core. ---- | |||
# train_pref = [] | |||
# val_pref = [] | |||
# test_pref = [] | |||
# for i in tqdm(range(NUM_TRIALS), desc='cross validation', file=sys.stdout): | |||
# o1, o2, o3 = trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, i) | |||
# train_pref.append(o1) | |||
# val_pref.append(o2) | |||
# test_pref.append(o3) | |||
print() | |||
print('4. Getting final performance...') | |||
str_fw += '\nIII. Performance.\n\n' | |||
# averages and confidences of performances on outer trials for each combination of parameters | |||
average_train_scores = np.mean(train_pref, axis=0) | |||
average_val_scores = np.mean(val_pref, axis=0) | |||
average_perf_scores = np.mean(test_pref, axis=0) | |||
# sample std is used here | |||
std_train_scores = np.std(train_pref, axis=0, ddof=1) | |||
std_val_scores = np.std(val_pref, axis=0, ddof=1) | |||
std_perf_scores = np.std(test_pref, axis=0, ddof=1) | |||
if model_type == 'regression': | |||
best_val_perf = np.amin(average_val_scores) | |||
else: | |||
best_val_perf = np.amax(average_val_scores) | |||
best_params_index = np.where(average_val_scores == best_val_perf) | |||
# find smallest val std with best val perf. | |||
best_val_stds = [ | |||
std_val_scores[value][best_params_index[1][idx]] | |||
for idx, value in enumerate(best_params_index[0]) | |||
] | |||
min_val_std = np.amin(best_val_stds) | |||
best_params_index = np.where(std_val_scores == min_val_std) | |||
best_params_out = [ | |||
param_list_pre_revised[i] for i in best_params_index[0] | |||
] | |||
best_params_in = [param_list[i] for i in best_params_index[1]] | |||
print('best_params_out: ', best_params_out) | |||
print('best_params_in: ', best_params_in) | |||
print() | |||
print('best_val_perf: ', best_val_perf) | |||
print('best_val_std: ', min_val_std) | |||
str_fw += 'best settings of hyper-params to build gram matrix: %s\n' % best_params_out | |||
str_fw += 'best settings of other hyper-params: %s\n\n' % best_params_in | |||
str_fw += 'best_val_perf: %s\n' % best_val_perf | |||
str_fw += 'best_val_std: %s\n' % min_val_std | |||
final_performance = [ | |||
average_perf_scores[value][best_params_index[1][idx]] | |||
for idx, value in enumerate(best_params_index[0]) | |||
] | |||
final_confidence = [ | |||
std_perf_scores[value][best_params_index[1][idx]] | |||
for idx, value in enumerate(best_params_index[0]) | |||
] | |||
print('final_performance: ', final_performance) | |||
print('final_confidence: ', final_confidence) | |||
str_fw += 'final_performance: %s\n' % final_performance | |||
str_fw += 'final_confidence: %s\n' % final_confidence | |||
train_performance = [ | |||
average_train_scores[value][best_params_index[1][idx]] | |||
for idx, value in enumerate(best_params_index[0]) | |||
] | |||
train_std = [ | |||
std_train_scores[value][best_params_index[1][idx]] | |||
for idx, value in enumerate(best_params_index[0]) | |||
] | |||
print('train_performance: %s' % train_performance) | |||
print('train_std: ', train_std) | |||
str_fw += 'train_performance: %s\n' % train_performance | |||
str_fw += 'train_std: %s\n\n' % train_std | |||
print() | |||
tt_total = time.time() - tts # training time for all hyper-parameters | |||
average_gram_matrix_time = np.mean(gram_matrix_time) | |||
std_gram_matrix_time = np.std(gram_matrix_time, ddof=1) | |||
best_gram_matrix_time = [ | |||
gram_matrix_time[i] for i in best_params_index[0] | |||
] | |||
ave_bgmt = np.mean(best_gram_matrix_time) | |||
std_bgmt = np.std(best_gram_matrix_time, ddof=1) | |||
print( | |||
'time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s' | |||
.format(average_gram_matrix_time, std_gram_matrix_time)) | |||
print('time to calculate best gram matrix: {:.2f}±{:.2f}s'.format( | |||
ave_bgmt, std_bgmt)) | |||
print( | |||
'total training time with all hyper-param choices: {:.2f}s'.format( | |||
tt_total)) | |||
str_fw += 'time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s\n'.format(average_gram_matrix_time, std_gram_matrix_time) | |||
str_fw += 'time to calculate best gram matrix: {:.2f}±{:.2f}s\n'.format(ave_bgmt, std_bgmt) | |||
str_fw += 'total training time with all hyper-param choices: {:.2f}s\n\n'.format(tt_total) | |||
# # save results to file | |||
# np.savetxt(results_name_pre + 'average_train_scores.dt', | |||
# average_train_scores) | |||
# np.savetxt(results_name_pre + 'average_val_scores', average_val_scores) | |||
# np.savetxt(results_name_pre + 'average_perf_scores.dt', | |||
# average_perf_scores) | |||
# np.savetxt(results_name_pre + 'std_train_scores.dt', std_train_scores) | |||
# np.savetxt(results_name_pre + 'std_val_scores.dt', std_val_scores) | |||
# np.savetxt(results_name_pre + 'std_perf_scores.dt', std_perf_scores) | |||
# np.save(results_name_pre + 'best_params_index', best_params_index) | |||
# np.save(results_name_pre + 'best_params_pre.dt', best_params_out) | |||
# np.save(results_name_pre + 'best_params_in.dt', best_params_in) | |||
# np.save(results_name_pre + 'best_val_perf.dt', best_val_perf) | |||
# np.save(results_name_pre + 'best_val_std.dt', best_val_std) | |||
# np.save(results_name_pre + 'final_performance.dt', final_performance) | |||
# np.save(results_name_pre + 'final_confidence.dt', final_confidence) | |||
# np.save(results_name_pre + 'train_performance.dt', train_performance) | |||
# np.save(results_name_pre + 'train_std.dt', train_std) | |||
# np.save(results_name_pre + 'gram_matrix_time.dt', gram_matrix_time) | |||
# np.save(results_name_pre + 'average_gram_matrix_time.dt', | |||
# average_gram_matrix_time) | |||
# np.save(results_name_pre + 'std_gram_matrix_time.dt', | |||
# std_gram_matrix_time) | |||
# np.save(results_name_pre + 'best_gram_matrix_time.dt', | |||
# best_gram_matrix_time) | |||
# print out as table. | |||
from collections import OrderedDict | |||
from tabulate import tabulate | |||
table_dict = {} | |||
if model_type == 'regression': | |||
for param_in in param_list: | |||
param_in['alpha'] = '{:.2e}'.format(param_in['alpha']) | |||
else: | |||
for param_in in param_list: | |||
param_in['C'] = '{:.2e}'.format(param_in['C']) | |||
table_dict['params'] = [{**param_out, **param_in} | |||
for param_in in param_list for param_out in param_list_pre_revised] | |||
table_dict['gram_matrix_time'] = [ | |||
'{:.2f}'.format(gram_matrix_time[index_out]) | |||
for param_in in param_list | |||
for index_out, _ in enumerate(param_list_pre_revised) | |||
] | |||
table_dict['valid_perf'] = [ | |||
'{:.2f}±{:.2f}'.format(average_val_scores[index_out][index_in], | |||
std_val_scores[index_out][index_in]) | |||
for index_in, _ in enumerate(param_list) | |||
for index_out, _ in enumerate(param_list_pre_revised) | |||
] | |||
table_dict['test_perf'] = [ | |||
'{:.2f}±{:.2f}'.format(average_perf_scores[index_out][index_in], | |||
std_perf_scores[index_out][index_in]) | |||
for index_in, _ in enumerate(param_list) | |||
for index_out, _ in enumerate(param_list_pre_revised) | |||
] | |||
table_dict['train_perf'] = [ | |||
'{:.2f}±{:.2f}'.format(average_train_scores[index_out][index_in], | |||
std_train_scores[index_out][index_in]) | |||
for index_in, _ in enumerate(param_list) | |||
for index_out, _ in enumerate(param_list_pre_revised) | |||
] | |||
keyorder = [ | |||
'params', 'train_perf', 'valid_perf', 'test_perf', | |||
'gram_matrix_time' | |||
] | |||
print() | |||
tb_print = tabulate( | |||
OrderedDict( | |||
sorted(table_dict.items(), | |||
key=lambda i: keyorder.index(i[0]))), | |||
headers='keys') | |||
print(tb_print) | |||
str_fw += 'table of performance v.s. hyper-params:\n\n%s\n\n' % tb_print | |||
# read gram matrices from file. | |||
else: | |||
# Grid of parameters with a discrete number of values for each. | |||
# param_list_precomputed = list(ParameterGrid(param_grid_precomputed)) | |||
param_list = list(ParameterGrid(param_grid)) | |||
# read gram matrices from file. | |||
print() | |||
print('2. Reading gram matrices from file...') | |||
str_fw += '\nII. Gram matrices.\n\nGram matrices are read from file, see last log for detail.\n' | |||
gmfile = np.load(results_dir + '/' + ds_name + '.gm.npz') | |||
gram_matrices = gmfile['gms'] # a list to store gram matrices for all param_grid_precomputed | |||
param_list_pre_revised = gmfile['params'] # list to store param grids precomputed ignoring the useless ones | |||
y = gmfile['y'].tolist() | |||
tts = time.time() # start training time | |||
# nb_gm_ignore = 0 # the number of gram matrices those should not be considered, as they may contain elements that are not numbers (NaN) | |||
print( | |||
'3. Fitting and predicting using nested cross validation. This could really take a while...' | |||
) | |||
pool = Pool(n_jobs) | |||
trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, gram_matrices, y, model_type) | |||
train_pref = [] | |||
@@ -205,7 +440,7 @@ def model_selection_for_precomputed_kernel(datafile, | |||
test_pref.append(o3) | |||
pool.close() | |||
pool.join() | |||
# # ---- use pool.map to parallel. ---- | |||
# result_perf = pool.map(trial_do_partial, range(NUM_TRIALS)) | |||
# train_pref = [item[0] for item in result_perf] | |||
@@ -219,19 +454,19 @@ def model_selection_for_precomputed_kernel(datafile, | |||
# val_pref = [item[1] for item in result_perf] | |||
# test_pref = [item[2] for item in result_perf] | |||
# # ---- direct running, normally use single CPU core. ---- | |||
# train_pref = [] | |||
# val_pref = [] | |||
# test_pref = [] | |||
# for i in tqdm(range(NUM_TRIALS), desc='cross validation', file=sys.stdout): | |||
# o1, o2, o3 = trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, i) | |||
# train_pref.append(o1) | |||
# val_pref.append(o2) | |||
# test_pref.append(o3) | |||
# # ---- direct running, normally use a single CPU core. ---- | |||
# train_pref = [] | |||
# val_pref = [] | |||
# test_pref = [] | |||
# for i in tqdm(range(NUM_TRIALS), desc='cross validation', file=sys.stdout): | |||
# o1, o2, o3 = trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, i) | |||
# train_pref.append(o1) | |||
# val_pref.append(o2) | |||
# test_pref.append(o3) | |||
print() | |||
print('4. Getting final performance...') | |||
str_fw += '\nII. Performance.\n\n' | |||
str_fw += '\nIII. Performance.\n\n' | |||
# averages and confidences of performances on outer trials for each combination of parameters | |||
average_train_scores = np.mean(train_pref, axis=0) | |||
average_val_scores = np.mean(val_pref, axis=0) | |||
@@ -293,53 +528,25 @@ def model_selection_for_precomputed_kernel(datafile, | |||
str_fw += 'train_std: %s\n\n' % train_std | |||
print() | |||
tt_total = time.time() - tts # training time for all hyper-parameters | |||
average_gram_matrix_time = np.mean(gram_matrix_time) | |||
std_gram_matrix_time = np.std(gram_matrix_time, ddof=1) | |||
best_gram_matrix_time = [ | |||
gram_matrix_time[i] for i in best_params_index[0] | |||
] | |||
ave_bgmt = np.mean(best_gram_matrix_time) | |||
std_bgmt = np.std(best_gram_matrix_time, ddof=1) | |||
print( | |||
'time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s' | |||
.format(average_gram_matrix_time, std_gram_matrix_time)) | |||
print('time to calculate best gram matrix: {:.2f}±{:.2f}s'.format( | |||
ave_bgmt, std_bgmt)) | |||
tt_poster = time.time() - tts # training time with hyper-param choices who did not participate in calculation of gram matrices | |||
# average_gram_matrix_time = np.mean(gram_matrix_time) | |||
# std_gram_matrix_time = np.std(gram_matrix_time, ddof=1) | |||
# best_gram_matrix_time = [ | |||
# gram_matrix_time[i] for i in best_params_index[0] | |||
# ] | |||
# ave_bgmt = np.mean(best_gram_matrix_time) | |||
# std_bgmt = np.std(best_gram_matrix_time, ddof=1) | |||
# print( | |||
# 'time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s' | |||
# .format(average_gram_matrix_time, std_gram_matrix_time)) | |||
# print('time to calculate best gram matrix: {:.2f}±{:.2f}s'.format( | |||
# ave_bgmt, std_bgmt)) | |||
print( | |||
'total training time with all hyper-param choices: {:.2f}s'.format( | |||
tt_total)) | |||
str_fw += 'time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s\n'.format(average_gram_matrix_time, std_gram_matrix_time) | |||
str_fw += 'time to calculate best gram matrix: {:.2f}±{:.2f}s\n'.format(ave_bgmt, std_bgmt) | |||
str_fw += 'total training time with all hyper-param choices: {:.2f}s\n\n'.format(tt_total) | |||
# # save results to file | |||
# np.savetxt(results_name_pre + 'average_train_scores.dt', | |||
# average_train_scores) | |||
# np.savetxt(results_name_pre + 'average_val_scores', average_val_scores) | |||
# np.savetxt(results_name_pre + 'average_perf_scores.dt', | |||
# average_perf_scores) | |||
# np.savetxt(results_name_pre + 'std_train_scores.dt', std_train_scores) | |||
# np.savetxt(results_name_pre + 'std_val_scores.dt', std_val_scores) | |||
# np.savetxt(results_name_pre + 'std_perf_scores.dt', std_perf_scores) | |||
# np.save(results_name_pre + 'best_params_index', best_params_index) | |||
# np.save(results_name_pre + 'best_params_pre.dt', best_params_out) | |||
# np.save(results_name_pre + 'best_params_in.dt', best_params_in) | |||
# np.save(results_name_pre + 'best_val_perf.dt', best_val_perf) | |||
# np.save(results_name_pre + 'best_val_std.dt', best_val_std) | |||
# np.save(results_name_pre + 'final_performance.dt', final_performance) | |||
# np.save(results_name_pre + 'final_confidence.dt', final_confidence) | |||
# np.save(results_name_pre + 'train_performance.dt', train_performance) | |||
# np.save(results_name_pre + 'train_std.dt', train_std) | |||
# np.save(results_name_pre + 'gram_matrix_time.dt', gram_matrix_time) | |||
# np.save(results_name_pre + 'average_gram_matrix_time.dt', | |||
# average_gram_matrix_time) | |||
# np.save(results_name_pre + 'std_gram_matrix_time.dt', | |||
# std_gram_matrix_time) | |||
# np.save(results_name_pre + 'best_gram_matrix_time.dt', | |||
# best_gram_matrix_time) | |||
'training time with hyper-param choices who did not participate in calculation of gram matrices: {:.2f}s'.format( | |||
tt_poster)) | |||
# str_fw += 'time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s\n'.format(average_gram_matrix_time, std_gram_matrix_time) | |||
# str_fw += 'time to calculate best gram matrix: {:.2f}±{:.2f}s\n'.format(ave_bgmt, std_bgmt) | |||
str_fw += 'training time with hyper-param choices who did not participate in calculation of gram matrices: {:.2f}s\n\n'.format(tt_poster) | |||
# print out as table. | |||
from collections import OrderedDict | |||
@@ -353,11 +560,11 @@ def model_selection_for_precomputed_kernel(datafile, | |||
param_in['C'] = '{:.2e}'.format(param_in['C']) | |||
table_dict['params'] = [{**param_out, **param_in} | |||
for param_in in param_list for param_out in param_list_pre_revised] | |||
table_dict['gram_matrix_time'] = [ | |||
'{:.2f}'.format(gram_matrix_time[index_out]) | |||
for param_in in param_list | |||
for index_out, _ in enumerate(param_list_pre_revised) | |||
] | |||
# table_dict['gram_matrix_time'] = [ | |||
# '{:.2f}'.format(gram_matrix_time[index_out]) | |||
# for param_in in param_list | |||
# for index_out, _ in enumerate(param_list_pre_revised) | |||
# ] | |||
table_dict['valid_perf'] = [ | |||
'{:.2f}±{:.2f}'.format(average_val_scores[index_out][index_in], | |||
std_val_scores[index_out][index_in]) | |||
@@ -377,8 +584,7 @@ def model_selection_for_precomputed_kernel(datafile, | |||
for index_out, _ in enumerate(param_list_pre_revised) | |||
] | |||
keyorder = [ | |||
'params', 'train_perf', 'valid_perf', 'test_perf', | |||
'gram_matrix_time' | |||
'params', 'train_perf', 'valid_perf', 'test_perf' | |||
] | |||
print() | |||
tb_print = tabulate( | |||
@@ -392,59 +598,62 @@ def model_selection_for_precomputed_kernel(datafile, | |||
# open file to save all results for this dataset. | |||
if not os.path.exists(results_dir): | |||
os.makedirs(results_dir) | |||
with open(results_dir + '/' + ds_name + '.txt', 'w') as fresults: | |||
fresults.write(str_fw) | |||
fresults.close() | |||
# open file to save all results for this dataset. | |||
if not os.path.exists(results_dir): | |||
os.makedirs(results_dir) | |||
if not os.path.exists(results_dir + '/' + ds_name + '.output.txt'): | |||
with open(results_dir + '/' + ds_name + '.output.txt', 'w') as f: | |||
f.write(str_fw) | |||
else: | |||
with open(results_dir + '/' + ds_name + '.output.txt', 'r+') as f: | |||
content = f.read() | |||
f.seek(0, 0) | |||
f.write(str_fw + '\n\n\n' + content) | |||
def trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, trial): # Test set level | |||
# Arrays to store scores | |||
train_pref = np.zeros((len(param_list_pre_revised), | |||
len(param_list))) | |||
val_pref = np.zeros((len(param_list_pre_revised), | |||
len(param_list))) | |||
test_pref = np.zeros((len(param_list_pre_revised), | |||
len(param_list))) | |||
train_pref = np.zeros((len(param_list_pre_revised), len(param_list))) | |||
val_pref = np.zeros((len(param_list_pre_revised), len(param_list))) | |||
test_pref = np.zeros((len(param_list_pre_revised), len(param_list))) | |||
# loop for each outer param tuple | |||
for index_out, params_out in enumerate(param_list_pre_revised): | |||
# split gram matrix and y to app and test sets. | |||
X_app, X_test, y_app, y_test = train_test_split( | |||
gram_matrices[index_out], y, test_size=0.1) | |||
split_index_app = [y.index(y_i) for y_i in y_app if y_i in y] | |||
# split_index_test = [y.index(y_i) for y_i in y_test if y_i in y] | |||
X_app = X_app[:, split_index_app] | |||
X_test = X_test[:, split_index_app] | |||
indices = range(len(y)) | |||
X_app, X_test, y_app, y_test, idx_app, idx_test = train_test_split( | |||
gram_matrices[index_out], y, indices, test_size=0.1, | |||
random_state=None, shuffle=True) | |||
X_app = X_app[:, idx_app] | |||
X_test = X_test[:, idx_app] | |||
y_app = np.array(y_app) | |||
y_test = np.array(y_test) | |||
# loop for each inner param tuple | |||
for index_in, params_in in enumerate(param_list): | |||
inner_cv = KFold( | |||
n_splits=10, shuffle=True, random_state=trial) | |||
inner_cv = KFold(n_splits=10, shuffle=True, random_state=trial) | |||
current_train_perf = [] | |||
current_valid_perf = [] | |||
current_test_perf = [] | |||
current_test_perf = [] | |||
# For regression use the Kernel Ridge method | |||
try: | |||
if model_type == 'regression': | |||
KR = KernelRidge(kernel='precomputed', **params_in) | |||
kr = KernelRidge(kernel='precomputed', **params_in) | |||
# loop for each split on validation set level | |||
# validation set level | |||
for train_index, valid_index in inner_cv.split( | |||
X_app): | |||
KR.fit(X_app[train_index, :][:, train_index], | |||
for train_index, valid_index in inner_cv.split(X_app): | |||
kr.fit(X_app[train_index, :][:, train_index], | |||
y_app[train_index]) | |||
# predict on the train, validation and test set | |||
y_pred_train = KR.predict( | |||
y_pred_train = kr.predict( | |||
X_app[train_index, :][:, train_index]) | |||
y_pred_valid = KR.predict( | |||
y_pred_valid = kr.predict( | |||
X_app[valid_index, :][:, train_index]) | |||
y_pred_test = KR.predict( | |||
y_pred_test = kr.predict( | |||
X_test[:, train_index]) | |||
# root mean squared errors | |||
@@ -460,22 +669,23 @@ def trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, t | |||
np.sqrt( | |||
mean_squared_error( | |||
y_test, y_pred_test))) | |||
# For clcassification use SVM | |||
# For clcassification use SVM | |||
else: | |||
KR = SVC(kernel='precomputed', **params_in) | |||
svc = SVC(kernel='precomputed', cache_size=200, | |||
verbose=False, **params_in) | |||
# loop for each split on validation set level | |||
# validation set level | |||
for train_index, valid_index in inner_cv.split( | |||
X_app): | |||
KR.fit(X_app[train_index, :][:, train_index], | |||
for train_index, valid_index in inner_cv.split(X_app): | |||
# np.savez("bug.npy",X_app[train_index, :][:, train_index],y_app[train_index]) | |||
svc.fit(X_app[train_index, :][:, train_index], | |||
y_app[train_index]) | |||
# predict on the train, validation and test set | |||
y_pred_train = KR.predict( | |||
y_pred_train = svc.predict( | |||
X_app[train_index, :][:, train_index]) | |||
y_pred_valid = KR.predict( | |||
y_pred_valid = svc.predict( | |||
X_app[valid_index, :][:, train_index]) | |||
y_pred_test = KR.predict( | |||
y_pred_test = svc.predict( | |||
X_test[:, train_index]) | |||
# root mean squared errors | |||