2. save gram matrices and relative data when using function cross_validation_pre_computed, before cross validation step, in case that something goes wrong with CV. Parameter read_gm_from_file can be used to choose whether to read gram matrices from file. 3. add some test code to check if a gram matrix is symmetric and positive semi-definite.v0.1
@@ -3,7 +3,7 @@ A python package for graph kernels. | |||||
## Requirements | ## Requirements | ||||
numpy==1.14.5 | |||||
numpy==1.15.1 | |||||
scipy==1.1.0 | scipy==1.1.0 | ||||
matplotlib==2.2.2 | matplotlib==2.2.2 | ||||
networkx==2.1 | networkx==2.1 | ||||
@@ -12,30 +12,52 @@ | |||||
"output_type": "stream", | "output_type": "stream", | ||||
"text": [ | "text": [ | ||||
"\n", | "\n", | ||||
"Letter-med:\n", | |||||
"substructures : {'linear', 'non linear'}\n", | |||||
"node_labeled : False\n", | |||||
"edge_labeled : False\n", | |||||
"is_directed : False\n", | |||||
"dataset_size : 2250\n", | |||||
"ave_graph_size : 4.674666666666667\n", | |||||
"min_graph_size : 1\n", | |||||
"max_graph_size : 9\n", | |||||
"ave_graph_edge_num : 3.2057777777777776\n", | |||||
"min_graph_edge_num : 0\n", | |||||
"max_graph_edge_num : 7\n", | |||||
"ave_graph_degree : 2.012888888888889\n", | |||||
"min_graph_degree : 0\n", | |||||
"max_graph_degree : 4\n", | |||||
"node_label_num : 0\n", | |||||
"edge_label_num : 0\n", | |||||
"node_attr_dim : 2\n", | |||||
"edge_attr_dim : 0\n", | |||||
"class_number : 15\n", | |||||
"\n", | |||||
"\n", | |||||
"Mutagenicity:\n", | "Mutagenicity:\n", | ||||
"substructures : {'non linear', 'linear'}\n", | |||||
"substructures : {'linear', 'non linear'}\n", | |||||
"node_labeled : True\n", | "node_labeled : True\n", | ||||
"edge_labeled : True\n", | "edge_labeled : True\n", | ||||
"is_directed : False\n", | "is_directed : False\n", | ||||
"dataset_size : 4337\n", | "dataset_size : 4337\n", | ||||
"ave_graph_size : 30.3177311506\n", | |||||
"ave_graph_size : 30.317731150564907\n", | |||||
"min_graph_size : 4\n", | "min_graph_size : 4\n", | ||||
"max_graph_size : 417\n", | "max_graph_size : 417\n", | ||||
"ave_graph_edge_num : 30.7694258704\n", | |||||
"ave_graph_edge_num : 30.76942587041734\n", | |||||
"min_graph_edge_num : 3\n", | "min_graph_edge_num : 3\n", | ||||
"max_graph_edge_num : 112\n", | "max_graph_edge_num : 112\n", | ||||
"ave_graph_degree : 3.75651371916\n", | |||||
"ave_graph_degree : 3.75651371916071\n", | |||||
"min_graph_degree : 3\n", | "min_graph_degree : 3\n", | ||||
"max_graph_degree : 4\n", | "max_graph_degree : 4\n", | ||||
"node_label_num : 14\n", | "node_label_num : 14\n", | ||||
"edge_label_num : 3\n", | "edge_label_num : 3\n", | ||||
"node_attr_dim : False\n", | |||||
"edge_attr_dim : False\n", | |||||
"node_attr_dim : 0\n", | |||||
"edge_attr_dim : 0\n", | |||||
"class_number : 2\n", | "class_number : 2\n", | ||||
"\n", | "\n", | ||||
"\n", | "\n", | ||||
"AIDS:\n", | "AIDS:\n", | ||||
"substructures : {'non linear', 'linear'}\n", | |||||
"substructures : {'linear', 'non linear'}\n", | |||||
"node_labeled : True\n", | "node_labeled : True\n", | ||||
"edge_labeled : True\n", | "edge_labeled : True\n", | ||||
"is_directed : False\n", | "is_directed : False\n", | ||||
@@ -52,7 +74,7 @@ | |||||
"node_label_num : 38\n", | "node_label_num : 38\n", | ||||
"edge_label_num : 3\n", | "edge_label_num : 3\n", | ||||
"node_attr_dim : 4\n", | "node_attr_dim : 4\n", | ||||
"edge_attr_dim : False\n", | |||||
"edge_attr_dim : 0\n", | |||||
"class_number : 2\n", | "class_number : 2\n", | ||||
"\n", | "\n", | ||||
"\n", | "\n", | ||||
@@ -62,13 +84,13 @@ | |||||
"edge_labeled : False\n", | "edge_labeled : False\n", | ||||
"is_directed : False\n", | "is_directed : False\n", | ||||
"dataset_size : 41\n", | "dataset_size : 41\n", | ||||
"ave_graph_size : 1377.26829268\n", | |||||
"ave_graph_size : 1377.2682926829268\n", | |||||
"min_graph_size : 134\n", | "min_graph_size : 134\n", | ||||
"max_graph_size : 5037\n", | "max_graph_size : 5037\n", | ||||
"ave_graph_edge_num : 3074.09756098\n", | |||||
"ave_graph_edge_num : 3074.0975609756097\n", | |||||
"min_graph_edge_num : 320\n", | "min_graph_edge_num : 320\n", | ||||
"max_graph_edge_num : 10888\n", | "max_graph_edge_num : 10888\n", | ||||
"ave_graph_degree : 7.85365853659\n", | |||||
"ave_graph_degree : 7.853658536585366\n", | |||||
"min_graph_degree : 6\n", | "min_graph_degree : 6\n", | ||||
"max_graph_degree : 10\n", | "max_graph_degree : 10\n", | ||||
"node_label_num : 5\n", | "node_label_num : 5\n", | ||||
@@ -79,51 +101,51 @@ | |||||
"\n", | "\n", | ||||
"\n", | "\n", | ||||
"MSRC9:\n", | "MSRC9:\n", | ||||
"substructures : {'non linear', 'linear'}\n", | |||||
"substructures : {'linear', 'non linear'}\n", | |||||
"node_labeled : True\n", | "node_labeled : True\n", | ||||
"edge_labeled : False\n", | "edge_labeled : False\n", | ||||
"is_directed : False\n", | "is_directed : False\n", | ||||
"dataset_size : 221\n", | "dataset_size : 221\n", | ||||
"ave_graph_size : 40.5791855204\n", | |||||
"ave_graph_size : 40.57918552036199\n", | |||||
"min_graph_size : 25\n", | "min_graph_size : 25\n", | ||||
"max_graph_size : 55\n", | "max_graph_size : 55\n", | ||||
"ave_graph_edge_num : 97.9366515837\n", | |||||
"ave_graph_edge_num : 97.9366515837104\n", | |||||
"min_graph_edge_num : 53\n", | "min_graph_edge_num : 53\n", | ||||
"max_graph_edge_num : 145\n", | "max_graph_edge_num : 145\n", | ||||
"ave_graph_degree : 10.1583710407\n", | |||||
"ave_graph_degree : 10.158371040723981\n", | |||||
"min_graph_degree : 8\n", | "min_graph_degree : 8\n", | ||||
"max_graph_degree : 16\n", | "max_graph_degree : 16\n", | ||||
"node_label_num : 10\n", | "node_label_num : 10\n", | ||||
"edge_label_num : 0\n", | "edge_label_num : 0\n", | ||||
"node_attr_dim : False\n", | |||||
"edge_attr_dim : False\n", | |||||
"node_attr_dim : 0\n", | |||||
"edge_attr_dim : 0\n", | |||||
"class_number : 8\n", | "class_number : 8\n", | ||||
"\n", | "\n", | ||||
"\n", | "\n", | ||||
"MSRC21:\n", | "MSRC21:\n", | ||||
"substructures : {'non linear', 'linear'}\n", | |||||
"substructures : {'linear', 'non linear'}\n", | |||||
"node_labeled : True\n", | "node_labeled : True\n", | ||||
"edge_labeled : False\n", | "edge_labeled : False\n", | ||||
"is_directed : False\n", | "is_directed : False\n", | ||||
"dataset_size : 563\n", | "dataset_size : 563\n", | ||||
"ave_graph_size : 77.5204262877\n", | |||||
"ave_graph_size : 77.52042628774423\n", | |||||
"min_graph_size : 51\n", | "min_graph_size : 51\n", | ||||
"max_graph_size : 141\n", | "max_graph_size : 141\n", | ||||
"ave_graph_edge_num : 198.323268206\n", | |||||
"ave_graph_edge_num : 198.32326820603907\n", | |||||
"min_graph_edge_num : 121\n", | "min_graph_edge_num : 121\n", | ||||
"max_graph_edge_num : 405\n", | "max_graph_edge_num : 405\n", | ||||
"ave_graph_degree : 11.4156305506\n", | |||||
"ave_graph_degree : 11.41563055062167\n", | |||||
"min_graph_degree : 8\n", | "min_graph_degree : 8\n", | ||||
"max_graph_degree : 23\n", | "max_graph_degree : 23\n", | ||||
"node_label_num : 22\n", | "node_label_num : 22\n", | ||||
"edge_label_num : 0\n", | "edge_label_num : 0\n", | ||||
"node_attr_dim : False\n", | |||||
"edge_attr_dim : False\n", | |||||
"node_attr_dim : 0\n", | |||||
"edge_attr_dim : 0\n", | |||||
"class_number : 20\n", | "class_number : 20\n", | ||||
"\n", | "\n", | ||||
"\n", | "\n", | ||||
"SYNTHETIC:\n", | "SYNTHETIC:\n", | ||||
"substructures : {'non linear', 'linear'}\n", | |||||
"substructures : {'linear', 'non linear'}\n", | |||||
"node_labeled : True\n", | "node_labeled : True\n", | ||||
"edge_labeled : False\n", | "edge_labeled : False\n", | ||||
"is_directed : False\n", | "is_directed : False\n", | ||||
@@ -140,42 +162,42 @@ | |||||
"node_label_num : 8\n", | "node_label_num : 8\n", | ||||
"edge_label_num : 0\n", | "edge_label_num : 0\n", | ||||
"node_attr_dim : 1\n", | "node_attr_dim : 1\n", | ||||
"edge_attr_dim : False\n", | |||||
"edge_attr_dim : 0\n", | |||||
"class_number : 2\n", | "class_number : 2\n", | ||||
"\n", | "\n", | ||||
"\n", | "\n", | ||||
"BZR:\n", | "BZR:\n", | ||||
"substructures : {'non linear', 'linear'}\n", | |||||
"substructures : {'linear', 'non linear'}\n", | |||||
"node_labeled : True\n", | "node_labeled : True\n", | ||||
"edge_labeled : False\n", | "edge_labeled : False\n", | ||||
"is_directed : False\n", | "is_directed : False\n", | ||||
"dataset_size : 405\n", | "dataset_size : 405\n", | ||||
"ave_graph_size : 35.750617284\n", | |||||
"ave_graph_size : 35.75061728395062\n", | |||||
"min_graph_size : 13\n", | "min_graph_size : 13\n", | ||||
"max_graph_size : 57\n", | "max_graph_size : 57\n", | ||||
"ave_graph_edge_num : 38.3580246914\n", | |||||
"ave_graph_edge_num : 38.358024691358025\n", | |||||
"min_graph_edge_num : 13\n", | "min_graph_edge_num : 13\n", | ||||
"max_graph_edge_num : 60\n", | "max_graph_edge_num : 60\n", | ||||
"ave_graph_degree : 3.86419753086\n", | |||||
"ave_graph_degree : 3.8641975308641974\n", | |||||
"min_graph_degree : 3\n", | "min_graph_degree : 3\n", | ||||
"max_graph_degree : 4\n", | "max_graph_degree : 4\n", | ||||
"node_label_num : 10\n", | "node_label_num : 10\n", | ||||
"edge_label_num : 0\n", | "edge_label_num : 0\n", | ||||
"node_attr_dim : 3\n", | "node_attr_dim : 3\n", | ||||
"edge_attr_dim : False\n", | |||||
"edge_attr_dim : 0\n", | |||||
"class_number : 2\n", | "class_number : 2\n", | ||||
"\n", | "\n", | ||||
"\n", | "\n", | ||||
"COX2:\n", | "COX2:\n", | ||||
"substructures : {'non linear', 'linear'}\n", | |||||
"substructures : {'linear', 'non linear'}\n", | |||||
"node_labeled : True\n", | "node_labeled : True\n", | ||||
"edge_labeled : False\n", | "edge_labeled : False\n", | ||||
"is_directed : False\n", | "is_directed : False\n", | ||||
"dataset_size : 467\n", | "dataset_size : 467\n", | ||||
"ave_graph_size : 41.2248394004\n", | |||||
"ave_graph_size : 41.224839400428266\n", | |||||
"min_graph_size : 32\n", | "min_graph_size : 32\n", | ||||
"max_graph_size : 56\n", | "max_graph_size : 56\n", | ||||
"ave_graph_edge_num : 43.4453961456\n", | |||||
"ave_graph_edge_num : 43.44539614561028\n", | |||||
"min_graph_edge_num : 34\n", | "min_graph_edge_num : 34\n", | ||||
"max_graph_edge_num : 59\n", | "max_graph_edge_num : 59\n", | ||||
"ave_graph_degree : 4.0\n", | "ave_graph_degree : 4.0\n", | ||||
@@ -184,152 +206,152 @@ | |||||
"node_label_num : 8\n", | "node_label_num : 8\n", | ||||
"edge_label_num : 0\n", | "edge_label_num : 0\n", | ||||
"node_attr_dim : 3\n", | "node_attr_dim : 3\n", | ||||
"edge_attr_dim : False\n", | |||||
"edge_attr_dim : 0\n", | |||||
"class_number : 2\n", | "class_number : 2\n", | ||||
"\n", | "\n", | ||||
"\n", | "\n", | ||||
"DHFR:\n", | "DHFR:\n", | ||||
"substructures : {'non linear', 'linear'}\n", | |||||
"substructures : {'linear', 'non linear'}\n", | |||||
"node_labeled : True\n", | "node_labeled : True\n", | ||||
"edge_labeled : False\n", | "edge_labeled : False\n", | ||||
"is_directed : False\n", | "is_directed : False\n", | ||||
"dataset_size : 756\n", | "dataset_size : 756\n", | ||||
"ave_graph_size : 42.4272486772\n", | |||||
"ave_graph_size : 42.42724867724868\n", | |||||
"min_graph_size : 20\n", | "min_graph_size : 20\n", | ||||
"max_graph_size : 71\n", | "max_graph_size : 71\n", | ||||
"ave_graph_edge_num : 44.544973545\n", | |||||
"ave_graph_edge_num : 44.544973544973544\n", | |||||
"min_graph_edge_num : 21\n", | "min_graph_edge_num : 21\n", | ||||
"max_graph_edge_num : 73\n", | "max_graph_edge_num : 73\n", | ||||
"ave_graph_degree : 3.95502645503\n", | |||||
"ave_graph_degree : 3.955026455026455\n", | |||||
"min_graph_degree : 3\n", | "min_graph_degree : 3\n", | ||||
"max_graph_degree : 4\n", | "max_graph_degree : 4\n", | ||||
"node_label_num : 9\n", | "node_label_num : 9\n", | ||||
"edge_label_num : 0\n", | "edge_label_num : 0\n", | ||||
"node_attr_dim : 3\n", | "node_attr_dim : 3\n", | ||||
"edge_attr_dim : False\n", | |||||
"edge_attr_dim : 0\n", | |||||
"class_number : 2\n", | "class_number : 2\n", | ||||
"\n", | "\n", | ||||
"\n", | "\n", | ||||
"ENZYMES:\n", | "ENZYMES:\n", | ||||
"substructures : {'non linear', 'linear'}\n", | |||||
"substructures : {'linear', 'non linear'}\n", | |||||
"node_labeled : True\n", | "node_labeled : True\n", | ||||
"edge_labeled : False\n", | "edge_labeled : False\n", | ||||
"is_directed : False\n", | "is_directed : False\n", | ||||
"dataset_size : 600\n", | "dataset_size : 600\n", | ||||
"ave_graph_size : 32.6333333333\n", | |||||
"ave_graph_size : 32.63333333333333\n", | |||||
"min_graph_size : 2\n", | "min_graph_size : 2\n", | ||||
"max_graph_size : 126\n", | "max_graph_size : 126\n", | ||||
"ave_graph_edge_num : 62.1366666667\n", | |||||
"ave_graph_edge_num : 62.13666666666666\n", | |||||
"min_graph_edge_num : 1\n", | "min_graph_edge_num : 1\n", | ||||
"max_graph_edge_num : 149\n", | "max_graph_edge_num : 149\n", | ||||
"ave_graph_degree : 6.08666666667\n", | |||||
"ave_graph_degree : 6.086666666666667\n", | |||||
"min_graph_degree : 1\n", | "min_graph_degree : 1\n", | ||||
"max_graph_degree : 9\n", | "max_graph_degree : 9\n", | ||||
"node_label_num : 3\n", | "node_label_num : 3\n", | ||||
"edge_label_num : 0\n", | "edge_label_num : 0\n", | ||||
"node_attr_dim : 18\n", | "node_attr_dim : 18\n", | ||||
"edge_attr_dim : False\n", | |||||
"edge_attr_dim : 0\n", | |||||
"class_number : 6\n", | "class_number : 6\n", | ||||
"\n", | "\n", | ||||
"\n", | "\n", | ||||
"PROTEINS:\n", | "PROTEINS:\n", | ||||
"substructures : {'non linear', 'linear'}\n", | |||||
"substructures : {'linear', 'non linear'}\n", | |||||
"node_labeled : True\n", | "node_labeled : True\n", | ||||
"edge_labeled : False\n", | "edge_labeled : False\n", | ||||
"is_directed : False\n", | "is_directed : False\n", | ||||
"dataset_size : 1113\n", | "dataset_size : 1113\n", | ||||
"ave_graph_size : 39.0575022462\n", | |||||
"ave_graph_size : 39.05750224618149\n", | |||||
"min_graph_size : 4\n", | "min_graph_size : 4\n", | ||||
"max_graph_size : 620\n", | "max_graph_size : 620\n", | ||||
"ave_graph_edge_num : 72.8158131177\n", | |||||
"ave_graph_edge_num : 72.8158131176999\n", | |||||
"min_graph_edge_num : 5\n", | "min_graph_edge_num : 5\n", | ||||
"max_graph_edge_num : 1049\n", | "max_graph_edge_num : 1049\n", | ||||
"ave_graph_degree : 5.79424977538\n", | |||||
"ave_graph_degree : 5.794249775381851\n", | |||||
"min_graph_degree : 3\n", | "min_graph_degree : 3\n", | ||||
"max_graph_degree : 25\n", | "max_graph_degree : 25\n", | ||||
"node_label_num : 3\n", | "node_label_num : 3\n", | ||||
"edge_label_num : 0\n", | "edge_label_num : 0\n", | ||||
"node_attr_dim : 1\n", | "node_attr_dim : 1\n", | ||||
"edge_attr_dim : False\n", | |||||
"edge_attr_dim : 0\n", | |||||
"class_number : 2\n", | "class_number : 2\n", | ||||
"\n", | "\n", | ||||
"\n", | "\n", | ||||
"PROTEINS_full:\n", | "PROTEINS_full:\n", | ||||
"substructures : {'non linear', 'linear'}\n", | |||||
"substructures : {'linear', 'non linear'}\n", | |||||
"node_labeled : True\n", | "node_labeled : True\n", | ||||
"edge_labeled : False\n", | "edge_labeled : False\n", | ||||
"is_directed : False\n", | "is_directed : False\n", | ||||
"dataset_size : 1113\n", | "dataset_size : 1113\n", | ||||
"ave_graph_size : 39.0575022462\n", | |||||
"ave_graph_size : 39.05750224618149\n", | |||||
"min_graph_size : 4\n", | "min_graph_size : 4\n", | ||||
"max_graph_size : 620\n", | "max_graph_size : 620\n", | ||||
"ave_graph_edge_num : 72.8158131177\n", | |||||
"ave_graph_edge_num : 72.8158131176999\n", | |||||
"min_graph_edge_num : 5\n", | "min_graph_edge_num : 5\n", | ||||
"max_graph_edge_num : 1049\n", | "max_graph_edge_num : 1049\n", | ||||
"ave_graph_degree : 5.79424977538\n", | |||||
"ave_graph_degree : 5.794249775381851\n", | |||||
"min_graph_degree : 3\n", | "min_graph_degree : 3\n", | ||||
"max_graph_degree : 25\n", | "max_graph_degree : 25\n", | ||||
"node_label_num : 3\n", | "node_label_num : 3\n", | ||||
"edge_label_num : 0\n", | "edge_label_num : 0\n", | ||||
"node_attr_dim : 29\n", | "node_attr_dim : 29\n", | ||||
"edge_attr_dim : False\n", | |||||
"edge_attr_dim : 0\n", | |||||
"class_number : 2\n", | "class_number : 2\n", | ||||
"\n", | "\n", | ||||
"\n", | "\n", | ||||
"D&D:\n", | "D&D:\n", | ||||
"substructures : {'non linear', 'linear'}\n", | |||||
"substructures : {'linear', 'non linear'}\n", | |||||
"node_labeled : True\n", | "node_labeled : True\n", | ||||
"edge_labeled : False\n", | "edge_labeled : False\n", | ||||
"is_directed : False\n", | "is_directed : False\n", | ||||
"dataset_size : 1178\n", | "dataset_size : 1178\n", | ||||
"ave_graph_size : 284.31663837\n", | |||||
"ave_graph_size : 284.3166383701188\n", | |||||
"min_graph_size : 30\n", | "min_graph_size : 30\n", | ||||
"max_graph_size : 5748\n", | "max_graph_size : 5748\n", | ||||
"ave_graph_edge_num : 715.658743633\n", | |||||
"ave_graph_edge_num : 715.6587436332767\n", | |||||
"min_graph_edge_num : 63\n", | "min_graph_edge_num : 63\n", | ||||
"max_graph_edge_num : 14267\n", | "max_graph_edge_num : 14267\n", | ||||
"ave_graph_degree : 9.50933786078\n", | |||||
"ave_graph_degree : 9.509337860780985\n", | |||||
"min_graph_degree : 6\n", | "min_graph_degree : 6\n", | ||||
"max_graph_degree : 19\n", | "max_graph_degree : 19\n", | ||||
"node_label_num : 82\n", | "node_label_num : 82\n", | ||||
"edge_label_num : 0\n", | "edge_label_num : 0\n", | ||||
"node_attr_dim : False\n", | |||||
"edge_attr_dim : False\n", | |||||
"node_attr_dim : 0\n", | |||||
"edge_attr_dim : 0\n", | |||||
"class_number : 2\n", | "class_number : 2\n", | ||||
"\n", | "\n", | ||||
"\n", | "\n", | ||||
"MUTAG:\n", | "MUTAG:\n", | ||||
"substructures : {'non linear', 'linear'}\n", | |||||
"substructures : {'linear', 'non linear'}\n", | |||||
"node_labeled : True\n", | "node_labeled : True\n", | ||||
"edge_labeled : True\n", | "edge_labeled : True\n", | ||||
"is_directed : False\n", | "is_directed : False\n", | ||||
"dataset_size : 188\n", | "dataset_size : 188\n", | ||||
"ave_graph_size : 17.9308510638\n", | |||||
"ave_graph_size : 17.930851063829788\n", | |||||
"min_graph_size : 10\n", | "min_graph_size : 10\n", | ||||
"max_graph_size : 28\n", | "max_graph_size : 28\n", | ||||
"ave_graph_edge_num : 19.7925531915\n", | |||||
"ave_graph_edge_num : 19.79255319148936\n", | |||||
"min_graph_edge_num : 10\n", | "min_graph_edge_num : 10\n", | ||||
"max_graph_edge_num : 33\n", | "max_graph_edge_num : 33\n", | ||||
"ave_graph_degree : 3.00531914894\n", | |||||
"ave_graph_degree : 3.00531914893617\n", | |||||
"min_graph_degree : 3\n", | "min_graph_degree : 3\n", | ||||
"max_graph_degree : 4\n", | "max_graph_degree : 4\n", | ||||
"node_label_num : 7\n", | "node_label_num : 7\n", | ||||
"edge_label_num : 11\n", | "edge_label_num : 11\n", | ||||
"node_attr_dim : False\n", | |||||
"edge_attr_dim : False\n", | |||||
"node_attr_dim : 0\n", | |||||
"edge_attr_dim : 0\n", | |||||
"class_number : 2\n", | "class_number : 2\n", | ||||
"\n", | "\n", | ||||
"\n", | "\n", | ||||
"Alkane:\n", | "Alkane:\n", | ||||
"substructures : {'non linear', 'linear'}\n", | |||||
"substructures : {'linear', 'non linear'}\n", | |||||
"node_labeled : True\n", | "node_labeled : True\n", | ||||
"edge_labeled : False\n", | "edge_labeled : False\n", | ||||
"is_directed : False\n", | "is_directed : False\n", | ||||
"dataset_size : 150\n", | "dataset_size : 150\n", | ||||
"ave_graph_size : 8.87333333333\n", | |||||
"ave_graph_size : 8.873333333333333\n", | |||||
"min_graph_size : 1\n", | "min_graph_size : 1\n", | ||||
"max_graph_size : 10\n", | "max_graph_size : 10\n", | ||||
"ave_graph_edge_num : 7.87333333333\n", | |||||
"ave_graph_edge_num : 7.873333333333333\n", | |||||
"min_graph_edge_num : 0\n", | "min_graph_edge_num : 0\n", | ||||
"max_graph_edge_num : 9\n", | "max_graph_edge_num : 9\n", | ||||
"ave_graph_degree : 3.36\n", | "ave_graph_degree : 3.36\n", | ||||
@@ -337,43 +359,43 @@ | |||||
"max_graph_degree : 4\n", | "max_graph_degree : 4\n", | ||||
"node_label_num : 2\n", | "node_label_num : 2\n", | ||||
"edge_label_num : 1\n", | "edge_label_num : 1\n", | ||||
"node_attr_dim : False\n", | |||||
"edge_attr_dim : False\n", | |||||
"node_attr_dim : 0\n", | |||||
"edge_attr_dim : 0\n", | |||||
"class_number : 123\n", | "class_number : 123\n", | ||||
"\n", | "\n", | ||||
"\n", | "\n", | ||||
"Acyclic:\n", | "Acyclic:\n", | ||||
"substructures : {'non linear', 'linear'}\n", | |||||
"substructures : {'linear', 'non linear'}\n", | |||||
"node_labeled : True\n", | "node_labeled : True\n", | ||||
"edge_labeled : False\n", | "edge_labeled : False\n", | ||||
"is_directed : False\n", | "is_directed : False\n", | ||||
"dataset_size : 183\n", | "dataset_size : 183\n", | ||||
"ave_graph_size : 8.15300546448\n", | |||||
"ave_graph_size : 8.153005464480874\n", | |||||
"min_graph_size : 3\n", | "min_graph_size : 3\n", | ||||
"max_graph_size : 11\n", | "max_graph_size : 11\n", | ||||
"ave_graph_edge_num : 7.15300546448\n", | |||||
"ave_graph_edge_num : 7.1530054644808745\n", | |||||
"min_graph_edge_num : 2\n", | "min_graph_edge_num : 2\n", | ||||
"max_graph_edge_num : 10\n", | "max_graph_edge_num : 10\n", | ||||
"ave_graph_degree : 2.80327868852\n", | |||||
"ave_graph_degree : 2.80327868852459\n", | |||||
"min_graph_degree : 2\n", | "min_graph_degree : 2\n", | ||||
"max_graph_degree : 4\n", | "max_graph_degree : 4\n", | ||||
"node_label_num : 3\n", | "node_label_num : 3\n", | ||||
"edge_label_num : 1\n", | "edge_label_num : 1\n", | ||||
"node_attr_dim : False\n", | |||||
"edge_attr_dim : False\n", | |||||
"node_attr_dim : 0\n", | |||||
"edge_attr_dim : 0\n", | |||||
"class_number : 148\n", | "class_number : 148\n", | ||||
"\n", | "\n", | ||||
"\n", | "\n", | ||||
"MAO:\n", | "MAO:\n", | ||||
"substructures : {'non linear', 'linear'}\n", | |||||
"substructures : {'linear', 'non linear'}\n", | |||||
"node_labeled : True\n", | "node_labeled : True\n", | ||||
"edge_labeled : True\n", | "edge_labeled : True\n", | ||||
"is_directed : False\n", | "is_directed : False\n", | ||||
"dataset_size : 68\n", | "dataset_size : 68\n", | ||||
"ave_graph_size : 18.3823529412\n", | |||||
"ave_graph_size : 18.38235294117647\n", | |||||
"min_graph_size : 11\n", | "min_graph_size : 11\n", | ||||
"max_graph_size : 27\n", | "max_graph_size : 27\n", | ||||
"ave_graph_edge_num : 19.6323529412\n", | |||||
"ave_graph_edge_num : 19.63235294117647\n", | |||||
"min_graph_edge_num : 12\n", | "min_graph_edge_num : 12\n", | ||||
"max_graph_edge_num : 29\n", | "max_graph_edge_num : 29\n", | ||||
"ave_graph_degree : 3.0\n", | "ave_graph_degree : 3.0\n", | ||||
@@ -381,107 +403,95 @@ | |||||
"max_graph_degree : 3\n", | "max_graph_degree : 3\n", | ||||
"node_label_num : 3\n", | "node_label_num : 3\n", | ||||
"edge_label_num : 4\n", | "edge_label_num : 4\n", | ||||
"node_attr_dim : False\n", | |||||
"edge_attr_dim : False\n", | |||||
"node_attr_dim : 0\n", | |||||
"edge_attr_dim : 0\n", | |||||
"class_number : 2\n", | "class_number : 2\n", | ||||
"\n", | |||||
"\n" | |||||
] | |||||
}, | |||||
{ | |||||
"name": "stdout", | |||||
"output_type": "stream", | |||||
"text": [ | |||||
"\n", | "\n", | ||||
"PAH:\n", | "PAH:\n", | ||||
"substructures : {'non linear', 'linear'}\n", | |||||
"substructures : {'linear', 'non linear'}\n", | |||||
"node_labeled : False\n", | "node_labeled : False\n", | ||||
"edge_labeled : False\n", | "edge_labeled : False\n", | ||||
"is_directed : False\n", | "is_directed : False\n", | ||||
"dataset_size : 94\n", | "dataset_size : 94\n", | ||||
"ave_graph_size : 20.7021276596\n", | |||||
"ave_graph_size : 20.70212765957447\n", | |||||
"min_graph_size : 10\n", | "min_graph_size : 10\n", | ||||
"max_graph_size : 28\n", | "max_graph_size : 28\n", | ||||
"ave_graph_edge_num : 24.4255319149\n", | |||||
"ave_graph_edge_num : 24.425531914893618\n", | |||||
"min_graph_edge_num : 11\n", | "min_graph_edge_num : 11\n", | ||||
"max_graph_edge_num : 34\n", | "max_graph_edge_num : 34\n", | ||||
"ave_graph_degree : 3.01063829787\n", | |||||
"ave_graph_degree : 3.0106382978723403\n", | |||||
"min_graph_degree : 3\n", | "min_graph_degree : 3\n", | ||||
"max_graph_degree : 4\n", | "max_graph_degree : 4\n", | ||||
"node_label_num : 1\n", | "node_label_num : 1\n", | ||||
"edge_label_num : 1\n", | "edge_label_num : 1\n", | ||||
"node_attr_dim : False\n", | |||||
"edge_attr_dim : False\n", | |||||
"node_attr_dim : 0\n", | |||||
"edge_attr_dim : 0\n", | |||||
"class_number : 2\n", | "class_number : 2\n", | ||||
"\n", | "\n", | ||||
"\n", | "\n", | ||||
"NCI1:\n", | "NCI1:\n", | ||||
"substructures : {'non linear', 'linear'}\n", | |||||
"substructures : {'linear', 'non linear'}\n", | |||||
"node_labeled : True\n", | "node_labeled : True\n", | ||||
"edge_labeled : False\n", | "edge_labeled : False\n", | ||||
"is_directed : False\n", | "is_directed : False\n", | ||||
"dataset_size : 4110\n", | "dataset_size : 4110\n", | ||||
"ave_graph_size : 29.8654501217\n", | |||||
"ave_graph_size : 29.8654501216545\n", | |||||
"min_graph_size : 3\n", | "min_graph_size : 3\n", | ||||
"max_graph_size : 111\n", | "max_graph_size : 111\n", | ||||
"ave_graph_edge_num : 32.3\n", | "ave_graph_edge_num : 32.3\n", | ||||
"min_graph_edge_num : 2\n", | "min_graph_edge_num : 2\n", | ||||
"max_graph_edge_num : 119\n", | "max_graph_edge_num : 119\n", | ||||
"ave_graph_degree : 3.33600973236\n", | |||||
"ave_graph_degree : 3.3360097323600972\n", | |||||
"min_graph_degree : 2\n", | "min_graph_degree : 2\n", | ||||
"max_graph_degree : 4\n", | "max_graph_degree : 4\n", | ||||
"node_label_num : 37\n", | "node_label_num : 37\n", | ||||
"edge_label_num : 0\n", | "edge_label_num : 0\n", | ||||
"node_attr_dim : False\n", | |||||
"edge_attr_dim : False\n", | |||||
"node_attr_dim : 0\n", | |||||
"edge_attr_dim : 0\n", | |||||
"class_number : 2\n", | "class_number : 2\n", | ||||
"\n" | |||||
] | |||||
}, | |||||
{ | |||||
"name": "stdout", | |||||
"output_type": "stream", | |||||
"text": [ | |||||
"\n", | |||||
"\n", | "\n", | ||||
"NCI109:\n", | "NCI109:\n", | ||||
"substructures : {'non linear', 'linear'}\n", | |||||
"substructures : {'linear', 'non linear'}\n", | |||||
"node_labeled : True\n", | "node_labeled : True\n", | ||||
"edge_labeled : False\n", | "edge_labeled : False\n", | ||||
"is_directed : False\n", | "is_directed : False\n", | ||||
"dataset_size : 4127\n", | "dataset_size : 4127\n", | ||||
"ave_graph_size : 29.6811243034\n", | |||||
"ave_graph_size : 29.681124303368065\n", | |||||
"min_graph_size : 4\n", | "min_graph_size : 4\n", | ||||
"max_graph_size : 111\n", | "max_graph_size : 111\n", | ||||
"ave_graph_edge_num : 32.1308456506\n", | |||||
"ave_graph_edge_num : 32.13084565059365\n", | |||||
"min_graph_edge_num : 3\n", | "min_graph_edge_num : 3\n", | ||||
"max_graph_edge_num : 119\n", | "max_graph_edge_num : 119\n", | ||||
"ave_graph_degree : 3.34383329295\n", | |||||
"ave_graph_degree : 3.343833292948873\n", | |||||
"min_graph_degree : 2\n", | "min_graph_degree : 2\n", | ||||
"max_graph_degree : 5\n", | "max_graph_degree : 5\n", | ||||
"node_label_num : 38\n", | "node_label_num : 38\n", | ||||
"edge_label_num : 0\n", | "edge_label_num : 0\n", | ||||
"node_attr_dim : False\n", | |||||
"edge_attr_dim : False\n", | |||||
"node_attr_dim : 0\n", | |||||
"edge_attr_dim : 0\n", | |||||
"class_number : 2\n", | "class_number : 2\n", | ||||
"\n", | |||||
"load SDF: 100%|██████████| 4457424/4457424 [00:11<00:00, 397248.47it/s]\n", | |||||
"ajust data: 100%|██████████| 42687/42687 [00:10<00:00, 3939.72it/s] \n", | |||||
"\n", | |||||
"NCI-HIV:\n", | |||||
"substructures : {'non linear', 'linear'}\n", | |||||
"node_labeled : True\n", | |||||
"edge_labeled : True\n", | |||||
"is_directed : False\n", | |||||
"dataset_size : 42682\n", | |||||
"ave_graph_size : 45.7094559768\n", | |||||
"min_graph_size : 2\n", | |||||
"max_graph_size : 438\n", | |||||
"ave_graph_edge_num : 47.7137903566\n", | |||||
"min_graph_edge_num : 1\n", | |||||
"max_graph_edge_num : 441\n", | |||||
"ave_graph_degree : 3.97605548006\n", | |||||
"min_graph_degree : 1\n", | |||||
"max_graph_degree : 12\n", | |||||
"node_label_num : 63\n", | |||||
"edge_label_num : 3\n", | |||||
"node_attr_dim : False\n", | |||||
"edge_attr_dim : False\n", | |||||
"class_number : 3\n", | |||||
"\n" | "\n" | ||||
] | ] | ||||
}, | |||||
{ | |||||
"ename": "ModuleNotFoundError", | |||||
"evalue": "No module named 'tqdm'", | |||||
"output_type": "error", | |||||
"traceback": [ | |||||
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", | |||||
"\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", | |||||
"\u001b[0;32m<ipython-input-1-1e4da065c026>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 45\u001b[0m \u001b[0mds\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'dataset'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 46\u001b[0m \u001b[0mfilename_y\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mds\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'dataset_y'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;34m'dataset_y'\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mds\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 47\u001b[0;31m extra_params=(ds['extra_params'] if 'extra_params' in ds else None))\n\u001b[0m\u001b[1;32m 48\u001b[0m attrs = get_dataset_attributes(\n\u001b[1;32m 49\u001b[0m dataset, target=y, node_label='atom', edge_label='bond_type')\n", | |||||
"\u001b[0;32m/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/utils/graphfiles.py\u001b[0m in \u001b[0;36mloadDataset\u001b[0;34m(filename, filename_y, extra_params)\u001b[0m\n\u001b[1;32m 377\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0mextension\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m\"sdf\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 378\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mnumpy\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 379\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mtqdm\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mtqdm\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 380\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0msys\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 381\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", | |||||
"\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'tqdm'" | |||||
] | |||||
} | } | ||||
], | ], | ||||
"source": [ | "source": [ | ||||
@@ -491,6 +501,7 @@ | |||||
"from pygraph.utils.graphdataset import get_dataset_attributes\n", | "from pygraph.utils.graphdataset import get_dataset_attributes\n", | ||||
"\n", | "\n", | ||||
"dslist = [\n", | "dslist = [\n", | ||||
" {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'},\n", | |||||
" {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'},\n", | " {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'},\n", | ||||
" {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'},\n", | " {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'},\n", | ||||
" {'name': 'FIRSTMM_DB', 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'},\n", | " {'name': 'FIRSTMM_DB', 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'},\n", | ||||
@@ -557,7 +568,7 @@ | |||||
"name": "python", | "name": "python", | ||||
"nbconvert_exporter": "python", | "nbconvert_exporter": "python", | ||||
"pygments_lexer": "ipython3", | "pygments_lexer": "ipython3", | ||||
"version": "3.5.2" | |||||
"version": "3.6.5" | |||||
} | } | ||||
}, | }, | ||||
"nbformat": 4, | "nbformat": 4, | ||||
@@ -1,89 +1,95 @@ | |||||
# %load_ext line_profiler | |||||
# %matplotlib inline | |||||
import functools | import functools | ||||
from libs import * | from libs import * | ||||
from pygraph.kernels.spKernel import spkernel | |||||
from pygraph.utils.kernels import deltakernel, kernelsum | |||||
import multiprocessing | |||||
from sklearn.metrics.pairwise import rbf_kernel | from sklearn.metrics.pairwise import rbf_kernel | ||||
# dslist = [ | |||||
# {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds', 'task': 'regression'}, # node symb | |||||
# # {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb | |||||
# {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds',}, # unlabeled | |||||
# {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds',}, # node/edge symb | |||||
# {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat', | |||||
# 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, # node/edge symb | |||||
# {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression', | |||||
# 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt',}, # contains single node graph, node symb | |||||
# # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb | |||||
# # {'name': 'COX2', 'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt'}, # node symb/nsymb | |||||
# {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, # node/edge symb | |||||
# {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'}, # node symb/nsymb | |||||
# # {'name': 'Fingerprint', 'dataset': '../datasets/Fingerprint/Fingerprint_A.txt'}, | |||||
# {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'}, | |||||
# # {'name': 'DHFR', 'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt'}, # node symb/nsymb | |||||
# # {'name': 'SYNTHETIC', 'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt'}, # node symb/nsymb | |||||
# # {'name': 'MSRC9', 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'}, # node symb | |||||
# # {'name': 'MSRC21', 'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'}, # node symb | |||||
# # {'name': 'FIRSTMM_DB', 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'}, # node symb/nsymb ,edge nsymb | |||||
from pygraph.kernels.spKernel import spkernel, spkernel_do | |||||
from pygraph.utils.kernels import deltakernel, kernelproduct | |||||
from pygraph.utils.model_selection_precomputed import trial_do | |||||
# # {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'}, # node symb/nsymb | |||||
# # {'name': 'PROTEINS_full', 'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'}, # node symb/nsymb | |||||
# {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat', | |||||
# 'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}}, # node symb | |||||
# # {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb | |||||
# # {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1.mat', | |||||
# # 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb | |||||
# # {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109.mat', | |||||
# # 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb | |||||
# # {'name': 'NCI-HIV', 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf', | |||||
# # 'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',}, # node/edge symb | |||||
dslist = [ | |||||
# {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds', | |||||
# 'task': 'regression'}, # node symb | |||||
# {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression', | |||||
# 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt', }, # contains single node graph, node symb | |||||
# {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds', }, # node/edge symb | |||||
# {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds', }, # unlabeled | |||||
# {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat', | |||||
# 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, # node/edge symb | |||||
{'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'}, | |||||
# # node symb/nsymb | |||||
# {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'}, | |||||
# # node/edge symb | |||||
# {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, | |||||
# {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat', | |||||
# 'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}}, # node symb | |||||
# # # not working below | |||||
# # {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',}, | |||||
# # {'name': 'PTC_FR', 'dataset': '../datasets/PTC/Train/FR.ds',}, | |||||
# # {'name': 'PTC_MM', 'dataset': '../datasets/PTC/Train/MM.ds',}, | |||||
# # {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',}, | |||||
# ] | |||||
# {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb | |||||
# # # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb | |||||
# # # {'name': 'COX2', 'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt'}, # node symb/nsymb | |||||
# {'name': 'Fingerprint', 'dataset': '../datasets/Fingerprint/Fingerprint_A.txt'}, | |||||
# | |||||
# # {'name': 'DHFR', 'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt'}, # node symb/nsymb | |||||
# # {'name': 'SYNTHETIC', 'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt'}, # node symb/nsymb | |||||
# # {'name': 'MSRC9', 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'}, # node symb | |||||
# # {'name': 'MSRC21', 'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'}, # node symb | |||||
# # {'name': 'FIRSTMM_DB', 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'}, # node symb/nsymb ,edge nsymb | |||||
import ast | |||||
ds = ast.literal_eval(sys.argv[1]) | |||||
# # {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'}, # node symb/nsymb | |||||
# # {'name': 'PROTEINS_full', 'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'}, # node symb/nsymb | |||||
# # {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb | |||||
# {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1.mat', | |||||
# 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb | |||||
# {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109.mat', | |||||
# 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb | |||||
# {'name': 'NCI-HIV', 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf', | |||||
# 'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',}, # node/edge symb | |||||
# # not working below | |||||
# {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',}, | |||||
# {'name': 'PTC_FR', 'dataset': '../datasets/PTC/Train/FR.ds',}, | |||||
# {'name': 'PTC_MM', 'dataset': '../datasets/PTC/Train/MM.ds',}, | |||||
# {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',}, | |||||
] | |||||
estimator = spkernel | estimator = spkernel | ||||
mixkernel = functools.partial(kernelsum, deltakernel, rbf_kernel) | |||||
param_grid_precomputed = { | |||||
'node_kernels': [{ | |||||
'symb': deltakernel, | |||||
'nsymb': rbf_kernel, | |||||
'mix': mixkernel | |||||
}] | |||||
} | |||||
param_grid = [{ | |||||
'C': np.logspace(-10, 10, num=41, base=10) | |||||
}, { | |||||
'alpha': np.logspace(-10, 10, num=41, base=10) | |||||
}] | |||||
mixkernel = functools.partial(kernelproduct, deltakernel, rbf_kernel) | |||||
param_grid_precomputed = {'node_kernels': [ | |||||
{'symb': deltakernel, 'nsymb': rbf_kernel, 'mix': mixkernel}]} | |||||
param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)}, | |||||
{'alpha': np.logspace(-10, 10, num=41, base=10)}] | |||||
print() | |||||
print(ds['name']) | |||||
model_selection_for_precomputed_kernel( | |||||
ds['dataset'], | |||||
estimator, | |||||
param_grid_precomputed, | |||||
(param_grid[1] | |||||
if ('task' in ds and ds['task'] == 'regression') else param_grid[0]), | |||||
(ds['task'] if 'task' in ds else 'classification'), | |||||
NUM_TRIALS=30, | |||||
datafile_y=(ds['dataset_y'] if 'dataset_y' in ds else None), | |||||
extra_params=(ds['extra_params'] if 'extra_params' in ds else None), | |||||
ds_name=ds['name']) | |||||
for ds in dslist: | |||||
print() | |||||
print(ds['name']) | |||||
model_selection_for_precomputed_kernel( | |||||
ds['dataset'], | |||||
estimator, | |||||
param_grid_precomputed, | |||||
(param_grid[1] if ('task' in ds and ds['task'] | |||||
== 'regression') else param_grid[0]), | |||||
(ds['task'] if 'task' in ds else 'classification'), | |||||
NUM_TRIALS=30, | |||||
datafile_y=(ds['dataset_y'] if 'dataset_y' in ds else None), | |||||
extra_params=(ds['extra_params'] if 'extra_params' in ds else None), | |||||
ds_name=ds['name'], | |||||
n_jobs=multiprocessing.cpu_count(), | |||||
read_gm_from_file=False) | |||||
# %lprun -f spkernel \ | |||||
# %lprun -f trial_do -f spkernel -f spkernel_do -f model_selection_for_precomputed_kernel \ | |||||
# model_selection_for_precomputed_kernel( \ | # model_selection_for_precomputed_kernel( \ | ||||
# ds['dataset'], estimator, param_grid_precomputed, \ | |||||
# ds['dataset'], \ | |||||
# estimator, \ | |||||
# param_grid_precomputed, \ | |||||
# (param_grid[1] if ('task' in ds and ds['task'] == 'regression') else param_grid[0]), \ | # (param_grid[1] if ('task' in ds and ds['task'] == 'regression') else param_grid[0]), \ | ||||
# (ds['task'] if 'task' in ds else 'classification'), NUM_TRIALS=30, \ | |||||
# (ds['task'] if 'task' in ds else 'classification'), \ | |||||
# NUM_TRIALS=30, \ | |||||
# datafile_y=(ds['dataset_y'] if 'dataset_y' in ds else None), \ | # datafile_y=(ds['dataset_y'] if 'dataset_y' in ds else None), \ | ||||
# extra_params=(ds['extra_params'] if 'extra_params' in ds else None)) | |||||
print() | |||||
# extra_params=(ds['extra_params'] if 'extra_params' in ds else None), \ | |||||
# ds_name=ds['name'], \ | |||||
# n_jobs=multiprocessing.cpu_count()) | |||||
print() | |||||
# import functools | # import functools | ||||
# from libs import * | # from libs import * | ||||
@@ -5,13 +5,12 @@ | |||||
import sys | import sys | ||||
import pathlib | import pathlib | ||||
sys.path.insert(0, "../") | |||||
from tqdm import tqdm | |||||
import time | import time | ||||
from itertools import combinations, combinations_with_replacement, product | from itertools import combinations, combinations_with_replacement, product | ||||
from functools import partial | from functools import partial | ||||
from joblib import Parallel, delayed | from joblib import Parallel, delayed | ||||
from multiprocessing import Pool | from multiprocessing import Pool | ||||
from tqdm import tqdm | |||||
import networkx as nx | import networkx as nx | ||||
import numpy as np | import numpy as np | ||||
@@ -19,6 +18,8 @@ import numpy as np | |||||
from pygraph.utils.utils import getSPGraph | from pygraph.utils.utils import getSPGraph | ||||
from pygraph.utils.graphdataset import get_dataset_attributes | from pygraph.utils.graphdataset import get_dataset_attributes | ||||
sys.path.insert(0, "../") | |||||
def spkernel(*args, | def spkernel(*args, | ||||
node_label='atom', | node_label='atom', | ||||
@@ -48,13 +49,13 @@ def spkernel(*args, | |||||
Gn = args[0] if len(args) == 1 else [args[0], args[1]] | Gn = args[0] if len(args) == 1 else [args[0], args[1]] | ||||
weight = None | weight = None | ||||
if edge_weight == None: | |||||
if edge_weight is None: | |||||
print('\n None edge weight specified. Set all weight to 1.\n') | print('\n None edge weight specified. Set all weight to 1.\n') | ||||
else: | else: | ||||
try: | try: | ||||
some_weight = list( | some_weight = list( | ||||
nx.get_edge_attributes(Gn[0], edge_weight).values())[0] | nx.get_edge_attributes(Gn[0], edge_weight).values())[0] | ||||
if isinstance(some_weight, float) or isinstance(some_weight, int): | |||||
if isinstance(some_weight, (float, int)): | |||||
weight = edge_weight | weight = edge_weight | ||||
else: | else: | ||||
print( | print( | ||||
@@ -241,7 +242,7 @@ def spkernel_do(Gn, ds_attrs, node_label, node_kernels, ij): | |||||
nk11, nk22 = vk_dict[(e1[0], e2[0])], vk_dict[(e1[1], | nk11, nk22 = vk_dict[(e1[0], e2[0])], vk_dict[(e1[1], | ||||
e2[1])] | e2[1])] | ||||
kn1 = nk11 * nk22 | kn1 = nk11 * nk22 | ||||
Kmatrix += kn1 + kn2 | |||||
Kmatrix += kn1 | |||||
else: | else: | ||||
for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)): | for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)): | ||||
if e1[2]['cost'] == e2[2]['cost']: | if e1[2]['cost'] == e2[2]['cost']: | ||||
@@ -5,14 +5,15 @@ from sklearn.svm import SVC | |||||
from sklearn.metrics import accuracy_score, mean_squared_error | from sklearn.metrics import accuracy_score, mean_squared_error | ||||
from sklearn.model_selection import KFold, train_test_split, ParameterGrid | from sklearn.model_selection import KFold, train_test_split, ParameterGrid | ||||
from joblib import Parallel, delayed | |||||
#from joblib import Parallel, delayed | |||||
from multiprocessing import Pool | from multiprocessing import Pool | ||||
from functools import partial | from functools import partial | ||||
import sys | import sys | ||||
sys.path.insert(0, "../") | sys.path.insert(0, "../") | ||||
import os | import os | ||||
import time | import time | ||||
from os.path import basename, splitext | |||||
import datetime | |||||
#from os.path import basename, splitext | |||||
from pygraph.utils.graphfiles import loadDataset | from pygraph.utils.graphfiles import loadDataset | ||||
from tqdm import tqdm | from tqdm import tqdm | ||||
@@ -26,7 +27,8 @@ def model_selection_for_precomputed_kernel(datafile, | |||||
datafile_y=None, | datafile_y=None, | ||||
extra_params=None, | extra_params=None, | ||||
ds_name='ds-unknown', | ds_name='ds-unknown', | ||||
n_jobs=1): | |||||
n_jobs=1, | |||||
read_gm_from_file=False): | |||||
"""Perform model selection, fitting and testing for precomputed kernels using nested cv. Print out neccessary data during the process then finally the results. | """Perform model selection, fitting and testing for precomputed kernels using nested cv. Print out neccessary data during the process then finally the results. | ||||
Parameters | Parameters | ||||
@@ -45,6 +47,8 @@ def model_selection_for_precomputed_kernel(datafile, | |||||
Number of random trials of outer cv loop. The default is 30. | Number of random trials of outer cv loop. The default is 30. | ||||
datafile_y : string | datafile_y : string | ||||
Path of file storing y data. This parameter is optional depending on the given dataset file. | Path of file storing y data. This parameter is optional depending on the given dataset file. | ||||
read_gm_from_file : boolean | |||||
Whether gram matrices are loaded from file. | |||||
Examples | Examples | ||||
-------- | -------- | ||||
@@ -65,7 +69,8 @@ def model_selection_for_precomputed_kernel(datafile, | |||||
results_dir = '../notebooks/results/' + estimator.__name__ | results_dir = '../notebooks/results/' + estimator.__name__ | ||||
# a string to save all the results. | # a string to save all the results. | ||||
str_fw = '# This file contains results of ' + estimator.__name__ + ' on dataset ' + ds_name + ',\n# including gram matrices, serial numbers for gram matrix figures and performance.\n\n' | |||||
str_fw = '###################### log time: ' + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + '. ######################\n\n' | |||||
str_fw += '# This file contains results of ' + estimator.__name__ + ' on dataset ' + ds_name + ',\n# including gram matrices, serial numbers for gram matrix figures and performance.\n\n' | |||||
# setup the model type | # setup the model type | ||||
model_type = model_type.lower() | model_type = model_type.lower() | ||||
@@ -75,119 +80,349 @@ def model_selection_for_precomputed_kernel(datafile, | |||||
) | ) | ||||
print() | print() | ||||
print('--- This is a %s problem ---' % model_type) | print('--- This is a %s problem ---' % model_type) | ||||
str_fw += 'This is a %s problem.\n\n' % model_type | |||||
# Load the dataset | |||||
print() | |||||
print('\nI. Loading dataset from file...') | |||||
dataset, y = loadDataset( | |||||
datafile, filename_y=datafile_y, extra_params=extra_params) | |||||
# import matplotlib.pyplot as plt | |||||
# import networkx as nx | |||||
# nx.draw_networkx(dataset[30]) | |||||
# plt.show() | |||||
# Grid of parameters with a discrete number of values for each. | |||||
param_list_precomputed = list(ParameterGrid(param_grid_precomputed)) | |||||
param_list = list(ParameterGrid(param_grid)) | |||||
# np.savetxt(results_name_pre + 'param_grid_precomputed.dt', | |||||
# [[key, value] for key, value in sorted(param_grid_precomputed)]) | |||||
# np.savetxt(results_name_pre + 'param_grid.dt', | |||||
# [[key, value] for key, value in sorted(param_grid)]) | |||||
gram_matrices = [ | |||||
] # a list to store gram matrices for all param_grid_precomputed | |||||
gram_matrix_time = [ | |||||
] # a list to store time to calculate gram matrices | |||||
param_list_pre_revised = [ | |||||
] # list to store param grids precomputed ignoring the useless ones | |||||
# calculate all gram matrices | |||||
print() | |||||
print('2. Calculating gram matrices. This could take a while...') | |||||
str_fw += '\nI. Gram matrices.\n\n' | |||||
tts = time.time() # start training time | |||||
nb_gm_ignore = 0 # the number of gram matrices those should not be considered, as they may contain elements that are not numbers (NaN) | |||||
for idx, params_out in enumerate(param_list_precomputed): | |||||
params_out['n_jobs'] = n_jobs | |||||
rtn_data = estimator(dataset, **params_out) | |||||
Kmatrix = rtn_data[0] | |||||
current_run_time = rtn_data[1] | |||||
if len(rtn_data) == 3: | |||||
idx_trim = rtn_data[2] # the index of trimmed graph list | |||||
y = [y[idx] for idx in idx_trim] | |||||
Kmatrix_diag = Kmatrix.diagonal().copy() | |||||
# remove graphs whose kernels with themselves are zeros | |||||
nb_g_ignore = 0 | |||||
for idx, diag in enumerate(Kmatrix_diag): | |||||
if diag == 0: | |||||
Kmatrix = np.delete(Kmatrix, (idx - nb_g_ignore), axis=0) | |||||
Kmatrix = np.delete(Kmatrix, (idx - nb_g_ignore), axis=1) | |||||
nb_g_ignore += 1 | |||||
# normalization | |||||
for i in range(len(Kmatrix)): | |||||
for j in range(i, len(Kmatrix)): | |||||
Kmatrix[i][j] /= np.sqrt(Kmatrix_diag[i] * Kmatrix_diag[j]) | |||||
Kmatrix[j][i] = Kmatrix[i][j] | |||||
str_fw += 'This is a %s problem.\n' % model_type | |||||
# calculate gram matrices rather than read them from file. | |||||
if read_gm_from_file == False: | |||||
# Load the dataset | |||||
print() | print() | ||||
if params_out == {}: | |||||
print('the gram matrix is: ') | |||||
str_fw += 'the gram matrix is:\n\n' | |||||
else: | |||||
print('the gram matrix with parameters', params_out, 'is: ') | |||||
str_fw += 'the gram matrix with parameters %s is:\n\n' % params_out | |||||
if len(Kmatrix) < 2: | |||||
nb_gm_ignore += 1 | |||||
print('ignored, as at most only one of all its diagonal value is non-zero.') | |||||
str_fw += 'ignored, as at most only one of all its diagonal value is non-zero.\n\n' | |||||
else: | |||||
if np.isnan(Kmatrix).any( | |||||
): # if the matrix contains elements that are not numbers | |||||
print('\n1. Loading dataset from file...') | |||||
dataset, y = loadDataset( | |||||
datafile, filename_y=datafile_y, extra_params=extra_params) | |||||
# import matplotlib.pyplot as plt | |||||
# import networkx as nx | |||||
# nx.draw_networkx(dataset[30]) | |||||
# plt.show() | |||||
# Grid of parameters with a discrete number of values for each. | |||||
param_list_precomputed = list(ParameterGrid(param_grid_precomputed)) | |||||
param_list = list(ParameterGrid(param_grid)) | |||||
gram_matrices = [ | |||||
] # a list to store gram matrices for all param_grid_precomputed | |||||
gram_matrix_time = [ | |||||
] # a list to store time to calculate gram matrices | |||||
param_list_pre_revised = [ | |||||
] # list to store param grids precomputed ignoring the useless ones | |||||
# calculate all gram matrices | |||||
print() | |||||
print('2. Calculating gram matrices. This could take a while...') | |||||
str_fw += '\nII. Gram matrices.\n\n' | |||||
tts = time.time() # start training time | |||||
nb_gm_ignore = 0 # the number of gram matrices those should not be considered, as they may contain elements that are not numbers (NaN) | |||||
for idx, params_out in enumerate(param_list_precomputed): | |||||
params_out['n_jobs'] = n_jobs | |||||
rtn_data = estimator(dataset, **params_out) | |||||
Kmatrix = rtn_data[0] | |||||
current_run_time = rtn_data[1] | |||||
# for some kernels, some graphs in datasets may not meet the | |||||
# kernels' requirements for graph structure. These graphs are trimmed. | |||||
if len(rtn_data) == 3: | |||||
idx_trim = rtn_data[2] # the index of trimmed graph list | |||||
y = [y[idx] for idx in idx_trim] # trim y accordingly | |||||
Kmatrix_diag = Kmatrix.diagonal().copy() | |||||
# remove graphs whose kernels with themselves are zeros | |||||
nb_g_ignore = 0 | |||||
for idx, diag in enumerate(Kmatrix_diag): | |||||
if diag == 0: | |||||
Kmatrix = np.delete(Kmatrix, (idx - nb_g_ignore), axis=0) | |||||
Kmatrix = np.delete(Kmatrix, (idx - nb_g_ignore), axis=1) | |||||
nb_g_ignore += 1 | |||||
# normalization | |||||
for i in range(len(Kmatrix)): | |||||
for j in range(i, len(Kmatrix)): | |||||
Kmatrix[i][j] /= np.sqrt(Kmatrix_diag[i] * Kmatrix_diag[j]) | |||||
Kmatrix[j][i] = Kmatrix[i][j] | |||||
print() | |||||
if params_out == {}: | |||||
print('the gram matrix is: ') | |||||
str_fw += 'the gram matrix is:\n\n' | |||||
else: | |||||
print('the gram matrix with parameters', params_out, 'is: ') | |||||
str_fw += 'the gram matrix with parameters %s is:\n\n' % params_out | |||||
if len(Kmatrix) < 2: | |||||
nb_gm_ignore += 1 | nb_gm_ignore += 1 | ||||
print('ignored, as it contains elements that are not numbers.') | |||||
str_fw += 'ignored, as it contains elements that are not numbers.\n\n' | |||||
print('ignored, as at most only one of all its diagonal value is non-zero.') | |||||
str_fw += 'ignored, as at most only one of all its diagonal value is non-zero.\n\n' | |||||
else: | |||||
if np.isnan(Kmatrix).any( | |||||
): # if the matrix contains elements that are not numbers | |||||
nb_gm_ignore += 1 | |||||
print('ignored, as it contains elements that are not numbers.') | |||||
str_fw += 'ignored, as it contains elements that are not numbers.\n\n' | |||||
else: | |||||
print(Kmatrix) | |||||
str_fw += np.array2string( | |||||
Kmatrix, | |||||
separator=',') + '\n\n' | |||||
# separator=',', | |||||
# threshold=np.inf, | |||||
# floatmode='unique') + '\n\n' | |||||
fig_file_name = results_dir + '/GM[ds]' + ds_name | |||||
if params_out != {}: | |||||
fig_file_name += '[params]' + str(idx) | |||||
plt.imshow(Kmatrix) | |||||
plt.colorbar() | |||||
plt.savefig(fig_file_name + '.eps', format='eps', dpi=300) | |||||
plt.show() | |||||
gram_matrices.append(Kmatrix) | |||||
gram_matrix_time.append(current_run_time) | |||||
param_list_pre_revised.append(params_out) | |||||
if nb_g_ignore > 0: | |||||
print(', where %d graphs are ignored as their graph kernels with themselves are zeros.' % nb_g_ignore) | |||||
str_fw += ', where %d graphs are ignored as their graph kernels with themselves are zeros.' % nb_g_ignore | |||||
print() | |||||
print( | |||||
'{} gram matrices are calculated, {} of which are ignored.'.format( | |||||
len(param_list_precomputed), nb_gm_ignore)) | |||||
str_fw += '{} gram matrices are calculated, {} of which are ignored.\n\n'.format(len(param_list_precomputed), nb_gm_ignore) | |||||
str_fw += 'serial numbers of gram matrix figures and their corresponding parameters settings:\n\n' | |||||
str_fw += ''.join([ | |||||
'{}: {}\n'.format(idx, params_out) | |||||
for idx, params_out in enumerate(param_list_precomputed) | |||||
]) | |||||
print() | |||||
if len(gram_matrices) == 0: | |||||
print('all gram matrices are ignored, no results obtained.') | |||||
str_fw += '\nall gram matrices are ignored, no results obtained.\n\n' | |||||
else: | |||||
# save gram matrices to file. | |||||
np.savez(results_dir + '/' + ds_name + '.gm', | |||||
gms=gram_matrices, params=param_list_pre_revised, y=y, | |||||
gmtime=gram_matrix_time) | |||||
print( | |||||
'3. Fitting and predicting using nested cross validation. This could really take a while...' | |||||
) | |||||
pool = Pool(n_jobs) | |||||
trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, gram_matrices, y, model_type) | |||||
train_pref = [] | |||||
val_pref = [] | |||||
test_pref = [] | |||||
if NUM_TRIALS < 100: | |||||
chunksize, extra = divmod(NUM_TRIALS, n_jobs * 4) | |||||
if extra: | |||||
chunksize += 1 | |||||
else: | else: | ||||
print(Kmatrix) | |||||
str_fw += np.array2string( | |||||
Kmatrix, | |||||
separator=',', | |||||
threshold=np.inf, | |||||
floatmode='unique') + '\n\n' | |||||
plt.matshow(Kmatrix) | |||||
plt.colorbar() | |||||
fig_file_name = results_dir + '/GM[ds]' + ds_name | |||||
if params_out != {}: | |||||
fig_file_name += '[params]' + str(idx) | |||||
plt.savefig(fig_file_name + '.eps', format='eps', dpi=300) | |||||
plt.show() | |||||
gram_matrices.append(Kmatrix) | |||||
gram_matrix_time.append(current_run_time) | |||||
param_list_pre_revised.append(params_out) | |||||
if nb_g_ignore > 0: | |||||
print(', where %d graphs are ignored as their graph kernels with themselves are zeros.' % nb_g_ignore) | |||||
str_fw += ', where %d graphs are ignored as their graph kernels with themselves are zeros.' % nb_g_ignore | |||||
print() | |||||
print( | |||||
'{} gram matrices are calculated, {} of which are ignored.'.format( | |||||
len(param_list_precomputed), nb_gm_ignore)) | |||||
str_fw += '{} gram matrices are calculated, {} of which are ignored.\n\n'.format(len(param_list_precomputed), nb_gm_ignore) | |||||
str_fw += 'serial numbers of gram matrix figures and their corresponding parameters settings:\n\n' | |||||
str_fw += ''.join([ | |||||
'{}: {}\n'.format(idx, params_out) | |||||
for idx, params_out in enumerate(param_list_precomputed) | |||||
]) | |||||
print() | |||||
if len(gram_matrices) == 0: | |||||
print('all gram matrices are ignored, no results obtained.') | |||||
str_fw += '\nall gram matrices are ignored, no results obtained.\n\n' | |||||
else: | |||||
chunksize = 100 | |||||
for o1, o2, o3 in tqdm(pool.imap_unordered(trial_do_partial, range(NUM_TRIALS), chunksize), desc='cross validation', file=sys.stdout): | |||||
train_pref.append(o1) | |||||
val_pref.append(o2) | |||||
test_pref.append(o3) | |||||
pool.close() | |||||
pool.join() | |||||
# # ---- use pool.map to parallel. ---- | |||||
# result_perf = pool.map(trial_do_partial, range(NUM_TRIALS)) | |||||
# train_pref = [item[0] for item in result_perf] | |||||
# val_pref = [item[1] for item in result_perf] | |||||
# test_pref = [item[2] for item in result_perf] | |||||
# # ---- use joblib.Parallel to parallel and track progress. ---- | |||||
# trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, gram_matrices, y, model_type) | |||||
# result_perf = Parallel(n_jobs=n_jobs, verbose=10)(delayed(trial_do_partial)(trial) for trial in range(NUM_TRIALS)) | |||||
# train_pref = [item[0] for item in result_perf] | |||||
# val_pref = [item[1] for item in result_perf] | |||||
# test_pref = [item[2] for item in result_perf] | |||||
# # ---- direct running, normally use a single CPU core. ---- | |||||
# train_pref = [] | |||||
# val_pref = [] | |||||
# test_pref = [] | |||||
# for i in tqdm(range(NUM_TRIALS), desc='cross validation', file=sys.stdout): | |||||
# o1, o2, o3 = trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, i) | |||||
# train_pref.append(o1) | |||||
# val_pref.append(o2) | |||||
# test_pref.append(o3) | |||||
print() | |||||
print('4. Getting final performance...') | |||||
str_fw += '\nIII. Performance.\n\n' | |||||
# averages and confidences of performances on outer trials for each combination of parameters | |||||
average_train_scores = np.mean(train_pref, axis=0) | |||||
average_val_scores = np.mean(val_pref, axis=0) | |||||
average_perf_scores = np.mean(test_pref, axis=0) | |||||
# sample std is used here | |||||
std_train_scores = np.std(train_pref, axis=0, ddof=1) | |||||
std_val_scores = np.std(val_pref, axis=0, ddof=1) | |||||
std_perf_scores = np.std(test_pref, axis=0, ddof=1) | |||||
if model_type == 'regression': | |||||
best_val_perf = np.amin(average_val_scores) | |||||
else: | |||||
best_val_perf = np.amax(average_val_scores) | |||||
best_params_index = np.where(average_val_scores == best_val_perf) | |||||
# find smallest val std with best val perf. | |||||
best_val_stds = [ | |||||
std_val_scores[value][best_params_index[1][idx]] | |||||
for idx, value in enumerate(best_params_index[0]) | |||||
] | |||||
min_val_std = np.amin(best_val_stds) | |||||
best_params_index = np.where(std_val_scores == min_val_std) | |||||
best_params_out = [ | |||||
param_list_pre_revised[i] for i in best_params_index[0] | |||||
] | |||||
best_params_in = [param_list[i] for i in best_params_index[1]] | |||||
print('best_params_out: ', best_params_out) | |||||
print('best_params_in: ', best_params_in) | |||||
print() | |||||
print('best_val_perf: ', best_val_perf) | |||||
print('best_val_std: ', min_val_std) | |||||
str_fw += 'best settings of hyper-params to build gram matrix: %s\n' % best_params_out | |||||
str_fw += 'best settings of other hyper-params: %s\n\n' % best_params_in | |||||
str_fw += 'best_val_perf: %s\n' % best_val_perf | |||||
str_fw += 'best_val_std: %s\n' % min_val_std | |||||
final_performance = [ | |||||
average_perf_scores[value][best_params_index[1][idx]] | |||||
for idx, value in enumerate(best_params_index[0]) | |||||
] | |||||
final_confidence = [ | |||||
std_perf_scores[value][best_params_index[1][idx]] | |||||
for idx, value in enumerate(best_params_index[0]) | |||||
] | |||||
print('final_performance: ', final_performance) | |||||
print('final_confidence: ', final_confidence) | |||||
str_fw += 'final_performance: %s\n' % final_performance | |||||
str_fw += 'final_confidence: %s\n' % final_confidence | |||||
train_performance = [ | |||||
average_train_scores[value][best_params_index[1][idx]] | |||||
for idx, value in enumerate(best_params_index[0]) | |||||
] | |||||
train_std = [ | |||||
std_train_scores[value][best_params_index[1][idx]] | |||||
for idx, value in enumerate(best_params_index[0]) | |||||
] | |||||
print('train_performance: %s' % train_performance) | |||||
print('train_std: ', train_std) | |||||
str_fw += 'train_performance: %s\n' % train_performance | |||||
str_fw += 'train_std: %s\n\n' % train_std | |||||
print() | |||||
tt_total = time.time() - tts # training time for all hyper-parameters | |||||
average_gram_matrix_time = np.mean(gram_matrix_time) | |||||
std_gram_matrix_time = np.std(gram_matrix_time, ddof=1) | |||||
best_gram_matrix_time = [ | |||||
gram_matrix_time[i] for i in best_params_index[0] | |||||
] | |||||
ave_bgmt = np.mean(best_gram_matrix_time) | |||||
std_bgmt = np.std(best_gram_matrix_time, ddof=1) | |||||
print( | |||||
'time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s' | |||||
.format(average_gram_matrix_time, std_gram_matrix_time)) | |||||
print('time to calculate best gram matrix: {:.2f}±{:.2f}s'.format( | |||||
ave_bgmt, std_bgmt)) | |||||
print( | |||||
'total training time with all hyper-param choices: {:.2f}s'.format( | |||||
tt_total)) | |||||
str_fw += 'time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s\n'.format(average_gram_matrix_time, std_gram_matrix_time) | |||||
str_fw += 'time to calculate best gram matrix: {:.2f}±{:.2f}s\n'.format(ave_bgmt, std_bgmt) | |||||
str_fw += 'total training time with all hyper-param choices: {:.2f}s\n\n'.format(tt_total) | |||||
# # save results to file | |||||
# np.savetxt(results_name_pre + 'average_train_scores.dt', | |||||
# average_train_scores) | |||||
# np.savetxt(results_name_pre + 'average_val_scores', average_val_scores) | |||||
# np.savetxt(results_name_pre + 'average_perf_scores.dt', | |||||
# average_perf_scores) | |||||
# np.savetxt(results_name_pre + 'std_train_scores.dt', std_train_scores) | |||||
# np.savetxt(results_name_pre + 'std_val_scores.dt', std_val_scores) | |||||
# np.savetxt(results_name_pre + 'std_perf_scores.dt', std_perf_scores) | |||||
# np.save(results_name_pre + 'best_params_index', best_params_index) | |||||
# np.save(results_name_pre + 'best_params_pre.dt', best_params_out) | |||||
# np.save(results_name_pre + 'best_params_in.dt', best_params_in) | |||||
# np.save(results_name_pre + 'best_val_perf.dt', best_val_perf) | |||||
# np.save(results_name_pre + 'best_val_std.dt', best_val_std) | |||||
# np.save(results_name_pre + 'final_performance.dt', final_performance) | |||||
# np.save(results_name_pre + 'final_confidence.dt', final_confidence) | |||||
# np.save(results_name_pre + 'train_performance.dt', train_performance) | |||||
# np.save(results_name_pre + 'train_std.dt', train_std) | |||||
# np.save(results_name_pre + 'gram_matrix_time.dt', gram_matrix_time) | |||||
# np.save(results_name_pre + 'average_gram_matrix_time.dt', | |||||
# average_gram_matrix_time) | |||||
# np.save(results_name_pre + 'std_gram_matrix_time.dt', | |||||
# std_gram_matrix_time) | |||||
# np.save(results_name_pre + 'best_gram_matrix_time.dt', | |||||
# best_gram_matrix_time) | |||||
# print out as table. | |||||
from collections import OrderedDict | |||||
from tabulate import tabulate | |||||
table_dict = {} | |||||
if model_type == 'regression': | |||||
for param_in in param_list: | |||||
param_in['alpha'] = '{:.2e}'.format(param_in['alpha']) | |||||
else: | |||||
for param_in in param_list: | |||||
param_in['C'] = '{:.2e}'.format(param_in['C']) | |||||
table_dict['params'] = [{**param_out, **param_in} | |||||
for param_in in param_list for param_out in param_list_pre_revised] | |||||
table_dict['gram_matrix_time'] = [ | |||||
'{:.2f}'.format(gram_matrix_time[index_out]) | |||||
for param_in in param_list | |||||
for index_out, _ in enumerate(param_list_pre_revised) | |||||
] | |||||
table_dict['valid_perf'] = [ | |||||
'{:.2f}±{:.2f}'.format(average_val_scores[index_out][index_in], | |||||
std_val_scores[index_out][index_in]) | |||||
for index_in, _ in enumerate(param_list) | |||||
for index_out, _ in enumerate(param_list_pre_revised) | |||||
] | |||||
table_dict['test_perf'] = [ | |||||
'{:.2f}±{:.2f}'.format(average_perf_scores[index_out][index_in], | |||||
std_perf_scores[index_out][index_in]) | |||||
for index_in, _ in enumerate(param_list) | |||||
for index_out, _ in enumerate(param_list_pre_revised) | |||||
] | |||||
table_dict['train_perf'] = [ | |||||
'{:.2f}±{:.2f}'.format(average_train_scores[index_out][index_in], | |||||
std_train_scores[index_out][index_in]) | |||||
for index_in, _ in enumerate(param_list) | |||||
for index_out, _ in enumerate(param_list_pre_revised) | |||||
] | |||||
keyorder = [ | |||||
'params', 'train_perf', 'valid_perf', 'test_perf', | |||||
'gram_matrix_time' | |||||
] | |||||
print() | |||||
tb_print = tabulate( | |||||
OrderedDict( | |||||
sorted(table_dict.items(), | |||||
key=lambda i: keyorder.index(i[0]))), | |||||
headers='keys') | |||||
print(tb_print) | |||||
str_fw += 'table of performance v.s. hyper-params:\n\n%s\n\n' % tb_print | |||||
# read gram matrices from file. | |||||
else: | |||||
# Grid of parameters with a discrete number of values for each. | |||||
# param_list_precomputed = list(ParameterGrid(param_grid_precomputed)) | |||||
param_list = list(ParameterGrid(param_grid)) | |||||
# read gram matrices from file. | |||||
print() | |||||
print('2. Reading gram matrices from file...') | |||||
str_fw += '\nII. Gram matrices.\n\nGram matrices are read from file, see last log for detail.\n' | |||||
gmfile = np.load(results_dir + '/' + ds_name + '.gm.npz') | |||||
gram_matrices = gmfile['gms'] # a list to store gram matrices for all param_grid_precomputed | |||||
param_list_pre_revised = gmfile['params'] # list to store param grids precomputed ignoring the useless ones | |||||
y = gmfile['y'].tolist() | |||||
tts = time.time() # start training time | |||||
# nb_gm_ignore = 0 # the number of gram matrices those should not be considered, as they may contain elements that are not numbers (NaN) | |||||
print( | print( | ||||
'3. Fitting and predicting using nested cross validation. This could really take a while...' | '3. Fitting and predicting using nested cross validation. This could really take a while...' | ||||
) | ) | ||||
pool = Pool(n_jobs) | pool = Pool(n_jobs) | ||||
trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, gram_matrices, y, model_type) | trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, gram_matrices, y, model_type) | ||||
train_pref = [] | train_pref = [] | ||||
@@ -205,7 +440,7 @@ def model_selection_for_precomputed_kernel(datafile, | |||||
test_pref.append(o3) | test_pref.append(o3) | ||||
pool.close() | pool.close() | ||||
pool.join() | pool.join() | ||||
# # ---- use pool.map to parallel. ---- | # # ---- use pool.map to parallel. ---- | ||||
# result_perf = pool.map(trial_do_partial, range(NUM_TRIALS)) | # result_perf = pool.map(trial_do_partial, range(NUM_TRIALS)) | ||||
# train_pref = [item[0] for item in result_perf] | # train_pref = [item[0] for item in result_perf] | ||||
@@ -219,19 +454,19 @@ def model_selection_for_precomputed_kernel(datafile, | |||||
# val_pref = [item[1] for item in result_perf] | # val_pref = [item[1] for item in result_perf] | ||||
# test_pref = [item[2] for item in result_perf] | # test_pref = [item[2] for item in result_perf] | ||||
# # ---- direct running, normally use single CPU core. ---- | |||||
# train_pref = [] | |||||
# val_pref = [] | |||||
# test_pref = [] | |||||
# for i in tqdm(range(NUM_TRIALS), desc='cross validation', file=sys.stdout): | |||||
# o1, o2, o3 = trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, i) | |||||
# train_pref.append(o1) | |||||
# val_pref.append(o2) | |||||
# test_pref.append(o3) | |||||
# # ---- direct running, normally use a single CPU core. ---- | |||||
# train_pref = [] | |||||
# val_pref = [] | |||||
# test_pref = [] | |||||
# for i in tqdm(range(NUM_TRIALS), desc='cross validation', file=sys.stdout): | |||||
# o1, o2, o3 = trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, i) | |||||
# train_pref.append(o1) | |||||
# val_pref.append(o2) | |||||
# test_pref.append(o3) | |||||
print() | print() | ||||
print('4. Getting final performance...') | print('4. Getting final performance...') | ||||
str_fw += '\nII. Performance.\n\n' | |||||
str_fw += '\nIII. Performance.\n\n' | |||||
# averages and confidences of performances on outer trials for each combination of parameters | # averages and confidences of performances on outer trials for each combination of parameters | ||||
average_train_scores = np.mean(train_pref, axis=0) | average_train_scores = np.mean(train_pref, axis=0) | ||||
average_val_scores = np.mean(val_pref, axis=0) | average_val_scores = np.mean(val_pref, axis=0) | ||||
@@ -293,53 +528,25 @@ def model_selection_for_precomputed_kernel(datafile, | |||||
str_fw += 'train_std: %s\n\n' % train_std | str_fw += 'train_std: %s\n\n' % train_std | ||||
print() | print() | ||||
tt_total = time.time() - tts # training time for all hyper-parameters | |||||
average_gram_matrix_time = np.mean(gram_matrix_time) | |||||
std_gram_matrix_time = np.std(gram_matrix_time, ddof=1) | |||||
best_gram_matrix_time = [ | |||||
gram_matrix_time[i] for i in best_params_index[0] | |||||
] | |||||
ave_bgmt = np.mean(best_gram_matrix_time) | |||||
std_bgmt = np.std(best_gram_matrix_time, ddof=1) | |||||
print( | |||||
'time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s' | |||||
.format(average_gram_matrix_time, std_gram_matrix_time)) | |||||
print('time to calculate best gram matrix: {:.2f}±{:.2f}s'.format( | |||||
ave_bgmt, std_bgmt)) | |||||
tt_poster = time.time() - tts # training time with hyper-param choices who did not participate in calculation of gram matrices | |||||
# average_gram_matrix_time = np.mean(gram_matrix_time) | |||||
# std_gram_matrix_time = np.std(gram_matrix_time, ddof=1) | |||||
# best_gram_matrix_time = [ | |||||
# gram_matrix_time[i] for i in best_params_index[0] | |||||
# ] | |||||
# ave_bgmt = np.mean(best_gram_matrix_time) | |||||
# std_bgmt = np.std(best_gram_matrix_time, ddof=1) | |||||
# print( | |||||
# 'time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s' | |||||
# .format(average_gram_matrix_time, std_gram_matrix_time)) | |||||
# print('time to calculate best gram matrix: {:.2f}±{:.2f}s'.format( | |||||
# ave_bgmt, std_bgmt)) | |||||
print( | print( | ||||
'total training time with all hyper-param choices: {:.2f}s'.format( | |||||
tt_total)) | |||||
str_fw += 'time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s\n'.format(average_gram_matrix_time, std_gram_matrix_time) | |||||
str_fw += 'time to calculate best gram matrix: {:.2f}±{:.2f}s\n'.format(ave_bgmt, std_bgmt) | |||||
str_fw += 'total training time with all hyper-param choices: {:.2f}s\n\n'.format(tt_total) | |||||
# # save results to file | |||||
# np.savetxt(results_name_pre + 'average_train_scores.dt', | |||||
# average_train_scores) | |||||
# np.savetxt(results_name_pre + 'average_val_scores', average_val_scores) | |||||
# np.savetxt(results_name_pre + 'average_perf_scores.dt', | |||||
# average_perf_scores) | |||||
# np.savetxt(results_name_pre + 'std_train_scores.dt', std_train_scores) | |||||
# np.savetxt(results_name_pre + 'std_val_scores.dt', std_val_scores) | |||||
# np.savetxt(results_name_pre + 'std_perf_scores.dt', std_perf_scores) | |||||
# np.save(results_name_pre + 'best_params_index', best_params_index) | |||||
# np.save(results_name_pre + 'best_params_pre.dt', best_params_out) | |||||
# np.save(results_name_pre + 'best_params_in.dt', best_params_in) | |||||
# np.save(results_name_pre + 'best_val_perf.dt', best_val_perf) | |||||
# np.save(results_name_pre + 'best_val_std.dt', best_val_std) | |||||
# np.save(results_name_pre + 'final_performance.dt', final_performance) | |||||
# np.save(results_name_pre + 'final_confidence.dt', final_confidence) | |||||
# np.save(results_name_pre + 'train_performance.dt', train_performance) | |||||
# np.save(results_name_pre + 'train_std.dt', train_std) | |||||
# np.save(results_name_pre + 'gram_matrix_time.dt', gram_matrix_time) | |||||
# np.save(results_name_pre + 'average_gram_matrix_time.dt', | |||||
# average_gram_matrix_time) | |||||
# np.save(results_name_pre + 'std_gram_matrix_time.dt', | |||||
# std_gram_matrix_time) | |||||
# np.save(results_name_pre + 'best_gram_matrix_time.dt', | |||||
# best_gram_matrix_time) | |||||
'training time with hyper-param choices who did not participate in calculation of gram matrices: {:.2f}s'.format( | |||||
tt_poster)) | |||||
# str_fw += 'time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s\n'.format(average_gram_matrix_time, std_gram_matrix_time) | |||||
# str_fw += 'time to calculate best gram matrix: {:.2f}±{:.2f}s\n'.format(ave_bgmt, std_bgmt) | |||||
str_fw += 'training time with hyper-param choices who did not participate in calculation of gram matrices: {:.2f}s\n\n'.format(tt_poster) | |||||
# print out as table. | # print out as table. | ||||
from collections import OrderedDict | from collections import OrderedDict | ||||
@@ -353,11 +560,11 @@ def model_selection_for_precomputed_kernel(datafile, | |||||
param_in['C'] = '{:.2e}'.format(param_in['C']) | param_in['C'] = '{:.2e}'.format(param_in['C']) | ||||
table_dict['params'] = [{**param_out, **param_in} | table_dict['params'] = [{**param_out, **param_in} | ||||
for param_in in param_list for param_out in param_list_pre_revised] | for param_in in param_list for param_out in param_list_pre_revised] | ||||
table_dict['gram_matrix_time'] = [ | |||||
'{:.2f}'.format(gram_matrix_time[index_out]) | |||||
for param_in in param_list | |||||
for index_out, _ in enumerate(param_list_pre_revised) | |||||
] | |||||
# table_dict['gram_matrix_time'] = [ | |||||
# '{:.2f}'.format(gram_matrix_time[index_out]) | |||||
# for param_in in param_list | |||||
# for index_out, _ in enumerate(param_list_pre_revised) | |||||
# ] | |||||
table_dict['valid_perf'] = [ | table_dict['valid_perf'] = [ | ||||
'{:.2f}±{:.2f}'.format(average_val_scores[index_out][index_in], | '{:.2f}±{:.2f}'.format(average_val_scores[index_out][index_in], | ||||
std_val_scores[index_out][index_in]) | std_val_scores[index_out][index_in]) | ||||
@@ -377,8 +584,7 @@ def model_selection_for_precomputed_kernel(datafile, | |||||
for index_out, _ in enumerate(param_list_pre_revised) | for index_out, _ in enumerate(param_list_pre_revised) | ||||
] | ] | ||||
keyorder = [ | keyorder = [ | ||||
'params', 'train_perf', 'valid_perf', 'test_perf', | |||||
'gram_matrix_time' | |||||
'params', 'train_perf', 'valid_perf', 'test_perf' | |||||
] | ] | ||||
print() | print() | ||||
tb_print = tabulate( | tb_print = tabulate( | ||||
@@ -392,59 +598,62 @@ def model_selection_for_precomputed_kernel(datafile, | |||||
# open file to save all results for this dataset. | # open file to save all results for this dataset. | ||||
if not os.path.exists(results_dir): | if not os.path.exists(results_dir): | ||||
os.makedirs(results_dir) | os.makedirs(results_dir) | ||||
with open(results_dir + '/' + ds_name + '.txt', 'w') as fresults: | |||||
fresults.write(str_fw) | |||||
fresults.close() | |||||
# open file to save all results for this dataset. | |||||
if not os.path.exists(results_dir): | |||||
os.makedirs(results_dir) | |||||
if not os.path.exists(results_dir + '/' + ds_name + '.output.txt'): | |||||
with open(results_dir + '/' + ds_name + '.output.txt', 'w') as f: | |||||
f.write(str_fw) | |||||
else: | |||||
with open(results_dir + '/' + ds_name + '.output.txt', 'r+') as f: | |||||
content = f.read() | |||||
f.seek(0, 0) | |||||
f.write(str_fw + '\n\n\n' + content) | |||||
def trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, trial): # Test set level | def trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, trial): # Test set level | ||||
# Arrays to store scores | # Arrays to store scores | ||||
train_pref = np.zeros((len(param_list_pre_revised), | |||||
len(param_list))) | |||||
val_pref = np.zeros((len(param_list_pre_revised), | |||||
len(param_list))) | |||||
test_pref = np.zeros((len(param_list_pre_revised), | |||||
len(param_list))) | |||||
train_pref = np.zeros((len(param_list_pre_revised), len(param_list))) | |||||
val_pref = np.zeros((len(param_list_pre_revised), len(param_list))) | |||||
test_pref = np.zeros((len(param_list_pre_revised), len(param_list))) | |||||
# loop for each outer param tuple | # loop for each outer param tuple | ||||
for index_out, params_out in enumerate(param_list_pre_revised): | for index_out, params_out in enumerate(param_list_pre_revised): | ||||
# split gram matrix and y to app and test sets. | # split gram matrix and y to app and test sets. | ||||
X_app, X_test, y_app, y_test = train_test_split( | |||||
gram_matrices[index_out], y, test_size=0.1) | |||||
split_index_app = [y.index(y_i) for y_i in y_app if y_i in y] | |||||
# split_index_test = [y.index(y_i) for y_i in y_test if y_i in y] | |||||
X_app = X_app[:, split_index_app] | |||||
X_test = X_test[:, split_index_app] | |||||
indices = range(len(y)) | |||||
X_app, X_test, y_app, y_test, idx_app, idx_test = train_test_split( | |||||
gram_matrices[index_out], y, indices, test_size=0.1, | |||||
random_state=None, shuffle=True) | |||||
X_app = X_app[:, idx_app] | |||||
X_test = X_test[:, idx_app] | |||||
y_app = np.array(y_app) | y_app = np.array(y_app) | ||||
y_test = np.array(y_test) | y_test = np.array(y_test) | ||||
# loop for each inner param tuple | # loop for each inner param tuple | ||||
for index_in, params_in in enumerate(param_list): | for index_in, params_in in enumerate(param_list): | ||||
inner_cv = KFold( | |||||
n_splits=10, shuffle=True, random_state=trial) | |||||
inner_cv = KFold(n_splits=10, shuffle=True, random_state=trial) | |||||
current_train_perf = [] | current_train_perf = [] | ||||
current_valid_perf = [] | current_valid_perf = [] | ||||
current_test_perf = [] | |||||
current_test_perf = [] | |||||
# For regression use the Kernel Ridge method | # For regression use the Kernel Ridge method | ||||
try: | try: | ||||
if model_type == 'regression': | if model_type == 'regression': | ||||
KR = KernelRidge(kernel='precomputed', **params_in) | |||||
kr = KernelRidge(kernel='precomputed', **params_in) | |||||
# loop for each split on validation set level | # loop for each split on validation set level | ||||
# validation set level | # validation set level | ||||
for train_index, valid_index in inner_cv.split( | |||||
X_app): | |||||
KR.fit(X_app[train_index, :][:, train_index], | |||||
for train_index, valid_index in inner_cv.split(X_app): | |||||
kr.fit(X_app[train_index, :][:, train_index], | |||||
y_app[train_index]) | y_app[train_index]) | ||||
# predict on the train, validation and test set | # predict on the train, validation and test set | ||||
y_pred_train = KR.predict( | |||||
y_pred_train = kr.predict( | |||||
X_app[train_index, :][:, train_index]) | X_app[train_index, :][:, train_index]) | ||||
y_pred_valid = KR.predict( | |||||
y_pred_valid = kr.predict( | |||||
X_app[valid_index, :][:, train_index]) | X_app[valid_index, :][:, train_index]) | ||||
y_pred_test = KR.predict( | |||||
y_pred_test = kr.predict( | |||||
X_test[:, train_index]) | X_test[:, train_index]) | ||||
# root mean squared errors | # root mean squared errors | ||||
@@ -460,22 +669,23 @@ def trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, t | |||||
np.sqrt( | np.sqrt( | ||||
mean_squared_error( | mean_squared_error( | ||||
y_test, y_pred_test))) | y_test, y_pred_test))) | ||||
# For clcassification use SVM | |||||
# For clcassification use SVM | |||||
else: | else: | ||||
KR = SVC(kernel='precomputed', **params_in) | |||||
svc = SVC(kernel='precomputed', cache_size=200, | |||||
verbose=False, **params_in) | |||||
# loop for each split on validation set level | # loop for each split on validation set level | ||||
# validation set level | # validation set level | ||||
for train_index, valid_index in inner_cv.split( | |||||
X_app): | |||||
KR.fit(X_app[train_index, :][:, train_index], | |||||
for train_index, valid_index in inner_cv.split(X_app): | |||||
# np.savez("bug.npy",X_app[train_index, :][:, train_index],y_app[train_index]) | |||||
svc.fit(X_app[train_index, :][:, train_index], | |||||
y_app[train_index]) | y_app[train_index]) | ||||
# predict on the train, validation and test set | # predict on the train, validation and test set | ||||
y_pred_train = KR.predict( | |||||
y_pred_train = svc.predict( | |||||
X_app[train_index, :][:, train_index]) | X_app[train_index, :][:, train_index]) | ||||
y_pred_valid = KR.predict( | |||||
y_pred_valid = svc.predict( | |||||
X_app[valid_index, :][:, train_index]) | X_app[valid_index, :][:, train_index]) | ||||
y_pred_test = KR.predict( | |||||
y_pred_test = svc.predict( | |||||
X_test[:, train_index]) | X_test[:, train_index]) | ||||
# root mean squared errors | # root mean squared errors | ||||