Browse Source

1. fix bug when computing kernels between non-symbolic node attributes of the spkernel and the ssp kernel, add a gaussian kernel implementation.

2. update pygraph.utils.graphdatasets.get_dataset_attributes function, so that if a dataset has missing attributes it can still read the dimension of attributes.
v0.1
jajupmochi 6 years ago
parent
commit
ef19fcba5a
10 changed files with 41112 additions and 39087 deletions
  1. +38582
    -38584
      notebooks/check_gm/Letter-med.gm.eps
  2. +329
    -317
      notebooks/get_dataset_attributes.ipynb
  3. +1922
    -1
      notebooks/plot_all_graphs.ipynb
  4. +12
    -13
      notebooks/run_spkernel.py
  5. +4
    -5
      notebooks/run_structuralspkernel.py
  6. +77
    -0
      notebooks/test.py
  7. +84
    -84
      pygraph/kernels/spKernel.py
  8. +11
    -14
      pygraph/kernels/structuralspKernel.py
  9. +54
    -65
      pygraph/utils/graphdataset.py
  10. +37
    -4
      pygraph/utils/kernels.py

+ 38582
- 38584
notebooks/check_gm/Letter-med.gm.eps
File diff suppressed because it is too large
View File


+ 329
- 317
notebooks/get_dataset_attributes.ipynb View File

@@ -12,21 +12,131 @@
"output_type": "stream",
"text": [
"\n",
"Acyclic:\n",
"substructures : {'non linear', 'linear'}\n",
"node_labeled : True\n",
"edge_labeled : False\n",
"is_directed : False\n",
"dataset_size : 183\n",
"ave_node_num : 8.153005464480874\n",
"min_node_num : 3\n",
"max_node_num : 11\n",
"ave_edge_num : 7.1530054644808745\n",
"min_edge_num : 2\n",
"max_edge_num : 10\n",
"ave_node_degree : 2.80327868852459\n",
"min_node_degree : 2\n",
"max_node_degree : 4\n",
"node_label_num : 3\n",
"edge_label_num : 1\n",
"node_attr_dim : 0\n",
"edge_attr_dim : 0\n",
"class_number : 148\n",
"\n",
"\n",
"Alkane:\n",
"substructures : {'non linear', 'linear'}\n",
"node_labeled : True\n",
"edge_labeled : False\n",
"is_directed : False\n",
"dataset_size : 150\n",
"ave_node_num : 8.873333333333333\n",
"min_node_num : 1\n",
"max_node_num : 10\n",
"ave_edge_num : 7.873333333333333\n",
"min_edge_num : 0\n",
"max_edge_num : 9\n",
"ave_node_degree : 3.36\n",
"min_node_degree : 0\n",
"max_node_degree : 4\n",
"node_label_num : 2\n",
"edge_label_num : 1\n",
"node_attr_dim : 0\n",
"edge_attr_dim : 0\n",
"class_number : 123\n",
"\n",
"\n",
"MAO:\n",
"substructures : {'non linear', 'linear'}\n",
"node_labeled : True\n",
"edge_labeled : True\n",
"is_directed : False\n",
"dataset_size : 68\n",
"ave_node_num : 18.38235294117647\n",
"min_node_num : 11\n",
"max_node_num : 27\n",
"ave_edge_num : 19.63235294117647\n",
"min_edge_num : 12\n",
"max_edge_num : 29\n",
"ave_node_degree : 3.0\n",
"min_node_degree : 3\n",
"max_node_degree : 3\n",
"node_label_num : 3\n",
"edge_label_num : 4\n",
"node_attr_dim : 0\n",
"edge_attr_dim : 0\n",
"class_number : 2\n",
"\n",
"\n",
"PAH:\n",
"substructures : {'non linear', 'linear'}\n",
"node_labeled : False\n",
"edge_labeled : False\n",
"is_directed : False\n",
"dataset_size : 94\n",
"ave_node_num : 20.70212765957447\n",
"min_node_num : 10\n",
"max_node_num : 28\n",
"ave_edge_num : 24.425531914893618\n",
"min_edge_num : 11\n",
"max_edge_num : 34\n",
"ave_node_degree : 3.0106382978723403\n",
"min_node_degree : 3\n",
"max_node_degree : 4\n",
"node_label_num : 1\n",
"edge_label_num : 1\n",
"node_attr_dim : 0\n",
"edge_attr_dim : 0\n",
"class_number : 2\n",
"\n",
"\n",
"MUTAG:\n",
"substructures : {'non linear', 'linear'}\n",
"node_labeled : True\n",
"edge_labeled : True\n",
"is_directed : False\n",
"dataset_size : 188\n",
"ave_node_num : 17.930851063829788\n",
"min_node_num : 10\n",
"max_node_num : 28\n",
"ave_edge_num : 19.79255319148936\n",
"min_edge_num : 10\n",
"max_edge_num : 33\n",
"ave_node_degree : 3.00531914893617\n",
"min_node_degree : 3\n",
"max_node_degree : 4\n",
"node_label_num : 7\n",
"edge_label_num : 11\n",
"node_attr_dim : 0\n",
"edge_attr_dim : 0\n",
"class_number : 2\n",
"\n",
"\n",
"Letter-med:\n",
"substructures : {'linear', 'non linear'}\n",
"substructures : {'non linear', 'linear'}\n",
"node_labeled : False\n",
"edge_labeled : False\n",
"is_directed : False\n",
"dataset_size : 2250\n",
"ave_graph_size : 4.674666666666667\n",
"min_graph_size : 1\n",
"max_graph_size : 9\n",
"ave_graph_edge_num : 3.2057777777777776\n",
"min_graph_edge_num : 0\n",
"max_graph_edge_num : 7\n",
"ave_graph_degree : 2.012888888888889\n",
"min_graph_degree : 0\n",
"max_graph_degree : 4\n",
"ave_node_num : 4.674666666666667\n",
"min_node_num : 1\n",
"max_node_num : 9\n",
"ave_edge_num : 3.2057777777777776\n",
"min_edge_num : 0\n",
"max_edge_num : 7\n",
"ave_node_degree : 2.012888888888889\n",
"min_node_degree : 0\n",
"max_node_degree : 4\n",
"node_label_num : 0\n",
"edge_label_num : 0\n",
"node_attr_dim : 2\n",
@@ -34,21 +144,43 @@
"class_number : 15\n",
"\n",
"\n",
"ENZYMES:\n",
"substructures : {'non linear', 'linear'}\n",
"node_labeled : True\n",
"edge_labeled : False\n",
"is_directed : False\n",
"dataset_size : 600\n",
"ave_node_num : 32.63333333333333\n",
"min_node_num : 2\n",
"max_node_num : 126\n",
"ave_edge_num : 62.13666666666666\n",
"min_edge_num : 1\n",
"max_edge_num : 149\n",
"ave_node_degree : 6.086666666666667\n",
"min_node_degree : 1\n",
"max_node_degree : 9\n",
"node_label_num : 3\n",
"edge_label_num : 0\n",
"node_attr_dim : 18\n",
"edge_attr_dim : 0\n",
"class_number : 6\n",
"\n",
"\n",
"Mutagenicity:\n",
"substructures : {'linear', 'non linear'}\n",
"substructures : {'non linear', 'linear'}\n",
"node_labeled : True\n",
"edge_labeled : True\n",
"is_directed : False\n",
"dataset_size : 4337\n",
"ave_graph_size : 30.317731150564907\n",
"min_graph_size : 4\n",
"max_graph_size : 417\n",
"ave_graph_edge_num : 30.76942587041734\n",
"min_graph_edge_num : 3\n",
"max_graph_edge_num : 112\n",
"ave_graph_degree : 3.75651371916071\n",
"min_graph_degree : 3\n",
"max_graph_degree : 4\n",
"ave_node_num : 30.317731150564907\n",
"min_node_num : 4\n",
"max_node_num : 417\n",
"ave_edge_num : 30.76942587041734\n",
"min_edge_num : 3\n",
"max_edge_num : 112\n",
"ave_node_degree : 3.75651371916071\n",
"min_node_degree : 3\n",
"max_node_degree : 4\n",
"node_label_num : 14\n",
"edge_label_num : 3\n",
"node_attr_dim : 0\n",
@@ -56,21 +188,43 @@
"class_number : 2\n",
"\n",
"\n",
"D&D:\n",
"substructures : {'non linear', 'linear'}\n",
"node_labeled : True\n",
"edge_labeled : False\n",
"is_directed : False\n",
"dataset_size : 1178\n",
"ave_node_num : 284.3166383701188\n",
"min_node_num : 30\n",
"max_node_num : 5748\n",
"ave_edge_num : 715.6587436332767\n",
"min_edge_num : 63\n",
"max_edge_num : 14267\n",
"ave_node_degree : 9.509337860780985\n",
"min_node_degree : 6\n",
"max_node_degree : 19\n",
"node_label_num : 82\n",
"edge_label_num : 0\n",
"node_attr_dim : 0\n",
"edge_attr_dim : 0\n",
"class_number : 2\n",
"\n",
"\n",
"AIDS:\n",
"substructures : {'linear', 'non linear'}\n",
"substructures : {'non linear', 'linear'}\n",
"node_labeled : True\n",
"edge_labeled : True\n",
"is_directed : False\n",
"dataset_size : 2000\n",
"ave_graph_size : 15.6925\n",
"min_graph_size : 2\n",
"max_graph_size : 95\n",
"ave_graph_edge_num : 16.195\n",
"min_graph_edge_num : 1\n",
"max_graph_edge_num : 103\n",
"ave_graph_degree : 3.322\n",
"min_graph_degree : 1\n",
"max_graph_degree : 6\n",
"ave_node_num : 15.6925\n",
"min_node_num : 2\n",
"max_node_num : 95\n",
"ave_edge_num : 16.195\n",
"min_edge_num : 1\n",
"max_edge_num : 103\n",
"ave_node_degree : 3.322\n",
"min_node_degree : 1\n",
"max_node_degree : 6\n",
"node_label_num : 38\n",
"edge_label_num : 3\n",
"node_attr_dim : 4\n",
@@ -84,15 +238,15 @@
"edge_labeled : False\n",
"is_directed : False\n",
"dataset_size : 41\n",
"ave_graph_size : 1377.2682926829268\n",
"min_graph_size : 134\n",
"max_graph_size : 5037\n",
"ave_graph_edge_num : 3074.0975609756097\n",
"min_graph_edge_num : 320\n",
"max_graph_edge_num : 10888\n",
"ave_graph_degree : 7.853658536585366\n",
"min_graph_degree : 6\n",
"max_graph_degree : 10\n",
"ave_node_num : 1377.2682926829268\n",
"min_node_num : 134\n",
"max_node_num : 5037\n",
"ave_edge_num : 3074.0975609756097\n",
"min_edge_num : 320\n",
"max_edge_num : 10888\n",
"ave_node_degree : 7.853658536585366\n",
"min_node_degree : 6\n",
"max_node_degree : 10\n",
"node_label_num : 5\n",
"edge_label_num : 0\n",
"node_attr_dim : 1\n",
@@ -101,20 +255,20 @@
"\n",
"\n",
"MSRC9:\n",
"substructures : {'linear', 'non linear'}\n",
"substructures : {'non linear', 'linear'}\n",
"node_labeled : True\n",
"edge_labeled : False\n",
"is_directed : False\n",
"dataset_size : 221\n",
"ave_graph_size : 40.57918552036199\n",
"min_graph_size : 25\n",
"max_graph_size : 55\n",
"ave_graph_edge_num : 97.9366515837104\n",
"min_graph_edge_num : 53\n",
"max_graph_edge_num : 145\n",
"ave_graph_degree : 10.158371040723981\n",
"min_graph_degree : 8\n",
"max_graph_degree : 16\n",
"ave_node_num : 40.57918552036199\n",
"min_node_num : 25\n",
"max_node_num : 55\n",
"ave_edge_num : 97.9366515837104\n",
"min_edge_num : 53\n",
"max_edge_num : 145\n",
"ave_node_degree : 10.158371040723981\n",
"min_node_degree : 8\n",
"max_node_degree : 16\n",
"node_label_num : 10\n",
"edge_label_num : 0\n",
"node_attr_dim : 0\n",
@@ -123,20 +277,20 @@
"\n",
"\n",
"MSRC21:\n",
"substructures : {'linear', 'non linear'}\n",
"substructures : {'non linear', 'linear'}\n",
"node_labeled : True\n",
"edge_labeled : False\n",
"is_directed : False\n",
"dataset_size : 563\n",
"ave_graph_size : 77.52042628774423\n",
"min_graph_size : 51\n",
"max_graph_size : 141\n",
"ave_graph_edge_num : 198.32326820603907\n",
"min_graph_edge_num : 121\n",
"max_graph_edge_num : 405\n",
"ave_graph_degree : 11.41563055062167\n",
"min_graph_degree : 8\n",
"max_graph_degree : 23\n",
"ave_node_num : 77.52042628774423\n",
"min_node_num : 51\n",
"max_node_num : 141\n",
"ave_edge_num : 198.32326820603907\n",
"min_edge_num : 121\n",
"max_edge_num : 405\n",
"ave_node_degree : 11.41563055062167\n",
"min_node_degree : 8\n",
"max_node_degree : 23\n",
"node_label_num : 22\n",
"edge_label_num : 0\n",
"node_attr_dim : 0\n",
@@ -145,20 +299,20 @@
"\n",
"\n",
"SYNTHETIC:\n",
"substructures : {'linear', 'non linear'}\n",
"substructures : {'non linear', 'linear'}\n",
"node_labeled : True\n",
"edge_labeled : False\n",
"is_directed : False\n",
"dataset_size : 300\n",
"ave_graph_size : 100.0\n",
"min_graph_size : 100\n",
"max_graph_size : 100\n",
"ave_graph_edge_num : 196.0\n",
"min_graph_edge_num : 196\n",
"max_graph_edge_num : 196\n",
"ave_graph_degree : 8.0\n",
"min_graph_degree : 8\n",
"max_graph_degree : 8\n",
"ave_node_num : 100.0\n",
"min_node_num : 100\n",
"max_node_num : 100\n",
"ave_edge_num : 196.0\n",
"min_edge_num : 196\n",
"max_edge_num : 196\n",
"ave_node_degree : 8.0\n",
"min_node_degree : 8\n",
"max_node_degree : 8\n",
"node_label_num : 8\n",
"edge_label_num : 0\n",
"node_attr_dim : 1\n",
@@ -167,20 +321,20 @@
"\n",
"\n",
"BZR:\n",
"substructures : {'linear', 'non linear'}\n",
"substructures : {'non linear', 'linear'}\n",
"node_labeled : True\n",
"edge_labeled : False\n",
"is_directed : False\n",
"dataset_size : 405\n",
"ave_graph_size : 35.75061728395062\n",
"min_graph_size : 13\n",
"max_graph_size : 57\n",
"ave_graph_edge_num : 38.358024691358025\n",
"min_graph_edge_num : 13\n",
"max_graph_edge_num : 60\n",
"ave_graph_degree : 3.8641975308641974\n",
"min_graph_degree : 3\n",
"max_graph_degree : 4\n",
"ave_node_num : 35.75061728395062\n",
"min_node_num : 13\n",
"max_node_num : 57\n",
"ave_edge_num : 38.358024691358025\n",
"min_edge_num : 13\n",
"max_edge_num : 60\n",
"ave_node_degree : 3.8641975308641974\n",
"min_node_degree : 3\n",
"max_node_degree : 4\n",
"node_label_num : 10\n",
"edge_label_num : 0\n",
"node_attr_dim : 3\n",
@@ -189,20 +343,20 @@
"\n",
"\n",
"COX2:\n",
"substructures : {'linear', 'non linear'}\n",
"substructures : {'non linear', 'linear'}\n",
"node_labeled : True\n",
"edge_labeled : False\n",
"is_directed : False\n",
"dataset_size : 467\n",
"ave_graph_size : 41.224839400428266\n",
"min_graph_size : 32\n",
"max_graph_size : 56\n",
"ave_graph_edge_num : 43.44539614561028\n",
"min_graph_edge_num : 34\n",
"max_graph_edge_num : 59\n",
"ave_graph_degree : 4.0\n",
"min_graph_degree : 4\n",
"max_graph_degree : 4\n",
"ave_node_num : 41.224839400428266\n",
"min_node_num : 32\n",
"max_node_num : 56\n",
"ave_edge_num : 43.44539614561028\n",
"min_edge_num : 34\n",
"max_edge_num : 59\n",
"ave_node_degree : 4.0\n",
"min_node_degree : 4\n",
"max_node_degree : 4\n",
"node_label_num : 8\n",
"edge_label_num : 0\n",
"node_attr_dim : 3\n",
@@ -211,20 +365,20 @@
"\n",
"\n",
"DHFR:\n",
"substructures : {'linear', 'non linear'}\n",
"substructures : {'non linear', 'linear'}\n",
"node_labeled : True\n",
"edge_labeled : False\n",
"is_directed : False\n",
"dataset_size : 756\n",
"ave_graph_size : 42.42724867724868\n",
"min_graph_size : 20\n",
"max_graph_size : 71\n",
"ave_graph_edge_num : 44.544973544973544\n",
"min_graph_edge_num : 21\n",
"max_graph_edge_num : 73\n",
"ave_graph_degree : 3.955026455026455\n",
"min_graph_degree : 3\n",
"max_graph_degree : 4\n",
"ave_node_num : 42.42724867724868\n",
"min_node_num : 20\n",
"max_node_num : 71\n",
"ave_edge_num : 44.544973544973544\n",
"min_edge_num : 21\n",
"max_edge_num : 73\n",
"ave_node_degree : 3.955026455026455\n",
"min_node_degree : 3\n",
"max_node_degree : 4\n",
"node_label_num : 9\n",
"edge_label_num : 0\n",
"node_attr_dim : 3\n",
@@ -232,43 +386,21 @@
"class_number : 2\n",
"\n",
"\n",
"ENZYMES:\n",
"substructures : {'linear', 'non linear'}\n",
"node_labeled : True\n",
"edge_labeled : False\n",
"is_directed : False\n",
"dataset_size : 600\n",
"ave_graph_size : 32.63333333333333\n",
"min_graph_size : 2\n",
"max_graph_size : 126\n",
"ave_graph_edge_num : 62.13666666666666\n",
"min_graph_edge_num : 1\n",
"max_graph_edge_num : 149\n",
"ave_graph_degree : 6.086666666666667\n",
"min_graph_degree : 1\n",
"max_graph_degree : 9\n",
"node_label_num : 3\n",
"edge_label_num : 0\n",
"node_attr_dim : 18\n",
"edge_attr_dim : 0\n",
"class_number : 6\n",
"\n",
"\n",
"PROTEINS:\n",
"substructures : {'linear', 'non linear'}\n",
"substructures : {'non linear', 'linear'}\n",
"node_labeled : True\n",
"edge_labeled : False\n",
"is_directed : False\n",
"dataset_size : 1113\n",
"ave_graph_size : 39.05750224618149\n",
"min_graph_size : 4\n",
"max_graph_size : 620\n",
"ave_graph_edge_num : 72.8158131176999\n",
"min_graph_edge_num : 5\n",
"max_graph_edge_num : 1049\n",
"ave_graph_degree : 5.794249775381851\n",
"min_graph_degree : 3\n",
"max_graph_degree : 25\n",
"ave_node_num : 39.05750224618149\n",
"min_node_num : 4\n",
"max_node_num : 620\n",
"ave_edge_num : 72.8158131176999\n",
"min_edge_num : 5\n",
"max_edge_num : 1049\n",
"ave_node_degree : 5.794249775381851\n",
"min_node_degree : 3\n",
"max_node_degree : 25\n",
"node_label_num : 3\n",
"edge_label_num : 0\n",
"node_attr_dim : 1\n",
@@ -277,135 +409,25 @@
"\n",
"\n",
"PROTEINS_full:\n",
"substructures : {'linear', 'non linear'}\n",
"substructures : {'non linear', 'linear'}\n",
"node_labeled : True\n",
"edge_labeled : False\n",
"is_directed : False\n",
"dataset_size : 1113\n",
"ave_graph_size : 39.05750224618149\n",
"min_graph_size : 4\n",
"max_graph_size : 620\n",
"ave_graph_edge_num : 72.8158131176999\n",
"min_graph_edge_num : 5\n",
"max_graph_edge_num : 1049\n",
"ave_graph_degree : 5.794249775381851\n",
"min_graph_degree : 3\n",
"max_graph_degree : 25\n",
"ave_node_num : 39.05750224618149\n",
"min_node_num : 4\n",
"max_node_num : 620\n",
"ave_edge_num : 72.8158131176999\n",
"min_edge_num : 5\n",
"max_edge_num : 1049\n",
"ave_node_degree : 5.794249775381851\n",
"min_node_degree : 3\n",
"max_node_degree : 25\n",
"node_label_num : 3\n",
"edge_label_num : 0\n",
"node_attr_dim : 29\n",
"edge_attr_dim : 0\n",
"class_number : 2\n",
"\n",
"\n",
"D&D:\n",
"substructures : {'linear', 'non linear'}\n",
"node_labeled : True\n",
"edge_labeled : False\n",
"is_directed : False\n",
"dataset_size : 1178\n",
"ave_graph_size : 284.3166383701188\n",
"min_graph_size : 30\n",
"max_graph_size : 5748\n",
"ave_graph_edge_num : 715.6587436332767\n",
"min_graph_edge_num : 63\n",
"max_graph_edge_num : 14267\n",
"ave_graph_degree : 9.509337860780985\n",
"min_graph_degree : 6\n",
"max_graph_degree : 19\n",
"node_label_num : 82\n",
"edge_label_num : 0\n",
"node_attr_dim : 0\n",
"edge_attr_dim : 0\n",
"class_number : 2\n",
"\n",
"\n",
"MUTAG:\n",
"substructures : {'linear', 'non linear'}\n",
"node_labeled : True\n",
"edge_labeled : True\n",
"is_directed : False\n",
"dataset_size : 188\n",
"ave_graph_size : 17.930851063829788\n",
"min_graph_size : 10\n",
"max_graph_size : 28\n",
"ave_graph_edge_num : 19.79255319148936\n",
"min_graph_edge_num : 10\n",
"max_graph_edge_num : 33\n",
"ave_graph_degree : 3.00531914893617\n",
"min_graph_degree : 3\n",
"max_graph_degree : 4\n",
"node_label_num : 7\n",
"edge_label_num : 11\n",
"node_attr_dim : 0\n",
"edge_attr_dim : 0\n",
"class_number : 2\n",
"\n",
"\n",
"Alkane:\n",
"substructures : {'linear', 'non linear'}\n",
"node_labeled : True\n",
"edge_labeled : False\n",
"is_directed : False\n",
"dataset_size : 150\n",
"ave_graph_size : 8.873333333333333\n",
"min_graph_size : 1\n",
"max_graph_size : 10\n",
"ave_graph_edge_num : 7.873333333333333\n",
"min_graph_edge_num : 0\n",
"max_graph_edge_num : 9\n",
"ave_graph_degree : 3.36\n",
"min_graph_degree : 0\n",
"max_graph_degree : 4\n",
"node_label_num : 2\n",
"edge_label_num : 1\n",
"node_attr_dim : 0\n",
"edge_attr_dim : 0\n",
"class_number : 123\n",
"\n",
"\n",
"Acyclic:\n",
"substructures : {'linear', 'non linear'}\n",
"node_labeled : True\n",
"edge_labeled : False\n",
"is_directed : False\n",
"dataset_size : 183\n",
"ave_graph_size : 8.153005464480874\n",
"min_graph_size : 3\n",
"max_graph_size : 11\n",
"ave_graph_edge_num : 7.1530054644808745\n",
"min_graph_edge_num : 2\n",
"max_graph_edge_num : 10\n",
"ave_graph_degree : 2.80327868852459\n",
"min_graph_degree : 2\n",
"max_graph_degree : 4\n",
"node_label_num : 3\n",
"edge_label_num : 1\n",
"node_attr_dim : 0\n",
"edge_attr_dim : 0\n",
"class_number : 148\n",
"\n",
"\n",
"MAO:\n",
"substructures : {'linear', 'non linear'}\n",
"node_labeled : True\n",
"edge_labeled : True\n",
"is_directed : False\n",
"dataset_size : 68\n",
"ave_graph_size : 18.38235294117647\n",
"min_graph_size : 11\n",
"max_graph_size : 27\n",
"ave_graph_edge_num : 19.63235294117647\n",
"min_graph_edge_num : 12\n",
"max_graph_edge_num : 29\n",
"ave_graph_degree : 3.0\n",
"min_graph_degree : 3\n",
"max_graph_degree : 3\n",
"node_label_num : 3\n",
"edge_label_num : 4\n",
"node_attr_dim : 0\n",
"edge_attr_dim : 0\n",
"class_number : 2\n",
"\n"
]
},
@@ -414,43 +436,21 @@
"output_type": "stream",
"text": [
"\n",
"PAH:\n",
"substructures : {'linear', 'non linear'}\n",
"node_labeled : False\n",
"edge_labeled : False\n",
"is_directed : False\n",
"dataset_size : 94\n",
"ave_graph_size : 20.70212765957447\n",
"min_graph_size : 10\n",
"max_graph_size : 28\n",
"ave_graph_edge_num : 24.425531914893618\n",
"min_graph_edge_num : 11\n",
"max_graph_edge_num : 34\n",
"ave_graph_degree : 3.0106382978723403\n",
"min_graph_degree : 3\n",
"max_graph_degree : 4\n",
"node_label_num : 1\n",
"edge_label_num : 1\n",
"node_attr_dim : 0\n",
"edge_attr_dim : 0\n",
"class_number : 2\n",
"\n",
"\n",
"NCI1:\n",
"substructures : {'linear', 'non linear'}\n",
"substructures : {'non linear', 'linear'}\n",
"node_labeled : True\n",
"edge_labeled : False\n",
"is_directed : False\n",
"dataset_size : 4110\n",
"ave_graph_size : 29.8654501216545\n",
"min_graph_size : 3\n",
"max_graph_size : 111\n",
"ave_graph_edge_num : 32.3\n",
"min_graph_edge_num : 2\n",
"max_graph_edge_num : 119\n",
"ave_graph_degree : 3.3360097323600972\n",
"min_graph_degree : 2\n",
"max_graph_degree : 4\n",
"ave_node_num : 29.8654501216545\n",
"min_node_num : 3\n",
"max_node_num : 111\n",
"ave_edge_num : 32.3\n",
"min_edge_num : 2\n",
"max_edge_num : 119\n",
"ave_node_degree : 3.3360097323600972\n",
"min_node_degree : 2\n",
"max_node_degree : 4\n",
"node_label_num : 37\n",
"edge_label_num : 0\n",
"node_attr_dim : 0\n",
@@ -459,39 +459,51 @@
"\n",
"\n",
"NCI109:\n",
"substructures : {'linear', 'non linear'}\n",
"substructures : {'non linear', 'linear'}\n",
"node_labeled : True\n",
"edge_labeled : False\n",
"is_directed : False\n",
"dataset_size : 4127\n",
"ave_graph_size : 29.681124303368065\n",
"min_graph_size : 4\n",
"max_graph_size : 111\n",
"ave_graph_edge_num : 32.13084565059365\n",
"min_graph_edge_num : 3\n",
"max_graph_edge_num : 119\n",
"ave_graph_degree : 3.343833292948873\n",
"min_graph_degree : 2\n",
"max_graph_degree : 5\n",
"ave_node_num : 29.681124303368065\n",
"min_node_num : 4\n",
"max_node_num : 111\n",
"ave_edge_num : 32.13084565059365\n",
"min_edge_num : 3\n",
"max_edge_num : 119\n",
"ave_node_degree : 3.343833292948873\n",
"min_node_degree : 2\n",
"max_node_degree : 5\n",
"node_label_num : 38\n",
"edge_label_num : 0\n",
"node_attr_dim : 0\n",
"edge_attr_dim : 0\n",
"class_number : 2\n",
"\n",
"load SDF: 100%|██████████| 4457424/4457424 [00:10<00:00, 430440.94it/s]\n",
"ajust data: 100%|██████████| 42687/42687 [00:09<00:00, 4352.25it/s] \n",
"\n",
"NCI-HIV:\n",
"substructures : {'non linear', 'linear'}\n",
"node_labeled : True\n",
"edge_labeled : True\n",
"is_directed : False\n",
"dataset_size : 42682\n",
"ave_node_num : 45.70945597675835\n",
"min_node_num : 2\n",
"max_node_num : 438\n",
"ave_edge_num : 47.7137903565906\n",
"min_edge_num : 1\n",
"max_edge_num : 441\n",
"ave_node_degree : 3.9760554800618526\n",
"min_node_degree : 1\n",
"max_node_degree : 12\n",
"node_label_num : 63\n",
"edge_label_num : 3\n",
"node_attr_dim : 0\n",
"edge_attr_dim : 0\n",
"class_number : 3\n",
"\n"
]
},
{
"ename": "ModuleNotFoundError",
"evalue": "No module named 'tqdm'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-1-1e4da065c026>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 45\u001b[0m \u001b[0mds\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'dataset'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 46\u001b[0m \u001b[0mfilename_y\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mds\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'dataset_y'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;34m'dataset_y'\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mds\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 47\u001b[0;31m extra_params=(ds['extra_params'] if 'extra_params' in ds else None))\n\u001b[0m\u001b[1;32m 48\u001b[0m attrs = get_dataset_attributes(\n\u001b[1;32m 49\u001b[0m dataset, target=y, node_label='atom', edge_label='bond_type')\n",
"\u001b[0;32m/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/utils/graphfiles.py\u001b[0m in \u001b[0;36mloadDataset\u001b[0;34m(filename, filename_y, extra_params)\u001b[0m\n\u001b[1;32m 377\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0mextension\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m\"sdf\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 378\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mnumpy\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 379\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mtqdm\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mtqdm\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 380\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0msys\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 381\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'tqdm'"
]
}
],
"source": [
@@ -501,8 +513,18 @@
"from pygraph.utils.graphdataset import get_dataset_attributes\n",
"\n",
"dslist = [\n",
" {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',},\n",
" {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds',\n",
" 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt',},\n",
" {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds',},\n",
" {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds',},\n",
" {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat',\n",
" 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}},\n",
" {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'},\n",
" {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'},\n",
" {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'},\n",
" {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat',\n",
" 'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}},\n",
" {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'},\n",
" {'name': 'FIRSTMM_DB', 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'},\n",
" {'name': 'MSRC9', 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'},\n",
@@ -510,19 +532,9 @@
" {'name': 'SYNTHETIC', 'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt'},\n",
" {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'},\n",
" {'name': 'COX2', 'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt'},\n",
" {'name': 'DHFR', 'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt'},\n",
" {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'},\n",
" {'name': 'DHFR', 'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt'}, \n",
" {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'},\n",
" {'name': 'PROTEINS_full', 'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'},\n",
" {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat',\n",
" 'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}},\n",
" {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat',\n",
" 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}},\n",
" {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds',\n",
" 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt',},\n",
" {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',},\n",
" {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds',},\n",
" {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds',},\n",
" {'name': 'PROTEINS_full', 'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'}, \n",
" {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1.mat',\n",
" 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}},\n",
" {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109.mat',\n",


+ 1922
- 1
notebooks/plot_all_graphs.ipynb
File diff suppressed because it is too large
View File


+ 12
- 13
notebooks/run_spkernel.py View File

@@ -1,22 +1,21 @@
import functools
from libs import *
import multiprocessing
from sklearn.metrics.pairwise import rbf_kernel

from pygraph.kernels.spKernel import spkernel
from pygraph.utils.kernels import deltakernel, kernelproduct
from pygraph.utils.kernels import deltakernel, gaussiankernel, kernelproduct
#from pygraph.utils.model_selection_precomputed import trial_do

dslist = [
{'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',
'task': 'regression'}, # node symb
{'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression',
'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt', },
# contains single node graph, node symb
{'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds', }, # node/edge symb
{'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds', }, # unlabeled
{'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat',
'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, # node/edge symb
# {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',
# 'task': 'regression'}, # node symb
# {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression',
# 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt', },
# # contains single node graph, node symb
# {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds', }, # node/edge symb
# {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds', }, # unlabeled
# {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat',
# 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, # node/edge symb
{'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'},
# node nsymb
{'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'},
@@ -54,9 +53,9 @@ dslist = [
# {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',},
]
estimator = spkernel
mixkernel = functools.partial(kernelproduct, deltakernel, rbf_kernel)
mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)
param_grid_precomputed = {'node_kernels': [
{'symb': deltakernel, 'nsymb': rbf_kernel, 'mix': mixkernel}]}
{'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}]}
param_grid = [{'C': np.logspace(-10, 3, num=27, base=10)},
{'alpha': np.logspace(-10, 10, num=41, base=10)}]



+ 4
- 5
notebooks/run_structuralspkernel.py View File

@@ -9,10 +9,9 @@ Created on Fri Sep 28 16:37:29 2018
import functools
from libs import *
import multiprocessing
from sklearn.metrics.pairwise import rbf_kernel

from pygraph.kernels.structuralspKernel import structuralspkernel
from pygraph.utils.kernels import deltakernel, kernelproduct
from pygraph.utils.kernels import deltakernel, gaussiankernel, kernelproduct

dslist = [
# {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',
@@ -61,11 +60,11 @@ dslist = [
# {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',},
]
estimator = structuralspkernel
mixkernel = functools.partial(kernelproduct, deltakernel, rbf_kernel)
mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)
param_grid_precomputed = {'node_kernels':
[{'symb': deltakernel, 'nsymb': rbf_kernel, 'mix': mixkernel}],
[{'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}],
'edge_kernels':
[{'symb': deltakernel, 'nsymb': rbf_kernel, 'mix': mixkernel}]}
[{'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}]}
param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)},
{'alpha': np.logspace(-10, 10, num=41, base=10)}]



+ 77
- 0
notebooks/test.py View File

@@ -0,0 +1,77 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Sep 28 16:37:29 2018

@author: ljia
"""

import functools
from libs import *
import multiprocessing
from sklearn.metrics.pairwise import rbf_kernel

from pygraph.kernels.structuralspKernel import structuralspkernel
from pygraph.utils.kernels import deltakernel, kernelproduct

dslist = [
# {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',
# 'task': 'regression'}, # node symb
# {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression',
# 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt', }, # contains single node graph, node symb
{'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds', }, # node/edge symb

# {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb
# # # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb
# # # {'name': 'COX2', 'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt'}, # node symb/nsymb
# {'name': 'Fingerprint', 'dataset': '../datasets/Fingerprint/Fingerprint_A.txt'},
#
# # {'name': 'DHFR', 'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt'}, # node symb/nsymb
# # {'name': 'SYNTHETIC', 'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt'}, # node symb/nsymb
# # {'name': 'MSRC9', 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'}, # node symb
# # {'name': 'MSRC21', 'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'}, # node symb
# # {'name': 'FIRSTMM_DB', 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'}, # node symb/nsymb ,edge nsymb

# # {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'}, # node symb/nsymb
# # {'name': 'PROTEINS_full', 'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'}, # node symb/nsymb
# # {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb
# {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1.mat',
# 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb
# {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109.mat',
# 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb
# {'name': 'NCI-HIV', 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf',
# 'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',}, # node/edge symb

# # not working below
# {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',},
# {'name': 'PTC_FR', 'dataset': '../datasets/PTC/Train/FR.ds',},
# {'name': 'PTC_MM', 'dataset': '../datasets/PTC/Train/MM.ds',},
# {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',},
]
estimator = structuralspkernel
mixkernel = functools.partial(kernelproduct, deltakernel, rbf_kernel)
param_grid_precomputed = {'node_kernels':
[{'symb': deltakernel, 'nsymb': rbf_kernel, 'mix': mixkernel}],
'edge_kernels':
[{'symb': deltakernel, 'nsymb': rbf_kernel, 'mix': mixkernel}]}
param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)},
{'alpha': np.logspace(-10, 10, num=41, base=10)}]

for ds in dslist:
print()
print(ds['name'])
model_selection_for_precomputed_kernel(
ds['dataset'],
estimator,
param_grid_precomputed,
(param_grid[1] if ('task' in ds and ds['task']
== 'regression') else param_grid[0]),
(ds['task'] if 'task' in ds else 'classification'),
NUM_TRIALS=30,
datafile_y=(ds['dataset_y'] if 'dataset_y' in ds else None),
extra_params=(ds['extra_params'] if 'extra_params' in ds else None),
ds_name=ds['name'],
n_jobs=multiprocessing.cpu_count(),
read_gm_from_file=False)
print()

+ 84
- 84
pygraph/kernels/spKernel.py View File

@@ -53,7 +53,6 @@ def spkernel(*args,
"""
# pre-process
Gn = args[0] if len(args) == 1 else [args[0], args[1]]

weight = None
if edge_weight is None:
print('\n None edge weight specified. Set all weight to 1.\n')
@@ -76,7 +75,8 @@ def spkernel(*args,
attr_names=['node_labeled', 'node_attr_dim', 'is_directed'],
node_label=node_label)

# remove graphs with no edges, as no sp can be found in their structures, so the kernel between such a graph and itself will be zero.
# remove graphs with no edges, as no sp can be found in their structures,
# so the kernel between such a graph and itself will be zero.
len_gn = len(Gn)
Gn = [(idx, G) for idx, G in enumerate(Gn) if nx.number_of_edges(G) != 0]
idx = [G[0] for G in Gn]
@@ -208,93 +208,93 @@ def spkernel_do(Gn, ds_attrs, node_label, node_kernels, ij):
g2 = Gn[j]
kernel = 0

try:
# compute shortest path matrices first, method borrowed from FCSP.
if ds_attrs['node_labeled']:
# node symb and non-synb labeled
if ds_attrs['node_attr_dim'] > 0:
kn = node_kernels['mix']
vk_dict = {} # shortest path matrices dict
for n1, n2 in product(
g1.nodes(data=True), g2.nodes(data=True)):
vk_dict[(n1[0], n2[0])] = kn(
n1[1][node_label], n2[1][node_label],
[n1[1]['attributes']], [n2[1]['attributes']])
# node symb labeled
else:
kn = node_kernels['symb']
vk_dict = {} # shortest path matrices dict
for n1 in g1.nodes(data=True):
for n2 in g2.nodes(data=True):
vk_dict[(n1[0], n2[0])] = kn(n1[1][node_label],
n2[1][node_label])
# try:
# compute shortest path matrices first, method borrowed from FCSP.
if ds_attrs['node_labeled']:
# node symb and non-synb labeled
if ds_attrs['node_attr_dim'] > 0:
kn = node_kernels['mix']
vk_dict = {} # shortest path matrices dict
for n1, n2 in product(
g1.nodes(data=True), g2.nodes(data=True)):
vk_dict[(n1[0], n2[0])] = kn(
n1[1][node_label], n2[1][node_label],
n1[1]['attributes'], n2[1]['attributes'])
# node symb labeled
else:
# node non-synb labeled
if ds_attrs['node_attr_dim'] > 0:
kn = node_kernels['nsymb']
vk_dict = {} # shortest path matrices dict
for n1 in g1.nodes(data=True):
for n2 in g2.nodes(data=True):
vk_dict[(n1[0], n2[0])] = kn([n1[1]['attributes']],
[n2[1]['attributes']])
# node unlabeled
else:
for e1, e2 in product(
g1.edges(data=True), g2.edges(data=True)):
if e1[2]['cost'] == e2[2]['cost']:
kernel += 1
return i, j, kernel

# compute graph kernels
if ds_attrs['is_directed']:
for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)):
if e1[2]['cost'] == e2[2]['cost']:
nk11, nk22 = vk_dict[(e1[0], e2[0])], vk_dict[(e1[1],
e2[1])]
kn1 = nk11 * nk22
kernel += kn1
kn = node_kernels['symb']
vk_dict = {} # shortest path matrices dict
for n1 in g1.nodes(data=True):
for n2 in g2.nodes(data=True):
vk_dict[(n1[0], n2[0])] = kn(n1[1][node_label],
n2[1][node_label])
else:
# node non-synb labeled
if ds_attrs['node_attr_dim'] > 0:
kn = node_kernels['nsymb']
vk_dict = {} # shortest path matrices dict
for n1 in g1.nodes(data=True):
for n2 in g2.nodes(data=True):
vk_dict[(n1[0], n2[0])] = kn(n1[1]['attributes'],
n2[1]['attributes'])
# node unlabeled
else:
for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)):
for e1, e2 in product(
g1.edges(data=True), g2.edges(data=True)):
if e1[2]['cost'] == e2[2]['cost']:
# each edge walk is counted twice, starting from both its extreme nodes.
nk11, nk12, nk21, nk22 = vk_dict[(e1[0], e2[0])], vk_dict[(
e1[0], e2[1])], vk_dict[(e1[1],
e2[0])], vk_dict[(e1[1],
e2[1])]
kn1 = nk11 * nk22
kn2 = nk12 * nk21
kernel += kn1 + kn2
kernel += 1
return i, j, kernel

# compute graph kernels
if ds_attrs['is_directed']:
for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)):
if e1[2]['cost'] == e2[2]['cost']:
nk11, nk22 = vk_dict[(e1[0], e2[0])], vk_dict[(e1[1],
e2[1])]
kn1 = nk11 * nk22
kernel += kn1
else:
for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)):
if e1[2]['cost'] == e2[2]['cost']:
# each edge walk is counted twice, starting from both its extreme nodes.
nk11, nk12, nk21, nk22 = vk_dict[(e1[0], e2[0])], vk_dict[(
e1[0], e2[1])], vk_dict[(e1[1],
e2[0])], vk_dict[(e1[1],
e2[1])]
kn1 = nk11 * nk22
kn2 = nk12 * nk21
kernel += kn1 + kn2

# # ---- exact implementation of the Fast Computation of Shortest Path Kernel (FCSP), reference [2], sadly it is slower than the current implementation
# # compute vertex kernels
# try:
# vk_mat = np.zeros((nx.number_of_nodes(g1),
# nx.number_of_nodes(g2)))
# g1nl = enumerate(g1.nodes(data=True))
# g2nl = enumerate(g2.nodes(data=True))
# for i1, n1 in g1nl:
# for i2, n2 in g2nl:
# vk_mat[i1][i2] = kn(
# n1[1][node_label], n2[1][node_label],
# [n1[1]['attributes']], [n2[1]['attributes']])
# # ---- exact implementation of the Fast Computation of Shortest Path Kernel (FCSP), reference [2], sadly it is slower than the current implementation
# # compute vertex kernels
# try:
# vk_mat = np.zeros((nx.number_of_nodes(g1),
# nx.number_of_nodes(g2)))
# g1nl = enumerate(g1.nodes(data=True))
# g2nl = enumerate(g2.nodes(data=True))
# for i1, n1 in g1nl:
# for i2, n2 in g2nl:
# vk_mat[i1][i2] = kn(
# n1[1][node_label], n2[1][node_label],
# [n1[1]['attributes']], [n2[1]['attributes']])

# range1 = range(0, len(edge_w_g[i]))
# range2 = range(0, len(edge_w_g[j]))
# for i1 in range1:
# x1 = edge_x_g[i][i1]
# y1 = edge_y_g[i][i1]
# w1 = edge_w_g[i][i1]
# for i2 in range2:
# x2 = edge_x_g[j][i2]
# y2 = edge_y_g[j][i2]
# w2 = edge_w_g[j][i2]
# ke = (w1 == w2)
# if ke > 0:
# kn1 = vk_mat[x1][x2] * vk_mat[y1][y2]
# kn2 = vk_mat[x1][y2] * vk_mat[y1][x2]
# kernel += kn1 + kn2
except KeyError: # missing labels or attributes
pass
# range1 = range(0, len(edge_w_g[i]))
# range2 = range(0, len(edge_w_g[j]))
# for i1 in range1:
# x1 = edge_x_g[i][i1]
# y1 = edge_y_g[i][i1]
# w1 = edge_w_g[i][i1]
# for i2 in range2:
# x2 = edge_x_g[j][i2]
# y2 = edge_y_g[j][i2]
# w2 = edge_w_g[j][i2]
# ke = (w1 == w2)
# if ke > 0:
# kn1 = vk_mat[x1][x2] * vk_mat[y1][y2]
# kn2 = vk_mat[x1][y2] * vk_mat[y1][x2]
# kernel += kn1 + kn2
# except KeyError: # missing labels or attributes
# pass

return i, j, kernel



+ 11
- 14
pygraph/kernels/structuralspKernel.py View File

@@ -194,13 +194,12 @@ def structuralspkernel(*args,
# # ---- direct running, normally use single CPU core. ----
# itr = combinations_with_replacement(range(0, len(Gn)), 2)
# for gs in tqdm(itr, desc='calculating kernels', file=sys.stdout):
# if gs[0] == 24 and gs[1] == 411:
# i, j, kernel = structuralspkernel_do(Gn, splist, ds_attrs,
# node_label, edge_label, node_kernels, edge_kernels, gs)
# if(kernel > 1):
# print("error here ")
# Kmatrix[i][j] = kernel
# Kmatrix[j][i] = kernel
# i, j, kernel = structuralspkernel_do(Gn, splist, ds_attrs,
# node_label, edge_label, node_kernels, edge_kernels, gs)
# if(kernel > 1):
# print("error here ")
# Kmatrix[i][j] = kernel
# Kmatrix[j][i] = kernel

run_time = time.time() - start_time
print(
@@ -232,7 +231,7 @@ def structuralspkernel_do(Gn, splist, ds_attrs, node_label, edge_label,
g1.nodes(data=True), g2.nodes(data=True)):
vk_dict[(n1[0], n2[0])] = kn(
n1[1][node_label], n2[1][node_label],
[n1[1]['attributes']], [n2[1]['attributes']])
n1[1]['attributes'], n2[1]['attributes'])
# node symb labeled
else:
kn = node_kernels['symb']
@@ -248,8 +247,8 @@ def structuralspkernel_do(Gn, splist, ds_attrs, node_label, edge_label,
vk_dict = {} # shortest path matrices dict
for n1 in g1.nodes(data=True):
for n2 in g2.nodes(data=True):
vk_dict[(n1[0], n2[0])] = kn([n1[1]['attributes']],
[n2[1]['attributes']])
vk_dict[(n1[0], n2[0])] = kn(n1[1]['attributes'],
n2[1]['attributes'])
# node unlabeled
else:
vk_dict = {}
@@ -265,7 +264,7 @@ def structuralspkernel_do(Gn, splist, ds_attrs, node_label, edge_label,
for e1, e2 in product(
g1.edges(data=True), g2.edges(data=True)):
ek_temp = ke(e1[2][edge_label], e2[2][edge_label],
[e1[2]['attributes']], [e2[2]['attributes']])
e1[2]['attributes'], e2[2]['attributes'])
ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ek_temp
ek_dict[((e1[1], e1[0]), (e2[0], e2[1]))] = ek_temp
ek_dict[((e1[0], e1[1]), (e2[1], e2[0]))] = ek_temp
@@ -288,7 +287,7 @@ def structuralspkernel_do(Gn, splist, ds_attrs, node_label, edge_label,
ek_dict = {}
for e1 in g1.edges(data=True):
for e2 in g2.edges(data=True):
ek_temp = kn([e1[2]['attributes']], [e2[2]['attributes']])
ek_temp = kn(e1[2]['attributes'], e2[2]['attributes'])
ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ek_temp
ek_dict[((e1[1], e1[0]), (e2[0], e2[1]))] = ek_temp
ek_dict[((e1[0], e1[1]), (e2[1], e2[0]))] = ek_temp
@@ -374,8 +373,6 @@ def structuralspkernel_do(Gn, splist, ds_attrs, node_label, edge_label,
# print("toto")
# pass

if(kernel > 1):
print("kernel error : ", ij)
return iglobal, jglobal, kernel




+ 54
- 65
pygraph/utils/graphdataset.py View File

@@ -15,29 +15,29 @@ def get_dataset_attributes(Gn,
def get_dataset_size(Gn):
return len(Gn)

def get_all_graph_size(Gn):
def get_all_node_num(Gn):
return [nx.number_of_nodes(G) for G in Gn]

def get_ave_graph_size(all_graph_size):
return np.mean(all_graph_size)
def get_ave_node_num(all_node_num):
return np.mean(all_node_num)

def get_min_graph_size(all_graph_size):
return np.amin(all_graph_size)
def get_min_node_num(all_node_num):
return np.amin(all_node_num)

def get_max_graph_size(Gn):
return np.amax(all_graph_size)
def get_max_node_num(all_node_num):
return np.amax(all_node_num)

def get_all_graph_edge_num(Gn):
def get_all_edge_num(Gn):
return [nx.number_of_edges(G) for G in Gn]

def get_ave_graph_edge_num(all_graph_edge_num):
return np.mean(all_graph_edge_num)
def get_ave_edge_num(all_edge_num):
return np.mean(all_edge_num)

def get_min_graph_edge_num(all_graph_edge_num):
return np.amin(all_graph_edge_num)
def get_min_edge_num(all_edge_num):
return np.amin(all_edge_num)

def get_max_graph_edge_num(all_graph_edge_num):
return np.amax(all_graph_edge_num)
def get_max_edge_num(all_edge_num):
return np.amax(all_edge_num)

def is_node_labeled(Gn):
return False if node_label is None else True
@@ -60,13 +60,13 @@ def get_dataset_attributes(Gn,
def is_directed(Gn):
return nx.is_directed(Gn[0])

def get_ave_graph_degree(Gn):
def get_ave_node_degree(Gn):
return np.mean([np.amax(list(dict(G.degree()).values())) for G in Gn])

def get_max_graph_degree(Gn):
def get_max_node_degree(Gn):
return np.amax([np.amax(list(dict(G.degree()).values())) for G in Gn])

def get_min_graph_degree(Gn):
def get_min_node_degree(Gn):
return np.amin([np.amax(list(dict(G.degree()).values())) for G in Gn])

def get_substructures(Gn):
@@ -107,11 +107,11 @@ def get_dataset_attributes(Gn,
return len(set(target))

def get_node_attr_dim(Gn):
attrs = Gn[0].nodes[0]
if 'attributes' in attrs:
return len(attrs['attributes'])
else:
return 0
for G in Gn:
for n in G.nodes(data=True):
if 'attributes' in n[1]:
return len(n[1]['attributes'])
return 0

def get_edge_attr_dim(Gn):
for G in Gn:
@@ -119,8 +119,6 @@ def get_dataset_attributes(Gn,
for e in G.edges(data=True):
if 'attributes' in e[2]:
return len(e[2]['attributes'])
else:
return 0
return 0

if attr_names == []:
@@ -130,15 +128,15 @@ def get_dataset_attributes(Gn,
'edge_labeled',
'is_directed',
'dataset_size',
'ave_graph_size',
'min_graph_size',
'max_graph_size',
'ave_graph_edge_num',
'min_graph_edge_num',
'max_graph_edge_num',
'ave_graph_degree',
'min_graph_degree',
'max_graph_degree',
'ave_node_num',
'min_node_num',
'max_node_num',
'ave_edge_num',
'min_edge_num',
'max_edge_num',
'ave_node_degree',
'min_node_degree',
'max_node_degree',
'node_label_num',
'edge_label_num',
'node_attr_dim',
@@ -151,50 +149,41 @@ def get_dataset_attributes(Gn,

attrs.update({'dataset_size': get_dataset_size(Gn)})

# graph size
# graph node number
if any(i in attr_names
for i in ['ave_graph_size', 'min_graph_size', 'max_graph_size']):
for i in ['ave_node_num', 'min_node_num', 'max_node_num']):

all_graph_size = get_all_graph_size(Gn)
all_node_num = get_all_node_num(Gn)

if 'ave_graph_size' in attr_names:
if 'ave_node_num' in attr_names:

attrs.update({'ave_graph_size': get_ave_graph_size(all_graph_size)})
attrs.update({'ave_node_num': get_ave_node_num(all_node_num)})

if 'min_graph_size' in attr_names:
if 'min_node_num' in attr_names:

attrs.update({'min_graph_size': get_min_graph_size(all_graph_size)})
attrs.update({'min_node_num': get_min_node_num(all_node_num)})

if 'max_graph_size' in attr_names:
if 'max_node_num' in attr_names:

attrs.update({'max_graph_size': get_max_graph_size(all_graph_size)})
attrs.update({'max_node_num': get_max_node_num(all_node_num)})

# graph edge number
if any(i in attr_names for i in
['ave_graph_edge_num', 'min_graph_edge_num', 'max_graph_edge_num']):
['ave_edge_num', 'min_edge_num', 'max_edge_num']):

all_graph_edge_num = get_all_graph_edge_num(Gn)
all_edge_num = get_all_edge_num(Gn)

if 'ave_graph_edge_num' in attr_names:
if 'ave_edge_num' in attr_names:

attrs.update({
'ave_graph_edge_num':
get_ave_graph_edge_num(all_graph_edge_num)
})
attrs.update({'ave_edge_num': get_ave_edge_num(all_edge_num)})

if 'max_graph_edge_num' in attr_names:
if 'max_edge_num' in attr_names:

attrs.update({
'max_graph_edge_num':
get_max_graph_edge_num(all_graph_edge_num)
})
attrs.update({'max_edge_num': get_max_edge_num(all_edge_num)})

if 'min_graph_edge_num' in attr_names:
if 'min_edge_num' in attr_names:

attrs.update({
'min_graph_edge_num':
get_min_graph_edge_num(all_graph_edge_num)
})
attrs.update({'min_edge_num': get_min_edge_num(all_edge_num)})

# label number
if any(i in attr_names for i in ['node_labeled', 'node_label_num']):
@@ -222,14 +211,14 @@ def get_dataset_attributes(Gn,
if 'is_directed' in attr_names:
attrs.update({'is_directed': is_directed(Gn)})

if 'ave_graph_degree' in attr_names:
attrs.update({'ave_graph_degree': get_ave_graph_degree(Gn)})
if 'ave_node_degree' in attr_names:
attrs.update({'ave_node_degree': get_ave_node_degree(Gn)})

if 'max_graph_degree' in attr_names:
attrs.update({'max_graph_degree': get_max_graph_degree(Gn)})
if 'max_node_degree' in attr_names:
attrs.update({'max_node_degree': get_max_node_degree(Gn)})

if 'min_graph_degree' in attr_names:
attrs.update({'min_graph_degree': get_min_graph_degree(Gn)})
if 'min_node_degree' in attr_names:
attrs.update({'min_node_degree': get_min_node_degree(Gn)})

if 'substructures' in attr_names:
attrs.update({'substructures': get_substructures(Gn)})


+ 37
- 4
pygraph/utils/kernels.py View File

@@ -1,6 +1,7 @@
"""Those who are not graph kernels. We can be kernels for nodes or edges!
These kernels are defined between pairs of vectors.
"""
import numpy as np

def deltakernel(x, y):
"""Delta kernel. Return 1 if x == y, 0 otherwise.
@@ -17,15 +18,47 @@ def deltakernel(x, y):

References
----------
[1] H. Kashima, K. Tsuda, and A. Inokuchi. Marginalized kernels between labeled graphs. In Proceedings of the 20th International Conference on Machine Learning, Washington, DC, United States, 2003.
[1] H. Kashima, K. Tsuda, and A. Inokuchi. Marginalized kernels between
labeled graphs. In Proceedings of the 20th International Conference on
Machine Learning, Washington, DC, United States, 2003.
"""
return x == y #(1 if condition else 0)


def gaussiankernel(x, y):
def gaussiankernel(x, y, gamma=None):
"""Gaussian kernel. Use sklearn.metrics.pairwise.rbf_kernel instead.
Compute the rbf (gaussian) kernel between X and Y:

K(x, y) = exp(-gamma ||x-y||^2)

for each pair of rows x in X and y in Y.

Read more in the :ref:`User Guide <rbf_kernel>`.

Parameters
----------
X : array of shape (n_features)

Y : array of shape (n_features)

gamma : float, default None
If None, defaults to 1.0 / n_features

Returns
-------
kernel : integer
"""
pass
if gamma is None:
gamma = 1.0 / len(x)

xt = np.array([float(itm) for itm in x])
yt = np.array([float(itm) for itm in y])
kernel = xt - yt
kernel = kernel ** 2
kernel = np.sum(kernel)
kernel *= -gamma
kernel = np.exp(kernel)
return kernel


def kernelsum(k1, k2, d11, d12, d21=None, d22=None, lamda1=1, lamda2=1):


Loading…
Cancel
Save