* MOD treelet kernel, retrieve canonkeys of all graphs before calculate kernels, wildly speed it up.v0.1
@@ -17,13 +17,14 @@ The criteria used for prediction are SVM for classification and kernel Ridge reg | |||||
For predition we randomly divide the data in train and test subset, where 90% of entire dataset is for training and rest for testing. 10 splits are performed. For each split, we first train on the train data, then evaluate the performance on the test set. We choose the optimal parameters for the test set and finally provide the corresponding performance. The final results correspond to the average of the performances on the test sets. | For predition we randomly divide the data in train and test subset, where 90% of entire dataset is for training and rest for testing. 10 splits are performed. For each split, we first train on the train data, then evaluate the performance on the test set. We choose the optimal parameters for the test set and finally provide the corresponding performance. The final results correspond to the average of the performances on the test sets. | ||||
| Kernels | RMSE(℃) | STD(℃) | Parameter | k_time | | |||||
|---------------|:---------:|:--------:|-------------:|-------:| | |||||
| Shortest path | 35.19 | 4.50 | - | 14.58" | | |||||
| Marginalized | 18.02 | 6.29 | p_quit = 0.1 | 4'19" | | |||||
| Path | 14.00 | 6.93 | - | 36.21" | | |||||
| WL subtree | 7.55 | 2.33 | height = 1 | 0.84" | | |||||
| Treelet | 8.31 | 3.38 | - | 49.58" | | |||||
| Kernels | RMSE(℃) | STD(℃) | Parameter | k_time | | |||||
|---------------|:-------:|:------:|-------------:|-------:| | |||||
| Shortest path | 35.19 | 4.50 | - | 14.58" | | |||||
| Marginalized | 18.02 | 6.29 | p_quit = 0.1 | 4'19" | | |||||
| Path | 14.00 | 6.93 | - | 36.21" | | |||||
| WL subtree | 7.55 | 2.33 | height = 1 | 0.84" | | |||||
| Treelet | 8.31 | 3.38 | - | 0.50" | | |||||
| Path up to d | 7.43 | 2.69 | depth = 2 | 0.59" | | |||||
* RMSE stands for arithmetic mean of the root mean squared errors on all splits. | * RMSE stands for arithmetic mean of the root mean squared errors on all splits. | ||||
* STD stands for standard deviation of the root mean squared errors on all splits. | * STD stands for standard deviation of the root mean squared errors on all splits. | ||||
@@ -44,6 +45,9 @@ For predition we randomly divide the data in train and test subset, where 90% of | |||||
[5] Gaüzère B, Brun L, Villemin D. Two new graphs kernels in chemoinformatics. Pattern Recognition Letters. 2012 Nov 1;33(15):2038-47. | [5] Gaüzère B, Brun L, Villemin D. Two new graphs kernels in chemoinformatics. Pattern Recognition Letters. 2012 Nov 1;33(15):2038-47. | ||||
## Updates | ## Updates | ||||
### 2018.01.24 | |||||
* ADD *path kernel up to depth d* and its result on dataset Asyclic. | |||||
* MOD treelet kernel, retrieve canonkeys of all graphs before calculate kernels, wildly speed it up. | |||||
### 2018.01.17 | ### 2018.01.17 | ||||
* ADD comments to code of treelet kernel. - linlin | * ADD comments to code of treelet kernel. - linlin | ||||
### 2018.01.16 | ### 2018.01.16 | ||||
@@ -2,23 +2,24 @@ | |||||
"cells": [ | "cells": [ | ||||
{ | { | ||||
"cell_type": "code", | "cell_type": "code", | ||||
"execution_count": 1, | |||||
"execution_count": 2, | |||||
"metadata": {}, | "metadata": {}, | ||||
"outputs": [ | "outputs": [ | ||||
{ | { | ||||
"name": "stdout", | "name": "stdout", | ||||
"output_type": "stream", | "output_type": "stream", | ||||
"text": [ | "text": [ | ||||
"The line_profiler extension is already loaded. To reload it, use:\n", | |||||
" %reload_ext line_profiler\n", | |||||
"\n", | "\n", | ||||
" --- This is a regression problem ---\n", | " --- This is a regression problem ---\n", | ||||
"\n", | "\n", | ||||
"\n", | "\n", | ||||
"\n", | |||||
" Loading dataset from file...\n", | " Loading dataset from file...\n", | ||||
"\n", | "\n", | ||||
" Calculating kernel matrix, this could take a while...\n", | " Calculating kernel matrix, this could take a while...\n", | ||||
"\n", | "\n", | ||||
" --- treelet kernel matrix of size 185 built in 50.925347328186035 seconds ---\n", | |||||
" --- treelet kernel matrix of size 185 built in 0.48417091369628906 seconds ---\n", | |||||
"[[ 4.00000000e+00 2.60653066e+00 1.00000000e+00 ..., 1.26641655e-14\n", | "[[ 4.00000000e+00 2.60653066e+00 1.00000000e+00 ..., 1.26641655e-14\n", | ||||
" 1.26641655e-14 1.26641655e-14]\n", | " 1.26641655e-14 1.26641655e-14]\n", | ||||
" [ 2.60653066e+00 6.00000000e+00 1.00000000e+00 ..., 1.26641655e-14\n", | " [ 2.60653066e+00 6.00000000e+00 1.00000000e+00 ..., 1.26641655e-14\n", | ||||
@@ -42,19 +43,18 @@ | |||||
"With standard deviation: 5.035844\n", | "With standard deviation: 5.035844\n", | ||||
"\n", | "\n", | ||||
"\n", | "\n", | ||||
" RMSE_test std_test RMSE_train std_train k_time\n", | |||||
" rmse_test std_test rmse_train std_train k_time\n", | |||||
"----------- ---------- ------------ ----------- --------\n", | "----------- ---------- ------------ ----------- --------\n", | ||||
" 10.0997 5.03584 2.68803 1.54162 50.9253\n", | |||||
" 10.0997 5.03584 2.68803 1.54162 0.484171\n", | |||||
"\n", | "\n", | ||||
" --- This is a regression problem ---\n", | " --- This is a regression problem ---\n", | ||||
"\n", | "\n", | ||||
"\n", | "\n", | ||||
"\n", | |||||
" Loading dataset from file...\n", | " Loading dataset from file...\n", | ||||
"\n", | "\n", | ||||
" Calculating kernel matrix, this could take a while...\n", | " Calculating kernel matrix, this could take a while...\n", | ||||
"\n", | "\n", | ||||
" --- treelet kernel matrix of size 185 built in 49.581383228302 seconds ---\n", | |||||
" --- treelet kernel matrix of size 185 built in 0.5003015995025635 seconds ---\n", | |||||
"[[ 4.00000000e+00 2.60653066e+00 1.00000000e+00 ..., 1.26641655e-14\n", | "[[ 4.00000000e+00 2.60653066e+00 1.00000000e+00 ..., 1.26641655e-14\n", | ||||
" 1.26641655e-14 1.26641655e-14]\n", | " 1.26641655e-14 1.26641655e-14]\n", | ||||
" [ 2.60653066e+00 6.00000000e+00 1.00000000e+00 ..., 1.26641655e-14\n", | " [ 2.60653066e+00 6.00000000e+00 1.00000000e+00 ..., 1.26641655e-14\n", | ||||
@@ -78,9 +78,9 @@ | |||||
"With standard deviation: 3.378376\n", | "With standard deviation: 3.378376\n", | ||||
"\n", | "\n", | ||||
"\n", | "\n", | ||||
" RMSE_test std_test RMSE_train std_train k_time\n", | |||||
" rmse_test std_test rmse_train std_train k_time\n", | |||||
"----------- ---------- ------------ ----------- --------\n", | "----------- ---------- ------------ ----------- --------\n", | ||||
" 8.3079 3.37838 2.90887 1.2679 49.5814\n" | |||||
" 8.3079 3.37838 2.90887 1.2679 0.500302\n" | |||||
] | ] | ||||
} | } | ||||
], | ], | ||||
@@ -97,7 +97,7 @@ | |||||
"\n", | "\n", | ||||
"kernel_para = dict(node_label = 'atom', edge_label = 'bond_type', labeled = True)\n", | "kernel_para = dict(node_label = 'atom', edge_label = 'bond_type', labeled = True)\n", | ||||
"\n", | "\n", | ||||
"# kernel_train_test(datafile, kernel_file_path, treeletkernel, kernel_para, normalize = False)\n", | |||||
"kernel_train_test(datafile, kernel_file_path, treeletkernel, kernel_para, normalize = False)\n", | |||||
"\n", | "\n", | ||||
"kernel_train_test(datafile, kernel_file_path, treeletkernel, kernel_para, normalize = True)\n", | "kernel_train_test(datafile, kernel_file_path, treeletkernel, kernel_para, normalize = True)\n", | ||||
"\n", | "\n", | ||||
@@ -116,12 +116,12 @@ | |||||
"# with y normalization\n", | "# with y normalization\n", | ||||
" RMSE_test std_test RMSE_train std_train k_time\n", | " RMSE_test std_test RMSE_train std_train k_time\n", | ||||
"----------- ---------- ------------ ----------- --------\n", | "----------- ---------- ------------ ----------- --------\n", | ||||
" 8.3079 3.37838 2.90887 1.2679 49.5814\n", | |||||
" 8.3079 3.37838 2.90887 1.2679 0.500302\n", | |||||
"\n", | "\n", | ||||
"# without y normalization\n", | "# without y normalization\n", | ||||
" RMSE_test std_test RMSE_train std_train k_time\n", | " RMSE_test std_test RMSE_train std_train k_time\n", | ||||
"----------- ---------- ------------ ----------- --------\n", | "----------- ---------- ------------ ----------- --------\n", | ||||
" 10.0997 5.03584 2.68803 1.54162 50.9253" | |||||
" 10.0997 5.03584 2.68803 1.54162 0.484171" | |||||
] | ] | ||||
}, | }, | ||||
{ | { | ||||
@@ -2,23 +2,24 @@ | |||||
"cells": [ | "cells": [ | ||||
{ | { | ||||
"cell_type": "code", | "cell_type": "code", | ||||
"execution_count": 1, | |||||
"execution_count": 2, | |||||
"metadata": {}, | "metadata": {}, | ||||
"outputs": [ | "outputs": [ | ||||
{ | { | ||||
"name": "stdout", | "name": "stdout", | ||||
"output_type": "stream", | "output_type": "stream", | ||||
"text": [ | "text": [ | ||||
"The line_profiler extension is already loaded. To reload it, use:\n", | |||||
" %reload_ext line_profiler\n", | |||||
"\n", | "\n", | ||||
" --- This is a regression problem ---\n", | " --- This is a regression problem ---\n", | ||||
"\n", | "\n", | ||||
"\n", | "\n", | ||||
"\n", | |||||
" Loading dataset from file...\n", | " Loading dataset from file...\n", | ||||
"\n", | "\n", | ||||
" Calculating kernel matrix, this could take a while...\n", | " Calculating kernel matrix, this could take a while...\n", | ||||
"\n", | "\n", | ||||
" --- treelet kernel matrix of size 185 built in 50.925347328186035 seconds ---\n", | |||||
" --- treelet kernel matrix of size 185 built in 0.48417091369628906 seconds ---\n", | |||||
"[[ 4.00000000e+00 2.60653066e+00 1.00000000e+00 ..., 1.26641655e-14\n", | "[[ 4.00000000e+00 2.60653066e+00 1.00000000e+00 ..., 1.26641655e-14\n", | ||||
" 1.26641655e-14 1.26641655e-14]\n", | " 1.26641655e-14 1.26641655e-14]\n", | ||||
" [ 2.60653066e+00 6.00000000e+00 1.00000000e+00 ..., 1.26641655e-14\n", | " [ 2.60653066e+00 6.00000000e+00 1.00000000e+00 ..., 1.26641655e-14\n", | ||||
@@ -42,19 +43,18 @@ | |||||
"With standard deviation: 5.035844\n", | "With standard deviation: 5.035844\n", | ||||
"\n", | "\n", | ||||
"\n", | "\n", | ||||
" RMSE_test std_test RMSE_train std_train k_time\n", | |||||
" rmse_test std_test rmse_train std_train k_time\n", | |||||
"----------- ---------- ------------ ----------- --------\n", | "----------- ---------- ------------ ----------- --------\n", | ||||
" 10.0997 5.03584 2.68803 1.54162 50.9253\n", | |||||
" 10.0997 5.03584 2.68803 1.54162 0.484171\n", | |||||
"\n", | "\n", | ||||
" --- This is a regression problem ---\n", | " --- This is a regression problem ---\n", | ||||
"\n", | "\n", | ||||
"\n", | "\n", | ||||
"\n", | |||||
" Loading dataset from file...\n", | " Loading dataset from file...\n", | ||||
"\n", | "\n", | ||||
" Calculating kernel matrix, this could take a while...\n", | " Calculating kernel matrix, this could take a while...\n", | ||||
"\n", | "\n", | ||||
" --- treelet kernel matrix of size 185 built in 49.581383228302 seconds ---\n", | |||||
" --- treelet kernel matrix of size 185 built in 0.5003015995025635 seconds ---\n", | |||||
"[[ 4.00000000e+00 2.60653066e+00 1.00000000e+00 ..., 1.26641655e-14\n", | "[[ 4.00000000e+00 2.60653066e+00 1.00000000e+00 ..., 1.26641655e-14\n", | ||||
" 1.26641655e-14 1.26641655e-14]\n", | " 1.26641655e-14 1.26641655e-14]\n", | ||||
" [ 2.60653066e+00 6.00000000e+00 1.00000000e+00 ..., 1.26641655e-14\n", | " [ 2.60653066e+00 6.00000000e+00 1.00000000e+00 ..., 1.26641655e-14\n", | ||||
@@ -78,9 +78,9 @@ | |||||
"With standard deviation: 3.378376\n", | "With standard deviation: 3.378376\n", | ||||
"\n", | "\n", | ||||
"\n", | "\n", | ||||
" RMSE_test std_test RMSE_train std_train k_time\n", | |||||
" rmse_test std_test rmse_train std_train k_time\n", | |||||
"----------- ---------- ------------ ----------- --------\n", | "----------- ---------- ------------ ----------- --------\n", | ||||
" 8.3079 3.37838 2.90887 1.2679 49.5814\n" | |||||
" 8.3079 3.37838 2.90887 1.2679 0.500302\n" | |||||
] | ] | ||||
} | } | ||||
], | ], | ||||
@@ -97,7 +97,7 @@ | |||||
"\n", | "\n", | ||||
"kernel_para = dict(node_label = 'atom', edge_label = 'bond_type', labeled = True)\n", | "kernel_para = dict(node_label = 'atom', edge_label = 'bond_type', labeled = True)\n", | ||||
"\n", | "\n", | ||||
"# kernel_train_test(datafile, kernel_file_path, treeletkernel, kernel_para, normalize = False)\n", | |||||
"kernel_train_test(datafile, kernel_file_path, treeletkernel, kernel_para, normalize = False)\n", | |||||
"\n", | "\n", | ||||
"kernel_train_test(datafile, kernel_file_path, treeletkernel, kernel_para, normalize = True)\n", | "kernel_train_test(datafile, kernel_file_path, treeletkernel, kernel_para, normalize = True)\n", | ||||
"\n", | "\n", | ||||
@@ -116,12 +116,12 @@ | |||||
"# with y normalization\n", | "# with y normalization\n", | ||||
" RMSE_test std_test RMSE_train std_train k_time\n", | " RMSE_test std_test RMSE_train std_train k_time\n", | ||||
"----------- ---------- ------------ ----------- --------\n", | "----------- ---------- ------------ ----------- --------\n", | ||||
" 8.3079 3.37838 2.90887 1.2679 49.5814\n", | |||||
" 8.3079 3.37838 2.90887 1.2679 0.500302\n", | |||||
"\n", | "\n", | ||||
"# without y normalization\n", | "# without y normalization\n", | ||||
" RMSE_test std_test RMSE_train std_train k_time\n", | " RMSE_test std_test RMSE_train std_train k_time\n", | ||||
"----------- ---------- ------------ ----------- --------\n", | "----------- ---------- ------------ ----------- --------\n", | ||||
" 10.0997 5.03584 2.68803 1.54162 50.9253" | |||||
" 10.0997 5.03584 2.68803 1.54162 0.484171" | |||||
] | ] | ||||
}, | }, | ||||
{ | { | ||||
@@ -0,0 +1 @@ | |||||
ljia@ljia-Precision-7520.5692:1516782025 |
@@ -10,7 +10,7 @@ from pygraph.kernels.deltaKernel import deltakernel | |||||
def pathkernel(*args, node_label = 'atom', edge_label = 'bond_type'): | def pathkernel(*args, node_label = 'atom', edge_label = 'bond_type'): | ||||
"""Calculate mean average path kernels between graphs. | """Calculate mean average path kernels between graphs. | ||||
Parameters | Parameters | ||||
---------- | ---------- | ||||
Gn : List of NetworkX graph | Gn : List of NetworkX graph | ||||
@@ -19,15 +19,15 @@ def pathkernel(*args, node_label = 'atom', edge_label = 'bond_type'): | |||||
G1, G2 : NetworkX graphs | G1, G2 : NetworkX graphs | ||||
2 graphs between which the kernel is calculated. | 2 graphs between which the kernel is calculated. | ||||
node_label : string | node_label : string | ||||
node attribute used as label. The default node label is atom. | |||||
node attribute used as label. The default node label is atom. | |||||
edge_label : string | edge_label : string | ||||
edge attribute used as label. The default edge label is bond_type. | edge attribute used as label. The default edge label is bond_type. | ||||
Return | Return | ||||
------ | ------ | ||||
Kmatrix/kernel : Numpy matrix/float | Kmatrix/kernel : Numpy matrix/float | ||||
Kernel matrix, each element of which is the path kernel between 2 praphs. / Path kernel between 2 graphs. | Kernel matrix, each element of which is the path kernel between 2 praphs. / Path kernel between 2 graphs. | ||||
References | References | ||||
---------- | ---------- | ||||
[1] Suard F, Rakotomamonjy A, Bensrhair A. Kernel on Bag of Paths For Measuring Similarity of Shapes. InESANN 2007 Apr 25 (pp. 355-360). | [1] Suard F, Rakotomamonjy A, Bensrhair A. Kernel on Bag of Paths For Measuring Similarity of Shapes. InESANN 2007 Apr 25 (pp. 355-360). | ||||
@@ -35,13 +35,13 @@ def pathkernel(*args, node_label = 'atom', edge_label = 'bond_type'): | |||||
some_graph = args[0][0] if len(args) == 1 else args[0] # only edge attributes of type int or float can be used as edge weight to calculate the shortest paths. | some_graph = args[0][0] if len(args) == 1 else args[0] # only edge attributes of type int or float can be used as edge weight to calculate the shortest paths. | ||||
some_weight = list(nx.get_edge_attributes(some_graph, edge_label).values())[0] | some_weight = list(nx.get_edge_attributes(some_graph, edge_label).values())[0] | ||||
weight = edge_label if isinstance(some_weight, float) or isinstance(some_weight, int) else None | weight = edge_label if isinstance(some_weight, float) or isinstance(some_weight, int) else None | ||||
if len(args) == 1: # for a list of graphs | if len(args) == 1: # for a list of graphs | ||||
Gn = args[0] | |||||
Gn = args[0] | |||||
Kmatrix = np.zeros((len(Gn), len(Gn))) | Kmatrix = np.zeros((len(Gn), len(Gn))) | ||||
start_time = time.time() | start_time = time.time() | ||||
for i in range(0, len(Gn)): | for i in range(0, len(Gn)): | ||||
for j in range(i, len(Gn)): | for j in range(i, len(Gn)): | ||||
Kmatrix[i][j] = _pathkernel_do(Gn[i], Gn[j], node_label, edge_label, weight = weight) | Kmatrix[i][j] = _pathkernel_do(Gn[i], Gn[j], node_label, edge_label, weight = weight) | ||||
@@ -49,34 +49,34 @@ def pathkernel(*args, node_label = 'atom', edge_label = 'bond_type'): | |||||
run_time = time.time() - start_time | run_time = time.time() - start_time | ||||
print("\n --- mean average path kernel matrix of size %d built in %s seconds ---" % (len(Gn), run_time)) | print("\n --- mean average path kernel matrix of size %d built in %s seconds ---" % (len(Gn), run_time)) | ||||
return Kmatrix, run_time | return Kmatrix, run_time | ||||
else: # for only 2 graphs | else: # for only 2 graphs | ||||
start_time = time.time() | start_time = time.time() | ||||
kernel = _pathkernel_do(args[0], args[1], node_label, edge_label, weight = weight) | kernel = _pathkernel_do(args[0], args[1], node_label, edge_label, weight = weight) | ||||
run_time = time.time() - start_time | run_time = time.time() - start_time | ||||
print("\n --- mean average path kernel built in %s seconds ---" % (run_time)) | print("\n --- mean average path kernel built in %s seconds ---" % (run_time)) | ||||
return kernel, run_time | return kernel, run_time | ||||
def _pathkernel_do(G1, G2, node_label = 'atom', edge_label = 'bond_type', weight = None): | def _pathkernel_do(G1, G2, node_label = 'atom', edge_label = 'bond_type', weight = None): | ||||
"""Calculate mean average path kernel between 2 graphs. | """Calculate mean average path kernel between 2 graphs. | ||||
Parameters | Parameters | ||||
---------- | ---------- | ||||
G1, G2 : NetworkX graphs | G1, G2 : NetworkX graphs | ||||
2 graphs between which the kernel is calculated. | 2 graphs between which the kernel is calculated. | ||||
node_label : string | node_label : string | ||||
node attribute used as label. The default node label is atom. | |||||
node attribute used as label. The default node label is atom. | |||||
edge_label : string | edge_label : string | ||||
edge attribute used as label. The default edge label is bond_type. | edge attribute used as label. The default edge label is bond_type. | ||||
weight : string/None | weight : string/None | ||||
edge attribute used as weight to calculate the shortest path. The default edge label is None. | edge attribute used as weight to calculate the shortest path. The default edge label is None. | ||||
Return | Return | ||||
------ | ------ | ||||
kernel : float | kernel : float | ||||
@@ -88,7 +88,7 @@ def _pathkernel_do(G1, G2, node_label = 'atom', edge_label = 'bond_type', weight | |||||
for node1 in range(num_nodes): | for node1 in range(num_nodes): | ||||
for node2 in range(node1 + 1, num_nodes): | for node2 in range(node1 + 1, num_nodes): | ||||
sp1.append(nx.shortest_path(G1, node1, node2, weight = weight)) | sp1.append(nx.shortest_path(G1, node1, node2, weight = weight)) | ||||
sp2 = [] | sp2 = [] | ||||
num_nodes = G2.number_of_nodes() | num_nodes = G2.number_of_nodes() | ||||
for node1 in range(num_nodes): | for node1 in range(num_nodes): | ||||
@@ -108,5 +108,5 @@ def _pathkernel_do(G1, G2, node_label = 'atom', edge_label = 'bond_type', weight | |||||
kernel += kernel_path # add up kernels of all paths | kernel += kernel_path # add up kernels of all paths | ||||
kernel = kernel / (len(sp1) * len(sp2)) # calculate mean average | kernel = kernel / (len(sp1) * len(sp2)) # calculate mean average | ||||
return kernel | |||||
return kernel |
@@ -7,13 +7,14 @@ For predition we randomly divide the data in train and test subset, where 90% of | |||||
## Summary | ## Summary | ||||
| Kernels | RMSE(℃) | STD(℃) | Parameter | k_time | | |||||
|---------------|:---------:|:--------:|-------------:|-------:| | |||||
| Shortest path | 35.19 | 4.50 | - | 14.58" | | |||||
| Marginalized | 18.02 | 6.29 | p_quit = 0.1 | 4'19" | | |||||
| Path | 14.00 | 6.94 | - | 37.58" | | |||||
| WL subtree | 7.55 | 2.33 | height = 1 | 0.84" | | |||||
| Treelet | 8.31 | 3.38 | - | 49.58" | | |||||
| Kernels | RMSE(℃) | STD(℃) | Parameter | k_time | | |||||
|---------------|:-------:|:------:|-------------:|-------:| | |||||
| Shortest path | 35.19 | 4.50 | - | 14.58" | | |||||
| Marginalized | 18.02 | 6.29 | p_quit = 0.1 | 4'19" | | |||||
| Path | 14.00 | 6.94 | - | 37.58" | | |||||
| WL subtree | 7.55 | 2.33 | height = 1 | 0.84" | | |||||
| Treelet | 8.31 | 3.38 | - | 0.50" | | |||||
| Path up to d | 7.43 | 2.69 | depth = 2 | 0.52" | | |||||
* RMSE stands for arithmetic mean of the root mean squared errors on all splits. | * RMSE stands for arithmetic mean of the root mean squared errors on all splits. | ||||
* STD stands for standard deviation of the root mean squared errors on all splits. | * STD stands for standard deviation of the root mean squared errors on all splits. | ||||
@@ -76,9 +77,46 @@ The table below shows the results of the WL subtree under different subtree heig | |||||
``` | ``` | ||||
### Treelet kernel | ### Treelet kernel | ||||
**The targets of training data are normalized before calculating the kernel.** | |||||
**The targets of training data are normalized before calculating the kernel.** | |||||
``` | ``` | ||||
RMSE_test std_test RMSE_train std_train k_time | RMSE_test std_test RMSE_train std_train k_time | ||||
----------- ---------- ------------ ----------- -------- | ----------- ---------- ------------ ----------- -------- | ||||
8.3079 3.37838 2.90887 1.2679 49.5814 | |||||
``` | |||||
8.3079 3.37838 2.90887 1.2679 0.500302 | |||||
``` | |||||
### Path kernel up to depth *d* | |||||
The table below shows the results of the path kernel up to different depth *d*. | |||||
The first table is the results using Tanimoto kernel, where **The targets of training data are normalized before calculating the kernel.**. | |||||
``` | |||||
depth rmse_test std_test rmse_train std_train k_time | |||||
------- ----------- ---------- ------------ ----------- --------- | |||||
0 41.6202 6.453 43.6169 2.13212 0.0904737 | |||||
1 38.8446 6.44648 40.8329 3.44147 0.175414 | |||||
2 35.2915 4.7813 35.7461 1.61134 0.344896 | |||||
3 29.4845 3.90351 28.4646 3.00137 0.553939 | |||||
4 22.6693 6.28053 19.2517 3.42893 0.770649 | |||||
5 21.7956 5.5225 16.886 2.60519 1.01558 | |||||
6 20.6049 5.49983 13.1097 2.58431 1.33302 | |||||
7 20.3479 5.17631 12.0152 2.5928 1.60266 | |||||
8 19.8228 5.13769 10.7981 2.13082 1.81218 | |||||
9 19.8734 5.10369 10.7997 2.09549 2.21726 | |||||
10 19.8708 5.09217 10.7787 2.10002 2.41006 | |||||
``` | |||||
The second table is the results using MinMax kernel. | |||||
``` | |||||
depth rmse_test std_test rmse_train std_train k_time | |||||
------- ----------- ---------- ------------ ----------- -------- | |||||
0 12.58 2.73235 12.1209 0.500467 0.377576 | |||||
1 12.6215 2.18866 10.2243 0.734261 0.456332 | |||||
2 7.42903 2.69395 2.71885 0.732922 0.585278 | |||||
3 9.02468 2.50808 1.54 1.13813 0.706556 | |||||
4 10.0811 3.6477 1.36029 1.42399 0.847957 | |||||
5 11.3005 4.44163 1.08518 1.06206 1.00086 | |||||
6 12.186 4.88816 1.06443 1.00191 1.19792 | |||||
7 12.7534 5.14529 1.19912 1.34031 1.4372 | |||||
8 13.0471 5.27184 1.35822 1.84315 1.68449 | |||||
9 13.1789 5.27707 1.36002 1.84834 1.96545 | |||||
10 13.2538 5.26425 1.36208 1.85426 2.24943 | |||||
``` |
@@ -38,9 +38,13 @@ def treeletkernel(*args, node_label = 'atom', edge_label = 'bond_type', labeled | |||||
start_time = time.time() | start_time = time.time() | ||||
# get all canonical keys of all graphs before calculating kernels to save time, but this may cost a lot of memory for large dataset. | |||||
canonkeys = [ get_canonkeys(Gn[i], node_label = node_label, edge_label = edge_label, labeled = labeled) \ | |||||
for i in range(0, len(Gn)) ] | |||||
for i in range(0, len(Gn)): | for i in range(0, len(Gn)): | ||||
for j in range(i, len(Gn)): | for j in range(i, len(Gn)): | ||||
Kmatrix[i][j] = _treeletkernel_do(Gn[i], Gn[j], node_label = node_label, edge_label = edge_label, labeled = labeled) | |||||
Kmatrix[i][j] = _treeletkernel_do(canonkeys[i], canonkeys[j], node_label = node_label, edge_label = edge_label, labeled = labeled) | |||||
Kmatrix[j][i] = Kmatrix[i][j] | Kmatrix[j][i] = Kmatrix[i][j] | ||||
run_time = time.time() - start_time | run_time = time.time() - start_time | ||||
@@ -51,8 +55,11 @@ def treeletkernel(*args, node_label = 'atom', edge_label = 'bond_type', labeled | |||||
else: # for only 2 graphs | else: # for only 2 graphs | ||||
start_time = time.time() | start_time = time.time() | ||||
canonkey1 = get_canonkeys(args[0], node_label = node_label, edge_label = edge_label, labeled = labeled) | |||||
canonkey2 = get_canonkeys(args[1], node_label = node_label, edge_label = edge_label, labeled = labeled) | |||||
kernel = _treeletkernel_do(args[0], args[1], node_label = node_label, edge_label = edge_label, labeled = labeled) | |||||
kernel = _treeletkernel_do(canonkey1, canonkey2, node_label = node_label, edge_label = edge_label, labeled = labeled) | |||||
run_time = time.time() - start_time | run_time = time.time() - start_time | ||||
print("\n --- treelet kernel built in %s seconds ---" % (run_time)) | print("\n --- treelet kernel built in %s seconds ---" % (run_time)) | ||||
@@ -60,17 +67,17 @@ def treeletkernel(*args, node_label = 'atom', edge_label = 'bond_type', labeled | |||||
return kernel, run_time | return kernel, run_time | ||||
def _treeletkernel_do(G1, G2, node_label = 'atom', edge_label = 'bond_type', labeled = True): | |||||
def _treeletkernel_do(canonkey1, canonkey2, node_label = 'atom', edge_label = 'bond_type', labeled = True): | |||||
"""Calculate treelet graph kernel between 2 graphs. | """Calculate treelet graph kernel between 2 graphs. | ||||
Parameters | Parameters | ||||
---------- | ---------- | ||||
G1, G2 : NetworkX graphs | |||||
2 graphs between which the kernel is calculated. | |||||
canonkey1, canonkey2 : list | |||||
List of canonical keys in 2 graphs, where each key is represented by a string. | |||||
node_label : string | node_label : string | ||||
node attribute used as label. The default node label is atom. | |||||
Node attribute used as label. The default node label is atom. | |||||
edge_label : string | edge_label : string | ||||
edge attribute used as label. The default edge label is bond_type. | |||||
Edge attribute used as label. The default edge label is bond_type. | |||||
labeled : boolean | labeled : boolean | ||||
Whether the graphs are labeled. The default is True. | Whether the graphs are labeled. The default is True. | ||||
@@ -79,12 +86,9 @@ def _treeletkernel_do(G1, G2, node_label = 'atom', edge_label = 'bond_type', lab | |||||
kernel : float | kernel : float | ||||
Treelet Kernel between 2 graphs. | Treelet Kernel between 2 graphs. | ||||
""" | """ | ||||
canonkey1 = get_canonkeys(G1, node_label = node_label, edge_label = edge_label, labeled = labeled) | |||||
canonkey2 = get_canonkeys(G2, node_label = node_label, edge_label = edge_label, labeled = labeled) | |||||
keys = set(canonkey1.keys()) & set(canonkey2.keys()) # find same canonical keys in both graphs | keys = set(canonkey1.keys()) & set(canonkey2.keys()) # find same canonical keys in both graphs | ||||
vector1 = np.matrix([ (canonkey1[key] if (key in canonkey1.keys()) else 0) for key in keys ]) | |||||
vector2 = np.matrix([ (canonkey2[key] if (key in canonkey2.keys()) else 0) for key in keys ]) | |||||
vector1 = np.array([ (canonkey1[key] if (key in canonkey1.keys()) else 0) for key in keys ]) | |||||
vector2 = np.array([ (canonkey2[key] if (key in canonkey2.keys()) else 0) for key in keys ]) | |||||
kernel = np.sum(np.exp(- np.square(vector1 - vector2) / 2)) | kernel = np.sum(np.exp(- np.square(vector1 - vector2) / 2)) | ||||
return kernel | return kernel | ||||
@@ -0,0 +1,203 @@ | |||||
import sys | |||||
import pathlib | |||||
sys.path.insert(0, "../") | |||||
import time | |||||
from collections import Counter | |||||
import networkx as nx | |||||
import numpy as np | |||||
def untildpathkernel(*args, node_label = 'atom', edge_label = 'bond_type', labeled = True, depth = 10, k_func = 'tanimoto'): | |||||
"""Calculate path graph kernels up to depth d between graphs. | |||||
Parameters | |||||
---------- | |||||
Gn : List of NetworkX graph | |||||
List of graphs between which the kernels are calculated. | |||||
/ | |||||
G1, G2 : NetworkX graphs | |||||
2 graphs between which the kernel is calculated. | |||||
node_label : string | |||||
node attribute used as label. The default node label is atom. | |||||
edge_label : string | |||||
edge attribute used as label. The default edge label is bond_type. | |||||
labeled : boolean | |||||
Whether the graphs are labeled. The default is True. | |||||
depth : integer | |||||
Depth of search. Longest length of paths. | |||||
k_func : function | |||||
A kernel function used using different notions of fingerprint similarity. | |||||
Return | |||||
------ | |||||
Kmatrix/kernel : Numpy matrix/float | |||||
Kernel matrix, each element of which is the path kernel up to d between 2 praphs. / Path kernel up to d between 2 graphs. | |||||
""" | |||||
depth = int(depth) | |||||
if len(args) == 1: # for a list of graphs | |||||
Gn = args[0] | |||||
Kmatrix = np.zeros((len(Gn), len(Gn))) | |||||
start_time = time.time() | |||||
# get all paths of all graphs before calculating kernels to save time, but this may cost a lot of memory for large dataset. | |||||
all_paths = [ find_all_paths_until_length(Gn[i], depth, node_label = node_label, edge_label = edge_label, labeled = labeled) for i in range(0, len(Gn)) ] | |||||
for i in range(0, len(Gn)): | |||||
for j in range(i, len(Gn)): | |||||
Kmatrix[i][j] = _untildpathkernel_do(all_paths[i], all_paths[j], k_func, node_label = node_label, edge_label = edge_label, labeled = labeled) | |||||
Kmatrix[j][i] = Kmatrix[i][j] | |||||
run_time = time.time() - start_time | |||||
print("\n --- kernel matrix of path kernel up to %d of size %d built in %s seconds ---" % (depth, len(Gn), run_time)) | |||||
return Kmatrix, run_time | |||||
else: # for only 2 graphs | |||||
start_time = time.time() | |||||
all_paths1 = find_all_paths_until_length(args[0], depth, node_label = node_label, edge_label = edge_label, labeled = labeled) | |||||
all_paths2 = find_all_paths_until_length(args[1], depth, node_label = node_label, edge_label = edge_label, labeled = labeled) | |||||
kernel = _untildpathkernel_do(all_paths1, all_paths2, k_func, node_label = node_label, edge_label = edge_label, labeled = labeled) | |||||
run_time = time.time() - start_time | |||||
print("\n --- path kernel up to %d built in %s seconds ---" % (depth, run_time)) | |||||
return kernel, run_time | |||||
def _untildpathkernel_do(paths1, paths2, k_func, node_label = 'atom', edge_label = 'bond_type', labeled = True): | |||||
"""Calculate path graph kernels up to depth d between 2 graphs. | |||||
Parameters | |||||
---------- | |||||
paths1, paths2 : list | |||||
List of paths in 2 graphs, where for unlabeled graphs, each path is represented by a list of nodes; while for labeled graphs, each path is represented by a string consists of labels of nodes and edges on that path. | |||||
k_func : function | |||||
A kernel function used using different notions of fingerprint similarity. | |||||
node_label : string | |||||
node attribute used as label. The default node label is atom. | |||||
edge_label : string | |||||
edge attribute used as label. The default edge label is bond_type. | |||||
labeled : boolean | |||||
Whether the graphs are labeled. The default is True. | |||||
Return | |||||
------ | |||||
kernel : float | |||||
Treelet Kernel between 2 graphs. | |||||
""" | |||||
all_paths = list(set(paths1 + paths2)) | |||||
if k_func == 'tanimoto': | |||||
vector1 = [ (1 if path in paths1 else 0) for path in all_paths ] | |||||
vector2 = [ (1 if path in paths2 else 0) for path in all_paths ] | |||||
kernel_uv = np.dot(vector1, vector2) | |||||
kernel = kernel_uv / (len(set(paths1)) + len(set(paths2)) - kernel_uv) | |||||
else: # MinMax kernel | |||||
path_count1 = Counter(paths1) | |||||
path_count2 = Counter(paths2) | |||||
vector1 = [ (path_count1[key] if (key in path_count1.keys()) else 0) for key in all_paths ] | |||||
vector2 = [ (path_count2[key] if (key in path_count2.keys()) else 0) for key in all_paths ] | |||||
kernel = np.sum(np.minimum(vector1, vector2)) / np.sum(np.maximum(vector1, vector2)) | |||||
return kernel | |||||
# this method find paths repetively, it could be faster. | |||||
def find_all_paths_until_length(G, length, node_label = 'atom', edge_label = 'bond_type', labeled = True): | |||||
"""Find all paths with a certain maximum length in a graph. A recursive depth first search is applied. | |||||
Parameters | |||||
---------- | |||||
G : NetworkX graphs | |||||
The graph in which paths are searched. | |||||
length : integer | |||||
The maximum length of paths. | |||||
node_label : string | |||||
node attribute used as label. The default node label is atom. | |||||
edge_label : string | |||||
edge attribute used as label. The default edge label is bond_type. | |||||
labeled : boolean | |||||
Whether the graphs are labeled. The default is True. | |||||
Return | |||||
------ | |||||
path : list | |||||
List of paths retrieved, where for unlabeled graphs, each path is represented by a list of nodes; while for labeled graphs, each path is represented by a string consists of labels of nodes and edges on that path. | |||||
""" | |||||
all_paths = [] | |||||
for i in range(0, length + 1): | |||||
new_paths = find_all_paths(G, i) | |||||
if new_paths == []: | |||||
break | |||||
all_paths.extend(new_paths) | |||||
if labeled == True: # convert paths to strings | |||||
path_strs = [] | |||||
for path in all_paths: | |||||
strlist = [ G.node[node][node_label] + G[node][path[path.index(node) + 1]][edge_label] for node in path[:-1] ] | |||||
path_strs.append(''.join(strlist) + G.node[path[-1]][node_label]) | |||||
return path_strs | |||||
return all_paths | |||||
def find_paths(G, source_node, length): | |||||
"""Find all paths with a certain length those start from a source node. A recursive depth first search is applied. | |||||
Parameters | |||||
---------- | |||||
G : NetworkX graphs | |||||
The graph in which paths are searched. | |||||
source_node : integer | |||||
The number of the node from where all paths start. | |||||
length : integer | |||||
The length of paths. | |||||
Return | |||||
------ | |||||
path : list of list | |||||
List of paths retrieved, where each path is represented by a list of nodes. | |||||
""" | |||||
return [[source_node]] if length == 0 else \ | |||||
[ [source_node] + path for neighbor in G[source_node] \ | |||||
for path in find_paths(G, neighbor, length - 1) if source_node not in path ] | |||||
def find_all_paths(G, length): | |||||
"""Find all paths with a certain length in a graph. A recursive depth first search is applied. | |||||
Parameters | |||||
---------- | |||||
G : NetworkX graphs | |||||
The graph in which paths are searched. | |||||
length : integer | |||||
The length of paths. | |||||
Return | |||||
------ | |||||
path : list of list | |||||
List of paths retrieved, where each path is represented by a list of nodes. | |||||
""" | |||||
all_paths = [] | |||||
for node in G: | |||||
all_paths.extend(find_paths(G, node, length)) | |||||
### The following process is not carried out according to the original article | |||||
# all_paths_r = [ path[::-1] for path in all_paths ] | |||||
# # For each path, two presentation are retrieved from its two extremities. Remove one of them. | |||||
# for idx, path in enumerate(all_paths[:-1]): | |||||
# for path2 in all_paths_r[idx+1::]: | |||||
# if path == path2: | |||||
# all_paths[idx] = [] | |||||
# break | |||||
# return list(filter(lambda a: a != [], all_paths)) | |||||
return all_paths |
@@ -170,10 +170,10 @@ def kernel_train_test(datafile, kernel_file_path, kernel_func, kernel_para, tria | |||||
test_stds_list.append(test_std) | test_stds_list.append(test_std) | ||||
print('\n') | print('\n') | ||||
table_dict = {'RMSE_test': test_means_list, 'std_test': test_stds_list, \ | |||||
'RMSE_train': train_means_list, 'std_train': train_stds_list, 'k_time': kernel_time_list} | |||||
table_dict = {'rmse_test': test_means_list, 'std_test': test_stds_list, \ | |||||
'rmse_train': train_means_list, 'std_train': train_stds_list, 'k_time': kernel_time_list} | |||||
if hyper_name == '': | if hyper_name == '': | ||||
keyorder = ['RMSE_test', 'std_test', 'RMSE_train', 'std_train', 'k_time'] | |||||
keyorder = ['rmse_test', 'std_test', 'rmse_train', 'std_train', 'k_time'] | |||||
else: | else: | ||||
table_dict[hyper_name] = hyper_range | table_dict[hyper_name] = hyper_range | ||||