@@ -1,6 +1,6 @@ | |||
# About graph kenrels. | |||
## (Random walk) Sylvester equation kernel. | |||
## (Random walk) Sylvester equation kernel. | |||
### ImportError: cannot import name 'frange' from 'matplotlib.mlab' | |||
@@ -10,7 +10,7 @@ Update your `control` package. | |||
### Intel MKL FATAL ERROR: Cannot load libmkl_avx2.so or libmkl_def.so. | |||
The Intel Math Kernel Library (MKL) is missing or not properly set. I assume the MKL is required by `control` module. | |||
The Intel Math Kernel Library (MKL) is missing or not properly set. I assume MKL is required by the `control` module. | |||
Install MKL. Then add the following to your path: | |||
@@ -18,4 +18,6 @@ Install MKL. Then add the following to your path: | |||
export PATH=/opt/intel/bin:$PATH | |||
export LD_LIBRARY_PATH=/opt/intel/lib/intel64:/opt/intel/mkl/lib/intel64:$LD_LIBRARY_PATH | |||
export LD_PRELOAD=/opt/intel/mkl/lib/intel64/libmkl_def.so:/opt/intel/mkl/lib/intel64/libmkl_avx2.so:/opt/intel/mkl/lib/intel64/libmkl_core.so:/opt/intel/mkl/lib/intel64/libmkl_intel_lp64.so:/opt/intel/mkl/lib/intel64/libmkl_intel_thread.so:/opt/intel/lib/intel64_lin/libiomp5.so | |||
``` |
@@ -60,7 +60,7 @@ Check [`notebooks`](https://github.com/jajupmochi/graphkit-learn/tree/master/not | |||
The docs of the library can be found [here](https://graphkit-learn.readthedocs.io/en/master/?badge=master). | |||
## Main contents | |||
## Main contents | |||
### 1 List of graph kernels | |||
@@ -131,6 +131,20 @@ A comparison of performances of graph kernels on benchmark datasets can be found | |||
Fork the library and open a pull request! Make your own contribute to the community! | |||
## Authors | |||
* [Linlin Jia](https://jajupmochi.github.io/), LITIS, INSA Rouen Normandie | |||
* [Benoit Gaüzère](http://pagesperso.litislab.fr/~bgauzere/#contact_en), LITIS, INSA Rouen Normandie | |||
* [Paul Honeine](http://honeine.fr/paul/Welcome.html), LITIS, Université de Rouen Normandie | |||
## Citation | |||
Still waiting... | |||
## Acknowledgments | |||
This research was supported by CSC (China Scholarship Council) and the French national research agency (ANR) under the grant APi (ANR-18-CE23-0014). The authors would like to thank the CRIANN (Le Centre Régional Informatique et d’Applications Numériques de Normandie) for providing computational resources. | |||
## References | |||
[1] Thomas Gärtner, Peter Flach, and Stefan Wrobel. On graph kernels: Hardness results and efficient alternatives. Learning Theory and Kernel Machines, pages 129–143, 2003. | |||
@@ -153,17 +167,3 @@ Fork the library and open a pull request! Make your own contribute to the commun | |||
[10] Gaüzere, B., Brun, L., Villemin, D., 2012. Two new graphs kernels in chemoinformatics. Pattern Recognition Letters 33, 2038–2047. | |||
[11] Shervashidze, N., Schweitzer, P., Leeuwen, E.J.v., Mehlhorn, K., Borgwardt, K.M., 2011. Weisfeiler-lehman graph kernels. Journal of Machine Learning Research 12, 2539–2561. | |||
## Authors | |||
* [Linlin Jia](https://jajupmochi.github.io/), LITIS, INSA Rouen Normandie | |||
* [Benoit Gaüzère](http://pagesperso.litislab.fr/~bgauzere/#contact_en), LITIS, INSA Rouen Normandie | |||
* [Paul Honeine](http://honeine.fr/paul/Welcome.html), LITIS, Université de Rouen Normandie | |||
## Citation | |||
Still waiting... | |||
## Acknowledgments | |||
This research was supported by CSC (China Scholarship Council) and the French national research agency (ANR) under the grant APi (ANR-18-CE23-0014). The authors would like to thank the CRIANN (Le Centre Régional Informatique et d’Applications Numériques de Normandie) for providing computational resources. |
@@ -10,19 +10,12 @@ from gklearn.utils.graphdataset import load_predefined_dataset | |||
import logging | |||
# def get_graphs(ds_name): | |||
# from gklearn.utils.graph_synthesizer import GraphSynthesizer | |||
# gsyzer = GraphSynthesizer() | |||
# graphs = gsyzer.unified_graphs(num_graphs=100, num_nodes=num_nodes, num_edges=int(num_nodes*2), num_node_labels=0, num_edge_labels=0, seed=None, directed=False) | |||
# return graphs | |||
def xp_runtimes_of_all_7cores(): | |||
def xp_runtimes_of_all_28cores(): | |||
# Run and save. | |||
import pickle | |||
import os | |||
save_dir = 'outputs/runtimes_of_all_7cores/' | |||
save_dir = 'outputs/runtimes_of_all_28cores/' | |||
if not os.path.exists(save_dir): | |||
os.makedirs(save_dir) | |||
@@ -41,16 +34,16 @@ def xp_runtimes_of_all_7cores(): | |||
graphs, _ = load_predefined_dataset(ds_name) | |||
# Compute Gram matrix. | |||
run_time = 'error' | |||
try: | |||
gram_matrix, run_time = compute_graph_kernel(graphs, kernel_name, n_jobs=28) | |||
run_times[kernel_name].append(run_time) | |||
except Exception as exp: | |||
run_times[kernel_name].append('error') | |||
print('An exception occured when running this experiment:') | |||
LOG_FILENAME = save_dir + 'error.txt' | |||
logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) | |||
logging.exception('') | |||
print(repr(exp)) | |||
run_times[kernel_name].append(run_time) | |||
pickle.dump(run_time, open(save_dir + 'run_time.' + kernel_name + '.' + ds_name + '.pkl', 'wb')) | |||
@@ -61,4 +54,4 @@ def xp_runtimes_of_all_7cores(): | |||
if __name__ == '__main__': | |||
xp_runtimes_of_all_7cores() | |||
xp_runtimes_of_all_28cores() |
@@ -0,0 +1,62 @@ | |||
#!/usr/bin/env python3 | |||
# -*- coding: utf-8 -*- | |||
""" | |||
Created on Mon Sep 21 10:34:26 2020 | |||
@author: ljia | |||
""" | |||
from utils import Graph_Kernel_List, Dataset_List, compute_graph_kernel | |||
from gklearn.utils.graphdataset import load_predefined_dataset | |||
import logging | |||
def xp_runtimes_diff_chunksizes(): | |||
# Run and save. | |||
import pickle | |||
import os | |||
save_dir = 'outputs/runtimes_diff_chunksizes/' | |||
if not os.path.exists(save_dir): | |||
os.makedirs(save_dir) | |||
run_times = {} | |||
for kernel_name in Graph_Kernel_List: | |||
print() | |||
print('Kernel:', kernel_name) | |||
run_times[kernel_name] = [] | |||
for ds_name in Dataset_List: | |||
print() | |||
print('Dataset:', ds_name) | |||
run_times[kernel_name].append([]) | |||
for chunksize in [1, 5, 10, 50, 100, 500, 1000, 5000, 10000, 50000, 100000]: | |||
print() | |||
print('Chunksize:', chunksize) | |||
# get graphs. | |||
graphs, _ = load_predefined_dataset(ds_name) | |||
# Compute Gram matrix. | |||
run_time = 'error' | |||
try: | |||
gram_matrix, run_time = compute_graph_kernel(graphs, kernel_name, chunksize=chunksize) | |||
except Exception as exp: | |||
print('An exception occured when running this experiment:') | |||
LOG_FILENAME = save_dir + 'error.txt' | |||
logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) | |||
logging.exception('') | |||
print(repr(exp)) | |||
run_times[kernel_name][-1].append(run_time) | |||
pickle.dump(run_time, open(save_dir + 'run_time.' + kernel_name + '.' + ds_name + '.' + str(chunksize) + '.pkl', 'wb')) | |||
# Save all. | |||
pickle.dump(run_times, open(save_dir + 'run_times.pkl', 'wb')) | |||
return | |||
if __name__ == '__main__': | |||
xp_runtimes_diff_chunksizes() |
@@ -41,16 +41,16 @@ def xp_synthesied_graphs_dataset_size(): | |||
sub_graphs = [g.copy() for g in graphs[0:num_graphs]] | |||
run_time = 'error' | |||
try: | |||
gram_matrix, run_time = compute_graph_kernel(sub_graphs, kernel_name, n_jobs=1) | |||
run_times[kernel_name].append(run_time) | |||
except Exception as exp: | |||
run_times[kernel_name].append('error') | |||
print('An exception occured when running this experiment:') | |||
LOG_FILENAME = save_dir + 'error.txt' | |||
logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) | |||
logging.exception('') | |||
print(repr(exp)) | |||
run_times[kernel_name].append(run_time) | |||
pickle.dump(run_time, open(save_dir + 'run_time.' + kernel_name + '.' + str(num_graphs) + '.pkl', 'wb')) | |||
@@ -40,16 +40,16 @@ def xp_synthesied_graphs_degrees(): | |||
graphs = generate_graphs(degree) | |||
# Compute Gram matrix. | |||
run_time = 'error' | |||
try: | |||
gram_matrix, run_time = compute_graph_kernel(graphs, kernel_name, n_jobs=1) | |||
run_times[kernel_name].append(run_time) | |||
except Exception as exp: | |||
run_times[kernel_name].append('error') | |||
print('An exception occured when running this experiment:') | |||
LOG_FILENAME = save_dir + 'error.txt' | |||
logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) | |||
logging.exception('') | |||
print(repr(exp)) | |||
run_times[kernel_name].append(run_time) | |||
pickle.dump(run_time, open(save_dir + 'run_time.' + kernel_name + '.' + str(degree) + '.pkl', 'wb')) | |||
@@ -40,16 +40,16 @@ def xp_synthesied_graphs_num_edge_label_alphabet(): | |||
graphs = generate_graphs(num_el_alp) | |||
# Compute Gram matrix. | |||
run_time = 'error' | |||
try: | |||
gram_matrix, run_time = compute_graph_kernel(graphs, kernel_name, n_jobs=1) | |||
run_times[kernel_name].append(run_time) | |||
except Exception as exp: | |||
run_times[kernel_name].append('error') | |||
print('An exception occured when running this experiment:') | |||
LOG_FILENAME = save_dir + 'error.txt' | |||
logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) | |||
logging.exception('') | |||
print(repr(exp)) | |||
run_times[kernel_name].append(run_time) | |||
pickle.dump(run_time, open(save_dir + 'run_time.' + kernel_name + '.' + str(num_el_alp) + '.pkl', 'wb')) | |||
@@ -40,9 +40,9 @@ def xp_synthesied_graphs_num_node_label_alphabet(): | |||
graphs = generate_graphs(num_nl_alp) | |||
# Compute Gram matrix. | |||
run_time = 'error' | |||
try: | |||
gram_matrix, run_time = compute_graph_kernel(graphs, kernel_name, n_jobs=1) | |||
run_times[kernel_name].append(run_time) | |||
except Exception as exp: | |||
run_times[kernel_name].append('error') | |||
print('An exception occured when running this experiment:') | |||
@@ -50,6 +50,7 @@ def xp_synthesied_graphs_num_node_label_alphabet(): | |||
logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) | |||
logging.exception('') | |||
print(repr(exp)) | |||
run_times[kernel_name].append(run_time) | |||
pickle.dump(run_time, open(save_dir + 'run_time.' + kernel_name + '.' + str(num_nl_alp) + '.pkl', 'wb')) | |||
@@ -40,9 +40,9 @@ def xp_synthesied_graphs_num_nodes(): | |||
graphs = generate_graphs(num_nodes) | |||
# Compute Gram matrix. | |||
run_time = 'error' | |||
try: | |||
gram_matrix, run_time = compute_graph_kernel(graphs, kernel_name, n_jobs=1) | |||
run_times[kernel_name].append(run_time) | |||
except Exception as exp: | |||
run_times[kernel_name].append('error') | |||
print('An exception occured when running this experiment:') | |||
@@ -50,6 +50,7 @@ def xp_synthesied_graphs_num_nodes(): | |||
logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) | |||
logging.exception('') | |||
print(repr(exp)) | |||
run_times[kernel_name].append(run_time) | |||
pickle.dump(run_time, open(save_dir + 'run_time.' + kernel_name + '.' + str(num_nodes) + '.pkl', 'wb')) | |||
@@ -27,7 +27,7 @@ Graph_Kernel_List_ECon = ['ConjugateGradient', 'FixedPoint', 'StructuralSP'] | |||
Dataset_List = ['Alkane', 'Acyclic', 'MAO', 'PAH', 'MUTAG', 'Letter-med', 'ENZYMES', 'AIDS', 'NCI1', 'NCI109', 'DD'] | |||
def compute_graph_kernel(graphs, kernel_name, n_jobs=multiprocessing.cpu_count()): | |||
def compute_graph_kernel(graphs, kernel_name, n_jobs=multiprocessing.cpu_count(), chunksize=None): | |||
if kernel_name == 'CommonWalk': | |||
from gklearn.kernels.commonWalkKernel import commonwalkkernel | |||
@@ -105,6 +105,7 @@ def compute_graph_kernel(graphs, kernel_name, n_jobs=multiprocessing.cpu_count() | |||
# params['parallel'] = None | |||
params['n_jobs'] = n_jobs | |||
params['chunksize'] = chunksize | |||
params['verbose'] = True | |||
results = estimator(graphs, **params) | |||
@@ -3,9 +3,9 @@ | |||
@references: | |||
[1] Thomas Gärtner, Peter Flach, and Stefan Wrobel. On graph kernels: | |||
Hardness results and efficient alternatives. Learning Theory and Kernel | |||
Machines, pages 129–143, 2003. | |||
[1] Thomas Gärtner, Peter Flach, and Stefan Wrobel. On graph kernels: | |||
Hardness results and efficient alternatives. Learning Theory and Kernel | |||
Machines, pages 129–143, 2003. | |||
""" | |||
import sys | |||
@@ -22,428 +22,429 @@ from gklearn.utils.parallel import parallel_gm | |||
def commonwalkkernel(*args, | |||
node_label='atom', | |||
edge_label='bond_type', | |||
# n=None, | |||
weight=1, | |||
compute_method=None, | |||
n_jobs=None, | |||
verbose=True): | |||
"""Calculate common walk graph kernels between graphs. | |||
Parameters | |||
---------- | |||
Gn : List of NetworkX graph | |||
List of graphs between which the kernels are calculated. | |||
G1, G2 : NetworkX graphs | |||
Two graphs between which the kernel is calculated. | |||
node_label : string | |||
Node attribute used as symbolic label. The default node label is 'atom'. | |||
edge_label : string | |||
Edge attribute used as symbolic label. The default edge label is 'bond_type'. | |||
weight: integer | |||
Weight coefficient of different lengths of walks, which represents beta | |||
in 'exp' method and gamma in 'geo'. | |||
compute_method : string | |||
Method used to compute walk kernel. The Following choices are | |||
available: | |||
'exp': method based on exponential serials applied on the direct | |||
product graph, as shown in reference [1]. The time complexity is O(n^6) | |||
for graphs with n vertices. | |||
'geo': method based on geometric serials applied on the direct product | |||
graph, as shown in reference [1]. The time complexity is O(n^6) for | |||
graphs with n vertices. | |||
n_jobs : int | |||
Number of jobs for parallelization. | |||
Return | |||
------ | |||
Kmatrix : Numpy matrix | |||
Kernel matrix, each element of which is a common walk kernel between 2 | |||
graphs. | |||
""" | |||
# n : integer | |||
# Longest length of walks. Only useful when applying the 'brute' method. | |||
# 'brute': brute force, simply search for all walks and compare them. | |||
compute_method = compute_method.lower() | |||
# arrange all graphs in a list | |||
Gn = args[0] if len(args) == 1 else [args[0], args[1]] | |||
# remove graphs with only 1 node, as they do not have adjacency matrices | |||
len_gn = len(Gn) | |||
Gn = [(idx, G) for idx, G in enumerate(Gn) if nx.number_of_nodes(G) != 1] | |||
idx = [G[0] for G in Gn] | |||
Gn = [G[1] for G in Gn] | |||
if len(Gn) != len_gn: | |||
if verbose: | |||
print('\n %d graphs are removed as they have only 1 node.\n' % | |||
(len_gn - len(Gn))) | |||
ds_attrs = get_dataset_attributes( | |||
Gn, | |||
attr_names=['node_labeled', 'edge_labeled', 'is_directed'], | |||
node_label=node_label, edge_label=edge_label) | |||
if not ds_attrs['node_labeled']: | |||
for G in Gn: | |||
nx.set_node_attributes(G, '0', 'atom') | |||
if not ds_attrs['edge_labeled']: | |||
for G in Gn: | |||
nx.set_edge_attributes(G, '0', 'bond_type') | |||
if not ds_attrs['is_directed']: # convert | |||
Gn = [G.to_directed() for G in Gn] | |||
start_time = time.time() | |||
Kmatrix = np.zeros((len(Gn), len(Gn))) | |||
# ---- use pool.imap_unordered to parallel and track progress. ---- | |||
def init_worker(gn_toshare): | |||
global G_gn | |||
G_gn = gn_toshare | |||
# direct product graph method - exponential | |||
if compute_method == 'exp': | |||
do_partial = partial(wrapper_cw_exp, node_label, edge_label, weight) | |||
# direct product graph method - geometric | |||
elif compute_method == 'geo': | |||
do_partial = partial(wrapper_cw_geo, node_label, edge_label, weight) | |||
parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, | |||
glbv=(Gn,), n_jobs=n_jobs, verbose=verbose) | |||
# pool = Pool(n_jobs) | |||
# itr = zip(combinations_with_replacement(Gn, 2), | |||
# combinations_with_replacement(range(0, len(Gn)), 2)) | |||
# len_itr = int(len(Gn) * (len(Gn) + 1) / 2) | |||
# if len_itr < 1000 * n_jobs: | |||
# chunksize = int(len_itr / n_jobs) + 1 | |||
# else: | |||
# chunksize = 1000 | |||
node_label='atom', | |||
edge_label='bond_type', | |||
# n=None, | |||
weight=1, | |||
compute_method=None, | |||
n_jobs=None, | |||
chunksize=None, | |||
verbose=True): | |||
"""Calculate common walk graph kernels between graphs. | |||
Parameters | |||
---------- | |||
Gn : List of NetworkX graph | |||
List of graphs between which the kernels are calculated. | |||
G1, G2 : NetworkX graphs | |||
Two graphs between which the kernel is calculated. | |||
node_label : string | |||
Node attribute used as symbolic label. The default node label is 'atom'. | |||
edge_label : string | |||
Edge attribute used as symbolic label. The default edge label is 'bond_type'. | |||
weight: integer | |||
Weight coefficient of different lengths of walks, which represents beta | |||
in 'exp' method and gamma in 'geo'. | |||
compute_method : string | |||
Method used to compute walk kernel. The Following choices are | |||
available: | |||
'exp': method based on exponential serials applied on the direct | |||
product graph, as shown in reference [1]. The time complexity is O(n^6) | |||
for graphs with n vertices. | |||
'geo': method based on geometric serials applied on the direct product | |||
graph, as shown in reference [1]. The time complexity is O(n^6) for | |||
graphs with n vertices. | |||
n_jobs : int | |||
Number of jobs for parallelization. | |||
Return | |||
------ | |||
Kmatrix : Numpy matrix | |||
Kernel matrix, each element of which is a common walk kernel between 2 | |||
graphs. | |||
""" | |||
# n : integer | |||
# Longest length of walks. Only useful when applying the 'brute' method. | |||
# 'brute': brute force, simply search for all walks and compare them. | |||
compute_method = compute_method.lower() | |||
# arrange all graphs in a list | |||
Gn = args[0] if len(args) == 1 else [args[0], args[1]] | |||
# remove graphs with only 1 node, as they do not have adjacency matrices | |||
len_gn = len(Gn) | |||
Gn = [(idx, G) for idx, G in enumerate(Gn) if nx.number_of_nodes(G) != 1] | |||
idx = [G[0] for G in Gn] | |||
Gn = [G[1] for G in Gn] | |||
if len(Gn) != len_gn: | |||
if verbose: | |||
print('\n %d graphs are removed as they have only 1 node.\n' % | |||
(len_gn - len(Gn))) | |||
ds_attrs = get_dataset_attributes( | |||
Gn, | |||
attr_names=['node_labeled', 'edge_labeled', 'is_directed'], | |||
node_label=node_label, edge_label=edge_label) | |||
if not ds_attrs['node_labeled']: | |||
for G in Gn: | |||
nx.set_node_attributes(G, '0', 'atom') | |||
if not ds_attrs['edge_labeled']: | |||
for G in Gn: | |||
nx.set_edge_attributes(G, '0', 'bond_type') | |||
if not ds_attrs['is_directed']: # convert | |||
Gn = [G.to_directed() for G in Gn] | |||
start_time = time.time() | |||
Kmatrix = np.zeros((len(Gn), len(Gn))) | |||
# ---- use pool.imap_unordered to parallel and track progress. ---- | |||
def init_worker(gn_toshare): | |||
global G_gn | |||
G_gn = gn_toshare | |||
# direct product graph method - exponential | |||
if compute_method == 'exp': | |||
do_partial = partial(wrapper_cw_exp, node_label, edge_label, weight) | |||
# direct product graph method - geometric | |||
elif compute_method == 'geo': | |||
do_partial = partial(wrapper_cw_geo, node_label, edge_label, weight) | |||
parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, | |||
glbv=(Gn,), n_jobs=n_jobs, chunksize=chunksize, verbose=verbose) | |||
# pool = Pool(n_jobs) | |||
# itr = zip(combinations_with_replacement(Gn, 2), | |||
# combinations_with_replacement(range(0, len(Gn)), 2)) | |||
# len_itr = int(len(Gn) * (len(Gn) + 1) / 2) | |||
# if len_itr < 1000 * n_jobs: | |||
# chunksize = int(len_itr / n_jobs) + 1 | |||
# else: | |||
# chunksize = 1000 | |||
# | |||
# # direct product graph method - exponential | |||
# if compute_method == 'exp': | |||
# do_partial = partial(wrapper_cw_exp, node_label, edge_label, weight) | |||
# # direct product graph method - geometric | |||
# elif compute_method == 'geo': | |||
# do_partial = partial(wrapper_cw_geo, node_label, edge_label, weight) | |||
# # direct product graph method - exponential | |||
# if compute_method == 'exp': | |||
# do_partial = partial(wrapper_cw_exp, node_label, edge_label, weight) | |||
# # direct product graph method - geometric | |||
# elif compute_method == 'geo': | |||
# do_partial = partial(wrapper_cw_geo, node_label, edge_label, weight) | |||
# | |||
# for i, j, kernel in tqdm( | |||
# pool.imap_unordered(do_partial, itr, chunksize), | |||
# desc='calculating kernels', | |||
# file=sys.stdout): | |||
# Kmatrix[i][j] = kernel | |||
# Kmatrix[j][i] = kernel | |||
# pool.close() | |||
# pool.join() | |||
# # ---- direct running, normally use single CPU core. ---- | |||
# # direct product graph method - exponential | |||
# itr = combinations_with_replacement(range(0, len(Gn)), 2) | |||
# if compute_method == 'exp': | |||
# for i, j in tqdm(itr, desc='calculating kernels', file=sys.stdout): | |||
# Kmatrix[i][j] = _commonwalkkernel_exp(Gn[i], Gn[j], node_label, | |||
# edge_label, weight) | |||
# Kmatrix[j][i] = Kmatrix[i][j] | |||
# for i, j, kernel in tqdm( | |||
# pool.imap_unordered(do_partial, itr, chunksize), | |||
# desc='calculating kernels', | |||
# file=sys.stdout): | |||
# Kmatrix[i][j] = kernel | |||
# Kmatrix[j][i] = kernel | |||
# pool.close() | |||
# pool.join() | |||
# # ---- direct running, normally use single CPU core. ---- | |||
# # direct product graph method - exponential | |||
# itr = combinations_with_replacement(range(0, len(Gn)), 2) | |||
# if compute_method == 'exp': | |||
# for i, j in tqdm(itr, desc='calculating kernels', file=sys.stdout): | |||
# Kmatrix[i][j] = _commonwalkkernel_exp(Gn[i], Gn[j], node_label, | |||
# edge_label, weight) | |||
# Kmatrix[j][i] = Kmatrix[i][j] | |||
# | |||
# # direct product graph method - geometric | |||
# elif compute_method == 'geo': | |||
# for i, j in tqdm(itr, desc='calculating kernels', file=sys.stdout): | |||
# Kmatrix[i][j] = _commonwalkkernel_geo(Gn[i], Gn[j], node_label, | |||
# edge_label, weight) | |||
# Kmatrix[j][i] = Kmatrix[i][j] | |||
# # search all paths use brute force. | |||
# elif compute_method == 'brute': | |||
# n = int(n) | |||
# # get all paths of all graphs before calculating kernels to save time, but this may cost a lot of memory for large dataset. | |||
# all_walks = [ | |||
# find_all_walks_until_length(Gn[i], n, node_label, edge_label) | |||
# for i in range(0, len(Gn)) | |||
# ] | |||
# # direct product graph method - geometric | |||
# elif compute_method == 'geo': | |||
# for i, j in tqdm(itr, desc='calculating kernels', file=sys.stdout): | |||
# Kmatrix[i][j] = _commonwalkkernel_geo(Gn[i], Gn[j], node_label, | |||
# edge_label, weight) | |||
# Kmatrix[j][i] = Kmatrix[i][j] | |||
# # search all paths use brute force. | |||
# elif compute_method == 'brute': | |||
# n = int(n) | |||
# # get all paths of all graphs before calculating kernels to save time, but this may cost a lot of memory for large dataset. | |||
# all_walks = [ | |||
# find_all_walks_until_length(Gn[i], n, node_label, edge_label) | |||
# for i in range(0, len(Gn)) | |||
# ] | |||
# | |||
# for i in range(0, len(Gn)): | |||
# for j in range(i, len(Gn)): | |||
# Kmatrix[i][j] = _commonwalkkernel_brute( | |||
# all_walks[i], | |||
# all_walks[j], | |||
# node_label=node_label, | |||
# edge_label=edge_label) | |||
# Kmatrix[j][i] = Kmatrix[i][j] | |||
# for i in range(0, len(Gn)): | |||
# for j in range(i, len(Gn)): | |||
# Kmatrix[i][j] = _commonwalkkernel_brute( | |||
# all_walks[i], | |||
# all_walks[j], | |||
# node_label=node_label, | |||
# edge_label=edge_label) | |||
# Kmatrix[j][i] = Kmatrix[i][j] | |||
run_time = time.time() - start_time | |||
if verbose: | |||
print("\n --- kernel matrix of common walk kernel of size %d built in %s seconds ---" | |||
% (len(Gn), run_time)) | |||
run_time = time.time() - start_time | |||
if verbose: | |||
print("\n --- kernel matrix of common walk kernel of size %d built in %s seconds ---" | |||
% (len(Gn), run_time)) | |||
return Kmatrix, run_time, idx | |||
return Kmatrix, run_time, idx | |||
def _commonwalkkernel_exp(g1, g2, node_label, edge_label, beta): | |||
"""Calculate walk graph kernels up to n between 2 graphs using exponential | |||
series. | |||
Parameters | |||
---------- | |||
Gn : List of NetworkX graph | |||
List of graphs between which the kernels are calculated. | |||
node_label : string | |||
Node attribute used as label. | |||
edge_label : string | |||
Edge attribute used as label. | |||
beta : integer | |||
Weight. | |||
ij : tuple of integer | |||
Index of graphs between which the kernel is computed. | |||
Return | |||
------ | |||
kernel : float | |||
The common walk Kernel between 2 graphs. | |||
""" | |||
# get tensor product / direct product | |||
gp = direct_product(g1, g2, node_label, edge_label) | |||
# return 0 if the direct product graph have no more than 1 node. | |||
if nx.number_of_nodes(gp) < 2: | |||
return 0 | |||
A = nx.adjacency_matrix(gp).todense() | |||
# print(A) | |||
# from matplotlib import pyplot as plt | |||
# nx.draw_networkx(G1) | |||
# plt.show() | |||
# nx.draw_networkx(G2) | |||
# plt.show() | |||
# nx.draw_networkx(gp) | |||
# plt.show() | |||
# print(G1.nodes(data=True)) | |||
# print(G2.nodes(data=True)) | |||
# print(gp.nodes(data=True)) | |||
# print(gp.edges(data=True)) | |||
ew, ev = np.linalg.eig(A) | |||
# print('ew: ', ew) | |||
# print(ev) | |||
# T = np.matrix(ev) | |||
# print('T: ', T) | |||
# T = ev.I | |||
D = np.zeros((len(ew), len(ew))) | |||
for i in range(len(ew)): | |||
D[i][i] = np.exp(beta * ew[i]) | |||
# print('D: ', D) | |||
# print('hshs: ', T.I * D * T) | |||
# print(np.exp(-2)) | |||
# print(D) | |||
# print(np.exp(weight * D)) | |||
# print(ev) | |||
# print(np.linalg.inv(ev)) | |||
exp_D = ev * D * ev.T | |||
# print(exp_D) | |||
# print(np.exp(weight * A)) | |||
# print('-------') | |||
return exp_D.sum() | |||
"""Calculate walk graph kernels up to n between 2 graphs using exponential | |||
series. | |||
Parameters | |||
---------- | |||
Gn : List of NetworkX graph | |||
List of graphs between which the kernels are calculated. | |||
node_label : string | |||
Node attribute used as label. | |||
edge_label : string | |||
Edge attribute used as label. | |||
beta : integer | |||
Weight. | |||
ij : tuple of integer | |||
Index of graphs between which the kernel is computed. | |||
Return | |||
------ | |||
kernel : float | |||
The common walk Kernel between 2 graphs. | |||
""" | |||
# get tensor product / direct product | |||
gp = direct_product(g1, g2, node_label, edge_label) | |||
# return 0 if the direct product graph have no more than 1 node. | |||
if nx.number_of_nodes(gp) < 2: | |||
return 0 | |||
A = nx.adjacency_matrix(gp).todense() | |||
# print(A) | |||
# from matplotlib import pyplot as plt | |||
# nx.draw_networkx(G1) | |||
# plt.show() | |||
# nx.draw_networkx(G2) | |||
# plt.show() | |||
# nx.draw_networkx(gp) | |||
# plt.show() | |||
# print(G1.nodes(data=True)) | |||
# print(G2.nodes(data=True)) | |||
# print(gp.nodes(data=True)) | |||
# print(gp.edges(data=True)) | |||
ew, ev = np.linalg.eig(A) | |||
# print('ew: ', ew) | |||
# print(ev) | |||
# T = np.matrix(ev) | |||
# print('T: ', T) | |||
# T = ev.I | |||
D = np.zeros((len(ew), len(ew))) | |||
for i in range(len(ew)): | |||
D[i][i] = np.exp(beta * ew[i]) | |||
# print('D: ', D) | |||
# print('hshs: ', T.I * D * T) | |||
# print(np.exp(-2)) | |||
# print(D) | |||
# print(np.exp(weight * D)) | |||
# print(ev) | |||
# print(np.linalg.inv(ev)) | |||
exp_D = ev * D * ev.T | |||
# print(exp_D) | |||
# print(np.exp(weight * A)) | |||
# print('-------') | |||
return exp_D.sum() | |||
def wrapper_cw_exp(node_label, edge_label, beta, itr): | |||
i = itr[0] | |||
j = itr[1] | |||
return i, j, _commonwalkkernel_exp(G_gn[i], G_gn[j], node_label, edge_label, beta) | |||
i = itr[0] | |||
j = itr[1] | |||
return i, j, _commonwalkkernel_exp(G_gn[i], G_gn[j], node_label, edge_label, beta) | |||
def _commonwalkkernel_geo(g1, g2, node_label, edge_label, gamma): | |||
"""Calculate common walk graph kernels up to n between 2 graphs using | |||
geometric series. | |||
Parameters | |||
---------- | |||
Gn : List of NetworkX graph | |||
List of graphs between which the kernels are calculated. | |||
node_label : string | |||
Node attribute used as label. | |||
edge_label : string | |||
Edge attribute used as label. | |||
gamma: integer | |||
Weight. | |||
ij : tuple of integer | |||
Index of graphs between which the kernel is computed. | |||
Return | |||
------ | |||
kernel : float | |||
The common walk Kernel between 2 graphs. | |||
""" | |||
# get tensor product / direct product | |||
gp = direct_product(g1, g2, node_label, edge_label) | |||
# return 0 if the direct product graph have no more than 1 node. | |||
if nx.number_of_nodes(gp) < 2: | |||
return 0 | |||
A = nx.adjacency_matrix(gp).todense() | |||
mat = np.identity(len(A)) - gamma * A | |||
# try: | |||
return mat.I.sum() | |||
# except np.linalg.LinAlgError: | |||
# return np.nan | |||
"""Calculate common walk graph kernels up to n between 2 graphs using | |||
geometric series. | |||
Parameters | |||
---------- | |||
Gn : List of NetworkX graph | |||
List of graphs between which the kernels are calculated. | |||
node_label : string | |||
Node attribute used as label. | |||
edge_label : string | |||
Edge attribute used as label. | |||
gamma: integer | |||
Weight. | |||
ij : tuple of integer | |||
Index of graphs between which the kernel is computed. | |||
Return | |||
------ | |||
kernel : float | |||
The common walk Kernel between 2 graphs. | |||
""" | |||
# get tensor product / direct product | |||
gp = direct_product(g1, g2, node_label, edge_label) | |||
# return 0 if the direct product graph have no more than 1 node. | |||
if nx.number_of_nodes(gp) < 2: | |||
return 0 | |||
A = nx.adjacency_matrix(gp).todense() | |||
mat = np.identity(len(A)) - gamma * A | |||
# try: | |||
return mat.I.sum() | |||
# except np.linalg.LinAlgError: | |||
# return np.nan | |||
def wrapper_cw_geo(node_label, edge_label, gama, itr): | |||
i = itr[0] | |||
j = itr[1] | |||
return i, j, _commonwalkkernel_geo(G_gn[i], G_gn[j], node_label, edge_label, gama) | |||
i = itr[0] | |||
j = itr[1] | |||
return i, j, _commonwalkkernel_geo(G_gn[i], G_gn[j], node_label, edge_label, gama) | |||
def _commonwalkkernel_brute(walks1, | |||
walks2, | |||
node_label='atom', | |||
edge_label='bond_type', | |||
labeled=True): | |||
"""Calculate walk graph kernels up to n between 2 graphs. | |||
Parameters | |||
---------- | |||
walks1, walks2 : list | |||
List of walks in 2 graphs, where for unlabeled graphs, each walk is | |||
represented by a list of nodes; while for labeled graphs, each walk is | |||
represented by a string consists of labels of nodes and edges on that | |||
walk. | |||
node_label : string | |||
node attribute used as label. The default node label is atom. | |||
edge_label : string | |||
edge attribute used as label. The default edge label is bond_type. | |||
labeled : boolean | |||
Whether the graphs are labeled. The default is True. | |||
Return | |||
------ | |||
kernel : float | |||
Treelet Kernel between 2 graphs. | |||
""" | |||
counts_walks1 = dict(Counter(walks1)) | |||
counts_walks2 = dict(Counter(walks2)) | |||
all_walks = list(set(walks1 + walks2)) | |||
vector1 = [(counts_walks1[walk] if walk in walks1 else 0) | |||
for walk in all_walks] | |||
vector2 = [(counts_walks2[walk] if walk in walks2 else 0) | |||
for walk in all_walks] | |||
kernel = np.dot(vector1, vector2) | |||
return kernel | |||
walks2, | |||
node_label='atom', | |||
edge_label='bond_type', | |||
labeled=True): | |||
"""Calculate walk graph kernels up to n between 2 graphs. | |||
Parameters | |||
---------- | |||
walks1, walks2 : list | |||
List of walks in 2 graphs, where for unlabeled graphs, each walk is | |||
represented by a list of nodes; while for labeled graphs, each walk is | |||
represented by a string consists of labels of nodes and edges on that | |||
walk. | |||
node_label : string | |||
node attribute used as label. The default node label is atom. | |||
edge_label : string | |||
edge attribute used as label. The default edge label is bond_type. | |||
labeled : boolean | |||
Whether the graphs are labeled. The default is True. | |||
Return | |||
------ | |||
kernel : float | |||
Treelet Kernel between 2 graphs. | |||
""" | |||
counts_walks1 = dict(Counter(walks1)) | |||
counts_walks2 = dict(Counter(walks2)) | |||
all_walks = list(set(walks1 + walks2)) | |||
vector1 = [(counts_walks1[walk] if walk in walks1 else 0) | |||
for walk in all_walks] | |||
vector2 = [(counts_walks2[walk] if walk in walks2 else 0) | |||
for walk in all_walks] | |||
kernel = np.dot(vector1, vector2) | |||
return kernel | |||
# this method find walks repetively, it could be faster. | |||
def find_all_walks_until_length(G, | |||
length, | |||
node_label='atom', | |||
edge_label='bond_type', | |||
labeled=True): | |||
"""Find all walks with a certain maximum length in a graph. | |||
A recursive depth first search is applied. | |||
Parameters | |||
---------- | |||
G : NetworkX graphs | |||
The graph in which walks are searched. | |||
length : integer | |||
The maximum length of walks. | |||
node_label : string | |||
node attribute used as label. The default node label is atom. | |||
edge_label : string | |||
edge attribute used as label. The default edge label is bond_type. | |||
labeled : boolean | |||
Whether the graphs are labeled. The default is True. | |||
Return | |||
------ | |||
walk : list | |||
List of walks retrieved, where for unlabeled graphs, each walk is | |||
represented by a list of nodes; while for labeled graphs, each walk | |||
is represented by a string consists of labels of nodes and edges on | |||
that walk. | |||
""" | |||
all_walks = [] | |||
# @todo: in this way, the time complexity is close to N(d^n+d^(n+1)+...+1), which could be optimized to O(Nd^n) | |||
for i in range(0, length + 1): | |||
new_walks = find_all_walks(G, i) | |||
if new_walks == []: | |||
break | |||
all_walks.extend(new_walks) | |||
if labeled == True: # convert paths to strings | |||
walk_strs = [] | |||
for walk in all_walks: | |||
strlist = [ | |||
G.node[node][node_label] + | |||
G[node][walk[walk.index(node) + 1]][edge_label] | |||
for node in walk[:-1] | |||
] | |||
walk_strs.append(''.join(strlist) + G.node[walk[-1]][node_label]) | |||
return walk_strs | |||
return all_walks | |||
length, | |||
node_label='atom', | |||
edge_label='bond_type', | |||
labeled=True): | |||
"""Find all walks with a certain maximum length in a graph. | |||
A recursive depth first search is applied. | |||
Parameters | |||
---------- | |||
G : NetworkX graphs | |||
The graph in which walks are searched. | |||
length : integer | |||
The maximum length of walks. | |||
node_label : string | |||
node attribute used as label. The default node label is atom. | |||
edge_label : string | |||
edge attribute used as label. The default edge label is bond_type. | |||
labeled : boolean | |||
Whether the graphs are labeled. The default is True. | |||
Return | |||
------ | |||
walk : list | |||
List of walks retrieved, where for unlabeled graphs, each walk is | |||
represented by a list of nodes; while for labeled graphs, each walk | |||
is represented by a string consists of labels of nodes and edges on | |||
that walk. | |||
""" | |||
all_walks = [] | |||
# @todo: in this way, the time complexity is close to N(d^n+d^(n+1)+...+1), which could be optimized to O(Nd^n) | |||
for i in range(0, length + 1): | |||
new_walks = find_all_walks(G, i) | |||
if new_walks == []: | |||
break | |||
all_walks.extend(new_walks) | |||
if labeled == True: # convert paths to strings | |||
walk_strs = [] | |||
for walk in all_walks: | |||
strlist = [ | |||
G.node[node][node_label] + | |||
G[node][walk[walk.index(node) + 1]][edge_label] | |||
for node in walk[:-1] | |||
] | |||
walk_strs.append(''.join(strlist) + G.node[walk[-1]][node_label]) | |||
return walk_strs | |||
return all_walks | |||
def find_walks(G, source_node, length): | |||
"""Find all walks with a certain length those start from a source node. A | |||
recursive depth first search is applied. | |||
Parameters | |||
---------- | |||
G : NetworkX graphs | |||
The graph in which walks are searched. | |||
source_node : integer | |||
The number of the node from where all walks start. | |||
length : integer | |||
The length of walks. | |||
Return | |||
------ | |||
walk : list of list | |||
List of walks retrieved, where each walk is represented by a list of | |||
nodes. | |||
""" | |||
return [[source_node]] if length == 0 else \ | |||
[[source_node] + walk for neighbor in G[source_node] | |||
for walk in find_walks(G, neighbor, length - 1)] | |||
"""Find all walks with a certain length those start from a source node. A | |||
recursive depth first search is applied. | |||
Parameters | |||
---------- | |||
G : NetworkX graphs | |||
The graph in which walks are searched. | |||
source_node : integer | |||
The number of the node from where all walks start. | |||
length : integer | |||
The length of walks. | |||
Return | |||
------ | |||
walk : list of list | |||
List of walks retrieved, where each walk is represented by a list of | |||
nodes. | |||
""" | |||
return [[source_node]] if length == 0 else \ | |||
[[source_node] + walk for neighbor in G[source_node] | |||
for walk in find_walks(G, neighbor, length - 1)] | |||
def find_all_walks(G, length): | |||
"""Find all walks with a certain length in a graph. A recursive depth first | |||
search is applied. | |||
Parameters | |||
---------- | |||
G : NetworkX graphs | |||
The graph in which walks are searched. | |||
length : integer | |||
The length of walks. | |||
Return | |||
------ | |||
walk : list of list | |||
List of walks retrieved, where each walk is represented by a list of | |||
nodes. | |||
""" | |||
all_walks = [] | |||
for node in G: | |||
all_walks.extend(find_walks(G, node, length)) | |||
# The following process is not carried out according to the original article | |||
# all_paths_r = [ path[::-1] for path in all_paths ] | |||
# # For each path, two presentation are retrieved from its two extremities. Remove one of them. | |||
# for idx, path in enumerate(all_paths[:-1]): | |||
# for path2 in all_paths_r[idx+1::]: | |||
# if path == path2: | |||
# all_paths[idx] = [] | |||
# break | |||
# return list(filter(lambda a: a != [], all_paths)) | |||
return all_walks | |||
"""Find all walks with a certain length in a graph. A recursive depth first | |||
search is applied. | |||
Parameters | |||
---------- | |||
G : NetworkX graphs | |||
The graph in which walks are searched. | |||
length : integer | |||
The length of walks. | |||
Return | |||
------ | |||
walk : list of list | |||
List of walks retrieved, where each walk is represented by a list of | |||
nodes. | |||
""" | |||
all_walks = [] | |||
for node in G: | |||
all_walks.extend(find_walks(G, node, length)) | |||
# The following process is not carried out according to the original article | |||
# all_paths_r = [ path[::-1] for path in all_paths ] | |||
# # For each path, two presentation are retrieved from its two extremities. Remove one of them. | |||
# for idx, path in enumerate(all_paths[:-1]): | |||
# for path2 in all_paths_r[idx+1::]: | |||
# if path == path2: | |||
# all_paths[idx] = [] | |||
# break | |||
# return list(filter(lambda a: a != [], all_paths)) | |||
return all_walks |
@@ -0,0 +1,245 @@ | |||
#!/usr/bin/env python3 | |||
# -*- coding: utf-8 -*- | |||
""" | |||
Created on Thu Aug 20 16:09:51 2020 | |||
@author: ljia | |||
@references: | |||
[1] S Vichy N Vishwanathan, Nicol N Schraudolph, Risi Kondor, and Karsten M Borgwardt. Graph kernels. Journal of Machine Learning Research, 11(Apr):1201–1242, 2010. | |||
""" | |||
import sys | |||
from tqdm import tqdm | |||
import numpy as np | |||
import networkx as nx | |||
from control import dlyap | |||
from gklearn.utils.parallel import parallel_gm, parallel_me | |||
from gklearn.kernels import RandomWalk | |||
class FixedPoint(RandomWalk): | |||
def __init__(self, **kwargs): | |||
RandomWalk.__init__(self, **kwargs) | |||
def _compute_gm_series(self): | |||
self._check_edge_weight(self._graphs) | |||
self._check_graphs(self._graphs) | |||
if self._verbose >= 2: | |||
import warnings | |||
warnings.warn('All labels are ignored.') | |||
lmda = self._weight | |||
# compute Gram matrix. | |||
gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) | |||
if self._q == None: | |||
# don't normalize adjacency matrices if q is a uniform vector. Note | |||
# A_wave_list actually contains the transposes of the adjacency matrices. | |||
if self._verbose >= 2: | |||
iterator = tqdm(self._graphs, desc='compute adjacency matrices', file=sys.stdout) | |||
else: | |||
iterator = self._graphs | |||
A_wave_list = [nx.adjacency_matrix(G, self._edge_weight).todense().transpose() for G in iterator] | |||
# # normalized adjacency matrices | |||
# A_wave_list = [] | |||
# for G in tqdm(Gn, desc='compute adjacency matrices', file=sys.stdout): | |||
# A_tilde = nx.adjacency_matrix(G, eweight).todense().transpose() | |||
# norm = A_tilde.sum(axis=0) | |||
# norm[norm == 0] = 1 | |||
# A_wave_list.append(A_tilde / norm) | |||
if self._p == None: # p is uniform distribution as default. | |||
from itertools import combinations_with_replacement | |||
itr = combinations_with_replacement(range(0, len(self._graphs)), 2) | |||
if self._verbose >= 2: | |||
iterator = tqdm(itr, desc='calculating kernels', file=sys.stdout) | |||
else: | |||
iterator = itr | |||
for i, j in iterator: | |||
kernel = self.__kernel_do(A_wave_list[i], A_wave_list[j], lmda) | |||
gram_matrix[i][j] = kernel | |||
gram_matrix[j][i] = kernel | |||
else: # @todo | |||
pass | |||
else: # @todo | |||
pass | |||
return gram_matrix | |||
def _compute_gm_imap_unordered(self): | |||
self._check_edge_weight(self._graphs) | |||
self._check_graphs(self._graphs) | |||
if self._verbose >= 2: | |||
import warnings | |||
warnings.warn('All labels are ignored.') | |||
# compute Gram matrix. | |||
gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) | |||
if self._q == None: | |||
# don't normalize adjacency matrices if q is a uniform vector. Note | |||
# A_wave_list actually contains the transposes of the adjacency matrices. | |||
if self._verbose >= 2: | |||
iterator = tqdm(self._graphs, desc='compute adjacency matrices', file=sys.stdout) | |||
else: | |||
iterator = self._graphs | |||
A_wave_list = [nx.adjacency_matrix(G, self._edge_weight).todense().transpose() for G in iterator] # @todo: parallel? | |||
if self._p == None: # p is uniform distribution as default. | |||
def init_worker(A_wave_list_toshare): | |||
global G_A_wave_list | |||
G_A_wave_list = A_wave_list_toshare | |||
do_fun = self._wrapper_kernel_do | |||
parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, | |||
glbv=(A_wave_list,), n_jobs=self._n_jobs, verbose=self._verbose) | |||
else: # @todo | |||
pass | |||
else: # @todo | |||
pass | |||
return gram_matrix | |||
def _compute_kernel_list_series(self, g1, g_list): | |||
self._check_edge_weight(g_list + [g1]) | |||
self._check_graphs(g_list + [g1]) | |||
if self._verbose >= 2: | |||
import warnings | |||
warnings.warn('All labels are ignored.') | |||
lmda = self._weight | |||
# compute kernel list. | |||
kernel_list = [None] * len(g_list) | |||
if self._q == None: | |||
# don't normalize adjacency matrices if q is a uniform vector. Note | |||
# A_wave_list actually contains the transposes of the adjacency matrices. | |||
A_wave_1 = nx.adjacency_matrix(g1, self._edge_weight).todense().transpose() | |||
if self._verbose >= 2: | |||
iterator = tqdm(range(len(g_list)), desc='compute adjacency matrices', file=sys.stdout) | |||
else: | |||
iterator = range(len(g_list)) | |||
A_wave_list = [nx.adjacency_matrix(G, self._edge_weight).todense().transpose() for G in iterator] | |||
if self._p == None: # p is uniform distribution as default. | |||
if self._verbose >= 2: | |||
iterator = tqdm(range(len(g_list)), desc='calculating kernels', file=sys.stdout) | |||
else: | |||
iterator = range(len(g_list)) | |||
for i in iterator: | |||
kernel = self.__kernel_do(A_wave_1, A_wave_list[i], lmda) | |||
kernel_list[i] = kernel | |||
else: # @todo | |||
pass | |||
else: # @todo | |||
pass | |||
return kernel_list | |||
def _compute_kernel_list_imap_unordered(self, g1, g_list): | |||
self._check_edge_weight(g_list + [g1]) | |||
self._check_graphs(g_list + [g1]) | |||
if self._verbose >= 2: | |||
import warnings | |||
warnings.warn('All labels are ignored.') | |||
# compute kernel list. | |||
kernel_list = [None] * len(g_list) | |||
if self._q == None: | |||
# don't normalize adjacency matrices if q is a uniform vector. Note | |||
# A_wave_list actually contains the transposes of the adjacency matrices. | |||
A_wave_1 = nx.adjacency_matrix(g1, self._edge_weight).todense().transpose() | |||
if self._verbose >= 2: | |||
iterator = tqdm(range(len(g_list)), desc='compute adjacency matrices', file=sys.stdout) | |||
else: | |||
iterator = range(len(g_list)) | |||
A_wave_list = [nx.adjacency_matrix(G, self._edge_weight).todense().transpose() for G in iterator] # @todo: parallel? | |||
if self._p == None: # p is uniform distribution as default. | |||
def init_worker(A_wave_1_toshare, A_wave_list_toshare): | |||
global G_A_wave_1, G_A_wave_list | |||
G_A_wave_1 = A_wave_1_toshare | |||
G_A_wave_list = A_wave_list_toshare | |||
do_fun = self._wrapper_kernel_list_do | |||
def func_assign(result, var_to_assign): | |||
var_to_assign[result[0]] = result[1] | |||
itr = range(len(g_list)) | |||
len_itr = len(g_list) | |||
parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, | |||
init_worker=init_worker, glbv=(A_wave_1, A_wave_list), method='imap_unordered', | |||
n_jobs=self._n_jobs, itr_desc='calculating kernels', verbose=self._verbose) | |||
else: # @todo | |||
pass | |||
else: # @todo | |||
pass | |||
return kernel_list | |||
def _wrapper_kernel_list_do(self, itr): | |||
return itr, self._kernel_do(G_A_wave_1, G_A_wave_list[itr], self._weight) | |||
def _compute_single_kernel_series(self, g1, g2): | |||
self._check_edge_weight([g1] + [g2]) | |||
self._check_graphs([g1] + [g2]) | |||
if self._verbose >= 2: | |||
import warnings | |||
warnings.warn('All labels are ignored.') | |||
lmda = self._weight | |||
if self._q == None: | |||
# don't normalize adjacency matrices if q is a uniform vector. Note | |||
# A_wave_list actually contains the transposes of the adjacency matrices. | |||
A_wave_1 = nx.adjacency_matrix(g1, self._edge_weight).todense().transpose() | |||
A_wave_2 = nx.adjacency_matrix(g2, self._edge_weight).todense().transpose() | |||
if self._p == None: # p is uniform distribution as default. | |||
kernel = self.__kernel_do(A_wave_1, A_wave_2, lmda) | |||
else: # @todo | |||
pass | |||
else: # @todo | |||
pass | |||
return kernel | |||
def __kernel_do(self, A_wave1, A_wave2, lmda): | |||
S = lmda * A_wave2 | |||
T_t = A_wave1 | |||
# use uniform distribution if there is no prior knowledge. | |||
nb_pd = len(A_wave1) * len(A_wave2) | |||
p_times_uni = 1 / nb_pd | |||
M0 = np.full((len(A_wave2), len(A_wave1)), p_times_uni) | |||
X = dlyap(S, T_t, M0) | |||
X = np.reshape(X, (-1, 1), order='F') | |||
# use uniform distribution if there is no prior knowledge. | |||
q_times = np.full((1, nb_pd), p_times_uni) | |||
return np.dot(q_times, X) | |||
def _wrapper_kernel_do(self, itr): | |||
i = itr[0] | |||
j = itr[1] | |||
return i, j, self.__kernel_do(G_A_wave_list[i], G_A_wave_list[j], self._weight) |
@@ -3,14 +3,14 @@ | |||
@references: | |||
[1] H. Kashima, K. Tsuda, and A. Inokuchi. Marginalized kernels between | |||
labeled graphs. In Proceedings of the 20th International Conference on | |||
Machine Learning, Washington, DC, United States, 2003. | |||
[2] Pierre Mahé, Nobuhisa Ueda, Tatsuya Akutsu, Jean-Luc Perret, and | |||
Jean-Philippe Vert. Extensions of marginalized graph kernels. In | |||
Proceedings of the twenty-first international conference on Machine | |||
learning, page 70. ACM, 2004. | |||
[1] H. Kashima, K. Tsuda, and A. Inokuchi. Marginalized kernels between | |||
labeled graphs. In Proceedings of the 20th International Conference on | |||
Machine Learning, Washington, DC, United States, 2003. | |||
[2] Pierre Mahé, Nobuhisa Ueda, Tatsuya Akutsu, Jean-Luc Perret, and | |||
Jean-Philippe Vert. Extensions of marginalized graph kernels. In | |||
Proceedings of the twenty-first international conference on Machine | |||
learning, page 70. ACM, 2004. | |||
""" | |||
import sys | |||
@@ -31,275 +31,277 @@ from gklearn.utils.parallel import parallel_gm | |||
def marginalizedkernel(*args, | |||
node_label='atom', | |||
edge_label='bond_type', | |||
p_quit=0.5, | |||
n_iteration=20, | |||
remove_totters=False, | |||
n_jobs=None, | |||
verbose=True): | |||
"""Calculate marginalized graph kernels between graphs. | |||
Parameters | |||
---------- | |||
Gn : List of NetworkX graph | |||
List of graphs between which the kernels are calculated. | |||
G1, G2 : NetworkX graphs | |||
Two graphs between which the kernel is calculated. | |||
node_label : string | |||
Node attribute used as symbolic label. The default node label is 'atom'. | |||
edge_label : string | |||
Edge attribute used as symbolic label. The default edge label is 'bond_type'. | |||
p_quit : integer | |||
The termination probability in the random walks generating step. | |||
n_iteration : integer | |||
Time of iterations to calculate R_inf. | |||
remove_totters : boolean | |||
Whether to remove totterings by method introduced in [2]. The default | |||
value is False. | |||
n_jobs : int | |||
Number of jobs for parallelization. | |||
Return | |||
------ | |||
Kmatrix : Numpy matrix | |||
Kernel matrix, each element of which is the marginalized kernel between | |||
2 praphs. | |||
""" | |||
# pre-process | |||
n_iteration = int(n_iteration) | |||
Gn = args[0][:] if len(args) == 1 else [args[0].copy(), args[1].copy()] | |||
Gn = [g.copy() for g in Gn] | |||
ds_attrs = get_dataset_attributes( | |||
Gn, | |||
attr_names=['node_labeled', 'edge_labeled', 'is_directed'], | |||
node_label=node_label, edge_label=edge_label) | |||
if not ds_attrs['node_labeled'] or node_label == None: | |||
node_label = 'atom' | |||
for G in Gn: | |||
nx.set_node_attributes(G, '0', 'atom') | |||
if not ds_attrs['edge_labeled'] or edge_label == None: | |||
edge_label = 'bond_type' | |||
for G in Gn: | |||
nx.set_edge_attributes(G, '0', 'bond_type') | |||
start_time = time.time() | |||
if remove_totters: | |||
# ---- use pool.imap_unordered to parallel and track progress. ---- | |||
pool = Pool(n_jobs) | |||
untotter_partial = partial(wrapper_untotter, Gn, node_label, edge_label) | |||
if len(Gn) < 100 * n_jobs: | |||
chunksize = int(len(Gn) / n_jobs) + 1 | |||
else: | |||
chunksize = 100 | |||
for i, g in tqdm( | |||
pool.imap_unordered( | |||
untotter_partial, range(0, len(Gn)), chunksize), | |||
desc='removing tottering', | |||
file=sys.stdout): | |||
Gn[i] = g | |||
pool.close() | |||
pool.join() | |||
# # ---- direct running, normally use single CPU core. ---- | |||
# Gn = [ | |||
# untotterTransformation(G, node_label, edge_label) | |||
# for G in tqdm(Gn, desc='removing tottering', file=sys.stdout) | |||
# ] | |||
Kmatrix = np.zeros((len(Gn), len(Gn))) | |||
# ---- use pool.imap_unordered to parallel and track progress. ---- | |||
def init_worker(gn_toshare): | |||
global G_gn | |||
G_gn = gn_toshare | |||
do_partial = partial(wrapper_marg_do, node_label, edge_label, | |||
p_quit, n_iteration) | |||
parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, | |||
glbv=(Gn,), n_jobs=n_jobs, verbose=verbose) | |||
# # ---- direct running, normally use single CPU core. ---- | |||
## pbar = tqdm( | |||
## total=(1 + len(Gn)) * len(Gn) / 2, | |||
## desc='calculating kernels', | |||
## file=sys.stdout) | |||
# for i in range(0, len(Gn)): | |||
# for j in range(i, len(Gn)): | |||
## print(i, j) | |||
# Kmatrix[i][j] = _marginalizedkernel_do(Gn[i], Gn[j], node_label, | |||
# edge_label, p_quit, n_iteration) | |||
# Kmatrix[j][i] = Kmatrix[i][j] | |||
## pbar.update(1) | |||
run_time = time.time() - start_time | |||
if verbose: | |||
print("\n --- marginalized kernel matrix of size %d built in %s seconds ---" | |||
% (len(Gn), run_time)) | |||
return Kmatrix, run_time | |||
node_label='atom', | |||
edge_label='bond_type', | |||
p_quit=0.5, | |||
n_iteration=20, | |||
remove_totters=False, | |||
n_jobs=None, | |||
chunksize=None, | |||
verbose=True): | |||
"""Calculate marginalized graph kernels between graphs. | |||
Parameters | |||
---------- | |||
Gn : List of NetworkX graph | |||
List of graphs between which the kernels are calculated. | |||
G1, G2 : NetworkX graphs | |||
Two graphs between which the kernel is calculated. | |||
node_label : string | |||
Node attribute used as symbolic label. The default node label is 'atom'. | |||
edge_label : string | |||
Edge attribute used as symbolic label. The default edge label is 'bond_type'. | |||
p_quit : integer | |||
The termination probability in the random walks generating step. | |||
n_iteration : integer | |||
Time of iterations to calculate R_inf. | |||
remove_totters : boolean | |||
Whether to remove totterings by method introduced in [2]. The default | |||
value is False. | |||
n_jobs : int | |||
Number of jobs for parallelization. | |||
Return | |||
------ | |||
Kmatrix : Numpy matrix | |||
Kernel matrix, each element of which is the marginalized kernel between | |||
2 praphs. | |||
""" | |||
# pre-process | |||
n_iteration = int(n_iteration) | |||
Gn = args[0][:] if len(args) == 1 else [args[0].copy(), args[1].copy()] | |||
Gn = [g.copy() for g in Gn] | |||
ds_attrs = get_dataset_attributes( | |||
Gn, | |||
attr_names=['node_labeled', 'edge_labeled', 'is_directed'], | |||
node_label=node_label, edge_label=edge_label) | |||
if not ds_attrs['node_labeled'] or node_label == None: | |||
node_label = 'atom' | |||
for G in Gn: | |||
nx.set_node_attributes(G, '0', 'atom') | |||
if not ds_attrs['edge_labeled'] or edge_label == None: | |||
edge_label = 'bond_type' | |||
for G in Gn: | |||
nx.set_edge_attributes(G, '0', 'bond_type') | |||
start_time = time.time() | |||
if remove_totters: | |||
# ---- use pool.imap_unordered to parallel and track progress. ---- | |||
pool = Pool(n_jobs) | |||
untotter_partial = partial(wrapper_untotter, Gn, node_label, edge_label) | |||
if chunksize is None: | |||
if len(Gn) < 100 * n_jobs: | |||
chunksize = int(len(Gn) / n_jobs) + 1 | |||
else: | |||
chunksize = 100 | |||
for i, g in tqdm( | |||
pool.imap_unordered( | |||
untotter_partial, range(0, len(Gn)), chunksize), | |||
desc='removing tottering', | |||
file=sys.stdout): | |||
Gn[i] = g | |||
pool.close() | |||
pool.join() | |||
# # ---- direct running, normally use single CPU core. ---- | |||
# Gn = [ | |||
# untotterTransformation(G, node_label, edge_label) | |||
# for G in tqdm(Gn, desc='removing tottering', file=sys.stdout) | |||
# ] | |||
Kmatrix = np.zeros((len(Gn), len(Gn))) | |||
# ---- use pool.imap_unordered to parallel and track progress. ---- | |||
def init_worker(gn_toshare): | |||
global G_gn | |||
G_gn = gn_toshare | |||
do_partial = partial(wrapper_marg_do, node_label, edge_label, | |||
p_quit, n_iteration) | |||
parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, | |||
glbv=(Gn,), n_jobs=n_jobs, chunksize=chunksize, verbose=verbose) | |||
# # ---- direct running, normally use single CPU core. ---- | |||
## pbar = tqdm( | |||
## total=(1 + len(Gn)) * len(Gn) / 2, | |||
## desc='calculating kernels', | |||
## file=sys.stdout) | |||
# for i in range(0, len(Gn)): | |||
# for j in range(i, len(Gn)): | |||
## print(i, j) | |||
# Kmatrix[i][j] = _marginalizedkernel_do(Gn[i], Gn[j], node_label, | |||
# edge_label, p_quit, n_iteration) | |||
# Kmatrix[j][i] = Kmatrix[i][j] | |||
## pbar.update(1) | |||
run_time = time.time() - start_time | |||
if verbose: | |||
print("\n --- marginalized kernel matrix of size %d built in %s seconds ---" | |||
% (len(Gn), run_time)) | |||
return Kmatrix, run_time | |||
def _marginalizedkernel_do(g1, g2, node_label, edge_label, p_quit, n_iteration): | |||
"""Calculate marginalized graph kernel between 2 graphs. | |||
Parameters | |||
---------- | |||
G1, G2 : NetworkX graphs | |||
2 graphs between which the kernel is calculated. | |||
node_label : string | |||
node attribute used as label. | |||
edge_label : string | |||
edge attribute used as label. | |||
p_quit : integer | |||
the termination probability in the random walks generating step. | |||
n_iteration : integer | |||
time of iterations to calculate R_inf. | |||
Return | |||
------ | |||
kernel : float | |||
Marginalized Kernel between 2 graphs. | |||
""" | |||
# init parameters | |||
kernel = 0 | |||
num_nodes_G1 = nx.number_of_nodes(g1) | |||
num_nodes_G2 = nx.number_of_nodes(g2) | |||
# the initial probability distribution in the random walks generating step | |||
# (uniform distribution over |G|) | |||
p_init_G1 = 1 / num_nodes_G1 | |||
p_init_G2 = 1 / num_nodes_G2 | |||
q = p_quit * p_quit | |||
r1 = q | |||
# # initial R_inf | |||
# # matrix to save all the R_inf for all pairs of nodes | |||
# R_inf = np.zeros([num_nodes_G1, num_nodes_G2]) | |||
"""Calculate marginalized graph kernel between 2 graphs. | |||
Parameters | |||
---------- | |||
G1, G2 : NetworkX graphs | |||
2 graphs between which the kernel is calculated. | |||
node_label : string | |||
node attribute used as label. | |||
edge_label : string | |||
edge attribute used as label. | |||
p_quit : integer | |||
the termination probability in the random walks generating step. | |||
n_iteration : integer | |||
time of iterations to calculate R_inf. | |||
Return | |||
------ | |||
kernel : float | |||
Marginalized Kernel between 2 graphs. | |||
""" | |||
# init parameters | |||
kernel = 0 | |||
num_nodes_G1 = nx.number_of_nodes(g1) | |||
num_nodes_G2 = nx.number_of_nodes(g2) | |||
# the initial probability distribution in the random walks generating step | |||
# (uniform distribution over |G|) | |||
p_init_G1 = 1 / num_nodes_G1 | |||
p_init_G2 = 1 / num_nodes_G2 | |||
q = p_quit * p_quit | |||
r1 = q | |||
# # initial R_inf | |||
# # matrix to save all the R_inf for all pairs of nodes | |||
# R_inf = np.zeros([num_nodes_G1, num_nodes_G2]) | |||
# | |||
# # calculate R_inf with a simple interative method | |||
# for i in range(1, n_iteration): | |||
# R_inf_new = np.zeros([num_nodes_G1, num_nodes_G2]) | |||
# R_inf_new.fill(r1) | |||
# # calculate R_inf with a simple interative method | |||
# for i in range(1, n_iteration): | |||
# R_inf_new = np.zeros([num_nodes_G1, num_nodes_G2]) | |||
# R_inf_new.fill(r1) | |||
# | |||
# # calculate R_inf for each pair of nodes | |||
# for node1 in g1.nodes(data=True): | |||
# neighbor_n1 = g1[node1[0]] | |||
# # the transition probability distribution in the random walks | |||
# # generating step (uniform distribution over the vertices adjacent | |||
# # to the current vertex) | |||
# if len(neighbor_n1) > 0: | |||
# p_trans_n1 = (1 - p_quit) / len(neighbor_n1) | |||
# for node2 in g2.nodes(data=True): | |||
# neighbor_n2 = g2[node2[0]] | |||
# if len(neighbor_n2) > 0: | |||
# p_trans_n2 = (1 - p_quit) / len(neighbor_n2) | |||
# | |||
# for neighbor1 in neighbor_n1: | |||
# for neighbor2 in neighbor_n2: | |||
# t = p_trans_n1 * p_trans_n2 * \ | |||
# deltakernel(g1.node[neighbor1][node_label], | |||
# g2.node[neighbor2][node_label]) * \ | |||
# deltakernel( | |||
# neighbor_n1[neighbor1][edge_label], | |||
# neighbor_n2[neighbor2][edge_label]) | |||
# | |||
# R_inf_new[node1[0]][node2[0]] += t * R_inf[neighbor1][ | |||
# neighbor2] # ref [1] equation (8) | |||
# R_inf[:] = R_inf_new | |||
# # calculate R_inf for each pair of nodes | |||
# for node1 in g1.nodes(data=True): | |||
# neighbor_n1 = g1[node1[0]] | |||
# # the transition probability distribution in the random walks | |||
# # generating step (uniform distribution over the vertices adjacent | |||
# # to the current vertex) | |||
# if len(neighbor_n1) > 0: | |||
# p_trans_n1 = (1 - p_quit) / len(neighbor_n1) | |||
# for node2 in g2.nodes(data=True): | |||
# neighbor_n2 = g2[node2[0]] | |||
# if len(neighbor_n2) > 0: | |||
# p_trans_n2 = (1 - p_quit) / len(neighbor_n2) | |||
# | |||
# for neighbor1 in neighbor_n1: | |||
# for neighbor2 in neighbor_n2: | |||
# t = p_trans_n1 * p_trans_n2 * \ | |||
# deltakernel(g1.node[neighbor1][node_label], | |||
# g2.node[neighbor2][node_label]) * \ | |||
# deltakernel( | |||
# neighbor_n1[neighbor1][edge_label], | |||
# neighbor_n2[neighbor2][edge_label]) | |||
# | |||
# R_inf_new[node1[0]][node2[0]] += t * R_inf[neighbor1][ | |||
# neighbor2] # ref [1] equation (8) | |||
# R_inf[:] = R_inf_new | |||
# | |||
# # add elements of R_inf up and calculate kernel | |||
# for node1 in g1.nodes(data=True): | |||
# for node2 in g2.nodes(data=True): | |||
# s = p_init_G1 * p_init_G2 * deltakernel( | |||
# node1[1][node_label], node2[1][node_label]) | |||
# kernel += s * R_inf[node1[0]][node2[0]] # ref [1] equation (6) | |||
R_inf = {} # dict to save all the R_inf for all pairs of nodes | |||
# initial R_inf, the 1st iteration. | |||
for node1 in g1.nodes(): | |||
for node2 in g2.nodes(): | |||
# R_inf[(node1[0], node2[0])] = r1 | |||
if len(g1[node1]) > 0: | |||
if len(g2[node2]) > 0: | |||
R_inf[(node1, node2)] = r1 | |||
else: | |||
R_inf[(node1, node2)] = p_quit | |||
else: | |||
if len(g2[node2]) > 0: | |||
R_inf[(node1, node2)] = p_quit | |||
else: | |||
R_inf[(node1, node2)] = 1 | |||
# compute all transition probability first. | |||
t_dict = {} | |||
if n_iteration > 1: | |||
for node1 in g1.nodes(): | |||
neighbor_n1 = g1[node1] | |||
# the transition probability distribution in the random walks | |||
# generating step (uniform distribution over the vertices adjacent | |||
# to the current vertex) | |||
if len(neighbor_n1) > 0: | |||
p_trans_n1 = (1 - p_quit) / len(neighbor_n1) | |||
for node2 in g2.nodes(): | |||
neighbor_n2 = g2[node2] | |||
if len(neighbor_n2) > 0: | |||
p_trans_n2 = (1 - p_quit) / len(neighbor_n2) | |||
for neighbor1 in neighbor_n1: | |||
for neighbor2 in neighbor_n2: | |||
t_dict[(node1, node2, neighbor1, neighbor2)] = \ | |||
p_trans_n1 * p_trans_n2 * \ | |||
deltakernel(g1.nodes[neighbor1][node_label], | |||
g2.nodes[neighbor2][node_label]) * \ | |||
deltakernel( | |||
neighbor_n1[neighbor1][edge_label], | |||
neighbor_n2[neighbor2][edge_label]) | |||
# calculate R_inf with a simple interative method | |||
for i in range(2, n_iteration + 1): | |||
R_inf_old = R_inf.copy() | |||
# calculate R_inf for each pair of nodes | |||
for node1 in g1.nodes(): | |||
neighbor_n1 = g1[node1] | |||
# the transition probability distribution in the random walks | |||
# generating step (uniform distribution over the vertices adjacent | |||
# to the current vertex) | |||
if len(neighbor_n1) > 0: | |||
for node2 in g2.nodes(): | |||
neighbor_n2 = g2[node2] | |||
if len(neighbor_n2) > 0: | |||
R_inf[(node1, node2)] = r1 | |||
for neighbor1 in neighbor_n1: | |||
for neighbor2 in neighbor_n2: | |||
R_inf[(node1, node2)] += \ | |||
(t_dict[(node1, node2, neighbor1, neighbor2)] * \ | |||
R_inf_old[(neighbor1, neighbor2)]) # ref [1] equation (8) | |||
# add elements of R_inf up and calculate kernel | |||
for (n1, n2), value in R_inf.items(): | |||
s = p_init_G1 * p_init_G2 * deltakernel( | |||
g1.nodes[n1][node_label], g2.nodes[n2][node_label]) | |||
kernel += s * value # ref [1] equation (6) | |||
return kernel | |||
# # add elements of R_inf up and calculate kernel | |||
# for node1 in g1.nodes(data=True): | |||
# for node2 in g2.nodes(data=True): | |||
# s = p_init_G1 * p_init_G2 * deltakernel( | |||
# node1[1][node_label], node2[1][node_label]) | |||
# kernel += s * R_inf[node1[0]][node2[0]] # ref [1] equation (6) | |||
R_inf = {} # dict to save all the R_inf for all pairs of nodes | |||
# initial R_inf, the 1st iteration. | |||
for node1 in g1.nodes(): | |||
for node2 in g2.nodes(): | |||
# R_inf[(node1[0], node2[0])] = r1 | |||
if len(g1[node1]) > 0: | |||
if len(g2[node2]) > 0: | |||
R_inf[(node1, node2)] = r1 | |||
else: | |||
R_inf[(node1, node2)] = p_quit | |||
else: | |||
if len(g2[node2]) > 0: | |||
R_inf[(node1, node2)] = p_quit | |||
else: | |||
R_inf[(node1, node2)] = 1 | |||
# compute all transition probability first. | |||
t_dict = {} | |||
if n_iteration > 1: | |||
for node1 in g1.nodes(): | |||
neighbor_n1 = g1[node1] | |||
# the transition probability distribution in the random walks | |||
# generating step (uniform distribution over the vertices adjacent | |||
# to the current vertex) | |||
if len(neighbor_n1) > 0: | |||
p_trans_n1 = (1 - p_quit) / len(neighbor_n1) | |||
for node2 in g2.nodes(): | |||
neighbor_n2 = g2[node2] | |||
if len(neighbor_n2) > 0: | |||
p_trans_n2 = (1 - p_quit) / len(neighbor_n2) | |||
for neighbor1 in neighbor_n1: | |||
for neighbor2 in neighbor_n2: | |||
t_dict[(node1, node2, neighbor1, neighbor2)] = \ | |||
p_trans_n1 * p_trans_n2 * \ | |||
deltakernel(g1.nodes[neighbor1][node_label], | |||
g2.nodes[neighbor2][node_label]) * \ | |||
deltakernel( | |||
neighbor_n1[neighbor1][edge_label], | |||
neighbor_n2[neighbor2][edge_label]) | |||
# calculate R_inf with a simple interative method | |||
for i in range(2, n_iteration + 1): | |||
R_inf_old = R_inf.copy() | |||
# calculate R_inf for each pair of nodes | |||
for node1 in g1.nodes(): | |||
neighbor_n1 = g1[node1] | |||
# the transition probability distribution in the random walks | |||
# generating step (uniform distribution over the vertices adjacent | |||
# to the current vertex) | |||
if len(neighbor_n1) > 0: | |||
for node2 in g2.nodes(): | |||
neighbor_n2 = g2[node2] | |||
if len(neighbor_n2) > 0: | |||
R_inf[(node1, node2)] = r1 | |||
for neighbor1 in neighbor_n1: | |||
for neighbor2 in neighbor_n2: | |||
R_inf[(node1, node2)] += \ | |||
(t_dict[(node1, node2, neighbor1, neighbor2)] * \ | |||
R_inf_old[(neighbor1, neighbor2)]) # ref [1] equation (8) | |||
# add elements of R_inf up and calculate kernel | |||
for (n1, n2), value in R_inf.items(): | |||
s = p_init_G1 * p_init_G2 * deltakernel( | |||
g1.nodes[n1][node_label], g2.nodes[n2][node_label]) | |||
kernel += s * value # ref [1] equation (6) | |||
return kernel | |||
def wrapper_marg_do(node_label, edge_label, p_quit, n_iteration, itr): | |||
i= itr[0] | |||
j = itr[1] | |||
return i, j, _marginalizedkernel_do(G_gn[i], G_gn[j], node_label, edge_label, p_quit, n_iteration) | |||
i= itr[0] | |||
j = itr[1] | |||
return i, j, _marginalizedkernel_do(G_gn[i], G_gn[j], node_label, edge_label, p_quit, n_iteration) | |||
def wrapper_untotter(Gn, node_label, edge_label, i): | |||
return i, untotterTransformation(Gn[i], node_label, edge_label) | |||
return i, untotterTransformation(Gn[i], node_label, edge_label) |
@@ -373,8 +373,18 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func == None | |||
for key in all_paths] | |||
kernel = np.sum(np.minimum(vector1, vector2)) / \ | |||
np.sum(np.maximum(vector1, vector2)) | |||
elif self.__k_func is None: # no sub-kernel used; compare paths directly. | |||
path_count1 = Counter(paths1) | |||
path_count2 = Counter(paths2) | |||
vector1 = [(path_count1[key] if (key in path_count1.keys()) else 0) | |||
for key in all_paths] | |||
vector2 = [(path_count2[key] if (key in path_count2.keys()) else 0) | |||
for key in all_paths] | |||
kernel = np.dot(vector1, vector2) | |||
else: | |||
raise Exception('The given "k_func" cannot be recognized. Possible choices include: "tanimoto", "MinMax".') | |||
raise Exception('The given "k_func" cannot be recognized. Possible choices include: "tanimoto", "MinMax" and None.') | |||
return kernel | |||
@@ -2,9 +2,9 @@ | |||
@author: linlin | |||
@references: | |||
[1] Borgwardt KM, Kriegel HP. Shortest-path kernels on graphs. InData | |||
Mining, Fifth IEEE International Conference on 2005 Nov 27 (pp. 8-pp). IEEE. | |||
[1] Borgwardt KM, Kriegel HP. Shortest-path kernels on graphs. InData | |||
Mining, Fifth IEEE International Conference on 2005 Nov 27 (pp. 8-pp). IEEE. | |||
""" | |||
import sys | |||
@@ -22,303 +22,305 @@ from gklearn.utils.graphdataset import get_dataset_attributes | |||
from gklearn.utils.parallel import parallel_gm | |||
def spkernel(*args, | |||
node_label='atom', | |||
edge_weight=None, | |||
node_kernels=None, | |||
parallel='imap_unordered', | |||
n_jobs=None, | |||
verbose=True): | |||
"""Calculate shortest-path kernels between graphs. | |||
Parameters | |||
---------- | |||
Gn : List of NetworkX graph | |||
List of graphs between which the kernels are calculated. | |||
G1, G2 : NetworkX graphs | |||
Two graphs between which the kernel is calculated. | |||
node_label : string | |||
Node attribute used as label. The default node label is atom. | |||
edge_weight : string | |||
Edge attribute name corresponding to the edge weight. | |||
node_kernels : dict | |||
A dictionary of kernel functions for nodes, including 3 items: 'symb' | |||
for symbolic node labels, 'nsymb' for non-symbolic node labels, 'mix' | |||
for both labels. The first 2 functions take two node labels as | |||
parameters, and the 'mix' function takes 4 parameters, a symbolic and a | |||
non-symbolic label for each the two nodes. Each label is in form of 2-D | |||
dimension array (n_samples, n_features). Each function returns an | |||
number as the kernel value. Ignored when nodes are unlabeled. | |||
n_jobs : int | |||
Number of jobs for parallelization. | |||
Return | |||
------ | |||
Kmatrix : Numpy matrix | |||
Kernel matrix, each element of which is the sp kernel between 2 praphs. | |||
""" | |||
# pre-process | |||
Gn = args[0] if len(args) == 1 else [args[0], args[1]] | |||
Gn = [g.copy() for g in Gn] | |||
weight = None | |||
if edge_weight is None: | |||
if verbose: | |||
print('\n None edge weight specified. Set all weight to 1.\n') | |||
else: | |||
try: | |||
some_weight = list( | |||
nx.get_edge_attributes(Gn[0], edge_weight).values())[0] | |||
if isinstance(some_weight, (float, int)): | |||
weight = edge_weight | |||
else: | |||
if verbose: | |||
print( | |||
'\n Edge weight with name %s is not float or integer. Set all weight to 1.\n' | |||
% edge_weight) | |||
except: | |||
if verbose: | |||
print( | |||
'\n Edge weight with name "%s" is not found in the edge attributes. Set all weight to 1.\n' | |||
% edge_weight) | |||
ds_attrs = get_dataset_attributes( | |||
Gn, | |||
attr_names=['node_labeled', 'node_attr_dim', 'is_directed'], | |||
node_label=node_label) | |||
# remove graphs with no edges, as no sp can be found in their structures, | |||
# so the kernel between such a graph and itself will be zero. | |||
len_gn = len(Gn) | |||
Gn = [(idx, G) for idx, G in enumerate(Gn) if nx.number_of_edges(G) != 0] | |||
idx = [G[0] for G in Gn] | |||
Gn = [G[1] for G in Gn] | |||
if len(Gn) != len_gn: | |||
if verbose: | |||
print('\n %d graphs are removed as they don\'t contain edges.\n' % | |||
(len_gn - len(Gn))) | |||
start_time = time.time() | |||
if parallel == 'imap_unordered': | |||
pool = Pool(n_jobs) | |||
# get shortest path graphs of Gn | |||
getsp_partial = partial(wrapper_getSPGraph, weight) | |||
itr = zip(Gn, range(0, len(Gn))) | |||
if len(Gn) < 100 * n_jobs: | |||
# # use default chunksize as pool.map when iterable is less than 100 | |||
# chunksize, extra = divmod(len(Gn), n_jobs * 4) | |||
# if extra: | |||
# chunksize += 1 | |||
chunksize = int(len(Gn) / n_jobs) + 1 | |||
else: | |||
chunksize = 100 | |||
if verbose: | |||
iterator = tqdm(pool.imap_unordered(getsp_partial, itr, chunksize), | |||
desc='getting sp graphs', file=sys.stdout) | |||
else: | |||
iterator = pool.imap_unordered(getsp_partial, itr, chunksize) | |||
for i, g in iterator: | |||
Gn[i] = g | |||
pool.close() | |||
pool.join() | |||
elif parallel is None: | |||
pass | |||
# # ---- direct running, normally use single CPU core. ---- | |||
# for i in tqdm(range(len(Gn)), desc='getting sp graphs', file=sys.stdout): | |||
# i, Gn[i] = wrapper_getSPGraph(weight, (Gn[i], i)) | |||
# # ---- use pool.map to parallel ---- | |||
# result_sp = pool.map(getsp_partial, range(0, len(Gn))) | |||
# for i in result_sp: | |||
# Gn[i[0]] = i[1] | |||
# or | |||
# getsp_partial = partial(wrap_getSPGraph, Gn, weight) | |||
# for i, g in tqdm( | |||
# pool.map(getsp_partial, range(0, len(Gn))), | |||
# desc='getting sp graphs', | |||
# file=sys.stdout): | |||
# Gn[i] = g | |||
# # ---- only for the Fast Computation of Shortest Path Kernel (FCSP) | |||
# sp_ml = [0] * len(Gn) # shortest path matrices | |||
# for i in result_sp: | |||
# sp_ml[i[0]] = i[1] | |||
# edge_x_g = [[] for i in range(len(sp_ml))] | |||
# edge_y_g = [[] for i in range(len(sp_ml))] | |||
# edge_w_g = [[] for i in range(len(sp_ml))] | |||
# for idx, item in enumerate(sp_ml): | |||
# for i1 in range(len(item)): | |||
# for i2 in range(i1 + 1, len(item)): | |||
# if item[i1, i2] != np.inf: | |||
# edge_x_g[idx].append(i1) | |||
# edge_y_g[idx].append(i2) | |||
# edge_w_g[idx].append(item[i1, i2]) | |||
# print(len(edge_x_g[0])) | |||
# print(len(edge_y_g[0])) | |||
# print(len(edge_w_g[0])) | |||
Kmatrix = np.zeros((len(Gn), len(Gn))) | |||
# ---- use pool.imap_unordered to parallel and track progress. ---- | |||
def init_worker(gn_toshare): | |||
global G_gn | |||
G_gn = gn_toshare | |||
do_partial = partial(wrapper_sp_do, ds_attrs, node_label, node_kernels) | |||
parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, | |||
glbv=(Gn,), n_jobs=n_jobs, verbose=verbose) | |||
# # ---- use pool.map to parallel. ---- | |||
# # result_perf = pool.map(do_partial, itr) | |||
# do_partial = partial(spkernel_do, Gn, ds_attrs, node_label, node_kernels) | |||
# itr = combinations_with_replacement(range(0, len(Gn)), 2) | |||
# for i, j, kernel in tqdm( | |||
# pool.map(do_partial, itr), desc='calculating kernels', | |||
# file=sys.stdout): | |||
# Kmatrix[i][j] = kernel | |||
# Kmatrix[j][i] = kernel | |||
# pool.close() | |||
# pool.join() | |||
# # ---- use joblib.Parallel to parallel and track progress. ---- | |||
# result_perf = Parallel( | |||
# n_jobs=n_jobs, verbose=10)( | |||
# delayed(do_partial)(ij) | |||
# for ij in combinations_with_replacement(range(0, len(Gn)), 2)) | |||
# result_perf = [ | |||
# do_partial(ij) | |||
# for ij in combinations_with_replacement(range(0, len(Gn)), 2) | |||
# ] | |||
# for i in result_perf: | |||
# Kmatrix[i[0]][i[1]] = i[2] | |||
# Kmatrix[i[1]][i[0]] = i[2] | |||
# # ---- direct running, normally use single CPU core. ---- | |||
# from itertools import combinations_with_replacement | |||
# itr = combinations_with_replacement(range(0, len(Gn)), 2) | |||
# for i, j in tqdm(itr, desc='calculating kernels', file=sys.stdout): | |||
# kernel = spkernel_do(Gn[i], Gn[j], ds_attrs, node_label, node_kernels) | |||
# Kmatrix[i][j] = kernel | |||
# Kmatrix[j][i] = kernel | |||
run_time = time.time() - start_time | |||
if verbose: | |||
print( | |||
"\n --- shortest path kernel matrix of size %d built in %s seconds ---" | |||
% (len(Gn), run_time)) | |||
return Kmatrix, run_time, idx | |||
node_label='atom', | |||
edge_weight=None, | |||
node_kernels=None, | |||
parallel='imap_unordered', | |||
n_jobs=None, | |||
chunksize=None, | |||
verbose=True): | |||
"""Calculate shortest-path kernels between graphs. | |||
Parameters | |||
---------- | |||
Gn : List of NetworkX graph | |||
List of graphs between which the kernels are calculated. | |||
G1, G2 : NetworkX graphs | |||
Two graphs between which the kernel is calculated. | |||
node_label : string | |||
Node attribute used as label. The default node label is atom. | |||
edge_weight : string | |||
Edge attribute name corresponding to the edge weight. | |||
node_kernels : dict | |||
A dictionary of kernel functions for nodes, including 3 items: 'symb' | |||
for symbolic node labels, 'nsymb' for non-symbolic node labels, 'mix' | |||
for both labels. The first 2 functions take two node labels as | |||
parameters, and the 'mix' function takes 4 parameters, a symbolic and a | |||
non-symbolic label for each the two nodes. Each label is in form of 2-D | |||
dimension array (n_samples, n_features). Each function returns an | |||
number as the kernel value. Ignored when nodes are unlabeled. | |||
n_jobs : int | |||
Number of jobs for parallelization. | |||
Return | |||
------ | |||
Kmatrix : Numpy matrix | |||
Kernel matrix, each element of which is the sp kernel between 2 praphs. | |||
""" | |||
# pre-process | |||
Gn = args[0] if len(args) == 1 else [args[0], args[1]] | |||
Gn = [g.copy() for g in Gn] | |||
weight = None | |||
if edge_weight is None: | |||
if verbose: | |||
print('\n None edge weight specified. Set all weight to 1.\n') | |||
else: | |||
try: | |||
some_weight = list( | |||
nx.get_edge_attributes(Gn[0], edge_weight).values())[0] | |||
if isinstance(some_weight, (float, int)): | |||
weight = edge_weight | |||
else: | |||
if verbose: | |||
print( | |||
'\n Edge weight with name %s is not float or integer. Set all weight to 1.\n' | |||
% edge_weight) | |||
except: | |||
if verbose: | |||
print( | |||
'\n Edge weight with name "%s" is not found in the edge attributes. Set all weight to 1.\n' | |||
% edge_weight) | |||
ds_attrs = get_dataset_attributes( | |||
Gn, | |||
attr_names=['node_labeled', 'node_attr_dim', 'is_directed'], | |||
node_label=node_label) | |||
# remove graphs with no edges, as no sp can be found in their structures, | |||
# so the kernel between such a graph and itself will be zero. | |||
len_gn = len(Gn) | |||
Gn = [(idx, G) for idx, G in enumerate(Gn) if nx.number_of_edges(G) != 0] | |||
idx = [G[0] for G in Gn] | |||
Gn = [G[1] for G in Gn] | |||
if len(Gn) != len_gn: | |||
if verbose: | |||
print('\n %d graphs are removed as they don\'t contain edges.\n' % | |||
(len_gn - len(Gn))) | |||
start_time = time.time() | |||
if parallel == 'imap_unordered': | |||
pool = Pool(n_jobs) | |||
# get shortest path graphs of Gn | |||
getsp_partial = partial(wrapper_getSPGraph, weight) | |||
itr = zip(Gn, range(0, len(Gn))) | |||
if chunksize is None: | |||
if len(Gn) < 100 * n_jobs: | |||
# # use default chunksize as pool.map when iterable is less than 100 | |||
# chunksize, extra = divmod(len(Gn), n_jobs * 4) | |||
# if extra: | |||
# chunksize += 1 | |||
chunksize = int(len(Gn) / n_jobs) + 1 | |||
else: | |||
chunksize = 100 | |||
if verbose: | |||
iterator = tqdm(pool.imap_unordered(getsp_partial, itr, chunksize), | |||
desc='getting sp graphs', file=sys.stdout) | |||
else: | |||
iterator = pool.imap_unordered(getsp_partial, itr, chunksize) | |||
for i, g in iterator: | |||
Gn[i] = g | |||
pool.close() | |||
pool.join() | |||
elif parallel is None: | |||
pass | |||
# # ---- direct running, normally use single CPU core. ---- | |||
# for i in tqdm(range(len(Gn)), desc='getting sp graphs', file=sys.stdout): | |||
# i, Gn[i] = wrapper_getSPGraph(weight, (Gn[i], i)) | |||
# # ---- use pool.map to parallel ---- | |||
# result_sp = pool.map(getsp_partial, range(0, len(Gn))) | |||
# for i in result_sp: | |||
# Gn[i[0]] = i[1] | |||
# or | |||
# getsp_partial = partial(wrap_getSPGraph, Gn, weight) | |||
# for i, g in tqdm( | |||
# pool.map(getsp_partial, range(0, len(Gn))), | |||
# desc='getting sp graphs', | |||
# file=sys.stdout): | |||
# Gn[i] = g | |||
# # ---- only for the Fast Computation of Shortest Path Kernel (FCSP) | |||
# sp_ml = [0] * len(Gn) # shortest path matrices | |||
# for i in result_sp: | |||
# sp_ml[i[0]] = i[1] | |||
# edge_x_g = [[] for i in range(len(sp_ml))] | |||
# edge_y_g = [[] for i in range(len(sp_ml))] | |||
# edge_w_g = [[] for i in range(len(sp_ml))] | |||
# for idx, item in enumerate(sp_ml): | |||
# for i1 in range(len(item)): | |||
# for i2 in range(i1 + 1, len(item)): | |||
# if item[i1, i2] != np.inf: | |||
# edge_x_g[idx].append(i1) | |||
# edge_y_g[idx].append(i2) | |||
# edge_w_g[idx].append(item[i1, i2]) | |||
# print(len(edge_x_g[0])) | |||
# print(len(edge_y_g[0])) | |||
# print(len(edge_w_g[0])) | |||
Kmatrix = np.zeros((len(Gn), len(Gn))) | |||
# ---- use pool.imap_unordered to parallel and track progress. ---- | |||
def init_worker(gn_toshare): | |||
global G_gn | |||
G_gn = gn_toshare | |||
do_partial = partial(wrapper_sp_do, ds_attrs, node_label, node_kernels) | |||
parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, | |||
glbv=(Gn,), n_jobs=n_jobs, chunksize=chunksize, verbose=verbose) | |||
# # ---- use pool.map to parallel. ---- | |||
# # result_perf = pool.map(do_partial, itr) | |||
# do_partial = partial(spkernel_do, Gn, ds_attrs, node_label, node_kernels) | |||
# itr = combinations_with_replacement(range(0, len(Gn)), 2) | |||
# for i, j, kernel in tqdm( | |||
# pool.map(do_partial, itr), desc='calculating kernels', | |||
# file=sys.stdout): | |||
# Kmatrix[i][j] = kernel | |||
# Kmatrix[j][i] = kernel | |||
# pool.close() | |||
# pool.join() | |||
# # ---- use joblib.Parallel to parallel and track progress. ---- | |||
# result_perf = Parallel( | |||
# n_jobs=n_jobs, verbose=10)( | |||
# delayed(do_partial)(ij) | |||
# for ij in combinations_with_replacement(range(0, len(Gn)), 2)) | |||
# result_perf = [ | |||
# do_partial(ij) | |||
# for ij in combinations_with_replacement(range(0, len(Gn)), 2) | |||
# ] | |||
# for i in result_perf: | |||
# Kmatrix[i[0]][i[1]] = i[2] | |||
# Kmatrix[i[1]][i[0]] = i[2] | |||
# # ---- direct running, normally use single CPU core. ---- | |||
# from itertools import combinations_with_replacement | |||
# itr = combinations_with_replacement(range(0, len(Gn)), 2) | |||
# for i, j in tqdm(itr, desc='calculating kernels', file=sys.stdout): | |||
# kernel = spkernel_do(Gn[i], Gn[j], ds_attrs, node_label, node_kernels) | |||
# Kmatrix[i][j] = kernel | |||
# Kmatrix[j][i] = kernel | |||
run_time = time.time() - start_time | |||
if verbose: | |||
print( | |||
"\n --- shortest path kernel matrix of size %d built in %s seconds ---" | |||
% (len(Gn), run_time)) | |||
return Kmatrix, run_time, idx | |||
def spkernel_do(g1, g2, ds_attrs, node_label, node_kernels): | |||
kernel = 0 | |||
# compute shortest path matrices first, method borrowed from FCSP. | |||
vk_dict = {} # shortest path matrices dict | |||
if ds_attrs['node_labeled']: | |||
# node symb and non-synb labeled | |||
if ds_attrs['node_attr_dim'] > 0: | |||
kn = node_kernels['mix'] | |||
for n1, n2 in product( | |||
g1.nodes(data=True), g2.nodes(data=True)): | |||
vk_dict[(n1[0], n2[0])] = kn( | |||
n1[1][node_label], n2[1][node_label], | |||
n1[1]['attributes'], n2[1]['attributes']) | |||
# node symb labeled | |||
else: | |||
kn = node_kernels['symb'] | |||
for n1 in g1.nodes(data=True): | |||
for n2 in g2.nodes(data=True): | |||
vk_dict[(n1[0], n2[0])] = kn(n1[1][node_label], | |||
n2[1][node_label]) | |||
else: | |||
# node non-synb labeled | |||
if ds_attrs['node_attr_dim'] > 0: | |||
kn = node_kernels['nsymb'] | |||
for n1 in g1.nodes(data=True): | |||
for n2 in g2.nodes(data=True): | |||
vk_dict[(n1[0], n2[0])] = kn(n1[1]['attributes'], | |||
n2[1]['attributes']) | |||
# node unlabeled | |||
else: | |||
for e1, e2 in product( | |||
g1.edges(data=True), g2.edges(data=True)): | |||
if e1[2]['cost'] == e2[2]['cost']: | |||
kernel += 1 | |||
return kernel | |||
# compute graph kernels | |||
if ds_attrs['is_directed']: | |||
for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)): | |||
if e1[2]['cost'] == e2[2]['cost']: | |||
nk11, nk22 = vk_dict[(e1[0], e2[0])], vk_dict[(e1[1], | |||
e2[1])] | |||
kn1 = nk11 * nk22 | |||
kernel += kn1 | |||
else: | |||
for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)): | |||
if e1[2]['cost'] == e2[2]['cost']: | |||
# each edge walk is counted twice, starting from both its extreme nodes. | |||
nk11, nk12, nk21, nk22 = vk_dict[(e1[0], e2[0])], vk_dict[( | |||
e1[0], e2[1])], vk_dict[(e1[1], | |||
e2[0])], vk_dict[(e1[1], | |||
e2[1])] | |||
kn1 = nk11 * nk22 | |||
kn2 = nk12 * nk21 | |||
kernel += kn1 + kn2 | |||
# # ---- exact implementation of the Fast Computation of Shortest Path Kernel (FCSP), reference [2], sadly it is slower than the current implementation | |||
# # compute vertex kernels | |||
# try: | |||
# vk_mat = np.zeros((nx.number_of_nodes(g1), | |||
# nx.number_of_nodes(g2))) | |||
# g1nl = enumerate(g1.nodes(data=True)) | |||
# g2nl = enumerate(g2.nodes(data=True)) | |||
# for i1, n1 in g1nl: | |||
# for i2, n2 in g2nl: | |||
# vk_mat[i1][i2] = kn( | |||
# n1[1][node_label], n2[1][node_label], | |||
# [n1[1]['attributes']], [n2[1]['attributes']]) | |||
# range1 = range(0, len(edge_w_g[i])) | |||
# range2 = range(0, len(edge_w_g[j])) | |||
# for i1 in range1: | |||
# x1 = edge_x_g[i][i1] | |||
# y1 = edge_y_g[i][i1] | |||
# w1 = edge_w_g[i][i1] | |||
# for i2 in range2: | |||
# x2 = edge_x_g[j][i2] | |||
# y2 = edge_y_g[j][i2] | |||
# w2 = edge_w_g[j][i2] | |||
# ke = (w1 == w2) | |||
# if ke > 0: | |||
# kn1 = vk_mat[x1][x2] * vk_mat[y1][y2] | |||
# kn2 = vk_mat[x1][y2] * vk_mat[y1][x2] | |||
# kernel += kn1 + kn2 | |||
return kernel | |||
kernel = 0 | |||
# compute shortest path matrices first, method borrowed from FCSP. | |||
vk_dict = {} # shortest path matrices dict | |||
if ds_attrs['node_labeled']: | |||
# node symb and non-synb labeled | |||
if ds_attrs['node_attr_dim'] > 0: | |||
kn = node_kernels['mix'] | |||
for n1, n2 in product( | |||
g1.nodes(data=True), g2.nodes(data=True)): | |||
vk_dict[(n1[0], n2[0])] = kn( | |||
n1[1][node_label], n2[1][node_label], | |||
n1[1]['attributes'], n2[1]['attributes']) | |||
# node symb labeled | |||
else: | |||
kn = node_kernels['symb'] | |||
for n1 in g1.nodes(data=True): | |||
for n2 in g2.nodes(data=True): | |||
vk_dict[(n1[0], n2[0])] = kn(n1[1][node_label], | |||
n2[1][node_label]) | |||
else: | |||
# node non-synb labeled | |||
if ds_attrs['node_attr_dim'] > 0: | |||
kn = node_kernels['nsymb'] | |||
for n1 in g1.nodes(data=True): | |||
for n2 in g2.nodes(data=True): | |||
vk_dict[(n1[0], n2[0])] = kn(n1[1]['attributes'], | |||
n2[1]['attributes']) | |||
# node unlabeled | |||
else: | |||
for e1, e2 in product( | |||
g1.edges(data=True), g2.edges(data=True)): | |||
if e1[2]['cost'] == e2[2]['cost']: | |||
kernel += 1 | |||
return kernel | |||
# compute graph kernels | |||
if ds_attrs['is_directed']: | |||
for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)): | |||
if e1[2]['cost'] == e2[2]['cost']: | |||
nk11, nk22 = vk_dict[(e1[0], e2[0])], vk_dict[(e1[1], | |||
e2[1])] | |||
kn1 = nk11 * nk22 | |||
kernel += kn1 | |||
else: | |||
for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)): | |||
if e1[2]['cost'] == e2[2]['cost']: | |||
# each edge walk is counted twice, starting from both its extreme nodes. | |||
nk11, nk12, nk21, nk22 = vk_dict[(e1[0], e2[0])], vk_dict[( | |||
e1[0], e2[1])], vk_dict[(e1[1], | |||
e2[0])], vk_dict[(e1[1], | |||
e2[1])] | |||
kn1 = nk11 * nk22 | |||
kn2 = nk12 * nk21 | |||
kernel += kn1 + kn2 | |||
# # ---- exact implementation of the Fast Computation of Shortest Path Kernel (FCSP), reference [2], sadly it is slower than the current implementation | |||
# # compute vertex kernels | |||
# try: | |||
# vk_mat = np.zeros((nx.number_of_nodes(g1), | |||
# nx.number_of_nodes(g2))) | |||
# g1nl = enumerate(g1.nodes(data=True)) | |||
# g2nl = enumerate(g2.nodes(data=True)) | |||
# for i1, n1 in g1nl: | |||
# for i2, n2 in g2nl: | |||
# vk_mat[i1][i2] = kn( | |||
# n1[1][node_label], n2[1][node_label], | |||
# [n1[1]['attributes']], [n2[1]['attributes']]) | |||
# range1 = range(0, len(edge_w_g[i])) | |||
# range2 = range(0, len(edge_w_g[j])) | |||
# for i1 in range1: | |||
# x1 = edge_x_g[i][i1] | |||
# y1 = edge_y_g[i][i1] | |||
# w1 = edge_w_g[i][i1] | |||
# for i2 in range2: | |||
# x2 = edge_x_g[j][i2] | |||
# y2 = edge_y_g[j][i2] | |||
# w2 = edge_w_g[j][i2] | |||
# ke = (w1 == w2) | |||
# if ke > 0: | |||
# kn1 = vk_mat[x1][x2] * vk_mat[y1][y2] | |||
# kn2 = vk_mat[x1][y2] * vk_mat[y1][x2] | |||
# kernel += kn1 + kn2 | |||
return kernel | |||
def wrapper_sp_do(ds_attrs, node_label, node_kernels, itr): | |||
i = itr[0] | |||
j = itr[1] | |||
return i, j, spkernel_do(G_gn[i], G_gn[j], ds_attrs, node_label, node_kernels) | |||
i = itr[0] | |||
j = itr[1] | |||
return i, j, spkernel_do(G_gn[i], G_gn[j], ds_attrs, node_label, node_kernels) | |||
#def wrapper_sp_do(ds_attrs, node_label, node_kernels, itr_item): | |||
# g1 = itr_item[0][0] | |||
# g2 = itr_item[0][1] | |||
# i = itr_item[1][0] | |||
# j = itr_item[1][1] | |||
# return i, j, spkernel_do(g1, g2, ds_attrs, node_label, node_kernels) | |||
# g1 = itr_item[0][0] | |||
# g2 = itr_item[0][1] | |||
# i = itr_item[1][0] | |||
# j = itr_item[1][1] | |||
# return i, j, spkernel_do(g1, g2, ds_attrs, node_label, node_kernels) | |||
def wrapper_getSPGraph(weight, itr_item): | |||
g = itr_item[0] | |||
i = itr_item[1] | |||
return i, getSPGraph(g, edge_weight=weight) | |||
# return i, nx.floyd_warshall_numpy(g, weight=weight) | |||
g = itr_item[0] | |||
i = itr_item[1] | |||
return i, getSPGraph(g, edge_weight=weight) | |||
# return i, nx.floyd_warshall_numpy(g, weight=weight) |
@@ -27,6 +27,7 @@ def treeletkernel(*args, | |||
edge_label='bond_type', | |||
parallel='imap_unordered', | |||
n_jobs=None, | |||
chunksize=None, | |||
verbose=True): | |||
"""Calculate treelet graph kernels between graphs. | |||
@@ -92,10 +93,11 @@ def treeletkernel(*args, | |||
# time, but this may cost a lot of memory for large dataset. | |||
pool = Pool(n_jobs) | |||
itr = zip(Gn, range(0, len(Gn))) | |||
if len(Gn) < 100 * n_jobs: | |||
chunksize = int(len(Gn) / n_jobs) + 1 | |||
else: | |||
chunksize = 100 | |||
if chunksize is None: | |||
if len(Gn) < 100 * n_jobs: | |||
chunksize = int(len(Gn) / n_jobs) + 1 | |||
else: | |||
chunksize = 100 | |||
canonkeys = [[] for _ in range(len(Gn))] | |||
get_partial = partial(wrapper_get_canonkeys, node_label, edge_label, | |||
labeled, ds_attrs['is_directed']) | |||
@@ -115,7 +117,7 @@ def treeletkernel(*args, | |||
G_canonkeys = canonkeys_toshare | |||
do_partial = partial(wrapper_treeletkernel_do, sub_kernel) | |||
parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, | |||
glbv=(canonkeys,), n_jobs=n_jobs, verbose=verbose) | |||
glbv=(canonkeys,), n_jobs=n_jobs, chunksize=chunksize, verbose=verbose) | |||
# ---- do not use parallelization. ---- | |||
elif parallel == None: | |||
@@ -30,6 +30,7 @@ def weisfeilerlehmankernel(*args, | |||
base_kernel='subtree', | |||
parallel=None, | |||
n_jobs=None, | |||
chunksize=None, | |||
verbose=True): | |||
"""Calculate Weisfeiler-Lehman kernels between graphs. | |||
@@ -91,7 +92,7 @@ def weisfeilerlehmankernel(*args, | |||
# for WL subtree kernel | |||
if base_kernel == 'subtree': | |||
Kmatrix = _wl_kernel_do(Gn, node_label, edge_label, height, parallel, n_jobs, verbose) | |||
Kmatrix = _wl_kernel_do(Gn, node_label, edge_label, height, parallel, n_jobs, chunksize, verbose) | |||
# for WL shortest path kernel | |||
elif base_kernel == 'sp': | |||
@@ -113,7 +114,7 @@ def weisfeilerlehmankernel(*args, | |||
return Kmatrix, run_time | |||
def _wl_kernel_do(Gn, node_label, edge_label, height, parallel, n_jobs, verbose): | |||
def _wl_kernel_do(Gn, node_label, edge_label, height, parallel, n_jobs, chunksize, verbose): | |||
"""Calculate Weisfeiler-Lehman kernels between graphs. | |||
Parameters | |||
@@ -146,7 +147,7 @@ def _wl_kernel_do(Gn, node_label, edge_label, height, parallel, n_jobs, verbose) | |||
all_num_of_each_label.append(dict(Counter(labels_ori))) | |||
# calculate subtree kernel with the 0th iteration and add it to the final kernel | |||
compute_kernel_matrix(Kmatrix, all_num_of_each_label, Gn, parallel, n_jobs, False) | |||
compute_kernel_matrix(Kmatrix, all_num_of_each_label, Gn, parallel, n_jobs, chunksize, False) | |||
# iterate each height | |||
for h in range(1, height + 1): | |||
@@ -304,7 +305,7 @@ def wrapper_wl_iteration(node_label, itr_item): | |||
return i, all_multisets | |||
def compute_kernel_matrix(Kmatrix, all_num_of_each_label, Gn, parallel, n_jobs, verbose): | |||
def compute_kernel_matrix(Kmatrix, all_num_of_each_label, Gn, parallel, n_jobs, chunksize, verbose): | |||
"""Compute kernel matrix using the base kernel. | |||
""" | |||
if parallel == 'imap_unordered': | |||
@@ -314,7 +315,7 @@ def compute_kernel_matrix(Kmatrix, all_num_of_each_label, Gn, parallel, n_jobs, | |||
G_alllabels = alllabels_toshare | |||
do_partial = partial(wrapper_compute_subtree_kernel, Kmatrix) | |||
parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, | |||
glbv=(all_num_of_each_label,), n_jobs=n_jobs, verbose=verbose) | |||
glbv=(all_num_of_each_label,), n_jobs=n_jobs, chunksize=chunksize, verbose=verbose) | |||
elif parallel == None: | |||
for i in range(len(Kmatrix)): | |||
for j in range(i, len(Kmatrix)): | |||
@@ -24,7 +24,7 @@ def parallel_me(func, func_assign, var_to_assign, itr, len_itr=None, init_worker | |||
n_jobs = multiprocessing.cpu_count() | |||
with Pool(processes=n_jobs, initializer=init_worker, | |||
initargs=glbv) as pool: | |||
if chunksize == None: | |||
if chunksize is None: | |||
if len_itr < 100 * n_jobs: | |||
chunksize = int(len_itr / n_jobs) + 1 | |||
else: | |||
@@ -39,7 +39,7 @@ def parallel_me(func, func_assign, var_to_assign, itr, len_itr=None, init_worker | |||
if n_jobs == None: | |||
n_jobs = multiprocessing.cpu_count() | |||
with Pool(processes=n_jobs) as pool: | |||
if chunksize == None: | |||
if chunksize is None: | |||
if len_itr < 100 * n_jobs: | |||
chunksize = int(len_itr / n_jobs) + 1 | |||
else: | |||
@@ -8,7 +8,7 @@ with open('requirements_pypi.txt') as fp: | |||
setuptools.setup( | |||
name="graphkit-learn", | |||
version="0.2b4", | |||
version="0.2.0", | |||
author="Linlin Jia", | |||
author_email="linlin.jia@insa-rouen.fr", | |||
description="A Python library for graph kernels, graph edit distances, and graph pre-images", | |||