|
|
@@ -0,0 +1,726 @@ |
|
|
|
""" |
|
|
|
@author: linlin |
|
|
|
|
|
|
|
@references: |
|
|
|
|
|
|
|
[1] Liva Ralaivola, Sanjay J Swamidass, Hiroto Saigo, and Pierre |
|
|
|
Baldi. Graph kernels for chemical informatics. Neural networks, |
|
|
|
18(8):1093–1110, 2005. |
|
|
|
""" |
|
|
|
|
|
|
|
import sys |
|
|
|
import time |
|
|
|
from collections import Counter |
|
|
|
from itertools import chain |
|
|
|
from functools import partial |
|
|
|
from multiprocessing import Pool |
|
|
|
from tqdm import tqdm |
|
|
|
|
|
|
|
import networkx as nx |
|
|
|
import numpy as np |
|
|
|
|
|
|
|
from gklearn.utils.graphdataset import get_dataset_attributes |
|
|
|
from gklearn.utils.parallel import parallel_gm |
|
|
|
from gklearn.utils.trie import Trie |
|
|
|
|
|
|
|
|
|
|
|
def untilhpathkernel(*args, |
|
|
|
node_label='atom', |
|
|
|
edge_label='bond_type', |
|
|
|
depth=10, |
|
|
|
k_func='MinMax', |
|
|
|
compute_method='trie', |
|
|
|
parallel='imap_unordered', |
|
|
|
n_jobs=None, |
|
|
|
chunksize=None, |
|
|
|
verbose=True): |
|
|
|
"""Calculate path graph kernels up to depth/hight h between graphs. |
|
|
|
|
|
|
|
Parameters |
|
|
|
---------- |
|
|
|
Gn : List of NetworkX graph |
|
|
|
List of graphs between which the kernels are calculated. |
|
|
|
|
|
|
|
G1, G2 : NetworkX graphs |
|
|
|
Two graphs between which the kernel is calculated. |
|
|
|
|
|
|
|
node_label : string |
|
|
|
Node attribute used as label. The default node label is atom. |
|
|
|
|
|
|
|
edge_label : string |
|
|
|
Edge attribute used as label. The default edge label is bond_type. |
|
|
|
|
|
|
|
depth : integer |
|
|
|
Depth of search. Longest length of paths. |
|
|
|
|
|
|
|
k_func : function |
|
|
|
A kernel function applied using different notions of fingerprint |
|
|
|
similarity, defining the type of feature map and normalization method |
|
|
|
applied for the graph kernel. The Following choices are available: |
|
|
|
|
|
|
|
'MinMax': use the MiniMax kernel and counting feature map. |
|
|
|
|
|
|
|
'tanimoto': use the Tanimoto kernel and binary feature map. |
|
|
|
|
|
|
|
None: no sub-kernel is used, the kernel is computed directly. |
|
|
|
|
|
|
|
compute_method : string |
|
|
|
Computation method to store paths and compute the graph kernel. The |
|
|
|
Following choices are available: |
|
|
|
|
|
|
|
'trie': store paths as tries. |
|
|
|
|
|
|
|
'naive': store paths to lists. |
|
|
|
|
|
|
|
n_jobs : int |
|
|
|
Number of jobs for parallelization. |
|
|
|
|
|
|
|
Return |
|
|
|
------ |
|
|
|
Kmatrix : Numpy matrix |
|
|
|
Kernel matrix, each element of which is the path kernel up to h between |
|
|
|
2 praphs. |
|
|
|
""" |
|
|
|
# pre-process |
|
|
|
depth = int(depth) |
|
|
|
Gn = args[0] if len(args) == 1 else [args[0], args[1]] |
|
|
|
Gn = [g.copy() for g in Gn] |
|
|
|
Kmatrix = np.zeros((len(Gn), len(Gn))) |
|
|
|
ds_attrs = get_dataset_attributes( |
|
|
|
Gn, |
|
|
|
attr_names=['node_labeled', 'node_attr_dim', 'edge_labeled', |
|
|
|
'edge_attr_dim', 'is_directed'], |
|
|
|
node_label=node_label, edge_label=edge_label) |
|
|
|
if k_func != None: |
|
|
|
if not ds_attrs['node_labeled']: |
|
|
|
for G in Gn: |
|
|
|
nx.set_node_attributes(G, '0', 'atom') |
|
|
|
if not ds_attrs['edge_labeled']: |
|
|
|
for G in Gn: |
|
|
|
nx.set_edge_attributes(G, '0', 'bond_type') |
|
|
|
|
|
|
|
start_time = time.time() |
|
|
|
|
|
|
|
if parallel == 'imap_unordered': |
|
|
|
# ---- use pool.imap_unordered to parallel and track progress. ---- |
|
|
|
# get all paths of all graphs before calculating kernels to save time, |
|
|
|
# but this may cost a lot of memory for large datasets. |
|
|
|
pool = Pool(n_jobs) |
|
|
|
itr = zip(Gn, range(0, len(Gn))) |
|
|
|
if chunksize is None: |
|
|
|
if len(Gn) < 100 * n_jobs: |
|
|
|
chunksize = int(len(Gn) / n_jobs) + 1 |
|
|
|
else: |
|
|
|
chunksize = 100 |
|
|
|
all_paths = [[] for _ in range(len(Gn))] |
|
|
|
if compute_method == 'trie' and k_func != None: |
|
|
|
getps_partial = partial(wrapper_find_all_path_as_trie, depth, |
|
|
|
ds_attrs, node_label, edge_label) |
|
|
|
elif compute_method != 'trie' and k_func != None: |
|
|
|
getps_partial = partial(wrapper_find_all_paths_until_length, depth, |
|
|
|
ds_attrs, node_label, edge_label, True) |
|
|
|
else: |
|
|
|
getps_partial = partial(wrapper_find_all_paths_until_length, depth, |
|
|
|
ds_attrs, node_label, edge_label, False) |
|
|
|
if verbose: |
|
|
|
iterator = tqdm(pool.imap_unordered(getps_partial, itr, chunksize), |
|
|
|
desc='getting paths', file=sys.stdout) |
|
|
|
else: |
|
|
|
iterator = pool.imap_unordered(getps_partial, itr, chunksize) |
|
|
|
for i, ps in iterator: |
|
|
|
all_paths[i] = ps |
|
|
|
pool.close() |
|
|
|
pool.join() |
|
|
|
|
|
|
|
# for g in Gn: |
|
|
|
# if compute_method == 'trie' and k_func != None: |
|
|
|
# find_all_path_as_trie(g, depth, ds_attrs, node_label, edge_label) |
|
|
|
# elif compute_method != 'trie' and k_func != None: |
|
|
|
# find_all_paths_until_length(g, depth, ds_attrs, node_label, edge_label) |
|
|
|
# else: |
|
|
|
# find_all_paths_until_length(g, depth, ds_attrs, node_label, edge_label, False) |
|
|
|
|
|
|
|
## size = sys.getsizeof(all_paths) |
|
|
|
## for item in all_paths: |
|
|
|
## size += sys.getsizeof(item) |
|
|
|
## for pppps in item: |
|
|
|
## size += sys.getsizeof(pppps) |
|
|
|
## print(size) |
|
|
|
# |
|
|
|
## ttt = time.time() |
|
|
|
## # ---- ---- use pool.map to parallel ---- |
|
|
|
## for i, ps in tqdm( |
|
|
|
## pool.map(getps_partial, range(0, len(Gn))), |
|
|
|
## desc='getting paths', file=sys.stdout): |
|
|
|
## all_paths[i] = ps |
|
|
|
## print(time.time() - ttt) |
|
|
|
|
|
|
|
if compute_method == 'trie' and k_func != None: |
|
|
|
def init_worker(trie_toshare): |
|
|
|
global G_trie |
|
|
|
G_trie = trie_toshare |
|
|
|
do_partial = partial(wrapper_uhpath_do_trie, k_func) |
|
|
|
parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, |
|
|
|
glbv=(all_paths,), n_jobs=n_jobs, chunksize=chunksize, verbose=verbose) |
|
|
|
elif compute_method != 'trie' and k_func != None: |
|
|
|
def init_worker(plist_toshare): |
|
|
|
global G_plist |
|
|
|
G_plist = plist_toshare |
|
|
|
do_partial = partial(wrapper_uhpath_do_naive, k_func) |
|
|
|
parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, |
|
|
|
glbv=(all_paths,), n_jobs=n_jobs, chunksize=chunksize, verbose=verbose) |
|
|
|
else: |
|
|
|
def init_worker(plist_toshare): |
|
|
|
global G_plist |
|
|
|
G_plist = plist_toshare |
|
|
|
do_partial = partial(wrapper_uhpath_do_kernelless, ds_attrs, edge_kernels) |
|
|
|
parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, |
|
|
|
glbv=(all_paths,), n_jobs=n_jobs, chunksize=chunksize, verbose=verbose) |
|
|
|
|
|
|
|
elif parallel == None: |
|
|
|
# from pympler import asizeof |
|
|
|
# ---- direct running, normally use single CPU core. ---- |
|
|
|
# print(asizeof.asized(all_paths, detail=1).format()) |
|
|
|
|
|
|
|
if compute_method == 'trie': |
|
|
|
all_paths = [ |
|
|
|
find_all_path_as_trie(Gn[i], |
|
|
|
depth, |
|
|
|
ds_attrs, |
|
|
|
node_label=node_label, |
|
|
|
edge_label=edge_label) for i in tqdm( |
|
|
|
range(0, len(Gn)), desc='getting paths', file=sys.stdout) |
|
|
|
] |
|
|
|
# sizeof_allpaths = asizeof.asizeof(all_paths) |
|
|
|
# print(sizeof_allpaths) |
|
|
|
pbar = tqdm( |
|
|
|
total=((len(Gn) + 1) * len(Gn) / 2), |
|
|
|
desc='calculating kernels', |
|
|
|
file=sys.stdout) |
|
|
|
for i in range(0, len(Gn)): |
|
|
|
for j in range(i, len(Gn)): |
|
|
|
Kmatrix[i][j] = _untilhpathkernel_do_trie(all_paths[i], |
|
|
|
all_paths[j], k_func) |
|
|
|
Kmatrix[j][i] = Kmatrix[i][j] |
|
|
|
pbar.update(1) |
|
|
|
else: |
|
|
|
all_paths = [ |
|
|
|
find_all_paths_until_length( |
|
|
|
Gn[i], |
|
|
|
depth, |
|
|
|
ds_attrs, |
|
|
|
node_label=node_label, |
|
|
|
edge_label=edge_label) for i in tqdm( |
|
|
|
range(0, len(Gn)), desc='getting paths', file=sys.stdout) |
|
|
|
] |
|
|
|
# sizeof_allpaths = asizeof.asizeof(all_paths) |
|
|
|
# print(sizeof_allpaths) |
|
|
|
pbar = tqdm( |
|
|
|
total=((len(Gn) + 1) * len(Gn) / 2), |
|
|
|
desc='calculating kernels', |
|
|
|
file=sys.stdout) |
|
|
|
for i in range(0, len(Gn)): |
|
|
|
for j in range(i, len(Gn)): |
|
|
|
Kmatrix[i][j] = _untilhpathkernel_do_naive(all_paths[i], all_paths[j], |
|
|
|
k_func) |
|
|
|
Kmatrix[j][i] = Kmatrix[i][j] |
|
|
|
pbar.update(1) |
|
|
|
|
|
|
|
run_time = time.time() - start_time |
|
|
|
if verbose: |
|
|
|
print("\n --- kernel matrix of path kernel up to %d of size %d built in %s seconds ---" |
|
|
|
% (depth, len(Gn), run_time)) |
|
|
|
|
|
|
|
# print(Kmatrix[0][0:10]) |
|
|
|
return Kmatrix, run_time |
|
|
|
|
|
|
|
|
|
|
|
def _untilhpathkernel_do_trie(trie1, trie2, k_func): |
|
|
|
"""Calculate path graph kernels up to depth d between 2 graphs using trie. |
|
|
|
|
|
|
|
Parameters |
|
|
|
---------- |
|
|
|
trie1, trie2 : list |
|
|
|
Tries that contains all paths in 2 graphs. |
|
|
|
k_func : function |
|
|
|
A kernel function applied using different notions of fingerprint |
|
|
|
similarity. |
|
|
|
|
|
|
|
Return |
|
|
|
------ |
|
|
|
kernel : float |
|
|
|
Path kernel up to h between 2 graphs. |
|
|
|
""" |
|
|
|
if k_func == 'tanimoto': |
|
|
|
# traverse all paths in graph1 and search them in graph2. Deep-first |
|
|
|
# search is applied. |
|
|
|
def traverseTrie1t(root, trie2, setlist, pcurrent=[]): |
|
|
|
for key, node in root['children'].items(): |
|
|
|
pcurrent.append(key) |
|
|
|
if node['isEndOfWord']: |
|
|
|
setlist[1] += 1 |
|
|
|
count2 = trie2.searchWord(pcurrent) |
|
|
|
if count2 != 0: |
|
|
|
setlist[0] += 1 |
|
|
|
if node['children'] != {}: |
|
|
|
traverseTrie1t(node, trie2, setlist, pcurrent) |
|
|
|
else: |
|
|
|
del pcurrent[-1] |
|
|
|
if pcurrent != []: |
|
|
|
del pcurrent[-1] |
|
|
|
|
|
|
|
|
|
|
|
# traverse all paths in graph2 and find out those that are not in |
|
|
|
# graph1. Deep-first search is applied. |
|
|
|
def traverseTrie2t(root, trie1, setlist, pcurrent=[]): |
|
|
|
for key, node in root['children'].items(): |
|
|
|
pcurrent.append(key) |
|
|
|
if node['isEndOfWord']: |
|
|
|
# print(node['count']) |
|
|
|
count1 = trie1.searchWord(pcurrent) |
|
|
|
if count1 == 0: |
|
|
|
setlist[1] += 1 |
|
|
|
if node['children'] != {}: |
|
|
|
traverseTrie2t(node, trie1, setlist, pcurrent) |
|
|
|
else: |
|
|
|
del pcurrent[-1] |
|
|
|
if pcurrent != []: |
|
|
|
del pcurrent[-1] |
|
|
|
|
|
|
|
setlist = [0, 0] # intersection and union of path sets of g1, g2. |
|
|
|
# print(trie1.root) |
|
|
|
# print(trie2.root) |
|
|
|
traverseTrie1t(trie1.root, trie2, setlist) |
|
|
|
# print(setlist) |
|
|
|
traverseTrie2t(trie2.root, trie1, setlist) |
|
|
|
# print(setlist) |
|
|
|
kernel = setlist[0] / setlist[1] |
|
|
|
|
|
|
|
else: # MinMax kernel |
|
|
|
# traverse all paths in graph1 and search them in graph2. Deep-first |
|
|
|
# search is applied. |
|
|
|
def traverseTrie1m(root, trie2, sumlist, pcurrent=[]): |
|
|
|
for key, node in root['children'].items(): |
|
|
|
pcurrent.append(key) |
|
|
|
if node['isEndOfWord']: |
|
|
|
# print(node['count']) |
|
|
|
count1 = node['count'] |
|
|
|
count2 = trie2.searchWord(pcurrent) |
|
|
|
sumlist[0] += min(count1, count2) |
|
|
|
sumlist[1] += max(count1, count2) |
|
|
|
if node['children'] != {}: |
|
|
|
traverseTrie1m(node, trie2, sumlist, pcurrent) |
|
|
|
else: |
|
|
|
del pcurrent[-1] |
|
|
|
if pcurrent != []: |
|
|
|
del pcurrent[-1] |
|
|
|
|
|
|
|
# traverse all paths in graph2 and find out those that are not in |
|
|
|
# graph1. Deep-first search is applied. |
|
|
|
def traverseTrie2m(root, trie1, sumlist, pcurrent=[]): |
|
|
|
for key, node in root['children'].items(): |
|
|
|
pcurrent.append(key) |
|
|
|
if node['isEndOfWord']: |
|
|
|
# print(node['count']) |
|
|
|
count1 = trie1.searchWord(pcurrent) |
|
|
|
if count1 == 0: |
|
|
|
sumlist[1] += node['count'] |
|
|
|
if node['children'] != {}: |
|
|
|
traverseTrie2m(node, trie1, sumlist, pcurrent) |
|
|
|
else: |
|
|
|
del pcurrent[-1] |
|
|
|
if pcurrent != []: |
|
|
|
del pcurrent[-1] |
|
|
|
|
|
|
|
sumlist = [0, 0] # sum of mins and sum of maxs |
|
|
|
# print(trie1.root) |
|
|
|
# print(trie2.root) |
|
|
|
traverseTrie1m(trie1.root, trie2, sumlist) |
|
|
|
# print(sumlist) |
|
|
|
traverseTrie2m(trie2.root, trie1, sumlist) |
|
|
|
# print(sumlist) |
|
|
|
kernel = sumlist[0] / sumlist[1] |
|
|
|
|
|
|
|
return kernel |
|
|
|
|
|
|
|
|
|
|
|
def wrapper_uhpath_do_trie(k_func, itr): |
|
|
|
i = itr[0] |
|
|
|
j = itr[1] |
|
|
|
return i, j, _untilhpathkernel_do_trie(G_trie[i], G_trie[j], k_func) |
|
|
|
|
|
|
|
|
|
|
|
def _untilhpathkernel_do_naive(paths1, paths2, k_func): |
|
|
|
"""Calculate path graph kernels up to depth d between 2 graphs naively. |
|
|
|
|
|
|
|
Parameters |
|
|
|
---------- |
|
|
|
paths_list : list of list |
|
|
|
List of list of paths in all graphs, where for unlabeled graphs, each |
|
|
|
path is represented by a list of nodes; while for labeled graphs, each |
|
|
|
path is represented by a string consists of labels of nodes and/or |
|
|
|
edges on that path. |
|
|
|
k_func : function |
|
|
|
A kernel function applied using different notions of fingerprint |
|
|
|
similarity. |
|
|
|
|
|
|
|
Return |
|
|
|
------ |
|
|
|
kernel : float |
|
|
|
Path kernel up to h between 2 graphs. |
|
|
|
""" |
|
|
|
all_paths = list(set(paths1 + paths2)) |
|
|
|
|
|
|
|
if k_func == 'tanimoto': |
|
|
|
length_union = len(set(paths1 + paths2)) |
|
|
|
kernel = (len(set(paths1)) + len(set(paths2)) - |
|
|
|
length_union) / length_union |
|
|
|
# vector1 = [(1 if path in paths1 else 0) for path in all_paths] |
|
|
|
# vector2 = [(1 if path in paths2 else 0) for path in all_paths] |
|
|
|
# kernel_uv = np.dot(vector1, vector2) |
|
|
|
# kernel = kernel_uv / (len(set(paths1)) + len(set(paths2)) - kernel_uv) |
|
|
|
|
|
|
|
else: # MinMax kernel |
|
|
|
path_count1 = Counter(paths1) |
|
|
|
path_count2 = Counter(paths2) |
|
|
|
vector1 = [(path_count1[key] if (key in path_count1.keys()) else 0) |
|
|
|
for key in all_paths] |
|
|
|
vector2 = [(path_count2[key] if (key in path_count2.keys()) else 0) |
|
|
|
for key in all_paths] |
|
|
|
kernel = np.sum(np.minimum(vector1, vector2)) / \ |
|
|
|
np.sum(np.maximum(vector1, vector2)) |
|
|
|
|
|
|
|
return kernel |
|
|
|
|
|
|
|
|
|
|
|
def wrapper_uhpath_do_naive(k_func, itr): |
|
|
|
i = itr[0] |
|
|
|
j = itr[1] |
|
|
|
return i, j, _untilhpathkernel_do_naive(G_plist[i], G_plist[j], k_func) |
|
|
|
|
|
|
|
|
|
|
|
def _untilhpathkernel_do_kernelless(paths1, paths2, k_func): |
|
|
|
"""Calculate path graph kernels up to depth d between 2 graphs naively. |
|
|
|
|
|
|
|
Parameters |
|
|
|
---------- |
|
|
|
paths_list : list of list |
|
|
|
List of list of paths in all graphs, where for unlabeled graphs, each |
|
|
|
path is represented by a list of nodes; while for labeled graphs, each |
|
|
|
path is represented by a string consists of labels of nodes and/or |
|
|
|
edges on that path. |
|
|
|
k_func : function |
|
|
|
A kernel function applied using different notions of fingerprint |
|
|
|
similarity. |
|
|
|
|
|
|
|
Return |
|
|
|
------ |
|
|
|
kernel : float |
|
|
|
Path kernel up to h between 2 graphs. |
|
|
|
""" |
|
|
|
all_paths = list(set(paths1 + paths2)) |
|
|
|
|
|
|
|
if k_func == 'tanimoto': |
|
|
|
length_union = len(set(paths1 + paths2)) |
|
|
|
kernel = (len(set(paths1)) + len(set(paths2)) - |
|
|
|
length_union) / length_union |
|
|
|
# vector1 = [(1 if path in paths1 else 0) for path in all_paths] |
|
|
|
# vector2 = [(1 if path in paths2 else 0) for path in all_paths] |
|
|
|
# kernel_uv = np.dot(vector1, vector2) |
|
|
|
# kernel = kernel_uv / (len(set(paths1)) + len(set(paths2)) - kernel_uv) |
|
|
|
|
|
|
|
else: # MinMax kernel |
|
|
|
path_count1 = Counter(paths1) |
|
|
|
path_count2 = Counter(paths2) |
|
|
|
vector1 = [(path_count1[key] if (key in path_count1.keys()) else 0) |
|
|
|
for key in all_paths] |
|
|
|
vector2 = [(path_count2[key] if (key in path_count2.keys()) else 0) |
|
|
|
for key in all_paths] |
|
|
|
kernel = np.sum(np.minimum(vector1, vector2)) / \ |
|
|
|
np.sum(np.maximum(vector1, vector2)) |
|
|
|
|
|
|
|
return kernel |
|
|
|
|
|
|
|
|
|
|
|
def wrapper_uhpath_do_kernelless(k_func, itr): |
|
|
|
i = itr[0] |
|
|
|
j = itr[1] |
|
|
|
return i, j, _untilhpathkernel_do_kernelless(G_plist[i], G_plist[j], k_func) |
|
|
|
|
|
|
|
|
|
|
|
# @todo: (can be removed maybe) this method find paths repetively, it could be faster. |
|
|
|
def find_all_paths_until_length(G, |
|
|
|
length, |
|
|
|
ds_attrs, |
|
|
|
node_label='atom', |
|
|
|
edge_label='bond_type', |
|
|
|
tolabelseqs=True): |
|
|
|
"""Find all paths no longer than a certain maximum length in a graph. A |
|
|
|
recursive depth first search is applied. |
|
|
|
|
|
|
|
Parameters |
|
|
|
---------- |
|
|
|
G : NetworkX graphs |
|
|
|
The graph in which paths are searched. |
|
|
|
length : integer |
|
|
|
The maximum length of paths. |
|
|
|
ds_attrs: dict |
|
|
|
Dataset attributes. |
|
|
|
node_label : string |
|
|
|
Node attribute used as label. The default node label is atom. |
|
|
|
edge_label : string |
|
|
|
Edge attribute used as label. The default edge label is bond_type. |
|
|
|
|
|
|
|
Return |
|
|
|
------ |
|
|
|
path : list |
|
|
|
List of paths retrieved, where for unlabeled graphs, each path is |
|
|
|
represented by a list of nodes; while for labeled graphs, each path is |
|
|
|
represented by a list of strings consists of labels of nodes and/or |
|
|
|
edges on that path. |
|
|
|
""" |
|
|
|
# path_l = [tuple([n]) for n in G.nodes] # paths of length l |
|
|
|
# all_paths = path_l[:] |
|
|
|
# for l in range(1, length + 1): |
|
|
|
# path_l_new = [] |
|
|
|
# for path in path_l: |
|
|
|
# for neighbor in G[path[-1]]: |
|
|
|
# if len(path) < 2 or neighbor != path[-2]: |
|
|
|
# tmp = path + (neighbor, ) |
|
|
|
# if tuple(tmp[::-1]) not in path_l_new: |
|
|
|
# path_l_new.append(tuple(tmp)) |
|
|
|
|
|
|
|
# all_paths += path_l_new |
|
|
|
# path_l = path_l_new[:] |
|
|
|
|
|
|
|
path_l = [[n] for n in G.nodes] # paths of length l |
|
|
|
all_paths = [p.copy() for p in path_l] |
|
|
|
for l in range(1, length + 1): |
|
|
|
path_lplus1 = [] |
|
|
|
for path in path_l: |
|
|
|
for neighbor in G[path[-1]]: |
|
|
|
if neighbor not in path: |
|
|
|
tmp = path + [neighbor] |
|
|
|
# if tmp[::-1] not in path_lplus1: |
|
|
|
path_lplus1.append(tmp) |
|
|
|
|
|
|
|
all_paths += path_lplus1 |
|
|
|
path_l = [p.copy() for p in path_lplus1] |
|
|
|
|
|
|
|
# for i in range(0, length + 1): |
|
|
|
# new_paths = find_all_paths(G, i) |
|
|
|
# if new_paths == []: |
|
|
|
# break |
|
|
|
# all_paths.extend(new_paths) |
|
|
|
|
|
|
|
# consider labels |
|
|
|
# print(paths2labelseqs(all_paths, G, ds_attrs, node_label, edge_label)) |
|
|
|
# print() |
|
|
|
return (paths2labelseqs(all_paths, G, ds_attrs, node_label, edge_label) |
|
|
|
if tolabelseqs else all_paths) |
|
|
|
|
|
|
|
|
|
|
|
def wrapper_find_all_paths_until_length(length, ds_attrs, node_label, |
|
|
|
edge_label, tolabelseqs, itr_item): |
|
|
|
g = itr_item[0] |
|
|
|
i = itr_item[1] |
|
|
|
return i, find_all_paths_until_length(g, length, ds_attrs, |
|
|
|
node_label=node_label, edge_label=edge_label, |
|
|
|
tolabelseqs=tolabelseqs) |
|
|
|
|
|
|
|
|
|
|
|
def find_all_path_as_trie(G, |
|
|
|
length, |
|
|
|
ds_attrs, |
|
|
|
node_label='atom', |
|
|
|
edge_label='bond_type'): |
|
|
|
# time1 = time.time() |
|
|
|
|
|
|
|
# all_path = find_all_paths_until_length(G, length, ds_attrs, |
|
|
|
# node_label=node_label, |
|
|
|
# edge_label=edge_label) |
|
|
|
# ptrie = Trie() |
|
|
|
# for path in all_path: |
|
|
|
# ptrie.insertWord(path) |
|
|
|
|
|
|
|
# ptrie = Trie() |
|
|
|
# path_l = [[n] for n in G.nodes] # paths of length l |
|
|
|
# path_l_str = paths2labelseqs(path_l, G, ds_attrs, node_label, edge_label) |
|
|
|
# for p in path_l_str: |
|
|
|
# ptrie.insertWord(p) |
|
|
|
# for l in range(1, length + 1): |
|
|
|
# path_lplus1 = [] |
|
|
|
# for path in path_l: |
|
|
|
# for neighbor in G[path[-1]]: |
|
|
|
# if neighbor not in path: |
|
|
|
# tmp = path + [neighbor] |
|
|
|
## if tmp[::-1] not in path_lplus1: |
|
|
|
# path_lplus1.append(tmp) |
|
|
|
# path_l = path_lplus1[:] |
|
|
|
# # consider labels |
|
|
|
# path_l_str = paths2labelseqs(path_l, G, ds_attrs, node_label, edge_label) |
|
|
|
# for p in path_l_str: |
|
|
|
# ptrie.insertWord(p) |
|
|
|
# |
|
|
|
# print(time.time() - time1) |
|
|
|
# print(ptrie.root) |
|
|
|
# print() |
|
|
|
|
|
|
|
|
|
|
|
# traverse all paths up to length h in a graph and construct a trie with |
|
|
|
# them. Deep-first search is applied. Notice the reverse of each path is |
|
|
|
# also stored to the trie. |
|
|
|
def traverseGraph(root, ptrie, length, G, ds_attrs, node_label, edge_label, |
|
|
|
pcurrent=[]): |
|
|
|
if len(pcurrent) < length + 1: |
|
|
|
for neighbor in G[root]: |
|
|
|
if neighbor not in pcurrent: |
|
|
|
pcurrent.append(neighbor) |
|
|
|
plstr = paths2labelseqs([pcurrent], G, ds_attrs, |
|
|
|
node_label, edge_label) |
|
|
|
ptrie.insertWord(plstr[0]) |
|
|
|
traverseGraph(neighbor, ptrie, length, G, ds_attrs, |
|
|
|
node_label, edge_label, pcurrent) |
|
|
|
del pcurrent[-1] |
|
|
|
|
|
|
|
|
|
|
|
ptrie = Trie() |
|
|
|
path_l = [[n] for n in G.nodes] # paths of length l |
|
|
|
path_l_str = paths2labelseqs(path_l, G, ds_attrs, node_label, edge_label) |
|
|
|
for p in path_l_str: |
|
|
|
ptrie.insertWord(p) |
|
|
|
for n in G.nodes: |
|
|
|
traverseGraph(n, ptrie, length, G, ds_attrs, node_label, edge_label, |
|
|
|
pcurrent=[n]) |
|
|
|
|
|
|
|
|
|
|
|
# def traverseGraph(root, all_paths, length, G, ds_attrs, node_label, edge_label, |
|
|
|
# pcurrent=[]): |
|
|
|
# if len(pcurrent) < length + 1: |
|
|
|
# for neighbor in G[root]: |
|
|
|
# if neighbor not in pcurrent: |
|
|
|
# pcurrent.append(neighbor) |
|
|
|
# plstr = paths2labelseqs([pcurrent], G, ds_attrs, |
|
|
|
# node_label, edge_label) |
|
|
|
# all_paths.append(pcurrent[:]) |
|
|
|
# traverseGraph(neighbor, all_paths, length, G, ds_attrs, |
|
|
|
# node_label, edge_label, pcurrent) |
|
|
|
# del pcurrent[-1] |
|
|
|
# |
|
|
|
# |
|
|
|
# path_l = [[n] for n in G.nodes] # paths of length l |
|
|
|
# all_paths = path_l[:] |
|
|
|
# path_l_str = paths2labelseqs(path_l, G, ds_attrs, node_label, edge_label) |
|
|
|
## for p in path_l_str: |
|
|
|
## ptrie.insertWord(p) |
|
|
|
# for n in G.nodes: |
|
|
|
# traverseGraph(n, all_paths, length, G, ds_attrs, node_label, edge_label, |
|
|
|
# pcurrent=[n]) |
|
|
|
|
|
|
|
# print(ptrie.root) |
|
|
|
return ptrie |
|
|
|
|
|
|
|
|
|
|
|
def wrapper_find_all_path_as_trie(length, ds_attrs, node_label, |
|
|
|
edge_label, itr_item): |
|
|
|
g = itr_item[0] |
|
|
|
i = itr_item[1] |
|
|
|
return i, find_all_path_as_trie(g, length, ds_attrs, |
|
|
|
node_label=node_label, edge_label=edge_label) |
|
|
|
|
|
|
|
|
|
|
|
def paths2labelseqs(plist, G, ds_attrs, node_label, edge_label): |
|
|
|
if ds_attrs['node_labeled']: |
|
|
|
if ds_attrs['edge_labeled']: |
|
|
|
path_strs = [ |
|
|
|
tuple( |
|
|
|
list( |
|
|
|
chain.from_iterable( |
|
|
|
(G.nodes[node][node_label], |
|
|
|
G[node][path[idx + 1]][edge_label]) |
|
|
|
for idx, node in enumerate(path[:-1]))) + |
|
|
|
[G.nodes[path[-1]][node_label]]) for path in plist |
|
|
|
] |
|
|
|
# path_strs = [] |
|
|
|
# for path in all_paths: |
|
|
|
# strlist = list( |
|
|
|
# chain.from_iterable((G.node[node][node_label], |
|
|
|
# G[node][path[idx + 1]][edge_label]) |
|
|
|
# for idx, node in enumerate(path[:-1]))) |
|
|
|
# strlist.append(G.node[path[-1]][node_label]) |
|
|
|
# path_strs.append(tuple(strlist)) |
|
|
|
else: |
|
|
|
path_strs = [ |
|
|
|
tuple([G.nodes[node][node_label] for node in path]) |
|
|
|
for path in plist |
|
|
|
] |
|
|
|
return path_strs |
|
|
|
else: |
|
|
|
if ds_attrs['edge_labeled']: |
|
|
|
return [ |
|
|
|
tuple([] if len(path) == 1 else [ |
|
|
|
G[node][path[idx + 1]][edge_label] |
|
|
|
for idx, node in enumerate(path[:-1]) |
|
|
|
]) for path in plist |
|
|
|
] |
|
|
|
else: |
|
|
|
return [tuple(['0' for node in path]) for path in plist] |
|
|
|
# return [tuple([len(path)]) for path in all_paths] |
|
|
|
|
|
|
|
# |
|
|
|
#def paths2GSuffixTree(paths): |
|
|
|
# return Tree(paths, builder=ukkonen.Builder) |
|
|
|
|
|
|
|
|
|
|
|
# def find_paths(G, source_node, length): |
|
|
|
# """Find all paths no longer than a certain length those start from a source node. A recursive depth first search is applied. |
|
|
|
|
|
|
|
# Parameters |
|
|
|
# ---------- |
|
|
|
# G : NetworkX graphs |
|
|
|
# The graph in which paths are searched. |
|
|
|
# source_node : integer |
|
|
|
# The number of the node from where all paths start. |
|
|
|
# length : integer |
|
|
|
# The length of paths. |
|
|
|
|
|
|
|
# Return |
|
|
|
# ------ |
|
|
|
# path : list of list |
|
|
|
# List of paths retrieved, where each path is represented by a list of nodes. |
|
|
|
# """ |
|
|
|
# return [[source_node]] if length == 0 else \ |
|
|
|
# [[source_node] + path for neighbor in G[source_node] |
|
|
|
# for path in find_paths(G, neighbor, length - 1) if source_node not in path] |
|
|
|
|
|
|
|
# def find_all_paths(G, length): |
|
|
|
# """Find all paths with a certain length in a graph. A recursive depth first search is applied. |
|
|
|
|
|
|
|
# Parameters |
|
|
|
# ---------- |
|
|
|
# G : NetworkX graphs |
|
|
|
# The graph in which paths are searched. |
|
|
|
# length : integer |
|
|
|
# The length of paths. |
|
|
|
|
|
|
|
# Return |
|
|
|
# ------ |
|
|
|
# path : list of list |
|
|
|
# List of paths retrieved, where each path is represented by a list of nodes. |
|
|
|
# """ |
|
|
|
# all_paths = [] |
|
|
|
# for node in G: |
|
|
|
# all_paths.extend(find_paths(G, node, length)) |
|
|
|
|
|
|
|
# # The following process is not carried out according to the original article |
|
|
|
# # all_paths_r = [ path[::-1] for path in all_paths ] |
|
|
|
|
|
|
|
# # # For each path, two presentation are retrieved from its two extremities. Remove one of them. |
|
|
|
# # for idx, path in enumerate(all_paths[:-1]): |
|
|
|
# # for path2 in all_paths_r[idx+1::]: |
|
|
|
# # if path == path2: |
|
|
|
# # all_paths[idx] = [] |
|
|
|
# # break |
|
|
|
|
|
|
|
# # return list(filter(lambda a: a != [], all_paths)) |
|
|
|
# return all_paths |