From f435b840d16ea7e54c236ccff83d48ec93f67eee Mon Sep 17 00:00:00 2001 From: linlin Date: Tue, 6 Oct 2020 17:25:51 +0200 Subject: [PATCH] New translations untilHPathKernel.py (Chinese Simplified) --- lang/zh/gklearn/kernels/untilHPathKernel.py | 726 ++++++++++++++++++++++++++++ 1 file changed, 726 insertions(+) create mode 100644 lang/zh/gklearn/kernels/untilHPathKernel.py diff --git a/lang/zh/gklearn/kernels/untilHPathKernel.py b/lang/zh/gklearn/kernels/untilHPathKernel.py new file mode 100644 index 0000000..9bac28b --- /dev/null +++ b/lang/zh/gklearn/kernels/untilHPathKernel.py @@ -0,0 +1,726 @@ +""" +@author: linlin + +@references: + + [1] Liva Ralaivola, Sanjay J Swamidass, Hiroto Saigo, and Pierre + Baldi. Graph kernels for chemical informatics. Neural networks, + 18(8):1093–1110, 2005. +""" + +import sys +import time +from collections import Counter +from itertools import chain +from functools import partial +from multiprocessing import Pool +from tqdm import tqdm + +import networkx as nx +import numpy as np + +from gklearn.utils.graphdataset import get_dataset_attributes +from gklearn.utils.parallel import parallel_gm +from gklearn.utils.trie import Trie + + +def untilhpathkernel(*args, + node_label='atom', + edge_label='bond_type', + depth=10, + k_func='MinMax', + compute_method='trie', + parallel='imap_unordered', + n_jobs=None, + chunksize=None, + verbose=True): + """Calculate path graph kernels up to depth/hight h between graphs. + + Parameters + ---------- + Gn : List of NetworkX graph + List of graphs between which the kernels are calculated. + + G1, G2 : NetworkX graphs + Two graphs between which the kernel is calculated. + + node_label : string + Node attribute used as label. The default node label is atom. + + edge_label : string + Edge attribute used as label. The default edge label is bond_type. + + depth : integer + Depth of search. Longest length of paths. + + k_func : function + A kernel function applied using different notions of fingerprint + similarity, defining the type of feature map and normalization method + applied for the graph kernel. The Following choices are available: + + 'MinMax': use the MiniMax kernel and counting feature map. + + 'tanimoto': use the Tanimoto kernel and binary feature map. + + None: no sub-kernel is used, the kernel is computed directly. + + compute_method : string + Computation method to store paths and compute the graph kernel. The + Following choices are available: + + 'trie': store paths as tries. + + 'naive': store paths to lists. + + n_jobs : int + Number of jobs for parallelization. + + Return + ------ + Kmatrix : Numpy matrix + Kernel matrix, each element of which is the path kernel up to h between + 2 praphs. + """ + # pre-process + depth = int(depth) + Gn = args[0] if len(args) == 1 else [args[0], args[1]] + Gn = [g.copy() for g in Gn] + Kmatrix = np.zeros((len(Gn), len(Gn))) + ds_attrs = get_dataset_attributes( + Gn, + attr_names=['node_labeled', 'node_attr_dim', 'edge_labeled', + 'edge_attr_dim', 'is_directed'], + node_label=node_label, edge_label=edge_label) + if k_func != None: + if not ds_attrs['node_labeled']: + for G in Gn: + nx.set_node_attributes(G, '0', 'atom') + if not ds_attrs['edge_labeled']: + for G in Gn: + nx.set_edge_attributes(G, '0', 'bond_type') + + start_time = time.time() + + if parallel == 'imap_unordered': + # ---- use pool.imap_unordered to parallel and track progress. ---- + # get all paths of all graphs before calculating kernels to save time, + # but this may cost a lot of memory for large datasets. + pool = Pool(n_jobs) + itr = zip(Gn, range(0, len(Gn))) + if chunksize is None: + if len(Gn) < 100 * n_jobs: + chunksize = int(len(Gn) / n_jobs) + 1 + else: + chunksize = 100 + all_paths = [[] for _ in range(len(Gn))] + if compute_method == 'trie' and k_func != None: + getps_partial = partial(wrapper_find_all_path_as_trie, depth, + ds_attrs, node_label, edge_label) + elif compute_method != 'trie' and k_func != None: + getps_partial = partial(wrapper_find_all_paths_until_length, depth, + ds_attrs, node_label, edge_label, True) + else: + getps_partial = partial(wrapper_find_all_paths_until_length, depth, + ds_attrs, node_label, edge_label, False) + if verbose: + iterator = tqdm(pool.imap_unordered(getps_partial, itr, chunksize), + desc='getting paths', file=sys.stdout) + else: + iterator = pool.imap_unordered(getps_partial, itr, chunksize) + for i, ps in iterator: + all_paths[i] = ps + pool.close() + pool.join() + +# for g in Gn: +# if compute_method == 'trie' and k_func != None: +# find_all_path_as_trie(g, depth, ds_attrs, node_label, edge_label) +# elif compute_method != 'trie' and k_func != None: +# find_all_paths_until_length(g, depth, ds_attrs, node_label, edge_label) +# else: +# find_all_paths_until_length(g, depth, ds_attrs, node_label, edge_label, False) + +## size = sys.getsizeof(all_paths) +## for item in all_paths: +## size += sys.getsizeof(item) +## for pppps in item: +## size += sys.getsizeof(pppps) +## print(size) +# +## ttt = time.time() +## # ---- ---- use pool.map to parallel ---- +## for i, ps in tqdm( +## pool.map(getps_partial, range(0, len(Gn))), +## desc='getting paths', file=sys.stdout): +## all_paths[i] = ps +## print(time.time() - ttt) + + if compute_method == 'trie' and k_func != None: + def init_worker(trie_toshare): + global G_trie + G_trie = trie_toshare + do_partial = partial(wrapper_uhpath_do_trie, k_func) + parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, + glbv=(all_paths,), n_jobs=n_jobs, chunksize=chunksize, verbose=verbose) + elif compute_method != 'trie' and k_func != None: + def init_worker(plist_toshare): + global G_plist + G_plist = plist_toshare + do_partial = partial(wrapper_uhpath_do_naive, k_func) + parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, + glbv=(all_paths,), n_jobs=n_jobs, chunksize=chunksize, verbose=verbose) + else: + def init_worker(plist_toshare): + global G_plist + G_plist = plist_toshare + do_partial = partial(wrapper_uhpath_do_kernelless, ds_attrs, edge_kernels) + parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, + glbv=(all_paths,), n_jobs=n_jobs, chunksize=chunksize, verbose=verbose) + + elif parallel == None: +# from pympler import asizeof + # ---- direct running, normally use single CPU core. ---- +# print(asizeof.asized(all_paths, detail=1).format()) + + if compute_method == 'trie': + all_paths = [ + find_all_path_as_trie(Gn[i], + depth, + ds_attrs, + node_label=node_label, + edge_label=edge_label) for i in tqdm( + range(0, len(Gn)), desc='getting paths', file=sys.stdout) + ] +# sizeof_allpaths = asizeof.asizeof(all_paths) +# print(sizeof_allpaths) + pbar = tqdm( + total=((len(Gn) + 1) * len(Gn) / 2), + desc='calculating kernels', + file=sys.stdout) + for i in range(0, len(Gn)): + for j in range(i, len(Gn)): + Kmatrix[i][j] = _untilhpathkernel_do_trie(all_paths[i], + all_paths[j], k_func) + Kmatrix[j][i] = Kmatrix[i][j] + pbar.update(1) + else: + all_paths = [ + find_all_paths_until_length( + Gn[i], + depth, + ds_attrs, + node_label=node_label, + edge_label=edge_label) for i in tqdm( + range(0, len(Gn)), desc='getting paths', file=sys.stdout) + ] +# sizeof_allpaths = asizeof.asizeof(all_paths) +# print(sizeof_allpaths) + pbar = tqdm( + total=((len(Gn) + 1) * len(Gn) / 2), + desc='calculating kernels', + file=sys.stdout) + for i in range(0, len(Gn)): + for j in range(i, len(Gn)): + Kmatrix[i][j] = _untilhpathkernel_do_naive(all_paths[i], all_paths[j], + k_func) + Kmatrix[j][i] = Kmatrix[i][j] + pbar.update(1) + + run_time = time.time() - start_time + if verbose: + print("\n --- kernel matrix of path kernel up to %d of size %d built in %s seconds ---" + % (depth, len(Gn), run_time)) + +# print(Kmatrix[0][0:10]) + return Kmatrix, run_time + + +def _untilhpathkernel_do_trie(trie1, trie2, k_func): + """Calculate path graph kernels up to depth d between 2 graphs using trie. + + Parameters + ---------- + trie1, trie2 : list + Tries that contains all paths in 2 graphs. + k_func : function + A kernel function applied using different notions of fingerprint + similarity. + + Return + ------ + kernel : float + Path kernel up to h between 2 graphs. + """ + if k_func == 'tanimoto': + # traverse all paths in graph1 and search them in graph2. Deep-first + # search is applied. + def traverseTrie1t(root, trie2, setlist, pcurrent=[]): + for key, node in root['children'].items(): + pcurrent.append(key) + if node['isEndOfWord']: + setlist[1] += 1 + count2 = trie2.searchWord(pcurrent) + if count2 != 0: + setlist[0] += 1 + if node['children'] != {}: + traverseTrie1t(node, trie2, setlist, pcurrent) + else: + del pcurrent[-1] + if pcurrent != []: + del pcurrent[-1] + + + # traverse all paths in graph2 and find out those that are not in + # graph1. Deep-first search is applied. + def traverseTrie2t(root, trie1, setlist, pcurrent=[]): + for key, node in root['children'].items(): + pcurrent.append(key) + if node['isEndOfWord']: + # print(node['count']) + count1 = trie1.searchWord(pcurrent) + if count1 == 0: + setlist[1] += 1 + if node['children'] != {}: + traverseTrie2t(node, trie1, setlist, pcurrent) + else: + del pcurrent[-1] + if pcurrent != []: + del pcurrent[-1] + + setlist = [0, 0] # intersection and union of path sets of g1, g2. +# print(trie1.root) +# print(trie2.root) + traverseTrie1t(trie1.root, trie2, setlist) +# print(setlist) + traverseTrie2t(trie2.root, trie1, setlist) +# print(setlist) + kernel = setlist[0] / setlist[1] + + else: # MinMax kernel + # traverse all paths in graph1 and search them in graph2. Deep-first + # search is applied. + def traverseTrie1m(root, trie2, sumlist, pcurrent=[]): + for key, node in root['children'].items(): + pcurrent.append(key) + if node['isEndOfWord']: + # print(node['count']) + count1 = node['count'] + count2 = trie2.searchWord(pcurrent) + sumlist[0] += min(count1, count2) + sumlist[1] += max(count1, count2) + if node['children'] != {}: + traverseTrie1m(node, trie2, sumlist, pcurrent) + else: + del pcurrent[-1] + if pcurrent != []: + del pcurrent[-1] + + # traverse all paths in graph2 and find out those that are not in + # graph1. Deep-first search is applied. + def traverseTrie2m(root, trie1, sumlist, pcurrent=[]): + for key, node in root['children'].items(): + pcurrent.append(key) + if node['isEndOfWord']: + # print(node['count']) + count1 = trie1.searchWord(pcurrent) + if count1 == 0: + sumlist[1] += node['count'] + if node['children'] != {}: + traverseTrie2m(node, trie1, sumlist, pcurrent) + else: + del pcurrent[-1] + if pcurrent != []: + del pcurrent[-1] + + sumlist = [0, 0] # sum of mins and sum of maxs +# print(trie1.root) +# print(trie2.root) + traverseTrie1m(trie1.root, trie2, sumlist) +# print(sumlist) + traverseTrie2m(trie2.root, trie1, sumlist) +# print(sumlist) + kernel = sumlist[0] / sumlist[1] + + return kernel + + +def wrapper_uhpath_do_trie(k_func, itr): + i = itr[0] + j = itr[1] + return i, j, _untilhpathkernel_do_trie(G_trie[i], G_trie[j], k_func) + + +def _untilhpathkernel_do_naive(paths1, paths2, k_func): + """Calculate path graph kernels up to depth d between 2 graphs naively. + + Parameters + ---------- + paths_list : list of list + List of list of paths in all graphs, where for unlabeled graphs, each + path is represented by a list of nodes; while for labeled graphs, each + path is represented by a string consists of labels of nodes and/or + edges on that path. + k_func : function + A kernel function applied using different notions of fingerprint + similarity. + + Return + ------ + kernel : float + Path kernel up to h between 2 graphs. + """ + all_paths = list(set(paths1 + paths2)) + + if k_func == 'tanimoto': + length_union = len(set(paths1 + paths2)) + kernel = (len(set(paths1)) + len(set(paths2)) - + length_union) / length_union +# vector1 = [(1 if path in paths1 else 0) for path in all_paths] +# vector2 = [(1 if path in paths2 else 0) for path in all_paths] +# kernel_uv = np.dot(vector1, vector2) +# kernel = kernel_uv / (len(set(paths1)) + len(set(paths2)) - kernel_uv) + + else: # MinMax kernel + path_count1 = Counter(paths1) + path_count2 = Counter(paths2) + vector1 = [(path_count1[key] if (key in path_count1.keys()) else 0) + for key in all_paths] + vector2 = [(path_count2[key] if (key in path_count2.keys()) else 0) + for key in all_paths] + kernel = np.sum(np.minimum(vector1, vector2)) / \ + np.sum(np.maximum(vector1, vector2)) + + return kernel + + +def wrapper_uhpath_do_naive(k_func, itr): + i = itr[0] + j = itr[1] + return i, j, _untilhpathkernel_do_naive(G_plist[i], G_plist[j], k_func) + + +def _untilhpathkernel_do_kernelless(paths1, paths2, k_func): + """Calculate path graph kernels up to depth d between 2 graphs naively. + + Parameters + ---------- + paths_list : list of list + List of list of paths in all graphs, where for unlabeled graphs, each + path is represented by a list of nodes; while for labeled graphs, each + path is represented by a string consists of labels of nodes and/or + edges on that path. + k_func : function + A kernel function applied using different notions of fingerprint + similarity. + + Return + ------ + kernel : float + Path kernel up to h between 2 graphs. + """ + all_paths = list(set(paths1 + paths2)) + + if k_func == 'tanimoto': + length_union = len(set(paths1 + paths2)) + kernel = (len(set(paths1)) + len(set(paths2)) - + length_union) / length_union +# vector1 = [(1 if path in paths1 else 0) for path in all_paths] +# vector2 = [(1 if path in paths2 else 0) for path in all_paths] +# kernel_uv = np.dot(vector1, vector2) +# kernel = kernel_uv / (len(set(paths1)) + len(set(paths2)) - kernel_uv) + + else: # MinMax kernel + path_count1 = Counter(paths1) + path_count2 = Counter(paths2) + vector1 = [(path_count1[key] if (key in path_count1.keys()) else 0) + for key in all_paths] + vector2 = [(path_count2[key] if (key in path_count2.keys()) else 0) + for key in all_paths] + kernel = np.sum(np.minimum(vector1, vector2)) / \ + np.sum(np.maximum(vector1, vector2)) + + return kernel + + +def wrapper_uhpath_do_kernelless(k_func, itr): + i = itr[0] + j = itr[1] + return i, j, _untilhpathkernel_do_kernelless(G_plist[i], G_plist[j], k_func) + + +# @todo: (can be removed maybe) this method find paths repetively, it could be faster. +def find_all_paths_until_length(G, + length, + ds_attrs, + node_label='atom', + edge_label='bond_type', + tolabelseqs=True): + """Find all paths no longer than a certain maximum length in a graph. A + recursive depth first search is applied. + + Parameters + ---------- + G : NetworkX graphs + The graph in which paths are searched. + length : integer + The maximum length of paths. + ds_attrs: dict + Dataset attributes. + node_label : string + Node attribute used as label. The default node label is atom. + edge_label : string + Edge attribute used as label. The default edge label is bond_type. + + Return + ------ + path : list + List of paths retrieved, where for unlabeled graphs, each path is + represented by a list of nodes; while for labeled graphs, each path is + represented by a list of strings consists of labels of nodes and/or + edges on that path. + """ + # path_l = [tuple([n]) for n in G.nodes] # paths of length l + # all_paths = path_l[:] + # for l in range(1, length + 1): + # path_l_new = [] + # for path in path_l: + # for neighbor in G[path[-1]]: + # if len(path) < 2 or neighbor != path[-2]: + # tmp = path + (neighbor, ) + # if tuple(tmp[::-1]) not in path_l_new: + # path_l_new.append(tuple(tmp)) + + # all_paths += path_l_new + # path_l = path_l_new[:] + + path_l = [[n] for n in G.nodes] # paths of length l + all_paths = [p.copy() for p in path_l] + for l in range(1, length + 1): + path_lplus1 = [] + for path in path_l: + for neighbor in G[path[-1]]: + if neighbor not in path: + tmp = path + [neighbor] +# if tmp[::-1] not in path_lplus1: + path_lplus1.append(tmp) + + all_paths += path_lplus1 + path_l = [p.copy() for p in path_lplus1] + + # for i in range(0, length + 1): + # new_paths = find_all_paths(G, i) + # if new_paths == []: + # break + # all_paths.extend(new_paths) + + # consider labels +# print(paths2labelseqs(all_paths, G, ds_attrs, node_label, edge_label)) +# print() + return (paths2labelseqs(all_paths, G, ds_attrs, node_label, edge_label) + if tolabelseqs else all_paths) + + +def wrapper_find_all_paths_until_length(length, ds_attrs, node_label, + edge_label, tolabelseqs, itr_item): + g = itr_item[0] + i = itr_item[1] + return i, find_all_paths_until_length(g, length, ds_attrs, + node_label=node_label, edge_label=edge_label, + tolabelseqs=tolabelseqs) + + +def find_all_path_as_trie(G, + length, + ds_attrs, + node_label='atom', + edge_label='bond_type'): +# time1 = time.time() + +# all_path = find_all_paths_until_length(G, length, ds_attrs, +# node_label=node_label, +# edge_label=edge_label) +# ptrie = Trie() +# for path in all_path: +# ptrie.insertWord(path) + +# ptrie = Trie() +# path_l = [[n] for n in G.nodes] # paths of length l +# path_l_str = paths2labelseqs(path_l, G, ds_attrs, node_label, edge_label) +# for p in path_l_str: +# ptrie.insertWord(p) +# for l in range(1, length + 1): +# path_lplus1 = [] +# for path in path_l: +# for neighbor in G[path[-1]]: +# if neighbor not in path: +# tmp = path + [neighbor] +## if tmp[::-1] not in path_lplus1: +# path_lplus1.append(tmp) +# path_l = path_lplus1[:] +# # consider labels +# path_l_str = paths2labelseqs(path_l, G, ds_attrs, node_label, edge_label) +# for p in path_l_str: +# ptrie.insertWord(p) +# +# print(time.time() - time1) +# print(ptrie.root) +# print() + + + # traverse all paths up to length h in a graph and construct a trie with + # them. Deep-first search is applied. Notice the reverse of each path is + # also stored to the trie. + def traverseGraph(root, ptrie, length, G, ds_attrs, node_label, edge_label, + pcurrent=[]): + if len(pcurrent) < length + 1: + for neighbor in G[root]: + if neighbor not in pcurrent: + pcurrent.append(neighbor) + plstr = paths2labelseqs([pcurrent], G, ds_attrs, + node_label, edge_label) + ptrie.insertWord(plstr[0]) + traverseGraph(neighbor, ptrie, length, G, ds_attrs, + node_label, edge_label, pcurrent) + del pcurrent[-1] + + + ptrie = Trie() + path_l = [[n] for n in G.nodes] # paths of length l + path_l_str = paths2labelseqs(path_l, G, ds_attrs, node_label, edge_label) + for p in path_l_str: + ptrie.insertWord(p) + for n in G.nodes: + traverseGraph(n, ptrie, length, G, ds_attrs, node_label, edge_label, + pcurrent=[n]) + + +# def traverseGraph(root, all_paths, length, G, ds_attrs, node_label, edge_label, +# pcurrent=[]): +# if len(pcurrent) < length + 1: +# for neighbor in G[root]: +# if neighbor not in pcurrent: +# pcurrent.append(neighbor) +# plstr = paths2labelseqs([pcurrent], G, ds_attrs, +# node_label, edge_label) +# all_paths.append(pcurrent[:]) +# traverseGraph(neighbor, all_paths, length, G, ds_attrs, +# node_label, edge_label, pcurrent) +# del pcurrent[-1] +# +# +# path_l = [[n] for n in G.nodes] # paths of length l +# all_paths = path_l[:] +# path_l_str = paths2labelseqs(path_l, G, ds_attrs, node_label, edge_label) +## for p in path_l_str: +## ptrie.insertWord(p) +# for n in G.nodes: +# traverseGraph(n, all_paths, length, G, ds_attrs, node_label, edge_label, +# pcurrent=[n]) + +# print(ptrie.root) + return ptrie + + +def wrapper_find_all_path_as_trie(length, ds_attrs, node_label, + edge_label, itr_item): + g = itr_item[0] + i = itr_item[1] + return i, find_all_path_as_trie(g, length, ds_attrs, + node_label=node_label, edge_label=edge_label) + + +def paths2labelseqs(plist, G, ds_attrs, node_label, edge_label): + if ds_attrs['node_labeled']: + if ds_attrs['edge_labeled']: + path_strs = [ + tuple( + list( + chain.from_iterable( + (G.nodes[node][node_label], + G[node][path[idx + 1]][edge_label]) + for idx, node in enumerate(path[:-1]))) + + [G.nodes[path[-1]][node_label]]) for path in plist + ] + # path_strs = [] + # for path in all_paths: + # strlist = list( + # chain.from_iterable((G.node[node][node_label], + # G[node][path[idx + 1]][edge_label]) + # for idx, node in enumerate(path[:-1]))) + # strlist.append(G.node[path[-1]][node_label]) + # path_strs.append(tuple(strlist)) + else: + path_strs = [ + tuple([G.nodes[node][node_label] for node in path]) + for path in plist + ] + return path_strs + else: + if ds_attrs['edge_labeled']: + return [ + tuple([] if len(path) == 1 else [ + G[node][path[idx + 1]][edge_label] + for idx, node in enumerate(path[:-1]) + ]) for path in plist + ] + else: + return [tuple(['0' for node in path]) for path in plist] +# return [tuple([len(path)]) for path in all_paths] + +# +#def paths2GSuffixTree(paths): +# return Tree(paths, builder=ukkonen.Builder) + + +# def find_paths(G, source_node, length): +# """Find all paths no longer than a certain length those start from a source node. A recursive depth first search is applied. + +# Parameters +# ---------- +# G : NetworkX graphs +# The graph in which paths are searched. +# source_node : integer +# The number of the node from where all paths start. +# length : integer +# The length of paths. + +# Return +# ------ +# path : list of list +# List of paths retrieved, where each path is represented by a list of nodes. +# """ +# return [[source_node]] if length == 0 else \ +# [[source_node] + path for neighbor in G[source_node] +# for path in find_paths(G, neighbor, length - 1) if source_node not in path] + +# def find_all_paths(G, length): +# """Find all paths with a certain length in a graph. A recursive depth first search is applied. + +# Parameters +# ---------- +# G : NetworkX graphs +# The graph in which paths are searched. +# length : integer +# The length of paths. + +# Return +# ------ +# path : list of list +# List of paths retrieved, where each path is represented by a list of nodes. +# """ +# all_paths = [] +# for node in G: +# all_paths.extend(find_paths(G, node, length)) + +# # The following process is not carried out according to the original article +# # all_paths_r = [ path[::-1] for path in all_paths ] + +# # # For each path, two presentation are retrieved from its two extremities. Remove one of them. +# # for idx, path in enumerate(all_paths[:-1]): +# # for path2 in all_paths_r[idx+1::]: +# # if path == path2: +# # all_paths[idx] = [] +# # break + +# # return list(filter(lambda a: a != [], all_paths)) +# return all_paths