From 1728c798e5411f00c4951e2b76d683249a6c56a7 Mon Sep 17 00:00:00 2001 From: linlin Date: Mon, 5 Oct 2020 16:35:29 +0200 Subject: [PATCH] New translations commonWalkKernel.py (French) --- lang/fr/gklearn/kernels/commonWalkKernel.py | 450 ++++++++++++++++++++++++++++ 1 file changed, 450 insertions(+) create mode 100644 lang/fr/gklearn/kernels/commonWalkKernel.py diff --git a/lang/fr/gklearn/kernels/commonWalkKernel.py b/lang/fr/gklearn/kernels/commonWalkKernel.py new file mode 100644 index 0000000..a5f9cb1 --- /dev/null +++ b/lang/fr/gklearn/kernels/commonWalkKernel.py @@ -0,0 +1,450 @@ +""" +@author: linlin + +@references: + + [1] Thomas Gärtner, Peter Flach, and Stefan Wrobel. On graph kernels: + Hardness results and efficient alternatives. Learning Theory and Kernel + Machines, pages 129–143, 2003. +""" + +import sys +import time +from collections import Counter +from functools import partial + +import networkx as nx +import numpy as np + +from gklearn.utils.utils import direct_product +from gklearn.utils.graphdataset import get_dataset_attributes +from gklearn.utils.parallel import parallel_gm + + +def commonwalkkernel(*args, + node_label='atom', + edge_label='bond_type', +# n=None, + weight=1, + compute_method=None, + n_jobs=None, + chunksize=None, + verbose=True): + """Calculate common walk graph kernels between graphs. + + Parameters + ---------- + Gn : List of NetworkX graph + List of graphs between which the kernels are calculated. + + G1, G2 : NetworkX graphs + Two graphs between which the kernel is calculated. + node_label : string + Node attribute used as symbolic label. The default node label is 'atom'. + edge_label : string + Edge attribute used as symbolic label. The default edge label is 'bond_type'. + weight: integer + Weight coefficient of different lengths of walks, which represents beta + in 'exp' method and gamma in 'geo'. + compute_method : string + Method used to compute walk kernel. The Following choices are + available: + + 'exp': method based on exponential serials applied on the direct + product graph, as shown in reference [1]. The time complexity is O(n^6) + for graphs with n vertices. + + 'geo': method based on geometric serials applied on the direct product + graph, as shown in reference [1]. The time complexity is O(n^6) for + graphs with n vertices. + + n_jobs : int + Number of jobs for parallelization. + + Return + ------ + Kmatrix : Numpy matrix + Kernel matrix, each element of which is a common walk kernel between 2 + graphs. + """ +# n : integer +# Longest length of walks. Only useful when applying the 'brute' method. +# 'brute': brute force, simply search for all walks and compare them. + compute_method = compute_method.lower() + # arrange all graphs in a list + Gn = args[0] if len(args) == 1 else [args[0], args[1]] + + # remove graphs with only 1 node, as they do not have adjacency matrices + len_gn = len(Gn) + Gn = [(idx, G) for idx, G in enumerate(Gn) if nx.number_of_nodes(G) != 1] + idx = [G[0] for G in Gn] + Gn = [G[1] for G in Gn] + if len(Gn) != len_gn: + if verbose: + print('\n %d graphs are removed as they have only 1 node.\n' % + (len_gn - len(Gn))) + + ds_attrs = get_dataset_attributes( + Gn, + attr_names=['node_labeled', 'edge_labeled', 'is_directed'], + node_label=node_label, edge_label=edge_label) + if not ds_attrs['node_labeled']: + for G in Gn: + nx.set_node_attributes(G, '0', 'atom') + if not ds_attrs['edge_labeled']: + for G in Gn: + nx.set_edge_attributes(G, '0', 'bond_type') + if not ds_attrs['is_directed']: # convert + Gn = [G.to_directed() for G in Gn] + + start_time = time.time() + + Kmatrix = np.zeros((len(Gn), len(Gn))) + + # ---- use pool.imap_unordered to parallel and track progress. ---- + def init_worker(gn_toshare): + global G_gn + G_gn = gn_toshare + # direct product graph method - exponential + if compute_method == 'exp': + do_partial = partial(wrapper_cw_exp, node_label, edge_label, weight) + # direct product graph method - geometric + elif compute_method == 'geo': + do_partial = partial(wrapper_cw_geo, node_label, edge_label, weight) + parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, + glbv=(Gn,), n_jobs=n_jobs, chunksize=chunksize, verbose=verbose) + + +# pool = Pool(n_jobs) +# itr = zip(combinations_with_replacement(Gn, 2), +# combinations_with_replacement(range(0, len(Gn)), 2)) +# len_itr = int(len(Gn) * (len(Gn) + 1) / 2) +# if len_itr < 1000 * n_jobs: +# chunksize = int(len_itr / n_jobs) + 1 +# else: +# chunksize = 1000 +# +# # direct product graph method - exponential +# if compute_method == 'exp': +# do_partial = partial(wrapper_cw_exp, node_label, edge_label, weight) +# # direct product graph method - geometric +# elif compute_method == 'geo': +# do_partial = partial(wrapper_cw_geo, node_label, edge_label, weight) +# +# for i, j, kernel in tqdm( +# pool.imap_unordered(do_partial, itr, chunksize), +# desc='calculating kernels', +# file=sys.stdout): +# Kmatrix[i][j] = kernel +# Kmatrix[j][i] = kernel +# pool.close() +# pool.join() + + +# # ---- direct running, normally use single CPU core. ---- +# # direct product graph method - exponential +# itr = combinations_with_replacement(range(0, len(Gn)), 2) +# if compute_method == 'exp': +# for i, j in tqdm(itr, desc='calculating kernels', file=sys.stdout): +# Kmatrix[i][j] = _commonwalkkernel_exp(Gn[i], Gn[j], node_label, +# edge_label, weight) +# Kmatrix[j][i] = Kmatrix[i][j] +# +# # direct product graph method - geometric +# elif compute_method == 'geo': +# for i, j in tqdm(itr, desc='calculating kernels', file=sys.stdout): +# Kmatrix[i][j] = _commonwalkkernel_geo(Gn[i], Gn[j], node_label, +# edge_label, weight) +# Kmatrix[j][i] = Kmatrix[i][j] + + +# # search all paths use brute force. +# elif compute_method == 'brute': +# n = int(n) +# # get all paths of all graphs before calculating kernels to save time, but this may cost a lot of memory for large dataset. +# all_walks = [ +# find_all_walks_until_length(Gn[i], n, node_label, edge_label) +# for i in range(0, len(Gn)) +# ] +# +# for i in range(0, len(Gn)): +# for j in range(i, len(Gn)): +# Kmatrix[i][j] = _commonwalkkernel_brute( +# all_walks[i], +# all_walks[j], +# node_label=node_label, +# edge_label=edge_label) +# Kmatrix[j][i] = Kmatrix[i][j] + + run_time = time.time() - start_time + if verbose: + print("\n --- kernel matrix of common walk kernel of size %d built in %s seconds ---" + % (len(Gn), run_time)) + + return Kmatrix, run_time, idx + + +def _commonwalkkernel_exp(g1, g2, node_label, edge_label, beta): + """Calculate walk graph kernels up to n between 2 graphs using exponential + series. + + Parameters + ---------- + Gn : List of NetworkX graph + List of graphs between which the kernels are calculated. + node_label : string + Node attribute used as label. + edge_label : string + Edge attribute used as label. + beta : integer + Weight. + ij : tuple of integer + Index of graphs between which the kernel is computed. + + Return + ------ + kernel : float + The common walk Kernel between 2 graphs. + """ + + # get tensor product / direct product + gp = direct_product(g1, g2, node_label, edge_label) + # return 0 if the direct product graph have no more than 1 node. + if nx.number_of_nodes(gp) < 2: + return 0 + A = nx.adjacency_matrix(gp).todense() + # print(A) + + # from matplotlib import pyplot as plt + # nx.draw_networkx(G1) + # plt.show() + # nx.draw_networkx(G2) + # plt.show() + # nx.draw_networkx(gp) + # plt.show() + # print(G1.nodes(data=True)) + # print(G2.nodes(data=True)) + # print(gp.nodes(data=True)) + # print(gp.edges(data=True)) + + ew, ev = np.linalg.eig(A) + # print('ew: ', ew) + # print(ev) + # T = np.matrix(ev) + # print('T: ', T) + # T = ev.I + D = np.zeros((len(ew), len(ew))) + for i in range(len(ew)): + D[i][i] = np.exp(beta * ew[i]) + # print('D: ', D) + # print('hshs: ', T.I * D * T) + + # print(np.exp(-2)) + # print(D) + # print(np.exp(weight * D)) + # print(ev) + # print(np.linalg.inv(ev)) + exp_D = ev * D * ev.T + # print(exp_D) + # print(np.exp(weight * A)) + # print('-------') + + return exp_D.sum() + + +def wrapper_cw_exp(node_label, edge_label, beta, itr): + i = itr[0] + j = itr[1] + return i, j, _commonwalkkernel_exp(G_gn[i], G_gn[j], node_label, edge_label, beta) + + +def _commonwalkkernel_geo(g1, g2, node_label, edge_label, gamma): + """Calculate common walk graph kernels up to n between 2 graphs using + geometric series. + + Parameters + ---------- + Gn : List of NetworkX graph + List of graphs between which the kernels are calculated. + node_label : string + Node attribute used as label. + edge_label : string + Edge attribute used as label. + gamma: integer + Weight. + ij : tuple of integer + Index of graphs between which the kernel is computed. + + Return + ------ + kernel : float + The common walk Kernel between 2 graphs. + """ + # get tensor product / direct product + gp = direct_product(g1, g2, node_label, edge_label) + # return 0 if the direct product graph have no more than 1 node. + if nx.number_of_nodes(gp) < 2: + return 0 + A = nx.adjacency_matrix(gp).todense() + mat = np.identity(len(A)) - gamma * A +# try: + return mat.I.sum() +# except np.linalg.LinAlgError: +# return np.nan + + +def wrapper_cw_geo(node_label, edge_label, gama, itr): + i = itr[0] + j = itr[1] + return i, j, _commonwalkkernel_geo(G_gn[i], G_gn[j], node_label, edge_label, gama) + + +def _commonwalkkernel_brute(walks1, + walks2, + node_label='atom', + edge_label='bond_type', + labeled=True): + """Calculate walk graph kernels up to n between 2 graphs. + + Parameters + ---------- + walks1, walks2 : list + List of walks in 2 graphs, where for unlabeled graphs, each walk is + represented by a list of nodes; while for labeled graphs, each walk is + represented by a string consists of labels of nodes and edges on that + walk. + node_label : string + node attribute used as label. The default node label is atom. + edge_label : string + edge attribute used as label. The default edge label is bond_type. + labeled : boolean + Whether the graphs are labeled. The default is True. + + Return + ------ + kernel : float + Treelet Kernel between 2 graphs. + """ + counts_walks1 = dict(Counter(walks1)) + counts_walks2 = dict(Counter(walks2)) + all_walks = list(set(walks1 + walks2)) + + vector1 = [(counts_walks1[walk] if walk in walks1 else 0) + for walk in all_walks] + vector2 = [(counts_walks2[walk] if walk in walks2 else 0) + for walk in all_walks] + kernel = np.dot(vector1, vector2) + + return kernel + + +# this method find walks repetively, it could be faster. +def find_all_walks_until_length(G, + length, + node_label='atom', + edge_label='bond_type', + labeled=True): + """Find all walks with a certain maximum length in a graph. + A recursive depth first search is applied. + + Parameters + ---------- + G : NetworkX graphs + The graph in which walks are searched. + length : integer + The maximum length of walks. + node_label : string + node attribute used as label. The default node label is atom. + edge_label : string + edge attribute used as label. The default edge label is bond_type. + labeled : boolean + Whether the graphs are labeled. The default is True. + + Return + ------ + walk : list + List of walks retrieved, where for unlabeled graphs, each walk is + represented by a list of nodes; while for labeled graphs, each walk + is represented by a string consists of labels of nodes and edges on + that walk. + """ + all_walks = [] + # @todo: in this way, the time complexity is close to N(d^n+d^(n+1)+...+1), which could be optimized to O(Nd^n) + for i in range(0, length + 1): + new_walks = find_all_walks(G, i) + if new_walks == []: + break + all_walks.extend(new_walks) + + if labeled == True: # convert paths to strings + walk_strs = [] + for walk in all_walks: + strlist = [ + G.node[node][node_label] + + G[node][walk[walk.index(node) + 1]][edge_label] + for node in walk[:-1] + ] + walk_strs.append(''.join(strlist) + G.node[walk[-1]][node_label]) + + return walk_strs + + return all_walks + + +def find_walks(G, source_node, length): + """Find all walks with a certain length those start from a source node. A + recursive depth first search is applied. + + Parameters + ---------- + G : NetworkX graphs + The graph in which walks are searched. + source_node : integer + The number of the node from where all walks start. + length : integer + The length of walks. + + Return + ------ + walk : list of list + List of walks retrieved, where each walk is represented by a list of + nodes. + """ + return [[source_node]] if length == 0 else \ + [[source_node] + walk for neighbor in G[source_node] + for walk in find_walks(G, neighbor, length - 1)] + + +def find_all_walks(G, length): + """Find all walks with a certain length in a graph. A recursive depth first + search is applied. + + Parameters + ---------- + G : NetworkX graphs + The graph in which walks are searched. + length : integer + The length of walks. + + Return + ------ + walk : list of list + List of walks retrieved, where each walk is represented by a list of + nodes. + """ + all_walks = [] + for node in G: + all_walks.extend(find_walks(G, node, length)) + + # The following process is not carried out according to the original article + # all_paths_r = [ path[::-1] for path in all_paths ] + + # # For each path, two presentation are retrieved from its two extremities. Remove one of them. + # for idx, path in enumerate(all_paths[:-1]): + # for path2 in all_paths_r[idx+1::]: + # if path == path2: + # all_paths[idx] = [] + # break + + # return list(filter(lambda a: a != [], all_paths)) + return all_walks