""" @author: linlin @references: Thomas Gärtner, Peter Flach, and Stefan Wrobel. On graph kernels: Hardness results and efficient alternatives. Learning Theory and Kernel Machines, pages 129–143, 2003. """ import sys import pathlib sys.path.insert(0, "../") import time from collections import Counter import networkx as nx import numpy as np def untilnwalkkernel(*args, node_label = 'atom', edge_label = 'bond_type', labeled = True, n = 10): """Calculate common walk graph kernels up to depth d between graphs. Parameters ---------- Gn : List of NetworkX graph List of graphs between which the kernels are calculated. / G1, G2 : NetworkX graphs 2 graphs between which the kernel is calculated. node_label : string node attribute used as label. The default node label is atom. edge_label : string edge attribute used as label. The default edge label is bond_type. labeled : boolean Whether the graphs are labeled. The default is True. n : integer Longest length of walks. Return ------ Kmatrix : Numpy matrix Kernel matrix, each element of which is the path kernel up to d between 2 praphs. """ Gn = args[0] if len(args) == 1 else [args[0], args[1]] # arrange all graphs in a list Kmatrix = np.zeros((len(Gn), len(Gn))) n = int(n) start_time = time.time() # get all paths of all graphs before calculating kernels to save time, but this may cost a lot of memory for large dataset. all_walks = [ find_all_walks_until_length(Gn[i], n, node_label = node_label, edge_label = edge_label, labeled = labeled) for i in range(0, len(Gn)) ] for i in range(0, len(Gn)): for j in range(i, len(Gn)): Kmatrix[i][j] = _untilnwalkkernel_do(all_walks[i], all_walks[j], node_label = node_label, edge_label = edge_label, labeled = labeled) Kmatrix[j][i] = Kmatrix[i][j] run_time = time.time() - start_time print("\n --- kernel matrix of walk kernel up to %d of size %d built in %s seconds ---" % (n, len(Gn), run_time)) return Kmatrix, run_time def _untilnwalkkernel_do(walks1, walks2, node_label = 'atom', edge_label = 'bond_type', labeled = True): """Calculate walk graph kernels up to n between 2 graphs. Parameters ---------- walks1, walks2 : list List of walks in 2 graphs, where for unlabeled graphs, each walk is represented by a list of nodes; while for labeled graphs, each walk is represented by a string consists of labels of nodes and edges on that walk. node_label : string node attribute used as label. The default node label is atom. edge_label : string edge attribute used as label. The default edge label is bond_type. labeled : boolean Whether the graphs are labeled. The default is True. Return ------ kernel : float Treelet Kernel between 2 graphs. """ counts_walks1 = dict(Counter(walks1)) counts_walks2 = dict(Counter(walks2)) all_walks = list(set(walks1 + walks2)) vector1 = [ (counts_walks1[walk] if walk in walks1 else 0) for walk in all_walks ] vector2 = [ (counts_walks2[walk] if walk in walks2 else 0) for walk in all_walks ] kernel = np.dot(vector1, vector2) return kernel # this method find walks repetively, it could be faster. def find_all_walks_until_length(G, length, node_label = 'atom', edge_label = 'bond_type', labeled = True): """Find all walks with a certain maximum length in a graph. A recursive depth first search is applied. Parameters ---------- G : NetworkX graphs The graph in which walks are searched. length : integer The maximum length of walks. node_label : string node attribute used as label. The default node label is atom. edge_label : string edge attribute used as label. The default edge label is bond_type. labeled : boolean Whether the graphs are labeled. The default is True. Return ------ walk : list List of walks retrieved, where for unlabeled graphs, each walk is represented by a list of nodes; while for labeled graphs, each walk is represented by a string consists of labels of nodes and edges on that walk. """ all_walks = [] for i in range(0, length + 1): new_walks = find_all_walks(G, i) if new_walks == []: break all_walks.extend(new_walks) if labeled == True: # convert paths to strings walk_strs = [] for walk in all_walks: strlist = [ G.node[node][node_label] + G[node][walk[walk.index(node) + 1]][edge_label] for node in walk[:-1] ] walk_strs.append(''.join(strlist) + G.node[walk[-1]][node_label]) return walk_strs return all_walks def find_walks(G, source_node, length): """Find all walks with a certain length those start from a source node. A recursive depth first search is applied. Parameters ---------- G : NetworkX graphs The graph in which walks are searched. source_node : integer The number of the node from where all walks start. length : integer The length of walks. Return ------ walk : list of list List of walks retrieved, where each walk is represented by a list of nodes. """ return [[source_node]] if length == 0 else \ [ [source_node] + walk for neighbor in G[source_node] \ for walk in find_walks(G, neighbor, length - 1) ] def find_all_walks(G, length): """Find all walks with a certain length in a graph. A recursive depth first search is applied. Parameters ---------- G : NetworkX graphs The graph in which walks are searched. length : integer The length of walks. Return ------ walk : list of list List of walks retrieved, where each walk is represented by a list of nodes. """ all_walks = [] for node in G: all_walks.extend(find_walks(G, node, length)) ### The following process is not carried out according to the original article # all_paths_r = [ path[::-1] for path in all_paths ] # # For each path, two presentation are retrieved from its two extremities. Remove one of them. # for idx, path in enumerate(all_paths[:-1]): # for path2 in all_paths_r[idx+1::]: # if path == path2: # all_paths[idx] = [] # break # return list(filter(lambda a: a != [], all_paths)) return all_walks