""" @author: linlin @references: [1] Tamás Horváth, Thomas Gärtner, and Stefan Wrobel. Cyclic pattern kernels for predictive graph mining. In Proceedings of the tenth ACM SIGKDD international conference on Knowledge discovery and data mining, pages 158–167. ACM, 2004. [2] Hopcroft, J.; Tarjan, R. (1973). “Efficient algorithms for graph manipulation”. Communications of the ACM 16: 372–378. doi:10.1145/362248.362272. [3] Finding all the elementary circuits of a directed graph. D. B. Johnson, SIAM Journal on Computing 4, no. 1, 77-84, 1975. http://dx.doi.org/10.1137/0204007 """ import sys import pathlib sys.path.insert(0, "../") import time import networkx as nx import numpy as np from tqdm import tqdm def cyclicpatternkernel(*args, node_label = 'atom', edge_label = 'bond_type', labeled = True, cycle_bound = None): """Calculate cyclic pattern graph kernels between graphs. Parameters ---------- Gn : List of NetworkX graph List of graphs between which the kernels are calculated. / G1, G2 : NetworkX graphs 2 graphs between which the kernel is calculated. node_label : string node attribute used as label. The default node label is atom. edge_label : string edge attribute used as label. The default edge label is bond_type. labeled : boolean Whether the graphs are labeled. The default is True. depth : integer Depth of search. Longest length of paths. Return ------ Kmatrix : Numpy matrix Kernel matrix, each element of which is the path kernel up to d between 2 praphs. """ Gn = args[0] if len(args) == 1 else [args[0], args[1]] # arrange all graphs in a list Kmatrix = np.zeros((len(Gn), len(Gn))) start_time = time.time() # get all cyclic and tree patterns of all graphs before calculating kernels to save time, but this may consume a lot of memory for large dataset. all_patterns = [ get_patterns(Gn[i], node_label = node_label, edge_label = edge_label, labeled = labeled, cycle_bound = cycle_bound) for i in tqdm(range(0, len(Gn)), desc = 'retrieve patterns', file=sys.stdout) ] for i in tqdm(range(0, len(Gn)), desc = 'calculate kernels', file=sys.stdout): for j in range(i, len(Gn)): Kmatrix[i][j] = _cyclicpatternkernel_do(all_patterns[i], all_patterns[j]) Kmatrix[j][i] = Kmatrix[i][j] run_time = time.time() - start_time print("\n --- kernel matrix of cyclic pattern kernel of size %d built in %s seconds ---" % (len(Gn), run_time)) return Kmatrix, run_time def _cyclicpatternkernel_do(patterns1, patterns2): """Calculate path graph kernels up to depth d between 2 graphs. Parameters ---------- paths1, paths2 : list List of paths in 2 graphs, where for unlabeled graphs, each path is represented by a list of nodes; while for labeled graphs, each path is represented by a string consists of labels of nodes and edges on that path. k_func : function A kernel function used using different notions of fingerprint similarity. node_label : string node attribute used as label. The default node label is atom. edge_label : string edge attribute used as label. The default edge label is bond_type. labeled : boolean Whether the graphs are labeled. The default is True. Return ------ kernel : float Treelet Kernel between 2 graphs. """ return len(set(patterns1) & set(patterns2)) def get_patterns(G, node_label = 'atom', edge_label = 'bond_type', labeled = True, cycle_bound = None): """Find all cyclic and tree patterns in a graph. Parameters ---------- G : NetworkX graphs The graph in which paths are searched. length : integer The maximum length of paths. node_label : string node attribute used as label. The default node label is atom. edge_label : string edge attribute used as label. The default edge label is bond_type. labeled : boolean Whether the graphs are labeled. The default is True. Return ------ path : list List of paths retrieved, where for unlabeled graphs, each path is represented by a list of nodes; while for labeled graphs, each path is represented by a string consists of labels of nodes and edges on that path. """ number_simplecycles = 0 bridges = nx.Graph() patterns = [] bicomponents = nx.biconnected_component_subgraphs(G) # all biconnected components of G. this function use algorithm in reference [2], which (i guess) is slightly different from the one used in paper [1] for subgraph in bicomponents: if nx.number_of_edges(subgraph) > 1: simple_cycles = list(nx.simple_cycles(G.to_directed())) # all simple cycles in biconnected components. this function use algorithm in reference [3], which has time complexity O((n+e)(N+1)) for n nodes, e edges and N simple cycles. Which might be slower than the algorithm applied in paper [1] if cycle_bound != None and len(simple_cycles) > cycle_bound - number_simplecycles: # in paper [1], when applying another algorithm (subroutine RT), this becomes len(simple_cycles) == cycle_bound - number_simplecycles + 1, check again. return [] else: # calculate canonical representation for each simple cycle all_canonkeys = [] for cycle in simple_cycles: canonlist = [ G.node[node][node_label] + G[node][cycle[cycle.index(node) + 1]][edge_label] for node in cycle[:-1] ] canonkey = ''.join(canonlist) canonkey = canonkey if canonkey < canonkey[::-1] else canonkey[::-1] for i in range(1, len(cycle[:-1])): canonlist = [ G.node[node][node_label] + G[node][cycle[cycle.index(node) + 1]][edge_label] for node in cycle[i:-1] + cycle[:i] ] canonkey_t = ''.join(canonlist) canonkey_t = canonkey_t if canonkey_t < canonkey_t[::-1] else canonkey_t[::-1] canonkey = canonkey if canonkey < canonkey_t else canonkey_t all_canonkeys.append(canonkey) patterns = list(set(patterns) | set(all_canonkeys)) number_simplecycles += len(simple_cycles) else: bridges.add_edges_from(subgraph.edges(data=True)) # calculate canonical representation for each connected component in bridge set components = list(nx.connected_component_subgraphs(bridges)) # all connected components in the bridge tree_patterns = [] for tree in components: break # patterns += pi(bridges) return patterns