""" @author: linlin @references: Pierre Mahé and Jean-Philippe Vert. Graph kernels based on tree patterns for molecules. Machine learning, 75(1):3–35, 2009. """ import sys import pathlib sys.path.insert(0, "../") import time from collections import Counter import networkx as nx import numpy as np def treepatternkernel(*args, node_label = 'atom', edge_label = 'bond_type', labeled = True, kernel_type = 'untiln', lmda = 1, h = 1): """Calculate tree pattern graph kernels between graphs. Parameters ---------- Gn : List of NetworkX graph List of graphs between which the kernels are calculated. / G1, G2 : NetworkX graphs 2 graphs between which the kernel is calculated. node_label : string node attribute used as label. The default node label is atom. edge_label : string edge attribute used as label. The default edge label is bond_type. labeled : boolean Whether the graphs are labeled. The default is True. kernel_type : string Type of tree pattern kernel, could be 'untiln', 'size' or 'branching'. lmda : float Weight to decide whether linear patterns or trees pattern of increasing complexity are favored. h : integer The upper bound of the height of tree patterns. Return ------ Kmatrix: Numpy matrix Kernel matrix, each element of which is the tree pattern graph kernel between 2 praphs. """ if h < 1: raise Exception('h > 0 is requested.') kernel_type = kernel_type.lower() Gn = args[0] if len(args) == 1 else [args[0], args[1]] # arrange all graphs in a list Kmatrix = np.zeros((len(Gn), len(Gn))) h = int(h) start_time = time.time() for i in range(0, len(Gn)): for j in range(i, len(Gn)): Kmatrix[i][j] = _treepatternkernel_do(Gn[i], Gn[j], node_label, edge_label, labeled, kernel_type, lmda, h) Kmatrix[j][i] = Kmatrix[i][j] run_time = time.time() - start_time print("\n --- kernel matrix of tree pattern kernel of size %d built in %s seconds ---" % (len(Gn), run_time)) return Kmatrix, run_time def _treepatternkernel_do(G1, G2, node_label, edge_label, labeled, kernel_type, lmda, h): """Calculate tree pattern graph kernels between 2 graphs. Parameters ---------- paths1, paths2 : list List of paths in 2 graphs, where for unlabeled graphs, each path is represented by a list of nodes; while for labeled graphs, each path is represented by a string consists of labels of nodes and edges on that path. k_func : function A kernel function used using different notions of fingerprint similarity. node_label : string node attribute used as label. The default node label is atom. edge_label : string edge attribute used as label. The default edge label is bond_type. labeled : boolean Whether the graphs are labeled. The default is True. kernel_type : string Type of tree pattern kernel, could be 'untiln', 'size' or 'branching'. lmda : float Weight to decide whether linear patterns or trees pattern of increasing complexity are favored. h : integer The upper bound of the height of tree patterns. Return ------ kernel : float Treelet Kernel between 2 graphs. """ def matchingset(n1, n2): """Get neiborhood matching set of two nodes in two graphs. """ def mset_com(allpairs, length): """Find all sets R of pairs by combination. """ if length == 1: mset = [ [pair] for pair in allpairs ] return mset, mset else: mset, mset_l = mset_com(allpairs, length - 1) mset_tmp = [] for pairset in mset_l: # for each pair set of length l-1 nodeset1 = [ pair[0] for pair in pairset ] # nodes already in the set nodeset2 = [ pair[1] for pair in pairset ] for pair in allpairs: if (pair[0] not in nodeset1) and (pair[1] not in nodeset2): # nodes in R should be unique mset_tmp.append(pairset + [pair]) # add this pair to the pair set of length l-1, constructing a new set of length l nodeset1.append(pair[0]) nodeset2.append(pair[1]) mset.extend(mset_tmp) return mset, mset_tmp allpairs = [] # all pairs those have the same node labels and edge labels for neighbor1 in G1[n1]: for neighbor2 in G2[n2]: if G1.node[neighbor1][node_label] == G2.node[neighbor2][node_label] \ and G1[n1][neighbor1][edge_label] == G2[n2][neighbor2][edge_label]: allpairs.append([neighbor1, neighbor2]) if allpairs != []: mset, _ = mset_com(allpairs, len(allpairs)) else: mset = [] return mset def kernel_h(h): """Calculate kernel of h-th iteration. """ if kernel_type == 'untiln': all_kh = { str(n1) + '.' + str(n2) : (G1.node[n1][node_label] == G2.node[n2][node_label]) \ for n1 in G1.nodes() for n2 in G2.nodes() } # kernels between all pair of nodes with h = 1 ] all_kh_tmp = all_kh.copy() for i in range(2, h + 1): for n1 in G1.nodes(): for n2 in G2.nodes(): kh = 0 mset = all_msets[str(n1) + '.' + str(n2)] for R in mset: kh_tmp = 1 for pair in R: kh_tmp *= lmda * all_kh[str(pair[0]) + '.' + str(pair[1])] kh += 1 / lmda * kh_tmp kh = (G1.node[n1][node_label] == G2.node[n2][node_label]) * (1 + kh) all_kh_tmp[str(n1) + '.' + str(n2)] = kh all_kh = all_kh_tmp.copy() elif kernel_type == 'size': all_kh = { str(n1) + '.' + str(n2) : lmda * (G1.node[n1][node_label] == G2.node[n2][node_label]) \ for n1 in G1.nodes() for n2 in G2.nodes() } # kernels between all pair of nodes with h = 1 ] all_kh_tmp = all_kh.copy() for i in range(2, h + 1): for n1 in G1.nodes(): for n2 in G2.nodes(): kh = 0 mset = all_msets[str(n1) + '.' + str(n2)] for R in mset: kh_tmp = 1 for pair in R: kh_tmp *= lmda * all_kh[str(pair[0]) + '.' + str(pair[1])] kh += kh_tmp kh *= lmda * (G1.node[n1][node_label] == G2.node[n2][node_label]) all_kh_tmp[str(n1) + '.' + str(n2)] = kh all_kh = all_kh_tmp.copy() elif kernel_type == 'branching': all_kh = { str(n1) + '.' + str(n2) : (G1.node[n1][node_label] == G2.node[n2][node_label]) \ for n1 in G1.nodes() for n2 in G2.nodes() } # kernels between all pair of nodes with h = 1 ] all_kh_tmp = all_kh.copy() for i in range(2, h + 1): for n1 in G1.nodes(): for n2 in G2.nodes(): kh = 0 mset = all_msets[str(n1) + '.' + str(n2)] for R in mset: kh_tmp = 1 for pair in R: kh_tmp *= lmda * all_kh[str(pair[0]) + '.' + str(pair[1])] kh += 1 / lmda * kh_tmp kh *= (G1.node[n1][node_label] == G2.node[n2][node_label]) all_kh_tmp[str(n1) + '.' + str(n2)] = kh all_kh = all_kh_tmp.copy() return all_kh # calculate matching sets for every pair of nodes at first to avoid calculating in every iteration. all_msets = ({ str(node1) + '.' + str(node2) : matchingset(node1, node2) for node1 in G1.nodes() \ for node2 in G2.nodes() } if h > 1 else {}) all_kh = kernel_h(h) kernel = sum(all_kh.values()) if kernel_type == 'size': kernel = kernel / (lmda ** h) return kernel