|
- """
- @author: linlin
- @references: Pierre Mahé and Jean-Philippe Vert. Graph kernels based on tree patterns for molecules. Machine learning, 75(1):3–35, 2009.
- """
-
- import sys
- import pathlib
- sys.path.insert(0, "../")
- import time
-
- from collections import Counter
-
- import networkx as nx
- import numpy as np
-
-
- def treepatternkernel(*args, node_label = 'atom', edge_label = 'bond_type', labeled = True, kernel_type = 'untiln', lmda = 1, h = 1):
- """Calculate tree pattern graph kernels between graphs.
- Parameters
- ----------
- Gn : List of NetworkX graph
- List of graphs between which the kernels are calculated.
- /
- G1, G2 : NetworkX graphs
- 2 graphs between which the kernel is calculated.
- node_label : string
- node attribute used as label. The default node label is atom.
- edge_label : string
- edge attribute used as label. The default edge label is bond_type.
- labeled : boolean
- Whether the graphs are labeled. The default is True.
- kernel_type : string
- Type of tree pattern kernel, could be 'untiln', 'size' or 'branching'.
- lmda : float
- Weight to decide whether linear patterns or trees pattern of increasing complexity are favored.
- h : integer
- The upper bound of the height of tree patterns.
-
- Return
- ------
- Kmatrix: Numpy matrix
- Kernel matrix, each element of which is the tree pattern graph kernel between 2 praphs.
- """
- if h < 1:
- raise Exception('h > 0 is requested.')
- kernel_type = kernel_type.lower()
- Gn = args[0] if len(args) == 1 else [args[0], args[1]] # arrange all graphs in a list
- Kmatrix = np.zeros((len(Gn), len(Gn)))
- h = int(h)
-
- start_time = time.time()
-
- for i in range(0, len(Gn)):
- for j in range(i, len(Gn)):
- Kmatrix[i][j] = _treepatternkernel_do(Gn[i], Gn[j], node_label, edge_label, labeled, kernel_type, lmda, h)
- Kmatrix[j][i] = Kmatrix[i][j]
-
- run_time = time.time() - start_time
- print("\n --- kernel matrix of tree pattern kernel of size %d built in %s seconds ---" % (len(Gn), run_time))
-
- return Kmatrix, run_time
-
-
- def _treepatternkernel_do(G1, G2, node_label, edge_label, labeled, kernel_type, lmda, h):
- """Calculate tree pattern graph kernels between 2 graphs.
-
- Parameters
- ----------
- paths1, paths2 : list
- List of paths in 2 graphs, where for unlabeled graphs, each path is represented by a list of nodes; while for labeled graphs, each path is represented by a string consists of labels of nodes and edges on that path.
- k_func : function
- A kernel function used using different notions of fingerprint similarity.
- node_label : string
- node attribute used as label. The default node label is atom.
- edge_label : string
- edge attribute used as label. The default edge label is bond_type.
- labeled : boolean
- Whether the graphs are labeled. The default is True.
- kernel_type : string
- Type of tree pattern kernel, could be 'untiln', 'size' or 'branching'.
- lmda : float
- Weight to decide whether linear patterns or trees pattern of increasing complexity are favored.
- h : integer
- The upper bound of the height of tree patterns.
-
- Return
- ------
- kernel : float
- Treelet Kernel between 2 graphs.
- """
-
- def matchingset(n1, n2):
- """Get neiborhood matching set of two nodes in two graphs.
- """
-
- def mset_com(allpairs, length):
- """Find all sets R of pairs by combination.
- """
- if length == 1:
- mset = [ [pair] for pair in allpairs ]
- return mset, mset
- else:
- mset, mset_l = mset_com(allpairs, length - 1)
- mset_tmp = []
- for pairset in mset_l: # for each pair set of length l-1
- nodeset1 = [ pair[0] for pair in pairset ] # nodes already in the set
- nodeset2 = [ pair[1] for pair in pairset ]
- for pair in allpairs:
- if (pair[0] not in nodeset1) and (pair[1] not in nodeset2): # nodes in R should be unique
- mset_tmp.append(pairset + [pair]) # add this pair to the pair set of length l-1, constructing a new set of length l
- nodeset1.append(pair[0])
- nodeset2.append(pair[1])
-
- mset.extend(mset_tmp)
-
- return mset, mset_tmp
-
-
- allpairs = [] # all pairs those have the same node labels and edge labels
- for neighbor1 in G1[n1]:
- for neighbor2 in G2[n2]:
- if G1.node[neighbor1][node_label] == G2.node[neighbor2][node_label] \
- and G1[n1][neighbor1][edge_label] == G2[n2][neighbor2][edge_label]:
- allpairs.append([neighbor1, neighbor2])
-
- if allpairs != []:
- mset, _ = mset_com(allpairs, len(allpairs))
- else:
- mset = []
-
- return mset
-
-
- def kernel_h(h):
- """Calculate kernel of h-th iteration.
- """
-
- if kernel_type == 'untiln':
- all_kh = { str(n1) + '.' + str(n2) : (G1.node[n1][node_label] == G2.node[n2][node_label]) \
- for n1 in G1.nodes() for n2 in G2.nodes() } # kernels between all pair of nodes with h = 1 ]
- all_kh_tmp = all_kh.copy()
- for i in range(2, h + 1):
- for n1 in G1.nodes():
- for n2 in G2.nodes():
- kh = 0
- mset = all_msets[str(n1) + '.' + str(n2)]
- for R in mset:
- kh_tmp = 1
- for pair in R:
- kh_tmp *= lmda * all_kh[str(pair[0]) + '.' + str(pair[1])]
- kh += 1 / lmda * kh_tmp
- kh = (G1.node[n1][node_label] == G2.node[n2][node_label]) * (1 + kh)
- all_kh_tmp[str(n1) + '.' + str(n2)] = kh
- all_kh = all_kh_tmp.copy()
-
- elif kernel_type == 'size':
- all_kh = { str(n1) + '.' + str(n2) : lmda * (G1.node[n1][node_label] == G2.node[n2][node_label]) \
- for n1 in G1.nodes() for n2 in G2.nodes() } # kernels between all pair of nodes with h = 1 ]
- all_kh_tmp = all_kh.copy()
- for i in range(2, h + 1):
- for n1 in G1.nodes():
- for n2 in G2.nodes():
- kh = 0
- mset = all_msets[str(n1) + '.' + str(n2)]
- for R in mset:
- kh_tmp = 1
- for pair in R:
- kh_tmp *= lmda * all_kh[str(pair[0]) + '.' + str(pair[1])]
- kh += kh_tmp
- kh *= lmda * (G1.node[n1][node_label] == G2.node[n2][node_label])
- all_kh_tmp[str(n1) + '.' + str(n2)] = kh
- all_kh = all_kh_tmp.copy()
-
- elif kernel_type == 'branching':
- all_kh = { str(n1) + '.' + str(n2) : (G1.node[n1][node_label] == G2.node[n2][node_label]) \
- for n1 in G1.nodes() for n2 in G2.nodes() } # kernels between all pair of nodes with h = 1 ]
- all_kh_tmp = all_kh.copy()
- for i in range(2, h + 1):
- for n1 in G1.nodes():
- for n2 in G2.nodes():
- kh = 0
- mset = all_msets[str(n1) + '.' + str(n2)]
- for R in mset:
- kh_tmp = 1
- for pair in R:
- kh_tmp *= lmda * all_kh[str(pair[0]) + '.' + str(pair[1])]
- kh += 1 / lmda * kh_tmp
- kh *= (G1.node[n1][node_label] == G2.node[n2][node_label])
- all_kh_tmp[str(n1) + '.' + str(n2)] = kh
- all_kh = all_kh_tmp.copy()
-
- return all_kh
-
-
-
- # calculate matching sets for every pair of nodes at first to avoid calculating in every iteration.
- all_msets = ({ str(node1) + '.' + str(node2) : matchingset(node1, node2) for node1 in G1.nodes() \
- for node2 in G2.nodes() } if h > 1 else {})
-
- all_kh = kernel_h(h)
- kernel = sum(all_kh.values())
-
- if kernel_type == 'size':
- kernel = kernel / (lmda ** h)
-
- return kernel
|