You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

untildPathKernel.py 8.0 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219
  1. """
  2. @author: linlin
  3. @references: Liva Ralaivola, Sanjay J Swamidass, Hiroto Saigo, and Pierre Baldi. Graph kernels for chemical informatics. Neural networks, 18(8):1093–1110, 2005.
  4. """
  5. import sys
  6. import pathlib
  7. sys.path.insert(0, "../")
  8. import time
  9. from collections import Counter
  10. import networkx as nx
  11. import numpy as np
  12. def untildpathkernel(*args, node_label='atom', edge_label='bond_type', labeled=True, depth=10, k_func='tanimoto'):
  13. """Calculate path graph kernels up to depth d between graphs.
  14. Parameters
  15. ----------
  16. Gn : List of NetworkX graph
  17. List of graphs between which the kernels are calculated.
  18. /
  19. G1, G2 : NetworkX graphs
  20. 2 graphs between which the kernel is calculated.
  21. node_label : string
  22. node attribute used as label. The default node label is atom.
  23. edge_label : string
  24. edge attribute used as label. The default edge label is bond_type.
  25. labeled : boolean
  26. Whether the graphs are labeled. The default is True.
  27. depth : integer
  28. Depth of search. Longest length of paths.
  29. k_func : function
  30. A kernel function used using different notions of fingerprint similarity.
  31. Return
  32. ------
  33. Kmatrix : Numpy matrix
  34. Kernel matrix, each element of which is the path kernel up to d between 2 praphs.
  35. """
  36. depth = int(depth)
  37. if len(args) == 1: # for a list of graphs
  38. Gn = args[0]
  39. Kmatrix = np.zeros((len(Gn), len(Gn)))
  40. start_time = time.time()
  41. # get all paths of all graphs before calculating kernels to save time, but this may cost a lot of memory for large dataset.
  42. all_paths = [find_all_paths_until_length(
  43. Gn[i], depth, node_label=node_label, edge_label=edge_label, labeled=labeled) for i in range(0, len(Gn))]
  44. for i in range(0, len(Gn)):
  45. for j in range(i, len(Gn)):
  46. Kmatrix[i][j] = _untildpathkernel_do(
  47. all_paths[i], all_paths[j], k_func, node_label=node_label, edge_label=edge_label, labeled=labeled)
  48. Kmatrix[j][i] = Kmatrix[i][j]
  49. run_time = time.time() - start_time
  50. print("\n --- kernel matrix of path kernel up to %d of size %d built in %s seconds ---" %
  51. (depth, len(Gn), run_time))
  52. return Kmatrix, run_time
  53. else: # for only 2 graphs
  54. start_time = time.time()
  55. all_paths1 = find_all_paths_until_length(
  56. args[0], depth, node_label=node_label, edge_label=edge_label, labeled=labeled)
  57. all_paths2 = find_all_paths_until_length(
  58. args[1], depth, node_label=node_label, edge_label=edge_label, labeled=labeled)
  59. kernel = _untildpathkernel_do(
  60. all_paths1, all_paths2, k_func, node_label=node_label, edge_label=edge_label, labeled=labeled)
  61. run_time = time.time() - start_time
  62. print("\n --- path kernel up to %d built in %s seconds ---" %
  63. (depth, run_time))
  64. return kernel, run_time
  65. def _untildpathkernel_do(paths1, paths2, k_func, node_label='atom', edge_label='bond_type', labeled=True):
  66. """Calculate path graph kernels up to depth d between 2 graphs.
  67. Parameters
  68. ----------
  69. paths1, paths2 : list
  70. List of paths in 2 graphs, where for unlabeled graphs, each path is represented by a list of nodes; while for labeled graphs, each path is represented by a string consists of labels of nodes and edges on that path.
  71. k_func : function
  72. A kernel function used using different notions of fingerprint similarity.
  73. node_label : string
  74. node attribute used as label. The default node label is atom.
  75. edge_label : string
  76. edge attribute used as label. The default edge label is bond_type.
  77. labeled : boolean
  78. Whether the graphs are labeled. The default is True.
  79. Return
  80. ------
  81. kernel : float
  82. Treelet Kernel between 2 graphs.
  83. """
  84. all_paths = list(set(paths1 + paths2))
  85. if k_func == 'tanimoto':
  86. vector1 = [(1 if path in paths1 else 0) for path in all_paths]
  87. vector2 = [(1 if path in paths2 else 0) for path in all_paths]
  88. kernel_uv = np.dot(vector1, vector2)
  89. kernel = kernel_uv / (len(set(paths1)) + len(set(paths2)) - kernel_uv)
  90. else: # MinMax kernel
  91. path_count1 = Counter(paths1)
  92. path_count2 = Counter(paths2)
  93. vector1 = [(path_count1[key] if (key in path_count1.keys()) else 0)
  94. for key in all_paths]
  95. vector2 = [(path_count2[key] if (key in path_count2.keys()) else 0)
  96. for key in all_paths]
  97. kernel = np.sum(np.minimum(vector1, vector2)) / \
  98. np.sum(np.maximum(vector1, vector2))
  99. return kernel
  100. # this method find paths repetively, it could be faster.
  101. def find_all_paths_until_length(G, length, node_label='atom', edge_label='bond_type', labeled=True):
  102. """Find all paths with a certain maximum length in a graph. A recursive depth first search is applied.
  103. Parameters
  104. ----------
  105. G : NetworkX graphs
  106. The graph in which paths are searched.
  107. length : integer
  108. The maximum length of paths.
  109. node_label : string
  110. node attribute used as label. The default node label is atom.
  111. edge_label : string
  112. edge attribute used as label. The default edge label is bond_type.
  113. labeled : boolean
  114. Whether the graphs are labeled. The default is True.
  115. Return
  116. ------
  117. path : list
  118. List of paths retrieved, where for unlabeled graphs, each path is represented by a list of nodes; while for labeled graphs, each path is represented by a string consists of labels of nodes and edges on that path.
  119. """
  120. all_paths = []
  121. for i in range(0, length + 1):
  122. new_paths = find_all_paths(G, i)
  123. if new_paths == []:
  124. break
  125. all_paths.extend(new_paths)
  126. if labeled == True: # convert paths to strings
  127. path_strs = []
  128. for path in all_paths:
  129. strlist = [G.node[node][node_label] + G[node]
  130. [path[path.index(node) + 1]][edge_label] for node in path[:-1]]
  131. path_strs.append(''.join(strlist) + G.node[path[-1]][node_label])
  132. return path_strs
  133. return all_paths
  134. def find_paths(G, source_node, length):
  135. """Find all paths with a certain length those start from a source node. A recursive depth first search is applied.
  136. Parameters
  137. ----------
  138. G : NetworkX graphs
  139. The graph in which paths are searched.
  140. source_node : integer
  141. The number of the node from where all paths start.
  142. length : integer
  143. The length of paths.
  144. Return
  145. ------
  146. path : list of list
  147. List of paths retrieved, where each path is represented by a list of nodes.
  148. """
  149. return [[source_node]] if length == 0 else \
  150. [[source_node] + path for neighbor in G[source_node]
  151. for path in find_paths(G, neighbor, length - 1) if source_node not in path]
  152. def find_all_paths(G, length):
  153. """Find all paths with a certain length in a graph. A recursive depth first search is applied.
  154. Parameters
  155. ----------
  156. G : NetworkX graphs
  157. The graph in which paths are searched.
  158. length : integer
  159. The length of paths.
  160. Return
  161. ------
  162. path : list of list
  163. List of paths retrieved, where each path is represented by a list of nodes.
  164. """
  165. all_paths = []
  166. for node in G:
  167. all_paths.extend(find_paths(G, node, length))
  168. # The following process is not carried out according to the original article
  169. # all_paths_r = [ path[::-1] for path in all_paths ]
  170. # # For each path, two presentation are retrieved from its two extremities. Remove one of them.
  171. # for idx, path in enumerate(all_paths[:-1]):
  172. # for path2 in all_paths_r[idx+1::]:
  173. # if path == path2:
  174. # all_paths[idx] = []
  175. # break
  176. # return list(filter(lambda a: a != [], all_paths))
  177. return all_paths

A Python package for graph kernels, graph edit distances and graph pre-image problem.