You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

untildPathKernel.py 7.8 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203
  1. import sys
  2. import pathlib
  3. sys.path.insert(0, "../")
  4. import time
  5. from collections import Counter
  6. import networkx as nx
  7. import numpy as np
  8. def untildpathkernel(*args, node_label = 'atom', edge_label = 'bond_type', labeled = True, depth = 10, k_func = 'tanimoto'):
  9. """Calculate path graph kernels up to depth d between graphs.
  10. Parameters
  11. ----------
  12. Gn : List of NetworkX graph
  13. List of graphs between which the kernels are calculated.
  14. /
  15. G1, G2 : NetworkX graphs
  16. 2 graphs between which the kernel is calculated.
  17. node_label : string
  18. node attribute used as label. The default node label is atom.
  19. edge_label : string
  20. edge attribute used as label. The default edge label is bond_type.
  21. labeled : boolean
  22. Whether the graphs are labeled. The default is True.
  23. depth : integer
  24. Depth of search. Longest length of paths.
  25. k_func : function
  26. A kernel function used using different notions of fingerprint similarity.
  27. Return
  28. ------
  29. Kmatrix/kernel : Numpy matrix/float
  30. Kernel matrix, each element of which is the path kernel up to d between 2 praphs. / Path kernel up to d between 2 graphs.
  31. """
  32. depth = int(depth)
  33. if len(args) == 1: # for a list of graphs
  34. Gn = args[0]
  35. Kmatrix = np.zeros((len(Gn), len(Gn)))
  36. start_time = time.time()
  37. # get all paths of all graphs before calculating kernels to save time, but this may cost a lot of memory for large dataset.
  38. all_paths = [ find_all_paths_until_length(Gn[i], depth, node_label = node_label, edge_label = edge_label, labeled = labeled) for i in range(0, len(Gn)) ]
  39. for i in range(0, len(Gn)):
  40. for j in range(i, len(Gn)):
  41. Kmatrix[i][j] = _untildpathkernel_do(all_paths[i], all_paths[j], k_func, node_label = node_label, edge_label = edge_label, labeled = labeled)
  42. Kmatrix[j][i] = Kmatrix[i][j]
  43. run_time = time.time() - start_time
  44. print("\n --- kernel matrix of path kernel up to %d of size %d built in %s seconds ---" % (depth, len(Gn), run_time))
  45. return Kmatrix, run_time
  46. else: # for only 2 graphs
  47. start_time = time.time()
  48. all_paths1 = find_all_paths_until_length(args[0], depth, node_label = node_label, edge_label = edge_label, labeled = labeled)
  49. all_paths2 = find_all_paths_until_length(args[1], depth, node_label = node_label, edge_label = edge_label, labeled = labeled)
  50. kernel = _untildpathkernel_do(all_paths1, all_paths2, k_func, node_label = node_label, edge_label = edge_label, labeled = labeled)
  51. run_time = time.time() - start_time
  52. print("\n --- path kernel up to %d built in %s seconds ---" % (depth, run_time))
  53. return kernel, run_time
  54. def _untildpathkernel_do(paths1, paths2, k_func, node_label = 'atom', edge_label = 'bond_type', labeled = True):
  55. """Calculate path graph kernels up to depth d between 2 graphs.
  56. Parameters
  57. ----------
  58. paths1, paths2 : list
  59. List of paths in 2 graphs, where for unlabeled graphs, each path is represented by a list of nodes; while for labeled graphs, each path is represented by a string consists of labels of nodes and edges on that path.
  60. k_func : function
  61. A kernel function used using different notions of fingerprint similarity.
  62. node_label : string
  63. node attribute used as label. The default node label is atom.
  64. edge_label : string
  65. edge attribute used as label. The default edge label is bond_type.
  66. labeled : boolean
  67. Whether the graphs are labeled. The default is True.
  68. Return
  69. ------
  70. kernel : float
  71. Treelet Kernel between 2 graphs.
  72. """
  73. all_paths = list(set(paths1 + paths2))
  74. if k_func == 'tanimoto':
  75. vector1 = [ (1 if path in paths1 else 0) for path in all_paths ]
  76. vector2 = [ (1 if path in paths2 else 0) for path in all_paths ]
  77. kernel_uv = np.dot(vector1, vector2)
  78. kernel = kernel_uv / (len(set(paths1)) + len(set(paths2)) - kernel_uv)
  79. else: # MinMax kernel
  80. path_count1 = Counter(paths1)
  81. path_count2 = Counter(paths2)
  82. vector1 = [ (path_count1[key] if (key in path_count1.keys()) else 0) for key in all_paths ]
  83. vector2 = [ (path_count2[key] if (key in path_count2.keys()) else 0) for key in all_paths ]
  84. kernel = np.sum(np.minimum(vector1, vector2)) / np.sum(np.maximum(vector1, vector2))
  85. return kernel
  86. # this method find paths repetively, it could be faster.
  87. def find_all_paths_until_length(G, length, node_label = 'atom', edge_label = 'bond_type', labeled = True):
  88. """Find all paths with a certain maximum length in a graph. A recursive depth first search is applied.
  89. Parameters
  90. ----------
  91. G : NetworkX graphs
  92. The graph in which paths are searched.
  93. length : integer
  94. The maximum length of paths.
  95. node_label : string
  96. node attribute used as label. The default node label is atom.
  97. edge_label : string
  98. edge attribute used as label. The default edge label is bond_type.
  99. labeled : boolean
  100. Whether the graphs are labeled. The default is True.
  101. Return
  102. ------
  103. path : list
  104. List of paths retrieved, where for unlabeled graphs, each path is represented by a list of nodes; while for labeled graphs, each path is represented by a string consists of labels of nodes and edges on that path.
  105. """
  106. all_paths = []
  107. for i in range(0, length + 1):
  108. new_paths = find_all_paths(G, i)
  109. if new_paths == []:
  110. break
  111. all_paths.extend(new_paths)
  112. if labeled == True: # convert paths to strings
  113. path_strs = []
  114. for path in all_paths:
  115. strlist = [ G.node[node][node_label] + G[node][path[path.index(node) + 1]][edge_label] for node in path[:-1] ]
  116. path_strs.append(''.join(strlist) + G.node[path[-1]][node_label])
  117. return path_strs
  118. return all_paths
  119. def find_paths(G, source_node, length):
  120. """Find all paths with a certain length those start from a source node. A recursive depth first search is applied.
  121. Parameters
  122. ----------
  123. G : NetworkX graphs
  124. The graph in which paths are searched.
  125. source_node : integer
  126. The number of the node from where all paths start.
  127. length : integer
  128. The length of paths.
  129. Return
  130. ------
  131. path : list of list
  132. List of paths retrieved, where each path is represented by a list of nodes.
  133. """
  134. return [[source_node]] if length == 0 else \
  135. [ [source_node] + path for neighbor in G[source_node] \
  136. for path in find_paths(G, neighbor, length - 1) if source_node not in path ]
  137. def find_all_paths(G, length):
  138. """Find all paths with a certain length in a graph. A recursive depth first search is applied.
  139. Parameters
  140. ----------
  141. G : NetworkX graphs
  142. The graph in which paths are searched.
  143. length : integer
  144. The length of paths.
  145. Return
  146. ------
  147. path : list of list
  148. List of paths retrieved, where each path is represented by a list of nodes.
  149. """
  150. all_paths = []
  151. for node in G:
  152. all_paths.extend(find_paths(G, node, length))
  153. ### The following process is not carried out according to the original article
  154. # all_paths_r = [ path[::-1] for path in all_paths ]
  155. # # For each path, two presentation are retrieved from its two extremities. Remove one of them.
  156. # for idx, path in enumerate(all_paths[:-1]):
  157. # for path2 in all_paths_r[idx+1::]:
  158. # if path == path2:
  159. # all_paths[idx] = []
  160. # break
  161. # return list(filter(lambda a: a != [], all_paths))
  162. return all_paths

A Python package for graph kernels, graph edit distances and graph pre-image problem.