You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

untilHPathKernel.py 9.8 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276
  1. """
  2. @author: linlin
  3. @references: Liva Ralaivola, Sanjay J Swamidass, Hiroto Saigo, and Pierre Baldi. Graph kernels for chemical informatics. Neural networks, 18(8):1093–1110, 2005.
  4. """
  5. import sys
  6. import pathlib
  7. sys.path.insert(0, "../")
  8. import time
  9. from collections import Counter
  10. from itertools import chain
  11. from tqdm import tqdm
  12. import networkx as nx
  13. import numpy as np
  14. from pygraph.utils.graphdataset import get_dataset_attributes
  15. def untilhpathkernel(*args,
  16. node_label='atom',
  17. edge_label='bond_type',
  18. depth=10,
  19. k_func='tanimoto'):
  20. """Calculate path graph kernels up to depth d between graphs.
  21. Parameters
  22. ----------
  23. Gn : List of NetworkX graph
  24. List of graphs between which the kernels are calculated.
  25. /
  26. G1, G2 : NetworkX graphs
  27. 2 graphs between which the kernel is calculated.
  28. node_label : string
  29. node attribute used as label. The default node label is atom.
  30. edge_label : string
  31. edge attribute used as label. The default edge label is bond_type.
  32. depth : integer
  33. Depth of search. Longest length of paths.
  34. k_func : function
  35. A kernel function used using different notions of fingerprint similarity.
  36. Return
  37. ------
  38. Kmatrix : Numpy matrix
  39. Kernel matrix, each element of which is the path kernel up to d between 2 praphs.
  40. """
  41. depth = int(depth)
  42. Gn = args[0] if len(args) == 1 else [args[0], args[1]]
  43. Kmatrix = np.zeros((len(Gn), len(Gn)))
  44. ds_attrs = get_dataset_attributes(
  45. Gn,
  46. attr_names=['node_labeled', 'edge_labeled', 'is_directed'],
  47. node_label=node_label,
  48. edge_label=edge_label)
  49. start_time = time.time()
  50. # get all paths of all graphs before calculating kernels to save time, but this may cost a lot of memory for large datasets.
  51. all_paths = [
  52. find_all_paths_until_length(
  53. Gn[i],
  54. depth,
  55. ds_attrs,
  56. node_label=node_label,
  57. edge_label=edge_label) for i in tqdm(
  58. range(0, len(Gn)), desc='getting paths', file=sys.stdout)
  59. ]
  60. pbar = tqdm(
  61. total=((len(Gn) + 1) * len(Gn) / 2),
  62. desc='calculating kernels',
  63. file=sys.stdout)
  64. for i in range(0, len(Gn)):
  65. for j in range(i, len(Gn)):
  66. Kmatrix[i][j] = _untilhpathkernel_do(all_paths[i], all_paths[j],
  67. k_func)
  68. Kmatrix[j][i] = Kmatrix[i][j]
  69. pbar.update(1)
  70. run_time = time.time() - start_time
  71. print(
  72. "\n --- kernel matrix of path kernel up to %d of size %d built in %s seconds ---"
  73. % (depth, len(Gn), run_time))
  74. return Kmatrix, run_time
  75. def _untilhpathkernel_do(paths1, paths2, k_func):
  76. """Calculate path graph kernels up to depth d between 2 graphs.
  77. Parameters
  78. ----------
  79. paths1, paths2 : list
  80. List of paths in 2 graphs, where for unlabeled graphs, each path is represented by a list of nodes; while for labeled graphs, each path is represented by a string consists of labels of nodes and edges on that path.
  81. k_func : function
  82. A kernel function used using different notions of fingerprint similarity.
  83. Return
  84. ------
  85. kernel : float
  86. Treelet Kernel between 2 graphs.
  87. """
  88. all_paths = list(set(paths1 + paths2))
  89. if k_func == 'tanimoto':
  90. vector1 = [(1 if path in paths1 else 0) for path in all_paths]
  91. vector2 = [(1 if path in paths2 else 0) for path in all_paths]
  92. kernel_uv = np.dot(vector1, vector2)
  93. kernel = kernel_uv / (len(set(paths1)) + len(set(paths2)) - kernel_uv)
  94. else: # MinMax kernel
  95. path_count1 = Counter(paths1)
  96. path_count2 = Counter(paths2)
  97. vector1 = [(path_count1[key] if (key in path_count1.keys()) else 0)
  98. for key in all_paths]
  99. vector2 = [(path_count2[key] if (key in path_count2.keys()) else 0)
  100. for key in all_paths]
  101. kernel = np.sum(np.minimum(vector1, vector2)) / \
  102. np.sum(np.maximum(vector1, vector2))
  103. return kernel
  104. # this method find paths repetively, it could be faster.
  105. def find_all_paths_until_length(G,
  106. length,
  107. ds_attrs,
  108. node_label='atom',
  109. edge_label='bond_type'):
  110. """Find all paths no longer than a certain maximum length in a graph. A recursive depth first search is applied.
  111. Parameters
  112. ----------
  113. G : NetworkX graphs
  114. The graph in which paths are searched.
  115. length : integer
  116. The maximum length of paths.
  117. ds_attrs: dict
  118. Dataset attributes.
  119. node_label : string
  120. Node attribute used as label. The default node label is atom.
  121. edge_label : string
  122. Edge attribute used as label. The default edge label is bond_type.
  123. Return
  124. ------
  125. path : list
  126. List of paths retrieved, where for unlabeled graphs, each path is represented by a list of nodes; while for labeled graphs, each path is represented by a list of strings consists of labels of nodes and/or edges on that path.
  127. """
  128. # path_l = [tuple([n]) for n in G.nodes] # paths of length l
  129. # all_paths = path_l[:]
  130. # for l in range(1, length + 1):
  131. # path_l_new = []
  132. # for path in path_l:
  133. # for neighbor in G[path[-1]]:
  134. # if len(path) < 2 or neighbor != path[-2]:
  135. # tmp = path + (neighbor, )
  136. # if tuple(tmp[::-1]) not in path_l_new:
  137. # path_l_new.append(tuple(tmp))
  138. # all_paths += path_l_new
  139. # path_l = path_l_new[:]
  140. path_l = [[n] for n in G.nodes] # paths of length l
  141. all_paths = path_l[:]
  142. for l in range(1, length + 1):
  143. path_l_new = []
  144. for path in path_l:
  145. for neighbor in G[path[-1]]:
  146. if len(path) < 2 or neighbor != path[-2]:
  147. tmp = path + [neighbor]
  148. if tmp[::-1] not in path_l_new:
  149. path_l_new.append(tmp)
  150. all_paths += path_l_new
  151. path_l = path_l_new[:]
  152. # for i in range(0, length + 1):
  153. # new_paths = find_all_paths(G, i)
  154. # if new_paths == []:
  155. # break
  156. # all_paths.extend(new_paths)
  157. # consider labels
  158. if ds_attrs['node_labeled']:
  159. if ds_attrs['edge_labeled']:
  160. path_strs = [
  161. tuple(
  162. list(
  163. chain.from_iterable(
  164. (G.node[node][node_label],
  165. G[node][path[idx + 1]][edge_label])
  166. for idx, node in enumerate(path[:-1]))) +
  167. [G.node[path[-1]][node_label]]) for path in all_paths
  168. ]
  169. # path_strs = []
  170. # for path in all_paths:
  171. # strlist = list(
  172. # chain.from_iterable((G.node[node][node_label],
  173. # G[node][path[idx + 1]][edge_label])
  174. # for idx, node in enumerate(path[:-1])))
  175. # strlist.append(G.node[path[-1]][node_label])
  176. # path_strs.append(tuple(strlist))
  177. else:
  178. path_strs = [
  179. tuple([G.node[node][node_label] for node in path])
  180. for path in all_paths
  181. ]
  182. return path_strs
  183. else:
  184. if ds_attrs['edge_labeled']:
  185. return [
  186. tuple([] if len(path) == 1 else [
  187. G[node][path[idx + 1]][edge_label]
  188. for idx, node in enumerate(path[:-1])
  189. ]) for path in all_paths
  190. ]
  191. else:
  192. return [tuple([len(path)]) for path in all_paths]
  193. # def find_paths(G, source_node, length):
  194. # """Find all paths no longer than a certain length those start from a source node. A recursive depth first search is applied.
  195. # Parameters
  196. # ----------
  197. # G : NetworkX graphs
  198. # The graph in which paths are searched.
  199. # source_node : integer
  200. # The number of the node from where all paths start.
  201. # length : integer
  202. # The length of paths.
  203. # Return
  204. # ------
  205. # path : list of list
  206. # List of paths retrieved, where each path is represented by a list of nodes.
  207. # """
  208. # return [[source_node]] if length == 0 else \
  209. # [[source_node] + path for neighbor in G[source_node]
  210. # for path in find_paths(G, neighbor, length - 1) if source_node not in path]
  211. # def find_all_paths(G, length):
  212. # """Find all paths with a certain length in a graph. A recursive depth first search is applied.
  213. # Parameters
  214. # ----------
  215. # G : NetworkX graphs
  216. # The graph in which paths are searched.
  217. # length : integer
  218. # The length of paths.
  219. # Return
  220. # ------
  221. # path : list of list
  222. # List of paths retrieved, where each path is represented by a list of nodes.
  223. # """
  224. # all_paths = []
  225. # for node in G:
  226. # all_paths.extend(find_paths(G, node, length))
  227. # # The following process is not carried out according to the original article
  228. # # all_paths_r = [ path[::-1] for path in all_paths ]
  229. # # # For each path, two presentation are retrieved from its two extremities. Remove one of them.
  230. # # for idx, path in enumerate(all_paths[:-1]):
  231. # # for path2 in all_paths_r[idx+1::]:
  232. # # if path == path2:
  233. # # all_paths[idx] = []
  234. # # break
  235. # # return list(filter(lambda a: a != [], all_paths))
  236. # return all_paths

A Python package for graph kernels, graph edit distances and graph pre-image problem.