You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

untilnWalkKernel.py 6.5 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182
  1. """
  2. @author: linlin
  3. @references: Thomas Gärtner, Peter Flach, and Stefan Wrobel. On graph kernels: Hardness results and efficient alternatives. Learning Theory and Kernel Machines, pages 129–143, 2003.
  4. """
  5. import sys
  6. import pathlib
  7. sys.path.insert(0, "../")
  8. import time
  9. from collections import Counter
  10. import networkx as nx
  11. import numpy as np
  12. def untilnwalkkernel(*args, node_label = 'atom', edge_label = 'bond_type', labeled = True, n = 10):
  13. """Calculate common walk graph kernels up to depth d between graphs.
  14. Parameters
  15. ----------
  16. Gn : List of NetworkX graph
  17. List of graphs between which the kernels are calculated.
  18. /
  19. G1, G2 : NetworkX graphs
  20. 2 graphs between which the kernel is calculated.
  21. node_label : string
  22. node attribute used as label. The default node label is atom.
  23. edge_label : string
  24. edge attribute used as label. The default edge label is bond_type.
  25. labeled : boolean
  26. Whether the graphs are labeled. The default is True.
  27. n : integer
  28. Longest length of walks.
  29. Return
  30. ------
  31. Kmatrix : Numpy matrix
  32. Kernel matrix, each element of which is the path kernel up to d between 2 praphs.
  33. """
  34. Gn = args[0] if len(args) == 1 else [args[0], args[1]] # arrange all graphs in a list
  35. Kmatrix = np.zeros((len(Gn), len(Gn)))
  36. n = int(n)
  37. start_time = time.time()
  38. # get all paths of all graphs before calculating kernels to save time, but this may cost a lot of memory for large dataset.
  39. all_walks = [ find_all_walks_until_length(Gn[i], n, node_label = node_label, edge_label = edge_label, labeled = labeled) for i in range(0, len(Gn)) ]
  40. for i in range(0, len(Gn)):
  41. for j in range(i, len(Gn)):
  42. Kmatrix[i][j] = _untilnwalkkernel_do(all_walks[i], all_walks[j], node_label = node_label, edge_label = edge_label, labeled = labeled)
  43. Kmatrix[j][i] = Kmatrix[i][j]
  44. run_time = time.time() - start_time
  45. print("\n --- kernel matrix of walk kernel up to %d of size %d built in %s seconds ---" % (n, len(Gn), run_time))
  46. return Kmatrix, run_time
  47. def _untilnwalkkernel_do(walks1, walks2, node_label = 'atom', edge_label = 'bond_type', labeled = True):
  48. """Calculate walk graph kernels up to n between 2 graphs.
  49. Parameters
  50. ----------
  51. walks1, walks2 : list
  52. List of walks in 2 graphs, where for unlabeled graphs, each walk is represented by a list of nodes; while for labeled graphs, each walk is represented by a string consists of labels of nodes and edges on that walk.
  53. node_label : string
  54. node attribute used as label. The default node label is atom.
  55. edge_label : string
  56. edge attribute used as label. The default edge label is bond_type.
  57. labeled : boolean
  58. Whether the graphs are labeled. The default is True.
  59. Return
  60. ------
  61. kernel : float
  62. Treelet Kernel between 2 graphs.
  63. """
  64. counts_walks1 = dict(Counter(walks1))
  65. counts_walks2 = dict(Counter(walks2))
  66. all_walks = list(set(walks1 + walks2))
  67. vector1 = [ (counts_walks1[walk] if walk in walks1 else 0) for walk in all_walks ]
  68. vector2 = [ (counts_walks2[walk] if walk in walks2 else 0) for walk in all_walks ]
  69. kernel = np.dot(vector1, vector2)
  70. return kernel
  71. # this method find walks repetively, it could be faster.
  72. def find_all_walks_until_length(G, length, node_label = 'atom', edge_label = 'bond_type', labeled = True):
  73. """Find all walks with a certain maximum length in a graph. A recursive depth first search is applied.
  74. Parameters
  75. ----------
  76. G : NetworkX graphs
  77. The graph in which walks are searched.
  78. length : integer
  79. The maximum length of walks.
  80. node_label : string
  81. node attribute used as label. The default node label is atom.
  82. edge_label : string
  83. edge attribute used as label. The default edge label is bond_type.
  84. labeled : boolean
  85. Whether the graphs are labeled. The default is True.
  86. Return
  87. ------
  88. walk : list
  89. List of walks retrieved, where for unlabeled graphs, each walk is represented by a list of nodes; while for labeled graphs, each walk is represented by a string consists of labels of nodes and edges on that walk.
  90. """
  91. all_walks = []
  92. for i in range(0, length + 1):
  93. new_walks = find_all_walks(G, i)
  94. if new_walks == []:
  95. break
  96. all_walks.extend(new_walks)
  97. if labeled == True: # convert paths to strings
  98. walk_strs = []
  99. for walk in all_walks:
  100. strlist = [ G.node[node][node_label] + G[node][walk[walk.index(node) + 1]][edge_label] for node in walk[:-1] ]
  101. walk_strs.append(''.join(strlist) + G.node[walk[-1]][node_label])
  102. return walk_strs
  103. return all_walks
  104. def find_walks(G, source_node, length):
  105. """Find all walks with a certain length those start from a source node. A recursive depth first search is applied.
  106. Parameters
  107. ----------
  108. G : NetworkX graphs
  109. The graph in which walks are searched.
  110. source_node : integer
  111. The number of the node from where all walks start.
  112. length : integer
  113. The length of walks.
  114. Return
  115. ------
  116. walk : list of list
  117. List of walks retrieved, where each walk is represented by a list of nodes.
  118. """
  119. return [[source_node]] if length == 0 else \
  120. [ [source_node] + walk for neighbor in G[source_node] \
  121. for walk in find_walks(G, neighbor, length - 1) ]
  122. def find_all_walks(G, length):
  123. """Find all walks with a certain length in a graph. A recursive depth first search is applied.
  124. Parameters
  125. ----------
  126. G : NetworkX graphs
  127. The graph in which walks are searched.
  128. length : integer
  129. The length of walks.
  130. Return
  131. ------
  132. walk : list of list
  133. List of walks retrieved, where each walk is represented by a list of nodes.
  134. """
  135. all_walks = []
  136. for node in G:
  137. all_walks.extend(find_walks(G, node, length))
  138. ### The following process is not carried out according to the original article
  139. # all_paths_r = [ path[::-1] for path in all_paths ]
  140. # # For each path, two presentation are retrieved from its two extremities. Remove one of them.
  141. # for idx, path in enumerate(all_paths[:-1]):
  142. # for path2 in all_paths_r[idx+1::]:
  143. # if path == path2:
  144. # all_paths[idx] = []
  145. # break
  146. # return list(filter(lambda a: a != [], all_paths))
  147. return all_walks

A Python package for graph kernels, graph edit distances and graph pre-image problem.