You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

weisfeilerLehmanKernel.py 12 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299
  1. import sys
  2. import pathlib
  3. sys.path.insert(0, "../")
  4. import networkx as nx
  5. import numpy as np
  6. import time
  7. from pygraph.kernels.spkernel import spkernel
  8. from pygraph.kernels.pathKernel import pathkernel
  9. import sys
  10. import pathlib
  11. from collections import Counter
  12. sys.path.insert(0, "../")
  13. import networkx as nx
  14. import numpy as np
  15. import time
  16. from pygraph.kernels.spkernel import spkernel
  17. from pygraph.kernels.pathKernel import pathkernel
  18. def weisfeilerlehmankernel(*args, node_label = 'atom', edge_label = 'bond_type', height = 0, base_kernel = 'subtree'):
  19. """Calculate Weisfeiler-Lehman kernels between graphs.
  20. Parameters
  21. ----------
  22. Gn : List of NetworkX graph
  23. List of graphs between which the kernels are calculated.
  24. /
  25. G1, G2 : NetworkX graphs
  26. 2 graphs between which the kernel is calculated.
  27. node_label : string
  28. node attribute used as label. The default node label is atom.
  29. edge_label : string
  30. edge attribute used as label. The default edge label is bond_type.
  31. height : int
  32. subtree height
  33. base_kernel : string
  34. base kernel used in each iteration of WL kernel. The default base kernel is subtree kernel.
  35. Return
  36. ------
  37. Kmatrix/kernel : Numpy matrix/float
  38. Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs. / Weisfeiler-Lehman kernel between 2 graphs.
  39. Notes
  40. -----
  41. This function now supports WL subtree kernel and WL shortest path kernel.
  42. References
  43. ----------
  44. [1] Shervashidze N, Schweitzer P, Leeuwen EJ, Mehlhorn K, Borgwardt KM. Weisfeiler-lehman graph kernels. Journal of Machine Learning Research. 2011;12(Sep):2539-61.
  45. """
  46. if len(args) == 1: # for a list of graphs
  47. start_time = time.time()
  48. # for WL subtree kernel
  49. if base_kernel == 'subtree':
  50. Kmatrix = _wl_subtreekernel_do(args[0], node_label, edge_label, height = height, base_kernel = 'subtree')
  51. # for WL edge kernel
  52. elif base_kernel == 'edge':
  53. print('edge')
  54. # for WL shortest path kernel
  55. elif base_kernel == 'sp':
  56. Gn = args[0]
  57. Kmatrix = np.zeros((len(Gn), len(Gn)))
  58. for i in range(0, len(Gn)):
  59. for j in range(i, len(Gn)):
  60. Kmatrix[i][j] = _weisfeilerlehmankernel_do(Gn[i], Gn[j], height = height)
  61. Kmatrix[j][i] = Kmatrix[i][j]
  62. run_time = time.time() - start_time
  63. print("\n --- Weisfeiler-Lehman %s kernel matrix of size %d built in %s seconds ---" % (base_kernel, len(args[0]), run_time))
  64. return Kmatrix, run_time
  65. else: # for only 2 graphs
  66. start_time = time.time()
  67. # for WL subtree kernel
  68. if base_kernel == 'subtree':
  69. args = [args[0], args[1]]
  70. kernel = _wl_subtreekernel_do(args, node_label, edge_label, height = height, base_kernel = 'subtree')
  71. # for WL edge kernel
  72. elif base_kernel == 'edge':
  73. print('edge')
  74. # for WL shortest path kernel
  75. elif base_kernel == 'sp':
  76. kernel = _pathkernel_do(args[0], args[1])
  77. run_time = time.time() - start_time
  78. print("\n --- Weisfeiler-Lehman %s kernel built in %s seconds ---" % (base_kernel, run_time))
  79. return kernel, run_time
  80. def _wl_subtreekernel_do(*args, node_label = 'atom', edge_label = 'bond_type', height = 0, base_kernel = 'subtree'):
  81. """Calculate Weisfeiler-Lehman subtree kernels between graphs.
  82. Parameters
  83. ----------
  84. Gn : List of NetworkX graph
  85. List of graphs between which the kernels are calculated.
  86. node_label : string
  87. node attribute used as label. The default node label is atom.
  88. edge_label : string
  89. edge attribute used as label. The default edge label is bond_type.
  90. height : int
  91. subtree height
  92. base_kernel : string
  93. base kernel used in each iteration of WL kernel. The default base kernel is subtree kernel.
  94. Return
  95. ------
  96. Kmatrix/kernel : Numpy matrix/float
  97. Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
  98. """
  99. height = int(height)
  100. Gn = args[0]
  101. Kmatrix = np.zeros((len(Gn), len(Gn)))
  102. all_num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
  103. # initial for height = 0
  104. all_labels_ori = set() # all unique orignal labels in all graphs in this iteration
  105. all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration
  106. all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
  107. num_of_labels_occured = all_num_of_labels_occured # number of the set of letters that occur before as node labels at least once in all graphs
  108. # for each graph
  109. for G in Gn:
  110. # get the set of original labels
  111. labels_ori = list(nx.get_node_attributes(G, node_label).values())
  112. all_labels_ori.update(labels_ori)
  113. num_of_each_label = dict(Counter(labels_ori)) # number of occurence of each label in graph
  114. all_num_of_each_label.append(num_of_each_label)
  115. num_of_labels = len(num_of_each_label) # number of all unique labels
  116. all_labels_ori.update(labels_ori)
  117. all_num_of_labels_occured += len(all_labels_ori)
  118. # calculate subtree kernel with the 0th iteration and add it to the final kernel
  119. for i in range(0, len(Gn)):
  120. for j in range(i, len(Gn)):
  121. labels = set(list(all_num_of_each_label[i].keys()) + list(all_num_of_each_label[j].keys()))
  122. vector1 = np.matrix([ (all_num_of_each_label[i][label] if (label in all_num_of_each_label[i].keys()) else 0) for label in labels ])
  123. vector2 = np.matrix([ (all_num_of_each_label[j][label] if (label in all_num_of_each_label[j].keys()) else 0) for label in labels ])
  124. Kmatrix[i][j] += np.dot(vector1, vector2.transpose())
  125. Kmatrix[j][i] = Kmatrix[i][j]
  126. # iterate each height
  127. for h in range(1, height + 1):
  128. all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
  129. num_of_labels_occured = all_num_of_labels_occured # number of the set of letters that occur before as node labels at least once in all graphs
  130. all_labels_ori = set()
  131. all_num_of_each_label = []
  132. # for each graph
  133. for idx, G in enumerate(Gn):
  134. set_multisets = []
  135. for node in G.nodes(data = True):
  136. # Multiset-label determination.
  137. multiset = [ G.node[neighbors][node_label] for neighbors in G[node[0]] ]
  138. # sorting each multiset
  139. multiset.sort()
  140. multiset = node[1][node_label] + ''.join(multiset) # concatenate to a string and add the prefix
  141. set_multisets.append(multiset)
  142. # label compression
  143. set_unique = list(set(set_multisets)) # set of unique multiset labels
  144. # a dictionary mapping original labels to new ones.
  145. set_compressed = {}
  146. # if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label
  147. for value in set_unique:
  148. if value in all_set_compressed.keys():
  149. set_compressed.update({ value : all_set_compressed[value] })
  150. else:
  151. set_compressed.update({ value : str(num_of_labels_occured + 1) })
  152. num_of_labels_occured += 1
  153. all_set_compressed.update(set_compressed)
  154. # relabel nodes
  155. for node in G.nodes(data = True):
  156. node[1][node_label] = set_compressed[set_multisets[node[0]]]
  157. # get the set of compressed labels
  158. labels_comp = list(nx.get_node_attributes(G, node_label).values())
  159. all_labels_ori.update(labels_comp)
  160. num_of_each_label = dict(Counter(labels_comp))
  161. all_num_of_each_label.append(num_of_each_label)
  162. all_num_of_labels_occured += len(all_labels_ori)
  163. # calculate subtree kernel with h iterations and add it to the final kernel
  164. for i in range(0, len(Gn)):
  165. for j in range(i, len(Gn)):
  166. labels = set(list(all_num_of_each_label[i].keys()) + list(all_num_of_each_label[j].keys()))
  167. vector1 = np.matrix([ (all_num_of_each_label[i][label] if (label in all_num_of_each_label[i].keys()) else 0) for label in labels ])
  168. vector2 = np.matrix([ (all_num_of_each_label[j][label] if (label in all_num_of_each_label[j].keys()) else 0) for label in labels ])
  169. Kmatrix[i][j] += np.dot(vector1, vector2.transpose())
  170. Kmatrix[j][i] = Kmatrix[i][j]
  171. return Kmatrix
  172. def _weisfeilerlehmankernel_do(G1, G2, height = 0):
  173. """Calculate Weisfeiler-Lehman kernels between 2 graphs. This kernel use shortest path kernel to calculate kernel between two graphs in each iteration.
  174. Parameters
  175. ----------
  176. G1, G2 : NetworkX graphs
  177. 2 graphs between which the kernel is calculated.
  178. Return
  179. ------
  180. kernel : float
  181. Weisfeiler-Lehman kernel between 2 graphs.
  182. """
  183. # init.
  184. height = int(height)
  185. kernel = 0 # init kernel
  186. num_nodes1 = G1.number_of_nodes()
  187. num_nodes2 = G2.number_of_nodes()
  188. # the first iteration.
  189. # labelset1 = { G1.nodes(data = True)[i]['label'] for i in range(num_nodes1) }
  190. # labelset2 = { G2.nodes(data = True)[i]['label'] for i in range(num_nodes2) }
  191. kernel += spkernel(G1, G2) # change your base kernel here (and one more below)
  192. for h in range(0, height + 1):
  193. # if labelset1 != labelset2:
  194. # break
  195. # Weisfeiler-Lehman test of graph isomorphism.
  196. relabel(G1)
  197. relabel(G2)
  198. # calculate kernel
  199. kernel += spkernel(G1, G2) # change your base kernel here (and one more before)
  200. # get label sets of both graphs
  201. # labelset1 = { G1.nodes(data = True)[i]['label'] for i in range(num_nodes1) }
  202. # labelset2 = { G2.nodes(data = True)[i]['label'] for i in range(num_nodes2) }
  203. return kernel
  204. def relabel(G):
  205. '''
  206. Relabel nodes in graph G in one iteration of the 1-dim. WL test of graph isomorphism.
  207. Parameters
  208. ----------
  209. G : NetworkX graph
  210. The graphs whose nodes are relabeled.
  211. '''
  212. # get the set of original labels
  213. labels_ori = list(nx.get_node_attributes(G, 'label').values())
  214. num_of_each_label = dict(Counter(labels_ori))
  215. num_of_labels = len(num_of_each_label)
  216. set_multisets = []
  217. for node in G.nodes(data = True):
  218. # Multiset-label determination.
  219. multiset = [ G.node[neighbors]['label'] for neighbors in G[node[0]] ]
  220. # sorting each multiset
  221. multiset.sort()
  222. multiset = node[1]['label'] + ''.join(multiset) # concatenate to a string and add the prefix
  223. set_multisets.append(multiset)
  224. # label compression
  225. # set_multisets.sort() # this is unnecessary
  226. set_unique = list(set(set_multisets)) # set of unique multiset labels
  227. set_compressed = { value : str(set_unique.index(value) + num_of_labels + 1) for value in set_unique } # assign new labels
  228. # relabel nodes
  229. # nx.relabel_nodes(G, set_compressed, copy = False)
  230. for node in G.nodes(data = True):
  231. node[1]['label'] = set_compressed[set_multisets[node[0]]]
  232. # get the set of compressed labels
  233. labels_comp = list(nx.get_node_attributes(G, 'label').values())
  234. num_of_each_label.update(dict(Counter(labels_comp)))

A Python package for graph kernels, graph edit distances and graph pre-image problem.