You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

weisfeilerLehmanKernel.py 12 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299
  1. import sys
  2. import pathlib
  3. sys.path.insert(0, "../")
  4. import networkx as nx
  5. import numpy as np
  6. import time
  7. from pygraph.kernels.spkernel import spkernel
  8. from pygraph.kernels.pathKernel import pathkernel
  9. # test of WL subtree kernel on many graphs
  10. import sys
  11. import pathlib
  12. from collections import Counter
  13. sys.path.insert(0, "../")
  14. import networkx as nx
  15. import numpy as np
  16. import time
  17. from pygraph.kernels.spkernel import spkernel
  18. from pygraph.kernels.pathKernel import pathkernel
  19. def weisfeilerlehmankernel(*args, node_label = 'atom', edge_label = 'bond_type', height = 0, base_kernel = 'subtree'):
  20. """Calculate Weisfeiler-Lehman kernels between graphs.
  21. Parameters
  22. ----------
  23. Gn : List of NetworkX graph
  24. List of graphs between which the kernels are calculated.
  25. /
  26. G1, G2 : NetworkX graphs
  27. 2 graphs between which the kernel is calculated.
  28. node_label : string
  29. node attribute used as label. The default node label is atom.
  30. edge_label : string
  31. edge attribute used as label. The default edge label is bond_type.
  32. height : int
  33. subtree height
  34. base_kernel : string
  35. base kernel used in each iteration of WL kernel. The default base kernel is subtree kernel.
  36. Return
  37. ------
  38. Kmatrix/Kernel : Numpy matrix/int
  39. Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs. / Weisfeiler-Lehman Kernel between 2 graphs.
  40. Notes
  41. -----
  42. This function now supports WL subtree kernel and WL shortest path kernel.
  43. References
  44. ----------
  45. [1] Shervashidze N, Schweitzer P, Leeuwen EJ, Mehlhorn K, Borgwardt KM. Weisfeiler-lehman graph kernels. Journal of Machine Learning Research. 2011;12(Sep):2539-61.
  46. """
  47. if len(args) == 1: # for a list of graphs
  48. start_time = time.time()
  49. # for WL subtree kernel
  50. if base_kernel == 'subtree':
  51. Kmatrix = _wl_subtreekernel_do(args[0], node_label, edge_label, height = height, base_kernel = 'subtree')
  52. # for WL edge kernel
  53. elif base_kernel == 'edge':
  54. print('edge')
  55. # for WL shortest path kernel
  56. elif base_kernel == 'sp':
  57. Gn = args[0]
  58. Kmatrix = np.zeros((len(Gn), len(Gn)))
  59. for i in range(0, len(Gn)):
  60. for j in range(i, len(Gn)):
  61. Kmatrix[i][j] = _weisfeilerlehmankernel_do(Gn[i], Gn[j], height = height)
  62. Kmatrix[j][i] = Kmatrix[i][j]
  63. run_time = time.time() - start_time
  64. print("\n --- Weisfeiler-Lehman %s kernel matrix of size %d built in %s seconds ---" % (base_kernel, len(args[0]), run_time))
  65. return Kmatrix, run_time
  66. else: # for only 2 graphs
  67. start_time = time.time()
  68. # for WL subtree kernel
  69. if base_kernel == 'subtree':
  70. args = [args[0], args[1]]
  71. kernel = _wl_subtreekernel_do(args, node_label, edge_label, height = height, base_kernel = 'subtree')
  72. # for WL edge kernel
  73. elif base_kernel == 'edge':
  74. print('edge')
  75. # for WL shortest path kernel
  76. elif base_kernel == 'sp':
  77. kernel = _pathkernel_do(args[0], args[1])
  78. run_time = time.time() - start_time
  79. print("\n --- Weisfeiler-Lehman %s kernel built in %s seconds ---" % (base_kernel, run_time))
  80. return kernel, run_time
  81. def _wl_subtreekernel_do(*args, node_label = 'atom', edge_label = 'bond_type', height = 0, base_kernel = 'subtree'):
  82. """Calculate Weisfeiler-Lehman subtree kernels between graphs.
  83. Parameters
  84. ----------
  85. Gn : List of NetworkX graph
  86. List of graphs between which the kernels are calculated.
  87. node_label : string
  88. node attribute used as label. The default node label is atom.
  89. edge_label : string
  90. edge attribute used as label. The default edge label is bond_type.
  91. height : int
  92. subtree height
  93. base_kernel : string
  94. base kernel used in each iteration of WL kernel. The default base kernel is subtree kernel.
  95. Return
  96. ------
  97. Kmatrix/Kernel : Numpy matrix/int
  98. Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
  99. """
  100. Gn = args[0]
  101. Kmatrix = np.zeros((len(Gn), len(Gn)))
  102. all_num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
  103. # initial for height = 0
  104. all_labels_ori = set() # all unique orignal labels in all graphs in this iteration
  105. all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration
  106. all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
  107. num_of_labels_occured = all_num_of_labels_occured # number of the set of letters that occur before as node labels at least once in all graphs
  108. # for each graph
  109. for G in Gn:
  110. # get the set of original labels
  111. labels_ori = list(nx.get_node_attributes(G, node_label).values())
  112. all_labels_ori.update(labels_ori)
  113. num_of_each_label = dict(Counter(labels_ori)) # number of occurence of each label in graph
  114. all_num_of_each_label.append(num_of_each_label)
  115. num_of_labels = len(num_of_each_label) # number of all unique labels
  116. all_labels_ori.update(labels_ori)
  117. all_num_of_labels_occured += len(all_labels_ori)
  118. # calculate subtree kernel with the 0th iteration and add it to the final kernel
  119. for i in range(0, len(Gn)):
  120. for j in range(i, len(Gn)):
  121. labels = set(list(all_num_of_each_label[i].keys()) + list(all_num_of_each_label[j].keys()))
  122. vector1 = np.matrix([ (all_num_of_each_label[i][label] if (label in all_num_of_each_label[i].keys()) else 0) for label in labels ])
  123. vector2 = np.matrix([ (all_num_of_each_label[j][label] if (label in all_num_of_each_label[j].keys()) else 0) for label in labels ])
  124. Kmatrix[i][j] += np.dot(vector1, vector2.transpose())
  125. Kmatrix[j][i] = Kmatrix[i][j]
  126. # iterate each height
  127. for h in range(1, height + 1):
  128. all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
  129. num_of_labels_occured = all_num_of_labels_occured # number of the set of letters that occur before as node labels at least once in all graphs
  130. all_labels_ori = set()
  131. all_num_of_each_label = []
  132. # for each graph
  133. for idx, G in enumerate(Gn):
  134. set_multisets = []
  135. for node in G.nodes(data = True):
  136. # Multiset-label determination.
  137. multiset = [ G.node[neighbors][node_label] for neighbors in G[node[0]] ]
  138. # sorting each multiset
  139. multiset.sort()
  140. multiset = node[1][node_label] + ''.join(multiset) # concatenate to a string and add the prefix
  141. set_multisets.append(multiset)
  142. # label compression
  143. set_unique = list(set(set_multisets)) # set of unique multiset labels
  144. # a dictionary mapping original labels to new ones.
  145. set_compressed = {}
  146. # if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label
  147. for value in set_unique:
  148. if value in all_set_compressed.keys():
  149. set_compressed.update({ value : all_set_compressed[value] })
  150. else:
  151. set_compressed.update({ value : str(num_of_labels_occured + 1) })
  152. num_of_labels_occured += 1
  153. all_set_compressed.update(set_compressed)
  154. # relabel nodes
  155. for node in G.nodes(data = True):
  156. node[1][node_label] = set_compressed[set_multisets[node[0]]]
  157. # get the set of compressed labels
  158. labels_comp = list(nx.get_node_attributes(G, node_label).values())
  159. all_labels_ori.update(labels_comp)
  160. num_of_each_label = dict(Counter(labels_comp))
  161. all_num_of_each_label.append(num_of_each_label)
  162. all_num_of_labels_occured += len(all_labels_ori)
  163. # calculate subtree kernel with h iterations and add it to the final kernel
  164. for i in range(0, len(Gn)):
  165. for j in range(i, len(Gn)):
  166. labels = set(list(all_num_of_each_label[i].keys()) + list(all_num_of_each_label[j].keys()))
  167. vector1 = np.matrix([ (all_num_of_each_label[i][label] if (label in all_num_of_each_label[i].keys()) else 0) for label in labels ])
  168. vector2 = np.matrix([ (all_num_of_each_label[j][label] if (label in all_num_of_each_label[j].keys()) else 0) for label in labels ])
  169. Kmatrix[i][j] += np.dot(vector1, vector2.transpose())
  170. Kmatrix[j][i] = Kmatrix[i][j]
  171. return Kmatrix
  172. def _weisfeilerlehmankernel_do(G1, G2, height = 0):
  173. """Calculate Weisfeiler-Lehman kernels between 2 graphs. This kernel use shortest path kernel to calculate kernel between two graphs in each iteration.
  174. Parameters
  175. ----------
  176. G1, G2 : NetworkX graphs
  177. 2 graphs between which the kernel is calculated.
  178. Return
  179. ------
  180. Kernel : int
  181. Weisfeiler-Lehman Kernel between 2 graphs.
  182. """
  183. # init.
  184. kernel = 0 # init kernel
  185. num_nodes1 = G1.number_of_nodes()
  186. num_nodes2 = G2.number_of_nodes()
  187. # the first iteration.
  188. # labelset1 = { G1.nodes(data = True)[i]['label'] for i in range(num_nodes1) }
  189. # labelset2 = { G2.nodes(data = True)[i]['label'] for i in range(num_nodes2) }
  190. kernel += spkernel(G1, G2) # change your base kernel here (and one more below)
  191. for h in range(0, height + 1):
  192. # if labelset1 != labelset2:
  193. # break
  194. # Weisfeiler-Lehman test of graph isomorphism.
  195. relabel(G1)
  196. relabel(G2)
  197. # calculate kernel
  198. kernel += spkernel(G1, G2) # change your base kernel here (and one more before)
  199. # get label sets of both graphs
  200. # labelset1 = { G1.nodes(data = True)[i]['label'] for i in range(num_nodes1) }
  201. # labelset2 = { G2.nodes(data = True)[i]['label'] for i in range(num_nodes2) }
  202. return kernel
  203. def relabel(G):
  204. '''
  205. Relabel nodes in graph G in one iteration of the 1-dim. WL test of graph isomorphism.
  206. Parameters
  207. ----------
  208. G : NetworkX graph
  209. The graphs whose nodes are relabeled.
  210. '''
  211. # get the set of original labels
  212. labels_ori = list(nx.get_node_attributes(G, 'label').values())
  213. num_of_each_label = dict(Counter(labels_ori))
  214. num_of_labels = len(num_of_each_label)
  215. set_multisets = []
  216. for node in G.nodes(data = True):
  217. # Multiset-label determination.
  218. multiset = [ G.node[neighbors]['label'] for neighbors in G[node[0]] ]
  219. # sorting each multiset
  220. multiset.sort()
  221. multiset = node[1]['label'] + ''.join(multiset) # concatenate to a string and add the prefix
  222. set_multisets.append(multiset)
  223. # label compression
  224. # set_multisets.sort() # this is unnecessary
  225. set_unique = list(set(set_multisets)) # set of unique multiset labels
  226. set_compressed = { value : str(set_unique.index(value) + num_of_labels + 1) for value in set_unique } # assign new labels
  227. # relabel nodes
  228. # nx.relabel_nodes(G, set_compressed, copy = False)
  229. for node in G.nodes(data = True):
  230. node[1]['label'] = set_compressed[set_multisets[node[0]]]
  231. # get the set of compressed labels
  232. labels_comp = list(nx.get_node_attributes(G, 'label').values())
  233. num_of_each_label.update(dict(Counter(labels_comp)))

A Python package for graph kernels, graph edit distances and graph pre-image problem.