You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

weisfeilerLehmanKernel.py 12 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289
  1. import sys
  2. import pathlib
  3. sys.path.insert(0, "../")
  4. import networkx as nx
  5. import numpy as np
  6. import time
  7. from pygraph.kernels.spkernel import spkernel
  8. from pygraph.kernels.pathKernel import pathkernel
  9. # test of WL subtree kernel on many graphs
  10. import sys
  11. import pathlib
  12. from collections import Counter
  13. sys.path.insert(0, "../")
  14. import networkx as nx
  15. import numpy as np
  16. import time
  17. from pygraph.kernels.spkernel import spkernel
  18. from pygraph.kernels.pathKernel import pathkernel
  19. def weisfeilerlehmankernel(*args, height = 0, base_kernel = 'subtree'):
  20. """Calculate Weisfeiler-Lehman kernels between graphs.
  21. Parameters
  22. ----------
  23. Gn : List of NetworkX graph
  24. List of graphs between which the kernels are calculated.
  25. /
  26. G1, G2 : NetworkX graphs
  27. 2 graphs between which the kernel is calculated.
  28. height : subtree height
  29. base_kernel : base kernel used in each iteration of WL kernel
  30. the default base kernel is subtree kernel
  31. Return
  32. ------
  33. Kmatrix/Kernel : Numpy matrix/int
  34. Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs. / Weisfeiler-Lehman Kernel between 2 graphs.
  35. Notes
  36. -----
  37. This function now supports WL subtree kernel and WL shortest path kernel.
  38. References
  39. ----------
  40. [1] Shervashidze N, Schweitzer P, Leeuwen EJ, Mehlhorn K, Borgwardt KM. Weisfeiler-lehman graph kernels. Journal of Machine Learning Research. 2011;12(Sep):2539-61.
  41. """
  42. if len(args) == 1: # for a list of graphs
  43. start_time = time.time()
  44. # for WL subtree kernel
  45. if base_kernel == 'subtree':
  46. Kmatrix = _wl_subtreekernel_do(args[0], height = height, base_kernel = 'subtree')
  47. # for WL edge kernel
  48. elif base_kernel == 'edge':
  49. print('edge')
  50. # for WL shortest path kernel
  51. elif base_kernel == 'sp':
  52. Gn = args[0]
  53. Kmatrix = np.zeros((len(Gn), len(Gn)))
  54. for i in range(0, len(Gn)):
  55. for j in range(i, len(Gn)):
  56. Kmatrix[i][j] = _weisfeilerlehmankernel_do(Gn[i], Gn[j], height = height)
  57. Kmatrix[j][i] = Kmatrix[i][j]
  58. print("\n --- Weisfeiler-Lehman %s kernel matrix of size %d built in %s seconds ---" % (base_kernel, len(args[0]), (time.time() - start_time)))
  59. return Kmatrix
  60. else: # for only 2 graphs
  61. start_time = time.time()
  62. # for WL subtree kernel
  63. if base_kernel == 'subtree':
  64. args = [args[0], args[1]]
  65. kernel = _wl_subtreekernel_do(args, height = height, base_kernel = 'subtree')
  66. # for WL edge kernel
  67. elif base_kernel == 'edge':
  68. print('edge')
  69. # for WL shortest path kernel
  70. elif base_kernel == 'sp':
  71. kernel = _pathkernel_do(args[0], args[1])
  72. print("\n --- Weisfeiler-Lehman %s kernel built in %s seconds ---" % (base_kernel, time.time() - start_time))
  73. return kernel
  74. def _wl_subtreekernel_do(*args, height = 0, base_kernel = 'subtree'):
  75. """Calculate Weisfeiler-Lehman subtree kernels between graphs.
  76. Parameters
  77. ----------
  78. Gn : List of NetworkX graph
  79. List of graphs between which the kernels are calculated.
  80. Return
  81. ------
  82. Kmatrix/Kernel : Numpy matrix/int
  83. Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
  84. """
  85. Gn = args[0]
  86. Kmatrix = np.zeros((len(Gn), len(Gn)))
  87. all_num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
  88. # initial
  89. # for each graph
  90. for idx, G in enumerate(Gn):
  91. # get the set of original labels
  92. labels_ori = list(nx.get_node_attributes(G, 'label').values())
  93. num_of_each_label = dict(Counter(labels_ori)) # number of occurence of each label in graph
  94. num_of_labels = len(num_of_each_label) # number of all unique labels
  95. all_labels_ori.update(labels_ori)
  96. # # calculate subtree kernel while h = 0 and add it to the final kernel
  97. # for i in range(0, len(Gn)):
  98. # for j in range(i, len(Gn)):
  99. # labels = set(list(nx.get_node_attributes(Gn[i], 'label').values()) + list(nx.get_node_attributes(Gn[j], 'label').values()))
  100. # vector1 = np.matrix([ (nx.get_node_attributes(Gn[i], 'label').values()[label] if (label in all_num_of_each_label[i].keys()) else 0) for label in labels ])
  101. # vector2 = np.matrix([ (all_num_of_each_label[j][label] if (label in all_num_of_each_label[j].keys()) else 0) for label in labels ])
  102. # Kmatrix[i][j] += np.dot(vector1, vector2.transpose())
  103. # Kmatrix[j][i] = Kmatrix[i][j]
  104. # iterate each height
  105. for h in range(height + 1):
  106. all_labels_ori = set() # all unique orignal labels in all graphs in this iteration
  107. all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration
  108. all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
  109. num_of_labels_occured = all_num_of_labels_occured # number of the set of letters that occur before as node labels at least once in all graphs
  110. # for each graph
  111. for idx, G in enumerate(Gn):
  112. # get the set of original labels
  113. labels_ori = list(nx.get_node_attributes(G, 'label').values())
  114. num_of_each_label = dict(Counter(labels_ori)) # number of occurence of each label in graph
  115. num_of_labels = len(num_of_each_label) # number of all unique labels
  116. all_labels_ori.update(labels_ori)
  117. num_of_labels_occured = all_num_of_labels_occured + len(all_labels_ori) + len(all_set_compressed)
  118. set_multisets = []
  119. for node in G.nodes(data = True):
  120. # Multiset-label determination.
  121. multiset = [ G.node[neighbors]['label'] for neighbors in G[node[0]] ]
  122. # sorting each multiset
  123. multiset.sort()
  124. multiset = node[1]['label'] + ''.join(multiset) # concatenate to a string and add the prefix
  125. set_multisets.append(multiset)
  126. # label compression
  127. # set_multisets.sort() # this is unnecessary
  128. set_unique = list(set(set_multisets)) # set of unique multiset labels
  129. # a dictionary mapping original labels to new ones.
  130. set_compressed = {}
  131. # if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label
  132. for value in set_unique:
  133. if value in all_set_compressed.keys():
  134. set_compressed.update({ value : all_set_compressed[value] })
  135. else:
  136. set_compressed.update({ value : str(num_of_labels_occured + 1) })
  137. num_of_labels_occured += 1
  138. # set_compressed = { value : (all_set_compressed[value] if value in all_set_compressed.keys() else str(set_unique.index(value) + num_of_labels_occured + 1)) for value in set_unique }
  139. all_set_compressed.update(set_compressed)
  140. # num_of_labels_occured += len(set_compressed) #@todo not precise
  141. # relabel nodes
  142. # nx.relabel_nodes(G, set_compressed, copy = False)
  143. for node in G.nodes(data = True):
  144. node[1]['label'] = set_compressed[set_multisets[node[0]]]
  145. # get the set of compressed labels
  146. labels_comp = list(nx.get_node_attributes(G, 'label').values())
  147. num_of_each_label.update(dict(Counter(labels_comp)))
  148. all_num_of_each_label.append(num_of_each_label)
  149. # calculate subtree kernel with h iterations and add it to the final kernel
  150. for i in range(0, len(Gn)):
  151. for j in range(i, len(Gn)):
  152. labels = set(list(all_num_of_each_label[i].keys()) + list(all_num_of_each_label[j].keys()))
  153. vector1 = np.matrix([ (all_num_of_each_label[i][label] if (label in all_num_of_each_label[i].keys()) else 0) for label in labels ])
  154. vector2 = np.matrix([ (all_num_of_each_label[j][label] if (label in all_num_of_each_label[j].keys()) else 0) for label in labels ])
  155. Kmatrix[i][j] += np.dot(vector1, vector2.transpose())
  156. Kmatrix[j][i] = Kmatrix[i][j]
  157. all_num_of_labels_occured += len(all_labels_ori)
  158. return Kmatrix
  159. def _weisfeilerlehmankernel_do(G1, G2, height = 0):
  160. """Calculate Weisfeiler-Lehman kernels between 2 graphs. This kernel use shortest path kernel to calculate kernel between two graphs in each iteration.
  161. Parameters
  162. ----------
  163. G1, G2 : NetworkX graphs
  164. 2 graphs between which the kernel is calculated.
  165. Return
  166. ------
  167. Kernel : int
  168. Weisfeiler-Lehman Kernel between 2 graphs.
  169. """
  170. # init.
  171. kernel = 0 # init kernel
  172. num_nodes1 = G1.number_of_nodes()
  173. num_nodes2 = G2.number_of_nodes()
  174. # the first iteration.
  175. # labelset1 = { G1.nodes(data = True)[i]['label'] for i in range(num_nodes1) }
  176. # labelset2 = { G2.nodes(data = True)[i]['label'] for i in range(num_nodes2) }
  177. kernel += spkernel(G1, G2) # change your base kernel here (and one more below)
  178. for h in range(0, height + 1):
  179. # if labelset1 != labelset2:
  180. # break
  181. # Weisfeiler-Lehman test of graph isomorphism.
  182. relabel(G1)
  183. relabel(G2)
  184. # calculate kernel
  185. kernel += spkernel(G1, G2) # change your base kernel here (and one more before)
  186. # get label sets of both graphs
  187. # labelset1 = { G1.nodes(data = True)[i]['label'] for i in range(num_nodes1) }
  188. # labelset2 = { G2.nodes(data = True)[i]['label'] for i in range(num_nodes2) }
  189. return kernel
  190. def relabel(G):
  191. '''
  192. Relabel nodes in graph G in one iteration of the 1-dim. WL test of graph isomorphism.
  193. Parameters
  194. ----------
  195. G : NetworkX graph
  196. The graphs whose nodes are relabeled.
  197. '''
  198. # get the set of original labels
  199. labels_ori = list(nx.get_node_attributes(G, 'label').values())
  200. num_of_each_label = dict(Counter(labels_ori))
  201. num_of_labels = len(num_of_each_label)
  202. set_multisets = []
  203. for node in G.nodes(data = True):
  204. # Multiset-label determination.
  205. multiset = [ G.node[neighbors]['label'] for neighbors in G[node[0]] ]
  206. # sorting each multiset
  207. multiset.sort()
  208. multiset = node[1]['label'] + ''.join(multiset) # concatenate to a string and add the prefix
  209. set_multisets.append(multiset)
  210. # label compression
  211. # set_multisets.sort() # this is unnecessary
  212. set_unique = list(set(set_multisets)) # set of unique multiset labels
  213. set_compressed = { value : str(set_unique.index(value) + num_of_labels + 1) for value in set_unique } # assign new labels
  214. # relabel nodes
  215. # nx.relabel_nodes(G, set_compressed, copy = False)
  216. for node in G.nodes(data = True):
  217. node[1]['label'] = set_compressed[set_multisets[node[0]]]
  218. # get the set of compressed labels
  219. labels_comp = list(nx.get_node_attributes(G, 'label').values())
  220. num_of_each_label.update(dict(Counter(labels_comp)))

A Python package for graph kernels, graph edit distances and graph pre-image problem.