You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

weisfeilerLehmanKernel.py 11 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270
  1. import sys
  2. import pathlib
  3. sys.path.insert(0, "../")
  4. import networkx as nx
  5. import numpy as np
  6. import time
  7. from pygraph.kernels.spkernel import spkernel
  8. from pygraph.kernels.pathKernel import pathkernel
  9. # test of WL subtree kernel on many graphs
  10. import sys
  11. import pathlib
  12. from collections import Counter
  13. sys.path.insert(0, "../")
  14. import networkx as nx
  15. import numpy as np
  16. import time
  17. from pygraph.kernels.spkernel import spkernel
  18. from pygraph.kernels.pathKernel import pathkernel
  19. def weisfeilerlehmankernel(*args, height = 0, base_kernel = 'subtree'):
  20. """Calculate Weisfeiler-Lehman kernels between graphs.
  21. Parameters
  22. ----------
  23. Gn : List of NetworkX graph
  24. List of graphs between which the kernels are calculated.
  25. /
  26. G1, G2 : NetworkX graphs
  27. 2 graphs between which the kernel is calculated.
  28. height : subtree height
  29. base_kernel : base kernel used in each iteration of WL kernel
  30. the default base kernel is subtree kernel
  31. Return
  32. ------
  33. Kmatrix/Kernel : Numpy matrix/int
  34. Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs. / Weisfeiler-Lehman Kernel between 2 graphs.
  35. Notes
  36. -----
  37. This function now supports WL subtree kernel and WL shortest path kernel.
  38. References
  39. ----------
  40. [1] Shervashidze N, Schweitzer P, Leeuwen EJ, Mehlhorn K, Borgwardt KM. Weisfeiler-lehman graph kernels. Journal of Machine Learning Research. 2011;12(Sep):2539-61.
  41. """
  42. if len(args) == 1: # for a list of graphs
  43. start_time = time.time()
  44. # for WL subtree kernel
  45. if base_kernel == 'subtree':
  46. Kmatrix = _wl_subtreekernel_do(args[0], height = height, base_kernel = 'subtree')
  47. # for WL edge kernel
  48. elif base_kernel == 'edge':
  49. print('edge')
  50. # for WL shortest path kernel
  51. elif base_kernel == 'sp':
  52. Gn = args[0]
  53. Kmatrix = np.zeros((len(Gn), len(Gn)))
  54. for i in range(0, len(Gn)):
  55. for j in range(i, len(Gn)):
  56. Kmatrix[i][j] = _weisfeilerlehmankernel_do(Gn[i], Gn[j])
  57. Kmatrix[j][i] = Kmatrix[i][j]
  58. print("\n --- Weisfeiler-Lehman %s kernel matrix of size %d built in %s seconds ---" % (base_kernel, len(args[0]), (time.time() - start_time)))
  59. return Kmatrix
  60. else: # for only 2 graphs
  61. start_time = time.time()
  62. # for WL subtree kernel
  63. if base_kernel == 'subtree':
  64. args = [args[0], args[1]]
  65. kernel = _wl_subtreekernel_do(args, height = height, base_kernel = 'subtree')
  66. # for WL edge kernel
  67. elif base_kernel == 'edge':
  68. print('edge')
  69. # for WL shortest path kernel
  70. elif base_kernel == 'sp':
  71. kernel = _pathkernel_do(args[0], args[1])
  72. print("\n --- Weisfeiler-Lehman %s kernel built in %s seconds ---" % (base_kernel, time.time() - start_time))
  73. return kernel
  74. def _wl_subtreekernel_do(*args, height = 0, base_kernel = 'subtree'):
  75. """Calculate Weisfeiler-Lehman subtree kernels between graphs.
  76. Parameters
  77. ----------
  78. Gn : List of NetworkX graph
  79. List of graphs between which the kernels are calculated.
  80. Return
  81. ------
  82. Kmatrix/Kernel : Numpy matrix/int
  83. Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
  84. """
  85. Gn = args[0]
  86. Kmatrix = np.zeros((len(Gn), len(Gn)))
  87. all_num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
  88. # iterate each height
  89. for h in range(height + 1):
  90. all_labels_ori = set() # all unique orignal labels in all graphs in this iteration
  91. all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration
  92. all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
  93. num_of_labels_occured = all_num_of_labels_occured # number of the set of letters that occur before as node labels at least once in all graphs
  94. # for each graph
  95. for idx, G in enumerate(Gn):
  96. # get the set of original labels
  97. labels_ori = list(nx.get_node_attributes(G, 'label').values())
  98. num_of_each_label = dict(Counter(labels_ori)) # number of occurence of each label in graph
  99. num_of_labels = len(num_of_each_label) # number of all unique labels
  100. all_labels_ori.update(labels_ori)
  101. # num_of_labels_occured += num_of_labels #@todo not precise
  102. num_of_labels_occured = all_num_of_labels_occured + len(all_labels_ori) + len(all_set_compressed)
  103. set_multisets = []
  104. for node in G.nodes(data = True):
  105. # Multiset-label determination.
  106. multiset = [ G.node[neighbors]['label'] for neighbors in G[node[0]] ]
  107. # sorting each multiset
  108. multiset.sort()
  109. multiset = node[1]['label'] + ''.join(multiset) # concatenate to a string and add the prefix
  110. set_multisets.append(multiset)
  111. # label compression
  112. # set_multisets.sort() # this is unnecessary
  113. set_unique = list(set(set_multisets)) # set of unique multiset labels
  114. # a dictionary mapping original labels to new ones.
  115. set_compressed = {}
  116. # if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label
  117. for value in set_unique:
  118. if value in all_set_compressed.keys():
  119. set_compressed.update({ value : all_set_compressed[value] })
  120. else:
  121. set_compressed.update({ value : str(num_of_labels_occured + 1) })
  122. num_of_labels_occured += 1
  123. # set_compressed = { value : (all_set_compressed[value] if value in all_set_compressed.keys() else str(set_unique.index(value) + num_of_labels_occured + 1)) for value in set_unique }
  124. all_set_compressed.update(set_compressed)
  125. # num_of_labels_occured += len(set_compressed) #@todo not precise
  126. # relabel nodes
  127. # nx.relabel_nodes(G, set_compressed, copy = False)
  128. for node in G.nodes(data = True):
  129. node[1]['label'] = set_compressed[set_multisets[node[0]]]
  130. # get the set of compressed labels
  131. labels_comp = list(nx.get_node_attributes(G, 'label').values())
  132. num_of_each_label.update(dict(Counter(labels_comp)))
  133. all_num_of_each_label.append(num_of_each_label)
  134. # calculate subtree kernel with h iterations and add it to the final kernel
  135. for i in range(0, len(Gn)):
  136. for j in range(i, len(Gn)):
  137. labels = set(list(all_num_of_each_label[i].keys()) + list(all_num_of_each_label[j].keys()))
  138. vector1 = np.matrix([ (all_num_of_each_label[i][label] if (label in all_num_of_each_label[i].keys()) else 0) for label in labels ])
  139. vector2 = np.matrix([ (all_num_of_each_label[j][label] if (label in all_num_of_each_label[j].keys()) else 0) for label in labels ])
  140. Kmatrix[i][j] += np.dot(vector1, vector2.transpose())
  141. Kmatrix[j][i] = Kmatrix[i][j]
  142. all_num_of_labels_occured += len(all_labels_ori)
  143. return Kmatrix
  144. def _weisfeilerlehmankernel_do(G1, G2):
  145. """Calculate Weisfeiler-Lehman kernels between 2 graphs. This kernel use shortest path kernel to calculate kernel between two graphs in each iteration.
  146. Parameters
  147. ----------
  148. G1, G2 : NetworkX graphs
  149. 2 graphs between which the kernel is calculated.
  150. Return
  151. ------
  152. Kernel : int
  153. Weisfeiler-Lehman Kernel between 2 graphs.
  154. """
  155. # init.
  156. kernel = 0 # init kernel
  157. num_nodes1 = G1.number_of_nodes()
  158. num_nodes2 = G2.number_of_nodes()
  159. height = 12 #min(num_nodes1, num_nodes2)) #Q how to determine the upper bound of the height?
  160. # the first iteration.
  161. # labelset1 = { G1.nodes(data = True)[i]['label'] for i in range(num_nodes1) }
  162. # labelset2 = { G2.nodes(data = True)[i]['label'] for i in range(num_nodes2) }
  163. kernel += pathkernel(G1, G2) # change your base kernel here (and one more below)
  164. for h in range(0, height):
  165. # if labelset1 != labelset2:
  166. # break
  167. # Weisfeiler-Lehman test of graph isomorphism.
  168. relabel(G1)
  169. relabel(G2)
  170. # calculate kernel
  171. kernel += pathkernel(G1, G2) # change your base kernel here (and one more before)
  172. # get label sets of both graphs
  173. # labelset1 = { G1.nodes(data = True)[i]['label'] for i in range(num_nodes1) }
  174. # labelset2 = { G2.nodes(data = True)[i]['label'] for i in range(num_nodes2) }
  175. return kernel
  176. def relabel(G):
  177. '''
  178. Relabel nodes in graph G in one iteration of the 1-dim. WL test of graph isomorphism.
  179. Parameters
  180. ----------
  181. G : NetworkX graph
  182. The graphs whose nodes are relabeled.
  183. '''
  184. # get the set of original labels
  185. labels_ori = list(nx.get_node_attributes(G, 'label').values())
  186. num_of_each_label = dict(Counter(labels_ori))
  187. num_of_labels = len(num_of_each_label)
  188. set_multisets = []
  189. for node in G.nodes(data = True):
  190. # Multiset-label determination.
  191. multiset = [ G.node[neighbors]['label'] for neighbors in G[node[0]] ]
  192. # sorting each multiset
  193. multiset.sort()
  194. multiset = node[1]['label'] + ''.join(multiset) # concatenate to a string and add the prefix
  195. set_multisets.append(multiset)
  196. # label compression
  197. # set_multisets.sort() # this is unnecessary
  198. set_unique = list(set(set_multisets)) # set of unique multiset labels
  199. set_compressed = { value : str(set_unique.index(value) + num_of_labels + 1) for value in set_unique } # assign new labels
  200. # relabel nodes
  201. # nx.relabel_nodes(G, set_compressed, copy = False)
  202. for node in G.nodes(data = True):
  203. node[1]['label'] = set_compressed[set_multisets[node[0]]]
  204. # get the set of compressed labels
  205. labels_comp = list(nx.get_node_attributes(G, 'label').values())
  206. num_of_each_label.update(dict(Counter(labels_comp)))

A Python package for graph kernels, graph edit distances and graph pre-image problem.