You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

weisfeilerLehmanKernel.py 18 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403
  1. """
  2. @author: linlin
  3. @references:
  4. [1] Shervashidze N, Schweitzer P, Leeuwen EJ, Mehlhorn K, Borgwardt KM. Weisfeiler-lehman graph kernels. Journal of Machine Learning Research. 2011;12(Sep):2539-61.
  5. """
  6. import sys
  7. import pathlib
  8. from collections import Counter
  9. sys.path.insert(0, "../")
  10. import networkx as nx
  11. import numpy as np
  12. import time
  13. from pygraph.kernels.pathKernel import pathkernel
  14. def weisfeilerlehmankernel(*args, node_label = 'atom', edge_label = 'bond_type', height = 0, base_kernel = 'subtree'):
  15. """Calculate Weisfeiler-Lehman kernels between graphs.
  16. Parameters
  17. ----------
  18. Gn : List of NetworkX graph
  19. List of graphs between which the kernels are calculated.
  20. /
  21. G1, G2 : NetworkX graphs
  22. 2 graphs between which the kernel is calculated.
  23. node_label : string
  24. node attribute used as label. The default node label is atom.
  25. edge_label : string
  26. edge attribute used as label. The default edge label is bond_type.
  27. height : int
  28. subtree height
  29. base_kernel : string
  30. base kernel used in each iteration of WL kernel. The default base kernel is subtree kernel. For user-defined kernel, base_kernel is the name of the base kernel function used in each iteration of WL kernel. This function returns a Numpy matrix, each element of which is the user-defined Weisfeiler-Lehman kernel between 2 praphs.
  31. Return
  32. ------
  33. Kmatrix : Numpy matrix
  34. Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
  35. Notes
  36. -----
  37. This function now supports WL subtree kernel, WL shortest path kernel and WL edge kernel.
  38. """
  39. base_kernel = base_kernel.lower()
  40. Gn = args[0] if len(args) == 1 else [args[0], args[1]] # arrange all graphs in a list
  41. Kmatrix = np.zeros((len(Gn), len(Gn)))
  42. start_time = time.time()
  43. # for WL subtree kernel
  44. if base_kernel == 'subtree':
  45. Kmatrix = _wl_subtreekernel_do(args[0], node_label, edge_label, height)
  46. # for WL shortest path kernel
  47. elif base_kernel == 'sp':
  48. Kmatrix = _wl_spkernel_do(args[0], node_label, edge_label, height)
  49. # for WL edge kernel
  50. elif base_kernel == 'edge':
  51. Kmatrix = _wl_edgekernel_do(args[0], node_label, edge_label, height)
  52. # for user defined base kernel
  53. else:
  54. Kmatrix = _wl_userkernel_do(args[0], node_label, edge_label, height, base_kernel)
  55. run_time = time.time() - start_time
  56. print("\n --- Weisfeiler-Lehman %s kernel matrix of size %d built in %s seconds ---" % (base_kernel, len(args[0]), run_time))
  57. return Kmatrix, run_time
  58. def _wl_subtreekernel_do(Gn, node_label, edge_label, height):
  59. """Calculate Weisfeiler-Lehman subtree kernels between graphs.
  60. Parameters
  61. ----------
  62. Gn : List of NetworkX graph
  63. List of graphs between which the kernels are calculated.
  64. node_label : string
  65. node attribute used as label.
  66. edge_label : string
  67. edge attribute used as label.
  68. height : int
  69. subtree height.
  70. Return
  71. ------
  72. Kmatrix : Numpy matrix
  73. Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
  74. """
  75. height = int(height)
  76. Kmatrix = np.zeros((len(Gn), len(Gn)))
  77. all_num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
  78. # initial for height = 0
  79. all_labels_ori = set() # all unique orignal labels in all graphs in this iteration
  80. all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration
  81. all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
  82. num_of_labels_occured = all_num_of_labels_occured # number of the set of letters that occur before as node labels at least once in all graphs
  83. # for each graph
  84. for G in Gn:
  85. # get the set of original labels
  86. labels_ori = list(nx.get_node_attributes(G, node_label).values())
  87. all_labels_ori.update(labels_ori)
  88. num_of_each_label = dict(Counter(labels_ori)) # number of occurence of each label in graph
  89. all_num_of_each_label.append(num_of_each_label)
  90. num_of_labels = len(num_of_each_label) # number of all unique labels
  91. all_labels_ori.update(labels_ori)
  92. all_num_of_labels_occured += len(all_labels_ori)
  93. # calculate subtree kernel with the 0th iteration and add it to the final kernel
  94. for i in range(0, len(Gn)):
  95. for j in range(i, len(Gn)):
  96. labels = set(list(all_num_of_each_label[i].keys()) + list(all_num_of_each_label[j].keys()))
  97. vector1 = np.matrix([ (all_num_of_each_label[i][label] if (label in all_num_of_each_label[i].keys()) else 0) for label in labels ])
  98. vector2 = np.matrix([ (all_num_of_each_label[j][label] if (label in all_num_of_each_label[j].keys()) else 0) for label in labels ])
  99. Kmatrix[i][j] += np.dot(vector1, vector2.transpose())
  100. Kmatrix[j][i] = Kmatrix[i][j]
  101. # iterate each height
  102. for h in range(1, height + 1):
  103. all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
  104. num_of_labels_occured = all_num_of_labels_occured # number of the set of letters that occur before as node labels at least once in all graphs
  105. all_labels_ori = set()
  106. all_num_of_each_label = []
  107. # for each graph
  108. for idx, G in enumerate(Gn):
  109. set_multisets = []
  110. for node in G.nodes(data = True):
  111. # Multiset-label determination.
  112. multiset = [ G.node[neighbors][node_label] for neighbors in G[node[0]] ]
  113. # sorting each multiset
  114. multiset.sort()
  115. multiset = node[1][node_label] + ''.join(multiset) # concatenate to a string and add the prefix
  116. set_multisets.append(multiset)
  117. # label compression
  118. set_unique = list(set(set_multisets)) # set of unique multiset labels
  119. # a dictionary mapping original labels to new ones.
  120. set_compressed = {}
  121. # if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label
  122. for value in set_unique:
  123. if value in all_set_compressed.keys():
  124. set_compressed.update({ value : all_set_compressed[value] })
  125. else:
  126. set_compressed.update({ value : str(num_of_labels_occured + 1) })
  127. num_of_labels_occured += 1
  128. all_set_compressed.update(set_compressed)
  129. # relabel nodes
  130. for node in G.nodes(data = True):
  131. node[1][node_label] = set_compressed[set_multisets[node[0]]]
  132. # get the set of compressed labels
  133. labels_comp = list(nx.get_node_attributes(G, node_label).values())
  134. all_labels_ori.update(labels_comp)
  135. num_of_each_label = dict(Counter(labels_comp))
  136. all_num_of_each_label.append(num_of_each_label)
  137. all_num_of_labels_occured += len(all_labels_ori)
  138. # calculate subtree kernel with h iterations and add it to the final kernel
  139. for i in range(0, len(Gn)):
  140. for j in range(i, len(Gn)):
  141. labels = set(list(all_num_of_each_label[i].keys()) + list(all_num_of_each_label[j].keys()))
  142. vector1 = np.matrix([ (all_num_of_each_label[i][label] if (label in all_num_of_each_label[i].keys()) else 0) for label in labels ])
  143. vector2 = np.matrix([ (all_num_of_each_label[j][label] if (label in all_num_of_each_label[j].keys()) else 0) for label in labels ])
  144. Kmatrix[i][j] += np.dot(vector1, vector2.transpose())
  145. Kmatrix[j][i] = Kmatrix[i][j]
  146. return Kmatrix
  147. def _wl_spkernel_do(Gn, node_label, edge_label, height):
  148. """Calculate Weisfeiler-Lehman shortest path kernels between graphs.
  149. Parameters
  150. ----------
  151. Gn : List of NetworkX graph
  152. List of graphs between which the kernels are calculated.
  153. node_label : string
  154. node attribute used as label.
  155. edge_label : string
  156. edge attribute used as label.
  157. height : int
  158. subtree height.
  159. Return
  160. ------
  161. Kmatrix : Numpy matrix
  162. Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
  163. """
  164. from pygraph.utils.utils import getSPGraph
  165. # init.
  166. height = int(height)
  167. Kmatrix = np.zeros((len(Gn), len(Gn))) # init kernel
  168. Gn = [ getSPGraph(G, edge_weight = edge_label) for G in Gn ] # get shortest path graphs of Gn
  169. # initial for height = 0
  170. for i in range(0, len(Gn)):
  171. for j in range(i, len(Gn)):
  172. for e1 in Gn[i].edges(data = True):
  173. for e2 in Gn[j].edges(data = True):
  174. if e1[2]['cost'] != 0 and e1[2]['cost'] == e2[2]['cost'] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])):
  175. Kmatrix[i][j] += 1
  176. Kmatrix[j][i] = Kmatrix[i][j]
  177. # iterate each height
  178. for h in range(1, height + 1):
  179. all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
  180. num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
  181. for G in Gn: # for each graph
  182. set_multisets = []
  183. for node in G.nodes(data = True):
  184. # Multiset-label determination.
  185. multiset = [ G.node[neighbors][node_label] for neighbors in G[node[0]] ]
  186. # sorting each multiset
  187. multiset.sort()
  188. multiset = node[1][node_label] + ''.join(multiset) # concatenate to a string and add the prefix
  189. set_multisets.append(multiset)
  190. # label compression
  191. set_unique = list(set(set_multisets)) # set of unique multiset labels
  192. # a dictionary mapping original labels to new ones.
  193. set_compressed = {}
  194. # if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label
  195. for value in set_unique:
  196. if value in all_set_compressed.keys():
  197. set_compressed.update({ value : all_set_compressed[value] })
  198. else:
  199. set_compressed.update({ value : str(num_of_labels_occured + 1) })
  200. num_of_labels_occured += 1
  201. all_set_compressed.update(set_compressed)
  202. # relabel nodes
  203. for node in G.nodes(data = True):
  204. node[1][node_label] = set_compressed[set_multisets[node[0]]]
  205. # calculate subtree kernel with h iterations and add it to the final kernel
  206. for i in range(0, len(Gn)):
  207. for j in range(i, len(Gn)):
  208. for e1 in Gn[i].edges(data = True):
  209. for e2 in Gn[j].edges(data = True):
  210. if e1[2]['cost'] != 0 and e1[2]['cost'] == e2[2]['cost'] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])):
  211. Kmatrix[i][j] += 1
  212. Kmatrix[j][i] = Kmatrix[i][j]
  213. return Kmatrix
  214. def _wl_edgekernel_do(Gn, node_label, edge_label, height):
  215. """Calculate Weisfeiler-Lehman edge kernels between graphs.
  216. Parameters
  217. ----------
  218. Gn : List of NetworkX graph
  219. List of graphs between which the kernels are calculated.
  220. node_label : string
  221. node attribute used as label.
  222. edge_label : string
  223. edge attribute used as label.
  224. height : int
  225. subtree height.
  226. Return
  227. ------
  228. Kmatrix : Numpy matrix
  229. Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
  230. """
  231. # init.
  232. height = int(height)
  233. Kmatrix = np.zeros((len(Gn), len(Gn))) # init kernel
  234. # initial for height = 0
  235. for i in range(0, len(Gn)):
  236. for j in range(i, len(Gn)):
  237. for e1 in Gn[i].edges(data = True):
  238. for e2 in Gn[j].edges(data = True):
  239. if e1[2][edge_label] == e2[2][edge_label] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])):
  240. Kmatrix[i][j] += 1
  241. Kmatrix[j][i] = Kmatrix[i][j]
  242. # iterate each height
  243. for h in range(1, height + 1):
  244. all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
  245. num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
  246. for G in Gn: # for each graph
  247. set_multisets = []
  248. for node in G.nodes(data = True):
  249. # Multiset-label determination.
  250. multiset = [ G.node[neighbors][node_label] for neighbors in G[node[0]] ]
  251. # sorting each multiset
  252. multiset.sort()
  253. multiset = node[1][node_label] + ''.join(multiset) # concatenate to a string and add the prefix
  254. set_multisets.append(multiset)
  255. # label compression
  256. set_unique = list(set(set_multisets)) # set of unique multiset labels
  257. # a dictionary mapping original labels to new ones.
  258. set_compressed = {}
  259. # if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label
  260. for value in set_unique:
  261. if value in all_set_compressed.keys():
  262. set_compressed.update({ value : all_set_compressed[value] })
  263. else:
  264. set_compressed.update({ value : str(num_of_labels_occured + 1) })
  265. num_of_labels_occured += 1
  266. all_set_compressed.update(set_compressed)
  267. # relabel nodes
  268. for node in G.nodes(data = True):
  269. node[1][node_label] = set_compressed[set_multisets[node[0]]]
  270. # calculate subtree kernel with h iterations and add it to the final kernel
  271. for i in range(0, len(Gn)):
  272. for j in range(i, len(Gn)):
  273. for e1 in Gn[i].edges(data = True):
  274. for e2 in Gn[j].edges(data = True):
  275. if e1[2][edge_label] == e2[2][edge_label] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])):
  276. Kmatrix[i][j] += 1
  277. Kmatrix[j][i] = Kmatrix[i][j]
  278. return Kmatrix
  279. def _wl_userkernel_do(Gn, node_label, edge_label, height, base_kernel):
  280. """Calculate Weisfeiler-Lehman kernels based on user-defined kernel between graphs.
  281. Parameters
  282. ----------
  283. Gn : List of NetworkX graph
  284. List of graphs between which the kernels are calculated.
  285. node_label : string
  286. node attribute used as label.
  287. edge_label : string
  288. edge attribute used as label.
  289. height : int
  290. subtree height.
  291. base_kernel : string
  292. Name of the base kernel function used in each iteration of WL kernel. This function returns a Numpy matrix, each element of which is the user-defined Weisfeiler-Lehman kernel between 2 praphs.
  293. Return
  294. ------
  295. Kmatrix : Numpy matrix
  296. Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
  297. """
  298. # init.
  299. height = int(height)
  300. Kmatrix = np.zeros((len(Gn), len(Gn))) # init kernel
  301. # initial for height = 0
  302. Kmatrix = base_kernel(Gn, node_label, edge_label)
  303. # iterate each height
  304. for h in range(1, height + 1):
  305. all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
  306. num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
  307. for G in Gn: # for each graph
  308. set_multisets = []
  309. for node in G.nodes(data = True):
  310. # Multiset-label determination.
  311. multiset = [ G.node[neighbors][node_label] for neighbors in G[node[0]] ]
  312. # sorting each multiset
  313. multiset.sort()
  314. multiset = node[1][node_label] + ''.join(multiset) # concatenate to a string and add the prefix
  315. set_multisets.append(multiset)
  316. # label compression
  317. set_unique = list(set(set_multisets)) # set of unique multiset labels
  318. # a dictionary mapping original labels to new ones.
  319. set_compressed = {}
  320. # if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label
  321. for value in set_unique:
  322. if value in all_set_compressed.keys():
  323. set_compressed.update({ value : all_set_compressed[value] })
  324. else:
  325. set_compressed.update({ value : str(num_of_labels_occured + 1) })
  326. num_of_labels_occured += 1
  327. all_set_compressed.update(set_compressed)
  328. # relabel nodes
  329. for node in G.nodes(data = True):
  330. node[1][node_label] = set_compressed[set_multisets[node[0]]]
  331. # calculate kernel with h iterations and add it to the final kernel
  332. Kmatrix += base_kernel(Gn, node_label, edge_label)
  333. return Kmatrix

A Python package for graph kernels, graph edit distances and graph pre-image problem.