You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

weisfeiler_lehman.py 17 kB

5 years ago
5 years ago
5 years ago
5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. Created on Tue Apr 14 15:16:34 2020
  5. @author: ljia
  6. @references:
  7. [1] Shervashidze N, Schweitzer P, Leeuwen EJ, Mehlhorn K, Borgwardt KM.
  8. Weisfeiler-lehman graph kernels. Journal of Machine Learning Research.
  9. 2011;12(Sep):2539-61.
  10. """
  11. import numpy as np
  12. import networkx as nx
  13. from collections import Counter
  14. from functools import partial
  15. from gklearn.utils import SpecialLabel
  16. from gklearn.utils.parallel import parallel_gm
  17. from gklearn.kernels import GraphKernel
  18. class WeisfeilerLehman(GraphKernel): # @todo: total parallelization and sp, edge user kernel.
  19. def __init__(self, **kwargs):
  20. GraphKernel.__init__(self)
  21. self._node_labels = kwargs.get('node_labels', [])
  22. self._edge_labels = kwargs.get('edge_labels', [])
  23. self._height = int(kwargs.get('height', 0))
  24. self._base_kernel = kwargs.get('base_kernel', 'subtree')
  25. self._ds_infos = kwargs.get('ds_infos', {})
  26. def _compute_gm_series(self):
  27. if self._verbose >= 2:
  28. import warnings
  29. warnings.warn('A part of the computation is parallelized.')
  30. self._add_dummy_node_labels(self._graphs)
  31. # for WL subtree kernel
  32. if self._base_kernel == 'subtree':
  33. gram_matrix = self._subtree_kernel_do(self._graphs)
  34. # for WL shortest path kernel
  35. elif self._base_kernel == 'sp':
  36. gram_matrix = self._sp_kernel_do(self._graphs)
  37. # for WL edge kernel
  38. elif self._base_kernel == 'edge':
  39. gram_matrix = self._edge_kernel_do(self._graphs)
  40. # for user defined base kernel
  41. else:
  42. gram_matrix = self._user_kernel_do(self._graphs)
  43. return gram_matrix
  44. def _compute_gm_imap_unordered(self):
  45. if self._verbose >= 2:
  46. import warnings
  47. warnings.warn('Only a part of the computation is parallelized due to the structure of this kernel.')
  48. return self._compute_gm_series()
  49. def _compute_kernel_list_series(self, g1, g_list): # @todo: this should be better.
  50. if self._verbose >= 2:
  51. import warnings
  52. warnings.warn('A part of the computation is parallelized.')
  53. self._add_dummy_node_labels(g_list + [g1])
  54. # for WL subtree kernel
  55. if self._base_kernel == 'subtree':
  56. gram_matrix = self._subtree_kernel_do(g_list + [g1])
  57. # for WL shortest path kernel
  58. elif self._base_kernel == 'sp':
  59. gram_matrix = self._sp_kernel_do(g_list + [g1])
  60. # for WL edge kernel
  61. elif self._base_kernel == 'edge':
  62. gram_matrix = self._edge_kernel_do(g_list + [g1])
  63. # for user defined base kernel
  64. else:
  65. gram_matrix = self._user_kernel_do(g_list + [g1])
  66. return list(gram_matrix[-1][0:-1])
  67. def _compute_kernel_list_imap_unordered(self, g1, g_list):
  68. if self._verbose >= 2:
  69. import warnings
  70. warnings.warn('Only a part of the computation is parallelized due to the structure of this kernel.')
  71. return self._compute_kernel_list_series(g1, g_list)
  72. def _wrapper_kernel_list_do(self, itr):
  73. pass
  74. def _compute_single_kernel_series(self, g1, g2): # @todo: this should be better.
  75. self._add_dummy_node_labels([g1] + [g2])
  76. # for WL subtree kernel
  77. if self._base_kernel == 'subtree':
  78. gram_matrix = self._subtree_kernel_do([g1] + [g2])
  79. # for WL shortest path kernel
  80. elif self._base_kernel == 'sp':
  81. gram_matrix = self._sp_kernel_do([g1] + [g2])
  82. # for WL edge kernel
  83. elif self._base_kernel == 'edge':
  84. gram_matrix = self._edge_kernel_do([g1] + [g2])
  85. # for user defined base kernel
  86. else:
  87. gram_matrix = self._user_kernel_do([g1] + [g2])
  88. return gram_matrix[0][1]
  89. def _subtree_kernel_do(self, Gn):
  90. """Compute Weisfeiler-Lehman kernels between graphs.
  91. Parameters
  92. ----------
  93. Gn : List of NetworkX graph
  94. List of graphs between which the kernels are computed.
  95. Return
  96. ------
  97. gram_matrix : Numpy matrix
  98. Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
  99. """
  100. gram_matrix = np.zeros((len(Gn), len(Gn)))
  101. # initial for height = 0
  102. all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration
  103. # for each graph
  104. for G in Gn:
  105. # set all labels into a tuple.
  106. for nd, attrs in G.nodes(data=True): # @todo: there may be a better way.
  107. G.nodes[nd]['label_tuple'] = tuple(attrs[name] for name in self._node_labels)
  108. # get the set of original labels
  109. labels_ori = list(nx.get_node_attributes(G, 'label_tuple').values())
  110. # number of occurence of each label in G
  111. all_num_of_each_label.append(dict(Counter(labels_ori)))
  112. # Compute subtree kernel with the 0th iteration and add it to the final kernel.
  113. self._compute_gram_matrix(gram_matrix, all_num_of_each_label, Gn)
  114. # iterate each height
  115. for h in range(1, self._height + 1):
  116. all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
  117. num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
  118. # all_labels_ori = set() # all unique orignal labels in all graphs in this iteration
  119. all_num_of_each_label = [] # number of occurence of each label in G
  120. # @todo: parallel this part.
  121. for idx, G in enumerate(Gn):
  122. all_multisets = []
  123. for node, attrs in G.nodes(data=True):
  124. # Multiset-label determination.
  125. multiset = [G.nodes[neighbors]['label_tuple'] for neighbors in G[node]]
  126. # sorting each multiset
  127. multiset.sort()
  128. multiset = [attrs['label_tuple']] + multiset # add the prefix
  129. all_multisets.append(tuple(multiset))
  130. # label compression
  131. set_unique = list(set(all_multisets)) # set of unique multiset labels
  132. # a dictionary mapping original labels to new ones.
  133. set_compressed = {}
  134. # if a label occured before, assign its former compressed label,
  135. # else assign the number of labels occured + 1 as the compressed label.
  136. for value in set_unique:
  137. if value in all_set_compressed.keys():
  138. set_compressed.update({value: all_set_compressed[value]})
  139. else:
  140. set_compressed.update({value: str(num_of_labels_occured + 1)})
  141. num_of_labels_occured += 1
  142. all_set_compressed.update(set_compressed)
  143. # relabel nodes
  144. for idx, node in enumerate(G.nodes()):
  145. G.nodes[node]['label_tuple'] = set_compressed[all_multisets[idx]]
  146. # get the set of compressed labels
  147. labels_comp = list(nx.get_node_attributes(G, 'label_tuple').values())
  148. # all_labels_ori.update(labels_comp)
  149. all_num_of_each_label.append(dict(Counter(labels_comp)))
  150. # Compute subtree kernel with h iterations and add it to the final kernel
  151. self._compute_gram_matrix(gram_matrix, all_num_of_each_label, Gn)
  152. return gram_matrix
  153. def _compute_gram_matrix(self, gram_matrix, all_num_of_each_label, Gn):
  154. """Compute Gram matrix using the base kernel.
  155. """
  156. if self._parallel == 'imap_unordered':
  157. # compute kernels.
  158. def init_worker(alllabels_toshare):
  159. global G_alllabels
  160. G_alllabels = alllabels_toshare
  161. do_partial = partial(self._wrapper_compute_subtree_kernel, gram_matrix)
  162. parallel_gm(do_partial, gram_matrix, Gn, init_worker=init_worker,
  163. glbv=(all_num_of_each_label,), n_jobs=self._n_jobs, verbose=self._verbose)
  164. elif self._parallel is None:
  165. for i in range(len(gram_matrix)):
  166. for j in range(i, len(gram_matrix)):
  167. gram_matrix[i][j] = self._compute_subtree_kernel(all_num_of_each_label[i],
  168. all_num_of_each_label[j], gram_matrix[i][j])
  169. gram_matrix[j][i] = gram_matrix[i][j]
  170. def _compute_subtree_kernel(self, num_of_each_label1, num_of_each_label2, kernel):
  171. """Compute the subtree kernel.
  172. """
  173. labels = set(list(num_of_each_label1.keys()) + list(num_of_each_label2.keys()))
  174. vector1 = np.array([(num_of_each_label1[label]
  175. if (label in num_of_each_label1.keys()) else 0)
  176. for label in labels])
  177. vector2 = np.array([(num_of_each_label2[label]
  178. if (label in num_of_each_label2.keys()) else 0)
  179. for label in labels])
  180. kernel += np.dot(vector1, vector2)
  181. return kernel
  182. def _wrapper_compute_subtree_kernel(self, gram_matrix, itr):
  183. i = itr[0]
  184. j = itr[1]
  185. return i, j, self._compute_subtree_kernel(G_alllabels[i], G_alllabels[j], gram_matrix[i][j])
  186. def _wl_spkernel_do(Gn, node_label, edge_label, height):
  187. """Compute Weisfeiler-Lehman shortest path kernels between graphs.
  188. Parameters
  189. ----------
  190. Gn : List of NetworkX graph
  191. List of graphs between which the kernels are computed.
  192. node_label : string
  193. node attribute used as label.
  194. edge_label : string
  195. edge attribute used as label.
  196. height : int
  197. subtree height.
  198. Return
  199. ------
  200. gram_matrix : Numpy matrix
  201. Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
  202. """
  203. pass
  204. from gklearn.utils.utils import getSPGraph
  205. # init.
  206. height = int(height)
  207. gram_matrix = np.zeros((len(Gn), len(Gn))) # init kernel
  208. Gn = [ getSPGraph(G, edge_weight = edge_label) for G in Gn ] # get shortest path graphs of Gn
  209. # initial for height = 0
  210. for i in range(0, len(Gn)):
  211. for j in range(i, len(Gn)):
  212. for e1 in Gn[i].edges(data = True):
  213. for e2 in Gn[j].edges(data = True):
  214. if e1[2]['cost'] != 0 and e1[2]['cost'] == e2[2]['cost'] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])):
  215. gram_matrix[i][j] += 1
  216. gram_matrix[j][i] = gram_matrix[i][j]
  217. # iterate each height
  218. for h in range(1, height + 1):
  219. all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
  220. num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
  221. for G in Gn: # for each graph
  222. set_multisets = []
  223. for node in G.nodes(data = True):
  224. # Multiset-label determination.
  225. multiset = [ G.node[neighbors][node_label] for neighbors in G[node[0]] ]
  226. # sorting each multiset
  227. multiset.sort()
  228. multiset = node[1][node_label] + ''.join(multiset) # concatenate to a string and add the prefix
  229. set_multisets.append(multiset)
  230. # label compression
  231. set_unique = list(set(set_multisets)) # set of unique multiset labels
  232. # a dictionary mapping original labels to new ones.
  233. set_compressed = {}
  234. # if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label
  235. for value in set_unique:
  236. if value in all_set_compressed.keys():
  237. set_compressed.update({ value : all_set_compressed[value] })
  238. else:
  239. set_compressed.update({ value : str(num_of_labels_occured + 1) })
  240. num_of_labels_occured += 1
  241. all_set_compressed.update(set_compressed)
  242. # relabel nodes
  243. for node in G.nodes(data = True):
  244. node[1][node_label] = set_compressed[set_multisets[node[0]]]
  245. # Compute subtree kernel with h iterations and add it to the final kernel
  246. for i in range(0, len(Gn)):
  247. for j in range(i, len(Gn)):
  248. for e1 in Gn[i].edges(data = True):
  249. for e2 in Gn[j].edges(data = True):
  250. if e1[2]['cost'] != 0 and e1[2]['cost'] == e2[2]['cost'] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])):
  251. gram_matrix[i][j] += 1
  252. gram_matrix[j][i] = gram_matrix[i][j]
  253. return gram_matrix
  254. def _wl_edgekernel_do(Gn, node_label, edge_label, height):
  255. """Compute Weisfeiler-Lehman edge kernels between graphs.
  256. Parameters
  257. ----------
  258. Gn : List of NetworkX graph
  259. List of graphs between which the kernels are computed.
  260. node_label : string
  261. node attribute used as label.
  262. edge_label : string
  263. edge attribute used as label.
  264. height : int
  265. subtree height.
  266. Return
  267. ------
  268. gram_matrix : Numpy matrix
  269. Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
  270. """
  271. pass
  272. # init.
  273. height = int(height)
  274. gram_matrix = np.zeros((len(Gn), len(Gn))) # init kernel
  275. # initial for height = 0
  276. for i in range(0, len(Gn)):
  277. for j in range(i, len(Gn)):
  278. for e1 in Gn[i].edges(data = True):
  279. for e2 in Gn[j].edges(data = True):
  280. if e1[2][edge_label] == e2[2][edge_label] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])):
  281. gram_matrix[i][j] += 1
  282. gram_matrix[j][i] = gram_matrix[i][j]
  283. # iterate each height
  284. for h in range(1, height + 1):
  285. all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
  286. num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
  287. for G in Gn: # for each graph
  288. set_multisets = []
  289. for node in G.nodes(data = True):
  290. # Multiset-label determination.
  291. multiset = [ G.node[neighbors][node_label] for neighbors in G[node[0]] ]
  292. # sorting each multiset
  293. multiset.sort()
  294. multiset = node[1][node_label] + ''.join(multiset) # concatenate to a string and add the prefix
  295. set_multisets.append(multiset)
  296. # label compression
  297. set_unique = list(set(set_multisets)) # set of unique multiset labels
  298. # a dictionary mapping original labels to new ones.
  299. set_compressed = {}
  300. # if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label
  301. for value in set_unique:
  302. if value in all_set_compressed.keys():
  303. set_compressed.update({ value : all_set_compressed[value] })
  304. else:
  305. set_compressed.update({ value : str(num_of_labels_occured + 1) })
  306. num_of_labels_occured += 1
  307. all_set_compressed.update(set_compressed)
  308. # relabel nodes
  309. for node in G.nodes(data = True):
  310. node[1][node_label] = set_compressed[set_multisets[node[0]]]
  311. # Compute subtree kernel with h iterations and add it to the final kernel
  312. for i in range(0, len(Gn)):
  313. for j in range(i, len(Gn)):
  314. for e1 in Gn[i].edges(data = True):
  315. for e2 in Gn[j].edges(data = True):
  316. if e1[2][edge_label] == e2[2][edge_label] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])):
  317. gram_matrix[i][j] += 1
  318. gram_matrix[j][i] = gram_matrix[i][j]
  319. return gram_matrix
  320. def _wl_userkernel_do(Gn, node_label, edge_label, height, base_kernel):
  321. """Compute Weisfeiler-Lehman kernels based on user-defined kernel between graphs.
  322. Parameters
  323. ----------
  324. Gn : List of NetworkX graph
  325. List of graphs between which the kernels are computed.
  326. node_label : string
  327. node attribute used as label.
  328. edge_label : string
  329. edge attribute used as label.
  330. height : int
  331. subtree height.
  332. base_kernel : string
  333. Name of the base kernel function used in each iteration of WL kernel. This function returns a Numpy matrix, each element of which is the user-defined Weisfeiler-Lehman kernel between 2 praphs.
  334. Return
  335. ------
  336. gram_matrix : Numpy matrix
  337. Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
  338. """
  339. pass
  340. # init.
  341. height = int(height)
  342. gram_matrix = np.zeros((len(Gn), len(Gn))) # init kernel
  343. # initial for height = 0
  344. gram_matrix = base_kernel(Gn, node_label, edge_label)
  345. # iterate each height
  346. for h in range(1, height + 1):
  347. all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
  348. num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
  349. for G in Gn: # for each graph
  350. set_multisets = []
  351. for node in G.nodes(data = True):
  352. # Multiset-label determination.
  353. multiset = [ G.node[neighbors][node_label] for neighbors in G[node[0]] ]
  354. # sorting each multiset
  355. multiset.sort()
  356. multiset = node[1][node_label] + ''.join(multiset) # concatenate to a string and add the prefix
  357. set_multisets.append(multiset)
  358. # label compression
  359. set_unique = list(set(set_multisets)) # set of unique multiset labels
  360. # a dictionary mapping original labels to new ones.
  361. set_compressed = {}
  362. # if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label
  363. for value in set_unique:
  364. if value in all_set_compressed.keys():
  365. set_compressed.update({ value : all_set_compressed[value] })
  366. else:
  367. set_compressed.update({ value : str(num_of_labels_occured + 1) })
  368. num_of_labels_occured += 1
  369. all_set_compressed.update(set_compressed)
  370. # relabel nodes
  371. for node in G.nodes(data = True):
  372. node[1][node_label] = set_compressed[set_multisets[node[0]]]
  373. # Compute kernel with h iterations and add it to the final kernel
  374. gram_matrix += base_kernel(Gn, node_label, edge_label)
  375. return gram_matrix
  376. def _add_dummy_node_labels(self, Gn):
  377. if len(self._node_labels) == 0 or (len(self._node_labels) == 1 and self._node_labels[0] == SpecialLabel.DUMMY):
  378. for i in range(len(Gn)):
  379. nx.set_node_attributes(Gn[i], '0', SpecialLabel.DUMMY)
  380. self._node_labels = [SpecialLabel.DUMMY]
  381. class WLSubtree(WeisfeilerLehman):
  382. def __init__(self, **kwargs):
  383. kwargs['base_kernel'] = 'subtree'
  384. super().__init__(**kwargs)

A Python package for graph kernels, graph edit distances and graph pre-image problem.