You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

weisfeiler_lehman.py 17 kB

5 years ago
5 years ago
5 years ago
5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. Created on Tue Apr 14 15:16:34 2020
  5. @author: ljia
  6. @references:
  7. [1] Shervashidze N, Schweitzer P, Leeuwen EJ, Mehlhorn K, Borgwardt KM.
  8. Weisfeiler-lehman graph kernels. Journal of Machine Learning Research.
  9. 2011;12(Sep):2539-61.
  10. """
  11. import numpy as np
  12. import networkx as nx
  13. from collections import Counter
  14. from functools import partial
  15. from gklearn.utils.parallel import parallel_gm
  16. from gklearn.kernels import GraphKernel
  17. class WeisfeilerLehman(GraphKernel): # @todo: total parallelization and sp, edge user kernel.
  18. def __init__(self, **kwargs):
  19. GraphKernel.__init__(self)
  20. self.__node_labels = kwargs.get('node_labels', [])
  21. self.__edge_labels = kwargs.get('edge_labels', [])
  22. self.__height = int(kwargs.get('height', 0))
  23. self.__base_kernel = kwargs.get('base_kernel', 'subtree')
  24. self.__ds_infos = kwargs.get('ds_infos', {})
  25. def _compute_gm_series(self):
  26. if self._verbose >= 2:
  27. import warnings
  28. warnings.warn('A part of the computation is parallelized.')
  29. self.__add_dummy_node_labels(self._graphs)
  30. # for WL subtree kernel
  31. if self.__base_kernel == 'subtree':
  32. gram_matrix = self.__subtree_kernel_do(self._graphs)
  33. # for WL shortest path kernel
  34. elif self.__base_kernel == 'sp':
  35. gram_matrix = self.__sp_kernel_do(self._graphs)
  36. # for WL edge kernel
  37. elif self.__base_kernel == 'edge':
  38. gram_matrix = self.__edge_kernel_do(self._graphs)
  39. # for user defined base kernel
  40. else:
  41. gram_matrix = self.__user_kernel_do(self._graphs)
  42. return gram_matrix
  43. def _compute_gm_imap_unordered(self):
  44. if self._verbose >= 2:
  45. import warnings
  46. warnings.warn('Only a part of the computation is parallelized due to the structure of this kernel.')
  47. return self._compute_gm_series()
  48. def _compute_kernel_list_series(self, g1, g_list): # @todo: this should be better.
  49. if self._verbose >= 2:
  50. import warnings
  51. warnings.warn('A part of the computation is parallelized.')
  52. self.__add_dummy_node_labels(g_list + [g1])
  53. # for WL subtree kernel
  54. if self.__base_kernel == 'subtree':
  55. gram_matrix = self.__subtree_kernel_do(g_list + [g1])
  56. # for WL shortest path kernel
  57. elif self.__base_kernel == 'sp':
  58. gram_matrix = self.__sp_kernel_do(g_list + [g1])
  59. # for WL edge kernel
  60. elif self.__base_kernel == 'edge':
  61. gram_matrix = self.__edge_kernel_do(g_list + [g1])
  62. # for user defined base kernel
  63. else:
  64. gram_matrix = self.__user_kernel_do(g_list + [g1])
  65. return list(gram_matrix[-1][0:-1])
  66. def _compute_kernel_list_imap_unordered(self, g1, g_list):
  67. if self._verbose >= 2:
  68. import warnings
  69. warnings.warn('Only a part of the computation is parallelized due to the structure of this kernel.')
  70. return self._compute_kernel_list_series(g1, g_list)
  71. def _wrapper_kernel_list_do(self, itr):
  72. pass
  73. def _compute_single_kernel_series(self, g1, g2): # @todo: this should be better.
  74. self.__add_dummy_node_labels([g1] + [g2])
  75. # for WL subtree kernel
  76. if self.__base_kernel == 'subtree':
  77. gram_matrix = self.__subtree_kernel_do([g1] + [g2])
  78. # for WL shortest path kernel
  79. elif self.__base_kernel == 'sp':
  80. gram_matrix = self.__sp_kernel_do([g1] + [g2])
  81. # for WL edge kernel
  82. elif self.__base_kernel == 'edge':
  83. gram_matrix = self.__edge_kernel_do([g1] + [g2])
  84. # for user defined base kernel
  85. else:
  86. gram_matrix = self.__user_kernel_do([g1] + [g2])
  87. return gram_matrix[0][1]
  88. def __subtree_kernel_do(self, Gn):
  89. """Calculate Weisfeiler-Lehman kernels between graphs.
  90. Parameters
  91. ----------
  92. Gn : List of NetworkX graph
  93. List of graphs between which the kernels are calculated.
  94. Return
  95. ------
  96. gram_matrix : Numpy matrix
  97. Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
  98. """
  99. gram_matrix = np.zeros((len(Gn), len(Gn)))
  100. # initial for height = 0
  101. all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration
  102. # for each graph
  103. for G in Gn:
  104. # set all labels into a tuple.
  105. for nd, attrs in G.nodes(data=True): # @todo: there may be a better way.
  106. G.nodes[nd]['label_tuple'] = tuple(attrs[name] for name in self.__node_labels)
  107. # get the set of original labels
  108. labels_ori = list(nx.get_node_attributes(G, 'label_tuple').values())
  109. # number of occurence of each label in G
  110. all_num_of_each_label.append(dict(Counter(labels_ori)))
  111. # calculate subtree kernel with the 0th iteration and add it to the final kernel.
  112. self.__compute_gram_matrix(gram_matrix, all_num_of_each_label, Gn)
  113. # iterate each height
  114. for h in range(1, self.__height + 1):
  115. all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
  116. num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
  117. # all_labels_ori = set() # all unique orignal labels in all graphs in this iteration
  118. all_num_of_each_label = [] # number of occurence of each label in G
  119. # @todo: parallel this part.
  120. for idx, G in enumerate(Gn):
  121. all_multisets = []
  122. for node, attrs in G.nodes(data=True):
  123. # Multiset-label determination.
  124. multiset = [G.nodes[neighbors]['label_tuple'] for neighbors in G[node]]
  125. # sorting each multiset
  126. multiset.sort()
  127. multiset = [attrs['label_tuple']] + multiset # add the prefix
  128. all_multisets.append(tuple(multiset))
  129. # label compression
  130. set_unique = list(set(all_multisets)) # set of unique multiset labels
  131. # a dictionary mapping original labels to new ones.
  132. set_compressed = {}
  133. # if a label occured before, assign its former compressed label,
  134. # else assign the number of labels occured + 1 as the compressed label.
  135. for value in set_unique:
  136. if value in all_set_compressed.keys():
  137. set_compressed.update({value: all_set_compressed[value]})
  138. else:
  139. set_compressed.update({value: str(num_of_labels_occured + 1)})
  140. num_of_labels_occured += 1
  141. all_set_compressed.update(set_compressed)
  142. # relabel nodes
  143. for idx, node in enumerate(G.nodes()):
  144. G.nodes[node]['label_tuple'] = set_compressed[all_multisets[idx]]
  145. # get the set of compressed labels
  146. labels_comp = list(nx.get_node_attributes(G, 'label_tuple').values())
  147. # all_labels_ori.update(labels_comp)
  148. all_num_of_each_label.append(dict(Counter(labels_comp)))
  149. # calculate subtree kernel with h iterations and add it to the final kernel
  150. self.__compute_gram_matrix(gram_matrix, all_num_of_each_label, Gn)
  151. return gram_matrix
  152. def __compute_gram_matrix(self, gram_matrix, all_num_of_each_label, Gn):
  153. """Compute Gram matrix using the base kernel.
  154. """
  155. if self._parallel == 'imap_unordered':
  156. # compute kernels.
  157. def init_worker(alllabels_toshare):
  158. global G_alllabels
  159. G_alllabels = alllabels_toshare
  160. do_partial = partial(self._wrapper_compute_subtree_kernel, gram_matrix)
  161. parallel_gm(do_partial, gram_matrix, Gn, init_worker=init_worker,
  162. glbv=(all_num_of_each_label,), n_jobs=self._n_jobs, verbose=self._verbose)
  163. elif self._parallel is None:
  164. for i in range(len(gram_matrix)):
  165. for j in range(i, len(gram_matrix)):
  166. gram_matrix[i][j] = self.__compute_subtree_kernel(all_num_of_each_label[i],
  167. all_num_of_each_label[j], gram_matrix[i][j])
  168. gram_matrix[j][i] = gram_matrix[i][j]
  169. def __compute_subtree_kernel(self, num_of_each_label1, num_of_each_label2, kernel):
  170. """Compute the subtree kernel.
  171. """
  172. labels = set(list(num_of_each_label1.keys()) + list(num_of_each_label2.keys()))
  173. vector1 = np.array([(num_of_each_label1[label]
  174. if (label in num_of_each_label1.keys()) else 0)
  175. for label in labels])
  176. vector2 = np.array([(num_of_each_label2[label]
  177. if (label in num_of_each_label2.keys()) else 0)
  178. for label in labels])
  179. kernel += np.dot(vector1, vector2)
  180. return kernel
  181. def _wrapper_compute_subtree_kernel(self, gram_matrix, itr):
  182. i = itr[0]
  183. j = itr[1]
  184. return i, j, self.__compute_subtree_kernel(G_alllabels[i], G_alllabels[j], gram_matrix[i][j])
  185. def _wl_spkernel_do(Gn, node_label, edge_label, height):
  186. """Calculate Weisfeiler-Lehman shortest path kernels between graphs.
  187. Parameters
  188. ----------
  189. Gn : List of NetworkX graph
  190. List of graphs between which the kernels are calculated.
  191. node_label : string
  192. node attribute used as label.
  193. edge_label : string
  194. edge attribute used as label.
  195. height : int
  196. subtree height.
  197. Return
  198. ------
  199. gram_matrix : Numpy matrix
  200. Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
  201. """
  202. pass
  203. from gklearn.utils.utils import getSPGraph
  204. # init.
  205. height = int(height)
  206. gram_matrix = np.zeros((len(Gn), len(Gn))) # init kernel
  207. Gn = [ getSPGraph(G, edge_weight = edge_label) for G in Gn ] # get shortest path graphs of Gn
  208. # initial for height = 0
  209. for i in range(0, len(Gn)):
  210. for j in range(i, len(Gn)):
  211. for e1 in Gn[i].edges(data = True):
  212. for e2 in Gn[j].edges(data = True):
  213. if e1[2]['cost'] != 0 and e1[2]['cost'] == e2[2]['cost'] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])):
  214. gram_matrix[i][j] += 1
  215. gram_matrix[j][i] = gram_matrix[i][j]
  216. # iterate each height
  217. for h in range(1, height + 1):
  218. all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
  219. num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
  220. for G in Gn: # for each graph
  221. set_multisets = []
  222. for node in G.nodes(data = True):
  223. # Multiset-label determination.
  224. multiset = [ G.node[neighbors][node_label] for neighbors in G[node[0]] ]
  225. # sorting each multiset
  226. multiset.sort()
  227. multiset = node[1][node_label] + ''.join(multiset) # concatenate to a string and add the prefix
  228. set_multisets.append(multiset)
  229. # label compression
  230. set_unique = list(set(set_multisets)) # set of unique multiset labels
  231. # a dictionary mapping original labels to new ones.
  232. set_compressed = {}
  233. # if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label
  234. for value in set_unique:
  235. if value in all_set_compressed.keys():
  236. set_compressed.update({ value : all_set_compressed[value] })
  237. else:
  238. set_compressed.update({ value : str(num_of_labels_occured + 1) })
  239. num_of_labels_occured += 1
  240. all_set_compressed.update(set_compressed)
  241. # relabel nodes
  242. for node in G.nodes(data = True):
  243. node[1][node_label] = set_compressed[set_multisets[node[0]]]
  244. # calculate subtree kernel with h iterations and add it to the final kernel
  245. for i in range(0, len(Gn)):
  246. for j in range(i, len(Gn)):
  247. for e1 in Gn[i].edges(data = True):
  248. for e2 in Gn[j].edges(data = True):
  249. if e1[2]['cost'] != 0 and e1[2]['cost'] == e2[2]['cost'] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])):
  250. gram_matrix[i][j] += 1
  251. gram_matrix[j][i] = gram_matrix[i][j]
  252. return gram_matrix
  253. def _wl_edgekernel_do(Gn, node_label, edge_label, height):
  254. """Calculate Weisfeiler-Lehman edge kernels between graphs.
  255. Parameters
  256. ----------
  257. Gn : List of NetworkX graph
  258. List of graphs between which the kernels are calculated.
  259. node_label : string
  260. node attribute used as label.
  261. edge_label : string
  262. edge attribute used as label.
  263. height : int
  264. subtree height.
  265. Return
  266. ------
  267. gram_matrix : Numpy matrix
  268. Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
  269. """
  270. pass
  271. # init.
  272. height = int(height)
  273. gram_matrix = np.zeros((len(Gn), len(Gn))) # init kernel
  274. # initial for height = 0
  275. for i in range(0, len(Gn)):
  276. for j in range(i, len(Gn)):
  277. for e1 in Gn[i].edges(data = True):
  278. for e2 in Gn[j].edges(data = True):
  279. if e1[2][edge_label] == e2[2][edge_label] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])):
  280. gram_matrix[i][j] += 1
  281. gram_matrix[j][i] = gram_matrix[i][j]
  282. # iterate each height
  283. for h in range(1, height + 1):
  284. all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
  285. num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
  286. for G in Gn: # for each graph
  287. set_multisets = []
  288. for node in G.nodes(data = True):
  289. # Multiset-label determination.
  290. multiset = [ G.node[neighbors][node_label] for neighbors in G[node[0]] ]
  291. # sorting each multiset
  292. multiset.sort()
  293. multiset = node[1][node_label] + ''.join(multiset) # concatenate to a string and add the prefix
  294. set_multisets.append(multiset)
  295. # label compression
  296. set_unique = list(set(set_multisets)) # set of unique multiset labels
  297. # a dictionary mapping original labels to new ones.
  298. set_compressed = {}
  299. # if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label
  300. for value in set_unique:
  301. if value in all_set_compressed.keys():
  302. set_compressed.update({ value : all_set_compressed[value] })
  303. else:
  304. set_compressed.update({ value : str(num_of_labels_occured + 1) })
  305. num_of_labels_occured += 1
  306. all_set_compressed.update(set_compressed)
  307. # relabel nodes
  308. for node in G.nodes(data = True):
  309. node[1][node_label] = set_compressed[set_multisets[node[0]]]
  310. # calculate subtree kernel with h iterations and add it to the final kernel
  311. for i in range(0, len(Gn)):
  312. for j in range(i, len(Gn)):
  313. for e1 in Gn[i].edges(data = True):
  314. for e2 in Gn[j].edges(data = True):
  315. if e1[2][edge_label] == e2[2][edge_label] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])):
  316. gram_matrix[i][j] += 1
  317. gram_matrix[j][i] = gram_matrix[i][j]
  318. return gram_matrix
  319. def _wl_userkernel_do(Gn, node_label, edge_label, height, base_kernel):
  320. """Calculate Weisfeiler-Lehman kernels based on user-defined kernel between graphs.
  321. Parameters
  322. ----------
  323. Gn : List of NetworkX graph
  324. List of graphs between which the kernels are calculated.
  325. node_label : string
  326. node attribute used as label.
  327. edge_label : string
  328. edge attribute used as label.
  329. height : int
  330. subtree height.
  331. base_kernel : string
  332. Name of the base kernel function used in each iteration of WL kernel. This function returns a Numpy matrix, each element of which is the user-defined Weisfeiler-Lehman kernel between 2 praphs.
  333. Return
  334. ------
  335. gram_matrix : Numpy matrix
  336. Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
  337. """
  338. pass
  339. # init.
  340. height = int(height)
  341. gram_matrix = np.zeros((len(Gn), len(Gn))) # init kernel
  342. # initial for height = 0
  343. gram_matrix = base_kernel(Gn, node_label, edge_label)
  344. # iterate each height
  345. for h in range(1, height + 1):
  346. all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
  347. num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
  348. for G in Gn: # for each graph
  349. set_multisets = []
  350. for node in G.nodes(data = True):
  351. # Multiset-label determination.
  352. multiset = [ G.node[neighbors][node_label] for neighbors in G[node[0]] ]
  353. # sorting each multiset
  354. multiset.sort()
  355. multiset = node[1][node_label] + ''.join(multiset) # concatenate to a string and add the prefix
  356. set_multisets.append(multiset)
  357. # label compression
  358. set_unique = list(set(set_multisets)) # set of unique multiset labels
  359. # a dictionary mapping original labels to new ones.
  360. set_compressed = {}
  361. # if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label
  362. for value in set_unique:
  363. if value in all_set_compressed.keys():
  364. set_compressed.update({ value : all_set_compressed[value] })
  365. else:
  366. set_compressed.update({ value : str(num_of_labels_occured + 1) })
  367. num_of_labels_occured += 1
  368. all_set_compressed.update(set_compressed)
  369. # relabel nodes
  370. for node in G.nodes(data = True):
  371. node[1][node_label] = set_compressed[set_multisets[node[0]]]
  372. # calculate kernel with h iterations and add it to the final kernel
  373. gram_matrix += base_kernel(Gn, node_label, edge_label)
  374. return gram_matrix
  375. def __add_dummy_node_labels(self, Gn):
  376. if len(self.__node_labels) == 0:
  377. for G in Gn:
  378. nx.set_node_attributes(G, '0', 'dummy')
  379. self.__node_labels.append('dummy')
  380. class WLSubtree(WeisfeilerLehman):
  381. def __init__(self, **kwargs):
  382. kwargs['base_kernel'] = 'subtree'
  383. super().__init__(**kwargs)

A Python package for graph kernels, graph edit distances and graph pre-image problem.