You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

weisfeiler_lehman.py 35 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. Created on Tue Apr 14 15:16:34 2020
  5. @author: ljia
  6. @references:
  7. [1] Shervashidze N, Schweitzer P, Leeuwen EJ, Mehlhorn K, Borgwardt KM.
  8. Weisfeiler-lehman graph kernels. Journal of Machine Learning Research.
  9. 2011;12(Sep):2539-61.
  10. """
  11. import numpy as np
  12. import networkx as nx
  13. import sys
  14. from collections import Counter
  15. # from functools import partial
  16. from itertools import combinations_with_replacement
  17. from gklearn.utils import SpecialLabel
  18. from gklearn.utils.parallel import parallel_gm, parallel_me
  19. from gklearn.kernels import GraphKernel
  20. from gklearn.utils.iters import get_iters
  21. class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel.
  22. def __init__(self, **kwargs):
  23. GraphKernel.__init__(self, **{k: kwargs.get(k) for k in ['parallel', 'n_jobs', 'chunksize', 'normalize', 'copy_graphs', 'verbose'] if k in kwargs})
  24. self.node_labels = kwargs.get('node_labels', [])
  25. self.edge_labels = kwargs.get('edge_labels', [])
  26. self.height = int(kwargs.get('height', 0))
  27. self._base_kernel = kwargs.get('base_kernel', 'subtree')
  28. self._ds_infos = kwargs.get('ds_infos', {})
  29. ##########################################################################
  30. # The following is the 1st paradigm to compute kernel matrix, which is
  31. # compatible with `scikit-learn`.
  32. # -------------------------------------------------------------------
  33. # Special thanks to the "GraKeL" library for providing an excellent template!
  34. ##########################################################################
  35. ##########################################################################
  36. # The following is the 2nd paradigm to compute kernel matrix. It is
  37. # simplified and not compatible with `scikit-learn`.
  38. ##########################################################################
  39. def _compute_gm_series(self, graphs):
  40. # if self.verbose >= 2:
  41. # import warnings
  42. # warnings.warn('A part of the computation is parallelized.')
  43. # self._add_dummy_node_labels(self._graphs)
  44. # for WL subtree kernel
  45. if self._base_kernel == 'subtree':
  46. gram_matrix = self._subtree_kernel_do(graphs)
  47. # for WL shortest path kernel
  48. elif self._base_kernel == 'sp':
  49. gram_matrix = self._sp_kernel_do(graphs)
  50. # for WL edge kernel
  51. elif self._base_kernel == 'edge':
  52. gram_matrix = self._edge_kernel_do(graphs)
  53. # for user defined base kernel
  54. else:
  55. gram_matrix = self._user_kernel_do(graphs)
  56. return gram_matrix
  57. def _compute_gm_imap_unordered(self):
  58. # self._add_dummy_node_labels(self._graphs)
  59. if self._base_kernel == 'subtree':
  60. gram_matrix = np.zeros((len(self._graphs), len(self._graphs)))
  61. # for i in range(len(self._graphs)):
  62. # for j in range(i, len(self._graphs)):
  63. # gram_matrix[i][j] = self.pairwise_kernel(self._graphs[i], self._graphs[j])
  64. # gram_matrix[j][i] = gram_matrix[i][j]
  65. def init_worker(gn_toshare):
  66. global G_gn
  67. G_gn = gn_toshare
  68. do_fun = self._wrapper_pairwise
  69. parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker,
  70. glbv=(self._graphs,), n_jobs=self.n_jobs, verbose=self.verbose)
  71. return gram_matrix
  72. else:
  73. if self.verbose >= 2:
  74. import warnings
  75. warnings.warn('This base kernel is not parallelized. The serial computation is used instead.')
  76. return self._compute_gm_series()
  77. def _compute_kernel_list_series(self, g1, g_list): # @todo: this should be better.
  78. # if self.verbose >= 2:
  79. # import warnings
  80. # warnings.warn('A part of the computation is parallelized.')
  81. self._add_dummy_node_labels(g_list + [g1])
  82. # for WL subtree kernel
  83. if self._base_kernel == 'subtree':
  84. gram_matrix = self._subtree_kernel_do(g_list + [g1])
  85. # for WL shortest path kernel
  86. elif self._base_kernel == 'sp':
  87. gram_matrix = self._sp_kernel_do(g_list + [g1])
  88. # for WL edge kernel
  89. elif self._base_kernel == 'edge':
  90. gram_matrix = self._edge_kernel_do(g_list + [g1])
  91. # for user defined base kernel
  92. else:
  93. gram_matrix = self._user_kernel_do(g_list + [g1])
  94. return list(gram_matrix[-1][0:-1])
  95. def _compute_kernel_list_imap_unordered(self, g1, g_list):
  96. self._add_dummy_node_labels(g_list + [g1])
  97. if self._base_kernel == 'subtree':
  98. kernel_list = [None] * len(g_list)
  99. def init_worker(g1_toshare, g_list_toshare):
  100. global G_g1, G_g_list
  101. G_g1 = g1_toshare
  102. G_g_list = g_list_toshare
  103. do_fun = self._wrapper_kernel_list_do
  104. def func_assign(result, var_to_assign):
  105. var_to_assign[result[0]] = result[1]
  106. itr = range(len(g_list))
  107. len_itr = len(g_list)
  108. parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr,
  109. init_worker=init_worker, glbv=(g1, g_list), method='imap_unordered',
  110. n_jobs=self.n_jobs, itr_desc='Computing kernels', verbose=self.verbose)
  111. return kernel_list
  112. else:
  113. if self.verbose >= 2:
  114. import warnings
  115. warnings.warn('This base kernel is not parallelized. The serial computation is used instead.')
  116. return self._compute_kernel_list_series(g1, g_list)
  117. def _wrapper_kernel_list_do(self, itr):
  118. return itr, self.pairwise_kernel(G_g1, G_g_list[itr])
  119. def _compute_single_kernel_series(self, g1, g2): # @todo: this should be better.
  120. self._add_dummy_node_labels([g1] + [g2])
  121. # for WL subtree kernel
  122. if self._base_kernel == 'subtree':
  123. gram_matrix = self._subtree_kernel_do([g1] + [g2])
  124. # for WL shortest path kernel
  125. elif self._base_kernel == 'sp':
  126. gram_matrix = self._sp_kernel_do([g1] + [g2])
  127. # for WL edge kernel
  128. elif self._base_kernel == 'edge':
  129. gram_matrix = self._edge_kernel_do([g1] + [g2])
  130. # for user defined base kernel
  131. else:
  132. gram_matrix = self._user_kernel_do([g1] + [g2])
  133. return gram_matrix[0][1]
  134. ##########################################################################
  135. # The following are the methods used by both diagrams.
  136. ##########################################################################
  137. def validate_parameters(self):
  138. """Validate all parameters for the transformer.
  139. Returns
  140. -------
  141. None.
  142. """
  143. super().validate_parameters()
  144. if len(self.node_labels) == 0:
  145. if len(self.edge_labels) == 0:
  146. self._subtree_kernel_do = self._subtree_kernel_do_unlabeled
  147. else:
  148. self._subtree_kernel_do = self._subtree_kernel_do_el
  149. else:
  150. if len(self.edge_labels) == 0:
  151. self._subtree_kernel_do = self._subtree_kernel_do_nl
  152. else:
  153. self._subtree_kernel_do = self._subtree_kernel_do_labeled
  154. def pairwise_kernel(self, g1, g2):
  155. # Gn = [g1.copy(), g2.copy()] # @todo: make sure it is a full deep copy. and faster!
  156. Gn = [g1, g2]
  157. # for WL subtree kernel
  158. if self._base_kernel == 'subtree':
  159. kernel = self._subtree_kernel_do(Gn, return_mat=False)
  160. # @todo: other subkernels.
  161. return kernel
  162. def _wrapper_pairwise(self, itr):
  163. i = itr[0]
  164. j = itr[1]
  165. return i, j, self.pairwise_kernel(G_gn[i], G_gn[j])
  166. def _compute_kernel_itr(self, kernel, all_num_of_each_label):
  167. labels = set(list(all_num_of_each_label[0].keys()) +
  168. list(all_num_of_each_label[1].keys()))
  169. vector1 = np.array([(all_num_of_each_label[0][label]
  170. if (label in all_num_of_each_label[0].keys()) else 0)
  171. for label in labels])
  172. vector2 = np.array([(all_num_of_each_label[1][label]
  173. if (label in all_num_of_each_label[1].keys()) else 0)
  174. for label in labels])
  175. kernel += np.dot(vector1, vector2)
  176. return kernel
  177. def _subtree_kernel_do_nl(self, Gn, return_mat=True):
  178. """Compute Weisfeiler-Lehman kernels between graphs with node labels.
  179. Parameters
  180. ----------
  181. Gn : List of NetworkX graph
  182. List of graphs between which the kernels are computed.
  183. Return
  184. ------
  185. kernel_matrix : Numpy matrix / float
  186. Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
  187. """
  188. kernel_matrix = (np.zeros((len(Gn), len(Gn))) if return_mat else 0)
  189. gram_itr_fun = (self._compute_gram_itr if return_mat else self._compute_kernel_itr)
  190. # initial for height = 0
  191. all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration
  192. # for each graph
  193. if self.verbose >= 2:
  194. iterator = get_iters(Gn, desc='Setting all labels into a tuple')
  195. else:
  196. iterator = Gn
  197. for G in iterator:
  198. # set all labels into a tuple. # @todo: remove this original labels or not?
  199. for nd, attrs in G.nodes(data=True): # @todo: there may be a better way.
  200. G.nodes[nd]['lt'] = tuple(attrs[name] for name in self.node_labels)
  201. # get the set of original labels
  202. labels_ori = list(nx.get_node_attributes(G, 'lt').values())
  203. # number of occurence of each label in G
  204. all_num_of_each_label.append(dict(Counter(labels_ori)))
  205. # Compute subtree kernel with the 0th iteration and add it to the final kernel.
  206. kernel_matrix = gram_itr_fun(kernel_matrix, all_num_of_each_label)
  207. # iterate each height
  208. for h in range(1, self.height + 1):
  209. all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
  210. num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
  211. # all_labels_ori = set() # all unique orignal labels in all graphs in this iteration
  212. all_num_of_each_label = [] # number of occurence of each label in G
  213. # @todo: parallel this part.
  214. # if self.verbose >= 2:
  215. # iterator = get_iters(enumerate(Gn), desc='Going through iteration ' + str(h), length=len(Gn))
  216. # else:
  217. # iterator = enumerate(Gn)
  218. for G in Gn:
  219. num_of_labels_occured = self._subtree_1graph_nl(G, all_set_compressed, all_num_of_each_label, num_of_labels_occured)
  220. # Compute subtree kernel with h iterations and add it to the final kernel
  221. kernel_matrix = gram_itr_fun(kernel_matrix, all_num_of_each_label)
  222. return kernel_matrix
  223. def _subtree_kernel_do_el(self, Gn, return_mat=True):
  224. """Compute Weisfeiler-Lehman kernels between graphs with edge labels.
  225. Parameters
  226. ----------
  227. Gn : List of NetworkX graph
  228. List of graphs between which the kernels are computed.
  229. Return
  230. ------
  231. kernel_matrix : Numpy matrix
  232. Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
  233. """
  234. kernel_matrix = (np.zeros((len(Gn), len(Gn))) if return_mat else 0)
  235. gram_itr_fun = (self._compute_gram_itr if return_mat else self._compute_kernel_itr)
  236. # initial for height = 0
  237. all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration
  238. # Compute subtree kernel with the 0th iteration and add it to the final kernel.
  239. iterator = combinations_with_replacement(range(0, len(kernel_matrix)), 2)
  240. for i, j in iterator: # @todo: not correct if return_mat == False.
  241. kernel_matrix[i][j] += nx.number_of_nodes(Gn[i]) * nx.number_of_nodes(Gn[j])
  242. kernel_matrix[j][i] = kernel_matrix[i][j]
  243. # if h >= 1.
  244. if self.height > 0:
  245. # Set all edge labels into a tuple. # @todo: remove this original labels or not?
  246. if self.verbose >= 2:
  247. iterator = get_iters(Gn, desc='Setting all labels into a tuple')
  248. else:
  249. iterator = Gn
  250. for G in iterator:
  251. for n1, n2, attrs in G.edges(data=True): # @todo: there may be a better way.
  252. G.edges[(n1, n2)]['lt'] = tuple(attrs[name] for name in self.edge_labels)
  253. # When h == 1, compute the kernel.
  254. all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
  255. num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
  256. all_num_of_each_label = [] # number of occurence of each label in G
  257. # @todo: parallel this part.
  258. for G in Gn:
  259. num_of_labels_occured = self._subtree_1graph_el(G, all_set_compressed, all_num_of_each_label, num_of_labels_occured)
  260. # Compute subtree kernel with h iterations and add it to the final kernel.
  261. kernel_matrix = gram_itr_fun(kernel_matrix, all_num_of_each_label)
  262. # Iterate along heights (>= 2).
  263. for h in range(2, self.height + 1):
  264. all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
  265. num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
  266. all_num_of_each_label = [] # number of occurence of each label in G
  267. # @todo: parallel this part.
  268. for G in Gn:
  269. num_of_labels_occured = self._subtree_1graph_nl(G, all_set_compressed, all_num_of_each_label, num_of_labels_occured)
  270. # Compute subtree kernel with h iterations and add it to the final kernel.
  271. kernel_matrix = gram_itr_fun(kernel_matrix, all_num_of_each_label)
  272. return kernel_matrix
  273. def _subtree_kernel_do_labeled(self, Gn, return_mat=True):
  274. """Compute Weisfeiler-Lehman kernels between graphs with both node and
  275. edge labels.
  276. Parameters
  277. ----------
  278. Gn : List of NetworkX graph
  279. List of graphs between which the kernels are computed.
  280. Return
  281. ------
  282. kernel_matrix : Numpy matrix
  283. Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
  284. """
  285. kernel_matrix = (np.zeros((len(Gn), len(Gn))) if return_mat else 0)
  286. gram_itr_fun = (self._compute_gram_itr if return_mat else self._compute_kernel_itr)
  287. # initial for height = 0
  288. all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration
  289. # Set all node labels into a tuple and get # of occurence of each label.
  290. if self.verbose >= 2:
  291. iterator = get_iters(Gn, desc='Setting all node labels into a tuple')
  292. else:
  293. iterator = Gn
  294. for G in iterator:
  295. # Set all node labels into a tuple. # @todo: remove this original labels or not?
  296. for nd, attrs in G.nodes(data=True): # @todo: there may be a better way.
  297. G.nodes[nd]['lt'] = tuple(attrs[name] for name in self.node_labels)
  298. # Get the set of original labels.
  299. labels_ori = list(nx.get_node_attributes(G, 'lt').values())
  300. # number of occurence of each label in G
  301. all_num_of_each_label.append(dict(Counter(labels_ori)))
  302. # Compute subtree kernel with the 0th iteration and add it to the final kernel.
  303. kernel_matrix = gram_itr_fun(kernel_matrix, all_num_of_each_label)
  304. # if h >= 1:
  305. if self.height > 0:
  306. # Set all edge labels into a tuple. # @todo: remove this original labels or not?
  307. if self.verbose >= 2:
  308. iterator = get_iters(Gn, desc='Setting all edge labels into a tuple')
  309. else:
  310. iterator = Gn
  311. for G in iterator:
  312. for n1, n2, attrs in G.edges(data=True): # @todo: there may be a better way.
  313. G.edges[(n1, n2)]['lt'] = tuple(attrs[name] for name in self.edge_labels)
  314. # When h == 1, compute the kernel.
  315. all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
  316. num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
  317. all_num_of_each_label = [] # number of occurence of each label in G
  318. # @todo: parallel this part.
  319. for G in Gn:
  320. num_of_labels_occured = self._subtree_1graph_labeled(G, all_set_compressed, all_num_of_each_label, num_of_labels_occured)
  321. # Compute subtree kernel with h iterations and add it to the final kernel.
  322. kernel_matrix = gram_itr_fun(kernel_matrix, all_num_of_each_label)
  323. # Iterate along heights.
  324. for h in range(2, self.height + 1):
  325. all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
  326. num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
  327. all_num_of_each_label = [] # number of occurence of each label in G
  328. # @todo: parallel this part.
  329. for G in Gn:
  330. num_of_labels_occured = self._subtree_1graph_nl(G, all_set_compressed, all_num_of_each_label, num_of_labels_occured)
  331. # Compute subtree kernel with h iterations and add it to the final kernel.
  332. kernel_matrix = gram_itr_fun(kernel_matrix, all_num_of_each_label)
  333. return kernel_matrix
  334. def _subtree_kernel_do_unlabeled(self, Gn, return_mat=True):
  335. """Compute Weisfeiler-Lehman kernels between graphs without labels.
  336. Parameters
  337. ----------
  338. Gn : List of NetworkX graph
  339. List of graphs between which the kernels are computed.
  340. Return
  341. ------
  342. kernel_matrix : Numpy matrix
  343. Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
  344. """
  345. kernel_matrix = (np.zeros((len(Gn), len(Gn))) if return_mat else 0)
  346. gram_itr_fun = (self._compute_gram_itr if return_mat else self._compute_kernel_itr)
  347. # initial for height = 0
  348. all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration
  349. # Compute subtree kernel with the 0th iteration and add it to the final kernel.
  350. iterator = combinations_with_replacement(range(0, len(kernel_matrix)), 2)
  351. for i, j in iterator: # @todo: not correct if return_mat == False.
  352. kernel_matrix[i][j] += nx.number_of_nodes(Gn[i]) * nx.number_of_nodes(Gn[j])
  353. kernel_matrix[j][i] = kernel_matrix[i][j]
  354. # if h >= 1.
  355. if self.height > 0:
  356. # When h == 1, compute the kernel.
  357. all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
  358. num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
  359. all_num_of_each_label = [] # number of occurence of each label in G
  360. # @todo: parallel this part.
  361. for G in Gn:
  362. num_of_labels_occured = self._subtree_1graph_unlabeled(G, all_set_compressed, all_num_of_each_label, num_of_labels_occured)
  363. # Compute subtree kernel with h iterations and add it to the final kernel.
  364. kernel_matrix = gram_itr_fun(kernel_matrix, all_num_of_each_label)
  365. # Iterate along heights (>= 2).
  366. for h in range(2, self.height + 1):
  367. all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
  368. num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
  369. all_num_of_each_label = [] # number of occurence of each label in G
  370. # @todo: parallel this part.
  371. for G in Gn:
  372. num_of_labels_occured = self._subtree_1graph_nl(G, all_set_compressed, all_num_of_each_label, num_of_labels_occured)
  373. # Compute subtree kernel with h iterations and add it to the final kernel.
  374. kernel_matrix = gram_itr_fun(kernel_matrix, all_num_of_each_label)
  375. return kernel_matrix
  376. def _subtree_1graph_nl(self, G, all_set_compressed, all_num_of_each_label, num_of_labels_occured):
  377. all_multisets = []
  378. for node, attrs in G.nodes(data=True):
  379. # Multiset-label determination.
  380. multiset = [G.nodes[neighbors]['lt'] for neighbors in G[node]]
  381. # sorting each multiset
  382. multiset.sort()
  383. multiset = [attrs['lt']] + multiset # add the prefix
  384. all_multisets.append(tuple(multiset))
  385. # label compression
  386. set_unique = list(set(all_multisets)) # set of unique multiset labels
  387. # a dictionary mapping original labels to new ones.
  388. set_compressed = {}
  389. # If a label occured before, assign its former compressed label;
  390. # otherwise assign the number of labels occured + 1 as the
  391. # compressed label.
  392. for value in set_unique:
  393. if value in all_set_compressed.keys(): # @todo: put keys() function out of for loop?
  394. set_compressed[value] = all_set_compressed[value]
  395. else:
  396. set_compressed[value] = str(num_of_labels_occured + 1) # @todo: remove str? and what if num_of_labels_occured is extremely big.
  397. num_of_labels_occured += 1
  398. all_set_compressed.update(set_compressed)
  399. # Relabel nodes.
  400. for idx, node in enumerate(G.nodes()):
  401. G.nodes[node]['lt'] = set_compressed[all_multisets[idx]]
  402. # Get the set of compressed labels.
  403. labels_comp = list(nx.get_node_attributes(G, 'lt').values())
  404. all_num_of_each_label.append(dict(Counter(labels_comp)))
  405. return num_of_labels_occured
  406. def _subtree_1graph_el(self, G, all_set_compressed, all_num_of_each_label, num_of_labels_occured):
  407. all_multisets = []
  408. # for node, attrs in G.nodes(data=True):
  409. for node in G.nodes():
  410. # Multiset-label determination.
  411. multiset = [G.edges[(node, neighbors)]['lt'] for neighbors in G[node]] # @todo: check reference for this.
  412. # sorting each multiset
  413. multiset.sort()
  414. # multiset = [attrs['lt']] + multiset # add the prefix
  415. all_multisets.append(tuple(multiset))
  416. # label compression
  417. set_unique = list(set(all_multisets)) # set of unique multiset labels
  418. # a dictionary mapping original labels to new ones.
  419. set_compressed = {}
  420. # If a label occured before, assign its former compressed label;
  421. # otherwise assign the number of labels occured + 1 as the
  422. # compressed label.
  423. for value in set_unique:
  424. if value in all_set_compressed.keys(): # @todo: put keys() function out of for loop?
  425. set_compressed[value] = all_set_compressed[value]
  426. else:
  427. set_compressed[value] = str(num_of_labels_occured + 1) # @todo: remove str?
  428. num_of_labels_occured += 1
  429. all_set_compressed.update(set_compressed)
  430. # Relabel nodes.
  431. for idx, node in enumerate(G.nodes()):
  432. G.nodes[node]['lt'] = set_compressed[all_multisets[idx]]
  433. # Get the set of compressed labels.
  434. labels_comp = list(nx.get_node_attributes(G, 'lt').values()) # @todo: maybe can be faster.
  435. all_num_of_each_label.append(dict(Counter(labels_comp)))
  436. return num_of_labels_occured
  437. def _subtree_1graph_labeled(self, G, all_set_compressed, all_num_of_each_label, num_of_labels_occured):
  438. all_multisets = []
  439. for node, attrs in G.nodes(data=True):
  440. # Multiset-label determination.
  441. multiset = [tuple((G.edges[(node, neighbors)]['lt'], G.nodes[neighbors]['lt'])) for neighbors in G[node]] # @todo: check reference for this.
  442. # sorting each multiset
  443. multiset.sort()
  444. multiset = [attrs['lt']] + multiset # add the prefix
  445. all_multisets.append(tuple(multiset))
  446. # label compression
  447. set_unique = list(set(all_multisets)) # set of unique multiset labels
  448. # a dictionary mapping original labels to new ones.
  449. set_compressed = {}
  450. # If a label occured before, assign its former compressed label;
  451. # otherwise assign the number of labels occured + 1 as the
  452. # compressed label.
  453. for value in set_unique:
  454. if value in all_set_compressed.keys(): # @todo: put keys() function out of for loop?
  455. set_compressed[value] = all_set_compressed[value]
  456. else:
  457. set_compressed[value] = str(num_of_labels_occured + 1) # @todo: remove str?
  458. num_of_labels_occured += 1
  459. all_set_compressed.update(set_compressed)
  460. # Relabel nodes.
  461. for idx, node in enumerate(G.nodes()):
  462. G.nodes[node]['lt'] = set_compressed[all_multisets[idx]]
  463. # Get the set of compressed labels.
  464. labels_comp = list(nx.get_node_attributes(G, 'lt').values())
  465. all_num_of_each_label.append(dict(Counter(labels_comp)))
  466. return num_of_labels_occured
  467. def _subtree_1graph_unlabeled(self, G, all_set_compressed, all_num_of_each_label, num_of_labels_occured):
  468. # all_multisets = []
  469. # for node, attrs in G.nodes(data=True): # @todo: it can be better.
  470. # # Multiset-label determination.
  471. # multiset = [0 for neighbors in G[node]]
  472. # # sorting each multiset
  473. # multiset.sort()
  474. # multiset = [0] + multiset # add the prefix
  475. # all_multisets.append(tuple(multiset))
  476. all_multisets = [len(G[node]) for node in G.nodes()]
  477. # label compression
  478. set_unique = list(set(all_multisets)) # set of unique multiset labels
  479. # a dictionary mapping original labels to new ones.
  480. set_compressed = {}
  481. # If a label occured before, assign its former compressed label;
  482. # otherwise assign the number of labels occured + 1 as the
  483. # compressed label.
  484. for value in set_unique:
  485. if value in all_set_compressed.keys(): # @todo: put keys() function out of for loop?
  486. set_compressed[value] = all_set_compressed[value]
  487. else:
  488. set_compressed[value] = str(num_of_labels_occured + 1) # @todo: remove str?
  489. num_of_labels_occured += 1
  490. all_set_compressed.update(set_compressed)
  491. # Relabel nodes.
  492. for idx, node in enumerate(G.nodes()):
  493. G.nodes[node]['lt'] = set_compressed[all_multisets[idx]]
  494. # Get the set of compressed labels.
  495. labels_comp = list(nx.get_node_attributes(G, 'lt').values())
  496. all_num_of_each_label.append(dict(Counter(labels_comp)))
  497. return num_of_labels_occured
  498. def _compute_gram_itr(self, gram_matrix, all_num_of_each_label):
  499. """Compute Gram matrix using the base kernel.
  500. """
  501. # if self.parallel == 'imap_unordered':
  502. # # compute kernels.
  503. # def init_worker(alllabels_toshare):
  504. # global G_alllabels
  505. # G_alllabels = alllabels_toshare
  506. # do_partial = partial(self._wrapper_compute_subtree_kernel, gram_matrix)
  507. # parallel_gm(do_partial, gram_matrix, Gn, init_worker=init_worker,
  508. # glbv=(all_num_of_each_label,), n_jobs=self.n_jobs, verbose=self.verbose)
  509. # elif self.parallel is None:
  510. itr = combinations_with_replacement(range(0, len(gram_matrix)), 2)
  511. len_itr = int(len(gram_matrix) * (len(gram_matrix) + 1) / 2)
  512. iterator = get_iters(itr, desc='Computing Gram matrix for this iteration', file=sys.stdout, length=len_itr, verbose=(self.verbose >= 2))
  513. for i, j in iterator:
  514. # for i in iterator:
  515. # for j in range(i, len(gram_matrix)):
  516. gram_matrix[i][j] += self._compute_subtree_kernel(all_num_of_each_label[i],
  517. all_num_of_each_label[j])
  518. gram_matrix[j][i] = gram_matrix[i][j]
  519. return gram_matrix
  520. def _compute_subtree_kernel(self, num_of_each_label1, num_of_each_label2):
  521. """Compute the subtree kernel.
  522. """
  523. labels = set(list(num_of_each_label1.keys()) + list(num_of_each_label2.keys()))
  524. vector1 = np.array([(num_of_each_label1[label]
  525. if (label in num_of_each_label1.keys()) else 0)
  526. for label in labels])
  527. vector2 = np.array([(num_of_each_label2[label]
  528. if (label in num_of_each_label2.keys()) else 0)
  529. for label in labels])
  530. kernel = np.dot(vector1, vector2)
  531. return kernel
  532. # def _wrapper_compute_subtree_kernel(self, gram_matrix, itr):
  533. # i = itr[0]
  534. # j = itr[1]
  535. # return i, j, self._compute_subtree_kernel(G_alllabels[i], G_alllabels[j], gram_matrix[i][j])
  536. def _wl_spkernel_do(Gn, node_label, edge_label, height):
  537. """Compute Weisfeiler-Lehman shortest path kernels between graphs.
  538. Parameters
  539. ----------
  540. Gn : List of NetworkX graph
  541. List of graphs between which the kernels are computed.
  542. node_label : string
  543. node attribute used as label.
  544. edge_label : string
  545. edge attribute used as label.
  546. height : int
  547. subtree height.
  548. Return
  549. ------
  550. gram_matrix : Numpy matrix
  551. Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
  552. """
  553. pass
  554. from gklearn.utils.utils import getSPGraph
  555. # init.
  556. height = int(height)
  557. gram_matrix = np.zeros((len(Gn), len(Gn))) # init kernel
  558. Gn = [ getSPGraph(G, edge_weight = edge_label) for G in Gn ] # get shortest path graphs of Gn
  559. # initial for height = 0
  560. for i in range(0, len(Gn)):
  561. for j in range(i, len(Gn)):
  562. for e1 in Gn[i].edges(data = True):
  563. for e2 in Gn[j].edges(data = True):
  564. if e1[2]['cost'] != 0 and e1[2]['cost'] == e2[2]['cost'] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])):
  565. gram_matrix[i][j] += 1
  566. gram_matrix[j][i] = gram_matrix[i][j]
  567. # iterate each height
  568. for h in range(1, height + 1):
  569. all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
  570. num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
  571. for G in Gn: # for each graph
  572. set_multisets = []
  573. for node in G.nodes(data = True):
  574. # Multiset-label determination.
  575. multiset = [ G.node[neighbors][node_label] for neighbors in G[node[0]] ]
  576. # sorting each multiset
  577. multiset.sort()
  578. multiset = node[1][node_label] + ''.join(multiset) # concatenate to a string and add the prefix
  579. set_multisets.append(multiset)
  580. # label compression
  581. set_unique = list(set(set_multisets)) # set of unique multiset labels
  582. # a dictionary mapping original labels to new ones.
  583. set_compressed = {}
  584. # if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label
  585. for value in set_unique:
  586. if value in all_set_compressed.keys():
  587. set_compressed[value] = all_set_compressed[value]
  588. else:
  589. set_compressed[value] = str(num_of_labels_occured + 1)
  590. num_of_labels_occured += 1
  591. all_set_compressed.update(set_compressed)
  592. # relabel nodes
  593. for node in G.nodes(data = True):
  594. node[1][node_label] = set_compressed[set_multisets[node[0]]]
  595. # Compute subtree kernel with h iterations and add it to the final kernel
  596. for i in range(0, len(Gn)):
  597. for j in range(i, len(Gn)):
  598. for e1 in Gn[i].edges(data = True):
  599. for e2 in Gn[j].edges(data = True):
  600. if e1[2]['cost'] != 0 and e1[2]['cost'] == e2[2]['cost'] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])):
  601. gram_matrix[i][j] += 1
  602. gram_matrix[j][i] = gram_matrix[i][j]
  603. return gram_matrix
  604. def _wl_edgekernel_do(Gn, node_label, edge_label, height):
  605. """Compute Weisfeiler-Lehman edge kernels between graphs.
  606. Parameters
  607. ----------
  608. Gn : List of NetworkX graph
  609. List of graphs between which the kernels are computed.
  610. node_label : string
  611. node attribute used as label.
  612. edge_label : string
  613. edge attribute used as label.
  614. height : int
  615. subtree height.
  616. Return
  617. ------
  618. gram_matrix : Numpy matrix
  619. Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
  620. """
  621. pass
  622. # init.
  623. height = int(height)
  624. gram_matrix = np.zeros((len(Gn), len(Gn))) # init kernel
  625. # initial for height = 0
  626. for i in range(0, len(Gn)):
  627. for j in range(i, len(Gn)):
  628. for e1 in Gn[i].edges(data = True):
  629. for e2 in Gn[j].edges(data = True):
  630. if e1[2][edge_label] == e2[2][edge_label] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])):
  631. gram_matrix[i][j] += 1
  632. gram_matrix[j][i] = gram_matrix[i][j]
  633. # iterate each height
  634. for h in range(1, height + 1):
  635. all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
  636. num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
  637. for G in Gn: # for each graph
  638. set_multisets = []
  639. for node in G.nodes(data = True):
  640. # Multiset-label determination.
  641. multiset = [ G.node[neighbors][node_label] for neighbors in G[node[0]] ]
  642. # sorting each multiset
  643. multiset.sort()
  644. multiset = node[1][node_label] + ''.join(multiset) # concatenate to a string and add the prefix
  645. set_multisets.append(multiset)
  646. # label compression
  647. set_unique = list(set(set_multisets)) # set of unique multiset labels
  648. # a dictionary mapping original labels to new ones.
  649. set_compressed = {}
  650. # if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label
  651. for value in set_unique:
  652. if value in all_set_compressed.keys():
  653. set_compressed[value] = all_set_compressed[value]
  654. else:
  655. set_compressed[value] = str(num_of_labels_occured + 1)
  656. num_of_labels_occured += 1
  657. all_set_compressed.update(set_compressed)
  658. # relabel nodes
  659. for node in G.nodes(data = True):
  660. node[1][node_label] = set_compressed[set_multisets[node[0]]]
  661. # Compute subtree kernel with h iterations and add it to the final kernel
  662. for i in range(0, len(Gn)):
  663. for j in range(i, len(Gn)):
  664. for e1 in Gn[i].edges(data = True):
  665. for e2 in Gn[j].edges(data = True):
  666. if e1[2][edge_label] == e2[2][edge_label] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])):
  667. gram_matrix[i][j] += 1
  668. gram_matrix[j][i] = gram_matrix[i][j]
  669. return gram_matrix
  670. def _wl_userkernel_do(Gn, node_label, edge_label, height, base_kernel):
  671. """Compute Weisfeiler-Lehman kernels based on user-defined kernel between graphs.
  672. Parameters
  673. ----------
  674. Gn : List of NetworkX graph
  675. List of graphs between which the kernels are computed.
  676. node_label : string
  677. node attribute used as label.
  678. edge_label : string
  679. edge attribute used as label.
  680. height : int
  681. subtree height.
  682. base_kernel : string
  683. Name of the base kernel function used in each iteration of WL kernel. This function returns a Numpy matrix, each element of which is the user-defined Weisfeiler-Lehman kernel between 2 praphs.
  684. Return
  685. ------
  686. gram_matrix : Numpy matrix
  687. Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
  688. """
  689. pass
  690. # init.
  691. height = int(height)
  692. gram_matrix = np.zeros((len(Gn), len(Gn))) # init kernel
  693. # initial for height = 0
  694. gram_matrix = base_kernel(Gn, node_label, edge_label)
  695. # iterate each height
  696. for h in range(1, height + 1):
  697. all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
  698. num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
  699. for G in Gn: # for each graph
  700. set_multisets = []
  701. for node in G.nodes(data = True):
  702. # Multiset-label determination.
  703. multiset = [ G.node[neighbors][node_label] for neighbors in G[node[0]] ]
  704. # sorting each multiset
  705. multiset.sort()
  706. multiset = node[1][node_label] + ''.join(multiset) # concatenate to a string and add the prefix
  707. set_multisets.append(multiset)
  708. # label compression
  709. set_unique = list(set(set_multisets)) # set of unique multiset labels
  710. # a dictionary mapping original labels to new ones.
  711. set_compressed = {}
  712. # if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label
  713. for value in set_unique:
  714. if value in all_set_compressed.keys():
  715. set_compressed[value] = all_set_compressed[value]
  716. else:
  717. set_compressed[value] = str(num_of_labels_occured + 1)
  718. num_of_labels_occured += 1
  719. all_set_compressed.update(set_compressed)
  720. # relabel nodes
  721. for node in G.nodes(data = True):
  722. node[1][node_label] = set_compressed[set_multisets[node[0]]]
  723. # Compute kernel with h iterations and add it to the final kernel
  724. gram_matrix += base_kernel(Gn, node_label, edge_label)
  725. return gram_matrix
  726. def _add_dummy_node_labels(self, Gn):
  727. if len(self.node_labels) == 0 or (len(self.node_labels) == 1 and self.node_labels[0] == SpecialLabel.DUMMY):
  728. for i in range(len(Gn)):
  729. nx.set_node_attributes(Gn[i], '0', SpecialLabel.DUMMY)
  730. self.node_labels = [SpecialLabel.DUMMY]
  731. class WLSubtree(WeisfeilerLehman):
  732. def __init__(self, **kwargs):
  733. kwargs['base_kernel'] = 'subtree'
  734. super().__init__(**kwargs)

A Python package for graph kernels, graph edit distances and graph pre-image problem.