You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

weisfeiler_lehman.py 36 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. Created on Tue Apr 14 15:16:34 2020
  5. @author: ljia
  6. @references:
  7. [1] Shervashidze N, Schweitzer P, Leeuwen EJ, Mehlhorn K, Borgwardt KM.
  8. Weisfeiler-lehman graph kernels. Journal of Machine Learning Research.
  9. 2011;12(Sep):2539-61.
  10. """
  11. import numpy as np
  12. import networkx as nx
  13. import sys
  14. from collections import Counter
  15. # from functools import partial
  16. from itertools import combinations_with_replacement
  17. from gklearn.utils import SpecialLabel
  18. from gklearn.utils.parallel import parallel_gm, parallel_me
  19. from gklearn.kernels import GraphKernel
  20. from gklearn.utils.iters import get_iters
  21. class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel.
  22. def __init__(self, **kwargs):
  23. GraphKernel.__init__(self)
  24. self.node_labels = kwargs.get('node_labels', [])
  25. self.edge_labels = kwargs.get('edge_labels', [])
  26. self.height = int(kwargs.get('height', 0))
  27. self._base_kernel = kwargs.get('base_kernel', 'subtree')
  28. self._ds_infos = kwargs.get('ds_infos', {})
  29. ##########################################################################
  30. # The following is the 1st paradigm to compute kernel matrix, which is
  31. # compatible with `scikit-learn`.
  32. # -------------------------------------------------------------------
  33. # Special thanks to the "GraKeL" library for providing an excellent template!
  34. ##########################################################################
  35. ##########################################################################
  36. # The following is the 2nd paradigm to compute kernel matrix. It is
  37. # simplified and not compatible with `scikit-learn`.
  38. ##########################################################################
  39. def _compute_gm_series(self):
  40. # if self.verbose >= 2:
  41. # import warnings
  42. # warnings.warn('A part of the computation is parallelized.')
  43. # self._add_dummy_node_labels(self._graphs)
  44. # for WL subtree kernel
  45. if self._base_kernel == 'subtree':
  46. gram_matrix = self._subtree_kernel_do(self._graphs)
  47. # for WL shortest path kernel
  48. elif self._base_kernel == 'sp':
  49. gram_matrix = self._sp_kernel_do(self._graphs)
  50. # for WL edge kernel
  51. elif self._base_kernel == 'edge':
  52. gram_matrix = self._edge_kernel_do(self._graphs)
  53. # for user defined base kernel
  54. else:
  55. gram_matrix = self._user_kernel_do(self._graphs)
  56. return gram_matrix
  57. def _compute_gm_imap_unordered(self):
  58. # self._add_dummy_node_labels(self._graphs)
  59. if self._base_kernel == 'subtree':
  60. gram_matrix = np.zeros((len(self._graphs), len(self._graphs)))
  61. # for i in range(len(self._graphs)):
  62. # for j in range(i, len(self._graphs)):
  63. # gram_matrix[i][j] = self.pairwise_kernel(self._graphs[i], self._graphs[j])
  64. # gram_matrix[j][i] = gram_matrix[i][j]
  65. def init_worker(gn_toshare):
  66. global G_gn
  67. G_gn = gn_toshare
  68. do_fun = self._wrapper_pairwise
  69. parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker,
  70. glbv=(self._graphs,), n_jobs=self.n_jobs, verbose=self.verbose)
  71. return gram_matrix
  72. else:
  73. if self.verbose >= 2:
  74. import warnings
  75. warnings.warn('This base kernel is not parallelized. The serial computation is used instead.')
  76. return self._compute_gm_series()
  77. def _compute_kernel_list_series(self, g1, g_list): # @todo: this should be better.
  78. # if self.verbose >= 2:
  79. # import warnings
  80. # warnings.warn('A part of the computation is parallelized.')
  81. self._add_dummy_node_labels(g_list + [g1])
  82. # for WL subtree kernel
  83. if self._base_kernel == 'subtree':
  84. gram_matrix = self._subtree_kernel_do(g_list + [g1])
  85. # for WL shortest path kernel
  86. elif self._base_kernel == 'sp':
  87. gram_matrix = self._sp_kernel_do(g_list + [g1])
  88. # for WL edge kernel
  89. elif self._base_kernel == 'edge':
  90. gram_matrix = self._edge_kernel_do(g_list + [g1])
  91. # for user defined base kernel
  92. else:
  93. gram_matrix = self._user_kernel_do(g_list + [g1])
  94. return list(gram_matrix[-1][0:-1])
  95. def _compute_kernel_list_imap_unordered(self, g1, g_list):
  96. self._add_dummy_node_labels(g_list + [g1])
  97. if self._base_kernel == 'subtree':
  98. kernel_list = [None] * len(g_list)
  99. def init_worker(g1_toshare, g_list_toshare):
  100. global G_g1, G_g_list
  101. G_g1 = g1_toshare
  102. G_g_list = g_list_toshare
  103. do_fun = self._wrapper_kernel_list_do
  104. def func_assign(result, var_to_assign):
  105. var_to_assign[result[0]] = result[1]
  106. itr = range(len(g_list))
  107. len_itr = len(g_list)
  108. parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr,
  109. init_worker=init_worker, glbv=(g1, g_list), method='imap_unordered',
  110. n_jobs=self.n_jobs, itr_desc='Computing kernels', verbose=self.verbose)
  111. return kernel_list
  112. else:
  113. if self.verbose >= 2:
  114. import warnings
  115. warnings.warn('This base kernel is not parallelized. The serial computation is used instead.')
  116. return self._compute_kernel_list_series(g1, g_list)
  117. def _wrapper_kernel_list_do(self, itr):
  118. return itr, self.pairwise_kernel(G_g1, G_g_list[itr])
  119. def _compute_single_kernel_series(self, g1, g2): # @todo: this should be better.
  120. self._add_dummy_node_labels([g1] + [g2])
  121. # for WL subtree kernel
  122. if self._base_kernel == 'subtree':
  123. gram_matrix = self._subtree_kernel_do([g1] + [g2])
  124. # for WL shortest path kernel
  125. elif self._base_kernel == 'sp':
  126. gram_matrix = self._sp_kernel_do([g1] + [g2])
  127. # for WL edge kernel
  128. elif self._base_kernel == 'edge':
  129. gram_matrix = self._edge_kernel_do([g1] + [g2])
  130. # for user defined base kernel
  131. else:
  132. gram_matrix = self._user_kernel_do([g1] + [g2])
  133. return gram_matrix[0][1]
  134. ##########################################################################
  135. # The following are the methods used by both diagrams.
  136. ##########################################################################
  137. def validate_parameters(self):
  138. """Validate all parameters for the transformer.
  139. Returns
  140. -------
  141. None.
  142. """
  143. super().validate_parameters()
  144. if len(self.node_labels) == 0:
  145. if len(self.edge_labels) == 0:
  146. self._subtree_kernel_do = self._subtree_kernel_do_unlabeled
  147. else:
  148. self._subtree_kernel_do = self._subtree_kernel_do_el
  149. else:
  150. if len(self.edge_labels) == 0:
  151. self._subtree_kernel_do = self._subtree_kernel_do_nl
  152. else:
  153. self._subtree_kernel_do = self._subtree_kernel_do_labeled
  154. def pairwise_kernel(self, g1, g2):
  155. Gn = [g1.copy(), g2.copy()] # @todo: make sure it is a full deep copy. and faster!
  156. kernel = 0
  157. # initial for height = 0
  158. all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration
  159. # for each graph
  160. for G in Gn:
  161. # set all labels into a tuple.
  162. for nd, attrs in G.nodes(data=True): # @todo: there may be a better way.
  163. G.nodes[nd]['lt'] = tuple(attrs[name] for name in self.node_labels)
  164. # get the set of original labels
  165. labels_ori = list(nx.get_node_attributes(G, 'lt').values())
  166. # number of occurence of each label in G
  167. all_num_of_each_label.append(dict(Counter(labels_ori)))
  168. # Compute subtree kernel with the 0th iteration and add it to the final kernel.
  169. kernel = self._compute_kernel_itr(kernel, all_num_of_each_label)
  170. # iterate each height
  171. for h in range(1, self.height + 1):
  172. all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
  173. num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
  174. # all_labels_ori = set() # all unique orignal labels in all graphs in this iteration
  175. all_num_of_each_label = [] # number of occurence of each label in G
  176. # @todo: parallel this part.
  177. for G in Gn:
  178. all_multisets = []
  179. for node, attrs in G.nodes(data=True):
  180. # Multiset-label determination.
  181. multiset = [G.nodes[neighbors]['lt'] for neighbors in G[node]]
  182. # sorting each multiset
  183. multiset.sort()
  184. multiset = [attrs['lt']] + multiset # add the prefix
  185. all_multisets.append(tuple(multiset))
  186. # label compression
  187. set_unique = list(set(all_multisets)) # set of unique multiset labels
  188. # a dictionary mapping original labels to new ones.
  189. set_compressed = {}
  190. # if a label occured before, assign its former compressed label,
  191. # else assign the number of labels occured + 1 as the compressed label.
  192. for value in set_unique:
  193. if value in all_set_compressed.keys():
  194. set_compressed[value] = all_set_compressed[value]
  195. else:
  196. set_compressed[value] = str(num_of_labels_occured + 1)
  197. num_of_labels_occured += 1
  198. all_set_compressed.update(set_compressed)
  199. # relabel nodes
  200. for idx, node in enumerate(G.nodes()):
  201. G.nodes[node]['lt'] = set_compressed[all_multisets[idx]]
  202. # get the set of compressed labels
  203. labels_comp = list(nx.get_node_attributes(G, 'lt').values())
  204. # all_labels_ori.update(labels_comp)
  205. all_num_of_each_label.append(dict(Counter(labels_comp)))
  206. # Compute subtree kernel with h iterations and add it to the final kernel
  207. kernel = self._compute_kernel_itr(kernel, all_num_of_each_label)
  208. return kernel
  209. def _wrapper_pairwise(self, itr):
  210. i = itr[0]
  211. j = itr[1]
  212. return i, j, self.pairwise_kernel(G_gn[i], G_gn[j])
  213. def _compute_kernel_itr(self, kernel, all_num_of_each_label):
  214. labels = set(list(all_num_of_each_label[0].keys()) +
  215. list(all_num_of_each_label[1].keys()))
  216. vector1 = np.array([(all_num_of_each_label[0][label]
  217. if (label in all_num_of_each_label[0].keys()) else 0)
  218. for label in labels])
  219. vector2 = np.array([(all_num_of_each_label[1][label]
  220. if (label in all_num_of_each_label[1].keys()) else 0)
  221. for label in labels])
  222. kernel += np.dot(vector1, vector2)
  223. return kernel
  224. def _subtree_kernel_do_nl(self, Gn):
  225. """Compute Weisfeiler-Lehman kernels between graphs with node labels.
  226. Parameters
  227. ----------
  228. Gn : List of NetworkX graph
  229. List of graphs between which the kernels are computed.
  230. Return
  231. ------
  232. gram_matrix : Numpy matrix
  233. Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
  234. """
  235. gram_matrix = np.zeros((len(Gn), len(Gn)))
  236. # initial for height = 0
  237. all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration
  238. # for each graph
  239. if self.verbose >= 2:
  240. iterator = get_iters(Gn, desc='Setting all labels into a tuple')
  241. else:
  242. iterator = Gn
  243. for G in iterator:
  244. # set all labels into a tuple. # @todo: remove this original labels or not?
  245. for nd, attrs in G.nodes(data=True): # @todo: there may be a better way.
  246. G.nodes[nd]['lt'] = tuple(attrs[name] for name in self.node_labels)
  247. # get the set of original labels
  248. labels_ori = list(nx.get_node_attributes(G, 'lt').values())
  249. # number of occurence of each label in G
  250. all_num_of_each_label.append(dict(Counter(labels_ori)))
  251. # Compute subtree kernel with the 0th iteration and add it to the final kernel.
  252. self._compute_gram_itr(gram_matrix, all_num_of_each_label)
  253. # iterate each height
  254. for h in range(1, self.height + 1):
  255. all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
  256. num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
  257. # all_labels_ori = set() # all unique orignal labels in all graphs in this iteration
  258. all_num_of_each_label = [] # number of occurence of each label in G
  259. # @todo: parallel this part.
  260. # if self.verbose >= 2:
  261. # iterator = get_iters(enumerate(Gn), desc='Going through iteration ' + str(h), length=len(Gn))
  262. # else:
  263. # iterator = enumerate(Gn)
  264. for G in Gn:
  265. num_of_labels_occured = self._subtree_1graph_nl(G, all_set_compressed, all_num_of_each_label, num_of_labels_occured)
  266. # Compute subtree kernel with h iterations and add it to the final kernel
  267. self._compute_gram_itr(gram_matrix, all_num_of_each_label)
  268. return gram_matrix
  269. def _subtree_kernel_do_el(self, Gn):
  270. """Compute Weisfeiler-Lehman kernels between graphs with edge labels.
  271. Parameters
  272. ----------
  273. Gn : List of NetworkX graph
  274. List of graphs between which the kernels are computed.
  275. Return
  276. ------
  277. gram_matrix : Numpy matrix
  278. Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
  279. """
  280. gram_matrix = np.zeros((len(Gn), len(Gn)))
  281. # initial for height = 0
  282. all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration
  283. # Compute subtree kernel with the 0th iteration and add it to the final kernel.
  284. iterator = combinations_with_replacement(range(0, len(gram_matrix)), 2)
  285. for i, j in iterator:
  286. gram_matrix[i][j] += nx.number_of_nodes(Gn[i]) * nx.number_of_nodes(Gn[j])
  287. gram_matrix[j][i] = gram_matrix[i][j]
  288. # if h >= 1.
  289. if self.height > 0:
  290. # Set all edge labels into a tuple. # @todo: remove this original labels or not?
  291. if self.verbose >= 2:
  292. iterator = get_iters(Gn, desc='Setting all labels into a tuple')
  293. else:
  294. iterator = Gn
  295. for G in iterator:
  296. for n1, n2, attrs in G.edges(data=True): # @todo: there may be a better way.
  297. G.edges[(n1, n2)]['lt'] = tuple(attrs[name] for name in self.edge_labels)
  298. # When h == 1, compute the kernel.
  299. all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
  300. num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
  301. all_num_of_each_label = [] # number of occurence of each label in G
  302. # @todo: parallel this part.
  303. for G in Gn:
  304. num_of_labels_occured = self._subtree_1graph_el(G, all_set_compressed, all_num_of_each_label, num_of_labels_occured)
  305. # Compute subtree kernel with h iterations and add it to the final kernel.
  306. self._compute_gram_itr(gram_matrix, all_num_of_each_label)
  307. # Iterate along heights (>= 2).
  308. for h in range(2, self.height + 1):
  309. all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
  310. num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
  311. all_num_of_each_label = [] # number of occurence of each label in G
  312. # @todo: parallel this part.
  313. for G in Gn:
  314. num_of_labels_occured = self._subtree_1graph_nl(G, all_set_compressed, all_num_of_each_label, num_of_labels_occured)
  315. # Compute subtree kernel with h iterations and add it to the final kernel.
  316. self._compute_gram_itr(gram_matrix, all_num_of_each_label)
  317. return gram_matrix
  318. def _subtree_kernel_do_labeled(self, Gn):
  319. """Compute Weisfeiler-Lehman kernels between graphs with both node and
  320. edge labels.
  321. Parameters
  322. ----------
  323. Gn : List of NetworkX graph
  324. List of graphs between which the kernels are computed.
  325. Return
  326. ------
  327. gram_matrix : Numpy matrix
  328. Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
  329. """
  330. gram_matrix = np.zeros((len(Gn), len(Gn)))
  331. # initial for height = 0
  332. all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration
  333. # Set all node labels into a tuple and get # of occurence of each label.
  334. if self.verbose >= 2:
  335. iterator = get_iters(Gn, desc='Setting all node labels into a tuple')
  336. else:
  337. iterator = Gn
  338. for G in iterator:
  339. # Set all node labels into a tuple. # @todo: remove this original labels or not?
  340. for nd, attrs in G.nodes(data=True): # @todo: there may be a better way.
  341. G.nodes[nd]['lt'] = tuple(attrs[name] for name in self.node_labels)
  342. # Get the set of original labels.
  343. labels_ori = list(nx.get_node_attributes(G, 'lt').values())
  344. # number of occurence of each label in G
  345. all_num_of_each_label.append(dict(Counter(labels_ori)))
  346. # Compute subtree kernel with the 0th iteration and add it to the final kernel.
  347. self._compute_gram_itr(gram_matrix, all_num_of_each_label)
  348. # if h >= 1.
  349. if self.height > 0:
  350. # Set all edge labels into a tuple. # @todo: remove this original labels or not?
  351. if self.verbose >= 2:
  352. iterator = get_iters(Gn, desc='Setting all edge labels into a tuple')
  353. else:
  354. iterator = Gn
  355. for G in iterator:
  356. for n1, n2, attrs in G.edges(data=True): # @todo: there may be a better way.
  357. G.edges[(n1, n2)]['lt'] = tuple(attrs[name] for name in self.edge_labels)
  358. # When h == 1, compute the kernel.
  359. all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
  360. num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
  361. all_num_of_each_label = [] # number of occurence of each label in G
  362. # @todo: parallel this part.
  363. for G in Gn:
  364. num_of_labels_occured = self._subtree_1graph_labeled(G, all_set_compressed, all_num_of_each_label, num_of_labels_occured)
  365. # Compute subtree kernel with h iterations and add it to the final kernel.
  366. self._compute_gram_itr(gram_matrix, all_num_of_each_label)
  367. # Iterate along heights.
  368. for h in range(2, self.height + 1):
  369. all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
  370. num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
  371. all_num_of_each_label = [] # number of occurence of each label in G
  372. # @todo: parallel this part.
  373. for G in Gn:
  374. num_of_labels_occured = self._subtree_1graph_nl(G, all_set_compressed, all_num_of_each_label, num_of_labels_occured)
  375. # Compute subtree kernel with h iterations and add it to the final kernel.
  376. self._compute_gram_itr(gram_matrix, all_num_of_each_label)
  377. return gram_matrix
  378. def _subtree_kernel_do_unlabeled(self, Gn):
  379. """Compute Weisfeiler-Lehman kernels between graphs without labels.
  380. Parameters
  381. ----------
  382. Gn : List of NetworkX graph
  383. List of graphs between which the kernels are computed.
  384. Return
  385. ------
  386. gram_matrix : Numpy matrix
  387. Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
  388. """
  389. gram_matrix = np.zeros((len(Gn), len(Gn)))
  390. # initial for height = 0
  391. all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration
  392. # Compute subtree kernel with the 0th iteration and add it to the final kernel.
  393. iterator = combinations_with_replacement(range(0, len(gram_matrix)), 2)
  394. for i, j in iterator:
  395. gram_matrix[i][j] += nx.number_of_nodes(Gn[i]) * nx.number_of_nodes(Gn[j])
  396. gram_matrix[j][i] = gram_matrix[i][j]
  397. # if h >= 1.
  398. if self.height > 0:
  399. # When h == 1, compute the kernel.
  400. all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
  401. num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
  402. all_num_of_each_label = [] # number of occurence of each label in G
  403. # @todo: parallel this part.
  404. for G in Gn:
  405. num_of_labels_occured = self._subtree_1graph_unlabeled(G, all_set_compressed, all_num_of_each_label, num_of_labels_occured)
  406. # Compute subtree kernel with h iterations and add it to the final kernel.
  407. self._compute_gram_itr(gram_matrix, all_num_of_each_label)
  408. # Iterate along heights (>= 2).
  409. for h in range(2, self.height + 1):
  410. all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
  411. num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
  412. all_num_of_each_label = [] # number of occurence of each label in G
  413. # @todo: parallel this part.
  414. for G in Gn:
  415. num_of_labels_occured = self._subtree_1graph_nl(G, all_set_compressed, all_num_of_each_label, num_of_labels_occured)
  416. # Compute subtree kernel with h iterations and add it to the final kernel.
  417. self._compute_gram_itr(gram_matrix, all_num_of_each_label)
  418. return gram_matrix
  419. def _subtree_1graph_nl(self, G, all_set_compressed, all_num_of_each_label, num_of_labels_occured):
  420. all_multisets = []
  421. for node, attrs in G.nodes(data=True):
  422. # Multiset-label determination.
  423. multiset = [G.nodes[neighbors]['lt'] for neighbors in G[node]]
  424. # sorting each multiset
  425. multiset.sort()
  426. multiset = [attrs['lt']] + multiset # add the prefix
  427. all_multisets.append(tuple(multiset))
  428. # label compression
  429. set_unique = list(set(all_multisets)) # set of unique multiset labels
  430. # a dictionary mapping original labels to new ones.
  431. set_compressed = {}
  432. # If a label occured before, assign its former compressed label;
  433. # otherwise assign the number of labels occured + 1 as the
  434. # compressed label.
  435. for value in set_unique:
  436. if value in all_set_compressed.keys(): # @todo: put keys() function out of for loop?
  437. set_compressed[value] = all_set_compressed[value]
  438. else:
  439. set_compressed[value] = str(num_of_labels_occured + 1) # @todo: remove str? and what if num_of_labels_occured is extremely big.
  440. num_of_labels_occured += 1
  441. all_set_compressed.update(set_compressed)
  442. # Relabel nodes.
  443. for idx, node in enumerate(G.nodes()):
  444. G.nodes[node]['lt'] = set_compressed[all_multisets[idx]]
  445. # Get the set of compressed labels.
  446. labels_comp = list(nx.get_node_attributes(G, 'lt').values())
  447. all_num_of_each_label.append(dict(Counter(labels_comp)))
  448. return num_of_labels_occured
  449. def _subtree_1graph_el(self, G, all_set_compressed, all_num_of_each_label, num_of_labels_occured):
  450. all_multisets = []
  451. # for node, attrs in G.nodes(data=True):
  452. for node in G.nodes():
  453. # Multiset-label determination.
  454. multiset = [G.edges[(node, neighbors)]['lt'] for neighbors in G[node]] # @todo: check reference for this.
  455. # sorting each multiset
  456. multiset.sort()
  457. # multiset = [attrs['lt']] + multiset # add the prefix
  458. all_multisets.append(tuple(multiset))
  459. # label compression
  460. set_unique = list(set(all_multisets)) # set of unique multiset labels
  461. # a dictionary mapping original labels to new ones.
  462. set_compressed = {}
  463. # If a label occured before, assign its former compressed label;
  464. # otherwise assign the number of labels occured + 1 as the
  465. # compressed label.
  466. for value in set_unique:
  467. if value in all_set_compressed.keys(): # @todo: put keys() function out of for loop?
  468. set_compressed[value] = all_set_compressed[value]
  469. else:
  470. set_compressed[value] = str(num_of_labels_occured + 1) # @todo: remove str?
  471. num_of_labels_occured += 1
  472. all_set_compressed.update(set_compressed)
  473. # Relabel nodes.
  474. for idx, node in enumerate(G.nodes()):
  475. G.nodes[node]['lt'] = set_compressed[all_multisets[idx]]
  476. # Get the set of compressed labels.
  477. labels_comp = list(nx.get_node_attributes(G, 'lt').values()) # @todo: maybe can be faster.
  478. all_num_of_each_label.append(dict(Counter(labels_comp)))
  479. return num_of_labels_occured
  480. def _subtree_1graph_labeled(self, G, all_set_compressed, all_num_of_each_label, num_of_labels_occured):
  481. all_multisets = []
  482. for node, attrs in G.nodes(data=True):
  483. # Multiset-label determination.
  484. multiset = [tuple((G.edges[(node, neighbors)]['lt'], G.nodes[neighbors]['lt'])) for neighbors in G[node]] # @todo: check reference for this.
  485. # sorting each multiset
  486. multiset.sort()
  487. multiset = [attrs['lt']] + multiset # add the prefix
  488. all_multisets.append(tuple(multiset))
  489. # label compression
  490. set_unique = list(set(all_multisets)) # set of unique multiset labels
  491. # a dictionary mapping original labels to new ones.
  492. set_compressed = {}
  493. # If a label occured before, assign its former compressed label;
  494. # otherwise assign the number of labels occured + 1 as the
  495. # compressed label.
  496. for value in set_unique:
  497. if value in all_set_compressed.keys(): # @todo: put keys() function out of for loop?
  498. set_compressed[value] = all_set_compressed[value]
  499. else:
  500. set_compressed[value] = str(num_of_labels_occured + 1) # @todo: remove str?
  501. num_of_labels_occured += 1
  502. all_set_compressed.update(set_compressed)
  503. # Relabel nodes.
  504. for idx, node in enumerate(G.nodes()):
  505. G.nodes[node]['lt'] = set_compressed[all_multisets[idx]]
  506. # Get the set of compressed labels.
  507. labels_comp = list(nx.get_node_attributes(G, 'lt').values())
  508. all_num_of_each_label.append(dict(Counter(labels_comp)))
  509. return num_of_labels_occured
  510. def _subtree_1graph_unlabeled(self, G, all_set_compressed, all_num_of_each_label, num_of_labels_occured):
  511. # all_multisets = []
  512. # for node, attrs in G.nodes(data=True): # @todo: it can be better.
  513. # # Multiset-label determination.
  514. # multiset = [0 for neighbors in G[node]]
  515. # # sorting each multiset
  516. # multiset.sort()
  517. # multiset = [0] + multiset # add the prefix
  518. # all_multisets.append(tuple(multiset))
  519. all_multisets = [len(G[node]) for node in G.nodes()]
  520. # label compression
  521. set_unique = list(set(all_multisets)) # set of unique multiset labels
  522. # a dictionary mapping original labels to new ones.
  523. set_compressed = {}
  524. # If a label occured before, assign its former compressed label;
  525. # otherwise assign the number of labels occured + 1 as the
  526. # compressed label.
  527. for value in set_unique:
  528. if value in all_set_compressed.keys(): # @todo: put keys() function out of for loop?
  529. set_compressed[value] = all_set_compressed[value]
  530. else:
  531. set_compressed[value] = str(num_of_labels_occured + 1) # @todo: remove str?
  532. num_of_labels_occured += 1
  533. all_set_compressed.update(set_compressed)
  534. # Relabel nodes.
  535. for idx, node in enumerate(G.nodes()):
  536. G.nodes[node]['lt'] = set_compressed[all_multisets[idx]]
  537. # Get the set of compressed labels.
  538. labels_comp = list(nx.get_node_attributes(G, 'lt').values())
  539. all_num_of_each_label.append(dict(Counter(labels_comp)))
  540. return num_of_labels_occured
  541. def _compute_gram_itr(self, gram_matrix, all_num_of_each_label):
  542. """Compute Gram matrix using the base kernel.
  543. """
  544. # if self.parallel == 'imap_unordered':
  545. # # compute kernels.
  546. # def init_worker(alllabels_toshare):
  547. # global G_alllabels
  548. # G_alllabels = alllabels_toshare
  549. # do_partial = partial(self._wrapper_compute_subtree_kernel, gram_matrix)
  550. # parallel_gm(do_partial, gram_matrix, Gn, init_worker=init_worker,
  551. # glbv=(all_num_of_each_label,), n_jobs=self.n_jobs, verbose=self.verbose)
  552. # elif self.parallel is None:
  553. itr = combinations_with_replacement(range(0, len(gram_matrix)), 2)
  554. len_itr = int(len(gram_matrix) * (len(gram_matrix) + 1) / 2)
  555. iterator = get_iters(itr, desc='Computing Gram matrix for this iteration', file=sys.stdout, length=len_itr, verbose=(self.verbose >= 2))
  556. for i, j in iterator:
  557. # for i in iterator:
  558. # for j in range(i, len(gram_matrix)):
  559. gram_matrix[i][j] += self._compute_subtree_kernel(all_num_of_each_label[i],
  560. all_num_of_each_label[j])
  561. gram_matrix[j][i] = gram_matrix[i][j]
  562. def _compute_subtree_kernel(self, num_of_each_label1, num_of_each_label2):
  563. """Compute the subtree kernel.
  564. """
  565. labels = set(list(num_of_each_label1.keys()) + list(num_of_each_label2.keys()))
  566. vector1 = np.array([(num_of_each_label1[label]
  567. if (label in num_of_each_label1.keys()) else 0)
  568. for label in labels])
  569. vector2 = np.array([(num_of_each_label2[label]
  570. if (label in num_of_each_label2.keys()) else 0)
  571. for label in labels])
  572. kernel = np.dot(vector1, vector2)
  573. return kernel
  574. # def _wrapper_compute_subtree_kernel(self, gram_matrix, itr):
  575. # i = itr[0]
  576. # j = itr[1]
  577. # return i, j, self._compute_subtree_kernel(G_alllabels[i], G_alllabels[j], gram_matrix[i][j])
  578. def _wl_spkernel_do(Gn, node_label, edge_label, height):
  579. """Compute Weisfeiler-Lehman shortest path kernels between graphs.
  580. Parameters
  581. ----------
  582. Gn : List of NetworkX graph
  583. List of graphs between which the kernels are computed.
  584. node_label : string
  585. node attribute used as label.
  586. edge_label : string
  587. edge attribute used as label.
  588. height : int
  589. subtree height.
  590. Return
  591. ------
  592. gram_matrix : Numpy matrix
  593. Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
  594. """
  595. pass
  596. from gklearn.utils.utils import getSPGraph
  597. # init.
  598. height = int(height)
  599. gram_matrix = np.zeros((len(Gn), len(Gn))) # init kernel
  600. Gn = [ getSPGraph(G, edge_weight = edge_label) for G in Gn ] # get shortest path graphs of Gn
  601. # initial for height = 0
  602. for i in range(0, len(Gn)):
  603. for j in range(i, len(Gn)):
  604. for e1 in Gn[i].edges(data = True):
  605. for e2 in Gn[j].edges(data = True):
  606. if e1[2]['cost'] != 0 and e1[2]['cost'] == e2[2]['cost'] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])):
  607. gram_matrix[i][j] += 1
  608. gram_matrix[j][i] = gram_matrix[i][j]
  609. # iterate each height
  610. for h in range(1, height + 1):
  611. all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
  612. num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
  613. for G in Gn: # for each graph
  614. set_multisets = []
  615. for node in G.nodes(data = True):
  616. # Multiset-label determination.
  617. multiset = [ G.node[neighbors][node_label] for neighbors in G[node[0]] ]
  618. # sorting each multiset
  619. multiset.sort()
  620. multiset = node[1][node_label] + ''.join(multiset) # concatenate to a string and add the prefix
  621. set_multisets.append(multiset)
  622. # label compression
  623. set_unique = list(set(set_multisets)) # set of unique multiset labels
  624. # a dictionary mapping original labels to new ones.
  625. set_compressed = {}
  626. # if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label
  627. for value in set_unique:
  628. if value in all_set_compressed.keys():
  629. set_compressed[value] = all_set_compressed[value]
  630. else:
  631. set_compressed[value] = str(num_of_labels_occured + 1)
  632. num_of_labels_occured += 1
  633. all_set_compressed.update(set_compressed)
  634. # relabel nodes
  635. for node in G.nodes(data = True):
  636. node[1][node_label] = set_compressed[set_multisets[node[0]]]
  637. # Compute subtree kernel with h iterations and add it to the final kernel
  638. for i in range(0, len(Gn)):
  639. for j in range(i, len(Gn)):
  640. for e1 in Gn[i].edges(data = True):
  641. for e2 in Gn[j].edges(data = True):
  642. if e1[2]['cost'] != 0 and e1[2]['cost'] == e2[2]['cost'] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])):
  643. gram_matrix[i][j] += 1
  644. gram_matrix[j][i] = gram_matrix[i][j]
  645. return gram_matrix
  646. def _wl_edgekernel_do(Gn, node_label, edge_label, height):
  647. """Compute Weisfeiler-Lehman edge kernels between graphs.
  648. Parameters
  649. ----------
  650. Gn : List of NetworkX graph
  651. List of graphs between which the kernels are computed.
  652. node_label : string
  653. node attribute used as label.
  654. edge_label : string
  655. edge attribute used as label.
  656. height : int
  657. subtree height.
  658. Return
  659. ------
  660. gram_matrix : Numpy matrix
  661. Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
  662. """
  663. pass
  664. # init.
  665. height = int(height)
  666. gram_matrix = np.zeros((len(Gn), len(Gn))) # init kernel
  667. # initial for height = 0
  668. for i in range(0, len(Gn)):
  669. for j in range(i, len(Gn)):
  670. for e1 in Gn[i].edges(data = True):
  671. for e2 in Gn[j].edges(data = True):
  672. if e1[2][edge_label] == e2[2][edge_label] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])):
  673. gram_matrix[i][j] += 1
  674. gram_matrix[j][i] = gram_matrix[i][j]
  675. # iterate each height
  676. for h in range(1, height + 1):
  677. all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
  678. num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
  679. for G in Gn: # for each graph
  680. set_multisets = []
  681. for node in G.nodes(data = True):
  682. # Multiset-label determination.
  683. multiset = [ G.node[neighbors][node_label] for neighbors in G[node[0]] ]
  684. # sorting each multiset
  685. multiset.sort()
  686. multiset = node[1][node_label] + ''.join(multiset) # concatenate to a string and add the prefix
  687. set_multisets.append(multiset)
  688. # label compression
  689. set_unique = list(set(set_multisets)) # set of unique multiset labels
  690. # a dictionary mapping original labels to new ones.
  691. set_compressed = {}
  692. # if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label
  693. for value in set_unique:
  694. if value in all_set_compressed.keys():
  695. set_compressed[value] = all_set_compressed[value]
  696. else:
  697. set_compressed[value] = str(num_of_labels_occured + 1)
  698. num_of_labels_occured += 1
  699. all_set_compressed.update(set_compressed)
  700. # relabel nodes
  701. for node in G.nodes(data = True):
  702. node[1][node_label] = set_compressed[set_multisets[node[0]]]
  703. # Compute subtree kernel with h iterations and add it to the final kernel
  704. for i in range(0, len(Gn)):
  705. for j in range(i, len(Gn)):
  706. for e1 in Gn[i].edges(data = True):
  707. for e2 in Gn[j].edges(data = True):
  708. if e1[2][edge_label] == e2[2][edge_label] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])):
  709. gram_matrix[i][j] += 1
  710. gram_matrix[j][i] = gram_matrix[i][j]
  711. return gram_matrix
  712. def _wl_userkernel_do(Gn, node_label, edge_label, height, base_kernel):
  713. """Compute Weisfeiler-Lehman kernels based on user-defined kernel between graphs.
  714. Parameters
  715. ----------
  716. Gn : List of NetworkX graph
  717. List of graphs between which the kernels are computed.
  718. node_label : string
  719. node attribute used as label.
  720. edge_label : string
  721. edge attribute used as label.
  722. height : int
  723. subtree height.
  724. base_kernel : string
  725. Name of the base kernel function used in each iteration of WL kernel. This function returns a Numpy matrix, each element of which is the user-defined Weisfeiler-Lehman kernel between 2 praphs.
  726. Return
  727. ------
  728. gram_matrix : Numpy matrix
  729. Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
  730. """
  731. pass
  732. # init.
  733. height = int(height)
  734. gram_matrix = np.zeros((len(Gn), len(Gn))) # init kernel
  735. # initial for height = 0
  736. gram_matrix = base_kernel(Gn, node_label, edge_label)
  737. # iterate each height
  738. for h in range(1, height + 1):
  739. all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
  740. num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
  741. for G in Gn: # for each graph
  742. set_multisets = []
  743. for node in G.nodes(data = True):
  744. # Multiset-label determination.
  745. multiset = [ G.node[neighbors][node_label] for neighbors in G[node[0]] ]
  746. # sorting each multiset
  747. multiset.sort()
  748. multiset = node[1][node_label] + ''.join(multiset) # concatenate to a string and add the prefix
  749. set_multisets.append(multiset)
  750. # label compression
  751. set_unique = list(set(set_multisets)) # set of unique multiset labels
  752. # a dictionary mapping original labels to new ones.
  753. set_compressed = {}
  754. # if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label
  755. for value in set_unique:
  756. if value in all_set_compressed.keys():
  757. set_compressed[value] = all_set_compressed[value]
  758. else:
  759. set_compressed[value] = str(num_of_labels_occured + 1)
  760. num_of_labels_occured += 1
  761. all_set_compressed.update(set_compressed)
  762. # relabel nodes
  763. for node in G.nodes(data = True):
  764. node[1][node_label] = set_compressed[set_multisets[node[0]]]
  765. # Compute kernel with h iterations and add it to the final kernel
  766. gram_matrix += base_kernel(Gn, node_label, edge_label)
  767. return gram_matrix
  768. def _add_dummy_node_labels(self, Gn):
  769. if len(self.node_labels) == 0 or (len(self.node_labels) == 1 and self.node_labels[0] == SpecialLabel.DUMMY):
  770. for i in range(len(Gn)):
  771. nx.set_node_attributes(Gn[i], '0', SpecialLabel.DUMMY)
  772. self.node_labels = [SpecialLabel.DUMMY]
  773. class WLSubtree(WeisfeilerLehman):
  774. def __init__(self, **kwargs):
  775. kwargs['base_kernel'] = 'subtree'
  776. super().__init__(**kwargs)

A Python package for graph kernels, graph edit distances and graph pre-image problem.