You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

marginalized.py 11 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. Created on Wed Jun 3 22:22:57 2020
  5. @author: ljia
  6. @references:
  7. [1] H. Kashima, K. Tsuda, and A. Inokuchi. Marginalized kernels between
  8. labeled graphs. In Proceedings of the 20th International Conference on
  9. Machine Learning, Washington, DC, United States, 2003.
  10. [2] Pierre Mahé, Nobuhisa Ueda, Tatsuya Akutsu, Jean-Luc Perret, and
  11. Jean-Philippe Vert. Extensions of marginalized graph kernels. In
  12. Proceedings of the twenty-first international conference on Machine
  13. learning, page 70. ACM, 2004.
  14. """
  15. import sys
  16. from multiprocessing import Pool
  17. from gklearn.utils import get_iters
  18. import numpy as np
  19. import networkx as nx
  20. from gklearn.utils import SpecialLabel
  21. from gklearn.utils.kernels import deltakernel
  22. from gklearn.utils.parallel import parallel_gm, parallel_me
  23. from gklearn.utils.utils import untotterTransformation
  24. from gklearn.kernels import GraphKernel
  25. class Marginalized(GraphKernel):
  26. def __init__(self, **kwargs):
  27. GraphKernel.__init__(self)
  28. self._node_labels = kwargs.get('node_labels', [])
  29. self._edge_labels = kwargs.get('edge_labels', [])
  30. self._p_quit = kwargs.get('p_quit', 0.5)
  31. self._n_iteration = kwargs.get('n_iteration', 10)
  32. self._remove_totters = kwargs.get('remove_totters', False)
  33. self._ds_infos = kwargs.get('ds_infos', {})
  34. self._n_iteration = int(self._n_iteration)
  35. def _compute_gm_series(self):
  36. self._add_dummy_labels(self._graphs)
  37. if self._remove_totters:
  38. iterator = get_iters(self._graphs, desc='removing tottering', file=sys.stdout, verbose=(self.verbose >= 2))
  39. # @todo: this may not work.
  40. self._graphs = [untotterTransformation(G, self._node_labels, self._edge_labels) for G in iterator]
  41. # compute Gram matrix.
  42. gram_matrix = np.zeros((len(self._graphs), len(self._graphs)))
  43. from itertools import combinations_with_replacement
  44. itr = combinations_with_replacement(range(0, len(self._graphs)), 2)
  45. len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2)
  46. iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout,
  47. length=len_itr, verbose=(self.verbose >= 2))
  48. for i, j in iterator:
  49. kernel = self._kernel_do(self._graphs[i], self._graphs[j])
  50. gram_matrix[i][j] = kernel
  51. gram_matrix[j][i] = kernel # @todo: no directed graph considered?
  52. return gram_matrix
  53. def _compute_gm_imap_unordered(self):
  54. self._add_dummy_labels(self._graphs)
  55. if self._remove_totters:
  56. pool = Pool(self.n_jobs)
  57. itr = range(0, len(self._graphs))
  58. if len(self._graphs) < 100 * self.n_jobs:
  59. chunksize = int(len(self._graphs) / self.n_jobs) + 1
  60. else:
  61. chunksize = 100
  62. remove_fun = self._wrapper_untotter
  63. iterator = get_iters(pool.imap_unordered(remove_fun, itr, chunksize),
  64. desc='removing tottering', file=sys.stdout,
  65. length=len(self._graphs), verbose=(self.verbose >= 2))
  66. for i, g in iterator:
  67. self._graphs[i] = g
  68. pool.close()
  69. pool.join()
  70. # compute Gram matrix.
  71. gram_matrix = np.zeros((len(self._graphs), len(self._graphs)))
  72. def init_worker(gn_toshare):
  73. global G_gn
  74. G_gn = gn_toshare
  75. do_fun = self._wrapper_kernel_do
  76. parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker,
  77. glbv=(self._graphs,), n_jobs=self.n_jobs, verbose=self.verbose)
  78. return gram_matrix
  79. def _compute_kernel_list_series(self, g1, g_list):
  80. self._add_dummy_labels(g_list + [g1])
  81. if self._remove_totters:
  82. g1 = untotterTransformation(g1, self._node_labels, self._edge_labels) # @todo: this may not work.
  83. iterator = get_iters(g_list, desc='removing tottering', file=sys.stdout, verbose=(self.verbose >= 2))
  84. # @todo: this may not work.
  85. g_list = [untotterTransformation(G, self._node_labels, self._edge_labels) for G in iterator]
  86. # compute kernel list.
  87. kernel_list = [None] * len(g_list)
  88. iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self.verbose >= 2))
  89. for i in iterator:
  90. kernel = self._kernel_do(g1, g_list[i])
  91. kernel_list[i] = kernel
  92. return kernel_list
  93. def _compute_kernel_list_imap_unordered(self, g1, g_list):
  94. self._add_dummy_labels(g_list + [g1])
  95. if self._remove_totters:
  96. g1 = untotterTransformation(g1, self._node_labels, self._edge_labels) # @todo: this may not work.
  97. pool = Pool(self.n_jobs)
  98. itr = range(0, len(g_list))
  99. if len(g_list) < 100 * self.n_jobs:
  100. chunksize = int(len(g_list) / self.n_jobs) + 1
  101. else:
  102. chunksize = 100
  103. remove_fun = self._wrapper_untotter
  104. iterator = get_iters(pool.imap_unordered(remove_fun, itr, chunksize),
  105. desc='removing tottering', file=sys.stdout,
  106. length=len(g_list), verbose=(self.verbose >= 2))
  107. for i, g in iterator:
  108. g_list[i] = g
  109. pool.close()
  110. pool.join()
  111. # compute kernel list.
  112. kernel_list = [None] * len(g_list)
  113. def init_worker(g1_toshare, g_list_toshare):
  114. global G_g1, G_g_list
  115. G_g1 = g1_toshare
  116. G_g_list = g_list_toshare
  117. do_fun = self._wrapper_kernel_list_do
  118. def func_assign(result, var_to_assign):
  119. var_to_assign[result[0]] = result[1]
  120. itr = range(len(g_list))
  121. len_itr = len(g_list)
  122. parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr,
  123. init_worker=init_worker, glbv=(g1, g_list), method='imap_unordered',
  124. n_jobs=self.n_jobs, itr_desc='Computing kernels', verbose=self.verbose)
  125. return kernel_list
  126. def _wrapper_kernel_list_do(self, itr):
  127. return itr, self._kernel_do(G_g1, G_g_list[itr])
  128. def _compute_single_kernel_series(self, g1, g2):
  129. self._add_dummy_labels([g1] + [g2])
  130. if self._remove_totters:
  131. g1 = untotterTransformation(g1, self._node_labels, self._edge_labels) # @todo: this may not work.
  132. g2 = untotterTransformation(g2, self._node_labels, self._edge_labels)
  133. kernel = self._kernel_do(g1, g2)
  134. return kernel
  135. def _kernel_do(self, g1, g2):
  136. """Compute marginalized graph kernel between 2 graphs.
  137. Parameters
  138. ----------
  139. g1, g2 : NetworkX graphs
  140. 2 graphs between which the kernel is computed.
  141. Return
  142. ------
  143. kernel : float
  144. Marginalized kernel between 2 graphs.
  145. """
  146. # init parameters
  147. kernel = 0
  148. num_nodes_G1 = nx.number_of_nodes(g1)
  149. num_nodes_G2 = nx.number_of_nodes(g2)
  150. # the initial probability distribution in the random walks generating step
  151. # (uniform distribution over |G|)
  152. p_init_G1 = 1 / num_nodes_G1
  153. p_init_G2 = 1 / num_nodes_G2
  154. q = self._p_quit * self._p_quit
  155. r1 = q
  156. # # initial R_inf
  157. # # matrix to save all the R_inf for all pairs of nodes
  158. # R_inf = np.zeros([num_nodes_G1, num_nodes_G2])
  159. #
  160. # # Compute R_inf with a simple interative method
  161. # for i in range(1, n_iteration):
  162. # R_inf_new = np.zeros([num_nodes_G1, num_nodes_G2])
  163. # R_inf_new.fill(r1)
  164. #
  165. # # Compute R_inf for each pair of nodes
  166. # for node1 in g1.nodes(data=True):
  167. # neighbor_n1 = g1[node1[0]]
  168. # # the transition probability distribution in the random walks
  169. # # generating step (uniform distribution over the vertices adjacent
  170. # # to the current vertex)
  171. # if len(neighbor_n1) > 0:
  172. # p_trans_n1 = (1 - p_quit) / len(neighbor_n1)
  173. # for node2 in g2.nodes(data=True):
  174. # neighbor_n2 = g2[node2[0]]
  175. # if len(neighbor_n2) > 0:
  176. # p_trans_n2 = (1 - p_quit) / len(neighbor_n2)
  177. #
  178. # for neighbor1 in neighbor_n1:
  179. # for neighbor2 in neighbor_n2:
  180. # t = p_trans_n1 * p_trans_n2 * \
  181. # deltakernel(g1.node[neighbor1][node_label],
  182. # g2.node[neighbor2][node_label]) * \
  183. # deltakernel(
  184. # neighbor_n1[neighbor1][edge_label],
  185. # neighbor_n2[neighbor2][edge_label])
  186. #
  187. # R_inf_new[node1[0]][node2[0]] += t * R_inf[neighbor1][
  188. # neighbor2] # ref [1] equation (8)
  189. # R_inf[:] = R_inf_new
  190. #
  191. # # add elements of R_inf up and compute kernel
  192. # for node1 in g1.nodes(data=True):
  193. # for node2 in g2.nodes(data=True):
  194. # s = p_init_G1 * p_init_G2 * deltakernel(
  195. # node1[1][node_label], node2[1][node_label])
  196. # kernel += s * R_inf[node1[0]][node2[0]] # ref [1] equation (6)
  197. R_inf = {} # dict to save all the R_inf for all pairs of nodes
  198. # initial R_inf, the 1st iteration.
  199. for node1 in g1.nodes():
  200. for node2 in g2.nodes():
  201. # R_inf[(node1[0], node2[0])] = r1
  202. if len(g1[node1]) > 0:
  203. if len(g2[node2]) > 0:
  204. R_inf[(node1, node2)] = r1
  205. else:
  206. R_inf[(node1, node2)] = self._p_quit
  207. else:
  208. if len(g2[node2]) > 0:
  209. R_inf[(node1, node2)] = self._p_quit
  210. else:
  211. R_inf[(node1, node2)] = 1
  212. # compute all transition probability first.
  213. t_dict = {}
  214. if self._n_iteration > 1:
  215. for node1 in g1.nodes():
  216. neighbor_n1 = g1[node1]
  217. # the transition probability distribution in the random walks
  218. # generating step (uniform distribution over the vertices adjacent
  219. # to the current vertex)
  220. if len(neighbor_n1) > 0:
  221. p_trans_n1 = (1 - self._p_quit) / len(neighbor_n1)
  222. for node2 in g2.nodes():
  223. neighbor_n2 = g2[node2]
  224. if len(neighbor_n2) > 0:
  225. p_trans_n2 = (1 - self._p_quit) / len(neighbor_n2)
  226. for neighbor1 in neighbor_n1:
  227. for neighbor2 in neighbor_n2:
  228. t_dict[(node1, node2, neighbor1, neighbor2)] = \
  229. p_trans_n1 * p_trans_n2 * \
  230. deltakernel(tuple(g1.nodes[neighbor1][nl] for nl in self._node_labels), tuple(g2.nodes[neighbor2][nl] for nl in self._node_labels)) * \
  231. deltakernel(tuple(neighbor_n1[neighbor1][el] for el in self._edge_labels), tuple(neighbor_n2[neighbor2][el] for el in self._edge_labels))
  232. # Compute R_inf with a simple interative method
  233. for i in range(2, self._n_iteration + 1):
  234. R_inf_old = R_inf.copy()
  235. # Compute R_inf for each pair of nodes
  236. for node1 in g1.nodes():
  237. neighbor_n1 = g1[node1]
  238. # the transition probability distribution in the random walks
  239. # generating step (uniform distribution over the vertices adjacent
  240. # to the current vertex)
  241. if len(neighbor_n1) > 0:
  242. for node2 in g2.nodes():
  243. neighbor_n2 = g2[node2]
  244. if len(neighbor_n2) > 0:
  245. R_inf[(node1, node2)] = r1
  246. for neighbor1 in neighbor_n1:
  247. for neighbor2 in neighbor_n2:
  248. R_inf[(node1, node2)] += \
  249. (t_dict[(node1, node2, neighbor1, neighbor2)] * \
  250. R_inf_old[(neighbor1, neighbor2)]) # ref [1] equation (8)
  251. # add elements of R_inf up and compute kernel.
  252. for (n1, n2), value in R_inf.items():
  253. s = p_init_G1 * p_init_G2 * deltakernel(tuple(g1.nodes[n1][nl] for nl in self._node_labels), tuple(g2.nodes[n2][nl] for nl in self._node_labels))
  254. kernel += s * value # ref [1] equation (6)
  255. return kernel
  256. def _wrapper_kernel_do(self, itr):
  257. i = itr[0]
  258. j = itr[1]
  259. return i, j, self._kernel_do(G_gn[i], G_gn[j])
  260. def _wrapper_untotter(self, i):
  261. return i, untotterTransformation(self._graphs[i], self._node_labels, self._edge_labels) # @todo: this may not work.
  262. def _add_dummy_labels(self, Gn):
  263. if len(self._node_labels) == 0 or (len(self._node_labels) == 1 and self._node_labels[0] == SpecialLabel.DUMMY):
  264. for i in range(len(Gn)):
  265. nx.set_node_attributes(Gn[i], '0', SpecialLabel.DUMMY)
  266. self._node_labels = [SpecialLabel.DUMMY]
  267. if len(self._edge_labels) == 0 or (len(self._edge_labels) == 1 and self._edge_labels[0] == SpecialLabel.DUMMY):
  268. for i in range(len(Gn)):
  269. nx.set_edge_attributes(Gn[i], '0', SpecialLabel.DUMMY)
  270. self._edge_labels = [SpecialLabel.DUMMY]

A Python package for graph kernels, graph edit distances and graph pre-image problem.