You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

marginalized.py 12 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. Created on Wed Jun 3 22:22:57 2020
  5. @author: ljia
  6. @references:
  7. [1] H. Kashima, K. Tsuda, and A. Inokuchi. Marginalized kernels between
  8. labeled graphs. In Proceedings of the 20th International Conference on
  9. Machine Learning, Washington, DC, United States, 2003.
  10. [2] Pierre Mahé, Nobuhisa Ueda, Tatsuya Akutsu, Jean-Luc Perret, and
  11. Jean-Philippe Vert. Extensions of marginalized graph kernels. In
  12. Proceedings of the twenty-first international conference on Machine
  13. learning, page 70. ACM, 2004.
  14. """
  15. import sys
  16. from multiprocessing import Pool
  17. from tqdm import tqdm
  18. import numpy as np
  19. import networkx as nx
  20. from gklearn.utils import SpecialLabel
  21. from gklearn.utils.kernels import deltakernel
  22. from gklearn.utils.parallel import parallel_gm, parallel_me
  23. from gklearn.utils.utils import untotterTransformation
  24. from gklearn.kernels import GraphKernel
  25. class Marginalized(GraphKernel):
  26. def __init__(self, **kwargs):
  27. GraphKernel.__init__(self)
  28. self.__node_labels = kwargs.get('node_labels', [])
  29. self.__edge_labels = kwargs.get('edge_labels', [])
  30. self.__p_quit = kwargs.get('p_quit', 0.5)
  31. self.__n_iteration = kwargs.get('n_iteration', 10)
  32. self.__remove_totters = kwargs.get('remove_totters', False)
  33. self.__ds_infos = kwargs.get('ds_infos', {})
  34. self.__n_iteration = int(self.__n_iteration)
  35. def _compute_gm_series(self):
  36. self.__add_dummy_labels(self._graphs)
  37. if self.__remove_totters:
  38. if self._verbose >= 2:
  39. iterator = tqdm(self._graphs, desc='removing tottering', file=sys.stdout)
  40. else:
  41. iterator = self._graphs
  42. # @todo: this may not work.
  43. self._graphs = [untotterTransformation(G, self.__node_label, self.__edge_label) for G in iterator]
  44. # compute Gram matrix.
  45. gram_matrix = np.zeros((len(self._graphs), len(self._graphs)))
  46. from itertools import combinations_with_replacement
  47. itr = combinations_with_replacement(range(0, len(self._graphs)), 2)
  48. if self._verbose >= 2:
  49. iterator = tqdm(itr, desc='calculating kernels', file=sys.stdout)
  50. else:
  51. iterator = itr
  52. for i, j in iterator:
  53. kernel = self.__kernel_do(self._graphs[i], self._graphs[j])
  54. gram_matrix[i][j] = kernel
  55. gram_matrix[j][i] = kernel # @todo: no directed graph considered?
  56. return gram_matrix
  57. def _compute_gm_imap_unordered(self):
  58. self.__add_dummy_labels(self._graphs)
  59. if self.__remove_totters:
  60. pool = Pool(self._n_jobs)
  61. itr = range(0, len(self._graphs))
  62. if len(self._graphs) < 100 * self._n_jobs:
  63. chunksize = int(len(self._graphs) / self._n_jobs) + 1
  64. else:
  65. chunksize = 100
  66. remove_fun = self._wrapper_untotter
  67. if self._verbose >= 2:
  68. iterator = tqdm(pool.imap_unordered(remove_fun, itr, chunksize),
  69. desc='removing tottering', file=sys.stdout)
  70. else:
  71. iterator = pool.imap_unordered(remove_fun, itr, chunksize)
  72. for i, g in iterator:
  73. self._graphs[i] = g
  74. pool.close()
  75. pool.join()
  76. # compute Gram matrix.
  77. gram_matrix = np.zeros((len(self._graphs), len(self._graphs)))
  78. def init_worker(gn_toshare):
  79. global G_gn
  80. G_gn = gn_toshare
  81. do_fun = self._wrapper_kernel_do
  82. parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker,
  83. glbv=(self._graphs,), n_jobs=self._n_jobs, verbose=self._verbose)
  84. return gram_matrix
  85. def _compute_kernel_list_series(self, g1, g_list):
  86. self.__add_dummy_labels(g_list + [g1])
  87. if self.__remove_totters:
  88. g1 = untotterTransformation(g1, self.__node_label, self.__edge_label) # @todo: this may not work.
  89. if self._verbose >= 2:
  90. iterator = tqdm(g_list, desc='removing tottering', file=sys.stdout)
  91. else:
  92. iterator = g_list
  93. # @todo: this may not work.
  94. g_list = [untotterTransformation(G, self.__node_label, self.__edge_label) for G in iterator]
  95. # compute kernel list.
  96. kernel_list = [None] * len(g_list)
  97. if self._verbose >= 2:
  98. iterator = tqdm(range(len(g_list)), desc='calculating kernels', file=sys.stdout)
  99. else:
  100. iterator = range(len(g_list))
  101. for i in iterator:
  102. kernel = self.__kernel_do(g1, g_list[i])
  103. kernel_list[i] = kernel
  104. return kernel_list
  105. def _compute_kernel_list_imap_unordered(self, g1, g_list):
  106. self.__add_dummy_labels(g_list + [g1])
  107. if self.__remove_totters:
  108. g1 = untotterTransformation(g1, self.__node_label, self.__edge_label) # @todo: this may not work.
  109. pool = Pool(self._n_jobs)
  110. itr = range(0, len(g_list))
  111. if len(g_list) < 100 * self._n_jobs:
  112. chunksize = int(len(g_list) / self._n_jobs) + 1
  113. else:
  114. chunksize = 100
  115. remove_fun = self._wrapper_untotter
  116. if self._verbose >= 2:
  117. iterator = tqdm(pool.imap_unordered(remove_fun, itr, chunksize),
  118. desc='removing tottering', file=sys.stdout)
  119. else:
  120. iterator = pool.imap_unordered(remove_fun, itr, chunksize)
  121. for i, g in iterator:
  122. g_list[i] = g
  123. pool.close()
  124. pool.join()
  125. # compute kernel list.
  126. kernel_list = [None] * len(g_list)
  127. def init_worker(g1_toshare, g_list_toshare):
  128. global G_g1, G_g_list
  129. G_g1 = g1_toshare
  130. G_g_list = g_list_toshare
  131. do_fun = self._wrapper_kernel_list_do
  132. def func_assign(result, var_to_assign):
  133. var_to_assign[result[0]] = result[1]
  134. itr = range(len(g_list))
  135. len_itr = len(g_list)
  136. parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr,
  137. init_worker=init_worker, glbv=(g1, g_list), method='imap_unordered',
  138. n_jobs=self._n_jobs, itr_desc='calculating kernels', verbose=self._verbose)
  139. return kernel_list
  140. def _wrapper_kernel_list_do(self, itr):
  141. return itr, self.__kernel_do(G_g1, G_g_list[itr])
  142. def _compute_single_kernel_series(self, g1, g2):
  143. self.__add_dummy_labels([g1] + [g2])
  144. if self.__remove_totters:
  145. g1 = untotterTransformation(g1, self.__node_label, self.__edge_label) # @todo: this may not work.
  146. g2 = untotterTransformation(g2, self.__node_label, self.__edge_label)
  147. kernel = self.__kernel_do(g1, g2)
  148. return kernel
  149. def __kernel_do(self, g1, g2):
  150. """Calculate marginalized graph kernel between 2 graphs.
  151. Parameters
  152. ----------
  153. g1, g2 : NetworkX graphs
  154. 2 graphs between which the kernel is calculated.
  155. Return
  156. ------
  157. kernel : float
  158. Marginalized kernel between 2 graphs.
  159. """
  160. # init parameters
  161. kernel = 0
  162. num_nodes_G1 = nx.number_of_nodes(g1)
  163. num_nodes_G2 = nx.number_of_nodes(g2)
  164. # the initial probability distribution in the random walks generating step
  165. # (uniform distribution over |G|)
  166. p_init_G1 = 1 / num_nodes_G1
  167. p_init_G2 = 1 / num_nodes_G2
  168. q = self.__p_quit * self.__p_quit
  169. r1 = q
  170. # # initial R_inf
  171. # # matrix to save all the R_inf for all pairs of nodes
  172. # R_inf = np.zeros([num_nodes_G1, num_nodes_G2])
  173. #
  174. # # calculate R_inf with a simple interative method
  175. # for i in range(1, n_iteration):
  176. # R_inf_new = np.zeros([num_nodes_G1, num_nodes_G2])
  177. # R_inf_new.fill(r1)
  178. #
  179. # # calculate R_inf for each pair of nodes
  180. # for node1 in g1.nodes(data=True):
  181. # neighbor_n1 = g1[node1[0]]
  182. # # the transition probability distribution in the random walks
  183. # # generating step (uniform distribution over the vertices adjacent
  184. # # to the current vertex)
  185. # if len(neighbor_n1) > 0:
  186. # p_trans_n1 = (1 - p_quit) / len(neighbor_n1)
  187. # for node2 in g2.nodes(data=True):
  188. # neighbor_n2 = g2[node2[0]]
  189. # if len(neighbor_n2) > 0:
  190. # p_trans_n2 = (1 - p_quit) / len(neighbor_n2)
  191. #
  192. # for neighbor1 in neighbor_n1:
  193. # for neighbor2 in neighbor_n2:
  194. # t = p_trans_n1 * p_trans_n2 * \
  195. # deltakernel(g1.node[neighbor1][node_label],
  196. # g2.node[neighbor2][node_label]) * \
  197. # deltakernel(
  198. # neighbor_n1[neighbor1][edge_label],
  199. # neighbor_n2[neighbor2][edge_label])
  200. #
  201. # R_inf_new[node1[0]][node2[0]] += t * R_inf[neighbor1][
  202. # neighbor2] # ref [1] equation (8)
  203. # R_inf[:] = R_inf_new
  204. #
  205. # # add elements of R_inf up and calculate kernel
  206. # for node1 in g1.nodes(data=True):
  207. # for node2 in g2.nodes(data=True):
  208. # s = p_init_G1 * p_init_G2 * deltakernel(
  209. # node1[1][node_label], node2[1][node_label])
  210. # kernel += s * R_inf[node1[0]][node2[0]] # ref [1] equation (6)
  211. R_inf = {} # dict to save all the R_inf for all pairs of nodes
  212. # initial R_inf, the 1st iteration.
  213. for node1 in g1.nodes():
  214. for node2 in g2.nodes():
  215. # R_inf[(node1[0], node2[0])] = r1
  216. if len(g1[node1]) > 0:
  217. if len(g2[node2]) > 0:
  218. R_inf[(node1, node2)] = r1
  219. else:
  220. R_inf[(node1, node2)] = self.__p_quit
  221. else:
  222. if len(g2[node2]) > 0:
  223. R_inf[(node1, node2)] = self.__p_quit
  224. else:
  225. R_inf[(node1, node2)] = 1
  226. # compute all transition probability first.
  227. t_dict = {}
  228. if self.__n_iteration > 1:
  229. for node1 in g1.nodes():
  230. neighbor_n1 = g1[node1]
  231. # the transition probability distribution in the random walks
  232. # generating step (uniform distribution over the vertices adjacent
  233. # to the current vertex)
  234. if len(neighbor_n1) > 0:
  235. p_trans_n1 = (1 - self.__p_quit) / len(neighbor_n1)
  236. for node2 in g2.nodes():
  237. neighbor_n2 = g2[node2]
  238. if len(neighbor_n2) > 0:
  239. p_trans_n2 = (1 - self.__p_quit) / len(neighbor_n2)
  240. for neighbor1 in neighbor_n1:
  241. for neighbor2 in neighbor_n2:
  242. t_dict[(node1, node2, neighbor1, neighbor2)] = \
  243. p_trans_n1 * p_trans_n2 * \
  244. deltakernel(tuple(g1.nodes[neighbor1][nl] for nl in self.__node_labels), tuple(g2.nodes[neighbor2][nl] for nl in self.__node_labels)) * \
  245. deltakernel(tuple(neighbor_n1[neighbor1][el] for el in self.__edge_labels), tuple(neighbor_n2[neighbor2][el] for el in self.__edge_labels))
  246. # calculate R_inf with a simple interative method
  247. for i in range(2, self.__n_iteration + 1):
  248. R_inf_old = R_inf.copy()
  249. # calculate R_inf for each pair of nodes
  250. for node1 in g1.nodes():
  251. neighbor_n1 = g1[node1]
  252. # the transition probability distribution in the random walks
  253. # generating step (uniform distribution over the vertices adjacent
  254. # to the current vertex)
  255. if len(neighbor_n1) > 0:
  256. for node2 in g2.nodes():
  257. neighbor_n2 = g2[node2]
  258. if len(neighbor_n2) > 0:
  259. R_inf[(node1, node2)] = r1
  260. for neighbor1 in neighbor_n1:
  261. for neighbor2 in neighbor_n2:
  262. R_inf[(node1, node2)] += \
  263. (t_dict[(node1, node2, neighbor1, neighbor2)] * \
  264. R_inf_old[(neighbor1, neighbor2)]) # ref [1] equation (8)
  265. # add elements of R_inf up and calculate kernel
  266. for (n1, n2), value in R_inf.items():
  267. s = p_init_G1 * p_init_G2 * deltakernel(tuple(g1.nodes[n1][nl] for nl in self.__node_labels), tuple(g2.nodes[n2][nl] for nl in self.__node_labels))
  268. kernel += s * value # ref [1] equation (6)
  269. return kernel
  270. def _wrapper_kernel_do(self, itr):
  271. i = itr[0]
  272. j = itr[1]
  273. return i, j, self.__kernel_do(G_gn[i], G_gn[j])
  274. def _wrapper_untotter(self, i):
  275. return i, untotterTransformation(self._graphs[i], self.__node_label, self.__edge_label) # @todo: this may not work.
  276. def __add_dummy_labels(self, Gn):
  277. if len(self.__node_labels) == 0 or (len(self.__node_labels) == 1 and self.__node_labels[0] == SpecialLabel.DUMMY):
  278. for i in range(len(Gn)):
  279. nx.set_node_attributes(Gn[i], '0', SpecialLabel.DUMMY)
  280. self.__node_labels = [SpecialLabel.DUMMY]
  281. if len(self.__edge_labels) == 0 or (len(self.__edge_labels) == 1 and self.__edge_labels[0] == SpecialLabel.DUMMY):
  282. for i in range(len(Gn)):
  283. nx.set_edge_attributes(Gn[i], '0', SpecialLabel.DUMMY)
  284. self.__edge_labels = [SpecialLabel.DUMMY]

A Python package for graph kernels, graph edit distances and graph pre-image problem.