You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

marginalizedKernel.py 9.5 kB

5 years ago
5 years ago
5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307
  1. """
  2. @author: linlin
  3. @references:
  4. [1] H. Kashima, K. Tsuda, and A. Inokuchi. Marginalized kernels between
  5. labeled graphs. In Proceedings of the 20th International Conference on
  6. Machine Learning, Washington, DC, United States, 2003.
  7. [2] Pierre Mahé, Nobuhisa Ueda, Tatsuya Akutsu, Jean-Luc Perret, and
  8. Jean-Philippe Vert. Extensions of marginalized graph kernels. In
  9. Proceedings of the twenty-first international conference on Machine
  10. learning, page 70. ACM, 2004.
  11. """
  12. import sys
  13. import time
  14. from functools import partial
  15. from multiprocessing import Pool
  16. from tqdm import tqdm
  17. tqdm.monitor_interval = 0
  18. #import traceback
  19. import networkx as nx
  20. import numpy as np
  21. from gklearn.utils.kernels import deltakernel
  22. from gklearn.utils.utils import untotterTransformation
  23. from gklearn.utils.graphdataset import get_dataset_attributes
  24. from gklearn.utils.parallel import parallel_gm
  25. def marginalizedkernel(*args,
  26. node_label='atom',
  27. edge_label='bond_type',
  28. p_quit=0.5,
  29. n_iteration=20,
  30. remove_totters=False,
  31. n_jobs=None,
  32. chunksize=None,
  33. verbose=True):
  34. """Compute marginalized graph kernels between graphs.
  35. Parameters
  36. ----------
  37. Gn : List of NetworkX graph
  38. List of graphs between which the kernels are computed.
  39. G1, G2 : NetworkX graphs
  40. Two graphs between which the kernel is computed.
  41. node_label : string
  42. Node attribute used as symbolic label. The default node label is 'atom'.
  43. edge_label : string
  44. Edge attribute used as symbolic label. The default edge label is 'bond_type'.
  45. p_quit : integer
  46. The termination probability in the random walks generating step.
  47. n_iteration : integer
  48. Time of iterations to compute R_inf.
  49. remove_totters : boolean
  50. Whether to remove totterings by method introduced in [2]. The default
  51. value is False.
  52. n_jobs : int
  53. Number of jobs for parallelization.
  54. Return
  55. ------
  56. Kmatrix : Numpy matrix
  57. Kernel matrix, each element of which is the marginalized kernel between
  58. 2 praphs.
  59. """
  60. # pre-process
  61. n_iteration = int(n_iteration)
  62. Gn = args[0][:] if len(args) == 1 else [args[0].copy(), args[1].copy()]
  63. Gn = [g.copy() for g in Gn]
  64. ds_attrs = get_dataset_attributes(
  65. Gn,
  66. attr_names=['node_labeled', 'edge_labeled', 'is_directed'],
  67. node_label=node_label, edge_label=edge_label)
  68. if not ds_attrs['node_labeled'] or node_label is None:
  69. node_label = 'atom'
  70. for G in Gn:
  71. nx.set_node_attributes(G, '0', 'atom')
  72. if not ds_attrs['edge_labeled'] or edge_label is None:
  73. edge_label = 'bond_type'
  74. for G in Gn:
  75. nx.set_edge_attributes(G, '0', 'bond_type')
  76. start_time = time.time()
  77. if remove_totters:
  78. # ---- use pool.imap_unordered to parallel and track progress. ----
  79. pool = Pool(n_jobs)
  80. untotter_partial = partial(wrapper_untotter, Gn, node_label, edge_label)
  81. if chunksize is None:
  82. if len(Gn) < 100 * n_jobs:
  83. chunksize = int(len(Gn) / n_jobs) + 1
  84. else:
  85. chunksize = 100
  86. for i, g in tqdm(
  87. pool.imap_unordered(
  88. untotter_partial, range(0, len(Gn)), chunksize),
  89. desc='removing tottering',
  90. file=sys.stdout):
  91. Gn[i] = g
  92. pool.close()
  93. pool.join()
  94. # # ---- direct running, normally use single CPU core. ----
  95. # Gn = [
  96. # untotterTransformation(G, node_label, edge_label)
  97. # for G in tqdm(Gn, desc='removing tottering', file=sys.stdout)
  98. # ]
  99. Kmatrix = np.zeros((len(Gn), len(Gn)))
  100. # ---- use pool.imap_unordered to parallel and track progress. ----
  101. def init_worker(gn_toshare):
  102. global G_gn
  103. G_gn = gn_toshare
  104. do_partial = partial(wrapper_marg_do, node_label, edge_label,
  105. p_quit, n_iteration)
  106. parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker,
  107. glbv=(Gn,), n_jobs=n_jobs, chunksize=chunksize, verbose=verbose)
  108. # # ---- direct running, normally use single CPU core. ----
  109. ## pbar = tqdm(
  110. ## total=(1 + len(Gn)) * len(Gn) / 2,
  111. ## desc='Computing kernels',
  112. ## file=sys.stdout)
  113. # for i in range(0, len(Gn)):
  114. # for j in range(i, len(Gn)):
  115. ## print(i, j)
  116. # Kmatrix[i][j] = _marginalizedkernel_do(Gn[i], Gn[j], node_label,
  117. # edge_label, p_quit, n_iteration)
  118. # Kmatrix[j][i] = Kmatrix[i][j]
  119. ## pbar.update(1)
  120. run_time = time.time() - start_time
  121. if verbose:
  122. print("\n --- marginalized kernel matrix of size %d built in %s seconds ---"
  123. % (len(Gn), run_time))
  124. return Kmatrix, run_time
  125. def _marginalizedkernel_do(g1, g2, node_label, edge_label, p_quit, n_iteration):
  126. """Compute marginalized graph kernel between 2 graphs.
  127. Parameters
  128. ----------
  129. G1, G2 : NetworkX graphs
  130. 2 graphs between which the kernel is computed.
  131. node_label : string
  132. node attribute used as label.
  133. edge_label : string
  134. edge attribute used as label.
  135. p_quit : integer
  136. the termination probability in the random walks generating step.
  137. n_iteration : integer
  138. time of iterations to compute R_inf.
  139. Return
  140. ------
  141. kernel : float
  142. Marginalized Kernel between 2 graphs.
  143. """
  144. # init parameters
  145. kernel = 0
  146. num_nodes_G1 = nx.number_of_nodes(g1)
  147. num_nodes_G2 = nx.number_of_nodes(g2)
  148. # the initial probability distribution in the random walks generating step
  149. # (uniform distribution over |G|)
  150. p_init_G1 = 1 / num_nodes_G1
  151. p_init_G2 = 1 / num_nodes_G2
  152. q = p_quit * p_quit
  153. r1 = q
  154. # # initial R_inf
  155. # # matrix to save all the R_inf for all pairs of nodes
  156. # R_inf = np.zeros([num_nodes_G1, num_nodes_G2])
  157. #
  158. # # Compute R_inf with a simple interative method
  159. # for i in range(1, n_iteration):
  160. # R_inf_new = np.zeros([num_nodes_G1, num_nodes_G2])
  161. # R_inf_new.fill(r1)
  162. #
  163. # # Compute R_inf for each pair of nodes
  164. # for node1 in g1.nodes(data=True):
  165. # neighbor_n1 = g1[node1[0]]
  166. # # the transition probability distribution in the random walks
  167. # # generating step (uniform distribution over the vertices adjacent
  168. # # to the current vertex)
  169. # if len(neighbor_n1) > 0:
  170. # p_trans_n1 = (1 - p_quit) / len(neighbor_n1)
  171. # for node2 in g2.nodes(data=True):
  172. # neighbor_n2 = g2[node2[0]]
  173. # if len(neighbor_n2) > 0:
  174. # p_trans_n2 = (1 - p_quit) / len(neighbor_n2)
  175. #
  176. # for neighbor1 in neighbor_n1:
  177. # for neighbor2 in neighbor_n2:
  178. # t = p_trans_n1 * p_trans_n2 * \
  179. # deltakernel(g1.node[neighbor1][node_label],
  180. # g2.node[neighbor2][node_label]) * \
  181. # deltakernel(
  182. # neighbor_n1[neighbor1][edge_label],
  183. # neighbor_n2[neighbor2][edge_label])
  184. #
  185. # R_inf_new[node1[0]][node2[0]] += t * R_inf[neighbor1][
  186. # neighbor2] # ref [1] equation (8)
  187. # R_inf[:] = R_inf_new
  188. #
  189. # # add elements of R_inf up and compute kernel.
  190. # for node1 in g1.nodes(data=True):
  191. # for node2 in g2.nodes(data=True):
  192. # s = p_init_G1 * p_init_G2 * deltakernel(
  193. # node1[1][node_label], node2[1][node_label])
  194. # kernel += s * R_inf[node1[0]][node2[0]] # ref [1] equation (6)
  195. R_inf = {} # dict to save all the R_inf for all pairs of nodes
  196. # initial R_inf, the 1st iteration.
  197. for node1 in g1.nodes():
  198. for node2 in g2.nodes():
  199. # R_inf[(node1[0], node2[0])] = r1
  200. if len(g1[node1]) > 0:
  201. if len(g2[node2]) > 0:
  202. R_inf[(node1, node2)] = r1
  203. else:
  204. R_inf[(node1, node2)] = p_quit
  205. else:
  206. if len(g2[node2]) > 0:
  207. R_inf[(node1, node2)] = p_quit
  208. else:
  209. R_inf[(node1, node2)] = 1
  210. # compute all transition probability first.
  211. t_dict = {}
  212. if n_iteration > 1:
  213. for node1 in g1.nodes():
  214. neighbor_n1 = g1[node1]
  215. # the transition probability distribution in the random walks
  216. # generating step (uniform distribution over the vertices adjacent
  217. # to the current vertex)
  218. if len(neighbor_n1) > 0:
  219. p_trans_n1 = (1 - p_quit) / len(neighbor_n1)
  220. for node2 in g2.nodes():
  221. neighbor_n2 = g2[node2]
  222. if len(neighbor_n2) > 0:
  223. p_trans_n2 = (1 - p_quit) / len(neighbor_n2)
  224. for neighbor1 in neighbor_n1:
  225. for neighbor2 in neighbor_n2:
  226. t_dict[(node1, node2, neighbor1, neighbor2)] = \
  227. p_trans_n1 * p_trans_n2 * \
  228. deltakernel(g1.nodes[neighbor1][node_label],
  229. g2.nodes[neighbor2][node_label]) * \
  230. deltakernel(
  231. neighbor_n1[neighbor1][edge_label],
  232. neighbor_n2[neighbor2][edge_label])
  233. # Compute R_inf with a simple interative method
  234. for i in range(2, n_iteration + 1):
  235. R_inf_old = R_inf.copy()
  236. # Compute R_inf for each pair of nodes
  237. for node1 in g1.nodes():
  238. neighbor_n1 = g1[node1]
  239. # the transition probability distribution in the random walks
  240. # generating step (uniform distribution over the vertices adjacent
  241. # to the current vertex)
  242. if len(neighbor_n1) > 0:
  243. for node2 in g2.nodes():
  244. neighbor_n2 = g2[node2]
  245. if len(neighbor_n2) > 0:
  246. R_inf[(node1, node2)] = r1
  247. for neighbor1 in neighbor_n1:
  248. for neighbor2 in neighbor_n2:
  249. R_inf[(node1, node2)] += \
  250. (t_dict[(node1, node2, neighbor1, neighbor2)] * \
  251. R_inf_old[(neighbor1, neighbor2)]) # ref [1] equation (8)
  252. # add elements of R_inf up and compute kernel.
  253. for (n1, n2), value in R_inf.items():
  254. s = p_init_G1 * p_init_G2 * deltakernel(
  255. g1.nodes[n1][node_label], g2.nodes[n2][node_label])
  256. kernel += s * value # ref [1] equation (6)
  257. return kernel
  258. def wrapper_marg_do(node_label, edge_label, p_quit, n_iteration, itr):
  259. i= itr[0]
  260. j = itr[1]
  261. return i, j, _marginalizedkernel_do(G_gn[i], G_gn[j], node_label, edge_label, p_quit, n_iteration)
  262. def wrapper_untotter(Gn, node_label, edge_label, i):
  263. return i, untotterTransformation(Gn[i], node_label, edge_label)

A Python package for graph kernels, graph edit distances and graph pre-image problem.