You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

marginalizedKernel.py 12 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293
  1. """
  2. @author: linlin
  3. @references:
  4. [1] H. Kashima, K. Tsuda, and A. Inokuchi. Marginalized kernels between
  5. labeled graphs. In Proceedings of the 20th International Conference on
  6. Machine Learning, Washington, DC, United States, 2003.
  7. [2] Pierre Mahé, Nobuhisa Ueda, Tatsuya Akutsu, Jean-Luc Perret, and
  8. Jean-Philippe Vert. Extensions of marginalized graph kernels. In
  9. Proceedings of the twenty-first international conference on Machine
  10. learning, page 70. ACM, 2004.
  11. """
  12. import sys
  13. import time
  14. from functools import partial
  15. from multiprocessing import Pool
  16. from tqdm import tqdm
  17. tqdm.monitor_interval = 0
  18. #import traceback
  19. import networkx as nx
  20. import numpy as np
  21. from pygraph.utils.kernels import deltakernel
  22. from pygraph.utils.utils import untotterTransformation
  23. from pygraph.utils.graphdataset import get_dataset_attributes
  24. from pygraph.utils.parallel import parallel_gm
  25. sys.path.insert(0, "../")
  26. def marginalizedkernel(*args,
  27. node_label='atom',
  28. edge_label='bond_type',
  29. p_quit=0.5,
  30. n_iteration=20,
  31. remove_totters=False,
  32. n_jobs=None,
  33. verbose=True):
  34. """Calculate marginalized graph kernels between graphs.
  35. Parameters
  36. ----------
  37. Gn : List of NetworkX graph
  38. List of graphs between which the kernels are calculated.
  39. /
  40. G1, G2 : NetworkX graphs
  41. 2 graphs between which the kernel is calculated.
  42. node_label : string
  43. node attribute used as label. The default node label is atom.
  44. edge_label : string
  45. edge attribute used as label. The default edge label is bond_type.
  46. p_quit : integer
  47. the termination probability in the random walks generating step
  48. n_iteration : integer
  49. time of iterations to calculate R_inf
  50. remove_totters : boolean
  51. whether to remove totters. The default value is True.
  52. Return
  53. ------
  54. Kmatrix : Numpy matrix
  55. Kernel matrix, each element of which is the marginalized kernel between
  56. 2 praphs.
  57. """
  58. # pre-process
  59. n_iteration = int(n_iteration)
  60. Gn = args[0][:] if len(args) == 1 else [args[0].copy(), args[1].copy()]
  61. ds_attrs = get_dataset_attributes(
  62. Gn,
  63. attr_names=['node_labeled', 'edge_labeled', 'is_directed'],
  64. node_label=node_label, edge_label=edge_label)
  65. if not ds_attrs['node_labeled'] or node_label == None:
  66. node_label = 'atom'
  67. for G in Gn:
  68. nx.set_node_attributes(G, '0', 'atom')
  69. if not ds_attrs['edge_labeled'] or edge_label == None:
  70. edge_label = 'bond_type'
  71. for G in Gn:
  72. nx.set_edge_attributes(G, '0', 'bond_type')
  73. start_time = time.time()
  74. if remove_totters:
  75. # ---- use pool.imap_unordered to parallel and track progress. ----
  76. pool = Pool(n_jobs)
  77. untotter_partial = partial(wrapper_untotter, Gn, node_label, edge_label)
  78. if len(Gn) < 100 * n_jobs:
  79. chunksize = int(len(Gn) / n_jobs) + 1
  80. else:
  81. chunksize = 100
  82. for i, g in tqdm(
  83. pool.imap_unordered(
  84. untotter_partial, range(0, len(Gn)), chunksize),
  85. desc='removing tottering',
  86. file=sys.stdout):
  87. Gn[i] = g
  88. pool.close()
  89. pool.join()
  90. # # ---- direct running, normally use single CPU core. ----
  91. # Gn = [
  92. # untotterTransformation(G, node_label, edge_label)
  93. # for G in tqdm(Gn, desc='removing tottering', file=sys.stdout)
  94. # ]
  95. Kmatrix = np.zeros((len(Gn), len(Gn)))
  96. # ---- use pool.imap_unordered to parallel and track progress. ----
  97. def init_worker(gn_toshare):
  98. global G_gn
  99. G_gn = gn_toshare
  100. do_partial = partial(wrapper_marg_do, node_label, edge_label,
  101. p_quit, n_iteration)
  102. parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker,
  103. glbv=(Gn,), n_jobs=n_jobs, verbose=verbose)
  104. # # ---- direct running, normally use single CPU core. ----
  105. ## pbar = tqdm(
  106. ## total=(1 + len(Gn)) * len(Gn) / 2,
  107. ## desc='calculating kernels',
  108. ## file=sys.stdout)
  109. # for i in range(0, len(Gn)):
  110. # for j in range(i, len(Gn)):
  111. ## print(i, j)
  112. # Kmatrix[i][j] = _marginalizedkernel_do(Gn[i], Gn[j], node_label,
  113. # edge_label, p_quit, n_iteration)
  114. # Kmatrix[j][i] = Kmatrix[i][j]
  115. ## pbar.update(1)
  116. run_time = time.time() - start_time
  117. if verbose:
  118. print("\n --- marginalized kernel matrix of size %d built in %s seconds ---"
  119. % (len(Gn), run_time))
  120. return Kmatrix, run_time
  121. def _marginalizedkernel_do(g1, g2, node_label, edge_label, p_quit, n_iteration):
  122. """Calculate marginalized graph kernel between 2 graphs.
  123. Parameters
  124. ----------
  125. G1, G2 : NetworkX graphs
  126. 2 graphs between which the kernel is calculated.
  127. node_label : string
  128. node attribute used as label.
  129. edge_label : string
  130. edge attribute used as label.
  131. p_quit : integer
  132. the termination probability in the random walks generating step.
  133. n_iteration : integer
  134. time of iterations to calculate R_inf.
  135. Return
  136. ------
  137. kernel : float
  138. Marginalized Kernel between 2 graphs.
  139. """
  140. # init parameters
  141. kernel = 0
  142. num_nodes_G1 = nx.number_of_nodes(g1)
  143. num_nodes_G2 = nx.number_of_nodes(g2)
  144. # the initial probability distribution in the random walks generating step
  145. # (uniform distribution over |G|)
  146. p_init_G1 = 1 / num_nodes_G1
  147. p_init_G2 = 1 / num_nodes_G2
  148. q = p_quit * p_quit
  149. r1 = q
  150. # # initial R_inf
  151. # # matrix to save all the R_inf for all pairs of nodes
  152. # R_inf = np.zeros([num_nodes_G1, num_nodes_G2])
  153. #
  154. # # calculate R_inf with a simple interative method
  155. # for i in range(1, n_iteration):
  156. # R_inf_new = np.zeros([num_nodes_G1, num_nodes_G2])
  157. # R_inf_new.fill(r1)
  158. #
  159. # # calculate R_inf for each pair of nodes
  160. # for node1 in g1.nodes(data=True):
  161. # neighbor_n1 = g1[node1[0]]
  162. # # the transition probability distribution in the random walks
  163. # # generating step (uniform distribution over the vertices adjacent
  164. # # to the current vertex)
  165. # if len(neighbor_n1) > 0:
  166. # p_trans_n1 = (1 - p_quit) / len(neighbor_n1)
  167. # for node2 in g2.nodes(data=True):
  168. # neighbor_n2 = g2[node2[0]]
  169. # if len(neighbor_n2) > 0:
  170. # p_trans_n2 = (1 - p_quit) / len(neighbor_n2)
  171. #
  172. # for neighbor1 in neighbor_n1:
  173. # for neighbor2 in neighbor_n2:
  174. # t = p_trans_n1 * p_trans_n2 * \
  175. # deltakernel(g1.node[neighbor1][node_label],
  176. # g2.node[neighbor2][node_label]) * \
  177. # deltakernel(
  178. # neighbor_n1[neighbor1][edge_label],
  179. # neighbor_n2[neighbor2][edge_label])
  180. #
  181. # R_inf_new[node1[0]][node2[0]] += t * R_inf[neighbor1][
  182. # neighbor2] # ref [1] equation (8)
  183. # R_inf[:] = R_inf_new
  184. #
  185. # # add elements of R_inf up and calculate kernel
  186. # for node1 in g1.nodes(data=True):
  187. # for node2 in g2.nodes(data=True):
  188. # s = p_init_G1 * p_init_G2 * deltakernel(
  189. # node1[1][node_label], node2[1][node_label])
  190. # kernel += s * R_inf[node1[0]][node2[0]] # ref [1] equation (6)
  191. R_inf = {} # dict to save all the R_inf for all pairs of nodes
  192. # initial R_inf, the 1st iteration.
  193. for node1 in g1.nodes(data=True):
  194. for node2 in g2.nodes(data=True):
  195. # R_inf[(node1[0], node2[0])] = r1
  196. if len(g1[node1[0]]) > 0:
  197. if len(g2[node2[0]]) > 0:
  198. R_inf[(node1[0], node2[0])] = r1
  199. else:
  200. R_inf[(node1[0], node2[0])] = p_quit
  201. else:
  202. if len(g2[node2[0]]) > 0:
  203. R_inf[(node1[0], node2[0])] = p_quit
  204. else:
  205. R_inf[(node1[0], node2[0])] = 1
  206. # compute all transition probability first.
  207. t_dict = {}
  208. if n_iteration > 1:
  209. for node1 in g1.nodes(data=True):
  210. neighbor_n1 = g1[node1[0]]
  211. # the transition probability distribution in the random walks
  212. # generating step (uniform distribution over the vertices adjacent
  213. # to the current vertex)
  214. if len(neighbor_n1) > 0:
  215. p_trans_n1 = (1 - p_quit) / len(neighbor_n1)
  216. for node2 in g2.nodes(data=True):
  217. neighbor_n2 = g2[node2[0]]
  218. if len(neighbor_n2) > 0:
  219. p_trans_n2 = (1 - p_quit) / len(neighbor_n2)
  220. for neighbor1 in neighbor_n1:
  221. for neighbor2 in neighbor_n2:
  222. t_dict[(node1[0], node2[0], neighbor1, neighbor2)] = \
  223. p_trans_n1 * p_trans_n2 * \
  224. deltakernel(g1.node[neighbor1][node_label],
  225. g2.node[neighbor2][node_label]) * \
  226. deltakernel(
  227. neighbor_n1[neighbor1][edge_label],
  228. neighbor_n2[neighbor2][edge_label])
  229. # calculate R_inf with a simple interative method
  230. for i in range(2, n_iteration + 1):
  231. R_inf_old = R_inf.copy()
  232. # calculate R_inf for each pair of nodes
  233. for node1 in g1.nodes(data=True):
  234. neighbor_n1 = g1[node1[0]]
  235. # the transition probability distribution in the random walks
  236. # generating step (uniform distribution over the vertices adjacent
  237. # to the current vertex)
  238. if len(neighbor_n1) > 0:
  239. for node2 in g2.nodes(data=True):
  240. neighbor_n2 = g2[node2[0]]
  241. if len(neighbor_n2) > 0:
  242. R_inf[(node1[0], node2[0])] = r1
  243. for neighbor1 in neighbor_n1:
  244. for neighbor2 in neighbor_n2:
  245. R_inf[(node1[0], node2[0])] += \
  246. (t_dict[(node1[0], node2[0], neighbor1, neighbor2)] * \
  247. R_inf_old[(neighbor1, neighbor2)]) # ref [1] equation (8)
  248. # add elements of R_inf up and calculate kernel
  249. for (n1, n2), value in R_inf.items():
  250. s = p_init_G1 * p_init_G2 * deltakernel(
  251. g1.nodes[n1][node_label], g2.nodes[n2][node_label])
  252. kernel += s * value # ref [1] equation (6)
  253. return kernel
  254. def wrapper_marg_do(node_label, edge_label, p_quit, n_iteration, itr):
  255. i= itr[0]
  256. j = itr[1]
  257. return i, j, _marginalizedkernel_do(G_gn[i], G_gn[j], node_label, edge_label, p_quit, n_iteration)
  258. def wrapper_untotter(Gn, node_label, edge_label, i):
  259. return i, untotterTransformation(Gn[i], node_label, edge_label)

A Python package for graph kernels, graph edit distances and graph pre-image problem.