You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

marginalizedKernel.py 8.3 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232
  1. """
  2. @author: linlin
  3. @references:
  4. [1] H. Kashima, K. Tsuda, and A. Inokuchi. Marginalized kernels between
  5. labeled graphs. In Proceedings of the 20th International Conference on
  6. Machine Learning, Washington, DC, United States, 2003.
  7. [2] Pierre Mahé, Nobuhisa Ueda, Tatsuya Akutsu, Jean-Luc Perret, and
  8. Jean-Philippe Vert. Extensions of marginalized graph kernels. In
  9. Proceedings of the twenty-first international conference on Machine
  10. learning, page 70. ACM, 2004.
  11. """
  12. import sys
  13. import time
  14. from itertools import combinations_with_replacement
  15. from functools import partial
  16. from multiprocessing import Pool
  17. from tqdm import tqdm
  18. tqdm.monitor_interval = 0
  19. import traceback
  20. import networkx as nx
  21. import numpy as np
  22. from pygraph.utils.kernels import deltakernel
  23. from pygraph.utils.utils import untotterTransformation
  24. from pygraph.utils.graphdataset import get_dataset_attributes
  25. sys.path.insert(0, "../")
  26. def marginalizedkernel(*args,
  27. node_label='atom',
  28. edge_label='bond_type',
  29. p_quit=0.5,
  30. n_iteration=20,
  31. remove_totters=True,
  32. n_jobs=None):
  33. """Calculate marginalized graph kernels between graphs.
  34. Parameters
  35. ----------
  36. Gn : List of NetworkX graph
  37. List of graphs between which the kernels are calculated.
  38. /
  39. G1, G2 : NetworkX graphs
  40. 2 graphs between which the kernel is calculated.
  41. node_label : string
  42. node attribute used as label. The default node label is atom.
  43. edge_label : string
  44. edge attribute used as label. The default edge label is bond_type.
  45. p_quit : integer
  46. the termination probability in the random walks generating step
  47. n_iteration : integer
  48. time of iterations to calculate R_inf
  49. remove_totters : boolean
  50. whether to remove totters. The default value is True.
  51. Return
  52. ------
  53. Kmatrix : Numpy matrix
  54. Kernel matrix, each element of which is the marginalized kernel between
  55. 2 praphs.
  56. """
  57. # pre-process
  58. n_iteration = int(n_iteration)
  59. Gn = args[0] if len(args) == 1 else [args[0], args[1]]
  60. ds_attrs = get_dataset_attributes(
  61. Gn,
  62. attr_names=['node_labeled', 'edge_labeled', 'is_directed'],
  63. node_label=node_label, edge_label=edge_label)
  64. if not ds_attrs['node_labeled']:
  65. for G in Gn:
  66. nx.set_node_attributes(G, '0', 'atom')
  67. if not ds_attrs['edge_labeled']:
  68. for G in Gn:
  69. nx.set_edge_attributes(G, '0', 'bond_type')
  70. start_time = time.time()
  71. if remove_totters:
  72. # ---- use pool.imap_unordered to parallel and track progress. ----
  73. pool = Pool(n_jobs)
  74. untotter_partial = partial(wrap_untotter, Gn, node_label, edge_label)
  75. if len(Gn) < 1000 * n_jobs:
  76. chunksize = int(len(Gn) / n_jobs) + 1
  77. else:
  78. chunksize = 1000
  79. for i, g in tqdm(
  80. pool.imap_unordered(
  81. untotter_partial, range(0, len(Gn)), chunksize),
  82. desc='removing tottering',
  83. file=sys.stdout):
  84. Gn[i] = g
  85. pool.close()
  86. pool.join()
  87. # # ---- direct running, normally use single CPU core. ----
  88. # Gn = [
  89. # untotterTransformation(G, node_label, edge_label)
  90. # for G in tqdm(Gn, desc='removing tottering', file=sys.stdout)
  91. # ]
  92. Kmatrix = np.zeros((len(Gn), len(Gn)))
  93. # ---- use pool.imap_unordered to parallel and track progress. ----
  94. pool = Pool(n_jobs)
  95. do_partial = partial(_marginalizedkernel_do, Gn, node_label, edge_label,
  96. p_quit, n_iteration)
  97. itr = combinations_with_replacement(range(0, len(Gn)), 2)
  98. len_itr = int(len(Gn) * (len(Gn) + 1) / 2)
  99. if len_itr < 1000 * n_jobs:
  100. chunksize = int(len_itr / n_jobs) + 1
  101. else:
  102. chunksize = 1000
  103. for i, j, kernel in tqdm(
  104. pool.imap_unordered(do_partial, itr, chunksize),
  105. desc='calculating kernels',
  106. file=sys.stdout):
  107. Kmatrix[i][j] = kernel
  108. Kmatrix[j][i] = kernel
  109. pool.close()
  110. pool.join()
  111. # # ---- direct running, normally use single CPU core. ----
  112. # pbar = tqdm(
  113. # total=(1 + len(Gn)) * len(Gn) / 2,
  114. # desc='calculating kernels',
  115. # file=sys.stdout)
  116. # for i in range(0, len(Gn)):
  117. # for j in range(i, len(Gn)):
  118. # Kmatrix[i][j] = _marginalizedkernel_do(Gn[i], Gn[j], node_label,
  119. # edge_label, p_quit, n_iteration)
  120. # Kmatrix[j][i] = Kmatrix[i][j]
  121. # pbar.update(1)
  122. run_time = time.time() - start_time
  123. print(
  124. "\n --- marginalized kernel matrix of size %d built in %s seconds ---"
  125. % (len(Gn), run_time))
  126. return Kmatrix, run_time
  127. def _marginalizedkernel_do(Gn, node_label, edge_label, p_quit, n_iteration, ij):
  128. """Calculate marginalized graph kernel between 2 graphs.
  129. Parameters
  130. ----------
  131. G1, G2 : NetworkX graphs
  132. 2 graphs between which the kernel is calculated.
  133. node_label : string
  134. node attribute used as label.
  135. edge_label : string
  136. edge attribute used as label.
  137. p_quit : integer
  138. the termination probability in the random walks generating step.
  139. n_iteration : integer
  140. time of iterations to calculate R_inf.
  141. Return
  142. ------
  143. kernel : float
  144. Marginalized Kernel between 2 graphs.
  145. """
  146. try:
  147. # init parameters
  148. iglobal = ij[0]
  149. jglobal = ij[1]
  150. g1 = Gn[iglobal]
  151. g2 = Gn[jglobal]
  152. kernel = 0
  153. num_nodes_G1 = nx.number_of_nodes(g1)
  154. num_nodes_G2 = nx.number_of_nodes(g2)
  155. # the initial probability distribution in the random walks generating step
  156. # (uniform distribution over |G|)
  157. p_init_G1 = 1 / num_nodes_G1
  158. p_init_G2 = 1 / num_nodes_G2
  159. q = p_quit * p_quit
  160. r1 = q
  161. # initial R_inf
  162. # matrix to save all the R_inf for all pairs of nodes
  163. R_inf = np.zeros([num_nodes_G1, num_nodes_G2])
  164. # calculate R_inf with a simple interative method
  165. for i in range(1, n_iteration):
  166. R_inf_new = np.zeros([num_nodes_G1, num_nodes_G2])
  167. R_inf_new.fill(r1)
  168. # calculate R_inf for each pair of nodes
  169. for node1 in g1.nodes(data=True):
  170. neighbor_n1 = g1[node1[0]]
  171. # the transition probability distribution in the random walks
  172. # generating step (uniform distribution over the vertices adjacent
  173. # to the current vertex)
  174. p_trans_n1 = (1 - p_quit) / len(neighbor_n1)
  175. for node2 in g2.nodes(data=True):
  176. neighbor_n2 = g2[node2[0]]
  177. p_trans_n2 = (1 - p_quit) / len(neighbor_n2)
  178. for neighbor1 in neighbor_n1:
  179. for neighbor2 in neighbor_n2:
  180. t = p_trans_n1 * p_trans_n2 * \
  181. deltakernel(g1.node[neighbor1][node_label],
  182. g2.node[neighbor2][node_label]) * \
  183. deltakernel(
  184. neighbor_n1[neighbor1][edge_label],
  185. neighbor_n2[neighbor2][edge_label])
  186. R_inf_new[node1[0]][node2[0]] += t * R_inf[neighbor1][
  187. neighbor2] # ref [1] equation (8)
  188. R_inf[:] = R_inf_new
  189. # add elements of R_inf up and calculate kernel
  190. for node1 in g1.nodes(data=True):
  191. for node2 in g2.nodes(data=True):
  192. s = p_init_G1 * p_init_G2 * deltakernel(
  193. node1[1][node_label], node2[1][node_label])
  194. kernel += s * R_inf[node1[0]][node2[0]] # ref [1] equation (6)
  195. return iglobal, jglobal, kernel
  196. except Exception as e:
  197. traceback.print_exc()
  198. print('')
  199. raise e
  200. def wrap_untotter(Gn, node_label, edge_label, i):
  201. return i, untotterTransformation(Gn[i], node_label, edge_label)

A Python package for graph kernels, graph edit distances and graph pre-image problem.