You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

shortest_path.py 8.2 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. Created on Tue Apr 7 15:24:58 2020
  5. @author: ljia
  6. @references:
  7. [1] Borgwardt KM, Kriegel HP. Shortest-path kernels on graphs. InData
  8. Mining, Fifth IEEE International Conference on 2005 Nov 27 (pp. 8-pp). IEEE.
  9. """
  10. import sys
  11. from itertools import product
  12. # from functools import partial
  13. from gklearn.utils import get_iters
  14. import numpy as np
  15. from gklearn.utils.utils import getSPGraph
  16. from gklearn.kernels import ShortestPath
  17. import os
  18. import pickle
  19. from pympler import asizeof
  20. import time
  21. import networkx as nx
  22. def load_results(file_name, fcsp):
  23. if os.path.isfile(file_name):
  24. with open(file_name, 'rb') as f:
  25. return pickle.load(f)
  26. else:
  27. results = {'nb_comparison': [], 'i': -1, 'j': -1, 'completed': False}
  28. if fcsp:
  29. results['vk_dict_mem'] = []
  30. return results
  31. def save_results(file_name, results):
  32. with open(file_name, 'wb') as f:
  33. pickle.dump(results, f)
  34. def estimate_vk_memory(obj, nb_nodes1, nb_nodes2):
  35. # asizeof.asized(obj, detail=1).format()
  36. # return asizeof.asizeof(obj)
  37. key, val = next(iter(obj.items()))
  38. # key = dict.iterkeys().next()
  39. # key_mem = asizeof.asizeof(key)
  40. dict_flat = sys.getsizeof(obj)
  41. key_mem = 64
  42. if isinstance(val, float):
  43. val_mem = 24
  44. mem = (key_mem + val_mem) * len(obj) + dict_flat + 28 * (nb_nodes1 + nb_nodes2)
  45. else: # value is True or False
  46. mem = (key_mem) * len(obj) + dict_flat + 52 + 28 * (nb_nodes1 + nb_nodes2)
  47. # print(mem, asizeof.asizeof(obj), '\n', asizeof.asized(obj, detail=3).format(), '\n')
  48. return mem
  49. def compute_stats(file_name, results):
  50. del results['i']
  51. del results['j']
  52. results['nb_comparison'] = np.mean(results['nb_comparison'])
  53. results['completed'] = True
  54. if 'vk_dict_mem' in results and len(results['vk_dict_mem']) > 0:
  55. results['vk_dict_mem'] = np.mean(results['vk_dict_mem'])
  56. save_results(file_name, results)
  57. class SPSpace(ShortestPath):
  58. def __init__(self, **kwargs):
  59. super().__init__(**kwargs)
  60. self._file_name = kwargs.get('file_name')
  61. # @profile
  62. def _compute_gm_series(self):
  63. self._all_graphs_have_edges(self._graphs)
  64. # get shortest path graph of each graph.
  65. iterator = get_iters(self._graphs, desc='getting sp graphs', file=sys.stdout, verbose=(self._verbose >= 2))
  66. self._graphs = [getSPGraph(g, edge_weight=self._edge_weight) for g in iterator]
  67. results = load_results(self._file_name, self._fcsp)
  68. # compute Gram matrix.
  69. gram_matrix = np.zeros((len(self._graphs), len(self._graphs)))
  70. from itertools import combinations_with_replacement
  71. itr = combinations_with_replacement(range(0, len(self._graphs)), 2)
  72. len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2)
  73. iterator = get_iters(itr, desc='Computing kernels',
  74. length=len_itr, file=sys.stdout,verbose=(self._verbose >= 2))
  75. time0 = time.time()
  76. for i, j in iterator:
  77. if i > results['i'] or (i == results['i'] and j > results['j']):
  78. data = self._sp_do_space(self._graphs[i], self._graphs[j])
  79. if self._fcsp:
  80. results['nb_comparison'].append(data[0])
  81. if data[1] != {}:
  82. results['vk_dict_mem'].append(estimate_vk_memory(data[1],
  83. nx.number_of_nodes(self._graphs[i]),
  84. nx.number_of_nodes(self._graphs[j])))
  85. else:
  86. results['nb_comparison'].append(data)
  87. results['i'] = i
  88. results['j'] = j
  89. time1 = time.time()
  90. if time1 - time0 > 600:
  91. save_results(self._file_name, results)
  92. time0 = time1
  93. compute_stats(self._file_name, results)
  94. return gram_matrix
  95. def _sp_do_space(self, g1, g2):
  96. if self._fcsp: # @todo: it may be put outside the _sp_do().
  97. return self._sp_do_fcsp(g1, g2)
  98. else:
  99. return self._sp_do_naive(g1, g2)
  100. def _sp_do_fcsp(self, g1, g2):
  101. nb_comparison = 0
  102. # compute shortest path matrices first, method borrowed from FCSP.
  103. vk_dict = {} # shortest path matrices dict
  104. if len(self._node_labels) > 0: # @todo: it may be put outside the _sp_do().
  105. # node symb and non-synb labeled
  106. if len(self._node_attrs) > 0:
  107. kn = self._node_kernels['mix']
  108. for n1, n2 in product(
  109. g1.nodes(data=True), g2.nodes(data=True)):
  110. n1_labels = [n1[1][nl] for nl in self._node_labels]
  111. n2_labels = [n2[1][nl] for nl in self._node_labels]
  112. n1_attrs = [n1[1][na] for na in self._node_attrs]
  113. n2_attrs = [n2[1][na] for na in self._node_attrs]
  114. vk_dict[(n1[0], n2[0])] = kn(n1_labels, n2_labels, n1_attrs, n2_attrs)
  115. nb_comparison += 1
  116. # node symb labeled
  117. else:
  118. kn = self._node_kernels['symb']
  119. for n1 in g1.nodes(data=True):
  120. for n2 in g2.nodes(data=True):
  121. n1_labels = [n1[1][nl] for nl in self._node_labels]
  122. n2_labels = [n2[1][nl] for nl in self._node_labels]
  123. vk_dict[(n1[0], n2[0])] = kn(n1_labels, n2_labels)
  124. nb_comparison += 1
  125. else:
  126. # node non-synb labeled
  127. if len(self._node_attrs) > 0:
  128. kn = self._node_kernels['nsymb']
  129. for n1 in g1.nodes(data=True):
  130. for n2 in g2.nodes(data=True):
  131. n1_attrs = [n1[1][na] for na in self._node_attrs]
  132. n2_attrs = [n2[1][na] for na in self._node_attrs]
  133. vk_dict[(n1[0], n2[0])] = kn(n1_attrs, n2_attrs)
  134. nb_comparison += 1
  135. # node unlabeled
  136. else:
  137. for e1, e2 in product(
  138. g1.edges(data=True), g2.edges(data=True)):
  139. pass
  140. # if e1[2]['cost'] == e2[2]['cost']:
  141. # kernel += 1
  142. # nb_comparison += 1
  143. return nb_comparison, vk_dict
  144. # # compute graph kernels
  145. # if self._ds_infos['directed']:
  146. # for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)):
  147. # if e1[2]['cost'] == e2[2]['cost']:
  148. # nk11, nk22 = vk_dict[(e1[0], e2[0])], vk_dict[(e1[1], e2[1])]
  149. # kn1 = nk11 * nk22
  150. # kernel += kn1
  151. # else:
  152. # for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)):
  153. # if e1[2]['cost'] == e2[2]['cost']:
  154. # # each edge walk is counted twice, starting from both its extreme nodes.
  155. # nk11, nk12, nk21, nk22 = vk_dict[(e1[0], e2[0])], vk_dict[(
  156. # e1[0], e2[1])], vk_dict[(e1[1], e2[0])], vk_dict[(e1[1], e2[1])]
  157. # kn1 = nk11 * nk22
  158. # kn2 = nk12 * nk21
  159. # kernel += kn1 + kn2
  160. def _sp_do_naive(self, g1, g2):
  161. nb_comparison = 0
  162. # Define the function to compute kernels between vertices in each condition.
  163. if len(self._node_labels) > 0:
  164. # node symb and non-synb labeled
  165. if len(self._node_attrs) > 0:
  166. def compute_vk(n1, n2):
  167. kn = self._node_kernels['mix']
  168. n1_labels = [g1.nodes[n1][nl] for nl in self._node_labels]
  169. n2_labels = [g2.nodes[n2][nl] for nl in self._node_labels]
  170. n1_attrs = [g1.nodes[n1][na] for na in self._node_attrs]
  171. n2_attrs = [g2.nodes[n2][na] for na in self._node_attrs]
  172. return kn(n1_labels, n2_labels, n1_attrs, n2_attrs)
  173. # node symb labeled
  174. else:
  175. def compute_vk(n1, n2):
  176. kn = self._node_kernels['symb']
  177. n1_labels = [g1.nodes[n1][nl] for nl in self._node_labels]
  178. n2_labels = [g2.nodes[n2][nl] for nl in self._node_labels]
  179. return kn(n1_labels, n2_labels)
  180. else:
  181. # node non-synb labeled
  182. if len(self._node_attrs) > 0:
  183. def compute_vk(n1, n2):
  184. kn = self._node_kernels['nsymb']
  185. n1_attrs = [g1.nodes[n1][na] for na in self._node_attrs]
  186. n2_attrs = [g2.nodes[n2][na] for na in self._node_attrs]
  187. return kn(n1_attrs, n2_attrs)
  188. # node unlabeled
  189. else:
  190. # for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)):
  191. # if e1[2]['cost'] == e2[2]['cost']:
  192. # kernel += 1
  193. return 0
  194. # compute graph kernels
  195. if self._ds_infos['directed']:
  196. for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)):
  197. if e1[2]['cost'] == e2[2]['cost']:
  198. # nk11, nk22 = compute_vk(e1[0], e2[0]), compute_vk(e1[1], e2[1])
  199. # kn1 = nk11 * nk22
  200. # kernel += kn1
  201. nb_comparison += 2
  202. else:
  203. for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)):
  204. if e1[2]['cost'] == e2[2]['cost']:
  205. # each edge walk is counted twice, starting from both its extreme nodes.
  206. # nk11, nk12, nk21, nk22 = compute_vk(e1[0], e2[0]), compute_vk(
  207. # e1[0], e2[1]), compute_vk(e1[1], e2[0]), compute_vk(e1[1], e2[1])
  208. # kn1 = nk11 * nk22
  209. # kn2 = nk12 * nk21
  210. # kernel += kn1 + kn2
  211. nb_comparison += 4
  212. return nb_comparison

A Python package for graph kernels, graph edit distances and graph pre-image problem.