You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

structural_sp.py 15 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. Created on Mon Mar 30 11:59:57 2020
  5. @author: ljia
  6. @references:
  7. [1] Suard F, Rakotomamonjy A, Bensrhair A. Kernel on Bag of Paths For
  8. Measuring Similarity of Shapes. InESANN 2007 Apr 25 (pp. 355-360).
  9. """
  10. import sys
  11. from itertools import product
  12. from gklearn.utils import get_iters
  13. import numpy as np
  14. import time
  15. import os, errno
  16. import pickle
  17. from pympler import asizeof
  18. import networkx as nx
  19. from gklearn.utils.utils import get_shortest_paths
  20. from gklearn.kernels import StructuralSP
  21. def load_splist(file_name):
  22. if os.path.isfile(file_name):
  23. with open(file_name, 'rb') as f:
  24. return pickle.load(f)
  25. else:
  26. results_path = {'splist': [], 'i': -1, 'completed': False}
  27. return results_path
  28. def load_results(file_name, fcsp):
  29. if os.path.isfile(file_name):
  30. with open(file_name, 'rb') as f:
  31. return pickle.load(f)
  32. else:
  33. results = {'nb_v_comparison': [], 'nb_e_comparison': [], 'i': -1, 'j': -1, 'completed': False}
  34. if fcsp:
  35. results['vk_dict_mem'] = []
  36. results['ek_dict_mem'] = []
  37. return results
  38. def save_results(file_name, results):
  39. with open(file_name, 'wb') as f:
  40. pickle.dump(results, f)
  41. def estimate_vk_memory(obj, nb_nodes1, nb_nodes2):
  42. # asizeof.asized(obj, detail=1).format()
  43. # return asizeof.asizeof(obj)
  44. key, val = next(iter(obj.items()))
  45. # key = dict.iterkeys().next()
  46. # key_mem = asizeof.asizeof(key)
  47. dict_flat = sys.getsizeof(obj)
  48. key_mem = 64
  49. if isinstance(val, float):
  50. val_mem = 24
  51. mem = (key_mem + val_mem) * len(obj) + dict_flat + 28 * (nb_nodes1 + nb_nodes2)
  52. else: # value is True or False
  53. mem = (key_mem) * len(obj) + dict_flat + 52 + 28 * (nb_nodes1 + nb_nodes2)
  54. # print(mem, asizeof.asizeof(obj), '\n', asizeof.asized(obj, detail=3).format(), '\n')
  55. return mem
  56. def estimate_ek_memory(obj, nb_nodes1, nb_nodes2):
  57. # asizeof.asized(obj, detail=1).format()
  58. # return asizeof.asizeof(obj)
  59. key, val = next(iter(obj.items()))
  60. # key = dict.iterkeys().next()
  61. # key_mem = asizeof.asizeof(key)
  62. dict_flat = sys.getsizeof(obj)
  63. key_mem = 192
  64. if isinstance(val, float):
  65. val_mem = 24
  66. mem = (key_mem + val_mem) * len(obj) + dict_flat + 28 * (nb_nodes1 + nb_nodes2)
  67. else: # value is True or False
  68. mem = (key_mem) * len(obj) + dict_flat + 52 + 28 * (nb_nodes1 + nb_nodes2)
  69. # print(mem, asizeof.asizeof(obj), '\n', asizeof.asized(obj, detail=3).format(), '\n')
  70. return mem
  71. def compute_stats(file_name, results, splist):
  72. del results['i']
  73. del results['j']
  74. results['nb_v_comparison'] = np.mean(results['nb_v_comparison'])
  75. # if len(results['nb_e_comparison']) > 0:
  76. results['nb_e_comparison'] = np.mean(results['nb_e_comparison'])
  77. results['completed'] = True
  78. if 'vk_dict_mem' in results and len(results['vk_dict_mem']) > 0:
  79. results['vk_dict_mem'] = np.mean(results['vk_dict_mem'])
  80. if 'ek_dict_mem' in results and len(results['ek_dict_mem']) > 0:
  81. results['ek_dict_mem'] = np.mean(results['ek_dict_mem'])
  82. results['nb_sp_ave'] = np.mean([len(ps) for ps in splist])
  83. results['sp_len_ave'] = np.mean([np.mean([len(p) for p in ps]) for ps in splist])
  84. results['sp_mem_all'] = asizeof.asizeof(splist)
  85. save_results(file_name, results)
  86. class SSPSpace(StructuralSP):
  87. def __init__(self, **kwargs):
  88. super().__init__(**kwargs)
  89. self._file_name = kwargs.get('file_name')
  90. # @profile
  91. def _compute_gm_series(self):
  92. # get shortest paths of each graph in the graphs.
  93. fn_paths = os.path.splitext(self._file_name)[0] + '.paths.pkl'
  94. results_path = load_splist(fn_paths)
  95. if not results_path['completed']:
  96. iterator = get_iters(self._graphs, desc='getting sp graphs', file=sys.stdout, verbose=(self._verbose >= 2))
  97. if self._compute_method == 'trie':
  98. for g in iterator:
  99. splist.append(self._get_sps_as_trie(g))
  100. else:
  101. time0 = time.time()
  102. for i, g in enumerate(iterator):
  103. if i > results_path['i']:
  104. results_path['splist'].append(get_shortest_paths(g, self._edge_weight, self._ds_infos['directed']))
  105. results_path['i'] = i
  106. time1 = time.time()
  107. if time1 - time0 > 600:
  108. save_results(fn_paths, results_path)
  109. time0 = time1
  110. del results_path['i']
  111. results_path['completed'] = True
  112. save_results(fn_paths, results_path)
  113. #########
  114. splist = results_path['splist']
  115. results = load_results(self._file_name, self._fcsp)
  116. # compute Gram matrix.
  117. gram_matrix = np.zeros((len(self._graphs), len(self._graphs)))
  118. from itertools import combinations_with_replacement
  119. itr = combinations_with_replacement(range(0, len(self._graphs)), 2)
  120. len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2)
  121. iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout,
  122. length=len_itr, verbose=(self._verbose >= 2))
  123. if self._compute_method == 'trie':
  124. for i, j in iterator:
  125. kernel = self._ssp_do_trie(self._graphs[i], self._graphs[j], splist[i], splist[j])
  126. gram_matrix[i][j] = kernel
  127. gram_matrix[j][i] = kernel
  128. else:
  129. time0 = time.time()
  130. for i, j in iterator:
  131. if i > results['i'] or (i == results['i'] and j > results['j']):
  132. data = self._ssp_do_naive_space(self._graphs[i], self._graphs[j], splist[i], splist[j])
  133. results['nb_v_comparison'].append(data[0])
  134. results['nb_e_comparison'].append(data[1])
  135. if self._fcsp:
  136. if data[2] != {}:
  137. results['vk_dict_mem'].append(estimate_vk_memory(data[2],
  138. nx.number_of_nodes(self._graphs[i]),
  139. nx.number_of_nodes(self._graphs[j])))
  140. if data[3] != {}:
  141. results['ek_dict_mem'].append(estimate_ek_memory(data[3],
  142. nx.number_of_nodes(self._graphs[i]),
  143. nx.number_of_nodes(self._graphs[j])))
  144. results['i'] = i
  145. results['j'] = j
  146. time1 = time.time()
  147. if time1 - time0 > 600:
  148. save_results(self._file_name, results)
  149. time0 = time1
  150. compute_stats(self._file_name, results, splist)
  151. # @todo: may not remove the path file if the program stops exactly here.
  152. try:
  153. os.remove(fn_paths)
  154. except OSError as e:
  155. if e.errno != errno.ENOENT:
  156. raise
  157. return gram_matrix
  158. def _ssp_do_naive_space(self, g1, g2, spl1, spl2):
  159. if self._fcsp: # @todo: it may be put outside the _sp_do().
  160. return self._sp_do_naive_fcsp(g1, g2, spl1, spl2)
  161. else:
  162. return self._sp_do_naive_naive(g1, g2, spl1, spl2)
  163. def _sp_do_naive_fcsp(self, g1, g2, spl1, spl2):
  164. # First, compute shortest path matrices, method borrowed from FCSP.
  165. vk_dict, nb_v_comparison = self._get_all_node_kernels(g1, g2)
  166. # Then, compute kernels between all pairs of edges, which is an idea of
  167. # extension of FCSP. It suits sparse graphs, which is the most case we
  168. # went though. For dense graphs, this would be slow.
  169. ek_dict, nb_e_comparison = self._get_all_edge_kernels(g1, g2)
  170. return nb_v_comparison, nb_e_comparison, vk_dict, ek_dict
  171. def _sp_do_naive_naive(self, g1, g2, spl1, spl2):
  172. nb_v_comparison = 0
  173. nb_e_comparison = 0
  174. # Define the function to compute kernels between vertices in each condition.
  175. if len(self._node_labels) > 0:
  176. # node symb and non-synb labeled
  177. if len(self._node_attrs) > 0:
  178. def compute_vk(n1, n2):
  179. kn = self._node_kernels['mix']
  180. n1_labels = [g1.nodes[n1][nl] for nl in self._node_labels]
  181. n2_labels = [g2.nodes[n2][nl] for nl in self._node_labels]
  182. n1_attrs = [g1.nodes[n1][na] for na in self._node_attrs]
  183. n2_attrs = [g2.nodes[n2][na] for na in self._node_attrs]
  184. return kn(n1_labels, n2_labels, n1_attrs, n2_attrs)
  185. # node symb labeled
  186. else:
  187. def compute_vk(n1, n2):
  188. kn = self._node_kernels['symb']
  189. n1_labels = [g1.nodes[n1][nl] for nl in self._node_labels]
  190. n2_labels = [g2.nodes[n2][nl] for nl in self._node_labels]
  191. return kn(n1_labels, n2_labels)
  192. else:
  193. # node non-synb labeled
  194. if len(self._node_attrs) > 0:
  195. def compute_vk(n1, n2):
  196. kn = self._node_kernels['nsymb']
  197. n1_attrs = [g1.nodes[n1][na] for na in self._node_attrs]
  198. n2_attrs = [g2.nodes[n2][na] for na in self._node_attrs]
  199. return kn(n1_attrs, n2_attrs)
  200. # # node unlabeled
  201. # else:
  202. # for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)):
  203. # if e1[2]['cost'] == e2[2]['cost']:
  204. # kernel += 1
  205. # return kernel
  206. # Define the function to compute kernels between edges in each condition.
  207. if len(self._edge_labels) > 0:
  208. # edge symb and non-synb labeled
  209. if len(self._edge_attrs) > 0:
  210. def compute_ek(e1, e2):
  211. ke = self._edge_kernels['mix']
  212. e1_labels = [g1.edges[e1][el] for el in self._edge_labels]
  213. e2_labels = [g2.edges[e2][el] for el in self._edge_labels]
  214. e1_attrs = [g1.edges[e1][ea] for ea in self._edge_attrs]
  215. e2_attrs = [g2.edges[e2][ea] for ea in self._edge_attrs]
  216. return ke(e1_labels, e2_labels, e1_attrs, e2_attrs)
  217. # edge symb labeled
  218. else:
  219. def compute_ek(e1, e2):
  220. ke = self._edge_kernels['symb']
  221. e1_labels = [g1.edges[e1][el] for el in self._edge_labels]
  222. e2_labels = [g2.edges[e2][el] for el in self._edge_labels]
  223. return ke(e1_labels, e2_labels)
  224. else:
  225. # edge non-synb labeled
  226. if len(self._edge_attrs) > 0:
  227. def compute_ek(e1, e2):
  228. ke = self._edge_kernels['nsymb']
  229. e1_attrs = [g1.edges[e1][ea] for ea in self._edge_attrs]
  230. e2_attrs = [g2.edges[e2][ea] for ea in self._edge_attrs]
  231. return ke(e1_attrs, e2_attrs)
  232. # compute graph kernels
  233. if len(self._node_labels) > 0 or len(self._node_attrs) > 0:
  234. if len(self._edge_labels) > 0 or len(self._edge_attrs) > 0:
  235. for p1, p2 in product(spl1, spl2):
  236. if len(p1) == len(p2):
  237. # nb_v_comparison = len(p1)
  238. # nb_e_comparison = len(p1) - 1
  239. kpath = compute_vk(p1[0], p2[0])
  240. nb_v_comparison += 1
  241. if kpath:
  242. for idx in range(1, len(p1)):
  243. kpath *= compute_vk(p1[idx], p2[idx]) * \
  244. compute_ek((p1[idx-1], p1[idx]),
  245. (p2[idx-1], p2[idx]))
  246. nb_v_comparison += 1
  247. nb_e_comparison += 1
  248. if not kpath:
  249. break
  250. # kernel += kpath # add up kernels of all paths
  251. else:
  252. for p1, p2 in product(spl1, spl2):
  253. if len(p1) == len(p2):
  254. kpath = compute_vk(p1[0], p2[0])
  255. nb_v_comparison += 1
  256. if kpath:
  257. for idx in range(1, len(p1)):
  258. kpath *= compute_vk(p1[idx], p2[idx])
  259. nb_v_comparison += 1
  260. if not kpath:
  261. break
  262. # kernel += kpath # add up kernels of all paths
  263. else:
  264. if len(self._edge_labels) > 0 or len(self._edge_attrs) > 0:
  265. for p1, p2 in product(spl1, spl2):
  266. if len(p1) == len(p2):
  267. if len(p1) == 0:
  268. pass
  269. else:
  270. kpath = 1
  271. for idx in range(0, len(p1) - 1):
  272. kpath *= compute_ek((p1[idx], p1[idx+1]),
  273. (p2[idx], p2[idx+1]))
  274. nb_e_comparison += 1
  275. if not kpath:
  276. break
  277. else:
  278. pass
  279. # for p1, p2 in product(spl1, spl2):
  280. # if len(p1) == len(p2):
  281. # kernel += 1
  282. # try:
  283. # kernel = kernel / (len(spl1) * len(spl2)) # Compute mean average
  284. # except ZeroDivisionError:
  285. # print(spl1, spl2)
  286. # print(g1.nodes(data=True))
  287. # print(g1.edges(data=True))
  288. # raise Exception
  289. return nb_v_comparison, nb_e_comparison
  290. def _get_all_node_kernels(self, g1, g2):
  291. nb_comparison = 0
  292. vk_dict = {} # shortest path matrices dict
  293. if len(self._node_labels) > 0:
  294. # node symb and non-synb labeled
  295. if len(self._node_attrs) > 0:
  296. kn = self._node_kernels['mix']
  297. for n1 in g1.nodes(data=True):
  298. for n2 in g2.nodes(data=True):
  299. n1_labels = [n1[1][nl] for nl in self._node_labels]
  300. n2_labels = [n2[1][nl] for nl in self._node_labels]
  301. n1_attrs = [n1[1][na] for na in self._node_attrs]
  302. n2_attrs = [n2[1][na] for na in self._node_attrs]
  303. vk_dict[(n1[0], n2[0])] = kn(n1_labels, n2_labels, n1_attrs, n2_attrs)
  304. nb_comparison += 1
  305. # node symb labeled
  306. else:
  307. kn = self._node_kernels['symb']
  308. for n1 in g1.nodes(data=True):
  309. for n2 in g2.nodes(data=True):
  310. n1_labels = [n1[1][nl] for nl in self._node_labels]
  311. n2_labels = [n2[1][nl] for nl in self._node_labels]
  312. vk_dict[(n1[0], n2[0])] = kn(n1_labels, n2_labels)
  313. nb_comparison += 1
  314. else:
  315. # node non-synb labeled
  316. if len(self._node_attrs) > 0:
  317. kn = self._node_kernels['nsymb']
  318. for n1 in g1.nodes(data=True):
  319. for n2 in g2.nodes(data=True):
  320. n1_attrs = [n1[1][na] for na in self._node_attrs]
  321. n2_attrs = [n2[1][na] for na in self._node_attrs]
  322. vk_dict[(n1[0], n2[0])] = kn(n1_attrs, n2_attrs)
  323. nb_comparison += 1
  324. # node unlabeled
  325. else:
  326. pass # @todo: add edge weights.
  327. # for e1 in g1.edges(data=True):
  328. # for e2 in g2.edges(data=True):
  329. # if e1[2]['cost'] == e2[2]['cost']:
  330. # kernel += 1
  331. # return kernel
  332. return vk_dict, nb_comparison
  333. def _get_all_edge_kernels(self, g1, g2):
  334. nb_comparison = 0
  335. # compute kernels between all pairs of edges, which is an idea of
  336. # extension of FCSP. It suits sparse graphs, which is the most case we
  337. # went though. For dense graphs, this would be slow.
  338. ek_dict = {} # dict of edge kernels
  339. if len(self._edge_labels) > 0:
  340. # edge symb and non-synb labeled
  341. if len(self._edge_attrs) > 0:
  342. ke = self._edge_kernels['mix']
  343. for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)):
  344. e1_labels = [e1[2][el] for el in self._edge_labels]
  345. e2_labels = [e2[2][el] for el in self._edge_labels]
  346. e1_attrs = [e1[2][ea] for ea in self._edge_attrs]
  347. e2_attrs = [e2[2][ea] for ea in self._edge_attrs]
  348. ek_temp = ke(e1_labels, e2_labels, e1_attrs, e2_attrs)
  349. ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ek_temp
  350. ek_dict[((e1[1], e1[0]), (e2[0], e2[1]))] = ek_temp
  351. ek_dict[((e1[0], e1[1]), (e2[1], e2[0]))] = ek_temp
  352. ek_dict[((e1[1], e1[0]), (e2[1], e2[0]))] = ek_temp
  353. nb_comparison += 1
  354. # edge symb labeled
  355. else:
  356. ke = self._edge_kernels['symb']
  357. for e1 in g1.edges(data=True):
  358. for e2 in g2.edges(data=True):
  359. e1_labels = [e1[2][el] for el in self._edge_labels]
  360. e2_labels = [e2[2][el] for el in self._edge_labels]
  361. ek_temp = ke(e1_labels, e2_labels)
  362. ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ek_temp
  363. ek_dict[((e1[1], e1[0]), (e2[0], e2[1]))] = ek_temp
  364. ek_dict[((e1[0], e1[1]), (e2[1], e2[0]))] = ek_temp
  365. ek_dict[((e1[1], e1[0]), (e2[1], e2[0]))] = ek_temp
  366. nb_comparison += 1
  367. else:
  368. # edge non-synb labeled
  369. if len(self._edge_attrs) > 0:
  370. ke = self._edge_kernels['nsymb']
  371. for e1 in g1.edges(data=True):
  372. for e2 in g2.edges(data=True):
  373. e1_attrs = [e1[2][ea] for ea in self._edge_attrs]
  374. e2_attrs = [e2[2][ea] for ea in self._edge_attrs]
  375. ek_temp = ke(e1_attrs, e2_attrs)
  376. ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ek_temp
  377. ek_dict[((e1[1], e1[0]), (e2[0], e2[1]))] = ek_temp
  378. ek_dict[((e1[0], e1[1]), (e2[1], e2[0]))] = ek_temp
  379. ek_dict[((e1[1], e1[0]), (e2[1], e2[0]))] = ek_temp
  380. nb_comparison += 1
  381. # edge unlabeled
  382. else:
  383. pass
  384. return ek_dict, nb_comparison

A Python package for graph kernels, graph edit distances and graph pre-image problem.