You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

structuralspKernel.py 33 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. Created on Thu Sep 27 10:56:23 2018
  5. @author: linlin
  6. @references: Suard F, Rakotomamonjy A, Bensrhair A. Kernel on Bag of Paths For
  7. Measuring Similarity of Shapes. InESANN 2007 Apr 25 (pp. 355-360).
  8. """
  9. import sys
  10. import time
  11. from itertools import combinations, product
  12. from functools import partial
  13. from multiprocessing import Pool
  14. from tqdm import tqdm
  15. import networkx as nx
  16. import numpy as np
  17. from pygraph.utils.graphdataset import get_dataset_attributes
  18. from pygraph.utils.parallel import parallel_gm
  19. from pygraph.utils.trie import Trie
  20. sys.path.insert(0, "../")
  21. def structuralspkernel(*args,
  22. node_label='atom',
  23. edge_weight=None,
  24. edge_label='bond_type',
  25. node_kernels=None,
  26. edge_kernels=None,
  27. compute_method='naive',
  28. n_jobs=None,
  29. verbose=True):
  30. """Calculate mean average structural shortest path kernels between graphs.
  31. Parameters
  32. ----------
  33. Gn : List of NetworkX graph
  34. List of graphs between which the kernels are calculated.
  35. /
  36. G1, G2 : NetworkX graphs
  37. 2 graphs between which the kernel is calculated.
  38. node_label : string
  39. node attribute used as label. The default node label is atom.
  40. edge_weight : string
  41. Edge attribute name corresponding to the edge weight.
  42. edge_label : string
  43. edge attribute used as label. The default edge label is bond_type.
  44. node_kernels: dict
  45. A dictionary of kernel functions for nodes, including 3 items: 'symb'
  46. for symbolic node labels, 'nsymb' for non-symbolic node labels, 'mix'
  47. for both labels. The first 2 functions take two node labels as
  48. parameters, and the 'mix' function takes 4 parameters, a symbolic and a
  49. non-symbolic label for each the two nodes. Each label is in form of 2-D
  50. dimension array (n_samples, n_features). Each function returns a number
  51. as the kernel value. Ignored when nodes are unlabeled.
  52. edge_kernels: dict
  53. A dictionary of kernel functions for edges, including 3 items: 'symb'
  54. for symbolic edge labels, 'nsymb' for non-symbolic edge labels, 'mix'
  55. for both labels. The first 2 functions take two edge labels as
  56. parameters, and the 'mix' function takes 4 parameters, a symbolic and a
  57. non-symbolic label for each the two edges. Each label is in form of 2-D
  58. dimension array (n_samples, n_features). Each function returns a number
  59. as the kernel value. Ignored when edges are unlabeled.
  60. Return
  61. ------
  62. Kmatrix : Numpy matrix
  63. Kernel matrix, each element of which is the mean average structural
  64. shortest path kernel between 2 praphs.
  65. """
  66. # pre-process
  67. Gn = args[0] if len(args) == 1 else [args[0], args[1]]
  68. Gn = [g.copy() for g in Gn]
  69. weight = None
  70. if edge_weight is None:
  71. if verbose:
  72. print('\n None edge weight specified. Set all weight to 1.\n')
  73. else:
  74. try:
  75. some_weight = list(
  76. nx.get_edge_attributes(Gn[0], edge_weight).values())[0]
  77. if isinstance(some_weight, (float, int)):
  78. weight = edge_weight
  79. else:
  80. if verbose:
  81. print(
  82. '\n Edge weight with name %s is not float or integer. Set all weight to 1.\n'
  83. % edge_weight)
  84. except:
  85. if verbose:
  86. print(
  87. '\n Edge weight with name "%s" is not found in the edge attributes. Set all weight to 1.\n'
  88. % edge_weight)
  89. ds_attrs = get_dataset_attributes(
  90. Gn,
  91. attr_names=['node_labeled', 'node_attr_dim', 'edge_labeled',
  92. 'edge_attr_dim', 'is_directed'],
  93. node_label=node_label, edge_label=edge_label)
  94. start_time = time.time()
  95. # get shortest paths of each graph in Gn
  96. splist = [None] * len(Gn)
  97. pool = Pool(n_jobs)
  98. itr = zip(Gn, range(0, len(Gn)))
  99. if len(Gn) < 100 * n_jobs:
  100. chunksize = int(len(Gn) / n_jobs) + 1
  101. else:
  102. chunksize = 100
  103. # get shortest path graphs of Gn
  104. if compute_method == 'trie':
  105. getsp_partial = partial(wrapper_getSP_trie, weight, ds_attrs['is_directed'])
  106. else:
  107. getsp_partial = partial(wrapper_getSP_naive, weight, ds_attrs['is_directed'])
  108. if verbose:
  109. iterator = tqdm(pool.imap_unordered(getsp_partial, itr, chunksize),
  110. desc='getting shortest paths', file=sys.stdout)
  111. else:
  112. iterator = pool.imap_unordered(getsp_partial, itr, chunksize)
  113. for i, sp in iterator:
  114. splist[i] = sp
  115. # time.sleep(10)
  116. pool.close()
  117. pool.join()
  118. # ss = 0
  119. # ss += sys.getsizeof(splist)
  120. # for spss in splist:
  121. # ss += sys.getsizeof(spss)
  122. # for spp in spss:
  123. # ss += sys.getsizeof(spp)
  124. # time.sleep(20)
  125. # # ---- direct running, normally use single CPU core. ----
  126. # splist = []
  127. # if compute_method == 'trie':
  128. # for g in tqdm(Gn, desc='getting sp graphs', file=sys.stdout):
  129. # splist.append(get_sps_as_trie(g, weight, ds_attrs['is_directed']))
  130. # else:
  131. # for g in tqdm(Gn, desc='getting sp graphs', file=sys.stdout):
  132. # splist.append(get_shortest_paths(g, weight, ds_attrs['is_directed']))
  133. # # ---- only for the Fast Computation of Shortest Path Kernel (FCSP)
  134. # sp_ml = [0] * len(Gn) # shortest path matrices
  135. # for i in result_sp:
  136. # sp_ml[i[0]] = i[1]
  137. # edge_x_g = [[] for i in range(len(sp_ml))]
  138. # edge_y_g = [[] for i in range(len(sp_ml))]
  139. # edge_w_g = [[] for i in range(len(sp_ml))]
  140. # for idx, item in enumerate(sp_ml):
  141. # for i1 in range(len(item)):
  142. # for i2 in range(i1 + 1, len(item)):
  143. # if item[i1, i2] != np.inf:
  144. # edge_x_g[idx].append(i1)
  145. # edge_y_g[idx].append(i2)
  146. # edge_w_g[idx].append(item[i1, i2])
  147. # print(len(edge_x_g[0]))
  148. # print(len(edge_y_g[0]))
  149. # print(len(edge_w_g[0]))
  150. Kmatrix = np.zeros((len(Gn), len(Gn)))
  151. # ---- use pool.imap_unordered to parallel and track progress. ----
  152. def init_worker(spl_toshare, gs_toshare):
  153. global G_spl, G_gs
  154. G_spl = spl_toshare
  155. G_gs = gs_toshare
  156. if compute_method == 'trie':
  157. do_partial = partial(wrapper_ssp_do_trie, ds_attrs, node_label, edge_label,
  158. node_kernels, edge_kernels)
  159. parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker,
  160. glbv=(splist, Gn), n_jobs=n_jobs, verbose=verbose)
  161. else:
  162. do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label,
  163. node_kernels, edge_kernels)
  164. parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker,
  165. glbv=(splist, Gn), n_jobs=n_jobs, verbose=verbose)
  166. # # ---- use pool.map to parallel. ----
  167. # pool = Pool(n_jobs)
  168. # do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label,
  169. # node_kernels, edge_kernels)
  170. # itr = zip(combinations_with_replacement(Gn, 2),
  171. # combinations_with_replacement(splist, 2),
  172. # combinations_with_replacement(range(0, len(Gn)), 2))
  173. # for i, j, kernel in tqdm(
  174. # pool.map(do_partial, itr), desc='calculating kernels',
  175. # file=sys.stdout):
  176. # Kmatrix[i][j] = kernel
  177. # Kmatrix[j][i] = kernel
  178. # pool.close()
  179. # pool.join()
  180. # # ---- use pool.imap_unordered to parallel and track progress. ----
  181. # do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label,
  182. # node_kernels, edge_kernels)
  183. # itr = zip(combinations_with_replacement(Gn, 2),
  184. # combinations_with_replacement(splist, 2),
  185. # combinations_with_replacement(range(0, len(Gn)), 2))
  186. # len_itr = int(len(Gn) * (len(Gn) + 1) / 2)
  187. # if len_itr < 1000 * n_jobs:
  188. # chunksize = int(len_itr / n_jobs) + 1
  189. # else:
  190. # chunksize = 1000
  191. # from contextlib import closing
  192. # with closing(Pool(n_jobs)) as pool:
  193. # for i, j, kernel in tqdm(
  194. # pool.imap_unordered(do_partial, itr, 1000),
  195. # desc='calculating kernels',
  196. # file=sys.stdout):
  197. # Kmatrix[i][j] = kernel
  198. # Kmatrix[j][i] = kernel
  199. # pool.close()
  200. # pool.join()
  201. # # ---- direct running, normally use single CPU core. ----
  202. # from itertools import combinations_with_replacement
  203. # itr = combinations_with_replacement(range(0, len(Gn)), 2)
  204. # if compute_method == 'trie':
  205. # for i, j in tqdm(itr, desc='calculating kernels', file=sys.stdout):
  206. # kernel = ssp_do_trie(Gn[i], Gn[j], splist[i], splist[j],
  207. # ds_attrs, node_label, edge_label, node_kernels, edge_kernels)
  208. # Kmatrix[i][j] = kernel
  209. # Kmatrix[j][i] = kernel
  210. # else:
  211. # for i, j in tqdm(itr, desc='calculating kernels', file=sys.stdout):
  212. # kernel = structuralspkernel_do(Gn[i], Gn[j], splist[i], splist[j],
  213. # ds_attrs, node_label, edge_label, node_kernels, edge_kernels)
  214. # # if(kernel > 1):
  215. # # print("error here ")
  216. # Kmatrix[i][j] = kernel
  217. # Kmatrix[j][i] = kernel
  218. run_time = time.time() - start_time
  219. if verbose:
  220. print("\n --- shortest path kernel matrix of size %d built in %s seconds ---"
  221. % (len(Gn), run_time))
  222. return Kmatrix, run_time
  223. def structuralspkernel_do(g1, g2, spl1, spl2, ds_attrs, node_label, edge_label,
  224. node_kernels, edge_kernels):
  225. kernel = 0
  226. # First, compute shortest path matrices, method borrowed from FCSP.
  227. vk_dict = getAllNodeKernels(g1, g2, node_kernels, node_label, ds_attrs)
  228. # Then, compute kernels between all pairs of edges, which is an idea of
  229. # extension of FCSP. It suits sparse graphs, which is the most case we
  230. # went though. For dense graphs, this would be slow.
  231. ek_dict = getAllEdgeKernels(g1, g2, edge_kernels, edge_label, ds_attrs)
  232. # compute graph kernels
  233. if vk_dict:
  234. if ek_dict:
  235. for p1, p2 in product(spl1, spl2):
  236. if len(p1) == len(p2):
  237. kpath = vk_dict[(p1[0], p2[0])]
  238. if kpath:
  239. for idx in range(1, len(p1)):
  240. kpath *= vk_dict[(p1[idx], p2[idx])] * \
  241. ek_dict[((p1[idx-1], p1[idx]),
  242. (p2[idx-1], p2[idx]))]
  243. if not kpath:
  244. break
  245. kernel += kpath # add up kernels of all paths
  246. else:
  247. for p1, p2 in product(spl1, spl2):
  248. if len(p1) == len(p2):
  249. kpath = vk_dict[(p1[0], p2[0])]
  250. if kpath:
  251. for idx in range(1, len(p1)):
  252. kpath *= vk_dict[(p1[idx], p2[idx])]
  253. if not kpath:
  254. break
  255. kernel += kpath # add up kernels of all paths
  256. else:
  257. if ek_dict:
  258. for p1, p2 in product(spl1, spl2):
  259. if len(p1) == len(p2):
  260. if len(p1) == 0:
  261. kernel += 1
  262. else:
  263. kpath = 1
  264. for idx in range(0, len(p1) - 1):
  265. kpath *= ek_dict[((p1[idx], p1[idx+1]),
  266. (p2[idx], p2[idx+1]))]
  267. if not kpath:
  268. break
  269. kernel += kpath # add up kernels of all paths
  270. else:
  271. for p1, p2 in product(spl1, spl2):
  272. if len(p1) == len(p2):
  273. kernel += 1
  274. kernel = kernel / (len(spl1) * len(spl2)) # calculate mean average
  275. # # ---- exact implementation of the Fast Computation of Shortest Path Kernel (FCSP), reference [2], sadly it is slower than the current implementation
  276. # # compute vertex kernel matrix
  277. # try:
  278. # vk_mat = np.zeros((nx.number_of_nodes(g1),
  279. # nx.number_of_nodes(g2)))
  280. # g1nl = enumerate(g1.nodes(data=True))
  281. # g2nl = enumerate(g2.nodes(data=True))
  282. # for i1, n1 in g1nl:
  283. # for i2, n2 in g2nl:
  284. # vk_mat[i1][i2] = kn(
  285. # n1[1][node_label], n2[1][node_label],
  286. # [n1[1]['attributes']], [n2[1]['attributes']])
  287. # range1 = range(0, len(edge_w_g[i]))
  288. # range2 = range(0, len(edge_w_g[j]))
  289. # for i1 in range1:
  290. # x1 = edge_x_g[i][i1]
  291. # y1 = edge_y_g[i][i1]
  292. # w1 = edge_w_g[i][i1]
  293. # for i2 in range2:
  294. # x2 = edge_x_g[j][i2]
  295. # y2 = edge_y_g[j][i2]
  296. # w2 = edge_w_g[j][i2]
  297. # ke = (w1 == w2)
  298. # if ke > 0:
  299. # kn1 = vk_mat[x1][x2] * vk_mat[y1][y2]
  300. # kn2 = vk_mat[x1][y2] * vk_mat[y1][x2]
  301. # Kmatrix += kn1 + kn2
  302. return kernel
  303. def wrapper_ssp_do(ds_attrs, node_label, edge_label, node_kernels,
  304. edge_kernels, itr):
  305. i = itr[0]
  306. j = itr[1]
  307. return i, j, structuralspkernel_do(G_gs[i], G_gs[j], G_spl[i], G_spl[j],
  308. ds_attrs, node_label, edge_label,
  309. node_kernels, edge_kernels)
  310. def ssp_do_trie(g1, g2, trie1, trie2, ds_attrs, node_label, edge_label,
  311. node_kernels, edge_kernels):
  312. # # traverse all paths in graph1. Deep-first search is applied.
  313. # def traverseBothTrie(root, trie2, kernel, pcurrent=[]):
  314. # for key, node in root['children'].items():
  315. # pcurrent.append(key)
  316. # if node['isEndOfWord']:
  317. # # print(node['count'])
  318. # traverseTrie2(trie2.root, pcurrent, kernel,
  319. # pcurrent=[])
  320. # if node['children'] != {}:
  321. # traverseBothTrie(node, trie2, kernel, pcurrent)
  322. # else:
  323. # del pcurrent[-1]
  324. # if pcurrent != []:
  325. # del pcurrent[-1]
  326. #
  327. #
  328. # # traverse all paths in graph2 and find out those that are not in
  329. # # graph1. Deep-first search is applied.
  330. # def traverseTrie2(root, p1, kernel, pcurrent=[]):
  331. # for key, node in root['children'].items():
  332. # pcurrent.append(key)
  333. # if node['isEndOfWord']:
  334. # # print(node['count'])
  335. # kernel[0] += computePathKernel(p1, pcurrent, vk_dict, ek_dict)
  336. # if node['children'] != {}:
  337. # traverseTrie2(node, p1, kernel, pcurrent)
  338. # else:
  339. # del pcurrent[-1]
  340. # if pcurrent != []:
  341. # del pcurrent[-1]
  342. #
  343. #
  344. # kernel = [0]
  345. #
  346. # # First, compute shortest path matrices, method borrowed from FCSP.
  347. # vk_dict = getAllNodeKernels(g1, g2, node_kernels, node_label, ds_attrs)
  348. # # Then, compute kernels between all pairs of edges, which is an idea of
  349. # # extension of FCSP. It suits sparse graphs, which is the most case we
  350. # # went though. For dense graphs, this would be slow.
  351. # ek_dict = getAllEdgeKernels(g1, g2, edge_kernels, edge_label, ds_attrs)
  352. #
  353. # # compute graph kernels
  354. # traverseBothTrie(trie1[0].root, trie2[0], kernel)
  355. #
  356. # kernel = kernel[0] / (trie1[1] * trie2[1]) # calculate mean average
  357. # # traverse all paths in graph1. Deep-first search is applied.
  358. # def traverseBothTrie(root, trie2, kernel, vk_dict, ek_dict, pcurrent=[]):
  359. # for key, node in root['children'].items():
  360. # pcurrent.append(key)
  361. # if node['isEndOfWord']:
  362. # # print(node['count'])
  363. # traverseTrie2(trie2.root, pcurrent, kernel, vk_dict, ek_dict,
  364. # pcurrent=[])
  365. # if node['children'] != {}:
  366. # traverseBothTrie(node, trie2, kernel, vk_dict, ek_dict, pcurrent)
  367. # else:
  368. # del pcurrent[-1]
  369. # if pcurrent != []:
  370. # del pcurrent[-1]
  371. #
  372. #
  373. # # traverse all paths in graph2 and find out those that are not in
  374. # # graph1. Deep-first search is applied.
  375. # def traverseTrie2(root, p1, kernel, vk_dict, ek_dict, pcurrent=[]):
  376. # for key, node in root['children'].items():
  377. # pcurrent.append(key)
  378. # if node['isEndOfWord']:
  379. # # print(node['count'])
  380. # kernel[0] += computePathKernel(p1, pcurrent, vk_dict, ek_dict)
  381. # if node['children'] != {}:
  382. # traverseTrie2(node, p1, kernel, vk_dict, ek_dict, pcurrent)
  383. # else:
  384. # del pcurrent[-1]
  385. # if pcurrent != []:
  386. # del pcurrent[-1]
  387. kernel = [0]
  388. # First, compute shortest path matrices, method borrowed from FCSP.
  389. vk_dict = getAllNodeKernels(g1, g2, node_kernels, node_label, ds_attrs)
  390. # Then, compute kernels between all pairs of edges, which is an idea of
  391. # extension of FCSP. It suits sparse graphs, which is the most case we
  392. # went though. For dense graphs, this would be slow.
  393. ek_dict = getAllEdgeKernels(g1, g2, edge_kernels, edge_label, ds_attrs)
  394. # compute graph kernels
  395. # traverseBothTrie(trie1[0].root, trie2[0], kernel, vk_dict, ek_dict)
  396. if vk_dict:
  397. if ek_dict:
  398. traverseBothTriem(trie1[0].root, trie2[0], kernel, vk_dict, ek_dict)
  399. else:
  400. traverseBothTriev(trie1[0].root, trie2[0], kernel, vk_dict, ek_dict)
  401. else:
  402. if ek_dict:
  403. traverseBothTriee(trie1[0].root, trie2[0], kernel, vk_dict, ek_dict)
  404. else:
  405. traverseBothTrieu(trie1[0].root, trie2[0], kernel, vk_dict, ek_dict)
  406. kernel = kernel[0] / (trie1[1] * trie2[1]) # calculate mean average
  407. return kernel
  408. def wrapper_ssp_do_trie(ds_attrs, node_label, edge_label, node_kernels,
  409. edge_kernels, itr):
  410. i = itr[0]
  411. j = itr[1]
  412. return i, j, ssp_do_trie(G_gs[i], G_gs[j], G_spl[i], G_spl[j], ds_attrs,
  413. node_label, edge_label, node_kernels, edge_kernels)
  414. def getAllNodeKernels(g1, g2, node_kernels, node_label, ds_attrs):
  415. # compute shortest path matrices, method borrowed from FCSP.
  416. vk_dict = {} # shortest path matrices dict
  417. if ds_attrs['node_labeled']:
  418. # node symb and non-synb labeled
  419. if ds_attrs['node_attr_dim'] > 0:
  420. kn = node_kernels['mix']
  421. for n1, n2 in product(
  422. g1.nodes(data=True), g2.nodes(data=True)):
  423. vk_dict[(n1[0], n2[0])] = kn(
  424. n1[1][node_label], n2[1][node_label],
  425. n1[1]['attributes'], n2[1]['attributes'])
  426. # node symb labeled
  427. else:
  428. kn = node_kernels['symb']
  429. for n1 in g1.nodes(data=True):
  430. for n2 in g2.nodes(data=True):
  431. vk_dict[(n1[0], n2[0])] = kn(n1[1][node_label],
  432. n2[1][node_label])
  433. else:
  434. # node non-synb labeled
  435. if ds_attrs['node_attr_dim'] > 0:
  436. kn = node_kernels['nsymb']
  437. for n1 in g1.nodes(data=True):
  438. for n2 in g2.nodes(data=True):
  439. vk_dict[(n1[0], n2[0])] = kn(n1[1]['attributes'],
  440. n2[1]['attributes'])
  441. # node unlabeled
  442. else:
  443. pass
  444. return vk_dict
  445. def getAllEdgeKernels(g1, g2, edge_kernels, edge_label, ds_attrs):
  446. # compute kernels between all pairs of edges, which is an idea of
  447. # extension of FCSP. It suits sparse graphs, which is the most case we
  448. # went though. For dense graphs, this would be slow.
  449. ek_dict = {} # dict of edge kernels
  450. if ds_attrs['edge_labeled']:
  451. # edge symb and non-synb labeled
  452. if ds_attrs['edge_attr_dim'] > 0:
  453. ke = edge_kernels['mix']
  454. for e1, e2 in product(
  455. g1.edges(data=True), g2.edges(data=True)):
  456. ek_temp = ke(e1[2][edge_label], e2[2][edge_label],
  457. e1[2]['attributes'], e2[2]['attributes'])
  458. ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ek_temp
  459. ek_dict[((e1[1], e1[0]), (e2[0], e2[1]))] = ek_temp
  460. ek_dict[((e1[0], e1[1]), (e2[1], e2[0]))] = ek_temp
  461. ek_dict[((e1[1], e1[0]), (e2[1], e2[0]))] = ek_temp
  462. # edge symb labeled
  463. else:
  464. ke = edge_kernels['symb']
  465. for e1 in g1.edges(data=True):
  466. for e2 in g2.edges(data=True):
  467. ek_temp = ke(e1[2][edge_label], e2[2][edge_label])
  468. ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ek_temp
  469. ek_dict[((e1[1], e1[0]), (e2[0], e2[1]))] = ek_temp
  470. ek_dict[((e1[0], e1[1]), (e2[1], e2[0]))] = ek_temp
  471. ek_dict[((e1[1], e1[0]), (e2[1], e2[0]))] = ek_temp
  472. else:
  473. # edge non-synb labeled
  474. if ds_attrs['edge_attr_dim'] > 0:
  475. ke = edge_kernels['nsymb']
  476. for e1 in g1.edges(data=True):
  477. for e2 in g2.edges(data=True):
  478. ek_temp = ke(e1[2]['attributes'], e2[2]['attributes'])
  479. ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ek_temp
  480. ek_dict[((e1[1], e1[0]), (e2[0], e2[1]))] = ek_temp
  481. ek_dict[((e1[0], e1[1]), (e2[1], e2[0]))] = ek_temp
  482. ek_dict[((e1[1], e1[0]), (e2[1], e2[0]))] = ek_temp
  483. # edge unlabeled
  484. else:
  485. pass
  486. return ek_dict
  487. # traverse all paths in graph1. Deep-first search is applied.
  488. def traverseBothTriem(root, trie2, kernel, vk_dict, ek_dict, pcurrent=[]):
  489. for key, node in root['children'].items():
  490. pcurrent.append(key)
  491. if node['isEndOfWord']:
  492. # print(node['count'])
  493. traverseTrie2m(trie2.root, pcurrent, kernel, vk_dict, ek_dict,
  494. pcurrent=[])
  495. if node['children'] != {}:
  496. traverseBothTriem(node, trie2, kernel, vk_dict, ek_dict, pcurrent)
  497. else:
  498. del pcurrent[-1]
  499. if pcurrent != []:
  500. del pcurrent[-1]
  501. # traverse all paths in graph2 and find out those that are not in
  502. # graph1. Deep-first search is applied.
  503. def traverseTrie2m(root, p1, kernel, vk_dict, ek_dict, pcurrent=[]):
  504. for key, node in root['children'].items():
  505. pcurrent.append(key)
  506. if node['isEndOfWord']:
  507. # print(node['count'])
  508. if len(p1) == len(pcurrent):
  509. kpath = vk_dict[(p1[0], pcurrent[0])]
  510. if kpath:
  511. for idx in range(1, len(p1)):
  512. kpath *= vk_dict[(p1[idx], pcurrent[idx])] * \
  513. ek_dict[((p1[idx-1], p1[idx]),
  514. (pcurrent[idx-1], pcurrent[idx]))]
  515. if not kpath:
  516. break
  517. kernel[0] += kpath # add up kernels of all paths
  518. if node['children'] != {}:
  519. traverseTrie2m(node, p1, kernel, vk_dict, ek_dict, pcurrent)
  520. else:
  521. del pcurrent[-1]
  522. if pcurrent != []:
  523. del pcurrent[-1]
  524. # traverse all paths in graph1. Deep-first search is applied.
  525. def traverseBothTriev(root, trie2, kernel, vk_dict, ek_dict, pcurrent=[]):
  526. for key, node in root['children'].items():
  527. pcurrent.append(key)
  528. if node['isEndOfWord']:
  529. # print(node['count'])
  530. traverseTrie2v(trie2.root, pcurrent, kernel, vk_dict, ek_dict,
  531. pcurrent=[])
  532. if node['children'] != {}:
  533. traverseBothTriev(node, trie2, kernel, vk_dict, ek_dict, pcurrent)
  534. else:
  535. del pcurrent[-1]
  536. if pcurrent != []:
  537. del pcurrent[-1]
  538. # traverse all paths in graph2 and find out those that are not in
  539. # graph1. Deep-first search is applied.
  540. def traverseTrie2v(root, p1, kernel, vk_dict, ek_dict, pcurrent=[]):
  541. for key, node in root['children'].items():
  542. pcurrent.append(key)
  543. if node['isEndOfWord']:
  544. # print(node['count'])
  545. if len(p1) == len(pcurrent):
  546. kpath = vk_dict[(p1[0], pcurrent[0])]
  547. if kpath:
  548. for idx in range(1, len(p1)):
  549. kpath *= vk_dict[(p1[idx], pcurrent[idx])]
  550. if not kpath:
  551. break
  552. kernel[0] += kpath # add up kernels of all paths
  553. if node['children'] != {}:
  554. traverseTrie2v(node, p1, kernel, vk_dict, ek_dict, pcurrent)
  555. else:
  556. del pcurrent[-1]
  557. if pcurrent != []:
  558. del pcurrent[-1]
  559. # traverse all paths in graph1. Deep-first search is applied.
  560. def traverseBothTriee(root, trie2, kernel, vk_dict, ek_dict, pcurrent=[]):
  561. for key, node in root['children'].items():
  562. pcurrent.append(key)
  563. if node['isEndOfWord']:
  564. # print(node['count'])
  565. traverseTrie2e(trie2.root, pcurrent, kernel, vk_dict, ek_dict,
  566. pcurrent=[])
  567. if node['children'] != {}:
  568. traverseBothTriee(node, trie2, kernel, vk_dict, ek_dict, pcurrent)
  569. else:
  570. del pcurrent[-1]
  571. if pcurrent != []:
  572. del pcurrent[-1]
  573. # traverse all paths in graph2 and find out those that are not in
  574. # graph1. Deep-first search is applied.
  575. def traverseTrie2e(root, p1, kernel, vk_dict, ek_dict, pcurrent=[]):
  576. for key, node in root['children'].items():
  577. pcurrent.append(key)
  578. if node['isEndOfWord']:
  579. # print(node['count'])
  580. if len(p1) == len(pcurrent):
  581. if len(p1) == 0:
  582. kernel += 1
  583. else:
  584. kpath = 1
  585. for idx in range(0, len(p1) - 1):
  586. kpath *= ek_dict[((p1[idx], p1[idx+1]),
  587. (pcurrent[idx], pcurrent[idx+1]))]
  588. if not kpath:
  589. break
  590. kernel[0] += kpath # add up kernels of all paths
  591. if node['children'] != {}:
  592. traverseTrie2e(node, p1, kernel, vk_dict, ek_dict, pcurrent)
  593. else:
  594. del pcurrent[-1]
  595. if pcurrent != []:
  596. del pcurrent[-1]
  597. # traverse all paths in graph1. Deep-first search is applied.
  598. def traverseBothTrieu(root, trie2, kernel, vk_dict, ek_dict, pcurrent=[]):
  599. for key, node in root['children'].items():
  600. pcurrent.append(key)
  601. if node['isEndOfWord']:
  602. # print(node['count'])
  603. traverseTrie2u(trie2.root, pcurrent, kernel, vk_dict, ek_dict,
  604. pcurrent=[])
  605. if node['children'] != {}:
  606. traverseBothTrieu(node, trie2, kernel, vk_dict, ek_dict, pcurrent)
  607. else:
  608. del pcurrent[-1]
  609. if pcurrent != []:
  610. del pcurrent[-1]
  611. # traverse all paths in graph2 and find out those that are not in
  612. # graph1. Deep-first search is applied.
  613. def traverseTrie2u(root, p1, kernel, vk_dict, ek_dict, pcurrent=[]):
  614. for key, node in root['children'].items():
  615. pcurrent.append(key)
  616. if node['isEndOfWord']:
  617. # print(node['count'])
  618. if len(p1) == len(pcurrent):
  619. kernel[0] += 1
  620. if node['children'] != {}:
  621. traverseTrie2u(node, p1, kernel, vk_dict, ek_dict, pcurrent)
  622. else:
  623. del pcurrent[-1]
  624. if pcurrent != []:
  625. del pcurrent[-1]
  626. #def computePathKernel(p1, p2, vk_dict, ek_dict):
  627. # kernel = 0
  628. # if vk_dict:
  629. # if ek_dict:
  630. # if len(p1) == len(p2):
  631. # kpath = vk_dict[(p1[0], p2[0])]
  632. # if kpath:
  633. # for idx in range(1, len(p1)):
  634. # kpath *= vk_dict[(p1[idx], p2[idx])] * \
  635. # ek_dict[((p1[idx-1], p1[idx]),
  636. # (p2[idx-1], p2[idx]))]
  637. # if not kpath:
  638. # break
  639. # kernel += kpath # add up kernels of all paths
  640. # else:
  641. # if len(p1) == len(p2):
  642. # kpath = vk_dict[(p1[0], p2[0])]
  643. # if kpath:
  644. # for idx in range(1, len(p1)):
  645. # kpath *= vk_dict[(p1[idx], p2[idx])]
  646. # if not kpath:
  647. # break
  648. # kernel += kpath # add up kernels of all paths
  649. # else:
  650. # if ek_dict:
  651. # if len(p1) == len(p2):
  652. # if len(p1) == 0:
  653. # kernel += 1
  654. # else:
  655. # kpath = 1
  656. # for idx in range(0, len(p1) - 1):
  657. # kpath *= ek_dict[((p1[idx], p1[idx+1]),
  658. # (p2[idx], p2[idx+1]))]
  659. # if not kpath:
  660. # break
  661. # kernel += kpath # add up kernels of all paths
  662. # else:
  663. # if len(p1) == len(p2):
  664. # kernel += 1
  665. #
  666. # return kernel
  667. def get_shortest_paths(G, weight, directed):
  668. """Get all shortest paths of a graph.
  669. Parameters
  670. ----------
  671. G : NetworkX graphs
  672. The graphs whose paths are calculated.
  673. weight : string/None
  674. edge attribute used as weight to calculate the shortest path.
  675. directed: boolean
  676. Whether graph is directed.
  677. Return
  678. ------
  679. sp : list of list
  680. List of shortest paths of the graph, where each path is represented by a list of nodes.
  681. """
  682. sp = []
  683. for n1, n2 in combinations(G.nodes(), 2):
  684. try:
  685. spltemp = list(nx.all_shortest_paths(G, n1, n2, weight=weight))
  686. except nx.NetworkXNoPath: # nodes not connected
  687. # sp.append([])
  688. pass
  689. else:
  690. sp += spltemp
  691. # each edge walk is counted twice, starting from both its extreme nodes.
  692. if not directed:
  693. sp += [sptemp[::-1] for sptemp in spltemp]
  694. # add single nodes as length 0 paths.
  695. sp += [[n] for n in G.nodes()]
  696. return sp
  697. def wrapper_getSP_naive(weight, directed, itr_item):
  698. g = itr_item[0]
  699. i = itr_item[1]
  700. return i, get_shortest_paths(g, weight, directed)
  701. def get_sps_as_trie(G, weight, directed):
  702. """Get all shortest paths of a graph and insert them into a trie.
  703. Parameters
  704. ----------
  705. G : NetworkX graphs
  706. The graphs whose paths are calculated.
  707. weight : string/None
  708. edge attribute used as weight to calculate the shortest path.
  709. directed: boolean
  710. Whether graph is directed.
  711. Return
  712. ------
  713. sp : list of list
  714. List of shortest paths of the graph, where each path is represented by a list of nodes.
  715. """
  716. sptrie = Trie()
  717. lensp = 0
  718. for n1, n2 in combinations(G.nodes(), 2):
  719. try:
  720. spltemp = list(nx.all_shortest_paths(G, n1, n2, weight=weight))
  721. except nx.NetworkXNoPath: # nodes not connected
  722. pass
  723. else:
  724. lensp += len(spltemp)
  725. if not directed:
  726. lensp += len(spltemp)
  727. for sp in spltemp:
  728. sptrie.insertWord(sp)
  729. # each edge walk is counted twice, starting from both its extreme nodes.
  730. if not directed:
  731. sptrie.insertWord(sp[::-1])
  732. # add single nodes as length 0 paths.
  733. for n in G.nodes():
  734. sptrie.insertWord([n])
  735. return sptrie, lensp + nx.number_of_nodes(G)
  736. def wrapper_getSP_trie(weight, directed, itr_item):
  737. g = itr_item[0]
  738. i = itr_item[1]
  739. return i, get_sps_as_trie(g, weight, directed)

A Python package for graph kernels, graph edit distances and graph pre-image problem.