You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

structuralspKernel.py 34 kB

5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. Created on Thu Sep 27 10:56:23 2018
  5. @author: linlin
  6. @references:
  7. [1] Suard F, Rakotomamonjy A, Bensrhair A. Kernel on Bag of Paths For
  8. Measuring Similarity of Shapes. InESANN 2007 Apr 25 (pp. 355-360).
  9. """
  10. import sys
  11. import time
  12. from itertools import combinations, product
  13. from functools import partial
  14. from multiprocessing import Pool
  15. from tqdm import tqdm
  16. import networkx as nx
  17. import numpy as np
  18. from gklearn.utils.graphdataset import get_dataset_attributes
  19. from gklearn.utils.parallel import parallel_gm
  20. from gklearn.utils.trie import Trie
  21. def structuralspkernel(*args,
  22. node_label='atom',
  23. edge_weight=None,
  24. edge_label='bond_type',
  25. node_kernels=None,
  26. edge_kernels=None,
  27. compute_method='naive',
  28. parallel='imap_unordered',
  29. # parallel=None,
  30. n_jobs=None,
  31. verbose=True):
  32. """Calculate mean average structural shortest path kernels between graphs.
  33. Parameters
  34. ----------
  35. Gn : List of NetworkX graph
  36. List of graphs between which the kernels are calculated.
  37. G1, G2 : NetworkX graphs
  38. Two graphs between which the kernel is calculated.
  39. node_label : string
  40. Node attribute used as label. The default node label is atom.
  41. edge_weight : string
  42. Edge attribute name corresponding to the edge weight. Applied for the
  43. computation of the shortest paths.
  44. edge_label : string
  45. Edge attribute used as label. The default edge label is bond_type.
  46. node_kernels : dict
  47. A dictionary of kernel functions for nodes, including 3 items: 'symb'
  48. for symbolic node labels, 'nsymb' for non-symbolic node labels, 'mix'
  49. for both labels. The first 2 functions take two node labels as
  50. parameters, and the 'mix' function takes 4 parameters, a symbolic and a
  51. non-symbolic label for each the two nodes. Each label is in form of 2-D
  52. dimension array (n_samples, n_features). Each function returns a number
  53. as the kernel value. Ignored when nodes are unlabeled.
  54. edge_kernels : dict
  55. A dictionary of kernel functions for edges, including 3 items: 'symb'
  56. for symbolic edge labels, 'nsymb' for non-symbolic edge labels, 'mix'
  57. for both labels. The first 2 functions take two edge labels as
  58. parameters, and the 'mix' function takes 4 parameters, a symbolic and a
  59. non-symbolic label for each the two edges. Each label is in form of 2-D
  60. dimension array (n_samples, n_features). Each function returns a number
  61. as the kernel value. Ignored when edges are unlabeled.
  62. compute_method : string
  63. Computation method to store the shortest paths and compute the graph
  64. kernel. The Following choices are available:
  65. 'trie': store paths as tries.
  66. 'naive': store paths to lists.
  67. n_jobs : int
  68. Number of jobs for parallelization.
  69. Return
  70. ------
  71. Kmatrix : Numpy matrix
  72. Kernel matrix, each element of which is the mean average structural
  73. shortest path kernel between 2 praphs.
  74. """
  75. # pre-process
  76. Gn = args[0] if len(args) == 1 else [args[0], args[1]]
  77. Gn = [g.copy() for g in Gn]
  78. weight = None
  79. if edge_weight is None:
  80. if verbose:
  81. print('\n None edge weight specified. Set all weight to 1.\n')
  82. else:
  83. try:
  84. some_weight = list(
  85. nx.get_edge_attributes(Gn[0], edge_weight).values())[0]
  86. if isinstance(some_weight, (float, int)):
  87. weight = edge_weight
  88. else:
  89. if verbose:
  90. print(
  91. '\n Edge weight with name %s is not float or integer. Set all weight to 1.\n'
  92. % edge_weight)
  93. except:
  94. if verbose:
  95. print(
  96. '\n Edge weight with name "%s" is not found in the edge attributes. Set all weight to 1.\n'
  97. % edge_weight)
  98. ds_attrs = get_dataset_attributes(
  99. Gn,
  100. attr_names=['node_labeled', 'node_attr_dim', 'edge_labeled',
  101. 'edge_attr_dim', 'is_directed'],
  102. node_label=node_label, edge_label=edge_label)
  103. start_time = time.time()
  104. # get shortest paths of each graph in Gn
  105. if parallel == 'imap_unordered':
  106. splist = [None] * len(Gn)
  107. pool = Pool(n_jobs)
  108. itr = zip(Gn, range(0, len(Gn)))
  109. if len(Gn) < 100 * n_jobs:
  110. chunksize = int(len(Gn) / n_jobs) + 1
  111. else:
  112. chunksize = 100
  113. # get shortest path graphs of Gn
  114. if compute_method == 'trie':
  115. getsp_partial = partial(wrapper_getSP_trie, weight, ds_attrs['is_directed'])
  116. else:
  117. getsp_partial = partial(wrapper_getSP_naive, weight, ds_attrs['is_directed'])
  118. if verbose:
  119. iterator = tqdm(pool.imap_unordered(getsp_partial, itr, chunksize),
  120. desc='getting shortest paths', file=sys.stdout)
  121. else:
  122. iterator = pool.imap_unordered(getsp_partial, itr, chunksize)
  123. for i, sp in iterator:
  124. splist[i] = sp
  125. # time.sleep(10)
  126. pool.close()
  127. pool.join()
  128. # ---- direct running, normally use single CPU core. ----
  129. elif parallel is None:
  130. splist = []
  131. if verbose:
  132. iterator = tqdm(Gn, desc='getting sp graphs', file=sys.stdout)
  133. else:
  134. iterator = Gn
  135. if compute_method == 'trie':
  136. for g in iterator:
  137. splist.append(get_sps_as_trie(g, weight, ds_attrs['is_directed']))
  138. else:
  139. for g in iterator:
  140. splist.append(get_shortest_paths(g, weight, ds_attrs['is_directed']))
  141. # ss = 0
  142. # ss += sys.getsizeof(splist)
  143. # for spss in splist:
  144. # ss += sys.getsizeof(spss)
  145. # for spp in spss:
  146. # ss += sys.getsizeof(spp)
  147. # time.sleep(20)
  148. # # ---- only for the Fast Computation of Shortest Path Kernel (FCSP)
  149. # sp_ml = [0] * len(Gn) # shortest path matrices
  150. # for i in result_sp:
  151. # sp_ml[i[0]] = i[1]
  152. # edge_x_g = [[] for i in range(len(sp_ml))]
  153. # edge_y_g = [[] for i in range(len(sp_ml))]
  154. # edge_w_g = [[] for i in range(len(sp_ml))]
  155. # for idx, item in enumerate(sp_ml):
  156. # for i1 in range(len(item)):
  157. # for i2 in range(i1 + 1, len(item)):
  158. # if item[i1, i2] != np.inf:
  159. # edge_x_g[idx].append(i1)
  160. # edge_y_g[idx].append(i2)
  161. # edge_w_g[idx].append(item[i1, i2])
  162. # print(len(edge_x_g[0]))
  163. # print(len(edge_y_g[0]))
  164. # print(len(edge_w_g[0]))
  165. Kmatrix = np.zeros((len(Gn), len(Gn)))
  166. # ---- use pool.imap_unordered to parallel and track progress. ----
  167. if parallel == 'imap_unordered':
  168. def init_worker(spl_toshare, gs_toshare):
  169. global G_spl, G_gs
  170. G_spl = spl_toshare
  171. G_gs = gs_toshare
  172. if compute_method == 'trie':
  173. do_partial = partial(wrapper_ssp_do_trie, ds_attrs, node_label, edge_label,
  174. node_kernels, edge_kernels)
  175. parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker,
  176. glbv=(splist, Gn), n_jobs=n_jobs, verbose=verbose)
  177. else:
  178. do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label,
  179. node_kernels, edge_kernels)
  180. parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker,
  181. glbv=(splist, Gn), n_jobs=n_jobs, verbose=verbose)
  182. # ---- direct running, normally use single CPU core. ----
  183. elif parallel is None:
  184. from itertools import combinations_with_replacement
  185. itr = combinations_with_replacement(range(0, len(Gn)), 2)
  186. if verbose:
  187. iterator = tqdm(itr, desc='calculating kernels', file=sys.stdout)
  188. else:
  189. iterator = itr
  190. if compute_method == 'trie':
  191. for i, j in iterator:
  192. kernel = ssp_do_trie(Gn[i], Gn[j], splist[i], splist[j],
  193. ds_attrs, node_label, edge_label, node_kernels, edge_kernels)
  194. Kmatrix[i][j] = kernel
  195. Kmatrix[j][i] = kernel
  196. else:
  197. for i, j in iterator:
  198. kernel = structuralspkernel_do(Gn[i], Gn[j], splist[i], splist[j],
  199. ds_attrs, node_label, edge_label, node_kernels, edge_kernels)
  200. # if(kernel > 1):
  201. # print("error here ")
  202. Kmatrix[i][j] = kernel
  203. Kmatrix[j][i] = kernel
  204. # # ---- use pool.map to parallel. ----
  205. # pool = Pool(n_jobs)
  206. # do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label,
  207. # node_kernels, edge_kernels)
  208. # itr = zip(combinations_with_replacement(Gn, 2),
  209. # combinations_with_replacement(splist, 2),
  210. # combinations_with_replacement(range(0, len(Gn)), 2))
  211. # for i, j, kernel in tqdm(
  212. # pool.map(do_partial, itr), desc='calculating kernels',
  213. # file=sys.stdout):
  214. # Kmatrix[i][j] = kernel
  215. # Kmatrix[j][i] = kernel
  216. # pool.close()
  217. # pool.join()
  218. # # ---- use pool.imap_unordered to parallel and track progress. ----
  219. # do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label,
  220. # node_kernels, edge_kernels)
  221. # itr = zip(combinations_with_replacement(Gn, 2),
  222. # combinations_with_replacement(splist, 2),
  223. # combinations_with_replacement(range(0, len(Gn)), 2))
  224. # len_itr = int(len(Gn) * (len(Gn) + 1) / 2)
  225. # if len_itr < 1000 * n_jobs:
  226. # chunksize = int(len_itr / n_jobs) + 1
  227. # else:
  228. # chunksize = 1000
  229. # from contextlib import closing
  230. # with closing(Pool(n_jobs)) as pool:
  231. # for i, j, kernel in tqdm(
  232. # pool.imap_unordered(do_partial, itr, 1000),
  233. # desc='calculating kernels',
  234. # file=sys.stdout):
  235. # Kmatrix[i][j] = kernel
  236. # Kmatrix[j][i] = kernel
  237. # pool.close()
  238. # pool.join()
  239. run_time = time.time() - start_time
  240. if verbose:
  241. print("\n --- shortest path kernel matrix of size %d built in %s seconds ---"
  242. % (len(Gn), run_time))
  243. return Kmatrix, run_time
  244. def structuralspkernel_do(g1, g2, spl1, spl2, ds_attrs, node_label, edge_label,
  245. node_kernels, edge_kernels):
  246. kernel = 0
  247. # First, compute shortest path matrices, method borrowed from FCSP.
  248. vk_dict = getAllNodeKernels(g1, g2, node_kernels, node_label, ds_attrs)
  249. # Then, compute kernels between all pairs of edges, which is an idea of
  250. # extension of FCSP. It suits sparse graphs, which is the most case we
  251. # went though. For dense graphs, this would be slow.
  252. ek_dict = getAllEdgeKernels(g1, g2, edge_kernels, edge_label, ds_attrs)
  253. # compute graph kernels
  254. if vk_dict:
  255. if ek_dict:
  256. for p1, p2 in product(spl1, spl2):
  257. if len(p1) == len(p2):
  258. kpath = vk_dict[(p1[0], p2[0])]
  259. if kpath:
  260. for idx in range(1, len(p1)):
  261. kpath *= vk_dict[(p1[idx], p2[idx])] * \
  262. ek_dict[((p1[idx-1], p1[idx]),
  263. (p2[idx-1], p2[idx]))]
  264. if not kpath:
  265. break
  266. kernel += kpath # add up kernels of all paths
  267. else:
  268. for p1, p2 in product(spl1, spl2):
  269. if len(p1) == len(p2):
  270. kpath = vk_dict[(p1[0], p2[0])]
  271. if kpath:
  272. for idx in range(1, len(p1)):
  273. kpath *= vk_dict[(p1[idx], p2[idx])]
  274. if not kpath:
  275. break
  276. kernel += kpath # add up kernels of all paths
  277. else:
  278. if ek_dict:
  279. for p1, p2 in product(spl1, spl2):
  280. if len(p1) == len(p2):
  281. if len(p1) == 0:
  282. kernel += 1
  283. else:
  284. kpath = 1
  285. for idx in range(0, len(p1) - 1):
  286. kpath *= ek_dict[((p1[idx], p1[idx+1]),
  287. (p2[idx], p2[idx+1]))]
  288. if not kpath:
  289. break
  290. kernel += kpath # add up kernels of all paths
  291. else:
  292. for p1, p2 in product(spl1, spl2):
  293. if len(p1) == len(p2):
  294. kernel += 1
  295. try:
  296. kernel = kernel / (len(spl1) * len(spl2)) # calculate mean average
  297. except ZeroDivisionError:
  298. print(spl1, spl2)
  299. print(g1.nodes(data=True))
  300. print(g1.edges(data=True))
  301. raise Exception
  302. # # ---- exact implementation of the Fast Computation of Shortest Path Kernel (FCSP), reference [2], sadly it is slower than the current implementation
  303. # # compute vertex kernel matrix
  304. # try:
  305. # vk_mat = np.zeros((nx.number_of_nodes(g1),
  306. # nx.number_of_nodes(g2)))
  307. # g1nl = enumerate(g1.nodes(data=True))
  308. # g2nl = enumerate(g2.nodes(data=True))
  309. # for i1, n1 in g1nl:
  310. # for i2, n2 in g2nl:
  311. # vk_mat[i1][i2] = kn(
  312. # n1[1][node_label], n2[1][node_label],
  313. # [n1[1]['attributes']], [n2[1]['attributes']])
  314. # range1 = range(0, len(edge_w_g[i]))
  315. # range2 = range(0, len(edge_w_g[j]))
  316. # for i1 in range1:
  317. # x1 = edge_x_g[i][i1]
  318. # y1 = edge_y_g[i][i1]
  319. # w1 = edge_w_g[i][i1]
  320. # for i2 in range2:
  321. # x2 = edge_x_g[j][i2]
  322. # y2 = edge_y_g[j][i2]
  323. # w2 = edge_w_g[j][i2]
  324. # ke = (w1 == w2)
  325. # if ke > 0:
  326. # kn1 = vk_mat[x1][x2] * vk_mat[y1][y2]
  327. # kn2 = vk_mat[x1][y2] * vk_mat[y1][x2]
  328. # Kmatrix += kn1 + kn2
  329. return kernel
  330. def wrapper_ssp_do(ds_attrs, node_label, edge_label, node_kernels,
  331. edge_kernels, itr):
  332. i = itr[0]
  333. j = itr[1]
  334. return i, j, structuralspkernel_do(G_gs[i], G_gs[j], G_spl[i], G_spl[j],
  335. ds_attrs, node_label, edge_label,
  336. node_kernels, edge_kernels)
  337. def ssp_do_trie(g1, g2, trie1, trie2, ds_attrs, node_label, edge_label,
  338. node_kernels, edge_kernels):
  339. # # traverse all paths in graph1. Deep-first search is applied.
  340. # def traverseBothTrie(root, trie2, kernel, pcurrent=[]):
  341. # for key, node in root['children'].items():
  342. # pcurrent.append(key)
  343. # if node['isEndOfWord']:
  344. # # print(node['count'])
  345. # traverseTrie2(trie2.root, pcurrent, kernel,
  346. # pcurrent=[])
  347. # if node['children'] != {}:
  348. # traverseBothTrie(node, trie2, kernel, pcurrent)
  349. # else:
  350. # del pcurrent[-1]
  351. # if pcurrent != []:
  352. # del pcurrent[-1]
  353. #
  354. #
  355. # # traverse all paths in graph2 and find out those that are not in
  356. # # graph1. Deep-first search is applied.
  357. # def traverseTrie2(root, p1, kernel, pcurrent=[]):
  358. # for key, node in root['children'].items():
  359. # pcurrent.append(key)
  360. # if node['isEndOfWord']:
  361. # # print(node['count'])
  362. # kernel[0] += computePathKernel(p1, pcurrent, vk_dict, ek_dict)
  363. # if node['children'] != {}:
  364. # traverseTrie2(node, p1, kernel, pcurrent)
  365. # else:
  366. # del pcurrent[-1]
  367. # if pcurrent != []:
  368. # del pcurrent[-1]
  369. #
  370. #
  371. # kernel = [0]
  372. #
  373. # # First, compute shortest path matrices, method borrowed from FCSP.
  374. # vk_dict = getAllNodeKernels(g1, g2, node_kernels, node_label, ds_attrs)
  375. # # Then, compute kernels between all pairs of edges, which is an idea of
  376. # # extension of FCSP. It suits sparse graphs, which is the most case we
  377. # # went though. For dense graphs, this would be slow.
  378. # ek_dict = getAllEdgeKernels(g1, g2, edge_kernels, edge_label, ds_attrs)
  379. #
  380. # # compute graph kernels
  381. # traverseBothTrie(trie1[0].root, trie2[0], kernel)
  382. #
  383. # kernel = kernel[0] / (trie1[1] * trie2[1]) # calculate mean average
  384. # # traverse all paths in graph1. Deep-first search is applied.
  385. # def traverseBothTrie(root, trie2, kernel, vk_dict, ek_dict, pcurrent=[]):
  386. # for key, node in root['children'].items():
  387. # pcurrent.append(key)
  388. # if node['isEndOfWord']:
  389. # # print(node['count'])
  390. # traverseTrie2(trie2.root, pcurrent, kernel, vk_dict, ek_dict,
  391. # pcurrent=[])
  392. # if node['children'] != {}:
  393. # traverseBothTrie(node, trie2, kernel, vk_dict, ek_dict, pcurrent)
  394. # else:
  395. # del pcurrent[-1]
  396. # if pcurrent != []:
  397. # del pcurrent[-1]
  398. #
  399. #
  400. # # traverse all paths in graph2 and find out those that are not in
  401. # # graph1. Deep-first search is applied.
  402. # def traverseTrie2(root, p1, kernel, vk_dict, ek_dict, pcurrent=[]):
  403. # for key, node in root['children'].items():
  404. # pcurrent.append(key)
  405. # if node['isEndOfWord']:
  406. # # print(node['count'])
  407. # kernel[0] += computePathKernel(p1, pcurrent, vk_dict, ek_dict)
  408. # if node['children'] != {}:
  409. # traverseTrie2(node, p1, kernel, vk_dict, ek_dict, pcurrent)
  410. # else:
  411. # del pcurrent[-1]
  412. # if pcurrent != []:
  413. # del pcurrent[-1]
  414. kernel = [0]
  415. # First, compute shortest path matrices, method borrowed from FCSP.
  416. vk_dict = getAllNodeKernels(g1, g2, node_kernels, node_label, ds_attrs)
  417. # Then, compute kernels between all pairs of edges, which is an idea of
  418. # extension of FCSP. It suits sparse graphs, which is the most case we
  419. # went though. For dense graphs, this would be slow.
  420. ek_dict = getAllEdgeKernels(g1, g2, edge_kernels, edge_label, ds_attrs)
  421. # compute graph kernels
  422. # traverseBothTrie(trie1[0].root, trie2[0], kernel, vk_dict, ek_dict)
  423. if vk_dict:
  424. if ek_dict:
  425. traverseBothTriem(trie1[0].root, trie2[0], kernel, vk_dict, ek_dict)
  426. else:
  427. traverseBothTriev(trie1[0].root, trie2[0], kernel, vk_dict, ek_dict)
  428. else:
  429. if ek_dict:
  430. traverseBothTriee(trie1[0].root, trie2[0], kernel, vk_dict, ek_dict)
  431. else:
  432. traverseBothTrieu(trie1[0].root, trie2[0], kernel, vk_dict, ek_dict)
  433. kernel = kernel[0] / (trie1[1] * trie2[1]) # calculate mean average
  434. return kernel
  435. def wrapper_ssp_do_trie(ds_attrs, node_label, edge_label, node_kernels,
  436. edge_kernels, itr):
  437. i = itr[0]
  438. j = itr[1]
  439. return i, j, ssp_do_trie(G_gs[i], G_gs[j], G_spl[i], G_spl[j], ds_attrs,
  440. node_label, edge_label, node_kernels, edge_kernels)
  441. def getAllNodeKernels(g1, g2, node_kernels, node_label, ds_attrs):
  442. # compute shortest path matrices, method borrowed from FCSP.
  443. vk_dict = {} # shortest path matrices dict
  444. if ds_attrs['node_labeled']:
  445. # node symb and non-synb labeled
  446. if ds_attrs['node_attr_dim'] > 0:
  447. kn = node_kernels['mix']
  448. for n1, n2 in product(
  449. g1.nodes(data=True), g2.nodes(data=True)):
  450. vk_dict[(n1[0], n2[0])] = kn(
  451. n1[1][node_label], n2[1][node_label],
  452. n1[1]['attributes'], n2[1]['attributes'])
  453. # node symb labeled
  454. else:
  455. kn = node_kernels['symb']
  456. for n1 in g1.nodes(data=True):
  457. for n2 in g2.nodes(data=True):
  458. vk_dict[(n1[0], n2[0])] = kn(n1[1][node_label],
  459. n2[1][node_label])
  460. else:
  461. # node non-synb labeled
  462. if ds_attrs['node_attr_dim'] > 0:
  463. kn = node_kernels['nsymb']
  464. for n1 in g1.nodes(data=True):
  465. for n2 in g2.nodes(data=True):
  466. vk_dict[(n1[0], n2[0])] = kn(n1[1]['attributes'],
  467. n2[1]['attributes'])
  468. # node unlabeled
  469. else:
  470. pass
  471. return vk_dict
  472. def getAllEdgeKernels(g1, g2, edge_kernels, edge_label, ds_attrs):
  473. # compute kernels between all pairs of edges, which is an idea of
  474. # extension of FCSP. It suits sparse graphs, which is the most case we
  475. # went though. For dense graphs, this would be slow.
  476. ek_dict = {} # dict of edge kernels
  477. if ds_attrs['edge_labeled']:
  478. # edge symb and non-synb labeled
  479. if ds_attrs['edge_attr_dim'] > 0:
  480. ke = edge_kernels['mix']
  481. for e1, e2 in product(
  482. g1.edges(data=True), g2.edges(data=True)):
  483. ek_temp = ke(e1[2][edge_label], e2[2][edge_label],
  484. e1[2]['attributes'], e2[2]['attributes'])
  485. ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ek_temp
  486. ek_dict[((e1[1], e1[0]), (e2[0], e2[1]))] = ek_temp
  487. ek_dict[((e1[0], e1[1]), (e2[1], e2[0]))] = ek_temp
  488. ek_dict[((e1[1], e1[0]), (e2[1], e2[0]))] = ek_temp
  489. # edge symb labeled
  490. else:
  491. ke = edge_kernels['symb']
  492. for e1 in g1.edges(data=True):
  493. for e2 in g2.edges(data=True):
  494. ek_temp = ke(e1[2][edge_label], e2[2][edge_label])
  495. ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ek_temp
  496. ek_dict[((e1[1], e1[0]), (e2[0], e2[1]))] = ek_temp
  497. ek_dict[((e1[0], e1[1]), (e2[1], e2[0]))] = ek_temp
  498. ek_dict[((e1[1], e1[0]), (e2[1], e2[0]))] = ek_temp
  499. else:
  500. # edge non-synb labeled
  501. if ds_attrs['edge_attr_dim'] > 0:
  502. ke = edge_kernels['nsymb']
  503. for e1 in g1.edges(data=True):
  504. for e2 in g2.edges(data=True):
  505. ek_temp = ke(e1[2]['attributes'], e2[2]['attributes'])
  506. ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ek_temp
  507. ek_dict[((e1[1], e1[0]), (e2[0], e2[1]))] = ek_temp
  508. ek_dict[((e1[0], e1[1]), (e2[1], e2[0]))] = ek_temp
  509. ek_dict[((e1[1], e1[0]), (e2[1], e2[0]))] = ek_temp
  510. # edge unlabeled
  511. else:
  512. pass
  513. return ek_dict
  514. # traverse all paths in graph1. Deep-first search is applied.
  515. def traverseBothTriem(root, trie2, kernel, vk_dict, ek_dict, pcurrent=[]):
  516. for key, node in root['children'].items():
  517. pcurrent.append(key)
  518. if node['isEndOfWord']:
  519. # print(node['count'])
  520. traverseTrie2m(trie2.root, pcurrent, kernel, vk_dict, ek_dict,
  521. pcurrent=[])
  522. if node['children'] != {}:
  523. traverseBothTriem(node, trie2, kernel, vk_dict, ek_dict, pcurrent)
  524. else:
  525. del pcurrent[-1]
  526. if pcurrent != []:
  527. del pcurrent[-1]
  528. # traverse all paths in graph2 and find out those that are not in
  529. # graph1. Deep-first search is applied.
  530. def traverseTrie2m(root, p1, kernel, vk_dict, ek_dict, pcurrent=[]):
  531. for key, node in root['children'].items():
  532. pcurrent.append(key)
  533. if node['isEndOfWord']:
  534. # print(node['count'])
  535. if len(p1) == len(pcurrent):
  536. kpath = vk_dict[(p1[0], pcurrent[0])]
  537. if kpath:
  538. for idx in range(1, len(p1)):
  539. kpath *= vk_dict[(p1[idx], pcurrent[idx])] * \
  540. ek_dict[((p1[idx-1], p1[idx]),
  541. (pcurrent[idx-1], pcurrent[idx]))]
  542. if not kpath:
  543. break
  544. kernel[0] += kpath # add up kernels of all paths
  545. if node['children'] != {}:
  546. traverseTrie2m(node, p1, kernel, vk_dict, ek_dict, pcurrent)
  547. else:
  548. del pcurrent[-1]
  549. if pcurrent != []:
  550. del pcurrent[-1]
  551. # traverse all paths in graph1. Deep-first search is applied.
  552. def traverseBothTriev(root, trie2, kernel, vk_dict, ek_dict, pcurrent=[]):
  553. for key, node in root['children'].items():
  554. pcurrent.append(key)
  555. if node['isEndOfWord']:
  556. # print(node['count'])
  557. traverseTrie2v(trie2.root, pcurrent, kernel, vk_dict, ek_dict,
  558. pcurrent=[])
  559. if node['children'] != {}:
  560. traverseBothTriev(node, trie2, kernel, vk_dict, ek_dict, pcurrent)
  561. else:
  562. del pcurrent[-1]
  563. if pcurrent != []:
  564. del pcurrent[-1]
  565. # traverse all paths in graph2 and find out those that are not in
  566. # graph1. Deep-first search is applied.
  567. def traverseTrie2v(root, p1, kernel, vk_dict, ek_dict, pcurrent=[]):
  568. for key, node in root['children'].items():
  569. pcurrent.append(key)
  570. if node['isEndOfWord']:
  571. # print(node['count'])
  572. if len(p1) == len(pcurrent):
  573. kpath = vk_dict[(p1[0], pcurrent[0])]
  574. if kpath:
  575. for idx in range(1, len(p1)):
  576. kpath *= vk_dict[(p1[idx], pcurrent[idx])]
  577. if not kpath:
  578. break
  579. kernel[0] += kpath # add up kernels of all paths
  580. if node['children'] != {}:
  581. traverseTrie2v(node, p1, kernel, vk_dict, ek_dict, pcurrent)
  582. else:
  583. del pcurrent[-1]
  584. if pcurrent != []:
  585. del pcurrent[-1]
  586. # traverse all paths in graph1. Deep-first search is applied.
  587. def traverseBothTriee(root, trie2, kernel, vk_dict, ek_dict, pcurrent=[]):
  588. for key, node in root['children'].items():
  589. pcurrent.append(key)
  590. if node['isEndOfWord']:
  591. # print(node['count'])
  592. traverseTrie2e(trie2.root, pcurrent, kernel, vk_dict, ek_dict,
  593. pcurrent=[])
  594. if node['children'] != {}:
  595. traverseBothTriee(node, trie2, kernel, vk_dict, ek_dict, pcurrent)
  596. else:
  597. del pcurrent[-1]
  598. if pcurrent != []:
  599. del pcurrent[-1]
  600. # traverse all paths in graph2 and find out those that are not in
  601. # graph1. Deep-first search is applied.
  602. def traverseTrie2e(root, p1, kernel, vk_dict, ek_dict, pcurrent=[]):
  603. for key, node in root['children'].items():
  604. pcurrent.append(key)
  605. if node['isEndOfWord']:
  606. # print(node['count'])
  607. if len(p1) == len(pcurrent):
  608. if len(p1) == 0:
  609. kernel += 1
  610. else:
  611. kpath = 1
  612. for idx in range(0, len(p1) - 1):
  613. kpath *= ek_dict[((p1[idx], p1[idx+1]),
  614. (pcurrent[idx], pcurrent[idx+1]))]
  615. if not kpath:
  616. break
  617. kernel[0] += kpath # add up kernels of all paths
  618. if node['children'] != {}:
  619. traverseTrie2e(node, p1, kernel, vk_dict, ek_dict, pcurrent)
  620. else:
  621. del pcurrent[-1]
  622. if pcurrent != []:
  623. del pcurrent[-1]
  624. # traverse all paths in graph1. Deep-first search is applied.
  625. def traverseBothTrieu(root, trie2, kernel, vk_dict, ek_dict, pcurrent=[]):
  626. for key, node in root['children'].items():
  627. pcurrent.append(key)
  628. if node['isEndOfWord']:
  629. # print(node['count'])
  630. traverseTrie2u(trie2.root, pcurrent, kernel, vk_dict, ek_dict,
  631. pcurrent=[])
  632. if node['children'] != {}:
  633. traverseBothTrieu(node, trie2, kernel, vk_dict, ek_dict, pcurrent)
  634. else:
  635. del pcurrent[-1]
  636. if pcurrent != []:
  637. del pcurrent[-1]
  638. # traverse all paths in graph2 and find out those that are not in
  639. # graph1. Deep-first search is applied.
  640. def traverseTrie2u(root, p1, kernel, vk_dict, ek_dict, pcurrent=[]):
  641. for key, node in root['children'].items():
  642. pcurrent.append(key)
  643. if node['isEndOfWord']:
  644. # print(node['count'])
  645. if len(p1) == len(pcurrent):
  646. kernel[0] += 1
  647. if node['children'] != {}:
  648. traverseTrie2u(node, p1, kernel, vk_dict, ek_dict, pcurrent)
  649. else:
  650. del pcurrent[-1]
  651. if pcurrent != []:
  652. del pcurrent[-1]
  653. #def computePathKernel(p1, p2, vk_dict, ek_dict):
  654. # kernel = 0
  655. # if vk_dict:
  656. # if ek_dict:
  657. # if len(p1) == len(p2):
  658. # kpath = vk_dict[(p1[0], p2[0])]
  659. # if kpath:
  660. # for idx in range(1, len(p1)):
  661. # kpath *= vk_dict[(p1[idx], p2[idx])] * \
  662. # ek_dict[((p1[idx-1], p1[idx]),
  663. # (p2[idx-1], p2[idx]))]
  664. # if not kpath:
  665. # break
  666. # kernel += kpath # add up kernels of all paths
  667. # else:
  668. # if len(p1) == len(p2):
  669. # kpath = vk_dict[(p1[0], p2[0])]
  670. # if kpath:
  671. # for idx in range(1, len(p1)):
  672. # kpath *= vk_dict[(p1[idx], p2[idx])]
  673. # if not kpath:
  674. # break
  675. # kernel += kpath # add up kernels of all paths
  676. # else:
  677. # if ek_dict:
  678. # if len(p1) == len(p2):
  679. # if len(p1) == 0:
  680. # kernel += 1
  681. # else:
  682. # kpath = 1
  683. # for idx in range(0, len(p1) - 1):
  684. # kpath *= ek_dict[((p1[idx], p1[idx+1]),
  685. # (p2[idx], p2[idx+1]))]
  686. # if not kpath:
  687. # break
  688. # kernel += kpath # add up kernels of all paths
  689. # else:
  690. # if len(p1) == len(p2):
  691. # kernel += 1
  692. #
  693. # return kernel
  694. def get_shortest_paths(G, weight, directed):
  695. """Get all shortest paths of a graph.
  696. Parameters
  697. ----------
  698. G : NetworkX graphs
  699. The graphs whose paths are calculated.
  700. weight : string/None
  701. edge attribute used as weight to calculate the shortest path.
  702. directed: boolean
  703. Whether graph is directed.
  704. Return
  705. ------
  706. sp : list of list
  707. List of shortest paths of the graph, where each path is represented by a list of nodes.
  708. """
  709. sp = []
  710. for n1, n2 in combinations(G.nodes(), 2):
  711. try:
  712. spltemp = list(nx.all_shortest_paths(G, n1, n2, weight=weight))
  713. except nx.NetworkXNoPath: # nodes not connected
  714. # sp.append([])
  715. pass
  716. else:
  717. sp += spltemp
  718. # each edge walk is counted twice, starting from both its extreme nodes.
  719. if not directed:
  720. sp += [sptemp[::-1] for sptemp in spltemp]
  721. # add single nodes as length 0 paths.
  722. sp += [[n] for n in G.nodes()]
  723. return sp
  724. def wrapper_getSP_naive(weight, directed, itr_item):
  725. g = itr_item[0]
  726. i = itr_item[1]
  727. return i, get_shortest_paths(g, weight, directed)
  728. def get_sps_as_trie(G, weight, directed):
  729. """Get all shortest paths of a graph and insert them into a trie.
  730. Parameters
  731. ----------
  732. G : NetworkX graphs
  733. The graphs whose paths are calculated.
  734. weight : string/None
  735. edge attribute used as weight to calculate the shortest path.
  736. directed: boolean
  737. Whether graph is directed.
  738. Return
  739. ------
  740. sp : list of list
  741. List of shortest paths of the graph, where each path is represented by a list of nodes.
  742. """
  743. sptrie = Trie()
  744. lensp = 0
  745. for n1, n2 in combinations(G.nodes(), 2):
  746. try:
  747. spltemp = list(nx.all_shortest_paths(G, n1, n2, weight=weight))
  748. except nx.NetworkXNoPath: # nodes not connected
  749. pass
  750. else:
  751. lensp += len(spltemp)
  752. if not directed:
  753. lensp += len(spltemp)
  754. for sp in spltemp:
  755. sptrie.insertWord(sp)
  756. # each edge walk is counted twice, starting from both its extreme nodes.
  757. if not directed:
  758. sptrie.insertWord(sp[::-1])
  759. # add single nodes as length 0 paths.
  760. for n in G.nodes():
  761. sptrie.insertWord([n])
  762. return sptrie, lensp + nx.number_of_nodes(G)
  763. def wrapper_getSP_trie(weight, directed, itr_item):
  764. g = itr_item[0]
  765. i = itr_item[1]
  766. return i, get_sps_as_trie(g, weight, directed)

A Python package for graph kernels, graph edit distances and graph pre-image problem.