You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

untilHPathKernel.py 27 kB

5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725
  1. """
  2. @author: linlin
  3. @references:
  4. [1] Liva Ralaivola, Sanjay J Swamidass, Hiroto Saigo, and Pierre
  5. Baldi. Graph kernels for chemical informatics. Neural networks,
  6. 18(8):1093–1110, 2005.
  7. """
  8. import sys
  9. sys.path.insert(0, "../")
  10. import time
  11. from collections import Counter
  12. from itertools import chain
  13. from functools import partial
  14. from multiprocessing import Pool
  15. from tqdm import tqdm
  16. import networkx as nx
  17. import numpy as np
  18. from gklearn.utils.graphdataset import get_dataset_attributes
  19. from gklearn.utils.parallel import parallel_gm
  20. from gklearn.utils.trie import Trie
  21. def untilhpathkernel(*args,
  22. node_label='atom',
  23. edge_label='bond_type',
  24. depth=10,
  25. k_func='MinMax',
  26. compute_method='trie',
  27. parallel=True,
  28. n_jobs=None,
  29. verbose=True):
  30. """Calculate path graph kernels up to depth/hight h between graphs.
  31. Parameters
  32. ----------
  33. Gn : List of NetworkX graph
  34. List of graphs between which the kernels are calculated.
  35. G1, G2 : NetworkX graphs
  36. Two graphs between which the kernel is calculated.
  37. node_label : string
  38. Node attribute used as label. The default node label is atom.
  39. edge_label : string
  40. Edge attribute used as label. The default edge label is bond_type.
  41. depth : integer
  42. Depth of search. Longest length of paths.
  43. k_func : function
  44. A kernel function applied using different notions of fingerprint
  45. similarity, defining the type of feature map and normalization method
  46. applied for the graph kernel. The Following choices are available:
  47. 'MinMax': use the MiniMax kernel and counting feature map.
  48. 'tanimoto': use the Tanimoto kernel and binary feature map.
  49. None: no sub-kernel is used, the kernel is computed directly.
  50. compute_method : string
  51. Computation method to store paths and compute the graph kernel. The
  52. Following choices are available:
  53. 'trie': store paths as tries.
  54. 'naive': store paths to lists.
  55. n_jobs : int
  56. Number of jobs for parallelization.
  57. Return
  58. ------
  59. Kmatrix : Numpy matrix
  60. Kernel matrix, each element of which is the path kernel up to h between
  61. 2 praphs.
  62. """
  63. # pre-process
  64. depth = int(depth)
  65. Gn = args[0] if len(args) == 1 else [args[0], args[1]]
  66. Gn = [g.copy() for g in Gn]
  67. Kmatrix = np.zeros((len(Gn), len(Gn)))
  68. ds_attrs = get_dataset_attributes(
  69. Gn,
  70. attr_names=['node_labeled', 'node_attr_dim', 'edge_labeled',
  71. 'edge_attr_dim', 'is_directed'],
  72. node_label=node_label, edge_label=edge_label)
  73. if k_func != None:
  74. if not ds_attrs['node_labeled']:
  75. for G in Gn:
  76. nx.set_node_attributes(G, '0', 'atom')
  77. if not ds_attrs['edge_labeled']:
  78. for G in Gn:
  79. nx.set_edge_attributes(G, '0', 'bond_type')
  80. start_time = time.time()
  81. if parallel == 'imap_unordered':
  82. # ---- use pool.imap_unordered to parallel and track progress. ----
  83. # get all paths of all graphs before calculating kernels to save time,
  84. # but this may cost a lot of memory for large datasets.
  85. pool = Pool(n_jobs)
  86. itr = zip(Gn, range(0, len(Gn)))
  87. if len(Gn) < 100 * n_jobs:
  88. chunksize = int(len(Gn) / n_jobs) + 1
  89. else:
  90. chunksize = 100
  91. all_paths = [[] for _ in range(len(Gn))]
  92. if compute_method == 'trie' and k_func != None:
  93. getps_partial = partial(wrapper_find_all_path_as_trie, depth,
  94. ds_attrs, node_label, edge_label)
  95. elif compute_method != 'trie' and k_func != None:
  96. getps_partial = partial(wrapper_find_all_paths_until_length, depth,
  97. ds_attrs, node_label, edge_label, True)
  98. else:
  99. getps_partial = partial(wrapper_find_all_paths_until_length, depth,
  100. ds_attrs, node_label, edge_label, False)
  101. if verbose:
  102. iterator = tqdm(pool.imap_unordered(getps_partial, itr, chunksize),
  103. desc='getting paths', file=sys.stdout)
  104. else:
  105. iterator = pool.imap_unordered(getps_partial, itr, chunksize)
  106. for i, ps in iterator:
  107. all_paths[i] = ps
  108. pool.close()
  109. pool.join()
  110. # for g in Gn:
  111. # if compute_method == 'trie' and k_func != None:
  112. # find_all_path_as_trie(g, depth, ds_attrs, node_label, edge_label)
  113. # elif compute_method != 'trie' and k_func != None:
  114. # find_all_paths_until_length(g, depth, ds_attrs, node_label, edge_label)
  115. # else:
  116. # find_all_paths_until_length(g, depth, ds_attrs, node_label, edge_label, False)
  117. ## size = sys.getsizeof(all_paths)
  118. ## for item in all_paths:
  119. ## size += sys.getsizeof(item)
  120. ## for pppps in item:
  121. ## size += sys.getsizeof(pppps)
  122. ## print(size)
  123. #
  124. ## ttt = time.time()
  125. ## # ---- ---- use pool.map to parallel ----
  126. ## for i, ps in tqdm(
  127. ## pool.map(getps_partial, range(0, len(Gn))),
  128. ## desc='getting paths', file=sys.stdout):
  129. ## all_paths[i] = ps
  130. ## print(time.time() - ttt)
  131. if compute_method == 'trie' and k_func != None:
  132. def init_worker(trie_toshare):
  133. global G_trie
  134. G_trie = trie_toshare
  135. do_partial = partial(wrapper_uhpath_do_trie, k_func)
  136. parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker,
  137. glbv=(all_paths,), n_jobs=n_jobs, verbose=verbose)
  138. elif compute_method != 'trie' and k_func != None:
  139. def init_worker(plist_toshare):
  140. global G_plist
  141. G_plist = plist_toshare
  142. do_partial = partial(wrapper_uhpath_do_naive, k_func)
  143. parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker,
  144. glbv=(all_paths,), n_jobs=n_jobs, verbose=verbose)
  145. else:
  146. def init_worker(plist_toshare):
  147. global G_plist
  148. G_plist = plist_toshare
  149. do_partial = partial(wrapper_uhpath_do_kernelless, ds_attrs, edge_kernels)
  150. parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker,
  151. glbv=(all_paths,), n_jobs=n_jobs, verbose=verbose)
  152. elif parallel == None:
  153. from pympler import asizeof
  154. # ---- direct running, normally use single CPU core. ----
  155. # print(asizeof.asized(all_paths, detail=1).format())
  156. if compute_method == 'trie':
  157. all_paths = [
  158. find_all_path_as_trie(Gn[i],
  159. depth,
  160. ds_attrs,
  161. node_label=node_label,
  162. edge_label=edge_label) for i in tqdm(
  163. range(0, len(Gn)), desc='getting paths', file=sys.stdout)
  164. ]
  165. # sizeof_allpaths = asizeof.asizeof(all_paths)
  166. # print(sizeof_allpaths)
  167. pbar = tqdm(
  168. total=((len(Gn) + 1) * len(Gn) / 2),
  169. desc='calculating kernels',
  170. file=sys.stdout)
  171. for i in range(0, len(Gn)):
  172. for j in range(i, len(Gn)):
  173. Kmatrix[i][j] = _untilhpathkernel_do_trie(all_paths[i],
  174. all_paths[j], k_func)
  175. Kmatrix[j][i] = Kmatrix[i][j]
  176. pbar.update(1)
  177. else:
  178. all_paths = [
  179. find_all_paths_until_length(
  180. Gn[i],
  181. depth,
  182. ds_attrs,
  183. node_label=node_label,
  184. edge_label=edge_label) for i in tqdm(
  185. range(0, len(Gn)), desc='getting paths', file=sys.stdout)
  186. ]
  187. # sizeof_allpaths = asizeof.asizeof(all_paths)
  188. # print(sizeof_allpaths)
  189. pbar = tqdm(
  190. total=((len(Gn) + 1) * len(Gn) / 2),
  191. desc='calculating kernels',
  192. file=sys.stdout)
  193. for i in range(0, len(Gn)):
  194. for j in range(i, len(Gn)):
  195. Kmatrix[i][j] = _untilhpathkernel_do_naive(all_paths[i], all_paths[j],
  196. k_func)
  197. Kmatrix[j][i] = Kmatrix[i][j]
  198. pbar.update(1)
  199. run_time = time.time() - start_time
  200. if verbose:
  201. print("\n --- kernel matrix of path kernel up to %d of size %d built in %s seconds ---"
  202. % (depth, len(Gn), run_time))
  203. # print(Kmatrix[0][0:10])
  204. return Kmatrix, run_time, sizeof_allpaths
  205. def _untilhpathkernel_do_trie(trie1, trie2, k_func):
  206. """Calculate path graph kernels up to depth d between 2 graphs using trie.
  207. Parameters
  208. ----------
  209. trie1, trie2 : list
  210. Tries that contains all paths in 2 graphs.
  211. k_func : function
  212. A kernel function applied using different notions of fingerprint
  213. similarity.
  214. Return
  215. ------
  216. kernel : float
  217. Path kernel up to h between 2 graphs.
  218. """
  219. if k_func == 'tanimoto':
  220. # traverse all paths in graph1 and search them in graph2. Deep-first
  221. # search is applied.
  222. def traverseTrie1t(root, trie2, setlist, pcurrent=[]):
  223. for key, node in root['children'].items():
  224. pcurrent.append(key)
  225. if node['isEndOfWord']:
  226. setlist[1] += 1
  227. count2 = trie2.searchWord(pcurrent)
  228. if count2 != 0:
  229. setlist[0] += 1
  230. if node['children'] != {}:
  231. traverseTrie1t(node, trie2, setlist, pcurrent)
  232. else:
  233. del pcurrent[-1]
  234. if pcurrent != []:
  235. del pcurrent[-1]
  236. # traverse all paths in graph2 and find out those that are not in
  237. # graph1. Deep-first search is applied.
  238. def traverseTrie2t(root, trie1, setlist, pcurrent=[]):
  239. for key, node in root['children'].items():
  240. pcurrent.append(key)
  241. if node['isEndOfWord']:
  242. # print(node['count'])
  243. count1 = trie1.searchWord(pcurrent)
  244. if count1 == 0:
  245. setlist[1] += 1
  246. if node['children'] != {}:
  247. traverseTrie2t(node, trie1, setlist, pcurrent)
  248. else:
  249. del pcurrent[-1]
  250. if pcurrent != []:
  251. del pcurrent[-1]
  252. setlist = [0, 0] # intersection and union of path sets of g1, g2.
  253. # print(trie1.root)
  254. # print(trie2.root)
  255. traverseTrie1t(trie1.root, trie2, setlist)
  256. # print(setlist)
  257. traverseTrie2t(trie2.root, trie1, setlist)
  258. # print(setlist)
  259. kernel = setlist[0] / setlist[1]
  260. else: # MinMax kernel
  261. # traverse all paths in graph1 and search them in graph2. Deep-first
  262. # search is applied.
  263. def traverseTrie1m(root, trie2, sumlist, pcurrent=[]):
  264. for key, node in root['children'].items():
  265. pcurrent.append(key)
  266. if node['isEndOfWord']:
  267. # print(node['count'])
  268. count1 = node['count']
  269. count2 = trie2.searchWord(pcurrent)
  270. sumlist[0] += min(count1, count2)
  271. sumlist[1] += max(count1, count2)
  272. if node['children'] != {}:
  273. traverseTrie1m(node, trie2, sumlist, pcurrent)
  274. else:
  275. del pcurrent[-1]
  276. if pcurrent != []:
  277. del pcurrent[-1]
  278. # traverse all paths in graph2 and find out those that are not in
  279. # graph1. Deep-first search is applied.
  280. def traverseTrie2m(root, trie1, sumlist, pcurrent=[]):
  281. for key, node in root['children'].items():
  282. pcurrent.append(key)
  283. if node['isEndOfWord']:
  284. # print(node['count'])
  285. count1 = trie1.searchWord(pcurrent)
  286. if count1 == 0:
  287. sumlist[1] += node['count']
  288. if node['children'] != {}:
  289. traverseTrie2m(node, trie1, sumlist, pcurrent)
  290. else:
  291. del pcurrent[-1]
  292. if pcurrent != []:
  293. del pcurrent[-1]
  294. sumlist = [0, 0] # sum of mins and sum of maxs
  295. # print(trie1.root)
  296. # print(trie2.root)
  297. traverseTrie1m(trie1.root, trie2, sumlist)
  298. # print(sumlist)
  299. traverseTrie2m(trie2.root, trie1, sumlist)
  300. # print(sumlist)
  301. kernel = sumlist[0] / sumlist[1]
  302. return kernel
  303. def wrapper_uhpath_do_trie(k_func, itr):
  304. i = itr[0]
  305. j = itr[1]
  306. return i, j, _untilhpathkernel_do_trie(G_trie[i], G_trie[j], k_func)
  307. def _untilhpathkernel_do_naive(paths1, paths2, k_func):
  308. """Calculate path graph kernels up to depth d between 2 graphs naively.
  309. Parameters
  310. ----------
  311. paths_list : list of list
  312. List of list of paths in all graphs, where for unlabeled graphs, each
  313. path is represented by a list of nodes; while for labeled graphs, each
  314. path is represented by a string consists of labels of nodes and/or
  315. edges on that path.
  316. k_func : function
  317. A kernel function applied using different notions of fingerprint
  318. similarity.
  319. Return
  320. ------
  321. kernel : float
  322. Path kernel up to h between 2 graphs.
  323. """
  324. all_paths = list(set(paths1 + paths2))
  325. if k_func == 'tanimoto':
  326. length_union = len(set(paths1 + paths2))
  327. kernel = (len(set(paths1)) + len(set(paths2)) -
  328. length_union) / length_union
  329. # vector1 = [(1 if path in paths1 else 0) for path in all_paths]
  330. # vector2 = [(1 if path in paths2 else 0) for path in all_paths]
  331. # kernel_uv = np.dot(vector1, vector2)
  332. # kernel = kernel_uv / (len(set(paths1)) + len(set(paths2)) - kernel_uv)
  333. else: # MinMax kernel
  334. path_count1 = Counter(paths1)
  335. path_count2 = Counter(paths2)
  336. vector1 = [(path_count1[key] if (key in path_count1.keys()) else 0)
  337. for key in all_paths]
  338. vector2 = [(path_count2[key] if (key in path_count2.keys()) else 0)
  339. for key in all_paths]
  340. kernel = np.sum(np.minimum(vector1, vector2)) / \
  341. np.sum(np.maximum(vector1, vector2))
  342. return kernel
  343. def wrapper_uhpath_do_naive(k_func, itr):
  344. i = itr[0]
  345. j = itr[1]
  346. return i, j, _untilhpathkernel_do_naive(G_plist[i], G_plist[j], k_func)
  347. def _untilhpathkernel_do_kernelless(paths1, paths2, k_func):
  348. """Calculate path graph kernels up to depth d between 2 graphs naively.
  349. Parameters
  350. ----------
  351. paths_list : list of list
  352. List of list of paths in all graphs, where for unlabeled graphs, each
  353. path is represented by a list of nodes; while for labeled graphs, each
  354. path is represented by a string consists of labels of nodes and/or
  355. edges on that path.
  356. k_func : function
  357. A kernel function applied using different notions of fingerprint
  358. similarity.
  359. Return
  360. ------
  361. kernel : float
  362. Path kernel up to h between 2 graphs.
  363. """
  364. all_paths = list(set(paths1 + paths2))
  365. if k_func == 'tanimoto':
  366. length_union = len(set(paths1 + paths2))
  367. kernel = (len(set(paths1)) + len(set(paths2)) -
  368. length_union) / length_union
  369. # vector1 = [(1 if path in paths1 else 0) for path in all_paths]
  370. # vector2 = [(1 if path in paths2 else 0) for path in all_paths]
  371. # kernel_uv = np.dot(vector1, vector2)
  372. # kernel = kernel_uv / (len(set(paths1)) + len(set(paths2)) - kernel_uv)
  373. else: # MinMax kernel
  374. path_count1 = Counter(paths1)
  375. path_count2 = Counter(paths2)
  376. vector1 = [(path_count1[key] if (key in path_count1.keys()) else 0)
  377. for key in all_paths]
  378. vector2 = [(path_count2[key] if (key in path_count2.keys()) else 0)
  379. for key in all_paths]
  380. kernel = np.sum(np.minimum(vector1, vector2)) / \
  381. np.sum(np.maximum(vector1, vector2))
  382. return kernel
  383. def wrapper_uhpath_do_kernelless(k_func, itr):
  384. i = itr[0]
  385. j = itr[1]
  386. return i, j, _untilhpathkernel_do_kernelless(G_plist[i], G_plist[j], k_func)
  387. # @todo: (can be removed maybe) this method find paths repetively, it could be faster.
  388. def find_all_paths_until_length(G,
  389. length,
  390. ds_attrs,
  391. node_label='atom',
  392. edge_label='bond_type',
  393. tolabelseqs=True):
  394. """Find all paths no longer than a certain maximum length in a graph. A
  395. recursive depth first search is applied.
  396. Parameters
  397. ----------
  398. G : NetworkX graphs
  399. The graph in which paths are searched.
  400. length : integer
  401. The maximum length of paths.
  402. ds_attrs: dict
  403. Dataset attributes.
  404. node_label : string
  405. Node attribute used as label. The default node label is atom.
  406. edge_label : string
  407. Edge attribute used as label. The default edge label is bond_type.
  408. Return
  409. ------
  410. path : list
  411. List of paths retrieved, where for unlabeled graphs, each path is
  412. represented by a list of nodes; while for labeled graphs, each path is
  413. represented by a list of strings consists of labels of nodes and/or
  414. edges on that path.
  415. """
  416. # path_l = [tuple([n]) for n in G.nodes] # paths of length l
  417. # all_paths = path_l[:]
  418. # for l in range(1, length + 1):
  419. # path_l_new = []
  420. # for path in path_l:
  421. # for neighbor in G[path[-1]]:
  422. # if len(path) < 2 or neighbor != path[-2]:
  423. # tmp = path + (neighbor, )
  424. # if tuple(tmp[::-1]) not in path_l_new:
  425. # path_l_new.append(tuple(tmp))
  426. # all_paths += path_l_new
  427. # path_l = path_l_new[:]
  428. path_l = [[n] for n in G.nodes] # paths of length l
  429. all_paths = [p.copy() for p in path_l]
  430. for l in range(1, length + 1):
  431. path_lplus1 = []
  432. for path in path_l:
  433. for neighbor in G[path[-1]]:
  434. if neighbor not in path:
  435. tmp = path + [neighbor]
  436. # if tmp[::-1] not in path_lplus1:
  437. path_lplus1.append(tmp)
  438. all_paths += path_lplus1
  439. path_l = [p.copy() for p in path_lplus1]
  440. # for i in range(0, length + 1):
  441. # new_paths = find_all_paths(G, i)
  442. # if new_paths == []:
  443. # break
  444. # all_paths.extend(new_paths)
  445. # consider labels
  446. # print(paths2labelseqs(all_paths, G, ds_attrs, node_label, edge_label))
  447. # print()
  448. return (paths2labelseqs(all_paths, G, ds_attrs, node_label, edge_label)
  449. if tolabelseqs else all_paths)
  450. def wrapper_find_all_paths_until_length(length, ds_attrs, node_label,
  451. edge_label, tolabelseqs, itr_item):
  452. g = itr_item[0]
  453. i = itr_item[1]
  454. return i, find_all_paths_until_length(g, length, ds_attrs,
  455. node_label=node_label, edge_label=edge_label,
  456. tolabelseqs=tolabelseqs)
  457. def find_all_path_as_trie(G,
  458. length,
  459. ds_attrs,
  460. node_label='atom',
  461. edge_label='bond_type'):
  462. # time1 = time.time()
  463. # all_path = find_all_paths_until_length(G, length, ds_attrs,
  464. # node_label=node_label,
  465. # edge_label=edge_label)
  466. # ptrie = Trie()
  467. # for path in all_path:
  468. # ptrie.insertWord(path)
  469. # ptrie = Trie()
  470. # path_l = [[n] for n in G.nodes] # paths of length l
  471. # path_l_str = paths2labelseqs(path_l, G, ds_attrs, node_label, edge_label)
  472. # for p in path_l_str:
  473. # ptrie.insertWord(p)
  474. # for l in range(1, length + 1):
  475. # path_lplus1 = []
  476. # for path in path_l:
  477. # for neighbor in G[path[-1]]:
  478. # if neighbor not in path:
  479. # tmp = path + [neighbor]
  480. ## if tmp[::-1] not in path_lplus1:
  481. # path_lplus1.append(tmp)
  482. # path_l = path_lplus1[:]
  483. # # consider labels
  484. # path_l_str = paths2labelseqs(path_l, G, ds_attrs, node_label, edge_label)
  485. # for p in path_l_str:
  486. # ptrie.insertWord(p)
  487. #
  488. # print(time.time() - time1)
  489. # print(ptrie.root)
  490. # print()
  491. # traverse all paths up to length h in a graph and construct a trie with
  492. # them. Deep-first search is applied. Notice the reverse of each path is
  493. # also stored to the trie.
  494. def traverseGraph(root, ptrie, length, G, ds_attrs, node_label, edge_label,
  495. pcurrent=[]):
  496. if len(pcurrent) < length + 1:
  497. for neighbor in G[root]:
  498. if neighbor not in pcurrent:
  499. pcurrent.append(neighbor)
  500. plstr = paths2labelseqs([pcurrent], G, ds_attrs,
  501. node_label, edge_label)
  502. ptrie.insertWord(plstr[0])
  503. traverseGraph(neighbor, ptrie, length, G, ds_attrs,
  504. node_label, edge_label, pcurrent)
  505. del pcurrent[-1]
  506. ptrie = Trie()
  507. path_l = [[n] for n in G.nodes] # paths of length l
  508. path_l_str = paths2labelseqs(path_l, G, ds_attrs, node_label, edge_label)
  509. for p in path_l_str:
  510. ptrie.insertWord(p)
  511. for n in G.nodes:
  512. traverseGraph(n, ptrie, length, G, ds_attrs, node_label, edge_label,
  513. pcurrent=[n])
  514. # def traverseGraph(root, all_paths, length, G, ds_attrs, node_label, edge_label,
  515. # pcurrent=[]):
  516. # if len(pcurrent) < length + 1:
  517. # for neighbor in G[root]:
  518. # if neighbor not in pcurrent:
  519. # pcurrent.append(neighbor)
  520. # plstr = paths2labelseqs([pcurrent], G, ds_attrs,
  521. # node_label, edge_label)
  522. # all_paths.append(pcurrent[:])
  523. # traverseGraph(neighbor, all_paths, length, G, ds_attrs,
  524. # node_label, edge_label, pcurrent)
  525. # del pcurrent[-1]
  526. #
  527. #
  528. # path_l = [[n] for n in G.nodes] # paths of length l
  529. # all_paths = path_l[:]
  530. # path_l_str = paths2labelseqs(path_l, G, ds_attrs, node_label, edge_label)
  531. ## for p in path_l_str:
  532. ## ptrie.insertWord(p)
  533. # for n in G.nodes:
  534. # traverseGraph(n, all_paths, length, G, ds_attrs, node_label, edge_label,
  535. # pcurrent=[n])
  536. # print(ptrie.root)
  537. return ptrie
  538. def wrapper_find_all_path_as_trie(length, ds_attrs, node_label,
  539. edge_label, itr_item):
  540. g = itr_item[0]
  541. i = itr_item[1]
  542. return i, find_all_path_as_trie(g, length, ds_attrs,
  543. node_label=node_label, edge_label=edge_label)
  544. def paths2labelseqs(plist, G, ds_attrs, node_label, edge_label):
  545. if ds_attrs['node_labeled']:
  546. if ds_attrs['edge_labeled']:
  547. path_strs = [
  548. tuple(
  549. list(
  550. chain.from_iterable(
  551. (G.node[node][node_label],
  552. G[node][path[idx + 1]][edge_label])
  553. for idx, node in enumerate(path[:-1]))) +
  554. [G.node[path[-1]][node_label]]) for path in plist
  555. ]
  556. # path_strs = []
  557. # for path in all_paths:
  558. # strlist = list(
  559. # chain.from_iterable((G.node[node][node_label],
  560. # G[node][path[idx + 1]][edge_label])
  561. # for idx, node in enumerate(path[:-1])))
  562. # strlist.append(G.node[path[-1]][node_label])
  563. # path_strs.append(tuple(strlist))
  564. else:
  565. path_strs = [
  566. tuple([G.node[node][node_label] for node in path])
  567. for path in plist
  568. ]
  569. return path_strs
  570. else:
  571. if ds_attrs['edge_labeled']:
  572. return [
  573. tuple([] if len(path) == 1 else [
  574. G[node][path[idx + 1]][edge_label]
  575. for idx, node in enumerate(path[:-1])
  576. ]) for path in plist
  577. ]
  578. else:
  579. return [tuple(['0' for node in path]) for path in plist]
  580. # return [tuple([len(path)]) for path in all_paths]
  581. #
  582. #def paths2GSuffixTree(paths):
  583. # return Tree(paths, builder=ukkonen.Builder)
  584. # def find_paths(G, source_node, length):
  585. # """Find all paths no longer than a certain length those start from a source node. A recursive depth first search is applied.
  586. # Parameters
  587. # ----------
  588. # G : NetworkX graphs
  589. # The graph in which paths are searched.
  590. # source_node : integer
  591. # The number of the node from where all paths start.
  592. # length : integer
  593. # The length of paths.
  594. # Return
  595. # ------
  596. # path : list of list
  597. # List of paths retrieved, where each path is represented by a list of nodes.
  598. # """
  599. # return [[source_node]] if length == 0 else \
  600. # [[source_node] + path for neighbor in G[source_node]
  601. # for path in find_paths(G, neighbor, length - 1) if source_node not in path]
  602. # def find_all_paths(G, length):
  603. # """Find all paths with a certain length in a graph. A recursive depth first search is applied.
  604. # Parameters
  605. # ----------
  606. # G : NetworkX graphs
  607. # The graph in which paths are searched.
  608. # length : integer
  609. # The length of paths.
  610. # Return
  611. # ------
  612. # path : list of list
  613. # List of paths retrieved, where each path is represented by a list of nodes.
  614. # """
  615. # all_paths = []
  616. # for node in G:
  617. # all_paths.extend(find_paths(G, node, length))
  618. # # The following process is not carried out according to the original article
  619. # # all_paths_r = [ path[::-1] for path in all_paths ]
  620. # # # For each path, two presentation are retrieved from its two extremities. Remove one of them.
  621. # # for idx, path in enumerate(all_paths[:-1]):
  622. # # for path2 in all_paths_r[idx+1::]:
  623. # # if path == path2:
  624. # # all_paths[idx] = []
  625. # # break
  626. # # return list(filter(lambda a: a != [], all_paths))
  627. # return all_paths

A Python package for graph kernels, graph edit distances and graph pre-image problem.