You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

structuralspKernel.py 17 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. Created on Thu Sep 27 10:56:23 2018
  5. @author: linlin
  6. @references: Suard F, Rakotomamonjy A, Bensrhair A. Kernel on Bag of Paths For
  7. Measuring Similarity of Shapes. InESANN 2007 Apr 25 (pp. 355-360).
  8. """
  9. import sys
  10. import time
  11. from itertools import combinations, combinations_with_replacement, product
  12. from functools import partial
  13. from multiprocessing import Pool
  14. from tqdm import tqdm
  15. import networkx as nx
  16. import numpy as np
  17. from pygraph.utils.graphdataset import get_dataset_attributes
  18. sys.path.insert(0, "../")
  19. def structuralspkernel(*args,
  20. node_label='atom',
  21. edge_weight=None,
  22. edge_label='bond_type',
  23. node_kernels=None,
  24. edge_kernels=None,
  25. n_jobs=None):
  26. """Calculate mean average structural shortest path kernels between graphs.
  27. Parameters
  28. ----------
  29. Gn : List of NetworkX graph
  30. List of graphs between which the kernels are calculated.
  31. /
  32. G1, G2 : NetworkX graphs
  33. 2 graphs between which the kernel is calculated.
  34. node_label : string
  35. node attribute used as label. The default node label is atom.
  36. edge_weight : string
  37. Edge attribute name corresponding to the edge weight.
  38. edge_label : string
  39. edge attribute used as label. The default edge label is bond_type.
  40. node_kernels: dict
  41. A dictionary of kernel functions for nodes, including 3 items: 'symb'
  42. for symbolic node labels, 'nsymb' for non-symbolic node labels, 'mix'
  43. for both labels. The first 2 functions take two node labels as
  44. parameters, and the 'mix' function takes 4 parameters, a symbolic and a
  45. non-symbolic label for each the two nodes. Each label is in form of 2-D
  46. dimension array (n_samples, n_features). Each function returns a number
  47. as the kernel value. Ignored when nodes are unlabeled.
  48. edge_kernels: dict
  49. A dictionary of kernel functions for edges, including 3 items: 'symb'
  50. for symbolic edge labels, 'nsymb' for non-symbolic edge labels, 'mix'
  51. for both labels. The first 2 functions take two edge labels as
  52. parameters, and the 'mix' function takes 4 parameters, a symbolic and a
  53. non-symbolic label for each the two edges. Each label is in form of 2-D
  54. dimension array (n_samples, n_features). Each function returns a number
  55. as the kernel value. Ignored when edges are unlabeled.
  56. Return
  57. ------
  58. Kmatrix : Numpy matrix
  59. Kernel matrix, each element of which is the mean average structural
  60. shortest path kernel between 2 praphs.
  61. """
  62. # pre-process
  63. Gn = args[0] if len(args) == 1 else [args[0], args[1]]
  64. weight = None
  65. if edge_weight is None:
  66. print('\n None edge weight specified. Set all weight to 1.\n')
  67. else:
  68. try:
  69. some_weight = list(
  70. nx.get_edge_attributes(Gn[0], edge_weight).values())[0]
  71. if isinstance(some_weight, (float, int)):
  72. weight = edge_weight
  73. else:
  74. print(
  75. '\n Edge weight with name %s is not float or integer. Set all weight to 1.\n'
  76. % edge_weight)
  77. except:
  78. print(
  79. '\n Edge weight with name "%s" is not found in the edge attributes. Set all weight to 1.\n'
  80. % edge_weight)
  81. ds_attrs = get_dataset_attributes(
  82. Gn,
  83. attr_names=['node_labeled', 'node_attr_dim', 'edge_labeled',
  84. 'edge_attr_dim', 'is_directed'],
  85. node_label=node_label, edge_label=edge_label)
  86. start_time = time.time()
  87. # get shortest paths of each graph in Gn
  88. splist = [None] * len(Gn)
  89. pool = Pool(n_jobs)
  90. # get shortest path graphs of Gn
  91. getsp_partial = partial(wrapper_getSP, weight, ds_attrs['is_directed'])
  92. itr = zip(Gn, range(0, len(Gn)))
  93. if len(Gn) < 1000 * n_jobs:
  94. chunksize = int(len(Gn) / n_jobs) + 1
  95. else:
  96. chunksize = 1000
  97. # chunksize = 300 # int(len(list(itr)) / n_jobs)
  98. for i, sp in tqdm(
  99. pool.imap_unordered(getsp_partial, itr, chunksize),
  100. desc='getting shortest paths',
  101. file=sys.stdout):
  102. splist[i] = sp
  103. # time.sleep(10)
  104. pool.close()
  105. pool.join()
  106. # # get shortest paths of each graph in Gn
  107. # splist = [[] for _ in range(len(Gn))]
  108. # # get shortest path graphs of Gn
  109. # getsp_partial = partial(wrapper_getSP, weight, ds_attrs['is_directed'])
  110. # itr = zip(Gn, range(0, len(Gn)))
  111. # if len(Gn) < 1000 * n_jobs:
  112. # chunksize = int(len(Gn) / n_jobs) + 1
  113. # else:
  114. # chunksize = 1000
  115. # # chunksize = 300 # int(len(list(itr)) / n_jobs)
  116. # from contextlib import closing
  117. # with closing(Pool(n_jobs)) as pool:
  118. ## for i, sp in tqdm(
  119. # res = pool.imap_unordered(getsp_partial, itr, 10)
  120. ## desc='getting shortest paths',
  121. ## file=sys.stdout):
  122. ## splist[i] = sp
  123. ## time.sleep(10)
  124. # pool.close()
  125. # pool.join()
  126. # ss = 0
  127. # ss += sys.getsizeof(splist)
  128. # for spss in splist:
  129. # ss += sys.getsizeof(spss)
  130. # for spp in spss:
  131. # ss += sys.getsizeof(spp)
  132. # time.sleep(20)
  133. # # ---- direct running, normally use single CPU core. ----
  134. # splist = []
  135. # for g in tqdm(Gn, desc='getting sp graphs', file=sys.stdout):
  136. # splist.append(get_shortest_paths(g, weight, ds_attrs['is_directed']))
  137. # # ---- only for the Fast Computation of Shortest Path Kernel (FCSP)
  138. # sp_ml = [0] * len(Gn) # shortest path matrices
  139. # for i in result_sp:
  140. # sp_ml[i[0]] = i[1]
  141. # edge_x_g = [[] for i in range(len(sp_ml))]
  142. # edge_y_g = [[] for i in range(len(sp_ml))]
  143. # edge_w_g = [[] for i in range(len(sp_ml))]
  144. # for idx, item in enumerate(sp_ml):
  145. # for i1 in range(len(item)):
  146. # for i2 in range(i1 + 1, len(item)):
  147. # if item[i1, i2] != np.inf:
  148. # edge_x_g[idx].append(i1)
  149. # edge_y_g[idx].append(i2)
  150. # edge_w_g[idx].append(item[i1, i2])
  151. # print(len(edge_x_g[0]))
  152. # print(len(edge_y_g[0]))
  153. # print(len(edge_w_g[0]))
  154. Kmatrix = np.zeros((len(Gn), len(Gn)))
  155. # ---- use pool.imap_unordered to parallel and track progress. ----
  156. pool = Pool(n_jobs)
  157. do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label,
  158. node_kernels, edge_kernels)
  159. itr = zip(combinations_with_replacement(Gn, 2),
  160. combinations_with_replacement(splist, 2),
  161. combinations_with_replacement(range(0, len(Gn)), 2))
  162. len_itr = int(len(Gn) * (len(Gn) + 1) / 2)
  163. if len_itr < 1000 * n_jobs:
  164. chunksize = int(len_itr / n_jobs) + 1
  165. else:
  166. chunksize = 1000
  167. for i, j, kernel in tqdm(
  168. pool.imap_unordered(do_partial, itr, chunksize),
  169. desc='calculating kernels',
  170. file=sys.stdout):
  171. Kmatrix[i][j] = kernel
  172. Kmatrix[j][i] = kernel
  173. pool.close()
  174. pool.join()
  175. # # ---- use pool.imap_unordered to parallel and track progress. ----
  176. # do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label,
  177. # node_kernels, edge_kernels)
  178. # itr = zip(combinations_with_replacement(Gn, 2),
  179. # combinations_with_replacement(splist, 2),
  180. # combinations_with_replacement(range(0, len(Gn)), 2))
  181. # len_itr = int(len(Gn) * (len(Gn) + 1) / 2)
  182. # if len_itr < 1000 * n_jobs:
  183. # chunksize = int(len_itr / n_jobs) + 1
  184. # else:
  185. # chunksize = 1000
  186. # from contextlib import closing
  187. # with closing(Pool(n_jobs)) as pool:
  188. # for i, j, kernel in tqdm(
  189. # pool.imap_unordered(do_partial, itr, 1000),
  190. # desc='calculating kernels',
  191. # file=sys.stdout):
  192. # Kmatrix[i][j] = kernel
  193. # Kmatrix[j][i] = kernel
  194. # pool.close()
  195. # pool.join()
  196. # # ---- direct running, normally use single CPU core. ----
  197. # itr = zip(combinations_with_replacement(Gn, 2),
  198. # combinations_with_replacement(splist, 2),
  199. # combinations_with_replacement(range(0, len(Gn)), 2))
  200. # for gs in tqdm(itr, desc='calculating kernels', file=sys.stdout):
  201. # i, j, kernel = wrapper_ssp_do(ds_attrs, node_label, edge_label,
  202. # node_kernels, edge_kernels, gs)
  203. # if(kernel > 1):
  204. # print("error here ")
  205. # Kmatrix[i][j] = kernel
  206. # Kmatrix[j][i] = kernel
  207. run_time = time.time() - start_time
  208. print(
  209. "\n --- shortest path kernel matrix of size %d built in %s seconds ---"
  210. % (len(Gn), run_time))
  211. return Kmatrix, run_time
  212. def structuralspkernel_do(g1, g2, spl1, spl2, ds_attrs, node_label, edge_label,
  213. node_kernels, edge_kernels):
  214. kernel = 0
  215. # First, compute shortest path matrices, method borrowed from FCSP.
  216. if ds_attrs['node_labeled']:
  217. # node symb and non-synb labeled
  218. if ds_attrs['node_attr_dim'] > 0:
  219. kn = node_kernels['mix']
  220. vk_dict = {} # shortest path matrices dict
  221. for n1, n2 in product(
  222. g1.nodes(data=True), g2.nodes(data=True)):
  223. vk_dict[(n1[0], n2[0])] = kn(
  224. n1[1][node_label], n2[1][node_label],
  225. n1[1]['attributes'], n2[1]['attributes'])
  226. # node symb labeled
  227. else:
  228. kn = node_kernels['symb']
  229. vk_dict = {} # shortest path matrices dict
  230. for n1 in g1.nodes(data=True):
  231. for n2 in g2.nodes(data=True):
  232. vk_dict[(n1[0], n2[0])] = kn(n1[1][node_label],
  233. n2[1][node_label])
  234. else:
  235. # node non-synb labeled
  236. if ds_attrs['node_attr_dim'] > 0:
  237. kn = node_kernels['nsymb']
  238. vk_dict = {} # shortest path matrices dict
  239. for n1 in g1.nodes(data=True):
  240. for n2 in g2.nodes(data=True):
  241. vk_dict[(n1[0], n2[0])] = kn(n1[1]['attributes'],
  242. n2[1]['attributes'])
  243. # node unlabeled
  244. else:
  245. vk_dict = {}
  246. # Then, compute kernels between all pairs of edges, which idea is an
  247. # extension of FCSP. It suits sparse graphs, which is the most case we
  248. # went though. For dense graphs, it would be slow.
  249. if ds_attrs['edge_labeled']:
  250. # edge symb and non-synb labeled
  251. if ds_attrs['edge_attr_dim'] > 0:
  252. ke = edge_kernels['mix']
  253. ek_dict = {} # dict of edge kernels
  254. for e1, e2 in product(
  255. g1.edges(data=True), g2.edges(data=True)):
  256. ek_temp = ke(e1[2][edge_label], e2[2][edge_label],
  257. e1[2]['attributes'], e2[2]['attributes'])
  258. ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ek_temp
  259. ek_dict[((e1[1], e1[0]), (e2[0], e2[1]))] = ek_temp
  260. ek_dict[((e1[0], e1[1]), (e2[1], e2[0]))] = ek_temp
  261. ek_dict[((e1[1], e1[0]), (e2[1], e2[0]))] = ek_temp
  262. # edge symb labeled
  263. else:
  264. ke = edge_kernels['symb']
  265. ek_dict = {}
  266. for e1 in g1.edges(data=True):
  267. for e2 in g2.edges(data=True):
  268. ek_temp = ke(e1[2][edge_label], e2[2][edge_label])
  269. ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ek_temp
  270. ek_dict[((e1[1], e1[0]), (e2[0], e2[1]))] = ek_temp
  271. ek_dict[((e1[0], e1[1]), (e2[1], e2[0]))] = ek_temp
  272. ek_dict[((e1[1], e1[0]), (e2[1], e2[0]))] = ek_temp
  273. else:
  274. # edge non-synb labeled
  275. if ds_attrs['edge_attr_dim'] > 0:
  276. ke = edge_kernels['nsymb']
  277. ek_dict = {}
  278. for e1 in g1.edges(data=True):
  279. for e2 in g2.edges(data=True):
  280. ek_temp = kn(e1[2]['attributes'], e2[2]['attributes'])
  281. ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ek_temp
  282. ek_dict[((e1[1], e1[0]), (e2[0], e2[1]))] = ek_temp
  283. ek_dict[((e1[0], e1[1]), (e2[1], e2[0]))] = ek_temp
  284. ek_dict[((e1[1], e1[0]), (e2[1], e2[0]))] = ek_temp
  285. # edge unlabeled
  286. else:
  287. ek_dict = {}
  288. # compute graph kernels
  289. if vk_dict:
  290. if ek_dict:
  291. for p1, p2 in product(spl1, spl2):
  292. if len(p1) == len(p2):
  293. kpath = vk_dict[(p1[0], p2[0])]
  294. if kpath:
  295. for idx in range(1, len(p1)):
  296. kpath *= vk_dict[(p1[idx], p2[idx])] * \
  297. ek_dict[((p1[idx-1], p1[idx]),
  298. (p2[idx-1], p2[idx]))]
  299. if not kpath:
  300. break
  301. kernel += kpath # add up kernels of all paths
  302. else:
  303. for p1, p2 in product(spl1, spl2):
  304. if len(p1) == len(p2):
  305. kpath = vk_dict[(p1[0], p2[0])]
  306. if kpath:
  307. for idx in range(1, len(p1)):
  308. kpath *= vk_dict[(p1[idx], p2[idx])]
  309. if not kpath:
  310. break
  311. kernel += kpath # add up kernels of all paths
  312. else:
  313. if ek_dict:
  314. for p1, p2 in product(spl1, spl2):
  315. if len(p1) == len(p2):
  316. if len(p1) == 0:
  317. kernel += 1
  318. else:
  319. kpath = 1
  320. for idx in range(0, len(p1) - 1):
  321. kpath *= ek_dict[((p1[idx], p1[idx+1]),
  322. (p2[idx], p2[idx+1]))]
  323. if not kpath:
  324. break
  325. kernel += kpath # add up kernels of all paths
  326. else:
  327. for p1, p2 in product(spl1, spl2):
  328. if len(p1) == len(p2):
  329. kernel += 1
  330. kernel = kernel / (len(spl1) * len(spl2)) # calculate mean average
  331. # # ---- exact implementation of the Fast Computation of Shortest Path Kernel (FCSP), reference [2], sadly it is slower than the current implementation
  332. # # compute vertex kernel matrix
  333. # try:
  334. # vk_mat = np.zeros((nx.number_of_nodes(g1),
  335. # nx.number_of_nodes(g2)))
  336. # g1nl = enumerate(g1.nodes(data=True))
  337. # g2nl = enumerate(g2.nodes(data=True))
  338. # for i1, n1 in g1nl:
  339. # for i2, n2 in g2nl:
  340. # vk_mat[i1][i2] = kn(
  341. # n1[1][node_label], n2[1][node_label],
  342. # [n1[1]['attributes']], [n2[1]['attributes']])
  343. # range1 = range(0, len(edge_w_g[i]))
  344. # range2 = range(0, len(edge_w_g[j]))
  345. # for i1 in range1:
  346. # x1 = edge_x_g[i][i1]
  347. # y1 = edge_y_g[i][i1]
  348. # w1 = edge_w_g[i][i1]
  349. # for i2 in range2:
  350. # x2 = edge_x_g[j][i2]
  351. # y2 = edge_y_g[j][i2]
  352. # w2 = edge_w_g[j][i2]
  353. # ke = (w1 == w2)
  354. # if ke > 0:
  355. # kn1 = vk_mat[x1][x2] * vk_mat[y1][y2]
  356. # kn2 = vk_mat[x1][y2] * vk_mat[y1][x2]
  357. # Kmatrix += kn1 + kn2
  358. return kernel
  359. def wrapper_ssp_do(ds_attrs, node_label, edge_label, node_kernels,
  360. edge_kernels, itr_item):
  361. g1 = itr_item[0][0]
  362. g2 = itr_item[0][1]
  363. spl1 = itr_item[1][0]
  364. spl2 = itr_item[1][1]
  365. i = itr_item[2][0]
  366. j = itr_item[2][1]
  367. return i, j, structuralspkernel_do(g1, g2, spl1, spl2, ds_attrs,
  368. node_label, edge_label, node_kernels, edge_kernels)
  369. def get_shortest_paths(G, weight, directed):
  370. """Get all shortest paths of a graph.
  371. Parameters
  372. ----------
  373. G : NetworkX graphs
  374. The graphs whose paths are calculated.
  375. weight : string/None
  376. edge attribute used as weight to calculate the shortest path.
  377. directed: boolean
  378. Whether graph is directed.
  379. Return
  380. ------
  381. sp : list of list
  382. List of shortest paths of the graph, where each path is represented by a list of nodes.
  383. """
  384. sp = []
  385. for n1, n2 in combinations(G.nodes(), 2):
  386. try:
  387. spltemp = list(nx.all_shortest_paths(G, n1, n2, weight=weight))
  388. except nx.NetworkXNoPath: # nodes not connected
  389. # sp.append([])
  390. pass
  391. else:
  392. sp += spltemp
  393. # each edge walk is counted twice, starting from both its extreme nodes.
  394. if not directed:
  395. sp += [sptemp[::-1] for sptemp in spltemp]
  396. # add single nodes as length 0 paths.
  397. sp += [[n] for n in G.nodes()]
  398. return sp
  399. def wrapper_getSP(weight, directed, itr_item):
  400. g = itr_item[0]
  401. i = itr_item[1]
  402. return i, get_shortest_paths(g, weight, directed)

A Python package for graph kernels, graph edit distances and graph pre-image problem.