You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

structuralspKernel.py 16 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. Created on Thu Sep 27 10:56:23 2018
  5. @author: linlin
  6. @references: Suard F, Rakotomamonjy A, Bensrhair A. Kernel on Bag of Paths For
  7. Measuring Similarity of Shapes. InESANN 2007 Apr 25 (pp. 355-360).
  8. """
  9. import sys
  10. import time
  11. from itertools import combinations, combinations_with_replacement, product
  12. from functools import partial
  13. from joblib import Parallel, delayed
  14. from multiprocessing import Pool
  15. from tqdm import tqdm
  16. import networkx as nx
  17. import numpy as np
  18. from pygraph.utils.graphdataset import get_dataset_attributes
  19. sys.path.insert(0, "../")
  20. def structuralspkernel(*args,
  21. node_label='atom',
  22. edge_weight=None,
  23. edge_label='bond_type',
  24. node_kernels=None,
  25. edge_kernels=None,
  26. n_jobs=None):
  27. """Calculate mean average structural shortest path kernels between graphs.
  28. Parameters
  29. ----------
  30. Gn : List of NetworkX graph
  31. List of graphs between which the kernels are calculated.
  32. /
  33. G1, G2 : NetworkX graphs
  34. 2 graphs between which the kernel is calculated.
  35. node_label : string
  36. node attribute used as label. The default node label is atom.
  37. edge_weight : string
  38. Edge attribute name corresponding to the edge weight.
  39. edge_label : string
  40. edge attribute used as label. The default edge label is bond_type.
  41. node_kernels: dict
  42. A dictionary of kernel functions for nodes, including 3 items: 'symb'
  43. for symbolic node labels, 'nsymb' for non-symbolic node labels, 'mix'
  44. for both labels. The first 2 functions take two node labels as
  45. parameters, and the 'mix' function takes 4 parameters, a symbolic and a
  46. non-symbolic label for each the two nodes. Each label is in form of 2-D
  47. dimension array (n_samples, n_features). Each function returns a number
  48. as the kernel value. Ignored when nodes are unlabeled.
  49. edge_kernels: dict
  50. A dictionary of kernel functions for edges, including 3 items: 'symb'
  51. for symbolic edge labels, 'nsymb' for non-symbolic edge labels, 'mix'
  52. for both labels. The first 2 functions take two edge labels as
  53. parameters, and the 'mix' function takes 4 parameters, a symbolic and a
  54. non-symbolic label for each the two edges. Each label is in form of 2-D
  55. dimension array (n_samples, n_features). Each function returns a number
  56. as the kernel value. Ignored when edges are unlabeled.
  57. Return
  58. ------
  59. Kmatrix : Numpy matrix
  60. Kernel matrix, each element of which is the mean average structural
  61. shortest path kernel between 2 praphs.
  62. """
  63. # pre-process
  64. Gn = args[0] if len(args) == 1 else [args[0], args[1]]
  65. weight = None
  66. if edge_weight is None:
  67. print('\n None edge weight specified. Set all weight to 1.\n')
  68. else:
  69. try:
  70. some_weight = list(
  71. nx.get_edge_attributes(Gn[0], edge_weight).values())[0]
  72. if isinstance(some_weight, (float, int)):
  73. weight = edge_weight
  74. else:
  75. print(
  76. '\n Edge weight with name %s is not float or integer. Set all weight to 1.\n'
  77. % edge_weight)
  78. except:
  79. print(
  80. '\n Edge weight with name "%s" is not found in the edge attributes. Set all weight to 1.\n'
  81. % edge_weight)
  82. ds_attrs = get_dataset_attributes(
  83. Gn,
  84. attr_names=['node_labeled', 'node_attr_dim', 'edge_labeled',
  85. 'edge_attr_dim', 'is_directed'],
  86. node_label=node_label, edge_label=edge_label)
  87. start_time = time.time()
  88. # get shortest paths of each graph in Gn
  89. splist = [[] for _ in range(len(Gn))]
  90. pool = Pool(n_jobs)
  91. # get shortest path graphs of Gn
  92. getsp_partial = partial(wrap_getSP, Gn, weight, ds_attrs['is_directed'])
  93. if len(Gn) < 1000 * n_jobs:
  94. chunksize = int(len(Gn) / n_jobs) + 1
  95. else:
  96. chunksize = 1000
  97. # chunksize = 300 # int(len(list(itr)) / n_jobs)
  98. for i, sp in tqdm(
  99. pool.imap_unordered(getsp_partial, range(0, len(Gn)), chunksize),
  100. desc='getting shortest paths',
  101. file=sys.stdout):
  102. splist[i] = sp
  103. pool.close()
  104. pool.join()
  105. # # ---- use pool.map to parallel ----
  106. # result_sp = pool.map(getsp_partial, range(0, len(Gn)))
  107. # for i in result_sp:
  108. # Gn[i[0]] = i[1]
  109. # or
  110. # getsp_partial = partial(wrap_getSP, Gn, weight)
  111. # for i, g in tqdm(
  112. # pool.map(getsp_partial, range(0, len(Gn))),
  113. # desc='getting sp graphs',
  114. # file=sys.stdout):
  115. # Gn[i] = g
  116. # # ---- only for the Fast Computation of Shortest Path Kernel (FCSP)
  117. # sp_ml = [0] * len(Gn) # shortest path matrices
  118. # for i in result_sp:
  119. # sp_ml[i[0]] = i[1]
  120. # edge_x_g = [[] for i in range(len(sp_ml))]
  121. # edge_y_g = [[] for i in range(len(sp_ml))]
  122. # edge_w_g = [[] for i in range(len(sp_ml))]
  123. # for idx, item in enumerate(sp_ml):
  124. # for i1 in range(len(item)):
  125. # for i2 in range(i1 + 1, len(item)):
  126. # if item[i1, i2] != np.inf:
  127. # edge_x_g[idx].append(i1)
  128. # edge_y_g[idx].append(i2)
  129. # edge_w_g[idx].append(item[i1, i2])
  130. # print(len(edge_x_g[0]))
  131. # print(len(edge_y_g[0]))
  132. # print(len(edge_w_g[0]))
  133. Kmatrix = np.zeros((len(Gn), len(Gn)))
  134. # ---- use pool.imap_unordered to parallel and track progress. ----
  135. pool = Pool(n_jobs)
  136. do_partial = partial(structuralspkernel_do, Gn, splist, ds_attrs,
  137. node_label, edge_label, node_kernels, edge_kernels)
  138. itr = combinations_with_replacement(range(0, len(Gn)), 2)
  139. len_itr = int(len(Gn) * (len(Gn) + 1) / 2)
  140. if len_itr < 1000 * n_jobs:
  141. chunksize = int(len_itr / n_jobs) + 1
  142. else:
  143. chunksize = 1000
  144. for i, j, kernel in tqdm(
  145. pool.imap_unordered(do_partial, itr, chunksize),
  146. desc='calculating kernels',
  147. file=sys.stdout):
  148. Kmatrix[i][j] = kernel
  149. Kmatrix[j][i] = kernel
  150. pool.close()
  151. pool.join()
  152. # # ---- use pool.map to parallel. ----
  153. # # result_perf = pool.map(do_partial, itr)
  154. # do_partial = partial(spkernel_do, Gn, ds_attrs, node_label, node_kernels)
  155. # itr = combinations_with_replacement(range(0, len(Gn)), 2)
  156. # for i, j, kernel in tqdm(
  157. # pool.map(do_partial, itr), desc='calculating kernels',
  158. # file=sys.stdout):
  159. # Kmatrix[i][j] = kernel
  160. # Kmatrix[j][i] = kernel
  161. # pool.close()
  162. # pool.join()
  163. # # ---- use joblib.Parallel to parallel and track progress. ----
  164. # result_perf = Parallel(
  165. # n_jobs=n_jobs, verbose=10)(
  166. # delayed(do_partial)(ij)
  167. # for ij in combinations_with_replacement(range(0, len(Gn)), 2))
  168. # result_perf = [
  169. # do_partial(ij)
  170. # for ij in combinations_with_replacement(range(0, len(Gn)), 2)
  171. # ]
  172. # for i in result_perf:
  173. # Kmatrix[i[0]][i[1]] = i[2]
  174. # Kmatrix[i[1]][i[0]] = i[2]
  175. # # ---- direct running, normally use single CPU core. ----
  176. # itr = combinations_with_replacement(range(0, len(Gn)), 2)
  177. # for gs in tqdm(itr, desc='calculating kernels', file=sys.stdout):
  178. # if gs[0] == 24 and gs[1] == 411:
  179. # i, j, kernel = structuralspkernel_do(Gn, splist, ds_attrs,
  180. # node_label, edge_label, node_kernels, edge_kernels, gs)
  181. # if(kernel > 1):
  182. # print("error here ")
  183. # Kmatrix[i][j] = kernel
  184. # Kmatrix[j][i] = kernel
  185. run_time = time.time() - start_time
  186. print(
  187. "\n --- shortest path kernel matrix of size %d built in %s seconds ---"
  188. % (len(Gn), run_time))
  189. return Kmatrix, run_time
  190. def structuralspkernel_do(Gn, splist, ds_attrs, node_label, edge_label,
  191. node_kernels, edge_kernels, ij):
  192. iglobal = ij[0]
  193. jglobal = ij[1]
  194. g1 = Gn[iglobal]
  195. g2 = Gn[jglobal]
  196. spl1 = splist[iglobal]
  197. spl2 = splist[jglobal]
  198. kernel = 0
  199. #try:
  200. # First, compute shortest path matrices, method borrowed from FCSP.
  201. if ds_attrs['node_labeled']:
  202. # node symb and non-synb labeled
  203. if ds_attrs['node_attr_dim'] > 0:
  204. kn = node_kernels['mix']
  205. vk_dict = {} # shortest path matrices dict
  206. for n1, n2 in product(
  207. g1.nodes(data=True), g2.nodes(data=True)):
  208. vk_dict[(n1[0], n2[0])] = kn(
  209. n1[1][node_label], n2[1][node_label],
  210. [n1[1]['attributes']], [n2[1]['attributes']])
  211. # node symb labeled
  212. else:
  213. kn = node_kernels['symb']
  214. vk_dict = {} # shortest path matrices dict
  215. for n1 in g1.nodes(data=True):
  216. for n2 in g2.nodes(data=True):
  217. vk_dict[(n1[0], n2[0])] = kn(n1[1][node_label],
  218. n2[1][node_label])
  219. else:
  220. # node non-synb labeled
  221. if ds_attrs['node_attr_dim'] > 0:
  222. kn = node_kernels['nsymb']
  223. vk_dict = {} # shortest path matrices dict
  224. for n1 in g1.nodes(data=True):
  225. for n2 in g2.nodes(data=True):
  226. vk_dict[(n1[0], n2[0])] = kn([n1[1]['attributes']],
  227. [n2[1]['attributes']])
  228. # node unlabeled
  229. else:
  230. vk_dict = {}
  231. # Then, compute kernels between all pairs of edges, which idea is an
  232. # extension of FCSP. It suits sparse graphs, which is the most case we
  233. # went though. For dense graphs, it would be slow.
  234. if ds_attrs['edge_labeled']:
  235. # edge symb and non-synb labeled
  236. if ds_attrs['edge_attr_dim'] > 0:
  237. ke = edge_kernels['mix']
  238. ek_dict = {} # dict of edge kernels
  239. for e1, e2 in product(
  240. g1.edges(data=True), g2.edges(data=True)):
  241. ek_temp = ke(e1[2][edge_label], e2[2][edge_label],
  242. [e1[2]['attributes']], [e2[2]['attributes']])
  243. ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ek_temp
  244. ek_dict[((e1[1], e1[0]), (e2[0], e2[1]))] = ek_temp
  245. ek_dict[((e1[0], e1[1]), (e2[1], e2[0]))] = ek_temp
  246. ek_dict[((e1[1], e1[0]), (e2[1], e2[0]))] = ek_temp
  247. # edge symb labeled
  248. else:
  249. ke = edge_kernels['symb']
  250. ek_dict = {}
  251. for e1 in g1.edges(data=True):
  252. for e2 in g2.edges(data=True):
  253. ek_temp = ke(e1[2][edge_label], e2[2][edge_label])
  254. ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ek_temp
  255. ek_dict[((e1[1], e1[0]), (e2[0], e2[1]))] = ek_temp
  256. ek_dict[((e1[0], e1[1]), (e2[1], e2[0]))] = ek_temp
  257. ek_dict[((e1[1], e1[0]), (e2[1], e2[0]))] = ek_temp
  258. else:
  259. # edge non-synb labeled
  260. if ds_attrs['edge_attr_dim'] > 0:
  261. ke = edge_kernels['nsymb']
  262. ek_dict = {}
  263. for e1 in g1.edges(data=True):
  264. for e2 in g2.edges(data=True):
  265. ek_temp = kn([e1[2]['attributes']], [e2[2]['attributes']])
  266. ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ek_temp
  267. ek_dict[((e1[1], e1[0]), (e2[0], e2[1]))] = ek_temp
  268. ek_dict[((e1[0], e1[1]), (e2[1], e2[0]))] = ek_temp
  269. ek_dict[((e1[1], e1[0]), (e2[1], e2[0]))] = ek_temp
  270. # edge unlabeled
  271. else:
  272. ek_dict = {}
  273. # compute graph kernels
  274. if vk_dict:
  275. if ek_dict:
  276. for p1, p2 in product(spl1, spl2):
  277. if len(p1) == len(p2):
  278. kpath = vk_dict[(p1[0], p2[0])]
  279. if kpath:
  280. for idx in range(1, len(p1)):
  281. kpath *= vk_dict[(p1[idx], p2[idx])] * \
  282. ek_dict[((p1[idx-1], p1[idx]),
  283. (p2[idx-1], p2[idx]))]
  284. if not kpath:
  285. break
  286. kernel += kpath # add up kernels of all paths
  287. else:
  288. for p1, p2 in product(spl1, spl2):
  289. if len(p1) == len(p2):
  290. kpath = vk_dict[(p1[0], p2[0])]
  291. if kpath:
  292. for idx in range(1, len(p1)):
  293. kpath *= vk_dict[(p1[idx], p2[idx])]
  294. if not kpath:
  295. break
  296. kernel += kpath # add up kernels of all paths
  297. else:
  298. if ek_dict:
  299. for p1, p2 in product(spl1, spl2):
  300. if len(p1) == len(p2):
  301. if len(p1) == 0:
  302. kernel += 1
  303. else:
  304. kpath = 1
  305. for idx in range(0, len(p1) - 1):
  306. kpath *= ek_dict[((p1[idx], p1[idx+1]),
  307. (p2[idx], p2[idx+1]))]
  308. if not kpath:
  309. break
  310. kernel += kpath # add up kernels of all paths
  311. else:
  312. for p1, p2 in product(spl1, spl2):
  313. if len(p1) == len(p2):
  314. kernel += 1
  315. kernel = kernel / (len(spl1) * len(spl2)) # calculate mean average
  316. # # ---- exact implementation of the Fast Computation of Shortest Path Kernel (FCSP), reference [2], sadly it is slower than the current implementation
  317. # # compute vertex kernel matrix
  318. # try:
  319. # vk_mat = np.zeros((nx.number_of_nodes(g1),
  320. # nx.number_of_nodes(g2)))
  321. # g1nl = enumerate(g1.nodes(data=True))
  322. # g2nl = enumerate(g2.nodes(data=True))
  323. # for i1, n1 in g1nl:
  324. # for i2, n2 in g2nl:
  325. # vk_mat[i1][i2] = kn(
  326. # n1[1][node_label], n2[1][node_label],
  327. # [n1[1]['attributes']], [n2[1]['attributes']])
  328. # range1 = range(0, len(edge_w_g[i]))
  329. # range2 = range(0, len(edge_w_g[j]))
  330. # for i1 in range1:
  331. # x1 = edge_x_g[i][i1]
  332. # y1 = edge_y_g[i][i1]
  333. # w1 = edge_w_g[i][i1]
  334. # for i2 in range2:
  335. # x2 = edge_x_g[j][i2]
  336. # y2 = edge_y_g[j][i2]
  337. # w2 = edge_w_g[j][i2]
  338. # ke = (w1 == w2)
  339. # if ke > 0:
  340. # kn1 = vk_mat[x1][x2] * vk_mat[y1][y2]
  341. # kn2 = vk_mat[x1][y2] * vk_mat[y1][x2]
  342. # Kmatrix += kn1 + kn2
  343. #except KeyError: # missing labels or attributes
  344. # print("toto")
  345. # pass
  346. if(kernel > 1):
  347. print("kernel error : ", ij)
  348. return iglobal, jglobal, kernel
  349. def get_shortest_paths(G, weight, directed):
  350. """Get all shortest paths of a graph.
  351. Parameters
  352. ----------
  353. G : NetworkX graphs
  354. The graphs whose paths are calculated.
  355. weight : string/None
  356. edge attribute used as weight to calculate the shortest path.
  357. directed: boolean
  358. Whether graph is directed.
  359. Return
  360. ------
  361. sp : list of list
  362. List of shortest paths of the graph, where each path is represented by a list of nodes.
  363. """
  364. sp = []
  365. for n1, n2 in combinations(G.nodes(), 2):
  366. try:
  367. spltemp = list(nx.all_shortest_paths(G, n1, n2, weight=weight))
  368. sp += spltemp
  369. # each edge walk is counted twice, starting from both its extreme nodes.
  370. if not directed:
  371. sp += [sptemp[::-1] for sptemp in spltemp]
  372. except nx.NetworkXNoPath: # nodes not connected
  373. # sp.append([])
  374. pass
  375. # add single nodes as length 0 paths.
  376. sp += [[n] for n in G.nodes()]
  377. return sp
  378. def wrap_getSP(Gn, weight, directed, i):
  379. return i, get_shortest_paths(Gn[i], weight, directed)

A Python package for graph kernels, graph edit distances and graph pre-image problem.