You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

spKernel.py 21 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462
  1. """
  2. @author: linlin
  3. @references: Borgwardt KM, Kriegel HP. Shortest-path kernels on graphs. InData Mining, Fifth IEEE International Conference on 2005 Nov 27 (pp. 8-pp). IEEE.
  4. """
  5. import sys
  6. import pathlib
  7. sys.path.insert(0, "../")
  8. from tqdm import tqdm
  9. import time
  10. from itertools import combinations, combinations_with_replacement, product
  11. from functools import partial
  12. from joblib import Parallel, delayed
  13. from multiprocessing import Pool
  14. import networkx as nx
  15. import numpy as np
  16. from pygraph.utils.utils import getSPGraph
  17. from pygraph.utils.graphdataset import get_dataset_attributes
  18. def spkernel(*args,
  19. node_label='atom',
  20. edge_weight=None,
  21. node_kernels=None,
  22. n_jobs=None):
  23. """Calculate shortest-path kernels between graphs.
  24. Parameters
  25. ----------
  26. Gn : List of NetworkX graph
  27. List of graphs between which the kernels are calculated.
  28. /
  29. G1, G2 : NetworkX graphs
  30. 2 graphs between which the kernel is calculated.
  31. edge_weight : string
  32. Edge attribute name corresponding to the edge weight.
  33. node_kernels: dict
  34. A dictionary of kernel functions for nodes, including 3 items: 'symb' for symbolic node labels, 'nsymb' for non-symbolic node labels, 'mix' for both labels. The first 2 functions take two node labels as parameters, and the 'mix' function takes 4 parameters, a symbolic and a non-symbolic label for each the two nodes. Each label is in form of 2-D dimension array (n_samples, n_features). Each function returns an number as the kernel value. Ignored when nodes are unlabeled.
  35. Return
  36. ------
  37. Kmatrix : Numpy matrix
  38. Kernel matrix, each element of which is the sp kernel between 2 praphs.
  39. """
  40. # pre-process
  41. Gn = args[0] if len(args) == 1 else [args[0], args[1]]
  42. weight = None
  43. if edge_weight == None:
  44. print('\n None edge weight specified. Set all weight to 1.\n')
  45. else:
  46. try:
  47. some_weight = list(
  48. nx.get_edge_attributes(Gn[0], edge_weight).values())[0]
  49. if isinstance(some_weight, float) or isinstance(some_weight, int):
  50. weight = edge_weight
  51. else:
  52. print(
  53. '\n Edge weight with name %s is not float or integer. Set all weight to 1.\n'
  54. % edge_weight)
  55. except:
  56. print(
  57. '\n Edge weight with name "%s" is not found in the edge attributes. Set all weight to 1.\n'
  58. % edge_weight)
  59. ds_attrs = get_dataset_attributes(
  60. Gn,
  61. attr_names=['node_labeled', 'node_attr_dim', 'is_directed'],
  62. node_label=node_label)
  63. # remove graphs with no edges, as no sp can be found in their structures, so the kernel between such a graph and itself will be zero.
  64. len_gn = len(Gn)
  65. Gn = [(idx, G) for idx, G in enumerate(Gn) if nx.number_of_edges(G) != 0]
  66. idx = [G[0] for G in Gn]
  67. Gn = [G[1] for G in Gn]
  68. if len(Gn) != len_gn:
  69. print('\n %d graphs are removed as they don\'t contain edges.\n' %
  70. (len_gn - len(Gn)))
  71. start_time = time.time()
  72. pool = Pool(n_jobs)
  73. # get shortest path graphs of Gn
  74. getsp_partial = partial(wrap_getSPGraph, Gn, edge_weight)
  75. if len(Gn) < 100:
  76. # use default chunksize as pool.map when iterable is less than 100
  77. chunksize, extra = divmod(len(Gn), n_jobs * 4)
  78. if extra:
  79. chunksize += 1
  80. else:
  81. chunksize = 100
  82. # chunksize = 300 # int(len(list(itr)) / n_jobs)
  83. for i, g in tqdm(
  84. pool.imap_unordered(getsp_partial, range(0, len(Gn)), chunksize),
  85. desc='getting sp graphs',
  86. file=sys.stdout):
  87. Gn[i] = g
  88. # # ---- use pool.map to parallel ----
  89. # result_sp = pool.map(getsp_partial, range(0, len(Gn)))
  90. # for i in result_sp:
  91. # Gn[i[0]] = i[1]
  92. # or
  93. # getsp_partial = partial(wrap_getSPGraph, Gn, edge_weight)
  94. # for i, g in tqdm(
  95. # pool.map(getsp_partial, range(0, len(Gn))),
  96. # desc='getting sp graphs',
  97. # file=sys.stdout):
  98. # Gn[i] = g
  99. # # ---- only for the Fast Computation of Shortest Path Kernel (FCSP)
  100. # sp_ml = [0] * len(Gn) # shortest path matrices
  101. # for i in result_sp:
  102. # sp_ml[i[0]] = i[1]
  103. # edge_x_g = [[] for i in range(len(sp_ml))]
  104. # edge_y_g = [[] for i in range(len(sp_ml))]
  105. # edge_w_g = [[] for i in range(len(sp_ml))]
  106. # for idx, item in enumerate(sp_ml):
  107. # for i1 in range(len(item)):
  108. # for i2 in range(i1 + 1, len(item)):
  109. # if item[i1, i2] != np.inf:
  110. # edge_x_g[idx].append(i1)
  111. # edge_y_g[idx].append(i2)
  112. # edge_w_g[idx].append(item[i1, i2])
  113. # print(len(edge_x_g[0]))
  114. # print(len(edge_y_g[0]))
  115. # print(len(edge_w_g[0]))
  116. Kmatrix = np.zeros((len(Gn), len(Gn)))
  117. # ---- use pool.imap_unordered to parallel and track progress. ----
  118. do_partial = partial(spkernel_do, Gn, ds_attrs, node_label, node_kernels)
  119. itr = combinations_with_replacement(range(0, len(Gn)), 2)
  120. len_itr = int(len(Gn) * (len(Gn) + 1) / 2)
  121. if len_itr < 100:
  122. chunksize, extra = divmod(len_itr, n_jobs * 4)
  123. if extra:
  124. chunksize += 1
  125. else:
  126. chunksize = 100
  127. for i, j, kernel in tqdm(
  128. pool.imap_unordered(do_partial, itr, chunksize),
  129. desc='calculating kernels',
  130. file=sys.stdout):
  131. Kmatrix[i][j] = kernel
  132. Kmatrix[j][i] = kernel
  133. pool.close()
  134. pool.join()
  135. # # ---- use pool.map to parallel. ----
  136. # # result_perf = pool.map(do_partial, itr)
  137. # do_partial = partial(spkernel_do, Gn, ds_attrs, node_label, node_kernels)
  138. # itr = combinations_with_replacement(range(0, len(Gn)), 2)
  139. # for i, j, kernel in tqdm(
  140. # pool.map(do_partial, itr), desc='calculating kernels',
  141. # file=sys.stdout):
  142. # Kmatrix[i][j] = kernel
  143. # Kmatrix[j][i] = kernel
  144. # pool.close()
  145. # pool.join()
  146. # # ---- use joblib.Parallel to parallel and track progress. ----
  147. # result_perf = Parallel(
  148. # n_jobs=n_jobs, verbose=10)(
  149. # delayed(do_partial)(ij)
  150. # for ij in combinations_with_replacement(range(0, len(Gn)), 2))
  151. # result_perf = [
  152. # do_partial(ij)
  153. # for ij in combinations_with_replacement(range(0, len(Gn)), 2)
  154. # ]
  155. # for i in result_perf:
  156. # Kmatrix[i[0]][i[1]] = i[2]
  157. # Kmatrix[i[1]][i[0]] = i[2]
  158. # # ---- direct running, normally use single CPU core. ----
  159. # itr = combinations_with_replacement(range(0, len(Gn)), 2)
  160. # for gs in tqdm(itr, desc='calculating kernels', file=sys.stdout):
  161. # i, j, kernel = spkernel_do(Gn, ds_attrs, node_label, node_kernels, gs)
  162. # Kmatrix[i][j] = kernel
  163. # Kmatrix[j][i] = kernel
  164. run_time = time.time() - start_time
  165. print(
  166. "\n --- shortest path kernel matrix of size %d built in %s seconds ---"
  167. % (len(Gn), run_time))
  168. return Kmatrix, run_time, idx
  169. def spkernel_do(Gn, ds_attrs, node_label, node_kernels, ij):
  170. i = ij[0]
  171. j = ij[1]
  172. g1 = Gn[i]
  173. g2 = Gn[j]
  174. Kmatrix = 0
  175. try:
  176. # compute shortest path matrices first, method borrowed from FCSP.
  177. if ds_attrs['node_labeled']:
  178. # node symb and non-synb labeled
  179. if ds_attrs['node_attr_dim'] > 0:
  180. kn = node_kernels['mix']
  181. vk_dict = {} # shortest path matrices dict
  182. for n1, n2 in product(
  183. g1.nodes(data=True), g2.nodes(data=True)):
  184. vk_dict[(n1[0], n2[0])] = kn(
  185. n1[1][node_label], n2[1][node_label],
  186. [n1[1]['attributes']], [n2[1]['attributes']])
  187. # node symb labeled
  188. else:
  189. kn = node_kernels['symb']
  190. vk_dict = {} # shortest path matrices dict
  191. for n1 in g1.nodes(data=True):
  192. for n2 in g2.nodes(data=True):
  193. vk_dict[(n1[0], n2[0])] = kn(n1[1][node_label],
  194. n2[1][node_label])
  195. else:
  196. # node non-synb labeled
  197. if ds_attrs['node_attr_dim'] > 0:
  198. kn = node_kernels['nsymb']
  199. vk_dict = {} # shortest path matrices dict
  200. for n1 in g1.nodes(data=True):
  201. for n2 in g2.nodes(data=True):
  202. vk_dict[(n1[0], n2[0])] = kn([n1[1]['attributes']],
  203. [n2[1]['attributes']])
  204. # node unlabeled
  205. else:
  206. for e1, e2 in product(
  207. Gn[i].edges(data=True), Gn[j].edges(data=True)):
  208. if e1[2]['cost'] == e2[2]['cost']:
  209. Kmatrix += 1
  210. return i, j, Kmatrix
  211. # compute graph kernels
  212. if ds_attrs['is_directed']:
  213. for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)):
  214. if e1[2]['cost'] == e2[2]['cost']:
  215. # each edge walk is counted twice, starting from both its extreme nodes.
  216. nk11, nk22 = vk_dict[(e1[0], e2[0])], vk_dict[(e1[1],
  217. e2[1])]
  218. kn1 = nk11 * nk22
  219. Kmatrix += kn1 + kn2
  220. else:
  221. for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)):
  222. if e1[2]['cost'] == e2[2]['cost']:
  223. # each edge walk is counted twice, starting from both its extreme nodes.
  224. nk11, nk12, nk21, nk22 = vk_dict[(e1[0], e2[0])], vk_dict[(
  225. e1[0], e2[1])], vk_dict[(e1[1],
  226. e2[0])], vk_dict[(e1[1],
  227. e2[1])]
  228. kn1 = nk11 * nk22
  229. kn2 = nk12 * nk21
  230. Kmatrix += kn1 + kn2
  231. # # ---- exact implementation of the Fast Computation of Shortest Path Kernel (FCSP), reference [2], sadly it is slower than the current implementation
  232. # # compute vertex kernel matrix
  233. # try:
  234. # vk_mat = np.zeros((nx.number_of_nodes(g1),
  235. # nx.number_of_nodes(g2)))
  236. # g1nl = enumerate(g1.nodes(data=True))
  237. # g2nl = enumerate(g2.nodes(data=True))
  238. # for i1, n1 in g1nl:
  239. # for i2, n2 in g2nl:
  240. # vk_mat[i1][i2] = kn(
  241. # n1[1][node_label], n2[1][node_label],
  242. # [n1[1]['attributes']], [n2[1]['attributes']])
  243. # range1 = range(0, len(edge_w_g[i]))
  244. # range2 = range(0, len(edge_w_g[j]))
  245. # for i1 in range1:
  246. # x1 = edge_x_g[i][i1]
  247. # y1 = edge_y_g[i][i1]
  248. # w1 = edge_w_g[i][i1]
  249. # for i2 in range2:
  250. # x2 = edge_x_g[j][i2]
  251. # y2 = edge_y_g[j][i2]
  252. # w2 = edge_w_g[j][i2]
  253. # ke = (w1 == w2)
  254. # if ke > 0:
  255. # kn1 = vk_mat[x1][x2] * vk_mat[y1][y2]
  256. # kn2 = vk_mat[x1][y2] * vk_mat[y1][x2]
  257. # Kmatrix += kn1 + kn2
  258. except KeyError: # missing labels or attributes
  259. pass
  260. return i, j, Kmatrix
  261. def wrap_getSPGraph(Gn, weight, i):
  262. return i, getSPGraph(Gn[i], edge_weight=weight)
  263. # return i, nx.floyd_warshall_numpy(Gn[i], weight=weight)
  264. # def spkernel_do(Gn, ds_attrs, node_label, node_kernels, ij):
  265. # i = ij[0]
  266. # j = ij[1]
  267. # g1 = Gn[i]
  268. # g2 = Gn[j]
  269. # Kmatrix = 0
  270. # if ds_attrs['node_labeled']:
  271. # # node symb and non-synb labeled
  272. # if ds_attrs['node_attr_dim'] > 0:
  273. # if ds_attrs['is_directed']:
  274. # for e1, e2 in product(
  275. # Gn[i].edges(data=True), Gn[j].edges(data=True)):
  276. # if e1[2]['cost'] == e2[2]['cost']:
  277. # kn = node_kernels['mix']
  278. # try:
  279. # n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[
  280. # i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[
  281. # j].nodes[e2[1]]
  282. # kn1 = kn(
  283. # n11[node_label], n21[node_label],
  284. # [n11['attributes']], [n21['attributes']]) * kn(
  285. # n12[node_label], n22[node_label],
  286. # [n12['attributes']], [n22['attributes']])
  287. # Kmatrix += kn1
  288. # except KeyError: # missing labels or attributes
  289. # pass
  290. # else:
  291. # kn = node_kernels['mix']
  292. # try:
  293. # # compute shortest path matrices first, method borrowed from FCSP.
  294. # vk_dict = {} # shortest path matrices dict
  295. # for n1 in g1.nodes(data=True):
  296. # for n2 in g2.nodes(data=True):
  297. # vk_dict[(n1[0], n2[0])] = kn(
  298. # n1[1][node_label], n2[1][node_label],
  299. # [n1[1]['attributes']], [n2[1]['attributes']])
  300. # for e1, e2 in product(
  301. # g1.edges(data=True), g2.edges(data=True)):
  302. # if e1[2]['cost'] == e2[2]['cost']:
  303. # # each edge walk is counted twice, starting from both its extreme nodes.
  304. # nk11, nk12, nk21, nk22 = vk_dict[(
  305. # e1[0],
  306. # e2[0])], vk_dict[(e1[0], e2[1])], vk_dict[(
  307. # e1[1], e2[0])], vk_dict[(e1[1], e2[1])]
  308. # kn1 = nk11 * nk22
  309. # kn2 = nk12 * nk21
  310. # Kmatrix += kn1 + kn2
  311. # # # ---- exact implementation of the Fast Computation of Shortest Path Kernel (FCSP), reference [2], sadly it is slower than the current implementation
  312. # # # compute vertex kernel matrix
  313. # # try:
  314. # # vk_mat = np.zeros((nx.number_of_nodes(g1),
  315. # # nx.number_of_nodes(g2)))
  316. # # g1nl = enumerate(g1.nodes(data=True))
  317. # # g2nl = enumerate(g2.nodes(data=True))
  318. # # for i1, n1 in g1nl:
  319. # # for i2, n2 in g2nl:
  320. # # vk_mat[i1][i2] = kn(
  321. # # n1[1][node_label], n2[1][node_label],
  322. # # [n1[1]['attributes']], [n2[1]['attributes']])
  323. # # range1 = range(0, len(edge_w_g[i]))
  324. # # range2 = range(0, len(edge_w_g[j]))
  325. # # for i1 in range1:
  326. # # x1 = edge_x_g[i][i1]
  327. # # y1 = edge_y_g[i][i1]
  328. # # w1 = edge_w_g[i][i1]
  329. # # for i2 in range2:
  330. # # x2 = edge_x_g[j][i2]
  331. # # y2 = edge_y_g[j][i2]
  332. # # w2 = edge_w_g[j][i2]
  333. # # ke = (w1 == w2)
  334. # # if ke > 0:
  335. # # kn1 = vk_mat[x1][x2] * vk_mat[y1][y2]
  336. # # kn2 = vk_mat[x1][y2] * vk_mat[y1][x2]
  337. # # Kmatrix += kn1 + kn2
  338. # except KeyError: # missing labels or attributes
  339. # pass
  340. # # node symb labeled
  341. # else:
  342. # if ds_attrs['is_directed']:
  343. # for e1, e2 in product(
  344. # Gn[i].edges(data=True), Gn[j].edges(data=True)):
  345. # if e1[2]['cost'] == e2[2]['cost']:
  346. # kn = node_kernels['symb']
  347. # try:
  348. # n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[
  349. # i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[
  350. # j].nodes[e2[1]]
  351. # kn1 = kn(n11[node_label], n21[node_label]) * kn(
  352. # n12[node_label], n22[node_label])
  353. # Kmatrix += kn1
  354. # except KeyError: # missing labels
  355. # pass
  356. # else:
  357. # kn = node_kernels['symb']
  358. # try:
  359. # # compute shortest path matrices first, method borrowed from FCSP.
  360. # vk_dict = {} # shortest path matrices dict
  361. # for n1 in g1.nodes(data=True):
  362. # for n2 in g2.nodes(data=True):
  363. # vk_dict[(n1[0], n2[0])] = kn(
  364. # n1[1][node_label], n2[1][node_label])
  365. # for e1, e2 in product(
  366. # g1.edges(data=True), g2.edges(data=True)):
  367. # if e1[2]['cost'] == e2[2]['cost']:
  368. # # each edge walk is counted twice, starting from both its extreme nodes.
  369. # nk11, nk12, nk21, nk22 = vk_dict[(
  370. # e1[0],
  371. # e2[0])], vk_dict[(e1[0], e2[1])], vk_dict[(
  372. # e1[1], e2[0])], vk_dict[(e1[1], e2[1])]
  373. # kn1 = nk11 * nk22
  374. # kn2 = nk12 * nk21
  375. # Kmatrix += kn1 + kn2
  376. # except KeyError: # missing labels
  377. # pass
  378. # else:
  379. # # node non-synb labeled
  380. # if ds_attrs['node_attr_dim'] > 0:
  381. # if ds_attrs['is_directed']:
  382. # for e1, e2 in product(
  383. # Gn[i].edges(data=True), Gn[j].edges(data=True)):
  384. # if e1[2]['cost'] == e2[2]['cost']:
  385. # kn = node_kernels['nsymb']
  386. # try:
  387. # # each edge walk is counted twice, starting from both its extreme nodes.
  388. # n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[
  389. # i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[
  390. # j].nodes[e2[1]]
  391. # kn1 = kn(
  392. # [n11['attributes']], [n21['attributes']]) * kn(
  393. # [n12['attributes']], [n22['attributes']])
  394. # Kmatrix += kn1
  395. # except KeyError: # missing attributes
  396. # pass
  397. # else:
  398. # for e1, e2 in product(
  399. # Gn[i].edges(data=True), Gn[j].edges(data=True)):
  400. # if e1[2]['cost'] == e2[2]['cost']:
  401. # kn = node_kernels['nsymb']
  402. # try:
  403. # # each edge walk is counted twice, starting from both its extreme nodes.
  404. # n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[
  405. # i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[
  406. # j].nodes[e2[1]]
  407. # kn1 = kn(
  408. # [n11['attributes']], [n21['attributes']]) * kn(
  409. # [n12['attributes']], [n22['attributes']])
  410. # kn2 = kn(
  411. # [n11['attributes']], [n22['attributes']]) * kn(
  412. # [n12['attributes']], [n21['attributes']])
  413. # Kmatrix += kn1 + kn2
  414. # except KeyError: # missing attributes
  415. # pass
  416. # # node unlabeled
  417. # else:
  418. # for e1, e2 in product(
  419. # Gn[i].edges(data=True), Gn[j].edges(data=True)):
  420. # if e1[2]['cost'] == e2[2]['cost']:
  421. # Kmatrix += 1
  422. # return i, j, Kmatrix

A Python package for graph kernels, graph edit distances and graph pre-image problem.