You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

spKernel.py 12 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307
  1. """
  2. @author: linlin
  3. @references: Borgwardt KM, Kriegel HP. Shortest-path kernels on graphs. InData
  4. Mining, Fifth IEEE International Conference on 2005 Nov 27 (pp. 8-pp). IEEE.
  5. """
  6. import sys
  7. import time
  8. from itertools import combinations_with_replacement, product
  9. from functools import partial
  10. from multiprocessing import Pool
  11. from tqdm import tqdm
  12. import networkx as nx
  13. import numpy as np
  14. from pygraph.utils.utils import getSPGraph
  15. from pygraph.utils.graphdataset import get_dataset_attributes
  16. sys.path.insert(0, "../")
  17. def spkernel(*args,
  18. node_label='atom',
  19. edge_weight=None,
  20. node_kernels=None,
  21. n_jobs=None):
  22. """Calculate shortest-path kernels between graphs.
  23. Parameters
  24. ----------
  25. Gn : List of NetworkX graph
  26. List of graphs between which the kernels are calculated.
  27. /
  28. G1, G2 : NetworkX graphs
  29. 2 graphs between which the kernel is calculated.
  30. node_label : string
  31. node attribute used as label. The default node label is atom.
  32. edge_weight : string
  33. Edge attribute name corresponding to the edge weight.
  34. node_kernels: dict
  35. A dictionary of kernel functions for nodes, including 3 items: 'symb'
  36. for symbolic node labels, 'nsymb' for non-symbolic node labels, 'mix'
  37. for both labels. The first 2 functions take two node labels as
  38. parameters, and the 'mix' function takes 4 parameters, a symbolic and a
  39. non-symbolic label for each the two nodes. Each label is in form of 2-D
  40. dimension array (n_samples, n_features). Each function returns an
  41. number as the kernel value. Ignored when nodes are unlabeled.
  42. Return
  43. ------
  44. Kmatrix : Numpy matrix
  45. Kernel matrix, each element of which is the sp kernel between 2 praphs.
  46. """
  47. # pre-process
  48. Gn = args[0] if len(args) == 1 else [args[0], args[1]]
  49. weight = None
  50. if edge_weight is None:
  51. print('\n None edge weight specified. Set all weight to 1.\n')
  52. else:
  53. try:
  54. some_weight = list(
  55. nx.get_edge_attributes(Gn[0], edge_weight).values())[0]
  56. if isinstance(some_weight, (float, int)):
  57. weight = edge_weight
  58. else:
  59. print(
  60. '\n Edge weight with name %s is not float or integer. Set all weight to 1.\n'
  61. % edge_weight)
  62. except:
  63. print(
  64. '\n Edge weight with name "%s" is not found in the edge attributes. Set all weight to 1.\n'
  65. % edge_weight)
  66. ds_attrs = get_dataset_attributes(
  67. Gn,
  68. attr_names=['node_labeled', 'node_attr_dim', 'is_directed'],
  69. node_label=node_label)
  70. # remove graphs with no edges, as no sp can be found in their structures,
  71. # so the kernel between such a graph and itself will be zero.
  72. len_gn = len(Gn)
  73. Gn = [(idx, G) for idx, G in enumerate(Gn) if nx.number_of_edges(G) != 0]
  74. idx = [G[0] for G in Gn]
  75. Gn = [G[1] for G in Gn]
  76. if len(Gn) != len_gn:
  77. print('\n %d graphs are removed as they don\'t contain edges.\n' %
  78. (len_gn - len(Gn)))
  79. start_time = time.time()
  80. pool = Pool(n_jobs)
  81. # get shortest path graphs of Gn
  82. getsp_partial = partial(wrapper_getSPGraph, weight)
  83. itr = zip(Gn, range(0, len(Gn)))
  84. if len(Gn) < 1000 * n_jobs:
  85. # # use default chunksize as pool.map when iterable is less than 100
  86. # chunksize, extra = divmod(len(Gn), n_jobs * 4)
  87. # if extra:
  88. # chunksize += 1
  89. chunksize = int(len(Gn) / n_jobs) + 1
  90. else:
  91. chunksize = 1000
  92. for i, g in tqdm(
  93. pool.imap_unordered(getsp_partial, itr, chunksize),
  94. desc='getting sp graphs', file=sys.stdout):
  95. Gn[i] = g
  96. pool.close()
  97. pool.join()
  98. # # ---- direct running, normally use single CPU core. ----
  99. # for i in tqdm(range(len(Gn)), desc='getting sp graphs', file=sys.stdout):
  100. # i, Gn[i] = wrap_getSPGraph(Gn, weight, i)
  101. # # ---- use pool.map to parallel ----
  102. # result_sp = pool.map(getsp_partial, range(0, len(Gn)))
  103. # for i in result_sp:
  104. # Gn[i[0]] = i[1]
  105. # or
  106. # getsp_partial = partial(wrap_getSPGraph, Gn, weight)
  107. # for i, g in tqdm(
  108. # pool.map(getsp_partial, range(0, len(Gn))),
  109. # desc='getting sp graphs',
  110. # file=sys.stdout):
  111. # Gn[i] = g
  112. # # ---- only for the Fast Computation of Shortest Path Kernel (FCSP)
  113. # sp_ml = [0] * len(Gn) # shortest path matrices
  114. # for i in result_sp:
  115. # sp_ml[i[0]] = i[1]
  116. # edge_x_g = [[] for i in range(len(sp_ml))]
  117. # edge_y_g = [[] for i in range(len(sp_ml))]
  118. # edge_w_g = [[] for i in range(len(sp_ml))]
  119. # for idx, item in enumerate(sp_ml):
  120. # for i1 in range(len(item)):
  121. # for i2 in range(i1 + 1, len(item)):
  122. # if item[i1, i2] != np.inf:
  123. # edge_x_g[idx].append(i1)
  124. # edge_y_g[idx].append(i2)
  125. # edge_w_g[idx].append(item[i1, i2])
  126. # print(len(edge_x_g[0]))
  127. # print(len(edge_y_g[0]))
  128. # print(len(edge_w_g[0]))
  129. Kmatrix = np.zeros((len(Gn), len(Gn)))
  130. # ---- use pool.imap_unordered to parallel and track progress. ----
  131. pool = Pool(n_jobs)
  132. do_partial = partial(wrapper_sp_do, ds_attrs, node_label, node_kernels)
  133. itr = zip(combinations_with_replacement(Gn, 2),
  134. combinations_with_replacement(range(0, len(Gn)), 2))
  135. len_itr = int(len(Gn) * (len(Gn) + 1) / 2)
  136. if len_itr < 1000 * n_jobs:
  137. chunksize = int(len_itr / n_jobs) + 1
  138. else:
  139. chunksize = 1000
  140. for i, j, kernel in tqdm(
  141. pool.imap_unordered(do_partial, itr, chunksize),
  142. desc='calculating kernels',
  143. file=sys.stdout):
  144. Kmatrix[i][j] = kernel
  145. Kmatrix[j][i] = kernel
  146. pool.close()
  147. pool.join()
  148. # # ---- use pool.map to parallel. ----
  149. # # result_perf = pool.map(do_partial, itr)
  150. # do_partial = partial(spkernel_do, Gn, ds_attrs, node_label, node_kernels)
  151. # itr = combinations_with_replacement(range(0, len(Gn)), 2)
  152. # for i, j, kernel in tqdm(
  153. # pool.map(do_partial, itr), desc='calculating kernels',
  154. # file=sys.stdout):
  155. # Kmatrix[i][j] = kernel
  156. # Kmatrix[j][i] = kernel
  157. # pool.close()
  158. # pool.join()
  159. # # ---- use joblib.Parallel to parallel and track progress. ----
  160. # result_perf = Parallel(
  161. # n_jobs=n_jobs, verbose=10)(
  162. # delayed(do_partial)(ij)
  163. # for ij in combinations_with_replacement(range(0, len(Gn)), 2))
  164. # result_perf = [
  165. # do_partial(ij)
  166. # for ij in combinations_with_replacement(range(0, len(Gn)), 2)
  167. # ]
  168. # for i in result_perf:
  169. # Kmatrix[i[0]][i[1]] = i[2]
  170. # Kmatrix[i[1]][i[0]] = i[2]
  171. # # ---- direct running, normally use single CPU core. ----
  172. # itr = combinations_with_replacement(range(0, len(Gn)), 2)
  173. # for gs in tqdm(itr, desc='calculating kernels', file=sys.stdout):
  174. # i, j, kernel = spkernel_do(Gn, ds_attrs, node_label, node_kernels, gs)
  175. # Kmatrix[i][j] = kernel
  176. # Kmatrix[j][i] = kernel
  177. run_time = time.time() - start_time
  178. print(
  179. "\n --- shortest path kernel matrix of size %d built in %s seconds ---"
  180. % (len(Gn), run_time))
  181. return Kmatrix, run_time, idx
  182. def spkernel_do(g1, g2, ds_attrs, node_label, node_kernels):
  183. kernel = 0
  184. # compute shortest path matrices first, method borrowed from FCSP.
  185. if ds_attrs['node_labeled']:
  186. # node symb and non-synb labeled
  187. if ds_attrs['node_attr_dim'] > 0:
  188. kn = node_kernels['mix']
  189. vk_dict = {} # shortest path matrices dict
  190. for n1, n2 in product(
  191. g1.nodes(data=True), g2.nodes(data=True)):
  192. vk_dict[(n1[0], n2[0])] = kn(
  193. n1[1][node_label], n2[1][node_label],
  194. n1[1]['attributes'], n2[1]['attributes'])
  195. # node symb labeled
  196. else:
  197. kn = node_kernels['symb']
  198. vk_dict = {} # shortest path matrices dict
  199. for n1 in g1.nodes(data=True):
  200. for n2 in g2.nodes(data=True):
  201. vk_dict[(n1[0], n2[0])] = kn(n1[1][node_label],
  202. n2[1][node_label])
  203. else:
  204. # node non-synb labeled
  205. if ds_attrs['node_attr_dim'] > 0:
  206. kn = node_kernels['nsymb']
  207. vk_dict = {} # shortest path matrices dict
  208. for n1 in g1.nodes(data=True):
  209. for n2 in g2.nodes(data=True):
  210. vk_dict[(n1[0], n2[0])] = kn(n1[1]['attributes'],
  211. n2[1]['attributes'])
  212. # node unlabeled
  213. else:
  214. for e1, e2 in product(
  215. g1.edges(data=True), g2.edges(data=True)):
  216. if e1[2]['cost'] == e2[2]['cost']:
  217. kernel += 1
  218. return kernel
  219. # compute graph kernels
  220. if ds_attrs['is_directed']:
  221. for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)):
  222. if e1[2]['cost'] == e2[2]['cost']:
  223. nk11, nk22 = vk_dict[(e1[0], e2[0])], vk_dict[(e1[1],
  224. e2[1])]
  225. kn1 = nk11 * nk22
  226. kernel += kn1
  227. else:
  228. for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)):
  229. if e1[2]['cost'] == e2[2]['cost']:
  230. # each edge walk is counted twice, starting from both its extreme nodes.
  231. nk11, nk12, nk21, nk22 = vk_dict[(e1[0], e2[0])], vk_dict[(
  232. e1[0], e2[1])], vk_dict[(e1[1],
  233. e2[0])], vk_dict[(e1[1],
  234. e2[1])]
  235. kn1 = nk11 * nk22
  236. kn2 = nk12 * nk21
  237. kernel += kn1 + kn2
  238. # # ---- exact implementation of the Fast Computation of Shortest Path Kernel (FCSP), reference [2], sadly it is slower than the current implementation
  239. # # compute vertex kernels
  240. # try:
  241. # vk_mat = np.zeros((nx.number_of_nodes(g1),
  242. # nx.number_of_nodes(g2)))
  243. # g1nl = enumerate(g1.nodes(data=True))
  244. # g2nl = enumerate(g2.nodes(data=True))
  245. # for i1, n1 in g1nl:
  246. # for i2, n2 in g2nl:
  247. # vk_mat[i1][i2] = kn(
  248. # n1[1][node_label], n2[1][node_label],
  249. # [n1[1]['attributes']], [n2[1]['attributes']])
  250. # range1 = range(0, len(edge_w_g[i]))
  251. # range2 = range(0, len(edge_w_g[j]))
  252. # for i1 in range1:
  253. # x1 = edge_x_g[i][i1]
  254. # y1 = edge_y_g[i][i1]
  255. # w1 = edge_w_g[i][i1]
  256. # for i2 in range2:
  257. # x2 = edge_x_g[j][i2]
  258. # y2 = edge_y_g[j][i2]
  259. # w2 = edge_w_g[j][i2]
  260. # ke = (w1 == w2)
  261. # if ke > 0:
  262. # kn1 = vk_mat[x1][x2] * vk_mat[y1][y2]
  263. # kn2 = vk_mat[x1][y2] * vk_mat[y1][x2]
  264. # kernel += kn1 + kn2
  265. return kernel
  266. def wrapper_sp_do(ds_attrs, node_label, node_kernels, itr_item):
  267. g1 = itr_item[0][0]
  268. g2 = itr_item[0][1]
  269. i = itr_item[1][0]
  270. j = itr_item[1][1]
  271. return i, j, spkernel_do(g1, g2, ds_attrs, node_label, node_kernels)
  272. def wrapper_getSPGraph(weight, itr_item):
  273. g = itr_item[0]
  274. i = itr_item[1]
  275. return i, getSPGraph(g, edge_weight=weight)
  276. # return i, nx.floyd_warshall_numpy(g, weight=weight)

A Python package for graph kernels, graph edit distances and graph pre-image problem.