You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

ssp_sym.py 18 kB

5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. Created on Sun Dec 23 16:42:48 2018
  5. @author: ljia
  6. """
  7. import sys
  8. import time
  9. from itertools import combinations, product
  10. from functools import partial
  11. from multiprocessing import Pool
  12. from tqdm import tqdm
  13. import networkx as nx
  14. import numpy as np
  15. from gklearn.utils.graphdataset import get_dataset_attributes
  16. from gklearn.utils.parallel import parallel_gm
  17. sys.path.insert(0, "../")
  18. def structuralspkernel(*args,
  19. node_label='atom',
  20. edge_weight=None,
  21. edge_label='bond_type',
  22. node_kernels=None,
  23. edge_kernels=None,
  24. n_jobs=None):
  25. """Calculate mean average structural shortest path kernels between graphs.
  26. Parameters
  27. ----------
  28. Gn : List of NetworkX graph
  29. List of graphs between which the kernels are calculated.
  30. /
  31. G1, G2 : NetworkX graphs
  32. 2 graphs between which the kernel is calculated.
  33. node_label : string
  34. node attribute used as label. The default node label is atom.
  35. edge_weight : string
  36. Edge attribute name corresponding to the edge weight.
  37. edge_label : string
  38. edge attribute used as label. The default edge label is bond_type.
  39. node_kernels: dict
  40. A dictionary of kernel functions for nodes, including 3 items: 'symb'
  41. for symbolic node labels, 'nsymb' for non-symbolic node labels, 'mix'
  42. for both labels. The first 2 functions take two node labels as
  43. parameters, and the 'mix' function takes 4 parameters, a symbolic and a
  44. non-symbolic label for each the two nodes. Each label is in form of 2-D
  45. dimension array (n_samples, n_features). Each function returns a number
  46. as the kernel value. Ignored when nodes are unlabeled.
  47. edge_kernels: dict
  48. A dictionary of kernel functions for edges, including 3 items: 'symb'
  49. for symbolic edge labels, 'nsymb' for non-symbolic edge labels, 'mix'
  50. for both labels. The first 2 functions take two edge labels as
  51. parameters, and the 'mix' function takes 4 parameters, a symbolic and a
  52. non-symbolic label for each the two edges. Each label is in form of 2-D
  53. dimension array (n_samples, n_features). Each function returns a number
  54. as the kernel value. Ignored when edges are unlabeled.
  55. Return
  56. ------
  57. Kmatrix : Numpy matrix
  58. Kernel matrix, each element of which is the mean average structural
  59. shortest path kernel between 2 praphs.
  60. """
  61. # pre-process
  62. Gn = args[0] if len(args) == 1 else [args[0], args[1]]
  63. weight = None
  64. if edge_weight is None:
  65. print('\n None edge weight specified. Set all weight to 1.\n')
  66. else:
  67. try:
  68. some_weight = list(
  69. nx.get_edge_attributes(Gn[0], edge_weight).values())[0]
  70. if isinstance(some_weight, (float, int)):
  71. weight = edge_weight
  72. else:
  73. print(
  74. '\n Edge weight with name %s is not float or integer. Set all weight to 1.\n'
  75. % edge_weight)
  76. except:
  77. print(
  78. '\n Edge weight with name "%s" is not found in the edge attributes. Set all weight to 1.\n'
  79. % edge_weight)
  80. ds_attrs = get_dataset_attributes(
  81. Gn,
  82. attr_names=['node_labeled', 'node_attr_dim', 'edge_labeled',
  83. 'edge_attr_dim', 'is_directed'],
  84. node_label=node_label, edge_label=edge_label)
  85. ds_attrs['node_attr_dim'] = 0
  86. ds_attrs['edge_attr_dim'] = 0
  87. start_time = time.time()
  88. # get shortest paths of each graph in Gn
  89. splist = [None] * len(Gn)
  90. pool = Pool(n_jobs)
  91. # get shortest path graphs of Gn
  92. getsp_partial = partial(wrapper_getSP, weight, ds_attrs['is_directed'])
  93. itr = zip(Gn, range(0, len(Gn)))
  94. if len(Gn) < 100 * n_jobs:
  95. chunksize = int(len(Gn) / n_jobs) + 1
  96. else:
  97. chunksize = 100
  98. # chunksize = 300 # int(len(list(itr)) / n_jobs)
  99. for i, sp in tqdm(
  100. pool.imap_unordered(getsp_partial, itr, chunksize),
  101. desc='getting shortest paths',
  102. file=sys.stdout):
  103. splist[i] = sp
  104. # time.sleep(10)
  105. pool.close()
  106. pool.join()
  107. # # get shortest paths of each graph in Gn
  108. # splist = [[] for _ in range(len(Gn))]
  109. # # get shortest path graphs of Gn
  110. # getsp_partial = partial(wrapper_getSP, weight, ds_attrs['is_directed'])
  111. # itr = zip(Gn, range(0, len(Gn)))
  112. # if len(Gn) < 1000 * n_jobs:
  113. # chunksize = int(len(Gn) / n_jobs) + 1
  114. # else:
  115. # chunksize = 1000
  116. # # chunksize = 300 # int(len(list(itr)) / n_jobs)
  117. # from contextlib import closing
  118. # with closing(Pool(n_jobs)) as pool:
  119. ## for i, sp in tqdm(
  120. # res = pool.imap_unordered(getsp_partial, itr, 10)
  121. ## desc='getting shortest paths',
  122. ## file=sys.stdout):
  123. ## splist[i] = sp
  124. ## time.sleep(10)
  125. # pool.close()
  126. # pool.join()
  127. # ss = 0
  128. # ss += sys.getsizeof(splist)
  129. # for spss in splist:
  130. # ss += sys.getsizeof(spss)
  131. # for spp in spss:
  132. # ss += sys.getsizeof(spp)
  133. # time.sleep(20)
  134. # # ---- direct running, normally use single CPU core. ----
  135. # splist = []
  136. # for g in tqdm(Gn, desc='getting sp graphs', file=sys.stdout):
  137. # splist.append(get_shortest_paths(g, weight, ds_attrs['is_directed']))
  138. # # ---- only for the Fast Computation of Shortest Path Kernel (FCSP)
  139. # sp_ml = [0] * len(Gn) # shortest path matrices
  140. # for i in result_sp:
  141. # sp_ml[i[0]] = i[1]
  142. # edge_x_g = [[] for i in range(len(sp_ml))]
  143. # edge_y_g = [[] for i in range(len(sp_ml))]
  144. # edge_w_g = [[] for i in range(len(sp_ml))]
  145. # for idx, item in enumerate(sp_ml):
  146. # for i1 in range(len(item)):
  147. # for i2 in range(i1 + 1, len(item)):
  148. # if item[i1, i2] != np.inf:
  149. # edge_x_g[idx].append(i1)
  150. # edge_y_g[idx].append(i2)
  151. # edge_w_g[idx].append(item[i1, i2])
  152. # print(len(edge_x_g[0]))
  153. # print(len(edge_y_g[0]))
  154. # print(len(edge_w_g[0]))
  155. Kmatrix = np.zeros((len(Gn), len(Gn)))
  156. # ---- use pool.imap_unordered to parallel and track progress. ----
  157. def init_worker(spl_toshare, gs_toshare):
  158. global G_spl, G_gs
  159. G_spl = spl_toshare
  160. G_gs = gs_toshare
  161. do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label,
  162. node_kernels, edge_kernels)
  163. parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker,
  164. glbv=(splist, Gn), n_jobs=n_jobs)
  165. # # ---- use pool.imap_unordered to parallel and track progress. ----
  166. # pool = Pool(n_jobs)
  167. # do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label,
  168. # node_kernels, edge_kernels)
  169. # itr = zip(combinations_with_replacement(Gn, 2),
  170. # combinations_with_replacement(splist, 2),
  171. # combinations_with_replacement(range(0, len(Gn)), 2))
  172. # len_itr = int(len(Gn) * (len(Gn) + 1) / 2)
  173. # if len_itr < 1000 * n_jobs:
  174. # chunksize = int(len_itr / n_jobs) + 1
  175. # else:
  176. # chunksize = 1000
  177. # for i, j, kernel in tqdm(
  178. # pool.imap_unordered(do_partial, itr, chunksize),
  179. # desc='calculating kernels',
  180. # file=sys.stdout):
  181. # Kmatrix[i][j] = kernel
  182. # Kmatrix[j][i] = kernel
  183. # pool.close()
  184. # pool.join()
  185. # # ---- use pool.map to parallel. ----
  186. # pool = Pool(n_jobs)
  187. # do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label,
  188. # node_kernels, edge_kernels)
  189. # itr = zip(combinations_with_replacement(Gn, 2),
  190. # combinations_with_replacement(splist, 2),
  191. # combinations_with_replacement(range(0, len(Gn)), 2))
  192. # for i, j, kernel in tqdm(
  193. # pool.map(do_partial, itr), desc='calculating kernels',
  194. # file=sys.stdout):
  195. # Kmatrix[i][j] = kernel
  196. # Kmatrix[j][i] = kernel
  197. # pool.close()
  198. # pool.join()
  199. # # ---- use pool.imap_unordered to parallel and track progress. ----
  200. # do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label,
  201. # node_kernels, edge_kernels)
  202. # itr = zip(combinations_with_replacement(Gn, 2),
  203. # combinations_with_replacement(splist, 2),
  204. # combinations_with_replacement(range(0, len(Gn)), 2))
  205. # len_itr = int(len(Gn) * (len(Gn) + 1) / 2)
  206. # if len_itr < 1000 * n_jobs:
  207. # chunksize = int(len_itr / n_jobs) + 1
  208. # else:
  209. # chunksize = 1000
  210. # from contextlib import closing
  211. # with closing(Pool(n_jobs)) as pool:
  212. # for i, j, kernel in tqdm(
  213. # pool.imap_unordered(do_partial, itr, 1000),
  214. # desc='calculating kernels',
  215. # file=sys.stdout):
  216. # Kmatrix[i][j] = kernel
  217. # Kmatrix[j][i] = kernel
  218. # pool.close()
  219. # pool.join()
  220. # # ---- direct running, normally use single CPU core. ----
  221. # from itertools import combinations_with_replacement
  222. # itr = combinations_with_replacement(range(0, len(Gn)), 2)
  223. # for i, j in tqdm(itr, desc='calculating kernels', file=sys.stdout):
  224. # kernel = structuralspkernel_do(Gn[i], Gn[j], splist[i], splist[j],
  225. # ds_attrs, node_label, edge_label, node_kernels, edge_kernels)
  226. ## if(kernel > 1):
  227. ## print("error here ")
  228. # Kmatrix[i][j] = kernel
  229. # Kmatrix[j][i] = kernel
  230. run_time = time.time() - start_time
  231. print(
  232. "\n --- shortest path kernel matrix of size %d built in %s seconds ---"
  233. % (len(Gn), run_time))
  234. return Kmatrix, run_time
  235. def structuralspkernel_do(g1, g2, spl1, spl2, ds_attrs, node_label, edge_label,
  236. node_kernels, edge_kernels):
  237. kernel = 0
  238. # First, compute shortest path matrices, method borrowed from FCSP.
  239. vk_dict = {} # shortest path matrices dict
  240. if ds_attrs['node_labeled']:
  241. # node symb and non-synb labeled
  242. if ds_attrs['node_attr_dim'] > 0:
  243. kn = node_kernels['mix']
  244. for n1, n2 in product(
  245. g1.nodes(data=True), g2.nodes(data=True)):
  246. vk_dict[(n1[0], n2[0])] = kn(
  247. n1[1][node_label], n2[1][node_label],
  248. n1[1]['attributes'], n2[1]['attributes'])
  249. # node symb labeled
  250. else:
  251. kn = node_kernels['symb']
  252. for n1 in g1.nodes(data=True):
  253. for n2 in g2.nodes(data=True):
  254. vk_dict[(n1[0], n2[0])] = kn(n1[1][node_label],
  255. n2[1][node_label])
  256. else:
  257. # node non-synb labeled
  258. if ds_attrs['node_attr_dim'] > 0:
  259. kn = node_kernels['nsymb']
  260. for n1 in g1.nodes(data=True):
  261. for n2 in g2.nodes(data=True):
  262. vk_dict[(n1[0], n2[0])] = kn(n1[1]['attributes'],
  263. n2[1]['attributes'])
  264. # node unlabeled
  265. else:
  266. pass
  267. # Then, compute kernels between all pairs of edges, which idea is an
  268. # extension of FCSP. It suits sparse graphs, which is the most case we
  269. # went though. For dense graphs, this would be slow.
  270. ek_dict = {} # dict of edge kernels
  271. if ds_attrs['edge_labeled']:
  272. # edge symb and non-synb labeled
  273. if ds_attrs['edge_attr_dim'] > 0:
  274. ke = edge_kernels['mix']
  275. for e1, e2 in product(
  276. g1.edges(data=True), g2.edges(data=True)):
  277. ek_temp = ke(e1[2][edge_label], e2[2][edge_label],
  278. e1[2]['attributes'], e2[2]['attributes'])
  279. ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ek_temp
  280. ek_dict[((e1[1], e1[0]), (e2[0], e2[1]))] = ek_temp
  281. ek_dict[((e1[0], e1[1]), (e2[1], e2[0]))] = ek_temp
  282. ek_dict[((e1[1], e1[0]), (e2[1], e2[0]))] = ek_temp
  283. # edge symb labeled
  284. else:
  285. ke = edge_kernels['symb']
  286. for e1 in g1.edges(data=True):
  287. for e2 in g2.edges(data=True):
  288. ek_temp = ke(e1[2][edge_label], e2[2][edge_label])
  289. ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ek_temp
  290. ek_dict[((e1[1], e1[0]), (e2[0], e2[1]))] = ek_temp
  291. ek_dict[((e1[0], e1[1]), (e2[1], e2[0]))] = ek_temp
  292. ek_dict[((e1[1], e1[0]), (e2[1], e2[0]))] = ek_temp
  293. else:
  294. # edge non-synb labeled
  295. if ds_attrs['edge_attr_dim'] > 0:
  296. ke = edge_kernels['nsymb']
  297. for e1 in g1.edges(data=True):
  298. for e2 in g2.edges(data=True):
  299. ek_temp = kn(e1[2]['attributes'], e2[2]['attributes'])
  300. ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ek_temp
  301. ek_dict[((e1[1], e1[0]), (e2[0], e2[1]))] = ek_temp
  302. ek_dict[((e1[0], e1[1]), (e2[1], e2[0]))] = ek_temp
  303. ek_dict[((e1[1], e1[0]), (e2[1], e2[0]))] = ek_temp
  304. # edge unlabeled
  305. else:
  306. pass
  307. # compute graph kernels
  308. if vk_dict:
  309. if ek_dict:
  310. for p1, p2 in product(spl1, spl2):
  311. if len(p1) == len(p2):
  312. kpath = vk_dict[(p1[0], p2[0])]
  313. if kpath:
  314. for idx in range(1, len(p1)):
  315. kpath *= vk_dict[(p1[idx], p2[idx])] * \
  316. ek_dict[((p1[idx-1], p1[idx]),
  317. (p2[idx-1], p2[idx]))]
  318. if not kpath:
  319. break
  320. kernel += kpath # add up kernels of all paths
  321. else:
  322. for p1, p2 in product(spl1, spl2):
  323. if len(p1) == len(p2):
  324. kpath = vk_dict[(p1[0], p2[0])]
  325. if kpath:
  326. for idx in range(1, len(p1)):
  327. kpath *= vk_dict[(p1[idx], p2[idx])]
  328. if not kpath:
  329. break
  330. kernel += kpath # add up kernels of all paths
  331. else:
  332. if ek_dict:
  333. for p1, p2 in product(spl1, spl2):
  334. if len(p1) == len(p2):
  335. if len(p1) == 0:
  336. kernel += 1
  337. else:
  338. kpath = 1
  339. for idx in range(0, len(p1) - 1):
  340. kpath *= ek_dict[((p1[idx], p1[idx+1]),
  341. (p2[idx], p2[idx+1]))]
  342. if not kpath:
  343. break
  344. kernel += kpath # add up kernels of all paths
  345. else:
  346. for p1, p2 in product(spl1, spl2):
  347. if len(p1) == len(p2):
  348. kernel += 1
  349. kernel = kernel / (len(spl1) * len(spl2)) # calculate mean average
  350. # # ---- exact implementation of the Fast Computation of Shortest Path Kernel (FCSP), reference [2], sadly it is slower than the current implementation
  351. # # compute vertex kernel matrix
  352. # try:
  353. # vk_mat = np.zeros((nx.number_of_nodes(g1),
  354. # nx.number_of_nodes(g2)))
  355. # g1nl = enumerate(g1.nodes(data=True))
  356. # g2nl = enumerate(g2.nodes(data=True))
  357. # for i1, n1 in g1nl:
  358. # for i2, n2 in g2nl:
  359. # vk_mat[i1][i2] = kn(
  360. # n1[1][node_label], n2[1][node_label],
  361. # [n1[1]['attributes']], [n2[1]['attributes']])
  362. # range1 = range(0, len(edge_w_g[i]))
  363. # range2 = range(0, len(edge_w_g[j]))
  364. # for i1 in range1:
  365. # x1 = edge_x_g[i][i1]
  366. # y1 = edge_y_g[i][i1]
  367. # w1 = edge_w_g[i][i1]
  368. # for i2 in range2:
  369. # x2 = edge_x_g[j][i2]
  370. # y2 = edge_y_g[j][i2]
  371. # w2 = edge_w_g[j][i2]
  372. # ke = (w1 == w2)
  373. # if ke > 0:
  374. # kn1 = vk_mat[x1][x2] * vk_mat[y1][y2]
  375. # kn2 = vk_mat[x1][y2] * vk_mat[y1][x2]
  376. # Kmatrix += kn1 + kn2
  377. return kernel
  378. def wrapper_ssp_do(ds_attrs, node_label, edge_label, node_kernels,
  379. edge_kernels, itr):
  380. i = itr[0]
  381. j = itr[1]
  382. return i, j, structuralspkernel_do(G_gs[i], G_gs[j], G_spl[i], G_spl[j],
  383. ds_attrs, node_label, edge_label,
  384. node_kernels, edge_kernels)
  385. def get_shortest_paths(G, weight, directed):
  386. """Get all shortest paths of a graph.
  387. Parameters
  388. ----------
  389. G : NetworkX graphs
  390. The graphs whose paths are calculated.
  391. weight : string/None
  392. edge attribute used as weight to calculate the shortest path.
  393. directed: boolean
  394. Whether graph is directed.
  395. Return
  396. ------
  397. sp : list of list
  398. List of shortest paths of the graph, where each path is represented by a list of nodes.
  399. """
  400. sp = []
  401. for n1, n2 in combinations(G.nodes(), 2):
  402. try:
  403. spltemp = list(nx.all_shortest_paths(G, n1, n2, weight=weight))
  404. except nx.NetworkXNoPath: # nodes not connected
  405. # sp.append([])
  406. pass
  407. else:
  408. sp += spltemp
  409. # each edge walk is counted twice, starting from both its extreme nodes.
  410. if not directed:
  411. sp += [sptemp[::-1] for sptemp in spltemp]
  412. # add single nodes as length 0 paths.
  413. sp += [[n] for n in G.nodes()]
  414. return sp
  415. def wrapper_getSP(weight, directed, itr_item):
  416. g = itr_item[0]
  417. i = itr_item[1]
  418. return i, get_shortest_paths(g, weight, directed)

A Python package for graph kernels, graph edit distances and graph pre-image problem.