You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

sp_sym.py 7.2 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. Created on Fri Dec 21 18:02:00 2018
  5. @author: ljia
  6. """
  7. import sys
  8. import time
  9. from itertools import product
  10. from functools import partial
  11. from multiprocessing import Pool
  12. from tqdm import tqdm
  13. import networkx as nx
  14. import numpy as np
  15. from gklearn.utils.utils import getSPGraph
  16. from gklearn.utils.graphdataset import get_dataset_attributes
  17. from gklearn.utils.parallel import parallel_gm
  18. sys.path.insert(0, "../")
  19. def spkernel(*args,
  20. node_label='atom',
  21. edge_weight=None,
  22. node_kernels=None,
  23. n_jobs=None):
  24. """Calculate shortest-path kernels between graphs.
  25. Parameters
  26. ----------
  27. Gn : List of NetworkX graph
  28. List of graphs between which the kernels are calculated.
  29. /
  30. G1, G2 : NetworkX graphs
  31. 2 graphs between which the kernel is calculated.
  32. node_label : string
  33. node attribute used as label. The default node label is atom.
  34. edge_weight : string
  35. Edge attribute name corresponding to the edge weight.
  36. node_kernels: dict
  37. A dictionary of kernel functions for nodes, including 3 items: 'symb'
  38. for symbolic node labels, 'nsymb' for non-symbolic node labels, 'mix'
  39. for both labels. The first 2 functions take two node labels as
  40. parameters, and the 'mix' function takes 4 parameters, a symbolic and a
  41. non-symbolic label for each the two nodes. Each label is in form of 2-D
  42. dimension array (n_samples, n_features). Each function returns an
  43. number as the kernel value. Ignored when nodes are unlabeled.
  44. Return
  45. ------
  46. Kmatrix : Numpy matrix
  47. Kernel matrix, each element of which is the sp kernel between 2 praphs.
  48. """
  49. # pre-process
  50. Gn = args[0] if len(args) == 1 else [args[0], args[1]]
  51. weight = None
  52. if edge_weight is None:
  53. print('\n None edge weight specified. Set all weight to 1.\n')
  54. else:
  55. try:
  56. some_weight = list(
  57. nx.get_edge_attributes(Gn[0], edge_weight).values())[0]
  58. if isinstance(some_weight, (float, int)):
  59. weight = edge_weight
  60. else:
  61. print(
  62. '\n Edge weight with name %s is not float or integer. Set all weight to 1.\n'
  63. % edge_weight)
  64. except:
  65. print(
  66. '\n Edge weight with name "%s" is not found in the edge attributes. Set all weight to 1.\n'
  67. % edge_weight)
  68. ds_attrs = get_dataset_attributes(
  69. Gn,
  70. attr_names=['node_labeled', 'node_attr_dim', 'is_directed'],
  71. node_label=node_label)
  72. ds_attrs['node_attr_dim'] = 0
  73. # remove graphs with no edges, as no sp can be found in their structures,
  74. # so the kernel between such a graph and itself will be zero.
  75. len_gn = len(Gn)
  76. Gn = [(idx, G) for idx, G in enumerate(Gn) if nx.number_of_edges(G) != 0]
  77. idx = [G[0] for G in Gn]
  78. Gn = [G[1] for G in Gn]
  79. if len(Gn) != len_gn:
  80. print('\n %d graphs are removed as they don\'t contain edges.\n' %
  81. (len_gn - len(Gn)))
  82. start_time = time.time()
  83. pool = Pool(n_jobs)
  84. # get shortest path graphs of Gn
  85. getsp_partial = partial(wrapper_getSPGraph, weight)
  86. itr = zip(Gn, range(0, len(Gn)))
  87. if len(Gn) < 100 * n_jobs:
  88. # # use default chunksize as pool.map when iterable is less than 100
  89. # chunksize, extra = divmod(len(Gn), n_jobs * 4)
  90. # if extra:
  91. # chunksize += 1
  92. chunksize = int(len(Gn) / n_jobs) + 1
  93. else:
  94. chunksize = 100
  95. for i, g in tqdm(
  96. pool.imap_unordered(getsp_partial, itr, chunksize),
  97. desc='getting sp graphs', file=sys.stdout):
  98. Gn[i] = g
  99. pool.close()
  100. pool.join()
  101. Kmatrix = np.zeros((len(Gn), len(Gn)))
  102. # ---- use pool.imap_unordered to parallel and track progress. ----
  103. def init_worker(gn_toshare):
  104. global G_gn
  105. G_gn = gn_toshare
  106. do_partial = partial(wrapper_sp_do, ds_attrs, node_label, node_kernels)
  107. parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker,
  108. glbv=(Gn,), n_jobs=n_jobs)
  109. run_time = time.time() - start_time
  110. print(
  111. "\n --- shortest path kernel matrix of size %d built in %s seconds ---"
  112. % (len(Gn), run_time))
  113. return Kmatrix, run_time, idx
  114. def spkernel_do(g1, g2, ds_attrs, node_label, node_kernels):
  115. kernel = 0
  116. # compute shortest path matrices first, method borrowed from FCSP.
  117. vk_dict = {} # shortest path matrices dict
  118. if ds_attrs['node_labeled']:
  119. # node symb and non-synb labeled
  120. if ds_attrs['node_attr_dim'] > 0:
  121. kn = node_kernels['mix']
  122. for n1, n2 in product(
  123. g1.nodes(data=True), g2.nodes(data=True)):
  124. vk_dict[(n1[0], n2[0])] = kn(
  125. n1[1][node_label], n2[1][node_label],
  126. n1[1]['attributes'], n2[1]['attributes'])
  127. # node symb labeled
  128. else:
  129. kn = node_kernels['symb']
  130. for n1 in g1.nodes(data=True):
  131. for n2 in g2.nodes(data=True):
  132. vk_dict[(n1[0], n2[0])] = kn(n1[1][node_label],
  133. n2[1][node_label])
  134. else:
  135. # node non-synb labeled
  136. if ds_attrs['node_attr_dim'] > 0:
  137. kn = node_kernels['nsymb']
  138. for n1 in g1.nodes(data=True):
  139. for n2 in g2.nodes(data=True):
  140. vk_dict[(n1[0], n2[0])] = kn(n1[1]['attributes'],
  141. n2[1]['attributes'])
  142. # node unlabeled
  143. else:
  144. for e1, e2 in product(
  145. g1.edges(data=True), g2.edges(data=True)):
  146. if e1[2]['cost'] == e2[2]['cost']:
  147. kernel += 1
  148. return kernel
  149. # compute graph kernels
  150. if ds_attrs['is_directed']:
  151. for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)):
  152. if e1[2]['cost'] == e2[2]['cost']:
  153. nk11, nk22 = vk_dict[(e1[0], e2[0])], vk_dict[(e1[1],
  154. e2[1])]
  155. kn1 = nk11 * nk22
  156. kernel += kn1
  157. else:
  158. for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)):
  159. if e1[2]['cost'] == e2[2]['cost']:
  160. # each edge walk is counted twice, starting from both its extreme nodes.
  161. nk11, nk12, nk21, nk22 = vk_dict[(e1[0], e2[0])], vk_dict[(
  162. e1[0], e2[1])], vk_dict[(e1[1],
  163. e2[0])], vk_dict[(e1[1],
  164. e2[1])]
  165. kn1 = nk11 * nk22
  166. kn2 = nk12 * nk21
  167. kernel += kn1 + kn2
  168. return kernel
  169. def wrapper_sp_do(ds_attrs, node_label, node_kernels, itr):
  170. i = itr[0]
  171. j = itr[1]
  172. return i, j, spkernel_do(G_gn[i], G_gn[j], ds_attrs, node_label, node_kernels)
  173. def wrapper_getSPGraph(weight, itr_item):
  174. g = itr_item[0]
  175. i = itr_item[1]
  176. return i, getSPGraph(g, edge_weight=weight)

A Python package for graph kernels, graph edit distances and graph pre-image problem.