You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

utils.py 15 kB

5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509
  1. import networkx as nx
  2. import numpy as np
  3. from copy import deepcopy
  4. from enum import Enum, unique
  5. #from itertools import product
  6. # from tqdm import tqdm
  7. def getSPLengths(G1):
  8. sp = nx.shortest_path(G1)
  9. distances = np.zeros((G1.number_of_nodes(), G1.number_of_nodes()))
  10. for i in sp.keys():
  11. for j in sp[i].keys():
  12. distances[i, j] = len(sp[i][j]) - 1
  13. return distances
  14. def getSPGraph(G, edge_weight=None):
  15. """Transform graph G to its corresponding shortest-paths graph.
  16. Parameters
  17. ----------
  18. G : NetworkX graph
  19. The graph to be tramsformed.
  20. edge_weight : string
  21. edge attribute corresponding to the edge weight.
  22. Return
  23. ------
  24. S : NetworkX graph
  25. The shortest-paths graph corresponding to G.
  26. Notes
  27. ------
  28. For an input graph G, its corresponding shortest-paths graph S contains the same set of nodes as G, while there exists an edge between all nodes in S which are connected by a walk in G. Every edge in S between two nodes is labeled by the shortest distance between these two nodes.
  29. References
  30. ----------
  31. .. [1] Borgwardt KM, Kriegel HP. Shortest-path kernels on graphs. InData Mining, Fifth IEEE International Conference on 2005 Nov 27 (pp. 8-pp). IEEE.
  32. """
  33. return floydTransformation(G, edge_weight=edge_weight)
  34. def floydTransformation(G, edge_weight=None):
  35. """Transform graph G to its corresponding shortest-paths graph using Floyd-transformation.
  36. Parameters
  37. ----------
  38. G : NetworkX graph
  39. The graph to be tramsformed.
  40. edge_weight : string
  41. edge attribute corresponding to the edge weight. The default edge weight is bond_type.
  42. Return
  43. ------
  44. S : NetworkX graph
  45. The shortest-paths graph corresponding to G.
  46. References
  47. ----------
  48. .. [1] Borgwardt KM, Kriegel HP. Shortest-path kernels on graphs. InData Mining, Fifth IEEE International Conference on 2005 Nov 27 (pp. 8-pp). IEEE.
  49. """
  50. spMatrix = nx.floyd_warshall_numpy(G, weight=edge_weight)
  51. S = nx.Graph()
  52. S.add_nodes_from(G.nodes(data=True))
  53. ns = list(G.nodes())
  54. for i in range(0, G.number_of_nodes()):
  55. for j in range(i + 1, G.number_of_nodes()):
  56. if spMatrix[i, j] != np.inf:
  57. S.add_edge(ns[i], ns[j], cost=spMatrix[i, j])
  58. return S
  59. def get_shortest_paths(G, weight, directed):
  60. """Get all shortest paths of a graph.
  61. Parameters
  62. ----------
  63. G : NetworkX graphs
  64. The graphs whose paths are calculated.
  65. weight : string/None
  66. edge attribute used as weight to calculate the shortest path.
  67. directed: boolean
  68. Whether graph is directed.
  69. Return
  70. ------
  71. sp : list of list
  72. List of shortest paths of the graph, where each path is represented by a list of nodes.
  73. """
  74. from itertools import combinations
  75. sp = []
  76. for n1, n2 in combinations(G.nodes(), 2):
  77. try:
  78. spltemp = list(nx.all_shortest_paths(G, n1, n2, weight=weight))
  79. except nx.NetworkXNoPath: # nodes not connected
  80. pass
  81. else:
  82. sp += spltemp
  83. # each edge walk is counted twice, starting from both its extreme nodes.
  84. if not directed:
  85. sp += [sptemp[::-1] for sptemp in spltemp]
  86. # add single nodes as length 0 paths.
  87. sp += [[n] for n in G.nodes()]
  88. return sp
  89. def untotterTransformation(G, node_label, edge_label):
  90. """Transform graph G according to Mahé et al.'s work to filter out tottering patterns of marginalized kernel and tree pattern kernel.
  91. Parameters
  92. ----------
  93. G : NetworkX graph
  94. The graph to be tramsformed.
  95. node_label : string
  96. node attribute used as label. The default node label is 'atom'.
  97. edge_label : string
  98. edge attribute used as label. The default edge label is 'bond_type'.
  99. Return
  100. ------
  101. gt : NetworkX graph
  102. The transformed graph corresponding to G.
  103. References
  104. ----------
  105. .. [1] Pierre Mahé, Nobuhisa Ueda, Tatsuya Akutsu, Jean-Luc Perret, and Jean-Philippe Vert. Extensions of marginalized graph kernels. In Proceedings of the twenty-first international conference on Machine learning, page 70. ACM, 2004.
  106. """
  107. # arrange all graphs in a list
  108. G = G.to_directed()
  109. gt = nx.Graph()
  110. gt.graph = G.graph
  111. gt.add_nodes_from(G.nodes(data=True))
  112. for edge in G.edges():
  113. gt.add_node(edge)
  114. gt.nodes[edge].update({node_label: G.nodes[edge[1]][node_label]})
  115. gt.add_edge(edge[0], edge)
  116. gt.edges[edge[0], edge].update({
  117. edge_label:
  118. G[edge[0]][edge[1]][edge_label]
  119. })
  120. for neighbor in G[edge[1]]:
  121. if neighbor != edge[0]:
  122. gt.add_edge(edge, (edge[1], neighbor))
  123. gt.edges[edge, (edge[1], neighbor)].update({
  124. edge_label:
  125. G[edge[1]][neighbor][edge_label]
  126. })
  127. # nx.draw_networkx(gt)
  128. # plt.show()
  129. # relabel nodes using consecutive integers for convenience of kernel calculation.
  130. gt = nx.convert_node_labels_to_integers(
  131. gt, first_label=0, label_attribute='label_orignal')
  132. return gt
  133. def direct_product(G1, G2, node_label, edge_label):
  134. """Return the direct/tensor product of directed graphs G1 and G2.
  135. Parameters
  136. ----------
  137. G1, G2 : NetworkX graph
  138. The original graphs.
  139. node_label : string
  140. node attribute used as label. The default node label is 'atom'.
  141. edge_label : string
  142. edge attribute used as label. The default edge label is 'bond_type'.
  143. Return
  144. ------
  145. gt : NetworkX graph
  146. The direct product graph of G1 and G2.
  147. Notes
  148. -----
  149. This method differs from networkx.tensor_product in that this method only adds nodes and edges in G1 and G2 that have the same labels to the direct product graph.
  150. References
  151. ----------
  152. .. [1] Thomas Gärtner, Peter Flach, and Stefan Wrobel. On graph kernels: Hardness results and efficient alternatives. Learning Theory and Kernel Machines, pages 129–143, 2003.
  153. """
  154. # arrange all graphs in a list
  155. from itertools import product
  156. # G = G.to_directed()
  157. gt = nx.DiGraph()
  158. # add nodes
  159. for u, v in product(G1, G2):
  160. if G1.nodes[u][node_label] == G2.nodes[v][node_label]:
  161. gt.add_node((u, v))
  162. gt.nodes[(u, v)].update({node_label: G1.nodes[u][node_label]})
  163. # add edges, faster for sparse graphs (no so many edges), which is the most case for now.
  164. for (u1, v1), (u2, v2) in product(G1.edges, G2.edges):
  165. if (u1, u2) in gt and (
  166. v1, v2
  167. ) in gt and G1.edges[u1, v1][edge_label] == G2.edges[u2,
  168. v2][edge_label]:
  169. gt.add_edge((u1, u2), (v1, v2))
  170. gt.edges[(u1, u2), (v1, v2)].update({
  171. edge_label:
  172. G1.edges[u1, v1][edge_label]
  173. })
  174. # # add edges, faster for dense graphs (a lot of edges, complete graph would be super).
  175. # for u, v in product(gt, gt):
  176. # if (u[0], v[0]) in G1.edges and (
  177. # u[1], v[1]
  178. # ) in G2.edges and G1.edges[u[0],
  179. # v[0]][edge_label] == G2.edges[u[1],
  180. # v[1]][edge_label]:
  181. # gt.add_edge((u[0], u[1]), (v[0], v[1]))
  182. # gt.edges[(u[0], u[1]), (v[0], v[1])].update({
  183. # edge_label:
  184. # G1.edges[u[0], v[0]][edge_label]
  185. # })
  186. # relabel nodes using consecutive integers for convenience of kernel calculation.
  187. # gt = nx.convert_node_labels_to_integers(
  188. # gt, first_label=0, label_attribute='label_orignal')
  189. return gt
  190. def graph_deepcopy(G):
  191. """Deep copy a graph, including deep copy of all nodes, edges and
  192. attributes of the graph, nodes and edges.
  193. Note
  194. ----
  195. It is the same as the NetworkX function graph.copy(), as far as I know.
  196. """
  197. # add graph attributes.
  198. labels = {}
  199. for k, v in G.graph.items():
  200. labels[k] = deepcopy(v)
  201. if G.is_directed():
  202. G_copy = nx.DiGraph(**labels)
  203. else:
  204. G_copy = nx.Graph(**labels)
  205. # add nodes
  206. for nd, attrs in G.nodes(data=True):
  207. labels = {}
  208. for k, v in attrs.items():
  209. labels[k] = deepcopy(v)
  210. G_copy.add_node(nd, **labels)
  211. # add edges.
  212. for nd1, nd2, attrs in G.edges(data=True):
  213. labels = {}
  214. for k, v in attrs.items():
  215. labels[k] = deepcopy(v)
  216. G_copy.add_edge(nd1, nd2, **labels)
  217. return G_copy
  218. def graph_isIdentical(G1, G2):
  219. """Check if two graphs are identical, including: same nodes, edges, node
  220. labels/attributes, edge labels/attributes.
  221. Notes
  222. -----
  223. 1. The type of graphs has to be the same.
  224. 2. Global/Graph attributes are neglected as they may contain names for graphs.
  225. """
  226. # check nodes.
  227. nlist1 = [n for n in G1.nodes(data=True)]
  228. nlist2 = [n for n in G2.nodes(data=True)]
  229. if not nlist1 == nlist2:
  230. return False
  231. # check edges.
  232. elist1 = [n for n in G1.edges(data=True)]
  233. elist2 = [n for n in G2.edges(data=True)]
  234. if not elist1 == elist2:
  235. return False
  236. # check graph attributes.
  237. return True
  238. def get_node_labels(Gn, node_label):
  239. """Get node labels of dataset Gn.
  240. """
  241. nl = set()
  242. for G in Gn:
  243. nl = nl | set(nx.get_node_attributes(G, node_label).values())
  244. return nl
  245. def get_edge_labels(Gn, edge_label):
  246. """Get edge labels of dataset Gn.
  247. """
  248. el = set()
  249. for G in Gn:
  250. el = el | set(nx.get_edge_attributes(G, edge_label).values())
  251. return el
  252. def get_graph_kernel_by_name(name, node_labels=None, edge_labels=None, node_attrs=None, edge_attrs=None, ds_infos=None, kernel_options={}):
  253. if name == 'Marginalized':
  254. from gklearn.kernels import Marginalized
  255. graph_kernel = Marginalized(node_labels=node_labels,
  256. edge_labels=edge_labels,
  257. ds_infos=ds_infos,
  258. **kernel_options)
  259. elif name == 'ShortestPath':
  260. from gklearn.kernels import ShortestPath
  261. graph_kernel = ShortestPath(node_labels=node_labels,
  262. node_attrs=node_attrs,
  263. ds_infos=ds_infos,
  264. **kernel_options)
  265. elif name == 'StructuralSP':
  266. from gklearn.kernels import StructuralSP
  267. graph_kernel = StructuralSP(node_labels=node_labels,
  268. edge_labels=edge_labels,
  269. node_attrs=node_attrs,
  270. edge_attrs=edge_attrs,
  271. ds_infos=ds_infos,
  272. **kernel_options)
  273. elif name == 'PathUpToH':
  274. from gklearn.kernels import PathUpToH
  275. graph_kernel = PathUpToH(node_labels=node_labels,
  276. edge_labels=edge_labels,
  277. ds_infos=ds_infos,
  278. **kernel_options)
  279. elif name == 'Treelet':
  280. from gklearn.kernels import Treelet
  281. graph_kernel = Treelet(node_labels=node_labels,
  282. edge_labels=edge_labels,
  283. ds_infos=ds_infos,
  284. **kernel_options)
  285. elif name == 'WLSubtree':
  286. from gklearn.kernels import WLSubtree
  287. graph_kernel = WLSubtree(node_labels=node_labels,
  288. edge_labels=edge_labels,
  289. ds_infos=ds_infos,
  290. **kernel_options)
  291. elif name == 'WeisfeilerLehman':
  292. from gklearn.kernels import WeisfeilerLehman
  293. graph_kernel = WeisfeilerLehman(node_labels=node_labels,
  294. edge_labels=edge_labels,
  295. ds_infos=ds_infos,
  296. **kernel_options)
  297. else:
  298. raise Exception('The graph kernel given is not defined. Possible choices include: "StructuralSP", "ShortestPath", "PathUpToH", "Treelet", "WLSubtree", "WeisfeilerLehman".')
  299. return graph_kernel
  300. def compute_gram_matrices_by_class(ds_name, kernel_options, save_results=True, dir_save='', irrelevant_labels=None, edge_required=False):
  301. import os
  302. from gklearn.utils import Dataset, split_dataset_by_target
  303. # 1. get dataset.
  304. print('1. getting dataset...')
  305. dataset_all = Dataset()
  306. dataset_all.load_predefined_dataset(ds_name)
  307. dataset_all.trim_dataset(edge_required=edge_required)
  308. if not irrelevant_labels is None:
  309. dataset_all.remove_labels(**irrelevant_labels)
  310. # dataset_all.cut_graphs(range(0, 10))
  311. datasets = split_dataset_by_target(dataset_all)
  312. gram_matrix_unnorm_list = []
  313. run_time_list = []
  314. print('start generating preimage for each class of target...')
  315. for idx, dataset in enumerate(datasets):
  316. target = dataset.targets[0]
  317. print('\ntarget =', target, '\n')
  318. # 2. initialize graph kernel.
  319. print('2. initializing graph kernel and setting parameters...')
  320. graph_kernel = get_graph_kernel_by_name(kernel_options['name'],
  321. node_labels=dataset.node_labels,
  322. edge_labels=dataset.edge_labels,
  323. node_attrs=dataset.node_attrs,
  324. edge_attrs=dataset.edge_attrs,
  325. ds_infos=dataset.get_dataset_infos(keys=['directed']),
  326. kernel_options=kernel_options)
  327. # 3. compute gram matrix.
  328. print('3. computing gram matrix...')
  329. gram_matrix, run_time = graph_kernel.compute(dataset.graphs, **kernel_options)
  330. gram_matrix_unnorm = graph_kernel.gram_matrix_unnorm
  331. gram_matrix_unnorm_list.append(gram_matrix_unnorm)
  332. run_time_list.append(run_time)
  333. # 4. save results.
  334. print()
  335. print('4. saving results...')
  336. if save_results:
  337. if not os.path.exists(dir_save):
  338. os.makedirs(dir_save)
  339. np.savez(dir_save + 'gram_matrix_unnorm.' + ds_name + '.' + kernel_options['name'] + '.gm', gram_matrix_unnorm_list=gram_matrix_unnorm_list, run_time_list=run_time_list)
  340. print('\ncomplete.')
  341. def find_paths(G, source_node, length):
  342. """Find all paths with a certain length those start from a source node.
  343. A recursive depth first search is applied.
  344. Parameters
  345. ----------
  346. G : NetworkX graphs
  347. The graph in which paths are searched.
  348. source_node : integer
  349. The number of the node from where all paths start.
  350. length : integer
  351. The length of paths.
  352. Return
  353. ------
  354. path : list of list
  355. List of paths retrieved, where each path is represented by a list of nodes.
  356. """
  357. if length == 0:
  358. return [[source_node]]
  359. path = [[source_node] + path for neighbor in G[source_node] \
  360. for path in find_paths(G, neighbor, length - 1) if source_node not in path]
  361. return path
  362. def find_all_paths(G, length, is_directed):
  363. """Find all paths with a certain length in a graph. A recursive depth first
  364. search is applied.
  365. Parameters
  366. ----------
  367. G : NetworkX graphs
  368. The graph in which paths are searched.
  369. length : integer
  370. The length of paths.
  371. Return
  372. ------
  373. path : list of list
  374. List of paths retrieved, where each path is represented by a list of nodes.
  375. """
  376. all_paths = []
  377. for node in G:
  378. all_paths.extend(find_paths(G, node, length))
  379. if not is_directed:
  380. # For each path, two presentations are retrieved from its two extremities.
  381. # Remove one of them.
  382. all_paths_r = [path[::-1] for path in all_paths]
  383. for idx, path in enumerate(all_paths[:-1]):
  384. for path2 in all_paths_r[idx+1::]:
  385. if path == path2:
  386. all_paths[idx] = []
  387. break
  388. all_paths = list(filter(lambda a: a != [], all_paths))
  389. return all_paths
  390. def get_mlti_dim_node_attrs(G, attr_names):
  391. attributes = []
  392. for nd, attrs in G.nodes(data=True):
  393. attributes.append(tuple(attrs[aname] for aname in attr_names))
  394. return attributes
  395. def get_mlti_dim_edge_attrs(G, attr_names):
  396. attributes = []
  397. for ed, attrs in G.edges(data=True):
  398. attributes.append(tuple(attrs[aname] for aname in attr_names))
  399. return attributes
  400. @unique
  401. class SpecialLabel(Enum):
  402. """can be used to define special labels.
  403. """
  404. DUMMY = 1 # The dummy label.
  405. # DUMMY = auto # enum.auto does not exist in Python 3.5.
  406. def normalize_gram_matrix(gram_matrix):
  407. diag = gram_matrix.diagonal().copy()
  408. for i in range(len(gram_matrix)):
  409. for j in range(i, len(gram_matrix)):
  410. gram_matrix[i][j] /= np.sqrt(diag[i] * diag[j])
  411. gram_matrix[j][i] = gram_matrix[i][j]
  412. return gram_matrix
  413. def compute_distance_matrix(gram_matrix):
  414. dis_mat = np.empty((len(gram_matrix), len(gram_matrix)))
  415. for i in range(len(gram_matrix)):
  416. for j in range(i, len(gram_matrix)):
  417. dis = gram_matrix[i, i] + gram_matrix[j, j] - 2 * gram_matrix[i, j]
  418. if dis < 0:
  419. if dis > -1e-10:
  420. dis = 0
  421. else:
  422. raise ValueError('The distance is negative.')
  423. dis_mat[i, j] = np.sqrt(dis)
  424. dis_mat[j, i] = dis_mat[i, j]
  425. dis_max = np.max(np.max(dis_mat))
  426. dis_min = np.min(np.min(dis_mat[dis_mat != 0]))
  427. dis_mean = np.mean(np.mean(dis_mat))
  428. return dis_mat, dis_max, dis_min, dis_mean

A Python package for graph kernels, graph edit distances and graph pre-image problem.