You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

xp_simple_preimage.py 7.0 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. Created on Fri Jun 12 10:30:17 2020
  5. @author: ljia
  6. This script constructs simple preimages to test preimage methods and find bugs and shortcomings in them.
  7. """
  8. def xp_simple_preimage():
  9. import numpy as np
  10. """**1. Get dataset.**"""
  11. from gklearn.utils import Dataset, split_dataset_by_target
  12. # Predefined dataset name, use dataset "MAO".
  13. ds_name = 'MAO'
  14. # The node/edge labels that will not be used in the computation.
  15. irrelevant_labels = {'node_attrs': ['x', 'y', 'z'], 'edge_labels': ['bond_stereo']}
  16. # Initialize a Dataset.
  17. dataset_all = Dataset()
  18. # Load predefined dataset "MAO".
  19. dataset_all.load_predefined_dataset(ds_name)
  20. # Remove irrelevant labels.
  21. dataset_all.remove_labels(**irrelevant_labels)
  22. # Split the whole dataset according to the classification targets.
  23. datasets = split_dataset_by_target(dataset_all)
  24. # Get the first class of graphs, whose median preimage will be computed.
  25. dataset = datasets[0]
  26. len(dataset.graphs)
  27. """**2. Set parameters.**"""
  28. import multiprocessing
  29. # Parameters for MedianPreimageGenerator (our method).
  30. mpg_options = {'fit_method': 'k-graphs', # how to fit edit costs. "k-graphs" means use all graphs in median set when fitting.
  31. 'init_ecc': [4, 4, 2, 1, 1, 1], # initial edit costs.
  32. 'ds_name': ds_name, # name of the dataset.
  33. 'parallel': True, # whether the parallel scheme is to be used.
  34. 'time_limit_in_sec': 0, # maximum time limit to compute the preimage. If set to 0 then no limit.
  35. 'max_itrs': 10, # maximum iteration limit to optimize edit costs. If set to 0 then no limit.
  36. 'max_itrs_without_update': 3, # If the times that edit costs is not update is more than this number, then the optimization stops.
  37. 'epsilon_residual': 0.01, # In optimization, the residual is only considered changed if the change is bigger than this number.
  38. 'epsilon_ec': 0.1, # In optimization, the edit costs are only considered changed if the changes are bigger than this number.
  39. 'verbose': 2 # whether to print out results.
  40. }
  41. # Parameters for graph kernel computation.
  42. kernel_options = {'name': 'PathUpToH', # use path kernel up to length h.
  43. 'depth': 9,
  44. 'k_func': 'MinMax',
  45. 'compute_method': 'trie',
  46. 'parallel': 'imap_unordered', # or None
  47. 'n_jobs': multiprocessing.cpu_count(),
  48. 'normalize': True, # whether to use normalized Gram matrix to optimize edit costs.
  49. 'verbose': 2 # whether to print out results.
  50. }
  51. # Parameters for GED computation.
  52. ged_options = {'method': 'IPFP', # use IPFP huristic.
  53. 'initialization_method': 'RANDOM', # or 'NODE', etc.
  54. 'initial_solutions': 10, # when bigger than 1, then the method is considered mIPFP.
  55. 'edit_cost': 'CONSTANT', # use CONSTANT cost.
  56. 'attr_distance': 'euclidean', # the distance between non-symbolic node/edge labels is computed by euclidean distance.
  57. 'ratio_runs_from_initial_solutions': 1,
  58. 'threads': multiprocessing.cpu_count(), # parallel threads. Do not work if mpg_options['parallel'] = False.
  59. 'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES'
  60. }
  61. # Parameters for MedianGraphEstimator (Boria's method).
  62. mge_options = {'init_type': 'MEDOID', # how to initial median (compute set-median). "MEDOID" is to use the graph with smallest SOD.
  63. 'random_inits': 10, # number of random initialization when 'init_type' = 'RANDOM'.
  64. 'time_limit': 600, # maximum time limit to compute the generalized median. If set to 0 then no limit.
  65. 'verbose': 2, # whether to print out results.
  66. 'refine': False # whether to refine the final SODs or not.
  67. }
  68. print('done.')
  69. """**3. Compute the Gram matrix and distance matrix.**"""
  70. from gklearn.utils.utils import get_graph_kernel_by_name
  71. # Get a graph kernel instance.
  72. graph_kernel = get_graph_kernel_by_name(kernel_options['name'],
  73. node_labels=dataset.node_labels, edge_labels=dataset.edge_labels,
  74. node_attrs=dataset.node_attrs, edge_attrs=dataset.edge_attrs,
  75. ds_infos=dataset.get_dataset_infos(keys=['directed']),
  76. kernel_options=kernel_options)
  77. # Compute Gram matrix.
  78. gram_matrix, run_time = graph_kernel.compute(dataset.graphs, **kernel_options)
  79. # Compute distance matrix.
  80. from gklearn.utils import compute_distance_matrix
  81. dis_mat, _, _, _ = compute_distance_matrix(gram_matrix)
  82. print('done.')
  83. """**4. Find the candidate graph.**"""
  84. from gklearn.preimage.utils import compute_k_dis
  85. # Number of the nearest neighbors.
  86. k_neighbors = 10
  87. # For each graph G in dataset, compute the distance between its image \Phi(G) and the mean of its neighbors' images.
  88. dis_min = np.inf # the minimum distance between possible \Phi(G) and the mean of its neighbors.
  89. for idx, G in enumerate(dataset.graphs):
  90. # Find the k nearest neighbors of G.
  91. dis_list = dis_mat[idx] # distance between \Phi(G) and image of each graphs.
  92. idx_sort = np.argsort(dis_list) # sort distances and get the sorted indices.
  93. idx_nearest = idx_sort[1:k_neighbors+1] # indices of the k-nearest neighbors.
  94. dis_k_nearest = [dis_list[i] for i in idx_nearest] # k-nearest distances, except the 0.
  95. G_k_nearest = [dataset.graphs[i] for i in idx_nearest] # k-nearest neighbors.
  96. # Compute the distance between \Phi(G) and the mean of its neighbors.
  97. dis_tmp = compute_k_dis(idx, # the index of G in Gram matrix.
  98. idx_nearest, # the indices of the neighbors
  99. [1 / k_neighbors] * k_neighbors, # coefficients for neighbors.
  100. gram_matrix,
  101. withterm3=False)
  102. # Check if the new distance is smallers.
  103. if dis_tmp < dis_min:
  104. dis_min = dis_tmp
  105. G_cand = G
  106. G_neighbors = G_k_nearest
  107. print('The minimum distance is', dis_min)
  108. """**5. Run median preimage generator.**"""
  109. from gklearn.preimage import MedianPreimageGenerator
  110. # Set the dataset as the k-nearest neighbors.
  111. dataset.load_graphs(G_neighbors)
  112. # Create median preimage generator instance.
  113. mpg = MedianPreimageGenerator()
  114. # Add dataset.
  115. mpg.dataset = dataset
  116. # Set parameters.
  117. mpg.set_options(**mpg_options.copy())
  118. mpg.kernel_options = kernel_options.copy()
  119. mpg.ged_options = ged_options.copy()
  120. mpg.mge_options = mge_options.copy()
  121. # Run.
  122. mpg.run()
  123. """**4. Get results.**"""
  124. # Get results.
  125. import pprint
  126. pp = pprint.PrettyPrinter(indent=4) # pretty print
  127. results = mpg.get_results()
  128. pp.pprint(results)
  129. draw_graph(mpg.set_median)
  130. draw_graph(mpg.gen_median)
  131. draw_graph(G_cand)
  132. # Draw generated graphs.
  133. def draw_graph(graph):
  134. import matplotlib.pyplot as plt
  135. import networkx as nx
  136. plt.figure()
  137. pos = nx.spring_layout(graph)
  138. nx.draw(graph, pos, node_size=500, labels=nx.get_node_attributes(graph, 'atom_symbol'), font_color='w', width=3, with_labels=True)
  139. plt.show()
  140. plt.clf()
  141. plt.close()
  142. if __name__ == '__main__':
  143. xp_simple_preimage()

A Python package for graph kernels, graph edit distances and graph pre-image problem.