You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

median_preimage_generator_cml.py 21 kB


  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. Created on Tue Jun 16 16:04:46 2020
  5. @author: ljia
  6. """
  7. import numpy as np
  8. import time
  9. import random
  10. import multiprocessing
  11. import networkx as nx
  12. from gklearn.preimage import PreimageGenerator
  13. from gklearn.preimage.utils import compute_k_dis
  14. from gklearn.ged.env import GEDEnv
  15. from gklearn.ged.learning import CostMatricesLearner
  16. from gklearn.ged.median import MedianGraphEstimatorCML
  17. from gklearn.ged.median import constant_node_costs, mge_options_to_string
  18. from gklearn.utils.utils import get_graph_kernel_by_name
  19. from gklearn.ged.util import label_costs_to_matrix
  20. class MedianPreimageGeneratorCML(PreimageGenerator):
  21. """Generator median preimages by cost matrices learning using the pure Python version of GEDEnv. Works only for symbolic labeled graphs.
  22. """
  23. def __init__(self, dataset=None):
  24. PreimageGenerator.__init__(self, dataset=dataset)
  25. ### arguments to set.
  26. self._mge = None
  27. self._ged_options = {}
  28. self._mge_options = {}
  29. # self._fit_method = 'k-graphs'
  30. self._init_method = 'random'
  31. self._init_ecc = None
  32. self._parallel = True
  33. self._n_jobs = multiprocessing.cpu_count()
  34. self._ds_name = None
  35. # for cml.
  36. self._time_limit_in_sec = 0
  37. self._max_itrs = 100
  38. self._max_itrs_without_update = 3
  39. self._epsilon_residual = 0.01
  40. self._epsilon_ec = 0.1
  41. self._allow_zeros = True
  42. # self._triangle_rule = True
  43. ### values to compute.
  44. self._runtime_optimize_ec = None
  45. self._runtime_generate_preimage = None
  46. self._runtime_total = None
  47. self._set_median = None
  48. self._gen_median = None
  49. self._best_from_dataset = None
  50. self._sod_set_median = None
  51. self._sod_gen_median = None
  52. self._k_dis_set_median = None
  53. self._k_dis_gen_median = None
  54. self._k_dis_dataset = None
  55. self._node_label_costs = None
  56. self._edge_label_costs = None
  57. # for cml.
  58. self._itrs = 0
  59. self._converged = False
  60. self._num_updates_ecs = 0
  61. ### values that can be set or to be computed.
  62. self._edit_cost_constants = []
  63. self._gram_matrix_unnorm = None
  64. self._runtime_precompute_gm = None
  65. def set_options(self, **kwargs):
  66. self._kernel_options = kwargs.get('kernel_options', {})
  67. self._graph_kernel = kwargs.get('graph_kernel', None)
  68. self._verbose = kwargs.get('verbose', 2)
  69. self._ged_options = kwargs.get('ged_options', {})
  70. self._mge_options = kwargs.get('mge_options', {})
  71. # self._fit_method = kwargs.get('fit_method', 'k-graphs')
  72. self._init_method = kwargs.get('init_method', 'random')
  73. self._init_ecc = kwargs.get('init_ecc', None)
  74. self._edit_cost_constants = kwargs.get('edit_cost_constants', [])
  75. self._parallel = kwargs.get('parallel', True)
  76. self._n_jobs = kwargs.get('n_jobs', multiprocessing.cpu_count())
  77. self._ds_name = kwargs.get('ds_name', None)
  78. self._time_limit_in_sec = kwargs.get('time_limit_in_sec', 0)
  79. self._max_itrs = kwargs.get('max_itrs', 100)
  80. self._max_itrs_without_update = kwargs.get('max_itrs_without_update', 3)
  81. self._epsilon_residual = kwargs.get('epsilon_residual', 0.01)
  82. self._epsilon_ec = kwargs.get('epsilon_ec', 0.1)
  83. self._gram_matrix_unnorm = kwargs.get('gram_matrix_unnorm', None)
  84. self._runtime_precompute_gm = kwargs.get('runtime_precompute_gm', None)
  85. self._allow_zeros = kwargs.get('allow_zeros', True)
  86. # self._triangle_rule = kwargs.get('triangle_rule', True)
  87. def run(self):
  88. self._graph_kernel = get_graph_kernel_by_name(self._kernel_options['name'],
  89. node_labels=self._dataset.node_labels,
  90. edge_labels=self._dataset.edge_labels,
  91. node_attrs=self._dataset.node_attrs,
  92. edge_attrs=self._dataset.edge_attrs,
  93. ds_infos=self._dataset.get_dataset_infos(keys=['directed']),
  94. kernel_options=self._kernel_options)
  95. # record start time.
  96. start = time.time()
  97. # 1. precompute gram matrix.
  98. if self._gram_matrix_unnorm is None:
  99. gram_matrix, run_time = self._graph_kernel.compute(self._dataset.graphs, **self._kernel_options)
  100. self._gram_matrix_unnorm = self._graph_kernel.gram_matrix_unnorm
  101. end_precompute_gm = time.time()
  102. self._runtime_precompute_gm = end_precompute_gm - start
  103. else:
  104. if self._runtime_precompute_gm is None:
  105. raise Exception('Parameter "runtime_precompute_gm" must be given when using pre-computed Gram matrix.')
  106. self._graph_kernel.gram_matrix_unnorm = self._gram_matrix_unnorm
  107. if self._kernel_options['normalize']:
  108. self._graph_kernel.gram_matrix = self._graph_kernel.normalize_gm(np.copy(self._gram_matrix_unnorm))
  109. else:
  110. self._graph_kernel.gram_matrix = np.copy(self._gram_matrix_unnorm)
  111. end_precompute_gm = time.time()
  112. start -= self._runtime_precompute_gm
  113. # if self._fit_method != 'k-graphs' and self._fit_method != 'whole-dataset':
  114. # start = time.time()
  115. # self._runtime_precompute_gm = 0
  116. # end_precompute_gm = start
  117. # 2. optimize edit cost constants.
  118. self._optimize_edit_cost_vector()
  119. end_optimize_ec = time.time()
  120. self._runtime_optimize_ec = end_optimize_ec - end_precompute_gm
  121. # 3. compute set median and gen median using optimized edit costs.
  122. if self._verbose >= 2:
  123. print('\nstart computing set median and gen median using optimized edit costs...\n')
  124. self._gmg_bcu()
  125. end_generate_preimage = time.time()
  126. self._runtime_generate_preimage = end_generate_preimage - end_optimize_ec
  127. self._runtime_total = end_generate_preimage - start
  128. if self._verbose >= 2:
  129. print('medians computed.')
  130. print('SOD of the set median: ', self._sod_set_median)
  131. print('SOD of the generalized median: ', self._sod_gen_median)
  132. # 4. compute kernel distances to the true median.
  133. if self._verbose >= 2:
  134. print('\nstart computing distances to true median....\n')
  135. self._compute_distances_to_true_median()
  136. # 5. print out results.
  137. if self._verbose:
  138. print()
  139. print('================================================================================')
  140. print('Finished generation of preimages.')
  141. print('--------------------------------------------------------------------------------')
  142. print('The optimized edit costs:', self._edit_cost_constants)
  143. print('SOD of the set median:', self._sod_set_median)
  144. print('SOD of the generalized median:', self._sod_gen_median)
  145. print('Distance in kernel space for set median:', self._k_dis_set_median)
  146. print('Distance in kernel space for generalized median:', self._k_dis_gen_median)
  147. print('Minimum distance in kernel space for each graph in median set:', self._k_dis_dataset)
  148. print('Time to pre-compute Gram matrix:', self._runtime_precompute_gm)
  149. print('Time to optimize edit costs:', self._runtime_optimize_ec)
  150. print('Time to generate pre-images:', self._runtime_generate_preimage)
  151. print('Total time:', self._runtime_total)
  152. print('Total number of iterations for optimizing:', self._itrs)
  153. print('Total number of updating edit costs:', self._num_updates_ecs)
  154. print('Is optimization of edit costs converged:', self._converged)
  155. print('================================================================================')
  156. print()
  157. def get_results(self):
  158. results = {}
  159. results['edit_cost_constants'] = self._edit_cost_constants
  160. results['runtime_precompute_gm'] = self._runtime_precompute_gm
  161. results['runtime_optimize_ec'] = self._runtime_optimize_ec
  162. results['runtime_generate_preimage'] = self._runtime_generate_preimage
  163. results['runtime_total'] = self._runtime_total
  164. results['sod_set_median'] = self._sod_set_median
  165. results['sod_gen_median'] = self._sod_gen_median
  166. results['k_dis_set_median'] = self._k_dis_set_median
  167. results['k_dis_gen_median'] = self._k_dis_gen_median
  168. results['k_dis_dataset'] = self._k_dis_dataset
  169. results['itrs'] = self._itrs
  170. results['converged'] = self._converged
  171. results['num_updates_ecc'] = self._num_updates_ecs
  172. results['mge'] = {}
  173. results['mge']['num_decrease_order'] = self._mge.get_num_times_order_decreased()
  174. results['mge']['num_increase_order'] = self._mge.get_num_times_order_increased()
  175. results['mge']['num_converged_descents'] = self._mge.get_num_converged_descents()
  176. return results
  177. def _optimize_edit_cost_vector(self):
  178. """Learn edit cost vector.
  179. """
  180. # Initialize label costs randomly.
  181. if self._init_method == 'random':
  182. # Initialize label costs.
  183. self._initialize_label_costs()
  184. # Optimize edit cost matrices.
  185. self._optimize_ecm_by_kernel_distances()
  186. # Initialize all label costs with the same value.
  187. elif self._init_method == 'uniform': # random
  188. pass
  189. elif self._fit_method == 'random': # random
  190. if self._ged_options['edit_cost'] == 'LETTER':
  191. self._edit_cost_constants = random.sample(range(1, 1000), 3)
  192. self._edit_cost_constants = [item * 0.001 for item in self._edit_cost_constants]
  193. elif self._ged_options['edit_cost'] == 'LETTER2':
  194. random.seed(time.time())
  195. self._edit_cost_constants = random.sample(range(1, 1000), 5)
  196. self._edit_cost_constants = [item * 0.01 for item in self._edit_cost_constants]
  197. elif self._ged_options['edit_cost'] == 'NON_SYMBOLIC':
  198. self._edit_cost_constants = random.sample(range(1, 1000), 6)
  199. self._edit_cost_constants = [item * 0.01 for item in self._edit_cost_constants]
  200. if self._dataset.node_attrs == []:
  201. self._edit_cost_constants[2] = 0
  202. if self._dataset.edge_attrs == []:
  203. self._edit_cost_constants[5] = 0
  204. else:
  205. self._edit_cost_constants = random.sample(range(1, 1000), 6)
  206. self._edit_cost_constants = [item * 0.01 for item in self._edit_cost_constants]
  207. if self._verbose >= 2:
  208. print('edit cost constants used:', self._edit_cost_constants)
  209. elif self._fit_method == 'expert': # expert
  210. if self._init_ecc is None:
  211. if self._ged_options['edit_cost'] == 'LETTER':
  212. self._edit_cost_constants = [0.9, 1.7, 0.75]
  213. elif self._ged_options['edit_cost'] == 'LETTER2':
  214. self._edit_cost_constants = [0.675, 0.675, 0.75, 0.425, 0.425]
  215. else:
  216. self._edit_cost_constants = [3, 3, 1, 3, 3, 1]
  217. else:
  218. self._edit_cost_constants = self._init_ecc
  219. elif self._fit_method == 'k-graphs':
  220. if self._init_ecc is None:
  221. if self._ged_options['edit_cost'] == 'LETTER':
  222. self._init_ecc = [0.9, 1.7, 0.75]
  223. elif self._ged_options['edit_cost'] == 'LETTER2':
  224. self._init_ecc = [0.675, 0.675, 0.75, 0.425, 0.425]
  225. elif self._ged_options['edit_cost'] == 'NON_SYMBOLIC':
  226. self._init_ecc = [0, 0, 1, 1, 1, 0]
  227. if self._dataset.node_attrs == []:
  228. self._init_ecc[2] = 0
  229. if self._dataset.edge_attrs == []:
  230. self._init_ecc[5] = 0
  231. else:
  232. self._init_ecc = [3, 3, 1, 3, 3, 1]
  233. # optimize on the k-graph subset.
  234. self._optimize_ecm_by_kernel_distances()
  235. elif self._fit_method == 'whole-dataset':
  236. if self._init_ecc is None:
  237. if self._ged_options['edit_cost'] == 'LETTER':
  238. self._init_ecc = [0.9, 1.7, 0.75]
  239. elif self._ged_options['edit_cost'] == 'LETTER2':
  240. self._init_ecc = [0.675, 0.675, 0.75, 0.425, 0.425]
  241. else:
  242. self._init_ecc = [3, 3, 1, 3, 3, 1]
  243. # optimizeon the whole set.
  244. self._optimize_ecc_by_kernel_distances()
  245. elif self._fit_method == 'precomputed':
  246. pass
  247. def _initialize_label_costs(self):
  248. self._initialize_node_label_costs()
  249. self._initialize_edge_label_costs()
  250. def _initialize_node_label_costs(self):
  251. # Get list of node labels.
  252. nls = self._dataset.get_all_node_labels()
  253. # Generate random costs.
  254. nb_nl = int((len(nls) * (len(nls) - 1)) / 2 + 2 * len(nls))
  255. rand_costs = random.sample(range(1, 10 * nb_nl + 1), nb_nl)
  256. rand_costs /= np.max(rand_costs) # @todo: maybe not needed.
  257. self._node_label_costs = rand_costs
  258. def _initialize_edge_label_costs(self):
  259. # Get list of edge labels.
  260. els = self._dataset.get_all_edge_labels()
  261. # Generate random costs.
  262. nb_el = int((len(els) * (len(els) - 1)) / 2 + 2 * len(els))
  263. rand_costs = random.sample(range(1, 10 * nb_el + 1), nb_el)
  264. rand_costs /= np.max(rand_costs) # @todo: maybe not needed.
  265. self._edge_label_costs = rand_costs
  266. def _optimize_ecm_by_kernel_distances(self):
  267. # compute distances in feature space.
  268. dis_k_mat, _, _, _ = self._graph_kernel.compute_distance_matrix()
  269. dis_k_vec = []
  270. for i in range(len(dis_k_mat)):
  271. # for j in range(i, len(dis_k_mat)):
  272. for j in range(i + 1, len(dis_k_mat)):
  273. dis_k_vec.append(dis_k_mat[i, j])
  274. dis_k_vec = np.array(dis_k_vec)
  275. # Set GEDEnv options.
  276. # graphs = [self._clean_graph(g) for g in self._dataset.graphs]
  277. # self._edit_cost_constants = self._init_ecc
  278. options = self._ged_options.copy()
  279. options['edit_cost_constants'] = self._edit_cost_constants # @todo: not needed.
  280. options['node_labels'] = self._dataset.node_labels
  281. options['edge_labels'] = self._dataset.edge_labels
  282. # options['node_attrs'] = self._dataset.node_attrs
  283. # options['edge_attrs'] = self._dataset.edge_attrs
  284. options['node_label_costs'] = self._node_label_costs
  285. options['edge_label_costs'] = self._edge_label_costs
  286. # Learner cost matrices.
  287. # Initialize cost learner.
  288. cml = CostMatricesLearner(edit_cost='CONSTANT', triangle_rule=False, allow_zeros=True, parallel=self._parallel, verbose=self._verbose) # @todo
  289. cml.set_update_params(time_limit_in_sec=self._time_limit_in_sec, max_itrs=self._max_itrs, max_itrs_without_update=self._max_itrs_without_update, epsilon_residual=self._epsilon_residual, epsilon_ec=self._epsilon_ec)
  290. # Run cost learner.
  291. cml.update(dis_k_vec, self._dataset.graphs, options)
  292. # Get results.
  293. results = cml.get_results()
  294. self._converged = results['converged']
  295. self._itrs = results['itrs']
  296. self._num_updates_ecs = results['num_updates_ecs']
  297. cost_list = results['cost_list']
  298. self._node_label_costs = cost_list[-1][0:len(self._node_label_costs)]
  299. self._edge_label_costs = cost_list[-1][len(self._node_label_costs):]
  300. def _gmg_bcu(self):
  301. """
  302. The local search algorithm based on block coordinate update (BCU) for estimating a generalized median graph (GMG).
  303. Returns
  304. -------
  305. None.
  306. """
  307. # Set up the ged environment.
  308. ged_env = GEDEnv() # @todo: maybe create a ged_env as a private varible.
  309. # gedlibpy.restart_env()
  310. ged_env.set_edit_cost(self._ged_options['edit_cost'], edit_cost_constants=self._edit_cost_constants)
  311. graphs = [self._clean_graph(g) for g in self._dataset.graphs]
  312. for g in graphs:
  313. ged_env.add_nx_graph(g, '')
  314. graph_ids = ged_env.get_all_graph_ids()
  315. node_labels = ged_env.get_all_node_labels()
  316. edge_labels = ged_env.get_all_edge_labels()
  317. node_label_costs = label_costs_to_matrix(self._node_label_costs, len(node_labels))
  318. edge_label_costs = label_costs_to_matrix(self._edge_label_costs, len(edge_labels))
  319. ged_env.set_label_costs(node_label_costs, edge_label_costs)
  320. set_median_id = ged_env.add_graph('set_median')
  321. gen_median_id = ged_env.add_graph('gen_median')
  322. ged_env.init(init_type=self._ged_options['init_option'])
  323. # Set up the madian graph estimator.
  324. self._mge = MedianGraphEstimatorCML(ged_env, constant_node_costs(self._ged_options['edit_cost']))
  325. self._mge.set_refine_method(self._ged_options['method'], self._ged_options)
  326. options = self._mge_options.copy()
  327. if not 'seed' in options:
  328. options['seed'] = int(round(time.time() * 1000)) # @todo: may not work correctly for possible parallel usage.
  329. options['parallel'] = self._parallel
  330. # Select the GED algorithm.
  331. self._mge.set_options(mge_options_to_string(options))
  332. self._mge.set_label_names(node_labels=self._dataset.node_labels,
  333. edge_labels=self._dataset.edge_labels,
  334. node_attrs=self._dataset.node_attrs,
  335. edge_attrs=self._dataset.edge_attrs)
  336. ged_options = self._ged_options.copy()
  337. if self._parallel:
  338. ged_options['threads'] = 1
  339. self._mge.set_init_method(ged_options['method'], ged_options)
  340. self._mge.set_descent_method(ged_options['method'], ged_options)
  341. # Run the estimator.
  342. self._mge.run(graph_ids, set_median_id, gen_median_id)
  343. # Get SODs.
  344. self._sod_set_median = self._mge.get_sum_of_distances('initialized')
  345. self._sod_gen_median = self._mge.get_sum_of_distances('converged')
  346. # Get median graphs.
  347. self._set_median = ged_env.get_nx_graph(set_median_id)
  348. self._gen_median = ged_env.get_nx_graph(gen_median_id)
  349. def _compute_distances_to_true_median(self):
  350. # compute distance in kernel space for set median.
  351. kernels_to_sm, _ = self._graph_kernel.compute(self._set_median, self._dataset.graphs, **self._kernel_options)
  352. kernel_sm, _ = self._graph_kernel.compute(self._set_median, self._set_median, **self._kernel_options)
  353. if self._kernel_options['normalize']:
  354. kernels_to_sm = [kernels_to_sm[i] / np.sqrt(self._gram_matrix_unnorm[i, i] * kernel_sm) for i in range(len(kernels_to_sm))] # normalize
  355. kernel_sm = 1
  356. # @todo: not correct kernel value
  357. gram_with_sm = np.concatenate((np.array([kernels_to_sm]), np.copy(self._graph_kernel.gram_matrix)), axis=0)
  358. gram_with_sm = np.concatenate((np.array([[kernel_sm] + kernels_to_sm]).T, gram_with_sm), axis=1)
  359. self._k_dis_set_median = compute_k_dis(0, range(1, 1+len(self._dataset.graphs)),
  360. [1 / len(self._dataset.graphs)] * len(self._dataset.graphs),
  361. gram_with_sm, withterm3=False)
  362. # compute distance in kernel space for generalized median.
  363. kernels_to_gm, _ = self._graph_kernel.compute(self._gen_median, self._dataset.graphs, **self._kernel_options)
  364. kernel_gm, _ = self._graph_kernel.compute(self._gen_median, self._gen_median, **self._kernel_options)
  365. if self._kernel_options['normalize']:
  366. kernels_to_gm = [kernels_to_gm[i] / np.sqrt(self._gram_matrix_unnorm[i, i] * kernel_gm) for i in range(len(kernels_to_gm))] # normalize
  367. kernel_gm = 1
  368. gram_with_gm = np.concatenate((np.array([kernels_to_gm]), np.copy(self._graph_kernel.gram_matrix)), axis=0)
  369. gram_with_gm = np.concatenate((np.array([[kernel_gm] + kernels_to_gm]).T, gram_with_gm), axis=1)
  370. self._k_dis_gen_median = compute_k_dis(0, range(1, 1+len(self._dataset.graphs)),
  371. [1 / len(self._dataset.graphs)] * len(self._dataset.graphs),
  372. gram_with_gm, withterm3=False)
  373. # compute distance in kernel space for each graph in median set.
  374. k_dis_median_set = []
  375. for idx in range(len(self._dataset.graphs)):
  376. k_dis_median_set.append(compute_k_dis(idx+1, range(1, 1+len(self._dataset.graphs)),
  377. [1 / len(self._dataset.graphs)] * len(self._dataset.graphs),
  378. gram_with_gm, withterm3=False))
  379. idx_k_dis_median_set_min = np.argmin(k_dis_median_set)
  380. self._k_dis_dataset = k_dis_median_set[idx_k_dis_median_set_min]
  381. self._best_from_dataset = self._dataset.graphs[idx_k_dis_median_set_min].copy()
  382. if self._verbose >= 2:
  383. print()
  384. print('distance in kernel space for set median:', self._k_dis_set_median)
  385. print('distance in kernel space for generalized median:', self._k_dis_gen_median)
  386. print('minimum distance in kernel space for each graph in median set:', self._k_dis_dataset)
  387. print('distance in kernel space for each graph in median set:', k_dis_median_set)
  388. # def _clean_graph(self, G, node_labels=[], edge_labels=[], node_attrs=[], edge_attrs=[]):
  389. def _clean_graph(self, G): # @todo: this may not be needed when datafile is updated.
  390. """
  391. Cleans node and edge labels and attributes of the given graph.
  392. """
  393. G_new = nx.Graph(**G.graph)
  394. for nd, attrs in G.nodes(data=True):
  395. G_new.add_node(str(nd)) # @todo: should we keep this as str()?
  396. for l_name in self._dataset.node_labels:
  397. G_new.nodes[str(nd)][l_name] = str(attrs[l_name])
  398. for a_name in self._dataset.node_attrs:
  399. G_new.nodes[str(nd)][a_name] = str(attrs[a_name])
  400. for nd1, nd2, attrs in G.edges(data=True):
  401. G_new.add_edge(str(nd1), str(nd2))
  402. for l_name in self._dataset.edge_labels:
  403. G_new.edges[str(nd1), str(nd2)][l_name] = str(attrs[l_name])
  404. for a_name in self._dataset.edge_attrs:
  405. G_new.edges[str(nd1), str(nd2)][a_name] = str(attrs[a_name])
  406. return G_new
  407. @property
  408. def mge(self):
  409. return self._mge
  410. @property
  411. def ged_options(self):
  412. return self._ged_options
  413. @ged_options.setter
  414. def ged_options(self, value):
  415. self._ged_options = value
  416. @property
  417. def mge_options(self):
  418. return self._mge_options
  419. @mge_options.setter
  420. def mge_options(self, value):
  421. self._mge_options = value
  422. @property
  423. def fit_method(self):
  424. return self._fit_method
  425. @fit_method.setter
  426. def fit_method(self, value):
  427. self._fit_method = value
  428. @property
  429. def init_ecc(self):
  430. return self._init_ecc
  431. @init_ecc.setter
  432. def init_ecc(self, value):
  433. self._init_ecc = value
  434. @property
  435. def set_median(self):
  436. return self._set_median
  437. @property
  438. def gen_median(self):
  439. return self._gen_median
  440. @property
  441. def best_from_dataset(self):
  442. return self._best_from_dataset
  443. @property
  444. def gram_matrix_unnorm(self):
  445. return self._gram_matrix_unnorm
  446. @gram_matrix_unnorm.setter
  447. def gram_matrix_unnorm(self, value):
  448. self._gram_matrix_unnorm = value

A Python package for graph kernels, graph edit distances and graph pre-image problem.