You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

median_preimage_generator.py 35 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. Created on Thu Mar 26 18:27:22 2020
  5. @author: ljia
  6. """
  7. import numpy as np
  8. import time
  9. import random
  10. import multiprocessing
  11. import networkx as nx
  12. import cvxpy as cp
  13. from gklearn.preimage import PreimageGenerator
  14. from gklearn.preimage.utils import compute_k_dis
  15. from gklearn.ged.util import compute_geds, ged_options_to_string
  16. from gklearn.ged.median import MedianGraphEstimator
  17. from gklearn.ged.median import constant_node_costs,mge_options_to_string
  18. from gklearn.gedlib import librariesImport, gedlibpy
  19. from gklearn.utils import Timer
  20. # from gklearn.utils.dataset import Dataset
  21. class MedianPreimageGenerator(PreimageGenerator):
  22. def __init__(self, dataset=None):
  23. PreimageGenerator.__init__(self, dataset=dataset)
  24. # arguments to set.
  25. self.__mge = None
  26. self.__ged_options = {}
  27. self.__mge_options = {}
  28. self.__fit_method = 'k-graphs'
  29. self.__init_ecc = None
  30. self.__parallel = True
  31. self.__n_jobs = multiprocessing.cpu_count()
  32. self.__ds_name = None
  33. self.__time_limit_in_sec = 0
  34. self.__max_itrs = 100
  35. self.__max_itrs_without_update = 3
  36. self.__epsilon_residual = 0.01
  37. self.__epsilon_ec = 0.1
  38. # values to compute.
  39. self.__runtime_optimize_ec = None
  40. self.__runtime_generate_preimage = None
  41. self.__runtime_total = None
  42. self.__set_median = None
  43. self.__gen_median = None
  44. self.__best_from_dataset = None
  45. self.__sod_set_median = None
  46. self.__sod_gen_median = None
  47. self.__k_dis_set_median = None
  48. self.__k_dis_gen_median = None
  49. self.__k_dis_dataset = None
  50. self.__itrs = 0
  51. self.__converged = False
  52. self.__num_updates_ecc = 0
  53. # values that can be set or to be computed.
  54. self.__edit_cost_constants = []
  55. self.__gram_matrix_unnorm = None
  56. self.__runtime_precompute_gm = None
  57. def set_options(self, **kwargs):
  58. self._kernel_options = kwargs.get('kernel_options', {})
  59. self._graph_kernel = kwargs.get('graph_kernel', None)
  60. self._verbose = kwargs.get('verbose', 2)
  61. self.__ged_options = kwargs.get('ged_options', {})
  62. self.__mge_options = kwargs.get('mge_options', {})
  63. self.__fit_method = kwargs.get('fit_method', 'k-graphs')
  64. self.__init_ecc = kwargs.get('init_ecc', None)
  65. self.__edit_cost_constants = kwargs.get('edit_cost_constants', [])
  66. self.__parallel = kwargs.get('parallel', True)
  67. self.__n_jobs = kwargs.get('n_jobs', multiprocessing.cpu_count())
  68. self.__ds_name = kwargs.get('ds_name', None)
  69. self.__time_limit_in_sec = kwargs.get('time_limit_in_sec', 0)
  70. self.__max_itrs = kwargs.get('max_itrs', 100)
  71. self.__max_itrs_without_update = kwargs.get('max_itrs_without_update', 3)
  72. self.__epsilon_residual = kwargs.get('epsilon_residual', 0.01)
  73. self.__epsilon_ec = kwargs.get('epsilon_ec', 0.1)
  74. self.__gram_matrix_unnorm = kwargs.get('gram_matrix_unnorm', None)
  75. self.__runtime_precompute_gm = kwargs.get('runtime_precompute_gm', None)
  76. def run(self):
  77. self.__set_graph_kernel_by_name()
  78. # record start time.
  79. start = time.time()
  80. # 1. precompute gram matrix.
  81. if self.__gram_matrix_unnorm is None:
  82. gram_matrix, run_time = self._graph_kernel.compute(self._dataset.graphs, **self._kernel_options)
  83. self.__gram_matrix_unnorm = self._graph_kernel.gram_matrix_unnorm
  84. end_precompute_gm = time.time()
  85. self.__runtime_precompute_gm = end_precompute_gm - start
  86. else:
  87. if self.__runtime_precompute_gm is None:
  88. raise Exception('Parameter "runtime_precompute_gm" must be given when using pre-computed Gram matrix.')
  89. self._graph_kernel.gram_matrix_unnorm = self.__gram_matrix_unnorm
  90. if self._kernel_options['normalize']:
  91. self._graph_kernel.gram_matrix = self._graph_kernel.normalize_gm(np.copy(self.__gram_matrix_unnorm))
  92. else:
  93. self._graph_kernel.gram_matrix = np.copy(self.__gram_matrix_unnorm)
  94. end_precompute_gm = time.time()
  95. start -= self.__runtime_precompute_gm
  96. if self.__fit_method != 'k-graphs' and self.__fit_method != 'whole-dataset':
  97. start = time.time()
  98. self.__runtime_precompute_gm = 0
  99. end_precompute_gm = start
  100. # 2. optimize edit cost constants.
  101. self.__optimize_edit_cost_constants()
  102. end_optimize_ec = time.time()
  103. self.__runtime_optimize_ec = end_optimize_ec - end_precompute_gm
  104. # 3. compute set median and gen median using optimized edit costs.
  105. if self._verbose >= 2:
  106. print('\nstart computing set median and gen median using optimized edit costs...\n')
  107. # group_fnames = [Gn[g].graph['filename'] for g in group_min]
  108. self.__generate_preimage_iam()
  109. end_generate_preimage = time.time()
  110. self.__runtime_generate_preimage = end_generate_preimage - end_optimize_ec
  111. self.__runtime_total = end_generate_preimage - start
  112. if self._verbose >= 2:
  113. print('medians computed.')
  114. print('SOD of the set median: ', self.__sod_set_median)
  115. print('SOD of the generalized median: ', self.__sod_gen_median)
  116. # 4. compute kernel distances to the true median.
  117. if self._verbose >= 2:
  118. print('\nstart computing distances to true median....\n')
  119. # Gn_median = [Gn[g].copy() for g in group_min]
  120. self.__compute_distances_to_true_median()
  121. # dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min, idx_dis_k_gi_min =
  122. # idx_dis_k_gi_min = group_min[idx_dis_k_gi_min]
  123. # print('index min dis_k_gi:', idx_dis_k_gi_min)
  124. # print('sod_sm:', sod_sm)
  125. # print('sod_gm:', sod_gm)
  126. # 5. print out results.
  127. if self._verbose:
  128. print()
  129. print('================================================================================')
  130. print('Finished generalization of preimages.')
  131. print('--------------------------------------------------------------------------------')
  132. print('The optimized edit cost constants:', self.__edit_cost_constants)
  133. print('SOD of the set median:', self.__sod_set_median)
  134. print('SOD of the generalized median:', self.__sod_gen_median)
  135. print('Distance in kernel space for set median:', self.__k_dis_set_median)
  136. print('Distance in kernel space for generalized median:', self.__k_dis_gen_median)
  137. print('Minimum distance in kernel space for each graph in median set:', self.__k_dis_dataset)
  138. print('Time to pre-compute Gram matrix:', self.__runtime_precompute_gm)
  139. print('Time to optimize edit costs:', self.__runtime_optimize_ec)
  140. print('Time to generate pre-images:', self.__runtime_generate_preimage)
  141. print('Total time:', self.__runtime_total)
  142. print('Total number of iterations for optimizing:', self.__itrs)
  143. print('Total number of updating edit costs:', self.__num_updates_ecc)
  144. print('Is optimization of edit costs converged:', self.__converged)
  145. print('================================================================================')
  146. print()
  147. # collect return values.
  148. # return (sod_sm, sod_gm), \
  149. # (dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min, idx_dis_k_gi_min), \
  150. # (time_fitting, time_generating)
  151. def get_results(self):
  152. results = {}
  153. results['edit_cost_constants'] = self.__edit_cost_constants
  154. results['runtime_precompute_gm'] = self.__runtime_precompute_gm
  155. results['runtime_optimize_ec'] = self.__runtime_optimize_ec
  156. results['runtime_generate_preimage'] = self.__runtime_generate_preimage
  157. results['runtime_total'] = self.__runtime_total
  158. results['sod_set_median'] = self.__sod_set_median
  159. results['sod_gen_median'] = self.__sod_gen_median
  160. results['k_dis_set_median'] = self.__k_dis_set_median
  161. results['k_dis_gen_median'] = self.__k_dis_gen_median
  162. results['k_dis_dataset'] = self.__k_dis_dataset
  163. results['itrs'] = self.__itrs
  164. results['converged'] = self.__converged
  165. results['num_updates_ecc'] = self.__num_updates_ecc
  166. return results
  167. def __optimize_edit_cost_constants(self):
  168. """fit edit cost constants.
  169. """
  170. if self.__fit_method == 'random': # random
  171. if self.__ged_options['edit_cost'] == 'LETTER':
  172. self.__edit_cost_constants = random.sample(range(1, 10), 3)
  173. self.__edit_cost_constants = [item * 0.1 for item in self.__edit_cost_constants]
  174. elif self.__ged_options['edit_cost'] == 'LETTER2':
  175. random.seed(time.time())
  176. self.__edit_cost_constants = random.sample(range(1, 10), 5)
  177. # self.__edit_cost_constants = [item * 0.1 for item in self.__edit_cost_constants]
  178. elif self.__ged_options['edit_cost'] == 'NON_SYMBOLIC':
  179. self.__edit_cost_constants = random.sample(range(1, 10), 6)
  180. if self._dataset.node_attrs == []:
  181. self.__edit_cost_constants[2] = 0
  182. if self._dataset.edge_attrs == []:
  183. self.__edit_cost_constants[5] = 0
  184. else:
  185. self.__edit_cost_constants = random.sample(range(1, 10), 6)
  186. if self._verbose >= 2:
  187. print('edit cost constants used:', self.__edit_cost_constants)
  188. elif self.__fit_method == 'expert': # expert
  189. if self.__init_ecc is None:
  190. if self.__ged_options['edit_cost'] == 'LETTER':
  191. self.__edit_cost_constants = [0.9, 1.7, 0.75]
  192. elif self.__ged_options['edit_cost'] == 'LETTER2':
  193. self.__edit_cost_constants = [0.675, 0.675, 0.75, 0.425, 0.425]
  194. else:
  195. self.__edit_cost_constants = [3, 3, 1, 3, 3, 1]
  196. else:
  197. self.__edit_cost_constants = self.__init_ecc
  198. elif self.__fit_method == 'k-graphs':
  199. if self.__init_ecc is None:
  200. if self.__ged_options['edit_cost'] == 'LETTER':
  201. self.__init_ecc = [0.9, 1.7, 0.75]
  202. elif self.__ged_options['edit_cost'] == 'LETTER2':
  203. self.__init_ecc = [0.675, 0.675, 0.75, 0.425, 0.425]
  204. elif self.__ged_options['edit_cost'] == 'NON_SYMBOLIC':
  205. self.__init_ecc = [0, 0, 1, 1, 1, 0]
  206. if self._dataset.node_attrs == []:
  207. self.__init_ecc[2] = 0
  208. if self._dataset.edge_attrs == []:
  209. self.__init_ecc[5] = 0
  210. else:
  211. self.__init_ecc = [3, 3, 1, 3, 3, 1]
  212. # optimize on the k-graph subset.
  213. self.__optimize_ecc_by_kernel_distances()
  214. elif self.__fit_method == 'whole-dataset':
  215. if self.__init_ecc is None:
  216. if self.__ged_options['edit_cost'] == 'LETTER':
  217. self.__init_ecc = [0.9, 1.7, 0.75]
  218. elif self.__ged_options['edit_cost'] == 'LETTER2':
  219. self.__init_ecc = [0.675, 0.675, 0.75, 0.425, 0.425]
  220. else:
  221. self.__init_ecc = [3, 3, 1, 3, 3, 1]
  222. # optimizeon the whole set.
  223. self.__optimize_ecc_by_kernel_distances()
  224. elif self.__fit_method == 'precomputed':
  225. pass
  226. def __optimize_ecc_by_kernel_distances(self):
  227. # compute distances in feature space.
  228. dis_k_mat, _, _, _ = self._graph_kernel.compute_distance_matrix()
  229. dis_k_vec = []
  230. for i in range(len(dis_k_mat)):
  231. # for j in range(i, len(dis_k_mat)):
  232. for j in range(i + 1, len(dis_k_mat)):
  233. dis_k_vec.append(dis_k_mat[i, j])
  234. dis_k_vec = np.array(dis_k_vec)
  235. # init ged.
  236. if self._verbose >= 2:
  237. print('\ninitial:')
  238. time0 = time.time()
  239. graphs = [self.__clean_graph(g) for g in self._dataset.graphs]
  240. self.__edit_cost_constants = self.__init_ecc
  241. options = self.__ged_options.copy()
  242. options['edit_cost_constants'] = self.__edit_cost_constants # @todo
  243. options['node_labels'] = self._dataset.node_labels
  244. options['edge_labels'] = self._dataset.edge_labels
  245. options['node_attrs'] = self._dataset.node_attrs
  246. options['edge_attrs'] = self._dataset.edge_attrs
  247. ged_vec_init, ged_mat, n_edit_operations = compute_geds(graphs, options=options, parallel=self.__parallel)
  248. residual_list = [np.sqrt(np.sum(np.square(np.array(ged_vec_init) - dis_k_vec)))]
  249. time_list = [time.time() - time0]
  250. edit_cost_list = [self.__init_ecc]
  251. nb_cost_mat = np.array(n_edit_operations)
  252. nb_cost_mat_list = [nb_cost_mat]
  253. if self._verbose >= 2:
  254. print('Current edit cost constants:', self.__edit_cost_constants)
  255. print('Residual list:', residual_list)
  256. # run iteration from initial edit costs.
  257. self.__converged = False
  258. itrs_without_update = 0
  259. self.__itrs = 0
  260. self.__num_updates_ecc = 0
  261. timer = Timer(self.__time_limit_in_sec)
  262. while not self.__termination_criterion_met(self.__converged, timer, self.__itrs, itrs_without_update):
  263. if self._verbose >= 2:
  264. print('\niteration', self.__itrs + 1)
  265. time0 = time.time()
  266. # "fit" geds to distances in feature space by tuning edit costs using theLeast Squares Method.
  267. # np.savez('results/xp_fit_method/fit_data_debug' + str(self.__itrs) + '.gm',
  268. # nb_cost_mat=nb_cost_mat, dis_k_vec=dis_k_vec,
  269. # n_edit_operations=n_edit_operations, ged_vec_init=ged_vec_init,
  270. # ged_mat=ged_mat)
  271. self.__edit_cost_constants, _ = self.__update_ecc(nb_cost_mat, dis_k_vec)
  272. for i in range(len(self.__edit_cost_constants)):
  273. if -1e-9 <= self.__edit_cost_constants[i] <= 1e-9:
  274. self.__edit_cost_constants[i] = 0
  275. if self.__edit_cost_constants[i] < 0:
  276. raise ValueError('The edit cost is negative.')
  277. # for i in range(len(self.__edit_cost_constants)):
  278. # if self.__edit_cost_constants[i] < 0:
  279. # self.__edit_cost_constants[i] = 0
  280. # compute new GEDs and numbers of edit operations.
  281. options = self.__ged_options.copy() # np.array([self.__edit_cost_constants[0], self.__edit_cost_constants[1], 0.75])
  282. options['edit_cost_constants'] = self.__edit_cost_constants # @todo
  283. options['node_labels'] = self._dataset.node_labels
  284. options['edge_labels'] = self._dataset.edge_labels
  285. options['node_attrs'] = self._dataset.node_attrs
  286. options['edge_attrs'] = self._dataset.edge_attrs
  287. ged_vec, ged_mat, n_edit_operations = compute_geds(graphs, options=options, parallel=self.__parallel)
  288. residual_list.append(np.sqrt(np.sum(np.square(np.array(ged_vec) - dis_k_vec))))
  289. time_list.append(time.time() - time0)
  290. edit_cost_list.append(self.__edit_cost_constants)
  291. nb_cost_mat = np.array(n_edit_operations)
  292. nb_cost_mat_list.append(nb_cost_mat)
  293. # check convergency.
  294. ec_changed = False
  295. for i, cost in enumerate(self.__edit_cost_constants):
  296. if cost == 0:
  297. if edit_cost_list[-2][i] > self.__epsilon_ec:
  298. ec_changed = True
  299. break
  300. elif abs(cost - edit_cost_list[-2][i]) / cost > self.__epsilon_ec:
  301. ec_changed = True
  302. break
  303. # if abs(cost - edit_cost_list[-2][i]) > self.__epsilon_ec:
  304. # ec_changed = True
  305. # break
  306. residual_changed = False
  307. if residual_list[-1] == 0:
  308. if residual_list[-2] > self.__epsilon_residual:
  309. residual_changed = True
  310. elif abs(residual_list[-1] - residual_list[-2]) / residual_list[-1] > self.__epsilon_residual:
  311. residual_changed = True
  312. self.__converged = not (ec_changed or residual_changed)
  313. if self.__converged:
  314. itrs_without_update += 1
  315. else:
  316. itrs_without_update = 0
  317. self.__num_updates_ecc += 1
  318. # print current states.
  319. if self._verbose >= 2:
  320. print()
  321. print('-------------------------------------------------------------------------')
  322. print('States of iteration', self.__itrs + 1)
  323. print('-------------------------------------------------------------------------')
  324. # print('Time spend:', self.__runtime_optimize_ec)
  325. print('Total number of iterations for optimizing:', self.__itrs + 1)
  326. print('Total number of updating edit costs:', self.__num_updates_ecc)
  327. print('Was optimization of edit costs converged:', self.__converged)
  328. print('Did edit costs change:', ec_changed)
  329. print('Did residual change:', residual_changed)
  330. print('Iterations without update:', itrs_without_update)
  331. print('Current edit cost constants:', self.__edit_cost_constants)
  332. print('Residual list:', residual_list)
  333. print('-------------------------------------------------------------------------')
  334. self.__itrs += 1
  335. def __termination_criterion_met(self, converged, timer, itr, itrs_without_update):
  336. if timer.expired() or (itr >= self.__max_itrs if self.__max_itrs >= 0 else False):
  337. # if self.__state == AlgorithmState.TERMINATED:
  338. # self.__state = AlgorithmState.INITIALIZED
  339. return True
  340. return converged or (itrs_without_update > self.__max_itrs_without_update if self.__max_itrs_without_update >= 0 else False)
  341. def __update_ecc(self, nb_cost_mat, dis_k_vec, rw_constraints='inequality'):
  342. # if self.__ds_name == 'Letter-high':
  343. if self.__ged_options['edit_cost'] == 'LETTER':
  344. pass
  345. # # method 1: set alpha automatically, just tune c_vir and c_eir by
  346. # # LMS using cvxpy.
  347. # alpha = 0.5
  348. # coeff = 100 # np.max(alpha * nb_cost_mat[:,4] / dis_k_vec)
  349. ## if np.count_nonzero(nb_cost_mat[:,4]) == 0:
  350. ## alpha = 0.75
  351. ## else:
  352. ## alpha = np.min([dis_k_vec / c_vs for c_vs in nb_cost_mat[:,4] if c_vs != 0])
  353. ## alpha = alpha * 0.99
  354. # param_vir = alpha * (nb_cost_mat[:,0] + nb_cost_mat[:,1])
  355. # param_eir = (1 - alpha) * (nb_cost_mat[:,4] + nb_cost_mat[:,5])
  356. # nb_cost_mat_new = np.column_stack((param_vir, param_eir))
  357. # dis_new = coeff * dis_k_vec - alpha * nb_cost_mat[:,3]
  358. #
  359. # x = cp.Variable(nb_cost_mat_new.shape[1])
  360. # cost = cp.sum_squares(nb_cost_mat_new * x - dis_new)
  361. # constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])]]
  362. # prob = cp.Problem(cp.Minimize(cost), constraints)
  363. # prob.solve()
  364. # edit_costs_new = x.value
  365. # edit_costs_new = np.array([edit_costs_new[0], edit_costs_new[1], alpha])
  366. # residual = np.sqrt(prob.value)
  367. # # method 2: tune c_vir, c_eir and alpha by nonlinear programming by
  368. # # scipy.optimize.minimize.
  369. # w0 = nb_cost_mat[:,0] + nb_cost_mat[:,1]
  370. # w1 = nb_cost_mat[:,4] + nb_cost_mat[:,5]
  371. # w2 = nb_cost_mat[:,3]
  372. # w3 = dis_k_vec
  373. # func_min = lambda x: np.sum((w0 * x[0] * x[3] + w1 * x[1] * (1 - x[2]) \
  374. # + w2 * x[2] - w3 * x[3]) ** 2)
  375. # bounds = ((0, None), (0., None), (0.5, 0.5), (0, None))
  376. # res = minimize(func_min, [0.9, 1.7, 0.75, 10], bounds=bounds)
  377. # edit_costs_new = res.x[0:3]
  378. # residual = res.fun
  379. # method 3: tune c_vir, c_eir and alpha by nonlinear programming using cvxpy.
  380. # # method 4: tune c_vir, c_eir and alpha by QP function
  381. # # scipy.optimize.least_squares. An initial guess is required.
  382. # w0 = nb_cost_mat[:,0] + nb_cost_mat[:,1]
  383. # w1 = nb_cost_mat[:,4] + nb_cost_mat[:,5]
  384. # w2 = nb_cost_mat[:,3]
  385. # w3 = dis_k_vec
  386. # func = lambda x: (w0 * x[0] * x[3] + w1 * x[1] * (1 - x[2]) \
  387. # + w2 * x[2] - w3 * x[3]) ** 2
  388. # res = optimize.root(func, [0.9, 1.7, 0.75, 100])
  389. # edit_costs_new = res.x
  390. # residual = None
  391. elif self.__ged_options['edit_cost'] == 'LETTER2':
  392. # # 1. if c_vi != c_vr, c_ei != c_er.
  393. # nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]]
  394. # x = cp.Variable(nb_cost_mat_new.shape[1])
  395. # cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec)
  396. ## # 1.1 no constraints.
  397. ## constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])]]
  398. # # 1.2 c_vs <= c_vi + c_vr.
  399. # constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])],
  400. # np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0]
  401. ## # 2. if c_vi == c_vr, c_ei == c_er.
  402. ## nb_cost_mat_new = nb_cost_mat[:,[0,3,4]]
  403. ## nb_cost_mat_new[:,0] += nb_cost_mat[:,1]
  404. ## nb_cost_mat_new[:,2] += nb_cost_mat[:,5]
  405. ## x = cp.Variable(nb_cost_mat_new.shape[1])
  406. ## cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec)
  407. ## # 2.1 no constraints.
  408. ## constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])]]
  409. ### # 2.2 c_vs <= c_vi + c_vr.
  410. ### constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])],
  411. ### np.array([2.0, -1.0, 0.0]).T@x >= 0.0]
  412. #
  413. # prob = cp.Problem(cp.Minimize(cost_fun), constraints)
  414. # prob.solve()
  415. # edit_costs_new = [x.value[0], x.value[0], x.value[1], x.value[2], x.value[2]]
  416. # edit_costs_new = np.array(edit_costs_new)
  417. # residual = np.sqrt(prob.value)
  418. if rw_constraints == 'inequality':
  419. # c_vs <= c_vi + c_vr.
  420. nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]]
  421. x = cp.Variable(nb_cost_mat_new.shape[1])
  422. cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec)
  423. constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])],
  424. np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0]
  425. prob = cp.Problem(cp.Minimize(cost_fun), constraints)
  426. self.__execute_cvx(prob)
  427. edit_costs_new = x.value
  428. residual = np.sqrt(prob.value)
  429. elif rw_constraints == '2constraints':
  430. # c_vs <= c_vi + c_vr and c_vi == c_vr, c_ei == c_er.
  431. nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]]
  432. x = cp.Variable(nb_cost_mat_new.shape[1])
  433. cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec)
  434. constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])],
  435. np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0,
  436. np.array([1.0, -1.0, 0.0, 0.0, 0.0]).T@x == 0.0,
  437. np.array([0.0, 0.0, 0.0, 1.0, -1.0]).T@x == 0.0]
  438. prob = cp.Problem(cp.Minimize(cost_fun), constraints)
  439. prob.solve()
  440. edit_costs_new = x.value
  441. residual = np.sqrt(prob.value)
  442. elif rw_constraints == 'no-constraint':
  443. # no constraint.
  444. nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]]
  445. x = cp.Variable(nb_cost_mat_new.shape[1])
  446. cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec)
  447. constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])]]
  448. prob = cp.Problem(cp.Minimize(cost_fun), constraints)
  449. prob.solve()
  450. edit_costs_new = x.value
  451. residual = np.sqrt(prob.value)
  452. # elif method == 'inequality_modified':
  453. # # c_vs <= c_vi + c_vr.
  454. # nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]]
  455. # x = cp.Variable(nb_cost_mat_new.shape[1])
  456. # cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec)
  457. # constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])],
  458. # np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0]
  459. # prob = cp.Problem(cp.Minimize(cost_fun), constraints)
  460. # prob.solve()
  461. # # use same costs for insertion and removal rather than the fitted costs.
  462. # edit_costs_new = [x.value[0], x.value[0], x.value[1], x.value[2], x.value[2]]
  463. # edit_costs_new = np.array(edit_costs_new)
  464. # residual = np.sqrt(prob.value)
  465. elif self.__ged_options['edit_cost'] == 'NON_SYMBOLIC':
  466. is_n_attr = np.count_nonzero(nb_cost_mat[:,2])
  467. is_e_attr = np.count_nonzero(nb_cost_mat[:,5])
  468. if self.__ds_name == 'SYNTHETICnew':
  469. # nb_cost_mat_new = nb_cost_mat[:,[0,1,2,3,4]]
  470. nb_cost_mat_new = nb_cost_mat[:,[2,3,4]]
  471. x = cp.Variable(nb_cost_mat_new.shape[1])
  472. cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec)
  473. # constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])],
  474. # np.array([0.0, 0.0, 0.0, 1.0, -1.0]).T@x == 0.0]
  475. # constraints = [x >= [0.0001 for i in range(nb_cost_mat_new.shape[1])]]
  476. constraints = [x >= [0.0001 for i in range(nb_cost_mat_new.shape[1])],
  477. np.array([0.0, 1.0, -1.0]).T@x == 0.0]
  478. prob = cp.Problem(cp.Minimize(cost_fun), constraints)
  479. prob.solve()
  480. # print(x.value)
  481. edit_costs_new = np.concatenate((np.array([0.0, 0.0]), x.value,
  482. np.array([0.0])))
  483. residual = np.sqrt(prob.value)
  484. elif rw_constraints == 'inequality':
  485. # c_vs <= c_vi + c_vr.
  486. if is_n_attr and is_e_attr:
  487. nb_cost_mat_new = nb_cost_mat[:,[0,1,2,3,4,5]]
  488. x = cp.Variable(nb_cost_mat_new.shape[1])
  489. cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec)
  490. constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])],
  491. np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0,
  492. np.array([0.0, 0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0]
  493. prob = cp.Problem(cp.Minimize(cost_fun), constraints)
  494. self.__execute_cvx(prob)
  495. edit_costs_new = x.value
  496. residual = np.sqrt(prob.value)
  497. elif is_n_attr and not is_e_attr:
  498. nb_cost_mat_new = nb_cost_mat[:,[0,1,2,3,4]]
  499. x = cp.Variable(nb_cost_mat_new.shape[1])
  500. cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec)
  501. constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])],
  502. np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0]
  503. prob = cp.Problem(cp.Minimize(cost_fun), constraints)
  504. self.__execute_cvx(prob)
  505. edit_costs_new = np.concatenate((x.value, np.array([0.0])))
  506. residual = np.sqrt(prob.value)
  507. elif not is_n_attr and is_e_attr:
  508. nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]]
  509. x = cp.Variable(nb_cost_mat_new.shape[1])
  510. cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec)
  511. constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])],
  512. np.array([0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0]
  513. prob = cp.Problem(cp.Minimize(cost_fun), constraints)
  514. self.__execute_cvx(prob)
  515. edit_costs_new = np.concatenate((x.value[0:2], np.array([0.0]), x.value[2:]))
  516. residual = np.sqrt(prob.value)
  517. else:
  518. nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4]]
  519. x = cp.Variable(nb_cost_mat_new.shape[1])
  520. cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec)
  521. constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])]]
  522. prob = cp.Problem(cp.Minimize(cost_fun), constraints)
  523. self.__execute_cvx(prob)
  524. edit_costs_new = np.concatenate((x.value[0:2], np.array([0.0]),
  525. x.value[2:], np.array([0.0])))
  526. residual = np.sqrt(prob.value)
  527. elif self.__ged_options['edit_cost'] == 'CONSTANT': # @todo: node/edge may not labeled.
  528. x = cp.Variable(nb_cost_mat.shape[1])
  529. cost_fun = cp.sum_squares(nb_cost_mat * x - dis_k_vec)
  530. constraints = [x >= [0.01 for i in range(nb_cost_mat.shape[1])],
  531. np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0,
  532. np.array([0.0, 0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0]
  533. prob = cp.Problem(cp.Minimize(cost_fun), constraints)
  534. self.__execute_cvx(prob)
  535. edit_costs_new = x.value
  536. residual = np.sqrt(prob.value)
  537. else:
  538. # # method 1: simple least square method.
  539. # edit_costs_new, residual, _, _ = np.linalg.lstsq(nb_cost_mat, dis_k_vec,
  540. # rcond=None)
  541. # # method 2: least square method with x_i >= 0.
  542. # edit_costs_new, residual = optimize.nnls(nb_cost_mat, dis_k_vec)
  543. # method 3: solve as a quadratic program with constraints.
  544. # P = np.dot(nb_cost_mat.T, nb_cost_mat)
  545. # q_T = -2 * np.dot(dis_k_vec.T, nb_cost_mat)
  546. # G = -1 * np.identity(nb_cost_mat.shape[1])
  547. # h = np.array([0 for i in range(nb_cost_mat.shape[1])])
  548. # A = np.array([1 for i in range(nb_cost_mat.shape[1])])
  549. # b = 1
  550. # x = cp.Variable(nb_cost_mat.shape[1])
  551. # prob = cp.Problem(cp.Minimize(cp.quad_form(x, P) + q_T@x),
  552. # [G@x <= h])
  553. # prob.solve()
  554. # edit_costs_new = x.value
  555. # residual = prob.value - np.dot(dis_k_vec.T, dis_k_vec)
  556. # G = -1 * np.identity(nb_cost_mat.shape[1])
  557. # h = np.array([0 for i in range(nb_cost_mat.shape[1])])
  558. x = cp.Variable(nb_cost_mat.shape[1])
  559. cost_fun = cp.sum_squares(nb_cost_mat * x - dis_k_vec)
  560. constraints = [x >= [0.0 for i in range(nb_cost_mat.shape[1])],
  561. # np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0]
  562. np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0,
  563. np.array([0.0, 0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0]
  564. prob = cp.Problem(cp.Minimize(cost_fun), constraints)
  565. self.__execute_cvx(prob)
  566. edit_costs_new = x.value
  567. residual = np.sqrt(prob.value)
  568. # method 4:
  569. return edit_costs_new, residual
  570. def __execute_cvx(self, prob):
  571. try:
  572. prob.solve(verbose=(self._verbose>=2))
  573. except MemoryError as error0:
  574. if self._verbose >= 2:
  575. print('\nUsing solver "OSQP" caused a memory error.')
  576. print('the original error message is\n', error0)
  577. print('solver status: ', prob.status)
  578. print('trying solver "CVXOPT" instead...\n')
  579. try:
  580. prob.solve(solver=cp.CVXOPT, verbose=(self._verbose>=2))
  581. except Exception as error1:
  582. if self._verbose >= 2:
  583. print('\nAn error occured when using solver "CVXOPT".')
  584. print('the original error message is\n', error1)
  585. print('solver status: ', prob.status)
  586. print('trying solver "MOSEK" instead. Notice this solver is commercial and a lisence is required.\n')
  587. prob.solve(solver=cp.MOSEK, verbose=(self._verbose>=2))
  588. else:
  589. if self._verbose >= 2:
  590. print('solver status: ', prob.status)
  591. else:
  592. if self._verbose >= 2:
  593. print('solver status: ', prob.status)
  594. if self._verbose >= 2:
  595. print()
  596. def __generate_preimage_iam(self):
  597. # Set up the ged environment.
  598. ged_env = gedlibpy.GEDEnv() # @todo: maybe create a ged_env as a private varible.
  599. # gedlibpy.restart_env()
  600. ged_env.set_edit_cost(self.__ged_options['edit_cost'], edit_cost_constant=self.__edit_cost_constants)
  601. graphs = [self.__clean_graph(g) for g in self._dataset.graphs]
  602. for g in graphs:
  603. ged_env.add_nx_graph(g, '')
  604. graph_ids = ged_env.get_all_graph_ids()
  605. set_median_id = ged_env.add_graph('set_median')
  606. gen_median_id = ged_env.add_graph('gen_median')
  607. ged_env.init(init_option=self.__ged_options['init_option'])
  608. # Set up the madian graph estimator.
  609. mge = MedianGraphEstimator(ged_env, constant_node_costs(self.__ged_options['edit_cost']))
  610. mge.set_refine_method(self.__ged_options['method'], ged_options_to_string(self.__ged_options))
  611. options = self.__mge_options.copy()
  612. if not 'seed' in options:
  613. options['seed'] = int(round(time.time() * 1000)) # @todo: may not work correctly for possible parallel usage.
  614. # Select the GED algorithm.
  615. mge.set_options(mge_options_to_string(options))
  616. mge.set_label_names(node_labels=self._dataset.node_labels,
  617. edge_labels=self._dataset.edge_labels,
  618. node_attrs=self._dataset.node_attrs,
  619. edge_attrs=self._dataset.edge_attrs)
  620. mge.set_init_method(self.__ged_options['method'], ged_options_to_string(self.__ged_options))
  621. mge.set_descent_method(self.__ged_options['method'], ged_options_to_string(self.__ged_options))
  622. # Run the estimator.
  623. mge.run(graph_ids, set_median_id, gen_median_id)
  624. # Get SODs.
  625. self.__sod_set_median = mge.get_sum_of_distances('initialized')
  626. self.__sod_gen_median = mge.get_sum_of_distances('converged')
  627. # Get median graphs.
  628. self.__set_median = ged_env.get_nx_graph(set_median_id)
  629. self.__gen_median = ged_env.get_nx_graph(gen_median_id)
  630. def __compute_distances_to_true_median(self):
  631. # compute distance in kernel space for set median.
  632. kernels_to_sm, _ = self._graph_kernel.compute(self.__set_median, self._dataset.graphs, **self._kernel_options)
  633. kernel_sm, _ = self._graph_kernel.compute(self.__set_median, self.__set_median, **self._kernel_options)
  634. kernels_to_sm = [kernels_to_sm[i] / np.sqrt(self.__gram_matrix_unnorm[i, i] * kernel_sm) for i in range(len(kernels_to_sm))] # normalize
  635. # @todo: not correct kernel value
  636. gram_with_sm = np.concatenate((np.array([kernels_to_sm]), np.copy(self._graph_kernel.gram_matrix)), axis=0)
  637. gram_with_sm = np.concatenate((np.array([[1] + kernels_to_sm]).T, gram_with_sm), axis=1)
  638. self.__k_dis_set_median = compute_k_dis(0, range(1, 1+len(self._dataset.graphs)),
  639. [1 / len(self._dataset.graphs)] * len(self._dataset.graphs),
  640. gram_with_sm, withterm3=False)
  641. # print(gen_median.nodes(data=True))
  642. # print(gen_median.edges(data=True))
  643. # print(set_median.nodes(data=True))
  644. # print(set_median.edges(data=True))
  645. # compute distance in kernel space for generalized median.
  646. kernels_to_gm, _ = self._graph_kernel.compute(self.__gen_median, self._dataset.graphs, **self._kernel_options)
  647. kernel_gm, _ = self._graph_kernel.compute(self.__gen_median, self.__gen_median, **self._kernel_options)
  648. kernels_to_gm = [kernels_to_gm[i] / np.sqrt(self.__gram_matrix_unnorm[i, i] * kernel_gm) for i in range(len(kernels_to_gm))] # normalize
  649. gram_with_gm = np.concatenate((np.array([kernels_to_gm]), np.copy(self._graph_kernel.gram_matrix)), axis=0)
  650. gram_with_gm = np.concatenate((np.array([[1] + kernels_to_gm]).T, gram_with_gm), axis=1)
  651. self.__k_dis_gen_median = compute_k_dis(0, range(1, 1+len(self._dataset.graphs)),
  652. [1 / len(self._dataset.graphs)] * len(self._dataset.graphs),
  653. gram_with_gm, withterm3=False)
  654. # compute distance in kernel space for each graph in median set.
  655. k_dis_median_set = []
  656. for idx in range(len(self._dataset.graphs)):
  657. k_dis_median_set.append(compute_k_dis(idx+1, range(1, 1+len(self._dataset.graphs)),
  658. [1 / len(self._dataset.graphs)] * len(self._dataset.graphs),
  659. gram_with_gm, withterm3=False))
  660. idx_k_dis_median_set_min = np.argmin(k_dis_median_set)
  661. self.__k_dis_dataset = k_dis_median_set[idx_k_dis_median_set_min]
  662. self.__best_from_dataset = self._dataset.graphs[idx_k_dis_median_set_min].copy()
  663. if self._verbose >= 2:
  664. print()
  665. print('distance in kernel space for set median:', self.__k_dis_set_median)
  666. print('distance in kernel space for generalized median:', self.__k_dis_gen_median)
  667. print('minimum distance in kernel space for each graph in median set:', self.__k_dis_dataset)
  668. print('distance in kernel space for each graph in median set:', k_dis_median_set)
  669. def __set_graph_kernel_by_name(self):
  670. if self.kernel_options['name'] == 'structuralspkernel':
  671. from gklearn.kernels import StructuralSP
  672. self._graph_kernel = StructuralSP(node_labels=self._dataset.node_labels,
  673. edge_labels=self._dataset.edge_labels,
  674. node_attrs=self._dataset.node_attrs,
  675. edge_attrs=self._dataset.edge_attrs,
  676. ds_infos=self._dataset.get_dataset_infos(keys=['directed']),
  677. **self._kernel_options)
  678. # def __clean_graph(self, G, node_labels=[], edge_labels=[], node_attrs=[], edge_attrs=[]):
  679. def __clean_graph(self, G): # @todo: this may not be needed when datafile is updated.
  680. """
  681. Cleans node and edge labels and attributes of the given graph.
  682. """
  683. G_new = nx.Graph(**G.graph)
  684. for nd, attrs in G.nodes(data=True):
  685. G_new.add_node(str(nd)) # @todo: should we keep this as str()?
  686. for l_name in self._dataset.node_labels:
  687. G_new.nodes[str(nd)][l_name] = str(attrs[l_name])
  688. for a_name in self._dataset.node_attrs:
  689. G_new.nodes[str(nd)][a_name] = str(attrs[a_name])
  690. for nd1, nd2, attrs in G.edges(data=True):
  691. G_new.add_edge(str(nd1), str(nd2))
  692. for l_name in self._dataset.edge_labels:
  693. G_new.edges[str(nd1), str(nd2)][l_name] = str(attrs[l_name])
  694. for a_name in self._dataset.edge_attrs:
  695. G_new.edges[str(nd1), str(nd2)][a_name] = str(attrs[a_name])
  696. return G_new
  697. @property
  698. def mge(self):
  699. return self.__mge
  700. @property
  701. def ged_options(self):
  702. return self.__ged_options
  703. @ged_options.setter
  704. def ged_options(self, value):
  705. self.__ged_options = value
  706. @property
  707. def mge_options(self):
  708. return self.__mge_options
  709. @mge_options.setter
  710. def mge_options(self, value):
  711. self.__mge_options = value
  712. @property
  713. def fit_method(self):
  714. return self.__fit_method
  715. @fit_method.setter
  716. def fit_method(self, value):
  717. self.__fit_method = value
  718. @property
  719. def init_ecc(self):
  720. return self.__init_ecc
  721. @init_ecc.setter
  722. def init_ecc(self, value):
  723. self.__init_ecc = value
  724. @property
  725. def set_median(self):
  726. return self.__set_median
  727. @property
  728. def gen_median(self):
  729. return self.__gen_median
  730. @property
  731. def best_from_dataset(self):
  732. return self.__best_from_dataset
  733. @property
  734. def gram_matrix_unnorm(self):
  735. return self.__gram_matrix_unnorm
  736. @gram_matrix_unnorm.setter
  737. def gram_matrix_unnorm(self, value):
  738. self.__gram_matrix_unnorm = value

A Python package for graph kernels, graph edit distances and graph pre-image problem.