You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

median_preimage_generator_cml.py 50 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. Created on Tue Jun 16 16:04:46 2020
  5. @author: ljia
  6. """
  7. import numpy as np
  8. import time
  9. import random
  10. import multiprocessing
  11. import networkx as nx
  12. import cvxpy as cp
  13. import itertools
  14. from gklearn.preimage import PreimageGenerator
  15. from gklearn.preimage.utils import compute_k_dis
  16. from gklearn.ged.util import compute_geds_cml
  17. from gklearn.ged.env import GEDEnv
  18. from gklearn.ged.median import MedianGraphEstimatorPy
  19. from gklearn.ged.median import constant_node_costs, mge_options_to_string
  20. from gklearn.utils import Timer, SpecialLabel
  21. from gklearn.utils.utils import get_graph_kernel_by_name
  22. class MedianPreimageGeneratorCML(PreimageGenerator):
  23. """Generator median preimages by cost matrices learning using the pure Python version of GEDEnv. Works only for symbolic labeled graphs.
  24. """
  25. def __init__(self, dataset=None):
  26. PreimageGenerator.__init__(self, dataset=dataset)
  27. # arguments to set.
  28. self.__mge = None
  29. self.__ged_options = {}
  30. self.__mge_options = {}
  31. # self.__fit_method = 'k-graphs'
  32. self.__init_method = 'random'
  33. self.__init_ecc = None
  34. self.__parallel = True
  35. self.__n_jobs = multiprocessing.cpu_count()
  36. self.__ds_name = None
  37. self.__time_limit_in_sec = 0
  38. self.__max_itrs = 100
  39. self.__max_itrs_without_update = 3
  40. self.__epsilon_residual = 0.01
  41. self.__epsilon_ec = 0.1
  42. self.__allow_zeros = True
  43. # self.__triangle_rule = True
  44. # values to compute.
  45. self.__runtime_optimize_ec = None
  46. self.__runtime_generate_preimage = None
  47. self.__runtime_total = None
  48. self.__set_median = None
  49. self.__gen_median = None
  50. self.__best_from_dataset = None
  51. self.__sod_set_median = None
  52. self.__sod_gen_median = None
  53. self.__k_dis_set_median = None
  54. self.__k_dis_gen_median = None
  55. self.__k_dis_dataset = None
  56. self.__itrs = 0
  57. self.__converged = False
  58. self.__num_updates_ecc = 0
  59. self.__node_label_costs = None
  60. self.__edge_label_costs = None
  61. # values that can be set or to be computed.
  62. self.__edit_cost_constants = []
  63. self.__gram_matrix_unnorm = None
  64. self.__runtime_precompute_gm = None
  65. def set_options(self, **kwargs):
  66. self._kernel_options = kwargs.get('kernel_options', {})
  67. self._graph_kernel = kwargs.get('graph_kernel', None)
  68. self._verbose = kwargs.get('verbose', 2)
  69. self.__ged_options = kwargs.get('ged_options', {})
  70. self.__mge_options = kwargs.get('mge_options', {})
  71. # self.__fit_method = kwargs.get('fit_method', 'k-graphs')
  72. self.__init_method = kwargs.get('init_method', 'random')
  73. self.__init_ecc = kwargs.get('init_ecc', None)
  74. self.__edit_cost_constants = kwargs.get('edit_cost_constants', [])
  75. self.__parallel = kwargs.get('parallel', True)
  76. self.__n_jobs = kwargs.get('n_jobs', multiprocessing.cpu_count())
  77. self.__ds_name = kwargs.get('ds_name', None)
  78. self.__time_limit_in_sec = kwargs.get('time_limit_in_sec', 0)
  79. self.__max_itrs = kwargs.get('max_itrs', 100)
  80. self.__max_itrs_without_update = kwargs.get('max_itrs_without_update', 3)
  81. self.__epsilon_residual = kwargs.get('epsilon_residual', 0.01)
  82. self.__epsilon_ec = kwargs.get('epsilon_ec', 0.1)
  83. self.__gram_matrix_unnorm = kwargs.get('gram_matrix_unnorm', None)
  84. self.__runtime_precompute_gm = kwargs.get('runtime_precompute_gm', None)
  85. self.__allow_zeros = kwargs.get('allow_zeros', True)
  86. # self.__triangle_rule = kwargs.get('triangle_rule', True)
  87. def run(self):
  88. self._graph_kernel = get_graph_kernel_by_name(self._kernel_options['name'],
  89. node_labels=self._dataset.node_labels,
  90. edge_labels=self._dataset.edge_labels,
  91. node_attrs=self._dataset.node_attrs,
  92. edge_attrs=self._dataset.edge_attrs,
  93. ds_infos=self._dataset.get_dataset_infos(keys=['directed']),
  94. kernel_options=self._kernel_options)
  95. # record start time.
  96. start = time.time()
  97. # 1. precompute gram matrix.
  98. if self.__gram_matrix_unnorm is None:
  99. gram_matrix, run_time = self._graph_kernel.compute(self._dataset.graphs, **self._kernel_options)
  100. self.__gram_matrix_unnorm = self._graph_kernel.gram_matrix_unnorm
  101. end_precompute_gm = time.time()
  102. self.__runtime_precompute_gm = end_precompute_gm - start
  103. else:
  104. if self.__runtime_precompute_gm is None:
  105. raise Exception('Parameter "runtime_precompute_gm" must be given when using pre-computed Gram matrix.')
  106. self._graph_kernel.gram_matrix_unnorm = self.__gram_matrix_unnorm
  107. if self._kernel_options['normalize']:
  108. self._graph_kernel.gram_matrix = self._graph_kernel.normalize_gm(np.copy(self.__gram_matrix_unnorm))
  109. else:
  110. self._graph_kernel.gram_matrix = np.copy(self.__gram_matrix_unnorm)
  111. end_precompute_gm = time.time()
  112. start -= self.__runtime_precompute_gm
  113. # if self.__fit_method != 'k-graphs' and self.__fit_method != 'whole-dataset':
  114. # start = time.time()
  115. # self.__runtime_precompute_gm = 0
  116. # end_precompute_gm = start
  117. # 2. optimize edit cost constants.
  118. self.__optimize_edit_cost_vector()
  119. end_optimize_ec = time.time()
  120. self.__runtime_optimize_ec = end_optimize_ec - end_precompute_gm
  121. # 3. compute set median and gen median using optimized edit costs.
  122. if self._verbose >= 2:
  123. print('\nstart computing set median and gen median using optimized edit costs...\n')
  124. self.__gmg_bcu()
  125. end_generate_preimage = time.time()
  126. self.__runtime_generate_preimage = end_generate_preimage - end_optimize_ec
  127. self.__runtime_total = end_generate_preimage - start
  128. if self._verbose >= 2:
  129. print('medians computed.')
  130. print('SOD of the set median: ', self.__sod_set_median)
  131. print('SOD of the generalized median: ', self.__sod_gen_median)
  132. # 4. compute kernel distances to the true median.
  133. if self._verbose >= 2:
  134. print('\nstart computing distances to true median....\n')
  135. self.__compute_distances_to_true_median()
  136. # 5. print out results.
  137. if self._verbose:
  138. print()
  139. print('================================================================================')
  140. print('Finished generation of preimages.')
  141. print('--------------------------------------------------------------------------------')
  142. print('The optimized edit cost constants:', self.__edit_cost_constants)
  143. print('SOD of the set median:', self.__sod_set_median)
  144. print('SOD of the generalized median:', self.__sod_gen_median)
  145. print('Distance in kernel space for set median:', self.__k_dis_set_median)
  146. print('Distance in kernel space for generalized median:', self.__k_dis_gen_median)
  147. print('Minimum distance in kernel space for each graph in median set:', self.__k_dis_dataset)
  148. print('Time to pre-compute Gram matrix:', self.__runtime_precompute_gm)
  149. print('Time to optimize edit costs:', self.__runtime_optimize_ec)
  150. print('Time to generate pre-images:', self.__runtime_generate_preimage)
  151. print('Total time:', self.__runtime_total)
  152. print('Total number of iterations for optimizing:', self.__itrs)
  153. print('Total number of updating edit costs:', self.__num_updates_ecc)
  154. print('Is optimization of edit costs converged:', self.__converged)
  155. print('================================================================================')
  156. print()
  157. def get_results(self):
  158. results = {}
  159. results['edit_cost_constants'] = self.__edit_cost_constants
  160. results['runtime_precompute_gm'] = self.__runtime_precompute_gm
  161. results['runtime_optimize_ec'] = self.__runtime_optimize_ec
  162. results['runtime_generate_preimage'] = self.__runtime_generate_preimage
  163. results['runtime_total'] = self.__runtime_total
  164. results['sod_set_median'] = self.__sod_set_median
  165. results['sod_gen_median'] = self.__sod_gen_median
  166. results['k_dis_set_median'] = self.__k_dis_set_median
  167. results['k_dis_gen_median'] = self.__k_dis_gen_median
  168. results['k_dis_dataset'] = self.__k_dis_dataset
  169. results['itrs'] = self.__itrs
  170. results['converged'] = self.__converged
  171. results['num_updates_ecc'] = self.__num_updates_ecc
  172. results['mge'] = {}
  173. results['mge']['num_decrease_order'] = self.__mge.get_num_times_order_decreased()
  174. results['mge']['num_increase_order'] = self.__mge.get_num_times_order_increased()
  175. results['mge']['num_converged_descents'] = self.__mge.get_num_converged_descents()
  176. return results
  177. def __optimize_edit_cost_vector(self):
  178. """Learn edit cost vector.
  179. """
  180. # Initialize label costs randomly.
  181. if self.__init_method == 'random':
  182. # Initialize label costs.
  183. self.__initialize_label_costs()
  184. # Optimize edit cost matrices.
  185. self.__optimize_ecm_by_kernel_distances()
  186. # Initialize all label costs with the same value.
  187. elif self.__init_method == 'uniform': # random
  188. pass
  189. elif self.__fit_method == 'random': # random
  190. if self.__ged_options['edit_cost'] == 'LETTER':
  191. self.__edit_cost_constants = random.sample(range(1, 1000), 3)
  192. self.__edit_cost_constants = [item * 0.001 for item in self.__edit_cost_constants]
  193. elif self.__ged_options['edit_cost'] == 'LETTER2':
  194. random.seed(time.time())
  195. self.__edit_cost_constants = random.sample(range(1, 1000), 5)
  196. self.__edit_cost_constants = [item * 0.01 for item in self.__edit_cost_constants]
  197. elif self.__ged_options['edit_cost'] == 'NON_SYMBOLIC':
  198. self.__edit_cost_constants = random.sample(range(1, 1000), 6)
  199. self.__edit_cost_constants = [item * 0.01 for item in self.__edit_cost_constants]
  200. if self._dataset.node_attrs == []:
  201. self.__edit_cost_constants[2] = 0
  202. if self._dataset.edge_attrs == []:
  203. self.__edit_cost_constants[5] = 0
  204. else:
  205. self.__edit_cost_constants = random.sample(range(1, 1000), 6)
  206. self.__edit_cost_constants = [item * 0.01 for item in self.__edit_cost_constants]
  207. if self._verbose >= 2:
  208. print('edit cost constants used:', self.__edit_cost_constants)
  209. elif self.__fit_method == 'expert': # expert
  210. if self.__init_ecc is None:
  211. if self.__ged_options['edit_cost'] == 'LETTER':
  212. self.__edit_cost_constants = [0.9, 1.7, 0.75]
  213. elif self.__ged_options['edit_cost'] == 'LETTER2':
  214. self.__edit_cost_constants = [0.675, 0.675, 0.75, 0.425, 0.425]
  215. else:
  216. self.__edit_cost_constants = [3, 3, 1, 3, 3, 1]
  217. else:
  218. self.__edit_cost_constants = self.__init_ecc
  219. elif self.__fit_method == 'k-graphs':
  220. if self.__init_ecc is None:
  221. if self.__ged_options['edit_cost'] == 'LETTER':
  222. self.__init_ecc = [0.9, 1.7, 0.75]
  223. elif self.__ged_options['edit_cost'] == 'LETTER2':
  224. self.__init_ecc = [0.675, 0.675, 0.75, 0.425, 0.425]
  225. elif self.__ged_options['edit_cost'] == 'NON_SYMBOLIC':
  226. self.__init_ecc = [0, 0, 1, 1, 1, 0]
  227. if self._dataset.node_attrs == []:
  228. self.__init_ecc[2] = 0
  229. if self._dataset.edge_attrs == []:
  230. self.__init_ecc[5] = 0
  231. else:
  232. self.__init_ecc = [3, 3, 1, 3, 3, 1]
  233. # optimize on the k-graph subset.
  234. self.__optimize_ecm_by_kernel_distances()
  235. elif self.__fit_method == 'whole-dataset':
  236. if self.__init_ecc is None:
  237. if self.__ged_options['edit_cost'] == 'LETTER':
  238. self.__init_ecc = [0.9, 1.7, 0.75]
  239. elif self.__ged_options['edit_cost'] == 'LETTER2':
  240. self.__init_ecc = [0.675, 0.675, 0.75, 0.425, 0.425]
  241. else:
  242. self.__init_ecc = [3, 3, 1, 3, 3, 1]
  243. # optimizeon the whole set.
  244. self.__optimize_ecc_by_kernel_distances()
  245. elif self.__fit_method == 'precomputed':
  246. pass
  247. def __initialize_label_costs(self):
  248. self.__initialize_node_label_costs()
  249. self.__initialize_edge_label_costs()
  250. def __initialize_node_label_costs(self):
  251. # Get list of node labels.
  252. nls = self._dataset.get_all_node_labels()
  253. # Generate random costs.
  254. nb_nl = int((len(nls) * (len(nls) - 1)) / 2 + 2 * len(nls))
  255. rand_costs = random.sample(range(1, 10 * nb_nl + 1), nb_nl)
  256. rand_costs /= np.max(rand_costs) # @todo: maybe not needed.
  257. self.__node_label_costs = np.zeros((len(nls) + 1, len(nls) + 1))
  258. # Initialize node label cost matrix, each row/column corresponds to a label, the first label is the dummy label. This is the same setting as in GEDData.
  259. i = 0
  260. # Costs of insertions.
  261. for row in range(1, len(nls) + 1):
  262. self.__node_label_costs[row, 0] = rand_costs[i]
  263. i += 1
  264. # Costs of deletions.
  265. for col in range(1, len(nls) + 1):
  266. self.__node_label_costs[0, col] = rand_costs[i]
  267. i += 1
  268. # Costs of substitutions.
  269. for row in range(1, len(nls) + 1):
  270. for col in range(row + 1, len(nls) + 1):
  271. self.__node_label_costs[row, col] = rand_costs[i]
  272. self.__node_label_costs[col, row] = rand_costs[i]
  273. i += 1
  274. # self.__node_label_costs = {}
  275. # for i, (nl1, nl2) in enumerate(itertools.combinations(nls, 2)):
  276. # self.__node_label_costs[(nl1, nl2)] = rand_costs[i]
  277. # # Add costs for deletion.
  278. # for j, nl in enumerate(nls):
  279. # self.__node_label_costs[(nl1, SpecialLabel.DUMMY)] = rand_costs[i + j]
  280. # # Add costs for insertion.
  281. # for k, nl in enumerate(nls):
  282. # self.__node_label_costs[(SpecialLabel.DUMMY, nl1)] = rand_costs[i + j + k]
  283. # # Add self costs.
  284. # for nl in nls:
  285. # self.__node_label_costs[(nl, nl)] = 0
  286. # self.__node_label_costs[(SpecialLabel.DUMMY, SpecialLabel.DUMMY)] = 0
  287. def __initialize_edge_label_costs(self):
  288. # Get list of edge labels.
  289. els = self._dataset.get_all_edge_labels()
  290. # Generate random costs.
  291. nb_el = int((len(els) * (len(els) - 1)) / 2 + 2 * len(els))
  292. rand_costs = random.sample(range(1, 10 * nb_el + 1), nb_el)
  293. rand_costs /= np.max(rand_costs) # @todo: maybe not needed.
  294. self.__edge_label_costs = np.zeros((len(els) + 1, len(els) + 1))
  295. # Initialize edge label cost matrix, each row/column corresponds to a label, the first label is the dummy label. This is the same setting as in GEDData.
  296. i = 0
  297. # Costs of insertions.
  298. for row in range(1, len(els) + 1):
  299. self.__edge_label_costs[row, 0] = rand_costs[i]
  300. i += 1
  301. # Costs of deletions.
  302. for col in range(1, len(els) + 1):
  303. self.__edge_label_costs[0, col] = rand_costs[i]
  304. i += 1
  305. # Costs of substitutions.
  306. for row in range(1, len(els) + 1):
  307. for col in range(row + 1, len(els) + 1):
  308. self.__edge_label_costs[row, col] = rand_costs[i]
  309. self.__edge_label_costs[col, row] = rand_costs[i]
  310. i += 1
  311. def __optimize_ecm_by_kernel_distances(self):
  312. # compute distances in feature space.
  313. dis_k_mat, _, _, _ = self._graph_kernel.compute_distance_matrix()
  314. dis_k_vec = []
  315. for i in range(len(dis_k_mat)):
  316. # for j in range(i, len(dis_k_mat)):
  317. for j in range(i + 1, len(dis_k_mat)):
  318. dis_k_vec.append(dis_k_mat[i, j])
  319. dis_k_vec = np.array(dis_k_vec)
  320. # init ged.
  321. if self._verbose >= 2:
  322. print('\ninitial:')
  323. time0 = time.time()
  324. graphs = [self.__clean_graph(g) for g in self._dataset.graphs]
  325. self.__edit_cost_constants = self.__init_ecc
  326. options = self.__ged_options.copy()
  327. options['edit_cost_constants'] = self.__edit_cost_constants # @todo
  328. options['node_labels'] = self._dataset.node_labels
  329. options['edge_labels'] = self._dataset.edge_labels
  330. options['node_attrs'] = self._dataset.node_attrs
  331. options['edge_attrs'] = self._dataset.edge_attrs
  332. options['node_label_costs'] = self.__node_label_costs
  333. options['edge_label_costs'] = self.__edge_label_costs
  334. ged_vec_init, ged_mat, n_edit_operations = compute_geds_cml(graphs, options=options, parallel=self.__parallel, verbose=(self._verbose > 1))
  335. residual_list = [np.sqrt(np.sum(np.square(np.array(ged_vec_init) - dis_k_vec)))]
  336. time_list = [time.time() - time0]
  337. edit_cost_list = [self.__init_ecc]
  338. nb_cost_mat = np.array(n_edit_operations)
  339. nb_cost_mat_list = [nb_cost_mat]
  340. if self._verbose >= 2:
  341. print('Current edit cost constants:', self.__edit_cost_constants)
  342. print('Residual list:', residual_list)
  343. # run iteration from initial edit costs.
  344. self.__converged = False
  345. itrs_without_update = 0
  346. self.__itrs = 0
  347. self.__num_updates_ecc = 0
  348. timer = Timer(self.__time_limit_in_sec)
  349. while not self.__termination_criterion_met(self.__converged, timer, self.__itrs, itrs_without_update):
  350. if self._verbose >= 2:
  351. print('\niteration', self.__itrs + 1)
  352. time0 = time.time()
  353. # "fit" geds to distances in feature space by tuning edit costs using theLeast Squares Method.
  354. # np.savez('results/xp_fit_method/fit_data_debug' + str(self.__itrs) + '.gm',
  355. # nb_cost_mat=nb_cost_mat, dis_k_vec=dis_k_vec,
  356. # n_edit_operations=n_edit_operations, ged_vec_init=ged_vec_init,
  357. # ged_mat=ged_mat)
  358. self.__edit_cost_constants, _ = self.__update_ecc(nb_cost_mat, dis_k_vec)
  359. for i in range(len(self.__edit_cost_constants)):
  360. if -1e-9 <= self.__edit_cost_constants[i] <= 1e-9:
  361. self.__edit_cost_constants[i] = 0
  362. if self.__edit_cost_constants[i] < 0:
  363. raise ValueError('The edit cost is negative.')
  364. # for i in range(len(self.__edit_cost_constants)):
  365. # if self.__edit_cost_constants[i] < 0:
  366. # self.__edit_cost_constants[i] = 0
  367. # compute new GEDs and numbers of edit operations.
  368. options = self.__ged_options.copy() # np.array([self.__edit_cost_constants[0], self.__edit_cost_constants[1], 0.75])
  369. options['edit_cost_constants'] = self.__edit_cost_constants # @todo
  370. options['node_labels'] = self._dataset.node_labels
  371. options['edge_labels'] = self._dataset.edge_labels
  372. options['node_attrs'] = self._dataset.node_attrs
  373. options['edge_attrs'] = self._dataset.edge_attrs
  374. ged_vec, ged_mat, n_edit_operations = compute_geds_cml(graphs, options=options, parallel=self.__parallel, verbose=(self._verbose > 1))
  375. residual_list.append(np.sqrt(np.sum(np.square(np.array(ged_vec) - dis_k_vec))))
  376. time_list.append(time.time() - time0)
  377. edit_cost_list.append(self.__edit_cost_constants)
  378. nb_cost_mat = np.array(n_edit_operations)
  379. nb_cost_mat_list.append(nb_cost_mat)
  380. # check convergency.
  381. ec_changed = False
  382. for i, cost in enumerate(self.__edit_cost_constants):
  383. if cost == 0:
  384. if edit_cost_list[-2][i] > self.__epsilon_ec:
  385. ec_changed = True
  386. break
  387. elif abs(cost - edit_cost_list[-2][i]) / cost > self.__epsilon_ec:
  388. ec_changed = True
  389. break
  390. # if abs(cost - edit_cost_list[-2][i]) > self.__epsilon_ec:
  391. # ec_changed = True
  392. # break
  393. residual_changed = False
  394. if residual_list[-1] == 0:
  395. if residual_list[-2] > self.__epsilon_residual:
  396. residual_changed = True
  397. elif abs(residual_list[-1] - residual_list[-2]) / residual_list[-1] > self.__epsilon_residual:
  398. residual_changed = True
  399. self.__converged = not (ec_changed or residual_changed)
  400. if self.__converged:
  401. itrs_without_update += 1
  402. else:
  403. itrs_without_update = 0
  404. self.__num_updates_ecc += 1
  405. # print current states.
  406. if self._verbose >= 2:
  407. print()
  408. print('-------------------------------------------------------------------------')
  409. print('States of iteration', self.__itrs + 1)
  410. print('-------------------------------------------------------------------------')
  411. # print('Time spend:', self.__runtime_optimize_ec)
  412. print('Total number of iterations for optimizing:', self.__itrs + 1)
  413. print('Total number of updating edit costs:', self.__num_updates_ecc)
  414. print('Was optimization of edit costs converged:', self.__converged)
  415. print('Did edit costs change:', ec_changed)
  416. print('Did residual change:', residual_changed)
  417. print('Iterations without update:', itrs_without_update)
  418. print('Current edit cost constants:', self.__edit_cost_constants)
  419. print('Residual list:', residual_list)
  420. print('-------------------------------------------------------------------------')
  421. self.__itrs += 1
  422. def __termination_criterion_met(self, converged, timer, itr, itrs_without_update):
  423. if timer.expired() or (itr >= self.__max_itrs if self.__max_itrs >= 0 else False):
  424. # if self.__state == AlgorithmState.TERMINATED:
  425. # self.__state = AlgorithmState.INITIALIZED
  426. return True
  427. return converged or (itrs_without_update > self.__max_itrs_without_update if self.__max_itrs_without_update >= 0 else False)
  428. def __update_ecc(self, nb_cost_mat, dis_k_vec, rw_constraints='inequality'):
  429. # if self.__ds_name == 'Letter-high':
  430. if self.__ged_options['edit_cost'] == 'LETTER':
  431. raise Exception('Cannot compute for cost "LETTER".')
  432. pass
  433. # # method 1: set alpha automatically, just tune c_vir and c_eir by
  434. # # LMS using cvxpy.
  435. # alpha = 0.5
  436. # coeff = 100 # np.max(alpha * nb_cost_mat[:,4] / dis_k_vec)
  437. ## if np.count_nonzero(nb_cost_mat[:,4]) == 0:
  438. ## alpha = 0.75
  439. ## else:
  440. ## alpha = np.min([dis_k_vec / c_vs for c_vs in nb_cost_mat[:,4] if c_vs != 0])
  441. ## alpha = alpha * 0.99
  442. # param_vir = alpha * (nb_cost_mat[:,0] + nb_cost_mat[:,1])
  443. # param_eir = (1 - alpha) * (nb_cost_mat[:,4] + nb_cost_mat[:,5])
  444. # nb_cost_mat_new = np.column_stack((param_vir, param_eir))
  445. # dis_new = coeff * dis_k_vec - alpha * nb_cost_mat[:,3]
  446. #
  447. # x = cp.Variable(nb_cost_mat_new.shape[1])
  448. # cost = cp.sum_squares(nb_cost_mat_new * x - dis_new)
  449. # constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])]]
  450. # prob = cp.Problem(cp.Minimize(cost), constraints)
  451. # prob.solve()
  452. # edit_costs_new = x.value
  453. # edit_costs_new = np.array([edit_costs_new[0], edit_costs_new[1], alpha])
  454. # residual = np.sqrt(prob.value)
  455. # # method 2: tune c_vir, c_eir and alpha by nonlinear programming by
  456. # # scipy.optimize.minimize.
  457. # w0 = nb_cost_mat[:,0] + nb_cost_mat[:,1]
  458. # w1 = nb_cost_mat[:,4] + nb_cost_mat[:,5]
  459. # w2 = nb_cost_mat[:,3]
  460. # w3 = dis_k_vec
  461. # func_min = lambda x: np.sum((w0 * x[0] * x[3] + w1 * x[1] * (1 - x[2]) \
  462. # + w2 * x[2] - w3 * x[3]) ** 2)
  463. # bounds = ((0, None), (0., None), (0.5, 0.5), (0, None))
  464. # res = minimize(func_min, [0.9, 1.7, 0.75, 10], bounds=bounds)
  465. # edit_costs_new = res.x[0:3]
  466. # residual = res.fun
  467. # method 3: tune c_vir, c_eir and alpha by nonlinear programming using cvxpy.
  468. # # method 4: tune c_vir, c_eir and alpha by QP function
  469. # # scipy.optimize.least_squares. An initial guess is required.
  470. # w0 = nb_cost_mat[:,0] + nb_cost_mat[:,1]
  471. # w1 = nb_cost_mat[:,4] + nb_cost_mat[:,5]
  472. # w2 = nb_cost_mat[:,3]
  473. # w3 = dis_k_vec
  474. # func = lambda x: (w0 * x[0] * x[3] + w1 * x[1] * (1 - x[2]) \
  475. # + w2 * x[2] - w3 * x[3]) ** 2
  476. # res = optimize.root(func, [0.9, 1.7, 0.75, 100])
  477. # edit_costs_new = res.x
  478. # residual = None
  479. elif self.__ged_options['edit_cost'] == 'LETTER2':
  480. # # 1. if c_vi != c_vr, c_ei != c_er.
  481. # nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]]
  482. # x = cp.Variable(nb_cost_mat_new.shape[1])
  483. # cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec)
  484. ## # 1.1 no constraints.
  485. ## constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])]]
  486. # # 1.2 c_vs <= c_vi + c_vr.
  487. # constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])],
  488. # np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0]
  489. ## # 2. if c_vi == c_vr, c_ei == c_er.
  490. ## nb_cost_mat_new = nb_cost_mat[:,[0,3,4]]
  491. ## nb_cost_mat_new[:,0] += nb_cost_mat[:,1]
  492. ## nb_cost_mat_new[:,2] += nb_cost_mat[:,5]
  493. ## x = cp.Variable(nb_cost_mat_new.shape[1])
  494. ## cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec)
  495. ## # 2.1 no constraints.
  496. ## constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])]]
  497. ### # 2.2 c_vs <= c_vi + c_vr.
  498. ### constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])],
  499. ### np.array([2.0, -1.0, 0.0]).T@x >= 0.0]
  500. #
  501. # prob = cp.Problem(cp.Minimize(cost_fun), constraints)
  502. # prob.solve()
  503. # edit_costs_new = [x.value[0], x.value[0], x.value[1], x.value[2], x.value[2]]
  504. # edit_costs_new = np.array(edit_costs_new)
  505. # residual = np.sqrt(prob.value)
  506. if not self.__triangle_rule and self.__allow_zeros:
  507. nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]]
  508. x = cp.Variable(nb_cost_mat_new.shape[1])
  509. cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec)
  510. constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])],
  511. np.array([1.0, 0.0, 0.0, 0.0, 0.0]).T@x >= 0.01,
  512. np.array([0.0, 1.0, 0.0, 0.0, 0.0]).T@x >= 0.01,
  513. np.array([0.0, 0.0, 0.0, 1.0, 0.0]).T@x >= 0.01,
  514. np.array([0.0, 0.0, 0.0, 0.0, 1.0]).T@x >= 0.01]
  515. prob = cp.Problem(cp.Minimize(cost_fun), constraints)
  516. self.__execute_cvx(prob)
  517. edit_costs_new = x.value
  518. residual = np.sqrt(prob.value)
  519. elif self.__triangle_rule and self.__allow_zeros:
  520. nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]]
  521. x = cp.Variable(nb_cost_mat_new.shape[1])
  522. cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec)
  523. constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])],
  524. np.array([1.0, 0.0, 0.0, 0.0, 0.0]).T@x >= 0.01,
  525. np.array([0.0, 1.0, 0.0, 0.0, 0.0]).T@x >= 0.01,
  526. np.array([0.0, 0.0, 0.0, 1.0, 0.0]).T@x >= 0.01,
  527. np.array([0.0, 0.0, 0.0, 0.0, 1.0]).T@x >= 0.01,
  528. np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0]
  529. prob = cp.Problem(cp.Minimize(cost_fun), constraints)
  530. self.__execute_cvx(prob)
  531. edit_costs_new = x.value
  532. residual = np.sqrt(prob.value)
  533. elif not self.__triangle_rule and not self.__allow_zeros:
  534. nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]]
  535. x = cp.Variable(nb_cost_mat_new.shape[1])
  536. cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec)
  537. constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])]]
  538. prob = cp.Problem(cp.Minimize(cost_fun), constraints)
  539. prob.solve()
  540. edit_costs_new = x.value
  541. residual = np.sqrt(prob.value)
  542. # elif method == 'inequality_modified':
  543. # # c_vs <= c_vi + c_vr.
  544. # nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]]
  545. # x = cp.Variable(nb_cost_mat_new.shape[1])
  546. # cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec)
  547. # constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])],
  548. # np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0]
  549. # prob = cp.Problem(cp.Minimize(cost_fun), constraints)
  550. # prob.solve()
  551. # # use same costs for insertion and removal rather than the fitted costs.
  552. # edit_costs_new = [x.value[0], x.value[0], x.value[1], x.value[2], x.value[2]]
  553. # edit_costs_new = np.array(edit_costs_new)
  554. # residual = np.sqrt(prob.value)
  555. elif self.__triangle_rule and not self.__allow_zeros:
  556. # c_vs <= c_vi + c_vr.
  557. nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]]
  558. x = cp.Variable(nb_cost_mat_new.shape[1])
  559. cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec)
  560. constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])],
  561. np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0]
  562. prob = cp.Problem(cp.Minimize(cost_fun), constraints)
  563. self.__execute_cvx(prob)
  564. edit_costs_new = x.value
  565. residual = np.sqrt(prob.value)
  566. elif rw_constraints == '2constraints': # @todo: rearrange it later.
  567. # c_vs <= c_vi + c_vr and c_vi == c_vr, c_ei == c_er.
  568. nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]]
  569. x = cp.Variable(nb_cost_mat_new.shape[1])
  570. cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec)
  571. constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])],
  572. np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0,
  573. np.array([1.0, -1.0, 0.0, 0.0, 0.0]).T@x == 0.0,
  574. np.array([0.0, 0.0, 0.0, 1.0, -1.0]).T@x == 0.0]
  575. prob = cp.Problem(cp.Minimize(cost_fun), constraints)
  576. prob.solve()
  577. edit_costs_new = x.value
  578. residual = np.sqrt(prob.value)
  579. elif self.__ged_options['edit_cost'] == 'NON_SYMBOLIC':
  580. is_n_attr = np.count_nonzero(nb_cost_mat[:,2])
  581. is_e_attr = np.count_nonzero(nb_cost_mat[:,5])
  582. if self.__ds_name == 'SYNTHETICnew': # @todo: rearrenge this later.
  583. # nb_cost_mat_new = nb_cost_mat[:,[0,1,2,3,4]]
  584. nb_cost_mat_new = nb_cost_mat[:,[2,3,4]]
  585. x = cp.Variable(nb_cost_mat_new.shape[1])
  586. cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec)
  587. # constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])],
  588. # np.array([0.0, 0.0, 0.0, 1.0, -1.0]).T@x == 0.0]
  589. # constraints = [x >= [0.0001 for i in range(nb_cost_mat_new.shape[1])]]
  590. constraints = [x >= [0.0001 for i in range(nb_cost_mat_new.shape[1])],
  591. np.array([0.0, 1.0, -1.0]).T@x == 0.0]
  592. prob = cp.Problem(cp.Minimize(cost_fun), constraints)
  593. prob.solve()
  594. # print(x.value)
  595. edit_costs_new = np.concatenate((np.array([0.0, 0.0]), x.value,
  596. np.array([0.0])))
  597. residual = np.sqrt(prob.value)
  598. elif not self.__triangle_rule and self.__allow_zeros:
  599. if is_n_attr and is_e_attr:
  600. nb_cost_mat_new = nb_cost_mat[:,[0,1,2,3,4,5]]
  601. x = cp.Variable(nb_cost_mat_new.shape[1])
  602. cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec)
  603. constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])],
  604. np.array([1.0, 0.0, 0.0, 0.0, 0.0, 0.0]).T@x >= 0.01,
  605. np.array([0.0, 1.0, 0.0, 0.0, 0.0, 0.0]).T@x >= 0.01,
  606. np.array([0.0, 0.0, 0.0, 1.0, 0.0, 0.0]).T@x >= 0.01,
  607. np.array([0.0, 0.0, 0.0, 0.0, 1.0, 0.0]).T@x >= 0.01]
  608. prob = cp.Problem(cp.Minimize(cost_fun), constraints)
  609. self.__execute_cvx(prob)
  610. edit_costs_new = x.value
  611. residual = np.sqrt(prob.value)
  612. elif is_n_attr and not is_e_attr:
  613. nb_cost_mat_new = nb_cost_mat[:,[0,1,2,3,4]]
  614. x = cp.Variable(nb_cost_mat_new.shape[1])
  615. cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec)
  616. constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])],
  617. np.array([1.0, 0.0, 0.0, 0.0, 0.0]).T@x >= 0.01,
  618. np.array([0.0, 1.0, 0.0, 0.0, 0.0]).T@x >= 0.01,
  619. np.array([0.0, 0.0, 0.0, 1.0, 0.0]).T@x >= 0.01,
  620. np.array([0.0, 0.0, 0.0, 0.0, 1.0]).T@x >= 0.01]
  621. prob = cp.Problem(cp.Minimize(cost_fun), constraints)
  622. self.__execute_cvx(prob)
  623. edit_costs_new = np.concatenate((x.value, np.array([0.0])))
  624. residual = np.sqrt(prob.value)
  625. elif not is_n_attr and is_e_attr:
  626. nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]]
  627. x = cp.Variable(nb_cost_mat_new.shape[1])
  628. cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec)
  629. constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])],
  630. np.array([1.0, 0.0, 0.0, 0.0, 0.0]).T@x >= 0.01,
  631. np.array([0.0, 1.0, 0.0, 0.0, 0.0]).T@x >= 0.01,
  632. np.array([0.0, 0.0, 1.0, 0.0, 0.0]).T@x >= 0.01,
  633. np.array([0.0, 0.0, 0.0, 1.0, 0.0]).T@x >= 0.01]
  634. prob = cp.Problem(cp.Minimize(cost_fun), constraints)
  635. self.__execute_cvx(prob)
  636. edit_costs_new = np.concatenate((x.value[0:2], np.array([0.0]), x.value[2:]))
  637. residual = np.sqrt(prob.value)
  638. else:
  639. nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4]]
  640. x = cp.Variable(nb_cost_mat_new.shape[1])
  641. cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec)
  642. constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])]]
  643. prob = cp.Problem(cp.Minimize(cost_fun), constraints)
  644. self.__execute_cvx(prob)
  645. edit_costs_new = np.concatenate((x.value[0:2], np.array([0.0]),
  646. x.value[2:], np.array([0.0])))
  647. residual = np.sqrt(prob.value)
  648. elif self.__triangle_rule and self.__allow_zeros:
  649. if is_n_attr and is_e_attr:
  650. nb_cost_mat_new = nb_cost_mat[:,[0,1,2,3,4,5]]
  651. x = cp.Variable(nb_cost_mat_new.shape[1])
  652. cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec)
  653. constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])],
  654. np.array([1.0, 0.0, 0.0, 0.0, 0.0, 0.0]).T@x >= 0.01,
  655. np.array([0.0, 1.0, 0.0, 0.0, 0.0, 0.0]).T@x >= 0.01,
  656. np.array([0.0, 0.0, 0.0, 1.0, 0.0, 0.0]).T@x >= 0.01,
  657. np.array([0.0, 0.0, 0.0, 0.0, 1.0, 0.0]).T@x >= 0.01,
  658. np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0,
  659. np.array([0.0, 0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0]
  660. prob = cp.Problem(cp.Minimize(cost_fun), constraints)
  661. self.__execute_cvx(prob)
  662. edit_costs_new = x.value
  663. residual = np.sqrt(prob.value)
  664. elif is_n_attr and not is_e_attr:
  665. nb_cost_mat_new = nb_cost_mat[:,[0,1,2,3,4]]
  666. x = cp.Variable(nb_cost_mat_new.shape[1])
  667. cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec)
  668. constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])],
  669. np.array([1.0, 0.0, 0.0, 0.0, 0.0]).T@x >= 0.01,
  670. np.array([0.0, 1.0, 0.0, 0.0, 0.0]).T@x >= 0.01,
  671. np.array([0.0, 0.0, 0.0, 1.0, 0.0]).T@x >= 0.01,
  672. np.array([0.0, 0.0, 0.0, 0.0, 1.0]).T@x >= 0.01,
  673. np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0]
  674. prob = cp.Problem(cp.Minimize(cost_fun), constraints)
  675. self.__execute_cvx(prob)
  676. edit_costs_new = np.concatenate((x.value, np.array([0.0])))
  677. residual = np.sqrt(prob.value)
  678. elif not is_n_attr and is_e_attr:
  679. nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]]
  680. x = cp.Variable(nb_cost_mat_new.shape[1])
  681. cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec)
  682. constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])],
  683. np.array([1.0, 0.0, 0.0, 0.0, 0.0]).T@x >= 0.01,
  684. np.array([0.0, 1.0, 0.0, 0.0, 0.0]).T@x >= 0.01,
  685. np.array([0.0, 0.0, 1.0, 0.0, 0.0]).T@x >= 0.01,
  686. np.array([0.0, 0.0, 0.0, 1.0, 0.0]).T@x >= 0.01,
  687. np.array([0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0]
  688. prob = cp.Problem(cp.Minimize(cost_fun), constraints)
  689. self.__execute_cvx(prob)
  690. edit_costs_new = np.concatenate((x.value[0:2], np.array([0.0]), x.value[2:]))
  691. residual = np.sqrt(prob.value)
  692. else:
  693. nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4]]
  694. x = cp.Variable(nb_cost_mat_new.shape[1])
  695. cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec)
  696. constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])]]
  697. prob = cp.Problem(cp.Minimize(cost_fun), constraints)
  698. self.__execute_cvx(prob)
  699. edit_costs_new = np.concatenate((x.value[0:2], np.array([0.0]),
  700. x.value[2:], np.array([0.0])))
  701. residual = np.sqrt(prob.value)
  702. elif not self.__triangle_rule and not self.__allow_zeros:
  703. if is_n_attr and is_e_attr:
  704. nb_cost_mat_new = nb_cost_mat[:,[0,1,2,3,4,5]]
  705. x = cp.Variable(nb_cost_mat_new.shape[1])
  706. cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec)
  707. constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])]]
  708. prob = cp.Problem(cp.Minimize(cost_fun), constraints)
  709. self.__execute_cvx(prob)
  710. edit_costs_new = x.value
  711. residual = np.sqrt(prob.value)
  712. elif is_n_attr and not is_e_attr:
  713. nb_cost_mat_new = nb_cost_mat[:,[0,1,2,3,4]]
  714. x = cp.Variable(nb_cost_mat_new.shape[1])
  715. cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec)
  716. constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])]]
  717. prob = cp.Problem(cp.Minimize(cost_fun), constraints)
  718. self.__execute_cvx(prob)
  719. edit_costs_new = np.concatenate((x.value, np.array([0.0])))
  720. residual = np.sqrt(prob.value)
  721. elif not is_n_attr and is_e_attr:
  722. nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]]
  723. x = cp.Variable(nb_cost_mat_new.shape[1])
  724. cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec)
  725. constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])]]
  726. prob = cp.Problem(cp.Minimize(cost_fun), constraints)
  727. self.__execute_cvx(prob)
  728. edit_costs_new = np.concatenate((x.value[0:2], np.array([0.0]), x.value[2:]))
  729. residual = np.sqrt(prob.value)
  730. else:
  731. nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4]]
  732. x = cp.Variable(nb_cost_mat_new.shape[1])
  733. cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec)
  734. constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])]]
  735. prob = cp.Problem(cp.Minimize(cost_fun), constraints)
  736. self.__execute_cvx(prob)
  737. edit_costs_new = np.concatenate((x.value[0:2], np.array([0.0]),
  738. x.value[2:], np.array([0.0])))
  739. residual = np.sqrt(prob.value)
  740. elif self.__triangle_rule and not self.__allow_zeros:
  741. # c_vs <= c_vi + c_vr.
  742. if is_n_attr and is_e_attr:
  743. nb_cost_mat_new = nb_cost_mat[:,[0,1,2,3,4,5]]
  744. x = cp.Variable(nb_cost_mat_new.shape[1])
  745. cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec)
  746. constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])],
  747. np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0,
  748. np.array([0.0, 0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0]
  749. prob = cp.Problem(cp.Minimize(cost_fun), constraints)
  750. self.__execute_cvx(prob)
  751. edit_costs_new = x.value
  752. residual = np.sqrt(prob.value)
  753. elif is_n_attr and not is_e_attr:
  754. nb_cost_mat_new = nb_cost_mat[:,[0,1,2,3,4]]
  755. x = cp.Variable(nb_cost_mat_new.shape[1])
  756. cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec)
  757. constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])],
  758. np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0]
  759. prob = cp.Problem(cp.Minimize(cost_fun), constraints)
  760. self.__execute_cvx(prob)
  761. edit_costs_new = np.concatenate((x.value, np.array([0.0])))
  762. residual = np.sqrt(prob.value)
  763. elif not is_n_attr and is_e_attr:
  764. nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]]
  765. x = cp.Variable(nb_cost_mat_new.shape[1])
  766. cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec)
  767. constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])],
  768. np.array([0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0]
  769. prob = cp.Problem(cp.Minimize(cost_fun), constraints)
  770. self.__execute_cvx(prob)
  771. edit_costs_new = np.concatenate((x.value[0:2], np.array([0.0]), x.value[2:]))
  772. residual = np.sqrt(prob.value)
  773. else:
  774. nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4]]
  775. x = cp.Variable(nb_cost_mat_new.shape[1])
  776. cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec)
  777. constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])]]
  778. prob = cp.Problem(cp.Minimize(cost_fun), constraints)
  779. self.__execute_cvx(prob)
  780. edit_costs_new = np.concatenate((x.value[0:2], np.array([0.0]),
  781. x.value[2:], np.array([0.0])))
  782. residual = np.sqrt(prob.value)
  783. elif self.__ged_options['edit_cost'] == 'CONSTANT': # @todo: node/edge may not labeled.
  784. if not self.__triangle_rule and self.__allow_zeros:
  785. x = cp.Variable(nb_cost_mat.shape[1])
  786. cost_fun = cp.sum_squares(nb_cost_mat @ x - dis_k_vec)
  787. constraints = [x >= [0.0 for i in range(nb_cost_mat.shape[1])],
  788. np.array([1.0, 0.0, 0.0, 0.0, 0.0, 0.0]).T@x >= 0.01,
  789. np.array([0.0, 1.0, 0.0, 0.0, 0.0, 0.0]).T@x >= 0.01,
  790. np.array([0.0, 0.0, 0.0, 1.0, 0.0, 0.0]).T@x >= 0.01,
  791. np.array([0.0, 0.0, 0.0, 0.0, 1.0, 0.0]).T@x >= 0.01]
  792. prob = cp.Problem(cp.Minimize(cost_fun), constraints)
  793. self.__execute_cvx(prob)
  794. edit_costs_new = x.value
  795. residual = np.sqrt(prob.value)
  796. elif self.__triangle_rule and self.__allow_zeros:
  797. x = cp.Variable(nb_cost_mat.shape[1])
  798. cost_fun = cp.sum_squares(nb_cost_mat @ x - dis_k_vec)
  799. constraints = [x >= [0.0 for i in range(nb_cost_mat.shape[1])],
  800. np.array([1.0, 0.0, 0.0, 0.0, 0.0, 0.0]).T@x >= 0.01,
  801. np.array([0.0, 1.0, 0.0, 0.0, 0.0, 0.0]).T@x >= 0.01,
  802. np.array([0.0, 0.0, 0.0, 1.0, 0.0, 0.0]).T@x >= 0.01,
  803. np.array([0.0, 0.0, 0.0, 0.0, 1.0, 0.0]).T@x >= 0.01,
  804. np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0,
  805. np.array([0.0, 0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0]
  806. prob = cp.Problem(cp.Minimize(cost_fun), constraints)
  807. self.__execute_cvx(prob)
  808. edit_costs_new = x.value
  809. residual = np.sqrt(prob.value)
  810. elif not self.__triangle_rule and not self.__allow_zeros:
  811. x = cp.Variable(nb_cost_mat.shape[1])
  812. cost_fun = cp.sum_squares(nb_cost_mat @ x - dis_k_vec)
  813. constraints = [x >= [0.01 for i in range(nb_cost_mat.shape[1])]]
  814. prob = cp.Problem(cp.Minimize(cost_fun), constraints)
  815. self.__execute_cvx(prob)
  816. edit_costs_new = x.value
  817. residual = np.sqrt(prob.value)
  818. elif self.__triangle_rule and not self.__allow_zeros:
  819. x = cp.Variable(nb_cost_mat.shape[1])
  820. cost_fun = cp.sum_squares(nb_cost_mat @ x - dis_k_vec)
  821. constraints = [x >= [0.01 for i in range(nb_cost_mat.shape[1])],
  822. np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0,
  823. np.array([0.0, 0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0]
  824. prob = cp.Problem(cp.Minimize(cost_fun), constraints)
  825. self.__execute_cvx(prob)
  826. edit_costs_new = x.value
  827. residual = np.sqrt(prob.value)
  828. else:
  829. raise Exception('The edit cost "', self.__ged_options['edit_cost'], '" is not supported for update progress.')
  830. # # method 1: simple least square method.
  831. # edit_costs_new, residual, _, _ = np.linalg.lstsq(nb_cost_mat, dis_k_vec,
  832. # rcond=None)
  833. # # method 2: least square method with x_i >= 0.
  834. # edit_costs_new, residual = optimize.nnls(nb_cost_mat, dis_k_vec)
  835. # method 3: solve as a quadratic program with constraints.
  836. # P = np.dot(nb_cost_mat.T, nb_cost_mat)
  837. # q_T = -2 * np.dot(dis_k_vec.T, nb_cost_mat)
  838. # G = -1 * np.identity(nb_cost_mat.shape[1])
  839. # h = np.array([0 for i in range(nb_cost_mat.shape[1])])
  840. # A = np.array([1 for i in range(nb_cost_mat.shape[1])])
  841. # b = 1
  842. # x = cp.Variable(nb_cost_mat.shape[1])
  843. # prob = cp.Problem(cp.Minimize(cp.quad_form(x, P) + q_T@x),
  844. # [G@x <= h])
  845. # prob.solve()
  846. # edit_costs_new = x.value
  847. # residual = prob.value - np.dot(dis_k_vec.T, dis_k_vec)
  848. # G = -1 * np.identity(nb_cost_mat.shape[1])
  849. # h = np.array([0 for i in range(nb_cost_mat.shape[1])])
  850. x = cp.Variable(nb_cost_mat.shape[1])
  851. cost_fun = cp.sum_squares(nb_cost_mat @ x - dis_k_vec)
  852. constraints = [x >= [0.0 for i in range(nb_cost_mat.shape[1])],
  853. # np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0]
  854. np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0,
  855. np.array([0.0, 0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0]
  856. prob = cp.Problem(cp.Minimize(cost_fun), constraints)
  857. self.__execute_cvx(prob)
  858. edit_costs_new = x.value
  859. residual = np.sqrt(prob.value)
  860. # method 4:
  861. return edit_costs_new, residual
  862. def __execute_cvx(self, prob):
  863. try:
  864. prob.solve(verbose=(self._verbose>=2))
  865. except MemoryError as error0:
  866. if self._verbose >= 2:
  867. print('\nUsing solver "OSQP" caused a memory error.')
  868. print('the original error message is\n', error0)
  869. print('solver status: ', prob.status)
  870. print('trying solver "CVXOPT" instead...\n')
  871. try:
  872. prob.solve(solver=cp.CVXOPT, verbose=(self._verbose>=2))
  873. except Exception as error1:
  874. if self._verbose >= 2:
  875. print('\nAn error occured when using solver "CVXOPT".')
  876. print('the original error message is\n', error1)
  877. print('solver status: ', prob.status)
  878. print('trying solver "MOSEK" instead. Notice this solver is commercial and a lisence is required.\n')
  879. prob.solve(solver=cp.MOSEK, verbose=(self._verbose>=2))
  880. else:
  881. if self._verbose >= 2:
  882. print('solver status: ', prob.status)
  883. else:
  884. if self._verbose >= 2:
  885. print('solver status: ', prob.status)
  886. if self._verbose >= 2:
  887. print()
  888. def __gmg_bcu(self):
  889. """
  890. The local search algorithm based on block coordinate update (BCU) for estimating a generalized median graph (GMG).
  891. Returns
  892. -------
  893. None.
  894. """
  895. # Set up the ged environment.
  896. ged_env = GEDEnv() # @todo: maybe create a ged_env as a private varible.
  897. # gedlibpy.restart_env()
  898. ged_env.set_edit_cost(self.__ged_options['edit_cost'], edit_cost_constants=self.__edit_cost_constants)
  899. graphs = [self.__clean_graph(g) for g in self._dataset.graphs]
  900. for g in graphs:
  901. ged_env.add_nx_graph(g, '')
  902. graph_ids = ged_env.get_all_graph_ids()
  903. set_median_id = ged_env.add_graph('set_median')
  904. gen_median_id = ged_env.add_graph('gen_median')
  905. ged_env.init(init_type=self.__ged_options['init_option'])
  906. # Set up the madian graph estimator.
  907. self.__mge = MedianGraphEstimatorPy(ged_env, constant_node_costs(self.__ged_options['edit_cost']))
  908. self.__mge.set_refine_method(self.__ged_options['method'], self.__ged_options)
  909. options = self.__mge_options.copy()
  910. if not 'seed' in options:
  911. options['seed'] = int(round(time.time() * 1000)) # @todo: may not work correctly for possible parallel usage.
  912. options['parallel'] = self.__parallel
  913. # Select the GED algorithm.
  914. self.__mge.set_options(mge_options_to_string(options))
  915. self.__mge.set_label_names(node_labels=self._dataset.node_labels,
  916. edge_labels=self._dataset.edge_labels,
  917. node_attrs=self._dataset.node_attrs,
  918. edge_attrs=self._dataset.edge_attrs)
  919. ged_options = self.__ged_options.copy()
  920. if self.__parallel:
  921. ged_options['threads'] = 1
  922. self.__mge.set_init_method(ged_options['method'], ged_options)
  923. self.__mge.set_descent_method(ged_options['method'], ged_options)
  924. # Run the estimator.
  925. self.__mge.run(graph_ids, set_median_id, gen_median_id)
  926. # Get SODs.
  927. self.__sod_set_median = self.__mge.get_sum_of_distances('initialized')
  928. self.__sod_gen_median = self.__mge.get_sum_of_distances('converged')
  929. # Get median graphs.
  930. self.__set_median = ged_env.get_nx_graph(set_median_id)
  931. self.__gen_median = ged_env.get_nx_graph(gen_median_id)
  932. def __compute_distances_to_true_median(self):
  933. # compute distance in kernel space for set median.
  934. kernels_to_sm, _ = self._graph_kernel.compute(self.__set_median, self._dataset.graphs, **self._kernel_options)
  935. kernel_sm, _ = self._graph_kernel.compute(self.__set_median, self.__set_median, **self._kernel_options)
  936. if self._kernel_options['normalize']:
  937. kernels_to_sm = [kernels_to_sm[i] / np.sqrt(self.__gram_matrix_unnorm[i, i] * kernel_sm) for i in range(len(kernels_to_sm))] # normalize
  938. kernel_sm = 1
  939. # @todo: not correct kernel value
  940. gram_with_sm = np.concatenate((np.array([kernels_to_sm]), np.copy(self._graph_kernel.gram_matrix)), axis=0)
  941. gram_with_sm = np.concatenate((np.array([[kernel_sm] + kernels_to_sm]).T, gram_with_sm), axis=1)
  942. self.__k_dis_set_median = compute_k_dis(0, range(1, 1+len(self._dataset.graphs)),
  943. [1 / len(self._dataset.graphs)] * len(self._dataset.graphs),
  944. gram_with_sm, withterm3=False)
  945. # compute distance in kernel space for generalized median.
  946. kernels_to_gm, _ = self._graph_kernel.compute(self.__gen_median, self._dataset.graphs, **self._kernel_options)
  947. kernel_gm, _ = self._graph_kernel.compute(self.__gen_median, self.__gen_median, **self._kernel_options)
  948. if self._kernel_options['normalize']:
  949. kernels_to_gm = [kernels_to_gm[i] / np.sqrt(self.__gram_matrix_unnorm[i, i] * kernel_gm) for i in range(len(kernels_to_gm))] # normalize
  950. kernel_gm = 1
  951. gram_with_gm = np.concatenate((np.array([kernels_to_gm]), np.copy(self._graph_kernel.gram_matrix)), axis=0)
  952. gram_with_gm = np.concatenate((np.array([[kernel_gm] + kernels_to_gm]).T, gram_with_gm), axis=1)
  953. self.__k_dis_gen_median = compute_k_dis(0, range(1, 1+len(self._dataset.graphs)),
  954. [1 / len(self._dataset.graphs)] * len(self._dataset.graphs),
  955. gram_with_gm, withterm3=False)
  956. # compute distance in kernel space for each graph in median set.
  957. k_dis_median_set = []
  958. for idx in range(len(self._dataset.graphs)):
  959. k_dis_median_set.append(compute_k_dis(idx+1, range(1, 1+len(self._dataset.graphs)),
  960. [1 / len(self._dataset.graphs)] * len(self._dataset.graphs),
  961. gram_with_gm, withterm3=False))
  962. idx_k_dis_median_set_min = np.argmin(k_dis_median_set)
  963. self.__k_dis_dataset = k_dis_median_set[idx_k_dis_median_set_min]
  964. self.__best_from_dataset = self._dataset.graphs[idx_k_dis_median_set_min].copy()
  965. if self._verbose >= 2:
  966. print()
  967. print('distance in kernel space for set median:', self.__k_dis_set_median)
  968. print('distance in kernel space for generalized median:', self.__k_dis_gen_median)
  969. print('minimum distance in kernel space for each graph in median set:', self.__k_dis_dataset)
  970. print('distance in kernel space for each graph in median set:', k_dis_median_set)
  971. # def __clean_graph(self, G, node_labels=[], edge_labels=[], node_attrs=[], edge_attrs=[]):
  972. def __clean_graph(self, G): # @todo: this may not be needed when datafile is updated.
  973. """
  974. Cleans node and edge labels and attributes of the given graph.
  975. """
  976. G_new = nx.Graph(**G.graph)
  977. for nd, attrs in G.nodes(data=True):
  978. G_new.add_node(str(nd)) # @todo: should we keep this as str()?
  979. for l_name in self._dataset.node_labels:
  980. G_new.nodes[str(nd)][l_name] = str(attrs[l_name])
  981. for a_name in self._dataset.node_attrs:
  982. G_new.nodes[str(nd)][a_name] = str(attrs[a_name])
  983. for nd1, nd2, attrs in G.edges(data=True):
  984. G_new.add_edge(str(nd1), str(nd2))
  985. for l_name in self._dataset.edge_labels:
  986. G_new.edges[str(nd1), str(nd2)][l_name] = str(attrs[l_name])
  987. for a_name in self._dataset.edge_attrs:
  988. G_new.edges[str(nd1), str(nd2)][a_name] = str(attrs[a_name])
  989. return G_new
  990. @property
  991. def mge(self):
  992. return self.__mge
  993. @property
  994. def ged_options(self):
  995. return self.__ged_options
  996. @ged_options.setter
  997. def ged_options(self, value):
  998. self.__ged_options = value
  999. @property
  1000. def mge_options(self):
  1001. return self.__mge_options
  1002. @mge_options.setter
  1003. def mge_options(self, value):
  1004. self.__mge_options = value
  1005. @property
  1006. def fit_method(self):
  1007. return self.__fit_method
  1008. @fit_method.setter
  1009. def fit_method(self, value):
  1010. self.__fit_method = value
  1011. @property
  1012. def init_ecc(self):
  1013. return self.__init_ecc
  1014. @init_ecc.setter
  1015. def init_ecc(self, value):
  1016. self.__init_ecc = value
  1017. @property
  1018. def set_median(self):
  1019. return self.__set_median
  1020. @property
  1021. def gen_median(self):
  1022. return self.__gen_median
  1023. @property
  1024. def best_from_dataset(self):
  1025. return self.__best_from_dataset
  1026. @property
  1027. def gram_matrix_unnorm(self):
  1028. return self.__gram_matrix_unnorm
  1029. @gram_matrix_unnorm.setter
  1030. def gram_matrix_unnorm(self, value):
  1031. self.__gram_matrix_unnorm = value

A Python package for graph kernels, graph edit distances and graph pre-image problem.