You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

median_preimage_generator.py 29 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. Created on Thu Mar 26 18:27:22 2020
  5. @author: ljia
  6. """
  7. import numpy as np
  8. import time
  9. import random
  10. import multiprocessing
  11. import networkx as nx
  12. import cvxpy as cp
  13. from gklearn.preimage import PreimageGenerator
  14. from gklearn.preimage.utils import compute_k_dis
  15. from gklearn.ged.util import compute_geds, ged_options_to_string
  16. from gklearn.ged.median import MedianGraphEstimator
  17. from gklearn.ged.median import constant_node_costs,mge_options_to_string
  18. from gklearn.gedlib import librariesImport, gedlibpy
  19. # from gklearn.utils.dataset import Dataset
  20. class MedianPreimageGenerator(PreimageGenerator):
  21. def __init__(self, dataset=None):
  22. PreimageGenerator.__init__(self, dataset=dataset)
  23. # arguments to set.
  24. self.__mge = None
  25. self.__ged_options = {}
  26. self.__mge_options = {}
  27. self.__fit_method = 'k-graphs'
  28. self.__init_ecc = None
  29. self.__max_itrs = 100
  30. self.__parallel = True
  31. self.__n_jobs = multiprocessing.cpu_count()
  32. self.__ds_name = None
  33. # values to compute.
  34. self.__edit_cost_constants = []
  35. self.__runtime_precompute_gm = None
  36. self.__runtime_optimize_ec = None
  37. self.__runtime_generate_preimage = None
  38. self.__runtime_total = None
  39. self.__set_median = None
  40. self.__gen_median = None
  41. self.__sod_set_median = None
  42. self.__sod_gen_median = None
  43. self.__k_dis_set_median = None
  44. self.__k_dis_gen_median = None
  45. self.__k_dis_dataset = None
  46. def set_options(self, **kwargs):
  47. self._kernel_options = kwargs.get('kernel_options', {})
  48. self._graph_kernel = kwargs.get('graph_kernel', None)
  49. self._verbose = kwargs.get('verbose', 2)
  50. self.__ged_options = kwargs.get('ged_options', {})
  51. self.__mge_options = kwargs.get('mge_options', {})
  52. self.__fit_method = kwargs.get('fit_method', 'k-graphs')
  53. self.__init_ecc = kwargs.get('init_ecc', None)
  54. self.__edit_cost_constants = kwargs.get('edit_cost_constants', [])
  55. self.__max_itrs = kwargs.get('max_itrs', 100)
  56. self.__parallel = kwargs.get('parallel', True)
  57. self.__n_jobs = kwargs.get('n_jobs', multiprocessing.cpu_count())
  58. self.__ds_name = kwargs.get('ds_name', None)
  59. def run(self):
  60. self.__set_graph_kernel_by_name()
  61. # record start time.
  62. start = time.time()
  63. # 1. precompute gram matrix.
  64. gram_matrix, run_time = self.__graph_kernel.compute(self._dataset.graphs, **self._kernel_options)
  65. end_precompute_gm = time.time()
  66. self.__runtime_precompute_gm = end_precompute_gm - start
  67. # 2. optimize edit cost constants.
  68. # self.__optimize_edit_cost_constants(dataset=dataset, Gn=Gn, Kmatrix_median=Kmatrix_median)
  69. self.__optimize_edit_cost_constants()
  70. end_optimize_ec = time.time()
  71. self.__runtime_optimize_ec = end_optimize_ec - end_precompute_gm
  72. # 3. compute set median and gen median using optimized edit costs.
  73. if self._verbose >= 2:
  74. print('\nstart computing set median and gen median using optimized edit costs...\n')
  75. # group_fnames = [Gn[g].graph['filename'] for g in group_min]
  76. self.__generate_preimage_iam()
  77. end_generate_preimage = time.time()
  78. self.__runtime_generate_preimage = end_generate_preimage - end_optimize_ec
  79. self.__runtime_total = end_generate_preimage - start
  80. if self._verbose >= 2:
  81. print('medians computed.')
  82. print('SOD of the set median: ', self.__sod_set_median)
  83. print('SOD of the generalized median: ', self.__sod_gen_median)
  84. # 4. compute kernel distances to the true median.
  85. if self._verbose >= 2:
  86. print('\nstart computing distances to true median....\n')
  87. # Gn_median = [Gn[g].copy() for g in group_min]
  88. self.__compute_distances_to_true_median()
  89. # dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min, idx_dis_k_gi_min =
  90. # idx_dis_k_gi_min = group_min[idx_dis_k_gi_min]
  91. # print('index min dis_k_gi:', idx_dis_k_gi_min)
  92. # print('sod_sm:', sod_sm)
  93. # print('sod_gm:', sod_gm)
  94. # 5. print out results.
  95. if self._verbose:
  96. print()
  97. print('================================================================================')
  98. print('The optimized edit cost constants: ', self.__edit_cost_constants)
  99. print('SOD of the set median: ', self.__sod_set_median)
  100. print('SOD of the generalized median: ', self.__sod_gen_median)
  101. print('Distance in kernel space for set median:', self.__k_dis_set_median)
  102. print('Distance in kernel space for generalized median:', self.__k_dis_gen_median)
  103. print('Minimum distance in kernel space for each graph in median set:', self.__k_dis_dataset)
  104. print('Time to pre-compute Gram matrix: ', self.__runtime_precompute_gm)
  105. print('Time to optimize edit costs: ', self.__runtime_optimize_ec)
  106. print('Time to generate pre-images: ', self.__runtime_generate_preimage)
  107. print('Total time: ', self.__runtime_total)
  108. print('================================================================================')
  109. # collect return values.
  110. # return (sod_sm, sod_gm), \
  111. # (dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min, idx_dis_k_gi_min), \
  112. # (time_fitting, time_generating)
  113. # def __optimize_edit_cost_constants(self, dataset=None, Gn=None, Kmatrix_median=None):
  114. def __optimize_edit_cost_constants(self):
  115. """fit edit cost constants.
  116. """
  117. if self.__fit_method == 'random': # random
  118. if self.__ged_options['edit_cost'] == 'LETTER':
  119. self.__edit_cost_constants = random.sample(range(1, 10), 3)
  120. self.__edit_cost_constants = [item * 0.1 for item in self.__edit_cost_constants]
  121. elif self.__ged_options['edit_cost'] == 'LETTER2':
  122. random.seed(time.time())
  123. self.__edit_cost_constants = random.sample(range(1, 10), 5)
  124. # self.__edit_cost_constants = [item * 0.1 for item in self.__edit_cost_constants]
  125. elif self.__ged_options['edit_cost'] == 'NON_SYMBOLIC':
  126. self.__edit_cost_constants = random.sample(range(1, 10), 6)
  127. if self._dataset.node_attrs == []:
  128. self.__edit_cost_constants[2] = 0
  129. if self._dataset.edge_attrs == []:
  130. self.__edit_cost_constants[5] = 0
  131. else:
  132. self.__edit_cost_constants = random.sample(range(1, 10), 6)
  133. if self._verbose >= 2:
  134. print('edit cost constants used:', self.__edit_cost_constants)
  135. elif self.__fit_method == 'expert': # expert
  136. if self.__init_ecc is None:
  137. if self.__ged_options['edit_cost'] == 'LETTER':
  138. self.__edit_cost_constants = [0.9, 1.7, 0.75]
  139. elif self.__ged_options['edit_cost'] == 'LETTER2':
  140. self.__edit_cost_constants = [0.675, 0.675, 0.75, 0.425, 0.425]
  141. else:
  142. self.__edit_cost_constants = [3, 3, 1, 3, 3, 1]
  143. else:
  144. self.__edit_cost_constants = self.__init_ecc
  145. elif self.__fit_method == 'k-graphs':
  146. if self.__init_ecc is None:
  147. if self.__ged_options['edit_cost'] == 'LETTER':
  148. self.__init_ecc = [0.9, 1.7, 0.75]
  149. elif self.__ged_options['edit_cost'] == 'LETTER2':
  150. self.__init_ecc = [0.675, 0.675, 0.75, 0.425, 0.425]
  151. elif self.__ged_options['edit_cost'] == 'NON_SYMBOLIC':
  152. self.__init_ecc = [0, 0, 1, 1, 1, 0]
  153. if self._dataset.node_attrs == []:
  154. self.__init_ecc[2] = 0
  155. if self._dataset.edge_attrs == []:
  156. self.__init_ecc[5] = 0
  157. else:
  158. self.__init_ecc = [3, 3, 1, 3, 3, 1]
  159. # optimize on the k-graph subset.
  160. self.__optimize_ecc_by_kernel_distances()
  161. # fit_GED_to_kernel_distance(Gn_median,
  162. # dataset=dataset, Kmatrix=Kmatrix_median)
  163. elif self.__fit_method == 'whole-dataset':
  164. if self.__init_ecc is None:
  165. if self.__ged_options['edit_cost'] == 'LETTER':
  166. self.__init_ecc = [0.9, 1.7, 0.75]
  167. elif self.__ged_options['edit_cost'] == 'LETTER2':
  168. self.__init_ecc = [0.675, 0.675, 0.75, 0.425, 0.425]
  169. else:
  170. self.__init_ecc = [3, 3, 1, 3, 3, 1]
  171. # optimizeon the whole set.
  172. self.__optimize_ecc_by_kernel_distances()
  173. # fit_GED_to_kernel_distance(Gn, dataset=dataset)
  174. elif self.__fit_method == 'precomputed':
  175. pass
  176. def __optimize_ecc_by_kernel_distances(self):
  177. # def fit_GED_to_kernel_distance(Gn, Kmatrix=None,
  178. # parallel=True):
  179. # compute distances in feature space.
  180. dis_k_mat, _, _, _ = self.__graph_kernel.compute_distance_matrix()
  181. dis_k_vec = []
  182. for i in range(len(dis_k_mat)):
  183. # for j in range(i, len(dis_k_mat)):
  184. for j in range(i + 1, len(dis_k_mat)):
  185. dis_k_vec.append(dis_k_mat[i, j])
  186. dis_k_vec = np.array(dis_k_vec)
  187. # init ged.
  188. if self._verbose >= 2:
  189. print('\ninitial:')
  190. time0 = time.time()
  191. graphs = [self.__clean_graph(g) for g in self._dataset.graphs]
  192. self.__edit_cost_constants = self.__init_ecc
  193. options = self.__ged_options.copy()
  194. options['edit_cost_constants'] = self.__edit_cost_constants # @todo
  195. ged_vec_init, ged_mat, n_edit_operations = compute_geds(graphs, options=options, parallel=self.__parallel)
  196. residual_list = [np.sqrt(np.sum(np.square(np.array(ged_vec_init) - dis_k_vec)))]
  197. time_list = [time.time() - time0]
  198. edit_cost_list = [self.__init_ecc]
  199. nb_cost_mat = np.array(n_edit_operations)
  200. nb_cost_mat_list = [nb_cost_mat]
  201. if self._verbose >= 2:
  202. print('edit_cost_constants:', self.__edit_cost_constants)
  203. print('residual_list:', residual_list)
  204. for itr in range(self.__max_itrs):
  205. if self._verbose >= 2:
  206. print('\niteration', itr)
  207. time0 = time.time()
  208. # "fit" geds to distances in feature space by tuning edit costs using the
  209. # Least Squares Method.
  210. np.savez('results/xp_fit_method/fit_data_debug' + str(itr) + '.gm',
  211. nb_cost_mat=nb_cost_mat, dis_k_vec=dis_k_vec,
  212. n_edit_operations=n_edit_operations, ged_vec_init=ged_vec_init,
  213. ged_mat=ged_mat)
  214. self.__edit_cost_constants, residual = self.__update_ecc(nb_cost_mat, dis_k_vec)
  215. for i in range(len(self.__edit_cost_constants)):
  216. if -1e-9 <= self.__edit_cost_constants[i] <= 1e-9:
  217. self.__edit_cost_constants[i] = 0
  218. if self.__edit_cost_constants[i] < 0:
  219. raise ValueError('The edit cost is negative.')
  220. # for i in range(len(self.__edit_cost_constants)):
  221. # if self.__edit_cost_constants[i] < 0:
  222. # self.__edit_cost_constants[i] = 0
  223. # compute new GEDs and numbers of edit operations.
  224. options = self.__ged_options.copy() # np.array([self.__edit_cost_constants[0], self.__edit_cost_constants[1], 0.75])
  225. options['edit_cost_constants'] = self.__edit_cost_constants # @todo
  226. ged_vec, ged_mat, n_edit_operations = compute_geds(graphs, options=options, parallel=self.__parallel)
  227. residual_list.append(np.sqrt(np.sum(np.square(np.array(ged_vec) - dis_k_vec))))
  228. time_list.append(time.time() - time0)
  229. edit_cost_list.append(self.__edit_cost_constants)
  230. nb_cost_mat = np.array(n_edit_operations)
  231. nb_cost_mat_list.append(nb_cost_mat)
  232. if self._verbose >= 2:
  233. print('edit_cost_constants:', self.__edit_cost_constants)
  234. print('residual_list:', residual_list)
  235. # return residual_list, edit_cost_list, dis_k_mat, ged_mat, \
  236. # time_list, nb_cost_mat_list
  237. def __update_ecc(self, nb_cost_mat, dis_k_vec, rw_constraints='inequality'):
  238. # if self.__ds_name == 'Letter-high':
  239. if self.__ged_options['edit_cost'] == 'LETTER':
  240. pass
  241. # # method 1: set alpha automatically, just tune c_vir and c_eir by
  242. # # LMS using cvxpy.
  243. # alpha = 0.5
  244. # coeff = 100 # np.max(alpha * nb_cost_mat[:,4] / dis_k_vec)
  245. ## if np.count_nonzero(nb_cost_mat[:,4]) == 0:
  246. ## alpha = 0.75
  247. ## else:
  248. ## alpha = np.min([dis_k_vec / c_vs for c_vs in nb_cost_mat[:,4] if c_vs != 0])
  249. ## alpha = alpha * 0.99
  250. # param_vir = alpha * (nb_cost_mat[:,0] + nb_cost_mat[:,1])
  251. # param_eir = (1 - alpha) * (nb_cost_mat[:,4] + nb_cost_mat[:,5])
  252. # nb_cost_mat_new = np.column_stack((param_vir, param_eir))
  253. # dis_new = coeff * dis_k_vec - alpha * nb_cost_mat[:,3]
  254. #
  255. # x = cp.Variable(nb_cost_mat_new.shape[1])
  256. # cost = cp.sum_squares(nb_cost_mat_new * x - dis_new)
  257. # constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])]]
  258. # prob = cp.Problem(cp.Minimize(cost), constraints)
  259. # prob.solve()
  260. # edit_costs_new = x.value
  261. # edit_costs_new = np.array([edit_costs_new[0], edit_costs_new[1], alpha])
  262. # residual = np.sqrt(prob.value)
  263. # # method 2: tune c_vir, c_eir and alpha by nonlinear programming by
  264. # # scipy.optimize.minimize.
  265. # w0 = nb_cost_mat[:,0] + nb_cost_mat[:,1]
  266. # w1 = nb_cost_mat[:,4] + nb_cost_mat[:,5]
  267. # w2 = nb_cost_mat[:,3]
  268. # w3 = dis_k_vec
  269. # func_min = lambda x: np.sum((w0 * x[0] * x[3] + w1 * x[1] * (1 - x[2]) \
  270. # + w2 * x[2] - w3 * x[3]) ** 2)
  271. # bounds = ((0, None), (0., None), (0.5, 0.5), (0, None))
  272. # res = minimize(func_min, [0.9, 1.7, 0.75, 10], bounds=bounds)
  273. # edit_costs_new = res.x[0:3]
  274. # residual = res.fun
  275. # method 3: tune c_vir, c_eir and alpha by nonlinear programming using cvxpy.
  276. # # method 4: tune c_vir, c_eir and alpha by QP function
  277. # # scipy.optimize.least_squares. An initial guess is required.
  278. # w0 = nb_cost_mat[:,0] + nb_cost_mat[:,1]
  279. # w1 = nb_cost_mat[:,4] + nb_cost_mat[:,5]
  280. # w2 = nb_cost_mat[:,3]
  281. # w3 = dis_k_vec
  282. # func = lambda x: (w0 * x[0] * x[3] + w1 * x[1] * (1 - x[2]) \
  283. # + w2 * x[2] - w3 * x[3]) ** 2
  284. # res = optimize.root(func, [0.9, 1.7, 0.75, 100])
  285. # edit_costs_new = res.x
  286. # residual = None
  287. elif self.__ged_options['edit_cost'] == 'LETTER2':
  288. # # 1. if c_vi != c_vr, c_ei != c_er.
  289. # nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]]
  290. # x = cp.Variable(nb_cost_mat_new.shape[1])
  291. # cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec)
  292. ## # 1.1 no constraints.
  293. ## constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])]]
  294. # # 1.2 c_vs <= c_vi + c_vr.
  295. # constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])],
  296. # np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0]
  297. ## # 2. if c_vi == c_vr, c_ei == c_er.
  298. ## nb_cost_mat_new = nb_cost_mat[:,[0,3,4]]
  299. ## nb_cost_mat_new[:,0] += nb_cost_mat[:,1]
  300. ## nb_cost_mat_new[:,2] += nb_cost_mat[:,5]
  301. ## x = cp.Variable(nb_cost_mat_new.shape[1])
  302. ## cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec)
  303. ## # 2.1 no constraints.
  304. ## constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])]]
  305. ### # 2.2 c_vs <= c_vi + c_vr.
  306. ### constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])],
  307. ### np.array([2.0, -1.0, 0.0]).T@x >= 0.0]
  308. #
  309. # prob = cp.Problem(cp.Minimize(cost_fun), constraints)
  310. # prob.solve()
  311. # edit_costs_new = [x.value[0], x.value[0], x.value[1], x.value[2], x.value[2]]
  312. # edit_costs_new = np.array(edit_costs_new)
  313. # residual = np.sqrt(prob.value)
  314. if rw_constraints == 'inequality':
  315. # c_vs <= c_vi + c_vr.
  316. nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]]
  317. x = cp.Variable(nb_cost_mat_new.shape[1])
  318. cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec)
  319. constraints = [x >= [0.001 for i in range(nb_cost_mat_new.shape[1])],
  320. np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0]
  321. prob = cp.Problem(cp.Minimize(cost_fun), constraints)
  322. try:
  323. prob.solve(verbose=True)
  324. except MemoryError as error0:
  325. if self._verbose >= 2:
  326. print('\nUsing solver "OSQP" caused a memory error.')
  327. print('the original error message is\n', error0)
  328. print('solver status: ', prob.status)
  329. print('trying solver "CVXOPT" instead...\n')
  330. try:
  331. prob.solve(solver=cp.CVXOPT, verbose=True)
  332. except Exception as error1:
  333. if self._verbose >= 2:
  334. print('\nAn error occured when using solver "CVXOPT".')
  335. print('the original error message is\n', error1)
  336. print('solver status: ', prob.status)
  337. print('trying solver "MOSEK" instead. Notice this solver is commercial and a lisence is required.\n')
  338. prob.solve(solver=cp.MOSEK, verbose=True)
  339. else:
  340. if self._verbose >= 2:
  341. print('solver status: ', prob.status)
  342. else:
  343. if self._verbose >= 2:
  344. print('solver status: ', prob.status)
  345. if self._verbose >= 2:
  346. print()
  347. edit_costs_new = x.value
  348. residual = np.sqrt(prob.value)
  349. elif rw_constraints == '2constraints':
  350. # c_vs <= c_vi + c_vr and c_vi == c_vr, c_ei == c_er.
  351. nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]]
  352. x = cp.Variable(nb_cost_mat_new.shape[1])
  353. cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec)
  354. constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])],
  355. np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0,
  356. np.array([1.0, -1.0, 0.0, 0.0, 0.0]).T@x == 0.0,
  357. np.array([0.0, 0.0, 0.0, 1.0, -1.0]).T@x == 0.0]
  358. prob = cp.Problem(cp.Minimize(cost_fun), constraints)
  359. prob.solve()
  360. edit_costs_new = x.value
  361. residual = np.sqrt(prob.value)
  362. elif rw_constraints == 'no-constraint':
  363. # no constraint.
  364. nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]]
  365. x = cp.Variable(nb_cost_mat_new.shape[1])
  366. cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec)
  367. constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])]]
  368. prob = cp.Problem(cp.Minimize(cost_fun), constraints)
  369. prob.solve()
  370. edit_costs_new = x.value
  371. residual = np.sqrt(prob.value)
  372. # elif method == 'inequality_modified':
  373. # # c_vs <= c_vi + c_vr.
  374. # nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]]
  375. # x = cp.Variable(nb_cost_mat_new.shape[1])
  376. # cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec)
  377. # constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])],
  378. # np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0]
  379. # prob = cp.Problem(cp.Minimize(cost_fun), constraints)
  380. # prob.solve()
  381. # # use same costs for insertion and removal rather than the fitted costs.
  382. # edit_costs_new = [x.value[0], x.value[0], x.value[1], x.value[2], x.value[2]]
  383. # edit_costs_new = np.array(edit_costs_new)
  384. # residual = np.sqrt(prob.value)
  385. elif self.__ged_options['edit_cost'] == 'NON_SYMBOLIC':
  386. is_n_attr = np.count_nonzero(nb_cost_mat[:,2])
  387. is_e_attr = np.count_nonzero(nb_cost_mat[:,5])
  388. if self.__ds_name == 'SYNTHETICnew':
  389. # nb_cost_mat_new = nb_cost_mat[:,[0,1,2,3,4]]
  390. nb_cost_mat_new = nb_cost_mat[:,[2,3,4]]
  391. x = cp.Variable(nb_cost_mat_new.shape[1])
  392. cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec)
  393. # constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])],
  394. # np.array([0.0, 0.0, 0.0, 1.0, -1.0]).T@x == 0.0]
  395. # constraints = [x >= [0.0001 for i in range(nb_cost_mat_new.shape[1])]]
  396. constraints = [x >= [0.0001 for i in range(nb_cost_mat_new.shape[1])],
  397. np.array([0.0, 1.0, -1.0]).T@x == 0.0]
  398. prob = cp.Problem(cp.Minimize(cost_fun), constraints)
  399. prob.solve()
  400. # print(x.value)
  401. edit_costs_new = np.concatenate((np.array([0.0, 0.0]), x.value,
  402. np.array([0.0])))
  403. residual = np.sqrt(prob.value)
  404. elif rw_constraints == 'inequality':
  405. # c_vs <= c_vi + c_vr.
  406. if is_n_attr and is_e_attr:
  407. nb_cost_mat_new = nb_cost_mat[:,[0,1,2,3,4,5]]
  408. x = cp.Variable(nb_cost_mat_new.shape[1])
  409. cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec)
  410. constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])],
  411. np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0,
  412. np.array([0.0, 0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0]
  413. prob = cp.Problem(cp.Minimize(cost_fun), constraints)
  414. prob.solve()
  415. edit_costs_new = x.value
  416. residual = np.sqrt(prob.value)
  417. elif is_n_attr and not is_e_attr:
  418. nb_cost_mat_new = nb_cost_mat[:,[0,1,2,3,4]]
  419. x = cp.Variable(nb_cost_mat_new.shape[1])
  420. cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec)
  421. constraints = [x >= [0.001 for i in range(nb_cost_mat_new.shape[1])],
  422. np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0]
  423. prob = cp.Problem(cp.Minimize(cost_fun), constraints)
  424. prob.solve()
  425. if self._verbose >= 2:
  426. print(x.value)
  427. edit_costs_new = np.concatenate((x.value, np.array([0.0])))
  428. residual = np.sqrt(prob.value)
  429. elif not is_n_attr and is_e_attr:
  430. nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]]
  431. x = cp.Variable(nb_cost_mat_new.shape[1])
  432. cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec)
  433. constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])],
  434. np.array([0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0]
  435. prob = cp.Problem(cp.Minimize(cost_fun), constraints)
  436. prob.solve()
  437. edit_costs_new = np.concatenate((x.value[0:2], np.array([0.0]), x.value[2:]))
  438. residual = np.sqrt(prob.value)
  439. else:
  440. nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4]]
  441. x = cp.Variable(nb_cost_mat_new.shape[1])
  442. cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec)
  443. constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])]]
  444. prob = cp.Problem(cp.Minimize(cost_fun), constraints)
  445. prob.solve()
  446. edit_costs_new = np.concatenate((x.value[0:2], np.array([0.0]),
  447. x.value[2:], np.array([0.0])))
  448. residual = np.sqrt(prob.value)
  449. else:
  450. # # method 1: simple least square method.
  451. # edit_costs_new, residual, _, _ = np.linalg.lstsq(nb_cost_mat, dis_k_vec,
  452. # rcond=None)
  453. # # method 2: least square method with x_i >= 0.
  454. # edit_costs_new, residual = optimize.nnls(nb_cost_mat, dis_k_vec)
  455. # method 3: solve as a quadratic program with constraints.
  456. # P = np.dot(nb_cost_mat.T, nb_cost_mat)
  457. # q_T = -2 * np.dot(dis_k_vec.T, nb_cost_mat)
  458. # G = -1 * np.identity(nb_cost_mat.shape[1])
  459. # h = np.array([0 for i in range(nb_cost_mat.shape[1])])
  460. # A = np.array([1 for i in range(nb_cost_mat.shape[1])])
  461. # b = 1
  462. # x = cp.Variable(nb_cost_mat.shape[1])
  463. # prob = cp.Problem(cp.Minimize(cp.quad_form(x, P) + q_T@x),
  464. # [G@x <= h])
  465. # prob.solve()
  466. # edit_costs_new = x.value
  467. # residual = prob.value - np.dot(dis_k_vec.T, dis_k_vec)
  468. # G = -1 * np.identity(nb_cost_mat.shape[1])
  469. # h = np.array([0 for i in range(nb_cost_mat.shape[1])])
  470. x = cp.Variable(nb_cost_mat.shape[1])
  471. cost_fun = cp.sum_squares(nb_cost_mat * x - dis_k_vec)
  472. constraints = [x >= [0.0 for i in range(nb_cost_mat.shape[1])],
  473. # np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0]
  474. np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0,
  475. np.array([0.0, 0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0]
  476. prob = cp.Problem(cp.Minimize(cost_fun), constraints)
  477. prob.solve()
  478. edit_costs_new = x.value
  479. residual = np.sqrt(prob.value)
  480. # method 4:
  481. return edit_costs_new, residual
  482. def __generate_preimage_iam(self):
  483. # Set up the ged environment.
  484. ged_env = gedlibpy.GEDEnv() # @todo: maybe create a ged_env as a private varible.
  485. # gedlibpy.restart_env()
  486. ged_env.set_edit_cost(self.__ged_options['edit_cost'], edit_cost_constant=self.__edit_cost_constants)
  487. graphs = [self.__clean_graph(g) for g in self._dataset.graphs]
  488. for g in graphs:
  489. ged_env.add_nx_graph(g, '')
  490. graph_ids = ged_env.get_all_graph_ids()
  491. set_median_id = ged_env.add_graph('set_median')
  492. gen_median_id = ged_env.add_graph('gen_median')
  493. ged_env.init(init_option=self.__ged_options['init_option'])
  494. # Set up the madian graph estimator.
  495. mge = MedianGraphEstimator(ged_env, constant_node_costs(self.__ged_options['edit_cost']))
  496. mge.set_refine_method(self.__ged_options['method'], ged_options_to_string(self.__ged_options))
  497. options = self.__mge_options.copy()
  498. if not 'seed' in options:
  499. options['seed'] = int(round(time.time() * 1000)) # @todo: may not work correctly for possible parallel usage.
  500. # Select the GED algorithm.
  501. mge.set_options(mge_options_to_string(options))
  502. mge.set_init_method(self.__ged_options['method'], ged_options_to_string(self.__ged_options))
  503. mge.set_descent_method(self.__ged_options['method'], ged_options_to_string(self.__ged_options))
  504. # Run the estimator.
  505. mge.run(graph_ids, set_median_id, gen_median_id)
  506. # Get SODs.
  507. self.__sod_set_median = mge.get_sum_of_distances('initialized')
  508. self.__sod_gen_median = mge.get_sum_of_distances('converged')
  509. # Get median graphs.
  510. self.__set_median = ged_env.get_nx_graph(set_median_id)
  511. self.__gen_median = ged_env.get_nx_graph(gen_median_id)
  512. def __compute_distances_to_true_median(self):
  513. # compute distance in kernel space for set median.
  514. kernels_to_sm, _ = self.__graph_kernel.compute(self.__set_median, self._dataset.graphs, **self._kernel_options)
  515. kernel_sm, _ = self.__graph_kernel.compute(self.__set_median, self.__set_median, **self._kernel_options)
  516. kernels_to_sm = [kernels_to_sm[i] / np.sqrt(self.__graph_kernel.gram_matrix_unnorm[i, i] * kernel_sm) for i in range(len(kernels_to_sm))] # normalize
  517. # @todo: not correct kernel value
  518. gram_with_sm = np.concatenate((np.array([kernels_to_sm]), np.copy(self.__graph_kernel.gram_matrix)), axis=0)
  519. gram_with_sm = np.concatenate((np.array([[1] + kernels_to_sm]).T, gram_with_sm), axis=1)
  520. self.__k_dis_set_median = compute_k_dis(0, range(1, 1+len(self._dataset.graphs)),
  521. [1 / len(self._dataset.graphs)] * len(self._dataset.graphs),
  522. gram_with_sm, withterm3=False)
  523. # print(gen_median.nodes(data=True))
  524. # print(gen_median.edges(data=True))
  525. # print(set_median.nodes(data=True))
  526. # print(set_median.edges(data=True))
  527. # compute distance in kernel space for generalized median.
  528. kernels_to_gm, _ = self.__graph_kernel.compute(self.__gen_median, self._dataset.graphs, **self._kernel_options)
  529. kernel_gm, _ = self.__graph_kernel.compute(self.__gen_median, self.__gen_median, **self._kernel_options)
  530. kernels_to_gm = [kernels_to_gm[i] / np.sqrt(self.__graph_kernel.gram_matrix_unnorm[i, i] * kernel_gm) for i in range(len(kernels_to_gm))] # normalize
  531. gram_with_gm = np.concatenate((np.array([kernels_to_gm]), np.copy(self.__graph_kernel.gram_matrix)), axis=0)
  532. gram_with_gm = np.concatenate((np.array([[1] + kernels_to_gm]).T, gram_with_gm), axis=1)
  533. self.__k_dis_gen_median = compute_k_dis(0, range(1, 1+len(self._dataset.graphs)),
  534. [1 / len(self._dataset.graphs)] * len(self._dataset.graphs),
  535. gram_with_gm, withterm3=False)
  536. # compute distance in kernel space for each graph in median set.
  537. k_dis_median_set = []
  538. for idx in range(len(self._dataset.graphs)):
  539. k_dis_median_set.append(compute_k_dis(idx+1, range(1, 1+len(self._dataset.graphs)),
  540. [1 / len(self._dataset.graphs)] * len(self._dataset.graphs),
  541. gram_with_gm, withterm3=False))
  542. idx_k_dis_median_set_min = np.argmin(k_dis_median_set)
  543. self.__k_dis_dataset = k_dis_median_set[idx_k_dis_median_set_min]
  544. if self._verbose >= 2:
  545. print()
  546. print('distance in kernel space for set median:', self.__k_dis_set_median)
  547. print('distance in kernel space for generalized median:', self.__k_dis_gen_median)
  548. print('minimum distance in kernel space for each graph in median set:', self.__k_dis_dataset)
  549. print('distance in kernel space for each graph in median set:', k_dis_median_set)
  550. # return dis_k_sm, dis_k_gm, k_dis_median_set, dis_k_gi_min, idx_dis_k_gi_min
  551. def __set_graph_kernel_by_name(self):
  552. if self.kernel_options['name'] == 'structuralspkernel':
  553. from gklearn.kernels import StructuralSP
  554. self.__graph_kernel = StructuralSP(node_labels=self.dataset.node_labels,
  555. edge_labels=self.dataset.edge_labels,
  556. node_attrs=self.dataset.node_attrs,
  557. edge_attrs=self.dataset.edge_attrs,
  558. ds_infos=self.dataset.get_dataset_infos(keys=['directed']),
  559. **self.kernel_options)
  560. # def __clean_graph(self, G, node_labels=[], edge_labels=[], node_attrs=[], edge_attrs=[]):
  561. def __clean_graph(self, G):
  562. """
  563. Cleans node and edge labels and attributes of the given graph.
  564. """
  565. G_new = nx.Graph()
  566. for nd, attrs in G.nodes(data=True):
  567. G_new.add_node(str(nd)) # @todo: should we keep this as str()?
  568. for l_name in self._dataset.node_labels:
  569. G_new.nodes[str(nd)][l_name] = str(attrs[l_name])
  570. for a_name in self._dataset.node_attrs:
  571. G_new.nodes[str(nd)][a_name] = str(attrs[a_name])
  572. for nd1, nd2, attrs in G.edges(data=True):
  573. G_new.add_edge(str(nd1), str(nd2))
  574. for l_name in self._dataset.edge_labels:
  575. G_new.edges[str(nd1), str(nd2)][l_name] = str(attrs[l_name])
  576. for a_name in self._dataset.edge_attrs:
  577. G_new.edges[str(nd1), str(nd2)][a_name] = str(attrs[a_name])
  578. return G_new
  579. @property
  580. def mge(self):
  581. return self.__mge
  582. @property
  583. def ged_options(self):
  584. return self.__ged_options
  585. @ged_options.setter
  586. def ged_options(self, value):
  587. self.__ged_options = value
  588. @property
  589. def mge_options(self):
  590. return self.__mge_options
  591. @mge_options.setter
  592. def mge_options(self, value):
  593. self.__mge_options = value
  594. @property
  595. def fit_method(self):
  596. return self.__fit_method
  597. @fit_method.setter
  598. def fit_method(self, value):
  599. self.__fit_method = value
  600. @property
  601. def init_ecc(self):
  602. return self.__init_ecc
  603. @init_ecc.setter
  604. def fit_method(self, value):
  605. self.__init_ecc = value

A Python package for graph kernels, graph edit distances and graph pre-image problem.