You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

fitDistance.py 8.1 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. Created on Wed Oct 16 14:20:06 2019
  5. @author: ljia
  6. """
  7. import numpy as np
  8. from tqdm import tqdm
  9. from itertools import combinations_with_replacement, combinations
  10. import multiprocessing
  11. from multiprocessing import Pool
  12. from functools import partial
  13. import time
  14. import random
  15. from scipy import optimize
  16. import cvxpy as cp
  17. import sys
  18. #sys.path.insert(0, "../")
  19. from ged import GED, get_nb_edit_operations
  20. from utils import kernel_distance_matrix
  21. def fit_GED_to_kernel_distance(Gn, node_label, edge_label, gkernel, itr_max, k=4,
  22. params_ged={'lib': 'gedlibpy', 'cost': 'CONSTANT',
  23. 'method': 'IPFP', 'stabilizer': None},
  24. init_costs=[3, 3, 1, 3, 3, 1],
  25. parallel=True):
  26. # c_vi, c_vr, c_vs, c_ei, c_er, c_es or parts of them.
  27. # random.seed(1)
  28. # cost_rdm = random.sample(range(1, 10), 6)
  29. # init_costs = cost_rdm + [0]
  30. # init_costs = cost_rdm
  31. init_costs = [3, 3, 1, 3, 3, 1]
  32. # init_costs = [i * 0.01 for i in cost_rdm] + [0]
  33. # init_costs = [0.2, 0.2, 0.2, 0.2, 0.2, 0]
  34. # init_costs = [0, 0, 0.9544, 0.026, 0.0196, 0]
  35. # init_costs = [0.008429912251810438, 0.025461055985319694, 0.2047320869225948, 0.004148727085832133, 0.0, 0]
  36. # idx_cost_nonzeros = [i for i, item in enumerate(edit_costs) if item != 0]
  37. # compute distances in feature space.
  38. dis_k_mat, _, _, _ = kernel_distance_matrix(Gn, node_label, edge_label, gkernel=gkernel)
  39. dis_k_vec = []
  40. for i in range(len(dis_k_mat)):
  41. # for j in range(i, len(dis_k_mat)):
  42. for j in range(i + 1, len(dis_k_mat)):
  43. dis_k_vec.append(dis_k_mat[i, j])
  44. dis_k_vec = np.array(dis_k_vec)
  45. # init ged.
  46. print('\ninitial:')
  47. time0 = time.time()
  48. params_ged['edit_cost_constant'] = init_costs
  49. ged_vec_init, ged_mat, n_edit_operations = compute_geds(Gn, params_ged,
  50. parallel=parallel)
  51. residual_list = [np.sqrt(np.sum(np.square(np.array(ged_vec_init) - dis_k_vec)))]
  52. time_list = [time.time() - time0]
  53. edit_cost_list = [init_costs]
  54. nb_cost_mat = np.array(n_edit_operations)
  55. nb_cost_mat_list = [nb_cost_mat]
  56. print('edit_costs:', init_costs)
  57. print('residual_list:', residual_list)
  58. for itr in range(itr_max):
  59. print('\niteration', itr)
  60. time0 = time.time()
  61. # "fit" geds to distances in feature space by tuning edit costs using the
  62. # Least Squares Method.
  63. edit_costs_new, residual = update_costs(nb_cost_mat, dis_k_vec)
  64. for i in range(len(edit_costs_new)):
  65. if edit_costs_new[i] < 0:
  66. if edit_costs_new[i] > -1e-9:
  67. edit_costs_new[i] = 0
  68. else:
  69. raise ValueError('The edit cost is negative.')
  70. # for i in range(len(edit_costs_new)):
  71. # if edit_costs_new[i] < 0:
  72. # edit_costs_new[i] = 0
  73. # compute new GEDs and numbers of edit operations.
  74. params_ged['edit_cost_constant'] = edit_costs_new
  75. ged_vec, ged_mat, n_edit_operations = compute_geds(Gn, params_ged,
  76. parallel=parallel)
  77. residual_list.append(np.sqrt(np.sum(np.square(np.array(ged_vec) - dis_k_vec))))
  78. time_list.append(time.time() - time0)
  79. edit_cost_list.append(edit_costs_new)
  80. nb_cost_mat = np.array(n_edit_operations)
  81. nb_cost_mat_list.append(nb_cost_mat)
  82. print('edit_costs:', edit_costs_new)
  83. print('residual_list:', residual_list)
  84. return edit_costs_new, residual_list, edit_cost_list, dis_k_mat, ged_mat, \
  85. time_list, nb_cost_mat_list
  86. def compute_geds(Gn, params_ged, parallel=False):
  87. ged_mat = np.zeros((len(Gn), len(Gn)))
  88. if parallel:
  89. # print('parallel')
  90. # len_itr = int(len(Gn) * (len(Gn) + 1) / 2)
  91. len_itr = int(len(Gn) * (len(Gn) - 1) / 2)
  92. ged_vec = [0 for i in range(len_itr)]
  93. n_edit_operations = [0 for i in range(len_itr)]
  94. # itr = combinations_with_replacement(range(0, len(Gn)), 2)
  95. itr = combinations(range(0, len(Gn)), 2)
  96. n_jobs = multiprocessing.cpu_count()
  97. if len_itr < 100 * n_jobs:
  98. chunksize = int(len_itr / n_jobs) + 1
  99. else:
  100. chunksize = 100
  101. def init_worker(gn_toshare):
  102. global G_gn
  103. G_gn = gn_toshare
  104. do_partial = partial(_wrapper_compute_ged_parallel, params_ged)
  105. pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(Gn,))
  106. iterator = tqdm(pool.imap_unordered(do_partial, itr, chunksize),
  107. desc='computing GEDs', file=sys.stdout)
  108. # iterator = pool.imap_unordered(do_partial, itr, chunksize)
  109. for i, j, dis, n_eo_tmp in iterator:
  110. idx_itr = int(len(Gn) * i + j - (i + 1) * (i + 2) / 2)
  111. ged_vec[idx_itr] = dis
  112. ged_mat[i][j] = dis
  113. ged_mat[j][i] = dis
  114. n_edit_operations[idx_itr] = n_eo_tmp
  115. # print('\n-------------------------------------------')
  116. # print(i, j, idx_itr, dis)
  117. pool.close()
  118. pool.join()
  119. else:
  120. ged_vec = []
  121. n_edit_operations = []
  122. for i in tqdm(range(len(Gn)), desc='computing GEDs', file=sys.stdout):
  123. # for i in range(len(Gn)):
  124. for j in range(i + 1, len(Gn)):
  125. dis, pi_forward, pi_backward = GED(Gn[i], Gn[j], **params_ged)
  126. ged_vec.append(dis)
  127. ged_mat[i][j] = dis
  128. ged_mat[j][i] = dis
  129. n_eo_tmp = get_nb_edit_operations(Gn[i], Gn[j], pi_forward, pi_backward)
  130. n_edit_operations.append(n_eo_tmp)
  131. return ged_vec, ged_mat, n_edit_operations
  132. def _wrapper_compute_ged_parallel(params_ged, itr):
  133. i = itr[0]
  134. j = itr[1]
  135. dis, n_eo_tmp = _compute_ged_parallel(G_gn[i], G_gn[j], params_ged)
  136. return i, j, dis, n_eo_tmp
  137. def _compute_ged_parallel(g1, g2, params_ged):
  138. dis, pi_forward, pi_backward = GED(g1, g2, **params_ged)
  139. n_eo_tmp = get_nb_edit_operations(g1, g2, pi_forward, pi_backward)
  140. return dis, n_eo_tmp
  141. def update_costs(nb_cost_mat, dis_k_vec):
  142. # # method 1: simple least square method.
  143. # edit_costs_new, residual, _, _ = np.linalg.lstsq(nb_cost_mat, dis_k_vec,
  144. # rcond=None)
  145. # # method 2: least square method with x_i >= 0.
  146. # edit_costs_new, residual = optimize.nnls(nb_cost_mat, dis_k_vec)
  147. # method 3: solve as a quadratic program with constraints.
  148. # P = np.dot(nb_cost_mat.T, nb_cost_mat)
  149. # q_T = -2 * np.dot(dis_k_vec.T, nb_cost_mat)
  150. # G = -1 * np.identity(nb_cost_mat.shape[1])
  151. # h = np.array([0 for i in range(nb_cost_mat.shape[1])])
  152. # A = np.array([1 for i in range(nb_cost_mat.shape[1])])
  153. # b = 1
  154. # x = cp.Variable(nb_cost_mat.shape[1])
  155. # prob = cp.Problem(cp.Minimize(cp.quad_form(x, P) + q_T@x),
  156. # [G@x <= h])
  157. # prob.solve()
  158. # edit_costs_new = x.value
  159. # residual = prob.value - np.dot(dis_k_vec.T, dis_k_vec)
  160. # G = -1 * np.identity(nb_cost_mat.shape[1])
  161. # h = np.array([0 for i in range(nb_cost_mat.shape[1])])
  162. x = cp.Variable(nb_cost_mat.shape[1])
  163. cost = cp.sum_squares(nb_cost_mat * x - dis_k_vec)
  164. constraints = [x >= [0.0001 for i in range(nb_cost_mat.shape[1])],
  165. # np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0]
  166. np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0,
  167. np.array([0.0, 0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0]
  168. prob = cp.Problem(cp.Minimize(cost), constraints)
  169. prob.solve()
  170. edit_costs_new = x.value
  171. residual = np.sqrt(prob.value)
  172. # method 4:
  173. return edit_costs_new, residual
  174. if __name__ == '__main__':
  175. print('check test_fitDistance.py')

A Python package for graph kernels, graph edit distances and graph pre-image problem.