You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

fitDistance.py 8.0 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. Created on Wed Oct 16 14:20:06 2019
  5. @author: ljia
  6. """
  7. import numpy as np
  8. from tqdm import tqdm
  9. from itertools import combinations_with_replacement
  10. import multiprocessing
  11. from multiprocessing import Pool
  12. from functools import partial
  13. import time
  14. import random
  15. from scipy import optimize
  16. import cvxpy as cp
  17. import sys
  18. sys.path.insert(0, "../")
  19. from ged import GED, get_nb_edit_operations
  20. from utils import kernel_distance_matrix
  21. def fit_GED_to_kernel_distance(Gn, gkernel, itr_max):
  22. # c_vi, c_vr, c_vs, c_ei, c_er, c_es or parts of them.
  23. random.seed(1)
  24. cost_rdm = random.sample(range(1, 10), 5)
  25. edit_costs = cost_rdm + [0]
  26. # edit_costs = [0.2, 0.2, 0.2, 0.2, 0.2, 0]
  27. # edit_costs = [0, 0, 0.9544, 0.026, 0.0196, 0]
  28. # edit_costs = [0.008429912251810438, 0.025461055985319694, 0.2047320869225948, 0.004148727085832133, 0.0, 0]
  29. idx_nonzeros = [i for i, item in enumerate(edit_costs) if item != 0]
  30. # compute distances in feature space.
  31. dis_k_mat, _, _, _ = kernel_distance_matrix(Gn, gkernel=gkernel)
  32. dis_k_vec = []
  33. for i in range(len(dis_k_mat)):
  34. for j in range(i, len(dis_k_mat)):
  35. dis_k_vec.append(dis_k_mat[i, j])
  36. dis_k_vec = np.array(dis_k_vec)
  37. residual_list = []
  38. edit_cost_list = []
  39. time_list = []
  40. for itr in range(itr_max):
  41. print('\niteration', itr)
  42. time0 = time.time()
  43. # compute GEDs and numbers of edit operations.
  44. edit_cost_constant = [i for i in edit_costs]
  45. edit_cost_list.append(edit_cost_constant)
  46. ged_all, ged_mat, n_edit_operations = compute_geds(Gn, edit_cost_constant,
  47. idx_nonzeros, parallel=True)
  48. residual = np.sqrt(np.sum(np.square(np.array(ged_all) - dis_k_vec)))
  49. residual_list.append(residual)
  50. # "fit" geds to distances in feature space by tuning edit costs using the
  51. # Least Squares Method.
  52. nb_cost_mat = np.array(n_edit_operations).T
  53. edit_costs_new, residual = compute_better_costs(nb_cost_mat, dis_k_vec)
  54. print('pseudo residual:', residual)
  55. for i in range(len(edit_costs_new)):
  56. if edit_costs_new[i] < 0:
  57. if edit_costs_new[i] > -1e-9:
  58. edit_costs_new[i] = 0
  59. else:
  60. raise ValueError('The edit cost is negative.')
  61. for idx, item in enumerate(idx_nonzeros):
  62. edit_costs[item] = edit_costs_new[idx]
  63. time_list.append(time.time() - time0)
  64. print('edit_costs:', edit_costs)
  65. print('residual_list:', residual_list)
  66. edit_cost_list.append(edit_costs)
  67. ged_all, ged_mat, n_edit_operations = compute_geds(Gn, edit_costs,
  68. idx_nonzeros, parallel=True)
  69. residual = np.sqrt(np.sum(np.square(np.array(ged_all) - dis_k_vec)))
  70. residual_list.append(residual)
  71. return edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list
  72. def compute_geds(Gn, edit_cost_constant, idx_nonzeros, parallel=False):
  73. ged_mat = np.zeros((len(Gn), len(Gn)))
  74. if parallel:
  75. # print('parallel')
  76. len_itr = int(len(Gn) * (len(Gn) + 1) / 2)
  77. ged_all = [0 for i in range(len_itr)]
  78. n_edit_operations = [[0 for i in range(len_itr)] for j in
  79. range(len(idx_nonzeros))]
  80. itr = combinations_with_replacement(range(0, len(Gn)), 2)
  81. n_jobs = multiprocessing.cpu_count()
  82. if len_itr < 100 * n_jobs:
  83. chunksize = int(len_itr / n_jobs) + 1
  84. else:
  85. chunksize = 100
  86. def init_worker(gn_toshare):
  87. global G_gn
  88. G_gn = gn_toshare
  89. do_partial = partial(_wrapper_compute_ged_parallel, edit_cost_constant,
  90. idx_nonzeros)
  91. pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(Gn,))
  92. iterator = tqdm(pool.imap_unordered(do_partial, itr, chunksize),
  93. desc='computing GEDs', file=sys.stdout)
  94. # iterator = pool.imap_unordered(do_partial, itr, chunksize)
  95. for i, j, dis, n_eo_tmp in iterator:
  96. idx_itr = int(len(Gn) * i + j - i * (i + 1) / 2)
  97. ged_all[idx_itr] = dis
  98. ged_mat[i][j] = dis
  99. ged_mat[j][i] = dis
  100. for idx, item in enumerate(idx_nonzeros):
  101. n_edit_operations[idx][idx_itr] = n_eo_tmp[item]
  102. # print('\n-------------------------------------------')
  103. # print(i, j, idx_itr, dis)
  104. pool.close()
  105. pool.join()
  106. else:
  107. ged_all = []
  108. n_edit_operations = [[] for i in range(len(idx_nonzeros))]
  109. for i in tqdm(range(len(Gn)), desc='computing GEDs', file=sys.stdout):
  110. # for i in range(len(Gn)):
  111. for j in range(i, len(Gn)):
  112. # time0 = time.time()
  113. dis, pi_forward, pi_backward = GED(Gn[i], Gn[j], lib='gedlibpy',
  114. cost='CONSTANT', method='IPFP',
  115. edit_cost_constant=edit_cost_constant, stabilizer='min',
  116. repeat=50)
  117. # time1 = time.time() - time0
  118. # time0 = time.time()
  119. ged_all.append(dis)
  120. ged_mat[i][j] = dis
  121. ged_mat[j][i] = dis
  122. n_eo_tmp = get_nb_edit_operations(Gn[i], Gn[j], pi_forward, pi_backward)
  123. for idx, item in enumerate(idx_nonzeros):
  124. n_edit_operations[idx].append(n_eo_tmp[item])
  125. # time2 = time.time() - time0
  126. # print(time1, time2, time1 / time2)
  127. return ged_all, ged_mat, n_edit_operations
  128. def _wrapper_compute_ged_parallel(edit_cost_constant, idx_nonzeros, itr):
  129. i = itr[0]
  130. j = itr[1]
  131. dis, n_eo_tmp = _compute_ged_parallel(G_gn[i], G_gn[j], edit_cost_constant,
  132. idx_nonzeros)
  133. return i, j, dis, n_eo_tmp
  134. def _compute_ged_parallel(g1, g2, edit_cost_constant, idx_nonzeros):
  135. dis, pi_forward, pi_backward = GED(g1, g2, lib='gedlibpy',
  136. cost='CONSTANT', method='IPFP',
  137. edit_cost_constant=edit_cost_constant, stabilizer='min',
  138. repeat=50)
  139. n_eo_tmp = get_nb_edit_operations(g1, g2, pi_forward, pi_backward)
  140. return dis, n_eo_tmp
  141. def compute_better_costs(nb_cost_mat, dis_k_vec):
  142. # # method 1: simple least square method.
  143. # edit_costs_new, residual, _, _ = np.linalg.lstsq(nb_cost_mat, dis_k_vec,
  144. # rcond=None)
  145. # # method 2: least square method with x_i >= 0.
  146. # edit_costs_new, residual = optimize.nnls(nb_cost_mat, dis_k_vec)
  147. # method 3: solve as a quadratic program with constraints: x_i >= 0, sum(x) = 1.
  148. # P = np.dot(nb_cost_mat.T, nb_cost_mat)
  149. # q_T = -2 * np.dot(dis_k_vec.T, nb_cost_mat)
  150. # G = -1 * np.identity(nb_cost_mat.shape[1])
  151. # h = np.array([0 for i in range(nb_cost_mat.shape[1])])
  152. # A = np.array([1 for i in range(nb_cost_mat.shape[1])])
  153. # b = 1
  154. # x = cp.Variable(nb_cost_mat.shape[1])
  155. # prob = cp.Problem(cp.Minimize(cp.quad_form(x, P) + q_T@x),
  156. # [G@x <= h])
  157. # prob.solve()
  158. # edit_costs_new = x.value
  159. # residual = prob.value - np.dot(dis_k_vec.T, dis_k_vec)
  160. # G = -1 * np.identity(nb_cost_mat.shape[1])
  161. # h = np.array([0 for i in range(nb_cost_mat.shape[1])])
  162. x = cp.Variable(nb_cost_mat.shape[1])
  163. cost = cp.sum_squares(nb_cost_mat * x - dis_k_vec)
  164. constraints = [x >= [0 for i in range(nb_cost_mat.shape[1])]]
  165. prob = cp.Problem(cp.Minimize(cost), constraints)
  166. prob.solve()
  167. edit_costs_new = x.value
  168. residual = np.sqrt(prob.value)
  169. # method 4:
  170. return edit_costs_new, residual
  171. if __name__ == '__main__':
  172. print('check test_fitDistance.py')

A Python package for graph kernels, graph edit distances and graph pre-image problem.