You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

fitDistance.py 9.5 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. Created on Wed Oct 16 14:20:06 2019
  5. @author: ljia
  6. """
  7. import numpy as np
  8. from tqdm import tqdm
  9. from itertools import combinations_with_replacement
  10. import multiprocessing
  11. from multiprocessing import Pool
  12. from functools import partial
  13. import time
  14. import random
  15. from scipy import optimize
  16. import cvxpy as cp
  17. import sys
  18. #sys.path.insert(0, "../")
  19. from ged import GED, get_nb_edit_operations
  20. from utils import kernel_distance_matrix
  21. def fit_GED_to_kernel_distance(Gn, node_label, edge_label, gkernel, itr_max,
  22. fitkernel=None, gamma=1.0):
  23. # c_vi, c_vr, c_vs, c_ei, c_er, c_es or parts of them.
  24. # random.seed(1)
  25. cost_rdm = random.sample(range(1, 10), 6)
  26. # edit_costs = cost_rdm + [0]
  27. edit_costs = cost_rdm
  28. # edit_costs = [i * 0.01 for i in cost_rdm] + [0]
  29. # edit_costs = [0.2, 0.2, 0.2, 0.2, 0.2, 0]
  30. # edit_costs = [0, 0, 0.9544, 0.026, 0.0196, 0]
  31. # edit_costs = [0.008429912251810438, 0.025461055985319694, 0.2047320869225948, 0.004148727085832133, 0.0, 0]
  32. idx_cost_nonzeros = [i for i, item in enumerate(edit_costs) if item != 0]
  33. # compute distances in feature space.
  34. coef_dk = 1
  35. dis_k_mat, _, _, _ = kernel_distance_matrix(Gn, node_label, edge_label, gkernel=gkernel)
  36. dis_k_vec = []
  37. for i in range(len(dis_k_mat)):
  38. for j in range(i, len(dis_k_mat)):
  39. dis_k_vec.append(dis_k_mat[i, j])
  40. dis_k_vec = np.array(dis_k_vec)
  41. if fitkernel == None:
  42. dis_k_vec_ajusted = dis_k_vec
  43. elif fitkernel == 'gaussian':
  44. coef_dk = 1 / np.max(dis_k_vec)
  45. idx_dk_nonzeros = np.where(dis_k_vec != 0)[0]
  46. # remove 0's and constraint d_k between 0 and 1.
  47. dis_k_vec = dis_k_vec[idx_dk_nonzeros] * coef_dk
  48. dis_k_vec_ajusted = np.sqrt(-np.log(dis_k_vec) / gamma)
  49. residual_list = []
  50. edit_cost_list = []
  51. time_list = []
  52. nb_cost_mat_list = []
  53. for itr in range(itr_max):
  54. print('\niteration', itr)
  55. time0 = time.time()
  56. # compute GEDs and numbers of edit operations.
  57. edit_cost_constant = [i for i in edit_costs]
  58. edit_cost_list.append(edit_cost_constant)
  59. ged_all, ged_mat, n_edit_operations = compute_geds(Gn, edit_cost_constant,
  60. idx_cost_nonzeros, parallel=True)
  61. if fitkernel == None:
  62. residual = np.sqrt(np.sum(np.square(np.array(ged_all) - dis_k_vec)))
  63. elif fitkernel == 'gaussian':
  64. ged_all = np.array(ged_all)[idx_dk_nonzeros]
  65. residual = np.sqrt(np.sum(np.square(
  66. np.exp(-gamma * ged_all ** 2) / coef_dk - dis_k_vec)))
  67. residual_list.append(residual)
  68. # "fit" geds to distances in feature space by tuning edit costs using the
  69. # Least Squares Method.
  70. nb_cost_mat = np.array(n_edit_operations).T
  71. if fitkernel == 'gaussian':
  72. nb_cost_mat = nb_cost_mat[idx_dk_nonzeros]
  73. nb_cost_mat_list.append(nb_cost_mat)
  74. edit_costs_new, residual = compute_better_costs(nb_cost_mat, dis_k_vec_ajusted)
  75. print('pseudo residual:', residual)
  76. for i in range(len(edit_costs_new)):
  77. if edit_costs_new[i] < 0:
  78. if edit_costs_new[i] > -1e-9:
  79. edit_costs_new[i] = 0
  80. else:
  81. raise ValueError('The edit cost is negative.')
  82. for idx, item in enumerate(idx_cost_nonzeros):
  83. edit_costs[item] = edit_costs_new[idx]
  84. time_list.append(time.time() - time0)
  85. print('edit_costs:', edit_costs)
  86. print('residual_list:', residual_list)
  87. print()
  88. edit_cost_list.append(edit_costs)
  89. ged_all, ged_mat, n_edit_operations = compute_geds(Gn, edit_costs,
  90. idx_cost_nonzeros, parallel=True)
  91. if fitkernel == 0:
  92. residual = np.sqrt(np.sum(np.square(np.array(ged_all) - dis_k_vec)))
  93. elif fitkernel == 'gaussian':
  94. ged_all = np.array(ged_all)[idx_dk_nonzeros]
  95. residual = np.sqrt(np.sum(np.square(
  96. np.exp(-gamma * ged_all ** 2) / coef_dk - dis_k_vec)))
  97. residual_list.append(residual)
  98. nb_cost_mat_list.append(np.array(n_edit_operations).T)
  99. return edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, \
  100. time_list, nb_cost_mat_list, coef_dk
  101. def compute_geds(Gn, edit_cost_constant, idx_nonzeros, parallel=False):
  102. ged_mat = np.zeros((len(Gn), len(Gn)))
  103. if parallel:
  104. # print('parallel')
  105. len_itr = int(len(Gn) * (len(Gn) + 1) / 2)
  106. ged_all = [0 for i in range(len_itr)]
  107. n_edit_operations = [[0 for i in range(len_itr)] for j in
  108. range(len(idx_nonzeros))]
  109. itr = combinations_with_replacement(range(0, len(Gn)), 2)
  110. n_jobs = multiprocessing.cpu_count()
  111. if len_itr < 100 * n_jobs:
  112. chunksize = int(len_itr / n_jobs) + 1
  113. else:
  114. chunksize = 100
  115. def init_worker(gn_toshare):
  116. global G_gn
  117. G_gn = gn_toshare
  118. do_partial = partial(_wrapper_compute_ged_parallel, edit_cost_constant,
  119. idx_nonzeros)
  120. pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(Gn,))
  121. iterator = tqdm(pool.imap_unordered(do_partial, itr, chunksize),
  122. desc='computing GEDs', file=sys.stdout)
  123. # iterator = pool.imap_unordered(do_partial, itr, chunksize)
  124. for i, j, dis, n_eo_tmp in iterator:
  125. idx_itr = int(len(Gn) * i + j - i * (i + 1) / 2)
  126. ged_all[idx_itr] = dis
  127. ged_mat[i][j] = dis
  128. ged_mat[j][i] = dis
  129. for idx, item in enumerate(idx_nonzeros):
  130. n_edit_operations[idx][idx_itr] = n_eo_tmp[item]
  131. # print('\n-------------------------------------------')
  132. # print(i, j, idx_itr, dis)
  133. pool.close()
  134. pool.join()
  135. else:
  136. ged_all = []
  137. n_edit_operations = [[] for i in range(len(idx_nonzeros))]
  138. for i in tqdm(range(len(Gn)), desc='computing GEDs', file=sys.stdout):
  139. # for i in range(len(Gn)):
  140. for j in range(i, len(Gn)):
  141. # time0 = time.time()
  142. dis, pi_forward, pi_backward = GED(Gn[i], Gn[j], lib='gedlibpy',
  143. cost='CONSTANT', method='IPFP',
  144. edit_cost_constant=edit_cost_constant, stabilizer='min',
  145. repeat=50)
  146. # time1 = time.time() - time0
  147. # time0 = time.time()
  148. ged_all.append(dis)
  149. ged_mat[i][j] = dis
  150. ged_mat[j][i] = dis
  151. n_eo_tmp = get_nb_edit_operations(Gn[i], Gn[j], pi_forward, pi_backward)
  152. for idx, item in enumerate(idx_nonzeros):
  153. n_edit_operations[idx].append(n_eo_tmp[item])
  154. # time2 = time.time() - time0
  155. # print(time1, time2, time1 / time2)
  156. return ged_all, ged_mat, n_edit_operations
  157. def _wrapper_compute_ged_parallel(edit_cost_constant, idx_nonzeros, itr):
  158. i = itr[0]
  159. j = itr[1]
  160. dis, n_eo_tmp = _compute_ged_parallel(G_gn[i], G_gn[j], edit_cost_constant,
  161. idx_nonzeros)
  162. return i, j, dis, n_eo_tmp
  163. def _compute_ged_parallel(g1, g2, edit_cost_constant, idx_nonzeros):
  164. dis, pi_forward, pi_backward = GED(g1, g2, lib='gedlibpy',
  165. cost='CONSTANT', method='IPFP',
  166. edit_cost_constant=edit_cost_constant, stabilizer='min',
  167. repeat=50)
  168. n_eo_tmp = get_nb_edit_operations(g1, g2, pi_forward, pi_backward)
  169. return dis, n_eo_tmp
  170. def compute_better_costs(nb_cost_mat, dis_k_vec):
  171. # # method 1: simple least square method.
  172. # edit_costs_new, residual, _, _ = np.linalg.lstsq(nb_cost_mat, dis_k_vec,
  173. # rcond=None)
  174. # # method 2: least square method with x_i >= 0.
  175. # edit_costs_new, residual = optimize.nnls(nb_cost_mat, dis_k_vec)
  176. # method 3: solve as a quadratic program with constraints: x_i >= 0, sum(x) = 1.
  177. # P = np.dot(nb_cost_mat.T, nb_cost_mat)
  178. # q_T = -2 * np.dot(dis_k_vec.T, nb_cost_mat)
  179. # G = -1 * np.identity(nb_cost_mat.shape[1])
  180. # h = np.array([0 for i in range(nb_cost_mat.shape[1])])
  181. # A = np.array([1 for i in range(nb_cost_mat.shape[1])])
  182. # b = 1
  183. # x = cp.Variable(nb_cost_mat.shape[1])
  184. # prob = cp.Problem(cp.Minimize(cp.quad_form(x, P) + q_T@x),
  185. # [G@x <= h])
  186. # prob.solve()
  187. # edit_costs_new = x.value
  188. # residual = prob.value - np.dot(dis_k_vec.T, dis_k_vec)
  189. # G = -1 * np.identity(nb_cost_mat.shape[1])
  190. # h = np.array([0 for i in range(nb_cost_mat.shape[1])])
  191. x = cp.Variable(nb_cost_mat.shape[1])
  192. cost = cp.sum_squares(nb_cost_mat * x - dis_k_vec)
  193. constraints = [x >= [0.01 for i in range(nb_cost_mat.shape[1])],
  194. # np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0]
  195. np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0,
  196. np.array([0.0, 0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0]
  197. prob = cp.Problem(cp.Minimize(cost), constraints)
  198. prob.solve()
  199. edit_costs_new = x.value
  200. residual = np.sqrt(prob.value)
  201. # method 4:
  202. return edit_costs_new, residual
  203. if __name__ == '__main__':
  204. print('check test_fitDistance.py')

A Python package for graph kernels, graph edit distances and graph pre-image problem.