You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

fitDistance.py 7.7 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. Created on Wed Oct 16 14:20:06 2019
  5. @author: ljia
  6. """
  7. import numpy as np
  8. from tqdm import tqdm
  9. from itertools import combinations_with_replacement
  10. import multiprocessing
  11. from multiprocessing import Pool
  12. from functools import partial
  13. import time
  14. import random
  15. from scipy import optimize
  16. import cvxpy as cp
  17. import sys
  18. sys.path.insert(0, "../")
  19. from pygraph.utils.graphfiles import loadDataset
  20. from ged import GED, get_nb_edit_operations
  21. from utils import kernel_distance_matrix
  22. def fit_GED_to_kernel_distance(Gn, gkernel, itr_max):
  23. # c_vi, c_vr, c_vs, c_ei, c_er, c_es or parts of them.
  24. random.seed(1)
  25. cost_rdm = random.sample(range(1, 10), 5)
  26. edit_costs = cost_rdm + [0]
  27. # edit_costs = [0.2, 0.2, 0.2, 0.2, 0.2, 0]
  28. # edit_costs = [0, 0, 0.9544, 0.026, 0.0196, 0]
  29. # edit_costs = [0.008429912251810438, 0.025461055985319694, 0.2047320869225948, 0.004148727085832133, 0.0, 0]
  30. idx_nonzeros = [i for i, item in enumerate(edit_costs) if item != 0]
  31. # compute distances in feature space.
  32. dis_k_mat, _, _, _ = kernel_distance_matrix(Gn, gkernel=gkernel)
  33. dis_k_vec = []
  34. for i in range(len(dis_k_mat)):
  35. for j in range(i, len(dis_k_mat)):
  36. dis_k_vec.append(dis_k_mat[i, j])
  37. dis_k_vec = np.array(dis_k_vec)
  38. residual_list = []
  39. edit_cost_list = []
  40. for itr in range(itr_max):
  41. print('\niteration', itr)
  42. # compute GEDs and numbers of edit operations.
  43. edit_cost_constant = [i for i in edit_costs]
  44. edit_cost_list.append(edit_cost_constant)
  45. ged_all, ged_mat, n_edit_operations = compute_geds(Gn, edit_cost_constant,
  46. idx_nonzeros, parallel=True)
  47. residual = np.sqrt(np.sum(np.square(np.array(ged_all) - dis_k_vec)))
  48. residual_list.append(residual)
  49. # "fit" geds to distances in feature space by tuning edit costs using the
  50. # Least Squares Method.
  51. nb_cost_mat = np.array(n_edit_operations).T
  52. edit_costs_new, residual = compute_better_costs(nb_cost_mat, dis_k_vec)
  53. print('pseudo residual:', residual)
  54. for i in range(len(edit_costs_new)):
  55. if edit_costs_new[i] < 0:
  56. if edit_costs_new[i] > -1e-9:
  57. edit_costs_new[i] = 0
  58. else:
  59. raise ValueError('The edit cost is negative.')
  60. for idx, item in enumerate(idx_nonzeros):
  61. edit_costs[item] = edit_costs_new[idx]
  62. print('edit_costs:', edit_costs)
  63. print('residual_list:', residual_list)
  64. return edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat
  65. def compute_geds(Gn, edit_cost_constant, idx_nonzeros, parallel=False):
  66. ged_mat = np.zeros((len(Gn), len(Gn)))
  67. if parallel:
  68. # print('parallel')
  69. len_itr = int(len(Gn) * (len(Gn) + 1) / 2)
  70. ged_all = [0 for i in range(len_itr)]
  71. n_edit_operations = [[0 for i in range(len_itr)] for j in
  72. range(len(idx_nonzeros))]
  73. itr = combinations_with_replacement(range(0, len(Gn)), 2)
  74. n_jobs = multiprocessing.cpu_count()
  75. if len_itr < 100 * n_jobs:
  76. chunksize = int(len_itr / n_jobs) + 1
  77. else:
  78. chunksize = 100
  79. def init_worker(gn_toshare):
  80. global G_gn
  81. G_gn = gn_toshare
  82. do_partial = partial(_wrapper_compute_ged_parallel, edit_cost_constant,
  83. idx_nonzeros)
  84. pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(Gn,))
  85. iterator = tqdm(pool.imap_unordered(do_partial, itr, chunksize),
  86. desc='computing GEDs', file=sys.stdout)
  87. # iterator = pool.imap_unordered(do_partial, itr, chunksize)
  88. for i, j, dis, n_eo_tmp in iterator:
  89. idx_itr = int(len(Gn) * i + j - i * (i + 1) / 2)
  90. ged_all[idx_itr] = dis
  91. ged_mat[i][j] = dis
  92. ged_mat[j][i] = dis
  93. for idx, item in enumerate(idx_nonzeros):
  94. n_edit_operations[idx][idx_itr] = n_eo_tmp[item]
  95. # print('\n-------------------------------------------')
  96. # print(i, j, idx_itr, dis)
  97. pool.close()
  98. pool.join()
  99. else:
  100. ged_all = []
  101. n_edit_operations = [[] for i in range(len(idx_nonzeros))]
  102. for i in tqdm(range(len(Gn)), desc='computing GEDs', file=sys.stdout):
  103. # for i in range(len(Gn)):
  104. for j in range(i, len(Gn)):
  105. # time0 = time.time()
  106. dis, pi_forward, pi_backward = GED(Gn[i], Gn[j], lib='gedlibpy',
  107. cost='CONSTANT', method='IPFP',
  108. edit_cost_constant=edit_cost_constant, stabilizer='min',
  109. repeat=50)
  110. # time1 = time.time() - time0
  111. # time0 = time.time()
  112. ged_all.append(dis)
  113. ged_mat[i][j] = dis
  114. ged_mat[j][i] = dis
  115. n_eo_tmp = get_nb_edit_operations(Gn[i], Gn[j], pi_forward, pi_backward)
  116. for idx, item in enumerate(idx_nonzeros):
  117. n_edit_operations[idx].append(n_eo_tmp[item])
  118. # time2 = time.time() - time0
  119. # print(time1, time2, time1 / time2)
  120. return ged_all, ged_mat, n_edit_operations
  121. def _wrapper_compute_ged_parallel(edit_cost_constant, idx_nonzeros, itr):
  122. i = itr[0]
  123. j = itr[1]
  124. dis, n_eo_tmp = _compute_ged_parallel(G_gn[i], G_gn[j], edit_cost_constant,
  125. idx_nonzeros)
  126. return i, j, dis, n_eo_tmp
  127. def _compute_ged_parallel(g1, g2, edit_cost_constant, idx_nonzeros):
  128. dis, pi_forward, pi_backward = GED(g1, g2, lib='gedlibpy',
  129. cost='CONSTANT', method='IPFP',
  130. edit_cost_constant=edit_cost_constant, stabilizer='min',
  131. repeat=50)
  132. n_eo_tmp = get_nb_edit_operations(g1, g2, pi_forward, pi_backward)
  133. return dis, n_eo_tmp
  134. def compute_better_costs(nb_cost_mat, dis_k_vec):
  135. # # method 1: simple least square method.
  136. # edit_costs_new, residual, _, _ = np.linalg.lstsq(nb_cost_mat, dis_k_vec,
  137. # rcond=None)
  138. # # method 2: least square method with x_i >= 0.
  139. # edit_costs_new, residual = optimize.nnls(nb_cost_mat, dis_k_vec)
  140. # method 3: solve as a quadratic program with constraints: x_i >= 0, sum(x) = 1.
  141. P = np.dot(nb_cost_mat.T, nb_cost_mat)
  142. q_T = -2 * np.dot(dis_k_vec.T, nb_cost_mat)
  143. G = -1 * np.identity(nb_cost_mat.shape[1])
  144. h = np.array([0 for i in range(nb_cost_mat.shape[1])])
  145. A = np.array([1 for i in range(nb_cost_mat.shape[1])])
  146. b = 1
  147. x = cp.Variable(nb_cost_mat.shape[1])
  148. prob = cp.Problem(cp.Minimize(cp.quad_form(x, P) + q_T@x),
  149. [G@x <= h])
  150. prob.solve()
  151. edit_costs_new = x.value
  152. residual = prob.value - np.dot(dis_k_vec.T, dis_k_vec)
  153. return edit_costs_new, residual
  154. if __name__ == '__main__':
  155. from utils import remove_edges
  156. ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
  157. 'extra_params': {}} # node/edge symb
  158. Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
  159. # Gn = Gn[0:10]
  160. remove_edges(Gn)
  161. gkernel = 'marginalizedkernel'
  162. itr_max = 10
  163. time0 = time.time()
  164. edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat = \
  165. fit_GED_to_kernel_distance(Gn, gkernel, itr_max)
  166. total_time = time.time() - time0
  167. print('total time:', total_time)

A Python package for graph kernels, graph edit distances and graph pre-image problem.