You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

test_fitDistance.py 12 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. Created on Thu Oct 24 11:50:56 2019
  5. @author: ljia
  6. """
  7. from matplotlib import pyplot as plt
  8. import numpy as np
  9. from tqdm import tqdm
  10. import sys
  11. sys.path.insert(0, "../")
  12. from pygraph.utils.graphfiles import loadDataset
  13. from utils import remove_edges
  14. from fitDistance import fit_GED_to_kernel_distance
  15. from utils import normalize_distance_matrix
  16. def test_anycosts():
  17. ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
  18. 'extra_params': {}} # node/edge symb
  19. Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
  20. # Gn = Gn[0:10]
  21. remove_edges(Gn)
  22. gkernel = 'marginalizedkernel'
  23. itr_max = 10
  24. edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \
  25. nb_cost_mat_list, coef_dk = fit_GED_to_kernel_distance(Gn, gkernel, itr_max)
  26. total_time = np.sum(time_list)
  27. print('\nedit_costs:', edit_costs)
  28. print('\nresidual_list:', residual_list)
  29. print('\nedit_cost_list:', edit_cost_list)
  30. print('\ndistance matrix in kernel space:', dis_k_mat)
  31. print('\nged matrix:', ged_mat)
  32. print('\ntotal time:', total_time)
  33. print('\nnb_cost_mat:', nb_cost_mat_list[-1])
  34. np.savez('results/fit_distance.any_costs.gm', edit_costs=edit_costs,
  35. residual_list=residual_list, edit_cost_list=edit_cost_list,
  36. dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list,
  37. total_time=total_time, nb_cost_mat_list=nb_cost_mat_list)
  38. # # normalized distance matrices.
  39. # gmfile = np.load('results/fit_distance.any_costs.gm.npz')
  40. # edit_costs = gmfile['edit_costs']
  41. # residual_list = gmfile['residual_list']
  42. # edit_cost_list = gmfile['edit_cost_list']
  43. # dis_k_mat = gmfile['dis_k_mat']
  44. # ged_mat = gmfile['ged_mat']
  45. # total_time = gmfile['total_time']
  46. ## nb_cost_mat_list = gmfile['nb_cost_mat_list']
  47. norm_dis_k_mat = normalize_distance_matrix(dis_k_mat)
  48. plt.imshow(norm_dis_k_mat)
  49. plt.colorbar()
  50. plt.savefig('results/norm_dis_k_mat.any_costs' + '.eps', format='eps', dpi=300)
  51. # plt.savefig('results/norm_dis_k_mat.any_costs' + '.png', format='png')
  52. # plt.show()
  53. plt.clf()
  54. norm_ged_mat = normalize_distance_matrix(ged_mat)
  55. plt.imshow(norm_ged_mat)
  56. plt.colorbar()
  57. plt.savefig('results/norm_ged_mat.any_costs' + '.eps', format='eps', dpi=300)
  58. # plt.savefig('results/norm_ged_mat.any_costs' + '.png', format='png')
  59. # plt.show()
  60. plt.clf()
  61. norm_diff = norm_ged_mat - norm_dis_k_mat
  62. plt.imshow(norm_diff)
  63. plt.colorbar()
  64. plt.savefig('results/diff_mat_norm_ged_dis_k.any_costs' + '.eps', format='eps', dpi=300)
  65. # plt.savefig('results/diff_mat_norm_ged_dis_k.any_costs' + '.png', format='png')
  66. # plt.show()
  67. plt.clf()
  68. # draw_count_bar(norm_diff)
  69. def test_cs_leq_ci_plus_cr():
  70. """c_vs <= c_vi + c_vr, c_es <= c_ei + c_er
  71. """
  72. ds = {'name': 'monoterpenoides',
  73. 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb
  74. Gn, y_all = loadDataset(ds['dataset'])
  75. # Gn = Gn[0:10]
  76. gkernel = 'untilhpathkernel'
  77. node_label = 'atom'
  78. edge_label = 'bond_type'
  79. itr_max = 10
  80. edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \
  81. nb_cost_mat_list, coef_dk = fit_GED_to_kernel_distance(Gn, node_label, edge_label,
  82. gkernel, itr_max,
  83. fitkernel='gaussian')
  84. total_time = np.sum(time_list)
  85. print('\nedit_costs:', edit_costs)
  86. print('\nresidual_list:', residual_list)
  87. print('\nedit_cost_list:', edit_cost_list)
  88. print('\ndistance matrix in kernel space:', dis_k_mat)
  89. print('\nged matrix:', ged_mat)
  90. print('\ntotal time:', total_time)
  91. print('\nnb_cost_mat:', nb_cost_mat_list[-1])
  92. np.savez('results/fit_distance.cs_leq_ci_plus_cr.gaussian.cost_leq_1en2.monot.elabeled.uhpkernel.gm',
  93. edit_costs=edit_costs,
  94. residual_list=residual_list, edit_cost_list=edit_cost_list,
  95. dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list,
  96. total_time=total_time, nb_cost_mat_list=nb_cost_mat_list,
  97. coef_dk=coef_dk)
  98. # ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
  99. # 'extra_params': {}} # node/edge symb
  100. # Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
  101. ## Gn = Gn[0:10]
  102. ## remove_edges(Gn)
  103. # gkernel = 'untilhpathkernel'
  104. # node_label = 'atom'
  105. # edge_label = 'bond_type'
  106. # itr_max = 10
  107. # edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \
  108. # nb_cost_mat_list, coef_dk = fit_GED_to_kernel_distance(Gn, node_label, edge_label,
  109. # gkernel, itr_max)
  110. # total_time = np.sum(time_list)
  111. # print('\nedit_costs:', edit_costs)
  112. # print('\nresidual_list:', residual_list)
  113. # print('\nedit_cost_list:', edit_cost_list)
  114. # print('\ndistance matrix in kernel space:', dis_k_mat)
  115. # print('\nged matrix:', ged_mat)
  116. # print('\ntotal time:', total_time)
  117. # print('\nnb_cost_mat:', nb_cost_mat_list[-1])
  118. # np.savez('results/fit_distance.cs_leq_ci_plus_cr.cost_leq_1en2.mutag.elabeled.uhpkernel.gm',
  119. # edit_costs=edit_costs,
  120. # residual_list=residual_list, edit_cost_list=edit_cost_list,
  121. # dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list,
  122. # total_time=total_time, nb_cost_mat_list=nb_cost_mat_list, coef_dk)
  123. # # normalized distance matrices.
  124. # gmfile = np.load('results/fit_distance.cs_leq_ci_plus_cr.cost_leq_1en2.monot.elabeled.uhpkernel.gm.npz')
  125. # edit_costs = gmfile['edit_costs']
  126. # residual_list = gmfile['residual_list']
  127. # edit_cost_list = gmfile['edit_cost_list']
  128. # dis_k_mat = gmfile['dis_k_mat']
  129. # ged_mat = gmfile['ged_mat']
  130. # total_time = gmfile['total_time']
  131. # nb_cost_mat_list = gmfile['nb_cost_mat_list']
  132. # coef_dk = gmfile['coef_dk']
  133. nb_consistent, nb_inconsistent, ratio_consistent = pairwise_substitution_consistence(dis_k_mat, ged_mat)
  134. print(nb_consistent, nb_inconsistent, ratio_consistent)
  135. # dis_k_sub = pairwise_substitution(dis_k_mat)
  136. # ged_sub = pairwise_substitution(ged_mat)
  137. # np.savez('results/sub_dis_mat.cs_leq_ci_plus_cr.cost_leq_1en2.gm',
  138. # dis_k_sub=dis_k_sub, ged_sub=ged_sub)
  139. norm_dis_k_mat = normalize_distance_matrix(dis_k_mat)
  140. plt.imshow(norm_dis_k_mat)
  141. plt.colorbar()
  142. plt.savefig('results/norm_dis_k_mat.cs_leq_ci_plus_cr.gaussian.cost_leq_1en2.monot.elabeled.uhpkernel'
  143. + '.eps', format='eps', dpi=300)
  144. plt.savefig('results/norm_dis_k_mat.cs_leq_ci_plus_cr.gaussian.cost_leq_1en2.monot.elabeled.uhpkernel'
  145. + '.png', format='png')
  146. # plt.show()
  147. plt.clf()
  148. norm_ged_mat = normalize_distance_matrix(ged_mat)
  149. plt.imshow(norm_ged_mat)
  150. plt.colorbar()
  151. plt.savefig('results/norm_ged_mat.cs_leq_ci_plus_cr.gaussian.cost_leq_1en2.monot.elabeled.uhpkernel'
  152. + '.eps', format='eps', dpi=300)
  153. plt.savefig('results/norm_ged_mat.cs_leq_ci_plus_cr.gaussian.cost_leq_1en2.monot.elabeled.uhpkernel'
  154. + '.png', format='png')
  155. # plt.show()
  156. plt.clf()
  157. norm_diff = norm_ged_mat - norm_dis_k_mat
  158. plt.imshow(norm_diff)
  159. plt.colorbar()
  160. plt.savefig('results/diff_mat_norm_ged_dis_k.cs_leq_ci_plus_cr.gaussian.cost_leq_1en2.monot.elabeled.uhpkernel'
  161. + '.eps', format='eps', dpi=300)
  162. plt.savefig('results/diff_mat_norm_ged_dis_k.cs_leq_ci_plus_cr.gaussian.cost_leq_1en2.monot.elabeled.uhpkernel'
  163. + '.png', format='png')
  164. # plt.show()
  165. plt.clf()
  166. # draw_count_bar(norm_diff)
  167. def test_unfitted():
  168. """unfitted.
  169. """
  170. from fitDistance import compute_geds
  171. from utils import kernel_distance_matrix
  172. ds = {'name': 'monoterpenoides',
  173. 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb
  174. Gn, y_all = loadDataset(ds['dataset'])
  175. # Gn = Gn[0:10]
  176. gkernel = 'untilhpathkernel'
  177. node_label = 'atom'
  178. edge_label = 'bond_type'
  179. # ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
  180. # 'extra_params': {}} # node/edge symb
  181. # Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
  182. ## Gn = Gn[0:10]
  183. ## remove_edges(Gn)
  184. # gkernel = 'marginalizedkernel'
  185. dis_k_mat, _, _, _ = kernel_distance_matrix(Gn, node_label, edge_label, gkernel=gkernel)
  186. ged_all, ged_mat, n_edit_operations = compute_geds(Gn, [3, 3, 1, 3, 3, 1],
  187. [0, 1, 2, 3, 4, 5], parallel=True)
  188. print('\ndistance matrix in kernel space:', dis_k_mat)
  189. print('\nged matrix:', ged_mat)
  190. # np.savez('results/fit_distance.cs_leq_ci_plus_cr.cost_leq_1en2.gm', edit_costs=edit_costs,
  191. # residual_list=residual_list, edit_cost_list=edit_cost_list,
  192. # dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list,
  193. # total_time=total_time, nb_cost_mat_list=nb_cost_mat_list)
  194. # normalized distance matrices.
  195. # gmfile = np.load('results/fit_distance.cs_leq_ci_plus_cr.cost_leq_1en3.gm.npz')
  196. # edit_costs = gmfile['edit_costs']
  197. # residual_list = gmfile['residual_list']
  198. # edit_cost_list = gmfile['edit_cost_list']
  199. # dis_k_mat = gmfile['dis_k_mat']
  200. # ged_mat = gmfile['ged_mat']
  201. # total_time = gmfile['total_time']
  202. # nb_cost_mat_list = gmfile['nb_cost_mat_list']
  203. nb_consistent, nb_inconsistent, ratio_consistent = pairwise_substitution_consistence(dis_k_mat, ged_mat)
  204. print(nb_consistent, nb_inconsistent, ratio_consistent)
  205. norm_dis_k_mat = normalize_distance_matrix(dis_k_mat)
  206. plt.imshow(norm_dis_k_mat)
  207. plt.colorbar()
  208. plt.savefig('results/norm_dis_k_mat.unfitted.MUTAG' + '.eps', format='eps', dpi=300)
  209. plt.savefig('results/norm_dis_k_mat.unfitted.MUTAG' + '.png', format='png')
  210. # plt.show()
  211. plt.clf()
  212. norm_ged_mat = normalize_distance_matrix(ged_mat)
  213. plt.imshow(norm_ged_mat)
  214. plt.colorbar()
  215. plt.savefig('results/norm_ged_mat.unfitted.MUTAG' + '.eps', format='eps', dpi=300)
  216. plt.savefig('results/norm_ged_mat.unfitted.MUTAG' + '.png', format='png')
  217. # plt.show()
  218. plt.clf()
  219. norm_diff = norm_ged_mat - norm_dis_k_mat
  220. plt.imshow(norm_diff)
  221. plt.colorbar()
  222. plt.savefig('results/diff_mat_norm_ged_dis_k.unfitted.MUTAG' + '.eps', format='eps', dpi=300)
  223. plt.savefig('results/diff_mat_norm_ged_dis_k.unfitted.MUTAG' + '.png', format='png')
  224. # plt.show()
  225. plt.clf()
  226. draw_count_bar(norm_diff)
  227. def pairwise_substitution_consistence(mat1, mat2):
  228. """
  229. """
  230. nb_consistent = 0
  231. nb_inconsistent = 0
  232. # the matrix is considered symmetric.
  233. upper_tri1 = mat1[np.triu_indices_from(mat1)]
  234. upper_tri2 = mat2[np.tril_indices_from(mat2)]
  235. for i in tqdm(range(len(upper_tri1)), desc='computing consistence', file=sys.stdout):
  236. for j in range(i, len(upper_tri1)):
  237. if np.sign(upper_tri1[i] - upper_tri1[j]) == np.sign(upper_tri2[i] - upper_tri2[j]):
  238. nb_consistent += 1
  239. else:
  240. nb_inconsistent += 1
  241. return nb_consistent, nb_inconsistent, nb_consistent / (nb_consistent + nb_inconsistent)
  242. def pairwise_substitution(mat):
  243. # the matrix is considered symmetric.
  244. upper_tri = mat[np.triu_indices_from(mat)]
  245. sub_list = []
  246. for i in tqdm(range(len(upper_tri)), desc='computing', file=sys.stdout):
  247. for j in range(i, len(upper_tri)):
  248. sub_list.append(upper_tri[i] - upper_tri[j])
  249. return sub_list
  250. def draw_count_bar(norm_diff):
  251. import pandas
  252. from collections import Counter, OrderedDict
  253. norm_diff_cnt = norm_diff.flatten()
  254. norm_diff_cnt = norm_diff_cnt * 10
  255. norm_diff_cnt = np.floor(norm_diff_cnt)
  256. norm_diff_cnt = Counter(norm_diff_cnt)
  257. norm_diff_cnt = OrderedDict(sorted(norm_diff_cnt.items()))
  258. df = pandas.DataFrame.from_dict(norm_diff_cnt, orient='index')
  259. df.plot(kind='bar')
  260. if __name__ == '__main__':
  261. # test_anycosts()
  262. test_cs_leq_ci_plus_cr()
  263. # test_unfitted()
  264. # x = np.array([[1,2,3],[4,5,6],[7,8,9]])
  265. # xx = pairwise_substitution(x)

A Python package for graph kernels, graph edit distances and graph pre-image problem.