You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

test_fitDistance.py 30 kB

5 years ago
5 years ago
5 years ago
5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. Created on Thu Oct 24 11:50:56 2019
  5. @author: ljia
  6. """
  7. from matplotlib import pyplot as plt
  8. import numpy as np
  9. from tqdm import tqdm
  10. import sys
  11. sys.path.insert(0, "../")
  12. from gklearn.utils.graphfiles import loadDataset
  13. from utils import remove_edges
  14. from fitDistance import fit_GED_to_kernel_distance
  15. from utils import normalize_distance_matrix
  16. def test_update_costs():
  17. from preimage.fitDistance import update_costs
  18. import cvxpy as cp
  19. ds = np.load('results/xp_fit_method/fit_data_debug4.gm.npz')
  20. nb_cost_mat = ds['nb_cost_mat']
  21. dis_k_vec = ds['dis_k_vec']
  22. n_edit_operations = ds['n_edit_operations']
  23. ged_vec_init = ds['ged_vec_init']
  24. ged_mat = ds['ged_mat']
  25. nb_cost_mat_new = nb_cost_mat[:,[2,3,4]]
  26. x = cp.Variable(nb_cost_mat_new.shape[1])
  27. cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec)
  28. # constraints = [x >= [0.000 for i in range(nb_cost_mat_new.shape[1])],
  29. # np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0]
  30. # constraints = [x >= [0.000 for i in range(nb_cost_mat_new.shape[1])],
  31. # np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0,
  32. # np.array([0.0, 0.0, 0.0, 1.0, -1.0]).T@x == 0.0]
  33. constraints = [x >= [0.00 for i in range(nb_cost_mat_new.shape[1])],
  34. np.array([0.0, 1.0, -1.0]).T@x == 0.0]
  35. # constraints = [x >= [0.00000 for i in range(nb_cost_mat_new.shape[1])]]
  36. prob = cp.Problem(cp.Minimize(cost_fun), constraints)
  37. prob.solve()
  38. print(x.value)
  39. edit_costs_new = np.concatenate((x.value, np.array([0.0])))
  40. residual = np.sqrt(prob.value)
  41. def median_paper_clcpc_python_best():
  42. """c_vs <= c_vi + c_vr, c_es <= c_ei + c_er with ged computation with
  43. python invoking the c++ code by bash command (with updated library).
  44. """
  45. # ds = {'name': 'monoterpenoides',
  46. # 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb
  47. # _, y_all = loadDataset(ds['dataset'])
  48. gkernel = 'untilhpathkernel'
  49. node_label = 'atom'
  50. edge_label = 'bond_type'
  51. itr_max = 6
  52. algo_options = '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1'
  53. params_ged = {'lib': 'gedlibpy', 'cost': 'CONSTANT', 'method': 'IPFP',
  54. 'algo_options': algo_options, 'stabilizer': None}
  55. y_all = ['3', '1', '4', '6', '7', '8', '9', '2']
  56. repeats = 50
  57. collection_path = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/generated_datsets/monoterpenoides/'
  58. graph_dir = collection_path + 'gxl/'
  59. fn_edit_costs_output = 'results/median_paper/edit_costs_output.python_init40.k10.txt'
  60. for y in y_all:
  61. for repeat in range(repeats):
  62. edit_costs_output_file = open(fn_edit_costs_output, 'a')
  63. collection_file = collection_path + 'monoterpenoides_' + y + '_' + str(repeat) + '.xml'
  64. Gn, _ = loadDataset(collection_file, extra_params=graph_dir)
  65. edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \
  66. nb_cost_mat_list = fit_GED_to_kernel_distance(Gn, node_label, edge_label,
  67. gkernel, itr_max, params_ged=params_ged,
  68. parallel=True)
  69. total_time = np.sum(time_list)
  70. # print('\nedit_costs:', edit_costs)
  71. # print('\nresidual_list:', residual_list)
  72. # print('\nedit_cost_list:', edit_cost_list)
  73. # print('\ndistance matrix in kernel space:', dis_k_mat)
  74. # print('\nged matrix:', ged_mat)
  75. # print('\ntotal time:', total_time)
  76. # print('\nnb_cost_mat:', nb_cost_mat_list[-1])
  77. np.savez('results/median_paper/fit_distance.clcpc.python_init40.monot.elabeled.uhpkernel.y'
  78. + y + '.repeat' + str(repeat) + '.k10..gm',
  79. edit_costs=edit_costs,
  80. residual_list=residual_list, edit_cost_list=edit_cost_list,
  81. dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list,
  82. total_time=total_time, nb_cost_mat_list=nb_cost_mat_list)
  83. for ec in edit_costs:
  84. edit_costs_output_file.write(str(ec) + ' ')
  85. edit_costs_output_file.write('\n')
  86. edit_costs_output_file.close()
  87. # # normalized distance matrices.
  88. # gmfile = np.load('results/fit_distance.cs_leq_ci_plus_cr.cost_leq_1en2.monot.elabeled.uhpkernel.gm.npz')
  89. # edit_costs = gmfile['edit_costs']
  90. # residual_list = gmfile['residual_list']
  91. # edit_cost_list = gmfile['edit_cost_list']
  92. # dis_k_mat = gmfile['dis_k_mat']
  93. # ged_mat = gmfile['ged_mat']
  94. # total_time = gmfile['total_time']
  95. # nb_cost_mat_list = gmfile['nb_cost_mat_list']
  96. nb_consistent, nb_inconsistent, ratio_consistent = pairwise_substitution_consistence(dis_k_mat, ged_mat)
  97. print(nb_consistent, nb_inconsistent, ratio_consistent)
  98. # norm_dis_k_mat = normalize_distance_matrix(dis_k_mat)
  99. # plt.imshow(norm_dis_k_mat)
  100. # plt.colorbar()
  101. # plt.savefig('results/median_paper/norm_dis_k_mat.clcpc.python_best.monot.elabeled.uhpkernel.y'
  102. # + y + '.repeat' + str(repeat) + '.eps', format='eps', dpi=300)
  103. # plt.savefig('results/median_paper/norm_dis_k_mat.clcpc.python_best.monot.elabeled.uhpkernel.y'
  104. # + y + '.repeat' + str(repeat) + '.png', format='png')
  105. # # plt.show()
  106. # plt.clf()
  107. #
  108. # norm_ged_mat = normalize_distance_matrix(ged_mat)
  109. # plt.imshow(norm_ged_mat)
  110. # plt.colorbar()
  111. # plt.savefig('results/median_paper/norm_ged_mat.clcpc.python_best.monot.elabeled.uhpkernel.y'
  112. # + y + '.repeat' + str(repeat) + '.eps', format='eps', dpi=300)
  113. # plt.savefig('results/median_paper/norm_ged_mat.clcpc.python_best.monot.elabeled.uhpkernel.y'
  114. # + y + '.repeat' + str(repeat) + '.png', format='png')
  115. # # plt.show()
  116. # plt.clf()
  117. #
  118. # norm_diff = norm_ged_mat - norm_dis_k_mat
  119. # plt.imshow(norm_diff)
  120. # plt.colorbar()
  121. # plt.savefig('results/median_paper/diff_mat_norm_ged_dis_k.clcpc.python_best.monot.elabeled.uhpkernel.y'
  122. # + y + '.repeat' + str(repeat) + '.eps', format='eps', dpi=300)
  123. # plt.savefig('results/median_paper/diff_mat_norm_ged_dis_k.clcpc.python_best.monot.elabeled.uhpkernel.y'
  124. # + y + '.repeat' + str(repeat) + '.png', format='png')
  125. # # plt.show()
  126. # plt.clf()
  127. # # draw_count_bar(norm_diff)
  128. def median_paper_clcpc_python_bash_cpp():
  129. """c_vs <= c_vi + c_vr, c_es <= c_ei + c_er with ged computation with
  130. python invoking the c++ code by bash command (with updated library).
  131. """
  132. # ds = {'name': 'monoterpenoides',
  133. # 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb
  134. # _, y_all = loadDataset(ds['dataset'])
  135. gkernel = 'untilhpathkernel'
  136. node_label = 'atom'
  137. edge_label = 'bond_type'
  138. itr_max = 20
  139. algo_options = '--threads 6 --initial-solutions 10 --ratio-runs-from-initial-solutions .5'
  140. params_ged = {'lib': 'gedlib-bash', 'cost': 'CONSTANT', 'method': 'IPFP',
  141. 'algo_options': algo_options}
  142. y_all = ['3', '1', '4', '6', '7', '8', '9', '2']
  143. repeats = 50
  144. collection_path = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/generated_datsets/monoterpenoides/'
  145. graph_dir = collection_path + 'gxl/'
  146. fn_edit_costs_output = 'results/median_paper/edit_costs_output.txt'
  147. for y in y_all:
  148. for repeat in range(repeats):
  149. edit_costs_output_file = open(fn_edit_costs_output, 'a')
  150. collection_file = collection_path + 'monoterpenoides_' + y + '_' + str(repeat) + '.xml'
  151. Gn, _ = loadDataset(collection_file, extra_params=graph_dir)
  152. edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \
  153. nb_cost_mat_list, coef_dk = fit_GED_to_kernel_distance(Gn, node_label, edge_label,
  154. gkernel, itr_max, params_ged=params_ged,
  155. parallel=False)
  156. total_time = np.sum(time_list)
  157. # print('\nedit_costs:', edit_costs)
  158. # print('\nresidual_list:', residual_list)
  159. # print('\nedit_cost_list:', edit_cost_list)
  160. # print('\ndistance matrix in kernel space:', dis_k_mat)
  161. # print('\nged matrix:', ged_mat)
  162. # print('\ntotal time:', total_time)
  163. # print('\nnb_cost_mat:', nb_cost_mat_list[-1])
  164. np.savez('results/median_paper/fit_distance.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y'
  165. + y + '.repeat' + str(repeat) + '.gm',
  166. edit_costs=edit_costs,
  167. residual_list=residual_list, edit_cost_list=edit_cost_list,
  168. dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list,
  169. total_time=total_time, nb_cost_mat_list=nb_cost_mat_list,
  170. coef_dk=coef_dk)
  171. for ec in edit_costs:
  172. edit_costs_output_file.write(str(ec) + ' ')
  173. edit_costs_output_file.write('\n')
  174. edit_costs_output_file.close()
  175. # # normalized distance matrices.
  176. # gmfile = np.load('results/fit_distance.cs_leq_ci_plus_cr.cost_leq_1en2.monot.elabeled.uhpkernel.gm.npz')
  177. # edit_costs = gmfile['edit_costs']
  178. # residual_list = gmfile['residual_list']
  179. # edit_cost_list = gmfile['edit_cost_list']
  180. # dis_k_mat = gmfile['dis_k_mat']
  181. # ged_mat = gmfile['ged_mat']
  182. # total_time = gmfile['total_time']
  183. # nb_cost_mat_list = gmfile['nb_cost_mat_list']
  184. # coef_dk = gmfile['coef_dk']
  185. nb_consistent, nb_inconsistent, ratio_consistent = pairwise_substitution_consistence(dis_k_mat, ged_mat)
  186. print(nb_consistent, nb_inconsistent, ratio_consistent)
  187. # norm_dis_k_mat = normalize_distance_matrix(dis_k_mat)
  188. # plt.imshow(norm_dis_k_mat)
  189. # plt.colorbar()
  190. # plt.savefig('results/median_paper/norm_dis_k_mat.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y'
  191. # + y + '.repeat' + str(repeat) + '.eps', format='eps', dpi=300)
  192. # plt.savefig('results/median_paper/norm_dis_k_mat.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y'
  193. # + y + '.repeat' + str(repeat) + '.png', format='png')
  194. # # plt.show()
  195. # plt.clf()
  196. #
  197. # norm_ged_mat = normalize_distance_matrix(ged_mat)
  198. # plt.imshow(norm_ged_mat)
  199. # plt.colorbar()
  200. # plt.savefig('results/median_paper/norm_ged_mat.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y'
  201. # + y + '.repeat' + str(repeat) + '.eps', format='eps', dpi=300)
  202. # plt.savefig('results/median_paper/norm_ged_mat.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y'
  203. # + y + '.repeat' + str(repeat) + '.png', format='png')
  204. # # plt.show()
  205. # plt.clf()
  206. #
  207. # norm_diff = norm_ged_mat - norm_dis_k_mat
  208. # plt.imshow(norm_diff)
  209. # plt.colorbar()
  210. # plt.savefig('results/median_paper/diff_mat_norm_ged_dis_k.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y'
  211. # + y + '.repeat' + str(repeat) + '.eps', format='eps', dpi=300)
  212. # plt.savefig('results/median_paper/diff_mat_norm_ged_dis_k.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y'
  213. # + y + '.repeat' + str(repeat) + '.png', format='png')
  214. # # plt.show()
  215. # plt.clf()
  216. # # draw_count_bar(norm_diff)
  217. def test_cs_leq_ci_plus_cr_python_bash_cpp():
  218. """c_vs <= c_vi + c_vr, c_es <= c_ei + c_er with ged computation with
  219. python invoking the c++ code by bash command (with updated library).
  220. """
  221. ds = {'name': 'monoterpenoides',
  222. 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb
  223. Gn, y_all = loadDataset(ds['dataset'])
  224. # Gn = Gn[0:10]
  225. gkernel = 'untilhpathkernel'
  226. node_label = 'atom'
  227. edge_label = 'bond_type'
  228. itr_max = 10
  229. algo_options = '--threads 6 --initial-solutions 10 --ratio-runs-from-initial-solutions .5'
  230. params_ged = {'lib': 'gedlib-bash', 'cost': 'CONSTANT', 'method': 'IPFP',
  231. 'algo_options': algo_options}
  232. edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \
  233. nb_cost_mat_list, coef_dk = fit_GED_to_kernel_distance(Gn, node_label, edge_label,
  234. gkernel, itr_max, params_ged=params_ged,
  235. parallel=False)
  236. total_time = np.sum(time_list)
  237. print('\nedit_costs:', edit_costs)
  238. print('\nresidual_list:', residual_list)
  239. print('\nedit_cost_list:', edit_cost_list)
  240. print('\ndistance matrix in kernel space:', dis_k_mat)
  241. print('\nged matrix:', ged_mat)
  242. print('\ntotal time:', total_time)
  243. print('\nnb_cost_mat:', nb_cost_mat_list[-1])
  244. np.savez('results/fit_distance.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel.gm',
  245. edit_costs=edit_costs,
  246. residual_list=residual_list, edit_cost_list=edit_cost_list,
  247. dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list,
  248. total_time=total_time, nb_cost_mat_list=nb_cost_mat_list,
  249. coef_dk=coef_dk)
  250. # ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
  251. # 'extra_params': {}} # node/edge symb
  252. # Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
  253. ## Gn = Gn[0:10]
  254. ## remove_edges(Gn)
  255. # gkernel = 'untilhpathkernel'
  256. # node_label = 'atom'
  257. # edge_label = 'bond_type'
  258. # itr_max = 10
  259. # edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \
  260. # nb_cost_mat_list, coef_dk = fit_GED_to_kernel_distance(Gn, node_label, edge_label,
  261. # gkernel, itr_max)
  262. # total_time = np.sum(time_list)
  263. # print('\nedit_costs:', edit_costs)
  264. # print('\nresidual_list:', residual_list)
  265. # print('\nedit_cost_list:', edit_cost_list)
  266. # print('\ndistance matrix in kernel space:', dis_k_mat)
  267. # print('\nged matrix:', ged_mat)
  268. # print('\ntotal time:', total_time)
  269. # print('\nnb_cost_mat:', nb_cost_mat_list[-1])
  270. # np.savez('results/fit_distance.cs_leq_ci_plus_cr.mutag.elabeled.uhpkernel.gm',
  271. # edit_costs=edit_costs,
  272. # residual_list=residual_list, edit_cost_list=edit_cost_list,
  273. # dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list,
  274. # total_time=total_time, nb_cost_mat_list=nb_cost_mat_list, coef_dk)
  275. # # normalized distance matrices.
  276. # gmfile = np.load('results/fit_distance.cs_leq_ci_plus_cr.monot.elabeled.uhpkernel.gm.npz')
  277. # edit_costs = gmfile['edit_costs']
  278. # residual_list = gmfile['residual_list']
  279. # edit_cost_list = gmfile['edit_cost_list']
  280. # dis_k_mat = gmfile['dis_k_mat']
  281. # ged_mat = gmfile['ged_mat']
  282. # total_time = gmfile['total_time']
  283. # nb_cost_mat_list = gmfile['nb_cost_mat_list']
  284. # coef_dk = gmfile['coef_dk']
  285. nb_consistent, nb_inconsistent, ratio_consistent = pairwise_substitution_consistence(dis_k_mat, ged_mat)
  286. print(nb_consistent, nb_inconsistent, ratio_consistent)
  287. # dis_k_sub = pairwise_substitution(dis_k_mat)
  288. # ged_sub = pairwise_substitution(ged_mat)
  289. # np.savez('results/sub_dis_mat.cs_leq_ci_plus_cr.gm',
  290. # dis_k_sub=dis_k_sub, ged_sub=ged_sub)
  291. norm_dis_k_mat = normalize_distance_matrix(dis_k_mat)
  292. plt.imshow(norm_dis_k_mat)
  293. plt.colorbar()
  294. plt.savefig('results/norm_dis_k_mat.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel'
  295. + '.eps', format='eps', dpi=300)
  296. plt.savefig('results/norm_dis_k_mat.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel'
  297. + '.png', format='png')
  298. # plt.show()
  299. plt.clf()
  300. norm_ged_mat = normalize_distance_matrix(ged_mat)
  301. plt.imshow(norm_ged_mat)
  302. plt.colorbar()
  303. plt.savefig('results/norm_ged_mat.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel'
  304. + '.eps', format='eps', dpi=300)
  305. plt.savefig('results/norm_ged_mat.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel'
  306. + '.png', format='png')
  307. # plt.show()
  308. plt.clf()
  309. norm_diff = norm_ged_mat - norm_dis_k_mat
  310. plt.imshow(norm_diff)
  311. plt.colorbar()
  312. plt.savefig('results/diff_mat_norm_ged_dis_k.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel'
  313. + '.eps', format='eps', dpi=300)
  314. plt.savefig('results/diff_mat_norm_ged_dis_k.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel'
  315. + '.png', format='png')
  316. # plt.show()
  317. plt.clf()
  318. # draw_count_bar(norm_diff)
  319. def test_anycosts():
  320. ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
  321. 'extra_params': {}} # node/edge symb
  322. Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
  323. # Gn = Gn[0:10]
  324. remove_edges(Gn)
  325. gkernel = 'marginalizedkernel'
  326. itr_max = 10
  327. edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \
  328. nb_cost_mat_list, coef_dk = fit_GED_to_kernel_distance(Gn, gkernel, itr_max)
  329. total_time = np.sum(time_list)
  330. print('\nedit_costs:', edit_costs)
  331. print('\nresidual_list:', residual_list)
  332. print('\nedit_cost_list:', edit_cost_list)
  333. print('\ndistance matrix in kernel space:', dis_k_mat)
  334. print('\nged matrix:', ged_mat)
  335. print('\ntotal time:', total_time)
  336. print('\nnb_cost_mat:', nb_cost_mat_list[-1])
  337. np.savez('results/fit_distance.any_costs.gm', edit_costs=edit_costs,
  338. residual_list=residual_list, edit_cost_list=edit_cost_list,
  339. dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list,
  340. total_time=total_time, nb_cost_mat_list=nb_cost_mat_list)
  341. # # normalized distance matrices.
  342. # gmfile = np.load('results/fit_distance.any_costs.gm.npz')
  343. # edit_costs = gmfile['edit_costs']
  344. # residual_list = gmfile['residual_list']
  345. # edit_cost_list = gmfile['edit_cost_list']
  346. # dis_k_mat = gmfile['dis_k_mat']
  347. # ged_mat = gmfile['ged_mat']
  348. # total_time = gmfile['total_time']
  349. ## nb_cost_mat_list = gmfile['nb_cost_mat_list']
  350. norm_dis_k_mat = normalize_distance_matrix(dis_k_mat)
  351. plt.imshow(norm_dis_k_mat)
  352. plt.colorbar()
  353. plt.savefig('results/norm_dis_k_mat.any_costs' + '.eps', format='eps', dpi=300)
  354. # plt.savefig('results/norm_dis_k_mat.any_costs' + '.png', format='png')
  355. # plt.show()
  356. plt.clf()
  357. norm_ged_mat = normalize_distance_matrix(ged_mat)
  358. plt.imshow(norm_ged_mat)
  359. plt.colorbar()
  360. plt.savefig('results/norm_ged_mat.any_costs' + '.eps', format='eps', dpi=300)
  361. # plt.savefig('results/norm_ged_mat.any_costs' + '.png', format='png')
  362. # plt.show()
  363. plt.clf()
  364. norm_diff = norm_ged_mat - norm_dis_k_mat
  365. plt.imshow(norm_diff)
  366. plt.colorbar()
  367. plt.savefig('results/diff_mat_norm_ged_dis_k.any_costs' + '.eps', format='eps', dpi=300)
  368. # plt.savefig('results/diff_mat_norm_ged_dis_k.any_costs' + '.png', format='png')
  369. # plt.show()
  370. plt.clf()
  371. # draw_count_bar(norm_diff)
  372. def test_cs_leq_ci_plus_cr():
  373. """c_vs <= c_vi + c_vr, c_es <= c_ei + c_er
  374. """
  375. ds = {'name': 'monoterpenoides',
  376. 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb
  377. Gn, y_all = loadDataset(ds['dataset'])
  378. # Gn = Gn[0:10]
  379. gkernel = 'untilhpathkernel'
  380. node_label = 'atom'
  381. edge_label = 'bond_type'
  382. itr_max = 10
  383. edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \
  384. nb_cost_mat_list, coef_dk = fit_GED_to_kernel_distance(Gn, node_label, edge_label,
  385. gkernel, itr_max,
  386. fitkernel='gaussian')
  387. total_time = np.sum(time_list)
  388. print('\nedit_costs:', edit_costs)
  389. print('\nresidual_list:', residual_list)
  390. print('\nedit_cost_list:', edit_cost_list)
  391. print('\ndistance matrix in kernel space:', dis_k_mat)
  392. print('\nged matrix:', ged_mat)
  393. print('\ntotal time:', total_time)
  394. print('\nnb_cost_mat:', nb_cost_mat_list[-1])
  395. np.savez('results/fit_distance.cs_leq_ci_plus_cr.gaussian.cost_leq_1en2.monot.elabeled.uhpkernel.gm',
  396. edit_costs=edit_costs,
  397. residual_list=residual_list, edit_cost_list=edit_cost_list,
  398. dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list,
  399. total_time=total_time, nb_cost_mat_list=nb_cost_mat_list,
  400. coef_dk=coef_dk)
  401. # ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
  402. # 'extra_params': {}} # node/edge symb
  403. # Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
  404. ## Gn = Gn[0:10]
  405. ## remove_edges(Gn)
  406. # gkernel = 'untilhpathkernel'
  407. # node_label = 'atom'
  408. # edge_label = 'bond_type'
  409. # itr_max = 10
  410. # edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \
  411. # nb_cost_mat_list, coef_dk = fit_GED_to_kernel_distance(Gn, node_label, edge_label,
  412. # gkernel, itr_max)
  413. # total_time = np.sum(time_list)
  414. # print('\nedit_costs:', edit_costs)
  415. # print('\nresidual_list:', residual_list)
  416. # print('\nedit_cost_list:', edit_cost_list)
  417. # print('\ndistance matrix in kernel space:', dis_k_mat)
  418. # print('\nged matrix:', ged_mat)
  419. # print('\ntotal time:', total_time)
  420. # print('\nnb_cost_mat:', nb_cost_mat_list[-1])
  421. # np.savez('results/fit_distance.cs_leq_ci_plus_cr.cost_leq_1en2.mutag.elabeled.uhpkernel.gm',
  422. # edit_costs=edit_costs,
  423. # residual_list=residual_list, edit_cost_list=edit_cost_list,
  424. # dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list,
  425. # total_time=total_time, nb_cost_mat_list=nb_cost_mat_list, coef_dk)
  426. # # normalized distance matrices.
  427. # gmfile = np.load('results/fit_distance.cs_leq_ci_plus_cr.cost_leq_1en2.monot.elabeled.uhpkernel.gm.npz')
  428. # edit_costs = gmfile['edit_costs']
  429. # residual_list = gmfile['residual_list']
  430. # edit_cost_list = gmfile['edit_cost_list']
  431. # dis_k_mat = gmfile['dis_k_mat']
  432. # ged_mat = gmfile['ged_mat']
  433. # total_time = gmfile['total_time']
  434. # nb_cost_mat_list = gmfile['nb_cost_mat_list']
  435. # coef_dk = gmfile['coef_dk']
  436. nb_consistent, nb_inconsistent, ratio_consistent = pairwise_substitution_consistence(dis_k_mat, ged_mat)
  437. print(nb_consistent, nb_inconsistent, ratio_consistent)
  438. # dis_k_sub = pairwise_substitution(dis_k_mat)
  439. # ged_sub = pairwise_substitution(ged_mat)
  440. # np.savez('results/sub_dis_mat.cs_leq_ci_plus_cr.cost_leq_1en2.gm',
  441. # dis_k_sub=dis_k_sub, ged_sub=ged_sub)
  442. norm_dis_k_mat = normalize_distance_matrix(dis_k_mat)
  443. plt.imshow(norm_dis_k_mat)
  444. plt.colorbar()
  445. plt.savefig('results/norm_dis_k_mat.cs_leq_ci_plus_cr.gaussian.cost_leq_1en2.monot.elabeled.uhpkernel'
  446. + '.eps', format='eps', dpi=300)
  447. plt.savefig('results/norm_dis_k_mat.cs_leq_ci_plus_cr.gaussian.cost_leq_1en2.monot.elabeled.uhpkernel'
  448. + '.png', format='png')
  449. # plt.show()
  450. plt.clf()
  451. norm_ged_mat = normalize_distance_matrix(ged_mat)
  452. plt.imshow(norm_ged_mat)
  453. plt.colorbar()
  454. plt.savefig('results/norm_ged_mat.cs_leq_ci_plus_cr.gaussian.cost_leq_1en2.monot.elabeled.uhpkernel'
  455. + '.eps', format='eps', dpi=300)
  456. plt.savefig('results/norm_ged_mat.cs_leq_ci_plus_cr.gaussian.cost_leq_1en2.monot.elabeled.uhpkernel'
  457. + '.png', format='png')
  458. # plt.show()
  459. plt.clf()
  460. norm_diff = norm_ged_mat - norm_dis_k_mat
  461. plt.imshow(norm_diff)
  462. plt.colorbar()
  463. plt.savefig('results/diff_mat_norm_ged_dis_k.cs_leq_ci_plus_cr.gaussian.cost_leq_1en2.monot.elabeled.uhpkernel'
  464. + '.eps', format='eps', dpi=300)
  465. plt.savefig('results/diff_mat_norm_ged_dis_k.cs_leq_ci_plus_cr.gaussian.cost_leq_1en2.monot.elabeled.uhpkernel'
  466. + '.png', format='png')
  467. # plt.show()
  468. plt.clf()
  469. # draw_count_bar(norm_diff)
  470. def test_unfitted():
  471. """unfitted.
  472. """
  473. from fitDistance import compute_geds
  474. from utils import kernel_distance_matrix
  475. ds = {'name': 'monoterpenoides',
  476. 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb
  477. Gn, y_all = loadDataset(ds['dataset'])
  478. # Gn = Gn[0:10]
  479. gkernel = 'untilhpathkernel'
  480. node_label = 'atom'
  481. edge_label = 'bond_type'
  482. # ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
  483. # 'extra_params': {}} # node/edge symb
  484. # Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
  485. ## Gn = Gn[0:10]
  486. ## remove_edges(Gn)
  487. # gkernel = 'marginalizedkernel'
  488. dis_k_mat, _, _, _ = kernel_distance_matrix(Gn, node_label, edge_label, gkernel=gkernel)
  489. ged_all, ged_mat, n_edit_operations = compute_geds(Gn, [3, 3, 1, 3, 3, 1],
  490. [0, 1, 2, 3, 4, 5], parallel=True)
  491. print('\ndistance matrix in kernel space:', dis_k_mat)
  492. print('\nged matrix:', ged_mat)
  493. # np.savez('results/fit_distance.cs_leq_ci_plus_cr.cost_leq_1en2.gm', edit_costs=edit_costs,
  494. # residual_list=residual_list, edit_cost_list=edit_cost_list,
  495. # dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list,
  496. # total_time=total_time, nb_cost_mat_list=nb_cost_mat_list)
  497. # normalized distance matrices.
  498. # gmfile = np.load('results/fit_distance.cs_leq_ci_plus_cr.cost_leq_1en3.gm.npz')
  499. # edit_costs = gmfile['edit_costs']
  500. # residual_list = gmfile['residual_list']
  501. # edit_cost_list = gmfile['edit_cost_list']
  502. # dis_k_mat = gmfile['dis_k_mat']
  503. # ged_mat = gmfile['ged_mat']
  504. # total_time = gmfile['total_time']
  505. # nb_cost_mat_list = gmfile['nb_cost_mat_list']
  506. nb_consistent, nb_inconsistent, ratio_consistent = pairwise_substitution_consistence(dis_k_mat, ged_mat)
  507. print(nb_consistent, nb_inconsistent, ratio_consistent)
  508. norm_dis_k_mat = normalize_distance_matrix(dis_k_mat)
  509. plt.imshow(norm_dis_k_mat)
  510. plt.colorbar()
  511. plt.savefig('results/norm_dis_k_mat.unfitted.MUTAG' + '.eps', format='eps', dpi=300)
  512. plt.savefig('results/norm_dis_k_mat.unfitted.MUTAG' + '.png', format='png')
  513. # plt.show()
  514. plt.clf()
  515. norm_ged_mat = normalize_distance_matrix(ged_mat)
  516. plt.imshow(norm_ged_mat)
  517. plt.colorbar()
  518. plt.savefig('results/norm_ged_mat.unfitted.MUTAG' + '.eps', format='eps', dpi=300)
  519. plt.savefig('results/norm_ged_mat.unfitted.MUTAG' + '.png', format='png')
  520. # plt.show()
  521. plt.clf()
  522. norm_diff = norm_ged_mat - norm_dis_k_mat
  523. plt.imshow(norm_diff)
  524. plt.colorbar()
  525. plt.savefig('results/diff_mat_norm_ged_dis_k.unfitted.MUTAG' + '.eps', format='eps', dpi=300)
  526. plt.savefig('results/diff_mat_norm_ged_dis_k.unfitted.MUTAG' + '.png', format='png')
  527. # plt.show()
  528. plt.clf()
  529. draw_count_bar(norm_diff)
  530. def pairwise_substitution_consistence(mat1, mat2):
  531. """
  532. """
  533. nb_consistent = 0
  534. nb_inconsistent = 0
  535. # the matrix is considered symmetric.
  536. upper_tri1 = mat1[np.triu_indices_from(mat1)]
  537. upper_tri2 = mat2[np.tril_indices_from(mat2)]
  538. for i in tqdm(range(len(upper_tri1)), desc='computing consistence', file=sys.stdout):
  539. for j in range(i, len(upper_tri1)):
  540. if np.sign(upper_tri1[i] - upper_tri1[j]) == np.sign(upper_tri2[i] - upper_tri2[j]):
  541. nb_consistent += 1
  542. else:
  543. nb_inconsistent += 1
  544. return nb_consistent, nb_inconsistent, nb_consistent / (nb_consistent + nb_inconsistent)
  545. def pairwise_substitution(mat):
  546. # the matrix is considered symmetric.
  547. upper_tri = mat[np.triu_indices_from(mat)]
  548. sub_list = []
  549. for i in tqdm(range(len(upper_tri)), desc='computing', file=sys.stdout):
  550. for j in range(i, len(upper_tri)):
  551. sub_list.append(upper_tri[i] - upper_tri[j])
  552. return sub_list
  553. def draw_count_bar(norm_diff):
  554. import pandas
  555. from collections import Counter, OrderedDict
  556. norm_diff_cnt = norm_diff.flatten()
  557. norm_diff_cnt = norm_diff_cnt * 10
  558. norm_diff_cnt = np.floor(norm_diff_cnt)
  559. norm_diff_cnt = Counter(norm_diff_cnt)
  560. norm_diff_cnt = OrderedDict(sorted(norm_diff_cnt.items()))
  561. df = pandas.DataFrame.from_dict(norm_diff_cnt, orient='index')
  562. df.plot(kind='bar')
  563. if __name__ == '__main__':
  564. # test_anycosts()
  565. # test_cs_leq_ci_plus_cr()
  566. # test_unfitted()
  567. # test_cs_leq_ci_plus_cr_python_bash_cpp()
  568. # median_paper_clcpc_python_bash_cpp()
  569. # median_paper_clcpc_python_best()
  570. # x = np.array([[1,2,3],[4,5,6],[7,8,9]])
  571. # xx = pairwise_substitution(x)
  572. test_update_costs()

A Python package for graph kernels, graph edit distances and graph pre-image problem.