You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

test_fitDistance.py 28 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. Created on Thu Oct 24 11:50:56 2019
  5. @author: ljia
  6. """
  7. from matplotlib import pyplot as plt
  8. import numpy as np
  9. from tqdm import tqdm
  10. import sys
  11. sys.path.insert(0, "../")
  12. from pygraph.utils.graphfiles import loadDataset
  13. from utils import remove_edges
  14. from fitDistance import fit_GED_to_kernel_distance
  15. from utils import normalize_distance_matrix
  16. def median_paper_clcpc_python_best():
  17. """c_vs <= c_vi + c_vr, c_es <= c_ei + c_er with ged computation with
  18. python invoking the c++ code by bash command (with updated library).
  19. """
  20. # ds = {'name': 'monoterpenoides',
  21. # 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb
  22. # _, y_all = loadDataset(ds['dataset'])
  23. gkernel = 'untilhpathkernel'
  24. node_label = 'atom'
  25. edge_label = 'bond_type'
  26. itr_max = 6
  27. algo_options = '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1'
  28. params_ged = {'lib': 'gedlibpy', 'cost': 'CONSTANT', 'method': 'IPFP',
  29. 'algo_options': algo_options, 'stabilizer': None}
  30. y_all = ['3', '1', '4', '6', '7', '8', '9', '2']
  31. repeats = 50
  32. collection_path = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/generated_datsets/monoterpenoides/'
  33. graph_dir = collection_path + 'gxl/'
  34. fn_edit_costs_output = 'results/median_paper/edit_costs_output.python_init40.k10.txt'
  35. for y in y_all:
  36. for repeat in range(repeats):
  37. edit_costs_output_file = open(fn_edit_costs_output, 'a')
  38. collection_file = collection_path + 'monoterpenoides_' + y + '_' + str(repeat) + '.xml'
  39. Gn, _ = loadDataset(collection_file, extra_params=graph_dir)
  40. edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \
  41. nb_cost_mat_list = fit_GED_to_kernel_distance(Gn, node_label, edge_label,
  42. gkernel, itr_max, params_ged=params_ged,
  43. parallel=True)
  44. total_time = np.sum(time_list)
  45. # print('\nedit_costs:', edit_costs)
  46. # print('\nresidual_list:', residual_list)
  47. # print('\nedit_cost_list:', edit_cost_list)
  48. # print('\ndistance matrix in kernel space:', dis_k_mat)
  49. # print('\nged matrix:', ged_mat)
  50. # print('\ntotal time:', total_time)
  51. # print('\nnb_cost_mat:', nb_cost_mat_list[-1])
  52. np.savez('results/median_paper/fit_distance.clcpc.python_init40.monot.elabeled.uhpkernel.y'
  53. + y + '.repeat' + str(repeat) + '.k10..gm',
  54. edit_costs=edit_costs,
  55. residual_list=residual_list, edit_cost_list=edit_cost_list,
  56. dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list,
  57. total_time=total_time, nb_cost_mat_list=nb_cost_mat_list)
  58. for ec in edit_costs:
  59. edit_costs_output_file.write(str(ec) + ' ')
  60. edit_costs_output_file.write('\n')
  61. edit_costs_output_file.close()
  62. # # normalized distance matrices.
  63. # gmfile = np.load('results/fit_distance.cs_leq_ci_plus_cr.cost_leq_1en2.monot.elabeled.uhpkernel.gm.npz')
  64. # edit_costs = gmfile['edit_costs']
  65. # residual_list = gmfile['residual_list']
  66. # edit_cost_list = gmfile['edit_cost_list']
  67. # dis_k_mat = gmfile['dis_k_mat']
  68. # ged_mat = gmfile['ged_mat']
  69. # total_time = gmfile['total_time']
  70. # nb_cost_mat_list = gmfile['nb_cost_mat_list']
  71. nb_consistent, nb_inconsistent, ratio_consistent = pairwise_substitution_consistence(dis_k_mat, ged_mat)
  72. print(nb_consistent, nb_inconsistent, ratio_consistent)
  73. # norm_dis_k_mat = normalize_distance_matrix(dis_k_mat)
  74. # plt.imshow(norm_dis_k_mat)
  75. # plt.colorbar()
  76. # plt.savefig('results/median_paper/norm_dis_k_mat.clcpc.python_best.monot.elabeled.uhpkernel.y'
  77. # + y + '.repeat' + str(repeat) + '.eps', format='eps', dpi=300)
  78. # plt.savefig('results/median_paper/norm_dis_k_mat.clcpc.python_best.monot.elabeled.uhpkernel.y'
  79. # + y + '.repeat' + str(repeat) + '.png', format='png')
  80. # # plt.show()
  81. # plt.clf()
  82. #
  83. # norm_ged_mat = normalize_distance_matrix(ged_mat)
  84. # plt.imshow(norm_ged_mat)
  85. # plt.colorbar()
  86. # plt.savefig('results/median_paper/norm_ged_mat.clcpc.python_best.monot.elabeled.uhpkernel.y'
  87. # + y + '.repeat' + str(repeat) + '.eps', format='eps', dpi=300)
  88. # plt.savefig('results/median_paper/norm_ged_mat.clcpc.python_best.monot.elabeled.uhpkernel.y'
  89. # + y + '.repeat' + str(repeat) + '.png', format='png')
  90. # # plt.show()
  91. # plt.clf()
  92. #
  93. # norm_diff = norm_ged_mat - norm_dis_k_mat
  94. # plt.imshow(norm_diff)
  95. # plt.colorbar()
  96. # plt.savefig('results/median_paper/diff_mat_norm_ged_dis_k.clcpc.python_best.monot.elabeled.uhpkernel.y'
  97. # + y + '.repeat' + str(repeat) + '.eps', format='eps', dpi=300)
  98. # plt.savefig('results/median_paper/diff_mat_norm_ged_dis_k.clcpc.python_best.monot.elabeled.uhpkernel.y'
  99. # + y + '.repeat' + str(repeat) + '.png', format='png')
  100. # # plt.show()
  101. # plt.clf()
  102. # # draw_count_bar(norm_diff)
  103. def median_paper_clcpc_python_bash_cpp():
  104. """c_vs <= c_vi + c_vr, c_es <= c_ei + c_er with ged computation with
  105. python invoking the c++ code by bash command (with updated library).
  106. """
  107. # ds = {'name': 'monoterpenoides',
  108. # 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb
  109. # _, y_all = loadDataset(ds['dataset'])
  110. gkernel = 'untilhpathkernel'
  111. node_label = 'atom'
  112. edge_label = 'bond_type'
  113. itr_max = 20
  114. algo_options = '--threads 6 --initial-solutions 10 --ratio-runs-from-initial-solutions .5'
  115. params_ged = {'lib': 'gedlib-bash', 'cost': 'CONSTANT', 'method': 'IPFP',
  116. 'algo_options': algo_options}
  117. y_all = ['3', '1', '4', '6', '7', '8', '9', '2']
  118. repeats = 50
  119. collection_path = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/generated_datsets/monoterpenoides/'
  120. graph_dir = collection_path + 'gxl/'
  121. fn_edit_costs_output = 'results/median_paper/edit_costs_output.txt'
  122. for y in y_all:
  123. for repeat in range(repeats):
  124. edit_costs_output_file = open(fn_edit_costs_output, 'a')
  125. collection_file = collection_path + 'monoterpenoides_' + y + '_' + str(repeat) + '.xml'
  126. Gn, _ = loadDataset(collection_file, extra_params=graph_dir)
  127. edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \
  128. nb_cost_mat_list, coef_dk = fit_GED_to_kernel_distance(Gn, node_label, edge_label,
  129. gkernel, itr_max, params_ged=params_ged,
  130. parallel=False)
  131. total_time = np.sum(time_list)
  132. # print('\nedit_costs:', edit_costs)
  133. # print('\nresidual_list:', residual_list)
  134. # print('\nedit_cost_list:', edit_cost_list)
  135. # print('\ndistance matrix in kernel space:', dis_k_mat)
  136. # print('\nged matrix:', ged_mat)
  137. # print('\ntotal time:', total_time)
  138. # print('\nnb_cost_mat:', nb_cost_mat_list[-1])
  139. np.savez('results/median_paper/fit_distance.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y'
  140. + y + '.repeat' + str(repeat) + '.gm',
  141. edit_costs=edit_costs,
  142. residual_list=residual_list, edit_cost_list=edit_cost_list,
  143. dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list,
  144. total_time=total_time, nb_cost_mat_list=nb_cost_mat_list,
  145. coef_dk=coef_dk)
  146. for ec in edit_costs:
  147. edit_costs_output_file.write(str(ec) + ' ')
  148. edit_costs_output_file.write('\n')
  149. edit_costs_output_file.close()
  150. # # normalized distance matrices.
  151. # gmfile = np.load('results/fit_distance.cs_leq_ci_plus_cr.cost_leq_1en2.monot.elabeled.uhpkernel.gm.npz')
  152. # edit_costs = gmfile['edit_costs']
  153. # residual_list = gmfile['residual_list']
  154. # edit_cost_list = gmfile['edit_cost_list']
  155. # dis_k_mat = gmfile['dis_k_mat']
  156. # ged_mat = gmfile['ged_mat']
  157. # total_time = gmfile['total_time']
  158. # nb_cost_mat_list = gmfile['nb_cost_mat_list']
  159. # coef_dk = gmfile['coef_dk']
  160. nb_consistent, nb_inconsistent, ratio_consistent = pairwise_substitution_consistence(dis_k_mat, ged_mat)
  161. print(nb_consistent, nb_inconsistent, ratio_consistent)
  162. # norm_dis_k_mat = normalize_distance_matrix(dis_k_mat)
  163. # plt.imshow(norm_dis_k_mat)
  164. # plt.colorbar()
  165. # plt.savefig('results/median_paper/norm_dis_k_mat.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y'
  166. # + y + '.repeat' + str(repeat) + '.eps', format='eps', dpi=300)
  167. # plt.savefig('results/median_paper/norm_dis_k_mat.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y'
  168. # + y + '.repeat' + str(repeat) + '.png', format='png')
  169. # # plt.show()
  170. # plt.clf()
  171. #
  172. # norm_ged_mat = normalize_distance_matrix(ged_mat)
  173. # plt.imshow(norm_ged_mat)
  174. # plt.colorbar()
  175. # plt.savefig('results/median_paper/norm_ged_mat.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y'
  176. # + y + '.repeat' + str(repeat) + '.eps', format='eps', dpi=300)
  177. # plt.savefig('results/median_paper/norm_ged_mat.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y'
  178. # + y + '.repeat' + str(repeat) + '.png', format='png')
  179. # # plt.show()
  180. # plt.clf()
  181. #
  182. # norm_diff = norm_ged_mat - norm_dis_k_mat
  183. # plt.imshow(norm_diff)
  184. # plt.colorbar()
  185. # plt.savefig('results/median_paper/diff_mat_norm_ged_dis_k.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y'
  186. # + y + '.repeat' + str(repeat) + '.eps', format='eps', dpi=300)
  187. # plt.savefig('results/median_paper/diff_mat_norm_ged_dis_k.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y'
  188. # + y + '.repeat' + str(repeat) + '.png', format='png')
  189. # # plt.show()
  190. # plt.clf()
  191. # # draw_count_bar(norm_diff)
  192. def test_cs_leq_ci_plus_cr_python_bash_cpp():
  193. """c_vs <= c_vi + c_vr, c_es <= c_ei + c_er with ged computation with
  194. python invoking the c++ code by bash command (with updated library).
  195. """
  196. ds = {'name': 'monoterpenoides',
  197. 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb
  198. Gn, y_all = loadDataset(ds['dataset'])
  199. # Gn = Gn[0:10]
  200. gkernel = 'untilhpathkernel'
  201. node_label = 'atom'
  202. edge_label = 'bond_type'
  203. itr_max = 10
  204. algo_options = '--threads 6 --initial-solutions 10 --ratio-runs-from-initial-solutions .5'
  205. params_ged = {'lib': 'gedlib-bash', 'cost': 'CONSTANT', 'method': 'IPFP',
  206. 'algo_options': algo_options}
  207. edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \
  208. nb_cost_mat_list, coef_dk = fit_GED_to_kernel_distance(Gn, node_label, edge_label,
  209. gkernel, itr_max, params_ged=params_ged,
  210. parallel=False)
  211. total_time = np.sum(time_list)
  212. print('\nedit_costs:', edit_costs)
  213. print('\nresidual_list:', residual_list)
  214. print('\nedit_cost_list:', edit_cost_list)
  215. print('\ndistance matrix in kernel space:', dis_k_mat)
  216. print('\nged matrix:', ged_mat)
  217. print('\ntotal time:', total_time)
  218. print('\nnb_cost_mat:', nb_cost_mat_list[-1])
  219. np.savez('results/fit_distance.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel.gm',
  220. edit_costs=edit_costs,
  221. residual_list=residual_list, edit_cost_list=edit_cost_list,
  222. dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list,
  223. total_time=total_time, nb_cost_mat_list=nb_cost_mat_list,
  224. coef_dk=coef_dk)
  225. # ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
  226. # 'extra_params': {}} # node/edge symb
  227. # Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
  228. ## Gn = Gn[0:10]
  229. ## remove_edges(Gn)
  230. # gkernel = 'untilhpathkernel'
  231. # node_label = 'atom'
  232. # edge_label = 'bond_type'
  233. # itr_max = 10
  234. # edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \
  235. # nb_cost_mat_list, coef_dk = fit_GED_to_kernel_distance(Gn, node_label, edge_label,
  236. # gkernel, itr_max)
  237. # total_time = np.sum(time_list)
  238. # print('\nedit_costs:', edit_costs)
  239. # print('\nresidual_list:', residual_list)
  240. # print('\nedit_cost_list:', edit_cost_list)
  241. # print('\ndistance matrix in kernel space:', dis_k_mat)
  242. # print('\nged matrix:', ged_mat)
  243. # print('\ntotal time:', total_time)
  244. # print('\nnb_cost_mat:', nb_cost_mat_list[-1])
  245. # np.savez('results/fit_distance.cs_leq_ci_plus_cr.mutag.elabeled.uhpkernel.gm',
  246. # edit_costs=edit_costs,
  247. # residual_list=residual_list, edit_cost_list=edit_cost_list,
  248. # dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list,
  249. # total_time=total_time, nb_cost_mat_list=nb_cost_mat_list, coef_dk)
  250. # # normalized distance matrices.
  251. # gmfile = np.load('results/fit_distance.cs_leq_ci_plus_cr.monot.elabeled.uhpkernel.gm.npz')
  252. # edit_costs = gmfile['edit_costs']
  253. # residual_list = gmfile['residual_list']
  254. # edit_cost_list = gmfile['edit_cost_list']
  255. # dis_k_mat = gmfile['dis_k_mat']
  256. # ged_mat = gmfile['ged_mat']
  257. # total_time = gmfile['total_time']
  258. # nb_cost_mat_list = gmfile['nb_cost_mat_list']
  259. # coef_dk = gmfile['coef_dk']
  260. nb_consistent, nb_inconsistent, ratio_consistent = pairwise_substitution_consistence(dis_k_mat, ged_mat)
  261. print(nb_consistent, nb_inconsistent, ratio_consistent)
  262. # dis_k_sub = pairwise_substitution(dis_k_mat)
  263. # ged_sub = pairwise_substitution(ged_mat)
  264. # np.savez('results/sub_dis_mat.cs_leq_ci_plus_cr.gm',
  265. # dis_k_sub=dis_k_sub, ged_sub=ged_sub)
  266. norm_dis_k_mat = normalize_distance_matrix(dis_k_mat)
  267. plt.imshow(norm_dis_k_mat)
  268. plt.colorbar()
  269. plt.savefig('results/norm_dis_k_mat.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel'
  270. + '.eps', format='eps', dpi=300)
  271. plt.savefig('results/norm_dis_k_mat.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel'
  272. + '.png', format='png')
  273. # plt.show()
  274. plt.clf()
  275. norm_ged_mat = normalize_distance_matrix(ged_mat)
  276. plt.imshow(norm_ged_mat)
  277. plt.colorbar()
  278. plt.savefig('results/norm_ged_mat.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel'
  279. + '.eps', format='eps', dpi=300)
  280. plt.savefig('results/norm_ged_mat.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel'
  281. + '.png', format='png')
  282. # plt.show()
  283. plt.clf()
  284. norm_diff = norm_ged_mat - norm_dis_k_mat
  285. plt.imshow(norm_diff)
  286. plt.colorbar()
  287. plt.savefig('results/diff_mat_norm_ged_dis_k.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel'
  288. + '.eps', format='eps', dpi=300)
  289. plt.savefig('results/diff_mat_norm_ged_dis_k.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel'
  290. + '.png', format='png')
  291. # plt.show()
  292. plt.clf()
  293. # draw_count_bar(norm_diff)
  294. def test_anycosts():
  295. ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
  296. 'extra_params': {}} # node/edge symb
  297. Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
  298. # Gn = Gn[0:10]
  299. remove_edges(Gn)
  300. gkernel = 'marginalizedkernel'
  301. itr_max = 10
  302. edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \
  303. nb_cost_mat_list, coef_dk = fit_GED_to_kernel_distance(Gn, gkernel, itr_max)
  304. total_time = np.sum(time_list)
  305. print('\nedit_costs:', edit_costs)
  306. print('\nresidual_list:', residual_list)
  307. print('\nedit_cost_list:', edit_cost_list)
  308. print('\ndistance matrix in kernel space:', dis_k_mat)
  309. print('\nged matrix:', ged_mat)
  310. print('\ntotal time:', total_time)
  311. print('\nnb_cost_mat:', nb_cost_mat_list[-1])
  312. np.savez('results/fit_distance.any_costs.gm', edit_costs=edit_costs,
  313. residual_list=residual_list, edit_cost_list=edit_cost_list,
  314. dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list,
  315. total_time=total_time, nb_cost_mat_list=nb_cost_mat_list)
  316. # # normalized distance matrices.
  317. # gmfile = np.load('results/fit_distance.any_costs.gm.npz')
  318. # edit_costs = gmfile['edit_costs']
  319. # residual_list = gmfile['residual_list']
  320. # edit_cost_list = gmfile['edit_cost_list']
  321. # dis_k_mat = gmfile['dis_k_mat']
  322. # ged_mat = gmfile['ged_mat']
  323. # total_time = gmfile['total_time']
  324. ## nb_cost_mat_list = gmfile['nb_cost_mat_list']
  325. norm_dis_k_mat = normalize_distance_matrix(dis_k_mat)
  326. plt.imshow(norm_dis_k_mat)
  327. plt.colorbar()
  328. plt.savefig('results/norm_dis_k_mat.any_costs' + '.eps', format='eps', dpi=300)
  329. # plt.savefig('results/norm_dis_k_mat.any_costs' + '.png', format='png')
  330. # plt.show()
  331. plt.clf()
  332. norm_ged_mat = normalize_distance_matrix(ged_mat)
  333. plt.imshow(norm_ged_mat)
  334. plt.colorbar()
  335. plt.savefig('results/norm_ged_mat.any_costs' + '.eps', format='eps', dpi=300)
  336. # plt.savefig('results/norm_ged_mat.any_costs' + '.png', format='png')
  337. # plt.show()
  338. plt.clf()
  339. norm_diff = norm_ged_mat - norm_dis_k_mat
  340. plt.imshow(norm_diff)
  341. plt.colorbar()
  342. plt.savefig('results/diff_mat_norm_ged_dis_k.any_costs' + '.eps', format='eps', dpi=300)
  343. # plt.savefig('results/diff_mat_norm_ged_dis_k.any_costs' + '.png', format='png')
  344. # plt.show()
  345. plt.clf()
  346. # draw_count_bar(norm_diff)
  347. def test_cs_leq_ci_plus_cr():
  348. """c_vs <= c_vi + c_vr, c_es <= c_ei + c_er
  349. """
  350. ds = {'name': 'monoterpenoides',
  351. 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb
  352. Gn, y_all = loadDataset(ds['dataset'])
  353. # Gn = Gn[0:10]
  354. gkernel = 'untilhpathkernel'
  355. node_label = 'atom'
  356. edge_label = 'bond_type'
  357. itr_max = 10
  358. edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \
  359. nb_cost_mat_list, coef_dk = fit_GED_to_kernel_distance(Gn, node_label, edge_label,
  360. gkernel, itr_max,
  361. fitkernel='gaussian')
  362. total_time = np.sum(time_list)
  363. print('\nedit_costs:', edit_costs)
  364. print('\nresidual_list:', residual_list)
  365. print('\nedit_cost_list:', edit_cost_list)
  366. print('\ndistance matrix in kernel space:', dis_k_mat)
  367. print('\nged matrix:', ged_mat)
  368. print('\ntotal time:', total_time)
  369. print('\nnb_cost_mat:', nb_cost_mat_list[-1])
  370. np.savez('results/fit_distance.cs_leq_ci_plus_cr.gaussian.cost_leq_1en2.monot.elabeled.uhpkernel.gm',
  371. edit_costs=edit_costs,
  372. residual_list=residual_list, edit_cost_list=edit_cost_list,
  373. dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list,
  374. total_time=total_time, nb_cost_mat_list=nb_cost_mat_list,
  375. coef_dk=coef_dk)
  376. # ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
  377. # 'extra_params': {}} # node/edge symb
  378. # Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
  379. ## Gn = Gn[0:10]
  380. ## remove_edges(Gn)
  381. # gkernel = 'untilhpathkernel'
  382. # node_label = 'atom'
  383. # edge_label = 'bond_type'
  384. # itr_max = 10
  385. # edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \
  386. # nb_cost_mat_list, coef_dk = fit_GED_to_kernel_distance(Gn, node_label, edge_label,
  387. # gkernel, itr_max)
  388. # total_time = np.sum(time_list)
  389. # print('\nedit_costs:', edit_costs)
  390. # print('\nresidual_list:', residual_list)
  391. # print('\nedit_cost_list:', edit_cost_list)
  392. # print('\ndistance matrix in kernel space:', dis_k_mat)
  393. # print('\nged matrix:', ged_mat)
  394. # print('\ntotal time:', total_time)
  395. # print('\nnb_cost_mat:', nb_cost_mat_list[-1])
  396. # np.savez('results/fit_distance.cs_leq_ci_plus_cr.cost_leq_1en2.mutag.elabeled.uhpkernel.gm',
  397. # edit_costs=edit_costs,
  398. # residual_list=residual_list, edit_cost_list=edit_cost_list,
  399. # dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list,
  400. # total_time=total_time, nb_cost_mat_list=nb_cost_mat_list, coef_dk)
  401. # # normalized distance matrices.
  402. # gmfile = np.load('results/fit_distance.cs_leq_ci_plus_cr.cost_leq_1en2.monot.elabeled.uhpkernel.gm.npz')
  403. # edit_costs = gmfile['edit_costs']
  404. # residual_list = gmfile['residual_list']
  405. # edit_cost_list = gmfile['edit_cost_list']
  406. # dis_k_mat = gmfile['dis_k_mat']
  407. # ged_mat = gmfile['ged_mat']
  408. # total_time = gmfile['total_time']
  409. # nb_cost_mat_list = gmfile['nb_cost_mat_list']
  410. # coef_dk = gmfile['coef_dk']
  411. nb_consistent, nb_inconsistent, ratio_consistent = pairwise_substitution_consistence(dis_k_mat, ged_mat)
  412. print(nb_consistent, nb_inconsistent, ratio_consistent)
  413. # dis_k_sub = pairwise_substitution(dis_k_mat)
  414. # ged_sub = pairwise_substitution(ged_mat)
  415. # np.savez('results/sub_dis_mat.cs_leq_ci_plus_cr.cost_leq_1en2.gm',
  416. # dis_k_sub=dis_k_sub, ged_sub=ged_sub)
  417. norm_dis_k_mat = normalize_distance_matrix(dis_k_mat)
  418. plt.imshow(norm_dis_k_mat)
  419. plt.colorbar()
  420. plt.savefig('results/norm_dis_k_mat.cs_leq_ci_plus_cr.gaussian.cost_leq_1en2.monot.elabeled.uhpkernel'
  421. + '.eps', format='eps', dpi=300)
  422. plt.savefig('results/norm_dis_k_mat.cs_leq_ci_plus_cr.gaussian.cost_leq_1en2.monot.elabeled.uhpkernel'
  423. + '.png', format='png')
  424. # plt.show()
  425. plt.clf()
  426. norm_ged_mat = normalize_distance_matrix(ged_mat)
  427. plt.imshow(norm_ged_mat)
  428. plt.colorbar()
  429. plt.savefig('results/norm_ged_mat.cs_leq_ci_plus_cr.gaussian.cost_leq_1en2.monot.elabeled.uhpkernel'
  430. + '.eps', format='eps', dpi=300)
  431. plt.savefig('results/norm_ged_mat.cs_leq_ci_plus_cr.gaussian.cost_leq_1en2.monot.elabeled.uhpkernel'
  432. + '.png', format='png')
  433. # plt.show()
  434. plt.clf()
  435. norm_diff = norm_ged_mat - norm_dis_k_mat
  436. plt.imshow(norm_diff)
  437. plt.colorbar()
  438. plt.savefig('results/diff_mat_norm_ged_dis_k.cs_leq_ci_plus_cr.gaussian.cost_leq_1en2.monot.elabeled.uhpkernel'
  439. + '.eps', format='eps', dpi=300)
  440. plt.savefig('results/diff_mat_norm_ged_dis_k.cs_leq_ci_plus_cr.gaussian.cost_leq_1en2.monot.elabeled.uhpkernel'
  441. + '.png', format='png')
  442. # plt.show()
  443. plt.clf()
  444. # draw_count_bar(norm_diff)
  445. def test_unfitted():
  446. """unfitted.
  447. """
  448. from fitDistance import compute_geds
  449. from utils import kernel_distance_matrix
  450. ds = {'name': 'monoterpenoides',
  451. 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb
  452. Gn, y_all = loadDataset(ds['dataset'])
  453. # Gn = Gn[0:10]
  454. gkernel = 'untilhpathkernel'
  455. node_label = 'atom'
  456. edge_label = 'bond_type'
  457. # ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
  458. # 'extra_params': {}} # node/edge symb
  459. # Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
  460. ## Gn = Gn[0:10]
  461. ## remove_edges(Gn)
  462. # gkernel = 'marginalizedkernel'
  463. dis_k_mat, _, _, _ = kernel_distance_matrix(Gn, node_label, edge_label, gkernel=gkernel)
  464. ged_all, ged_mat, n_edit_operations = compute_geds(Gn, [3, 3, 1, 3, 3, 1],
  465. [0, 1, 2, 3, 4, 5], parallel=True)
  466. print('\ndistance matrix in kernel space:', dis_k_mat)
  467. print('\nged matrix:', ged_mat)
  468. # np.savez('results/fit_distance.cs_leq_ci_plus_cr.cost_leq_1en2.gm', edit_costs=edit_costs,
  469. # residual_list=residual_list, edit_cost_list=edit_cost_list,
  470. # dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list,
  471. # total_time=total_time, nb_cost_mat_list=nb_cost_mat_list)
  472. # normalized distance matrices.
  473. # gmfile = np.load('results/fit_distance.cs_leq_ci_plus_cr.cost_leq_1en3.gm.npz')
  474. # edit_costs = gmfile['edit_costs']
  475. # residual_list = gmfile['residual_list']
  476. # edit_cost_list = gmfile['edit_cost_list']
  477. # dis_k_mat = gmfile['dis_k_mat']
  478. # ged_mat = gmfile['ged_mat']
  479. # total_time = gmfile['total_time']
  480. # nb_cost_mat_list = gmfile['nb_cost_mat_list']
  481. nb_consistent, nb_inconsistent, ratio_consistent = pairwise_substitution_consistence(dis_k_mat, ged_mat)
  482. print(nb_consistent, nb_inconsistent, ratio_consistent)
  483. norm_dis_k_mat = normalize_distance_matrix(dis_k_mat)
  484. plt.imshow(norm_dis_k_mat)
  485. plt.colorbar()
  486. plt.savefig('results/norm_dis_k_mat.unfitted.MUTAG' + '.eps', format='eps', dpi=300)
  487. plt.savefig('results/norm_dis_k_mat.unfitted.MUTAG' + '.png', format='png')
  488. # plt.show()
  489. plt.clf()
  490. norm_ged_mat = normalize_distance_matrix(ged_mat)
  491. plt.imshow(norm_ged_mat)
  492. plt.colorbar()
  493. plt.savefig('results/norm_ged_mat.unfitted.MUTAG' + '.eps', format='eps', dpi=300)
  494. plt.savefig('results/norm_ged_mat.unfitted.MUTAG' + '.png', format='png')
  495. # plt.show()
  496. plt.clf()
  497. norm_diff = norm_ged_mat - norm_dis_k_mat
  498. plt.imshow(norm_diff)
  499. plt.colorbar()
  500. plt.savefig('results/diff_mat_norm_ged_dis_k.unfitted.MUTAG' + '.eps', format='eps', dpi=300)
  501. plt.savefig('results/diff_mat_norm_ged_dis_k.unfitted.MUTAG' + '.png', format='png')
  502. # plt.show()
  503. plt.clf()
  504. draw_count_bar(norm_diff)
  505. def pairwise_substitution_consistence(mat1, mat2):
  506. """
  507. """
  508. nb_consistent = 0
  509. nb_inconsistent = 0
  510. # the matrix is considered symmetric.
  511. upper_tri1 = mat1[np.triu_indices_from(mat1)]
  512. upper_tri2 = mat2[np.tril_indices_from(mat2)]
  513. for i in tqdm(range(len(upper_tri1)), desc='computing consistence', file=sys.stdout):
  514. for j in range(i, len(upper_tri1)):
  515. if np.sign(upper_tri1[i] - upper_tri1[j]) == np.sign(upper_tri2[i] - upper_tri2[j]):
  516. nb_consistent += 1
  517. else:
  518. nb_inconsistent += 1
  519. return nb_consistent, nb_inconsistent, nb_consistent / (nb_consistent + nb_inconsistent)
  520. def pairwise_substitution(mat):
  521. # the matrix is considered symmetric.
  522. upper_tri = mat[np.triu_indices_from(mat)]
  523. sub_list = []
  524. for i in tqdm(range(len(upper_tri)), desc='computing', file=sys.stdout):
  525. for j in range(i, len(upper_tri)):
  526. sub_list.append(upper_tri[i] - upper_tri[j])
  527. return sub_list
  528. def draw_count_bar(norm_diff):
  529. import pandas
  530. from collections import Counter, OrderedDict
  531. norm_diff_cnt = norm_diff.flatten()
  532. norm_diff_cnt = norm_diff_cnt * 10
  533. norm_diff_cnt = np.floor(norm_diff_cnt)
  534. norm_diff_cnt = Counter(norm_diff_cnt)
  535. norm_diff_cnt = OrderedDict(sorted(norm_diff_cnt.items()))
  536. df = pandas.DataFrame.from_dict(norm_diff_cnt, orient='index')
  537. df.plot(kind='bar')
  538. if __name__ == '__main__':
  539. # test_anycosts()
  540. # test_cs_leq_ci_plus_cr()
  541. # test_unfitted()
  542. # test_cs_leq_ci_plus_cr_python_bash_cpp()
  543. # median_paper_clcpc_python_bash_cpp()
  544. median_paper_clcpc_python_best()
  545. # x = np.array([[1,2,3],[4,5,6],[7,8,9]])
  546. # xx = pairwise_substitution(x)

A Python package for graph kernels, graph edit distances and graph pre-image problem.