You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

test_fitDistance.py 30 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. Created on Thu Oct 24 11:50:56 2019
  5. @author: ljia
  6. """
  7. from matplotlib import pyplot as plt
  8. import numpy as np
  9. from tqdm import tqdm
  10. from gklearn.utils.graphfiles import loadDataset
  11. from gklearn.preimage.utils import remove_edges
  12. from gklearn.preimage.fitDistance import fit_GED_to_kernel_distance
  13. from gklearn.preimage.utils import normalize_distance_matrix
  14. def test_update_costs():
  15. from preimage.fitDistance import update_costs
  16. import cvxpy as cp
  17. ds = np.load('results/xp_fit_method/fit_data_debug4.gm.npz')
  18. nb_cost_mat = ds['nb_cost_mat']
  19. dis_k_vec = ds['dis_k_vec']
  20. n_edit_operations = ds['n_edit_operations']
  21. ged_vec_init = ds['ged_vec_init']
  22. ged_mat = ds['ged_mat']
  23. nb_cost_mat_new = nb_cost_mat[:,[2,3,4]]
  24. x = cp.Variable(nb_cost_mat_new.shape[1])
  25. cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec)
  26. # constraints = [x >= [0.000 for i in range(nb_cost_mat_new.shape[1])],
  27. # np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0]
  28. # constraints = [x >= [0.000 for i in range(nb_cost_mat_new.shape[1])],
  29. # np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0,
  30. # np.array([0.0, 0.0, 0.0, 1.0, -1.0]).T@x == 0.0]
  31. constraints = [x >= [0.00 for i in range(nb_cost_mat_new.shape[1])],
  32. np.array([0.0, 1.0, -1.0]).T@x == 0.0]
  33. # constraints = [x >= [0.00000 for i in range(nb_cost_mat_new.shape[1])]]
  34. prob = cp.Problem(cp.Minimize(cost_fun), constraints)
  35. prob.solve()
  36. print(x.value)
  37. edit_costs_new = np.concatenate((x.value, np.array([0.0])))
  38. residual = np.sqrt(prob.value)
  39. def median_paper_clcpc_python_best():
  40. """c_vs <= c_vi + c_vr, c_es <= c_ei + c_er with ged computation with
  41. python invoking the c++ code by bash command (with updated library).
  42. """
  43. # ds = {'name': 'monoterpenoides',
  44. # 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb
  45. # _, y_all = loadDataset(ds['dataset'])
  46. gkernel = 'untilhpathkernel'
  47. node_label = 'atom'
  48. edge_label = 'bond_type'
  49. itr_max = 6
  50. algo_options = '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1'
  51. params_ged = {'lib': 'gedlibpy', 'cost': 'CONSTANT', 'method': 'IPFP',
  52. 'algo_options': algo_options, 'stabilizer': None}
  53. y_all = ['3', '1', '4', '6', '7', '8', '9', '2']
  54. repeats = 50
  55. collection_path = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/generated_datsets/monoterpenoides/'
  56. graph_dir = collection_path + 'gxl/'
  57. fn_edit_costs_output = 'results/median_paper/edit_costs_output.python_init40.k10.txt'
  58. for y in y_all:
  59. for repeat in range(repeats):
  60. edit_costs_output_file = open(fn_edit_costs_output, 'a')
  61. collection_file = collection_path + 'monoterpenoides_' + y + '_' + str(repeat) + '.xml'
  62. Gn, _ = loadDataset(collection_file, extra_params=graph_dir)
  63. edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \
  64. nb_cost_mat_list = fit_GED_to_kernel_distance(Gn, node_label, edge_label,
  65. gkernel, itr_max, params_ged=params_ged,
  66. parallel=True)
  67. total_time = np.sum(time_list)
  68. # print('\nedit_costs:', edit_costs)
  69. # print('\nresidual_list:', residual_list)
  70. # print('\nedit_cost_list:', edit_cost_list)
  71. # print('\ndistance matrix in kernel space:', dis_k_mat)
  72. # print('\nged matrix:', ged_mat)
  73. # print('\ntotal time:', total_time)
  74. # print('\nnb_cost_mat:', nb_cost_mat_list[-1])
  75. np.savez('results/median_paper/fit_distance.clcpc.python_init40.monot.elabeled.uhpkernel.y'
  76. + y + '.repeat' + str(repeat) + '.k10..gm',
  77. edit_costs=edit_costs,
  78. residual_list=residual_list, edit_cost_list=edit_cost_list,
  79. dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list,
  80. total_time=total_time, nb_cost_mat_list=nb_cost_mat_list)
  81. for ec in edit_costs:
  82. edit_costs_output_file.write(str(ec) + ' ')
  83. edit_costs_output_file.write('\n')
  84. edit_costs_output_file.close()
  85. # # normalized distance matrices.
  86. # gmfile = np.load('results/fit_distance.cs_leq_ci_plus_cr.cost_leq_1en2.monot.elabeled.uhpkernel.gm.npz')
  87. # edit_costs = gmfile['edit_costs']
  88. # residual_list = gmfile['residual_list']
  89. # edit_cost_list = gmfile['edit_cost_list']
  90. # dis_k_mat = gmfile['dis_k_mat']
  91. # ged_mat = gmfile['ged_mat']
  92. # total_time = gmfile['total_time']
  93. # nb_cost_mat_list = gmfile['nb_cost_mat_list']
  94. nb_consistent, nb_inconsistent, ratio_consistent = pairwise_substitution_consistence(dis_k_mat, ged_mat)
  95. print(nb_consistent, nb_inconsistent, ratio_consistent)
  96. # norm_dis_k_mat = normalize_distance_matrix(dis_k_mat)
  97. # plt.imshow(norm_dis_k_mat)
  98. # plt.colorbar()
  99. # plt.savefig('results/median_paper/norm_dis_k_mat.clcpc.python_best.monot.elabeled.uhpkernel.y'
  100. # + y + '.repeat' + str(repeat) + '.eps', format='eps', dpi=300)
  101. # plt.savefig('results/median_paper/norm_dis_k_mat.clcpc.python_best.monot.elabeled.uhpkernel.y'
  102. # + y + '.repeat' + str(repeat) + '.png', format='png')
  103. # # plt.show()
  104. # plt.clf()
  105. #
  106. # norm_ged_mat = normalize_distance_matrix(ged_mat)
  107. # plt.imshow(norm_ged_mat)
  108. # plt.colorbar()
  109. # plt.savefig('results/median_paper/norm_ged_mat.clcpc.python_best.monot.elabeled.uhpkernel.y'
  110. # + y + '.repeat' + str(repeat) + '.eps', format='eps', dpi=300)
  111. # plt.savefig('results/median_paper/norm_ged_mat.clcpc.python_best.monot.elabeled.uhpkernel.y'
  112. # + y + '.repeat' + str(repeat) + '.png', format='png')
  113. # # plt.show()
  114. # plt.clf()
  115. #
  116. # norm_diff = norm_ged_mat - norm_dis_k_mat
  117. # plt.imshow(norm_diff)
  118. # plt.colorbar()
  119. # plt.savefig('results/median_paper/diff_mat_norm_ged_dis_k.clcpc.python_best.monot.elabeled.uhpkernel.y'
  120. # + y + '.repeat' + str(repeat) + '.eps', format='eps', dpi=300)
  121. # plt.savefig('results/median_paper/diff_mat_norm_ged_dis_k.clcpc.python_best.monot.elabeled.uhpkernel.y'
  122. # + y + '.repeat' + str(repeat) + '.png', format='png')
  123. # # plt.show()
  124. # plt.clf()
  125. # # draw_count_bar(norm_diff)
  126. def median_paper_clcpc_python_bash_cpp():
  127. """c_vs <= c_vi + c_vr, c_es <= c_ei + c_er with ged computation with
  128. python invoking the c++ code by bash command (with updated library).
  129. """
  130. # ds = {'name': 'monoterpenoides',
  131. # 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb
  132. # _, y_all = loadDataset(ds['dataset'])
  133. gkernel = 'untilhpathkernel'
  134. node_label = 'atom'
  135. edge_label = 'bond_type'
  136. itr_max = 20
  137. algo_options = '--threads 6 --initial-solutions 10 --ratio-runs-from-initial-solutions .5'
  138. params_ged = {'lib': 'gedlib-bash', 'cost': 'CONSTANT', 'method': 'IPFP',
  139. 'algo_options': algo_options}
  140. y_all = ['3', '1', '4', '6', '7', '8', '9', '2']
  141. repeats = 50
  142. collection_path = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/generated_datsets/monoterpenoides/'
  143. graph_dir = collection_path + 'gxl/'
  144. fn_edit_costs_output = 'results/median_paper/edit_costs_output.txt'
  145. for y in y_all:
  146. for repeat in range(repeats):
  147. edit_costs_output_file = open(fn_edit_costs_output, 'a')
  148. collection_file = collection_path + 'monoterpenoides_' + y + '_' + str(repeat) + '.xml'
  149. Gn, _ = loadDataset(collection_file, extra_params=graph_dir)
  150. edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \
  151. nb_cost_mat_list, coef_dk = fit_GED_to_kernel_distance(Gn, node_label, edge_label,
  152. gkernel, itr_max, params_ged=params_ged,
  153. parallel=False)
  154. total_time = np.sum(time_list)
  155. # print('\nedit_costs:', edit_costs)
  156. # print('\nresidual_list:', residual_list)
  157. # print('\nedit_cost_list:', edit_cost_list)
  158. # print('\ndistance matrix in kernel space:', dis_k_mat)
  159. # print('\nged matrix:', ged_mat)
  160. # print('\ntotal time:', total_time)
  161. # print('\nnb_cost_mat:', nb_cost_mat_list[-1])
  162. np.savez('results/median_paper/fit_distance.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y'
  163. + y + '.repeat' + str(repeat) + '.gm',
  164. edit_costs=edit_costs,
  165. residual_list=residual_list, edit_cost_list=edit_cost_list,
  166. dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list,
  167. total_time=total_time, nb_cost_mat_list=nb_cost_mat_list,
  168. coef_dk=coef_dk)
  169. for ec in edit_costs:
  170. edit_costs_output_file.write(str(ec) + ' ')
  171. edit_costs_output_file.write('\n')
  172. edit_costs_output_file.close()
  173. # # normalized distance matrices.
  174. # gmfile = np.load('results/fit_distance.cs_leq_ci_plus_cr.cost_leq_1en2.monot.elabeled.uhpkernel.gm.npz')
  175. # edit_costs = gmfile['edit_costs']
  176. # residual_list = gmfile['residual_list']
  177. # edit_cost_list = gmfile['edit_cost_list']
  178. # dis_k_mat = gmfile['dis_k_mat']
  179. # ged_mat = gmfile['ged_mat']
  180. # total_time = gmfile['total_time']
  181. # nb_cost_mat_list = gmfile['nb_cost_mat_list']
  182. # coef_dk = gmfile['coef_dk']
  183. nb_consistent, nb_inconsistent, ratio_consistent = pairwise_substitution_consistence(dis_k_mat, ged_mat)
  184. print(nb_consistent, nb_inconsistent, ratio_consistent)
  185. # norm_dis_k_mat = normalize_distance_matrix(dis_k_mat)
  186. # plt.imshow(norm_dis_k_mat)
  187. # plt.colorbar()
  188. # plt.savefig('results/median_paper/norm_dis_k_mat.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y'
  189. # + y + '.repeat' + str(repeat) + '.eps', format='eps', dpi=300)
  190. # plt.savefig('results/median_paper/norm_dis_k_mat.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y'
  191. # + y + '.repeat' + str(repeat) + '.png', format='png')
  192. # # plt.show()
  193. # plt.clf()
  194. #
  195. # norm_ged_mat = normalize_distance_matrix(ged_mat)
  196. # plt.imshow(norm_ged_mat)
  197. # plt.colorbar()
  198. # plt.savefig('results/median_paper/norm_ged_mat.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y'
  199. # + y + '.repeat' + str(repeat) + '.eps', format='eps', dpi=300)
  200. # plt.savefig('results/median_paper/norm_ged_mat.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y'
  201. # + y + '.repeat' + str(repeat) + '.png', format='png')
  202. # # plt.show()
  203. # plt.clf()
  204. #
  205. # norm_diff = norm_ged_mat - norm_dis_k_mat
  206. # plt.imshow(norm_diff)
  207. # plt.colorbar()
  208. # plt.savefig('results/median_paper/diff_mat_norm_ged_dis_k.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y'
  209. # + y + '.repeat' + str(repeat) + '.eps', format='eps', dpi=300)
  210. # plt.savefig('results/median_paper/diff_mat_norm_ged_dis_k.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y'
  211. # + y + '.repeat' + str(repeat) + '.png', format='png')
  212. # # plt.show()
  213. # plt.clf()
  214. # # draw_count_bar(norm_diff)
  215. def test_cs_leq_ci_plus_cr_python_bash_cpp():
  216. """c_vs <= c_vi + c_vr, c_es <= c_ei + c_er with ged computation with
  217. python invoking the c++ code by bash command (with updated library).
  218. """
  219. ds = {'name': 'monoterpenoides',
  220. 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb
  221. Gn, y_all = loadDataset(ds['dataset'])
  222. # Gn = Gn[0:10]
  223. gkernel = 'untilhpathkernel'
  224. node_label = 'atom'
  225. edge_label = 'bond_type'
  226. itr_max = 10
  227. algo_options = '--threads 6 --initial-solutions 10 --ratio-runs-from-initial-solutions .5'
  228. params_ged = {'lib': 'gedlib-bash', 'cost': 'CONSTANT', 'method': 'IPFP',
  229. 'algo_options': algo_options}
  230. edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \
  231. nb_cost_mat_list, coef_dk = fit_GED_to_kernel_distance(Gn, node_label, edge_label,
  232. gkernel, itr_max, params_ged=params_ged,
  233. parallel=False)
  234. total_time = np.sum(time_list)
  235. print('\nedit_costs:', edit_costs)
  236. print('\nresidual_list:', residual_list)
  237. print('\nedit_cost_list:', edit_cost_list)
  238. print('\ndistance matrix in kernel space:', dis_k_mat)
  239. print('\nged matrix:', ged_mat)
  240. print('\ntotal time:', total_time)
  241. print('\nnb_cost_mat:', nb_cost_mat_list[-1])
  242. np.savez('results/fit_distance.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel.gm',
  243. edit_costs=edit_costs,
  244. residual_list=residual_list, edit_cost_list=edit_cost_list,
  245. dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list,
  246. total_time=total_time, nb_cost_mat_list=nb_cost_mat_list,
  247. coef_dk=coef_dk)
  248. # ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
  249. # 'extra_params': {}} # node/edge symb
  250. # Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
  251. ## Gn = Gn[0:10]
  252. ## remove_edges(Gn)
  253. # gkernel = 'untilhpathkernel'
  254. # node_label = 'atom'
  255. # edge_label = 'bond_type'
  256. # itr_max = 10
  257. # edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \
  258. # nb_cost_mat_list, coef_dk = fit_GED_to_kernel_distance(Gn, node_label, edge_label,
  259. # gkernel, itr_max)
  260. # total_time = np.sum(time_list)
  261. # print('\nedit_costs:', edit_costs)
  262. # print('\nresidual_list:', residual_list)
  263. # print('\nedit_cost_list:', edit_cost_list)
  264. # print('\ndistance matrix in kernel space:', dis_k_mat)
  265. # print('\nged matrix:', ged_mat)
  266. # print('\ntotal time:', total_time)
  267. # print('\nnb_cost_mat:', nb_cost_mat_list[-1])
  268. # np.savez('results/fit_distance.cs_leq_ci_plus_cr.mutag.elabeled.uhpkernel.gm',
  269. # edit_costs=edit_costs,
  270. # residual_list=residual_list, edit_cost_list=edit_cost_list,
  271. # dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list,
  272. # total_time=total_time, nb_cost_mat_list=nb_cost_mat_list, coef_dk)
  273. # # normalized distance matrices.
  274. # gmfile = np.load('results/fit_distance.cs_leq_ci_plus_cr.monot.elabeled.uhpkernel.gm.npz')
  275. # edit_costs = gmfile['edit_costs']
  276. # residual_list = gmfile['residual_list']
  277. # edit_cost_list = gmfile['edit_cost_list']
  278. # dis_k_mat = gmfile['dis_k_mat']
  279. # ged_mat = gmfile['ged_mat']
  280. # total_time = gmfile['total_time']
  281. # nb_cost_mat_list = gmfile['nb_cost_mat_list']
  282. # coef_dk = gmfile['coef_dk']
  283. nb_consistent, nb_inconsistent, ratio_consistent = pairwise_substitution_consistence(dis_k_mat, ged_mat)
  284. print(nb_consistent, nb_inconsistent, ratio_consistent)
  285. # dis_k_sub = pairwise_substitution(dis_k_mat)
  286. # ged_sub = pairwise_substitution(ged_mat)
  287. # np.savez('results/sub_dis_mat.cs_leq_ci_plus_cr.gm',
  288. # dis_k_sub=dis_k_sub, ged_sub=ged_sub)
  289. norm_dis_k_mat = normalize_distance_matrix(dis_k_mat)
  290. plt.imshow(norm_dis_k_mat)
  291. plt.colorbar()
  292. plt.savefig('results/norm_dis_k_mat.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel'
  293. + '.eps', format='eps', dpi=300)
  294. plt.savefig('results/norm_dis_k_mat.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel'
  295. + '.png', format='png')
  296. # plt.show()
  297. plt.clf()
  298. norm_ged_mat = normalize_distance_matrix(ged_mat)
  299. plt.imshow(norm_ged_mat)
  300. plt.colorbar()
  301. plt.savefig('results/norm_ged_mat.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel'
  302. + '.eps', format='eps', dpi=300)
  303. plt.savefig('results/norm_ged_mat.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel'
  304. + '.png', format='png')
  305. # plt.show()
  306. plt.clf()
  307. norm_diff = norm_ged_mat - norm_dis_k_mat
  308. plt.imshow(norm_diff)
  309. plt.colorbar()
  310. plt.savefig('results/diff_mat_norm_ged_dis_k.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel'
  311. + '.eps', format='eps', dpi=300)
  312. plt.savefig('results/diff_mat_norm_ged_dis_k.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel'
  313. + '.png', format='png')
  314. # plt.show()
  315. plt.clf()
  316. # draw_count_bar(norm_diff)
  317. def test_anycosts():
  318. ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
  319. 'extra_params': {}} # node/edge symb
  320. Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
  321. # Gn = Gn[0:10]
  322. remove_edges(Gn)
  323. gkernel = 'marginalizedkernel'
  324. itr_max = 10
  325. edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \
  326. nb_cost_mat_list, coef_dk = fit_GED_to_kernel_distance(Gn, gkernel, itr_max)
  327. total_time = np.sum(time_list)
  328. print('\nedit_costs:', edit_costs)
  329. print('\nresidual_list:', residual_list)
  330. print('\nedit_cost_list:', edit_cost_list)
  331. print('\ndistance matrix in kernel space:', dis_k_mat)
  332. print('\nged matrix:', ged_mat)
  333. print('\ntotal time:', total_time)
  334. print('\nnb_cost_mat:', nb_cost_mat_list[-1])
  335. np.savez('results/fit_distance.any_costs.gm', edit_costs=edit_costs,
  336. residual_list=residual_list, edit_cost_list=edit_cost_list,
  337. dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list,
  338. total_time=total_time, nb_cost_mat_list=nb_cost_mat_list)
  339. # # normalized distance matrices.
  340. # gmfile = np.load('results/fit_distance.any_costs.gm.npz')
  341. # edit_costs = gmfile['edit_costs']
  342. # residual_list = gmfile['residual_list']
  343. # edit_cost_list = gmfile['edit_cost_list']
  344. # dis_k_mat = gmfile['dis_k_mat']
  345. # ged_mat = gmfile['ged_mat']
  346. # total_time = gmfile['total_time']
  347. ## nb_cost_mat_list = gmfile['nb_cost_mat_list']
  348. norm_dis_k_mat = normalize_distance_matrix(dis_k_mat)
  349. plt.imshow(norm_dis_k_mat)
  350. plt.colorbar()
  351. plt.savefig('results/norm_dis_k_mat.any_costs' + '.eps', format='eps', dpi=300)
  352. # plt.savefig('results/norm_dis_k_mat.any_costs' + '.png', format='png')
  353. # plt.show()
  354. plt.clf()
  355. norm_ged_mat = normalize_distance_matrix(ged_mat)
  356. plt.imshow(norm_ged_mat)
  357. plt.colorbar()
  358. plt.savefig('results/norm_ged_mat.any_costs' + '.eps', format='eps', dpi=300)
  359. # plt.savefig('results/norm_ged_mat.any_costs' + '.png', format='png')
  360. # plt.show()
  361. plt.clf()
  362. norm_diff = norm_ged_mat - norm_dis_k_mat
  363. plt.imshow(norm_diff)
  364. plt.colorbar()
  365. plt.savefig('results/diff_mat_norm_ged_dis_k.any_costs' + '.eps', format='eps', dpi=300)
  366. # plt.savefig('results/diff_mat_norm_ged_dis_k.any_costs' + '.png', format='png')
  367. # plt.show()
  368. plt.clf()
  369. # draw_count_bar(norm_diff)
  370. def test_cs_leq_ci_plus_cr():
  371. """c_vs <= c_vi + c_vr, c_es <= c_ei + c_er
  372. """
  373. ds = {'name': 'monoterpenoides',
  374. 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb
  375. Gn, y_all = loadDataset(ds['dataset'])
  376. # Gn = Gn[0:10]
  377. gkernel = 'untilhpathkernel'
  378. node_label = 'atom'
  379. edge_label = 'bond_type'
  380. itr_max = 10
  381. edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \
  382. nb_cost_mat_list, coef_dk = fit_GED_to_kernel_distance(Gn, node_label, edge_label,
  383. gkernel, itr_max,
  384. fitkernel='gaussian')
  385. total_time = np.sum(time_list)
  386. print('\nedit_costs:', edit_costs)
  387. print('\nresidual_list:', residual_list)
  388. print('\nedit_cost_list:', edit_cost_list)
  389. print('\ndistance matrix in kernel space:', dis_k_mat)
  390. print('\nged matrix:', ged_mat)
  391. print('\ntotal time:', total_time)
  392. print('\nnb_cost_mat:', nb_cost_mat_list[-1])
  393. np.savez('results/fit_distance.cs_leq_ci_plus_cr.gaussian.cost_leq_1en2.monot.elabeled.uhpkernel.gm',
  394. edit_costs=edit_costs,
  395. residual_list=residual_list, edit_cost_list=edit_cost_list,
  396. dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list,
  397. total_time=total_time, nb_cost_mat_list=nb_cost_mat_list,
  398. coef_dk=coef_dk)
  399. # ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
  400. # 'extra_params': {}} # node/edge symb
  401. # Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
  402. ## Gn = Gn[0:10]
  403. ## remove_edges(Gn)
  404. # gkernel = 'untilhpathkernel'
  405. # node_label = 'atom'
  406. # edge_label = 'bond_type'
  407. # itr_max = 10
  408. # edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \
  409. # nb_cost_mat_list, coef_dk = fit_GED_to_kernel_distance(Gn, node_label, edge_label,
  410. # gkernel, itr_max)
  411. # total_time = np.sum(time_list)
  412. # print('\nedit_costs:', edit_costs)
  413. # print('\nresidual_list:', residual_list)
  414. # print('\nedit_cost_list:', edit_cost_list)
  415. # print('\ndistance matrix in kernel space:', dis_k_mat)
  416. # print('\nged matrix:', ged_mat)
  417. # print('\ntotal time:', total_time)
  418. # print('\nnb_cost_mat:', nb_cost_mat_list[-1])
  419. # np.savez('results/fit_distance.cs_leq_ci_plus_cr.cost_leq_1en2.mutag.elabeled.uhpkernel.gm',
  420. # edit_costs=edit_costs,
  421. # residual_list=residual_list, edit_cost_list=edit_cost_list,
  422. # dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list,
  423. # total_time=total_time, nb_cost_mat_list=nb_cost_mat_list, coef_dk)
  424. # # normalized distance matrices.
  425. # gmfile = np.load('results/fit_distance.cs_leq_ci_plus_cr.cost_leq_1en2.monot.elabeled.uhpkernel.gm.npz')
  426. # edit_costs = gmfile['edit_costs']
  427. # residual_list = gmfile['residual_list']
  428. # edit_cost_list = gmfile['edit_cost_list']
  429. # dis_k_mat = gmfile['dis_k_mat']
  430. # ged_mat = gmfile['ged_mat']
  431. # total_time = gmfile['total_time']
  432. # nb_cost_mat_list = gmfile['nb_cost_mat_list']
  433. # coef_dk = gmfile['coef_dk']
  434. nb_consistent, nb_inconsistent, ratio_consistent = pairwise_substitution_consistence(dis_k_mat, ged_mat)
  435. print(nb_consistent, nb_inconsistent, ratio_consistent)
  436. # dis_k_sub = pairwise_substitution(dis_k_mat)
  437. # ged_sub = pairwise_substitution(ged_mat)
  438. # np.savez('results/sub_dis_mat.cs_leq_ci_plus_cr.cost_leq_1en2.gm',
  439. # dis_k_sub=dis_k_sub, ged_sub=ged_sub)
  440. norm_dis_k_mat = normalize_distance_matrix(dis_k_mat)
  441. plt.imshow(norm_dis_k_mat)
  442. plt.colorbar()
  443. plt.savefig('results/norm_dis_k_mat.cs_leq_ci_plus_cr.gaussian.cost_leq_1en2.monot.elabeled.uhpkernel'
  444. + '.eps', format='eps', dpi=300)
  445. plt.savefig('results/norm_dis_k_mat.cs_leq_ci_plus_cr.gaussian.cost_leq_1en2.monot.elabeled.uhpkernel'
  446. + '.png', format='png')
  447. # plt.show()
  448. plt.clf()
  449. norm_ged_mat = normalize_distance_matrix(ged_mat)
  450. plt.imshow(norm_ged_mat)
  451. plt.colorbar()
  452. plt.savefig('results/norm_ged_mat.cs_leq_ci_plus_cr.gaussian.cost_leq_1en2.monot.elabeled.uhpkernel'
  453. + '.eps', format='eps', dpi=300)
  454. plt.savefig('results/norm_ged_mat.cs_leq_ci_plus_cr.gaussian.cost_leq_1en2.monot.elabeled.uhpkernel'
  455. + '.png', format='png')
  456. # plt.show()
  457. plt.clf()
  458. norm_diff = norm_ged_mat - norm_dis_k_mat
  459. plt.imshow(norm_diff)
  460. plt.colorbar()
  461. plt.savefig('results/diff_mat_norm_ged_dis_k.cs_leq_ci_plus_cr.gaussian.cost_leq_1en2.monot.elabeled.uhpkernel'
  462. + '.eps', format='eps', dpi=300)
  463. plt.savefig('results/diff_mat_norm_ged_dis_k.cs_leq_ci_plus_cr.gaussian.cost_leq_1en2.monot.elabeled.uhpkernel'
  464. + '.png', format='png')
  465. # plt.show()
  466. plt.clf()
  467. # draw_count_bar(norm_diff)
  468. def test_unfitted():
  469. """unfitted.
  470. """
  471. from fitDistance import compute_geds
  472. from utils import kernel_distance_matrix
  473. ds = {'name': 'monoterpenoides',
  474. 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb
  475. Gn, y_all = loadDataset(ds['dataset'])
  476. # Gn = Gn[0:10]
  477. gkernel = 'untilhpathkernel'
  478. node_label = 'atom'
  479. edge_label = 'bond_type'
  480. # ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
  481. # 'extra_params': {}} # node/edge symb
  482. # Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
  483. ## Gn = Gn[0:10]
  484. ## remove_edges(Gn)
  485. # gkernel = 'marginalizedkernel'
  486. dis_k_mat, _, _, _ = kernel_distance_matrix(Gn, node_label, edge_label, gkernel=gkernel)
  487. ged_all, ged_mat, n_edit_operations = compute_geds(Gn, [3, 3, 1, 3, 3, 1],
  488. [0, 1, 2, 3, 4, 5], parallel=True)
  489. print('\ndistance matrix in kernel space:', dis_k_mat)
  490. print('\nged matrix:', ged_mat)
  491. # np.savez('results/fit_distance.cs_leq_ci_plus_cr.cost_leq_1en2.gm', edit_costs=edit_costs,
  492. # residual_list=residual_list, edit_cost_list=edit_cost_list,
  493. # dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list,
  494. # total_time=total_time, nb_cost_mat_list=nb_cost_mat_list)
  495. # normalized distance matrices.
  496. # gmfile = np.load('results/fit_distance.cs_leq_ci_plus_cr.cost_leq_1en3.gm.npz')
  497. # edit_costs = gmfile['edit_costs']
  498. # residual_list = gmfile['residual_list']
  499. # edit_cost_list = gmfile['edit_cost_list']
  500. # dis_k_mat = gmfile['dis_k_mat']
  501. # ged_mat = gmfile['ged_mat']
  502. # total_time = gmfile['total_time']
  503. # nb_cost_mat_list = gmfile['nb_cost_mat_list']
  504. nb_consistent, nb_inconsistent, ratio_consistent = pairwise_substitution_consistence(dis_k_mat, ged_mat)
  505. print(nb_consistent, nb_inconsistent, ratio_consistent)
  506. norm_dis_k_mat = normalize_distance_matrix(dis_k_mat)
  507. plt.imshow(norm_dis_k_mat)
  508. plt.colorbar()
  509. plt.savefig('results/norm_dis_k_mat.unfitted.MUTAG' + '.eps', format='eps', dpi=300)
  510. plt.savefig('results/norm_dis_k_mat.unfitted.MUTAG' + '.png', format='png')
  511. # plt.show()
  512. plt.clf()
  513. norm_ged_mat = normalize_distance_matrix(ged_mat)
  514. plt.imshow(norm_ged_mat)
  515. plt.colorbar()
  516. plt.savefig('results/norm_ged_mat.unfitted.MUTAG' + '.eps', format='eps', dpi=300)
  517. plt.savefig('results/norm_ged_mat.unfitted.MUTAG' + '.png', format='png')
  518. # plt.show()
  519. plt.clf()
  520. norm_diff = norm_ged_mat - norm_dis_k_mat
  521. plt.imshow(norm_diff)
  522. plt.colorbar()
  523. plt.savefig('results/diff_mat_norm_ged_dis_k.unfitted.MUTAG' + '.eps', format='eps', dpi=300)
  524. plt.savefig('results/diff_mat_norm_ged_dis_k.unfitted.MUTAG' + '.png', format='png')
  525. # plt.show()
  526. plt.clf()
  527. draw_count_bar(norm_diff)
  528. def pairwise_substitution_consistence(mat1, mat2):
  529. """
  530. """
  531. nb_consistent = 0
  532. nb_inconsistent = 0
  533. # the matrix is considered symmetric.
  534. upper_tri1 = mat1[np.triu_indices_from(mat1)]
  535. upper_tri2 = mat2[np.tril_indices_from(mat2)]
  536. for i in tqdm(range(len(upper_tri1)), desc='computing consistence', file=sys.stdout):
  537. for j in range(i, len(upper_tri1)):
  538. if np.sign(upper_tri1[i] - upper_tri1[j]) == np.sign(upper_tri2[i] - upper_tri2[j]):
  539. nb_consistent += 1
  540. else:
  541. nb_inconsistent += 1
  542. return nb_consistent, nb_inconsistent, nb_consistent / (nb_consistent + nb_inconsistent)
  543. def pairwise_substitution(mat):
  544. # the matrix is considered symmetric.
  545. upper_tri = mat[np.triu_indices_from(mat)]
  546. sub_list = []
  547. for i in tqdm(range(len(upper_tri)), desc='computing', file=sys.stdout):
  548. for j in range(i, len(upper_tri)):
  549. sub_list.append(upper_tri[i] - upper_tri[j])
  550. return sub_list
  551. def draw_count_bar(norm_diff):
  552. import pandas
  553. from collections import Counter, OrderedDict
  554. norm_diff_cnt = norm_diff.flatten()
  555. norm_diff_cnt = norm_diff_cnt * 10
  556. norm_diff_cnt = np.floor(norm_diff_cnt)
  557. norm_diff_cnt = Counter(norm_diff_cnt)
  558. norm_diff_cnt = OrderedDict(sorted(norm_diff_cnt.items()))
  559. df = pandas.DataFrame.from_dict(norm_diff_cnt, orient='index')
  560. df.plot(kind='bar')
  561. if __name__ == '__main__':
  562. # test_anycosts()
  563. # test_cs_leq_ci_plus_cr()
  564. # test_unfitted()
  565. # test_cs_leq_ci_plus_cr_python_bash_cpp()
  566. # median_paper_clcpc_python_bash_cpp()
  567. # median_paper_clcpc_python_best()
  568. # x = np.array([[1,2,3],[4,5,6],[7,8,9]])
  569. # xx = pairwise_substitution(x)
  570. test_update_costs()

A Python package for graph kernels, graph edit distances and graph pre-image problem.