You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

visualization.py 28 kB

5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. Created on Thu Dec 19 17:16:23 2019
  5. @author: ljia
  6. """
  7. import numpy as np
  8. from sklearn.manifold import TSNE, Isomap
  9. import matplotlib.pyplot as plt
  10. from mpl_toolkits.axes_grid1.inset_locator import zoomed_inset_axes, mark_inset
  11. from tqdm import tqdm
  12. import sys
  13. sys.path.insert(0, "../")
  14. from gklearn.utils.graphfiles import loadDataset, loadGXL
  15. from utils import kernel_distance_matrix, compute_kernel, dis_gstar, get_same_item_indices
  16. def visualize_graph_dataset(dis_measure, visual_method, draw_figure,
  17. draw_params={}, dis_mat=None, Gn=None,
  18. median_set=None):
  19. def draw_zoomed_axes(Gn_embedded, ax):
  20. margin = 0.01
  21. if dis_measure == 'graph-kernel':
  22. index = -2
  23. elif dis_measure == 'ged':
  24. index = -1
  25. x1 = np.min(Gn_embedded[median_set + [index], 0]) - margin * np.max(Gn_embedded)
  26. x2 = np.max(Gn_embedded[median_set + [index], 0]) + margin * np.max(Gn_embedded)
  27. y1 = np.min(Gn_embedded[median_set + [index], 1]) - margin * np.max(Gn_embedded)
  28. y2 = np.max(Gn_embedded[median_set + [index], 1]) + margin * np.max(Gn_embedded)
  29. if (x1 < 0 and y1 < 0) or ((x1 > 0 and y1 > 0)):
  30. loc = 2
  31. else:
  32. loc = 3
  33. axins = zoomed_inset_axes(ax, 4, loc=loc) # zoom-factor: 2.5, location: upper-left
  34. draw_figure(axins, Gn_embedded, dis_measure=dis_measure,
  35. median_set=median_set, **draw_params)
  36. axins.set_xlim(x1, x2) # apply the x-limits
  37. axins.set_ylim(y1, y2) # apply the y-limits
  38. plt.yticks(visible=False)
  39. plt.xticks(visible=False)
  40. loc1 = 1 if loc == 2 else 3
  41. mark_inset(ax, axins, loc1=2, loc2=4, fc="none", ec="0.5")
  42. if dis_mat is None:
  43. if dis_measure == 'graph-kernel':
  44. gkernel = 'untilhpathkernel'
  45. node_label = 'atom'
  46. edge_label = 'bond_type'
  47. dis_mat, _, _, _ = kernel_distance_matrix(Gn, node_label, edge_label,
  48. Kmatrix=None, gkernel=gkernel)
  49. elif dis_measure == 'ged':
  50. pass
  51. if visual_method == 'tsne':
  52. Gn_embedded = TSNE(n_components=2, metric='precomputed').fit_transform(dis_mat)
  53. elif visual_method == 'isomap':
  54. Gn_embedded = Isomap(n_components=2, metric='precomputed').fit_transform(dis_mat)
  55. print(Gn_embedded.shape)
  56. fig, ax = plt.subplots()
  57. draw_figure(plt, Gn_embedded, dis_measure=dis_measure, legend=True,
  58. median_set=median_set, **draw_params)
  59. # draw_zoomed_axes(Gn_embedded, ax)
  60. plt.show()
  61. plt.clf()
  62. return
  63. def draw_figure(ax, Gn_embedded, dis_measure=None, y_idx=None, legend=False,
  64. median_set=None):
  65. from matplotlib import colors as mcolors
  66. colors = list(dict(mcolors.BASE_COLORS, **mcolors.CSS4_COLORS))
  67. # colors = ['#08306b', '#08519c', '#2171b5', '#4292c6', '#6baed6', '#9ecae1',
  68. # '#c6dbef', '#deebf7']
  69. # for i, values in enumerate(y_idx.values()):
  70. # for item in values:
  71. ## ax.scatter(Gn_embedded[item,0], Gn_embedded[item,1], c=colors[i]) # , c='b')
  72. # ax.scatter(Gn_embedded[item,0], Gn_embedded[item,1], c='b')
  73. # ax.scatter(Gn_embedded[:,0], Gn_embedded[:,1], c='b')
  74. h1 = ax.scatter(Gn_embedded[median_set, 0], Gn_embedded[median_set, 1], c='b')
  75. if dis_measure == 'graph-kernel':
  76. h2 = ax.scatter(Gn_embedded[-1, 0], Gn_embedded[-1, 1], c='darkorchid') # \psi
  77. h3 = ax.scatter(Gn_embedded[-2, 0], Gn_embedded[-2, 1], c='gold') # gen median
  78. h4 = ax.scatter(Gn_embedded[-3, 0], Gn_embedded[-3, 1], c='r') #c='g', marker='+') # set median
  79. elif dis_measure == 'ged':
  80. h3 = ax.scatter(Gn_embedded[-1, 0], Gn_embedded[-1, 1], c='gold') # gen median
  81. h4 = ax.scatter(Gn_embedded[-2, 0], Gn_embedded[-2, 1], c='r') #c='g', marker='+') # set median
  82. if legend:
  83. # fig.subplots_adjust(bottom=0.17)
  84. if dis_measure == 'graph-kernel':
  85. ax.legend([h1, h2, h3, h4],
  86. ['k closest graphs', 'true median', 'gen median', 'set median'])
  87. elif dis_measure == 'ged':
  88. ax.legend([h1, h3, h4], ['k closest graphs', 'gen median', 'set median'])
  89. # fig.legend(handles, labels, loc='lower center', ncol=2, frameon=False) # , ncol=5, labelspacing=0.1, handletextpad=0.4, columnspacing=0.6)
  90. # plt.savefig('symbolic_and_non_comparison_vertical_short.eps', format='eps', dpi=300, transparent=True,
  91. # bbox_inches='tight')
  92. # plt.show()
  93. ###############################################################################
  94. def visualize_distances_in_kernel():
  95. ds = {'name': 'monoterpenoides',
  96. 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb
  97. Gn, y_all = loadDataset(ds['dataset'])
  98. # Gn = Gn[0:50]
  99. fname_medians = 'expert.treelet'
  100. # add set median.
  101. fname_sm = '/media/ljia/DATA/research-repo/codes/Linlin/graphkit-learn/preimage/results/test_k_closest_graphs/set_median.' + fname_medians + '.gxl'
  102. set_median = loadGXL(fname_sm)
  103. Gn.append(set_median)
  104. # add generalized median (estimated pre-image.)
  105. fname_gm = '/media/ljia/DATA/research-repo/codes/Linlin/graphkit-learn/preimage/results/test_k_closest_graphs/gen_median.' + fname_medians + '.gxl'
  106. gen_median = loadGXL(fname_gm)
  107. Gn.append(gen_median)
  108. # compute distance matrix
  109. median_set = [22, 29, 54, 74]
  110. gkernel = 'treeletkernel'
  111. node_label = 'atom'
  112. edge_label = 'bond_type'
  113. Gn_median_set = [Gn[i].copy() for i in median_set]
  114. Kmatrix_median = compute_kernel(Gn + Gn_median_set, gkernel, node_label,
  115. edge_label, True)
  116. Kmatrix = Kmatrix_median[0:len(Gn), 0:len(Gn)]
  117. dis_mat, _, _, _ = kernel_distance_matrix(Gn, node_label, edge_label,
  118. Kmatrix=Kmatrix, gkernel=gkernel)
  119. print('average distances: ', np.mean(np.mean(dis_mat[0:len(Gn)-2, 0:len(Gn)-2])))
  120. print('min distances: ', np.min(np.min(dis_mat[0:len(Gn)-2, 0:len(Gn)-2])))
  121. print('max distances: ', np.max(np.max(dis_mat[0:len(Gn)-2, 0:len(Gn)-2])))
  122. # add distances for the image of exact median \psi.
  123. dis_k_median_list = []
  124. for idx, g in enumerate(Gn):
  125. dis_k_median_list.append(dis_gstar(idx, range(len(Gn), len(Gn) + len(Gn_median_set)),
  126. [1 / len(Gn_median_set)] * len(Gn_median_set),
  127. Kmatrix_median, withterm3=False))
  128. dis_mat_median = np.zeros((len(Gn) + 1, len(Gn) + 1))
  129. for i in range(len(Gn)):
  130. for j in range(i, len(Gn)):
  131. dis_mat_median[i, j] = dis_mat[i, j]
  132. dis_mat_median[j, i] = dis_mat_median[i, j]
  133. for i in range(len(Gn)):
  134. dis_mat_median[i, -1] = dis_k_median_list[i]
  135. dis_mat_median[-1, i] = dis_k_median_list[i]
  136. # get indices by classes.
  137. y_idx = get_same_item_indices(y_all)
  138. # visualization.
  139. # visualize_graph_dataset('graph-kernel', 'tsne', Gn)
  140. # visualize_graph_dataset('graph-kernel', 'tsne', draw_figure,
  141. # draw_params={'y_idx': y_idx}, dis_mat=dis_mat_median)
  142. visualize_graph_dataset('graph-kernel', 'tsne', draw_figure,
  143. draw_params={'y_idx': y_idx}, dis_mat=dis_mat_median,
  144. median_set=median_set)
  145. def visualize_distances_in_ged():
  146. from fitDistance import compute_geds
  147. from ged import GED
  148. ds = {'name': 'monoterpenoides',
  149. 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb
  150. Gn, y_all = loadDataset(ds['dataset'])
  151. # Gn = Gn[0:50]
  152. # add set median.
  153. fname_medians = 'expert.treelet'
  154. fname_sm = '/media/ljia/DATA/research-repo/codes/Linlin/graphkit-learn/preimage/results/test_k_closest_graphs/set_median.' + fname_medians + '.gxl'
  155. set_median = loadGXL(fname_sm)
  156. Gn.append(set_median)
  157. # add generalized median (estimated pre-image.)
  158. fname_gm = '/media/ljia/DATA/research-repo/codes/Linlin/graphkit-learn/preimage/results/test_k_closest_graphs/gen_median.' + fname_medians + '.gxl'
  159. gen_median = loadGXL(fname_gm)
  160. Gn.append(gen_median)
  161. # compute/load ged matrix.
  162. # # compute.
  163. ## k = 4
  164. ## edit_costs = [0.16229209837639536, 0.06612870523413916, 0.04030113378793905, 0.20723547009415202, 0.3338607220394598, 0.27054392518077297]
  165. # edit_costs = [3, 3, 1, 3, 3, 1]
  166. ## edit_costs = [7, 3, 5, 9, 2, 6]
  167. # algo_options = '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1'
  168. # params_ged = {'lib': 'gedlibpy', 'cost': 'CONSTANT', 'method': 'IPFP',
  169. # 'algo_options': algo_options, 'stabilizer': None,
  170. # 'edit_cost_constant': edit_costs}
  171. # _, ged_mat, _ = compute_geds(Gn, params_ged=params_ged, parallel=True)
  172. # np.savez('results/test_k_closest_graphs/ged_mat.' + fname_medians + '.with_medians.gm', ged_mat=ged_mat)
  173. # load from file.
  174. gmfile = np.load('results/test_k_closest_graphs/ged_mat.' + fname_medians + '.with_medians.gm.npz')
  175. ged_mat = gmfile['ged_mat']
  176. # # change medians.
  177. # edit_costs = [3, 3, 1, 3, 3, 1]
  178. # algo_options = '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1'
  179. # params_ged = {'lib': 'gedlibpy', 'cost': 'CONSTANT', 'method': 'IPFP',
  180. # 'algo_options': algo_options, 'stabilizer': None,
  181. # 'edit_cost_constant': edit_costs}
  182. # for idx in tqdm(range(len(Gn) - 2), desc='computing GEDs', file=sys.stdout):
  183. # dis, _, _ = GED(Gn[idx], set_median, **params_ged)
  184. # ged_mat[idx, -2] = dis
  185. # ged_mat[-2, idx] = dis
  186. # dis, _, _ = GED(Gn[idx], gen_median, **params_ged)
  187. # ged_mat[idx, -1] = dis
  188. # ged_mat[-1, idx] = dis
  189. # np.savez('results/test_k_closest_graphs/ged_mat.' + fname_medians + '.with_medians.gm',
  190. # ged_mat=ged_mat)
  191. # get indices by classes.
  192. y_idx = get_same_item_indices(y_all)
  193. # visualization.
  194. median_set = [22, 29, 54, 74]
  195. visualize_graph_dataset('ged', 'tsne', draw_figure,
  196. draw_params={'y_idx': y_idx}, dis_mat=ged_mat,
  197. median_set=median_set)
  198. ###############################################################################
  199. def visualize_distances_in_kernel_monoterpenoides():
  200. ds = {'dataset': '../datasets/monoterpenoides/dataset_10+.ds',
  201. 'graph_dir': '/media/ljia/DATA/research-repo/codes/Linlin/graphkit-learn/datasets/monoterpenoides/'} # node/edge symb
  202. Gn_original, y_all = loadDataset(ds['dataset'])
  203. # Gn = Gn[0:50]
  204. # compute distance matrix
  205. # median_set = [22, 29, 54, 74]
  206. gkernel = 'treeletkernel'
  207. fit_method = 'expert'
  208. node_label = 'atom'
  209. edge_label = 'bond_type'
  210. ds_name = 'monoterpenoides'
  211. fname_medians = fit_method + '.' + gkernel
  212. dir_output = 'results/xp_monoterpenoides/'
  213. repeat = 0
  214. # get indices by classes.
  215. y_idx = get_same_item_indices(y_all)
  216. for i, (y, values) in enumerate(y_idx.items()):
  217. print('\ny =', y)
  218. k = len(values)
  219. Gn = [Gn_original[g].copy() for g in values]
  220. # add set median.
  221. fname_sm = dir_output + 'medians/' + str(int(y)) + '/set_median.k' + str(int(k)) \
  222. + '.y' + str(int(y)) + '.repeat' + str(repeat) + '.gxl'
  223. set_median = loadGXL(fname_sm)
  224. Gn.append(set_median)
  225. # add generalized median (estimated pre-image.)
  226. fname_gm = dir_output + 'medians/' + str(int(y)) + '/gen_median.k' + str(int(k)) \
  227. + '.y' + str(int(y)) + '.repeat' + str(repeat) + '.gxl'
  228. gen_median = loadGXL(fname_gm)
  229. Gn.append(gen_median)
  230. # compute distance matrix
  231. median_set = range(0, len(values))
  232. Gn_median_set = [Gn[i].copy() for i in median_set]
  233. Kmatrix_median = compute_kernel(Gn + Gn_median_set, gkernel, node_label,
  234. edge_label, False)
  235. Kmatrix = Kmatrix_median[0:len(Gn), 0:len(Gn)]
  236. dis_mat, _, _, _ = kernel_distance_matrix(Gn, node_label, edge_label,
  237. Kmatrix=Kmatrix, gkernel=gkernel)
  238. print('average distances: ', np.mean(np.mean(dis_mat[0:len(Gn)-2, 0:len(Gn)-2])))
  239. print('min distances: ', np.min(np.min(dis_mat[0:len(Gn)-2, 0:len(Gn)-2])))
  240. print('max distances: ', np.max(np.max(dis_mat[0:len(Gn)-2, 0:len(Gn)-2])))
  241. # add distances for the image of exact median \psi.
  242. dis_k_median_list = []
  243. for idx, g in enumerate(Gn):
  244. dis_k_median_list.append(dis_gstar(idx, range(len(Gn), len(Gn) + len(Gn_median_set)),
  245. [1 / len(Gn_median_set)] * len(Gn_median_set),
  246. Kmatrix_median, withterm3=False))
  247. dis_mat_median = np.zeros((len(Gn) + 1, len(Gn) + 1))
  248. for i in range(len(Gn)):
  249. for j in range(i, len(Gn)):
  250. dis_mat_median[i, j] = dis_mat[i, j]
  251. dis_mat_median[j, i] = dis_mat_median[i, j]
  252. for i in range(len(Gn)):
  253. dis_mat_median[i, -1] = dis_k_median_list[i]
  254. dis_mat_median[-1, i] = dis_k_median_list[i]
  255. # visualization.
  256. # visualize_graph_dataset('graph-kernel', 'tsne', Gn)
  257. # visualize_graph_dataset('graph-kernel', 'tsne', draw_figure,
  258. # draw_params={'y_idx': y_idx}, dis_mat=dis_mat_median)
  259. visualize_graph_dataset('graph-kernel', 'tsne', draw_figure,
  260. draw_params={'y_idx': y_idx}, dis_mat=dis_mat_median,
  261. median_set=median_set)
  262. def visualize_distances_in_ged_monoterpenoides():
  263. from fitDistance import compute_geds
  264. from ged import GED
  265. ds = {'dataset': '../datasets/monoterpenoides/dataset_10+.ds',
  266. 'graph_dir': '/media/ljia/DATA/research-repo/codes/Linlin/graphkit-learn/datasets/monoterpenoides/'} # node/edge symb
  267. Gn_original, y_all = loadDataset(ds['dataset'])
  268. # Gn = Gn[0:50]
  269. # compute distance matrix
  270. # median_set = [22, 29, 54, 74]
  271. gkernel = 'treeletkernel'
  272. fit_method = 'expert'
  273. ds_name = 'monoterpenoides'
  274. fname_medians = fit_method + '.' + gkernel
  275. dir_output = 'results/xp_monoterpenoides/'
  276. repeat = 0
  277. # edit_costs = [0.16229209837639536, 0.06612870523413916, 0.04030113378793905, 0.20723547009415202, 0.3338607220394598, 0.27054392518077297]
  278. edit_costs = [3, 3, 1, 3, 3, 1]
  279. # edit_costs = [7, 3, 5, 9, 2, 6]
  280. # get indices by classes.
  281. y_idx = get_same_item_indices(y_all)
  282. for i, (y, values) in enumerate(y_idx.items()):
  283. print('\ny =', y)
  284. k = len(values)
  285. Gn = [Gn_original[g].copy() for g in values]
  286. # add set median.
  287. fname_sm = dir_output + 'medians/' + str(int(y)) + '/set_median.k' + str(int(k)) \
  288. + '.y' + str(int(y)) + '.repeat' + str(repeat) + '.gxl'
  289. set_median = loadGXL(fname_sm)
  290. Gn.append(set_median)
  291. # add generalized median (estimated pre-image.)
  292. fname_gm = dir_output + 'medians/' + str(int(y)) + '/gen_median.k' + str(int(k)) \
  293. + '.y' + str(int(y)) + '.repeat' + str(repeat) + '.gxl'
  294. gen_median = loadGXL(fname_gm)
  295. Gn.append(gen_median)
  296. # compute/load ged matrix.
  297. # compute.
  298. algo_options = '--threads 1 --initial-solutions 40 --ratio-runs-from-initial-solutions 1'
  299. params_ged = {'dataset': ds_name, 'lib': 'gedlibpy', 'cost': 'CONSTANT',
  300. 'method': 'IPFP', 'algo_options': algo_options,
  301. 'stabilizer': None, 'edit_cost_constant': edit_costs}
  302. _, ged_mat, _ = compute_geds(Gn, params_ged=params_ged, parallel=True)
  303. np.savez(dir_output + 'ged_mat.' + fname_medians + '.y' + str(int(y)) \
  304. + '.with_medians.gm', ged_mat=ged_mat)
  305. # # load from file.
  306. # gmfile = np.load('dir_output + 'ged_mat.' + fname_medians + '.y' + str(int(y)) + '.with_medians.gm.npz')
  307. # ged_mat = gmfile['ged_mat']
  308. # # change medians.
  309. # algo_options = '--threads 1 --initial-solutions 40 --ratio-runs-from-initial-solutions 1'
  310. # params_ged = {'lib': 'gedlibpy', 'cost': 'CONSTANT', 'method': 'IPFP',
  311. # 'algo_options': algo_options, 'stabilizer': None,
  312. # 'edit_cost_constant': edit_costs}
  313. # for idx in tqdm(range(len(Gn) - 2), desc='computing GEDs', file=sys.stdout):
  314. # dis, _, _ = GED(Gn[idx], set_median, **params_ged)
  315. # ged_mat[idx, -2] = dis
  316. # ged_mat[-2, idx] = dis
  317. # dis, _, _ = GED(Gn[idx], gen_median, **params_ged)
  318. # ged_mat[idx, -1] = dis
  319. # ged_mat[-1, idx] = dis
  320. # np.savez(dir_output + 'ged_mat.' + fname_medians + '.y' + str(int(y)) + '.with_medians.gm',
  321. # ged_mat=ged_mat)
  322. # visualization.
  323. median_set = range(0, len(values))
  324. visualize_graph_dataset('ged', 'tsne', draw_figure,
  325. draw_params={'y_idx': y_idx}, dis_mat=ged_mat,
  326. median_set=median_set)
  327. ###############################################################################
  328. def visualize_distances_in_kernel_letter_h():
  329. ds = {'dataset': '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/data/collections/Letter.xml',
  330. 'graph_dir': '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/data/datasets/Letter/HIGH/'} # node/edge symb
  331. Gn_original, y_all = loadDataset(ds['dataset'], extra_params=ds['graph_dir'])
  332. # Gn = Gn[0:50]
  333. # compute distance matrix
  334. # median_set = [22, 29, 54, 74]
  335. gkernel = 'structuralspkernel'
  336. fit_method = 'expert'
  337. node_label = None
  338. edge_label = None
  339. ds_name = 'letter-h'
  340. fname_medians = fit_method + '.' + gkernel
  341. dir_output = 'results/xp_letter_h/'
  342. k = 150
  343. repeat = 0
  344. # get indices by classes.
  345. y_idx = get_same_item_indices(y_all)
  346. for i, (y, values) in enumerate(y_idx.items()):
  347. print('\ny =', y)
  348. Gn = [Gn_original[g].copy() for g in values]
  349. # add set median.
  350. fname_sm = dir_output + 'medians/' + y + '/set_median.k' + str(int(k)) \
  351. + '.y' + y + '.repeat' + str(repeat) + '.gxl'
  352. set_median = loadGXL(fname_sm)
  353. Gn.append(set_median)
  354. # add generalized median (estimated pre-image.)
  355. fname_gm = dir_output + 'medians/' + y + '/gen_median.k' + str(int(k)) \
  356. + '.y' + y + '.repeat' + str(repeat) + '.gxl'
  357. gen_median = loadGXL(fname_gm)
  358. Gn.append(gen_median)
  359. # compute distance matrix
  360. median_set = range(0, len(values))
  361. Gn_median_set = [Gn[i].copy() for i in median_set]
  362. Kmatrix_median = compute_kernel(Gn + Gn_median_set, gkernel, node_label,
  363. edge_label, False)
  364. Kmatrix = Kmatrix_median[0:len(Gn), 0:len(Gn)]
  365. dis_mat, _, _, _ = kernel_distance_matrix(Gn, node_label, edge_label,
  366. Kmatrix=Kmatrix, gkernel=gkernel)
  367. print('average distances: ', np.mean(np.mean(dis_mat[0:len(Gn)-2, 0:len(Gn)-2])))
  368. print('min distances: ', np.min(np.min(dis_mat[0:len(Gn)-2, 0:len(Gn)-2])))
  369. print('max distances: ', np.max(np.max(dis_mat[0:len(Gn)-2, 0:len(Gn)-2])))
  370. # add distances for the image of exact median \psi.
  371. dis_k_median_list = []
  372. for idx, g in enumerate(Gn):
  373. dis_k_median_list.append(dis_gstar(idx, range(len(Gn), len(Gn) + len(Gn_median_set)),
  374. [1 / len(Gn_median_set)] * len(Gn_median_set),
  375. Kmatrix_median, withterm3=False))
  376. dis_mat_median = np.zeros((len(Gn) + 1, len(Gn) + 1))
  377. for i in range(len(Gn)):
  378. for j in range(i, len(Gn)):
  379. dis_mat_median[i, j] = dis_mat[i, j]
  380. dis_mat_median[j, i] = dis_mat_median[i, j]
  381. for i in range(len(Gn)):
  382. dis_mat_median[i, -1] = dis_k_median_list[i]
  383. dis_mat_median[-1, i] = dis_k_median_list[i]
  384. # visualization.
  385. # visualize_graph_dataset('graph-kernel', 'tsne', Gn)
  386. # visualize_graph_dataset('graph-kernel', 'tsne', draw_figure,
  387. # draw_params={'y_idx': y_idx}, dis_mat=dis_mat_median)
  388. visualize_graph_dataset('graph-kernel', 'tsne', draw_figure,
  389. draw_params={'y_idx': y_idx}, dis_mat=dis_mat_median,
  390. median_set=median_set)
  391. def visualize_distances_in_ged_letter_h():
  392. from fitDistance import compute_geds
  393. from preimage.test_k_closest_graphs import reform_attributes
  394. ds = {'dataset': '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/data/collections/Letter.xml',
  395. 'graph_dir': '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/data/datasets/Letter/HIGH/'} # node/edge symb
  396. Gn_original, y_all = loadDataset(ds['dataset'], extra_params=ds['graph_dir'])
  397. # Gn = Gn[0:50]
  398. # compute distance matrix
  399. # median_set = [22, 29, 54, 74]
  400. gkernel = 'structuralspkernel'
  401. fit_method = 'expert'
  402. ds_name = 'letter-h'
  403. fname_medians = fit_method + '.' + gkernel
  404. dir_output = 'results/xp_letter_h/'
  405. k = 150
  406. repeat = 0
  407. # edit_costs = [0.16229209837639536, 0.06612870523413916, 0.04030113378793905, 0.20723547009415202, 0.3338607220394598, 0.27054392518077297]
  408. edit_costs = [3, 3, 1, 3, 3, 1]
  409. # edit_costs = [7, 3, 5, 9, 2, 6]
  410. # get indices by classes.
  411. y_idx = get_same_item_indices(y_all)
  412. for i, (y, values) in enumerate(y_idx.items()):
  413. print('\ny =', y)
  414. Gn = [Gn_original[g].copy() for g in values]
  415. # add set median.
  416. fname_sm = dir_output + 'medians/' + y + '/set_median.k' + str(int(k)) \
  417. + '.y' + y + '.repeat' + str(repeat) + '.gxl'
  418. set_median = loadGXL(fname_sm)
  419. Gn.append(set_median)
  420. # add generalized median (estimated pre-image.)
  421. fname_gm = dir_output + 'medians/' + y + '/gen_median.k' + str(int(k)) \
  422. + '.y' + y + '.repeat' + str(repeat) + '.gxl'
  423. gen_median = loadGXL(fname_gm)
  424. Gn.append(gen_median)
  425. # compute/load ged matrix.
  426. # compute.
  427. algo_options = '--threads 1 --initial-solutions 40 --ratio-runs-from-initial-solutions 1'
  428. params_ged = {'dataset': 'Letter', 'lib': 'gedlibpy', 'cost': 'CONSTANT',
  429. 'method': 'IPFP', 'algo_options': algo_options,
  430. 'stabilizer': None, 'edit_cost_constant': edit_costs}
  431. for g in Gn:
  432. reform_attributes(g)
  433. _, ged_mat, _ = compute_geds(Gn, params_ged=params_ged, parallel=True)
  434. np.savez(dir_output + 'ged_mat.' + fname_medians + '.y' + y + '.with_medians.gm', ged_mat=ged_mat)
  435. # # load from file.
  436. # gmfile = np.load('dir_output + 'ged_mat.' + fname_medians + '.y' + y + '.with_medians.gm.npz')
  437. # ged_mat = gmfile['ged_mat']
  438. # # change medians.
  439. # algo_options = '--threads 1 --initial-solutions 40 --ratio-runs-from-initial-solutions 1'
  440. # params_ged = {'lib': 'gedlibpy', 'cost': 'CONSTANT', 'method': 'IPFP',
  441. # 'algo_options': algo_options, 'stabilizer': None,
  442. # 'edit_cost_constant': edit_costs}
  443. # for idx in tqdm(range(len(Gn) - 2), desc='computing GEDs', file=sys.stdout):
  444. # dis, _, _ = GED(Gn[idx], set_median, **params_ged)
  445. # ged_mat[idx, -2] = dis
  446. # ged_mat[-2, idx] = dis
  447. # dis, _, _ = GED(Gn[idx], gen_median, **params_ged)
  448. # ged_mat[idx, -1] = dis
  449. # ged_mat[-1, idx] = dis
  450. # np.savez(dir_output + 'ged_mat.' + fname_medians + '.y' + y + '.with_medians.gm',
  451. # ged_mat=ged_mat)
  452. # visualization.
  453. median_set = range(0, len(values))
  454. visualize_graph_dataset('ged', 'tsne', draw_figure,
  455. draw_params={'y_idx': y_idx}, dis_mat=ged_mat,
  456. median_set=median_set)
  457. if __name__ == '__main__':
  458. visualize_distances_in_kernel_letter_h()
  459. # visualize_distances_in_ged_letter_h()
  460. # visualize_distances_in_kernel_monoterpenoides()
  461. # visualize_distances_in_kernel_monoterpenoides()
  462. # visualize_distances_in_kernel()
  463. # visualize_distances_in_ged()
  464. #def draw_figure_dis_k(ax, Gn_embedded, y_idx=None, legend=False):
  465. # from matplotlib import colors as mcolors
  466. # colors = list(dict(mcolors.BASE_COLORS, **mcolors.CSS4_COLORS))
  467. ## colors = ['#08306b', '#08519c', '#2171b5', '#4292c6', '#6baed6', '#9ecae1',
  468. ## '#c6dbef', '#deebf7']
  469. # for i, values in enumerate(y_idx.values()):
  470. # for item in values:
  471. ## ax.scatter(Gn_embedded[item,0], Gn_embedded[item,1], c=colors[i]) # , c='b')
  472. # ax.scatter(Gn_embedded[item,0], Gn_embedded[item,1], c='b')
  473. # h1 = ax.scatter(Gn_embedded[[12, 13, 22, 29], 0], Gn_embedded[[12, 13, 22, 29], 1], c='r')
  474. # h2 = ax.scatter(Gn_embedded[-1, 0], Gn_embedded[-1, 1], c='darkorchid') # \psi
  475. # h3 = ax.scatter(Gn_embedded[-2, 0], Gn_embedded[-2, 1], c='gold') # gen median
  476. # h4 = ax.scatter(Gn_embedded[-3, 0], Gn_embedded[-3, 1], c='r', marker='+') # set median
  477. # if legend:
  478. ## fig.subplots_adjust(bottom=0.17)
  479. # ax.legend([h1, h2, h3, h4], ['k clostest graphs', 'true median', 'gen median', 'set median'])
  480. ## fig.legend(handles, labels, loc='lower center', ncol=2, frameon=False) # , ncol=5, labelspacing=0.1, handletextpad=0.4, columnspacing=0.6)
  481. ## plt.savefig('symbolic_and_non_comparison_vertical_short.eps', format='eps', dpi=300, transparent=True,
  482. ## bbox_inches='tight')
  483. ## plt.show()
  484. #def draw_figure_ged(ax, Gn_embedded, y_idx=None, legend=False):
  485. # from matplotlib import colors as mcolors
  486. # colors = list(dict(mcolors.BASE_COLORS, **mcolors.CSS4_COLORS))
  487. ## colors = ['#08306b', '#08519c', '#2171b5', '#4292c6', '#6baed6', '#9ecae1',
  488. ## '#c6dbef', '#deebf7']
  489. # for i, values in enumerate(y_idx.values()):
  490. # for item in values:
  491. ## ax.scatter(Gn_embedded[item,0], Gn_embedded[item,1], c=colors[i]) # , c='b')
  492. # ax.scatter(Gn_embedded[item,0], Gn_embedded[item,1], c='b')
  493. # h1 = ax.scatter(Gn_embedded[[12, 13, 22, 29], 0], Gn_embedded[[12, 13, 22, 29], 1], c='r')
  494. ## h2 = ax.scatter(Gn_embedded[-1, 0], Gn_embedded[-1, 1], c='darkorchid') # \psi
  495. # h3 = ax.scatter(Gn_embedded[-1, 0], Gn_embedded[-1, 1], c='gold') # gen median
  496. # h4 = ax.scatter(Gn_embedded[-2, 0], Gn_embedded[-2, 1], c='r', marker='+') # set median
  497. # if legend:
  498. ## fig.subplots_adjust(bottom=0.17)
  499. # ax.legend([h1, h3, h4], ['k clostest graphs', 'gen median', 'set median'])
  500. ## fig.legend(handles, labels, loc='lower center', ncol=2, frameon=False) # , ncol=5, labelspacing=0.1, handletextpad=0.4, columnspacing=0.6)
  501. ## plt.savefig('symbolic_and_non_comparison_vertical_short.eps', format='eps', dpi=300, transparent=True,
  502. ## bbox_inches='tight')
  503. ## plt.show()

A Python package for graph kernels, graph edit distances and graph pre-image problem.