You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

utils.py 18 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. Created on Thu Oct 17 19:05:07 2019
  5. Useful functions.
  6. @author: ljia
  7. """
  8. #import networkx as nx
  9. import multiprocessing
  10. import numpy as np
  11. from gklearn.kernels.marginalizedKernel import marginalizedkernel
  12. from gklearn.kernels.untilHPathKernel import untilhpathkernel
  13. from gklearn.kernels.spKernel import spkernel
  14. import functools
  15. from gklearn.utils.kernels import deltakernel, gaussiankernel, kernelproduct, polynomialkernel
  16. from gklearn.kernels.structuralspKernel import structuralspkernel
  17. from gklearn.kernels.treeletKernel import treeletkernel
  18. from gklearn.kernels.weisfeilerLehmanKernel import weisfeilerlehmankernel
  19. from gklearn.utils import Dataset
  20. import csv
  21. import matplotlib.pyplot as plt
  22. import networkx as nx
  23. def generate_median_preimage_by_class(ds_name, mpg_options, kernel_options, ged_options, mge_options, save_results=True, save_medians=True, plot_medians=True, dir_save='', ):
  24. from gklearn.preimage import MedianPreimageGenerator
  25. from gklearn.utils import split_dataset_by_target
  26. from gklearn.utils.graphfiles import saveGXL
  27. # 1. get dataset.
  28. print('getting dataset...')
  29. dataset_all = Dataset()
  30. dataset_all.load_predefined_dataset(ds_name)
  31. datasets = split_dataset_by_target(dataset_all)
  32. # dataset.cut_graphs(range(0, 10))
  33. if save_results:
  34. # create result files.
  35. print('creating output files...')
  36. fn_output_detail, fn_output_summary = __init_output_file(ds_name, kernel_options['name'], mpg_options['fit_method'], dir_save)
  37. sod_sm_list = []
  38. sod_gm_list = []
  39. dis_k_sm_list = []
  40. dis_k_gm_list = []
  41. dis_k_gi_min_list = []
  42. time_precompute_gm_list = []
  43. time_optimize_ec_list = []
  44. time_generate_list = []
  45. time_total_list = []
  46. itrs_list = []
  47. converged_list = []
  48. num_updates_ecc_list = []
  49. nb_sod_sm2gm = [0, 0, 0]
  50. nb_dis_k_sm2gm = [0, 0, 0]
  51. nb_dis_k_gi2sm = [0, 0, 0]
  52. nb_dis_k_gi2gm = [0, 0, 0]
  53. # repeats_better_sod_sm2gm = []
  54. # repeats_better_dis_k_sm2gm = []
  55. # repeats_better_dis_k_gi2sm = []
  56. # repeats_better_dis_k_gi2gm = []
  57. print('start generating preimage for each class of target...')
  58. for dataset in datasets:
  59. print('\ntarget =', dataset.targets[0], '\n')
  60. num_graphs = len(dataset.graphs)
  61. if num_graphs < 2:
  62. print('\nnumber of graphs = ', num_graphs, ', skip.\n')
  63. continue
  64. # 2. set parameters.
  65. print('1. initializing mpg and setting parameters...')
  66. mpg = MedianPreimageGenerator()
  67. mpg.dataset = dataset
  68. mpg.set_options(**mpg_options.copy())
  69. mpg.kernel_options = kernel_options.copy()
  70. mpg.ged_options = ged_options.copy()
  71. mpg.mge_options = mge_options.copy()
  72. # 3. compute median preimage.
  73. print('2. computing median preimage...')
  74. mpg.run()
  75. results = mpg.get_results()
  76. # write result detail.
  77. if save_results:
  78. print('writing results to files...')
  79. sod_sm2gm = get_relations(np.sign(results['sod_gen_median'] - results['sod_set_median']))
  80. dis_k_sm2gm = get_relations(np.sign(results['k_dis_gen_median'] - results['k_dis_set_median']))
  81. dis_k_gi2sm = get_relations(np.sign(results['k_dis_set_median'] - results['k_dis_dataset']))
  82. dis_k_gi2gm = get_relations(np.sign(results['k_dis_gen_median'] - results['k_dis_dataset']))
  83. f_detail = open(dir_save + fn_output_detail, 'a')
  84. csv.writer(f_detail).writerow([ds_name, kernel_options['name'],
  85. ged_options['edit_cost'], ged_options['method'],
  86. ged_options['attr_distance'], mpg_options['fit_method'],
  87. num_graphs, dataset.targets[0], 1,
  88. results['sod_set_median'], results['sod_gen_median'],
  89. results['k_dis_set_median'], results['k_dis_gen_median'],
  90. results['k_dis_dataset'], sod_sm2gm, dis_k_sm2gm,
  91. dis_k_gi2sm, dis_k_gi2gm, results['edit_cost_constants'],
  92. results['runtime_precompute_gm'], results['runtime_optimize_ec'],
  93. results['runtime_generate_preimage'], results['runtime_total'],
  94. results['itrs'], results['converged'],
  95. results['num_updates_ecc']])
  96. f_detail.close()
  97. # compute result summary.
  98. sod_sm_list.append(results['sod_set_median'])
  99. sod_gm_list.append(results['sod_gen_median'])
  100. dis_k_sm_list.append(results['k_dis_set_median'])
  101. dis_k_gm_list.append(results['k_dis_gen_median'])
  102. dis_k_gi_min_list.append(results['k_dis_dataset'])
  103. time_precompute_gm_list.append(results['runtime_precompute_gm'])
  104. time_optimize_ec_list.append(results['runtime_optimize_ec'])
  105. time_generate_list.append(results['runtime_generate_preimage'])
  106. time_total_list.append(results['runtime_total'])
  107. itrs_list.append(results['itrs'])
  108. converged_list.append(results['converged'])
  109. num_updates_ecc_list.append(results['num_updates_ecc'])
  110. # # SOD SM -> GM
  111. if results['sod_set_median'] > results['sod_gen_median']:
  112. nb_sod_sm2gm[0] += 1
  113. # repeats_better_sod_sm2gm.append(1)
  114. elif results['sod_set_median'] == results['sod_gen_median']:
  115. nb_sod_sm2gm[1] += 1
  116. elif results['sod_set_median'] < results['sod_gen_median']:
  117. nb_sod_sm2gm[2] += 1
  118. # # dis_k SM -> GM
  119. if results['k_dis_set_median'] > results['k_dis_gen_median']:
  120. nb_dis_k_sm2gm[0] += 1
  121. # repeats_better_dis_k_sm2gm.append(1)
  122. elif results['k_dis_set_median'] == results['k_dis_gen_median']:
  123. nb_dis_k_sm2gm[1] += 1
  124. elif results['k_dis_set_median'] < results['k_dis_gen_median']:
  125. nb_dis_k_sm2gm[2] += 1
  126. # # dis_k gi -> SM
  127. if results['k_dis_dataset'] > results['k_dis_set_median']:
  128. nb_dis_k_gi2sm[0] += 1
  129. # repeats_better_dis_k_gi2sm.append(1)
  130. elif results['k_dis_dataset'] == results['k_dis_set_median']:
  131. nb_dis_k_gi2sm[1] += 1
  132. elif results['k_dis_dataset'] < results['k_dis_set_median']:
  133. nb_dis_k_gi2sm[2] += 1
  134. # # dis_k gi -> GM
  135. if results['k_dis_dataset'] > results['k_dis_gen_median']:
  136. nb_dis_k_gi2gm[0] += 1
  137. # repeats_better_dis_k_gi2gm.append(1)
  138. elif results['k_dis_dataset'] == results['k_dis_gen_median']:
  139. nb_dis_k_gi2gm[1] += 1
  140. elif results['k_dis_dataset'] < results['k_dis_gen_median']:
  141. nb_dis_k_gi2gm[2] += 1
  142. # write result summary for each letter.
  143. f_summary = open(dir_save + fn_output_summary, 'a')
  144. csv.writer(f_summary).writerow([ds_name, kernel_options['name'],
  145. ged_options['edit_cost'], ged_options['method'],
  146. ged_options['attr_distance'], mpg_options['fit_method'],
  147. num_graphs, dataset.targets[0],
  148. results['sod_set_median'], results['sod_gen_median'],
  149. results['k_dis_set_median'], results['k_dis_gen_median'],
  150. results['k_dis_dataset'], sod_sm2gm, dis_k_sm2gm,
  151. dis_k_gi2sm, dis_k_gi2gm,
  152. results['runtime_precompute_gm'], results['runtime_optimize_ec'],
  153. results['runtime_generate_preimage'], results['runtime_total'],
  154. results['itrs'], results['converged'],
  155. results['num_updates_ecc'], nb_sod_sm2gm,
  156. nb_dis_k_sm2gm, nb_dis_k_gi2sm, nb_dis_k_gi2gm])
  157. f_summary.close()
  158. # save median graphs.
  159. if save_medians:
  160. fn_pre_sm = dir_save + 'medians/set_median.' + mpg_options['fit_method'] + '.k' + str(num_graphs) + '.y' + str(dataset.targets[0]) + '.repeat' + str(1)
  161. saveGXL(mpg.set_median, fn_pre_sm + '.gxl', method='default',
  162. node_labels=dataset.node_labels, edge_labels=dataset.edge_labels,
  163. node_attrs=dataset.node_attrs, edge_attrs=dataset.edge_attrs)
  164. fn_pre_gm = dir_save + 'medians/gen_median.' + mpg_options['fit_method'] + '.k' + str(num_graphs) + '.y' + str(dataset.targets[0]) + '.repeat' + str(1)
  165. saveGXL(mpg.gen_median, fn_pre_gm + '.gxl', method='default',
  166. node_labels=dataset.node_labels, edge_labels=dataset.edge_labels,
  167. node_attrs=dataset.node_attrs, edge_attrs=dataset.edge_attrs)
  168. fn_best_dataset = dir_save + 'medians/g_best_dataset.' + mpg_options['fit_method'] + '.k' + str(num_graphs) + '.y' + str(dataset.targets[0]) + '.repeat' + str(1)
  169. saveGXL(mpg.best_from_dataset, fn_best_dataset + '.gxl', method='default',
  170. node_labels=dataset.node_labels, edge_labels=dataset.edge_labels,
  171. node_attrs=dataset.node_attrs, edge_attrs=dataset.edge_attrs)
  172. # plot median graphs.
  173. if plot_medians and save_medians:
  174. if ds_name == 'Letter-high' or ds_name == 'Letter-med' or ds_name == 'Letter-low':
  175. draw_Letter_graph(mpg.set_median, fn_pre_sm)
  176. draw_Letter_graph(mpg.gen_median, fn_pre_gm)
  177. draw_Letter_graph(mpg.best_from_dataset, fn_best_dataset)
  178. # write result summary for each letter.
  179. if save_results:
  180. sod_sm_mean = np.mean(sod_sm_list)
  181. sod_gm_mean = np.mean(sod_gm_list)
  182. dis_k_sm_mean = np.mean(dis_k_sm_list)
  183. dis_k_gm_mean = np.mean(dis_k_gm_list)
  184. dis_k_gi_min_mean = np.mean(dis_k_gi_min_list)
  185. time_precompute_gm_mean = np.mean(time_precompute_gm_list)
  186. time_optimize_ec_mean = np.mean(time_optimize_ec_list)
  187. time_generate_mean = np.mean(time_generate_list)
  188. time_total_mean = np.mean(time_total_list)
  189. itrs_mean = np.mean(itrs_list)
  190. num_converged = np.sum(converged_list)
  191. num_updates_ecc_mean = np.mean(num_updates_ecc_list)
  192. sod_sm2gm_mean = get_relations(np.sign(sod_gm_mean - sod_sm_mean))
  193. dis_k_sm2gm_mean = get_relations(np.sign(dis_k_gm_mean - dis_k_sm_mean))
  194. dis_k_gi2sm_mean = get_relations(np.sign(dis_k_sm_mean - dis_k_gi_min_mean))
  195. dis_k_gi2gm_mean = get_relations(np.sign(dis_k_gm_mean - dis_k_gi_min_mean))
  196. f_summary = open(dir_save + fn_output_summary, 'a')
  197. csv.writer(f_summary).writerow([ds_name, kernel_options['name'],
  198. ged_options['edit_cost'], ged_options['method'],
  199. ged_options['attr_distance'], mpg_options['fit_method'],
  200. num_graphs, 'all',
  201. sod_sm_mean, sod_gm_mean, dis_k_sm_mean, dis_k_gm_mean,
  202. dis_k_gi_min_mean, sod_sm2gm_mean, dis_k_sm2gm_mean,
  203. dis_k_gi2sm_mean, dis_k_gi2gm_mean,
  204. time_precompute_gm_mean, time_optimize_ec_mean,
  205. time_generate_mean, time_total_mean, itrs_mean,
  206. num_converged, num_updates_ecc_mean])
  207. f_summary.close()
  208. print('\ncomplete.')
  209. def __init_output_file(ds_name, gkernel, fit_method, dir_output):
  210. # fn_output_detail = 'results_detail.' + ds_name + '.' + gkernel + '.' + fit_method + '.csv'
  211. fn_output_detail = 'results_detail.' + ds_name + '.' + gkernel + '.csv'
  212. f_detail = open(dir_output + fn_output_detail, 'a')
  213. csv.writer(f_detail).writerow(['dataset', 'graph kernel', 'edit cost',
  214. 'GED method', 'attr distance', 'fit method', 'k',
  215. 'target', 'repeat', 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM',
  216. 'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM',
  217. 'dis_k gi -> GM', 'edit cost constants', 'time precompute gm',
  218. 'time optimize ec', 'time generate preimage', 'time total',
  219. 'itrs', 'converged', 'num updates ecc'])
  220. f_detail.close()
  221. # fn_output_summary = 'results_summary.' + ds_name + '.' + gkernel + '.' + fit_method + '.csv'
  222. fn_output_summary = 'results_summary.' + ds_name + '.' + gkernel + '.csv'
  223. f_summary = open(dir_output + fn_output_summary, 'a')
  224. csv.writer(f_summary).writerow(['dataset', 'graph kernel', 'edit cost',
  225. 'GED method', 'attr distance', 'fit method', 'k',
  226. 'target', 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM',
  227. 'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM',
  228. 'dis_k gi -> GM', 'time precompute gm', 'time optimize ec',
  229. 'time generate preimage', 'time total', 'itrs', 'num converged',
  230. 'num updates ecc', '# SOD SM -> GM', '# dis_k SM -> GM',
  231. '# dis_k gi -> SM', '# dis_k gi -> GM'])
  232. # 'repeats better SOD SM -> GM',
  233. # 'repeats better dis_k SM -> GM', 'repeats better dis_k gi -> SM',
  234. # 'repeats better dis_k gi -> GM'])
  235. f_summary.close()
  236. return fn_output_detail, fn_output_summary
  237. def get_relations(sign):
  238. if sign == -1:
  239. return 'better'
  240. elif sign == 0:
  241. return 'same'
  242. elif sign == 1:
  243. return 'worse'
  244. #Dessin median courrant
  245. def draw_Letter_graph(graph, file_prefix):
  246. plt.figure()
  247. pos = {}
  248. for n in graph.nodes:
  249. pos[n] = np.array([float(graph.node[n]['x']),float(graph.node[n]['y'])])
  250. nx.draw_networkx(graph, pos)
  251. plt.savefig(file_prefix + '.eps', format='eps', dpi=300)
  252. # plt.show()
  253. plt.clf()
  254. def remove_edges(Gn):
  255. for G in Gn:
  256. for _, _, attrs in G.edges(data=True):
  257. attrs.clear()
  258. def dis_gstar(idx_g, idx_gi, alpha, Kmatrix, term3=0, withterm3=True):
  259. term1 = Kmatrix[idx_g, idx_g]
  260. term2 = 0
  261. for i, a in enumerate(alpha):
  262. term2 += a * Kmatrix[idx_g, idx_gi[i]]
  263. term2 *= 2
  264. if withterm3 == False:
  265. for i1, a1 in enumerate(alpha):
  266. for i2, a2 in enumerate(alpha):
  267. term3 += a1 * a2 * Kmatrix[idx_gi[i1], idx_gi[i2]]
  268. return np.sqrt(term1 - term2 + term3)
  269. def compute_k_dis(idx_g, idx_gi, alpha, Kmatrix, term3=0, withterm3=True):
  270. term1 = Kmatrix[idx_g, idx_g]
  271. term2 = 0
  272. for i, a in enumerate(alpha):
  273. term2 += a * Kmatrix[idx_g, idx_gi[i]]
  274. term2 *= 2
  275. if withterm3 == False:
  276. for i1, a1 in enumerate(alpha):
  277. for i2, a2 in enumerate(alpha):
  278. term3 += a1 * a2 * Kmatrix[idx_gi[i1], idx_gi[i2]]
  279. return np.sqrt(term1 - term2 + term3)
  280. def compute_kernel(Gn, graph_kernel, node_label, edge_label, verbose, parallel='imap_unordered'):
  281. if graph_kernel == 'marginalizedkernel':
  282. Kmatrix, _ = marginalizedkernel(Gn, node_label=node_label, edge_label=edge_label,
  283. p_quit=0.03, n_iteration=10, remove_totters=False,
  284. n_jobs=multiprocessing.cpu_count(), verbose=verbose)
  285. elif graph_kernel == 'untilhpathkernel':
  286. Kmatrix, _ = untilhpathkernel(Gn, node_label=node_label, edge_label=edge_label,
  287. depth=7, k_func='MinMax', compute_method='trie',
  288. parallel=parallel,
  289. n_jobs=multiprocessing.cpu_count(), verbose=verbose)
  290. elif graph_kernel == 'spkernel':
  291. mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)
  292. Kmatrix = np.empty((len(Gn), len(Gn)))
  293. # Kmatrix[:] = np.nan
  294. Kmatrix, _, idx = spkernel(Gn, node_label=node_label, node_kernels=
  295. {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel},
  296. n_jobs=multiprocessing.cpu_count(), verbose=verbose)
  297. # for i, row in enumerate(idx):
  298. # for j, col in enumerate(idx):
  299. # Kmatrix[row, col] = Kmatrix_tmp[i, j]
  300. elif graph_kernel == 'structuralspkernel':
  301. mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)
  302. sub_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}
  303. Kmatrix, _ = structuralspkernel(Gn, node_label=node_label,
  304. edge_label=edge_label, node_kernels=sub_kernels,
  305. edge_kernels=sub_kernels,
  306. parallel=parallel, n_jobs=multiprocessing.cpu_count(),
  307. verbose=verbose)
  308. elif graph_kernel == 'treeletkernel':
  309. pkernel = functools.partial(polynomialkernel, d=2, c=1e5)
  310. # pkernel = functools.partial(gaussiankernel, gamma=1e-6)
  311. mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)
  312. Kmatrix, _ = treeletkernel(Gn, node_label=node_label, edge_label=edge_label,
  313. sub_kernel=pkernel, parallel=parallel,
  314. n_jobs=multiprocessing.cpu_count(), verbose=verbose)
  315. elif graph_kernel == 'weisfeilerlehmankernel':
  316. Kmatrix, _ = weisfeilerlehmankernel(Gn, node_label=node_label, edge_label=edge_label,
  317. height=4, base_kernel='subtree', parallel=None,
  318. n_jobs=multiprocessing.cpu_count(), verbose=verbose)
  319. # normalization
  320. Kmatrix_diag = Kmatrix.diagonal().copy()
  321. for i in range(len(Kmatrix)):
  322. for j in range(i, len(Kmatrix)):
  323. Kmatrix[i][j] /= np.sqrt(Kmatrix_diag[i] * Kmatrix_diag[j])
  324. Kmatrix[j][i] = Kmatrix[i][j]
  325. return Kmatrix
  326. def gram2distances(Kmatrix):
  327. dmatrix = np.zeros((len(Kmatrix), len(Kmatrix)))
  328. for i1 in range(len(Kmatrix)):
  329. for i2 in range(len(Kmatrix)):
  330. dmatrix[i1, i2] = Kmatrix[i1, i1] + Kmatrix[i2, i2] - 2 * Kmatrix[i1, i2]
  331. dmatrix = np.sqrt(dmatrix)
  332. return dmatrix
  333. def kernel_distance_matrix(Gn, node_label, edge_label, Kmatrix=None,
  334. gkernel=None, verbose=True):
  335. dis_mat = np.empty((len(Gn), len(Gn)))
  336. if Kmatrix is None:
  337. Kmatrix = compute_kernel(Gn, gkernel, node_label, edge_label, verbose)
  338. for i in range(len(Gn)):
  339. for j in range(i, len(Gn)):
  340. dis = Kmatrix[i, i] + Kmatrix[j, j] - 2 * Kmatrix[i, j]
  341. if dis < 0:
  342. if dis > -1e-10:
  343. dis = 0
  344. else:
  345. raise ValueError('The distance is negative.')
  346. dis_mat[i, j] = np.sqrt(dis)
  347. dis_mat[j, i] = dis_mat[i, j]
  348. dis_max = np.max(np.max(dis_mat))
  349. dis_min = np.min(np.min(dis_mat[dis_mat != 0]))
  350. dis_mean = np.mean(np.mean(dis_mat))
  351. return dis_mat, dis_max, dis_min, dis_mean
  352. def get_same_item_indices(ls):
  353. """Get the indices of the same items in a list. Return a dict keyed by items.
  354. """
  355. idx_dict = {}
  356. for idx, item in enumerate(ls):
  357. if item in idx_dict:
  358. idx_dict[item].append(idx)
  359. else:
  360. idx_dict[item] = [idx]
  361. return idx_dict
  362. def k_nearest_neighbors_to_median_in_kernel_space(Gn, Kmatrix=None, gkernel=None,
  363. node_label=None, edge_label=None):
  364. dis_k_all = [] # distance between g_star and each graph.
  365. alpha = [1 / len(Gn)] * len(Gn)
  366. if Kmatrix is None:
  367. Kmatrix = compute_kernel(Gn, gkernel, node_label, edge_label, True)
  368. term3 = 0
  369. for i1, a1 in enumerate(alpha):
  370. for i2, a2 in enumerate(alpha):
  371. term3 += a1 * a2 * Kmatrix[idx_gi[i1], idx_gi[i2]]
  372. for ig, g in tqdm(enumerate(Gn_init), desc='computing distances', file=sys.stdout):
  373. dtemp = dis_gstar(ig, idx_gi, alpha, Kmatrix, term3=term3)
  374. dis_all.append(dtemp)
  375. def normalize_distance_matrix(D):
  376. max_value = np.amax(D)
  377. min_value = np.amin(D)
  378. return (D - min_value) / (max_value - min_value)

A Python package for graph kernels, graph edit distances and graph pre-image problem.