|
- #!/usr/bin/env python3
- # -*- coding: utf-8 -*-
- """
- Created on Thu Jan 16 11:03:11 2020
-
- @author: ljia
- """
-
- import numpy as np
- import random
- import csv
- from shutil import copyfile
- import networkx as nx
- import matplotlib.pyplot as plt
-
- from gklearn.utils.graphfiles import loadDataset, loadGXL, saveGXL
- from gklearn.preimage.test_k_closest_graphs import median_on_k_closest_graphs, reform_attributes
- from gklearn.preimage.utils import get_same_item_indices
- from gklearn.preimage.find_best_k import getRelations
-
- def xp_monoterpenoides():
- import os
-
- ds = {'dataset': '../../datasets/monoterpenoides/dataset_10+.ds',
- 'graph_dir': os.path.dirname(os.path.realpath(__file__)) + '../../datasets/monoterpenoides/'} # node/edge symb
- Gn, y_all = loadDataset(ds['dataset'])
- # ds = {'name': 'Letter-high',
- # 'dataset': '../datasets/Letter-high/Letter-high_A.txt'} # node/edge symb
- # Gn, y_all = loadDataset(ds['dataset'])
- # Gn = Gn[0:50]
- gkernel = 'treeletkernel'
- node_label = 'atom'
- edge_label = 'bond_type'
- ds_name = 'monoterpenoides'
- dir_output = 'results/xp_monoterpenoides/'
-
- repeats = 1
- # k_list = range(2, 11)
- k_list = [0]
- fit_method = 'k-graphs'
- # get indices by classes.
- y_idx = get_same_item_indices(y_all)
-
- # create result files.
- fn_output_detail = 'results_detail.' + ds_name + '.' + gkernel + '.' + fit_method + '.csv'
- f_detail = open(dir_output + fn_output_detail, 'a')
- csv.writer(f_detail).writerow(['dataset', 'graph kernel', 'fit method', 'k',
- 'target', 'repeat', 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM',
- 'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM',
- 'dis_k gi -> GM', 'median set'])
- f_detail.close()
- fn_output_summary = 'results_summary.' + ds_name + '.' + gkernel + '.' + fit_method + '.csv'
- f_summary = open(dir_output + fn_output_summary, 'a')
- csv.writer(f_summary).writerow(['dataset', 'graph kernel', 'fit method', 'k',
- 'target', 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM',
- 'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM',
- 'dis_k gi -> GM', '# SOD SM -> GM', '# dis_k SM -> GM',
- '# dis_k gi -> SM', '# dis_k gi -> GM', 'repeats better SOD SM -> GM',
- 'repeats better dis_k SM -> GM', 'repeats better dis_k gi -> SM',
- 'repeats better dis_k gi -> GM'])
- f_summary.close()
-
- random.seed(1)
- rdn_seed_list = random.sample(range(0, repeats * 100), repeats)
-
- for k in k_list:
- print('\n--------- k =', k, '----------')
-
- sod_sm_mean_list = []
- sod_gm_mean_list = []
- dis_k_sm_mean_list = []
- dis_k_gm_mean_list = []
- dis_k_gi_min_mean_list = []
- # nb_sod_sm2gm = [0, 0, 0]
- # nb_dis_k_sm2gm = [0, 0, 0]
- # nb_dis_k_gi2sm = [0, 0, 0]
- # nb_dis_k_gi2gm = [0, 0, 0]
- # repeats_better_sod_sm2gm = []
- # repeats_better_dis_k_sm2gm = []
- # repeats_better_dis_k_gi2sm = []
- # repeats_better_dis_k_gi2gm = []
-
- for i, (y, values) in enumerate(y_idx.items()):
- print('\ny =', y)
- # y = 'I'
- # values = y_idx[y]
-
- k = len(values)
- # k = kkk
-
- sod_sm_list = []
- sod_gm_list = []
- dis_k_sm_list = []
- dis_k_gm_list = []
- dis_k_gi_min_list = []
- nb_sod_sm2gm = [0, 0, 0]
- nb_dis_k_sm2gm = [0, 0, 0]
- nb_dis_k_gi2sm = [0, 0, 0]
- nb_dis_k_gi2gm = [0, 0, 0]
- repeats_better_sod_sm2gm = []
- repeats_better_dis_k_sm2gm = []
- repeats_better_dis_k_gi2sm = []
- repeats_better_dis_k_gi2gm = []
-
- for repeat in range(repeats):
- print('\nrepeat =', repeat)
- random.seed(rdn_seed_list[repeat])
- median_set_idx_idx = random.sample(range(0, len(values)), k)
- median_set_idx = [values[idx] for idx in median_set_idx_idx]
- print('median set: ', median_set_idx)
- Gn_median = [Gn[g] for g in values]
-
- sod_sm, sod_gm, dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min, idx_dis_k_gi_min \
- = median_on_k_closest_graphs(Gn_median, node_label, edge_label,
- gkernel, k, fit_method=fit_method, graph_dir=ds['graph_dir'],
- edit_costs=None, group_min=median_set_idx_idx,
- dataset=ds_name, parallel=False)
-
- # write result detail.
- sod_sm2gm = getRelations(np.sign(sod_gm - sod_sm))
- dis_k_sm2gm = getRelations(np.sign(dis_k_gm - dis_k_sm))
- dis_k_gi2sm = getRelations(np.sign(dis_k_sm - dis_k_gi_min))
- dis_k_gi2gm = getRelations(np.sign(dis_k_gm - dis_k_gi_min))
- f_detail = open(dir_output + fn_output_detail, 'a')
- csv.writer(f_detail).writerow([ds_name, gkernel, fit_method, k,
- y, repeat,
- sod_sm, sod_gm, dis_k_sm, dis_k_gm,
- dis_k_gi_min, sod_sm2gm, dis_k_sm2gm, dis_k_gi2sm,
- dis_k_gi2gm, median_set_idx])
- f_detail.close()
-
- # compute result summary.
- sod_sm_list.append(sod_sm)
- sod_gm_list.append(sod_gm)
- dis_k_sm_list.append(dis_k_sm)
- dis_k_gm_list.append(dis_k_gm)
- dis_k_gi_min_list.append(dis_k_gi_min)
- # # SOD SM -> GM
- if sod_sm > sod_gm:
- nb_sod_sm2gm[0] += 1
- repeats_better_sod_sm2gm.append(repeat)
- elif sod_sm == sod_gm:
- nb_sod_sm2gm[1] += 1
- elif sod_sm < sod_gm:
- nb_sod_sm2gm[2] += 1
- # # dis_k SM -> GM
- if dis_k_sm > dis_k_gm:
- nb_dis_k_sm2gm[0] += 1
- repeats_better_dis_k_sm2gm.append(repeat)
- elif dis_k_sm == dis_k_gm:
- nb_dis_k_sm2gm[1] += 1
- elif dis_k_sm < dis_k_gm:
- nb_dis_k_sm2gm[2] += 1
- # # dis_k gi -> SM
- if dis_k_gi_min > dis_k_sm:
- nb_dis_k_gi2sm[0] += 1
- repeats_better_dis_k_gi2sm.append(repeat)
- elif dis_k_gi_min == dis_k_sm:
- nb_dis_k_gi2sm[1] += 1
- elif dis_k_gi_min < dis_k_sm:
- nb_dis_k_gi2sm[2] += 1
- # # dis_k gi -> GM
- if dis_k_gi_min > dis_k_gm:
- nb_dis_k_gi2gm[0] += 1
- repeats_better_dis_k_gi2gm.append(repeat)
- elif dis_k_gi_min == dis_k_gm:
- nb_dis_k_gi2gm[1] += 1
- elif dis_k_gi_min < dis_k_gm:
- nb_dis_k_gi2gm[2] += 1
-
- # save median graphs.
- fname_sm = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/output/tmp_ged/set_median.gxl'
- fn_pre_sm_new = dir_output + 'medians/set_median.' + fit_method \
- + '.k' + str(int(k)) + '.y' + str(int(y)) + '.repeat' + str(repeat)
- copyfile(fname_sm, fn_pre_sm_new + '.gxl')
- fname_gm = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/output/tmp_ged/gen_median.gxl'
- fn_pre_gm_new = dir_output + 'medians/gen_median.' + fit_method \
- + '.k' + str(int(k)) + '.y' + str(int(y)) + '.repeat' + str(repeat)
- copyfile(fname_gm, fn_pre_gm_new + '.gxl')
- G_best_kernel = Gn_median[idx_dis_k_gi_min].copy()
- # reform_attributes(G_best_kernel)
- fn_pre_g_best_kernel = dir_output + 'medians/g_best_kernel.' + fit_method \
- + '.k' + str(int(k)) + '.y' + str(int(y)) + '.repeat' + str(repeat)
- saveGXL(G_best_kernel, fn_pre_g_best_kernel + '.gxl', method='gedlib')
-
- # # plot median graphs.
- # set_median = loadGXL(fn_pre_sm_new + '.gxl')
- # gen_median = loadGXL(fn_pre_gm_new + '.gxl')
- # draw_Letter_graph(set_median, fn_pre_sm_new)
- # draw_Letter_graph(gen_median, fn_pre_gm_new)
- # draw_Letter_graph(G_best_kernel, fn_pre_g_best_kernel)
-
- # write result summary for each letter.
- sod_sm_mean_list.append(np.mean(sod_sm_list))
- sod_gm_mean_list.append(np.mean(sod_gm_list))
- dis_k_sm_mean_list.append(np.mean(dis_k_sm_list))
- dis_k_gm_mean_list.append(np.mean(dis_k_gm_list))
- dis_k_gi_min_mean_list.append(np.mean(dis_k_gi_min_list))
- sod_sm2gm_mean = getRelations(np.sign(sod_gm_mean_list[-1] - sod_sm_mean_list[-1]))
- dis_k_sm2gm_mean = getRelations(np.sign(dis_k_gm_mean_list[-1] - dis_k_sm_mean_list[-1]))
- dis_k_gi2sm_mean = getRelations(np.sign(dis_k_sm_mean_list[-1] - dis_k_gi_min_mean_list[-1]))
- dis_k_gi2gm_mean = getRelations(np.sign(dis_k_gm_mean_list[-1] - dis_k_gi_min_mean_list[-1]))
- f_summary = open(dir_output + fn_output_summary, 'a')
- csv.writer(f_summary).writerow([ds_name, gkernel, fit_method, k, y,
- sod_sm_mean_list[-1], sod_gm_mean_list[-1],
- dis_k_sm_mean_list[-1], dis_k_gm_mean_list[-1],
- dis_k_gi_min_mean_list[-1], sod_sm2gm_mean, dis_k_sm2gm_mean,
- dis_k_gi2sm_mean, dis_k_gi2gm_mean, nb_sod_sm2gm,
- nb_dis_k_sm2gm, nb_dis_k_gi2sm, nb_dis_k_gi2gm,
- repeats_better_sod_sm2gm, repeats_better_dis_k_sm2gm,
- repeats_better_dis_k_gi2sm, repeats_better_dis_k_gi2gm])
- f_summary.close()
-
-
- # write result summary for each letter.
- sod_sm_mean = np.mean(sod_sm_mean_list)
- sod_gm_mean = np.mean(sod_gm_mean_list)
- dis_k_sm_mean = np.mean(dis_k_sm_mean_list)
- dis_k_gm_mean = np.mean(dis_k_gm_mean_list)
- dis_k_gi_min_mean = np.mean(dis_k_gi_min_list)
- sod_sm2gm_mean = getRelations(np.sign(sod_gm_mean - sod_sm_mean))
- dis_k_sm2gm_mean = getRelations(np.sign(dis_k_gm_mean - dis_k_sm_mean))
- dis_k_gi2sm_mean = getRelations(np.sign(dis_k_sm_mean - dis_k_gi_min_mean))
- dis_k_gi2gm_mean = getRelations(np.sign(dis_k_gm_mean - dis_k_gi_min_mean))
- f_summary = open(dir_output + fn_output_summary, 'a')
- csv.writer(f_summary).writerow([ds_name, gkernel, fit_method, k, 'all',
- sod_sm_mean, sod_gm_mean, dis_k_sm_mean, dis_k_gm_mean,
- dis_k_gi_min_mean, sod_sm2gm_mean, dis_k_sm2gm_mean,
- dis_k_gi2sm_mean, dis_k_gi2gm_mean])
- f_summary.close()
-
-
- print('\ncomplete.')
-
-
- #Dessin median courrant
- def draw_Letter_graph(graph, file_prefix):
- plt.figure()
- pos = {}
- for n in graph.nodes:
- pos[n] = np.array([float(graph.node[n]['x']),float(graph.node[n]['y'])])
- nx.draw_networkx(graph, pos)
- plt.savefig(file_prefix + '.eps', format='eps', dpi=300)
- # plt.show()
- plt.clf()
-
-
- if __name__ == "__main__":
- xp_monoterpenoides()
|