From fd25350bd1dd40d6493ffeb1f207a097d1bb8752 Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Wed, 15 Jan 2020 17:01:41 +0100 Subject: [PATCH] update preimage. --- preimage/find_best_k.py | 172 +++++++++++++++++++ preimage/test_k_closest_graphs.py | 336 ++++++++++++++++++++++++++++++++++++++ preimage/xp_letter_h.py | 246 ++++++++++++++++++++++++++++ 3 files changed, 754 insertions(+) create mode 100644 preimage/find_best_k.py create mode 100644 preimage/test_k_closest_graphs.py create mode 100644 preimage/xp_letter_h.py diff --git a/preimage/find_best_k.py b/preimage/find_best_k.py new file mode 100644 index 0000000..ed1ef44 --- /dev/null +++ b/preimage/find_best_k.py @@ -0,0 +1,172 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Thu Jan 9 11:54:32 2020 + +@author: ljia +""" +import numpy as np +import random +import csv + +import sys +sys.path.insert(0, "../") +from pygraph.utils.graphfiles import loadDataset +from preimage.test_k_closest_graphs import median_on_k_closest_graphs + +def find_best_k(): + ds = {'name': 'monoterpenoides', + 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb + Gn, y_all = loadDataset(ds['dataset']) +# Gn = Gn[0:50] + gkernel = 'treeletkernel' + node_label = 'atom' + edge_label = 'bond_type' + ds_name = 'mono' + dir_output = 'results/test_find_best_k/' + + repeats = 50 + k_list = range(2, 11) + fit_method = 'k-graphs' + # fitted on the whole dataset - treelet - mono + edit_costs = [0.1268873773592978, 0.004084633224249829, 0.0897581955378986, 0.15328856114451297, 0.3109956881625734, 0.0] + + # create result files. + fn_output_detail = 'results_detail.' + fit_method + '.csv' + f_detail = open(dir_output + fn_output_detail, 'a') + csv.writer(f_detail).writerow(['dataset', 'graph kernel', 'fit method', 'k', + 'repeat', 'median set', 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM', + 'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM', + 'dis_k gi -> GM']) + f_detail.close() + fn_output_summary = 'results_summary.csv' + f_summary = open(dir_output + fn_output_summary, 'a') + csv.writer(f_summary).writerow(['dataset', 'graph kernel', 'fit method', 'k', + 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM', + 'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM', + 'dis_k gi -> GM', '# SOD SM -> GM', '# dis_k SM -> GM', + '# dis_k gi -> SM', '# dis_k gi -> GM', 'repeats better SOD SM -> GM', + 'repeats better dis_k SM -> GM', 'repeats better dis_k gi -> SM', + 'repeats better dis_k gi -> GM']) + f_summary.close() + + random.seed(1) + rdn_seed_list = random.sample(range(0, repeats * 100), repeats) + + for k in k_list: + print('\n--------- k =', k, '----------') + + sod_sm_list = [] + sod_gm_list = [] + dis_k_sm_list = [] + dis_k_gm_list = [] + dis_k_gi_min_list = [] + nb_sod_sm2gm = [0, 0, 0] + nb_dis_k_sm2gm = [0, 0, 0] + nb_dis_k_gi2sm = [0, 0, 0] + nb_dis_k_gi2gm = [0, 0, 0] + repeats_better_sod_sm2gm = [] + repeats_better_dis_k_sm2gm = [] + repeats_better_dis_k_gi2sm = [] + repeats_better_dis_k_gi2gm = [] + + + for repeat in range(repeats): + print('\nrepeat =', repeat) + random.seed(rdn_seed_list[repeat]) + median_set_idx = random.sample(range(0, len(Gn)), k) + print('median set: ', median_set_idx) + + sod_sm, sod_gm, dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min \ + = median_on_k_closest_graphs(Gn, node_label, edge_label, gkernel, k, + fit_method='k-graphs', + edit_costs=edit_costs, + group_min=median_set_idx, + parallel=False) + + # write result detail. + sod_sm2gm = getRelations(np.sign(sod_gm - sod_sm)) + dis_k_sm2gm = getRelations(np.sign(dis_k_gm - dis_k_sm)) + dis_k_gi2sm = getRelations(np.sign(dis_k_sm - dis_k_gi_min)) + dis_k_gi2gm = getRelations(np.sign(dis_k_gm - dis_k_gi_min)) + f_detail = open(dir_output + fn_output_detail, 'a') + csv.writer(f_detail).writerow([ds_name, gkernel, fit_method, k, repeat, + median_set_idx, sod_sm, sod_gm, dis_k_sm, dis_k_gm, + dis_k_gi_min, sod_sm2gm, dis_k_sm2gm, dis_k_gi2sm, + dis_k_gi2gm]) + f_detail.close() + + # compute result summary. + sod_sm_list.append(sod_sm) + sod_gm_list.append(sod_gm) + dis_k_sm_list.append(dis_k_sm) + dis_k_gm_list.append(dis_k_gm) + dis_k_gi_min_list.append(dis_k_gi_min) + # # SOD SM -> GM + if sod_sm > sod_gm: + nb_sod_sm2gm[0] += 1 + repeats_better_sod_sm2gm.append(repeat) + elif sod_sm == sod_gm: + nb_sod_sm2gm[1] += 1 + elif sod_sm < sod_gm: + nb_sod_sm2gm[2] += 1 + # # dis_k SM -> GM + if dis_k_sm > dis_k_gm: + nb_dis_k_sm2gm[0] += 1 + repeats_better_dis_k_sm2gm.append(repeat) + elif dis_k_sm == dis_k_gm: + nb_dis_k_sm2gm[1] += 1 + elif dis_k_sm < dis_k_gm: + nb_dis_k_sm2gm[2] += 1 + # # dis_k gi -> SM + if dis_k_gi_min > dis_k_sm: + nb_dis_k_gi2sm[0] += 1 + repeats_better_dis_k_gi2sm.append(repeat) + elif dis_k_gi_min == dis_k_sm: + nb_dis_k_gi2sm[1] += 1 + elif dis_k_gi_min < dis_k_sm: + nb_dis_k_gi2sm[2] += 1 + # # dis_k gi -> GM + if dis_k_gi_min > dis_k_gm: + nb_dis_k_gi2gm[0] += 1 + repeats_better_dis_k_gi2gm.append(repeat) + elif dis_k_gi_min == dis_k_gm: + nb_dis_k_gi2gm[1] += 1 + elif dis_k_gi_min < dis_k_gm: + nb_dis_k_gi2gm[2] += 1 + + # write result summary. + sod_sm_mean = np.mean(sod_sm_list) + sod_gm_mean = np.mean(sod_gm_list) + dis_k_sm_mean = np.mean(dis_k_sm_list) + dis_k_gm_mean = np.mean(dis_k_gm_list) + dis_k_gi_min_mean = np.mean(dis_k_gi_min_list) + sod_sm2gm_mean = getRelations(np.sign(sod_gm_mean - sod_sm_mean)) + dis_k_sm2gm_mean = getRelations(np.sign(dis_k_gm_mean - dis_k_sm_mean)) + dis_k_gi2sm_mean = getRelations(np.sign(dis_k_sm_mean - dis_k_gi_min_mean)) + dis_k_gi2gm_mean = getRelations(np.sign(dis_k_gm_mean - dis_k_gi_min_mean)) + f_summary = open(dir_output + fn_output_summary, 'a') + csv.writer(f_summary).writerow([ds_name, gkernel, fit_method, k, + sod_sm_mean, sod_gm_mean, dis_k_sm_mean, dis_k_gm_mean, + dis_k_gi_min_mean, sod_sm2gm_mean, dis_k_sm2gm_mean, + dis_k_gi2sm_mean, dis_k_gi2gm_mean, nb_sod_sm2gm, + nb_dis_k_sm2gm, nb_dis_k_gi2sm, nb_dis_k_gi2gm, + repeats_better_sod_sm2gm, repeats_better_dis_k_sm2gm, + repeats_better_dis_k_gi2sm, repeats_better_dis_k_gi2gm]) + f_summary.close() + + print('\ncomplete.') + return + + +def getRelations(sign): + if sign == -1: + return 'better' + elif sign == 0: + return 'same' + elif sign == 1: + return 'worse' + + +if __name__ == '__main__': + find_best_k() \ No newline at end of file diff --git a/preimage/test_k_closest_graphs.py b/preimage/test_k_closest_graphs.py new file mode 100644 index 0000000..8d7d27a --- /dev/null +++ b/preimage/test_k_closest_graphs.py @@ -0,0 +1,336 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Mon Dec 16 11:53:54 2019 + +@author: ljia +""" +import numpy as np +import math +import networkx as nx +import matplotlib.pyplot as plt +import time +import random +from tqdm import tqdm +from itertools import combinations, islice +import multiprocessing +from multiprocessing import Pool +from functools import partial + +#import os +import sys +sys.path.insert(0, "../") +from pygraph.utils.graphfiles import loadDataset, loadGXL +#from pygraph.utils.logger2file import * +from iam import iam_upgraded, iam_bash +from utils import compute_kernel, dis_gstar, kernel_distance_matrix +from fitDistance import fit_GED_to_kernel_distance +#from ged import ged_median + + +def median_on_k_closest_graphs(Gn, node_label, edge_label, gkernel, k, fit_method, + graph_dir='/media/ljia/DATA/research-repo/codes/Linlin/py-graph/datasets/monoterpenoides/', + edit_costs=None, group_min=None, dataset='monoterpenoides', + parallel=True): + +# # compute distances in kernel space. +# dis_mat, _, _, _ = kernel_distance_matrix(Gn, node_label, edge_label, +# Kmatrix=None, gkernel=gkernel) +# # ged. +# gmfile = np.load('results/test_k_closest_graphs/ged_mat.fit_on_whole_dataset.with_medians.gm.npz') +# ged_mat = gmfile['ged_mat'] +# dis_mat = ged_mat[0:len(Gn), 0:len(Gn)] + +# # choose k closest graphs +# time0 = time.time() +# sod_ks_min, group_min = get_closest_k_graphs(dis_mat, k, parallel) +# time_spent = time.time() - time0 +# print('closest graphs:', sod_ks_min, group_min) +# print('time spent:', time_spent) +# group_min = (12, 13, 22, 29) # closest w.r.t path kernel +# group_min = (77, 85, 160, 171) # closest w.r.t ged +# group_min = (0,1,2,3,4,5,6,7,8,9,10,11) # closest w.r.t treelet kernel + + Gn_median = [Gn[g].copy() for g in group_min] + + + # fit edit costs. + if fit_method == 'random': # random + edit_cost_constant = random.sample(range(1, 10), 6) + print('edit costs used:', edit_cost_constant) + elif fit_method == 'expert': # expert + edit_cost_constant = [3, 3, 1, 3, 3, 1] + elif fit_method == 'k-graphs': + itr_max = 6 + algo_options = '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1' + params_ged = {'lib': 'gedlibpy', 'cost': 'CONSTANT', 'method': 'IPFP', + 'algo_options': algo_options, 'stabilizer': None} + # fit on k-graph subset + edit_cost_constant, _, _, _, _, _, _ = fit_GED_to_kernel_distance(Gn_median, + node_label, edge_label, gkernel, itr_max, params_ged=params_ged, parallel=True) + elif fit_method == 'whole-dataset': + itr_max = 6 + algo_options = '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1' + params_ged = {'lib': 'gedlibpy', 'cost': 'CONSTANT', 'method': 'IPFP', + 'algo_options': algo_options, 'stabilizer': None} + # fit on all subset + edit_cost_constant, _, _, _, _, _, _ = fit_GED_to_kernel_distance(Gn, + node_label, edge_label, gkernel, itr_max, params_ged=params_ged, parallel=True) + elif fit_method == 'precomputed': + edit_cost_constant = edit_costs + + + # compute set median and gen median using IAM (C++ through bash). + group_fnames = [Gn[g].graph['filename'] for g in group_min] + sod_sm, sod_gm, fname_sm, fname_gm = iam_bash(group_fnames, edit_cost_constant, + graph_dir=graph_dir, dataset=dataset) + + + # compute distances in kernel space. + Gn_median = [Gn[g].copy() for g in group_min] + set_median = loadGXL(fname_sm) + gen_median = loadGXL(fname_gm) + if dataset == 'Letter': + for g in Gn_median: + reform_attributes(g) + reform_attributes(set_median) + reform_attributes(gen_median) + + # compute distance in kernel space for set median. + Kmatrix_sm = compute_kernel([set_median] + Gn_median, gkernel, + None if dataset == 'Letter' else 'chem', + None if dataset == 'Letter' else 'valence', + False) + dis_k_sm = dis_gstar(0, range(1, 1+len(Gn_median)), + [1 / len(Gn_median)] * len(Gn_median), Kmatrix_sm, withterm3=False) + + # compute distance in kernel space for generalized median. + Kmatrix_gm = compute_kernel([gen_median] + Gn_median, gkernel, + None if dataset == 'Letter' else 'chem', + None if dataset == 'Letter' else 'valence', + False) + dis_k_gm = dis_gstar(0, range(1, 1+len(Gn_median)), + [1 / len(Gn_median)] * len(Gn_median), Kmatrix_gm, withterm3=False) + + # compute distance in kernel space for each graph in median set. + dis_k_gi = [] + for idx in range(len(Gn_median)): + dis_k_gi.append(dis_gstar(idx+1, range(1, 1+len(Gn_median)), + [1 / len(Gn_median)] * len(Gn_median), Kmatrix_gm, withterm3=False)) + + print('sod_sm:', sod_sm) + print('sod_gm:', sod_gm) + print('dis_k_sm:', dis_k_sm) + print('dis_k_gm:', dis_k_gm) + print('dis_k_gi:', dis_k_gi) + idx_dis_k_gi_min = np.argmin(dis_k_gi) + dis_k_gi_min = dis_k_gi[idx_dis_k_gi_min] + print('index min dis_k_gi:', group_min[idx_dis_k_gi_min]) + print('min dis_k_gi:', dis_k_gi_min) + + return sod_sm, sod_gm, dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min, group_min[idx_dis_k_gi_min] + + +def reform_attributes(G): + for node in G.nodes: + G.nodes[node]['attributes'] = [G.nodes[node]['x'], G.nodes[node]['y']] + + +def get_closest_k_graphs(dis_mat, k, parallel): + k_graph_groups = combinations(range(0, len(dis_mat)), k) + sod_ks_min = np.inf + if parallel: + len_combination = get_combination_length(len(dis_mat), k) + len_itr_max = int(len_combination if len_combination < 1e7 else 1e7) +# pos_cur = 0 + graph_groups_slices = split_iterable(k_graph_groups, len_itr_max, len_combination) + for graph_groups_cur in graph_groups_slices: +# while True: +# graph_groups_cur = islice(k_graph_groups, pos_cur, pos_cur + len_itr_max) + graph_groups_cur_list = list(graph_groups_cur) + print('current position:', graph_groups_cur_list[0]) + len_itr_cur = len(graph_groups_cur_list) +# if len_itr_cur < len_itr_max: +# break + + itr = zip(graph_groups_cur_list, range(0, len_itr_cur)) + sod_k_list = np.empty(len_itr_cur) + graphs_list = [None] * len_itr_cur + n_jobs = multiprocessing.cpu_count() + chunksize = int(len_itr_max / n_jobs + 1) + n_jobs = multiprocessing.cpu_count() + def init_worker(dis_mat_toshare): + global G_dis_mat + G_dis_mat = dis_mat_toshare + pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(dis_mat,)) +# iterator = tqdm(pool.imap_unordered(_get_closest_k_graphs_parallel, +# itr, chunksize), +# desc='Choosing k closest graphs', file=sys.stdout) + iterator = pool.imap_unordered(_get_closest_k_graphs_parallel, itr, chunksize) + for graphs, i, sod_ks in iterator: + sod_k_list[i] = sod_ks + graphs_list[i] = graphs + pool.close() + pool.join() + + arg_min = np.argmin(sod_k_list) + sod_ks_cur = sod_k_list[arg_min] + group_cur = graphs_list[arg_min] + if sod_ks_cur < sod_ks_min: + sod_ks_min = sod_ks_cur + group_min = group_cur + print('get closer graphs:', sod_ks_min, group_min) + else: + for items in tqdm(k_graph_groups, desc='Choosing k closest graphs', file=sys.stdout): + # if items[0] != itmp: + # itmp = items[0] + # print(items) + k_graph_pairs = combinations(items, 2) + sod_ks = 0 + for i1, i2 in k_graph_pairs: + sod_ks += dis_mat[i1, i2] + if sod_ks < sod_ks_min: + sod_ks_min = sod_ks + group_min = items + print('get closer graphs:', sod_ks_min, group_min) + + return sod_ks_min, group_min + + +def _get_closest_k_graphs_parallel(itr): + k_graph_pairs = combinations(itr[0], 2) + sod_ks = 0 + for i1, i2 in k_graph_pairs: + sod_ks += G_dis_mat[i1, i2] + + return itr[0], itr[1], sod_ks + + +def split_iterable(iterable, n, len_iter): + it = iter(iterable) + for i in range(0, len_iter, n): + piece = islice(it, n) + yield piece + + +def get_combination_length(n, k): + len_combination = 1 + for i in range(n, n - k, -1): + len_combination *= i + return int(len_combination / math.factorial(k)) + + +############################################################################### + +def test_k_closest_graphs(): + ds = {'name': 'monoterpenoides', + 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb + Gn, y_all = loadDataset(ds['dataset']) +# Gn = Gn[0:50] +# gkernel = 'untilhpathkernel' +# gkernel = 'weisfeilerlehmankernel' + gkernel = 'treeletkernel' + node_label = 'atom' + edge_label = 'bond_type' + + k = 5 + edit_costs = [0.16229209837639536, 0.06612870523413916, 0.04030113378793905, 0.20723547009415202, 0.3338607220394598, 0.27054392518077297] + +# sod_sm, sod_gm, dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min \ +# = median_on_k_closest_graphs(Gn, node_label, edge_label, gkernel, k, +# 'precomputed', edit_costs=edit_costs, +## 'k-graphs', +# parallel=False) +# +# sod_sm, sod_gm, dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min \ +# = median_on_k_closest_graphs(Gn, node_label, edge_label, gkernel, k, +# 'expert', parallel=False) + + sod_sm, sod_gm, dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min \ + = median_on_k_closest_graphs(Gn, node_label, edge_label, gkernel, k, + 'expert', parallel=False) + return + + +def test_k_closest_graphs_with_cv(): + gkernel = 'untilhpathkernel' + node_label = 'atom' + edge_label = 'bond_type' + + k = 4 + + y_all = ['3', '1', '4', '6', '7', '8', '9', '2'] + repeats = 50 + collection_path = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/generated_datsets/monoterpenoides/' + graph_dir = collection_path + 'gxl/' + + sod_sm_list = [] + sod_gm_list = [] + dis_k_sm_list = [] + dis_k_gm_list = [] + dis_k_gi_min_list = [] + for y in y_all: + print('\n-------------------------------------------------------') + print('class of y:', y) + + sod_sm_list.append([]) + sod_gm_list.append([]) + dis_k_sm_list.append([]) + dis_k_gm_list.append([]) + dis_k_gi_min_list.append([]) + + for repeat in range(repeats): + print('\nrepeat ', repeat) + collection_file = collection_path + 'monoterpenoides_' + y + '_' + str(repeat) + '.xml' + Gn, _ = loadDataset(collection_file, extra_params=graph_dir) + sod_sm, sod_gm, dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min \ + = median_on_k_closest_graphs(Gn, node_label, edge_label, gkernel, + k, 'whole-dataset', graph_dir=graph_dir, + parallel=False) + + sod_sm_list[-1].append(sod_sm) + sod_gm_list[-1].append(sod_gm) + dis_k_sm_list[-1].append(dis_k_sm) + dis_k_gm_list[-1].append(dis_k_gm) + dis_k_gi_min_list[-1].append(dis_k_gi_min) + + print('\nsods of the set median for this class:', sod_sm_list[-1]) + print('\nsods of the gen median for this class:', sod_gm_list[-1]) + print('\ndistances in kernel space of set median for this class:', + dis_k_sm_list[-1]) + print('\ndistances in kernel space of gen median for this class:', + dis_k_gm_list[-1]) + print('\ndistances in kernel space of min graph for this class:', + dis_k_gi_min_list[-1]) + + sod_sm_list[-1] = np.mean(sod_sm_list[-1]) + sod_gm_list[-1] = np.mean(sod_gm_list[-1]) + dis_k_sm_list[-1] = np.mean(dis_k_sm_list[-1]) + dis_k_gm_list[-1] = np.mean(dis_k_gm_list[-1]) + dis_k_gi_min_list[-1] = np.mean(dis_k_gi_min_list[-1]) + + print() + print('\nmean sods of the set median for each class:', sod_sm_list) + print('\nmean sods of the gen median for each class:', sod_gm_list) + print('\nmean distance in kernel space of set median for each class:', + dis_k_sm_list) + print('\nmean distances in kernel space of gen median for each class:', + dis_k_gm_list) + print('\nmean distances in kernel space of min graph for each class:', + dis_k_gi_min_list) + + print('\nmean sods of the set median of all:', np.mean(sod_sm_list)) + print('\nmean sods of the gen median of all:', np.mean(sod_gm_list)) + print('\nmean distances in kernel space of set median of all:', + np.mean(dis_k_sm_list)) + print('\nmean distances in kernel space of gen median of all:', + np.mean(dis_k_gm_list)) + print('\nmean distances in kernel space of min graph of all:', + np.mean(dis_k_gi_min_list)) + + return + + +if __name__ == '__main__': + test_k_closest_graphs() +# test_k_closest_graphs_with_cv() \ No newline at end of file diff --git a/preimage/xp_letter_h.py b/preimage/xp_letter_h.py new file mode 100644 index 0000000..71496e4 --- /dev/null +++ b/preimage/xp_letter_h.py @@ -0,0 +1,246 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Tue Jan 14 15:39:29 2020 + +@author: ljia +""" +import numpy as np +import random +import csv +from shutil import copyfile +import networkx as nx +import matplotlib.pyplot as plt + +import sys +sys.path.insert(0, "../") +from pygraph.utils.graphfiles import loadDataset, loadGXL, saveGXL +from preimage.test_k_closest_graphs import median_on_k_closest_graphs, reform_attributes +from preimage.utils import get_same_item_indices +from preimage.find_best_k import getRelations + +def xp_letter_h(): + ds = {'name': 'Letter-high', + 'dataset': '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/data/collections/Letter.xml', + 'graph_dir': '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/data/datasets/Letter/HIGH/'} # node/edge symb + Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['graph_dir']) +# ds = {'name': 'Letter-high', +# 'dataset': '../datasets/Letter-high/Letter-high_A.txt'} # node/edge symb +# Gn, y_all = loadDataset(ds['dataset']) +# Gn = Gn[0:50] + gkernel = 'structuralspkernel' + node_label = None + edge_label = None + ds_name = 'letter-h' + dir_output = 'results/xp_letter_h/' + + repeats = 1 +# k_list = range(2, 11) + k_list = [150] + fit_method = 'precomputed' + # get indices by classes. + y_idx = get_same_item_indices(y_all) + + # create result files. + fn_output_detail = 'results_detail.' + fit_method + '.csv' + f_detail = open(dir_output + fn_output_detail, 'a') + csv.writer(f_detail).writerow(['dataset', 'graph kernel', 'fit method', 'k', + 'target', 'repeat', 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM', + 'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM', + 'dis_k gi -> GM', 'median set']) + f_detail.close() + fn_output_summary = 'results_summary.csv' + f_summary = open(dir_output + fn_output_summary, 'a') + csv.writer(f_summary).writerow(['dataset', 'graph kernel', 'fit method', 'k', + 'target', 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM', + 'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM', + 'dis_k gi -> GM', '# SOD SM -> GM', '# dis_k SM -> GM', + '# dis_k gi -> SM', '# dis_k gi -> GM', 'repeats better SOD SM -> GM', + 'repeats better dis_k SM -> GM', 'repeats better dis_k gi -> SM', + 'repeats better dis_k gi -> GM']) + f_summary.close() + + random.seed(1) + rdn_seed_list = random.sample(range(0, repeats * 100), repeats) + + for k in k_list: + print('\n--------- k =', k, '----------') + + sod_sm_mean_list = [] + sod_gm_mean_list = [] + dis_k_sm_mean_list = [] + dis_k_gm_mean_list = [] + dis_k_gi_min_mean_list = [] +# nb_sod_sm2gm = [0, 0, 0] +# nb_dis_k_sm2gm = [0, 0, 0] +# nb_dis_k_gi2sm = [0, 0, 0] +# nb_dis_k_gi2gm = [0, 0, 0] +# repeats_better_sod_sm2gm = [] +# repeats_better_dis_k_sm2gm = [] +# repeats_better_dis_k_gi2sm = [] +# repeats_better_dis_k_gi2gm = [] + + for i, (y, values) in enumerate(y_idx.items()): + print('\ny =', y) +# y = 'I' +# values = y_idx[y] + +# k = len(values) +# k = kkk + + sod_sm_list = [] + sod_gm_list = [] + dis_k_sm_list = [] + dis_k_gm_list = [] + dis_k_gi_min_list = [] + nb_sod_sm2gm = [0, 0, 0] + nb_dis_k_sm2gm = [0, 0, 0] + nb_dis_k_gi2sm = [0, 0, 0] + nb_dis_k_gi2gm = [0, 0, 0] + repeats_better_sod_sm2gm = [] + repeats_better_dis_k_sm2gm = [] + repeats_better_dis_k_gi2sm = [] + repeats_better_dis_k_gi2gm = [] + + for repeat in range(repeats): + print('\nrepeat =', repeat) + random.seed(rdn_seed_list[repeat]) + median_set_idx_idx = random.sample(range(0, len(values)), k) + median_set_idx = [values[idx] for idx in median_set_idx_idx] + print('median set: ', median_set_idx) + Gn_median = [Gn[g] for g in values] + + sod_sm, sod_gm, dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min, idx_dis_k_gi_min \ + = median_on_k_closest_graphs(Gn_median, node_label, edge_label, + gkernel, k, fit_method=fit_method, graph_dir=ds['graph_dir'], + edit_costs=None, group_min=median_set_idx_idx, + dataset='Letter', parallel=False) + + # write result detail. + sod_sm2gm = getRelations(np.sign(sod_gm - sod_sm)) + dis_k_sm2gm = getRelations(np.sign(dis_k_gm - dis_k_sm)) + dis_k_gi2sm = getRelations(np.sign(dis_k_sm - dis_k_gi_min)) + dis_k_gi2gm = getRelations(np.sign(dis_k_gm - dis_k_gi_min)) + f_detail = open(dir_output + fn_output_detail, 'a') + csv.writer(f_detail).writerow([ds_name, gkernel, fit_method, k, + y, repeat, + sod_sm, sod_gm, dis_k_sm, dis_k_gm, + dis_k_gi_min, sod_sm2gm, dis_k_sm2gm, dis_k_gi2sm, + dis_k_gi2gm, median_set_idx]) + f_detail.close() + + # compute result summary. + sod_sm_list.append(sod_sm) + sod_gm_list.append(sod_gm) + dis_k_sm_list.append(dis_k_sm) + dis_k_gm_list.append(dis_k_gm) + dis_k_gi_min_list.append(dis_k_gi_min) + # # SOD SM -> GM + if sod_sm > sod_gm: + nb_sod_sm2gm[0] += 1 + repeats_better_sod_sm2gm.append(repeat) + elif sod_sm == sod_gm: + nb_sod_sm2gm[1] += 1 + elif sod_sm < sod_gm: + nb_sod_sm2gm[2] += 1 + # # dis_k SM -> GM + if dis_k_sm > dis_k_gm: + nb_dis_k_sm2gm[0] += 1 + repeats_better_dis_k_sm2gm.append(repeat) + elif dis_k_sm == dis_k_gm: + nb_dis_k_sm2gm[1] += 1 + elif dis_k_sm < dis_k_gm: + nb_dis_k_sm2gm[2] += 1 + # # dis_k gi -> SM + if dis_k_gi_min > dis_k_sm: + nb_dis_k_gi2sm[0] += 1 + repeats_better_dis_k_gi2sm.append(repeat) + elif dis_k_gi_min == dis_k_sm: + nb_dis_k_gi2sm[1] += 1 + elif dis_k_gi_min < dis_k_sm: + nb_dis_k_gi2sm[2] += 1 + # # dis_k gi -> GM + if dis_k_gi_min > dis_k_gm: + nb_dis_k_gi2gm[0] += 1 + repeats_better_dis_k_gi2gm.append(repeat) + elif dis_k_gi_min == dis_k_gm: + nb_dis_k_gi2gm[1] += 1 + elif dis_k_gi_min < dis_k_gm: + nb_dis_k_gi2gm[2] += 1 + + # save median graphs. + fname_sm = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/output/tmp_ged/set_median.gxl' + fn_pre_sm_new = dir_output + 'medians/set_median.k' + str(int(k)) + '.y' + y + '.repeat' + str(repeat) + copyfile(fname_sm, fn_pre_sm_new + '.gxl') + fname_gm = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/output/tmp_ged/gen_median.gxl' + fn_pre_gm_new = dir_output + 'medians/gen_median.k' + str(int(k)) + '.y' + y + '.repeat' + str(repeat) + copyfile(fname_gm, fn_pre_gm_new + '.gxl') + G_best_kernel = Gn_median[idx_dis_k_gi_min].copy() + reform_attributes(G_best_kernel) + fn_pre_g_best_kernel = dir_output + 'medians/g_best_kernel.k' + str(int(k)) + '.y' + y + '.repeat' + str(repeat) + saveGXL(G_best_kernel, fn_pre_g_best_kernel + '.gxl', method='gedlib-letter') + + # plot median graphs. + set_median = loadGXL(fn_pre_sm_new + '.gxl') + gen_median = loadGXL(fn_pre_gm_new + '.gxl') + draw_Letter_graph(set_median, fn_pre_sm_new) + draw_Letter_graph(gen_median, fn_pre_gm_new) + draw_Letter_graph(G_best_kernel, fn_pre_g_best_kernel) + + # write result summary for each letter. + sod_sm_mean_list.append(np.mean(sod_sm_list)) + sod_gm_mean_list.append(np.mean(sod_gm_list)) + dis_k_sm_mean_list.append(np.mean(dis_k_sm_list)) + dis_k_gm_mean_list.append(np.mean(dis_k_gm_list)) + dis_k_gi_min_mean_list.append(np.mean(dis_k_gi_min_list)) + sod_sm2gm_mean = getRelations(np.sign(sod_gm_mean_list[-1] - sod_sm_mean_list[-1])) + dis_k_sm2gm_mean = getRelations(np.sign(dis_k_gm_mean_list[-1] - dis_k_sm_mean_list[-1])) + dis_k_gi2sm_mean = getRelations(np.sign(dis_k_sm_mean_list[-1] - dis_k_gi_min_mean_list[-1])) + dis_k_gi2gm_mean = getRelations(np.sign(dis_k_gm_mean_list[-1] - dis_k_gi_min_mean_list[-1])) + f_summary = open(dir_output + fn_output_summary, 'a') + csv.writer(f_summary).writerow([ds_name, gkernel, fit_method, k, y, + sod_sm_mean_list[-1], sod_gm_mean_list[-1], + dis_k_sm_mean_list[-1], dis_k_gm_mean_list[-1], + dis_k_gi_min_mean_list[-1], sod_sm2gm_mean, dis_k_sm2gm_mean, + dis_k_gi2sm_mean, dis_k_gi2gm_mean, nb_sod_sm2gm, + nb_dis_k_sm2gm, nb_dis_k_gi2sm, nb_dis_k_gi2gm, + repeats_better_sod_sm2gm, repeats_better_dis_k_sm2gm, + repeats_better_dis_k_gi2sm, repeats_better_dis_k_gi2gm]) + f_summary.close() + + + # write result summary for each letter. + sod_sm_mean = np.mean(sod_sm_mean_list) + sod_gm_mean = np.mean(sod_gm_mean_list) + dis_k_sm_mean = np.mean(dis_k_sm_mean_list) + dis_k_gm_mean = np.mean(dis_k_gm_mean_list) + dis_k_gi_min_mean = np.mean(dis_k_gi_min_list) + sod_sm2gm_mean = getRelations(np.sign(sod_gm_mean - sod_sm_mean)) + dis_k_sm2gm_mean = getRelations(np.sign(dis_k_gm_mean - dis_k_sm_mean)) + dis_k_gi2sm_mean = getRelations(np.sign(dis_k_sm_mean - dis_k_gi_min_mean)) + dis_k_gi2gm_mean = getRelations(np.sign(dis_k_gm_mean - dis_k_gi_min_mean)) + f_summary = open(dir_output + fn_output_summary, 'a') + csv.writer(f_summary).writerow([ds_name, gkernel, fit_method, k, 'all', + sod_sm_mean, sod_gm_mean, dis_k_sm_mean, dis_k_gm_mean, + dis_k_gi_min_mean, sod_sm2gm_mean, dis_k_sm2gm_mean, + dis_k_gi2sm_mean, dis_k_gi2gm_mean]) + f_summary.close() + + + print('\ncomplete.') + + +#Dessin median courrant +def draw_Letter_graph(graph, file_prefix): + plt.figure() + pos = {} + for n in graph.nodes: + pos[n] = np.array([float(graph.node[n]['x']),float(graph.node[n]['y'])]) + nx.draw_networkx(graph, pos) + plt.savefig(file_prefix + '.eps', format='eps', dpi=300) +# plt.show() + plt.clf() + + +if __name__ == "__main__": + xp_letter_h() \ No newline at end of file