@@ -0,0 +1,172 @@ | |||
#!/usr/bin/env python3 | |||
# -*- coding: utf-8 -*- | |||
""" | |||
Created on Thu Jan 9 11:54:32 2020 | |||
@author: ljia | |||
""" | |||
import numpy as np | |||
import random | |||
import csv | |||
import sys | |||
sys.path.insert(0, "../") | |||
from pygraph.utils.graphfiles import loadDataset | |||
from preimage.test_k_closest_graphs import median_on_k_closest_graphs | |||
def find_best_k(): | |||
ds = {'name': 'monoterpenoides', | |||
'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb | |||
Gn, y_all = loadDataset(ds['dataset']) | |||
# Gn = Gn[0:50] | |||
gkernel = 'treeletkernel' | |||
node_label = 'atom' | |||
edge_label = 'bond_type' | |||
ds_name = 'mono' | |||
dir_output = 'results/test_find_best_k/' | |||
repeats = 50 | |||
k_list = range(2, 11) | |||
fit_method = 'k-graphs' | |||
# fitted on the whole dataset - treelet - mono | |||
edit_costs = [0.1268873773592978, 0.004084633224249829, 0.0897581955378986, 0.15328856114451297, 0.3109956881625734, 0.0] | |||
# create result files. | |||
fn_output_detail = 'results_detail.' + fit_method + '.csv' | |||
f_detail = open(dir_output + fn_output_detail, 'a') | |||
csv.writer(f_detail).writerow(['dataset', 'graph kernel', 'fit method', 'k', | |||
'repeat', 'median set', 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM', | |||
'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM', | |||
'dis_k gi -> GM']) | |||
f_detail.close() | |||
fn_output_summary = 'results_summary.csv' | |||
f_summary = open(dir_output + fn_output_summary, 'a') | |||
csv.writer(f_summary).writerow(['dataset', 'graph kernel', 'fit method', 'k', | |||
'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM', | |||
'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM', | |||
'dis_k gi -> GM', '# SOD SM -> GM', '# dis_k SM -> GM', | |||
'# dis_k gi -> SM', '# dis_k gi -> GM', 'repeats better SOD SM -> GM', | |||
'repeats better dis_k SM -> GM', 'repeats better dis_k gi -> SM', | |||
'repeats better dis_k gi -> GM']) | |||
f_summary.close() | |||
random.seed(1) | |||
rdn_seed_list = random.sample(range(0, repeats * 100), repeats) | |||
for k in k_list: | |||
print('\n--------- k =', k, '----------') | |||
sod_sm_list = [] | |||
sod_gm_list = [] | |||
dis_k_sm_list = [] | |||
dis_k_gm_list = [] | |||
dis_k_gi_min_list = [] | |||
nb_sod_sm2gm = [0, 0, 0] | |||
nb_dis_k_sm2gm = [0, 0, 0] | |||
nb_dis_k_gi2sm = [0, 0, 0] | |||
nb_dis_k_gi2gm = [0, 0, 0] | |||
repeats_better_sod_sm2gm = [] | |||
repeats_better_dis_k_sm2gm = [] | |||
repeats_better_dis_k_gi2sm = [] | |||
repeats_better_dis_k_gi2gm = [] | |||
for repeat in range(repeats): | |||
print('\nrepeat =', repeat) | |||
random.seed(rdn_seed_list[repeat]) | |||
median_set_idx = random.sample(range(0, len(Gn)), k) | |||
print('median set: ', median_set_idx) | |||
sod_sm, sod_gm, dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min \ | |||
= median_on_k_closest_graphs(Gn, node_label, edge_label, gkernel, k, | |||
fit_method='k-graphs', | |||
edit_costs=edit_costs, | |||
group_min=median_set_idx, | |||
parallel=False) | |||
# write result detail. | |||
sod_sm2gm = getRelations(np.sign(sod_gm - sod_sm)) | |||
dis_k_sm2gm = getRelations(np.sign(dis_k_gm - dis_k_sm)) | |||
dis_k_gi2sm = getRelations(np.sign(dis_k_sm - dis_k_gi_min)) | |||
dis_k_gi2gm = getRelations(np.sign(dis_k_gm - dis_k_gi_min)) | |||
f_detail = open(dir_output + fn_output_detail, 'a') | |||
csv.writer(f_detail).writerow([ds_name, gkernel, fit_method, k, repeat, | |||
median_set_idx, sod_sm, sod_gm, dis_k_sm, dis_k_gm, | |||
dis_k_gi_min, sod_sm2gm, dis_k_sm2gm, dis_k_gi2sm, | |||
dis_k_gi2gm]) | |||
f_detail.close() | |||
# compute result summary. | |||
sod_sm_list.append(sod_sm) | |||
sod_gm_list.append(sod_gm) | |||
dis_k_sm_list.append(dis_k_sm) | |||
dis_k_gm_list.append(dis_k_gm) | |||
dis_k_gi_min_list.append(dis_k_gi_min) | |||
# # SOD SM -> GM | |||
if sod_sm > sod_gm: | |||
nb_sod_sm2gm[0] += 1 | |||
repeats_better_sod_sm2gm.append(repeat) | |||
elif sod_sm == sod_gm: | |||
nb_sod_sm2gm[1] += 1 | |||
elif sod_sm < sod_gm: | |||
nb_sod_sm2gm[2] += 1 | |||
# # dis_k SM -> GM | |||
if dis_k_sm > dis_k_gm: | |||
nb_dis_k_sm2gm[0] += 1 | |||
repeats_better_dis_k_sm2gm.append(repeat) | |||
elif dis_k_sm == dis_k_gm: | |||
nb_dis_k_sm2gm[1] += 1 | |||
elif dis_k_sm < dis_k_gm: | |||
nb_dis_k_sm2gm[2] += 1 | |||
# # dis_k gi -> SM | |||
if dis_k_gi_min > dis_k_sm: | |||
nb_dis_k_gi2sm[0] += 1 | |||
repeats_better_dis_k_gi2sm.append(repeat) | |||
elif dis_k_gi_min == dis_k_sm: | |||
nb_dis_k_gi2sm[1] += 1 | |||
elif dis_k_gi_min < dis_k_sm: | |||
nb_dis_k_gi2sm[2] += 1 | |||
# # dis_k gi -> GM | |||
if dis_k_gi_min > dis_k_gm: | |||
nb_dis_k_gi2gm[0] += 1 | |||
repeats_better_dis_k_gi2gm.append(repeat) | |||
elif dis_k_gi_min == dis_k_gm: | |||
nb_dis_k_gi2gm[1] += 1 | |||
elif dis_k_gi_min < dis_k_gm: | |||
nb_dis_k_gi2gm[2] += 1 | |||
# write result summary. | |||
sod_sm_mean = np.mean(sod_sm_list) | |||
sod_gm_mean = np.mean(sod_gm_list) | |||
dis_k_sm_mean = np.mean(dis_k_sm_list) | |||
dis_k_gm_mean = np.mean(dis_k_gm_list) | |||
dis_k_gi_min_mean = np.mean(dis_k_gi_min_list) | |||
sod_sm2gm_mean = getRelations(np.sign(sod_gm_mean - sod_sm_mean)) | |||
dis_k_sm2gm_mean = getRelations(np.sign(dis_k_gm_mean - dis_k_sm_mean)) | |||
dis_k_gi2sm_mean = getRelations(np.sign(dis_k_sm_mean - dis_k_gi_min_mean)) | |||
dis_k_gi2gm_mean = getRelations(np.sign(dis_k_gm_mean - dis_k_gi_min_mean)) | |||
f_summary = open(dir_output + fn_output_summary, 'a') | |||
csv.writer(f_summary).writerow([ds_name, gkernel, fit_method, k, | |||
sod_sm_mean, sod_gm_mean, dis_k_sm_mean, dis_k_gm_mean, | |||
dis_k_gi_min_mean, sod_sm2gm_mean, dis_k_sm2gm_mean, | |||
dis_k_gi2sm_mean, dis_k_gi2gm_mean, nb_sod_sm2gm, | |||
nb_dis_k_sm2gm, nb_dis_k_gi2sm, nb_dis_k_gi2gm, | |||
repeats_better_sod_sm2gm, repeats_better_dis_k_sm2gm, | |||
repeats_better_dis_k_gi2sm, repeats_better_dis_k_gi2gm]) | |||
f_summary.close() | |||
print('\ncomplete.') | |||
return | |||
def getRelations(sign): | |||
if sign == -1: | |||
return 'better' | |||
elif sign == 0: | |||
return 'same' | |||
elif sign == 1: | |||
return 'worse' | |||
if __name__ == '__main__': | |||
find_best_k() |
@@ -0,0 +1,336 @@ | |||
#!/usr/bin/env python3 | |||
# -*- coding: utf-8 -*- | |||
""" | |||
Created on Mon Dec 16 11:53:54 2019 | |||
@author: ljia | |||
""" | |||
import numpy as np | |||
import math | |||
import networkx as nx | |||
import matplotlib.pyplot as plt | |||
import time | |||
import random | |||
from tqdm import tqdm | |||
from itertools import combinations, islice | |||
import multiprocessing | |||
from multiprocessing import Pool | |||
from functools import partial | |||
#import os | |||
import sys | |||
sys.path.insert(0, "../") | |||
from pygraph.utils.graphfiles import loadDataset, loadGXL | |||
#from pygraph.utils.logger2file import * | |||
from iam import iam_upgraded, iam_bash | |||
from utils import compute_kernel, dis_gstar, kernel_distance_matrix | |||
from fitDistance import fit_GED_to_kernel_distance | |||
#from ged import ged_median | |||
def median_on_k_closest_graphs(Gn, node_label, edge_label, gkernel, k, fit_method, | |||
graph_dir='/media/ljia/DATA/research-repo/codes/Linlin/py-graph/datasets/monoterpenoides/', | |||
edit_costs=None, group_min=None, dataset='monoterpenoides', | |||
parallel=True): | |||
# # compute distances in kernel space. | |||
# dis_mat, _, _, _ = kernel_distance_matrix(Gn, node_label, edge_label, | |||
# Kmatrix=None, gkernel=gkernel) | |||
# # ged. | |||
# gmfile = np.load('results/test_k_closest_graphs/ged_mat.fit_on_whole_dataset.with_medians.gm.npz') | |||
# ged_mat = gmfile['ged_mat'] | |||
# dis_mat = ged_mat[0:len(Gn), 0:len(Gn)] | |||
# # choose k closest graphs | |||
# time0 = time.time() | |||
# sod_ks_min, group_min = get_closest_k_graphs(dis_mat, k, parallel) | |||
# time_spent = time.time() - time0 | |||
# print('closest graphs:', sod_ks_min, group_min) | |||
# print('time spent:', time_spent) | |||
# group_min = (12, 13, 22, 29) # closest w.r.t path kernel | |||
# group_min = (77, 85, 160, 171) # closest w.r.t ged | |||
# group_min = (0,1,2,3,4,5,6,7,8,9,10,11) # closest w.r.t treelet kernel | |||
Gn_median = [Gn[g].copy() for g in group_min] | |||
# fit edit costs. | |||
if fit_method == 'random': # random | |||
edit_cost_constant = random.sample(range(1, 10), 6) | |||
print('edit costs used:', edit_cost_constant) | |||
elif fit_method == 'expert': # expert | |||
edit_cost_constant = [3, 3, 1, 3, 3, 1] | |||
elif fit_method == 'k-graphs': | |||
itr_max = 6 | |||
algo_options = '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1' | |||
params_ged = {'lib': 'gedlibpy', 'cost': 'CONSTANT', 'method': 'IPFP', | |||
'algo_options': algo_options, 'stabilizer': None} | |||
# fit on k-graph subset | |||
edit_cost_constant, _, _, _, _, _, _ = fit_GED_to_kernel_distance(Gn_median, | |||
node_label, edge_label, gkernel, itr_max, params_ged=params_ged, parallel=True) | |||
elif fit_method == 'whole-dataset': | |||
itr_max = 6 | |||
algo_options = '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1' | |||
params_ged = {'lib': 'gedlibpy', 'cost': 'CONSTANT', 'method': 'IPFP', | |||
'algo_options': algo_options, 'stabilizer': None} | |||
# fit on all subset | |||
edit_cost_constant, _, _, _, _, _, _ = fit_GED_to_kernel_distance(Gn, | |||
node_label, edge_label, gkernel, itr_max, params_ged=params_ged, parallel=True) | |||
elif fit_method == 'precomputed': | |||
edit_cost_constant = edit_costs | |||
# compute set median and gen median using IAM (C++ through bash). | |||
group_fnames = [Gn[g].graph['filename'] for g in group_min] | |||
sod_sm, sod_gm, fname_sm, fname_gm = iam_bash(group_fnames, edit_cost_constant, | |||
graph_dir=graph_dir, dataset=dataset) | |||
# compute distances in kernel space. | |||
Gn_median = [Gn[g].copy() for g in group_min] | |||
set_median = loadGXL(fname_sm) | |||
gen_median = loadGXL(fname_gm) | |||
if dataset == 'Letter': | |||
for g in Gn_median: | |||
reform_attributes(g) | |||
reform_attributes(set_median) | |||
reform_attributes(gen_median) | |||
# compute distance in kernel space for set median. | |||
Kmatrix_sm = compute_kernel([set_median] + Gn_median, gkernel, | |||
None if dataset == 'Letter' else 'chem', | |||
None if dataset == 'Letter' else 'valence', | |||
False) | |||
dis_k_sm = dis_gstar(0, range(1, 1+len(Gn_median)), | |||
[1 / len(Gn_median)] * len(Gn_median), Kmatrix_sm, withterm3=False) | |||
# compute distance in kernel space for generalized median. | |||
Kmatrix_gm = compute_kernel([gen_median] + Gn_median, gkernel, | |||
None if dataset == 'Letter' else 'chem', | |||
None if dataset == 'Letter' else 'valence', | |||
False) | |||
dis_k_gm = dis_gstar(0, range(1, 1+len(Gn_median)), | |||
[1 / len(Gn_median)] * len(Gn_median), Kmatrix_gm, withterm3=False) | |||
# compute distance in kernel space for each graph in median set. | |||
dis_k_gi = [] | |||
for idx in range(len(Gn_median)): | |||
dis_k_gi.append(dis_gstar(idx+1, range(1, 1+len(Gn_median)), | |||
[1 / len(Gn_median)] * len(Gn_median), Kmatrix_gm, withterm3=False)) | |||
print('sod_sm:', sod_sm) | |||
print('sod_gm:', sod_gm) | |||
print('dis_k_sm:', dis_k_sm) | |||
print('dis_k_gm:', dis_k_gm) | |||
print('dis_k_gi:', dis_k_gi) | |||
idx_dis_k_gi_min = np.argmin(dis_k_gi) | |||
dis_k_gi_min = dis_k_gi[idx_dis_k_gi_min] | |||
print('index min dis_k_gi:', group_min[idx_dis_k_gi_min]) | |||
print('min dis_k_gi:', dis_k_gi_min) | |||
return sod_sm, sod_gm, dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min, group_min[idx_dis_k_gi_min] | |||
def reform_attributes(G): | |||
for node in G.nodes: | |||
G.nodes[node]['attributes'] = [G.nodes[node]['x'], G.nodes[node]['y']] | |||
def get_closest_k_graphs(dis_mat, k, parallel): | |||
k_graph_groups = combinations(range(0, len(dis_mat)), k) | |||
sod_ks_min = np.inf | |||
if parallel: | |||
len_combination = get_combination_length(len(dis_mat), k) | |||
len_itr_max = int(len_combination if len_combination < 1e7 else 1e7) | |||
# pos_cur = 0 | |||
graph_groups_slices = split_iterable(k_graph_groups, len_itr_max, len_combination) | |||
for graph_groups_cur in graph_groups_slices: | |||
# while True: | |||
# graph_groups_cur = islice(k_graph_groups, pos_cur, pos_cur + len_itr_max) | |||
graph_groups_cur_list = list(graph_groups_cur) | |||
print('current position:', graph_groups_cur_list[0]) | |||
len_itr_cur = len(graph_groups_cur_list) | |||
# if len_itr_cur < len_itr_max: | |||
# break | |||
itr = zip(graph_groups_cur_list, range(0, len_itr_cur)) | |||
sod_k_list = np.empty(len_itr_cur) | |||
graphs_list = [None] * len_itr_cur | |||
n_jobs = multiprocessing.cpu_count() | |||
chunksize = int(len_itr_max / n_jobs + 1) | |||
n_jobs = multiprocessing.cpu_count() | |||
def init_worker(dis_mat_toshare): | |||
global G_dis_mat | |||
G_dis_mat = dis_mat_toshare | |||
pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(dis_mat,)) | |||
# iterator = tqdm(pool.imap_unordered(_get_closest_k_graphs_parallel, | |||
# itr, chunksize), | |||
# desc='Choosing k closest graphs', file=sys.stdout) | |||
iterator = pool.imap_unordered(_get_closest_k_graphs_parallel, itr, chunksize) | |||
for graphs, i, sod_ks in iterator: | |||
sod_k_list[i] = sod_ks | |||
graphs_list[i] = graphs | |||
pool.close() | |||
pool.join() | |||
arg_min = np.argmin(sod_k_list) | |||
sod_ks_cur = sod_k_list[arg_min] | |||
group_cur = graphs_list[arg_min] | |||
if sod_ks_cur < sod_ks_min: | |||
sod_ks_min = sod_ks_cur | |||
group_min = group_cur | |||
print('get closer graphs:', sod_ks_min, group_min) | |||
else: | |||
for items in tqdm(k_graph_groups, desc='Choosing k closest graphs', file=sys.stdout): | |||
# if items[0] != itmp: | |||
# itmp = items[0] | |||
# print(items) | |||
k_graph_pairs = combinations(items, 2) | |||
sod_ks = 0 | |||
for i1, i2 in k_graph_pairs: | |||
sod_ks += dis_mat[i1, i2] | |||
if sod_ks < sod_ks_min: | |||
sod_ks_min = sod_ks | |||
group_min = items | |||
print('get closer graphs:', sod_ks_min, group_min) | |||
return sod_ks_min, group_min | |||
def _get_closest_k_graphs_parallel(itr): | |||
k_graph_pairs = combinations(itr[0], 2) | |||
sod_ks = 0 | |||
for i1, i2 in k_graph_pairs: | |||
sod_ks += G_dis_mat[i1, i2] | |||
return itr[0], itr[1], sod_ks | |||
def split_iterable(iterable, n, len_iter): | |||
it = iter(iterable) | |||
for i in range(0, len_iter, n): | |||
piece = islice(it, n) | |||
yield piece | |||
def get_combination_length(n, k): | |||
len_combination = 1 | |||
for i in range(n, n - k, -1): | |||
len_combination *= i | |||
return int(len_combination / math.factorial(k)) | |||
############################################################################### | |||
def test_k_closest_graphs(): | |||
ds = {'name': 'monoterpenoides', | |||
'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb | |||
Gn, y_all = loadDataset(ds['dataset']) | |||
# Gn = Gn[0:50] | |||
# gkernel = 'untilhpathkernel' | |||
# gkernel = 'weisfeilerlehmankernel' | |||
gkernel = 'treeletkernel' | |||
node_label = 'atom' | |||
edge_label = 'bond_type' | |||
k = 5 | |||
edit_costs = [0.16229209837639536, 0.06612870523413916, 0.04030113378793905, 0.20723547009415202, 0.3338607220394598, 0.27054392518077297] | |||
# sod_sm, sod_gm, dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min \ | |||
# = median_on_k_closest_graphs(Gn, node_label, edge_label, gkernel, k, | |||
# 'precomputed', edit_costs=edit_costs, | |||
## 'k-graphs', | |||
# parallel=False) | |||
# | |||
# sod_sm, sod_gm, dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min \ | |||
# = median_on_k_closest_graphs(Gn, node_label, edge_label, gkernel, k, | |||
# 'expert', parallel=False) | |||
sod_sm, sod_gm, dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min \ | |||
= median_on_k_closest_graphs(Gn, node_label, edge_label, gkernel, k, | |||
'expert', parallel=False) | |||
return | |||
def test_k_closest_graphs_with_cv(): | |||
gkernel = 'untilhpathkernel' | |||
node_label = 'atom' | |||
edge_label = 'bond_type' | |||
k = 4 | |||
y_all = ['3', '1', '4', '6', '7', '8', '9', '2'] | |||
repeats = 50 | |||
collection_path = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/generated_datsets/monoterpenoides/' | |||
graph_dir = collection_path + 'gxl/' | |||
sod_sm_list = [] | |||
sod_gm_list = [] | |||
dis_k_sm_list = [] | |||
dis_k_gm_list = [] | |||
dis_k_gi_min_list = [] | |||
for y in y_all: | |||
print('\n-------------------------------------------------------') | |||
print('class of y:', y) | |||
sod_sm_list.append([]) | |||
sod_gm_list.append([]) | |||
dis_k_sm_list.append([]) | |||
dis_k_gm_list.append([]) | |||
dis_k_gi_min_list.append([]) | |||
for repeat in range(repeats): | |||
print('\nrepeat ', repeat) | |||
collection_file = collection_path + 'monoterpenoides_' + y + '_' + str(repeat) + '.xml' | |||
Gn, _ = loadDataset(collection_file, extra_params=graph_dir) | |||
sod_sm, sod_gm, dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min \ | |||
= median_on_k_closest_graphs(Gn, node_label, edge_label, gkernel, | |||
k, 'whole-dataset', graph_dir=graph_dir, | |||
parallel=False) | |||
sod_sm_list[-1].append(sod_sm) | |||
sod_gm_list[-1].append(sod_gm) | |||
dis_k_sm_list[-1].append(dis_k_sm) | |||
dis_k_gm_list[-1].append(dis_k_gm) | |||
dis_k_gi_min_list[-1].append(dis_k_gi_min) | |||
print('\nsods of the set median for this class:', sod_sm_list[-1]) | |||
print('\nsods of the gen median for this class:', sod_gm_list[-1]) | |||
print('\ndistances in kernel space of set median for this class:', | |||
dis_k_sm_list[-1]) | |||
print('\ndistances in kernel space of gen median for this class:', | |||
dis_k_gm_list[-1]) | |||
print('\ndistances in kernel space of min graph for this class:', | |||
dis_k_gi_min_list[-1]) | |||
sod_sm_list[-1] = np.mean(sod_sm_list[-1]) | |||
sod_gm_list[-1] = np.mean(sod_gm_list[-1]) | |||
dis_k_sm_list[-1] = np.mean(dis_k_sm_list[-1]) | |||
dis_k_gm_list[-1] = np.mean(dis_k_gm_list[-1]) | |||
dis_k_gi_min_list[-1] = np.mean(dis_k_gi_min_list[-1]) | |||
print() | |||
print('\nmean sods of the set median for each class:', sod_sm_list) | |||
print('\nmean sods of the gen median for each class:', sod_gm_list) | |||
print('\nmean distance in kernel space of set median for each class:', | |||
dis_k_sm_list) | |||
print('\nmean distances in kernel space of gen median for each class:', | |||
dis_k_gm_list) | |||
print('\nmean distances in kernel space of min graph for each class:', | |||
dis_k_gi_min_list) | |||
print('\nmean sods of the set median of all:', np.mean(sod_sm_list)) | |||
print('\nmean sods of the gen median of all:', np.mean(sod_gm_list)) | |||
print('\nmean distances in kernel space of set median of all:', | |||
np.mean(dis_k_sm_list)) | |||
print('\nmean distances in kernel space of gen median of all:', | |||
np.mean(dis_k_gm_list)) | |||
print('\nmean distances in kernel space of min graph of all:', | |||
np.mean(dis_k_gi_min_list)) | |||
return | |||
if __name__ == '__main__': | |||
test_k_closest_graphs() | |||
# test_k_closest_graphs_with_cv() |
@@ -0,0 +1,246 @@ | |||
#!/usr/bin/env python3 | |||
# -*- coding: utf-8 -*- | |||
""" | |||
Created on Tue Jan 14 15:39:29 2020 | |||
@author: ljia | |||
""" | |||
import numpy as np | |||
import random | |||
import csv | |||
from shutil import copyfile | |||
import networkx as nx | |||
import matplotlib.pyplot as plt | |||
import sys | |||
sys.path.insert(0, "../") | |||
from pygraph.utils.graphfiles import loadDataset, loadGXL, saveGXL | |||
from preimage.test_k_closest_graphs import median_on_k_closest_graphs, reform_attributes | |||
from preimage.utils import get_same_item_indices | |||
from preimage.find_best_k import getRelations | |||
def xp_letter_h(): | |||
ds = {'name': 'Letter-high', | |||
'dataset': '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/data/collections/Letter.xml', | |||
'graph_dir': '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/data/datasets/Letter/HIGH/'} # node/edge symb | |||
Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['graph_dir']) | |||
# ds = {'name': 'Letter-high', | |||
# 'dataset': '../datasets/Letter-high/Letter-high_A.txt'} # node/edge symb | |||
# Gn, y_all = loadDataset(ds['dataset']) | |||
# Gn = Gn[0:50] | |||
gkernel = 'structuralspkernel' | |||
node_label = None | |||
edge_label = None | |||
ds_name = 'letter-h' | |||
dir_output = 'results/xp_letter_h/' | |||
repeats = 1 | |||
# k_list = range(2, 11) | |||
k_list = [150] | |||
fit_method = 'precomputed' | |||
# get indices by classes. | |||
y_idx = get_same_item_indices(y_all) | |||
# create result files. | |||
fn_output_detail = 'results_detail.' + fit_method + '.csv' | |||
f_detail = open(dir_output + fn_output_detail, 'a') | |||
csv.writer(f_detail).writerow(['dataset', 'graph kernel', 'fit method', 'k', | |||
'target', 'repeat', 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM', | |||
'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM', | |||
'dis_k gi -> GM', 'median set']) | |||
f_detail.close() | |||
fn_output_summary = 'results_summary.csv' | |||
f_summary = open(dir_output + fn_output_summary, 'a') | |||
csv.writer(f_summary).writerow(['dataset', 'graph kernel', 'fit method', 'k', | |||
'target', 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM', | |||
'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM', | |||
'dis_k gi -> GM', '# SOD SM -> GM', '# dis_k SM -> GM', | |||
'# dis_k gi -> SM', '# dis_k gi -> GM', 'repeats better SOD SM -> GM', | |||
'repeats better dis_k SM -> GM', 'repeats better dis_k gi -> SM', | |||
'repeats better dis_k gi -> GM']) | |||
f_summary.close() | |||
random.seed(1) | |||
rdn_seed_list = random.sample(range(0, repeats * 100), repeats) | |||
for k in k_list: | |||
print('\n--------- k =', k, '----------') | |||
sod_sm_mean_list = [] | |||
sod_gm_mean_list = [] | |||
dis_k_sm_mean_list = [] | |||
dis_k_gm_mean_list = [] | |||
dis_k_gi_min_mean_list = [] | |||
# nb_sod_sm2gm = [0, 0, 0] | |||
# nb_dis_k_sm2gm = [0, 0, 0] | |||
# nb_dis_k_gi2sm = [0, 0, 0] | |||
# nb_dis_k_gi2gm = [0, 0, 0] | |||
# repeats_better_sod_sm2gm = [] | |||
# repeats_better_dis_k_sm2gm = [] | |||
# repeats_better_dis_k_gi2sm = [] | |||
# repeats_better_dis_k_gi2gm = [] | |||
for i, (y, values) in enumerate(y_idx.items()): | |||
print('\ny =', y) | |||
# y = 'I' | |||
# values = y_idx[y] | |||
# k = len(values) | |||
# k = kkk | |||
sod_sm_list = [] | |||
sod_gm_list = [] | |||
dis_k_sm_list = [] | |||
dis_k_gm_list = [] | |||
dis_k_gi_min_list = [] | |||
nb_sod_sm2gm = [0, 0, 0] | |||
nb_dis_k_sm2gm = [0, 0, 0] | |||
nb_dis_k_gi2sm = [0, 0, 0] | |||
nb_dis_k_gi2gm = [0, 0, 0] | |||
repeats_better_sod_sm2gm = [] | |||
repeats_better_dis_k_sm2gm = [] | |||
repeats_better_dis_k_gi2sm = [] | |||
repeats_better_dis_k_gi2gm = [] | |||
for repeat in range(repeats): | |||
print('\nrepeat =', repeat) | |||
random.seed(rdn_seed_list[repeat]) | |||
median_set_idx_idx = random.sample(range(0, len(values)), k) | |||
median_set_idx = [values[idx] for idx in median_set_idx_idx] | |||
print('median set: ', median_set_idx) | |||
Gn_median = [Gn[g] for g in values] | |||
sod_sm, sod_gm, dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min, idx_dis_k_gi_min \ | |||
= median_on_k_closest_graphs(Gn_median, node_label, edge_label, | |||
gkernel, k, fit_method=fit_method, graph_dir=ds['graph_dir'], | |||
edit_costs=None, group_min=median_set_idx_idx, | |||
dataset='Letter', parallel=False) | |||
# write result detail. | |||
sod_sm2gm = getRelations(np.sign(sod_gm - sod_sm)) | |||
dis_k_sm2gm = getRelations(np.sign(dis_k_gm - dis_k_sm)) | |||
dis_k_gi2sm = getRelations(np.sign(dis_k_sm - dis_k_gi_min)) | |||
dis_k_gi2gm = getRelations(np.sign(dis_k_gm - dis_k_gi_min)) | |||
f_detail = open(dir_output + fn_output_detail, 'a') | |||
csv.writer(f_detail).writerow([ds_name, gkernel, fit_method, k, | |||
y, repeat, | |||
sod_sm, sod_gm, dis_k_sm, dis_k_gm, | |||
dis_k_gi_min, sod_sm2gm, dis_k_sm2gm, dis_k_gi2sm, | |||
dis_k_gi2gm, median_set_idx]) | |||
f_detail.close() | |||
# compute result summary. | |||
sod_sm_list.append(sod_sm) | |||
sod_gm_list.append(sod_gm) | |||
dis_k_sm_list.append(dis_k_sm) | |||
dis_k_gm_list.append(dis_k_gm) | |||
dis_k_gi_min_list.append(dis_k_gi_min) | |||
# # SOD SM -> GM | |||
if sod_sm > sod_gm: | |||
nb_sod_sm2gm[0] += 1 | |||
repeats_better_sod_sm2gm.append(repeat) | |||
elif sod_sm == sod_gm: | |||
nb_sod_sm2gm[1] += 1 | |||
elif sod_sm < sod_gm: | |||
nb_sod_sm2gm[2] += 1 | |||
# # dis_k SM -> GM | |||
if dis_k_sm > dis_k_gm: | |||
nb_dis_k_sm2gm[0] += 1 | |||
repeats_better_dis_k_sm2gm.append(repeat) | |||
elif dis_k_sm == dis_k_gm: | |||
nb_dis_k_sm2gm[1] += 1 | |||
elif dis_k_sm < dis_k_gm: | |||
nb_dis_k_sm2gm[2] += 1 | |||
# # dis_k gi -> SM | |||
if dis_k_gi_min > dis_k_sm: | |||
nb_dis_k_gi2sm[0] += 1 | |||
repeats_better_dis_k_gi2sm.append(repeat) | |||
elif dis_k_gi_min == dis_k_sm: | |||
nb_dis_k_gi2sm[1] += 1 | |||
elif dis_k_gi_min < dis_k_sm: | |||
nb_dis_k_gi2sm[2] += 1 | |||
# # dis_k gi -> GM | |||
if dis_k_gi_min > dis_k_gm: | |||
nb_dis_k_gi2gm[0] += 1 | |||
repeats_better_dis_k_gi2gm.append(repeat) | |||
elif dis_k_gi_min == dis_k_gm: | |||
nb_dis_k_gi2gm[1] += 1 | |||
elif dis_k_gi_min < dis_k_gm: | |||
nb_dis_k_gi2gm[2] += 1 | |||
# save median graphs. | |||
fname_sm = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/output/tmp_ged/set_median.gxl' | |||
fn_pre_sm_new = dir_output + 'medians/set_median.k' + str(int(k)) + '.y' + y + '.repeat' + str(repeat) | |||
copyfile(fname_sm, fn_pre_sm_new + '.gxl') | |||
fname_gm = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/output/tmp_ged/gen_median.gxl' | |||
fn_pre_gm_new = dir_output + 'medians/gen_median.k' + str(int(k)) + '.y' + y + '.repeat' + str(repeat) | |||
copyfile(fname_gm, fn_pre_gm_new + '.gxl') | |||
G_best_kernel = Gn_median[idx_dis_k_gi_min].copy() | |||
reform_attributes(G_best_kernel) | |||
fn_pre_g_best_kernel = dir_output + 'medians/g_best_kernel.k' + str(int(k)) + '.y' + y + '.repeat' + str(repeat) | |||
saveGXL(G_best_kernel, fn_pre_g_best_kernel + '.gxl', method='gedlib-letter') | |||
# plot median graphs. | |||
set_median = loadGXL(fn_pre_sm_new + '.gxl') | |||
gen_median = loadGXL(fn_pre_gm_new + '.gxl') | |||
draw_Letter_graph(set_median, fn_pre_sm_new) | |||
draw_Letter_graph(gen_median, fn_pre_gm_new) | |||
draw_Letter_graph(G_best_kernel, fn_pre_g_best_kernel) | |||
# write result summary for each letter. | |||
sod_sm_mean_list.append(np.mean(sod_sm_list)) | |||
sod_gm_mean_list.append(np.mean(sod_gm_list)) | |||
dis_k_sm_mean_list.append(np.mean(dis_k_sm_list)) | |||
dis_k_gm_mean_list.append(np.mean(dis_k_gm_list)) | |||
dis_k_gi_min_mean_list.append(np.mean(dis_k_gi_min_list)) | |||
sod_sm2gm_mean = getRelations(np.sign(sod_gm_mean_list[-1] - sod_sm_mean_list[-1])) | |||
dis_k_sm2gm_mean = getRelations(np.sign(dis_k_gm_mean_list[-1] - dis_k_sm_mean_list[-1])) | |||
dis_k_gi2sm_mean = getRelations(np.sign(dis_k_sm_mean_list[-1] - dis_k_gi_min_mean_list[-1])) | |||
dis_k_gi2gm_mean = getRelations(np.sign(dis_k_gm_mean_list[-1] - dis_k_gi_min_mean_list[-1])) | |||
f_summary = open(dir_output + fn_output_summary, 'a') | |||
csv.writer(f_summary).writerow([ds_name, gkernel, fit_method, k, y, | |||
sod_sm_mean_list[-1], sod_gm_mean_list[-1], | |||
dis_k_sm_mean_list[-1], dis_k_gm_mean_list[-1], | |||
dis_k_gi_min_mean_list[-1], sod_sm2gm_mean, dis_k_sm2gm_mean, | |||
dis_k_gi2sm_mean, dis_k_gi2gm_mean, nb_sod_sm2gm, | |||
nb_dis_k_sm2gm, nb_dis_k_gi2sm, nb_dis_k_gi2gm, | |||
repeats_better_sod_sm2gm, repeats_better_dis_k_sm2gm, | |||
repeats_better_dis_k_gi2sm, repeats_better_dis_k_gi2gm]) | |||
f_summary.close() | |||
# write result summary for each letter. | |||
sod_sm_mean = np.mean(sod_sm_mean_list) | |||
sod_gm_mean = np.mean(sod_gm_mean_list) | |||
dis_k_sm_mean = np.mean(dis_k_sm_mean_list) | |||
dis_k_gm_mean = np.mean(dis_k_gm_mean_list) | |||
dis_k_gi_min_mean = np.mean(dis_k_gi_min_list) | |||
sod_sm2gm_mean = getRelations(np.sign(sod_gm_mean - sod_sm_mean)) | |||
dis_k_sm2gm_mean = getRelations(np.sign(dis_k_gm_mean - dis_k_sm_mean)) | |||
dis_k_gi2sm_mean = getRelations(np.sign(dis_k_sm_mean - dis_k_gi_min_mean)) | |||
dis_k_gi2gm_mean = getRelations(np.sign(dis_k_gm_mean - dis_k_gi_min_mean)) | |||
f_summary = open(dir_output + fn_output_summary, 'a') | |||
csv.writer(f_summary).writerow([ds_name, gkernel, fit_method, k, 'all', | |||
sod_sm_mean, sod_gm_mean, dis_k_sm_mean, dis_k_gm_mean, | |||
dis_k_gi_min_mean, sod_sm2gm_mean, dis_k_sm2gm_mean, | |||
dis_k_gi2sm_mean, dis_k_gi2gm_mean]) | |||
f_summary.close() | |||
print('\ncomplete.') | |||
#Dessin median courrant | |||
def draw_Letter_graph(graph, file_prefix): | |||
plt.figure() | |||
pos = {} | |||
for n in graph.nodes: | |||
pos[n] = np.array([float(graph.node[n]['x']),float(graph.node[n]['y'])]) | |||
nx.draw_networkx(graph, pos) | |||
plt.savefig(file_prefix + '.eps', format='eps', dpi=300) | |||
# plt.show() | |||
plt.clf() | |||
if __name__ == "__main__": | |||
xp_letter_h() |