From 75e42f18380ca4ec51863ef042b501672de3f25b Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Mon, 13 Apr 2020 17:43:33 +0200 Subject: [PATCH] rearrange gklearn/preimage directory. --- gklearn/preimage/find_best_k.py | 170 ---- gklearn/preimage/fitDistance.py | 430 --------- gklearn/preimage/ged.py | 467 ---------- gklearn/preimage/iam.py | 775 ----------------- gklearn/preimage/knn.py | 114 --- gklearn/preimage/libs.py | 6 - gklearn/preimage/median.py | 218 ----- gklearn/preimage/median_benoit.py | 201 ----- gklearn/preimage/median_linlin.py | 215 ----- gklearn/preimage/pathfrequency.py | 201 ----- gklearn/preimage/preimage_iam.py | 705 --------------- gklearn/preimage/preimage_random.py | 309 ------- gklearn/preimage/python_code.py | 122 --- gklearn/preimage/test.py | 83 -- gklearn/preimage/test_fitDistance.py | 648 -------------- gklearn/preimage/test_ged.py | 520 ----------- gklearn/preimage/test_iam.py | 964 --------------------- gklearn/preimage/test_k_closest_graphs.py | 462 ---------- gklearn/preimage/test_median_preimage_generator.py | 69 -- gklearn/preimage/test_others.py | 686 --------------- gklearn/preimage/test_preimage_iam.py | 620 ------------- gklearn/preimage/test_preimage_mix.py | 539 ------------ gklearn/preimage/test_preimage_random.py | 398 --------- gklearn/preimage/xp_fit_method.py | 935 -------------------- gklearn/preimage/xp_letter_h.py | 476 ---------- gklearn/preimage/xp_monoterpenoides.py | 249 ------ 26 files changed, 10582 deletions(-) delete mode 100644 gklearn/preimage/find_best_k.py delete mode 100644 gklearn/preimage/fitDistance.py delete mode 100644 gklearn/preimage/ged.py delete mode 100644 gklearn/preimage/iam.py delete mode 100644 gklearn/preimage/knn.py delete mode 100644 gklearn/preimage/libs.py delete mode 100644 gklearn/preimage/median.py delete mode 100644 gklearn/preimage/median_benoit.py delete mode 100644 gklearn/preimage/median_linlin.py delete mode 100644 gklearn/preimage/pathfrequency.py delete mode 100644 gklearn/preimage/preimage_iam.py delete mode 100644 gklearn/preimage/preimage_random.py delete mode 100644 gklearn/preimage/python_code.py delete mode 100644 gklearn/preimage/test.py delete mode 100644 gklearn/preimage/test_fitDistance.py delete mode 100644 gklearn/preimage/test_ged.py delete mode 100644 gklearn/preimage/test_iam.py delete mode 100644 gklearn/preimage/test_k_closest_graphs.py delete mode 100644 gklearn/preimage/test_median_preimage_generator.py delete mode 100644 gklearn/preimage/test_others.py delete mode 100644 gklearn/preimage/test_preimage_iam.py delete mode 100644 gklearn/preimage/test_preimage_mix.py delete mode 100644 gklearn/preimage/test_preimage_random.py delete mode 100644 gklearn/preimage/xp_fit_method.py delete mode 100644 gklearn/preimage/xp_letter_h.py delete mode 100644 gklearn/preimage/xp_monoterpenoides.py diff --git a/gklearn/preimage/find_best_k.py b/gklearn/preimage/find_best_k.py deleted file mode 100644 index df38d32..0000000 --- a/gklearn/preimage/find_best_k.py +++ /dev/null @@ -1,170 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -Created on Thu Jan 9 11:54:32 2020 - -@author: ljia -""" -import numpy as np -import random -import csv - -from gklearn.utils.graphfiles import loadDataset -from gklearn.preimage.test_k_closest_graphs import median_on_k_closest_graphs - -def find_best_k(): - ds = {'name': 'monoterpenoides', - 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb - Gn, y_all = loadDataset(ds['dataset']) -# Gn = Gn[0:50] - gkernel = 'treeletkernel' - node_label = 'atom' - edge_label = 'bond_type' - ds_name = 'mono' - dir_output = 'results/test_find_best_k/' - - repeats = 50 - k_list = range(2, 11) - fit_method = 'k-graphs' - # fitted on the whole dataset - treelet - mono - edit_costs = [0.1268873773592978, 0.004084633224249829, 0.0897581955378986, 0.15328856114451297, 0.3109956881625734, 0.0] - - # create result files. - fn_output_detail = 'results_detail.' + fit_method + '.csv' - f_detail = open(dir_output + fn_output_detail, 'a') - csv.writer(f_detail).writerow(['dataset', 'graph kernel', 'fit method', 'k', - 'repeat', 'median set', 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM', - 'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM', - 'dis_k gi -> GM']) - f_detail.close() - fn_output_summary = 'results_summary.csv' - f_summary = open(dir_output + fn_output_summary, 'a') - csv.writer(f_summary).writerow(['dataset', 'graph kernel', 'fit method', 'k', - 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM', - 'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM', - 'dis_k gi -> GM', '# SOD SM -> GM', '# dis_k SM -> GM', - '# dis_k gi -> SM', '# dis_k gi -> GM', 'repeats better SOD SM -> GM', - 'repeats better dis_k SM -> GM', 'repeats better dis_k gi -> SM', - 'repeats better dis_k gi -> GM']) - f_summary.close() - - random.seed(1) - rdn_seed_list = random.sample(range(0, repeats * 100), repeats) - - for k in k_list: - print('\n--------- k =', k, '----------') - - sod_sm_list = [] - sod_gm_list = [] - dis_k_sm_list = [] - dis_k_gm_list = [] - dis_k_gi_min_list = [] - nb_sod_sm2gm = [0, 0, 0] - nb_dis_k_sm2gm = [0, 0, 0] - nb_dis_k_gi2sm = [0, 0, 0] - nb_dis_k_gi2gm = [0, 0, 0] - repeats_better_sod_sm2gm = [] - repeats_better_dis_k_sm2gm = [] - repeats_better_dis_k_gi2sm = [] - repeats_better_dis_k_gi2gm = [] - - - for repeat in range(repeats): - print('\nrepeat =', repeat) - random.seed(rdn_seed_list[repeat]) - median_set_idx = random.sample(range(0, len(Gn)), k) - print('median set: ', median_set_idx) - - sod_sm, sod_gm, dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min \ - = median_on_k_closest_graphs(Gn, node_label, edge_label, gkernel, k, - fit_method='k-graphs', - edit_costs=edit_costs, - group_min=median_set_idx, - parallel=False) - - # write result detail. - sod_sm2gm = getRelations(np.sign(sod_gm - sod_sm)) - dis_k_sm2gm = getRelations(np.sign(dis_k_gm - dis_k_sm)) - dis_k_gi2sm = getRelations(np.sign(dis_k_sm - dis_k_gi_min)) - dis_k_gi2gm = getRelations(np.sign(dis_k_gm - dis_k_gi_min)) - f_detail = open(dir_output + fn_output_detail, 'a') - csv.writer(f_detail).writerow([ds_name, gkernel, fit_method, k, repeat, - median_set_idx, sod_sm, sod_gm, dis_k_sm, dis_k_gm, - dis_k_gi_min, sod_sm2gm, dis_k_sm2gm, dis_k_gi2sm, - dis_k_gi2gm]) - f_detail.close() - - # compute result summary. - sod_sm_list.append(sod_sm) - sod_gm_list.append(sod_gm) - dis_k_sm_list.append(dis_k_sm) - dis_k_gm_list.append(dis_k_gm) - dis_k_gi_min_list.append(dis_k_gi_min) - # # SOD SM -> GM - if sod_sm > sod_gm: - nb_sod_sm2gm[0] += 1 - repeats_better_sod_sm2gm.append(repeat) - elif sod_sm == sod_gm: - nb_sod_sm2gm[1] += 1 - elif sod_sm < sod_gm: - nb_sod_sm2gm[2] += 1 - # # dis_k SM -> GM - if dis_k_sm > dis_k_gm: - nb_dis_k_sm2gm[0] += 1 - repeats_better_dis_k_sm2gm.append(repeat) - elif dis_k_sm == dis_k_gm: - nb_dis_k_sm2gm[1] += 1 - elif dis_k_sm < dis_k_gm: - nb_dis_k_sm2gm[2] += 1 - # # dis_k gi -> SM - if dis_k_gi_min > dis_k_sm: - nb_dis_k_gi2sm[0] += 1 - repeats_better_dis_k_gi2sm.append(repeat) - elif dis_k_gi_min == dis_k_sm: - nb_dis_k_gi2sm[1] += 1 - elif dis_k_gi_min < dis_k_sm: - nb_dis_k_gi2sm[2] += 1 - # # dis_k gi -> GM - if dis_k_gi_min > dis_k_gm: - nb_dis_k_gi2gm[0] += 1 - repeats_better_dis_k_gi2gm.append(repeat) - elif dis_k_gi_min == dis_k_gm: - nb_dis_k_gi2gm[1] += 1 - elif dis_k_gi_min < dis_k_gm: - nb_dis_k_gi2gm[2] += 1 - - # write result summary. - sod_sm_mean = np.mean(sod_sm_list) - sod_gm_mean = np.mean(sod_gm_list) - dis_k_sm_mean = np.mean(dis_k_sm_list) - dis_k_gm_mean = np.mean(dis_k_gm_list) - dis_k_gi_min_mean = np.mean(dis_k_gi_min_list) - sod_sm2gm_mean = getRelations(np.sign(sod_gm_mean - sod_sm_mean)) - dis_k_sm2gm_mean = getRelations(np.sign(dis_k_gm_mean - dis_k_sm_mean)) - dis_k_gi2sm_mean = getRelations(np.sign(dis_k_sm_mean - dis_k_gi_min_mean)) - dis_k_gi2gm_mean = getRelations(np.sign(dis_k_gm_mean - dis_k_gi_min_mean)) - f_summary = open(dir_output + fn_output_summary, 'a') - csv.writer(f_summary).writerow([ds_name, gkernel, fit_method, k, - sod_sm_mean, sod_gm_mean, dis_k_sm_mean, dis_k_gm_mean, - dis_k_gi_min_mean, sod_sm2gm_mean, dis_k_sm2gm_mean, - dis_k_gi2sm_mean, dis_k_gi2gm_mean, nb_sod_sm2gm, - nb_dis_k_sm2gm, nb_dis_k_gi2sm, nb_dis_k_gi2gm, - repeats_better_sod_sm2gm, repeats_better_dis_k_sm2gm, - repeats_better_dis_k_gi2sm, repeats_better_dis_k_gi2gm]) - f_summary.close() - - print('\ncomplete.') - return - - -def getRelations(sign): - if sign == -1: - return 'better' - elif sign == 0: - return 'same' - elif sign == 1: - return 'worse' - - -if __name__ == '__main__': - find_best_k() \ No newline at end of file diff --git a/gklearn/preimage/fitDistance.py b/gklearn/preimage/fitDistance.py deleted file mode 100644 index 234f7fc..0000000 --- a/gklearn/preimage/fitDistance.py +++ /dev/null @@ -1,430 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -Created on Wed Oct 16 14:20:06 2019 - -@author: ljia -""" -import numpy as np -from tqdm import tqdm -from itertools import combinations_with_replacement, combinations -import multiprocessing -from multiprocessing import Pool -from functools import partial -import time -import random -import sys - -from scipy import optimize -from scipy.optimize import minimize -import cvxpy as cp - -from gklearn.preimage.ged import GED, get_nb_edit_operations, get_nb_edit_operations_letter, get_nb_edit_operations_nonsymbolic -from gklearn.preimage.utils import kernel_distance_matrix - -def fit_GED_to_kernel_distance(Gn, node_label, edge_label, gkernel, itr_max, - params_ged={'lib': 'gedlibpy', 'cost': 'CONSTANT', - 'method': 'IPFP', 'stabilizer': None}, - init_costs=[3, 3, 1, 3, 3, 1], - dataset='monoterpenoides', Kmatrix=None, - parallel=True): -# dataset = dataset.lower() - - # c_vi, c_vr, c_vs, c_ei, c_er, c_es or parts of them. -# random.seed(1) -# cost_rdm = random.sample(range(1, 10), 6) -# init_costs = cost_rdm + [0] -# init_costs = cost_rdm -# init_costs = [3, 3, 1, 3, 3, 1] -# init_costs = [i * 0.01 for i in cost_rdm] + [0] -# init_costs = [0.2, 0.2, 0.2, 0.2, 0.2, 0] -# init_costs = [0, 0, 0.9544, 0.026, 0.0196, 0] -# init_costs = [0.008429912251810438, 0.025461055985319694, 0.2047320869225948, 0.004148727085832133, 0.0, 0] -# idx_cost_nonzeros = [i for i, item in enumerate(edit_costs) if item != 0] - - # compute distances in feature space. - dis_k_mat, _, _, _ = kernel_distance_matrix(Gn, node_label, edge_label, - Kmatrix=Kmatrix, gkernel=gkernel) - dis_k_vec = [] - for i in range(len(dis_k_mat)): -# for j in range(i, len(dis_k_mat)): - for j in range(i + 1, len(dis_k_mat)): - dis_k_vec.append(dis_k_mat[i, j]) - dis_k_vec = np.array(dis_k_vec) - - # init ged. - print('\ninitial:') - time0 = time.time() - params_ged['dataset'] = dataset - params_ged['edit_cost_constant'] = init_costs - ged_vec_init, ged_mat, n_edit_operations = compute_geds(Gn, params_ged, - parallel=parallel) - residual_list = [np.sqrt(np.sum(np.square(np.array(ged_vec_init) - dis_k_vec)))] - time_list = [time.time() - time0] - edit_cost_list = [init_costs] - nb_cost_mat = np.array(n_edit_operations) - nb_cost_mat_list = [nb_cost_mat] - print('edit_costs:', init_costs) - print('residual_list:', residual_list) - - for itr in range(itr_max): - print('\niteration', itr) - time0 = time.time() - # "fit" geds to distances in feature space by tuning edit costs using the - # Least Squares Method. - np.savez('results/xp_fit_method/fit_data_debug' + str(itr) + '.gm', - nb_cost_mat=nb_cost_mat, dis_k_vec=dis_k_vec, - n_edit_operations=n_edit_operations, ged_vec_init=ged_vec_init, - ged_mat=ged_mat) - edit_costs_new, residual = update_costs(nb_cost_mat, dis_k_vec, - dataset=dataset, cost=params_ged['cost']) - for i in range(len(edit_costs_new)): - if -1e-9 <= edit_costs_new[i] <= 1e-9: - edit_costs_new[i] = 0 - if edit_costs_new[i] < 0: - raise ValueError('The edit cost is negative.') -# for i in range(len(edit_costs_new)): -# if edit_costs_new[i] < 0: -# edit_costs_new[i] = 0 - - # compute new GEDs and numbers of edit operations. - params_ged['edit_cost_constant'] = edit_costs_new # np.array([edit_costs_new[0], edit_costs_new[1], 0.75]) - ged_vec, ged_mat, n_edit_operations = compute_geds(Gn, params_ged, - parallel=parallel) - residual_list.append(np.sqrt(np.sum(np.square(np.array(ged_vec) - dis_k_vec)))) - time_list.append(time.time() - time0) - edit_cost_list.append(edit_costs_new) - nb_cost_mat = np.array(n_edit_operations) - nb_cost_mat_list.append(nb_cost_mat) - print('edit_costs:', edit_costs_new) - print('residual_list:', residual_list) - - return edit_costs_new, residual_list, edit_cost_list, dis_k_mat, ged_mat, \ - time_list, nb_cost_mat_list - - -def compute_geds(Gn, params_ged, parallel=False): - edit_cost_name = params_ged['cost'] - if edit_cost_name == 'LETTER' or edit_cost_name == 'LETTER2': - get_nb_eo = get_nb_edit_operations_letter - elif edit_cost_name == 'NON_SYMBOLIC': - get_nb_eo = get_nb_edit_operations_nonsymbolic - else: - get_nb_eo = get_nb_edit_operations - ged_mat = np.zeros((len(Gn), len(Gn))) - if parallel: -# print('parallel') -# len_itr = int(len(Gn) * (len(Gn) + 1) / 2) - len_itr = int(len(Gn) * (len(Gn) - 1) / 2) - ged_vec = [0 for i in range(len_itr)] - n_edit_operations = [0 for i in range(len_itr)] -# itr = combinations_with_replacement(range(0, len(Gn)), 2) - itr = combinations(range(0, len(Gn)), 2) - n_jobs = multiprocessing.cpu_count() - if len_itr < 100 * n_jobs: - chunksize = int(len_itr / n_jobs) + 1 - else: - chunksize = 100 - def init_worker(gn_toshare): - global G_gn - G_gn = gn_toshare - do_partial = partial(_wrapper_compute_ged_parallel, params_ged, get_nb_eo) - pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(Gn,)) - iterator = tqdm(pool.imap_unordered(do_partial, itr, chunksize), - desc='computing GEDs', file=sys.stdout) -# iterator = pool.imap_unordered(do_partial, itr, chunksize) - for i, j, dis, n_eo_tmp in iterator: - idx_itr = int(len(Gn) * i + j - (i + 1) * (i + 2) / 2) - ged_vec[idx_itr] = dis - ged_mat[i][j] = dis - ged_mat[j][i] = dis - n_edit_operations[idx_itr] = n_eo_tmp -# print('\n-------------------------------------------') -# print(i, j, idx_itr, dis) - pool.close() - pool.join() - - else: - ged_vec = [] - n_edit_operations = [] - for i in tqdm(range(len(Gn)), desc='computing GEDs', file=sys.stdout): -# for i in range(len(Gn)): - for j in range(i + 1, len(Gn)): - dis, pi_forward, pi_backward = GED(Gn[i], Gn[j], **params_ged) - ged_vec.append(dis) - ged_mat[i][j] = dis - ged_mat[j][i] = dis - n_eo_tmp = get_nb_eo(Gn[i], Gn[j], pi_forward, pi_backward) - n_edit_operations.append(n_eo_tmp) - - return ged_vec, ged_mat, n_edit_operations - - -def _wrapper_compute_ged_parallel(params_ged, get_nb_eo, itr): - i = itr[0] - j = itr[1] - dis, n_eo_tmp = _compute_ged_parallel(G_gn[i], G_gn[j], params_ged, get_nb_eo) - return i, j, dis, n_eo_tmp - - -def _compute_ged_parallel(g1, g2, params_ged, get_nb_eo): - dis, pi_forward, pi_backward = GED(g1, g2, **params_ged) - n_eo_tmp = get_nb_eo(g1, g2, pi_forward, pi_backward) # [0,0,0,0,0,0] - return dis, n_eo_tmp - - -def update_costs(nb_cost_mat, dis_k_vec, dataset='monoterpenoides', - cost='CONSTANT', rw_constraints='inequality'): -# if dataset == 'Letter-high': - if cost == 'LETTER': - pass -# # method 1: set alpha automatically, just tune c_vir and c_eir by -# # LMS using cvxpy. -# alpha = 0.5 -# coeff = 100 # np.max(alpha * nb_cost_mat[:,4] / dis_k_vec) -## if np.count_nonzero(nb_cost_mat[:,4]) == 0: -## alpha = 0.75 -## else: -## alpha = np.min([dis_k_vec / c_vs for c_vs in nb_cost_mat[:,4] if c_vs != 0]) -## alpha = alpha * 0.99 -# param_vir = alpha * (nb_cost_mat[:,0] + nb_cost_mat[:,1]) -# param_eir = (1 - alpha) * (nb_cost_mat[:,4] + nb_cost_mat[:,5]) -# nb_cost_mat_new = np.column_stack((param_vir, param_eir)) -# dis_new = coeff * dis_k_vec - alpha * nb_cost_mat[:,3] -# -# x = cp.Variable(nb_cost_mat_new.shape[1]) -# cost = cp.sum_squares(nb_cost_mat_new * x - dis_new) -# constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])]] -# prob = cp.Problem(cp.Minimize(cost), constraints) -# prob.solve() -# edit_costs_new = x.value -# edit_costs_new = np.array([edit_costs_new[0], edit_costs_new[1], alpha]) -# residual = np.sqrt(prob.value) - -# # method 2: tune c_vir, c_eir and alpha by nonlinear programming by -# # scipy.optimize.minimize. -# w0 = nb_cost_mat[:,0] + nb_cost_mat[:,1] -# w1 = nb_cost_mat[:,4] + nb_cost_mat[:,5] -# w2 = nb_cost_mat[:,3] -# w3 = dis_k_vec -# func_min = lambda x: np.sum((w0 * x[0] * x[3] + w1 * x[1] * (1 - x[2]) \ -# + w2 * x[2] - w3 * x[3]) ** 2) -# bounds = ((0, None), (0., None), (0.5, 0.5), (0, None)) -# res = minimize(func_min, [0.9, 1.7, 0.75, 10], bounds=bounds) -# edit_costs_new = res.x[0:3] -# residual = res.fun - - # method 3: tune c_vir, c_eir and alpha by nonlinear programming using cvxpy. - - -# # method 4: tune c_vir, c_eir and alpha by QP function -# # scipy.optimize.least_squares. An initial guess is required. -# w0 = nb_cost_mat[:,0] + nb_cost_mat[:,1] -# w1 = nb_cost_mat[:,4] + nb_cost_mat[:,5] -# w2 = nb_cost_mat[:,3] -# w3 = dis_k_vec -# func = lambda x: (w0 * x[0] * x[3] + w1 * x[1] * (1 - x[2]) \ -# + w2 * x[2] - w3 * x[3]) ** 2 -# res = optimize.root(func, [0.9, 1.7, 0.75, 100]) -# edit_costs_new = res.x -# residual = None - elif cost == 'LETTER2': -# # 1. if c_vi != c_vr, c_ei != c_er. -# nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]] -# x = cp.Variable(nb_cost_mat_new.shape[1]) -# cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec) -## # 1.1 no constraints. -## constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])]] -# # 1.2 c_vs <= c_vi + c_vr. -# constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])], -# np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0] -## # 2. if c_vi == c_vr, c_ei == c_er. -## nb_cost_mat_new = nb_cost_mat[:,[0,3,4]] -## nb_cost_mat_new[:,0] += nb_cost_mat[:,1] -## nb_cost_mat_new[:,2] += nb_cost_mat[:,5] -## x = cp.Variable(nb_cost_mat_new.shape[1]) -## cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec) -## # 2.1 no constraints. -## constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])]] -### # 2.2 c_vs <= c_vi + c_vr. -### constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])], -### np.array([2.0, -1.0, 0.0]).T@x >= 0.0] -# -# prob = cp.Problem(cp.Minimize(cost_fun), constraints) -# prob.solve() -# edit_costs_new = [x.value[0], x.value[0], x.value[1], x.value[2], x.value[2]] -# edit_costs_new = np.array(edit_costs_new) -# residual = np.sqrt(prob.value) - if rw_constraints == 'inequality': - # c_vs <= c_vi + c_vr. - nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]] - x = cp.Variable(nb_cost_mat_new.shape[1]) - cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec) - constraints = [x >= [0.001 for i in range(nb_cost_mat_new.shape[1])], - np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0] - prob = cp.Problem(cp.Minimize(cost_fun), constraints) - try: - prob.solve(verbose=True) - except MemoryError as error0: - print('\nUsing solver "OSQP" caused a memory error.') - print('the original error message is\n', error0) - print('solver status: ', prob.status) - print('trying solver "CVXOPT" instead...\n') - try: - prob.solve(solver=cp.CVXOPT, verbose=True) - except Exception as error1: - print('\nAn error occured when using solver "CVXOPT".') - print('the original error message is\n', error1) - print('solver status: ', prob.status) - print('trying solver "MOSEK" instead. Notice this solver is commercial and a lisence is required.\n') - prob.solve(solver=cp.MOSEK, verbose=True) - else: - print('solver status: ', prob.status) - else: - print('solver status: ', prob.status) - print() - edit_costs_new = x.value - residual = np.sqrt(prob.value) - elif rw_constraints == '2constraints': - # c_vs <= c_vi + c_vr and c_vi == c_vr, c_ei == c_er. - nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]] - x = cp.Variable(nb_cost_mat_new.shape[1]) - cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec) - constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])], - np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0, - np.array([1.0, -1.0, 0.0, 0.0, 0.0]).T@x == 0.0, - np.array([0.0, 0.0, 0.0, 1.0, -1.0]).T@x == 0.0] - prob = cp.Problem(cp.Minimize(cost_fun), constraints) - prob.solve() - edit_costs_new = x.value - residual = np.sqrt(prob.value) - elif rw_constraints == 'no-constraint': - # no constraint. - nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]] - x = cp.Variable(nb_cost_mat_new.shape[1]) - cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec) - constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])]] - prob = cp.Problem(cp.Minimize(cost_fun), constraints) - prob.solve() - edit_costs_new = x.value - residual = np.sqrt(prob.value) -# elif method == 'inequality_modified': -# # c_vs <= c_vi + c_vr. -# nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]] -# x = cp.Variable(nb_cost_mat_new.shape[1]) -# cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec) -# constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])], -# np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0] -# prob = cp.Problem(cp.Minimize(cost_fun), constraints) -# prob.solve() -# # use same costs for insertion and removal rather than the fitted costs. -# edit_costs_new = [x.value[0], x.value[0], x.value[1], x.value[2], x.value[2]] -# edit_costs_new = np.array(edit_costs_new) -# residual = np.sqrt(prob.value) - elif cost == 'NON_SYMBOLIC': - is_n_attr = np.count_nonzero(nb_cost_mat[:,2]) - is_e_attr = np.count_nonzero(nb_cost_mat[:,5]) - - if dataset == 'SYNTHETICnew': -# nb_cost_mat_new = nb_cost_mat[:,[0,1,2,3,4]] - nb_cost_mat_new = nb_cost_mat[:,[2,3,4]] - x = cp.Variable(nb_cost_mat_new.shape[1]) - cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec) -# constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])], -# np.array([0.0, 0.0, 0.0, 1.0, -1.0]).T@x == 0.0] -# constraints = [x >= [0.0001 for i in range(nb_cost_mat_new.shape[1])]] - constraints = [x >= [0.0001 for i in range(nb_cost_mat_new.shape[1])], - np.array([0.0, 1.0, -1.0]).T@x == 0.0] - prob = cp.Problem(cp.Minimize(cost_fun), constraints) - prob.solve() -# print(x.value) - edit_costs_new = np.concatenate((np.array([0.0, 0.0]), x.value, - np.array([0.0]))) - residual = np.sqrt(prob.value) - - elif rw_constraints == 'inequality': - # c_vs <= c_vi + c_vr. - if is_n_attr and is_e_attr: - nb_cost_mat_new = nb_cost_mat[:,[0,1,2,3,4,5]] - x = cp.Variable(nb_cost_mat_new.shape[1]) - cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec) - constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])], - np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0, - np.array([0.0, 0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0] - prob = cp.Problem(cp.Minimize(cost_fun), constraints) - prob.solve() - edit_costs_new = x.value - residual = np.sqrt(prob.value) - elif is_n_attr and not is_e_attr: - nb_cost_mat_new = nb_cost_mat[:,[0,1,2,3,4]] - x = cp.Variable(nb_cost_mat_new.shape[1]) - cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec) - constraints = [x >= [0.001 for i in range(nb_cost_mat_new.shape[1])], - np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0] - prob = cp.Problem(cp.Minimize(cost_fun), constraints) - prob.solve() - print(x.value) - edit_costs_new = np.concatenate((x.value, np.array([0.0]))) - residual = np.sqrt(prob.value) - elif not is_n_attr and is_e_attr: - nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]] - x = cp.Variable(nb_cost_mat_new.shape[1]) - cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec) - constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])], - np.array([0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0] - prob = cp.Problem(cp.Minimize(cost_fun), constraints) - prob.solve() - edit_costs_new = np.concatenate((x.value[0:2], np.array([0.0]), x.value[2:])) - residual = np.sqrt(prob.value) - else: - nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4]] - x = cp.Variable(nb_cost_mat_new.shape[1]) - cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec) - constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])]] - prob = cp.Problem(cp.Minimize(cost_fun), constraints) - prob.solve() - edit_costs_new = np.concatenate((x.value[0:2], np.array([0.0]), - x.value[2:], np.array([0.0]))) - residual = np.sqrt(prob.value) - else: -# # method 1: simple least square method. -# edit_costs_new, residual, _, _ = np.linalg.lstsq(nb_cost_mat, dis_k_vec, -# rcond=None) - -# # method 2: least square method with x_i >= 0. -# edit_costs_new, residual = optimize.nnls(nb_cost_mat, dis_k_vec) - - # method 3: solve as a quadratic program with constraints. -# P = np.dot(nb_cost_mat.T, nb_cost_mat) -# q_T = -2 * np.dot(dis_k_vec.T, nb_cost_mat) -# G = -1 * np.identity(nb_cost_mat.shape[1]) -# h = np.array([0 for i in range(nb_cost_mat.shape[1])]) -# A = np.array([1 for i in range(nb_cost_mat.shape[1])]) -# b = 1 -# x = cp.Variable(nb_cost_mat.shape[1]) -# prob = cp.Problem(cp.Minimize(cp.quad_form(x, P) + q_T@x), -# [G@x <= h]) -# prob.solve() -# edit_costs_new = x.value -# residual = prob.value - np.dot(dis_k_vec.T, dis_k_vec) - -# G = -1 * np.identity(nb_cost_mat.shape[1]) -# h = np.array([0 for i in range(nb_cost_mat.shape[1])]) - x = cp.Variable(nb_cost_mat.shape[1]) - cost_fun = cp.sum_squares(nb_cost_mat * x - dis_k_vec) - constraints = [x >= [0.0 for i in range(nb_cost_mat.shape[1])], - # np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0] - np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0, - np.array([0.0, 0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0] - prob = cp.Problem(cp.Minimize(cost_fun), constraints) - prob.solve() - edit_costs_new = x.value - residual = np.sqrt(prob.value) - - # method 4: - - return edit_costs_new, residual - - -if __name__ == '__main__': - print('check test_fitDistance.py') \ No newline at end of file diff --git a/gklearn/preimage/ged.py b/gklearn/preimage/ged.py deleted file mode 100644 index a66baaf..0000000 --- a/gklearn/preimage/ged.py +++ /dev/null @@ -1,467 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -Created on Thu Oct 17 18:44:59 2019 - -@author: ljia -""" -import numpy as np -import networkx as nx -from tqdm import tqdm -import sys -import multiprocessing -from multiprocessing import Pool -from functools import partial - -#from gedlibpy_linlin import librariesImport, gedlibpy -from gklearn.gedlib import librariesImport, gedlibpy - -def GED(g1, g2, dataset='monoterpenoides', lib='gedlibpy', cost='CHEM_1', method='IPFP', - edit_cost_constant=[], algo_options='', stabilizer='min', repeat=50): - """ - Compute GED for 2 graphs. - """ - -# dataset = dataset.lower() - - if lib == 'gedlibpy': - gedlibpy.restart_env() - gedlibpy.add_nx_graph(convertGraph(g1, cost), "") - gedlibpy.add_nx_graph(convertGraph(g2, cost), "") - - listID = gedlibpy.get_all_graph_ids() - gedlibpy.set_edit_cost(cost, edit_cost_constant=edit_cost_constant) - gedlibpy.init() - gedlibpy.set_method(method, algo_options) - gedlibpy.init_method() - - g = listID[0] - h = listID[1] - if stabilizer is None: - gedlibpy.run_method(g, h) - pi_forward = gedlibpy.get_forward_map(g, h) - pi_backward = gedlibpy.get_backward_map(g, h) - upper = gedlibpy.get_upper_bound(g, h) - lower = gedlibpy.get_lower_bound(g, h) - elif stabilizer == 'mean': - # @todo: to be finished... - upper_list = [np.inf] * repeat - for itr in range(repeat): - gedlibpy.run_method(g, h) - upper_list[itr] = gedlibpy.get_upper_bound(g, h) - pi_forward = gedlibpy.get_forward_map(g, h) - pi_backward = gedlibpy.get_backward_map(g, h) - lower = gedlibpy.get_lower_bound(g, h) - upper = np.mean(upper_list) - elif stabilizer == 'median': - if repeat % 2 == 0: - repeat += 1 - upper_list = [np.inf] * repeat - pi_forward_list = [0] * repeat - pi_backward_list = [0] * repeat - for itr in range(repeat): - gedlibpy.run_method(g, h) - upper_list[itr] = gedlibpy.get_upper_bound(g, h) - pi_forward_list[itr] = gedlibpy.get_forward_map(g, h) - pi_backward_list[itr] = gedlibpy.get_backward_map(g, h) - lower = gedlibpy.get_lower_bound(g, h) - upper = np.median(upper_list) - idx_median = upper_list.index(upper) - pi_forward = pi_forward_list[idx_median] - pi_backward = pi_backward_list[idx_median] - elif stabilizer == 'min': - upper = np.inf - for itr in range(repeat): - gedlibpy.run_method(g, h) - upper_tmp = gedlibpy.get_upper_bound(g, h) - if upper_tmp < upper: - upper = upper_tmp - pi_forward = gedlibpy.get_forward_map(g, h) - pi_backward = gedlibpy.get_backward_map(g, h) - lower = gedlibpy.get_lower_bound(g, h) - if upper == 0: - break - elif stabilizer == 'max': - upper = 0 - for itr in range(repeat): - gedlibpy.run_method(g, h) - upper_tmp = gedlibpy.get_upper_bound(g, h) - if upper_tmp > upper: - upper = upper_tmp - pi_forward = gedlibpy.get_forward_map(g, h) - pi_backward = gedlibpy.get_backward_map(g, h) - lower = gedlibpy.get_lower_bound(g, h) - elif stabilizer == 'gaussian': - pass - - dis = upper - - elif lib == 'gedlib-bash': - import time - import random - import os - from gklearn.utils.graphfiles import saveDataset - - tmp_dir = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/output/tmp_ged/' - if not os.path.exists(tmp_dir): - os.makedirs(tmp_dir) - fn_collection = tmp_dir + 'collection.' + str(time.time()) + str(random.randint(0, 1e9)) - xparams = {'method': 'gedlib', 'graph_dir': fn_collection} - saveDataset([g1, g2], ['dummy', 'dummy'], gformat='gxl', group='xml', - filename=fn_collection, xparams=xparams) - - command = 'GEDLIB_HOME=\'/media/ljia/DATA/research-repo/codes/others/gedlib/gedlib2\'\n' - command += 'LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$GEDLIB_HOME/lib\n' - command += 'export LD_LIBRARY_PATH\n' - command += 'cd \'' + os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/bin\'\n' - command += './ged_for_python_bash monoterpenoides ' + fn_collection \ - + ' \'' + algo_options + '\' ' - for ec in edit_cost_constant: - command += str(ec) + ' ' -# output = os.system(command) - stream = os.popen(command) - output = stream.readlines() -# print(output) - - dis = float(output[0].strip()) - runtime = float(output[1].strip()) - size_forward = int(output[2].strip()) - pi_forward = [int(item.strip()) for item in output[3:3+size_forward]] - pi_backward = [int(item.strip()) for item in output[3+size_forward:]] - -# print(dis) -# print(runtime) -# print(size_forward) -# print(pi_forward) -# print(pi_backward) - - - # make the map label correct (label remove map as np.inf) - nodes1 = [n for n in g1.nodes()] - nodes2 = [n for n in g2.nodes()] - nb1 = nx.number_of_nodes(g1) - nb2 = nx.number_of_nodes(g2) - pi_forward = [nodes2[pi] if pi < nb2 else np.inf for pi in pi_forward] - pi_backward = [nodes1[pi] if pi < nb1 else np.inf for pi in pi_backward] -# print(pi_forward) - - - return dis, pi_forward, pi_backward - - -def convertGraph(G, cost): - """Convert a graph to the proper NetworkX format that can be - recognized by library gedlibpy. - """ - G_new = nx.Graph() - if cost == 'LETTER' or cost == 'LETTER2': - for nd, attrs in G.nodes(data=True): - G_new.add_node(str(nd), x=str(attrs['attributes'][0]), - y=str(attrs['attributes'][1])) - for nd1, nd2, attrs in G.edges(data=True): - G_new.add_edge(str(nd1), str(nd2)) - elif cost == 'NON_SYMBOLIC': - for nd, attrs in G.nodes(data=True): - G_new.add_node(str(nd)) - for a_name in G.graph['node_attrs']: - G_new.nodes[str(nd)][a_name] = str(attrs[a_name]) - for nd1, nd2, attrs in G.edges(data=True): - G_new.add_edge(str(nd1), str(nd2)) - for a_name in G.graph['edge_attrs']: - G_new.edges[str(nd1), str(nd2)][a_name] = str(attrs[a_name]) - else: - for nd, attrs in G.nodes(data=True): - G_new.add_node(str(nd), chem=attrs['atom']) - for nd1, nd2, attrs in G.edges(data=True): - G_new.add_edge(str(nd1), str(nd2), valence=attrs['bond_type']) -# G_new.add_edge(str(nd1), str(nd2)) - - return G_new - - -def GED_n(Gn, lib='gedlibpy', cost='CHEM_1', method='IPFP', - edit_cost_constant=[], stabilizer='min', repeat=50): - """ - Compute GEDs for a group of graphs. - """ - if lib == 'gedlibpy': - def convertGraph(G): - """Convert a graph to the proper NetworkX format that can be - recognized by library gedlibpy. - """ - G_new = nx.Graph() - for nd, attrs in G.nodes(data=True): - G_new.add_node(str(nd), chem=attrs['atom']) - for nd1, nd2, attrs in G.edges(data=True): -# G_new.add_edge(str(nd1), str(nd2), valence=attrs['bond_type']) - G_new.add_edge(str(nd1), str(nd2)) - - return G_new - - gedlibpy.restart_env() - gedlibpy.add_nx_graph(convertGraph(g1), "") - gedlibpy.add_nx_graph(convertGraph(g2), "") - - listID = gedlibpy.get_all_graph_ids() - gedlibpy.set_edit_cost(cost, edit_cost_constant=edit_cost_constant) - gedlibpy.init() - gedlibpy.set_method(method, "") - gedlibpy.init_method() - - g = listID[0] - h = listID[1] - if stabilizer is None: - gedlibpy.run_method(g, h) - pi_forward = gedlibpy.get_forward_map(g, h) - pi_backward = gedlibpy.get_backward_map(g, h) - upper = gedlibpy.get_upper_bound(g, h) - lower = gedlibpy.get_lower_bound(g, h) - elif stabilizer == 'min': - upper = np.inf - for itr in range(repeat): - gedlibpy.run_method(g, h) - upper_tmp = gedlibpy.get_upper_bound(g, h) - if upper_tmp < upper: - upper = upper_tmp - pi_forward = gedlibpy.get_forward_map(g, h) - pi_backward = gedlibpy.get_backward_map(g, h) - lower = gedlibpy.get_lower_bound(g, h) - if upper == 0: - break - - dis = upper - - # make the map label correct (label remove map as np.inf) - nodes1 = [n for n in g1.nodes()] - nodes2 = [n for n in g2.nodes()] - nb1 = nx.number_of_nodes(g1) - nb2 = nx.number_of_nodes(g2) - pi_forward = [nodes2[pi] if pi < nb2 else np.inf for pi in pi_forward] - pi_backward = [nodes1[pi] if pi < nb1 else np.inf for pi in pi_backward] - - return dis, pi_forward, pi_backward - - -def ged_median(Gn, Gn_median, verbose=False, params_ged={'lib': 'gedlibpy', - 'cost': 'CHEM_1', 'method': 'IPFP', 'edit_cost_constant': [], - 'algo_options': '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1', - 'stabilizer': None}, parallel=False): - if parallel: - len_itr = int(len(Gn)) - pi_forward_list = [[] for i in range(len_itr)] - dis_list = [0 for i in range(len_itr)] - - itr = range(0, len_itr) - n_jobs = multiprocessing.cpu_count() - if len_itr < 100 * n_jobs: - chunksize = int(len_itr / n_jobs) + 1 - else: - chunksize = 100 - def init_worker(gn_toshare, gn_median_toshare): - global G_gn, G_gn_median - G_gn = gn_toshare - G_gn_median = gn_median_toshare - do_partial = partial(_compute_ged_median, params_ged) - pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(Gn, Gn_median)) - if verbose: - iterator = tqdm(pool.imap_unordered(do_partial, itr, chunksize), - desc='computing GEDs', file=sys.stdout) - else: - iterator = pool.imap_unordered(do_partial, itr, chunksize) - for i, dis_sum, pi_forward in iterator: - pi_forward_list[i] = pi_forward - dis_list[i] = dis_sum -# print('\n-------------------------------------------') -# print(i, j, idx_itr, dis) - pool.close() - pool.join() - - else: - dis_list = [] - pi_forward_list = [] - for idx, G in tqdm(enumerate(Gn), desc='computing median distances', - file=sys.stdout) if verbose else enumerate(Gn): - dis_sum = 0 - pi_forward_list.append([]) - for G_p in Gn_median: - dis_tmp, pi_tmp_forward, pi_tmp_backward = GED(G, G_p, - **params_ged) - pi_forward_list[idx].append(pi_tmp_forward) - dis_sum += dis_tmp - dis_list.append(dis_sum) - - return dis_list, pi_forward_list - - -def _compute_ged_median(params_ged, itr): -# print(itr) - dis_sum = 0 - pi_forward = [] - for G_p in G_gn_median: - dis_tmp, pi_tmp_forward, pi_tmp_backward = GED(G_gn[itr], G_p, - **params_ged) - pi_forward.append(pi_tmp_forward) - dis_sum += dis_tmp - - return itr, dis_sum, pi_forward - - -def get_nb_edit_operations(g1, g2, forward_map, backward_map): - """Compute the number of each edit operations. - """ - n_vi = 0 - n_vr = 0 - n_vs = 0 - n_ei = 0 - n_er = 0 - n_es = 0 - - nodes1 = [n for n in g1.nodes()] - for i, map_i in enumerate(forward_map): - if map_i == np.inf: - n_vr += 1 - elif g1.node[nodes1[i]]['atom'] != g2.node[map_i]['atom']: - n_vs += 1 - for map_i in backward_map: - if map_i == np.inf: - n_vi += 1 - -# idx_nodes1 = range(0, len(node1)) - - edges1 = [e for e in g1.edges()] - nb_edges2_cnted = 0 - for n1, n2 in edges1: - idx1 = nodes1.index(n1) - idx2 = nodes1.index(n2) - # one of the nodes is removed, thus the edge is removed. - if forward_map[idx1] == np.inf or forward_map[idx2] == np.inf: - n_er += 1 - # corresponding edge is in g2. - elif (forward_map[idx1], forward_map[idx2]) in g2.edges(): - nb_edges2_cnted += 1 - # edge labels are different. - if g2.edges[((forward_map[idx1], forward_map[idx2]))]['bond_type'] \ - != g1.edges[(n1, n2)]['bond_type']: - n_es += 1 - elif (forward_map[idx2], forward_map[idx1]) in g2.edges(): - nb_edges2_cnted += 1 - # edge labels are different. - if g2.edges[((forward_map[idx2], forward_map[idx1]))]['bond_type'] \ - != g1.edges[(n1, n2)]['bond_type']: - n_es += 1 - # corresponding nodes are in g2, however the edge is removed. - else: - n_er += 1 - n_ei = nx.number_of_edges(g2) - nb_edges2_cnted - - return n_vi, n_vr, n_vs, n_ei, n_er, n_es - - -def get_nb_edit_operations_letter(g1, g2, forward_map, backward_map): - """Compute the number of each edit operations. - """ - n_vi = 0 - n_vr = 0 - n_vs = 0 - sod_vs = 0 - n_ei = 0 - n_er = 0 - - nodes1 = [n for n in g1.nodes()] - for i, map_i in enumerate(forward_map): - if map_i == np.inf: - n_vr += 1 - else: - n_vs += 1 - diff_x = float(g1.nodes[nodes1[i]]['x']) - float(g2.nodes[map_i]['x']) - diff_y = float(g1.nodes[nodes1[i]]['y']) - float(g2.nodes[map_i]['y']) - sod_vs += np.sqrt(np.square(diff_x) + np.square(diff_y)) - for map_i in backward_map: - if map_i == np.inf: - n_vi += 1 - -# idx_nodes1 = range(0, len(node1)) - - edges1 = [e for e in g1.edges()] - nb_edges2_cnted = 0 - for n1, n2 in edges1: - idx1 = nodes1.index(n1) - idx2 = nodes1.index(n2) - # one of the nodes is removed, thus the edge is removed. - if forward_map[idx1] == np.inf or forward_map[idx2] == np.inf: - n_er += 1 - # corresponding edge is in g2. Edge label is not considered. - elif (forward_map[idx1], forward_map[idx2]) in g2.edges() or \ - (forward_map[idx2], forward_map[idx1]) in g2.edges(): - nb_edges2_cnted += 1 - # corresponding nodes are in g2, however the edge is removed. - else: - n_er += 1 - n_ei = nx.number_of_edges(g2) - nb_edges2_cnted - - return n_vi, n_vr, n_vs, sod_vs, n_ei, n_er - - -def get_nb_edit_operations_nonsymbolic(g1, g2, forward_map, backward_map): - """Compute the number of each edit operations. - """ - n_vi = 0 - n_vr = 0 - n_vs = 0 - sod_vs = 0 - n_ei = 0 - n_er = 0 - n_es = 0 - sod_es = 0 - - nodes1 = [n for n in g1.nodes()] - for i, map_i in enumerate(forward_map): - if map_i == np.inf: - n_vr += 1 - else: - n_vs += 1 - sum_squares = 0 - for a_name in g1.graph['node_attrs']: - diff = float(g1.nodes[nodes1[i]][a_name]) - float(g2.nodes[map_i][a_name]) - sum_squares += np.square(diff) - sod_vs += np.sqrt(sum_squares) - for map_i in backward_map: - if map_i == np.inf: - n_vi += 1 - -# idx_nodes1 = range(0, len(node1)) - - edges1 = [e for e in g1.edges()] - for n1, n2 in edges1: - idx1 = nodes1.index(n1) - idx2 = nodes1.index(n2) - n1_g2 = forward_map[idx1] - n2_g2 = forward_map[idx2] - # one of the nodes is removed, thus the edge is removed. - if n1_g2 == np.inf or n2_g2 == np.inf: - n_er += 1 - # corresponding edge is in g2. - elif (n1_g2, n2_g2) in g2.edges(): - n_es += 1 - sum_squares = 0 - for a_name in g1.graph['edge_attrs']: - diff = float(g1.edges[n1, n2][a_name]) - float(g2.nodes[n1_g2, n2_g2][a_name]) - sum_squares += np.square(diff) - sod_es += np.sqrt(sum_squares) - elif (n2_g2, n1_g2) in g2.edges(): - n_es += 1 - sum_squares = 0 - for a_name in g1.graph['edge_attrs']: - diff = float(g1.edges[n2, n1][a_name]) - float(g2.nodes[n2_g2, n1_g2][a_name]) - sum_squares += np.square(diff) - sod_es += np.sqrt(sum_squares) - # corresponding nodes are in g2, however the edge is removed. - else: - n_er += 1 - n_ei = nx.number_of_edges(g2) - n_es - - return n_vi, n_vr, sod_vs, n_ei, n_er, sod_es - - -if __name__ == '__main__': - print('check test_ged.py') \ No newline at end of file diff --git a/gklearn/preimage/iam.py b/gklearn/preimage/iam.py deleted file mode 100644 index f3e2165..0000000 --- a/gklearn/preimage/iam.py +++ /dev/null @@ -1,775 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -Created on Fri Apr 26 11:49:12 2019 - -Iterative alternate minimizations using GED. -@author: ljia -""" -import numpy as np -import random -import networkx as nx -from tqdm import tqdm - -from gklearn.utils.graphdataset import get_dataset_attributes -from gklearn.utils.utils import graph_isIdentical, get_node_labels, get_edge_labels -from gklearn.preimage.ged import GED, ged_median - - -def iam_upgraded(Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, ite_max=50, - epsilon=0.001, node_label='atom', edge_label='bond_type', - connected=False, removeNodes=True, allBestInit=False, allBestNodes=False, - allBestEdges=False, allBestOutput=False, - params_ged={'lib': 'gedlibpy', 'cost': 'CHEM_1', 'method': 'IPFP', - 'edit_cost_constant': [], 'stabilizer': None, - 'algo_options': '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1'}): - """See my name, then you know what I do. - """ -# Gn_median = Gn_median[0:10] -# Gn_median = [nx.convert_node_labels_to_integers(g) for g in Gn_median] - node_ir = np.inf # corresponding to the node remove and insertion. - label_r = 'thanksdanny' # the label for node remove. # @todo: make this label unrepeatable. - ds_attrs = get_dataset_attributes(Gn_median + Gn_candidate, - attr_names=['edge_labeled', 'node_attr_dim', 'edge_attr_dim'], - edge_label=edge_label) - node_label_set = get_node_labels(Gn_median, node_label) - edge_label_set = get_edge_labels(Gn_median, edge_label) - - - def generate_graph(G, pi_p_forward): - G_new_list = [G.copy()] # all "best" graphs generated in this iteration. -# nx.draw_networkx(G) -# import matplotlib.pyplot as plt -# plt.show() -# print(pi_p_forward) - - # update vertex labels. - # pre-compute h_i0 for each label. -# for label in get_node_labels(Gn, node_label): -# print(label) -# for nd in G.nodes(data=True): -# pass - if not ds_attrs['node_attr_dim']: # labels are symbolic - for ndi, (nd, _) in enumerate(G.nodes(data=True)): - h_i0_list = [] - label_list = [] - for label in node_label_set: - h_i0 = 0 - for idx, g in enumerate(Gn_median): - pi_i = pi_p_forward[idx][ndi] - if pi_i != node_ir and g.nodes[pi_i][node_label] == label: - h_i0 += 1 - h_i0_list.append(h_i0) - label_list.append(label) - # case when the node is to be removed. - if removeNodes: - h_i0_remove = 0 # @todo: maybe this can be added to the node_label_set above. - for idx, g in enumerate(Gn_median): - pi_i = pi_p_forward[idx][ndi] - if pi_i == node_ir: - h_i0_remove += 1 - h_i0_list.append(h_i0_remove) - label_list.append(label_r) - # get the best labels. - idx_max = np.argwhere(h_i0_list == np.max(h_i0_list)).flatten().tolist() - if allBestNodes: # choose all best graphs. - nlabel_best = [label_list[idx] for idx in idx_max] - # generate "best" graphs with regard to "best" node labels. - G_new_list_nd = [] - for g in G_new_list: # @todo: seems it can be simplified. The G_new_list will only contain 1 graph for now. - for nl in nlabel_best: - g_tmp = g.copy() - if nl == label_r: - g_tmp.remove_node(nd) - else: - g_tmp.nodes[nd][node_label] = nl - G_new_list_nd.append(g_tmp) - # nx.draw_networkx(g_tmp) - # import matplotlib.pyplot as plt - # plt.show() - # print(g_tmp.nodes(data=True)) - # print(g_tmp.edges(data=True)) - G_new_list = [ggg.copy() for ggg in G_new_list_nd] - else: - # choose one of the best randomly. - idx_rdm = random.randint(0, len(idx_max) - 1) - best_label = label_list[idx_max[idx_rdm]] - h_i0_max = h_i0_list[idx_max[idx_rdm]] - - g_new = G_new_list[0] - if best_label == label_r: - g_new.remove_node(nd) - else: - g_new.nodes[nd][node_label] = best_label - G_new_list = [g_new] - else: # labels are non-symbolic - for ndi, (nd, _) in enumerate(G.nodes(data=True)): - Si_norm = 0 - phi_i_bar = np.array([0.0 for _ in range(ds_attrs['node_attr_dim'])]) - for idx, g in enumerate(Gn_median): - pi_i = pi_p_forward[idx][ndi] - if g.has_node(pi_i): #@todo: what if no g has node? phi_i_bar = 0? - Si_norm += 1 - phi_i_bar += np.array([float(itm) for itm in g.nodes[pi_i]['attributes']]) - phi_i_bar /= Si_norm - G_new_list[0].nodes[nd]['attributes'] = phi_i_bar - -# for g in G_new_list: -# import matplotlib.pyplot as plt -# nx.draw(g, labels=nx.get_node_attributes(g, 'atom'), with_labels=True) -# plt.show() -# print(g.nodes(data=True)) -# print(g.edges(data=True)) - - # update edge labels and adjacency matrix. - if ds_attrs['edge_labeled']: - G_new_list_edge = [] - for g_new in G_new_list: - nd_list = [n for n in g_new.nodes()] - g_tmp_list = [g_new.copy()] - for nd1i in range(nx.number_of_nodes(g_new)): - nd1 = nd_list[nd1i]# @todo: not just edges, but all pairs of nodes - for nd2i in range(nd1i + 1, nx.number_of_nodes(g_new)): - nd2 = nd_list[nd2i] -# for nd1, nd2, _ in g_new.edges(data=True): - h_ij0_list = [] - label_list = [] - for label in edge_label_set: - h_ij0 = 0 - for idx, g in enumerate(Gn_median): - pi_i = pi_p_forward[idx][nd1i] - pi_j = pi_p_forward[idx][nd2i] - h_ij0_p = (g.has_node(pi_i) and g.has_node(pi_j) and - g.has_edge(pi_i, pi_j) and - g.edges[pi_i, pi_j][edge_label] == label) - h_ij0 += h_ij0_p - h_ij0_list.append(h_ij0) - label_list.append(label) - - # get the best labels. - idx_max = np.argwhere(h_ij0_list == np.max(h_ij0_list)).flatten().tolist() - if allBestEdges: # choose all best graphs. - elabel_best = [label_list[idx] for idx in idx_max] - h_ij0_max = [h_ij0_list[idx] for idx in idx_max] - # generate "best" graphs with regard to "best" node labels. - G_new_list_ed = [] - for g_tmp in g_tmp_list: # @todo: seems it can be simplified. The G_new_list will only contain 1 graph for now. - for idxl, el in enumerate(elabel_best): - g_tmp_copy = g_tmp.copy() - # check whether a_ij is 0 or 1. - sij_norm = 0 - for idx, g in enumerate(Gn_median): - pi_i = pi_p_forward[idx][nd1i] - pi_j = pi_p_forward[idx][nd2i] - if g.has_node(pi_i) and g.has_node(pi_j) and \ - g.has_edge(pi_i, pi_j): - sij_norm += 1 - if h_ij0_max[idxl] > len(Gn_median) * c_er / c_es + \ - sij_norm * (1 - (c_er + c_ei) / c_es): - if not g_tmp_copy.has_edge(nd1, nd2): - g_tmp_copy.add_edge(nd1, nd2) - g_tmp_copy.edges[nd1, nd2][edge_label] = elabel_best[idxl] - else: - if g_tmp_copy.has_edge(nd1, nd2): - g_tmp_copy.remove_edge(nd1, nd2) - G_new_list_ed.append(g_tmp_copy) - g_tmp_list = [ggg.copy() for ggg in G_new_list_ed] - else: # choose one of the best randomly. - idx_rdm = random.randint(0, len(idx_max) - 1) - best_label = label_list[idx_max[idx_rdm]] - h_ij0_max = h_ij0_list[idx_max[idx_rdm]] - - # check whether a_ij is 0 or 1. - sij_norm = 0 - for idx, g in enumerate(Gn_median): - pi_i = pi_p_forward[idx][nd1i] - pi_j = pi_p_forward[idx][nd2i] - if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j): - sij_norm += 1 - if h_ij0_max > len(Gn_median) * c_er / c_es + sij_norm * (1 - (c_er + c_ei) / c_es): - if not g_new.has_edge(nd1, nd2): - g_new.add_edge(nd1, nd2) - g_new.edges[nd1, nd2][edge_label] = best_label - else: -# elif h_ij0_max < len(Gn_median) * c_er / c_es + sij_norm * (1 - (c_er + c_ei) / c_es): - if g_new.has_edge(nd1, nd2): - g_new.remove_edge(nd1, nd2) - g_tmp_list = [g_new] - G_new_list_edge += g_tmp_list - G_new_list = [ggg.copy() for ggg in G_new_list_edge] - - - else: # if edges are unlabeled - # @todo: is this even right? G or g_tmp? check if the new one is right - # @todo: works only for undirected graphs. - - for g_tmp in G_new_list: - nd_list = [n for n in g_tmp.nodes()] - for nd1i in range(nx.number_of_nodes(g_tmp)): - nd1 = nd_list[nd1i] - for nd2i in range(nd1i + 1, nx.number_of_nodes(g_tmp)): - nd2 = nd_list[nd2i] - sij_norm = 0 - for idx, g in enumerate(Gn_median): - pi_i = pi_p_forward[idx][nd1i] - pi_j = pi_p_forward[idx][nd2i] - if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j): - sij_norm += 1 - if sij_norm > len(Gn_median) * c_er / (c_er + c_ei): - # @todo: should we consider if nd1 and nd2 in g_tmp? - # or just add the edge anyway? - if g_tmp.has_node(nd1) and g_tmp.has_node(nd2) \ - and not g_tmp.has_edge(nd1, nd2): - g_tmp.add_edge(nd1, nd2) - else: # @todo: which to use? -# elif sij_norm < len(Gn_median) * c_er / (c_er + c_ei): - if g_tmp.has_edge(nd1, nd2): - g_tmp.remove_edge(nd1, nd2) - # do not change anything when equal. - -# for i, g in enumerate(G_new_list): -# import matplotlib.pyplot as plt -# nx.draw(g, labels=nx.get_node_attributes(g, 'atom'), with_labels=True) -## plt.savefig("results/gk_iam/simple_two/xx" + str(i) + ".png", format="PNG") -# plt.show() -# print(g.nodes(data=True)) -# print(g.edges(data=True)) - -# # find the best graph generated in this iteration and update pi_p. - # @todo: should we update all graphs generated or just the best ones? - dis_list, pi_forward_list = ged_median(G_new_list, Gn_median, - params_ged=params_ged) - # @todo: should we remove the identical and connectivity check? - # Don't know which is faster. - if ds_attrs['node_attr_dim'] == 0 and ds_attrs['edge_attr_dim'] == 0: - G_new_list, idx_list = remove_duplicates(G_new_list) - pi_forward_list = [pi_forward_list[idx] for idx in idx_list] - dis_list = [dis_list[idx] for idx in idx_list] -# if connected == True: -# G_new_list, idx_list = remove_disconnected(G_new_list) -# pi_forward_list = [pi_forward_list[idx] for idx in idx_list] -# idx_min_list = np.argwhere(dis_list == np.min(dis_list)).flatten().tolist() -# dis_min = dis_list[idx_min_tmp_list[0]] -# pi_forward_list = [pi_forward_list[idx] for idx in idx_min_list] -# G_new_list = [G_new_list[idx] for idx in idx_min_list] - -# for g in G_new_list: -# import matplotlib.pyplot as plt -# nx.draw_networkx(g) -# plt.show() -# print(g.nodes(data=True)) -# print(g.edges(data=True)) - - return G_new_list, pi_forward_list, dis_list - - - def best_median_graphs(Gn_candidate, pi_all_forward, dis_all): - idx_min_list = np.argwhere(dis_all == np.min(dis_all)).flatten().tolist() - dis_min = dis_all[idx_min_list[0]] - pi_forward_min_list = [pi_all_forward[idx] for idx in idx_min_list] - G_min_list = [Gn_candidate[idx] for idx in idx_min_list] - return G_min_list, pi_forward_min_list, dis_min - - - def iteration_proc(G, pi_p_forward, cur_sod): - G_list = [G] - pi_forward_list = [pi_p_forward] - old_sod = cur_sod * 2 - sod_list = [cur_sod] - dis_list = [cur_sod] - # iterations. - itr = 0 - # @todo: what if difference == 0? -# while itr < ite_max and (np.abs(old_sod - cur_sod) > epsilon or -# np.abs(old_sod - cur_sod) == 0): - while itr < ite_max and np.abs(old_sod - cur_sod) > epsilon: -# while itr < ite_max: -# for itr in range(0, 5): # the convergence condition? - print('itr_iam is', itr) - G_new_list = [] - pi_forward_new_list = [] - dis_new_list = [] - for idx, g in enumerate(G_list): -# label_set = get_node_labels(Gn_median + [g], node_label) - G_tmp_list, pi_forward_tmp_list, dis_tmp_list = generate_graph( - g, pi_forward_list[idx]) - G_new_list += G_tmp_list - pi_forward_new_list += pi_forward_tmp_list - dis_new_list += dis_tmp_list - # @todo: need to remove duplicates here? - G_list = [ggg.copy() for ggg in G_new_list] - pi_forward_list = [pitem.copy() for pitem in pi_forward_new_list] - dis_list = dis_new_list[:] - - old_sod = cur_sod - cur_sod = np.min(dis_list) - sod_list.append(cur_sod) - - itr += 1 - - # @todo: do we return all graphs or the best ones? - # get the best ones of the generated graphs. - G_list, pi_forward_list, dis_min = best_median_graphs( - G_list, pi_forward_list, dis_list) - - if ds_attrs['node_attr_dim'] == 0 and ds_attrs['edge_attr_dim'] == 0: - G_list, idx_list = remove_duplicates(G_list) - pi_forward_list = [pi_forward_list[idx] for idx in idx_list] -# dis_list = [dis_list[idx] for idx in idx_list] - -# import matplotlib.pyplot as plt -# for g in G_list: -# nx.draw_networkx(g) -# plt.show() -# print(g.nodes(data=True)) -# print(g.edges(data=True)) - - print('\nsods:', sod_list, '\n') - - return G_list, pi_forward_list, dis_min, sod_list - - - def remove_duplicates(Gn): - """Remove duplicate graphs from list. - """ - Gn_new = [] - idx_list = [] - for idx, g in enumerate(Gn): - dupl = False - for g_new in Gn_new: - if graph_isIdentical(g_new, g): - dupl = True - break - if not dupl: - Gn_new.append(g) - idx_list.append(idx) - return Gn_new, idx_list - - - def remove_disconnected(Gn): - """Remove disconnected graphs from list. - """ - Gn_new = [] - idx_list = [] - for idx, g in enumerate(Gn): - if nx.is_connected(g): - Gn_new.append(g) - idx_list.append(idx) - return Gn_new, idx_list - - - ########################################################################### - - # phase 1: initilize. - # compute set-median. - dis_min = np.inf - dis_list, pi_forward_all = ged_median(Gn_candidate, Gn_median, - params_ged=params_ged, parallel=True) - print('finish computing GEDs.') - # find all smallest distances. - if allBestInit: # try all best init graphs. - idx_min_list = range(len(dis_list)) - dis_min = dis_list - else: - idx_min_list = np.argwhere(dis_list == np.min(dis_list)).flatten().tolist() - dis_min = [dis_list[idx_min_list[0]]] * len(idx_min_list) - idx_min_rdm = random.randint(0, len(idx_min_list) - 1) - idx_min_list = [idx_min_list[idx_min_rdm]] - sod_set_median = np.min(dis_min) - - - # phase 2: iteration. - G_list = [] - dis_list = [] - pi_forward_list = [] - G_set_median_list = [] -# sod_list = [] - for idx_tmp, idx_min in enumerate(idx_min_list): -# print('idx_min is', idx_min) - G = Gn_candidate[idx_min].copy() - G_set_median_list.append(G.copy()) - # list of edit operations. - pi_p_forward = pi_forward_all[idx_min] -# pi_p_backward = pi_all_backward[idx_min] - Gi_list, pi_i_forward_list, dis_i_min, sod_list = iteration_proc(G, - pi_p_forward, dis_min[idx_tmp]) - G_list += Gi_list - dis_list += [dis_i_min] * len(Gi_list) - pi_forward_list += pi_i_forward_list - - - if ds_attrs['node_attr_dim'] == 0 and ds_attrs['edge_attr_dim'] == 0: - G_list, idx_list = remove_duplicates(G_list) - dis_list = [dis_list[idx] for idx in idx_list] - pi_forward_list = [pi_forward_list[idx] for idx in idx_list] - if connected == True: - G_list_con, idx_list = remove_disconnected(G_list) - # if there is no connected graphs at all, then remain the disconnected ones. - if len(G_list_con) > 0: # @todo: ?????????????????????????? - G_list = G_list_con - dis_list = [dis_list[idx] for idx in idx_list] - pi_forward_list = [pi_forward_list[idx] for idx in idx_list] - -# import matplotlib.pyplot as plt -# for g in G_list: -# nx.draw_networkx(g) -# plt.show() -# print(g.nodes(data=True)) -# print(g.edges(data=True)) - - # get the best median graphs - G_gen_median_list, pi_forward_min_list, sod_gen_median = best_median_graphs( - G_list, pi_forward_list, dis_list) -# for g in G_gen_median_list: -# nx.draw_networkx(g) -# plt.show() -# print(g.nodes(data=True)) -# print(g.edges(data=True)) - - if not allBestOutput: - # randomly choose one graph. - idx_rdm = random.randint(0, len(G_gen_median_list) - 1) - G_gen_median_list = [G_gen_median_list[idx_rdm]] - - return G_gen_median_list, sod_gen_median, sod_list, G_set_median_list, sod_set_median - - -def iam_bash(Gn_names, edit_cost_constant, cost='CONSTANT', initial_solutions=1, - dataset='monoterpenoides', - graph_dir=''): - """Compute the iam by c++ implementation (gedlib) through bash. - """ - import os - import time - - def createCollectionFile(Gn_names, y, filename): - """Create collection file. - """ - dirname_ds = os.path.dirname(filename) - if dirname_ds != '': - dirname_ds += '/' - if not os.path.exists(dirname_ds) : - os.makedirs(dirname_ds) - - with open(filename + '.xml', 'w') as fgroup: - fgroup.write("") - fgroup.write("\n") - fgroup.write("\n") - for idx, fname in enumerate(Gn_names): - fgroup.write("\n\t") - fgroup.write("\n") - fgroup.close() - - tmp_dir = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/output/tmp_ged/' - fn_collection = tmp_dir + 'collection.' + str(time.time()) + str(random.randint(0, 1e9)) - createCollectionFile(Gn_names, ['dummy'] * len(Gn_names), fn_collection) -# fn_collection = tmp_dir + 'collection_for_debug' -# graph_dir = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/generated_datsets/monoterpenoides/gxl' - -# if dataset == 'Letter-high' or dataset == 'Fingerprint': -# dataset = 'letter' - command = 'GEDLIB_HOME=\'/media/ljia/DATA/research-repo/codes/Linlin/gedlib\'\n' - command += 'LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$GEDLIB_HOME/lib\n' - command += 'export LD_LIBRARY_PATH\n' - command += 'cd \'' + os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/bin\'\n' - command += './iam_for_python_bash ' + dataset + ' ' + fn_collection \ - + ' \'' + graph_dir + '\' ' + ' ' + cost + ' ' + str(initial_solutions) + ' ' - if edit_cost_constant is None: - command += 'None' - else: - for ec in edit_cost_constant: - command += str(ec) + ' ' -# output = os.system(command) - stream = os.popen(command) - - output = stream.readlines() -# print(output) - sod_sm = float(output[0].strip()) - sod_gm = float(output[1].strip()) - - fname_sm = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/output/tmp_ged/set_median.gxl' - fname_gm = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/output/tmp_ged/gen_median.gxl' - - return sod_sm, sod_gm, fname_sm, fname_gm - - - -############################################################################### -# Old implementations. - -def iam(Gn, c_ei=3, c_er=3, c_es=1, node_label='atom', edge_label='bond_type', - connected=True): - """See my name, then you know what I do. - """ -# Gn = Gn[0:10] - Gn = [nx.convert_node_labels_to_integers(g) for g in Gn] - - # phase 1: initilize. - # compute set-median. - dis_min = np.inf - pi_p = [] - pi_all = [] - for idx1, G_p in enumerate(Gn): - dist_sum = 0 - pi_all.append([]) - for idx2, G_p_prime in enumerate(Gn): - dist_tmp, pi_tmp, _ = GED(G_p, G_p_prime) - pi_all[idx1].append(pi_tmp) - dist_sum += dist_tmp - if dist_sum < dis_min: - dis_min = dist_sum - G = G_p.copy() - idx_min = idx1 - # list of edit operations. - pi_p = pi_all[idx_min] - - # phase 2: iteration. - ds_attrs = get_dataset_attributes(Gn, attr_names=['edge_labeled', 'node_attr_dim'], - edge_label=edge_label) - for itr in range(0, 10): # @todo: the convergence condition? - G_new = G.copy() - # update vertex labels. - # pre-compute h_i0 for each label. -# for label in get_node_labels(Gn, node_label): -# print(label) -# for nd in G.nodes(data=True): -# pass - if not ds_attrs['node_attr_dim']: # labels are symbolic - for nd, _ in G.nodes(data=True): - h_i0_list = [] - label_list = [] - for label in get_node_labels(Gn, node_label): - h_i0 = 0 - for idx, g in enumerate(Gn): - pi_i = pi_p[idx][nd] - if g.has_node(pi_i) and g.nodes[pi_i][node_label] == label: - h_i0 += 1 - h_i0_list.append(h_i0) - label_list.append(label) - # choose one of the best randomly. - idx_max = np.argwhere(h_i0_list == np.max(h_i0_list)).flatten().tolist() - idx_rdm = random.randint(0, len(idx_max) - 1) - G_new.nodes[nd][node_label] = label_list[idx_max[idx_rdm]] - else: # labels are non-symbolic - for nd, _ in G.nodes(data=True): - Si_norm = 0 - phi_i_bar = np.array([0.0 for _ in range(ds_attrs['node_attr_dim'])]) - for idx, g in enumerate(Gn): - pi_i = pi_p[idx][nd] - if g.has_node(pi_i): #@todo: what if no g has node? phi_i_bar = 0? - Si_norm += 1 - phi_i_bar += np.array([float(itm) for itm in g.nodes[pi_i]['attributes']]) - phi_i_bar /= Si_norm - G_new.nodes[nd]['attributes'] = phi_i_bar - - # update edge labels and adjacency matrix. - if ds_attrs['edge_labeled']: - for nd1, nd2, _ in G.edges(data=True): - h_ij0_list = [] - label_list = [] - for label in get_edge_labels(Gn, edge_label): - h_ij0 = 0 - for idx, g in enumerate(Gn): - pi_i = pi_p[idx][nd1] - pi_j = pi_p[idx][nd2] - h_ij0_p = (g.has_node(pi_i) and g.has_node(pi_j) and - g.has_edge(pi_i, pi_j) and - g.edges[pi_i, pi_j][edge_label] == label) - h_ij0 += h_ij0_p - h_ij0_list.append(h_ij0) - label_list.append(label) - # choose one of the best randomly. - idx_max = np.argwhere(h_ij0_list == np.max(h_ij0_list)).flatten().tolist() - h_ij0_max = h_ij0_list[idx_max[0]] - idx_rdm = random.randint(0, len(idx_max) - 1) - best_label = label_list[idx_max[idx_rdm]] - - # check whether a_ij is 0 or 1. - sij_norm = 0 - for idx, g in enumerate(Gn): - pi_i = pi_p[idx][nd1] - pi_j = pi_p[idx][nd2] - if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j): - sij_norm += 1 - if h_ij0_max > len(Gn) * c_er / c_es + sij_norm * (1 - (c_er + c_ei) / c_es): - if not G_new.has_edge(nd1, nd2): - G_new.add_edge(nd1, nd2) - G_new.edges[nd1, nd2][edge_label] = best_label - else: - if G_new.has_edge(nd1, nd2): - G_new.remove_edge(nd1, nd2) - else: # if edges are unlabeled - for nd1, nd2, _ in G.edges(data=True): - sij_norm = 0 - for idx, g in enumerate(Gn): - pi_i = pi_p[idx][nd1] - pi_j = pi_p[idx][nd2] - if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j): - sij_norm += 1 - if sij_norm > len(Gn) * c_er / (c_er + c_ei): - if not G_new.has_edge(nd1, nd2): - G_new.add_edge(nd1, nd2) - else: - if G_new.has_edge(nd1, nd2): - G_new.remove_edge(nd1, nd2) - - G = G_new.copy() - - # update pi_p - pi_p = [] - for idx1, G_p in enumerate(Gn): - dist_tmp, pi_tmp, _ = GED(G, G_p) - pi_p.append(pi_tmp) - - return G - -# --------------------------- These are tests --------------------------------# - -def test_iam_with_more_graphs_as_init(Gn, G_candidate, c_ei=3, c_er=3, c_es=1, - node_label='atom', edge_label='bond_type'): - """See my name, then you know what I do. - """ -# Gn = Gn[0:10] - Gn = [nx.convert_node_labels_to_integers(g) for g in Gn] - - # phase 1: initilize. - # compute set-median. - dis_min = np.inf -# pi_p = [] - pi_all_forward = [] - pi_all_backward = [] - for idx1, G_p in tqdm(enumerate(G_candidate), desc='computing GEDs', file=sys.stdout): - dist_sum = 0 - pi_all_forward.append([]) - pi_all_backward.append([]) - for idx2, G_p_prime in enumerate(Gn): - dist_tmp, pi_tmp_forward, pi_tmp_backward = GED(G_p, G_p_prime) - pi_all_forward[idx1].append(pi_tmp_forward) - pi_all_backward[idx1].append(pi_tmp_backward) - dist_sum += dist_tmp - if dist_sum <= dis_min: - dis_min = dist_sum - G = G_p.copy() - idx_min = idx1 - # list of edit operations. - pi_p_forward = pi_all_forward[idx_min] - pi_p_backward = pi_all_backward[idx_min] - - # phase 2: iteration. - ds_attrs = get_dataset_attributes(Gn + [G], attr_names=['edge_labeled', 'node_attr_dim'], - edge_label=edge_label) - label_set = get_node_labels(Gn + [G], node_label) - for itr in range(0, 10): # @todo: the convergence condition? - G_new = G.copy() - # update vertex labels. - # pre-compute h_i0 for each label. -# for label in get_node_labels(Gn, node_label): -# print(label) -# for nd in G.nodes(data=True): -# pass - if not ds_attrs['node_attr_dim']: # labels are symbolic - for nd in G.nodes(): - h_i0_list = [] - label_list = [] - for label in label_set: - h_i0 = 0 - for idx, g in enumerate(Gn): - pi_i = pi_p_forward[idx][nd] - if g.has_node(pi_i) and g.nodes[pi_i][node_label] == label: - h_i0 += 1 - h_i0_list.append(h_i0) - label_list.append(label) - # choose one of the best randomly. - idx_max = np.argwhere(h_i0_list == np.max(h_i0_list)).flatten().tolist() - idx_rdm = random.randint(0, len(idx_max) - 1) - G_new.nodes[nd][node_label] = label_list[idx_max[idx_rdm]] - else: # labels are non-symbolic - for nd in G.nodes(): - Si_norm = 0 - phi_i_bar = np.array([0.0 for _ in range(ds_attrs['node_attr_dim'])]) - for idx, g in enumerate(Gn): - pi_i = pi_p_forward[idx][nd] - if g.has_node(pi_i): #@todo: what if no g has node? phi_i_bar = 0? - Si_norm += 1 - phi_i_bar += np.array([float(itm) for itm in g.nodes[pi_i]['attributes']]) - phi_i_bar /= Si_norm - G_new.nodes[nd]['attributes'] = phi_i_bar - - # update edge labels and adjacency matrix. - if ds_attrs['edge_labeled']: - for nd1, nd2, _ in G.edges(data=True): - h_ij0_list = [] - label_list = [] - for label in get_edge_labels(Gn, edge_label): - h_ij0 = 0 - for idx, g in enumerate(Gn): - pi_i = pi_p_forward[idx][nd1] - pi_j = pi_p_forward[idx][nd2] - h_ij0_p = (g.has_node(pi_i) and g.has_node(pi_j) and - g.has_edge(pi_i, pi_j) and - g.edges[pi_i, pi_j][edge_label] == label) - h_ij0 += h_ij0_p - h_ij0_list.append(h_ij0) - label_list.append(label) - # choose one of the best randomly. - idx_max = np.argwhere(h_ij0_list == np.max(h_ij0_list)).flatten().tolist() - h_ij0_max = h_ij0_list[idx_max[0]] - idx_rdm = random.randint(0, len(idx_max) - 1) - best_label = label_list[idx_max[idx_rdm]] - - # check whether a_ij is 0 or 1. - sij_norm = 0 - for idx, g in enumerate(Gn): - pi_i = pi_p_forward[idx][nd1] - pi_j = pi_p_forward[idx][nd2] - if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j): - sij_norm += 1 - if h_ij0_max > len(Gn) * c_er / c_es + sij_norm * (1 - (c_er + c_ei) / c_es): - if not G_new.has_edge(nd1, nd2): - G_new.add_edge(nd1, nd2) - G_new.edges[nd1, nd2][edge_label] = best_label - else: - if G_new.has_edge(nd1, nd2): - G_new.remove_edge(nd1, nd2) - else: # if edges are unlabeled - # @todo: works only for undirected graphs. - for nd1 in range(nx.number_of_nodes(G)): - for nd2 in range(nd1 + 1, nx.number_of_nodes(G)): - sij_norm = 0 - for idx, g in enumerate(Gn): - pi_i = pi_p_forward[idx][nd1] - pi_j = pi_p_forward[idx][nd2] - if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j): - sij_norm += 1 - if sij_norm > len(Gn) * c_er / (c_er + c_ei): - if not G_new.has_edge(nd1, nd2): - G_new.add_edge(nd1, nd2) - elif sij_norm < len(Gn) * c_er / (c_er + c_ei): - if G_new.has_edge(nd1, nd2): - G_new.remove_edge(nd1, nd2) - # do not change anything when equal. - - G = G_new.copy() - - # update pi_p - pi_p_forward = [] - for G_p in Gn: - dist_tmp, pi_tmp_forward, pi_tmp_backward = GED(G, G_p) - pi_p_forward.append(pi_tmp_forward) - - return G - - -############################################################################### - -if __name__ == '__main__': - from gklearn.utils.graphfiles import loadDataset - ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat', - 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}} # node/edge symb -# ds = {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt', -# 'extra_params': {}} # node nsymb -# ds = {'name': 'Acyclic', 'dataset': '../datasets/monoterpenoides/trainset_9.ds', -# 'extra_params': {}} - Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params']) - - iam(Gn) \ No newline at end of file diff --git a/gklearn/preimage/knn.py b/gklearn/preimage/knn.py deleted file mode 100644 index c179287..0000000 --- a/gklearn/preimage/knn.py +++ /dev/null @@ -1,114 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -Created on Fri Jan 10 13:22:04 2020 - -@author: ljia -""" -import numpy as np -#import matplotlib.pyplot as plt -from tqdm import tqdm -import random -#import csv -from shutil import copyfile -import os - -from gklearn.preimage.iam import iam_bash -from gklearn.utils.graphfiles import loadDataset, loadGXL -from gklearn.preimage.ged import GED -from gklearn.preimage.utils import get_same_item_indices - -def test_knn(): - ds = {'name': 'monoterpenoides', - 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb - Gn, y_all = loadDataset(ds['dataset']) -# Gn = Gn[0:50] -# gkernel = 'treeletkernel' -# node_label = 'atom' -# edge_label = 'bond_type' -# ds_name = 'mono' - dir_output = 'results/knn/' - graph_dir = os.path.dirname(os.path.realpath(__file__)) + '../../datasets/monoterpenoides/' - - k_nn = 1 - percent = 0.1 - repeats = 50 - edit_cost_constant = [3, 3, 1, 3, 3, 1] - - # get indices by classes. - y_idx = get_same_item_indices(y_all) - sod_sm_list_list - for repeat in range(0, repeats): - print('\n---------------------------------') - print('repeat =', repeat) - accuracy_sm_list = [] - accuracy_gm_list = [] - sod_sm_list = [] - sod_gm_list = [] - - random.seed(repeat) - set_median_list = [] - gen_median_list = [] - train_y_set = [] - for y, values in y_idx.items(): - print('\ny =', y) - size_median_set = int(len(values) * percent) - median_set_idx = random.sample(values, size_median_set) - print('median set: ', median_set_idx) - - # compute set median and gen median using IAM (C++ through bash). - # Gn_median = [Gn[idx] for idx in median_set_idx] - group_fnames = [Gn[g].graph['filename'] for g in median_set_idx] - sod_sm, sod_gm, fname_sm, fname_gm = iam_bash(group_fnames, edit_cost_constant, - graph_dir=graph_dir) - print('sod_sm, sod_gm:', sod_sm, sod_gm) - sod_sm_list.append(sod_sm) - sod_gm_list.append(sod_gm) - fname_sm_new = dir_output + 'medians/set_median.y' + str(int(y)) + '.repeat' + str(repeat) + '.gxl' - copyfile(fname_sm, fname_sm_new) - fname_gm_new = dir_output + 'medians/gen_median.y' + str(int(y)) + '.repeat' + str(repeat) + '.gxl' - copyfile(fname_gm, fname_gm_new) - set_median_list.append(loadGXL(fname_sm_new)) - gen_median_list.append(loadGXL(fname_gm_new)) - train_y_set.append(int(y)) - - print(sod_sm, sod_gm) - - # do 1-nn. - test_y_set = [int(y) for y in y_all] - accuracy_sm = knn(set_median_list, train_y_set, Gn, test_y_set, k=k_nn, distance='ged') - accuracy_gm = knn(set_median_list, train_y_set, Gn, test_y_set, k=k_nn, distance='ged') - accuracy_sm_list.append(accuracy_sm) - accuracy_gm_list.append(accuracy_gm) - print('current accuracy sm and gm:', accuracy_sm, accuracy_gm) - - # output - accuracy_sm_mean = np.mean(accuracy_sm_list) - accuracy_gm_mean = np.mean(accuracy_gm_list) - print('\ntotal average accuracy sm and gm:', accuracy_sm_mean, accuracy_gm_mean) - - -def knn(train_set, train_y_set, test_set, test_y_set, k=1, distance='ged'): - if k == 1 and distance == 'ged': - algo_options = '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1' - params_ged = {'lib': 'gedlibpy', 'cost': 'CONSTANT', 'method': 'IPFP', - 'algo_options': algo_options, 'stabilizer': None} - accuracy = 0 - for idx_test, g_test in tqdm(enumerate(test_set), desc='computing 1-nn', - file=sys.stdout): - dis = np.inf - for idx_train, g_train in enumerate(train_set): - dis_cur, _, _ = GED(g_test, g_train, **params_ged) - if dis_cur < dis: - dis = dis_cur - test_y_cur = train_y_set[idx_train] - if test_y_cur == test_y_set[idx_test]: - accuracy += 1 - accuracy = accuracy / len(test_set) - - return accuracy - - - -if __name__ == '__main__': - test_knn() \ No newline at end of file diff --git a/gklearn/preimage/libs.py b/gklearn/preimage/libs.py deleted file mode 100644 index 76005c6..0000000 --- a/gklearn/preimage/libs.py +++ /dev/null @@ -1,6 +0,0 @@ -import sys -import pathlib - -# insert gedlibpy library. -sys.path.insert(0, "../../../") -from gedlibpy import librariesImport, gedlibpy diff --git a/gklearn/preimage/median.py b/gklearn/preimage/median.py deleted file mode 100644 index 1c5bb0f..0000000 --- a/gklearn/preimage/median.py +++ /dev/null @@ -1,218 +0,0 @@ -import sys -sys.path.insert(0, "../") -#import pathlib -import numpy as np -import networkx as nx -import time - -from gedlibpy import librariesImport, gedlibpy -#import script -sys.path.insert(0, "/home/bgauzere/dev/optim-graphes/") -import gklearn -from gklearn.utils.graphfiles import loadDataset - -def replace_graph_in_env(script, graph, old_id, label='median'): - """ - Replace a graph in script - - If old_id is -1, add a new graph to the environnemt - - """ - if(old_id > -1): - script.PyClearGraph(old_id) - new_id = script.PyAddGraph(label) - for i in graph.nodes(): - script.PyAddNode(new_id,str(i),graph.node[i]) # !! strings are required bt gedlib - for e in graph.edges: - script.PyAddEdge(new_id, str(e[0]),str(e[1]), {}) - script.PyInitEnv() - script.PySetMethod("IPFP", "") - script.PyInitMethod() - - return new_id - -#Dessin median courrant -def draw_Letter_graph(graph, savepath=''): - import numpy as np - import networkx as nx - import matplotlib.pyplot as plt - plt.figure() - pos = {} - for n in graph.nodes: - pos[n] = np.array([float(graph.node[n]['attributes'][0]), - float(graph.node[n]['attributes'][1])]) - nx.draw_networkx(graph, pos) - if savepath != '': - plt.savefig(savepath + str(time.time()) + '.eps', format='eps', dpi=300) - plt.show() - plt.clf() - -#compute new mappings -def update_mappings(script,median_id,listID): - med_distances = {} - med_mappings = {} - sod = 0 - for i in range(0,len(listID)): - script.PyRunMethod(median_id,listID[i]) - med_distances[i] = script.PyGetUpperBound(median_id,listID[i]) - med_mappings[i] = script.PyGetForwardMap(median_id,listID[i]) - sod += med_distances[i] - return med_distances, med_mappings, sod - -def calcul_Sij(all_mappings, all_graphs,i,j): - s_ij = 0 - for k in range(0,len(all_mappings)): - cur_graph = all_graphs[k] - cur_mapping = all_mappings[k] - size_graph = cur_graph.order() - if ((cur_mapping[i] < size_graph) and - (cur_mapping[j] < size_graph) and - (cur_graph.has_edge(cur_mapping[i], cur_mapping[j]) == True)): - s_ij += 1 - - return s_ij - -# def update_median_nodes_L1(median,listIdSet,median_id,dataset, mappings): -# from scipy.stats.mstats import gmean - -# for i in median.nodes(): -# for k in listIdSet: -# vectors = [] #np.zeros((len(listIdSet),2)) -# if(k != median_id): -# phi_i = mappings[k][i] -# if(phi_i < dataset[k].order()): -# vectors.append([float(dataset[k].node[phi_i]['x']),float(dataset[k].node[phi_i]['y'])]) - -# new_labels = gmean(vectors) -# median.node[i]['x'] = str(new_labels[0]) -# median.node[i]['y'] = str(new_labels[1]) -# return median - -def update_median_nodes(median,dataset,mappings): - #update node attributes - for i in median.nodes(): - nb_sub=0 - mean_label = {'x' : 0, 'y' : 0} - for k in range(0,len(mappings)): - phi_i = mappings[k][i] - if ( phi_i < dataset[k].order() ): - nb_sub += 1 - mean_label['x'] += 0.75*float(dataset[k].node[phi_i]['x']) - mean_label['y'] += 0.75*float(dataset[k].node[phi_i]['y']) - median.node[i]['x'] = str((1/0.75)*(mean_label['x']/nb_sub)) - median.node[i]['y'] = str((1/0.75)*(mean_label['y']/nb_sub)) - return median - -def update_median_edges(dataset, mappings, median, cei=0.425,cer=0.425): -#for letter high, ceir = 1.7, alpha = 0.75 - size_dataset = len(dataset) - ratio_cei_cer = cer/(cei + cer) - threshold = size_dataset*ratio_cei_cer - order_graph_median = median.order() - for i in range(0,order_graph_median): - for j in range(i+1,order_graph_median): - s_ij = calcul_Sij(mappings,dataset,i,j) - if(s_ij > threshold): - median.add_edge(i,j) - else: - if(median.has_edge(i,j)): - median.remove_edge(i,j) - return median - - - -def compute_median(script, listID, dataset,verbose=False): - """Compute a graph median of a dataset according to an environment - - Parameters - - script : An gedlib initialized environnement - listID (list): a list of ID in script: encodes the dataset - dataset (list): corresponding graphs in networkX format. We assume that graph - listID[i] corresponds to dataset[i] - - Returns: - A networkX graph, which is the median, with corresponding sod - """ - print(len(listID)) - median_set_index, median_set_sod = compute_median_set(script, listID) - print(median_set_index) - print(median_set_sod) - sods = [] - #Ajout median dans environnement - set_median = dataset[median_set_index].copy() - median = dataset[median_set_index].copy() - cur_med_id = replace_graph_in_env(script,median,-1) - med_distances, med_mappings, cur_sod = update_mappings(script,cur_med_id,listID) - sods.append(cur_sod) - if(verbose): - print(cur_sod) - ite_max = 50 - old_sod = cur_sod * 2 - ite = 0 - epsilon = 0.001 - - best_median - while((ite < ite_max) and (np.abs(old_sod - cur_sod) > epsilon )): - median = update_median_nodes(median,dataset, med_mappings) - median = update_median_edges(dataset,med_mappings,median) - - cur_med_id = replace_graph_in_env(script,median,cur_med_id) - med_distances, med_mappings, cur_sod = update_mappings(script,cur_med_id,listID) - - - sods.append(cur_sod) - if(verbose): - print(cur_sod) - ite += 1 - return median, cur_sod, sods, set_median - - draw_Letter_graph(median) - - -def compute_median_set(script,listID): - 'Returns the id in listID corresponding to median set' - #Calcul median set - N=len(listID) - map_id_to_index = {} - map_index_to_id = {} - for i in range(0,len(listID)): - map_id_to_index[listID[i]] = i - map_index_to_id[i] = listID[i] - - distances = np.zeros((N,N)) - for i in listID: - for j in listID: - script.PyRunMethod(i,j) - distances[map_id_to_index[i],map_id_to_index[j]] = script.PyGetUpperBound(i,j) - - median_set_index = np.argmin(np.sum(distances,0)) - sod = np.min(np.sum(distances,0)) - - return median_set_index, sod - -if __name__ == "__main__": - #Chargement du dataset - script.PyLoadGXLGraph('/home/bgauzere/dev/gedlib/data/datasets/Letter/HIGH/', '/home/bgauzere/dev/gedlib/data/collections/Letter_Z.xml') - script.PySetEditCost("LETTER") - script.PyInitEnv() - script.PySetMethod("IPFP", "") - script.PyInitMethod() - - dataset,my_y = gklearn.utils.graphfiles.loadDataset("/home/bgauzere/dev/gedlib/data/datasets/Letter/HIGH/Letter_Z.cxl") - - listID = script.PyGetAllGraphIds() - median, sod = compute_median(script,listID,dataset,verbose=True) - - print(sod) - draw_Letter_graph(median) - - -#if __name__ == '__main__': -# # test draw_Letter_graph -# ds = {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt', -# 'extra_params': {}} # node nsymb -# Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params']) -# print(y_all) -# for g in Gn: -# draw_Letter_graph(g) \ No newline at end of file diff --git a/gklearn/preimage/median_benoit.py b/gklearn/preimage/median_benoit.py deleted file mode 100644 index 6712196..0000000 --- a/gklearn/preimage/median_benoit.py +++ /dev/null @@ -1,201 +0,0 @@ -import sys -import pathlib -import numpy as np -import networkx as nx - -import librariesImport -import script -sys.path.insert(0, "/home/bgauzere/dev/optim-graphes/") -import gklearn - -def replace_graph_in_env(script, graph, old_id, label='median'): - """ - Replace a graph in script - - If old_id is -1, add a new graph to the environnemt - - """ - if(old_id > -1): - script.PyClearGraph(old_id) - new_id = script.PyAddGraph(label) - for i in graph.nodes(): - script.PyAddNode(new_id,str(i),graph.node[i]) # !! strings are required bt gedlib - for e in graph.edges: - script.PyAddEdge(new_id, str(e[0]),str(e[1]), {}) - script.PyInitEnv() - script.PySetMethod("IPFP", "") - script.PyInitMethod() - - return new_id - -#Dessin median courrant -def draw_Letter_graph(graph): - import numpy as np - import networkx as nx - import matplotlib.pyplot as plt - plt.figure() - pos = {} - for n in graph.nodes: - pos[n] = np.array([float(graph.node[n]['x']),float(graph.node[n]['y'])]) - nx.draw_networkx(graph,pos) - plt.show() - -#compute new mappings -def update_mappings(script,median_id,listID): - med_distances = {} - med_mappings = {} - sod = 0 - for i in range(0,len(listID)): - script.PyRunMethod(median_id,listID[i]) - med_distances[i] = script.PyGetUpperBound(median_id,listID[i]) - med_mappings[i] = script.PyGetForwardMap(median_id,listID[i]) - sod += med_distances[i] - return med_distances, med_mappings, sod - -def calcul_Sij(all_mappings, all_graphs,i,j): - s_ij = 0 - for k in range(0,len(all_mappings)): - cur_graph = all_graphs[k] - cur_mapping = all_mappings[k] - size_graph = cur_graph.order() - if ((cur_mapping[i] < size_graph) and - (cur_mapping[j] < size_graph) and - (cur_graph.has_edge(cur_mapping[i], cur_mapping[j]) == True)): - s_ij += 1 - - return s_ij - -# def update_median_nodes_L1(median,listIdSet,median_id,dataset, mappings): -# from scipy.stats.mstats import gmean - -# for i in median.nodes(): -# for k in listIdSet: -# vectors = [] #np.zeros((len(listIdSet),2)) -# if(k != median_id): -# phi_i = mappings[k][i] -# if(phi_i < dataset[k].order()): -# vectors.append([float(dataset[k].node[phi_i]['x']),float(dataset[k].node[phi_i]['y'])]) - -# new_labels = gmean(vectors) -# median.node[i]['x'] = str(new_labels[0]) -# median.node[i]['y'] = str(new_labels[1]) -# return median - -def update_median_nodes(median,dataset,mappings): - #update node attributes - for i in median.nodes(): - nb_sub=0 - mean_label = {'x' : 0, 'y' : 0} - for k in range(0,len(mappings)): - phi_i = mappings[k][i] - if ( phi_i < dataset[k].order() ): - nb_sub += 1 - mean_label['x'] += 0.75*float(dataset[k].node[phi_i]['x']) - mean_label['y'] += 0.75*float(dataset[k].node[phi_i]['y']) - median.node[i]['x'] = str((1/0.75)*(mean_label['x']/nb_sub)) - median.node[i]['y'] = str((1/0.75)*(mean_label['y']/nb_sub)) - return median - -def update_median_edges(dataset, mappings, median, cei=0.425,cer=0.425): -#for letter high, ceir = 1.7, alpha = 0.75 - size_dataset = len(dataset) - ratio_cei_cer = cer/(cei + cer) - threshold = size_dataset*ratio_cei_cer - order_graph_median = median.order() - for i in range(0,order_graph_median): - for j in range(i+1,order_graph_median): - s_ij = calcul_Sij(mappings,dataset,i,j) - if(s_ij > threshold): - median.add_edge(i,j) - else: - if(median.has_edge(i,j)): - median.remove_edge(i,j) - return median - - - -def compute_median(script, listID, dataset,verbose=False): - """Compute a graph median of a dataset according to an environment - - Parameters - - script : An gedlib initialized environnement - listID (list): a list of ID in script: encodes the dataset - dataset (list): corresponding graphs in networkX format. We assume that graph - listID[i] corresponds to dataset[i] - - Returns: - A networkX graph, which is the median, with corresponding sod - """ - print(len(listID)) - median_set_index, median_set_sod = compute_median_set(script, listID) - print(median_set_index) - print(median_set_sod) - sods = [] - #Ajout median dans environnement - set_median = dataset[median_set_index].copy() - median = dataset[median_set_index].copy() - cur_med_id = replace_graph_in_env(script,median,-1) - med_distances, med_mappings, cur_sod = update_mappings(script,cur_med_id,listID) - sods.append(cur_sod) - if(verbose): - print(cur_sod) - ite_max = 50 - old_sod = cur_sod * 2 - ite = 0 - epsilon = 0.001 - - best_median - while((ite < ite_max) and (np.abs(old_sod - cur_sod) > epsilon )): - median = update_median_nodes(median,dataset, med_mappings) - median = update_median_edges(dataset,med_mappings,median) - - cur_med_id = replace_graph_in_env(script,median,cur_med_id) - med_distances, med_mappings, cur_sod = update_mappings(script,cur_med_id,listID) - - - sods.append(cur_sod) - if(verbose): - print(cur_sod) - ite += 1 - return median, cur_sod, sods, set_median - - draw_Letter_graph(median) - - -def compute_median_set(script,listID): - 'Returns the id in listID corresponding to median set' - #Calcul median set - N=len(listID) - map_id_to_index = {} - map_index_to_id = {} - for i in range(0,len(listID)): - map_id_to_index[listID[i]] = i - map_index_to_id[i] = listID[i] - - distances = np.zeros((N,N)) - for i in listID: - for j in listID: - script.PyRunMethod(i,j) - distances[map_id_to_index[i],map_id_to_index[j]] = script.PyGetUpperBound(i,j) - - median_set_index = np.argmin(np.sum(distances,0)) - sod = np.min(np.sum(distances,0)) - - return median_set_index, sod - -if __name__ == "__main__": - #Chargement du dataset - script.PyLoadGXLGraph('/home/bgauzere/dev/gedlib/data/datasets/Letter/HIGH/', '/home/bgauzere/dev/gedlib/data/collections/Letter_Z.xml') - script.PySetEditCost("LETTER") - script.PyInitEnv() - script.PySetMethod("IPFP", "") - script.PyInitMethod() - - dataset,my_y = gklearn.utils.graphfiles.loadDataset("/home/bgauzere/dev/gedlib/data/datasets/Letter/HIGH/Letter_Z.cxl") - - listID = script.PyGetAllGraphIds() - median, sod = compute_median(script,listID,dataset,verbose=True) - - print(sod) - draw_Letter_graph(median) diff --git a/gklearn/preimage/median_linlin.py b/gklearn/preimage/median_linlin.py deleted file mode 100644 index 6139558..0000000 --- a/gklearn/preimage/median_linlin.py +++ /dev/null @@ -1,215 +0,0 @@ -import sys -import pathlib -import numpy as np -import networkx as nx - -from gedlibpy import librariesImport, gedlibpy -sys.path.insert(0, "/home/bgauzere/dev/optim-graphes/") -import gklearn - -def replace_graph_in_env(script, graph, old_id, label='median'): - """ - Replace a graph in script - - If old_id is -1, add a new graph to the environnemt - - """ - if(old_id > -1): - script.PyClearGraph(old_id) - new_id = script.PyAddGraph(label) - for i in graph.nodes(): - script.PyAddNode(new_id,str(i),graph.node[i]) # !! strings are required bt gedlib - for e in graph.edges: - script.PyAddEdge(new_id, str(e[0]),str(e[1]), {}) - script.PyInitEnv() - script.PySetMethod("IPFP", "") - script.PyInitMethod() - - return new_id - -#Dessin median courrant -def draw_Letter_graph(graph): - import numpy as np - import networkx as nx - import matplotlib.pyplot as plt - plt.figure() - pos = {} - for n in graph.nodes: - pos[n] = np.array([float(graph.node[n]['x']),float(graph.node[n]['y'])]) - nx.draw_networkx(graph,pos) - plt.show() - -#compute new mappings -def update_mappings(script,median_id,listID): - med_distances = {} - med_mappings = {} - sod = 0 - for i in range(0,len(listID)): - script.PyRunMethod(median_id,listID[i]) - med_distances[i] = script.PyGetUpperBound(median_id,listID[i]) - med_mappings[i] = script.PyGetForwardMap(median_id,listID[i]) - sod += med_distances[i] - return med_distances, med_mappings, sod - -def calcul_Sij(all_mappings, all_graphs,i,j): - s_ij = 0 - for k in range(0,len(all_mappings)): - cur_graph = all_graphs[k] - cur_mapping = all_mappings[k] - size_graph = cur_graph.order() - if ((cur_mapping[i] < size_graph) and - (cur_mapping[j] < size_graph) and - (cur_graph.has_edge(cur_mapping[i], cur_mapping[j]) == True)): - s_ij += 1 - - return s_ij - -# def update_median_nodes_L1(median,listIdSet,median_id,dataset, mappings): -# from scipy.stats.mstats import gmean - -# for i in median.nodes(): -# for k in listIdSet: -# vectors = [] #np.zeros((len(listIdSet),2)) -# if(k != median_id): -# phi_i = mappings[k][i] -# if(phi_i < dataset[k].order()): -# vectors.append([float(dataset[k].node[phi_i]['x']),float(dataset[k].node[phi_i]['y'])]) - -# new_labels = gmean(vectors) -# median.node[i]['x'] = str(new_labels[0]) -# median.node[i]['y'] = str(new_labels[1]) -# return median - -def update_median_nodes(median,dataset,mappings): - #update node attributes - for i in median.nodes(): - nb_sub=0 - mean_label = {'x' : 0, 'y' : 0} - for k in range(0,len(mappings)): - phi_i = mappings[k][i] - if ( phi_i < dataset[k].order() ): - nb_sub += 1 - mean_label['x'] += 0.75*float(dataset[k].node[phi_i]['x']) - mean_label['y'] += 0.75*float(dataset[k].node[phi_i]['y']) - median.node[i]['x'] = str((1/0.75)*(mean_label['x']/nb_sub)) - median.node[i]['y'] = str((1/0.75)*(mean_label['y']/nb_sub)) - return median - -def update_median_edges(dataset, mappings, median, cei=0.425,cer=0.425): -#for letter high, ceir = 1.7, alpha = 0.75 - size_dataset = len(dataset) - ratio_cei_cer = cer/(cei + cer) - threshold = size_dataset*ratio_cei_cer - order_graph_median = median.order() - for i in range(0,order_graph_median): - for j in range(i+1,order_graph_median): - s_ij = calcul_Sij(mappings,dataset,i,j) - if(s_ij > threshold): - median.add_edge(i,j) - else: - if(median.has_edge(i,j)): - median.remove_edge(i,j) - return median - - - -def compute_median(script, listID, dataset,verbose=False): - """Compute a graph median of a dataset according to an environment - - Parameters - - script : An gedlib initialized environnement - listID (list): a list of ID in script: encodes the dataset - dataset (list): corresponding graphs in networkX format. We assume that graph - listID[i] corresponds to dataset[i] - - Returns: - A networkX graph, which is the median, with corresponding sod - """ - print(len(listID)) - median_set_index, median_set_sod = compute_median_set(script, listID) - print(median_set_index) - print(median_set_sod) - sods = [] - #Ajout median dans environnement - set_median = dataset[median_set_index].copy() - median = dataset[median_set_index].copy() - cur_med_id = replace_graph_in_env(script,median,-1) - med_distances, med_mappings, cur_sod = update_mappings(script,cur_med_id,listID) - sods.append(cur_sod) - if(verbose): - print(cur_sod) - ite_max = 50 - old_sod = cur_sod * 2 - ite = 0 - epsilon = 0.001 - - best_median - while((ite < ite_max) and (np.abs(old_sod - cur_sod) > epsilon )): - median = update_median_nodes(median,dataset, med_mappings) - median = update_median_edges(dataset,med_mappings,median) - - cur_med_id = replace_graph_in_env(script,median,cur_med_id) - med_distances, med_mappings, cur_sod = update_mappings(script,cur_med_id,listID) - - - sods.append(cur_sod) - if(verbose): - print(cur_sod) - ite += 1 - return median, cur_sod, sods, set_median - - draw_Letter_graph(median) - - -def compute_median_set(script,listID): - 'Returns the id in listID corresponding to median set' - #Calcul median set - N=len(listID) - map_id_to_index = {} - map_index_to_id = {} - for i in range(0,len(listID)): - map_id_to_index[listID[i]] = i - map_index_to_id[i] = listID[i] - - distances = np.zeros((N,N)) - for i in listID: - for j in listID: - script.PyRunMethod(i,j) - distances[map_id_to_index[i],map_id_to_index[j]] = script.PyGetUpperBound(i,j) - - median_set_index = np.argmin(np.sum(distances,0)) - sod = np.min(np.sum(distances,0)) - - return median_set_index, sod - -def _convertGraph(G): - """Convert a graph to the proper NetworkX format that can be - recognized by library gedlibpy. - """ - G_new = nx.Graph() - for nd, attrs in G.nodes(data=True): - G_new.add_node(str(nd), chem=attrs['atom']) -# G_new.add_node(str(nd), x=str(attrs['attributes'][0]), -# y=str(attrs['attributes'][1])) - for nd1, nd2, attrs in G.edges(data=True): - G_new.add_edge(str(nd1), str(nd2), valence=attrs['bond_type']) -# G_new.add_edge(str(nd1), str(nd2)) - - return G_new - -if __name__ == "__main__": - #Chargement du dataset - gedlibpy.PyLoadGXLGraph('/home/bgauzere/dev/gedlib/data/datasets/Letter/HIGH/', '/home/bgauzere/dev/gedlib/data/collections/Letter_Z.xml') - gedlibpy.PySetEditCost("LETTER") - gedlibpy.PyInitEnv() - gedlibpy.PySetMethod("IPFP", "") - gedlibpy.PyInitMethod() - - dataset,my_y = gklearn.utils.graphfiles.loadDataset("/home/bgauzere/dev/gedlib/data/datasets/Letter/HIGH/Letter_Z.cxl") - - listID = gedlibpy.PyGetAllGraphIds() - median, sod = compute_median(gedlibpy,listID,dataset,verbose=True) - - print(sod) - draw_Letter_graph(median) diff --git a/gklearn/preimage/pathfrequency.py b/gklearn/preimage/pathfrequency.py deleted file mode 100644 index 3bca1bc..0000000 --- a/gklearn/preimage/pathfrequency.py +++ /dev/null @@ -1,201 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -Created on Wed Mar 20 10:12:15 2019 - -inferring a graph grom path frequency. -@author: ljia -""" -#import numpy as np -import networkx as nx -from scipy.spatial.distance import hamming -import itertools - -def SISF(K, v): - if output: - return output - else: - return 'no solution' - - -def SISF_M(K, v): - return output - - -def GIPF_tree(v_obj, K=1, alphabet=[0, 1]): - if K == 1: - n_graph = v_obj[0] + v_obj[1] - D_T, father_idx = getDynamicTable(n_graph, alphabet) - - # get the vector the closest to v_obj. - if v_obj not in D_T: - print('no exact solution') - dis_lim = 1 / len(v_obj) # the possible shortest distance. - dis_min = 1.0 # minimum proportional distance - v_min = v_obj - for vc in D_T: - if vc[0] + vc[1] == n_graph: -# print(vc) - dis = hamming(vc, v_obj) - if dis < dis_min: - dis_min = dis - v_min = vc - if dis_min <= dis_lim: - break - v_obj = v_min - - # obtain required graph by traceback procedure. - return getObjectGraph(v_obj, D_T, father_idx, alphabet), v_obj - -def GIPF_M(K, v): - return G - - -def getDynamicTable(n_graph, alphabet=[0, 1]): - # init. When only one node exists. - D_T = {(1, 0, 0, 0, 0, 0): 1, (0, 1, 0, 0, 0, 0): 1, (0, 0, 1, 0, 0, 0): 0, - (0, 0, 0, 1, 0, 0): 0, (0, 0, 0, 0, 1, 0): 0, (0, 0, 0, 0, 0, 1): 0,} - D_T = [(1, 0, 0, 0, 0, 0), (0, 1, 0, 0, 0, 0)] - father_idx = [-1, -1] # index of each vector's father - # add possible vectors. - for idx, v in enumerate(D_T): - if v[0] + v[1] < n_graph: - D_T.append((v[0] + 1, v[1], v[2] + 2, v[3], v[4], v[5])) - D_T.append((v[0] + 1, v[1], v[2], v[3] + 1, v[4] + 1, v[5])) - D_T.append((v[0], v[1] + 1, v[2], v[3] + 1, v[4] + 1, v[5])) - D_T.append((v[0], v[1] + 1, v[2], v[3], v[4], v[5] + 2)) - father_idx += [idx, idx, idx, idx] - -# D_T = itertools.chain([(1, 0, 0, 0, 0, 0)], [(0, 1, 0, 0, 0, 0)]) -# father_idx = itertools.chain([-1], [-1]) # index of each vector's father -# # add possible vectors. -# for idx, v in enumerate(D_T): -# if v[0] + v[1] < n_graph: -# D_T = itertools.chain(D_T, [(v[0] + 1, v[1], v[2] + 2, v[3], v[4], v[5])]) -# D_T = itertools.chain(D_T, [(v[0] + 1, v[1], v[2], v[3] + 1, v[4] + 1, v[5])]) -# D_T = itertools.chain(D_T, [(v[0], v[1] + 1, v[2], v[3] + 1, v[4] + 1, v[5])]) -# D_T = itertools.chain(D_T, [(v[0], v[1] + 1, v[2], v[3], v[4], v[5] + 2)]) -# father_idx = itertools.chain(father_idx, [idx, idx, idx, idx]) - return D_T, father_idx - - -def getObjectGraph(v_obj, D_T, father_idx, alphabet=[0, 1]): - g_obj = nx.Graph() - - # do vector traceback. - v_tb = [list(v_obj)] # traceback vectors. - v_tb_idx = [D_T.index(v_obj)] # indices of traceback vectors. - while v_tb_idx[-1] > 1: - idx_pre = father_idx[v_tb_idx[-1]] - v_tb_idx.append(idx_pre) - v_tb.append(list(D_T[idx_pre])) - v_tb = v_tb[::-1] # reverse -# v_tb_idx = v_tb_idx[::-1] - - # construct tree. - v_c = v_tb[0] # current vector. - if v_c[0] == 1: - g_obj.add_node(0, node_label=alphabet[0]) - else: - g_obj.add_node(0, node_label=alphabet[1]) - for vct in v_tb[1:]: - if vct[0] - v_c[0] == 1: - if vct[2] - v_c[2] == 2: # transfer 1 - label1 = alphabet[0] - label2 = alphabet[0] - else: # transfer 2 - label1 = alphabet[1] - label2 = alphabet[0] - else: - if vct[3] - v_c[3] == 1: # transfer 3 - label1 = alphabet[0] - label2 = alphabet[1] - else: # transfer 4 - label1 = alphabet[1] - label2 = alphabet[1] - for nd, attr in g_obj.nodes(data=True): - if attr['node_label'] == label1: - nb_node = nx.number_of_nodes(g_obj) - g_obj.add_node(nb_node, node_label=label2) - g_obj.add_edge(nd, nb_node) - break - v_c = vct - return g_obj - - -import random -def hierarchy_pos(G, root=None, width=1., vert_gap = 0.2, vert_loc = 0, xcenter = 0.5): - - ''' - From Joel's answer at https://stackoverflow.com/a/29597209/2966723. - Licensed under Creative Commons Attribution-Share Alike - - If the graph is a tree this will return the positions to plot this in a - hierarchical layout. - - G: the graph (must be a tree) - - root: the root node of current branch - - if the tree is directed and this is not given, - the root will be found and used - - if the tree is directed and this is given, then - the positions will be just for the descendants of this node. - - if the tree is undirected and not given, - then a random choice will be used. - - width: horizontal space allocated for this branch - avoids overlap with other branches - - vert_gap: gap between levels of hierarchy - - vert_loc: vertical location of root - - xcenter: horizontal location of root - ''' - if not nx.is_tree(G): - raise TypeError('cannot use hierarchy_pos on a graph that is not a tree') - - if root is None: - if isinstance(G, nx.DiGraph): - root = next(iter(nx.topological_sort(G))) #allows back compatibility with nx version 1.11 - else: - root = random.choice(list(G.nodes)) - - def _hierarchy_pos(G, root, width=1., vert_gap = 0.2, vert_loc = 0, xcenter = 0.5, pos = None, parent = None): - ''' - see hierarchy_pos docstring for most arguments - - pos: a dict saying where all nodes go if they have been assigned - parent: parent of this branch. - only affects it if non-directed - - ''' - - if pos is None: - pos = {root:(xcenter,vert_loc)} - else: - pos[root] = (xcenter, vert_loc) - children = list(G.neighbors(root)) - if not isinstance(G, nx.DiGraph) and parent is not None: - children.remove(parent) - if len(children)!=0: - dx = width/len(children) - nextx = xcenter - width/2 - dx/2 - for child in children: - nextx += dx - pos = _hierarchy_pos(G,child, width = dx, vert_gap = vert_gap, - vert_loc = vert_loc-vert_gap, xcenter=nextx, - pos=pos, parent = root) - return pos - - - return _hierarchy_pos(G, root, width, vert_gap, vert_loc, xcenter) - - -if __name__ == '__main__': - v_obj = (6, 4, 10, 3, 3, 2) -# v_obj = (6, 5, 10, 3, 3, 2) - tree_obj, v_obj = GIPF_tree(v_obj) - print('One closest vector is', v_obj) - # plot - pos = hierarchy_pos(tree_obj, 0) - node_labels = nx.get_node_attributes(tree_obj, 'node_label') - nx.draw(tree_obj, pos=pos, labels=node_labels, with_labels=True) \ No newline at end of file diff --git a/gklearn/preimage/preimage_iam.py b/gklearn/preimage/preimage_iam.py deleted file mode 100644 index bf79d0e..0000000 --- a/gklearn/preimage/preimage_iam.py +++ /dev/null @@ -1,705 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -Created on Tue Apr 30 17:07:43 2019 - -A graph pre-image method combining iterative pre-image method in reference [1] -and the iterative alternate minimizations (IAM) in reference [2]. -@author: ljia -@references: - [1] Gökhan H Bakir, Alexander Zien, and Koji Tsuda. Learning to and graph - pre-images. In Joint Pattern Re ognition Symposium , pages 253-261. Springer, 2004. - [2] Generalized median graph via iterative alternate minimization. -""" -import sys -import numpy as np -from tqdm import tqdm -import networkx as nx -import matplotlib.pyplot as plt -import random - -from iam import iam_upgraded -from utils import dis_gstar, compute_kernel - - -def preimage_iam(Gn_init, Gn_median, alpha, idx_gi, Kmatrix, k, r_max, - gkernel, epsilon=0.001, InitIAMWithAllDk=False, - params_iam={'c_ei': 1, 'c_er': 1, 'c_es': 1, - 'ite_max': 50, 'epsilon': 0.001, - 'removeNodes': True, 'connected': False}, - params_ged={'lib': 'gedlibpy', 'cost': 'CHEM_1', 'method': 'IPFP', - 'edit_cost_constant': [], 'stabilizer': 'min', - 'repeat': 50}): - """This function constructs graph pre-image by the iterative pre-image - framework in reference [1], algorithm 1, where the step of generating new - graphs randomly is replaced by the IAM algorithm in reference [2]. - - notes - ----- - Every time a set of n better graphs is acquired, their distances in kernel space are - compared with the k nearest ones, and the k nearest distances from the k+n - distances will be used as the new ones. - """ - # compute k nearest neighbors of phi in DN. - dis_all = [] # distance between g_star and each graph. - term3 = 0 - for i1, a1 in enumerate(alpha): - for i2, a2 in enumerate(alpha): - term3 += a1 * a2 * Kmatrix[idx_gi[i1], idx_gi[i2]] - for ig, g in tqdm(enumerate(Gn_init), desc='computing distances', file=sys.stdout): - dtemp = dis_gstar(ig, idx_gi, alpha, Kmatrix, term3=term3) - dis_all.append(dtemp) - - # sort - sort_idx = np.argsort(dis_all) - dis_k = [dis_all[idis] for idis in sort_idx[0:k]] # the k shortest distances - nb_best = len(np.argwhere(dis_k == dis_k[0]).flatten().tolist()) - ghat_list = [Gn_init[idx].copy() for idx in sort_idx[0:nb_best]] # the nearest neighbors of phi in DN - if dis_k[0] == 0: # the exact pre-image. - print('The exact pre-image is found from the input dataset.') - return 0, ghat_list, 0, 0 - dhat = dis_k[0] # the nearest distance -# for g in ghat_list: -# draw_Letter_graph(g) -# nx.draw_networkx(g) -# plt.show() -# print(g.nodes(data=True)) -# print(g.edges(data=True)) - Gk = [Gn_init[ig].copy() for ig in sort_idx[0:k]] # the k nearest neighbors -# for gi in Gk: -# nx.draw(gi, labels=nx.get_node_attributes(gi, 'atom'), with_labels=True) -## nx.draw_networkx(gi) -# plt.show() -## draw_Letter_graph(g) -# print(gi.nodes(data=True)) -# print(gi.edges(data=True)) - -# i = 1 - r = 0 - itr_total = 0 - dis_of_each_itr = [dhat] - found = False - nb_updated = 0 - nb_updated_k = 0 - while r < r_max:# and not found: # @todo: if not found?# and np.abs(old_dis - cur_dis) > epsilon: - print('\n-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-') - print('Current preimage iteration =', r) - print('Total preimage iteration =', itr_total, '\n') - found = False - - Gn_nearest_median = [g.copy() for g in Gk] - if InitIAMWithAllDk: # each graph in D_k is used to initialize IAM. - ghat_new_list = [] - for g_tmp in Gk: - Gn_nearest_init = [g_tmp.copy()] - ghat_new_list_tmp, _, _ = iam_upgraded(Gn_nearest_median, - Gn_nearest_init, params_ged=params_ged, **params_iam) - ghat_new_list += ghat_new_list_tmp - else: # only the best graph in D_k is used to initialize IAM. - Gn_nearest_init = [g.copy() for g in Gk] - ghat_new_list, _, _ = iam_upgraded(Gn_nearest_median, Gn_nearest_init, - params_ged=params_ged, **params_iam) - -# for g in g_tmp_list: -# nx.draw_networkx(g) -# plt.show() -# draw_Letter_graph(g) -# print(g.nodes(data=True)) -# print(g.edges(data=True)) - - # compute distance between \psi and the new generated graphs. - knew = compute_kernel(ghat_new_list + Gn_median, gkernel, False) - dhat_new_list = [] - for idx, g_tmp in enumerate(ghat_new_list): - # @todo: the term3 below could use the one at the beginning of the function. - dhat_new_list.append(dis_gstar(idx, range(len(ghat_new_list), - len(ghat_new_list) + len(Gn_median) + 1), - alpha, knew, withterm3=False)) - - for idx_g, ghat_new in enumerate(ghat_new_list): - dhat_new = dhat_new_list[idx_g] - - # if the new distance is smaller than the max of D_k. - if dhat_new < dis_k[-1] and np.abs(dhat_new - dis_k[-1]) >= epsilon: - # check if the new distance is the same as one in D_k. - is_duplicate = False - for dis_tmp in dis_k[1:-1]: - if np.abs(dhat_new - dis_tmp) < epsilon: - is_duplicate = True - print('IAM: duplicate k nearest graph generated.') - break - if not is_duplicate: - if np.abs(dhat_new - dhat) < epsilon: - print('IAM: I am equal!') -# dhat = dhat_new -# ghat_list = [ghat_new.copy()] - else: - print('IAM: we got better k nearest neighbors!') - nb_updated_k += 1 - print('the k nearest neighbors are updated', - nb_updated_k, 'times.') - - dis_k = [dhat_new] + dis_k[0:k-1] # add the new nearest distance. - Gk = [ghat_new.copy()] + Gk[0:k-1] # add the corresponding graph. - sort_idx = np.argsort(dis_k) - dis_k = [dis_k[idx] for idx in sort_idx[0:k]] # the new k nearest distances. - Gk = [Gk[idx] for idx in sort_idx[0:k]] - if dhat_new < dhat: - print('IAM: I have smaller distance!') - print(str(dhat) + '->' + str(dhat_new)) - dhat = dhat_new - ghat_list = [Gk[0].copy()] - r = 0 - nb_updated += 1 - - print('the graph is updated', nb_updated, 'times.') - nx.draw(Gk[0], labels=nx.get_node_attributes(Gk[0], 'atom'), - with_labels=True) - ## plt.savefig("results/gk_iam/simple_two/xx" + str(i) + ".png", format="PNG") - plt.show() - - found = True - if not found: - r += 1 - - dis_of_each_itr.append(dhat) - itr_total += 1 - print('\nthe k shortest distances are', dis_k) - print('the shortest distances for previous iterations are', dis_of_each_itr) - - print('\n\nthe graph is updated', nb_updated, 'times.') - print('\nthe k nearest neighbors are updated', nb_updated_k, 'times.') - print('distances in kernel space:', dis_of_each_itr, '\n') - - return dhat, ghat_list, dis_of_each_itr[-1], nb_updated, nb_updated_k - - - - -def preimage_iam_random_mix(Gn_init, Gn_median, alpha, idx_gi, Kmatrix, k, r_max, - l_max, gkernel, epsilon=0.001, - InitIAMWithAllDk=False, InitRandomWithAllDk=True, - params_iam={'c_ei': 1, 'c_er': 1, 'c_es': 1, - 'ite_max': 50, 'epsilon': 0.001, - 'removeNodes': True, 'connected': False}, - params_ged={'lib': 'gedlibpy', 'cost': 'CHEM_1', - 'method': 'IPFP', 'edit_cost_constant': [], - 'stabilizer': 'min', 'repeat': 50}): - """This function constructs graph pre-image by the iterative pre-image - framework in reference [1], algorithm 1, where new graphs are generated - randomly and by the IAM algorithm in reference [2]. - - notes - ----- - Every time a set of n better graphs is acquired, their distances in kernel space are - compared with the k nearest ones, and the k nearest distances from the k+n - distances will be used as the new ones. - """ - Gn_init = [nx.convert_node_labels_to_integers(g) for g in Gn_init] - # compute k nearest neighbors of phi in DN. - dis_all = [] # distance between g_star and each graph. - term3 = 0 - for i1, a1 in enumerate(alpha): - for i2, a2 in enumerate(alpha): - term3 += a1 * a2 * Kmatrix[idx_gi[i1], idx_gi[i2]] - for ig, g in tqdm(enumerate(Gn_init), desc='computing distances', file=sys.stdout): - dtemp = dis_gstar(ig, idx_gi, alpha, Kmatrix, term3=term3) - dis_all.append(dtemp) - - # sort - sort_idx = np.argsort(dis_all) - dis_k = [dis_all[idis] for idis in sort_idx[0:k]] # the k shortest distances - nb_best = len(np.argwhere(dis_k == dis_k[0]).flatten().tolist()) - ghat_list = [Gn_init[idx].copy() for idx in sort_idx[0:nb_best]] # the nearest neighbors of psi in DN - if dis_k[0] == 0: # the exact pre-image. - print('The exact pre-image is found from the input dataset.') - return 0, ghat_list, 0, 0 - dhat = dis_k[0] # the nearest distance -# for g in ghat_list: -# draw_Letter_graph(g) -# nx.draw_networkx(g) -# plt.show() -# print(g.nodes(data=True)) -# print(g.edges(data=True)) - Gk = [Gn_init[ig].copy() for ig in sort_idx[0:k]] # the k nearest neighbors -# for gi in Gk: -# nx.draw(gi, labels=nx.get_node_attributes(gi, 'atom'), with_labels=True) -## nx.draw_networkx(gi) -# plt.show() -## draw_Letter_graph(g) -# print(gi.nodes(data=True)) -# print(gi.edges(data=True)) - - r = 0 - itr_total = 0 - dis_of_each_itr = [dhat] - nb_updated_iam = 0 - nb_updated_k_iam = 0 - nb_updated_random = 0 - nb_updated_k_random = 0 -# is_iam_duplicate = False - while r < r_max: # and not found: # @todo: if not found?# and np.abs(old_dis - cur_dis) > epsilon: - print('\n-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-') - print('Current preimage iteration =', r) - print('Total preimage iteration =', itr_total, '\n') - found_iam = False - - Gn_nearest_median = [g.copy() for g in Gk] - if InitIAMWithAllDk: # each graph in D_k is used to initialize IAM. - ghat_new_list = [] - for g_tmp in Gk: - Gn_nearest_init = [g_tmp.copy()] - ghat_new_list_tmp, _ = iam_upgraded(Gn_nearest_median, - Gn_nearest_init, params_ged=params_ged, **params_iam) - ghat_new_list += ghat_new_list_tmp - else: # only the best graph in D_k is used to initialize IAM. - Gn_nearest_init = [g.copy() for g in Gk] - ghat_new_list, _ = iam_upgraded(Gn_nearest_median, Gn_nearest_init, - params_ged=params_ged, **params_iam) - -# for g in g_tmp_list: -# nx.draw_networkx(g) -# plt.show() -# draw_Letter_graph(g) -# print(g.nodes(data=True)) -# print(g.edges(data=True)) - - # compute distance between \psi and the new generated graphs. - knew = compute_kernel(ghat_new_list + Gn_median, gkernel, False) - dhat_new_list = [] - - for idx, g_tmp in enumerate(ghat_new_list): - # @todo: the term3 below could use the one at the beginning of the function. - dhat_new_list.append(dis_gstar(idx, range(len(ghat_new_list), - len(ghat_new_list) + len(Gn_median) + 1), - alpha, knew, withterm3=False)) - - # find the new k nearest graphs. - for idx_g, ghat_new in enumerate(ghat_new_list): - dhat_new = dhat_new_list[idx_g] - - # if the new distance is smaller than the max of D_k. - if dhat_new < dis_k[-1] and np.abs(dhat_new - dis_k[-1]) >= epsilon: - # check if the new distance is the same as one in D_k. - is_duplicate = False - for dis_tmp in dis_k[1:-1]: - if np.abs(dhat_new - dis_tmp) < epsilon: - is_duplicate = True - print('IAM: duplicate k nearest graph generated.') - break - if not is_duplicate: - if np.abs(dhat_new - dhat) < epsilon: - print('IAM: I am equal!') -# dhat = dhat_new -# ghat_list = [ghat_new.copy()] - else: - print('IAM: we got better k nearest neighbors!') - nb_updated_k_iam += 1 - print('the k nearest neighbors are updated', - nb_updated_k_iam, 'times.') - - dis_k = [dhat_new] + dis_k[0:k-1] # add the new nearest distance. - Gk = [ghat_new.copy()] + Gk[0:k-1] # add the corresponding graph. - sort_idx = np.argsort(dis_k) - dis_k = [dis_k[idx] for idx in sort_idx[0:k]] # the new k nearest distances. - Gk = [Gk[idx] for idx in sort_idx[0:k]] - if dhat_new < dhat: - print('IAM: I have smaller distance!') - print(str(dhat) + '->' + str(dhat_new)) - dhat = dhat_new - ghat_list = [Gk[0].copy()] - r = 0 - nb_updated_iam += 1 - - print('the graph is updated by IAM', nb_updated_iam, - 'times.') - nx.draw(Gk[0], labels=nx.get_node_attributes(Gk[0], 'atom'), - with_labels=True) - ## plt.savefig("results/gk_iam/simple_two/xx" + str(i) + ".png", format="PNG") - plt.show() - - found_iam = True - - # when new distance is not smaller than the max of D_k, use random generation. - if not found_iam: - print('Distance not better, switching to random generation now.') - print(str(dhat) + '->' + str(dhat_new)) - - if InitRandomWithAllDk: # use all k nearest graphs as the initials. - init_list = [g_init.copy() for g_init in Gk] - else: # use just the nearest graph as the initial. - init_list = [Gk[0].copy()] - - # number of edges to be changed. - if len(init_list) == 1: - # @todo what if the log is negetive? how to choose alpha (scalar)? seems fdgs is always 1. - # fdgs = dhat_new - fdgs = nb_updated_random + 1 - if fdgs < 1: - fdgs = 1 - fdgs = int(np.ceil(np.log(fdgs))) - if fdgs < 1: - fdgs += 1 - # fdgs = nb_updated_random + 1 # @todo: - fdgs_list = [fdgs] - else: - # @todo what if the log is negetive? how to choose alpha (scalar)? - fdgs_list = np.array(dis_k[:]) - if np.min(fdgs_list) < 1: - fdgs_list /= dis_k[0] - fdgs_list = [int(item) for item in np.ceil(np.log(fdgs_list))] - if np.min(fdgs_list) < 1: - fdgs_list = np.array(fdgs_list) + 1 - - l = 0 - found_random = False - while l < l_max and not found_random: - for idx_g, g_tmp in enumerate(init_list): - # add and delete edges. - ghat_new = nx.convert_node_labels_to_integers(g_tmp.copy()) - # @todo: should we use just half of the adjacency matrix for undirected graphs? - nb_vpairs = nx.number_of_nodes(ghat_new) * (nx.number_of_nodes(ghat_new) - 1) - np.random.seed() - # which edges to change. - # @todo: what if fdgs is bigger than nb_vpairs? - idx_change = random.sample(range(nb_vpairs), fdgs_list[idx_g] if - fdgs_list[idx_g] < nb_vpairs else nb_vpairs) -# idx_change = np.random.randint(0, nx.number_of_nodes(gs) * -# (nx.number_of_nodes(gs) - 1), fdgs) - for item in idx_change: - node1 = int(item / (nx.number_of_nodes(ghat_new) - 1)) - node2 = (item - node1 * (nx.number_of_nodes(ghat_new) - 1)) - if node2 >= node1: # skip the self pair. - node2 += 1 - # @todo: is the randomness correct? - if not ghat_new.has_edge(node1, node2): - ghat_new.add_edge(node1, node2) - # nx.draw_networkx(gs) - # plt.show() - # nx.draw_networkx(ghat_new) - # plt.show() - else: - ghat_new.remove_edge(node1, node2) - # nx.draw_networkx(gs) - # plt.show() - # nx.draw_networkx(ghat_new) - # plt.show() - # nx.draw_networkx(ghat_new) - # plt.show() - - # compute distance between \psi and the new generated graph. - knew = compute_kernel([ghat_new] + Gn_median, gkernel, verbose=False) - dhat_new = dis_gstar(0, range(1, len(Gn_median) + 1), - alpha, knew, withterm3=False) - # @todo: the new distance is smaller or also equal? - if dhat_new < dis_k[-1] and np.abs(dhat_new - dis_k[-1]) >= epsilon: - # check if the new distance is the same as one in D_k. - is_duplicate = False - for dis_tmp in dis_k[1:-1]: - if np.abs(dhat_new - dis_tmp) < epsilon: - is_duplicate = True - print('Random: duplicate k nearest graph generated.') - break - if not is_duplicate: - if np.abs(dhat_new - dhat) < epsilon: - print('Random: I am equal!') - # dhat = dhat_new - # ghat_list = [ghat_new.copy()] - else: - print('Random: we got better k nearest neighbors!') - print('l =', str(l)) - nb_updated_k_random += 1 - print('the k nearest neighbors are updated by random generation', - nb_updated_k_random, 'times.') - - dis_k = [dhat_new] + dis_k # add the new nearest distances. - Gk = [ghat_new.copy()] + Gk # add the corresponding graphs. - sort_idx = np.argsort(dis_k) - dis_k = [dis_k[idx] for idx in sort_idx[0:k]] # the new k nearest distances. - Gk = [Gk[idx] for idx in sort_idx[0:k]] - if dhat_new < dhat: - print('\nRandom: I am smaller!') - print('l =', str(l)) - print(dhat, '->', dhat_new) - dhat = dhat_new - ghat_list = [ghat_new.copy()] - r = 0 - nb_updated_random += 1 - - print('the graph is updated by random generation', - nb_updated_random, 'times.') - - nx.draw(ghat_new, labels=nx.get_node_attributes(ghat_new, 'atom'), - with_labels=True) - ## plt.savefig("results/gk_iam/simple_two/xx" + str(i) + ".png", format="PNG") - plt.show() - found_random = True - break - l += 1 - if not found_random: # l == l_max: - r += 1 - - dis_of_each_itr.append(dhat) - itr_total += 1 - print('\nthe k shortest distances are', dis_k) - print('the shortest distances for previous iterations are', dis_of_each_itr) - - print('\n\nthe graph is updated by IAM', nb_updated_iam, 'times, and by random generation', - nb_updated_random, 'times.') - print('\nthe k nearest neighbors are updated by IAM', nb_updated_k_iam, - 'times, and by random generation', nb_updated_k_random, 'times.') - print('distances in kernel space:', dis_of_each_itr, '\n') - - return dhat, ghat_list, dis_of_each_itr[-1], \ - nb_updated_iam, nb_updated_random, nb_updated_k_iam, nb_updated_k_random - - -############################################################################### -# Old implementations. - -#def gk_iam(Gn, alpha): -# """This function constructs graph pre-image by the iterative pre-image -# framework in reference [1], algorithm 1, where the step of generating new -# graphs randomly is replaced by the IAM algorithm in reference [2]. -# -# notes -# ----- -# Every time a better graph is acquired, the older one is replaced by it. -# """ -# pass -# # compute k nearest neighbors of phi in DN. -# dis_list = [] # distance between g_star and each graph. -# for ig, g in tqdm(enumerate(Gn), desc='computing distances', file=sys.stdout): -# dtemp = k_list[ig] - 2 * (alpha * k_g1_list[ig] + (1 - alpha) * -# k_g2_list[ig]) + (alpha * alpha * k_list[idx1] + alpha * -# (1 - alpha) * k_g2_list[idx1] + (1 - alpha) * alpha * -# k_g1_list[idx2] + (1 - alpha) * (1 - alpha) * k_list[idx2]) -# dis_list.append(dtemp) -# -# # sort -# sort_idx = np.argsort(dis_list) -# dis_gs = [dis_list[idis] for idis in sort_idx[0:k]] -# g0hat = Gn[sort_idx[0]] # the nearest neighbor of phi in DN -# if dis_gs[0] == 0: # the exact pre-image. -# print('The exact pre-image is found from the input dataset.') -# return 0, g0hat -# dhat = dis_gs[0] # the nearest distance -# Gk = [Gn[ig] for ig in sort_idx[0:k]] # the k nearest neighbors -# gihat_list = [] -# -## i = 1 -# r = 1 -# while r < r_max: -# print('r =', r) -## found = False -# Gs_nearest = Gk + gihat_list -# g_tmp = iam(Gs_nearest) -# -# # compute distance between \psi and the new generated graph. -# knew = marginalizedkernel([g_tmp, g1, g2], node_label='atom', edge_label=None, -# p_quit=lmbda, n_iteration=20, remove_totters=False, -# n_jobs=multiprocessing.cpu_count(), verbose=False) -# dnew = knew[0][0, 0] - 2 * (alpha * knew[0][0, 1] + (1 - alpha) * -# knew[0][0, 2]) + (alpha * alpha * k_list[idx1] + alpha * -# (1 - alpha) * k_g2_list[idx1] + (1 - alpha) * alpha * -# k_g1_list[idx2] + (1 - alpha) * (1 - alpha) * k_list[idx2]) -# if dnew <= dhat: # the new distance is smaller -# print('I am smaller!') -# dhat = dnew -# g_new = g_tmp.copy() # found better graph. -# gihat_list = [g_new] -# dis_gs.append(dhat) -# r = 0 -# else: -# r += 1 -# -# ghat = ([g0hat] if len(gihat_list) == 0 else gihat_list) -# -# return dhat, ghat - - -#def gk_iam_nearest(Gn, alpha, idx_gi, Kmatrix, k, r_max): -# """This function constructs graph pre-image by the iterative pre-image -# framework in reference [1], algorithm 1, where the step of generating new -# graphs randomly is replaced by the IAM algorithm in reference [2]. -# -# notes -# ----- -# Every time a better graph is acquired, its distance in kernel space is -# compared with the k nearest ones, and the k nearest distances from the k+1 -# distances will be used as the new ones. -# """ -# # compute k nearest neighbors of phi in DN. -# dis_list = [] # distance between g_star and each graph. -# for ig, g in tqdm(enumerate(Gn), desc='computing distances', file=sys.stdout): -# dtemp = dis_gstar(ig, idx_gi, alpha, Kmatrix) -## dtemp = k_list[ig] - 2 * (alpha * k_g1_list[ig] + (1 - alpha) * -## k_g2_list[ig]) + (alpha * alpha * k_list[0] + alpha * -## (1 - alpha) * k_g2_list[0] + (1 - alpha) * alpha * -## k_g1_list[6] + (1 - alpha) * (1 - alpha) * k_list[6]) -# dis_list.append(dtemp) -# -# # sort -# sort_idx = np.argsort(dis_list) -# dis_gs = [dis_list[idis] for idis in sort_idx[0:k]] # the k shortest distances -# g0hat = Gn[sort_idx[0]] # the nearest neighbor of phi in DN -# if dis_gs[0] == 0: # the exact pre-image. -# print('The exact pre-image is found from the input dataset.') -# return 0, g0hat -# dhat = dis_gs[0] # the nearest distance -# ghat = g0hat.copy() -# Gk = [Gn[ig].copy() for ig in sort_idx[0:k]] # the k nearest neighbors -# for gi in Gk: -# nx.draw_networkx(gi) -# plt.show() -# print(gi.nodes(data=True)) -# print(gi.edges(data=True)) -# Gs_nearest = Gk.copy() -## gihat_list = [] -# -## i = 1 -# r = 1 -# while r < r_max: -# print('r =', r) -## found = False -## Gs_nearest = Gk + gihat_list -## g_tmp = iam(Gs_nearest) -# g_tmp = test_iam_with_more_graphs_as_init(Gs_nearest, Gs_nearest, c_ei=1, c_er=1, c_es=1) -# nx.draw_networkx(g_tmp) -# plt.show() -# print(g_tmp.nodes(data=True)) -# print(g_tmp.edges(data=True)) -# -# # compute distance between \psi and the new generated graph. -# gi_list = [Gn[i] for i in idx_gi] -# knew = compute_kernel([g_tmp] + gi_list, 'untilhpathkernel', False) -# dnew = dis_gstar(0, range(1, len(gi_list) + 1), alpha, knew) -# -## dnew = knew[0, 0] - 2 * (alpha[0] * knew[0, 1] + alpha[1] * -## knew[0, 2]) + (alpha[0] * alpha[0] * k_list[0] + alpha[0] * -## alpha[1] * k_g2_list[0] + alpha[1] * alpha[0] * -## k_g1_list[1] + alpha[1] * alpha[1] * k_list[1]) -# if dnew <= dhat and g_tmp != ghat: # the new distance is smaller -# print('I am smaller!') -# print(str(dhat) + '->' + str(dnew)) -## nx.draw_networkx(ghat) -## plt.show() -## print('->') -## nx.draw_networkx(g_tmp) -## plt.show() -# -# dhat = dnew -# g_new = g_tmp.copy() # found better graph. -# ghat = g_tmp.copy() -# dis_gs.append(dhat) # add the new nearest distance. -# Gs_nearest.append(g_new) # add the corresponding graph. -# sort_idx = np.argsort(dis_gs) -# dis_gs = [dis_gs[idx] for idx in sort_idx[0:k]] # the new k nearest distances. -# Gs_nearest = [Gs_nearest[idx] for idx in sort_idx[0:k]] -# r = 0 -# else: -# r += 1 -# -# return dhat, ghat - - -#def gk_iam_nearest_multi(Gn, alpha, idx_gi, Kmatrix, k, r_max): -# """This function constructs graph pre-image by the iterative pre-image -# framework in reference [1], algorithm 1, where the step of generating new -# graphs randomly is replaced by the IAM algorithm in reference [2]. -# -# notes -# ----- -# Every time a set of n better graphs is acquired, their distances in kernel space are -# compared with the k nearest ones, and the k nearest distances from the k+n -# distances will be used as the new ones. -# """ -# Gn_median = [Gn[idx].copy() for idx in idx_gi] -# # compute k nearest neighbors of phi in DN. -# dis_list = [] # distance between g_star and each graph. -# for ig, g in tqdm(enumerate(Gn), desc='computing distances', file=sys.stdout): -# dtemp = dis_gstar(ig, idx_gi, alpha, Kmatrix) -## dtemp = k_list[ig] - 2 * (alpha * k_g1_list[ig] + (1 - alpha) * -## k_g2_list[ig]) + (alpha * alpha * k_list[0] + alpha * -## (1 - alpha) * k_g2_list[0] + (1 - alpha) * alpha * -## k_g1_list[6] + (1 - alpha) * (1 - alpha) * k_list[6]) -# dis_list.append(dtemp) -# -# # sort -# sort_idx = np.argsort(dis_list) -# dis_gs = [dis_list[idis] for idis in sort_idx[0:k]] # the k shortest distances -# nb_best = len(np.argwhere(dis_gs == dis_gs[0]).flatten().tolist()) -# g0hat_list = [Gn[idx] for idx in sort_idx[0:nb_best]] # the nearest neighbors of phi in DN -# if dis_gs[0] == 0: # the exact pre-image. -# print('The exact pre-image is found from the input dataset.') -# return 0, g0hat_list -# dhat = dis_gs[0] # the nearest distance -# ghat_list = [g.copy() for g in g0hat_list] -# for g in ghat_list: -# nx.draw_networkx(g) -# plt.show() -# print(g.nodes(data=True)) -# print(g.edges(data=True)) -# Gk = [Gn[ig].copy() for ig in sort_idx[0:k]] # the k nearest neighbors -# for gi in Gk: -# nx.draw_networkx(gi) -# plt.show() -# print(gi.nodes(data=True)) -# print(gi.edges(data=True)) -# Gs_nearest = Gk.copy() -## gihat_list = [] -# -## i = 1 -# r = 1 -# while r < r_max: -# print('r =', r) -## found = False -## Gs_nearest = Gk + gihat_list -## g_tmp = iam(Gs_nearest) -# g_tmp_list = test_iam_moreGraphsAsInit_tryAllPossibleBestGraphs_deleteNodesInIterations( -# Gn_median, Gs_nearest, c_ei=1, c_er=1, c_es=1) -# for g in g_tmp_list: -# nx.draw_networkx(g) -# plt.show() -# print(g.nodes(data=True)) -# print(g.edges(data=True)) -# -# # compute distance between \psi and the new generated graphs. -# gi_list = [Gn[i] for i in idx_gi] -# knew = compute_kernel(g_tmp_list + gi_list, 'marginalizedkernel', False) -# dnew_list = [] -# for idx, g_tmp in enumerate(g_tmp_list): -# dnew_list.append(dis_gstar(idx, range(len(g_tmp_list), -# len(g_tmp_list) + len(gi_list) + 1), alpha, knew)) -# -## dnew = knew[0, 0] - 2 * (alpha[0] * knew[0, 1] + alpha[1] * -## knew[0, 2]) + (alpha[0] * alpha[0] * k_list[0] + alpha[0] * -## alpha[1] * k_g2_list[0] + alpha[1] * alpha[0] * -## k_g1_list[1] + alpha[1] * alpha[1] * k_list[1]) -# -# # find the new k nearest graphs. -# dis_gs = dnew_list + dis_gs # add the new nearest distances. -# Gs_nearest = [g.copy() for g in g_tmp_list] + Gs_nearest # add the corresponding graphs. -# sort_idx = np.argsort(dis_gs) -# if len([i for i in sort_idx[0:k] if i < len(dnew_list)]) > 0: -# print('We got better k nearest neighbors! Hurray!') -# dis_gs = [dis_gs[idx] for idx in sort_idx[0:k]] # the new k nearest distances. -# print(dis_gs[-1]) -# Gs_nearest = [Gs_nearest[idx] for idx in sort_idx[0:k]] -# nb_best = len(np.argwhere(dis_gs == dis_gs[0]).flatten().tolist()) -# if len([i for i in sort_idx[0:nb_best] if i < len(dnew_list)]) > 0: -# print('I have smaller or equal distance!') -# dhat = dis_gs[0] -# print(str(dhat) + '->' + str(dhat)) -# idx_best_list = np.argwhere(dnew_list == dhat).flatten().tolist() -# ghat_list = [g_tmp_list[idx].copy() for idx in idx_best_list] -# for g in ghat_list: -# nx.draw_networkx(g) -# plt.show() -# print(g.nodes(data=True)) -# print(g.edges(data=True)) -# r = 0 -# else: -# r += 1 -# -# return dhat, ghat_list \ No newline at end of file diff --git a/gklearn/preimage/preimage_random.py b/gklearn/preimage/preimage_random.py deleted file mode 100644 index e5f74cd..0000000 --- a/gklearn/preimage/preimage_random.py +++ /dev/null @@ -1,309 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -Created on Wed Mar 6 16:03:11 2019 - -pre-image -@author: ljia -""" - -import sys -import numpy as np -import random -from tqdm import tqdm -import networkx as nx -import matplotlib.pyplot as plt - -from gklearn.preimage.utils import compute_kernel, dis_gstar - - -def preimage_random(Gn_init, Gn_median, alpha, idx_gi, Kmatrix, k, r_max, l, gkernel): - Gn_init = [nx.convert_node_labels_to_integers(g) for g in Gn_init] - - # compute k nearest neighbors of phi in DN. - dis_list = [] # distance between g_star and each graph. - term3 = 0 - for i1, a1 in enumerate(alpha): - for i2, a2 in enumerate(alpha): - term3 += a1 * a2 * Kmatrix[idx_gi[i1], idx_gi[i2]] - for ig, g in tqdm(enumerate(Gn_init), desc='computing distances', file=sys.stdout): - dtemp = dis_gstar(ig, idx_gi, alpha, Kmatrix, term3=term3) - dis_list.append(dtemp) -# print(np.max(dis_list)) -# print(np.min(dis_list)) -# print(np.min([item for item in dis_list if item != 0])) -# print(np.mean(dis_list)) - - # sort - sort_idx = np.argsort(dis_list) - dis_gs = [dis_list[idis] for idis in sort_idx[0:k]] # the k shortest distances - nb_best = len(np.argwhere(dis_gs == dis_gs[0]).flatten().tolist()) - g0hat_list = [Gn_init[idx] for idx in sort_idx[0:nb_best]] # the nearest neighbors of phi in DN - if dis_gs[0] == 0: # the exact pre-image. - print('The exact pre-image is found from the input dataset.') - return 0, g0hat_list[0], 0 - dhat = dis_gs[0] # the nearest distance -# ghat_list = [g.copy() for g in g0hat_list] -# for g in ghat_list: -# draw_Letter_graph(g) -# nx.draw_networkx(g) -# plt.show() -# print(g.nodes(data=True)) -# print(g.edges(data=True)) - Gk = [Gn_init[ig].copy() for ig in sort_idx[0:k]] # the k nearest neighbors -# for gi in Gk: -## nx.draw_networkx(gi) -## plt.show() -# draw_Letter_graph(g) -# print(gi.nodes(data=True)) -# print(gi.edges(data=True)) - Gs_nearest = [g.copy() for g in Gk] - gihat_list = [] - dihat_list = [] - -# i = 1 - r = 0 -# sod_list = [dhat] -# found = False - dis_of_each_itr = [dhat] - nb_updated = 0 - g_best = [] - while r < r_max: - print('\nr =', r) - print('itr for gk =', nb_updated, '\n') - found = False - dis_bests = dis_gs + dihat_list - # @todo what if the log is negetive? how to choose alpha (scalar)? - fdgs_list = np.array(dis_bests) - if np.min(fdgs_list) < 1: - fdgs_list /= np.min(dis_bests) - fdgs_list = [int(item) for item in np.ceil(np.log(fdgs_list))] - if np.min(fdgs_list) < 1: - fdgs_list = np.array(fdgs_list) + 1 - - for ig, gs in enumerate(Gs_nearest + gihat_list): -# nx.draw_networkx(gs) -# plt.show() - for trail in range(0, l): -# for trail in tqdm(range(0, l), desc='l loops', file=sys.stdout): - # add and delete edges. - gtemp = gs.copy() - np.random.seed() - # which edges to change. - # @todo: should we use just half of the adjacency matrix for undirected graphs? - nb_vpairs = nx.number_of_nodes(gs) * (nx.number_of_nodes(gs) - 1) - # @todo: what if fdgs is bigger than nb_vpairs? - idx_change = random.sample(range(nb_vpairs), fdgs_list[ig] if - fdgs_list[ig] < nb_vpairs else nb_vpairs) -# idx_change = np.random.randint(0, nx.number_of_nodes(gs) * -# (nx.number_of_nodes(gs) - 1), fdgs) - for item in idx_change: - node1 = int(item / (nx.number_of_nodes(gs) - 1)) - node2 = (item - node1 * (nx.number_of_nodes(gs) - 1)) - if node2 >= node1: # skip the self pair. - node2 += 1 - # @todo: is the randomness correct? - if not gtemp.has_edge(node1, node2): - gtemp.add_edge(node1, node2) -# nx.draw_networkx(gs) -# plt.show() -# nx.draw_networkx(gtemp) -# plt.show() - else: - gtemp.remove_edge(node1, node2) -# nx.draw_networkx(gs) -# plt.show() -# nx.draw_networkx(gtemp) -# plt.show() -# nx.draw_networkx(gtemp) -# plt.show() - - # compute distance between \psi and the new generated graph. -# knew = marginalizedkernel([gtemp, g1, g2], node_label='atom', edge_label=None, -# p_quit=lmbda, n_iteration=20, remove_totters=False, -# n_jobs=multiprocessing.cpu_count(), verbose=False) - knew = compute_kernel([gtemp] + Gn_median, gkernel, verbose=False) - dnew = dis_gstar(0, range(1, len(Gn_median) + 1), alpha, knew, - withterm3=False) - if dnew <= dhat: # @todo: the new distance is smaller or also equal? - if dnew < dhat: - print('\nI am smaller!') - print('ig =', str(ig), ', l =', str(trail)) - print(dhat, '->', dnew) - nb_updated += 1 - elif dnew == dhat: - print('I am equal!') -# nx.draw_networkx(gtemp) -# plt.show() -# print(gtemp.nodes(data=True)) -# print(gtemp.edges(data=True)) - dhat = dnew - gnew = gtemp.copy() - found = True # found better graph. - if found: - r = 0 - gihat_list = [gnew] - dihat_list = [dhat] - else: - r += 1 - - dis_of_each_itr.append(dhat) - print('the shortest distances for previous iterations are', dis_of_each_itr) -# dis_best.append(dhat) - g_best = (g0hat_list[0] if len(gihat_list) == 0 else gihat_list[0]) - print('distances in kernel space:', dis_of_each_itr, '\n') - - return dhat, g_best, nb_updated -# return 0, 0, 0 - - -if __name__ == '__main__': - from gklearn.utils.graphfiles import loadDataset - -# ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt', -# 'extra_params': {}} # node/edge symb - ds = {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt', - 'extra_params': {}} # node nsymb -# ds = {'name': 'Acyclic', 'dataset': '../datasets/monoterpenoides/trainset_9.ds', -# 'extra_params': {}} -# ds = {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds', -# 'extra_params': {}} # node symb - - DN, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params']) - #DN = DN[0:10] - - lmbda = 0.03 # termination probalility - r_max = 3 # 10 # iteration limit. - l = 500 - alpha_range = np.linspace(0.5, 0.5, 1) - #alpha_range = np.linspace(0.1, 0.9, 9) - k = 10 # 5 # k nearest neighbors - - # randomly select two molecules - #np.random.seed(1) - #idx1, idx2 = np.random.randint(0, len(DN), 2) - #g1 = DN[idx1] - #g2 = DN[idx2] - idx1 = 0 - idx2 = 6 - g1 = DN[idx1] - g2 = DN[idx2] - - # compute - k_list = [] # kernel between each graph and itself. - k_g1_list = [] # kernel between each graph and g1 - k_g2_list = [] # kernel between each graph and g2 - for ig, g in tqdm(enumerate(DN), desc='computing self kernels', file=sys.stdout): - # ktemp = marginalizedkernel([g, g1, g2], node_label='atom', edge_label=None, - # p_quit=lmbda, n_iteration=20, remove_totters=False, - # n_jobs=multiprocessing.cpu_count(), verbose=False) - ktemp = compute_kernel([g, g1, g2], 'untilhpathkernel', verbose=False) - k_list.append(ktemp[0, 0]) - k_g1_list.append(ktemp[0, 1]) - k_g2_list.append(ktemp[0, 2]) - - g_best = [] - dis_best = [] - # for each alpha - for alpha in alpha_range: - print('alpha =', alpha) - # compute k nearest neighbors of phi in DN. - dis_list = [] # distance between g_star and each graph. - for ig, g in tqdm(enumerate(DN), desc='computing distances', file=sys.stdout): - dtemp = k_list[ig] - 2 * (alpha * k_g1_list[ig] + (1 - alpha) * - k_g2_list[ig]) + (alpha * alpha * k_list[idx1] + alpha * - (1 - alpha) * k_g2_list[idx1] + (1 - alpha) * alpha * - k_g1_list[idx2] + (1 - alpha) * (1 - alpha) * k_list[idx2]) - dis_list.append(np.sqrt(dtemp)) - - # sort - sort_idx = np.argsort(dis_list) - dis_gs = [dis_list[idis] for idis in sort_idx[0:k]] - g0hat = DN[sort_idx[0]] # the nearest neighbor of phi in DN - if dis_gs[0] == 0: # the exact pre-image. - print('The exact pre-image is found from the input dataset.') - g_pimg = g0hat - break - dhat = dis_gs[0] # the nearest distance - Dk = [DN[ig] for ig in sort_idx[0:k]] # the k nearest neighbors - gihat_list = [] - - i = 1 - r = 1 - while r < r_max: - print('r =', r) - found = False - for ig, gs in enumerate(Dk + gihat_list): - # nx.draw_networkx(gs) - # plt.show() - # @todo what if the log is negetive? - fdgs = int(np.abs(np.ceil(np.log(alpha * dis_gs[ig])))) - for trail in tqdm(range(0, l), desc='l loop', file=sys.stdout): - # add and delete edges. - gtemp = gs.copy() - np.random.seed() - # which edges to change. - # @todo: should we use just half of the adjacency matrix for undirected graphs? - nb_vpairs = nx.number_of_nodes(gs) * (nx.number_of_nodes(gs) - 1) - # @todo: what if fdgs is bigger than nb_vpairs? - idx_change = random.sample(range(nb_vpairs), fdgs if fdgs < nb_vpairs else nb_vpairs) - # idx_change = np.random.randint(0, nx.number_of_nodes(gs) * - # (nx.number_of_nodes(gs) - 1), fdgs) - for item in idx_change: - node1 = int(item / (nx.number_of_nodes(gs) - 1)) - node2 = (item - node1 * (nx.number_of_nodes(gs) - 1)) - if node2 >= node1: # skip the self pair. - node2 += 1 - # @todo: is the randomness correct? - if not gtemp.has_edge(node1, node2): - # @todo: how to update the bond_type? 0 or 1? - gtemp.add_edges_from([(node1, node2, {'bond_type': 1})]) - # nx.draw_networkx(gs) - # plt.show() - # nx.draw_networkx(gtemp) - # plt.show() - else: - gtemp.remove_edge(node1, node2) - # nx.draw_networkx(gs) - # plt.show() - # nx.draw_networkx(gtemp) - # plt.show() - # nx.draw_networkx(gtemp) - # plt.show() - - # compute distance between phi and the new generated graph. - # knew = marginalizedkernel([gtemp, g1, g2], node_label='atom', edge_label=None, - # p_quit=lmbda, n_iteration=20, remove_totters=False, - # n_jobs=multiprocessing.cpu_count(), verbose=False) - knew = compute_kernel([gtemp, g1, g2], 'untilhpathkernel', verbose=False) - dnew = np.sqrt(knew[0, 0] - 2 * (alpha * knew[0, 1] + (1 - alpha) * - knew[0, 2]) + (alpha * alpha * k_list[idx1] + alpha * - (1 - alpha) * k_g2_list[idx1] + (1 - alpha) * alpha * - k_g1_list[idx2] + (1 - alpha) * (1 - alpha) * k_list[idx2])) - if dnew < dhat: # @todo: the new distance is smaller or also equal? - print('I am smaller!') - print(dhat, '->', dnew) - nx.draw_networkx(gtemp) - plt.show() - print(gtemp.nodes(data=True)) - print(gtemp.edges(data=True)) - dhat = dnew - gnew = gtemp.copy() - found = True # found better graph. - r = 0 - elif dnew == dhat: - print('I am equal!') - if found: - gihat_list = [gnew] - dis_gs.append(dhat) - else: - r += 1 - dis_best.append(dhat) - g_best += ([g0hat] if len(gihat_list) == 0 else gihat_list) - - - for idx, item in enumerate(alpha_range): - print('when alpha is', item, 'the shortest distance is', dis_best[idx]) - print('the corresponding pre-image is') - nx.draw_networkx(g_best[idx]) - plt.show() \ No newline at end of file diff --git a/gklearn/preimage/python_code.py b/gklearn/preimage/python_code.py deleted file mode 100644 index 3772526..0000000 --- a/gklearn/preimage/python_code.py +++ /dev/null @@ -1,122 +0,0 @@ - elif opt_name == 'random-inits': - try: - num_random_inits_ = std::stoul(opt_val) - desired_num_random_inits_ = num_random_inits_ - - except: - raise Error('Invalid argument "' + opt_val + '" for option random-inits. Usage: options = "[--random-inits ]"') - - if num_random_inits_ <= 0: - raise Error('Invalid argument "' + opt_val + '" for option random-inits. Usage: options = "[--random-inits ]"') - - } - elif opt_name == 'randomness': - if opt_val == 'PSEUDO': - use_real_randomness_ = False - - elif opt_val == 'REAL': - use_real_randomness_ = True - - else: - raise Error('Invalid argument "' + opt_val + '" for option randomness. Usage: options = "[--randomness REAL|PSEUDO] [...]"') - - } - elif opt_name == 'stdout': - if opt_val == '0': - print_to_stdout_ = 0 - - elif opt_val == '1': - print_to_stdout_ = 1 - - elif opt_val == '2': - print_to_stdout_ = 2 - - else: - raise Error('Invalid argument "' + opt_val + '" for option stdout. Usage: options = "[--stdout 0|1|2] [...]"') - - } - elif opt_name == 'refine': - if opt_val == 'TRUE': - refine_ = True - - elif opt_val == 'FALSE': - refine_ = False - - else: - raise Error('Invalid argument "' + opt_val + '" for option refine. Usage: options = "[--refine TRUE|FALSE] [...]"') - - } - elif opt_name == 'time-limit': - try: - time_limit_in_sec_ = std::stod(opt_val) - - except: - raise Error('Invalid argument "' + opt_val + '" for option time-limit. Usage: options = "[--time-limit ] [...]') - - } - elif opt_name == 'max-itrs': - try: - max_itrs_ = std::stoi(opt_val) - - except: - raise Error('Invalid argument "' + opt_val + '" for option max-itrs. Usage: options = "[--max-itrs ] [...]') - - } - elif opt_name == 'max-itrs-without-update': - try: - max_itrs_without_update_ = std::stoi(opt_val) - - except: - raise Error('Invalid argument "' + opt_val + '" for option max-itrs-without-update. Usage: options = "[--max-itrs-without-update ] [...]') - - } - elif opt_name == 'seed': - try: - seed_ = std::stoul(opt_val) - - except: - raise Error('Invalid argument "' + opt_val + '" for option seed. Usage: options = "[--seed ] [...]') - - } - elif opt_name == 'epsilon': - try: - epsilon_ = std::stod(opt_val) - - except: - raise Error('Invalid argument "' + opt_val + '" for option epsilon. Usage: options = "[--epsilon ] [...]') - - if epsilon_ <= 0: - raise Error('Invalid argument "' + opt_val + '" for option epsilon. Usage: options = "[--epsilon ] [...]') - - } - elif opt_name == 'inits-increase-order': - try: - num_inits_increase_order_ = std::stoul(opt_val) - - except: - raise Error('Invalid argument "' + opt_val + '" for option inits-increase-order. Usage: options = "[--inits-increase-order ]"') - - if num_inits_increase_order_ <= 0: - raise Error('Invalid argument "' + opt_val + '" for option inits-increase-order. Usage: options = "[--inits-increase-order ]"') - - } - elif opt_name == 'init-type-increase-order': - init_type_increase_order_ = opt_val - if opt_val != 'CLUSTERS' and opt_val != 'K-MEANS++': - raise Exception(std::string('Invalid argument ') + opt_val + ' for option init-type-increase-order. Usage: options = "[--init-type-increase-order CLUSTERS|K-MEANS++] [...]"') - - } - elif opt_name == 'max-itrs-increase-order': - try: - max_itrs_increase_order_ = std::stoi(opt_val) - - except: - raise Error('Invalid argument "' + opt_val + '" for option max-itrs-increase-order. Usage: options = "[--max-itrs-increase-order ] [...]') - - } - else: - std::string valid_options('[--init-type ] [--random-inits ] [--randomness ] [--seed ] [--stdout ] ') - valid_options += '[--time-limit ] [--max-itrs ] [--epsilon ] ' - valid_options += '[--inits-increase-order ] [--init-type-increase-order ] [--max-itrs-increase-order ]' - raise Error(std::string('Invalid option "') + opt_name + '". Usage: options = "' + valid_options + '"') - diff --git a/gklearn/preimage/test.py b/gklearn/preimage/test.py deleted file mode 100644 index 4110a6f..0000000 --- a/gklearn/preimage/test.py +++ /dev/null @@ -1,83 +0,0 @@ -#export LD_LIBRARY_PATH=.:/export/home/lambertn/Documents/gedlibpy/lib/fann/:/export/home/lambertn/Documents/gedlibpy/lib/libsvm.3.22:/export/home/lambertn/Documents/gedlibpy/lib/nomad - -#Pour que "import script" trouve les librairies qu'a besoin GedLib -#Equivalent à définir la variable d'environnement LD_LIBRARY_PATH sur un bash -import gedlibpy.librariesImport -from gedlibpy import gedlibpy -import networkx as nx - - -def init() : - print("List of Edit Cost Options : ") - for i in gedlibpy.list_of_edit_cost_options : - print (i) - print("") - - print("List of Method Options : ") - for j in gedlibpy.list_of_method_options : - print (j) - print("") - - print("List of Init Options : ") - for k in gedlibpy.list_of_init_options : - print (k) - print("") - -def test(): - - gedlibpy.load_GXL_graphs('include/gedlib-master/data/datasets/Mutagenicity/data/', 'collections/MUTA_10.xml') - listID = gedlibpy.get_all_graph_ids() - gedlibpy.set_edit_cost("CHEM_1") - gedlibpy.init() - gedlibpy.set_method("IPFP", "") - gedlibpy.init_method() - g = listID[0] - h = listID[1] - gedlibpy.run_method(g, h) - print("Node Map : ", gedlibpy.get_node_map(g,h)) - print("Forward map : " , gedlibpy.get_forward_map(g, h), ", Backward map : ", gedlibpy.get_backward_map(g, h)) - print("Assignment Matrix : ") - print(gedlibpy.get_assignment_matrix(g, h)) - print ("Upper Bound = " + str(gedlibpy.get_upper_bound(g,h)) + ", Lower Bound = " + str(gedlibpy.get_lower_bound(g, h)) + ", Runtime = " + str(gedlibpy.get_runtime(g, h))) - - -def convertGraph(G): - G_new = nx.Graph() - for nd, attrs in G.nodes(data=True): - G_new.add_node(str(nd), chem=attrs['atom']) - for nd1, nd2, attrs in G.edges(data=True): - G_new.add_edge(str(nd1), str(nd2), valence=attrs['bond_type']) - - return G_new - - -def testNxGrapĥ(): - from gklearn.utils.graphfiles import loadDataset - ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt', - 'extra_params': {}} # node/edge symb - Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params']) - - gedlibpy.restart_env() - for graph in Gn: - g_new = convertGraph(graph) - gedlibpy.add_nx_graph(g_new, "") - - listID = gedlibpy.get_all_graph_ids() - gedlibpy.set_edit_cost("CHEM_1") - gedlibpy.init() - gedlibpy.set_method("IPFP", "") - gedlibpy.init_method() - - print(listID) - g = listID[0] - h = listID[1] - - gedlibpy.run_method(g, h) - - print("Node Map : ", gedlibpy.get_node_map(g, h)) - print("Forward map : " , gedlibpy.get_forward_map(g, h), ", Backward map : ", gedlibpy.get_backward_map(g, h)) - print ("Upper Bound = " + str(gedlibpy.get_upper_bound(g, h)) + ", Lower Bound = " + str(gedlibpy.get_lower_bound(g, h)) + ", Runtime = " + str(gedlibpy.get_runtime(g, h))) - -#test() -init() -#testNxGrapĥ() diff --git a/gklearn/preimage/test_fitDistance.py b/gklearn/preimage/test_fitDistance.py deleted file mode 100644 index 2945a24..0000000 --- a/gklearn/preimage/test_fitDistance.py +++ /dev/null @@ -1,648 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -Created on Thu Oct 24 11:50:56 2019 - -@author: ljia -""" -from matplotlib import pyplot as plt -import numpy as np -from tqdm import tqdm - -from gklearn.utils.graphfiles import loadDataset -from gklearn.preimage.utils import remove_edges -from gklearn.preimage.fitDistance import fit_GED_to_kernel_distance -from gklearn.preimage.utils import normalize_distance_matrix - - -def test_update_costs(): - from preimage.fitDistance import update_costs - import cvxpy as cp - - ds = np.load('results/xp_fit_method/fit_data_debug4.gm.npz') - nb_cost_mat = ds['nb_cost_mat'] - dis_k_vec = ds['dis_k_vec'] - n_edit_operations = ds['n_edit_operations'] - ged_vec_init = ds['ged_vec_init'] - ged_mat = ds['ged_mat'] - - nb_cost_mat_new = nb_cost_mat[:,[2,3,4]] - x = cp.Variable(nb_cost_mat_new.shape[1]) - cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec) -# constraints = [x >= [0.000 for i in range(nb_cost_mat_new.shape[1])], -# np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0] -# constraints = [x >= [0.000 for i in range(nb_cost_mat_new.shape[1])], -# np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0, -# np.array([0.0, 0.0, 0.0, 1.0, -1.0]).T@x == 0.0] - constraints = [x >= [0.00 for i in range(nb_cost_mat_new.shape[1])], - np.array([0.0, 1.0, -1.0]).T@x == 0.0] -# constraints = [x >= [0.00000 for i in range(nb_cost_mat_new.shape[1])]] - prob = cp.Problem(cp.Minimize(cost_fun), constraints) - prob.solve() - print(x.value) - edit_costs_new = np.concatenate((x.value, np.array([0.0]))) - residual = np.sqrt(prob.value) - - -def median_paper_clcpc_python_best(): - """c_vs <= c_vi + c_vr, c_es <= c_ei + c_er with ged computation with - python invoking the c++ code by bash command (with updated library). - """ -# ds = {'name': 'monoterpenoides', -# 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb -# _, y_all = loadDataset(ds['dataset']) - gkernel = 'untilhpathkernel' - node_label = 'atom' - edge_label = 'bond_type' - itr_max = 6 - algo_options = '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1' - params_ged = {'lib': 'gedlibpy', 'cost': 'CONSTANT', 'method': 'IPFP', - 'algo_options': algo_options, 'stabilizer': None} - - y_all = ['3', '1', '4', '6', '7', '8', '9', '2'] - repeats = 50 - collection_path = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/generated_datsets/monoterpenoides/' - graph_dir = collection_path + 'gxl/' - - fn_edit_costs_output = 'results/median_paper/edit_costs_output.python_init40.k10.txt' - - for y in y_all: - for repeat in range(repeats): - edit_costs_output_file = open(fn_edit_costs_output, 'a') - collection_file = collection_path + 'monoterpenoides_' + y + '_' + str(repeat) + '.xml' - Gn, _ = loadDataset(collection_file, extra_params=graph_dir) - edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \ - nb_cost_mat_list = fit_GED_to_kernel_distance(Gn, node_label, edge_label, - gkernel, itr_max, params_ged=params_ged, - parallel=True) - total_time = np.sum(time_list) -# print('\nedit_costs:', edit_costs) -# print('\nresidual_list:', residual_list) -# print('\nedit_cost_list:', edit_cost_list) -# print('\ndistance matrix in kernel space:', dis_k_mat) -# print('\nged matrix:', ged_mat) -# print('\ntotal time:', total_time) -# print('\nnb_cost_mat:', nb_cost_mat_list[-1]) - np.savez('results/median_paper/fit_distance.clcpc.python_init40.monot.elabeled.uhpkernel.y' - + y + '.repeat' + str(repeat) + '.k10..gm', - edit_costs=edit_costs, - residual_list=residual_list, edit_cost_list=edit_cost_list, - dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list, - total_time=total_time, nb_cost_mat_list=nb_cost_mat_list) - - for ec in edit_costs: - edit_costs_output_file.write(str(ec) + ' ') - edit_costs_output_file.write('\n') - edit_costs_output_file.close() - - -# # normalized distance matrices. -# gmfile = np.load('results/fit_distance.cs_leq_ci_plus_cr.cost_leq_1en2.monot.elabeled.uhpkernel.gm.npz') -# edit_costs = gmfile['edit_costs'] -# residual_list = gmfile['residual_list'] -# edit_cost_list = gmfile['edit_cost_list'] -# dis_k_mat = gmfile['dis_k_mat'] -# ged_mat = gmfile['ged_mat'] -# total_time = gmfile['total_time'] -# nb_cost_mat_list = gmfile['nb_cost_mat_list'] - - nb_consistent, nb_inconsistent, ratio_consistent = pairwise_substitution_consistence(dis_k_mat, ged_mat) - print(nb_consistent, nb_inconsistent, ratio_consistent) - -# norm_dis_k_mat = normalize_distance_matrix(dis_k_mat) -# plt.imshow(norm_dis_k_mat) -# plt.colorbar() -# plt.savefig('results/median_paper/norm_dis_k_mat.clcpc.python_best.monot.elabeled.uhpkernel.y' -# + y + '.repeat' + str(repeat) + '.eps', format='eps', dpi=300) -# plt.savefig('results/median_paper/norm_dis_k_mat.clcpc.python_best.monot.elabeled.uhpkernel.y' -# + y + '.repeat' + str(repeat) + '.png', format='png') -# # plt.show() -# plt.clf() -# -# norm_ged_mat = normalize_distance_matrix(ged_mat) -# plt.imshow(norm_ged_mat) -# plt.colorbar() -# plt.savefig('results/median_paper/norm_ged_mat.clcpc.python_best.monot.elabeled.uhpkernel.y' -# + y + '.repeat' + str(repeat) + '.eps', format='eps', dpi=300) -# plt.savefig('results/median_paper/norm_ged_mat.clcpc.python_best.monot.elabeled.uhpkernel.y' -# + y + '.repeat' + str(repeat) + '.png', format='png') -# # plt.show() -# plt.clf() -# -# norm_diff = norm_ged_mat - norm_dis_k_mat -# plt.imshow(norm_diff) -# plt.colorbar() -# plt.savefig('results/median_paper/diff_mat_norm_ged_dis_k.clcpc.python_best.monot.elabeled.uhpkernel.y' -# + y + '.repeat' + str(repeat) + '.eps', format='eps', dpi=300) -# plt.savefig('results/median_paper/diff_mat_norm_ged_dis_k.clcpc.python_best.monot.elabeled.uhpkernel.y' -# + y + '.repeat' + str(repeat) + '.png', format='png') -# # plt.show() -# plt.clf() -# # draw_count_bar(norm_diff) - - -def median_paper_clcpc_python_bash_cpp(): - """c_vs <= c_vi + c_vr, c_es <= c_ei + c_er with ged computation with - python invoking the c++ code by bash command (with updated library). - """ -# ds = {'name': 'monoterpenoides', -# 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb -# _, y_all = loadDataset(ds['dataset']) - gkernel = 'untilhpathkernel' - node_label = 'atom' - edge_label = 'bond_type' - itr_max = 20 - algo_options = '--threads 6 --initial-solutions 10 --ratio-runs-from-initial-solutions .5' - params_ged = {'lib': 'gedlib-bash', 'cost': 'CONSTANT', 'method': 'IPFP', - 'algo_options': algo_options} - - y_all = ['3', '1', '4', '6', '7', '8', '9', '2'] - repeats = 50 - collection_path = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/generated_datsets/monoterpenoides/' - graph_dir = collection_path + 'gxl/' - - fn_edit_costs_output = 'results/median_paper/edit_costs_output.txt' - - for y in y_all: - for repeat in range(repeats): - edit_costs_output_file = open(fn_edit_costs_output, 'a') - collection_file = collection_path + 'monoterpenoides_' + y + '_' + str(repeat) + '.xml' - Gn, _ = loadDataset(collection_file, extra_params=graph_dir) - edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \ - nb_cost_mat_list, coef_dk = fit_GED_to_kernel_distance(Gn, node_label, edge_label, - gkernel, itr_max, params_ged=params_ged, - parallel=False) - total_time = np.sum(time_list) -# print('\nedit_costs:', edit_costs) -# print('\nresidual_list:', residual_list) -# print('\nedit_cost_list:', edit_cost_list) -# print('\ndistance matrix in kernel space:', dis_k_mat) -# print('\nged matrix:', ged_mat) -# print('\ntotal time:', total_time) -# print('\nnb_cost_mat:', nb_cost_mat_list[-1]) - np.savez('results/median_paper/fit_distance.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y' - + y + '.repeat' + str(repeat) + '.gm', - edit_costs=edit_costs, - residual_list=residual_list, edit_cost_list=edit_cost_list, - dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list, - total_time=total_time, nb_cost_mat_list=nb_cost_mat_list, - coef_dk=coef_dk) - - for ec in edit_costs: - edit_costs_output_file.write(str(ec) + ' ') - edit_costs_output_file.write('\n') - edit_costs_output_file.close() - - -# # normalized distance matrices. -# gmfile = np.load('results/fit_distance.cs_leq_ci_plus_cr.cost_leq_1en2.monot.elabeled.uhpkernel.gm.npz') -# edit_costs = gmfile['edit_costs'] -# residual_list = gmfile['residual_list'] -# edit_cost_list = gmfile['edit_cost_list'] -# dis_k_mat = gmfile['dis_k_mat'] -# ged_mat = gmfile['ged_mat'] -# total_time = gmfile['total_time'] -# nb_cost_mat_list = gmfile['nb_cost_mat_list'] -# coef_dk = gmfile['coef_dk'] - - nb_consistent, nb_inconsistent, ratio_consistent = pairwise_substitution_consistence(dis_k_mat, ged_mat) - print(nb_consistent, nb_inconsistent, ratio_consistent) - -# norm_dis_k_mat = normalize_distance_matrix(dis_k_mat) -# plt.imshow(norm_dis_k_mat) -# plt.colorbar() -# plt.savefig('results/median_paper/norm_dis_k_mat.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y' -# + y + '.repeat' + str(repeat) + '.eps', format='eps', dpi=300) -# plt.savefig('results/median_paper/norm_dis_k_mat.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y' -# + y + '.repeat' + str(repeat) + '.png', format='png') -# # plt.show() -# plt.clf() -# -# norm_ged_mat = normalize_distance_matrix(ged_mat) -# plt.imshow(norm_ged_mat) -# plt.colorbar() -# plt.savefig('results/median_paper/norm_ged_mat.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y' -# + y + '.repeat' + str(repeat) + '.eps', format='eps', dpi=300) -# plt.savefig('results/median_paper/norm_ged_mat.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y' -# + y + '.repeat' + str(repeat) + '.png', format='png') -# # plt.show() -# plt.clf() -# -# norm_diff = norm_ged_mat - norm_dis_k_mat -# plt.imshow(norm_diff) -# plt.colorbar() -# plt.savefig('results/median_paper/diff_mat_norm_ged_dis_k.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y' -# + y + '.repeat' + str(repeat) + '.eps', format='eps', dpi=300) -# plt.savefig('results/median_paper/diff_mat_norm_ged_dis_k.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y' -# + y + '.repeat' + str(repeat) + '.png', format='png') -# # plt.show() -# plt.clf() -# # draw_count_bar(norm_diff) - - - - - -def test_cs_leq_ci_plus_cr_python_bash_cpp(): - """c_vs <= c_vi + c_vr, c_es <= c_ei + c_er with ged computation with - python invoking the c++ code by bash command (with updated library). - """ - ds = {'name': 'monoterpenoides', - 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb - Gn, y_all = loadDataset(ds['dataset']) -# Gn = Gn[0:10] - gkernel = 'untilhpathkernel' - node_label = 'atom' - edge_label = 'bond_type' - itr_max = 10 - algo_options = '--threads 6 --initial-solutions 10 --ratio-runs-from-initial-solutions .5' - params_ged = {'lib': 'gedlib-bash', 'cost': 'CONSTANT', 'method': 'IPFP', - 'algo_options': algo_options} - edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \ - nb_cost_mat_list, coef_dk = fit_GED_to_kernel_distance(Gn, node_label, edge_label, - gkernel, itr_max, params_ged=params_ged, - parallel=False) - total_time = np.sum(time_list) - print('\nedit_costs:', edit_costs) - print('\nresidual_list:', residual_list) - print('\nedit_cost_list:', edit_cost_list) - print('\ndistance matrix in kernel space:', dis_k_mat) - print('\nged matrix:', ged_mat) - print('\ntotal time:', total_time) - print('\nnb_cost_mat:', nb_cost_mat_list[-1]) - np.savez('results/fit_distance.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel.gm', - edit_costs=edit_costs, - residual_list=residual_list, edit_cost_list=edit_cost_list, - dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list, - total_time=total_time, nb_cost_mat_list=nb_cost_mat_list, - coef_dk=coef_dk) - -# ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt', -# 'extra_params': {}} # node/edge symb -# Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params']) -## Gn = Gn[0:10] -## remove_edges(Gn) -# gkernel = 'untilhpathkernel' -# node_label = 'atom' -# edge_label = 'bond_type' -# itr_max = 10 -# edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \ -# nb_cost_mat_list, coef_dk = fit_GED_to_kernel_distance(Gn, node_label, edge_label, -# gkernel, itr_max) -# total_time = np.sum(time_list) -# print('\nedit_costs:', edit_costs) -# print('\nresidual_list:', residual_list) -# print('\nedit_cost_list:', edit_cost_list) -# print('\ndistance matrix in kernel space:', dis_k_mat) -# print('\nged matrix:', ged_mat) -# print('\ntotal time:', total_time) -# print('\nnb_cost_mat:', nb_cost_mat_list[-1]) -# np.savez('results/fit_distance.cs_leq_ci_plus_cr.mutag.elabeled.uhpkernel.gm', -# edit_costs=edit_costs, -# residual_list=residual_list, edit_cost_list=edit_cost_list, -# dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list, -# total_time=total_time, nb_cost_mat_list=nb_cost_mat_list, coef_dk) - - -# # normalized distance matrices. -# gmfile = np.load('results/fit_distance.cs_leq_ci_plus_cr.monot.elabeled.uhpkernel.gm.npz') -# edit_costs = gmfile['edit_costs'] -# residual_list = gmfile['residual_list'] -# edit_cost_list = gmfile['edit_cost_list'] -# dis_k_mat = gmfile['dis_k_mat'] -# ged_mat = gmfile['ged_mat'] -# total_time = gmfile['total_time'] -# nb_cost_mat_list = gmfile['nb_cost_mat_list'] -# coef_dk = gmfile['coef_dk'] - - nb_consistent, nb_inconsistent, ratio_consistent = pairwise_substitution_consistence(dis_k_mat, ged_mat) - print(nb_consistent, nb_inconsistent, ratio_consistent) - -# dis_k_sub = pairwise_substitution(dis_k_mat) -# ged_sub = pairwise_substitution(ged_mat) -# np.savez('results/sub_dis_mat.cs_leq_ci_plus_cr.gm', -# dis_k_sub=dis_k_sub, ged_sub=ged_sub) - - - norm_dis_k_mat = normalize_distance_matrix(dis_k_mat) - plt.imshow(norm_dis_k_mat) - plt.colorbar() - plt.savefig('results/norm_dis_k_mat.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel' - + '.eps', format='eps', dpi=300) - plt.savefig('results/norm_dis_k_mat.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel' - + '.png', format='png') -# plt.show() - plt.clf() - - norm_ged_mat = normalize_distance_matrix(ged_mat) - plt.imshow(norm_ged_mat) - plt.colorbar() - plt.savefig('results/norm_ged_mat.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel' - + '.eps', format='eps', dpi=300) - plt.savefig('results/norm_ged_mat.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel' - + '.png', format='png') -# plt.show() - plt.clf() - - norm_diff = norm_ged_mat - norm_dis_k_mat - plt.imshow(norm_diff) - plt.colorbar() - plt.savefig('results/diff_mat_norm_ged_dis_k.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel' - + '.eps', format='eps', dpi=300) - plt.savefig('results/diff_mat_norm_ged_dis_k.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel' - + '.png', format='png') -# plt.show() - plt.clf() -# draw_count_bar(norm_diff) - - -def test_anycosts(): - ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt', - 'extra_params': {}} # node/edge symb - Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params']) -# Gn = Gn[0:10] - remove_edges(Gn) - gkernel = 'marginalizedkernel' - itr_max = 10 - edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \ - nb_cost_mat_list, coef_dk = fit_GED_to_kernel_distance(Gn, gkernel, itr_max) - total_time = np.sum(time_list) - print('\nedit_costs:', edit_costs) - print('\nresidual_list:', residual_list) - print('\nedit_cost_list:', edit_cost_list) - print('\ndistance matrix in kernel space:', dis_k_mat) - print('\nged matrix:', ged_mat) - print('\ntotal time:', total_time) - print('\nnb_cost_mat:', nb_cost_mat_list[-1]) - np.savez('results/fit_distance.any_costs.gm', edit_costs=edit_costs, - residual_list=residual_list, edit_cost_list=edit_cost_list, - dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list, - total_time=total_time, nb_cost_mat_list=nb_cost_mat_list) - -# # normalized distance matrices. -# gmfile = np.load('results/fit_distance.any_costs.gm.npz') -# edit_costs = gmfile['edit_costs'] -# residual_list = gmfile['residual_list'] -# edit_cost_list = gmfile['edit_cost_list'] -# dis_k_mat = gmfile['dis_k_mat'] -# ged_mat = gmfile['ged_mat'] -# total_time = gmfile['total_time'] -## nb_cost_mat_list = gmfile['nb_cost_mat_list'] - - norm_dis_k_mat = normalize_distance_matrix(dis_k_mat) - plt.imshow(norm_dis_k_mat) - plt.colorbar() - plt.savefig('results/norm_dis_k_mat.any_costs' + '.eps', format='eps', dpi=300) -# plt.savefig('results/norm_dis_k_mat.any_costs' + '.png', format='png') -# plt.show() - plt.clf() - - norm_ged_mat = normalize_distance_matrix(ged_mat) - plt.imshow(norm_ged_mat) - plt.colorbar() - plt.savefig('results/norm_ged_mat.any_costs' + '.eps', format='eps', dpi=300) -# plt.savefig('results/norm_ged_mat.any_costs' + '.png', format='png') -# plt.show() - plt.clf() - - norm_diff = norm_ged_mat - norm_dis_k_mat - plt.imshow(norm_diff) - plt.colorbar() - plt.savefig('results/diff_mat_norm_ged_dis_k.any_costs' + '.eps', format='eps', dpi=300) -# plt.savefig('results/diff_mat_norm_ged_dis_k.any_costs' + '.png', format='png') -# plt.show() - plt.clf() -# draw_count_bar(norm_diff) - - -def test_cs_leq_ci_plus_cr(): - """c_vs <= c_vi + c_vr, c_es <= c_ei + c_er - """ - ds = {'name': 'monoterpenoides', - 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb - Gn, y_all = loadDataset(ds['dataset']) -# Gn = Gn[0:10] - gkernel = 'untilhpathkernel' - node_label = 'atom' - edge_label = 'bond_type' - itr_max = 10 - edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \ - nb_cost_mat_list, coef_dk = fit_GED_to_kernel_distance(Gn, node_label, edge_label, - gkernel, itr_max, - fitkernel='gaussian') - total_time = np.sum(time_list) - print('\nedit_costs:', edit_costs) - print('\nresidual_list:', residual_list) - print('\nedit_cost_list:', edit_cost_list) - print('\ndistance matrix in kernel space:', dis_k_mat) - print('\nged matrix:', ged_mat) - print('\ntotal time:', total_time) - print('\nnb_cost_mat:', nb_cost_mat_list[-1]) - np.savez('results/fit_distance.cs_leq_ci_plus_cr.gaussian.cost_leq_1en2.monot.elabeled.uhpkernel.gm', - edit_costs=edit_costs, - residual_list=residual_list, edit_cost_list=edit_cost_list, - dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list, - total_time=total_time, nb_cost_mat_list=nb_cost_mat_list, - coef_dk=coef_dk) - -# ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt', -# 'extra_params': {}} # node/edge symb -# Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params']) -## Gn = Gn[0:10] -## remove_edges(Gn) -# gkernel = 'untilhpathkernel' -# node_label = 'atom' -# edge_label = 'bond_type' -# itr_max = 10 -# edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \ -# nb_cost_mat_list, coef_dk = fit_GED_to_kernel_distance(Gn, node_label, edge_label, -# gkernel, itr_max) -# total_time = np.sum(time_list) -# print('\nedit_costs:', edit_costs) -# print('\nresidual_list:', residual_list) -# print('\nedit_cost_list:', edit_cost_list) -# print('\ndistance matrix in kernel space:', dis_k_mat) -# print('\nged matrix:', ged_mat) -# print('\ntotal time:', total_time) -# print('\nnb_cost_mat:', nb_cost_mat_list[-1]) -# np.savez('results/fit_distance.cs_leq_ci_plus_cr.cost_leq_1en2.mutag.elabeled.uhpkernel.gm', -# edit_costs=edit_costs, -# residual_list=residual_list, edit_cost_list=edit_cost_list, -# dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list, -# total_time=total_time, nb_cost_mat_list=nb_cost_mat_list, coef_dk) - - -# # normalized distance matrices. -# gmfile = np.load('results/fit_distance.cs_leq_ci_plus_cr.cost_leq_1en2.monot.elabeled.uhpkernel.gm.npz') -# edit_costs = gmfile['edit_costs'] -# residual_list = gmfile['residual_list'] -# edit_cost_list = gmfile['edit_cost_list'] -# dis_k_mat = gmfile['dis_k_mat'] -# ged_mat = gmfile['ged_mat'] -# total_time = gmfile['total_time'] -# nb_cost_mat_list = gmfile['nb_cost_mat_list'] -# coef_dk = gmfile['coef_dk'] - - nb_consistent, nb_inconsistent, ratio_consistent = pairwise_substitution_consistence(dis_k_mat, ged_mat) - print(nb_consistent, nb_inconsistent, ratio_consistent) - -# dis_k_sub = pairwise_substitution(dis_k_mat) -# ged_sub = pairwise_substitution(ged_mat) -# np.savez('results/sub_dis_mat.cs_leq_ci_plus_cr.cost_leq_1en2.gm', -# dis_k_sub=dis_k_sub, ged_sub=ged_sub) - - - norm_dis_k_mat = normalize_distance_matrix(dis_k_mat) - plt.imshow(norm_dis_k_mat) - plt.colorbar() - plt.savefig('results/norm_dis_k_mat.cs_leq_ci_plus_cr.gaussian.cost_leq_1en2.monot.elabeled.uhpkernel' - + '.eps', format='eps', dpi=300) - plt.savefig('results/norm_dis_k_mat.cs_leq_ci_plus_cr.gaussian.cost_leq_1en2.monot.elabeled.uhpkernel' - + '.png', format='png') -# plt.show() - plt.clf() - - norm_ged_mat = normalize_distance_matrix(ged_mat) - plt.imshow(norm_ged_mat) - plt.colorbar() - plt.savefig('results/norm_ged_mat.cs_leq_ci_plus_cr.gaussian.cost_leq_1en2.monot.elabeled.uhpkernel' - + '.eps', format='eps', dpi=300) - plt.savefig('results/norm_ged_mat.cs_leq_ci_plus_cr.gaussian.cost_leq_1en2.monot.elabeled.uhpkernel' - + '.png', format='png') -# plt.show() - plt.clf() - - norm_diff = norm_ged_mat - norm_dis_k_mat - plt.imshow(norm_diff) - plt.colorbar() - plt.savefig('results/diff_mat_norm_ged_dis_k.cs_leq_ci_plus_cr.gaussian.cost_leq_1en2.monot.elabeled.uhpkernel' - + '.eps', format='eps', dpi=300) - plt.savefig('results/diff_mat_norm_ged_dis_k.cs_leq_ci_plus_cr.gaussian.cost_leq_1en2.monot.elabeled.uhpkernel' - + '.png', format='png') -# plt.show() - plt.clf() -# draw_count_bar(norm_diff) - - -def test_unfitted(): - """unfitted. - """ - from fitDistance import compute_geds - from utils import kernel_distance_matrix - ds = {'name': 'monoterpenoides', - 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb - Gn, y_all = loadDataset(ds['dataset']) -# Gn = Gn[0:10] - gkernel = 'untilhpathkernel' - node_label = 'atom' - edge_label = 'bond_type' - - -# ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt', -# 'extra_params': {}} # node/edge symb -# Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params']) -## Gn = Gn[0:10] -## remove_edges(Gn) -# gkernel = 'marginalizedkernel' - - dis_k_mat, _, _, _ = kernel_distance_matrix(Gn, node_label, edge_label, gkernel=gkernel) - ged_all, ged_mat, n_edit_operations = compute_geds(Gn, [3, 3, 1, 3, 3, 1], - [0, 1, 2, 3, 4, 5], parallel=True) - print('\ndistance matrix in kernel space:', dis_k_mat) - print('\nged matrix:', ged_mat) -# np.savez('results/fit_distance.cs_leq_ci_plus_cr.cost_leq_1en2.gm', edit_costs=edit_costs, -# residual_list=residual_list, edit_cost_list=edit_cost_list, -# dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list, -# total_time=total_time, nb_cost_mat_list=nb_cost_mat_list) - - # normalized distance matrices. -# gmfile = np.load('results/fit_distance.cs_leq_ci_plus_cr.cost_leq_1en3.gm.npz') -# edit_costs = gmfile['edit_costs'] -# residual_list = gmfile['residual_list'] -# edit_cost_list = gmfile['edit_cost_list'] -# dis_k_mat = gmfile['dis_k_mat'] -# ged_mat = gmfile['ged_mat'] -# total_time = gmfile['total_time'] -# nb_cost_mat_list = gmfile['nb_cost_mat_list'] - - nb_consistent, nb_inconsistent, ratio_consistent = pairwise_substitution_consistence(dis_k_mat, ged_mat) - print(nb_consistent, nb_inconsistent, ratio_consistent) - - norm_dis_k_mat = normalize_distance_matrix(dis_k_mat) - plt.imshow(norm_dis_k_mat) - plt.colorbar() - plt.savefig('results/norm_dis_k_mat.unfitted.MUTAG' + '.eps', format='eps', dpi=300) - plt.savefig('results/norm_dis_k_mat.unfitted.MUTAG' + '.png', format='png') -# plt.show() - plt.clf() - - norm_ged_mat = normalize_distance_matrix(ged_mat) - plt.imshow(norm_ged_mat) - plt.colorbar() - plt.savefig('results/norm_ged_mat.unfitted.MUTAG' + '.eps', format='eps', dpi=300) - plt.savefig('results/norm_ged_mat.unfitted.MUTAG' + '.png', format='png') -# plt.show() - plt.clf() - - norm_diff = norm_ged_mat - norm_dis_k_mat - plt.imshow(norm_diff) - plt.colorbar() - plt.savefig('results/diff_mat_norm_ged_dis_k.unfitted.MUTAG' + '.eps', format='eps', dpi=300) - plt.savefig('results/diff_mat_norm_ged_dis_k.unfitted.MUTAG' + '.png', format='png') -# plt.show() - plt.clf() - draw_count_bar(norm_diff) - - -def pairwise_substitution_consistence(mat1, mat2): - """ - """ - nb_consistent = 0 - nb_inconsistent = 0 - # the matrix is considered symmetric. - upper_tri1 = mat1[np.triu_indices_from(mat1)] - upper_tri2 = mat2[np.tril_indices_from(mat2)] - for i in tqdm(range(len(upper_tri1)), desc='computing consistence', file=sys.stdout): - for j in range(i, len(upper_tri1)): - if np.sign(upper_tri1[i] - upper_tri1[j]) == np.sign(upper_tri2[i] - upper_tri2[j]): - nb_consistent += 1 - else: - nb_inconsistent += 1 - return nb_consistent, nb_inconsistent, nb_consistent / (nb_consistent + nb_inconsistent) - - -def pairwise_substitution(mat): - # the matrix is considered symmetric. - upper_tri = mat[np.triu_indices_from(mat)] - sub_list = [] - for i in tqdm(range(len(upper_tri)), desc='computing', file=sys.stdout): - for j in range(i, len(upper_tri)): - sub_list.append(upper_tri[i] - upper_tri[j]) - return sub_list - - -def draw_count_bar(norm_diff): - import pandas - from collections import Counter, OrderedDict - norm_diff_cnt = norm_diff.flatten() - norm_diff_cnt = norm_diff_cnt * 10 - norm_diff_cnt = np.floor(norm_diff_cnt) - norm_diff_cnt = Counter(norm_diff_cnt) - norm_diff_cnt = OrderedDict(sorted(norm_diff_cnt.items())) - df = pandas.DataFrame.from_dict(norm_diff_cnt, orient='index') - df.plot(kind='bar') - - -if __name__ == '__main__': -# test_anycosts() -# test_cs_leq_ci_plus_cr() -# test_unfitted() - -# test_cs_leq_ci_plus_cr_python_bash_cpp() -# median_paper_clcpc_python_bash_cpp() -# median_paper_clcpc_python_best() - -# x = np.array([[1,2,3],[4,5,6],[7,8,9]]) -# xx = pairwise_substitution(x) - - test_update_costs() \ No newline at end of file diff --git a/gklearn/preimage/test_ged.py b/gklearn/preimage/test_ged.py deleted file mode 100644 index 74e18a0..0000000 --- a/gklearn/preimage/test_ged.py +++ /dev/null @@ -1,520 +0,0 @@ -#export LD_LIBRARY_PATH=.:/export/home/lambertn/Documents/gedlibpy/lib/fann/:/export/home/lambertn/Documents/gedlibpy/lib/libsvm.3.22:/export/home/lambertn/Documents/gedlibpy/lib/nomad - -#Pour que "import script" trouve les librairies qu'a besoin GedLib -#Equivalent à définir la variable d'environnement LD_LIBRARY_PATH sur un bash -#import gedlibpy_linlin.librariesImport -#from gedlibpy_linlin import gedlibpy -from libs import * -import networkx as nx -import numpy as np -from tqdm import tqdm -import sys - - -def test_NON_SYMBOLIC_cost(): - """Test edit cost LETTER2. - """ - from gklearn.preimage.ged import GED, get_nb_edit_operations_nonsymbolic, get_nb_edit_operations_letter - from gklearn.preimage.test_k_closest_graphs import reform_attributes - from gklearn.utils.graphfiles import loadDataset - - dataset = '../../datasets/Letter-high/Letter-high_A.txt' - Gn, y_all = loadDataset(dataset) - - g1 = Gn[200] - g2 = Gn[1780] - reform_attributes(g1) - reform_attributes(g2) - - c_vi = 0.675 - c_vr = 0.675 - c_vs = 0.75 - c_ei = 0.425 - c_er = 0.425 - c_es = 0 - - edit_cost_constant = [c_vi, c_vr, c_vs, c_ei, c_er, c_es] - dis, pi_forward, pi_backward = GED(g1, g2, lib='gedlibpy', - cost='NON_SYMBOLIC', method='IPFP', edit_cost_constant=edit_cost_constant, - algo_options='', stabilizer=None) - n_vi, n_vr, sod_vs, n_ei, n_er, sod_es = get_nb_edit_operations_nonsymbolic(g1, g2, - pi_forward, pi_backward) - - print('# of operations:', n_vi, n_vr, sod_vs, n_ei, n_er, sod_es) - print('c_vi, c_vr, c_vs, c_ei, c_er:', c_vi, c_vr, c_vs, c_ei, c_er, c_es) - cost_computed = c_vi * n_vi + c_vr * n_vr + c_vs * sod_vs \ - + c_ei * n_ei + c_er * n_er + c_es * sod_es - print('dis (cost computed by GED):', dis) - print('cost computed by # of operations and edit cost constants:', cost_computed) - - -def test_LETTER2_cost(): - """Test edit cost LETTER2. - """ - from gklearn.preimage.ged import GED, get_nb_edit_operations_letter - from gklearn.preimage.test_k_closest_graphs import reform_attributes - from gklearn.utils.graphfiles import loadDataset - - ds = {'dataset': 'cpp_ext/data/collections/Letter.xml', - 'graph_dir': os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/data/datasets/Letter/HIGH/'} # node/edge symb - Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['graph_dir']) - - g1 = Gn[200] - g2 = Gn[1780] - reform_attributes(g1) - reform_attributes(g2) - - c_vi = 0.675 - c_vr = 0.675 - c_vs = 0.75 - c_ei = 0.425 - c_er = 0.425 - - edit_cost_constant = [c_vi, c_vr, c_vs, c_ei, c_er] - dis, pi_forward, pi_backward = GED(g1, g2, dataset='letter', lib='gedlibpy', - cost='LETTER2', method='IPFP', edit_cost_constant=edit_cost_constant, - algo_options='', stabilizer=None) - n_vi, n_vr, n_vs, sod_vs, n_ei, n_er = get_nb_edit_operations_letter(g1, g2, - pi_forward, pi_backward) - - print('# of operations:', n_vi, n_vr, n_vs, sod_vs, n_ei, n_er) - print('c_vi, c_vr, c_vs, c_ei, c_er:', c_vi, c_vr, c_vs, c_ei, c_er) - cost_computed = c_vi * n_vi + c_vr * n_vr + c_vs * sod_vs \ - + c_ei * n_ei + c_er * n_er - print('dis (cost computed by GED):', dis) - print('cost computed by # of operations and edit cost constants:', cost_computed) - - - -def test_get_nb_edit_operations_letter(): - """Test whether function preimage.ged.get_nb_edit_operations_letter returns - correct numbers of edit operations. The distance/cost computed by GED - should be the same as the cost computed by number of operations and edit - cost constants. - """ - from gklearn.preimage.ged import GED, get_nb_edit_operations_letter - from gklearn.preimage.test_k_closest_graphs import reform_attributes - from gklearn.utils.graphfiles import loadDataset - - ds = {'dataset': 'cpp_ext/data/collections/Letter.xml', - 'graph_dir': os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/data/datasets/Letter/HIGH/'} # node/edge symb - Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['graph_dir']) - - g1 = Gn[200] - g2 = Gn[1780] - reform_attributes(g1) - reform_attributes(g2) - - c_vir = 0.9 - c_eir = 1.7 - alpha = 0.75 - - edit_cost_constant = [c_vir, c_eir, alpha] - dis, pi_forward, pi_backward = GED(g1, g2, dataset='letter', lib='gedlibpy', - cost='LETTER', method='IPFP', edit_cost_constant=edit_cost_constant, - algo_options='', stabilizer=None) - n_vi, n_vr, n_vs, c_vs, n_ei, n_er = get_nb_edit_operations_letter(g1, g2, - pi_forward, pi_backward) - - print('# of operations and costs:', n_vi, n_vr, n_vs, c_vs, n_ei, n_er) - print('c_vir, c_eir, alpha:', c_vir, c_eir, alpha) - cost_computed = alpha * c_vir * (n_vi + n_vr) \ - + alpha * c_vs \ - + (1 - alpha) * c_eir * (n_ei + n_er) - print('dis (cost computed by GED):', dis) - print('cost computed by # of operations and edit cost constants:', cost_computed) - - -def test_get_nb_edit_operations(): - """Test whether function preimage.ged.get_nb_edit_operations returns correct - numbers of edit operations. The distance/cost computed by GED should be the - same as the cost computed by number of operations and edit cost constants. - """ - from gklearn.preimage.ged import GED, get_nb_edit_operations - from gklearn.utils.graphfiles import loadDataset - import os - - ds = {'dataset': '../../datasets/monoterpenoides/dataset_10+.ds', - 'graph_dir': os.path.dirname(os.path.realpath(__file__)) + '../../datasets/monoterpenoides/'} # node/edge symb - Gn, y_all = loadDataset(ds['dataset']) - - g1 = Gn[20] - g2 = Gn[108] - - c_vi = 3 - c_vr = 3 - c_vs = 1 - c_ei = 3 - c_er = 3 - c_es = 1 - - edit_cost_constant = [c_vi, c_vr, c_vs, c_ei, c_er, c_es] - dis, pi_forward, pi_backward = GED(g1, g2, dataset='monoterpenoides', lib='gedlibpy', - cost='CONSTANT', method='IPFP', edit_cost_constant=edit_cost_constant, - algo_options='', stabilizer=None) - n_vi, n_vr, n_vs, n_ei, n_er, n_es = get_nb_edit_operations(g1, g2, - pi_forward, pi_backward) - - print('# of operations and costs:', n_vi, n_vr, n_vs, n_ei, n_er, n_es) - print('edit costs:', c_vi, c_vr, c_vs, c_ei, c_er, c_es) - cost_computed = n_vi * c_vi + n_vr * c_vr + n_vs * c_vs \ - + n_ei * c_ei + n_er * c_er + n_es * c_es - print('dis (cost computed by GED):', dis) - print('cost computed by # of operations and edit cost constants:', cost_computed) - - -def test_ged_python_bash_cpp(): - """Test ged computation with python invoking the c++ code by bash command (with updated library). - """ - from gklearn.utils.graphfiles import loadDataset - from gklearn.preimage.ged import GED - - data_dir_prefix = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/' -# collection_file = data_dir_prefix + 'generated_datsets/monoterpenoides/gxl/monoterpenoides.xml' - collection_file = data_dir_prefix + 'generated_datsets/monoterpenoides/monoterpenoides_3_20.xml' - graph_dir = data_dir_prefix +'generated_datsets/monoterpenoides/gxl/' - - Gn, y = loadDataset(collection_file, extra_params=graph_dir) - - algo_options = '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1' - - for repeat in range(0, 3): - # Generate the result file. - ged_filename = data_dir_prefix + 'output/test_ged/ged_mat_python_bash_' + str(repeat) + '_init40.3_20.txt' -# runtime_filename = data_dir_prefix + 'output/test_ged/runtime_mat_python_min_' + str(repeat) + '.txt' - - ged_file = open(ged_filename, 'a') -# runtime_file = open(runtime_filename, 'a') - - ged_mat = np.empty((len(Gn), len(Gn))) -# runtime_mat = np.empty((len(Gn), len(Gn))) - - for i in tqdm(range(len(Gn)), desc='computing GEDs', file=sys.stdout): - for j in range(len(Gn)): - print(i, j) - g1 = Gn[i] - g2 = Gn[j] - upper_bound, _, _ = GED(g1, g2, lib='gedlib-bash', cost='CONSTANT', - method='IPFP', - edit_cost_constant=[3.0, 3.0, 1.0, 3.0, 3.0, 1.0], - algo_options=algo_options) -# runtime = gedlibpy.get_runtime(g1, g2) - ged_mat[i][j] = upper_bound -# runtime_mat[i][j] = runtime - - # Write to files. - ged_file.write(str(int(upper_bound)) + ' ') -# runtime_file.write(str(runtime) + ' ') - - ged_file.write('\n') -# runtime_file.write('\n') - - ged_file.close() -# runtime_file.close() - - print('ged_mat') - print(ged_mat) -# print('runtime_mat:') -# print(runtime_mat) - - return - - - -def test_ged_best_settings_updated(): - """Test ged computation with best settings the same as in the C++ code (with updated library). - """ - - data_dir_prefix = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/' - collection_file = data_dir_prefix + 'generated_datsets/monoterpenoides/gxl/monoterpenoides.xml' -# collection_file = data_dir_prefix + 'generated_datsets/monoterpenoides/monoterpenoides_3_20.xml' - - graph_dir = data_dir_prefix +'generated_datsets/monoterpenoides/gxl/' - - algo_options = '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1' - - for repeat in range(0, 3): - # Generate the result file. - ged_filename = data_dir_prefix + 'output/test_ged/ged_mat_python_updated_' + str(repeat) + '_init40.txt' - runtime_filename = data_dir_prefix + 'output/test_ged/runtime_mat_python_updated_' + str(repeat) + '_init40.txt' - - gedlibpy.restart_env() - gedlibpy.load_GXL_graphs(graph_dir, collection_file) - listID = gedlibpy.get_all_graph_ids() - gedlibpy.set_edit_cost('CONSTANT', [3.0, 3.0, 1.0, 3.0, 3.0, 1.0]) - gedlibpy.init() - gedlibpy.set_method("IPFP", algo_options) - gedlibpy.init_method() - - ged_mat = np.empty((len(listID), len(listID))) - runtime_mat = np.empty((len(listID), len(listID))) - - for i in tqdm(range(len(listID)), desc='computing GEDs', file=sys.stdout): - ged_file = open(ged_filename, 'a') - runtime_file = open(runtime_filename, 'a') - - for j in range(len(listID)): - g1 = listID[i] - g2 = listID[j] - gedlibpy.run_method(g1, g2) - upper_bound = gedlibpy.get_upper_bound(g1, g2) - runtime = gedlibpy.get_runtime(g1, g2) - ged_mat[i][j] = upper_bound - runtime_mat[i][j] = runtime - - # Write to files. - ged_file.write(str(int(upper_bound)) + ' ') - runtime_file.write(str(runtime) + ' ') - - ged_file.write('\n') - runtime_file.write('\n') - - ged_file.close() - runtime_file.close() - - print('ged_mat') - print(ged_mat) - print('runtime_mat:') - print(runtime_mat) - - return - - -def test_ged_best_settings(): - """Test ged computation with best settings the same as in the C++ code. - """ - - data_dir_prefix = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/' - collection_file = data_dir_prefix + 'generated_datsets/monoterpenoides/gxl/monoterpenoides.xml' - graph_dir = data_dir_prefix +'generated_datsets/monoterpenoides/gxl/' - - algo_options = '--threads 6 --initial-solutions 10 --ratio-runs-from-initial-solutions .5' - - for repeat in range(0, 3): - # Generate the result file. - ged_filename = data_dir_prefix + 'output/test_ged/ged_mat_python_best_settings_' + str(repeat) + '.txt' - runtime_filename = data_dir_prefix + 'output/test_ged/runtime_mat_python_best_settings_' + str(repeat) + '.txt' - - ged_file = open(ged_filename, 'a') - runtime_file = open(runtime_filename, 'a') - - gedlibpy.restart_env() - gedlibpy.load_GXL_graphs(graph_dir, collection_file) - listID = gedlibpy.get_all_graph_ids() - gedlibpy.set_edit_cost('CONSTANT', [3.0, 3.0, 1.0, 3.0, 3.0, 1.0]) - gedlibpy.init() - gedlibpy.set_method("IPFP", algo_options) - gedlibpy.init_method() - - ged_mat = np.empty((len(listID), len(listID))) - runtime_mat = np.empty((len(listID), len(listID))) - - for i in tqdm(range(len(listID)), desc='computing GEDs', file=sys.stdout): - for j in range(len(listID)): - g1 = listID[i] - g2 = listID[j] - gedlibpy.run_method(g1, g2) - upper_bound = gedlibpy.get_upper_bound(g1, g2) - runtime = gedlibpy.get_runtime(g1, g2) - ged_mat[i][j] = upper_bound - runtime_mat[i][j] = runtime - - # Write to files. - ged_file.write(str(int(upper_bound)) + ' ') - runtime_file.write(str(runtime) + ' ') - - ged_file.write('\n') - runtime_file.write('\n') - - ged_file.close() - runtime_file.close() - - print('ged_mat') - print(ged_mat) - print('runtime_mat:') - print(runtime_mat) - - return - - - -def test_ged_default(): - """Test ged computation with default settings. - """ - - data_dir_prefix = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/' - collection_file = data_dir_prefix + 'generated_datsets/monoterpenoides/gxl/monoterpenoides.xml' - graph_dir = data_dir_prefix +'generated_datsets/monoterpenoides/gxl/' - - for repeat in range(3): - # Generate the result file. - ged_filename = data_dir_prefix + 'output/test_ged/ged_mat_python_default_' + str(repeat) + '.txt' - runtime_filename = data_dir_prefix + 'output/test_ged/runtime_mat_python_default_' + str(repeat) + '.txt' - - ged_file = open(ged_filename, 'a') - runtime_file = open(runtime_filename, 'a') - - gedlibpy.restart_env() - gedlibpy.load_GXL_graphs(graph_dir, collection_file) - listID = gedlibpy.get_all_graph_ids() - gedlibpy.set_edit_cost('CONSTANT', [3.0, 3.0, 1.0, 3.0, 3.0, 1.0]) - gedlibpy.init() - gedlibpy.set_method("IPFP", "") - gedlibpy.init_method() - - ged_mat = np.empty((len(listID), len(listID))) - runtime_mat = np.empty((len(listID), len(listID))) - - for i in tqdm(range(len(listID)), desc='computing GEDs', file=sys.stdout): - for j in range(len(listID)): - g1 = listID[i] - g2 = listID[j] - gedlibpy.run_method(g1, g2) - upper_bound = gedlibpy.get_upper_bound(g1, g2) - runtime = gedlibpy.get_runtime(g1, g2) - ged_mat[i][j] = upper_bound - runtime_mat[i][j] = runtime - - # Write to files. - ged_file.write(str(int(upper_bound)) + ' ') - runtime_file.write(str(runtime) + ' ') - - ged_file.write('\n') - runtime_file.write('\n') - - ged_file.close() - runtime_file.close() - - print('ged_mat') - print(ged_mat) - print('runtime_mat:') - print(runtime_mat) - - return - - -def test_ged_min(): - """Test ged computation with the "min" stabilizer. - """ - from gklearn.utils.graphfiles import loadDataset - from gklearn.preimage.ged import GED - - data_dir_prefix = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/' - collection_file = data_dir_prefix + 'generated_datsets/monoterpenoides/gxl/monoterpenoides.xml' - graph_dir = data_dir_prefix +'generated_datsets/monoterpenoides/gxl/' - - Gn, y = loadDataset(collection_file, extra_params=graph_dir) - -# algo_options = '--threads 6 --initial-solutions 10 --ratio-runs-from-initial-solutions .5' - - for repeat in range(0, 3): - # Generate the result file. - ged_filename = data_dir_prefix + 'output/test_ged/ged_mat_python_min_' + str(repeat) + '.txt' -# runtime_filename = data_dir_prefix + 'output/test_ged/runtime_mat_python_min_' + str(repeat) + '.txt' - - ged_file = open(ged_filename, 'a') -# runtime_file = open(runtime_filename, 'a') - - ged_mat = np.empty((len(Gn), len(Gn))) -# runtime_mat = np.empty((len(Gn), len(Gn))) - - for i in tqdm(range(len(Gn)), desc='computing GEDs', file=sys.stdout): - for j in range(len(Gn)): - g1 = Gn[i] - g2 = Gn[j] - upper_bound, _, _ = GED(g1, g2, lib='gedlibpy', cost='CONSTANT', - method='IPFP', - edit_cost_constant=[3.0, 3.0, 1.0, 3.0, 3.0, 1.0], - stabilizer='min', repeat=10) -# runtime = gedlibpy.get_runtime(g1, g2) - ged_mat[i][j] = upper_bound -# runtime_mat[i][j] = runtime - - # Write to files. - ged_file.write(str(int(upper_bound)) + ' ') -# runtime_file.write(str(runtime) + ' ') - - ged_file.write('\n') -# runtime_file.write('\n') - - ged_file.close() -# runtime_file.close() - - print('ged_mat') - print(ged_mat) -# print('runtime_mat:') -# print(runtime_mat) - - return - - -def init() : - print("List of Edit Cost Options : ") - for i in gedlibpy.list_of_edit_cost_options : - print (i) - print("") - - print("List of Method Options : ") - for j in gedlibpy.list_of_method_options : - print (j) - print("") - - print("List of Init Options : ") - for k in gedlibpy.list_of_init_options : - print (k) - print("") - - - - -def convertGraph(G): - G_new = nx.Graph() - for nd, attrs in G.nodes(data=True): - G_new.add_node(str(nd), chem=attrs['atom']) - for nd1, nd2, attrs in G.edges(data=True): - G_new.add_edge(str(nd1), str(nd2), valence=attrs['bond_type']) - - return G_new - - -def testNxGrapĥ(): - from gklearn.utils.graphfiles import loadDataset - ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt', - 'extra_params': {}} # node/edge symb - Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params']) - - gedlibpy.restart_env() - for graph in Gn: - g_new = convertGraph(graph) - gedlibpy.add_nx_graph(g_new, "") - - listID = gedlibpy.get_all_graph_ids() - gedlibpy.set_edit_cost("CHEM_1") - gedlibpy.init() - gedlibpy.set_method("IPFP", "") - gedlibpy.init_method() - - print(listID) - g = listID[0] - h = listID[1] - - gedlibpy.run_method(g, h) - - print("Node Map : ", gedlibpy.get_node_map(g, h)) - print("Forward map : " , gedlibpy.get_forward_map(g, h), ", Backward map : ", gedlibpy.get_backward_map(g, h)) - print ("Upper Bound = " + str(gedlibpy.get_upper_bound(g, h)) + ", Lower Bound = " + str(gedlibpy.get_lower_bound(g, h)) + ", Runtime = " + str(gedlibpy.get_runtime(g, h))) - -if __name__ == '__main__': -# test_ged_default() -# test_ged_min() -# test_ged_best_settings() -# test_ged_best_settings_updated() -# test_ged_python_bash_cpp() -# test_get_nb_edit_operations() -# test_get_nb_edit_operations_letter() -# test_LETTER2_cost() - test_NON_SYMBOLIC_cost() - - - #init() - #testNxGrapĥ() diff --git a/gklearn/preimage/test_iam.py b/gklearn/preimage/test_iam.py deleted file mode 100644 index 5897f50..0000000 --- a/gklearn/preimage/test_iam.py +++ /dev/null @@ -1,964 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -Created on Thu Sep 5 15:59:00 2019 - -@author: ljia -""" - -import numpy as np -import networkx as nx -import matplotlib.pyplot as plt -import time -import random -#from tqdm import tqdm - -from gklearn.utils.graphfiles import loadDataset -#from gklearn.utils.logger2file import * -from gklearn.preimage.iam import iam_upgraded -from gklearn.preimage.utils import remove_edges, compute_kernel, get_same_item_indices, dis_gstar -#from gklearn.preimage.ged import ged_median - - -def test_iam_monoterpenoides_with_init40(): - gkernel = 'untilhpathkernel' - node_label = 'atom' - edge_label = 'bond_type' - # unfitted edit costs. - c_vi = 3 - c_vr = 3 - c_vs = 1 - c_ei = 3 - c_er = 3 - c_es = 1 - ite_max_iam = 50 - epsilon_iam = 0.0001 - removeNodes = False - connected_iam = False - # parameters for IAM function -# ged_cost = 'CONSTANT' - ged_cost = 'CONSTANT' - ged_method = 'IPFP' - edit_cost_constant = [c_vi, c_vr, c_vs, c_ei, c_er, c_es] - ged_stabilizer = None -# ged_repeat = 50 - algo_options = '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1' - params_ged = {'lib': 'gedlibpy', 'cost': ged_cost, 'method': ged_method, - 'edit_cost_constant': edit_cost_constant, - 'algo_options': algo_options, - 'stabilizer': ged_stabilizer} - - - collection_path = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/generated_datsets/monoterpenoides/' - graph_dir = collection_path + 'gxl/' - y_all = ['3', '1', '4', '6', '7', '8', '9', '2'] - repeats = 50 - - # classify graphs according to classes. - time_list = [] - dis_ks_min_list = [] - dis_ks_set_median_list = [] - sod_gs_list = [] - g_best = [] - sod_set_median_list = [] - sod_list_list = [] - for y in y_all: - print('\n-------------------------------------------------------') - print('class of y:', y) - - time_list.append([]) - dis_ks_min_list.append([]) - dis_ks_set_median_list.append([]) - sod_gs_list.append([]) - g_best.append([]) - sod_set_median_list.append([]) - - for repeat in range(repeats): - # load median set. - collection_file = collection_path + 'monoterpenoides_' + y + '_' + str(repeat) + '.xml' - Gn_median, _ = loadDataset(collection_file, extra_params=graph_dir) - Gn_candidate = [g.copy() for g in Gn_median] - - time0 = time.time() - G_gen_median_list, sod_gen_median, sod_list, G_set_median_list, sod_set_median \ - = iam_upgraded(Gn_median, - Gn_candidate, c_ei=c_ei, c_er=c_er, c_es=c_es, ite_max=ite_max_iam, - epsilon=epsilon_iam, node_label=node_label, edge_label=edge_label, - connected=connected_iam, removeNodes=removeNodes, - params_ged=params_ged) - time_total = time.time() - time0 - print('\ntime: ', time_total) - time_list[-1].append(time_total) - g_best[-1].append(G_gen_median_list[0]) - sod_set_median_list[-1].append(sod_set_median) - print('\nsmallest sod of the set median:', sod_set_median) - sod_gs_list[-1].append(sod_gen_median) - print('\nsmallest sod in graph space:', sod_gen_median) - sod_list_list.append(sod_list) - -# # show the best graph and save it to file. -# print('one of the possible corresponding pre-images is') -# nx.draw(G_gen_median_list[0], labels=nx.get_node_attributes(G_gen_median_list[0], 'atom'), -# with_labels=True) -## plt.show() -# # plt.savefig('results/iam/mutag_median.fit_costs2.001.nb' + str(nb_median) + -## plt.savefig('results/iam/paper_compare/monoter_y' + str(y_class) + -## '_repeat' + str(repeat) + '_' + str(time.time()) + -## '.png', format="PNG") -# plt.clf() -# # print(G_gen_median_list[0].nodes(data=True)) -# # print(G_gen_median_list[0].edges(data=True)) - - print('\nsods of the set median for this class:', sod_set_median_list[-1]) - print('\nsods in graph space for this class:', sod_gs_list[-1]) -# print('\ndistance in kernel space of set median for this class:', -# dis_ks_set_median_list[-1]) -# print('\nsmallest distances in kernel space for this class:', -# dis_ks_min_list[-1]) - print('\ntimes for this class:', time_list[-1]) - - sod_set_median_list[-1] = np.mean(sod_set_median_list[-1]) - sod_gs_list[-1] = np.mean(sod_gs_list[-1]) -# dis_ks_set_median_list[-1] = np.mean(dis_ks_set_median_list[-1]) -# dis_ks_min_list[-1] = np.mean(dis_ks_min_list[-1]) - time_list[-1] = np.mean(time_list[-1]) - - print() - print('\nmean sods of the set median for each class:', sod_set_median_list) - print('\nmean sods in graph space for each class:', sod_gs_list) -# print('\ndistances in kernel space of set median for each class:', -# dis_ks_set_median_list) -# print('\nmean smallest distances in kernel space for each class:', -# dis_ks_min_list) - print('\nmean times for each class:', time_list) - - print('\nmean sods of the set median of all:', np.mean(sod_set_median_list)) - print('\nmean sods in graph space of all:', np.mean(sod_gs_list)) -# print('\nmean distances in kernel space of set median of all:', -# np.mean(dis_ks_set_median_list)) -# print('\nmean smallest distances in kernel space of all:', -# np.mean(dis_ks_min_list)) - print('\nmean times of all:', np.mean(time_list)) - - - - -def test_iam_monoterpenoides(): - ds = {'name': 'monoterpenoides', - 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb - Gn, y_all = loadDataset(ds['dataset']) -# Gn = Gn[0:50] - gkernel = 'untilhpathkernel' - node_label = 'atom' - edge_label = 'bond_type' - - # parameters for GED function from the IAM paper. - # fitted edit costs (Gaussian). - c_vi = 0.03620133402089074 - c_vr = 0.0417574590207099 - c_vs = 0.009992282328587499 - c_ei = 0.08293120042342755 - c_er = 0.09512220476358019 - c_es = 0.09222529696841467 -# # fitted edit costs (linear combinations). -# c_vi = 0.1749684054238749 -# c_vr = 0.0734054228711457 -# c_vs = 0.05017781726016715 -# c_ei = 0.1869431164806936 -# c_er = 0.32055856948274 -# c_es = 0.2569469379247611 -# # unfitted edit costs. -# c_vi = 3 -# c_vr = 3 -# c_vs = 1 -# c_ei = 3 -# c_er = 3 -# c_es = 1 - ite_max_iam = 50 - epsilon_iam = 0.001 - removeNodes = False - connected_iam = False - # parameters for IAM function -# ged_cost = 'CONSTANT' - ged_cost = 'CONSTANT' - ged_method = 'IPFP' - edit_cost_constant = [c_vi, c_vr, c_vs, c_ei, c_er, c_es] -# edit_cost_constant = [] - ged_stabilizer = 'min' - ged_repeat = 50 - params_ged = {'lib': 'gedlibpy', 'cost': ged_cost, 'method': ged_method, - 'edit_cost_constant': edit_cost_constant, - 'stabilizer': ged_stabilizer, 'repeat': ged_repeat} - - # classify graphs according to letters. - time_list = [] - dis_ks_min_list = [] - dis_ks_set_median_list = [] - sod_gs_list = [] - g_best = [] - sod_set_median_list = [] - sod_list_list = [] - idx_dict = get_same_item_indices(y_all) - for y_class in idx_dict: - print('\n-------------------------------------------------------') - print('class of y:', y_class) - Gn_class = [Gn[i].copy() for i in idx_dict[y_class]] - - time_list.append([]) - dis_ks_min_list.append([]) - dis_ks_set_median_list.append([]) - sod_gs_list.append([]) - g_best.append([]) - sod_set_median_list.append([]) - - for repeat in range(50): - idx_rdm = random.sample(range(len(Gn_class)), 10) - print('graphs chosen:', idx_rdm) - Gn_median = [Gn_class[idx].copy() for idx in idx_rdm] - Gn_candidate = [g.copy() for g in Gn_median] - - alpha_range = [1 / len(Gn_median)] * len(Gn_median) - time0 = time.time() - G_gen_median_list, sod_gen_median, sod_list, G_set_median_list, sod_set_median \ - = iam_upgraded(Gn_median, - Gn_candidate, c_ei=c_ei, c_er=c_er, c_es=c_es, ite_max=ite_max_iam, - epsilon=epsilon_iam, connected=connected_iam, removeNodes=removeNodes, - params_ged=params_ged) - time_total = time.time() - time0 - print('\ntime: ', time_total) - time_list[-1].append(time_total) - g_best[-1].append(G_gen_median_list[0]) - sod_set_median_list[-1].append(sod_set_median) - print('\nsmallest sod of the set median:', sod_set_median) - sod_gs_list[-1].append(sod_gen_median) - print('\nsmallest sod in graph space:', sod_gen_median) - sod_list_list.append(sod_list) - - # show the best graph and save it to file. - print('one of the possible corresponding pre-images is') - nx.draw(G_gen_median_list[0], labels=nx.get_node_attributes(G_gen_median_list[0], 'atom'), - with_labels=True) -# plt.show() - # plt.savefig('results/iam/mutag_median.fit_costs2.001.nb' + str(nb_median) + -# plt.savefig('results/iam/paper_compare/monoter_y' + str(y_class) + -# '_repeat' + str(repeat) + '_' + str(time.time()) + -# '.png', format="PNG") - plt.clf() - # print(G_gen_median_list[0].nodes(data=True)) - # print(G_gen_median_list[0].edges(data=True)) - - - # compute distance between \psi and the set median graph. - knew_set_median = compute_kernel(G_set_median_list + Gn_median, - gkernel, node_label, edge_label, False) - dhat_new_set_median_list = [] - for idx, g_tmp in enumerate(G_set_median_list): - # @todo: the term3 below could use the one at the beginning of the function. - dhat_new_set_median_list.append(dis_gstar(idx, range(len(G_set_median_list), - len(G_set_median_list) + len(Gn_median) + 1), - alpha_range, knew_set_median, withterm3=False)) - - print('\ndistance in kernel space of set median: ', dhat_new_set_median_list[0]) - dis_ks_set_median_list[-1].append(dhat_new_set_median_list[0]) - - - # compute distance between \psi and the new generated graphs. - knew = compute_kernel(G_gen_median_list + Gn_median, gkernel, node_label, - edge_label, False) - dhat_new_list = [] - for idx, g_tmp in enumerate(G_gen_median_list): - # @todo: the term3 below could use the one at the beginning of the function. - dhat_new_list.append(dis_gstar(idx, range(len(G_gen_median_list), - len(G_gen_median_list) + len(Gn_median) + 1), - alpha_range, knew, withterm3=False)) - - print('\nsmallest distance in kernel space: ', dhat_new_list[0]) - dis_ks_min_list[-1].append(dhat_new_list[0]) - - - print('\nsods of the set median for this class:', sod_set_median_list[-1]) - print('\nsods in graph space for this class:', sod_gs_list[-1]) - print('\ndistance in kernel space of set median for this class:', - dis_ks_set_median_list[-1]) - print('\nsmallest distances in kernel space for this class:', - dis_ks_min_list[-1]) - print('\ntimes for this class:', time_list[-1]) - - sod_set_median_list[-1] = np.mean(sod_set_median_list[-1]) - sod_gs_list[-1] = np.mean(sod_gs_list[-1]) - dis_ks_set_median_list[-1] = np.mean(dis_ks_set_median_list[-1]) - dis_ks_min_list[-1] = np.mean(dis_ks_min_list[-1]) - time_list[-1] = np.mean(time_list[-1]) - - print() - print('\nmean sods of the set median for each class:', sod_set_median_list) - print('\nmean sods in graph space for each class:', sod_gs_list) - print('\ndistances in kernel space of set median for each class:', - dis_ks_set_median_list) - print('\nmean smallest distances in kernel space for each class:', - dis_ks_min_list) - print('\nmean times for each class:', time_list) - - print('\nmean sods of the set median of all:', np.mean(sod_set_median_list)) - print('\nmean sods in graph space of all:', np.mean(sod_gs_list)) - print('\nmean distances in kernel space of set median of all:', - np.mean(dis_ks_set_median_list)) - print('\nmean smallest distances in kernel space of all:', - np.mean(dis_ks_min_list)) - print('\nmean times of all:', np.mean(time_list)) - - nb_better_sods = 0 - nb_worse_sods = 0 - nb_same_sods = 0 - for sods in sod_list_list: - if sods[0] > sods[-1]: - nb_better_sods += 1 - elif sods[0] < sods[-1]: - nb_worse_sods += 1 - else: - nb_same_sods += 1 - print('\n In', str(len(sod_list_list)), 'sod lists,', str(nb_better_sods), - 'are getting better,', str(nb_worse_sods), 'are getting worse,', - str(nb_same_sods), 'are not changed; ', str(nb_better_sods / len(sod_list_list)), - 'sods are improved.') - - -def test_iam_mutag(): - ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt', - 'extra_params': {}} # node/edge symb - Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params']) -# Gn = Gn[0:50] - gkernel = 'untilhpathkernel' - node_label = 'atom' - edge_label = 'bond_type' - - # parameters for GED function from the IAM paper. - # fitted edit costs. - c_vi = 0.03523843108436513 - c_vr = 0.03347339739350128 - c_vs = 0.06871290673612238 - c_ei = 0.08591999846720685 - c_er = 0.07962086440894103 - c_es = 0.08596855855478233 - # unfitted edit costs. -# c_vi = 3 -# c_vr = 3 -# c_vs = 1 -# c_ei = 3 -# c_er = 3 -# c_es = 1 - ite_max_iam = 50 - epsilon_iam = 0.001 - removeNodes = False - connected_iam = False - # parameters for IAM function -# ged_cost = 'CONSTANT' - ged_cost = 'CONSTANT' - ged_method = 'IPFP' - edit_cost_constant = [c_vi, c_vr, c_vs, c_ei, c_er, c_es] -# edit_cost_constant = [] - ged_stabilizer = 'min' - ged_repeat = 50 - params_ged = {'lib': 'gedlibpy', 'cost': ged_cost, 'method': ged_method, - 'edit_cost_constant': edit_cost_constant, - 'stabilizer': ged_stabilizer, 'repeat': ged_repeat} - - # classify graphs according to letters. - time_list = [] - dis_ks_min_list = [] - dis_ks_set_median_list = [] - sod_gs_list = [] - g_best = [] - sod_set_median_list = [] - sod_list_list = [] - idx_dict = get_same_item_indices(y_all) - for y_class in idx_dict: - print('\n-------------------------------------------------------') - print('class of y:', y_class) - Gn_class = [Gn[i].copy() for i in idx_dict[y_class]] - - time_list.append([]) - dis_ks_min_list.append([]) - dis_ks_set_median_list.append([]) - sod_gs_list.append([]) - g_best.append([]) - sod_set_median_list.append([]) - - for repeat in range(50): - idx_rdm = random.sample(range(len(Gn_class)), 10) - print('graphs chosen:', idx_rdm) - Gn_median = [Gn_class[idx].copy() for idx in idx_rdm] - Gn_candidate = [g.copy() for g in Gn_median] - - alpha_range = [1 / len(Gn_median)] * len(Gn_median) - time0 = time.time() - G_gen_median_list, sod_gen_median, sod_list, G_set_median_list, sod_set_median \ - = iam_upgraded(Gn_median, - Gn_candidate, c_ei=c_ei, c_er=c_er, c_es=c_es, ite_max=ite_max_iam, - epsilon=epsilon_iam, connected=connected_iam, removeNodes=removeNodes, - params_ged=params_ged) - time_total = time.time() - time0 - print('\ntime: ', time_total) - time_list[-1].append(time_total) - g_best[-1].append(G_gen_median_list[0]) - sod_set_median_list[-1].append(sod_set_median) - print('\nsmallest sod of the set median:', sod_set_median) - sod_gs_list[-1].append(sod_gen_median) - print('\nsmallest sod in graph space:', sod_gen_median) - sod_list_list.append(sod_list) - - # show the best graph and save it to file. - print('one of the possible corresponding pre-images is') - nx.draw(G_gen_median_list[0], labels=nx.get_node_attributes(G_gen_median_list[0], 'atom'), - with_labels=True) -# plt.show() - # plt.savefig('results/iam/mutag_median.fit_costs2.001.nb' + str(nb_median) + -# plt.savefig('results/iam/paper_compare/mutag_y' + str(y_class) + -# '_repeat' + str(repeat) + '_' + str(time.time()) + -# '.png', format="PNG") - plt.clf() - # print(G_gen_median_list[0].nodes(data=True)) - # print(G_gen_median_list[0].edges(data=True)) - - - # compute distance between \psi and the set median graph. - knew_set_median = compute_kernel(G_set_median_list + Gn_median, - gkernel, node_label, edge_label, False) - dhat_new_set_median_list = [] - for idx, g_tmp in enumerate(G_set_median_list): - # @todo: the term3 below could use the one at the beginning of the function. - dhat_new_set_median_list.append(dis_gstar(idx, range(len(G_set_median_list), - len(G_set_median_list) + len(Gn_median) + 1), - alpha_range, knew_set_median, withterm3=False)) - - print('\ndistance in kernel space of set median: ', dhat_new_set_median_list[0]) - dis_ks_set_median_list[-1].append(dhat_new_set_median_list[0]) - - - # compute distance between \psi and the new generated graphs. - knew = compute_kernel(G_gen_median_list + Gn_median, gkernel, node_label, - edge_label, False) - dhat_new_list = [] - for idx, g_tmp in enumerate(G_gen_median_list): - # @todo: the term3 below could use the one at the beginning of the function. - dhat_new_list.append(dis_gstar(idx, range(len(G_gen_median_list), - len(G_gen_median_list) + len(Gn_median) + 1), - alpha_range, knew, withterm3=False)) - - print('\nsmallest distance in kernel space: ', dhat_new_list[0]) - dis_ks_min_list[-1].append(dhat_new_list[0]) - - - print('\nsods of the set median for this class:', sod_set_median_list[-1]) - print('\nsods in graph space for this class:', sod_gs_list[-1]) - print('\ndistance in kernel space of set median for this class:', - dis_ks_set_median_list[-1]) - print('\nsmallest distances in kernel space for this class:', - dis_ks_min_list[-1]) - print('\ntimes for this class:', time_list[-1]) - - sod_set_median_list[-1] = np.mean(sod_set_median_list[-1]) - sod_gs_list[-1] = np.mean(sod_gs_list[-1]) - dis_ks_set_median_list[-1] = np.mean(dis_ks_set_median_list[-1]) - dis_ks_min_list[-1] = np.mean(dis_ks_min_list[-1]) - time_list[-1] = np.mean(time_list[-1]) - - print() - print('\nmean sods of the set median for each class:', sod_set_median_list) - print('\nmean sods in graph space for each class:', sod_gs_list) - print('\ndistances in kernel space of set median for each class:', - dis_ks_set_median_list) - print('\nmean smallest distances in kernel space for each class:', - dis_ks_min_list) - print('\nmean times for each class:', time_list) - - print('\nmean sods of the set median of all:', np.mean(sod_set_median_list)) - print('\nmean sods in graph space of all:', np.mean(sod_gs_list)) - print('\nmean distances in kernel space of set median of all:', - np.mean(dis_ks_set_median_list)) - print('\nmean smallest distances in kernel space of all:', - np.mean(dis_ks_min_list)) - print('\nmean times of all:', np.mean(time_list)) - - nb_better_sods = 0 - nb_worse_sods = 0 - nb_same_sods = 0 - for sods in sod_list_list: - if sods[0] > sods[-1]: - nb_better_sods += 1 - elif sods[0] < sods[-1]: - nb_worse_sods += 1 - else: - nb_same_sods += 1 - print('\n In', str(len(sod_list_list)), 'sod lists,', str(nb_better_sods), - 'are getting better,', str(nb_worse_sods), 'are getting worse,', - str(nb_same_sods), 'are not changed; ', str(nb_better_sods / len(sod_list_list)), - 'sods are improved.') - - -############################################################################### -# tests on different numbers of median-sets. - -def test_iam_median_nb(): - - ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt', - 'extra_params': {}} # node/edge symb - Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params']) -# Gn = Gn[0:50] - remove_edges(Gn) - gkernel = 'marginalizedkernel' - - lmbda = 0.03 # termination probalility -# # parameters for GED function -# c_vi = 0.037 -# c_vr = 0.038 -# c_vs = 0.075 -# c_ei = 0.001 -# c_er = 0.001 -# c_es = 0.0 -# ite_max_iam = 50 -# epsilon_iam = 0.001 -# removeNodes = False -# connected_iam = False -# # parameters for IAM function -# ged_cost = 'CONSTANT' -# ged_method = 'IPFP' -# edit_cost_constant = [c_vi, c_vr, c_vs, c_ei, c_er, c_es] -# ged_stabilizer = 'min' -# ged_repeat = 50 -# params_ged = {'lib': 'gedlibpy', 'cost': ged_cost, 'method': ged_method, -# 'edit_cost_constant': edit_cost_constant, -# 'stabilizer': ged_stabilizer, 'repeat': ged_repeat} - - # parameters for GED function - c_vi = 4 - c_vr = 4 - c_vs = 2 - c_ei = 1 - c_er = 1 - c_es = 1 - ite_max_iam = 50 - epsilon_iam = 0.001 - removeNodes = False - connected_iam = False - # parameters for IAM function - ged_cost = 'CHEM_1' - ged_method = 'IPFP' - edit_cost_constant = [] - ged_stabilizer = 'min' - ged_repeat = 50 - params_ged = {'lib': 'gedlibpy', 'cost': ged_cost, 'method': ged_method, - 'edit_cost_constant': edit_cost_constant, - 'stabilizer': ged_stabilizer, 'repeat': ged_repeat} - - # find out all the graphs classified to positive group 1. - idx_dict = get_same_item_indices(y_all) - Gn = [Gn[i] for i in idx_dict[1]] - - # number of graphs; we what to compute the median of these graphs. -# nb_median_range = [2, 3, 4, 5, 10, 20, 30, 40, 50, 100] - nb_median_range = [len(Gn)] - -# # compute Gram matrix. -# time0 = time.time() -# km = compute_kernel(Gn, gkernel, True) -# time_km = time.time() - time0 -# # write Gram matrix to file. -# np.savez('results/gram_matrix_marg_itr10_pq0.03_mutag_positive.gm', gm=km, gmtime=time_km) - - time_list = [] - dis_ks_min_list = [] - sod_gs_list = [] -# sod_gs_min_list = [] -# nb_updated_list = [] -# nb_updated_k_list = [] - g_best = [] - for nb_median in nb_median_range: - print('\n-------------------------------------------------------') - print('number of median graphs =', nb_median) - random.seed(1) - idx_rdm = random.sample(range(len(Gn)), nb_median) - print('graphs chosen:', idx_rdm) - Gn_median = [Gn[idx].copy() for idx in idx_rdm] - Gn_candidate = [g.copy() for g in Gn] - -# for g in Gn_median: -# nx.draw(g, labels=nx.get_node_attributes(g, 'atom'), with_labels=True) -## plt.savefig("results/preimage_mix/mutag.png", format="PNG") -# plt.show() -# plt.clf() - - ################################################################### -# gmfile = np.load('results/gram_matrix_marg_itr10_pq0.03_mutag_positive.gm.npz') -# km_tmp = gmfile['gm'] -# time_km = gmfile['gmtime'] -# # modify mixed gram matrix. -# km = np.zeros((len(Gn) + nb_median, len(Gn) + nb_median)) -# for i in range(len(Gn)): -# for j in range(i, len(Gn)): -# km[i, j] = km_tmp[i, j] -# km[j, i] = km[i, j] -# for i in range(len(Gn)): -# for j, idx in enumerate(idx_rdm): -# km[i, len(Gn) + j] = km[i, idx] -# km[len(Gn) + j, i] = km[i, idx] -# for i, idx1 in enumerate(idx_rdm): -# for j, idx2 in enumerate(idx_rdm): -# km[len(Gn) + i, len(Gn) + j] = km[idx1, idx2] - - ################################################################### - alpha_range = [1 / nb_median] * nb_median - time0 = time.time() - ghat_new_list, sod_min = iam_upgraded(Gn_median, Gn_candidate, - c_ei=c_ei, c_er=c_er, c_es=c_es, ite_max=ite_max_iam, - epsilon=epsilon_iam, connected=connected_iam, removeNodes=removeNodes, - params_ged=params_ged) - - time_total = time.time() - time0 - print('\ntime: ', time_total) - time_list.append(time_total) - - # compute distance between \psi and the new generated graphs. - knew = compute_kernel(ghat_new_list + Gn_median, gkernel, False) - dhat_new_list = [] - for idx, g_tmp in enumerate(ghat_new_list): - # @todo: the term3 below could use the one at the beginning of the function. - dhat_new_list.append(dis_gstar(idx, range(len(ghat_new_list), - len(ghat_new_list) + len(Gn_median) + 1), - alpha_range, knew, withterm3=False)) - - print('\nsmallest distance in kernel space: ', dhat_new_list[0]) - dis_ks_min_list.append(dhat_new_list[0]) - g_best.append(ghat_new_list[0]) - - # show the best graph and save it to file. -# print('the shortest distance is', dhat) - print('one of the possible corresponding pre-images is') - nx.draw(ghat_new_list[0], labels=nx.get_node_attributes(ghat_new_list[0], 'atom'), - with_labels=True) - plt.show() -# plt.savefig('results/iam/mutag_median.fit_costs2.001.nb' + str(nb_median) + - plt.savefig('results/iam/mutag_median_unfit2.nb' + str(nb_median) + - '.png', format="PNG") - plt.clf() -# print(ghat_list[0].nodes(data=True)) -# print(ghat_list[0].edges(data=True)) - - sod_gs_list.append(sod_min) -# sod_gs_min_list.append(np.min(sod_min)) - print('\nsmallest sod in graph space: ', sod_min) - - print('\nsods in graph space: ', sod_gs_list) -# print('\nsmallest sod in graph space for each set of median graphs: ', sod_gs_min_list) - print('\nsmallest distance in kernel space for each set of median graphs: ', - dis_ks_min_list) -# print('\nnumber of updates of the best graph for each set of median graphs by IAM: ', -# nb_updated_list) -# print('\nnumber of updates of k nearest graphs for each set of median graphs by IAM: ', -# nb_updated_k_list) - print('\ntimes:', time_list) - - -def test_iam_letter_h(): - from median import draw_Letter_graph - ds = {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt', - 'extra_params': {}} # node nsymb -# ds = {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt', -# 'extra_params': {}} # node nsymb -# Gn = Gn[0:50] - Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params']) - gkernel = 'structuralspkernel' - - # parameters for GED function from the IAM paper. - c_vi = 3 - c_vr = 3 - c_vs = 1 - c_ei = 3 - c_er = 3 - c_es = 1 - ite_max_iam = 50 - epsilon_iam = 0.001 - removeNodes = False - connected_iam = False - # parameters for IAM function -# ged_cost = 'CONSTANT' - ged_cost = 'LETTER' - ged_method = 'IPFP' -# edit_cost_constant = [c_vi, c_vr, c_vs, c_ei, c_er, c_es] - edit_cost_constant = [] - ged_stabilizer = 'min' - ged_repeat = 50 - params_ged = {'lib': 'gedlibpy', 'cost': ged_cost, 'method': ged_method, - 'edit_cost_constant': edit_cost_constant, - 'stabilizer': ged_stabilizer, 'repeat': ged_repeat} - - # classify graphs according to letters. - time_list = [] - dis_ks_min_list = [] - sod_gs_list = [] - g_best = [] - sod_set_median_list = [] - idx_dict = get_same_item_indices(y_all) - for letter in idx_dict: - print('\n-------------------------------------------------------') - print('letter', letter) - Gn_let = [Gn[i].copy() for i in idx_dict[letter]] - - time_list.append([]) - dis_ks_min_list.append([]) - sod_gs_list.append([]) - g_best.append([]) - sod_set_median_list.append([]) - - for repeat in range(50): - idx_rdm = random.sample(range(len(Gn_let)), 50) - print('graphs chosen:', idx_rdm) - Gn_median = [Gn_let[idx].copy() for idx in idx_rdm] - Gn_candidate = [g.copy() for g in Gn_median] - - alpha_range = [1 / len(Gn_median)] * len(Gn_median) - time0 = time.time() - ghat_new_list, sod_min, sod_set_median = iam_upgraded(Gn_median, - Gn_candidate, c_ei=c_ei, c_er=c_er, c_es=c_es, ite_max=ite_max_iam, - epsilon=epsilon_iam, connected=connected_iam, removeNodes=removeNodes, - params_ged=params_ged) - time_total = time.time() - time0 - print('\ntime: ', time_total) - time_list[-1].append(time_total) - g_best[-1].append(ghat_new_list[0]) - sod_set_median_list[-1].append(sod_set_median) - print('\nsmallest sod of the set median:', sod_set_median) - sod_gs_list[-1].append(sod_min) - print('\nsmallest sod in graph space:', sod_min) - - # show the best graph and save it to file. - print('one of the possible corresponding pre-images is') - draw_Letter_graph(ghat_new_list[0], savepath='results/iam/paper_compare/') - - # compute distance between \psi and the new generated graphs. - knew = compute_kernel(ghat_new_list + Gn_median, gkernel, False) - dhat_new_list = [] - for idx, g_tmp in enumerate(ghat_new_list): - # @todo: the term3 below could use the one at the beginning of the function. - dhat_new_list.append(dis_gstar(idx, range(len(ghat_new_list), - len(ghat_new_list) + len(Gn_median) + 1), - alpha_range, knew, withterm3=False)) - - print('\nsmallest distance in kernel space: ', dhat_new_list[0]) - dis_ks_min_list[-1].append(dhat_new_list[0]) - - print('\nsods of the set median for this letter:', sod_set_median_list[-1]) - print('\nsods in graph space for this letter:', sod_gs_list[-1]) - print('\nsmallest distances in kernel space for this letter:', - dis_ks_min_list[-1]) - print('\ntimes for this letter:', time_list[-1]) - - sod_set_median_list[-1] = np.mean(sod_set_median_list[-1]) - sod_gs_list[-1] = np.mean(sod_gs_list[-1]) - dis_ks_min_list[-1] = np.mean(dis_ks_min_list[-1]) - time_list[-1] = np.mean(time_list[-1]) - - print('\nmean sods of the set median for each letter:', sod_set_median_list) - print('\nmean sods in graph space for each letter:', sod_gs_list) - print('\nmean smallest distances in kernel space for each letter:', - dis_ks_min_list) - print('\nmean times for each letter:', time_list) - - print('\nmean sods of the set median of all:', np.mean(sod_set_median_list)) - print('\nmean sods in graph space of all:', np.mean(sod_gs_list)) - print('\nmean smallest distances in kernel space of all:', - np.mean(dis_ks_min_list)) - print('\nmean times of all:', np.mean(time_list)) - - - - - - - - - -def test_iam_fitdistance(): - - ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt', - 'extra_params': {}} # node/edge symb - Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params']) -# Gn = Gn[0:50] -# remove_edges(Gn) - gkernel = 'marginalizedkernel' - node_label = 'atom' - edge_label = 'bond_type' - -# lmbda = 0.03 # termination probalility -# # parameters for GED function -# c_vi = 0.037 -# c_vr = 0.038 -# c_vs = 0.075 -# c_ei = 0.001 -# c_er = 0.001 -# c_es = 0.0 -# ite_max_iam = 50 -# epsilon_iam = 0.001 -# removeNodes = False -# connected_iam = False -# # parameters for IAM function -# ged_cost = 'CONSTANT' -# ged_method = 'IPFP' -# edit_cost_constant = [c_vi, c_vr, c_vs, c_ei, c_er, c_es] -# ged_stabilizer = 'min' -# ged_repeat = 50 -# params_ged = {'lib': 'gedlibpy', 'cost': ged_cost, 'method': ged_method, -# 'edit_cost_constant': edit_cost_constant, -# 'stabilizer': ged_stabilizer, 'repeat': ged_repeat} - - # parameters for GED function - c_vi = 4 - c_vr = 4 - c_vs = 2 - c_ei = 1 - c_er = 1 - c_es = 1 - ite_max_iam = 50 - epsilon_iam = 0.001 - removeNodes = False - connected_iam = False - # parameters for IAM function - ged_cost = 'CHEM_1' - ged_method = 'IPFP' - edit_cost_constant = [] - ged_stabilizer = 'min' - ged_repeat = 50 - params_ged = {'lib': 'gedlibpy', 'cost': ged_cost, 'method': ged_method, - 'edit_cost_constant': edit_cost_constant, - 'stabilizer': ged_stabilizer, 'repeat': ged_repeat} - - # find out all the graphs classified to positive group 1. - idx_dict = get_same_item_indices(y_all) - Gn = [Gn[i] for i in idx_dict[1]] - - # number of graphs; we what to compute the median of these graphs. -# nb_median_range = [2, 3, 4, 5, 10, 20, 30, 40, 50, 100] - nb_median_range = [10] - -# # compute Gram matrix. -# time0 = time.time() -# km = compute_kernel(Gn, gkernel, True) -# time_km = time.time() - time0 -# # write Gram matrix to file. -# np.savez('results/gram_matrix_marg_itr10_pq0.03_mutag_positive.gm', gm=km, gmtime=time_km) - - time_list = [] - dis_ks_min_list = [] - dis_ks_gen_median_list = [] - sod_gs_list = [] -# sod_gs_min_list = [] -# nb_updated_list = [] -# nb_updated_k_list = [] - g_best = [] - for nb_median in nb_median_range: - print('\n-------------------------------------------------------') - print('number of median graphs =', nb_median) - random.seed(1) - idx_rdm = random.sample(range(len(Gn)), nb_median) - print('graphs chosen:', idx_rdm) - Gn_median = [Gn[idx].copy() for idx in idx_rdm] - Gn_candidate = [g.copy() for g in Gn_median] - -# for g in Gn_median: -# nx.draw(g, labels=nx.get_node_attributes(g, 'atom'), with_labels=True) -## plt.savefig("results/preimage_mix/mutag.png", format="PNG") -# plt.show() -# plt.clf() - - ################################################################### -# gmfile = np.load('results/gram_matrix_marg_itr10_pq0.03_mutag_positive.gm.npz') -# km_tmp = gmfile['gm'] -# time_km = gmfile['gmtime'] -# # modify mixed gram matrix. -# km = np.zeros((len(Gn) + nb_median, len(Gn) + nb_median)) -# for i in range(len(Gn)): -# for j in range(i, len(Gn)): -# km[i, j] = km_tmp[i, j] -# km[j, i] = km[i, j] -# for i in range(len(Gn)): -# for j, idx in enumerate(idx_rdm): -# km[i, len(Gn) + j] = km[i, idx] -# km[len(Gn) + j, i] = km[i, idx] -# for i, idx1 in enumerate(idx_rdm): -# for j, idx2 in enumerate(idx_rdm): -# km[len(Gn) + i, len(Gn) + j] = km[idx1, idx2] - - ################################################################### - alpha_range = [1 / nb_median] * nb_median - time0 = time.time() - G_gen_median_list, sod_gen_median, sod_list, G_set_median_list, sod_set_median \ - = iam_upgraded(Gn_median, Gn_candidate, - c_ei=c_ei, c_er=c_er, c_es=c_es, ite_max=ite_max_iam, - epsilon=epsilon_iam, connected=connected_iam, removeNodes=removeNodes, - params_ged=params_ged) - - time_total = time.time() - time0 - print('\ntime: ', time_total) - time_list.append(time_total) - - # compute distance between \psi and the new generated graphs. - knew = compute_kernel(G_gen_median_list + Gn_median, gkernel, node_label, - edge_label, False) - dhat_new_list = [] - for idx, g_tmp in enumerate(G_gen_median_list): - # @todo: the term3 below could use the one at the beginning of the function. - dhat_new_list.append(dis_gstar(idx, range(len(G_gen_median_list), - len(G_gen_median_list) + len(Gn_median) + 1), - alpha_range, knew, withterm3=False)) - - print('\nsmallest distance in kernel space: ', dhat_new_list[0]) - dis_ks_min_list.append(dhat_new_list[0]) - g_best.append(G_gen_median_list[0]) - - # show the best graph and save it to file. -# print('the shortest distance is', dhat) - print('one of the possible corresponding pre-images is') - nx.draw(G_gen_median_list[0], labels=nx.get_node_attributes(G_gen_median_list[0], 'atom'), - with_labels=True) - plt.show() -# plt.savefig('results/iam/mutag_median.fit_costs2.001.nb' + str(nb_median) + -# plt.savefig('results/iam/mutag_median_unfit2.nb' + str(nb_median) + -# '.png', format="PNG") - plt.clf() -# print(ghat_list[0].nodes(data=True)) -# print(ghat_list[0].edges(data=True)) - - sod_gs_list.append(sod_gen_median) -# sod_gs_min_list.append(np.min(sod_gen_median)) - print('\nsmallest sod in graph space: ', sod_gen_median) - print('\nsmallest sod of set median in graph space: ', sod_set_median) - - print('\nsods in graph space: ', sod_gs_list) -# print('\nsmallest sod in graph space for each set of median graphs: ', sod_gs_min_list) - print('\nsmallest distance in kernel space for each set of median graphs: ', - dis_ks_min_list) -# print('\nnumber of updates of the best graph for each set of median graphs by IAM: ', -# nb_updated_list) -# print('\nnumber of updates of k nearest graphs for each set of median graphs by IAM: ', -# nb_updated_k_list) - print('\ntimes:', time_list) - - - - - -############################################################################### - - -if __name__ == '__main__': -############################################################################### -# tests on different numbers of median-sets. -# test_iam_median_nb() -# test_iam_letter_h() -# test_iam_monoterpenoides() -# test_iam_mutag() - -# test_iam_fitdistance() -# print("test log") - - test_iam_monoterpenoides_with_init40() diff --git a/gklearn/preimage/test_k_closest_graphs.py b/gklearn/preimage/test_k_closest_graphs.py deleted file mode 100644 index 56971c7..0000000 --- a/gklearn/preimage/test_k_closest_graphs.py +++ /dev/null @@ -1,462 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -Created on Mon Dec 16 11:53:54 2019 - -@author: ljia -""" -import numpy as np -import math -import networkx as nx -import matplotlib.pyplot as plt -import time -import random -from tqdm import tqdm -from itertools import combinations, islice -import multiprocessing -from multiprocessing import Pool -from functools import partial - -from gklearn.utils.graphfiles import loadDataset, loadGXL -#from gklearn.utils.logger2file import * -from gklearn.preimage.iam import iam_upgraded, iam_bash -from gklearn.preimage.utils import compute_kernel, dis_gstar, kernel_distance_matrix -from gklearn.preimage.fitDistance import fit_GED_to_kernel_distance -#from gklearn.preimage.ged import ged_median - - -def fit_edit_cost_constants(fit_method, edit_cost_name, - edit_cost_constants=None, initial_solutions=1, - Gn_median=None, node_label=None, edge_label=None, - gkernel=None, dataset=None, init_ecc=None, - Gn=None, Kmatrix_median=None): - """fit edit cost constants. - """ - if fit_method == 'random': # random - if edit_cost_name == 'LETTER': - edit_cost_constants = random.sample(range(1, 10), 3) - edit_cost_constants = [item * 0.1 for item in edit_cost_constants] - elif edit_cost_name == 'LETTER2': - random.seed(time.time()) - edit_cost_constants = random.sample(range(1, 10), 5) -# edit_cost_constants = [item * 0.1 for item in edit_cost_constants] - elif edit_cost_name == 'NON_SYMBOLIC': - edit_cost_constants = random.sample(range(1, 10), 6) - if Gn_median[0].graph['node_attrs'] == []: - edit_cost_constants[2] = 0 - if Gn_median[0].graph['edge_attrs'] == []: - edit_cost_constants[5] = 0 - else: - edit_cost_constants = random.sample(range(1, 10), 6) - print('edit cost constants used:', edit_cost_constants) - elif fit_method == 'expert': # expert - if init_ecc is None: - if edit_cost_name == 'LETTER': - edit_cost_constants = [0.9, 1.7, 0.75] - elif edit_cost_name == 'LETTER2': - edit_cost_constants = [0.675, 0.675, 0.75, 0.425, 0.425] - else: - edit_cost_constants = [3, 3, 1, 3, 3, 1] - else: - edit_cost_constants = init_ecc - elif fit_method == 'k-graphs': - itr_max = 6 - if init_ecc is None: - if edit_cost_name == 'LETTER': - init_costs = [0.9, 1.7, 0.75] - elif edit_cost_name == 'LETTER2': - init_costs = [0.675, 0.675, 0.75, 0.425, 0.425] - elif edit_cost_name == 'NON_SYMBOLIC': - init_costs = [0, 0, 1, 1, 1, 0] - if Gn_median[0].graph['node_attrs'] == []: - init_costs[2] = 0 - if Gn_median[0].graph['edge_attrs'] == []: - init_costs[5] = 0 - else: - init_costs = [3, 3, 1, 3, 3, 1] - else: - init_costs = init_ecc - algo_options = '--threads 1 --initial-solutions ' \ - + str(initial_solutions) + ' --ratio-runs-from-initial-solutions 1' - params_ged = {'lib': 'gedlibpy', 'cost': edit_cost_name, 'method': 'IPFP', - 'algo_options': algo_options, 'stabilizer': None} - # fit on k-graph subset - edit_cost_constants, _, _, _, _, _, _ = fit_GED_to_kernel_distance(Gn_median, - node_label, edge_label, gkernel, itr_max, params_ged=params_ged, - init_costs=init_costs, dataset=dataset, Kmatrix=Kmatrix_median, - parallel=True) - elif fit_method == 'whole-dataset': - itr_max = 6 - if init_ecc is None: - if edit_cost_name == 'LETTER': - init_costs = [0.9, 1.7, 0.75] - elif edit_cost_name == 'LETTER2': - init_costs = [0.675, 0.675, 0.75, 0.425, 0.425] - else: - init_costs = [3, 3, 1, 3, 3, 1] - else: - init_costs = init_ecc - algo_options = '--threads 1 --initial-solutions ' \ - + str(initial_solutions) + ' --ratio-runs-from-initial-solutions 1' - params_ged = {'lib': 'gedlibpy', 'cost': edit_cost_name, 'method': 'IPFP', - 'algo_options': algo_options, 'stabilizer': None} - # fit on all subset - edit_cost_constants, _, _, _, _, _, _ = fit_GED_to_kernel_distance(Gn, - node_label, edge_label, gkernel, itr_max, params_ged=params_ged, - init_costs=init_costs, dataset=dataset, parallel=True) - elif fit_method == 'precomputed': - pass - - return edit_cost_constants - - -def compute_distances_to_true_median(Gn_median, fname_sm, fname_gm, - gkernel, edit_cost_name, - Kmatrix_median=None): - # reform graphs. - set_median = loadGXL(fname_sm) - gen_median = loadGXL(fname_gm) -# print(gen_median.nodes(data=True)) -# print(gen_median.edges(data=True)) - if edit_cost_name == 'LETTER' or edit_cost_name == 'LETTER2' or edit_cost_name == 'NON_SYMBOLIC': -# dataset == 'Fingerprint': -# for g in Gn_median: -# reform_attributes(g) - reform_attributes(set_median, Gn_median[0].graph['node_attrs'], - Gn_median[0].graph['edge_attrs']) - reform_attributes(gen_median, Gn_median[0].graph['node_attrs'], - Gn_median[0].graph['edge_attrs']) - - if edit_cost_name == 'LETTER' or edit_cost_name == 'LETTER2' or edit_cost_name == 'NON_SYMBOLIC': - node_label = None - edge_label = None - else: - node_label = 'chem' - edge_label = 'valence' - - # compute Gram matrix for median set. - if Kmatrix_median is None: - Kmatrix_median = compute_kernel(Gn_median, gkernel, node_label, edge_label, False) - - # compute distance in kernel space for set median. - kernel_sm = [] - for G_median in Gn_median: - km_tmp = compute_kernel([set_median, G_median], gkernel, node_label, edge_label, False) - kernel_sm.append(km_tmp[0, 1]) - Kmatrix_sm = np.concatenate((np.array([kernel_sm]), np.copy(Kmatrix_median)), axis=0) - Kmatrix_sm = np.concatenate((np.array([[km_tmp[0, 0]] + kernel_sm]).T, Kmatrix_sm), axis=1) -# Kmatrix_sm = compute_kernel([set_median] + Gn_median, gkernel, -# node_label, edge_label, False) - dis_k_sm = dis_gstar(0, range(1, 1+len(Gn_median)), - [1 / len(Gn_median)] * len(Gn_median), Kmatrix_sm, withterm3=False) -# print(gen_median.nodes(data=True)) -# print(gen_median.edges(data=True)) -# print(set_median.nodes(data=True)) -# print(set_median.edges(data=True)) - - # compute distance in kernel space for generalized median. - kernel_gm = [] - for G_median in Gn_median: - km_tmp = compute_kernel([gen_median, G_median], gkernel, node_label, edge_label, False) - kernel_gm.append(km_tmp[0, 1]) - Kmatrix_gm = np.concatenate((np.array([kernel_gm]), np.copy(Kmatrix_median)), axis=0) - Kmatrix_gm = np.concatenate((np.array([[km_tmp[0, 0]] + kernel_gm]).T, Kmatrix_gm), axis=1) -# Kmatrix_gm = compute_kernel([gen_median] + Gn_median, gkernel, -# node_label, edge_label, False) - dis_k_gm = dis_gstar(0, range(1, 1+len(Gn_median)), - [1 / len(Gn_median)] * len(Gn_median), Kmatrix_gm, withterm3=False) - - # compute distance in kernel space for each graph in median set. - dis_k_gi = [] - for idx in range(len(Gn_median)): - dis_k_gi.append(dis_gstar(idx+1, range(1, 1+len(Gn_median)), - [1 / len(Gn_median)] * len(Gn_median), Kmatrix_gm, withterm3=False)) - - print('dis_k_sm:', dis_k_sm) - print('dis_k_gm:', dis_k_gm) - print('dis_k_gi:', dis_k_gi) - idx_dis_k_gi_min = np.argmin(dis_k_gi) - dis_k_gi_min = dis_k_gi[idx_dis_k_gi_min] - print('min dis_k_gi:', dis_k_gi_min) - - return dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min, idx_dis_k_gi_min - - -def median_on_k_closest_graphs(Gn, node_label, edge_label, gkernel, k, fit_method, - graph_dir=None, initial_solutions=1, - edit_cost_constants=None, group_min=None, - dataset=None, edit_cost_name=None, init_ecc=None, - Kmatrix=None, parallel=True): -# dataset = dataset.lower() - -# # compute distances in kernel space. -# dis_mat, _, _, _ = kernel_distance_matrix(Gn, node_label, edge_label, -# Kmatrix=None, gkernel=gkernel) -# # ged. -# gmfile = np.load('results/test_k_closest_graphs/ged_mat.fit_on_whole_dataset.with_medians.gm.npz') -# ged_mat = gmfile['ged_mat'] -# dis_mat = ged_mat[0:len(Gn), 0:len(Gn)] - -# # choose k closest graphs -# time0 = time.time() -# sod_ks_min, group_min = get_closest_k_graphs(dis_mat, k, parallel) -# time_spent = time.time() - time0 -# print('closest graphs:', sod_ks_min, group_min) -# print('time spent:', time_spent) -# group_min = (12, 13, 22, 29) # closest w.r.t path kernel -# group_min = (77, 85, 160, 171) # closest w.r.t ged -# group_min = (0,1,2,3,4,5,6,7,8,9,10,11) # closest w.r.t treelet kernel - Gn_median = [Gn[g].copy() for g in group_min] - if Kmatrix is not None: - Kmatrix_median = np.copy(Kmatrix[group_min,:]) - Kmatrix_median = Kmatrix_median[:,group_min] - else: - Kmatrix_median = None - - - # 1. fit edit cost constants. - time0 = time.time() - edit_cost_constants = fit_edit_cost_constants(fit_method, edit_cost_name, - edit_cost_constants=edit_cost_constants, initial_solutions=initial_solutions, - Gn_median=Gn_median, node_label=node_label, edge_label=edge_label, - gkernel=gkernel, dataset=dataset, init_ecc=init_ecc, - Gn=Gn, Kmatrix_median=Kmatrix_median) - time_fitting = time.time() - time0 - - - # 2. compute set median and gen median using IAM (C++ through bash). - print('\nstart computing set median and gen median using IAM (C++ through bash)...\n') - group_fnames = [Gn[g].graph['filename'] for g in group_min] - time0 = time.time() - sod_sm, sod_gm, fname_sm, fname_gm = iam_bash(group_fnames, edit_cost_constants, - cost=edit_cost_name, initial_solutions=initial_solutions, - graph_dir=graph_dir, dataset=dataset) - time_generating = time.time() - time0 - print('\nmedians computed.\n') - - - # 3. compute distances to the true median. - print('\nstart computing distances to true median....\n') - Gn_median = [Gn[g].copy() for g in group_min] - dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min, idx_dis_k_gi_min = \ - compute_distances_to_true_median(Gn_median, fname_sm, fname_gm, - gkernel, edit_cost_name, - Kmatrix_median=Kmatrix_median) - idx_dis_k_gi_min = group_min[idx_dis_k_gi_min] - print('index min dis_k_gi:', idx_dis_k_gi_min) - print('sod_sm:', sod_sm) - print('sod_gm:', sod_gm) - - # collect return values. - return (sod_sm, sod_gm), \ - (dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min, idx_dis_k_gi_min), \ - (time_fitting, time_generating) - - -def reform_attributes(G, na_names=[], ea_names=[]): - if not na_names == []: - for node in G.nodes: - G.nodes[node]['attributes'] = [G.node[node][a_name] for a_name in na_names] - if not ea_names == []: - for edge in G.edges: - G.edges[edge]['attributes'] = [G.edge[edge][a_name] for a_name in ea_names] - - -def get_closest_k_graphs(dis_mat, k, parallel): - k_graph_groups = combinations(range(0, len(dis_mat)), k) - sod_ks_min = np.inf - if parallel: - len_combination = get_combination_length(len(dis_mat), k) - len_itr_max = int(len_combination if len_combination < 1e7 else 1e7) -# pos_cur = 0 - graph_groups_slices = split_iterable(k_graph_groups, len_itr_max, len_combination) - for graph_groups_cur in graph_groups_slices: -# while True: -# graph_groups_cur = islice(k_graph_groups, pos_cur, pos_cur + len_itr_max) - graph_groups_cur_list = list(graph_groups_cur) - print('current position:', graph_groups_cur_list[0]) - len_itr_cur = len(graph_groups_cur_list) -# if len_itr_cur < len_itr_max: -# break - - itr = zip(graph_groups_cur_list, range(0, len_itr_cur)) - sod_k_list = np.empty(len_itr_cur) - graphs_list = [None] * len_itr_cur - n_jobs = multiprocessing.cpu_count() - chunksize = int(len_itr_max / n_jobs + 1) - n_jobs = multiprocessing.cpu_count() - def init_worker(dis_mat_toshare): - global G_dis_mat - G_dis_mat = dis_mat_toshare - pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(dis_mat,)) -# iterator = tqdm(pool.imap_unordered(_get_closest_k_graphs_parallel, -# itr, chunksize), -# desc='Choosing k closest graphs', file=sys.stdout) - iterator = pool.imap_unordered(_get_closest_k_graphs_parallel, itr, chunksize) - for graphs, i, sod_ks in iterator: - sod_k_list[i] = sod_ks - graphs_list[i] = graphs - pool.close() - pool.join() - - arg_min = np.argmin(sod_k_list) - sod_ks_cur = sod_k_list[arg_min] - group_cur = graphs_list[arg_min] - if sod_ks_cur < sod_ks_min: - sod_ks_min = sod_ks_cur - group_min = group_cur - print('get closer graphs:', sod_ks_min, group_min) - else: - for items in tqdm(k_graph_groups, desc='Choosing k closest graphs', file=sys.stdout): - # if items[0] != itmp: - # itmp = items[0] - # print(items) - k_graph_pairs = combinations(items, 2) - sod_ks = 0 - for i1, i2 in k_graph_pairs: - sod_ks += dis_mat[i1, i2] - if sod_ks < sod_ks_min: - sod_ks_min = sod_ks - group_min = items - print('get closer graphs:', sod_ks_min, group_min) - - return sod_ks_min, group_min - - -def _get_closest_k_graphs_parallel(itr): - k_graph_pairs = combinations(itr[0], 2) - sod_ks = 0 - for i1, i2 in k_graph_pairs: - sod_ks += G_dis_mat[i1, i2] - - return itr[0], itr[1], sod_ks - - -def split_iterable(iterable, n, len_iter): - it = iter(iterable) - for i in range(0, len_iter, n): - piece = islice(it, n) - yield piece - - -def get_combination_length(n, k): - len_combination = 1 - for i in range(n, n - k, -1): - len_combination *= i - return int(len_combination / math.factorial(k)) - - -############################################################################### - -def test_k_closest_graphs(): - ds = {'name': 'monoterpenoides', - 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb - Gn, y_all = loadDataset(ds['dataset']) -# Gn = Gn[0:50] -# gkernel = 'untilhpathkernel' -# gkernel = 'weisfeilerlehmankernel' - gkernel = 'treeletkernel' - node_label = 'atom' - edge_label = 'bond_type' - - k = 5 - edit_costs = [0.16229209837639536, 0.06612870523413916, 0.04030113378793905, 0.20723547009415202, 0.3338607220394598, 0.27054392518077297] - -# sod_sm, sod_gm, dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min \ -# = median_on_k_closest_graphs(Gn, node_label, edge_label, gkernel, k, -# 'precomputed', edit_costs=edit_costs, -## 'k-graphs', -# parallel=False) -# -# sod_sm, sod_gm, dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min \ -# = median_on_k_closest_graphs(Gn, node_label, edge_label, gkernel, k, -# 'expert', parallel=False) - - sod_sm, sod_gm, dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min \ - = median_on_k_closest_graphs(Gn, node_label, edge_label, gkernel, k, - 'expert', parallel=False) - return - - -def test_k_closest_graphs_with_cv(): - gkernel = 'untilhpathkernel' - node_label = 'atom' - edge_label = 'bond_type' - - k = 4 - - y_all = ['3', '1', '4', '6', '7', '8', '9', '2'] - repeats = 50 - collection_path = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/generated_datsets/monoterpenoides/' - graph_dir = collection_path + 'gxl/' - - sod_sm_list = [] - sod_gm_list = [] - dis_k_sm_list = [] - dis_k_gm_list = [] - dis_k_gi_min_list = [] - for y in y_all: - print('\n-------------------------------------------------------') - print('class of y:', y) - - sod_sm_list.append([]) - sod_gm_list.append([]) - dis_k_sm_list.append([]) - dis_k_gm_list.append([]) - dis_k_gi_min_list.append([]) - - for repeat in range(repeats): - print('\nrepeat ', repeat) - collection_file = collection_path + 'monoterpenoides_' + y + '_' + str(repeat) + '.xml' - Gn, _ = loadDataset(collection_file, extra_params=graph_dir) - sod_sm, sod_gm, dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min \ - = median_on_k_closest_graphs(Gn, node_label, edge_label, gkernel, - k, 'whole-dataset', graph_dir=graph_dir, - parallel=False) - - sod_sm_list[-1].append(sod_sm) - sod_gm_list[-1].append(sod_gm) - dis_k_sm_list[-1].append(dis_k_sm) - dis_k_gm_list[-1].append(dis_k_gm) - dis_k_gi_min_list[-1].append(dis_k_gi_min) - - print('\nsods of the set median for this class:', sod_sm_list[-1]) - print('\nsods of the gen median for this class:', sod_gm_list[-1]) - print('\ndistances in kernel space of set median for this class:', - dis_k_sm_list[-1]) - print('\ndistances in kernel space of gen median for this class:', - dis_k_gm_list[-1]) - print('\ndistances in kernel space of min graph for this class:', - dis_k_gi_min_list[-1]) - - sod_sm_list[-1] = np.mean(sod_sm_list[-1]) - sod_gm_list[-1] = np.mean(sod_gm_list[-1]) - dis_k_sm_list[-1] = np.mean(dis_k_sm_list[-1]) - dis_k_gm_list[-1] = np.mean(dis_k_gm_list[-1]) - dis_k_gi_min_list[-1] = np.mean(dis_k_gi_min_list[-1]) - - print() - print('\nmean sods of the set median for each class:', sod_sm_list) - print('\nmean sods of the gen median for each class:', sod_gm_list) - print('\nmean distance in kernel space of set median for each class:', - dis_k_sm_list) - print('\nmean distances in kernel space of gen median for each class:', - dis_k_gm_list) - print('\nmean distances in kernel space of min graph for each class:', - dis_k_gi_min_list) - - print('\nmean sods of the set median of all:', np.mean(sod_sm_list)) - print('\nmean sods of the gen median of all:', np.mean(sod_gm_list)) - print('\nmean distances in kernel space of set median of all:', - np.mean(dis_k_sm_list)) - print('\nmean distances in kernel space of gen median of all:', - np.mean(dis_k_gm_list)) - print('\nmean distances in kernel space of min graph of all:', - np.mean(dis_k_gi_min_list)) - - return - - -if __name__ == '__main__': - test_k_closest_graphs() -# test_k_closest_graphs_with_cv() \ No newline at end of file diff --git a/gklearn/preimage/test_median_preimage_generator.py b/gklearn/preimage/test_median_preimage_generator.py deleted file mode 100644 index 2f458af..0000000 --- a/gklearn/preimage/test_median_preimage_generator.py +++ /dev/null @@ -1,69 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -Created on Fri Mar 27 17:30:55 2020 - -@author: ljia -""" -import multiprocessing -import functools -from gklearn.utils.kernels import deltakernel, gaussiankernel, kernelproduct -from gklearn.preimage import MedianPreimageGenerator -from gklearn.utils import Dataset - - -def test_median_preimage_generator(): - - # 1. set parameters. - print('1. setting parameters...') - ds_name = 'Letter-high' - mpg = MedianPreimageGenerator() - mpg_options = {'fit_method': 'k-graphs', - 'init_ecc': [3, 3, 1, 3, 3], - 'ds_name': 'Letter-high', - 'parallel': True, - 'time_limit_in_sec': 0, - 'max_itrs': 100, - 'max_itrs_without_update': 3, - 'epsilon_ratio': 0.01, - 'verbose': 2} - mpg.set_options(**mpg_options) - mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) - sub_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel} - mpg.kernel_options = {'name': 'structuralspkernel', - 'edge_weight': None, - 'node_kernels': sub_kernels, - 'edge_kernels': sub_kernels, - 'compute_method': 'naive', - 'parallel': 'imap_unordered', -# 'parallel': None, - 'n_jobs': multiprocessing.cpu_count(), - 'normalize': True, - 'verbose': 2} - mpg.ged_options = {'method': 'IPFP', - 'initial_solutions': 40, - 'edit_cost': 'LETTER2', - 'attr_distance': 'euclidean', - 'ratio_runs_from_initial_solutions': 1, - 'threads': multiprocessing.cpu_count(), - 'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES'} - mpg.mge_options = {'init_type': 'MEDOID', - 'random_inits': 10, - 'time_limit': 600, - 'verbose': 2, - 'refine': False} - - - # 2. get dataset. - print('2. getting dataset...') - mpg.dataset = Dataset() - mpg.dataset.load_predefined_dataset(ds_name) - mpg.dataset.cut_graphs(range(0, 10)) - - # 3. compute median preimage. - print('3. computing median preimage...') - mpg.run() - - -if __name__ == '__main__': - test_median_preimage_generator() \ No newline at end of file diff --git a/gklearn/preimage/test_others.py b/gklearn/preimage/test_others.py deleted file mode 100644 index a277a17..0000000 --- a/gklearn/preimage/test_others.py +++ /dev/null @@ -1,686 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -Created on Thu Jul 4 12:20:16 2019 - -@author: ljia -""" -import numpy as np -import networkx as nx -import matplotlib.pyplot as plt -import time -from tqdm import tqdm - -from gklearn.utils.graphfiles import loadDataset -from gklearn.preimage.median import draw_Letter_graph -from gklearn.preimage.ged import GED, ged_median -from gklearn.preimage.utils import get_same_item_indices, compute_kernel, gram2distances, \ - dis_gstar, remove_edges - - -# --------------------------- These are tests --------------------------------# - -def test_who_is_the_closest_in_kernel_space(Gn): - idx_gi = [0, 6] - g1 = Gn[idx_gi[0]] - g2 = Gn[idx_gi[1]] - # create the "median" graph. - gnew = g2.copy() - gnew.remove_node(0) - nx.draw_networkx(gnew) - plt.show() - print(gnew.nodes(data=True)) - Gn = [gnew] + Gn - - # compute gram matrix - Kmatrix = compute_kernel(Gn, 'untilhpathkernel', True) - # the distance matrix - dmatrix = gram2distances(Kmatrix) - print(np.sort(dmatrix[idx_gi[0] + 1])) - print(np.argsort(dmatrix[idx_gi[0] + 1])) - print(np.sort(dmatrix[idx_gi[1] + 1])) - print(np.argsort(dmatrix[idx_gi[1] + 1])) - # for all g in Gn, compute (d(g1, g) + d(g2, g)) / 2 - dis_median = [(dmatrix[i, idx_gi[0] + 1] + dmatrix[i, idx_gi[1] + 1]) / 2 for i in range(len(Gn))] - print(np.sort(dis_median)) - print(np.argsort(dis_median)) - return - - -def test_who_is_the_closest_in_GED_space(Gn): - idx_gi = [0, 6] - g1 = Gn[idx_gi[0]] - g2 = Gn[idx_gi[1]] - # create the "median" graph. - gnew = g2.copy() - gnew.remove_node(0) - nx.draw_networkx(gnew) - plt.show() - print(gnew.nodes(data=True)) - Gn = [gnew] + Gn - - # compute GEDs - ged_matrix = np.zeros((len(Gn), len(Gn))) - for i1 in tqdm(range(len(Gn)), desc='computing GEDs', file=sys.stdout): - for i2 in range(len(Gn)): - dis, _, _ = GED(Gn[i1], Gn[i2], lib='gedlib') - ged_matrix[i1, i2] = dis - print(np.sort(ged_matrix[idx_gi[0] + 1])) - print(np.argsort(ged_matrix[idx_gi[0] + 1])) - print(np.sort(ged_matrix[idx_gi[1] + 1])) - print(np.argsort(ged_matrix[idx_gi[1] + 1])) - # for all g in Gn, compute (GED(g1, g) + GED(g2, g)) / 2 - dis_median = [(ged_matrix[i, idx_gi[0] + 1] + ged_matrix[i, idx_gi[1] + 1]) / 2 for i in range(len(Gn))] - print(np.sort(dis_median)) - print(np.argsort(dis_median)) - return - - -def test_will_IAM_give_the_median_graph_we_wanted(Gn): - idx_gi = [0, 6] - g1 = Gn[idx_gi[0]].copy() - g2 = Gn[idx_gi[1]].copy() -# del Gn[idx_gi[0]] -# del Gn[idx_gi[1] - 1] - g_median = test_iam_with_more_graphs_as_init([g1, g2], [g1, g2], c_ei=1, c_er=1, c_es=1) -# g_median = test_iam_with_more_graphs_as_init(Gn, Gn, c_ei=1, c_er=1, c_es=1) - nx.draw_networkx(g_median) - plt.show() - print(g_median.nodes(data=True)) - print(g_median.edges(data=True)) - - -def test_new_IAM_allGraph_deleteNodes(Gn): - idx_gi = [0, 6] -# g1 = Gn[idx_gi[0]].copy() -# g2 = Gn[idx_gi[1]].copy() - -# g1 = nx.Graph(name='haha') -# g1.add_nodes_from([(0, {'atom': 'C'}), (1, {'atom': 'O'}), (2, {'atom': 'C'})]) -# g1.add_edges_from([(0, 1, {'bond_type': '1'}), (1, 2, {'bond_type': '1'})]) -# g2 = nx.Graph(name='hahaha') -# g2.add_nodes_from([(0, {'atom': 'C'}), (1, {'atom': 'O'}), (2, {'atom': 'C'}), -# (3, {'atom': 'O'}), (4, {'atom': 'C'})]) -# g2.add_edges_from([(0, 1, {'bond_type': '1'}), (1, 2, {'bond_type': '1'}), -# (2, 3, {'bond_type': '1'}), (3, 4, {'bond_type': '1'})]) - - g1 = nx.Graph(name='haha') - g1.add_nodes_from([(0, {'atom': 'C'}), (1, {'atom': 'C'}), (2, {'atom': 'C'}), - (3, {'atom': 'S'}), (4, {'atom': 'S'})]) - g1.add_edges_from([(0, 1, {'bond_type': '1'}), (1, 2, {'bond_type': '1'}), - (2, 3, {'bond_type': '1'}), (2, 4, {'bond_type': '1'})]) - g2 = nx.Graph(name='hahaha') - g2.add_nodes_from([(0, {'atom': 'C'}), (1, {'atom': 'C'}), (2, {'atom': 'C'}), - (3, {'atom': 'O'}), (4, {'atom': 'O'})]) - g2.add_edges_from([(0, 1, {'bond_type': '1'}), (1, 2, {'bond_type': '1'}), - (2, 3, {'bond_type': '1'}), (2, 4, {'bond_type': '1'})]) - -# g2 = g1.copy() -# g2.add_nodes_from([(3, {'atom': 'O'})]) -# g2.add_nodes_from([(4, {'atom': 'C'})]) -# g2.add_edges_from([(1, 3, {'bond_type': '1'})]) -# g2.add_edges_from([(3, 4, {'bond_type': '1'})]) - -# del Gn[idx_gi[0]] -# del Gn[idx_gi[1] - 1] - - nx.draw_networkx(g1) - plt.show() - print(g1.nodes(data=True)) - print(g1.edges(data=True)) - nx.draw_networkx(g2) - plt.show() - print(g2.nodes(data=True)) - print(g2.edges(data=True)) - - g_median = test_iam_moreGraphsAsInit_tryAllPossibleBestGraphs_deleteNodesInIterations([g1, g2], [g1, g2], c_ei=1, c_er=1, c_es=1) -# g_median = test_iam_moreGraphsAsInit_tryAllPossibleBestGraphs_deleteNodesInIterations(Gn, Gn, c_ei=1, c_er=1, c_es=1) - nx.draw_networkx(g_median) - plt.show() - print(g_median.nodes(data=True)) - print(g_median.edges(data=True)) - - -def test_the_simple_two(Gn, gkernel): - from gk_iam import gk_iam_nearest_multi - lmbda = 0.03 # termination probalility - r_max = 10 # recursions - l = 500 - alpha_range = np.linspace(0.5, 0.5, 1) - k = 2 # k nearest neighbors - - # randomly select two molecules - np.random.seed(1) - idx_gi = [0, 6] # np.random.randint(0, len(Gn), 2) - g1 = Gn[idx_gi[0]] - g2 = Gn[idx_gi[1]] - Gn_mix = [g.copy() for g in Gn] - Gn_mix.append(g1.copy()) - Gn_mix.append(g2.copy()) - -# g_tmp = iam([g1, g2]) -# nx.draw_networkx(g_tmp) -# plt.show() - - # compute -# k_list = [] # kernel between each graph and itself. -# k_g1_list = [] # kernel between each graph and g1 -# k_g2_list = [] # kernel between each graph and g2 -# for ig, g in tqdm(enumerate(Gn), desc='computing self kernels', file=sys.stdout): -# ktemp = compute_kernel([g, g1, g2], 'marginalizedkernel', False) -# k_list.append(ktemp[0][0, 0]) -# k_g1_list.append(ktemp[0][0, 1]) -# k_g2_list.append(ktemp[0][0, 2]) - - km = compute_kernel(Gn_mix, gkernel, True) -# k_list = np.diag(km) # kernel between each graph and itself. -# k_g1_list = km[idx_gi[0]] # kernel between each graph and g1 -# k_g2_list = km[idx_gi[1]] # kernel between each graph and g2 - - g_best = [] - dis_best = [] - # for each alpha - for alpha in alpha_range: - print('alpha =', alpha) - dhat, ghat_list = gk_iam_nearest_multi(Gn, [g1, g2], [alpha, 1 - alpha], - range(len(Gn), len(Gn) + 2), km, - k, r_max,gkernel) - dis_best.append(dhat) - g_best.append(ghat_list) - - for idx, item in enumerate(alpha_range): - print('when alpha is', item, 'the shortest distance is', dis_best[idx]) - print('the corresponding pre-images are') - for g in g_best[idx]: - nx.draw_networkx(g) - plt.show() - print(g.nodes(data=True)) - print(g.edges(data=True)) - - -def test_remove_bests(Gn, gkernel): - from gk_iam import gk_iam_nearest_multi - lmbda = 0.03 # termination probalility - r_max = 10 # recursions - l = 500 - alpha_range = np.linspace(0.5, 0.5, 1) - k = 20 # k nearest neighbors - - # randomly select two molecules - np.random.seed(1) - idx_gi = [0, 6] # np.random.randint(0, len(Gn), 2) - g1 = Gn[idx_gi[0]] - g2 = Gn[idx_gi[1]] - # remove the best 2 graphs. - del Gn[idx_gi[0]] - del Gn[idx_gi[1] - 1] -# del Gn[8] - - Gn_mix = [g.copy() for g in Gn] - Gn_mix.append(g1.copy()) - Gn_mix.append(g2.copy()) - - - # compute - km = compute_kernel(Gn_mix, gkernel, True) - g_best = [] - dis_best = [] - # for each alpha - for alpha in alpha_range: - print('alpha =', alpha) - dhat, ghat_list = gk_iam_nearest_multi(Gn, [g1, g2], [alpha, 1 - alpha], - range(len(Gn), len(Gn) + 2), km, - k, r_max, gkernel) - dis_best.append(dhat) - g_best.append(ghat_list) - - for idx, item in enumerate(alpha_range): - print('when alpha is', item, 'the shortest distance is', dis_best[idx]) - print('the corresponding pre-images are') - for g in g_best[idx]: - draw_Letter_graph(g) -# nx.draw_networkx(g) -# plt.show() - print(g.nodes(data=True)) - print(g.edges(data=True)) - - -############################################################################### -# Tests on dataset Letter-H. - -def test_gkiam_letter_h(): - from gk_iam import gk_iam_nearest_multi - ds = {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt', - 'extra_params': {}} # node nsymb -# ds = {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt', -# 'extra_params': {}} # node nsymb - Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params']) - gkernel = 'structuralspkernel' - - lmbda = 0.03 # termination probalility - r_max = 3 # recursions -# alpha_range = np.linspace(0.5, 0.5, 1) - k = 10 # k nearest neighbors - - # classify graphs according to letters. - idx_dict = get_same_item_indices(y_all) - time_list = [] - sod_ks_min_list = [] - sod_gs_list = [] - sod_gs_min_list = [] - nb_updated_list = [] - for letter in idx_dict: - print('\n-------------------------------------------------------\n') - Gn_let = [Gn[i].copy() for i in idx_dict[letter]] - Gn_mix = Gn_let + [g.copy() for g in Gn_let] - - alpha_range = np.linspace(1 / len(Gn_let), 1 / len(Gn_let), 1) - - # compute - time0 = time.time() - km = compute_kernel(Gn_mix, gkernel, True) - g_best = [] - dis_best = [] - # for each alpha - for alpha in alpha_range: - print('alpha =', alpha) - dhat, ghat_list, sod_ks, nb_updated = gk_iam_nearest_multi(Gn_let, - Gn_let, [alpha] * len(Gn_let), range(len(Gn_let), len(Gn_mix)), - km, k, r_max, gkernel, c_ei=1.7, c_er=1.7, c_es=1.7, - ged_cost='LETTER', ged_method='IPFP', saveGXL='gedlib-letter') - dis_best.append(dhat) - g_best.append(ghat_list) - time_list.append(time.time() - time0) - - # show best graphs and save them to file. - for idx, item in enumerate(alpha_range): - print('when alpha is', item, 'the shortest distance is', dis_best[idx]) - print('the corresponding pre-images are') - for g in g_best[idx]: - draw_Letter_graph(g, savepath='results/gk_iam/') -# nx.draw_networkx(g) -# plt.show() - print(g.nodes(data=True)) - print(g.edges(data=True)) - - # compute the corresponding sod in graph space. (alpha range not considered.) - sod_tmp, _ = ged_median(g_best[0], Gn_let, ged_cost='LETTER', - ged_method='IPFP', saveGXL='gedlib-letter') - sod_gs_list.append(sod_tmp) - sod_gs_min_list.append(np.min(sod_tmp)) - sod_ks_min_list.append(sod_ks) - nb_updated_list.append(nb_updated) - - - print('\nsods in graph space: ', sod_gs_list) - print('\nsmallest sod in graph space for each letter: ', sod_gs_min_list) - print('\nsmallest sod in kernel space for each letter: ', sod_ks_min_list) - print('\nnumber of updates for each letter: ', nb_updated_list) - print('\ntimes:', time_list) - -#def compute_letter_median_by_average(Gn): -# return g_median - - -def test_iam_letter_h(): - from iam import test_iam_moreGraphsAsInit_tryAllPossibleBestGraphs_deleteNodesInIterations - ds = {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt', - 'extra_params': {}} # node nsymb -# ds = {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt', -# 'extra_params': {}} # node nsymb - Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params']) - - lmbda = 0.03 # termination probalility -# alpha_range = np.linspace(0.5, 0.5, 1) - - # classify graphs according to letters. - idx_dict = get_same_item_indices(y_all) - time_list = [] - sod_list = [] - sod_min_list = [] - for letter in idx_dict: - Gn_let = [Gn[i].copy() for i in idx_dict[letter]] - - alpha_range = np.linspace(1 / len(Gn_let), 1 / len(Gn_let), 1) - - # compute - g_best = [] - dis_best = [] - time0 = time.time() - # for each alpha - for alpha in alpha_range: - print('alpha =', alpha) - ghat_list, dhat = test_iam_moreGraphsAsInit_tryAllPossibleBestGraphs_deleteNodesInIterations( - Gn_let, Gn_let, c_ei=1.7, c_er=1.7, c_es=1.7, - ged_cost='LETTER', ged_method='IPFP', saveGXL='gedlib-letter') - dis_best.append(dhat) - g_best.append(ghat_list) - time_list.append(time.time() - time0) - - # show best graphs and save them to file. - for idx, item in enumerate(alpha_range): - print('when alpha is', item, 'the shortest distance is', dis_best[idx]) - print('the corresponding pre-images are') - for g in g_best[idx]: - draw_Letter_graph(g, savepath='results/iam/') -# nx.draw_networkx(g) -# plt.show() - print(g.nodes(data=True)) - print(g.edges(data=True)) - - # compute the corresponding sod in kernel space. (alpha range not considered.) - gkernel = 'structuralspkernel' - sod_tmp = [] - Gn_mix = g_best[0] + Gn_let - km = compute_kernel(Gn_mix, gkernel, True) - for ig, g in tqdm(enumerate(g_best[0]), desc='computing kernel sod', file=sys.stdout): - dtemp = dis_gstar(ig, range(len(g_best[0]), len(Gn_mix)), - [alpha_range[0]] * len(Gn_let), km, withterm3=False) - sod_tmp.append(dtemp) - sod_list.append(sod_tmp) - sod_min_list.append(np.min(sod_tmp)) - - - print('\nsods in kernel space: ', sod_list) - print('\nsmallest sod in kernel space for each letter: ', sod_min_list) - print('\ntimes:', time_list) - - -def test_random_preimage_letter_h(): - from preimage_random import preimage_random - ds = {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt', - 'extra_params': {}} # node nsymb -# ds = {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt', -# 'extra_params': {}} # node nsymb - # ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt', -# 'extra_params': {}} # node/edge symb -# ds = {'name': 'Acyclic', 'dataset': '../datasets/monoterpenoides/trainset_9.ds', -# 'extra_params': {}} -# ds = {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds', -# 'extra_params': {}} # node symb - Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params']) - gkernel = 'structuralspkernel' - -# lmbda = 0.03 # termination probalility - r_max = 3 # 10 # recursions - l = 500 -# alpha_range = np.linspace(0.5, 0.5, 1) - #alpha_range = np.linspace(0.1, 0.9, 9) - k = 10 # 5 # k nearest neighbors - - # classify graphs according to letters. - idx_dict = get_same_item_indices(y_all) - time_list = [] - sod_list = [] - sod_min_list = [] - for letter in idx_dict: - print('\n-------------------------------------------------------\n') - Gn_let = [Gn[i].copy() for i in idx_dict[letter]] - Gn_mix = Gn_let + [g.copy() for g in Gn_let] - - alpha_range = np.linspace(1 / len(Gn_let), 1 / len(Gn_let), 1) - - # compute - time0 = time.time() - km = compute_kernel(Gn_mix, gkernel, True) - g_best = [] - dis_best = [] - # for each alpha - for alpha in alpha_range: - print('alpha =', alpha) - dhat, ghat_list = preimage_random(Gn_let, Gn_let, [alpha] * len(Gn_let), - range(len(Gn_let), len(Gn_mix)), km, - k, r_max, gkernel, c_ei=1.7, - c_er=1.7, c_es=1.7) - dis_best.append(dhat) - g_best.append(ghat_list) - time_list.append(time.time() - time0) - - # show best graphs and save them to file. - for idx, item in enumerate(alpha_range): - print('when alpha is', item, 'the shortest distance is', dis_best[idx]) - print('the corresponding pre-images are') - for g in g_best[idx]: - draw_Letter_graph(g, savepath='results/gk_iam/') -# nx.draw_networkx(g) -# plt.show() - print(g.nodes(data=True)) - print(g.edges(data=True)) - - # compute the corresponding sod in graph space. (alpha range not considered.) - sod_tmp, _ = ged_median(g_best[0], Gn_let) - sod_list.append(sod_tmp) - sod_min_list.append(np.min(sod_tmp)) - - - print('\nsods in graph space: ', sod_list) - print('\nsmallest sod in graph space for each letter: ', sod_min_list) - print('\ntimes:', time_list) - - - - - - - -def test_gkiam_mutag(): - from gk_iam import gk_iam_nearest_multi - ds = {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt', - 'extra_params': {}} # node nsymb -# ds = {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt', -# 'extra_params': {}} # node nsymb - Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params']) - gkernel = 'structuralspkernel' - - lmbda = 0.03 # termination probalility - r_max = 3 # recursions -# alpha_range = np.linspace(0.5, 0.5, 1) - k = 20 # k nearest neighbors - - # classify graphs according to letters. - idx_dict = get_same_item_indices(y_all) - time_list = [] - sod_ks_min_list = [] - sod_gs_list = [] - sod_gs_min_list = [] - nb_updated_list = [] - for letter in idx_dict: - print('\n-------------------------------------------------------\n') - Gn_let = [Gn[i].copy() for i in idx_dict[letter]] - Gn_mix = Gn_let + [g.copy() for g in Gn_let] - - alpha_range = np.linspace(1 / len(Gn_let), 1 / len(Gn_let), 1) - - # compute - time0 = time.time() - km = compute_kernel(Gn_mix, gkernel, True) - g_best = [] - dis_best = [] - # for each alpha - for alpha in alpha_range: - print('alpha =', alpha) - dhat, ghat_list, sod_ks, nb_updated = gk_iam_nearest_multi(Gn_let, Gn_let, [alpha] * len(Gn_let), - range(len(Gn_let), len(Gn_mix)), km, - k, r_max, gkernel, c_ei=1.7, - c_er=1.7, c_es=1.7) - dis_best.append(dhat) - g_best.append(ghat_list) - time_list.append(time.time() - time0) - - # show best graphs and save them to file. - for idx, item in enumerate(alpha_range): - print('when alpha is', item, 'the shortest distance is', dis_best[idx]) - print('the corresponding pre-images are') - for g in g_best[idx]: - draw_Letter_graph(g, savepath='results/gk_iam/') -# nx.draw_networkx(g) -# plt.show() - print(g.nodes(data=True)) - print(g.edges(data=True)) - - # compute the corresponding sod in graph space. (alpha range not considered.) - sod_tmp, _ = ged_median(g_best[0], Gn_let) - sod_gs_list.append(sod_tmp) - sod_gs_min_list.append(np.min(sod_tmp)) - sod_ks_min_list.append(sod_ks) - nb_updated_list.append(nb_updated) - - - print('\nsods in graph space: ', sod_gs_list) - print('\nsmallest sod in graph space for each letter: ', sod_gs_min_list) - print('\nsmallest sod in kernel space for each letter: ', sod_ks_min_list) - print('\nnumber of updates for each letter: ', nb_updated_list) - print('\ntimes:', time_list) - - -############################################################################### -# Re-test. - -def retest_the_simple_two(): - from gk_iam import gk_iam_nearest_multi - - # The two simple graphs. -# g1 = nx.Graph(name='haha') -# g1.add_nodes_from([(0, {'atom': 'C'}), (1, {'atom': 'O'}), (2, {'atom': 'C'})]) -# g1.add_edges_from([(0, 1, {'bond_type': '1'}), (1, 2, {'bond_type': '1'})]) -# g2 = nx.Graph(name='hahaha') -# g2.add_nodes_from([(0, {'atom': 'C'}), (1, {'atom': 'O'}), (2, {'atom': 'C'}), -# (3, {'atom': 'O'}), (4, {'atom': 'C'})]) -# g2.add_edges_from([(0, 1, {'bond_type': '1'}), (1, 2, {'bond_type': '1'}), -# (2, 3, {'bond_type': '1'}), (3, 4, {'bond_type': '1'})]) - - g1 = nx.Graph(name='haha') - g1.add_nodes_from([(0, {'atom': 'C'}), (1, {'atom': 'C'}), (2, {'atom': 'C'}), - (3, {'atom': 'S'}), (4, {'atom': 'S'})]) - g1.add_edges_from([(0, 1, {'bond_type': '1'}), (1, 2, {'bond_type': '1'}), - (2, 3, {'bond_type': '1'}), (2, 4, {'bond_type': '1'})]) - g2 = nx.Graph(name='hahaha') - g2.add_nodes_from([(0, {'atom': 'C'}), (1, {'atom': 'C'}), (2, {'atom': 'C'}), - (3, {'atom': 'O'}), (4, {'atom': 'O'})]) - g2.add_edges_from([(0, 1, {'bond_type': '1'}), (1, 2, {'bond_type': '1'}), - (2, 3, {'bond_type': '1'}), (2, 4, {'bond_type': '1'})]) - -# # randomly select two molecules -# np.random.seed(1) -# idx_gi = [0, 6] # np.random.randint(0, len(Gn), 2) -# g1 = Gn[idx_gi[0]] -# g2 = Gn[idx_gi[1]] -# Gn_mix = [g.copy() for g in Gn] -# Gn_mix.append(g1.copy()) -# Gn_mix.append(g2.copy()) - - Gn = [g1.copy(), g2.copy()] - remove_edges(Gn) - gkernel = 'marginalizedkernel' - - lmbda = 0.03 # termination probalility - r_max = 10 # recursions -# l = 500 - alpha_range = np.linspace(0.5, 0.5, 1) - k = 2 # k nearest neighbors - epsilon = 1e-6 - ged_cost='CHEM_1' - ged_method='IPFP' - saveGXL='gedlib' - c_ei=1 - c_er=1 - c_es=1 - - Gn_mix = Gn + [g1.copy(), g2.copy()] - - # compute - time0 = time.time() - km = compute_kernel(Gn_mix, gkernel, True) - time_km = time.time() - time0 - - time_list = [] - sod_ks_min_list = [] - sod_gs_list = [] - sod_gs_min_list = [] - nb_updated_list = [] - g_best = [] - # for each alpha - for alpha in alpha_range: - print('\n-------------------------------------------------------\n') - print('alpha =', alpha) - time0 = time.time() - dhat, ghat_list, sod_ks, nb_updated = gk_iam_nearest_multi(Gn, [g1, g2], - [alpha, 1 - alpha], range(len(Gn), len(Gn) + 2), km, k, r_max, - gkernel, c_ei=c_ei, c_er=c_er, c_es=c_es, epsilon=epsilon, - ged_cost=ged_cost, ged_method=ged_method, saveGXL=saveGXL) - time_total = time.time() - time0 + time_km - print('time: ', time_total) - time_list.append(time_total) - sod_ks_min_list.append(dhat) - g_best.append(ghat_list) - nb_updated_list.append(nb_updated) - - # show best graphs and save them to file. - for idx, item in enumerate(alpha_range): - print('when alpha is', item, 'the shortest distance is', sod_ks_min_list[idx]) - print('one of the possible corresponding pre-images is') - nx.draw(g_best[idx][0], labels=nx.get_node_attributes(g_best[idx][0], 'atom'), - with_labels=True) - plt.savefig('results/gk_iam/mutag_alpha' + str(item) + '.png', format="PNG") - plt.show() - print(g_best[idx][0].nodes(data=True)) - print(g_best[idx][0].edges(data=True)) - -# for g in g_best[idx]: -# draw_Letter_graph(g, savepath='results/gk_iam/') -## nx.draw_networkx(g) -## plt.show() -# print(g.nodes(data=True)) -# print(g.edges(data=True)) - - # compute the corresponding sod in graph space. - for idx, item in enumerate(alpha_range): - sod_tmp, _ = ged_median(g_best[0], [g1, g2], ged_cost=ged_cost, - ged_method=ged_method, saveGXL=saveGXL) - sod_gs_list.append(sod_tmp) - sod_gs_min_list.append(np.min(sod_tmp)) - - print('\nsods in graph space: ', sod_gs_list) - print('\nsmallest sod in graph space for each alpha: ', sod_gs_min_list) - print('\nsmallest sod in kernel space for each alpha: ', sod_ks_min_list) - print('\nnumber of updates for each alpha: ', nb_updated_list) - print('\ntimes:', time_list) - - - -if __name__ == '__main__': -# ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt', -# 'extra_params': {}} # node/edge symb -# ds = {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt', -# 'extra_params': {}} # node nsymb -# ds = {'name': 'Acyclic', 'dataset': '../datasets/monoterpenoides/trainset_9.ds', -# 'extra_params': {}} -# ds = {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds', -# 'extra_params': {}} # node symb -# Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params']) -# Gn = Gn[0:20] - -# import networkx.algorithms.isomorphism as iso -# G1 = nx.MultiDiGraph() -# G2 = nx.MultiDiGraph() -# G1.add_nodes_from([1,2,3], fill='red') -# G2.add_nodes_from([10,20,30,40], fill='red') -# nx.add_path(G1, [1,2,3,4], weight=3, linewidth=2.5) -# nx.add_path(G2, [10,20,30,40], weight=3) -# nm = iso.categorical_node_match('fill', 'red') -# print(nx.is_isomorphic(G1, G2, node_match=nm)) -# -# test_new_IAM_allGraph_deleteNodes(Gn) -# test_will_IAM_give_the_median_graph_we_wanted(Gn) -# test_who_is_the_closest_in_GED_space(Gn) -# test_who_is_the_closest_in_kernel_space(Gn) - -# test_the_simple_two(Gn, 'untilhpathkernel') -# test_remove_bests(Gn, 'untilhpathkernel') -# test_gkiam_letter_h() -# test_iam_letter_h() -# test_random_preimage_letter_h - -############################################################################### -# retests. - retest_the_simple_two() \ No newline at end of file diff --git a/gklearn/preimage/test_preimage_iam.py b/gklearn/preimage/test_preimage_iam.py deleted file mode 100644 index 9b05dd9..0000000 --- a/gklearn/preimage/test_preimage_iam.py +++ /dev/null @@ -1,620 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -Created on Thu Sep 5 15:59:00 2019 - -@author: ljia -""" - -import numpy as np -import networkx as nx -import matplotlib.pyplot as plt -import time -import random -#from tqdm import tqdm - -from gklearn.utils.graphfiles import loadDataset -from gklearn.preimage.utils import remove_edges, compute_kernel, get_same_item_indices -from gklearn.preimage.ged import ged_median - -from gklearn.preimage.preimage_iam import preimage_iam - - -############################################################################### -# tests on different values on grid of median-sets and k. - -def test_preimage_iam_grid_k_median_nb(): - ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt', - 'extra_params': {}} # node/edge symb - Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params']) -# Gn = Gn[0:50] - remove_edges(Gn) - gkernel = 'marginalizedkernel' - - lmbda = 0.03 # termination probalility - r_max = 5 # iteration limit for pre-image. -# alpha_range = np.linspace(0.5, 0.5, 1) -# k = 5 # k nearest neighbors - epsilon = 1e-6 - InitIAMWithAllDk = True - # parameters for GED function - ged_cost='CHEM_1' - ged_method='IPFP' - saveGXL='gedlib' - # parameters for IAM function - c_ei=1 - c_er=1 - c_es=1 - ite_max_iam = 50 - epsilon_iam = 0.001 - removeNodes = True - connected_iam = False - - # number of graphs; we what to compute the median of these graphs. - nb_median_range = [2, 3, 4, 5, 10, 20, 30, 40, 50, 100] - # number of nearest neighbors. - k_range = [5, 6, 7, 8, 9, 10, 20, 30, 40, 50, 100] - - # find out all the graphs classified to positive group 1. - idx_dict = get_same_item_indices(y_all) - Gn = [Gn[i] for i in idx_dict[1]] - -# # compute Gram matrix. -# time0 = time.time() -# km = compute_kernel(Gn, gkernel, True) -# time_km = time.time() - time0 -# # write Gram matrix to file. -# np.savez('results/gram_matrix_marg_itr10_pq0.03_mutag_positive.gm', gm=km, gmtime=time_km) - - - time_list = [] - dis_ks_min_list = [] - sod_gs_list = [] - sod_gs_min_list = [] - nb_updated_list = [] - nb_updated_k_list = [] - g_best = [] - for idx_nb, nb_median in enumerate(nb_median_range): - print('\n-------------------------------------------------------') - print('number of median graphs =', nb_median) - random.seed(1) - idx_rdm = random.sample(range(len(Gn)), nb_median) - print('graphs chosen:', idx_rdm) - Gn_median = [Gn[idx].copy() for idx in idx_rdm] - -# for g in Gn_median: -# nx.draw(g, labels=nx.get_node_attributes(g, 'atom'), with_labels=True) -## plt.savefig("results/preimage_mix/mutag.png", format="PNG") -# plt.show() -# plt.clf() - - ################################################################### - gmfile = np.load('results/gram_matrix_marg_itr10_pq0.03_mutag_positive.gm.npz') - km_tmp = gmfile['gm'] - time_km = gmfile['gmtime'] - # modify mixed gram matrix. - km = np.zeros((len(Gn) + nb_median, len(Gn) + nb_median)) - for i in range(len(Gn)): - for j in range(i, len(Gn)): - km[i, j] = km_tmp[i, j] - km[j, i] = km[i, j] - for i in range(len(Gn)): - for j, idx in enumerate(idx_rdm): - km[i, len(Gn) + j] = km[i, idx] - km[len(Gn) + j, i] = km[i, idx] - for i, idx1 in enumerate(idx_rdm): - for j, idx2 in enumerate(idx_rdm): - km[len(Gn) + i, len(Gn) + j] = km[idx1, idx2] - - ################################################################### - alpha_range = [1 / nb_median] * nb_median - - time_list.append([]) - dis_ks_min_list.append([]) - sod_gs_list.append([]) - sod_gs_min_list.append([]) - nb_updated_list.append([]) - nb_updated_k_list.append([]) - g_best.append([]) - - for k in k_range: - print('\n++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n') - print('k =', k) - time0 = time.time() - dhat, ghat_list, dis_of_each_itr, nb_updated, nb_updated_k = \ - preimage_iam(Gn, Gn_median, - alpha_range, range(len(Gn), len(Gn) + nb_median), km, k, r_max, - gkernel, epsilon=epsilon, InitIAMWithAllDk=InitIAMWithAllDk, - params_iam={'c_ei': c_ei, 'c_er': c_er, 'c_es': c_es, - 'ite_max': ite_max_iam, 'epsilon': epsilon_iam, - 'removeNodes': removeNodes, 'connected': connected_iam}, - params_ged={'ged_cost': ged_cost, 'ged_method': ged_method, - 'saveGXL': saveGXL}) - - time_total = time.time() - time0 + time_km - print('time: ', time_total) - time_list[idx_nb].append(time_total) - print('\nsmallest distance in kernel space: ', dhat) - dis_ks_min_list[idx_nb].append(dhat) - g_best[idx_nb].append(ghat_list) - print('\nnumber of updates of the best graph by IAM: ', nb_updated) - nb_updated_list[idx_nb].append(nb_updated) - print('\nnumber of updates of k nearest graphs by IAM: ', nb_updated_k) - nb_updated_k_list[idx_nb].append(nb_updated_k) - - # show the best graph and save it to file. - print('the shortest distance is', dhat) - print('one of the possible corresponding pre-images is') - nx.draw(ghat_list[0], labels=nx.get_node_attributes(ghat_list[0], 'atom'), - with_labels=True) - plt.savefig('results/preimage_iam/mutag_median_nb' + str(nb_median) + - '_k' + str(k) + '.png', format="PNG") - # plt.show() - plt.clf() - # print(ghat_list[0].nodes(data=True)) - # print(ghat_list[0].edges(data=True)) - - # compute the corresponding sod in graph space. - sod_tmp, _ = ged_median([ghat_list[0]], Gn_median, ged_cost=ged_cost, - ged_method=ged_method, saveGXL=saveGXL) - sod_gs_list[idx_nb].append(sod_tmp) - sod_gs_min_list[idx_nb].append(np.min(sod_tmp)) - print('\nsmallest sod in graph space: ', np.min(sod_tmp)) - - print('\nsods in graph space: ', sod_gs_list) - print('\nsmallest sod in graph space for each set of median graphs and k: ', - sod_gs_min_list) - print('\nsmallest distance in kernel space for each set of median graphs and k: ', - dis_ks_min_list) - print('\nnumber of updates of the best graph for each set of median graphs and k by IAM: ', - nb_updated_list) - print('\nnumber of updates of k nearest graphs for each set of median graphs and k by IAM: ', - nb_updated_k_list) - print('\ntimes:', time_list) - - - - - - -############################################################################### -# tests on different numbers of median-sets. - -def test_preimage_iam_median_nb(): - ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt', - 'extra_params': {}} # node/edge symb - Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params']) -# Gn = Gn[0:50] - remove_edges(Gn) - gkernel = 'marginalizedkernel' - - lmbda = 0.03 # termination probalility - r_max = 3 # iteration limit for pre-image. -# alpha_range = np.linspace(0.5, 0.5, 1) - k = 5 # k nearest neighbors - epsilon = 1e-6 - InitIAMWithAllDk = True - # parameters for IAM function -# c_vi = 0.037 -# c_vr = 0.038 -# c_vs = 0.075 -# c_ei = 0.001 -# c_er = 0.001 -# c_es = 0.0 - c_vi = 4 - c_vr = 4 - c_vs = 2 - c_ei = 1 - c_er = 1 - c_es = 1 - ite_max_iam = 50 - epsilon_iam = 0.001 - removeNodes = True - connected_iam = False - # parameters for GED function -# ged_cost='CHEM_1' - ged_cost = 'CONSTANT' - ged_method = 'IPFP' - edit_cost_constant = [c_vi, c_vr, c_vs, c_ei, c_er, c_es] - ged_stabilizer = 'min' - ged_repeat = 50 - params_ged = {'lib': 'gedlibpy', 'cost': ged_cost, 'method': ged_method, - 'edit_cost_constant': edit_cost_constant, - 'stabilizer': ged_stabilizer, 'repeat': ged_repeat} - - # number of graphs; we what to compute the median of these graphs. -# nb_median_range = [2, 3, 4, 5, 10, 20, 30, 40, 50, 100] - nb_median_range = [2] - - # find out all the graphs classified to positive group 1. - idx_dict = get_same_item_indices(y_all) - Gn = [Gn[i] for i in idx_dict[1]] - -# # compute Gram matrix. -# time0 = time.time() -# km = compute_kernel(Gn, gkernel, True) -# time_km = time.time() - time0 -# # write Gram matrix to file. -# np.savez('results/gram_matrix_marg_itr10_pq0.03_mutag_positive.gm', gm=km, gmtime=time_km) - - - time_list = [] - dis_ks_min_list = [] - sod_gs_list = [] - sod_gs_min_list = [] - nb_updated_list = [] - nb_updated_k_list = [] - g_best = [] - for nb_median in nb_median_range: - print('\n-------------------------------------------------------') - print('number of median graphs =', nb_median) - random.seed(1) - idx_rdm = random.sample(range(len(Gn)), nb_median) - print('graphs chosen:', idx_rdm) - Gn_median = [Gn[idx].copy() for idx in idx_rdm] - -# for g in Gn_median: -# nx.draw(g, labels=nx.get_node_attributes(g, 'atom'), with_labels=True) -## plt.savefig("results/preimage_mix/mutag.png", format="PNG") -# plt.show() -# plt.clf() - - ################################################################### - gmfile = np.load('results/gram_matrix_marg_itr10_pq0.03_mutag_positive.gm.npz') - km_tmp = gmfile['gm'] - time_km = gmfile['gmtime'] - # modify mixed gram matrix. - km = np.zeros((len(Gn) + nb_median, len(Gn) + nb_median)) - for i in range(len(Gn)): - for j in range(i, len(Gn)): - km[i, j] = km_tmp[i, j] - km[j, i] = km[i, j] - for i in range(len(Gn)): - for j, idx in enumerate(idx_rdm): - km[i, len(Gn) + j] = km[i, idx] - km[len(Gn) + j, i] = km[i, idx] - for i, idx1 in enumerate(idx_rdm): - for j, idx2 in enumerate(idx_rdm): - km[len(Gn) + i, len(Gn) + j] = km[idx1, idx2] - - ################################################################### - alpha_range = [1 / nb_median] * nb_median - time0 = time.time() - dhat, ghat_list, dis_of_each_itr, nb_updated, nb_updated_k = \ - preimage_iam(Gn, Gn_median, - alpha_range, range(len(Gn), len(Gn) + nb_median), km, k, r_max, - gkernel, epsilon=epsilon, InitIAMWithAllDk=InitIAMWithAllDk, - params_iam={'c_ei': c_ei, 'c_er': c_er, 'c_es': c_es, - 'ite_max': ite_max_iam, 'epsilon': epsilon_iam, - 'removeNodes': removeNodes, 'connected': connected_iam}, - params_ged=params_ged) - - time_total = time.time() - time0 + time_km - print('\ntime: ', time_total) - time_list.append(time_total) - print('\nsmallest distance in kernel space: ', dhat) - dis_ks_min_list.append(dhat) - g_best.append(ghat_list) - print('\nnumber of updates of the best graph: ', nb_updated) - nb_updated_list.append(nb_updated) - print('\nnumber of updates of k nearest graphs: ', nb_updated_k) - nb_updated_k_list.append(nb_updated_k) - - # show the best graph and save it to file. - print('the shortest distance is', dhat) - print('one of the possible corresponding pre-images is') - nx.draw(ghat_list[0], labels=nx.get_node_attributes(ghat_list[0], 'atom'), - with_labels=True) - plt.show() -# plt.savefig('results/preimage_iam/mutag_median_cs.001_nb' + str(nb_median) + -# '.png', format="PNG") - plt.clf() -# print(ghat_list[0].nodes(data=True)) -# print(ghat_list[0].edges(data=True)) - - # compute the corresponding sod in graph space. - sod_tmp, _ = ged_median([ghat_list[0]], Gn_median, params_ged=params_ged) - sod_gs_list.append(sod_tmp) - sod_gs_min_list.append(np.min(sod_tmp)) - print('\nsmallest sod in graph space: ', np.min(sod_tmp)) - - print('\nsods in graph space: ', sod_gs_list) - print('\nsmallest sod in graph space for each set of median graphs: ', sod_gs_min_list) - print('\nsmallest distance in kernel space for each set of median graphs: ', - dis_ks_min_list) - print('\nnumber of updates of the best graph for each set of median graphs by IAM: ', - nb_updated_list) - print('\nnumber of updates of k nearest graphs for each set of median graphs by IAM: ', - nb_updated_k_list) - print('\ntimes:', time_list) - - - - - - -############################################################################### -# test on the combination of the two randomly chosen graphs. (the same as in the -# random pre-image paper.) - -def test_gkiam_2combination_all_pairs(): - ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt', - 'extra_params': {}} # node/edge symb - Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params']) -# Gn = Gn[0:50] - remove_edges(Gn) - gkernel = 'marginalizedkernel' - - lmbda = 0.03 # termination probalility - r_max = 10 # iteration limit for pre-image. - alpha_range = np.linspace(0.5, 0.5, 1) - k = 5 # k nearest neighbors - epsilon = 1e-6 - InitIAMWithAllDk = False - # parameters for GED function - ged_cost='CHEM_1' - ged_method='IPFP' - saveGXL='gedlib' - # parameters for IAM function - c_ei=1 - c_er=1 - c_es=1 - ite_max_iam = 50 - epsilon_iam = 0.001 - removeNodes = True - connected_iam = False - - nb_update_mat = np.full((len(Gn), len(Gn)), np.inf) - # test on each pair of graphs. -# for idx1 in range(len(Gn) - 1, -1, -1): -# for idx2 in range(idx1, -1, -1): - for idx1 in range(187, 188): - for idx2 in range(167, 168): - g1 = Gn[idx1].copy() - g2 = Gn[idx2].copy() - # Gn[10] = [] - # Gn[10] = [] - - nx.draw(g1, labels=nx.get_node_attributes(g1, 'atom'), with_labels=True) - plt.savefig("results/gk_iam/all_pairs/mutag187.png", format="PNG") - plt.show() - plt.clf() - nx.draw(g2, labels=nx.get_node_attributes(g2, 'atom'), with_labels=True) - plt.savefig("results/gk_iam/all_pairs/mutag167.png", format="PNG") - plt.show() - plt.clf() - - ################################################################### -# Gn_mix = [g.copy() for g in Gn] -# Gn_mix.append(g1.copy()) -# Gn_mix.append(g2.copy()) -# -# # compute -# time0 = time.time() -# km = compute_kernel(Gn_mix, gkernel, True) -# time_km = time.time() - time0 -# -# # write Gram matrix to file and read it. -# np.savez('results/gram_matrix_uhpath_itr7_pq0.8.gm', gm=km, gmtime=time_km) - - ################################################################### - gmfile = np.load('results/gram_matrix_marg_itr10_pq0.03.gm.npz') - km = gmfile['gm'] - time_km = gmfile['gmtime'] - # modify mixed gram matrix. - for i in range(len(Gn)): - km[i, len(Gn)] = km[i, idx1] - km[i, len(Gn) + 1] = km[i, idx2] - km[len(Gn), i] = km[i, idx1] - km[len(Gn) + 1, i] = km[i, idx2] - km[len(Gn), len(Gn)] = km[idx1, idx1] - km[len(Gn), len(Gn) + 1] = km[idx1, idx2] - km[len(Gn) + 1, len(Gn)] = km[idx2, idx1] - km[len(Gn) + 1, len(Gn) + 1] = km[idx2, idx2] - - ################################################################### -# # use only the two graphs in median set as candidates. -# Gn = [g1.copy(), g2.copy()] -# Gn_mix = Gn + [g1.copy(), g2.copy()] -# # compute -# time0 = time.time() -# km = compute_kernel(Gn_mix, gkernel, True) -# time_km = time.time() - time0 - - - time_list = [] - dis_ks_min_list = [] - sod_gs_list = [] - sod_gs_min_list = [] - nb_updated_list = [] - nb_updated_k_list = [] - g_best = [] - # for each alpha - for alpha in alpha_range: - print('\n-------------------------------------------------------\n') - print('alpha =', alpha) - time0 = time.time() - dhat, ghat_list, sod_ks, nb_updated, nb_updated_k = \ - preimage_iam(Gn, [g1, g2], - [alpha, 1 - alpha], range(len(Gn), len(Gn) + 2), km, k, r_max, - gkernel, epsilon=epsilon, InitIAMWithAllDk=InitIAMWithAllDk, - params_iam={'c_ei': c_ei, 'c_er': c_er, 'c_es': c_es, - 'ite_max': ite_max_iam, 'epsilon': epsilon_iam, - 'removeNodes': removeNodes, 'connected': connected_iam}, - params_ged={'ged_cost': ged_cost, 'ged_method': ged_method, - 'saveGXL': saveGXL}) - time_total = time.time() - time0 + time_km - print('time: ', time_total) - time_list.append(time_total) - dis_ks_min_list.append(dhat) - g_best.append(ghat_list) - nb_updated_list.append(nb_updated) - nb_updated_k_list.append(nb_updated_k) - - # show best graphs and save them to file. - for idx, item in enumerate(alpha_range): - print('when alpha is', item, 'the shortest distance is', dis_ks_min_list[idx]) - print('one of the possible corresponding pre-images is') - nx.draw(g_best[idx][0], labels=nx.get_node_attributes(g_best[idx][0], 'atom'), - with_labels=True) - plt.savefig('results/gk_iam/mutag' + str(idx1) + '_' + str(idx2) - + '_alpha' + str(item) + '.png', format="PNG") -# plt.show() - plt.clf() -# print(g_best[idx][0].nodes(data=True)) -# print(g_best[idx][0].edges(data=True)) - - # for g in g_best[idx]: - # draw_Letter_graph(g, savepath='results/gk_iam/') - ## nx.draw_networkx(g) - ## plt.show() - # print(g.nodes(data=True)) - # print(g.edges(data=True)) - - # compute the corresponding sod in graph space. - for idx, item in enumerate(alpha_range): - sod_tmp, _ = ged_median([g_best[0]], [g1, g2], ged_cost=ged_cost, - ged_method=ged_method, saveGXL=saveGXL) - sod_gs_list.append(sod_tmp) - sod_gs_min_list.append(np.min(sod_tmp)) - - print('\nsods in graph space: ', sod_gs_list) - print('\nsmallest sod in graph space for each alpha: ', sod_gs_min_list) - print('\nsmallest distance in kernel space for each alpha: ', dis_ks_min_list) - print('\nnumber of updates of the best graph for each alpha: ', - nb_updated_list) - print('\nnumber of updates of the k nearest graphs for each alpha: ', - nb_updated_k_list) - print('\ntimes:', time_list) - nb_update_mat[idx1, idx2] = nb_updated_list[0] - - str_fw = 'graphs %d and %d: %d.\n' % (idx1, idx2, nb_updated_list[0]) - with open('results/gk_iam/all_pairs/nb_updates.txt', 'r+') as file: - content = file.read() - file.seek(0, 0) - file.write(str_fw + content) - - - -def test_gkiam_2combination(): - from gk_iam import gk_iam_nearest_multi - ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt', - 'extra_params': {}} # node/edge symb - Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params']) -# Gn = Gn[0:50] - remove_edges(Gn) - gkernel = 'marginalizedkernel' - - lmbda = 0.03 # termination probalility - r_max = 10 # iteration limit for pre-image. - alpha_range = np.linspace(0.5, 0.5, 1) - k = 20 # k nearest neighbors - epsilon = 1e-6 - ged_cost='CHEM_1' - ged_method='IPFP' - saveGXL='gedlib' - c_ei=1 - c_er=1 - c_es=1 - - # randomly select two molecules - np.random.seed(1) - idx_gi = [10, 11] # np.random.randint(0, len(Gn), 2) - g1 = Gn[idx_gi[0]].copy() - g2 = Gn[idx_gi[1]].copy() -# Gn[10] = [] -# Gn[10] = [] - -# nx.draw(g1, labels=nx.get_node_attributes(g1, 'atom'), with_labels=True) -# plt.savefig("results/random_preimage/mutag10.png", format="PNG") -# plt.show() -# nx.draw(g2, labels=nx.get_node_attributes(g2, 'atom'), with_labels=True) -# plt.savefig("results/random_preimage/mutag11.png", format="PNG") -# plt.show() - - Gn_mix = [g.copy() for g in Gn] - Gn_mix.append(g1.copy()) - Gn_mix.append(g2.copy()) - - # compute -# time0 = time.time() -# km = compute_kernel(Gn_mix, gkernel, True) -# time_km = time.time() - time0 - - # write Gram matrix to file and read it. -# np.savez('results/gram_matrix.gm', gm=km, gmtime=time_km) - gmfile = np.load('results/gram_matrix.gm.npz') - km = gmfile['gm'] - time_km = gmfile['gmtime'] - - time_list = [] - dis_ks_min_list = [] - sod_gs_list = [] - sod_gs_min_list = [] - nb_updated_list = [] - g_best = [] - # for each alpha - for alpha in alpha_range: - print('\n-------------------------------------------------------\n') - print('alpha =', alpha) - time0 = time.time() - dhat, ghat_list, sod_ks, nb_updated = gk_iam_nearest_multi(Gn, [g1, g2], - [alpha, 1 - alpha], range(len(Gn), len(Gn) + 2), km, k, r_max, - gkernel, c_ei=c_ei, c_er=c_er, c_es=c_es, epsilon=epsilon, - ged_cost=ged_cost, ged_method=ged_method, saveGXL=saveGXL) - time_total = time.time() - time0 + time_km - print('time: ', time_total) - time_list.append(time_total) - dis_ks_min_list.append(dhat) - g_best.append(ghat_list) - nb_updated_list.append(nb_updated) - - # show best graphs and save them to file. - for idx, item in enumerate(alpha_range): - print('when alpha is', item, 'the shortest distance is', dis_ks_min_list[idx]) - print('one of the possible corresponding pre-images is') - nx.draw(g_best[idx][0], labels=nx.get_node_attributes(g_best[idx][0], 'atom'), - with_labels=True) - plt.savefig('results/gk_iam/mutag_alpha' + str(item) + '.png', format="PNG") - plt.show() - print(g_best[idx][0].nodes(data=True)) - print(g_best[idx][0].edges(data=True)) - -# for g in g_best[idx]: -# draw_Letter_graph(g, savepath='results/gk_iam/') -## nx.draw_networkx(g) -## plt.show() -# print(g.nodes(data=True)) -# print(g.edges(data=True)) - - # compute the corresponding sod in graph space. - for idx, item in enumerate(alpha_range): - sod_tmp, _ = ged_median([g_best[0]], [g1, g2], ged_cost=ged_cost, - ged_method=ged_method, saveGXL=saveGXL) - sod_gs_list.append(sod_tmp) - sod_gs_min_list.append(np.min(sod_tmp)) - - print('\nsods in graph space: ', sod_gs_list) - print('\nsmallest sod in graph space for each alpha: ', sod_gs_min_list) - print('\nsmallest distance in kernel space for each alpha: ', dis_ks_min_list) - print('\nnumber of updates for each alpha: ', nb_updated_list) - print('\ntimes:', time_list) - - -############################################################################### - - -if __name__ == '__main__': -############################################################################### -# test on the combination of the two randomly chosen graphs. (the same as in the -# random pre-image paper.) -# test_gkiam_2combination() -# test_gkiam_2combination_all_pairs() - -############################################################################### -# tests on different numbers of median-sets. - test_preimage_iam_median_nb() - -############################################################################### -# tests on different values on grid of median-sets and k. -# test_preimage_iam_grid_k_median_nb() \ No newline at end of file diff --git a/gklearn/preimage/test_preimage_mix.py b/gklearn/preimage/test_preimage_mix.py deleted file mode 100644 index 888de86..0000000 --- a/gklearn/preimage/test_preimage_mix.py +++ /dev/null @@ -1,539 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -Created on Thu Sep 5 15:59:00 2019 - -@author: ljia -""" - -import numpy as np -import networkx as nx -import matplotlib.pyplot as plt -import time -import random -#from tqdm import tqdm - -from gklearn.utils.graphfiles import loadDataset -from gklearn.preimage.ged import ged_median -from gklearn.preimage.utils import compute_kernel, get_same_item_indices, remove_edges -from gklearn.preimage.preimage_iam import preimage_iam_random_mix - -############################################################################### -# tests on different values on grid of median-sets and k. - -def test_preimage_mix_grid_k_median_nb(): - ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt', - 'extra_params': {}} # node/edge symb - Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params']) -# Gn = Gn[0:50] - remove_edges(Gn) - gkernel = 'marginalizedkernel' - - lmbda = 0.03 # termination probalility - r_max = 5 # iteration limit for pre-image. - l_max = 500 # update limit for random generation -# alpha_range = np.linspace(0.5, 0.5, 1) -# k = 5 # k nearest neighbors - epsilon = 1e-6 - InitIAMWithAllDk = True - InitRandomWithAllDk = True - # parameters for GED function - ged_cost='CHEM_1' - ged_method='IPFP' - saveGXL='gedlib' - # parameters for IAM function - c_ei=1 - c_er=1 - c_es=1 - ite_max_iam = 50 - epsilon_iam = 0.001 - removeNodes = True - connected_iam = False - - # number of graphs; we what to compute the median of these graphs. - nb_median_range = [2, 3, 4, 5, 10, 20, 30, 40, 50, 100] - # number of nearest neighbors. - k_range = [5, 6, 7, 8, 9, 10, 20, 30, 40, 50, 100] - - # find out all the graphs classified to positive group 1. - idx_dict = get_same_item_indices(y_all) - Gn = [Gn[i] for i in idx_dict[1]] - -# # compute Gram matrix. -# time0 = time.time() -# km = compute_kernel(Gn, gkernel, True) -# time_km = time.time() - time0 -# # write Gram matrix to file. -# np.savez('results/gram_matrix_marg_itr10_pq0.03_mutag_positive.gm', gm=km, gmtime=time_km) - - - time_list = [] - dis_ks_min_list = [] - sod_gs_list = [] - sod_gs_min_list = [] - nb_updated_list_iam = [] - nb_updated_list_random = [] - nb_updated_k_list_iam = [] - nb_updated_k_list_random = [] - g_best = [] - for idx_nb, nb_median in enumerate(nb_median_range): - print('\n-------------------------------------------------------') - print('number of median graphs =', nb_median) - random.seed(1) - idx_rdm = random.sample(range(len(Gn)), nb_median) - print('graphs chosen:', idx_rdm) - Gn_median = [Gn[idx].copy() for idx in idx_rdm] - -# for g in Gn_median: -# nx.draw(g, labels=nx.get_node_attributes(g, 'atom'), with_labels=True) -## plt.savefig("results/preimage_mix/mutag.png", format="PNG") -# plt.show() -# plt.clf() - - ################################################################### - gmfile = np.load('results/gram_matrix_marg_itr10_pq0.03_mutag_positive.gm.npz') - km_tmp = gmfile['gm'] - time_km = gmfile['gmtime'] - # modify mixed gram matrix. - km = np.zeros((len(Gn) + nb_median, len(Gn) + nb_median)) - for i in range(len(Gn)): - for j in range(i, len(Gn)): - km[i, j] = km_tmp[i, j] - km[j, i] = km[i, j] - for i in range(len(Gn)): - for j, idx in enumerate(idx_rdm): - km[i, len(Gn) + j] = km[i, idx] - km[len(Gn) + j, i] = km[i, idx] - for i, idx1 in enumerate(idx_rdm): - for j, idx2 in enumerate(idx_rdm): - km[len(Gn) + i, len(Gn) + j] = km[idx1, idx2] - - ################################################################### - alpha_range = [1 / nb_median] * nb_median - - time_list.append([]) - dis_ks_min_list.append([]) - sod_gs_list.append([]) - sod_gs_min_list.append([]) - nb_updated_list_iam.append([]) - nb_updated_list_random.append([]) - nb_updated_k_list_iam.append([]) - nb_updated_k_list_random.append([]) - g_best.append([]) - - for k in k_range: - print('\n++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n') - print('k =', k) - time0 = time.time() - dhat, ghat_list, dis_of_each_itr, nb_updated_iam, nb_updated_random, \ - nb_updated_k_iam, nb_updated_k_random = \ - preimage_iam_random_mix(Gn, Gn_median, - alpha_range, range(len(Gn), len(Gn) + nb_median), km, k, r_max, - l_max, gkernel, epsilon=epsilon, InitIAMWithAllDk=InitIAMWithAllDk, - InitRandomWithAllDk=InitRandomWithAllDk, - params_iam={'c_ei': c_ei, 'c_er': c_er, 'c_es': c_es, - 'ite_max': ite_max_iam, 'epsilon': epsilon_iam, - 'removeNodes': removeNodes, 'connected': connected_iam}, - params_ged={'ged_cost': ged_cost, 'ged_method': ged_method, - 'saveGXL': saveGXL}) - - time_total = time.time() - time0 + time_km - print('time: ', time_total) - time_list[idx_nb].append(time_total) - print('\nsmallest distance in kernel space: ', dhat) - dis_ks_min_list[idx_nb].append(dhat) - g_best[idx_nb].append(ghat_list) - print('\nnumber of updates of the best graph by IAM: ', nb_updated_iam) - nb_updated_list_iam[idx_nb].append(nb_updated_iam) - print('\nnumber of updates of the best graph by random generation: ', - nb_updated_random) - nb_updated_list_random[idx_nb].append(nb_updated_random) - print('\nnumber of updates of k nearest graphs by IAM: ', nb_updated_k_iam) - nb_updated_k_list_iam[idx_nb].append(nb_updated_k_iam) - print('\nnumber of updates of k nearest graphs by random generation: ', - nb_updated_k_random) - nb_updated_k_list_random[idx_nb].append(nb_updated_k_random) - - # show the best graph and save it to file. - print('the shortest distance is', dhat) - print('one of the possible corresponding pre-images is') - nx.draw(ghat_list[0], labels=nx.get_node_attributes(ghat_list[0], 'atom'), - with_labels=True) - plt.savefig('results/preimage_mix/mutag_median_nb' + str(nb_median) + - '_k' + str(k) + '.png', format="PNG") - # plt.show() - plt.clf() - # print(ghat_list[0].nodes(data=True)) - # print(ghat_list[0].edges(data=True)) - - # compute the corresponding sod in graph space. - sod_tmp, _ = ged_median([ghat_list[0]], Gn_median, ged_cost=ged_cost, - ged_method=ged_method, saveGXL=saveGXL) - sod_gs_list[idx_nb].append(sod_tmp) - sod_gs_min_list[idx_nb].append(np.min(sod_tmp)) - print('\nsmallest sod in graph space: ', np.min(sod_tmp)) - - print('\nsods in graph space: ', sod_gs_list) - print('\nsmallest sod in graph space for each set of median graphs and k: ', - sod_gs_min_list) - print('\nsmallest distance in kernel space for each set of median graphs and k: ', - dis_ks_min_list) - print('\nnumber of updates of the best graph for each set of median graphs and k by IAM: ', - nb_updated_list_iam) - print('\nnumber of updates of the best graph for each set of median graphs and k by random generation: ', - nb_updated_list_random) - print('\nnumber of updates of k nearest graphs for each set of median graphs and k by IAM: ', - nb_updated_k_list_iam) - print('\nnumber of updates of k nearest graphs for each set of median graphs and k by random generation: ', - nb_updated_k_list_random) - print('\ntimes:', time_list) - - - - -############################################################################### -# tests on different numbers of median-sets. - -def test_preimage_mix_median_nb(): - ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt', - 'extra_params': {}} # node/edge symb - Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params']) -# Gn = Gn[0:50] - remove_edges(Gn) - gkernel = 'marginalizedkernel' - - lmbda = 0.03 # termination probalility - r_max = 5 # iteration limit for pre-image. - l_max = 500 # update limit for random generation -# alpha_range = np.linspace(0.5, 0.5, 1) - k = 5 # k nearest neighbors - epsilon = 1e-6 - InitIAMWithAllDk = True - InitRandomWithAllDk = True - # parameters for GED function - ged_cost='CHEM_1' - ged_method='IPFP' - saveGXL='gedlib' - # parameters for IAM function - c_ei=1 - c_er=1 - c_es=1 - ite_max_iam = 50 - epsilon_iam = 0.001 - removeNodes = True - connected_iam = False - - # number of graphs; we what to compute the median of these graphs. - nb_median_range = [2, 3, 4, 5, 10, 20, 30, 40, 50, 100] - - # find out all the graphs classified to positive group 1. - idx_dict = get_same_item_indices(y_all) - Gn = [Gn[i] for i in idx_dict[1]] - -# # compute Gram matrix. -# time0 = time.time() -# km = compute_kernel(Gn, gkernel, True) -# time_km = time.time() - time0 -# # write Gram matrix to file. -# np.savez('results/gram_matrix_marg_itr10_pq0.03_mutag_positive.gm', gm=km, gmtime=time_km) - - - time_list = [] - dis_ks_min_list = [] - sod_gs_list = [] - sod_gs_min_list = [] - nb_updated_list_iam = [] - nb_updated_list_random = [] - nb_updated_k_list_iam = [] - nb_updated_k_list_random = [] - g_best = [] - for nb_median in nb_median_range: - print('\n-------------------------------------------------------') - print('number of median graphs =', nb_median) - random.seed(1) - idx_rdm = random.sample(range(len(Gn)), nb_median) - print('graphs chosen:', idx_rdm) - Gn_median = [Gn[idx].copy() for idx in idx_rdm] - -# for g in Gn_median: -# nx.draw(g, labels=nx.get_node_attributes(g, 'atom'), with_labels=True) -## plt.savefig("results/preimage_mix/mutag.png", format="PNG") -# plt.show() -# plt.clf() - - ################################################################### - gmfile = np.load('results/gram_matrix_marg_itr10_pq0.03_mutag_positive.gm.npz') - km_tmp = gmfile['gm'] - time_km = gmfile['gmtime'] - # modify mixed gram matrix. - km = np.zeros((len(Gn) + nb_median, len(Gn) + nb_median)) - for i in range(len(Gn)): - for j in range(i, len(Gn)): - km[i, j] = km_tmp[i, j] - km[j, i] = km[i, j] - for i in range(len(Gn)): - for j, idx in enumerate(idx_rdm): - km[i, len(Gn) + j] = km[i, idx] - km[len(Gn) + j, i] = km[i, idx] - for i, idx1 in enumerate(idx_rdm): - for j, idx2 in enumerate(idx_rdm): - km[len(Gn) + i, len(Gn) + j] = km[idx1, idx2] - - ################################################################### - alpha_range = [1 / nb_median] * nb_median - time0 = time.time() - dhat, ghat_list, dis_of_each_itr, nb_updated_iam, nb_updated_random, \ - nb_updated_k_iam, nb_updated_k_random = \ - preimage_iam_random_mix(Gn, Gn_median, - alpha_range, range(len(Gn), len(Gn) + nb_median), km, k, r_max, - l_max, gkernel, epsilon=epsilon, InitIAMWithAllDk=InitIAMWithAllDk, - InitRandomWithAllDk=InitRandomWithAllDk, - params_iam={'c_ei': c_ei, 'c_er': c_er, 'c_es': c_es, - 'ite_max': ite_max_iam, 'epsilon': epsilon_iam, - 'removeNodes': removeNodes, 'connected': connected_iam}, - params_ged={'ged_cost': ged_cost, 'ged_method': ged_method, - 'saveGXL': saveGXL}) - - time_total = time.time() - time0 + time_km - print('time: ', time_total) - time_list.append(time_total) - print('\nsmallest distance in kernel space: ', dhat) - dis_ks_min_list.append(dhat) - g_best.append(ghat_list) - print('\nnumber of updates of the best graph by IAM: ', nb_updated_iam) - nb_updated_list_iam.append(nb_updated_iam) - print('\nnumber of updates of the best graph by random generation: ', - nb_updated_random) - nb_updated_list_random.append(nb_updated_random) - print('\nnumber of updates of k nearest graphs by IAM: ', nb_updated_k_iam) - nb_updated_k_list_iam.append(nb_updated_k_iam) - print('\nnumber of updates of k nearest graphs by random generation: ', - nb_updated_k_random) - nb_updated_k_list_random.append(nb_updated_k_random) - - # show the best graph and save it to file. - print('the shortest distance is', dhat) - print('one of the possible corresponding pre-images is') - nx.draw(ghat_list[0], labels=nx.get_node_attributes(ghat_list[0], 'atom'), - with_labels=True) - plt.savefig('results/preimage_mix/mutag_median_nb' + str(nb_median) + - '.png', format="PNG") -# plt.show() - plt.clf() -# print(ghat_list[0].nodes(data=True)) -# print(ghat_list[0].edges(data=True)) - - # compute the corresponding sod in graph space. - sod_tmp, _ = ged_median([ghat_list[0]], Gn_median, ged_cost=ged_cost, - ged_method=ged_method, saveGXL=saveGXL) - sod_gs_list.append(sod_tmp) - sod_gs_min_list.append(np.min(sod_tmp)) - print('\nsmallest sod in graph space: ', np.min(sod_tmp)) - - print('\nsods in graph space: ', sod_gs_list) - print('\nsmallest sod in graph space for each set of median graphs: ', sod_gs_min_list) - print('\nsmallest distance in kernel space for each set of median graphs: ', - dis_ks_min_list) - print('\nnumber of updates of the best graph for each set of median graphs by IAM: ', - nb_updated_list_iam) - print('\nnumber of updates of the best graph for each set of median graphs by random generation: ', - nb_updated_list_random) - print('\nnumber of updates of k nearest graphs for each set of median graphs by IAM: ', - nb_updated_k_list_iam) - print('\nnumber of updates of k nearest graphs for each set of median graphs by random generation: ', - nb_updated_k_list_random) - print('\ntimes:', time_list) - - - -############################################################################### -# test on the combination of the two randomly chosen graphs. (the same as in the -# random pre-image paper.) - -def test_preimage_mix_2combination_all_pairs(): - ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt', - 'extra_params': {}} # node/edge symb - Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params']) -# Gn = Gn[0:50] - remove_edges(Gn) - gkernel = 'marginalizedkernel' - - lmbda = 0.03 # termination probalility - r_max = 10 # iteration limit for pre-image. - l_max = 500 # update limit for random generation - alpha_range = np.linspace(0.5, 0.5, 1) - k = 5 # k nearest neighbors - epsilon = 1e-6 - InitIAMWithAllDk = True - InitRandomWithAllDk = True - # parameters for GED function - ged_cost='CHEM_1' - ged_method='IPFP' - saveGXL='gedlib' - # parameters for IAM function - c_ei=1 - c_er=1 - c_es=1 - ite_max_iam = 50 - epsilon_iam = 0.001 - removeNodes = True - connected_iam = False - - nb_update_mat_iam = np.full((len(Gn), len(Gn)), np.inf) - nb_update_mat_random = np.full((len(Gn), len(Gn)), np.inf) - # test on each pair of graphs. -# for idx1 in range(len(Gn) - 1, -1, -1): -# for idx2 in range(idx1, -1, -1): - for idx1 in range(187, 188): - for idx2 in range(167, 168): - g1 = Gn[idx1].copy() - g2 = Gn[idx2].copy() - # Gn[10] = [] - # Gn[10] = [] - - nx.draw(g1, labels=nx.get_node_attributes(g1, 'atom'), with_labels=True) - plt.savefig("results/preimage_mix/mutag187.png", format="PNG") - plt.show() - plt.clf() - nx.draw(g2, labels=nx.get_node_attributes(g2, 'atom'), with_labels=True) - plt.savefig("results/preimage_mix/mutag167.png", format="PNG") - plt.show() - plt.clf() - - ################################################################### -# Gn_mix = [g.copy() for g in Gn] -# Gn_mix.append(g1.copy()) -# Gn_mix.append(g2.copy()) -# -# # compute -# time0 = time.time() -# km = compute_kernel(Gn_mix, gkernel, True) -# time_km = time.time() - time0 -# -# # write Gram matrix to file and read it. -# np.savez('results/gram_matrix_uhpath_itr7_pq0.8.gm', gm=km, gmtime=time_km) - - ################################################################### - gmfile = np.load('results/gram_matrix_marg_itr10_pq0.03.gm.npz') - km = gmfile['gm'] - time_km = gmfile['gmtime'] - # modify mixed gram matrix. - for i in range(len(Gn)): - km[i, len(Gn)] = km[i, idx1] - km[i, len(Gn) + 1] = km[i, idx2] - km[len(Gn), i] = km[i, idx1] - km[len(Gn) + 1, i] = km[i, idx2] - km[len(Gn), len(Gn)] = km[idx1, idx1] - km[len(Gn), len(Gn) + 1] = km[idx1, idx2] - km[len(Gn) + 1, len(Gn)] = km[idx2, idx1] - km[len(Gn) + 1, len(Gn) + 1] = km[idx2, idx2] - - ################################################################### -# # use only the two graphs in median set as candidates. -# Gn = [g1.copy(), g2.copy()] -# Gn_mix = Gn + [g1.copy(), g2.copy()] -# # compute -# time0 = time.time() -# km = compute_kernel(Gn_mix, gkernel, True) -# time_km = time.time() - time0 - - - time_list = [] - dis_ks_min_list = [] - sod_gs_list = [] - sod_gs_min_list = [] - nb_updated_list_iam = [] - nb_updated_list_random = [] - nb_updated_k_list_iam = [] - nb_updated_k_list_random = [] - g_best = [] - # for each alpha - for alpha in alpha_range: - print('\n-------------------------------------------------------\n') - print('alpha =', alpha) - time0 = time.time() - dhat, ghat_list, dis_of_each_itr, nb_updated_iam, nb_updated_random, \ - nb_updated_k_iam, nb_updated_k_random = \ - preimage_iam_random_mix(Gn, [g1, g2], - [alpha, 1 - alpha], range(len(Gn), len(Gn) + 2), km, k, r_max, - l_max, gkernel, epsilon=epsilon, InitIAMWithAllDk=InitIAMWithAllDk, - InitRandomWithAllDk=InitRandomWithAllDk, - params_iam={'c_ei': c_ei, 'c_er': c_er, 'c_es': c_es, - 'ite_max': ite_max_iam, 'epsilon': epsilon_iam, - 'removeNodes': removeNodes, 'connected': connected_iam}, - params_ged={'ged_cost': ged_cost, 'ged_method': ged_method, - 'saveGXL': saveGXL}) - time_total = time.time() - time0 + time_km - print('time: ', time_total) - time_list.append(time_total) - dis_ks_min_list.append(dhat) - g_best.append(ghat_list) - nb_updated_list_iam.append(nb_updated_iam) - nb_updated_list_random.append(nb_updated_random) - nb_updated_k_list_iam.append(nb_updated_k_iam) - nb_updated_k_list_random.append(nb_updated_k_random) - - # show best graphs and save them to file. - for idx, item in enumerate(alpha_range): - print('when alpha is', item, 'the shortest distance is', dis_ks_min_list[idx]) - print('one of the possible corresponding pre-images is') - nx.draw(g_best[idx][0], labels=nx.get_node_attributes(g_best[idx][0], 'atom'), - with_labels=True) - plt.savefig('results/preimage_mix/mutag' + str(idx1) + '_' + str(idx2) - + '_alpha' + str(item) + '.png', format="PNG") -# plt.show() - plt.clf() -# print(g_best[idx][0].nodes(data=True)) -# print(g_best[idx][0].edges(data=True)) - - # for g in g_best[idx]: - # draw_Letter_graph(g, savepath='results/gk_iam/') - ## nx.draw_networkx(g) - ## plt.show() - # print(g.nodes(data=True)) - # print(g.edges(data=True)) - - # compute the corresponding sod in graph space. - for idx, item in enumerate(alpha_range): - sod_tmp, _ = ged_median([g_best[0]], [g1, g2], ged_cost=ged_cost, - ged_method=ged_method, saveGXL=saveGXL) - sod_gs_list.append(sod_tmp) - sod_gs_min_list.append(np.min(sod_tmp)) - - print('\nsods in graph space: ', sod_gs_list) - print('\nsmallest sod in graph space for each alpha: ', sod_gs_min_list) - print('\nsmallest distance in kernel space for each alpha: ', dis_ks_min_list) - print('\nnumber of updates of the best graph for each alpha by IAM: ', nb_updated_list_iam) - print('\nnumber of updates of the best graph for each alpha by random generation: ', - nb_updated_list_random) - print('\nnumber of updates of k nearest graphs for each alpha by IAM: ', - nb_updated_k_list_iam) - print('\nnumber of updates of k nearest graphs for each alpha by random generation: ', - nb_updated_k_list_random) - print('\ntimes:', time_list) - nb_update_mat_iam[idx1, idx2] = nb_updated_list_iam[0] - nb_update_mat_random[idx1, idx2] = nb_updated_list_random[0] - - str_fw = 'graphs %d and %d: %d times by IAM, %d times by random generation.\n' \ - % (idx1, idx2, nb_updated_list_iam[0], nb_updated_list_random[0]) - with open('results/preimage_mix/nb_updates.txt', 'r+') as file: - content = file.read() - file.seek(0, 0) - file.write(str_fw + content) - -############################################################################### - - -if __name__ == '__main__': -############################################################################### -# test on the combination of the two randomly chosen graphs. (the same as in the -# random pre-image paper.) -# test_preimage_mix_2combination_all_pairs() - -############################################################################### -# tests on different numbers of median-sets. -# test_preimage_mix_median_nb() - -############################################################################### -# tests on different values on grid of median-sets and k. - test_preimage_mix_grid_k_median_nb() \ No newline at end of file diff --git a/gklearn/preimage/test_preimage_random.py b/gklearn/preimage/test_preimage_random.py deleted file mode 100644 index bb77d2f..0000000 --- a/gklearn/preimage/test_preimage_random.py +++ /dev/null @@ -1,398 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -Created on Thu Sep 5 15:59:00 2019 - -@author: ljia -""" - -import numpy as np -import networkx as nx -import matplotlib.pyplot as plt -import time -import random -#from tqdm import tqdm - -from gklearn.utils.graphfiles import loadDataset -from gklearn.preimage.preimage_random import preimage_random -from gklearn.preimage.ged import ged_median -from gklearn.preimage.utils import compute_kernel, get_same_item_indices, remove_edges - - -############################################################################### -# tests on different values on grid of median-sets and k. - -def test_preimage_random_grid_k_median_nb(): - ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt', - 'extra_params': {}} # node/edge symb - Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params']) -# Gn = Gn[0:50] - remove_edges(Gn) - gkernel = 'marginalizedkernel' - - lmbda = 0.03 # termination probalility - r_max = 5 # iteration limit for pre-image. - l = 500 # update limit for random generation -# alpha_range = np.linspace(0.5, 0.5, 1) -# k = 5 # k nearest neighbors - # parameters for GED function - ged_cost='CHEM_1' - ged_method='IPFP' - saveGXL='gedlib' - - # number of graphs; we what to compute the median of these graphs. - nb_median_range = [2, 3, 4, 5, 10, 20, 30, 40, 50, 100] - # number of nearest neighbors. - k_range = [5, 6, 7, 8, 9, 10, 20, 30, 40, 50, 100] - - # find out all the graphs classified to positive group 1. - idx_dict = get_same_item_indices(y_all) - Gn = [Gn[i] for i in idx_dict[1]] - -# # compute Gram matrix. -# time0 = time.time() -# km = compute_kernel(Gn, gkernel, True) -# time_km = time.time() - time0 -# # write Gram matrix to file. -# np.savez('results/gram_matrix_marg_itr10_pq0.03_mutag_positive.gm', gm=km, gmtime=time_km) - - - time_list = [] - dis_ks_min_list = [] - sod_gs_list = [] - sod_gs_min_list = [] - nb_updated_list = [] - g_best = [] - for idx_nb, nb_median in enumerate(nb_median_range): - print('\n-------------------------------------------------------') - print('number of median graphs =', nb_median) - random.seed(1) - idx_rdm = random.sample(range(len(Gn)), nb_median) - print('graphs chosen:', idx_rdm) - Gn_median = [Gn[idx].copy() for idx in idx_rdm] - -# for g in Gn_median: -# nx.draw(g, labels=nx.get_node_attributes(g, 'atom'), with_labels=True) -## plt.savefig("results/preimage_mix/mutag.png", format="PNG") -# plt.show() -# plt.clf() - - ################################################################### - gmfile = np.load('results/gram_matrix_marg_itr10_pq0.03_mutag_positive.gm.npz') - km_tmp = gmfile['gm'] - time_km = gmfile['gmtime'] - # modify mixed gram matrix. - km = np.zeros((len(Gn) + nb_median, len(Gn) + nb_median)) - for i in range(len(Gn)): - for j in range(i, len(Gn)): - km[i, j] = km_tmp[i, j] - km[j, i] = km[i, j] - for i in range(len(Gn)): - for j, idx in enumerate(idx_rdm): - km[i, len(Gn) + j] = km[i, idx] - km[len(Gn) + j, i] = km[i, idx] - for i, idx1 in enumerate(idx_rdm): - for j, idx2 in enumerate(idx_rdm): - km[len(Gn) + i, len(Gn) + j] = km[idx1, idx2] - - ################################################################### - alpha_range = [1 / nb_median] * nb_median - - time_list.append([]) - dis_ks_min_list.append([]) - sod_gs_list.append([]) - sod_gs_min_list.append([]) - nb_updated_list.append([]) - g_best.append([]) - - for k in k_range: - print('\n++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n') - print('k =', k) - time0 = time.time() - dhat, ghat, nb_updated = preimage_random(Gn, Gn_median, alpha_range, - range(len(Gn), len(Gn) + nb_median), km, k, r_max, l, gkernel) - - time_total = time.time() - time0 + time_km - print('time: ', time_total) - time_list[idx_nb].append(time_total) - print('\nsmallest distance in kernel space: ', dhat) - dis_ks_min_list[idx_nb].append(dhat) - g_best[idx_nb].append(ghat) - print('\nnumber of updates of the best graph: ', nb_updated) - nb_updated_list[idx_nb].append(nb_updated) - - # show the best graph and save it to file. - print('the shortest distance is', dhat) - print('one of the possible corresponding pre-images is') - nx.draw(ghat, labels=nx.get_node_attributes(ghat, 'atom'), - with_labels=True) - plt.savefig('results/preimage_random/mutag_median_nb' + str(nb_median) + - '_k' + str(k) + '.png', format="PNG") - # plt.show() - plt.clf() - # print(ghat_list[0].nodes(data=True)) - # print(ghat_list[0].edges(data=True)) - - # compute the corresponding sod in graph space. - sod_tmp, _ = ged_median([ghat], Gn_median, ged_cost=ged_cost, - ged_method=ged_method, saveGXL=saveGXL) - sod_gs_list[idx_nb].append(sod_tmp) - sod_gs_min_list[idx_nb].append(np.min(sod_tmp)) - print('\nsmallest sod in graph space: ', np.min(sod_tmp)) - - print('\nsods in graph space: ', sod_gs_list) - print('\nsmallest sod in graph space for each set of median graphs and k: ', - sod_gs_min_list) - print('\nsmallest distance in kernel space for each set of median graphs and k: ', - dis_ks_min_list) - print('\nnumber of updates of the best graph for each set of median graphs and k by IAM: ', - nb_updated_list) - print('\ntimes:', time_list) - - - - -############################################################################### -# tests on different numbers of median-sets. - -def test_preimage_random_median_nb(): - ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt', - 'extra_params': {}} # node/edge symb - Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params']) -# Gn = Gn[0:50] - remove_edges(Gn) - gkernel = 'marginalizedkernel' - - lmbda = 0.03 # termination probalility - r_max = 5 # iteration limit for pre-image. - l = 500 # update limit for random generation -# alpha_range = np.linspace(0.5, 0.5, 1) - k = 5 # k nearest neighbors - # parameters for GED function - ged_cost='CHEM_1' - ged_method='IPFP' - saveGXL='gedlib' - - # number of graphs; we what to compute the median of these graphs. - nb_median_range = [2, 3, 4, 5, 10, 20, 30, 40, 50, 100] - - # find out all the graphs classified to positive group 1. - idx_dict = get_same_item_indices(y_all) - Gn = [Gn[i] for i in idx_dict[1]] - -# # compute Gram matrix. -# time0 = time.time() -# km = compute_kernel(Gn, gkernel, True) -# time_km = time.time() - time0 -# # write Gram matrix to file. -# np.savez('results/gram_matrix_marg_itr10_pq0.03_mutag_positive.gm', gm=km, gmtime=time_km) - - - time_list = [] - dis_ks_min_list = [] - sod_gs_list = [] - sod_gs_min_list = [] - nb_updated_list = [] - g_best = [] - for nb_median in nb_median_range: - print('\n-------------------------------------------------------') - print('number of median graphs =', nb_median) - random.seed(1) - idx_rdm = random.sample(range(len(Gn)), nb_median) - print('graphs chosen:', idx_rdm) - Gn_median = [Gn[idx].copy() for idx in idx_rdm] - -# for g in Gn_median: -# nx.draw(g, labels=nx.get_node_attributes(g, 'atom'), with_labels=True) -## plt.savefig("results/preimage_mix/mutag.png", format="PNG") -# plt.show() -# plt.clf() - - ################################################################### - gmfile = np.load('results/gram_matrix_marg_itr10_pq0.03_mutag_positive.gm.npz') - km_tmp = gmfile['gm'] - time_km = gmfile['gmtime'] - # modify mixed gram matrix. - km = np.zeros((len(Gn) + nb_median, len(Gn) + nb_median)) - for i in range(len(Gn)): - for j in range(i, len(Gn)): - km[i, j] = km_tmp[i, j] - km[j, i] = km[i, j] - for i in range(len(Gn)): - for j, idx in enumerate(idx_rdm): - km[i, len(Gn) + j] = km[i, idx] - km[len(Gn) + j, i] = km[i, idx] - for i, idx1 in enumerate(idx_rdm): - for j, idx2 in enumerate(idx_rdm): - km[len(Gn) + i, len(Gn) + j] = km[idx1, idx2] - - ################################################################### - alpha_range = [1 / nb_median] * nb_median - time0 = time.time() - dhat, ghat, nb_updated = preimage_random(Gn, Gn_median, alpha_range, - range(len(Gn), len(Gn) + nb_median), km, k, r_max, l, gkernel) - - time_total = time.time() - time0 + time_km - print('time: ', time_total) - time_list.append(time_total) - print('\nsmallest distance in kernel space: ', dhat) - dis_ks_min_list.append(dhat) - g_best.append(ghat) - print('\nnumber of updates of the best graph: ', nb_updated) - nb_updated_list.append(nb_updated) - - # show the best graph and save it to file. - print('the shortest distance is', dhat) - print('one of the possible corresponding pre-images is') - nx.draw(ghat, labels=nx.get_node_attributes(ghat, 'atom'), - with_labels=True) - plt.savefig('results/preimage_random/mutag_median_nb' + str(nb_median) + - '.png', format="PNG") -# plt.show() - plt.clf() -# print(ghat_list[0].nodes(data=True)) -# print(ghat_list[0].edges(data=True)) - - # compute the corresponding sod in graph space. - sod_tmp, _ = ged_median([ghat], Gn_median, ged_cost=ged_cost, - ged_method=ged_method, saveGXL=saveGXL) - sod_gs_list.append(sod_tmp) - sod_gs_min_list.append(np.min(sod_tmp)) - print('\nsmallest sod in graph space: ', np.min(sod_tmp)) - - print('\nsods in graph space: ', sod_gs_list) - print('\nsmallest sod in graph space for each set of median graphs: ', sod_gs_min_list) - print('\nsmallest distance in kernel space for each set of median graphs: ', - dis_ks_min_list) - print('\nnumber of updates of the best graph for each set of median graphs: ', - nb_updated_list) - print('\ntimes:', time_list) - - - -############################################################################### -# test on the combination of the two randomly chosen graphs. (the same as in the -# random pre-image paper.) - -def test_random_preimage_2combination(): - ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt', - 'extra_params': {}} # node/edge symb - Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params']) -# Gn = Gn[0:12] - remove_edges(Gn) - gkernel = 'marginalizedkernel' - -# dis_mat, dis_max, dis_min, dis_mean = kernel_distance_matrix(Gn, gkernel=gkernel) -# print(dis_max, dis_min, dis_mean) - - lmbda = 0.03 # termination probalility - r_max = 10 # iteration limit for pre-image. - l = 500 - alpha_range = np.linspace(0, 1, 11) - k = 5 # k nearest neighbors - - # randomly select two molecules - np.random.seed(1) - idx_gi = [187, 167] # np.random.randint(0, len(Gn), 2) - g1 = Gn[idx_gi[0]].copy() - g2 = Gn[idx_gi[1]].copy() - -# nx.draw(g1, labels=nx.get_node_attributes(g1, 'atom'), with_labels=True) -# plt.savefig("results/random_preimage/mutag10.png", format="PNG") -# plt.show() -# nx.draw(g2, labels=nx.get_node_attributes(g2, 'atom'), with_labels=True) -# plt.savefig("results/random_preimage/mutag11.png", format="PNG") -# plt.show() - - ###################################################################### -# Gn_mix = [g.copy() for g in Gn] -# Gn_mix.append(g1.copy()) -# Gn_mix.append(g2.copy()) -# -## g_tmp = iam([g1, g2]) -## nx.draw_networkx(g_tmp) -## plt.show() -# -# # compute -# time0 = time.time() -# km = compute_kernel(Gn_mix, gkernel, True) -# time_km = time.time() - time0 - - ################################################################### - idx1 = idx_gi[0] - idx2 = idx_gi[1] - gmfile = np.load('results/gram_matrix_marg_itr10_pq0.03.gm.npz') - km = gmfile['gm'] - time_km = gmfile['gmtime'] - # modify mixed gram matrix. - for i in range(len(Gn)): - km[i, len(Gn)] = km[i, idx1] - km[i, len(Gn) + 1] = km[i, idx2] - km[len(Gn), i] = km[i, idx1] - km[len(Gn) + 1, i] = km[i, idx2] - km[len(Gn), len(Gn)] = km[idx1, idx1] - km[len(Gn), len(Gn) + 1] = km[idx1, idx2] - km[len(Gn) + 1, len(Gn)] = km[idx2, idx1] - km[len(Gn) + 1, len(Gn) + 1] = km[idx2, idx2] - - ################################################################### - - time_list = [] - nb_updated_list = [] - g_best = [] - dis_ks_min_list = [] - # for each alpha - for alpha in alpha_range: - print('\n-------------------------------------------------------\n') - print('alpha =', alpha) - time0 = time.time() - dhat, ghat, nb_updated = preimage_random(Gn, [g1, g2], [alpha, 1 - alpha], - range(len(Gn), len(Gn) + 2), km, - k, r_max, l, gkernel) - time_total = time.time() - time0 + time_km - print('time: ', time_total) - time_list.append(time_total) - dis_ks_min_list.append(dhat) - g_best.append(ghat) - nb_updated_list.append(nb_updated) - - # show best graphs and save them to file. - for idx, item in enumerate(alpha_range): - print('when alpha is', item, 'the shortest distance is', dis_ks_min_list[idx]) - print('one of the possible corresponding pre-images is') - nx.draw(g_best[idx], labels=nx.get_node_attributes(g_best[idx], 'atom'), - with_labels=True) - plt.show() - plt.savefig('results/random_preimage/mutag_alpha' + str(item) + '.png', format="PNG") - plt.clf() - print(g_best[idx].nodes(data=True)) - print(g_best[idx].edges(data=True)) - -# # compute the corresponding sod in graph space. (alpha range not considered.) -# sod_tmp, _ = median_distance(g_best[0], Gn_let) -# sod_gs_list.append(sod_tmp) -# sod_gs_min_list.append(np.min(sod_tmp)) -# sod_ks_min_list.append(sod_ks) -# nb_updated_list.append(nb_updated) - -# print('\nsmallest sod in graph space for each alpha: ', sod_gs_min_list) - print('\nsmallest distance in kernel space for each alpha: ', dis_ks_min_list) - print('\nnumber of updates for each alpha: ', nb_updated_list) - print('\ntimes:', time_list) - -############################################################################### - - -if __name__ == '__main__': -############################################################################### -# test on the combination of the two randomly chosen graphs. (the same as in the -# random pre-image paper.) -# test_random_preimage_2combination() - -############################################################################### -# tests all algorithms on different numbers of median-sets. - test_preimage_random_median_nb() - -############################################################################### -# tests all algorithms on different values on grid of median-sets and k. -# test_preimage_random_grid_k_median_nb() \ No newline at end of file diff --git a/gklearn/preimage/xp_fit_method.py b/gklearn/preimage/xp_fit_method.py deleted file mode 100644 index ead2786..0000000 --- a/gklearn/preimage/xp_fit_method.py +++ /dev/null @@ -1,935 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -Created on Tue Jan 14 15:39:29 2020 - -@author: ljia -""" -import numpy as np -import random -import csv -from shutil import copyfile -import networkx as nx -import matplotlib.pyplot as plt -import os -import time - -from gklearn.utils.graphfiles import loadDataset, loadGXL, saveGXL -from gklearn.preimage.test_k_closest_graphs import median_on_k_closest_graphs, reform_attributes -from gklearn.preimage.utils import get_same_item_indices, kernel_distance_matrix, compute_kernel -from gklearn.preimage.find_best_k import getRelations - - -def get_dataset(ds_name): - if ds_name == 'Letter-high': # node non-symb - dataset = 'cpp_ext/data/collections/Letter.xml' - graph_dir = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/data/datasets/Letter/HIGH/' - Gn, y_all = loadDataset(dataset, extra_params=graph_dir) - for G in Gn: - reform_attributes(G, na_names=['x', 'y']) - G.graph['node_labels'] = [] - G.graph['edge_labels'] = [] - G.graph['node_attrs'] = ['x', 'y'] - G.graph['edge_attrs'] = [] - elif ds_name == 'Letter-med': # node non-symb - dataset = 'cpp_ext/data/collections/Letter.xml' - graph_dir = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/data/datasets/Letter/MED/' - Gn, y_all = loadDataset(dataset, extra_params=graph_dir) - for G in Gn: - reform_attributes(G, na_names=['x', 'y']) - G.graph['node_labels'] = [] - G.graph['edge_labels'] = [] - G.graph['node_attrs'] = ['x', 'y'] - G.graph['edge_attrs'] = [] - elif ds_name == 'Letter-low': # node non-symb - dataset = 'cpp_ext/data/collections/Letter.xml' - graph_dir = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/data/datasets/Letter/LOW/' - Gn, y_all = loadDataset(dataset, extra_params=graph_dir) - for G in Gn: - reform_attributes(G, na_names=['x', 'y']) - G.graph['node_labels'] = [] - G.graph['edge_labels'] = [] - G.graph['node_attrs'] = ['x', 'y'] - G.graph['edge_attrs'] = [] - elif ds_name == 'Fingerprint': -# dataset = 'cpp_ext/data/collections/Fingerprint.xml' -# graph_dir = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/generated_datsets/Fingerprint/node_attrs/' -# Gn, y_all = loadDataset(dataset, extra_params=graph_dir) -# for G in Gn: -# reform_attributes(G) - dataset = '../../datasets/Fingerprint/Fingerprint_A.txt' - graph_dir = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/generated_datsets/Fingerprint/node_attrs/' - Gn, y_all = loadDataset(dataset) - elif ds_name == 'SYNTHETIC': - pass - elif ds_name == 'SYNTHETICnew': - dataset = '../../datasets/SYNTHETICnew/SYNTHETICnew_A.txt' - graph_dir = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/generated_datsets/SYNTHETICnew' -# dataset = '../../datasets/Letter-high/Letter-high_A.txt' -# graph_dir = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/data/datasets/Letter/HIGH/' - Gn, y_all = loadDataset(dataset) - elif ds_name == 'Synthie': - pass - elif ds_name == 'COIL-DEL': - dataset = '../../datasets/COIL-DEL/COIL-DEL_A.txt' - graph_dir = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/generated_datsets/COIL-DEL/' - Gn, y_all = loadDataset(dataset) - elif ds_name == 'COIL-RAG': - pass - elif ds_name == 'COLORS-3': - pass - elif ds_name == 'FRANKENSTEIN': - pass - - return Gn, y_all, graph_dir - - -def init_output_file(ds_name, gkernel, fit_method, dir_output): -# fn_output_detail = 'results_detail.' + ds_name + '.' + gkernel + '.' + fit_method + '.csv' - fn_output_detail = 'results_detail.' + ds_name + '.' + gkernel + '.csv' - f_detail = open(dir_output + fn_output_detail, 'a') - csv.writer(f_detail).writerow(['dataset', 'graph kernel', 'edit cost', - 'GED method', 'attr distance', 'fit method', 'k', - 'target', 'repeat', 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM', - 'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM', - 'dis_k gi -> GM', 'fitting time', 'generating time', 'total time', - 'median set']) - f_detail.close() - -# fn_output_summary = 'results_summary.' + ds_name + '.' + gkernel + '.' + fit_method + '.csv' - fn_output_summary = 'results_summary.' + ds_name + '.' + gkernel + '.csv' - f_summary = open(dir_output + fn_output_summary, 'a') - csv.writer(f_summary).writerow(['dataset', 'graph kernel', 'edit cost', - 'GED method', 'attr distance', 'fit method', 'k', - 'target', 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM', - 'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM', - 'dis_k gi -> GM', 'fitting time', 'generating time', 'total time', - '# SOD SM -> GM', '# dis_k SM -> GM', - '# dis_k gi -> SM', '# dis_k gi -> GM', 'repeats better SOD SM -> GM', - 'repeats better dis_k SM -> GM', 'repeats better dis_k gi -> SM', - 'repeats better dis_k gi -> GM']) - f_summary.close() - - return fn_output_detail, fn_output_summary - - -def xp_fit_method_for_non_symbolic(parameters, save_results=True, initial_solutions=1, - Gn_data=None, k_dis_data=None, Kmatrix=None, - is_separate=False): - - # 1. set parameters. - print('1. setting parameters...') - ds_name = parameters['ds_name'] - gkernel = parameters['gkernel'] - edit_cost_name = parameters['edit_cost_name'] - ged_method = parameters['ged_method'] - attr_distance = parameters['attr_distance'] - fit_method = parameters['fit_method'] - init_ecc = parameters['init_ecc'] - - node_label = None - edge_label = None - dir_output = 'results/xp_fit_method/' - - - # 2. get dataset. - print('2. getting dataset...') - if Gn_data is None: - Gn, y_all, graph_dir = get_dataset(ds_name) - else: - Gn = Gn_data[0] - y_all = Gn_data[1] - graph_dir = Gn_data[2] - - - # 3. compute kernel distance matrix. - print('3. computing kernel distance matrix...') - if k_dis_data is None: - dis_mat, dis_max, dis_min, dis_mean = kernel_distance_matrix(Gn, None, - None, Kmatrix=Kmatrix, gkernel=gkernel) - else: -# dis_mat = k_dis_data[0] -# dis_max = k_dis_data[1] -# dis_min = k_dis_data[2] -# dis_mean = k_dis_data[3] -# print('pair distances - dis_max, dis_min, dis_mean:', dis_max, dis_min, dis_mean) - pass - - - if save_results: - # create result files. - print('creating output files...') - fn_output_detail, fn_output_summary = init_output_file(ds_name, gkernel, - fit_method, dir_output) - - - # start repeats. - repeats = 1 -# k_list = range(2, 11) - k_list = [0] - # get indices by classes. - y_idx = get_same_item_indices(y_all) - random.seed(1) - rdn_seed_list = random.sample(range(0, repeats * 100), repeats) - - for k in k_list: -# print('\n--------- k =', k, '----------') - - sod_sm_mean_list = [] - sod_gm_mean_list = [] - dis_k_sm_mean_list = [] - dis_k_gm_mean_list = [] - dis_k_gi_min_mean_list = [] - time_fitting_mean_list = [] - time_generating_mean_list = [] - time_total_mean_list = [] - - # 3. start generating and computing over targets. - print('4. starting generating and computing over targets......') - for i, (y, values) in enumerate(y_idx.items()): -# y = 'I' -# values = y_idx[y] -# values = values[0:10] - print('\ny =', y) -# if y.strip() == 'A': -# continue - - k = len(values) - print('\n--------- k =', k, '----------') - - if k < 2: - print('\nk = ', k, ', skip.\n') - continue - - sod_sm_list = [] - sod_gm_list = [] - dis_k_sm_list = [] - dis_k_gm_list = [] - dis_k_gi_min_list = [] - time_fitting_list = [] - time_generating_list = [] - time_total_list = [] - nb_sod_sm2gm = [0, 0, 0] - nb_dis_k_sm2gm = [0, 0, 0] - nb_dis_k_gi2sm = [0, 0, 0] - nb_dis_k_gi2gm = [0, 0, 0] - repeats_better_sod_sm2gm = [] - repeats_better_dis_k_sm2gm = [] - repeats_better_dis_k_gi2sm = [] - repeats_better_dis_k_gi2gm = [] - - # get Gram matrix for this part of data. - if Kmatrix is not None: - if is_separate: - Kmatrix_sub = Kmatrix[i].copy() - else: - Kmatrix_sub = Kmatrix[values,:] - Kmatrix_sub = Kmatrix_sub[:,values] - else: - Kmatrix_sub = None - - for repeat in range(repeats): - print('\nrepeat =', repeat) - random.seed(rdn_seed_list[repeat]) - median_set_idx_idx = random.sample(range(0, len(values)), k) - median_set_idx = [values[idx] for idx in median_set_idx_idx] - print('median set: ', median_set_idx) - Gn_median = [Gn[g] for g in values] -# from notebooks.utils.plot_all_graphs import draw_Fingerprint_graph -# for Gn in Gn_median: -# draw_Fingerprint_graph(Gn, save=None) - - # GENERATING & COMPUTING!! - res_sods, res_dis_ks, res_times = median_on_k_closest_graphs(Gn_median, - node_label, edge_label, - gkernel, k, fit_method=fit_method, graph_dir=graph_dir, - edit_cost_constants=None, group_min=median_set_idx_idx, - dataset=ds_name, initial_solutions=initial_solutions, - edit_cost_name=edit_cost_name, init_ecc=init_ecc, - Kmatrix=Kmatrix_sub, parallel=False) - sod_sm = res_sods[0] - sod_gm = res_sods[1] - dis_k_sm = res_dis_ks[0] - dis_k_gm = res_dis_ks[1] - dis_k_gi = res_dis_ks[2] - dis_k_gi_min = res_dis_ks[3] - idx_dis_k_gi_min = res_dis_ks[4] - time_fitting = res_times[0] - time_generating = res_times[1] - - # write result detail. - sod_sm2gm = getRelations(np.sign(sod_gm - sod_sm)) - dis_k_sm2gm = getRelations(np.sign(dis_k_gm - dis_k_sm)) - dis_k_gi2sm = getRelations(np.sign(dis_k_sm - dis_k_gi_min)) - dis_k_gi2gm = getRelations(np.sign(dis_k_gm - dis_k_gi_min)) - if save_results: - f_detail = open(dir_output + fn_output_detail, 'a') - csv.writer(f_detail).writerow([ds_name, gkernel, - edit_cost_name, ged_method, attr_distance, - fit_method, k, y, repeat, - sod_sm, sod_gm, dis_k_sm, dis_k_gm, - dis_k_gi_min, sod_sm2gm, dis_k_sm2gm, dis_k_gi2sm, - dis_k_gi2gm, time_fitting, time_generating, - time_fitting + time_generating, median_set_idx]) - f_detail.close() - - # compute result summary. - sod_sm_list.append(sod_sm) - sod_gm_list.append(sod_gm) - dis_k_sm_list.append(dis_k_sm) - dis_k_gm_list.append(dis_k_gm) - dis_k_gi_min_list.append(dis_k_gi_min) - time_fitting_list.append(time_fitting) - time_generating_list.append(time_generating) - time_total_list.append(time_fitting + time_generating) - # # SOD SM -> GM - if sod_sm > sod_gm: - nb_sod_sm2gm[0] += 1 - repeats_better_sod_sm2gm.append(repeat) - elif sod_sm == sod_gm: - nb_sod_sm2gm[1] += 1 - elif sod_sm < sod_gm: - nb_sod_sm2gm[2] += 1 - # # dis_k SM -> GM - if dis_k_sm > dis_k_gm: - nb_dis_k_sm2gm[0] += 1 - repeats_better_dis_k_sm2gm.append(repeat) - elif dis_k_sm == dis_k_gm: - nb_dis_k_sm2gm[1] += 1 - elif dis_k_sm < dis_k_gm: - nb_dis_k_sm2gm[2] += 1 - # # dis_k gi -> SM - if dis_k_gi_min > dis_k_sm: - nb_dis_k_gi2sm[0] += 1 - repeats_better_dis_k_gi2sm.append(repeat) - elif dis_k_gi_min == dis_k_sm: - nb_dis_k_gi2sm[1] += 1 - elif dis_k_gi_min < dis_k_sm: - nb_dis_k_gi2sm[2] += 1 - # # dis_k gi -> GM - if dis_k_gi_min > dis_k_gm: - nb_dis_k_gi2gm[0] += 1 - repeats_better_dis_k_gi2gm.append(repeat) - elif dis_k_gi_min == dis_k_gm: - nb_dis_k_gi2gm[1] += 1 - elif dis_k_gi_min < dis_k_gm: - nb_dis_k_gi2gm[2] += 1 - - # save median graphs. - fname_sm = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/output/tmp_ged/set_median.gxl' - fn_pre_sm_new = dir_output + 'medians/set_median.' + fit_method \ - + '.k' + str(int(k)) + '.y' + str(y) + '.repeat' + str(repeat) - copyfile(fname_sm, fn_pre_sm_new + '.gxl') - fname_gm = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/output/tmp_ged/gen_median.gxl' - fn_pre_gm_new = dir_output + 'medians/gen_median.' + fit_method \ - + '.k' + str(int(k)) + '.y' + str(y) + '.repeat' + str(repeat) - copyfile(fname_gm, fn_pre_gm_new + '.gxl') - G_best_kernel = Gn_median[idx_dis_k_gi_min].copy() -# reform_attributes(G_best_kernel) - fn_pre_g_best_kernel = dir_output + 'medians/g_best_kernel.' + fit_method \ - + '.k' + str(int(k)) + '.y' + str(y) + '.repeat' + str(repeat) - saveGXL(G_best_kernel, fn_pre_g_best_kernel + '.gxl', method='default') - - # plot median graphs. - if ds_name == 'Letter-high' or ds_name == 'Letter-med' or ds_name == 'Letter-low': - set_median = loadGXL(fn_pre_sm_new + '.gxl') - gen_median = loadGXL(fn_pre_gm_new + '.gxl') - draw_Letter_graph(set_median, fn_pre_sm_new) - draw_Letter_graph(gen_median, fn_pre_gm_new) - draw_Letter_graph(G_best_kernel, fn_pre_g_best_kernel) - - # write result summary for each letter. - sod_sm_mean_list.append(np.mean(sod_sm_list)) - sod_gm_mean_list.append(np.mean(sod_gm_list)) - dis_k_sm_mean_list.append(np.mean(dis_k_sm_list)) - dis_k_gm_mean_list.append(np.mean(dis_k_gm_list)) - dis_k_gi_min_mean_list.append(np.mean(dis_k_gi_min_list)) - time_fitting_mean_list.append(np.mean(time_fitting_list)) - time_generating_mean_list.append(np.mean(time_generating_list)) - time_total_mean_list.append(np.mean(time_total_list)) - sod_sm2gm_mean = getRelations(np.sign(sod_gm_mean_list[-1] - sod_sm_mean_list[-1])) - dis_k_sm2gm_mean = getRelations(np.sign(dis_k_gm_mean_list[-1] - dis_k_sm_mean_list[-1])) - dis_k_gi2sm_mean = getRelations(np.sign(dis_k_sm_mean_list[-1] - dis_k_gi_min_mean_list[-1])) - dis_k_gi2gm_mean = getRelations(np.sign(dis_k_gm_mean_list[-1] - dis_k_gi_min_mean_list[-1])) - if save_results: - f_summary = open(dir_output + fn_output_summary, 'a') - csv.writer(f_summary).writerow([ds_name, gkernel, - edit_cost_name, ged_method, attr_distance, - fit_method, k, y, - sod_sm_mean_list[-1], sod_gm_mean_list[-1], - dis_k_sm_mean_list[-1], dis_k_gm_mean_list[-1], - dis_k_gi_min_mean_list[-1], sod_sm2gm_mean, dis_k_sm2gm_mean, - dis_k_gi2sm_mean, dis_k_gi2gm_mean, - time_fitting_mean_list[-1], time_generating_mean_list[-1], - time_total_mean_list[-1], nb_sod_sm2gm, - nb_dis_k_sm2gm, nb_dis_k_gi2sm, nb_dis_k_gi2gm, - repeats_better_sod_sm2gm, repeats_better_dis_k_sm2gm, - repeats_better_dis_k_gi2sm, repeats_better_dis_k_gi2gm]) - f_summary.close() - - - # write result summary for each letter. - sod_sm_mean = np.mean(sod_sm_mean_list) - sod_gm_mean = np.mean(sod_gm_mean_list) - dis_k_sm_mean = np.mean(dis_k_sm_mean_list) - dis_k_gm_mean = np.mean(dis_k_gm_mean_list) - dis_k_gi_min_mean = np.mean(dis_k_gi_min_list) - time_fitting_mean = np.mean(time_fitting_list) - time_generating_mean = np.mean(time_generating_list) - time_total_mean = np.mean(time_total_list) - sod_sm2gm_mean = getRelations(np.sign(sod_gm_mean - sod_sm_mean)) - dis_k_sm2gm_mean = getRelations(np.sign(dis_k_gm_mean - dis_k_sm_mean)) - dis_k_gi2sm_mean = getRelations(np.sign(dis_k_sm_mean - dis_k_gi_min_mean)) - dis_k_gi2gm_mean = getRelations(np.sign(dis_k_gm_mean - dis_k_gi_min_mean)) - if save_results: - f_summary = open(dir_output + fn_output_summary, 'a') - csv.writer(f_summary).writerow([ds_name, gkernel, - edit_cost_name, ged_method, attr_distance, - fit_method, k, 'all', - sod_sm_mean, sod_gm_mean, dis_k_sm_mean, dis_k_gm_mean, - dis_k_gi_min_mean, sod_sm2gm_mean, dis_k_sm2gm_mean, - dis_k_gi2sm_mean, dis_k_gi2gm_mean, - time_fitting_mean, time_generating_mean, time_total_mean]) - f_summary.close() - - print('\ncomplete.') - - -#Dessin median courrant -def draw_Letter_graph(graph, file_prefix): - plt.figure() - pos = {} - for n in graph.nodes: - pos[n] = np.array([float(graph.node[n]['x']),float(graph.node[n]['y'])]) - nx.draw_networkx(graph, pos) - plt.savefig(file_prefix + '.eps', format='eps', dpi=300) -# plt.show() - plt.clf() - - -def compute_gm_for_each_class(Gn, y_all, gkernel, parallel='imap_unordered', is_separate=True): - - if is_separate: - print('the Gram matrix is computed for each class.') - y_idx = get_same_item_indices(y_all) - Kmatrix = [] - run_time = [] - k_dis_data = [] - for i, (y, values) in enumerate(y_idx.items()): - print('The ', str(i), ' class:') - Gn_i = [Gn[val] for val in values] - time0 = time.time() - Kmatrix.append(compute_kernel(Gn_i, gkernel, None, None, True, parallel=parallel)) - run_time.append(time.time() - time0) - k_dis_data.append(kernel_distance_matrix(Gn_i, None, None, - Kmatrix=Kmatrix[i], gkernel=gkernel, verbose=True)) - np.savez('results/xp_fit_method/Kmatrix.' + ds_name + '.' + gkernel + '.gm', - Kmatrix=Kmatrix, run_time=run_time, is_separate=is_separate) - dis_max = np.max([item[1] for item in k_dis_data]) - dis_min = np.min([item[2] for item in k_dis_data]) - dis_mean = np.mean([item[3] for item in k_dis_data]) - print('pair distances - dis_max, dis_min, dis_mean:', dis_max, dis_min, - dis_mean) - - else: - time0 = time.time() - Kmatrix = compute_kernel(Gn, gkernel, None, None, True, parallel=parallel) - run_time = time.time() - time0 - np.savez('results/xp_fit_method/Kmatrix.' + ds_name + '.' + gkernel + '.gm', - Kmatrix=Kmatrix, run_time=run_time, is_separate=is_separate) - k_dis_data = kernel_distance_matrix(Gn, None, None, - Kmatrix=Kmatrix, gkernel=gkernel, verbose=True) - print('the Gram matrix is computed for the whole dataset.') - print('pair distances - dis_max, dis_min, dis_mean:', k_dis_data[1], - k_dis_data[2], k_dis_data[3]) - - print('\nTime to compute Gram matrix for the whole dataset: ', run_time) -# k_dis_data = [dis_mat, dis_max, dis_min, dis_mean] - return Kmatrix, run_time, k_dis_data - - -if __name__ == "__main__": -# #### xp 1: Letter-high, spkernel. -# # load dataset. -# print('getting dataset and computing kernel distance matrix first...') -# ds_name = 'Letter-high' -# gkernel = 'spkernel' -# Gn, y_all, graph_dir = get_dataset(ds_name) -# # remove graphs without edges. -# Gn = [(idx, G) for idx, G in enumerate(Gn) if nx.number_of_edges(G) != 0] -# idx = [G[0] for G in Gn] -# Gn = [G[1] for G in Gn] -# y_all = [y_all[i] for i in idx] -## Gn = Gn[0:50] -## y_all = y_all[0:50] -# # compute pair distances. -# dis_mat, dis_max, dis_min, dis_mean = kernel_distance_matrix(Gn, None, None, -# Kmatrix=None, gkernel=gkernel, verbose=True) -## dis_mat, dis_max, dis_min, dis_mean = 0, 0, 0, 0 -# # fitting and computing. -# fit_methods = ['random', 'expert', 'k-graphs'] -# for fit_method in fit_methods: -# print('\n-------------------------------------') -# print('fit method:', fit_method) -# parameters = {'ds_name': ds_name, -# 'gkernel': gkernel, -# 'edit_cost_name': 'LETTER2', -# 'ged_method': 'mIPFP', -# 'attr_distance': 'euclidean', -# 'fit_method': fit_method} -# xp_fit_method_for_non_symbolic(parameters, save_results=True, -# initial_solutions=40, -# Gn_data = [Gn, y_all, graph_dir], -# k_dis_data = [dis_mat, dis_max, dis_min, dis_mean]) - - -# #### xp 2: Letter-high, sspkernel. -# # load dataset. -# print('getting dataset and computing kernel distance matrix first...') -# ds_name = 'Letter-high' -# gkernel = 'structuralspkernel' -# Gn, y_all, graph_dir = get_dataset(ds_name) -## Gn = Gn[0:50] -## y_all = y_all[0:50] -# # compute pair distances. -# dis_mat, dis_max, dis_min, dis_mean = kernel_distance_matrix(Gn, None, None, -# Kmatrix=None, gkernel=gkernel, verbose=True) -## dis_mat, dis_max, dis_min, dis_mean = 0, 0, 0, 0 -# # fitting and computing. -# fit_methods = ['random', 'expert', 'k-graphs'] -# for fit_method in fit_methods: -# print('\n-------------------------------------') -# print('fit method:', fit_method) -# parameters = {'ds_name': ds_name, -# 'gkernel': gkernel, -# 'edit_cost_name': 'LETTER2', -# 'ged_method': 'mIPFP', -# 'attr_distance': 'euclidean', -# 'fit_method': fit_method} -# print('parameters: ', parameters) -# xp_fit_method_for_non_symbolic(parameters, save_results=True, -# initial_solutions=40, -# Gn_data = [Gn, y_all, graph_dir], -# k_dis_data = [dis_mat, dis_max, dis_min, dis_mean]) - - -# #### xp 3: SYNTHETICnew, sspkernel, using NON_SYMBOLIC. -# gmfile = np.load('results/xp_fit_method/Kmatrix.SYNTHETICnew.structuralspkernel.gm.npz') -# Kmatrix = gmfile['Kmatrix'] -# run_time = gmfile['run_time'] -# # normalization -# Kmatrix_diag = Kmatrix.diagonal().copy() -# for i in range(len(Kmatrix)): -# for j in range(i, len(Kmatrix)): -# Kmatrix[i][j] /= np.sqrt(Kmatrix_diag[i] * Kmatrix_diag[j]) -# Kmatrix[j][i] = Kmatrix[i][j] -## np.savez('results/xp_fit_method/Kmatrix.SYNTHETICnew.spkernel.gm', -## Kmatrix=Kmatrix, run_time=run_time) -# # load dataset. -# print('getting dataset and computing kernel distance matrix first...') -# ds_name = 'SYNTHETICnew' -# gkernel = 'structuralspkernel' -# Gn, y_all, graph_dir = get_dataset(ds_name) -# # remove graphs without nodes and edges. -# Gn = [(idx, G) for idx, G in enumerate(Gn) if (nx.number_of_nodes(G) != 0 -# and nx.number_of_edges(G) != 0)] -# idx = [G[0] for G in Gn] -# Gn = [G[1] for G in Gn] -# y_all = [y_all[i] for i in idx] -## Gn = Gn[0:10] -## y_all = y_all[0:10] -# for G in Gn: -# G.graph['filename'] = 'graph' + str(G.graph['name']) + '.gxl' -# # compute pair distances. -# dis_mat, dis_max, dis_min, dis_mean = kernel_distance_matrix(Gn, None, None, -# Kmatrix=Kmatrix, gkernel=gkernel, verbose=True) -## dis_mat, dis_max, dis_min, dis_mean = 0, 0, 0, 0 -# # fitting and computing. -# fit_methods = ['k-graphs', 'random', 'random', 'random'] -# for fit_method in fit_methods: -# print('\n-------------------------------------') -# print('fit method:', fit_method) -# parameters = {'ds_name': ds_name, -# 'gkernel': gkernel, -# 'edit_cost_name': 'NON_SYMBOLIC', -# 'ged_method': 'mIPFP', -# 'attr_distance': 'euclidean', -# 'fit_method': fit_method} -# xp_fit_method_for_non_symbolic(parameters, save_results=True, -# initial_solutions=1, -# Gn_data = [Gn, y_all, graph_dir], -# k_dis_data = [dis_mat, dis_max, dis_min, dis_mean], -# Kmatrix=Kmatrix) - - -# ### xp 4: SYNTHETICnew, spkernel, using NON_SYMBOLIC. -# gmfile = np.load('results/xp_fit_method/Kmatrix.SYNTHETICnew.spkernel.gm.npz') -# Kmatrix = gmfile['Kmatrix'] -# # normalization -# Kmatrix_diag = Kmatrix.diagonal().copy() -# for i in range(len(Kmatrix)): -# for j in range(i, len(Kmatrix)): -# Kmatrix[i][j] /= np.sqrt(Kmatrix_diag[i] * Kmatrix_diag[j]) -# Kmatrix[j][i] = Kmatrix[i][j] -# run_time = 21821.35 -# np.savez('results/xp_fit_method/Kmatrix.SYNTHETICnew.spkernel.gm', -# Kmatrix=Kmatrix, run_time=run_time) -# -# # load dataset. -# print('getting dataset and computing kernel distance matrix first...') -# ds_name = 'SYNTHETICnew' -# gkernel = 'spkernel' -# Gn, y_all, graph_dir = get_dataset(ds_name) -## # remove graphs without nodes and edges. -## Gn = [(idx, G) for idx, G in enumerate(Gn) if (nx.number_of_node(G) != 0 -## and nx.number_of_edges(G) != 0)] -## idx = [G[0] for G in Gn] -## Gn = [G[1] for G in Gn] -## y_all = [y_all[i] for i in idx] -## Gn = Gn[0:5] -## y_all = y_all[0:5] -# for G in Gn: -# G.graph['filename'] = 'graph' + str(G.graph['name']) + '.gxl' -# -# # compute/read Gram matrix and pair distances. -## Kmatrix = compute_kernel(Gn, gkernel, None, None, True) -## np.savez('results/xp_fit_method/Kmatrix.' + ds_name + '.' + gkernel + '.gm', -## Kmatrix=Kmatrix) -# gmfile = np.load('results/xp_fit_method/Kmatrix.' + ds_name + '.' + gkernel + '.gm.npz') -# Kmatrix = gmfile['Kmatrix'] -# run_time = gmfile['run_time'] -## Kmatrix = Kmatrix[[0,1,2,3,4],:] -## Kmatrix = Kmatrix[:,[0,1,2,3,4]] -# print('\nTime to compute Gram matrix for the whole dataset: ', run_time) -# dis_mat, dis_max, dis_min, dis_mean = kernel_distance_matrix(Gn, None, None, -# Kmatrix=Kmatrix, gkernel=gkernel, verbose=True) -## Kmatrix = np.zeros((len(Gn), len(Gn))) -## dis_mat, dis_max, dis_min, dis_mean = 0, 0, 0, 0 -# -# # fitting and computing. -# fit_methods = ['k-graphs', 'random', 'random', 'random'] -# for fit_method in fit_methods: -# print('\n-------------------------------------') -# print('fit method:', fit_method) -# parameters = {'ds_name': ds_name, -# 'gkernel': gkernel, -# 'edit_cost_name': 'NON_SYMBOLIC', -# 'ged_method': 'mIPFP', -# 'attr_distance': 'euclidean', -# 'fit_method': fit_method} -# xp_fit_method_for_non_symbolic(parameters, save_results=True, -# initial_solutions=1, -# Gn_data=[Gn, y_all, graph_dir], -# k_dis_data=[dis_mat, dis_max, dis_min, dis_mean], -# Kmatrix=Kmatrix) - - -# #### xp 5: Fingerprint, sspkernel, using LETTER2, only node attrs. -# # load dataset. -# print('getting dataset and computing kernel distance matrix first...') -# ds_name = 'Fingerprint' -# gkernel = 'structuralspkernel' -# Gn, y_all, graph_dir = get_dataset(ds_name) -# # remove graphs without nodes and edges. -# Gn = [(idx, G) for idx, G in enumerate(Gn) if nx.number_of_nodes(G) != 0] -## and nx.number_of_edges(G) != 0)] -# idx = [G[0] for G in Gn] -# Gn = [G[1] for G in Gn] -# y_all = [y_all[i] for i in idx] -# y_idx = get_same_item_indices(y_all) -# # remove unused labels. -# for G in Gn: -# G.graph['edge_attrs'] = [] -# for edge in G.edges: -# del G.edges[edge]['attributes'] -# del G.edges[edge]['orient'] -# del G.edges[edge]['angle'] -## Gn = Gn[805:815] -## y_all = y_all[805:815] -# for G in Gn: -# G.graph['filename'] = 'graph' + str(G.graph['name']) + '.gxl' -# -# # compute/read Gram matrix and pair distances. -## Kmatrix = compute_kernel(Gn, gkernel, None, None, True, parallel='imap_unordered') -## np.savez('results/xp_fit_method/Kmatrix.' + ds_name + '.' + gkernel + '.gm', -## Kmatrix=Kmatrix) -# gmfile = np.load('results/xp_fit_method/Kmatrix.' + ds_name + '.' + gkernel + '.gm.npz') -# Kmatrix = gmfile['Kmatrix'] -## run_time = gmfile['run_time'] -## Kmatrix = Kmatrix[[0,1,2,3,4],:] -## Kmatrix = Kmatrix[:,[0,1,2,3,4]] -## print('\nTime to compute Gram matrix for the whole dataset: ', run_time) -# dis_mat, dis_max, dis_min, dis_mean = kernel_distance_matrix(Gn, None, None, -# Kmatrix=Kmatrix, gkernel=gkernel, verbose=True) -## Kmatrix = np.zeros((len(Gn), len(Gn))) -## dis_mat, dis_max, dis_min, dis_mean = 0, 0, 0, 0 -# -# # fitting and computing. -# fit_methods = ['k-graphs', 'random', 'random', 'random'] -# for fit_method in fit_methods: -# print('\n-------------------------------------') -# print('fit method:', fit_method) -# parameters = {'ds_name': ds_name, -# 'gkernel': gkernel, -# 'edit_cost_name': 'LETTER2', -# 'ged_method': 'mIPFP', -# 'attr_distance': 'euclidean', -# 'fit_method': fit_method, -# 'init_ecc': [1,1,1,1,1]} # [0.525, 0.525, 0.001, 0.125, 0.125]} -# xp_fit_method_for_non_symbolic(parameters, save_results=True, -# initial_solutions=40, -# Gn_data = [Gn, y_all, graph_dir], -# k_dis_data = [dis_mat, dis_max, dis_min, dis_mean], -# Kmatrix=Kmatrix) - - -# #### xp 6: Letter-med, sspkernel. -# # load dataset. -# print('getting dataset and computing kernel distance matrix first...') -# ds_name = 'Letter-med' -# gkernel = 'structuralspkernel' -# Gn, y_all, graph_dir = get_dataset(ds_name) -## Gn = Gn[0:50] -## y_all = y_all[0:50] -# -# # compute/read Gram matrix and pair distances. -# Kmatrix = compute_kernel(Gn, gkernel, None, None, True, parallel='imap_unordered') -# np.savez('results/xp_fit_method/Kmatrix.' + ds_name + '.' + gkernel + '.gm', -# Kmatrix=Kmatrix) -## gmfile = np.load('results/xp_fit_method/Kmatrix.' + ds_name + '.' + gkernel + '.gm.npz') -## Kmatrix = gmfile['Kmatrix'] -## run_time = gmfile['run_time'] -## Kmatrix = Kmatrix[[0,1,2,3,4],:] -## Kmatrix = Kmatrix[:,[0,1,2,3,4]] -## print('\nTime to compute Gram matrix for the whole dataset: ', run_time) -# dis_mat, dis_max, dis_min, dis_mean = kernel_distance_matrix(Gn, None, None, -# Kmatrix=Kmatrix, gkernel=gkernel, verbose=True) -## Kmatrix = np.zeros((len(Gn), len(Gn))) -## dis_mat, dis_max, dis_min, dis_mean = 0, 0, 0, 0 -# -# # fitting and computing. -# fit_methods = ['k-graphs', 'expert', 'random', 'random', 'random'] -# for fit_method in fit_methods: -# print('\n-------------------------------------') -# print('fit method:', fit_method) -# parameters = {'ds_name': ds_name, -# 'gkernel': gkernel, -# 'edit_cost_name': 'LETTER2', -# 'ged_method': 'mIPFP', -# 'attr_distance': 'euclidean', -# 'fit_method': fit_method, -# 'init_ecc': [0.525, 0.525, 0.75, 0.475, 0.475]} -# print('parameters: ', parameters) -# xp_fit_method_for_non_symbolic(parameters, save_results=True, -# initial_solutions=40, -# Gn_data = [Gn, y_all, graph_dir], -# k_dis_data = [dis_mat, dis_max, dis_min, dis_mean], -# Kmatrix=Kmatrix) - - -# #### xp 7: Letter-low, sspkernel. -# # load dataset. -# print('getting dataset and computing kernel distance matrix first...') -# ds_name = 'Letter-low' -# gkernel = 'structuralspkernel' -# Gn, y_all, graph_dir = get_dataset(ds_name) -## Gn = Gn[0:50] -## y_all = y_all[0:50] -# -# # compute/read Gram matrix and pair distances. -# Kmatrix = compute_kernel(Gn, gkernel, None, None, True, parallel='imap_unordered') -# np.savez('results/xp_fit_method/Kmatrix.' + ds_name + '.' + gkernel + '.gm', -# Kmatrix=Kmatrix) -## gmfile = np.load('results/xp_fit_method/Kmatrix.' + ds_name + '.' + gkernel + '.gm.npz') -## Kmatrix = gmfile['Kmatrix'] -## run_time = gmfile['run_time'] -## Kmatrix = Kmatrix[[0,1,2,3,4],:] -## Kmatrix = Kmatrix[:,[0,1,2,3,4]] -## print('\nTime to compute Gram matrix for the whole dataset: ', run_time) -# dis_mat, dis_max, dis_min, dis_mean = kernel_distance_matrix(Gn, None, None, -# Kmatrix=Kmatrix, gkernel=gkernel, verbose=True) -## Kmatrix = np.zeros((len(Gn), len(Gn))) -## dis_mat, dis_max, dis_min, dis_mean = 0, 0, 0, 0 -# -# # fitting and computing. -# fit_methods = ['k-graphs', 'expert', 'random', 'random', 'random'] -# for fit_method in fit_methods: -# print('\n-------------------------------------') -# print('fit method:', fit_method) -# parameters = {'ds_name': ds_name, -# 'gkernel': gkernel, -# 'edit_cost_name': 'LETTER2', -# 'ged_method': 'mIPFP', -# 'attr_distance': 'euclidean', -# 'fit_method': fit_method, -# 'init_ecc': [0.075, 0.075, 0.25, 0.075, 0.075]} -# print('parameters: ', parameters) -# xp_fit_method_for_non_symbolic(parameters, save_results=True, -# initial_solutions=40, -# Gn_data = [Gn, y_all, graph_dir], -# k_dis_data = [dis_mat, dis_max, dis_min, dis_mean], -# Kmatrix=Kmatrix) - - -# #### xp 8: Letter-med, spkernel. -# # load dataset. -# print('getting dataset and computing kernel distance matrix first...') -# ds_name = 'Letter-med' -# gkernel = 'spkernel' -# Gn, y_all, graph_dir = get_dataset(ds_name) -# # remove graphs without nodes and edges. -# Gn = [(idx, G) for idx, G in enumerate(Gn) if (nx.number_of_nodes(G) != 0 -# and nx.number_of_edges(G) != 0)] -# idx = [G[0] for G in Gn] -# Gn = [G[1] for G in Gn] -# y_all = [y_all[i] for i in idx] -## Gn = Gn[0:50] -## y_all = y_all[0:50] -# -# # compute/read Gram matrix and pair distances. -# Kmatrix = compute_kernel(Gn, gkernel, None, None, True, parallel='imap_unordered') -# np.savez('results/xp_fit_method/Kmatrix.' + ds_name + '.' + gkernel + '.gm', -# Kmatrix=Kmatrix) -## gmfile = np.load('results/xp_fit_method/Kmatrix.' + ds_name + '.' + gkernel + '.gm.npz') -## Kmatrix = gmfile['Kmatrix'] -## run_time = gmfile['run_time'] -## Kmatrix = Kmatrix[[0,1,2,3,4],:] -## Kmatrix = Kmatrix[:,[0,1,2,3,4]] -## print('\nTime to compute Gram matrix for the whole dataset: ', run_time) -# dis_mat, dis_max, dis_min, dis_mean = kernel_distance_matrix(Gn, None, None, -# Kmatrix=Kmatrix, gkernel=gkernel, verbose=True) -## Kmatrix = np.zeros((len(Gn), len(Gn))) -## dis_mat, dis_max, dis_min, dis_mean = 0, 0, 0, 0 -# -# # fitting and computing. -# fit_methods = ['k-graphs', 'expert', 'random', 'random', 'random'] -# for fit_method in fit_methods: -# print('\n-------------------------------------') -# print('fit method:', fit_method) -# parameters = {'ds_name': ds_name, -# 'gkernel': gkernel, -# 'edit_cost_name': 'LETTER2', -# 'ged_method': 'mIPFP', -# 'attr_distance': 'euclidean', -# 'fit_method': fit_method, -# 'init_ecc': [0.525, 0.525, 0.75, 0.475, 0.475]} -# print('parameters: ', parameters) -# xp_fit_method_for_non_symbolic(parameters, save_results=True, -# initial_solutions=40, -# Gn_data = [Gn, y_all, graph_dir], -# k_dis_data = [dis_mat, dis_max, dis_min, dis_mean], -# Kmatrix=Kmatrix) - - -# #### xp 9: Letter-low, spkernel. -# # load dataset. -# print('getting dataset and computing kernel distance matrix first...') -# ds_name = 'Letter-low' -# gkernel = 'spkernel' -# Gn, y_all, graph_dir = get_dataset(ds_name) -# # remove graphs without nodes and edges. -# Gn = [(idx, G) for idx, G in enumerate(Gn) if (nx.number_of_nodes(G) != 0 -# and nx.number_of_edges(G) != 0)] -# idx = [G[0] for G in Gn] -# Gn = [G[1] for G in Gn] -# y_all = [y_all[i] for i in idx] -## Gn = Gn[0:50] -## y_all = y_all[0:50] -# -# # compute/read Gram matrix and pair distances. -# Kmatrix = compute_kernel(Gn, gkernel, None, None, True, parallel='imap_unordered') -# np.savez('results/xp_fit_method/Kmatrix.' + ds_name + '.' + gkernel + '.gm', -# Kmatrix=Kmatrix) -## gmfile = np.load('results/xp_fit_method/Kmatrix.' + ds_name + '.' + gkernel + '.gm.npz') -## Kmatrix = gmfile['Kmatrix'] -## run_time = gmfile['run_time'] -## Kmatrix = Kmatrix[[0,1,2,3,4],:] -## Kmatrix = Kmatrix[:,[0,1,2,3,4]] -## print('\nTime to compute Gram matrix for the whole dataset: ', run_time) -# dis_mat, dis_max, dis_min, dis_mean = kernel_distance_matrix(Gn, None, None, -# Kmatrix=Kmatrix, gkernel=gkernel, verbose=True) -## Kmatrix = np.zeros((len(Gn), len(Gn))) -## dis_mat, dis_max, dis_min, dis_mean = 0, 0, 0, 0 -# -# # fitting and computing. -# fit_methods = ['k-graphs', 'expert', 'random', 'random', 'random'] -# for fit_method in fit_methods: -# print('\n-------------------------------------') -# print('fit method:', fit_method) -# parameters = {'ds_name': ds_name, -# 'gkernel': gkernel, -# 'edit_cost_name': 'LETTER2', -# 'ged_method': 'mIPFP', -# 'attr_distance': 'euclidean', -# 'fit_method': fit_method, -# 'init_ecc': [0.075, 0.075, 0.25, 0.075, 0.075]} -# print('parameters: ', parameters) -# xp_fit_method_for_non_symbolic(parameters, save_results=True, -# initial_solutions=40, -# Gn_data = [Gn, y_all, graph_dir], -# k_dis_data = [dis_mat, dis_max, dis_min, dis_mean], -# Kmatrix=Kmatrix) - - - #### xp 5: COIL-DEL, sspkernel, using LETTER2, only node attrs. - # load dataset. - print('getting dataset and computing kernel distance matrix first...') - ds_name = 'COIL-DEL' - gkernel = 'structuralspkernel' - Gn, y_all, graph_dir = get_dataset(ds_name) - # remove graphs without nodes and edges. - Gn = [(idx, G) for idx, G in enumerate(Gn) if nx.number_of_nodes(G) != 0] -# and nx.number_of_edges(G) != 0)] - idx = [G[0] for G in Gn] - Gn = [G[1] for G in Gn] - y_all = [y_all[i] for i in idx] - # remove unused labels. - for G in Gn: - G.graph['edge_labels'] = [] - for edge in G.edges: - del G.edges[edge]['bond_type'] - del G.edges[edge]['valence'] -# Gn = Gn[805:815] -# y_all = y_all[805:815] - for G in Gn: - G.graph['filename'] = 'graph' + str(G.graph['name']) + '.gxl' - - # compute/read Gram matrix and pair distances. - is_separate = True - Kmatrix, run_time, k_dis_data = compute_gm_for_each_class(Gn, - y_all, - gkernel, - parallel='imap_unordered', - is_separate=is_separate) -# Kmatrix = compute_kernel(Gn, gkernel, None, None, True, parallel='imap_unordered') -# np.savez('results/xp_fit_method/Kmatrix.' + ds_name + '.' + gkernel + '.gm', -# Kmatrix=Kmatrix) -# gmfile = np.load('results/xp_fit_method/Kmatrix.' + ds_name + '.' + gkernel + '.gm.npz') -# Kmatrix = gmfile['Kmatrix'] -# run_time = gmfile['run_time'] -# Kmatrix = Kmatrix[[0,1,2,3,4],:] -# Kmatrix = Kmatrix[:,[0,1,2,3,4]] -# print('\nTime to compute Gram matrix for the whole dataset: ', run_time) -# dis_mat, dis_max, dis_min, dis_mean = kernel_distance_matrix(Gn, None, None, -# Kmatrix=Kmatrix, gkernel=gkernel, verbose=True) -# Kmatrix = np.zeros((len(Gn), len(Gn))) -# dis_mat, dis_max, dis_min, dis_mean = 0, 0, 0, 0 - - # fitting and computing. - fit_methods = ['k-graphs', 'random', 'random', 'random'] - for fit_method in fit_methods: - print('\n-------------------------------------') - print('fit method:', fit_method) - parameters = {'ds_name': ds_name, - 'gkernel': gkernel, - 'edit_cost_name': 'LETTER2', - 'ged_method': 'mIPFP', - 'attr_distance': 'euclidean', - 'fit_method': fit_method, - 'init_ecc': [3,3,1,3,3]} # [0.525, 0.525, 0.001, 0.125, 0.125]} - xp_fit_method_for_non_symbolic(parameters, save_results=True, - initial_solutions=40, - Gn_data=[Gn, y_all, graph_dir], - k_dis_data=k_dis_data, - Kmatrix=Kmatrix, - is_separate=is_separate) \ No newline at end of file diff --git a/gklearn/preimage/xp_letter_h.py b/gklearn/preimage/xp_letter_h.py deleted file mode 100644 index 1e16fcf..0000000 --- a/gklearn/preimage/xp_letter_h.py +++ /dev/null @@ -1,476 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -Created on Tue Jan 14 15:39:29 2020 - -@author: ljia -""" -import numpy as np -import random -import csv -from shutil import copyfile -import networkx as nx -import matplotlib.pyplot as plt - -from gklearn.utils.graphfiles import loadDataset, loadGXL, saveGXL -from gklearn.preimage.test_k_closest_graphs import median_on_k_closest_graphs, reform_attributes -from gklearn.preimage.utils import get_same_item_indices, kernel_distance_matrix -from gklearn.preimage.find_best_k import getRelations - - -def xp_letter_h_LETTER2_cost(): - ds = {'dataset': 'cpp_ext/data/collections/Letter.xml', - 'graph_dir': os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/data/datasets/Letter/HIGH/'} # node/edge symb - Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['graph_dir']) - - dis_mat, dis_max, dis_min, dis_mean = kernel_distance_matrix(Gn, None, None, Kmatrix=None, gkernel='structuralspkernel') - for G in Gn: - reform_attributes(G) -# ds = {'name': 'Letter-high', -# 'dataset': '../datasets/Letter-high/Letter-high_A.txt'} # node/edge symb -# Gn, y_all = loadDataset(ds['dataset']) -# Gn = Gn[0:50] - gkernel = 'structuralspkernel' - node_label = None - edge_label = None - ds_name = 'letter-h' - dir_output = 'results/xp_letter_h/' - save_results = True - cost = 'LETTER2' - - repeats = 1 -# k_list = range(2, 11) - k_list = [150] - fit_method = 'k-graphs' - # get indices by classes. - y_idx = get_same_item_indices(y_all) - - if save_results: - # create result files. - fn_output_detail = 'results_detail.' + ds_name + '.' + gkernel + '.' + fit_method + '.csv' - f_detail = open(dir_output + fn_output_detail, 'a') - csv.writer(f_detail).writerow(['dataset', 'graph kernel', 'fit method', 'k', - 'target', 'repeat', 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM', - 'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM', - 'dis_k gi -> GM', 'median set']) - f_detail.close() - fn_output_summary = 'results_summary.' + ds_name + '.' + gkernel + '.' + fit_method + '.csv' - f_summary = open(dir_output + fn_output_summary, 'a') - csv.writer(f_summary).writerow(['dataset', 'graph kernel', 'fit method', 'k', - 'target', 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM', - 'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM', - 'dis_k gi -> GM', '# SOD SM -> GM', '# dis_k SM -> GM', - '# dis_k gi -> SM', '# dis_k gi -> GM', 'repeats better SOD SM -> GM', - 'repeats better dis_k SM -> GM', 'repeats better dis_k gi -> SM', - 'repeats better dis_k gi -> GM']) - f_summary.close() - - random.seed(1) - rdn_seed_list = random.sample(range(0, repeats * 100), repeats) - - for k in k_list: - print('\n--------- k =', k, '----------') - - sod_sm_mean_list = [] - sod_gm_mean_list = [] - dis_k_sm_mean_list = [] - dis_k_gm_mean_list = [] - dis_k_gi_min_mean_list = [] -# nb_sod_sm2gm = [0, 0, 0] -# nb_dis_k_sm2gm = [0, 0, 0] -# nb_dis_k_gi2sm = [0, 0, 0] -# nb_dis_k_gi2gm = [0, 0, 0] -# repeats_better_sod_sm2gm = [] -# repeats_better_dis_k_sm2gm = [] -# repeats_better_dis_k_gi2sm = [] -# repeats_better_dis_k_gi2gm = [] - - for i, (y, values) in enumerate(y_idx.items()): - print('\ny =', y) -# y = 'F' -# values = y_idx[y] -# values = values[0:10] - - k = len(values) - - sod_sm_list = [] - sod_gm_list = [] - dis_k_sm_list = [] - dis_k_gm_list = [] - dis_k_gi_min_list = [] - nb_sod_sm2gm = [0, 0, 0] - nb_dis_k_sm2gm = [0, 0, 0] - nb_dis_k_gi2sm = [0, 0, 0] - nb_dis_k_gi2gm = [0, 0, 0] - repeats_better_sod_sm2gm = [] - repeats_better_dis_k_sm2gm = [] - repeats_better_dis_k_gi2sm = [] - repeats_better_dis_k_gi2gm = [] - - for repeat in range(repeats): - print('\nrepeat =', repeat) - random.seed(rdn_seed_list[repeat]) - median_set_idx_idx = random.sample(range(0, len(values)), k) - median_set_idx = [values[idx] for idx in median_set_idx_idx] - print('median set: ', median_set_idx) - Gn_median = [Gn[g] for g in values] - - sod_sm, sod_gm, dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min, idx_dis_k_gi_min \ - = median_on_k_closest_graphs(Gn_median, node_label, edge_label, - gkernel, k, fit_method=fit_method, graph_dir=ds['graph_dir'], - edit_costs=None, group_min=median_set_idx_idx, - dataset='Letter', cost=cost, parallel=False) - - # write result detail. - sod_sm2gm = getRelations(np.sign(sod_gm - sod_sm)) - dis_k_sm2gm = getRelations(np.sign(dis_k_gm - dis_k_sm)) - dis_k_gi2sm = getRelations(np.sign(dis_k_sm - dis_k_gi_min)) - dis_k_gi2gm = getRelations(np.sign(dis_k_gm - dis_k_gi_min)) - if save_results: - f_detail = open(dir_output + fn_output_detail, 'a') - csv.writer(f_detail).writerow([ds_name, gkernel, fit_method, k, - y, repeat, - sod_sm, sod_gm, dis_k_sm, dis_k_gm, - dis_k_gi_min, sod_sm2gm, dis_k_sm2gm, dis_k_gi2sm, - dis_k_gi2gm, median_set_idx]) - f_detail.close() - - # compute result summary. - sod_sm_list.append(sod_sm) - sod_gm_list.append(sod_gm) - dis_k_sm_list.append(dis_k_sm) - dis_k_gm_list.append(dis_k_gm) - dis_k_gi_min_list.append(dis_k_gi_min) - # # SOD SM -> GM - if sod_sm > sod_gm: - nb_sod_sm2gm[0] += 1 - repeats_better_sod_sm2gm.append(repeat) - elif sod_sm == sod_gm: - nb_sod_sm2gm[1] += 1 - elif sod_sm < sod_gm: - nb_sod_sm2gm[2] += 1 - # # dis_k SM -> GM - if dis_k_sm > dis_k_gm: - nb_dis_k_sm2gm[0] += 1 - repeats_better_dis_k_sm2gm.append(repeat) - elif dis_k_sm == dis_k_gm: - nb_dis_k_sm2gm[1] += 1 - elif dis_k_sm < dis_k_gm: - nb_dis_k_sm2gm[2] += 1 - # # dis_k gi -> SM - if dis_k_gi_min > dis_k_sm: - nb_dis_k_gi2sm[0] += 1 - repeats_better_dis_k_gi2sm.append(repeat) - elif dis_k_gi_min == dis_k_sm: - nb_dis_k_gi2sm[1] += 1 - elif dis_k_gi_min < dis_k_sm: - nb_dis_k_gi2sm[2] += 1 - # # dis_k gi -> GM - if dis_k_gi_min > dis_k_gm: - nb_dis_k_gi2gm[0] += 1 - repeats_better_dis_k_gi2gm.append(repeat) - elif dis_k_gi_min == dis_k_gm: - nb_dis_k_gi2gm[1] += 1 - elif dis_k_gi_min < dis_k_gm: - nb_dis_k_gi2gm[2] += 1 - - # save median graphs. - fname_sm = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/output/tmp_ged/set_median.gxl' - fn_pre_sm_new = dir_output + 'medians/set_median.' + fit_method \ - + '.k' + str(int(k)) + '.y' + y + '.repeat' + str(repeat) - copyfile(fname_sm, fn_pre_sm_new + '.gxl') - fname_gm = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/output/tmp_ged/gen_median.gxl' - fn_pre_gm_new = dir_output + 'medians/gen_median.' + fit_method \ - + '.k' + str(int(k)) + '.y' + y + '.repeat' + str(repeat) - copyfile(fname_gm, fn_pre_gm_new + '.gxl') - G_best_kernel = Gn_median[idx_dis_k_gi_min].copy() - reform_attributes(G_best_kernel) - fn_pre_g_best_kernel = dir_output + 'medians/g_best_kernel.' + fit_method \ - + '.k' + str(int(k)) + '.y' + y + '.repeat' + str(repeat) - saveGXL(G_best_kernel, fn_pre_g_best_kernel + '.gxl', method='gedlib-letter') - - # plot median graphs. - set_median = loadGXL(fn_pre_sm_new + '.gxl') - gen_median = loadGXL(fn_pre_gm_new + '.gxl') - draw_Letter_graph(set_median, fn_pre_sm_new) - draw_Letter_graph(gen_median, fn_pre_gm_new) - draw_Letter_graph(G_best_kernel, fn_pre_g_best_kernel) - - # write result summary for each letter. - sod_sm_mean_list.append(np.mean(sod_sm_list)) - sod_gm_mean_list.append(np.mean(sod_gm_list)) - dis_k_sm_mean_list.append(np.mean(dis_k_sm_list)) - dis_k_gm_mean_list.append(np.mean(dis_k_gm_list)) - dis_k_gi_min_mean_list.append(np.mean(dis_k_gi_min_list)) - sod_sm2gm_mean = getRelations(np.sign(sod_gm_mean_list[-1] - sod_sm_mean_list[-1])) - dis_k_sm2gm_mean = getRelations(np.sign(dis_k_gm_mean_list[-1] - dis_k_sm_mean_list[-1])) - dis_k_gi2sm_mean = getRelations(np.sign(dis_k_sm_mean_list[-1] - dis_k_gi_min_mean_list[-1])) - dis_k_gi2gm_mean = getRelations(np.sign(dis_k_gm_mean_list[-1] - dis_k_gi_min_mean_list[-1])) - if save_results: - f_summary = open(dir_output + fn_output_summary, 'a') - csv.writer(f_summary).writerow([ds_name, gkernel, fit_method, k, y, - sod_sm_mean_list[-1], sod_gm_mean_list[-1], - dis_k_sm_mean_list[-1], dis_k_gm_mean_list[-1], - dis_k_gi_min_mean_list[-1], sod_sm2gm_mean, dis_k_sm2gm_mean, - dis_k_gi2sm_mean, dis_k_gi2gm_mean, nb_sod_sm2gm, - nb_dis_k_sm2gm, nb_dis_k_gi2sm, nb_dis_k_gi2gm, - repeats_better_sod_sm2gm, repeats_better_dis_k_sm2gm, - repeats_better_dis_k_gi2sm, repeats_better_dis_k_gi2gm]) - f_summary.close() - - - # write result summary for each letter. - sod_sm_mean = np.mean(sod_sm_mean_list) - sod_gm_mean = np.mean(sod_gm_mean_list) - dis_k_sm_mean = np.mean(dis_k_sm_mean_list) - dis_k_gm_mean = np.mean(dis_k_gm_mean_list) - dis_k_gi_min_mean = np.mean(dis_k_gi_min_list) - sod_sm2gm_mean = getRelations(np.sign(sod_gm_mean - sod_sm_mean)) - dis_k_sm2gm_mean = getRelations(np.sign(dis_k_gm_mean - dis_k_sm_mean)) - dis_k_gi2sm_mean = getRelations(np.sign(dis_k_sm_mean - dis_k_gi_min_mean)) - dis_k_gi2gm_mean = getRelations(np.sign(dis_k_gm_mean - dis_k_gi_min_mean)) - if save_results: - f_summary = open(dir_output + fn_output_summary, 'a') - csv.writer(f_summary).writerow([ds_name, gkernel, fit_method, k, 'all', - sod_sm_mean, sod_gm_mean, dis_k_sm_mean, dis_k_gm_mean, - dis_k_gi_min_mean, sod_sm2gm_mean, dis_k_sm2gm_mean, - dis_k_gi2sm_mean, dis_k_gi2gm_mean]) - f_summary.close() - - print('\ncomplete.') - - -def xp_letter_h(): - ds = {'dataset': 'cpp_ext/data/collections/Letter.xml', - 'graph_dir': os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/data/datasets/Letter/HIGH/'} # node/edge symb - Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['graph_dir']) - for G in Gn: - reform_attributes(G) -# ds = {'name': 'Letter-high', -# 'dataset': '../datasets/Letter-high/Letter-high_A.txt'} # node/edge symb -# Gn, y_all = loadDataset(ds['dataset']) -# Gn = Gn[0:50] - gkernel = 'structuralspkernel' - node_label = None - edge_label = None - ds_name = 'letter-h' - dir_output = 'results/xp_letter_h/' - save_results = False - - repeats = 1 -# k_list = range(2, 11) - k_list = [150] - fit_method = 'k-graphs' - # get indices by classes. - y_idx = get_same_item_indices(y_all) - - if save_results: - # create result files. - fn_output_detail = 'results_detail.' + ds_name + '.' + gkernel + '.' + fit_method + '.csv' - f_detail = open(dir_output + fn_output_detail, 'a') - csv.writer(f_detail).writerow(['dataset', 'graph kernel', 'fit method', 'k', - 'target', 'repeat', 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM', - 'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM', - 'dis_k gi -> GM', 'median set']) - f_detail.close() - fn_output_summary = 'results_summary.' + ds_name + '.' + gkernel + '.' + fit_method + '.csv' - f_summary = open(dir_output + fn_output_summary, 'a') - csv.writer(f_summary).writerow(['dataset', 'graph kernel', 'fit method', 'k', - 'target', 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM', - 'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM', - 'dis_k gi -> GM', '# SOD SM -> GM', '# dis_k SM -> GM', - '# dis_k gi -> SM', '# dis_k gi -> GM', 'repeats better SOD SM -> GM', - 'repeats better dis_k SM -> GM', 'repeats better dis_k gi -> SM', - 'repeats better dis_k gi -> GM']) - f_summary.close() - - random.seed(1) - rdn_seed_list = random.sample(range(0, repeats * 100), repeats) - - for k in k_list: - print('\n--------- k =', k, '----------') - - sod_sm_mean_list = [] - sod_gm_mean_list = [] - dis_k_sm_mean_list = [] - dis_k_gm_mean_list = [] - dis_k_gi_min_mean_list = [] -# nb_sod_sm2gm = [0, 0, 0] -# nb_dis_k_sm2gm = [0, 0, 0] -# nb_dis_k_gi2sm = [0, 0, 0] -# nb_dis_k_gi2gm = [0, 0, 0] -# repeats_better_sod_sm2gm = [] -# repeats_better_dis_k_sm2gm = [] -# repeats_better_dis_k_gi2sm = [] -# repeats_better_dis_k_gi2gm = [] - - for i, (y, values) in enumerate(y_idx.items()): - print('\ny =', y) -# y = 'N' -# values = y_idx[y] -# values = values[0:10] - - k = len(values) - - sod_sm_list = [] - sod_gm_list = [] - dis_k_sm_list = [] - dis_k_gm_list = [] - dis_k_gi_min_list = [] - nb_sod_sm2gm = [0, 0, 0] - nb_dis_k_sm2gm = [0, 0, 0] - nb_dis_k_gi2sm = [0, 0, 0] - nb_dis_k_gi2gm = [0, 0, 0] - repeats_better_sod_sm2gm = [] - repeats_better_dis_k_sm2gm = [] - repeats_better_dis_k_gi2sm = [] - repeats_better_dis_k_gi2gm = [] - - for repeat in range(repeats): - print('\nrepeat =', repeat) - random.seed(rdn_seed_list[repeat]) - median_set_idx_idx = random.sample(range(0, len(values)), k) - median_set_idx = [values[idx] for idx in median_set_idx_idx] - print('median set: ', median_set_idx) - Gn_median = [Gn[g] for g in values] - - sod_sm, sod_gm, dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min, idx_dis_k_gi_min \ - = median_on_k_closest_graphs(Gn_median, node_label, edge_label, - gkernel, k, fit_method=fit_method, graph_dir=ds['graph_dir'], - edit_costs=None, group_min=median_set_idx_idx, - dataset='Letter', parallel=False) - - # write result detail. - sod_sm2gm = getRelations(np.sign(sod_gm - sod_sm)) - dis_k_sm2gm = getRelations(np.sign(dis_k_gm - dis_k_sm)) - dis_k_gi2sm = getRelations(np.sign(dis_k_sm - dis_k_gi_min)) - dis_k_gi2gm = getRelations(np.sign(dis_k_gm - dis_k_gi_min)) - if save_results: - f_detail = open(dir_output + fn_output_detail, 'a') - csv.writer(f_detail).writerow([ds_name, gkernel, fit_method, k, - y, repeat, - sod_sm, sod_gm, dis_k_sm, dis_k_gm, - dis_k_gi_min, sod_sm2gm, dis_k_sm2gm, dis_k_gi2sm, - dis_k_gi2gm, median_set_idx]) - f_detail.close() - - # compute result summary. - sod_sm_list.append(sod_sm) - sod_gm_list.append(sod_gm) - dis_k_sm_list.append(dis_k_sm) - dis_k_gm_list.append(dis_k_gm) - dis_k_gi_min_list.append(dis_k_gi_min) - # # SOD SM -> GM - if sod_sm > sod_gm: - nb_sod_sm2gm[0] += 1 - repeats_better_sod_sm2gm.append(repeat) - elif sod_sm == sod_gm: - nb_sod_sm2gm[1] += 1 - elif sod_sm < sod_gm: - nb_sod_sm2gm[2] += 1 - # # dis_k SM -> GM - if dis_k_sm > dis_k_gm: - nb_dis_k_sm2gm[0] += 1 - repeats_better_dis_k_sm2gm.append(repeat) - elif dis_k_sm == dis_k_gm: - nb_dis_k_sm2gm[1] += 1 - elif dis_k_sm < dis_k_gm: - nb_dis_k_sm2gm[2] += 1 - # # dis_k gi -> SM - if dis_k_gi_min > dis_k_sm: - nb_dis_k_gi2sm[0] += 1 - repeats_better_dis_k_gi2sm.append(repeat) - elif dis_k_gi_min == dis_k_sm: - nb_dis_k_gi2sm[1] += 1 - elif dis_k_gi_min < dis_k_sm: - nb_dis_k_gi2sm[2] += 1 - # # dis_k gi -> GM - if dis_k_gi_min > dis_k_gm: - nb_dis_k_gi2gm[0] += 1 - repeats_better_dis_k_gi2gm.append(repeat) - elif dis_k_gi_min == dis_k_gm: - nb_dis_k_gi2gm[1] += 1 - elif dis_k_gi_min < dis_k_gm: - nb_dis_k_gi2gm[2] += 1 - - # save median graphs. - fname_sm = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/output/tmp_ged/set_median.gxl' - fn_pre_sm_new = dir_output + 'medians/set_median.' + fit_method \ - + '.k' + str(int(k)) + '.y' + y + '.repeat' + str(repeat) - copyfile(fname_sm, fn_pre_sm_new + '.gxl') - fname_gm = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/output/tmp_ged/gen_median.gxl' - fn_pre_gm_new = dir_output + 'medians/gen_median.' + fit_method \ - + '.k' + str(int(k)) + '.y' + y + '.repeat' + str(repeat) - copyfile(fname_gm, fn_pre_gm_new + '.gxl') - G_best_kernel = Gn_median[idx_dis_k_gi_min].copy() - reform_attributes(G_best_kernel) - fn_pre_g_best_kernel = dir_output + 'medians/g_best_kernel.' + fit_method \ - + '.k' + str(int(k)) + '.y' + y + '.repeat' + str(repeat) - saveGXL(G_best_kernel, fn_pre_g_best_kernel + '.gxl', method='gedlib-letter') - - # plot median graphs. - set_median = loadGXL(fn_pre_sm_new + '.gxl') - gen_median = loadGXL(fn_pre_gm_new + '.gxl') - draw_Letter_graph(set_median, fn_pre_sm_new) - draw_Letter_graph(gen_median, fn_pre_gm_new) - draw_Letter_graph(G_best_kernel, fn_pre_g_best_kernel) - - # write result summary for each letter. - sod_sm_mean_list.append(np.mean(sod_sm_list)) - sod_gm_mean_list.append(np.mean(sod_gm_list)) - dis_k_sm_mean_list.append(np.mean(dis_k_sm_list)) - dis_k_gm_mean_list.append(np.mean(dis_k_gm_list)) - dis_k_gi_min_mean_list.append(np.mean(dis_k_gi_min_list)) - sod_sm2gm_mean = getRelations(np.sign(sod_gm_mean_list[-1] - sod_sm_mean_list[-1])) - dis_k_sm2gm_mean = getRelations(np.sign(dis_k_gm_mean_list[-1] - dis_k_sm_mean_list[-1])) - dis_k_gi2sm_mean = getRelations(np.sign(dis_k_sm_mean_list[-1] - dis_k_gi_min_mean_list[-1])) - dis_k_gi2gm_mean = getRelations(np.sign(dis_k_gm_mean_list[-1] - dis_k_gi_min_mean_list[-1])) - if save_results: - f_summary = open(dir_output + fn_output_summary, 'a') - csv.writer(f_summary).writerow([ds_name, gkernel, fit_method, k, y, - sod_sm_mean_list[-1], sod_gm_mean_list[-1], - dis_k_sm_mean_list[-1], dis_k_gm_mean_list[-1], - dis_k_gi_min_mean_list[-1], sod_sm2gm_mean, dis_k_sm2gm_mean, - dis_k_gi2sm_mean, dis_k_gi2gm_mean, nb_sod_sm2gm, - nb_dis_k_sm2gm, nb_dis_k_gi2sm, nb_dis_k_gi2gm, - repeats_better_sod_sm2gm, repeats_better_dis_k_sm2gm, - repeats_better_dis_k_gi2sm, repeats_better_dis_k_gi2gm]) - f_summary.close() - - - # write result summary for each letter. - sod_sm_mean = np.mean(sod_sm_mean_list) - sod_gm_mean = np.mean(sod_gm_mean_list) - dis_k_sm_mean = np.mean(dis_k_sm_mean_list) - dis_k_gm_mean = np.mean(dis_k_gm_mean_list) - dis_k_gi_min_mean = np.mean(dis_k_gi_min_list) - sod_sm2gm_mean = getRelations(np.sign(sod_gm_mean - sod_sm_mean)) - dis_k_sm2gm_mean = getRelations(np.sign(dis_k_gm_mean - dis_k_sm_mean)) - dis_k_gi2sm_mean = getRelations(np.sign(dis_k_sm_mean - dis_k_gi_min_mean)) - dis_k_gi2gm_mean = getRelations(np.sign(dis_k_gm_mean - dis_k_gi_min_mean)) - if save_results: - f_summary = open(dir_output + fn_output_summary, 'a') - csv.writer(f_summary).writerow([ds_name, gkernel, fit_method, k, 'all', - sod_sm_mean, sod_gm_mean, dis_k_sm_mean, dis_k_gm_mean, - dis_k_gi_min_mean, sod_sm2gm_mean, dis_k_sm2gm_mean, - dis_k_gi2sm_mean, dis_k_gi2gm_mean]) - f_summary.close() - - print('\ncomplete.') - - -#Dessin median courrant -def draw_Letter_graph(graph, file_prefix): - plt.figure() - pos = {} - for n in graph.nodes: - pos[n] = np.array([float(graph.node[n]['x']),float(graph.node[n]['y'])]) - nx.draw_networkx(graph, pos) - plt.savefig(file_prefix + '.eps', format='eps', dpi=300) -# plt.show() - plt.clf() - - -if __name__ == "__main__": -# xp_letter_h() - xp_letter_h_LETTER2_cost() \ No newline at end of file diff --git a/gklearn/preimage/xp_monoterpenoides.py b/gklearn/preimage/xp_monoterpenoides.py deleted file mode 100644 index 2270471..0000000 --- a/gklearn/preimage/xp_monoterpenoides.py +++ /dev/null @@ -1,249 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -Created on Thu Jan 16 11:03:11 2020 - -@author: ljia -""" - -import numpy as np -import random -import csv -from shutil import copyfile -import networkx as nx -import matplotlib.pyplot as plt - -from gklearn.utils.graphfiles import loadDataset, loadGXL, saveGXL -from gklearn.preimage.test_k_closest_graphs import median_on_k_closest_graphs, reform_attributes -from gklearn.preimage.utils import get_same_item_indices -from gklearn.preimage.find_best_k import getRelations - -def xp_monoterpenoides(): - import os - - ds = {'dataset': '../../datasets/monoterpenoides/dataset_10+.ds', - 'graph_dir': os.path.dirname(os.path.realpath(__file__)) + '../../datasets/monoterpenoides/'} # node/edge symb - Gn, y_all = loadDataset(ds['dataset']) -# ds = {'name': 'Letter-high', -# 'dataset': '../datasets/Letter-high/Letter-high_A.txt'} # node/edge symb -# Gn, y_all = loadDataset(ds['dataset']) -# Gn = Gn[0:50] - gkernel = 'treeletkernel' - node_label = 'atom' - edge_label = 'bond_type' - ds_name = 'monoterpenoides' - dir_output = 'results/xp_monoterpenoides/' - - repeats = 1 -# k_list = range(2, 11) - k_list = [0] - fit_method = 'k-graphs' - # get indices by classes. - y_idx = get_same_item_indices(y_all) - - # create result files. - fn_output_detail = 'results_detail.' + ds_name + '.' + gkernel + '.' + fit_method + '.csv' - f_detail = open(dir_output + fn_output_detail, 'a') - csv.writer(f_detail).writerow(['dataset', 'graph kernel', 'fit method', 'k', - 'target', 'repeat', 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM', - 'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM', - 'dis_k gi -> GM', 'median set']) - f_detail.close() - fn_output_summary = 'results_summary.' + ds_name + '.' + gkernel + '.' + fit_method + '.csv' - f_summary = open(dir_output + fn_output_summary, 'a') - csv.writer(f_summary).writerow(['dataset', 'graph kernel', 'fit method', 'k', - 'target', 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM', - 'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM', - 'dis_k gi -> GM', '# SOD SM -> GM', '# dis_k SM -> GM', - '# dis_k gi -> SM', '# dis_k gi -> GM', 'repeats better SOD SM -> GM', - 'repeats better dis_k SM -> GM', 'repeats better dis_k gi -> SM', - 'repeats better dis_k gi -> GM']) - f_summary.close() - - random.seed(1) - rdn_seed_list = random.sample(range(0, repeats * 100), repeats) - - for k in k_list: - print('\n--------- k =', k, '----------') - - sod_sm_mean_list = [] - sod_gm_mean_list = [] - dis_k_sm_mean_list = [] - dis_k_gm_mean_list = [] - dis_k_gi_min_mean_list = [] -# nb_sod_sm2gm = [0, 0, 0] -# nb_dis_k_sm2gm = [0, 0, 0] -# nb_dis_k_gi2sm = [0, 0, 0] -# nb_dis_k_gi2gm = [0, 0, 0] -# repeats_better_sod_sm2gm = [] -# repeats_better_dis_k_sm2gm = [] -# repeats_better_dis_k_gi2sm = [] -# repeats_better_dis_k_gi2gm = [] - - for i, (y, values) in enumerate(y_idx.items()): - print('\ny =', y) -# y = 'I' -# values = y_idx[y] - - k = len(values) -# k = kkk - - sod_sm_list = [] - sod_gm_list = [] - dis_k_sm_list = [] - dis_k_gm_list = [] - dis_k_gi_min_list = [] - nb_sod_sm2gm = [0, 0, 0] - nb_dis_k_sm2gm = [0, 0, 0] - nb_dis_k_gi2sm = [0, 0, 0] - nb_dis_k_gi2gm = [0, 0, 0] - repeats_better_sod_sm2gm = [] - repeats_better_dis_k_sm2gm = [] - repeats_better_dis_k_gi2sm = [] - repeats_better_dis_k_gi2gm = [] - - for repeat in range(repeats): - print('\nrepeat =', repeat) - random.seed(rdn_seed_list[repeat]) - median_set_idx_idx = random.sample(range(0, len(values)), k) - median_set_idx = [values[idx] for idx in median_set_idx_idx] - print('median set: ', median_set_idx) - Gn_median = [Gn[g] for g in values] - - sod_sm, sod_gm, dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min, idx_dis_k_gi_min \ - = median_on_k_closest_graphs(Gn_median, node_label, edge_label, - gkernel, k, fit_method=fit_method, graph_dir=ds['graph_dir'], - edit_costs=None, group_min=median_set_idx_idx, - dataset=ds_name, parallel=False) - - # write result detail. - sod_sm2gm = getRelations(np.sign(sod_gm - sod_sm)) - dis_k_sm2gm = getRelations(np.sign(dis_k_gm - dis_k_sm)) - dis_k_gi2sm = getRelations(np.sign(dis_k_sm - dis_k_gi_min)) - dis_k_gi2gm = getRelations(np.sign(dis_k_gm - dis_k_gi_min)) - f_detail = open(dir_output + fn_output_detail, 'a') - csv.writer(f_detail).writerow([ds_name, gkernel, fit_method, k, - y, repeat, - sod_sm, sod_gm, dis_k_sm, dis_k_gm, - dis_k_gi_min, sod_sm2gm, dis_k_sm2gm, dis_k_gi2sm, - dis_k_gi2gm, median_set_idx]) - f_detail.close() - - # compute result summary. - sod_sm_list.append(sod_sm) - sod_gm_list.append(sod_gm) - dis_k_sm_list.append(dis_k_sm) - dis_k_gm_list.append(dis_k_gm) - dis_k_gi_min_list.append(dis_k_gi_min) - # # SOD SM -> GM - if sod_sm > sod_gm: - nb_sod_sm2gm[0] += 1 - repeats_better_sod_sm2gm.append(repeat) - elif sod_sm == sod_gm: - nb_sod_sm2gm[1] += 1 - elif sod_sm < sod_gm: - nb_sod_sm2gm[2] += 1 - # # dis_k SM -> GM - if dis_k_sm > dis_k_gm: - nb_dis_k_sm2gm[0] += 1 - repeats_better_dis_k_sm2gm.append(repeat) - elif dis_k_sm == dis_k_gm: - nb_dis_k_sm2gm[1] += 1 - elif dis_k_sm < dis_k_gm: - nb_dis_k_sm2gm[2] += 1 - # # dis_k gi -> SM - if dis_k_gi_min > dis_k_sm: - nb_dis_k_gi2sm[0] += 1 - repeats_better_dis_k_gi2sm.append(repeat) - elif dis_k_gi_min == dis_k_sm: - nb_dis_k_gi2sm[1] += 1 - elif dis_k_gi_min < dis_k_sm: - nb_dis_k_gi2sm[2] += 1 - # # dis_k gi -> GM - if dis_k_gi_min > dis_k_gm: - nb_dis_k_gi2gm[0] += 1 - repeats_better_dis_k_gi2gm.append(repeat) - elif dis_k_gi_min == dis_k_gm: - nb_dis_k_gi2gm[1] += 1 - elif dis_k_gi_min < dis_k_gm: - nb_dis_k_gi2gm[2] += 1 - - # save median graphs. - fname_sm = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/output/tmp_ged/set_median.gxl' - fn_pre_sm_new = dir_output + 'medians/set_median.' + fit_method \ - + '.k' + str(int(k)) + '.y' + str(int(y)) + '.repeat' + str(repeat) - copyfile(fname_sm, fn_pre_sm_new + '.gxl') - fname_gm = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/output/tmp_ged/gen_median.gxl' - fn_pre_gm_new = dir_output + 'medians/gen_median.' + fit_method \ - + '.k' + str(int(k)) + '.y' + str(int(y)) + '.repeat' + str(repeat) - copyfile(fname_gm, fn_pre_gm_new + '.gxl') - G_best_kernel = Gn_median[idx_dis_k_gi_min].copy() -# reform_attributes(G_best_kernel) - fn_pre_g_best_kernel = dir_output + 'medians/g_best_kernel.' + fit_method \ - + '.k' + str(int(k)) + '.y' + str(int(y)) + '.repeat' + str(repeat) - saveGXL(G_best_kernel, fn_pre_g_best_kernel + '.gxl', method='gedlib') - -# # plot median graphs. -# set_median = loadGXL(fn_pre_sm_new + '.gxl') -# gen_median = loadGXL(fn_pre_gm_new + '.gxl') -# draw_Letter_graph(set_median, fn_pre_sm_new) -# draw_Letter_graph(gen_median, fn_pre_gm_new) -# draw_Letter_graph(G_best_kernel, fn_pre_g_best_kernel) - - # write result summary for each letter. - sod_sm_mean_list.append(np.mean(sod_sm_list)) - sod_gm_mean_list.append(np.mean(sod_gm_list)) - dis_k_sm_mean_list.append(np.mean(dis_k_sm_list)) - dis_k_gm_mean_list.append(np.mean(dis_k_gm_list)) - dis_k_gi_min_mean_list.append(np.mean(dis_k_gi_min_list)) - sod_sm2gm_mean = getRelations(np.sign(sod_gm_mean_list[-1] - sod_sm_mean_list[-1])) - dis_k_sm2gm_mean = getRelations(np.sign(dis_k_gm_mean_list[-1] - dis_k_sm_mean_list[-1])) - dis_k_gi2sm_mean = getRelations(np.sign(dis_k_sm_mean_list[-1] - dis_k_gi_min_mean_list[-1])) - dis_k_gi2gm_mean = getRelations(np.sign(dis_k_gm_mean_list[-1] - dis_k_gi_min_mean_list[-1])) - f_summary = open(dir_output + fn_output_summary, 'a') - csv.writer(f_summary).writerow([ds_name, gkernel, fit_method, k, y, - sod_sm_mean_list[-1], sod_gm_mean_list[-1], - dis_k_sm_mean_list[-1], dis_k_gm_mean_list[-1], - dis_k_gi_min_mean_list[-1], sod_sm2gm_mean, dis_k_sm2gm_mean, - dis_k_gi2sm_mean, dis_k_gi2gm_mean, nb_sod_sm2gm, - nb_dis_k_sm2gm, nb_dis_k_gi2sm, nb_dis_k_gi2gm, - repeats_better_sod_sm2gm, repeats_better_dis_k_sm2gm, - repeats_better_dis_k_gi2sm, repeats_better_dis_k_gi2gm]) - f_summary.close() - - - # write result summary for each letter. - sod_sm_mean = np.mean(sod_sm_mean_list) - sod_gm_mean = np.mean(sod_gm_mean_list) - dis_k_sm_mean = np.mean(dis_k_sm_mean_list) - dis_k_gm_mean = np.mean(dis_k_gm_mean_list) - dis_k_gi_min_mean = np.mean(dis_k_gi_min_list) - sod_sm2gm_mean = getRelations(np.sign(sod_gm_mean - sod_sm_mean)) - dis_k_sm2gm_mean = getRelations(np.sign(dis_k_gm_mean - dis_k_sm_mean)) - dis_k_gi2sm_mean = getRelations(np.sign(dis_k_sm_mean - dis_k_gi_min_mean)) - dis_k_gi2gm_mean = getRelations(np.sign(dis_k_gm_mean - dis_k_gi_min_mean)) - f_summary = open(dir_output + fn_output_summary, 'a') - csv.writer(f_summary).writerow([ds_name, gkernel, fit_method, k, 'all', - sod_sm_mean, sod_gm_mean, dis_k_sm_mean, dis_k_gm_mean, - dis_k_gi_min_mean, sod_sm2gm_mean, dis_k_sm2gm_mean, - dis_k_gi2sm_mean, dis_k_gi2gm_mean]) - f_summary.close() - - - print('\ncomplete.') - - -#Dessin median courrant -def draw_Letter_graph(graph, file_prefix): - plt.figure() - pos = {} - for n in graph.nodes: - pos[n] = np.array([float(graph.node[n]['x']),float(graph.node[n]['y'])]) - nx.draw_networkx(graph, pos) - plt.savefig(file_prefix + '.eps', format='eps', dpi=300) -# plt.show() - plt.clf() - - -if __name__ == "__main__": - xp_monoterpenoides() \ No newline at end of file