diff --git a/gklearn/preimage/find_best_k.py b/gklearn/preimage/find_best_k.py
deleted file mode 100644
index df38d32..0000000
--- a/gklearn/preimage/find_best_k.py
+++ /dev/null
@@ -1,170 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Thu Jan 9 11:54:32 2020
-
-@author: ljia
-"""
-import numpy as np
-import random
-import csv
-
-from gklearn.utils.graphfiles import loadDataset
-from gklearn.preimage.test_k_closest_graphs import median_on_k_closest_graphs
-
-def find_best_k():
- ds = {'name': 'monoterpenoides',
- 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb
- Gn, y_all = loadDataset(ds['dataset'])
-# Gn = Gn[0:50]
- gkernel = 'treeletkernel'
- node_label = 'atom'
- edge_label = 'bond_type'
- ds_name = 'mono'
- dir_output = 'results/test_find_best_k/'
-
- repeats = 50
- k_list = range(2, 11)
- fit_method = 'k-graphs'
- # fitted on the whole dataset - treelet - mono
- edit_costs = [0.1268873773592978, 0.004084633224249829, 0.0897581955378986, 0.15328856114451297, 0.3109956881625734, 0.0]
-
- # create result files.
- fn_output_detail = 'results_detail.' + fit_method + '.csv'
- f_detail = open(dir_output + fn_output_detail, 'a')
- csv.writer(f_detail).writerow(['dataset', 'graph kernel', 'fit method', 'k',
- 'repeat', 'median set', 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM',
- 'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM',
- 'dis_k gi -> GM'])
- f_detail.close()
- fn_output_summary = 'results_summary.csv'
- f_summary = open(dir_output + fn_output_summary, 'a')
- csv.writer(f_summary).writerow(['dataset', 'graph kernel', 'fit method', 'k',
- 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM',
- 'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM',
- 'dis_k gi -> GM', '# SOD SM -> GM', '# dis_k SM -> GM',
- '# dis_k gi -> SM', '# dis_k gi -> GM', 'repeats better SOD SM -> GM',
- 'repeats better dis_k SM -> GM', 'repeats better dis_k gi -> SM',
- 'repeats better dis_k gi -> GM'])
- f_summary.close()
-
- random.seed(1)
- rdn_seed_list = random.sample(range(0, repeats * 100), repeats)
-
- for k in k_list:
- print('\n--------- k =', k, '----------')
-
- sod_sm_list = []
- sod_gm_list = []
- dis_k_sm_list = []
- dis_k_gm_list = []
- dis_k_gi_min_list = []
- nb_sod_sm2gm = [0, 0, 0]
- nb_dis_k_sm2gm = [0, 0, 0]
- nb_dis_k_gi2sm = [0, 0, 0]
- nb_dis_k_gi2gm = [0, 0, 0]
- repeats_better_sod_sm2gm = []
- repeats_better_dis_k_sm2gm = []
- repeats_better_dis_k_gi2sm = []
- repeats_better_dis_k_gi2gm = []
-
-
- for repeat in range(repeats):
- print('\nrepeat =', repeat)
- random.seed(rdn_seed_list[repeat])
- median_set_idx = random.sample(range(0, len(Gn)), k)
- print('median set: ', median_set_idx)
-
- sod_sm, sod_gm, dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min \
- = median_on_k_closest_graphs(Gn, node_label, edge_label, gkernel, k,
- fit_method='k-graphs',
- edit_costs=edit_costs,
- group_min=median_set_idx,
- parallel=False)
-
- # write result detail.
- sod_sm2gm = getRelations(np.sign(sod_gm - sod_sm))
- dis_k_sm2gm = getRelations(np.sign(dis_k_gm - dis_k_sm))
- dis_k_gi2sm = getRelations(np.sign(dis_k_sm - dis_k_gi_min))
- dis_k_gi2gm = getRelations(np.sign(dis_k_gm - dis_k_gi_min))
- f_detail = open(dir_output + fn_output_detail, 'a')
- csv.writer(f_detail).writerow([ds_name, gkernel, fit_method, k, repeat,
- median_set_idx, sod_sm, sod_gm, dis_k_sm, dis_k_gm,
- dis_k_gi_min, sod_sm2gm, dis_k_sm2gm, dis_k_gi2sm,
- dis_k_gi2gm])
- f_detail.close()
-
- # compute result summary.
- sod_sm_list.append(sod_sm)
- sod_gm_list.append(sod_gm)
- dis_k_sm_list.append(dis_k_sm)
- dis_k_gm_list.append(dis_k_gm)
- dis_k_gi_min_list.append(dis_k_gi_min)
- # # SOD SM -> GM
- if sod_sm > sod_gm:
- nb_sod_sm2gm[0] += 1
- repeats_better_sod_sm2gm.append(repeat)
- elif sod_sm == sod_gm:
- nb_sod_sm2gm[1] += 1
- elif sod_sm < sod_gm:
- nb_sod_sm2gm[2] += 1
- # # dis_k SM -> GM
- if dis_k_sm > dis_k_gm:
- nb_dis_k_sm2gm[0] += 1
- repeats_better_dis_k_sm2gm.append(repeat)
- elif dis_k_sm == dis_k_gm:
- nb_dis_k_sm2gm[1] += 1
- elif dis_k_sm < dis_k_gm:
- nb_dis_k_sm2gm[2] += 1
- # # dis_k gi -> SM
- if dis_k_gi_min > dis_k_sm:
- nb_dis_k_gi2sm[0] += 1
- repeats_better_dis_k_gi2sm.append(repeat)
- elif dis_k_gi_min == dis_k_sm:
- nb_dis_k_gi2sm[1] += 1
- elif dis_k_gi_min < dis_k_sm:
- nb_dis_k_gi2sm[2] += 1
- # # dis_k gi -> GM
- if dis_k_gi_min > dis_k_gm:
- nb_dis_k_gi2gm[0] += 1
- repeats_better_dis_k_gi2gm.append(repeat)
- elif dis_k_gi_min == dis_k_gm:
- nb_dis_k_gi2gm[1] += 1
- elif dis_k_gi_min < dis_k_gm:
- nb_dis_k_gi2gm[2] += 1
-
- # write result summary.
- sod_sm_mean = np.mean(sod_sm_list)
- sod_gm_mean = np.mean(sod_gm_list)
- dis_k_sm_mean = np.mean(dis_k_sm_list)
- dis_k_gm_mean = np.mean(dis_k_gm_list)
- dis_k_gi_min_mean = np.mean(dis_k_gi_min_list)
- sod_sm2gm_mean = getRelations(np.sign(sod_gm_mean - sod_sm_mean))
- dis_k_sm2gm_mean = getRelations(np.sign(dis_k_gm_mean - dis_k_sm_mean))
- dis_k_gi2sm_mean = getRelations(np.sign(dis_k_sm_mean - dis_k_gi_min_mean))
- dis_k_gi2gm_mean = getRelations(np.sign(dis_k_gm_mean - dis_k_gi_min_mean))
- f_summary = open(dir_output + fn_output_summary, 'a')
- csv.writer(f_summary).writerow([ds_name, gkernel, fit_method, k,
- sod_sm_mean, sod_gm_mean, dis_k_sm_mean, dis_k_gm_mean,
- dis_k_gi_min_mean, sod_sm2gm_mean, dis_k_sm2gm_mean,
- dis_k_gi2sm_mean, dis_k_gi2gm_mean, nb_sod_sm2gm,
- nb_dis_k_sm2gm, nb_dis_k_gi2sm, nb_dis_k_gi2gm,
- repeats_better_sod_sm2gm, repeats_better_dis_k_sm2gm,
- repeats_better_dis_k_gi2sm, repeats_better_dis_k_gi2gm])
- f_summary.close()
-
- print('\ncomplete.')
- return
-
-
-def getRelations(sign):
- if sign == -1:
- return 'better'
- elif sign == 0:
- return 'same'
- elif sign == 1:
- return 'worse'
-
-
-if __name__ == '__main__':
- find_best_k()
\ No newline at end of file
diff --git a/gklearn/preimage/fitDistance.py b/gklearn/preimage/fitDistance.py
deleted file mode 100644
index 234f7fc..0000000
--- a/gklearn/preimage/fitDistance.py
+++ /dev/null
@@ -1,430 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Wed Oct 16 14:20:06 2019
-
-@author: ljia
-"""
-import numpy as np
-from tqdm import tqdm
-from itertools import combinations_with_replacement, combinations
-import multiprocessing
-from multiprocessing import Pool
-from functools import partial
-import time
-import random
-import sys
-
-from scipy import optimize
-from scipy.optimize import minimize
-import cvxpy as cp
-
-from gklearn.preimage.ged import GED, get_nb_edit_operations, get_nb_edit_operations_letter, get_nb_edit_operations_nonsymbolic
-from gklearn.preimage.utils import kernel_distance_matrix
-
-def fit_GED_to_kernel_distance(Gn, node_label, edge_label, gkernel, itr_max,
- params_ged={'lib': 'gedlibpy', 'cost': 'CONSTANT',
- 'method': 'IPFP', 'stabilizer': None},
- init_costs=[3, 3, 1, 3, 3, 1],
- dataset='monoterpenoides', Kmatrix=None,
- parallel=True):
-# dataset = dataset.lower()
-
- # c_vi, c_vr, c_vs, c_ei, c_er, c_es or parts of them.
-# random.seed(1)
-# cost_rdm = random.sample(range(1, 10), 6)
-# init_costs = cost_rdm + [0]
-# init_costs = cost_rdm
-# init_costs = [3, 3, 1, 3, 3, 1]
-# init_costs = [i * 0.01 for i in cost_rdm] + [0]
-# init_costs = [0.2, 0.2, 0.2, 0.2, 0.2, 0]
-# init_costs = [0, 0, 0.9544, 0.026, 0.0196, 0]
-# init_costs = [0.008429912251810438, 0.025461055985319694, 0.2047320869225948, 0.004148727085832133, 0.0, 0]
-# idx_cost_nonzeros = [i for i, item in enumerate(edit_costs) if item != 0]
-
- # compute distances in feature space.
- dis_k_mat, _, _, _ = kernel_distance_matrix(Gn, node_label, edge_label,
- Kmatrix=Kmatrix, gkernel=gkernel)
- dis_k_vec = []
- for i in range(len(dis_k_mat)):
-# for j in range(i, len(dis_k_mat)):
- for j in range(i + 1, len(dis_k_mat)):
- dis_k_vec.append(dis_k_mat[i, j])
- dis_k_vec = np.array(dis_k_vec)
-
- # init ged.
- print('\ninitial:')
- time0 = time.time()
- params_ged['dataset'] = dataset
- params_ged['edit_cost_constant'] = init_costs
- ged_vec_init, ged_mat, n_edit_operations = compute_geds(Gn, params_ged,
- parallel=parallel)
- residual_list = [np.sqrt(np.sum(np.square(np.array(ged_vec_init) - dis_k_vec)))]
- time_list = [time.time() - time0]
- edit_cost_list = [init_costs]
- nb_cost_mat = np.array(n_edit_operations)
- nb_cost_mat_list = [nb_cost_mat]
- print('edit_costs:', init_costs)
- print('residual_list:', residual_list)
-
- for itr in range(itr_max):
- print('\niteration', itr)
- time0 = time.time()
- # "fit" geds to distances in feature space by tuning edit costs using the
- # Least Squares Method.
- np.savez('results/xp_fit_method/fit_data_debug' + str(itr) + '.gm',
- nb_cost_mat=nb_cost_mat, dis_k_vec=dis_k_vec,
- n_edit_operations=n_edit_operations, ged_vec_init=ged_vec_init,
- ged_mat=ged_mat)
- edit_costs_new, residual = update_costs(nb_cost_mat, dis_k_vec,
- dataset=dataset, cost=params_ged['cost'])
- for i in range(len(edit_costs_new)):
- if -1e-9 <= edit_costs_new[i] <= 1e-9:
- edit_costs_new[i] = 0
- if edit_costs_new[i] < 0:
- raise ValueError('The edit cost is negative.')
-# for i in range(len(edit_costs_new)):
-# if edit_costs_new[i] < 0:
-# edit_costs_new[i] = 0
-
- # compute new GEDs and numbers of edit operations.
- params_ged['edit_cost_constant'] = edit_costs_new # np.array([edit_costs_new[0], edit_costs_new[1], 0.75])
- ged_vec, ged_mat, n_edit_operations = compute_geds(Gn, params_ged,
- parallel=parallel)
- residual_list.append(np.sqrt(np.sum(np.square(np.array(ged_vec) - dis_k_vec))))
- time_list.append(time.time() - time0)
- edit_cost_list.append(edit_costs_new)
- nb_cost_mat = np.array(n_edit_operations)
- nb_cost_mat_list.append(nb_cost_mat)
- print('edit_costs:', edit_costs_new)
- print('residual_list:', residual_list)
-
- return edit_costs_new, residual_list, edit_cost_list, dis_k_mat, ged_mat, \
- time_list, nb_cost_mat_list
-
-
-def compute_geds(Gn, params_ged, parallel=False):
- edit_cost_name = params_ged['cost']
- if edit_cost_name == 'LETTER' or edit_cost_name == 'LETTER2':
- get_nb_eo = get_nb_edit_operations_letter
- elif edit_cost_name == 'NON_SYMBOLIC':
- get_nb_eo = get_nb_edit_operations_nonsymbolic
- else:
- get_nb_eo = get_nb_edit_operations
- ged_mat = np.zeros((len(Gn), len(Gn)))
- if parallel:
-# print('parallel')
-# len_itr = int(len(Gn) * (len(Gn) + 1) / 2)
- len_itr = int(len(Gn) * (len(Gn) - 1) / 2)
- ged_vec = [0 for i in range(len_itr)]
- n_edit_operations = [0 for i in range(len_itr)]
-# itr = combinations_with_replacement(range(0, len(Gn)), 2)
- itr = combinations(range(0, len(Gn)), 2)
- n_jobs = multiprocessing.cpu_count()
- if len_itr < 100 * n_jobs:
- chunksize = int(len_itr / n_jobs) + 1
- else:
- chunksize = 100
- def init_worker(gn_toshare):
- global G_gn
- G_gn = gn_toshare
- do_partial = partial(_wrapper_compute_ged_parallel, params_ged, get_nb_eo)
- pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(Gn,))
- iterator = tqdm(pool.imap_unordered(do_partial, itr, chunksize),
- desc='computing GEDs', file=sys.stdout)
-# iterator = pool.imap_unordered(do_partial, itr, chunksize)
- for i, j, dis, n_eo_tmp in iterator:
- idx_itr = int(len(Gn) * i + j - (i + 1) * (i + 2) / 2)
- ged_vec[idx_itr] = dis
- ged_mat[i][j] = dis
- ged_mat[j][i] = dis
- n_edit_operations[idx_itr] = n_eo_tmp
-# print('\n-------------------------------------------')
-# print(i, j, idx_itr, dis)
- pool.close()
- pool.join()
-
- else:
- ged_vec = []
- n_edit_operations = []
- for i in tqdm(range(len(Gn)), desc='computing GEDs', file=sys.stdout):
-# for i in range(len(Gn)):
- for j in range(i + 1, len(Gn)):
- dis, pi_forward, pi_backward = GED(Gn[i], Gn[j], **params_ged)
- ged_vec.append(dis)
- ged_mat[i][j] = dis
- ged_mat[j][i] = dis
- n_eo_tmp = get_nb_eo(Gn[i], Gn[j], pi_forward, pi_backward)
- n_edit_operations.append(n_eo_tmp)
-
- return ged_vec, ged_mat, n_edit_operations
-
-
-def _wrapper_compute_ged_parallel(params_ged, get_nb_eo, itr):
- i = itr[0]
- j = itr[1]
- dis, n_eo_tmp = _compute_ged_parallel(G_gn[i], G_gn[j], params_ged, get_nb_eo)
- return i, j, dis, n_eo_tmp
-
-
-def _compute_ged_parallel(g1, g2, params_ged, get_nb_eo):
- dis, pi_forward, pi_backward = GED(g1, g2, **params_ged)
- n_eo_tmp = get_nb_eo(g1, g2, pi_forward, pi_backward) # [0,0,0,0,0,0]
- return dis, n_eo_tmp
-
-
-def update_costs(nb_cost_mat, dis_k_vec, dataset='monoterpenoides',
- cost='CONSTANT', rw_constraints='inequality'):
-# if dataset == 'Letter-high':
- if cost == 'LETTER':
- pass
-# # method 1: set alpha automatically, just tune c_vir and c_eir by
-# # LMS using cvxpy.
-# alpha = 0.5
-# coeff = 100 # np.max(alpha * nb_cost_mat[:,4] / dis_k_vec)
-## if np.count_nonzero(nb_cost_mat[:,4]) == 0:
-## alpha = 0.75
-## else:
-## alpha = np.min([dis_k_vec / c_vs for c_vs in nb_cost_mat[:,4] if c_vs != 0])
-## alpha = alpha * 0.99
-# param_vir = alpha * (nb_cost_mat[:,0] + nb_cost_mat[:,1])
-# param_eir = (1 - alpha) * (nb_cost_mat[:,4] + nb_cost_mat[:,5])
-# nb_cost_mat_new = np.column_stack((param_vir, param_eir))
-# dis_new = coeff * dis_k_vec - alpha * nb_cost_mat[:,3]
-#
-# x = cp.Variable(nb_cost_mat_new.shape[1])
-# cost = cp.sum_squares(nb_cost_mat_new * x - dis_new)
-# constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])]]
-# prob = cp.Problem(cp.Minimize(cost), constraints)
-# prob.solve()
-# edit_costs_new = x.value
-# edit_costs_new = np.array([edit_costs_new[0], edit_costs_new[1], alpha])
-# residual = np.sqrt(prob.value)
-
-# # method 2: tune c_vir, c_eir and alpha by nonlinear programming by
-# # scipy.optimize.minimize.
-# w0 = nb_cost_mat[:,0] + nb_cost_mat[:,1]
-# w1 = nb_cost_mat[:,4] + nb_cost_mat[:,5]
-# w2 = nb_cost_mat[:,3]
-# w3 = dis_k_vec
-# func_min = lambda x: np.sum((w0 * x[0] * x[3] + w1 * x[1] * (1 - x[2]) \
-# + w2 * x[2] - w3 * x[3]) ** 2)
-# bounds = ((0, None), (0., None), (0.5, 0.5), (0, None))
-# res = minimize(func_min, [0.9, 1.7, 0.75, 10], bounds=bounds)
-# edit_costs_new = res.x[0:3]
-# residual = res.fun
-
- # method 3: tune c_vir, c_eir and alpha by nonlinear programming using cvxpy.
-
-
-# # method 4: tune c_vir, c_eir and alpha by QP function
-# # scipy.optimize.least_squares. An initial guess is required.
-# w0 = nb_cost_mat[:,0] + nb_cost_mat[:,1]
-# w1 = nb_cost_mat[:,4] + nb_cost_mat[:,5]
-# w2 = nb_cost_mat[:,3]
-# w3 = dis_k_vec
-# func = lambda x: (w0 * x[0] * x[3] + w1 * x[1] * (1 - x[2]) \
-# + w2 * x[2] - w3 * x[3]) ** 2
-# res = optimize.root(func, [0.9, 1.7, 0.75, 100])
-# edit_costs_new = res.x
-# residual = None
- elif cost == 'LETTER2':
-# # 1. if c_vi != c_vr, c_ei != c_er.
-# nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]]
-# x = cp.Variable(nb_cost_mat_new.shape[1])
-# cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec)
-## # 1.1 no constraints.
-## constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])]]
-# # 1.2 c_vs <= c_vi + c_vr.
-# constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])],
-# np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0]
-## # 2. if c_vi == c_vr, c_ei == c_er.
-## nb_cost_mat_new = nb_cost_mat[:,[0,3,4]]
-## nb_cost_mat_new[:,0] += nb_cost_mat[:,1]
-## nb_cost_mat_new[:,2] += nb_cost_mat[:,5]
-## x = cp.Variable(nb_cost_mat_new.shape[1])
-## cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec)
-## # 2.1 no constraints.
-## constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])]]
-### # 2.2 c_vs <= c_vi + c_vr.
-### constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])],
-### np.array([2.0, -1.0, 0.0]).T@x >= 0.0]
-#
-# prob = cp.Problem(cp.Minimize(cost_fun), constraints)
-# prob.solve()
-# edit_costs_new = [x.value[0], x.value[0], x.value[1], x.value[2], x.value[2]]
-# edit_costs_new = np.array(edit_costs_new)
-# residual = np.sqrt(prob.value)
- if rw_constraints == 'inequality':
- # c_vs <= c_vi + c_vr.
- nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]]
- x = cp.Variable(nb_cost_mat_new.shape[1])
- cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec)
- constraints = [x >= [0.001 for i in range(nb_cost_mat_new.shape[1])],
- np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0]
- prob = cp.Problem(cp.Minimize(cost_fun), constraints)
- try:
- prob.solve(verbose=True)
- except MemoryError as error0:
- print('\nUsing solver "OSQP" caused a memory error.')
- print('the original error message is\n', error0)
- print('solver status: ', prob.status)
- print('trying solver "CVXOPT" instead...\n')
- try:
- prob.solve(solver=cp.CVXOPT, verbose=True)
- except Exception as error1:
- print('\nAn error occured when using solver "CVXOPT".')
- print('the original error message is\n', error1)
- print('solver status: ', prob.status)
- print('trying solver "MOSEK" instead. Notice this solver is commercial and a lisence is required.\n')
- prob.solve(solver=cp.MOSEK, verbose=True)
- else:
- print('solver status: ', prob.status)
- else:
- print('solver status: ', prob.status)
- print()
- edit_costs_new = x.value
- residual = np.sqrt(prob.value)
- elif rw_constraints == '2constraints':
- # c_vs <= c_vi + c_vr and c_vi == c_vr, c_ei == c_er.
- nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]]
- x = cp.Variable(nb_cost_mat_new.shape[1])
- cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec)
- constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])],
- np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0,
- np.array([1.0, -1.0, 0.0, 0.0, 0.0]).T@x == 0.0,
- np.array([0.0, 0.0, 0.0, 1.0, -1.0]).T@x == 0.0]
- prob = cp.Problem(cp.Minimize(cost_fun), constraints)
- prob.solve()
- edit_costs_new = x.value
- residual = np.sqrt(prob.value)
- elif rw_constraints == 'no-constraint':
- # no constraint.
- nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]]
- x = cp.Variable(nb_cost_mat_new.shape[1])
- cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec)
- constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])]]
- prob = cp.Problem(cp.Minimize(cost_fun), constraints)
- prob.solve()
- edit_costs_new = x.value
- residual = np.sqrt(prob.value)
-# elif method == 'inequality_modified':
-# # c_vs <= c_vi + c_vr.
-# nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]]
-# x = cp.Variable(nb_cost_mat_new.shape[1])
-# cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec)
-# constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])],
-# np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0]
-# prob = cp.Problem(cp.Minimize(cost_fun), constraints)
-# prob.solve()
-# # use same costs for insertion and removal rather than the fitted costs.
-# edit_costs_new = [x.value[0], x.value[0], x.value[1], x.value[2], x.value[2]]
-# edit_costs_new = np.array(edit_costs_new)
-# residual = np.sqrt(prob.value)
- elif cost == 'NON_SYMBOLIC':
- is_n_attr = np.count_nonzero(nb_cost_mat[:,2])
- is_e_attr = np.count_nonzero(nb_cost_mat[:,5])
-
- if dataset == 'SYNTHETICnew':
-# nb_cost_mat_new = nb_cost_mat[:,[0,1,2,3,4]]
- nb_cost_mat_new = nb_cost_mat[:,[2,3,4]]
- x = cp.Variable(nb_cost_mat_new.shape[1])
- cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec)
-# constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])],
-# np.array([0.0, 0.0, 0.0, 1.0, -1.0]).T@x == 0.0]
-# constraints = [x >= [0.0001 for i in range(nb_cost_mat_new.shape[1])]]
- constraints = [x >= [0.0001 for i in range(nb_cost_mat_new.shape[1])],
- np.array([0.0, 1.0, -1.0]).T@x == 0.0]
- prob = cp.Problem(cp.Minimize(cost_fun), constraints)
- prob.solve()
-# print(x.value)
- edit_costs_new = np.concatenate((np.array([0.0, 0.0]), x.value,
- np.array([0.0])))
- residual = np.sqrt(prob.value)
-
- elif rw_constraints == 'inequality':
- # c_vs <= c_vi + c_vr.
- if is_n_attr and is_e_attr:
- nb_cost_mat_new = nb_cost_mat[:,[0,1,2,3,4,5]]
- x = cp.Variable(nb_cost_mat_new.shape[1])
- cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec)
- constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])],
- np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0,
- np.array([0.0, 0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0]
- prob = cp.Problem(cp.Minimize(cost_fun), constraints)
- prob.solve()
- edit_costs_new = x.value
- residual = np.sqrt(prob.value)
- elif is_n_attr and not is_e_attr:
- nb_cost_mat_new = nb_cost_mat[:,[0,1,2,3,4]]
- x = cp.Variable(nb_cost_mat_new.shape[1])
- cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec)
- constraints = [x >= [0.001 for i in range(nb_cost_mat_new.shape[1])],
- np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0]
- prob = cp.Problem(cp.Minimize(cost_fun), constraints)
- prob.solve()
- print(x.value)
- edit_costs_new = np.concatenate((x.value, np.array([0.0])))
- residual = np.sqrt(prob.value)
- elif not is_n_attr and is_e_attr:
- nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]]
- x = cp.Variable(nb_cost_mat_new.shape[1])
- cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec)
- constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])],
- np.array([0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0]
- prob = cp.Problem(cp.Minimize(cost_fun), constraints)
- prob.solve()
- edit_costs_new = np.concatenate((x.value[0:2], np.array([0.0]), x.value[2:]))
- residual = np.sqrt(prob.value)
- else:
- nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4]]
- x = cp.Variable(nb_cost_mat_new.shape[1])
- cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec)
- constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])]]
- prob = cp.Problem(cp.Minimize(cost_fun), constraints)
- prob.solve()
- edit_costs_new = np.concatenate((x.value[0:2], np.array([0.0]),
- x.value[2:], np.array([0.0])))
- residual = np.sqrt(prob.value)
- else:
-# # method 1: simple least square method.
-# edit_costs_new, residual, _, _ = np.linalg.lstsq(nb_cost_mat, dis_k_vec,
-# rcond=None)
-
-# # method 2: least square method with x_i >= 0.
-# edit_costs_new, residual = optimize.nnls(nb_cost_mat, dis_k_vec)
-
- # method 3: solve as a quadratic program with constraints.
-# P = np.dot(nb_cost_mat.T, nb_cost_mat)
-# q_T = -2 * np.dot(dis_k_vec.T, nb_cost_mat)
-# G = -1 * np.identity(nb_cost_mat.shape[1])
-# h = np.array([0 for i in range(nb_cost_mat.shape[1])])
-# A = np.array([1 for i in range(nb_cost_mat.shape[1])])
-# b = 1
-# x = cp.Variable(nb_cost_mat.shape[1])
-# prob = cp.Problem(cp.Minimize(cp.quad_form(x, P) + q_T@x),
-# [G@x <= h])
-# prob.solve()
-# edit_costs_new = x.value
-# residual = prob.value - np.dot(dis_k_vec.T, dis_k_vec)
-
-# G = -1 * np.identity(nb_cost_mat.shape[1])
-# h = np.array([0 for i in range(nb_cost_mat.shape[1])])
- x = cp.Variable(nb_cost_mat.shape[1])
- cost_fun = cp.sum_squares(nb_cost_mat * x - dis_k_vec)
- constraints = [x >= [0.0 for i in range(nb_cost_mat.shape[1])],
- # np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0]
- np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0,
- np.array([0.0, 0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0]
- prob = cp.Problem(cp.Minimize(cost_fun), constraints)
- prob.solve()
- edit_costs_new = x.value
- residual = np.sqrt(prob.value)
-
- # method 4:
-
- return edit_costs_new, residual
-
-
-if __name__ == '__main__':
- print('check test_fitDistance.py')
\ No newline at end of file
diff --git a/gklearn/preimage/ged.py b/gklearn/preimage/ged.py
deleted file mode 100644
index a66baaf..0000000
--- a/gklearn/preimage/ged.py
+++ /dev/null
@@ -1,467 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Thu Oct 17 18:44:59 2019
-
-@author: ljia
-"""
-import numpy as np
-import networkx as nx
-from tqdm import tqdm
-import sys
-import multiprocessing
-from multiprocessing import Pool
-from functools import partial
-
-#from gedlibpy_linlin import librariesImport, gedlibpy
-from gklearn.gedlib import librariesImport, gedlibpy
-
-def GED(g1, g2, dataset='monoterpenoides', lib='gedlibpy', cost='CHEM_1', method='IPFP',
- edit_cost_constant=[], algo_options='', stabilizer='min', repeat=50):
- """
- Compute GED for 2 graphs.
- """
-
-# dataset = dataset.lower()
-
- if lib == 'gedlibpy':
- gedlibpy.restart_env()
- gedlibpy.add_nx_graph(convertGraph(g1, cost), "")
- gedlibpy.add_nx_graph(convertGraph(g2, cost), "")
-
- listID = gedlibpy.get_all_graph_ids()
- gedlibpy.set_edit_cost(cost, edit_cost_constant=edit_cost_constant)
- gedlibpy.init()
- gedlibpy.set_method(method, algo_options)
- gedlibpy.init_method()
-
- g = listID[0]
- h = listID[1]
- if stabilizer is None:
- gedlibpy.run_method(g, h)
- pi_forward = gedlibpy.get_forward_map(g, h)
- pi_backward = gedlibpy.get_backward_map(g, h)
- upper = gedlibpy.get_upper_bound(g, h)
- lower = gedlibpy.get_lower_bound(g, h)
- elif stabilizer == 'mean':
- # @todo: to be finished...
- upper_list = [np.inf] * repeat
- for itr in range(repeat):
- gedlibpy.run_method(g, h)
- upper_list[itr] = gedlibpy.get_upper_bound(g, h)
- pi_forward = gedlibpy.get_forward_map(g, h)
- pi_backward = gedlibpy.get_backward_map(g, h)
- lower = gedlibpy.get_lower_bound(g, h)
- upper = np.mean(upper_list)
- elif stabilizer == 'median':
- if repeat % 2 == 0:
- repeat += 1
- upper_list = [np.inf] * repeat
- pi_forward_list = [0] * repeat
- pi_backward_list = [0] * repeat
- for itr in range(repeat):
- gedlibpy.run_method(g, h)
- upper_list[itr] = gedlibpy.get_upper_bound(g, h)
- pi_forward_list[itr] = gedlibpy.get_forward_map(g, h)
- pi_backward_list[itr] = gedlibpy.get_backward_map(g, h)
- lower = gedlibpy.get_lower_bound(g, h)
- upper = np.median(upper_list)
- idx_median = upper_list.index(upper)
- pi_forward = pi_forward_list[idx_median]
- pi_backward = pi_backward_list[idx_median]
- elif stabilizer == 'min':
- upper = np.inf
- for itr in range(repeat):
- gedlibpy.run_method(g, h)
- upper_tmp = gedlibpy.get_upper_bound(g, h)
- if upper_tmp < upper:
- upper = upper_tmp
- pi_forward = gedlibpy.get_forward_map(g, h)
- pi_backward = gedlibpy.get_backward_map(g, h)
- lower = gedlibpy.get_lower_bound(g, h)
- if upper == 0:
- break
- elif stabilizer == 'max':
- upper = 0
- for itr in range(repeat):
- gedlibpy.run_method(g, h)
- upper_tmp = gedlibpy.get_upper_bound(g, h)
- if upper_tmp > upper:
- upper = upper_tmp
- pi_forward = gedlibpy.get_forward_map(g, h)
- pi_backward = gedlibpy.get_backward_map(g, h)
- lower = gedlibpy.get_lower_bound(g, h)
- elif stabilizer == 'gaussian':
- pass
-
- dis = upper
-
- elif lib == 'gedlib-bash':
- import time
- import random
- import os
- from gklearn.utils.graphfiles import saveDataset
-
- tmp_dir = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/output/tmp_ged/'
- if not os.path.exists(tmp_dir):
- os.makedirs(tmp_dir)
- fn_collection = tmp_dir + 'collection.' + str(time.time()) + str(random.randint(0, 1e9))
- xparams = {'method': 'gedlib', 'graph_dir': fn_collection}
- saveDataset([g1, g2], ['dummy', 'dummy'], gformat='gxl', group='xml',
- filename=fn_collection, xparams=xparams)
-
- command = 'GEDLIB_HOME=\'/media/ljia/DATA/research-repo/codes/others/gedlib/gedlib2\'\n'
- command += 'LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$GEDLIB_HOME/lib\n'
- command += 'export LD_LIBRARY_PATH\n'
- command += 'cd \'' + os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/bin\'\n'
- command += './ged_for_python_bash monoterpenoides ' + fn_collection \
- + ' \'' + algo_options + '\' '
- for ec in edit_cost_constant:
- command += str(ec) + ' '
-# output = os.system(command)
- stream = os.popen(command)
- output = stream.readlines()
-# print(output)
-
- dis = float(output[0].strip())
- runtime = float(output[1].strip())
- size_forward = int(output[2].strip())
- pi_forward = [int(item.strip()) for item in output[3:3+size_forward]]
- pi_backward = [int(item.strip()) for item in output[3+size_forward:]]
-
-# print(dis)
-# print(runtime)
-# print(size_forward)
-# print(pi_forward)
-# print(pi_backward)
-
-
- # make the map label correct (label remove map as np.inf)
- nodes1 = [n for n in g1.nodes()]
- nodes2 = [n for n in g2.nodes()]
- nb1 = nx.number_of_nodes(g1)
- nb2 = nx.number_of_nodes(g2)
- pi_forward = [nodes2[pi] if pi < nb2 else np.inf for pi in pi_forward]
- pi_backward = [nodes1[pi] if pi < nb1 else np.inf for pi in pi_backward]
-# print(pi_forward)
-
-
- return dis, pi_forward, pi_backward
-
-
-def convertGraph(G, cost):
- """Convert a graph to the proper NetworkX format that can be
- recognized by library gedlibpy.
- """
- G_new = nx.Graph()
- if cost == 'LETTER' or cost == 'LETTER2':
- for nd, attrs in G.nodes(data=True):
- G_new.add_node(str(nd), x=str(attrs['attributes'][0]),
- y=str(attrs['attributes'][1]))
- for nd1, nd2, attrs in G.edges(data=True):
- G_new.add_edge(str(nd1), str(nd2))
- elif cost == 'NON_SYMBOLIC':
- for nd, attrs in G.nodes(data=True):
- G_new.add_node(str(nd))
- for a_name in G.graph['node_attrs']:
- G_new.nodes[str(nd)][a_name] = str(attrs[a_name])
- for nd1, nd2, attrs in G.edges(data=True):
- G_new.add_edge(str(nd1), str(nd2))
- for a_name in G.graph['edge_attrs']:
- G_new.edges[str(nd1), str(nd2)][a_name] = str(attrs[a_name])
- else:
- for nd, attrs in G.nodes(data=True):
- G_new.add_node(str(nd), chem=attrs['atom'])
- for nd1, nd2, attrs in G.edges(data=True):
- G_new.add_edge(str(nd1), str(nd2), valence=attrs['bond_type'])
-# G_new.add_edge(str(nd1), str(nd2))
-
- return G_new
-
-
-def GED_n(Gn, lib='gedlibpy', cost='CHEM_1', method='IPFP',
- edit_cost_constant=[], stabilizer='min', repeat=50):
- """
- Compute GEDs for a group of graphs.
- """
- if lib == 'gedlibpy':
- def convertGraph(G):
- """Convert a graph to the proper NetworkX format that can be
- recognized by library gedlibpy.
- """
- G_new = nx.Graph()
- for nd, attrs in G.nodes(data=True):
- G_new.add_node(str(nd), chem=attrs['atom'])
- for nd1, nd2, attrs in G.edges(data=True):
-# G_new.add_edge(str(nd1), str(nd2), valence=attrs['bond_type'])
- G_new.add_edge(str(nd1), str(nd2))
-
- return G_new
-
- gedlibpy.restart_env()
- gedlibpy.add_nx_graph(convertGraph(g1), "")
- gedlibpy.add_nx_graph(convertGraph(g2), "")
-
- listID = gedlibpy.get_all_graph_ids()
- gedlibpy.set_edit_cost(cost, edit_cost_constant=edit_cost_constant)
- gedlibpy.init()
- gedlibpy.set_method(method, "")
- gedlibpy.init_method()
-
- g = listID[0]
- h = listID[1]
- if stabilizer is None:
- gedlibpy.run_method(g, h)
- pi_forward = gedlibpy.get_forward_map(g, h)
- pi_backward = gedlibpy.get_backward_map(g, h)
- upper = gedlibpy.get_upper_bound(g, h)
- lower = gedlibpy.get_lower_bound(g, h)
- elif stabilizer == 'min':
- upper = np.inf
- for itr in range(repeat):
- gedlibpy.run_method(g, h)
- upper_tmp = gedlibpy.get_upper_bound(g, h)
- if upper_tmp < upper:
- upper = upper_tmp
- pi_forward = gedlibpy.get_forward_map(g, h)
- pi_backward = gedlibpy.get_backward_map(g, h)
- lower = gedlibpy.get_lower_bound(g, h)
- if upper == 0:
- break
-
- dis = upper
-
- # make the map label correct (label remove map as np.inf)
- nodes1 = [n for n in g1.nodes()]
- nodes2 = [n for n in g2.nodes()]
- nb1 = nx.number_of_nodes(g1)
- nb2 = nx.number_of_nodes(g2)
- pi_forward = [nodes2[pi] if pi < nb2 else np.inf for pi in pi_forward]
- pi_backward = [nodes1[pi] if pi < nb1 else np.inf for pi in pi_backward]
-
- return dis, pi_forward, pi_backward
-
-
-def ged_median(Gn, Gn_median, verbose=False, params_ged={'lib': 'gedlibpy',
- 'cost': 'CHEM_1', 'method': 'IPFP', 'edit_cost_constant': [],
- 'algo_options': '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1',
- 'stabilizer': None}, parallel=False):
- if parallel:
- len_itr = int(len(Gn))
- pi_forward_list = [[] for i in range(len_itr)]
- dis_list = [0 for i in range(len_itr)]
-
- itr = range(0, len_itr)
- n_jobs = multiprocessing.cpu_count()
- if len_itr < 100 * n_jobs:
- chunksize = int(len_itr / n_jobs) + 1
- else:
- chunksize = 100
- def init_worker(gn_toshare, gn_median_toshare):
- global G_gn, G_gn_median
- G_gn = gn_toshare
- G_gn_median = gn_median_toshare
- do_partial = partial(_compute_ged_median, params_ged)
- pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(Gn, Gn_median))
- if verbose:
- iterator = tqdm(pool.imap_unordered(do_partial, itr, chunksize),
- desc='computing GEDs', file=sys.stdout)
- else:
- iterator = pool.imap_unordered(do_partial, itr, chunksize)
- for i, dis_sum, pi_forward in iterator:
- pi_forward_list[i] = pi_forward
- dis_list[i] = dis_sum
-# print('\n-------------------------------------------')
-# print(i, j, idx_itr, dis)
- pool.close()
- pool.join()
-
- else:
- dis_list = []
- pi_forward_list = []
- for idx, G in tqdm(enumerate(Gn), desc='computing median distances',
- file=sys.stdout) if verbose else enumerate(Gn):
- dis_sum = 0
- pi_forward_list.append([])
- for G_p in Gn_median:
- dis_tmp, pi_tmp_forward, pi_tmp_backward = GED(G, G_p,
- **params_ged)
- pi_forward_list[idx].append(pi_tmp_forward)
- dis_sum += dis_tmp
- dis_list.append(dis_sum)
-
- return dis_list, pi_forward_list
-
-
-def _compute_ged_median(params_ged, itr):
-# print(itr)
- dis_sum = 0
- pi_forward = []
- for G_p in G_gn_median:
- dis_tmp, pi_tmp_forward, pi_tmp_backward = GED(G_gn[itr], G_p,
- **params_ged)
- pi_forward.append(pi_tmp_forward)
- dis_sum += dis_tmp
-
- return itr, dis_sum, pi_forward
-
-
-def get_nb_edit_operations(g1, g2, forward_map, backward_map):
- """Compute the number of each edit operations.
- """
- n_vi = 0
- n_vr = 0
- n_vs = 0
- n_ei = 0
- n_er = 0
- n_es = 0
-
- nodes1 = [n for n in g1.nodes()]
- for i, map_i in enumerate(forward_map):
- if map_i == np.inf:
- n_vr += 1
- elif g1.node[nodes1[i]]['atom'] != g2.node[map_i]['atom']:
- n_vs += 1
- for map_i in backward_map:
- if map_i == np.inf:
- n_vi += 1
-
-# idx_nodes1 = range(0, len(node1))
-
- edges1 = [e for e in g1.edges()]
- nb_edges2_cnted = 0
- for n1, n2 in edges1:
- idx1 = nodes1.index(n1)
- idx2 = nodes1.index(n2)
- # one of the nodes is removed, thus the edge is removed.
- if forward_map[idx1] == np.inf or forward_map[idx2] == np.inf:
- n_er += 1
- # corresponding edge is in g2.
- elif (forward_map[idx1], forward_map[idx2]) in g2.edges():
- nb_edges2_cnted += 1
- # edge labels are different.
- if g2.edges[((forward_map[idx1], forward_map[idx2]))]['bond_type'] \
- != g1.edges[(n1, n2)]['bond_type']:
- n_es += 1
- elif (forward_map[idx2], forward_map[idx1]) in g2.edges():
- nb_edges2_cnted += 1
- # edge labels are different.
- if g2.edges[((forward_map[idx2], forward_map[idx1]))]['bond_type'] \
- != g1.edges[(n1, n2)]['bond_type']:
- n_es += 1
- # corresponding nodes are in g2, however the edge is removed.
- else:
- n_er += 1
- n_ei = nx.number_of_edges(g2) - nb_edges2_cnted
-
- return n_vi, n_vr, n_vs, n_ei, n_er, n_es
-
-
-def get_nb_edit_operations_letter(g1, g2, forward_map, backward_map):
- """Compute the number of each edit operations.
- """
- n_vi = 0
- n_vr = 0
- n_vs = 0
- sod_vs = 0
- n_ei = 0
- n_er = 0
-
- nodes1 = [n for n in g1.nodes()]
- for i, map_i in enumerate(forward_map):
- if map_i == np.inf:
- n_vr += 1
- else:
- n_vs += 1
- diff_x = float(g1.nodes[nodes1[i]]['x']) - float(g2.nodes[map_i]['x'])
- diff_y = float(g1.nodes[nodes1[i]]['y']) - float(g2.nodes[map_i]['y'])
- sod_vs += np.sqrt(np.square(diff_x) + np.square(diff_y))
- for map_i in backward_map:
- if map_i == np.inf:
- n_vi += 1
-
-# idx_nodes1 = range(0, len(node1))
-
- edges1 = [e for e in g1.edges()]
- nb_edges2_cnted = 0
- for n1, n2 in edges1:
- idx1 = nodes1.index(n1)
- idx2 = nodes1.index(n2)
- # one of the nodes is removed, thus the edge is removed.
- if forward_map[idx1] == np.inf or forward_map[idx2] == np.inf:
- n_er += 1
- # corresponding edge is in g2. Edge label is not considered.
- elif (forward_map[idx1], forward_map[idx2]) in g2.edges() or \
- (forward_map[idx2], forward_map[idx1]) in g2.edges():
- nb_edges2_cnted += 1
- # corresponding nodes are in g2, however the edge is removed.
- else:
- n_er += 1
- n_ei = nx.number_of_edges(g2) - nb_edges2_cnted
-
- return n_vi, n_vr, n_vs, sod_vs, n_ei, n_er
-
-
-def get_nb_edit_operations_nonsymbolic(g1, g2, forward_map, backward_map):
- """Compute the number of each edit operations.
- """
- n_vi = 0
- n_vr = 0
- n_vs = 0
- sod_vs = 0
- n_ei = 0
- n_er = 0
- n_es = 0
- sod_es = 0
-
- nodes1 = [n for n in g1.nodes()]
- for i, map_i in enumerate(forward_map):
- if map_i == np.inf:
- n_vr += 1
- else:
- n_vs += 1
- sum_squares = 0
- for a_name in g1.graph['node_attrs']:
- diff = float(g1.nodes[nodes1[i]][a_name]) - float(g2.nodes[map_i][a_name])
- sum_squares += np.square(diff)
- sod_vs += np.sqrt(sum_squares)
- for map_i in backward_map:
- if map_i == np.inf:
- n_vi += 1
-
-# idx_nodes1 = range(0, len(node1))
-
- edges1 = [e for e in g1.edges()]
- for n1, n2 in edges1:
- idx1 = nodes1.index(n1)
- idx2 = nodes1.index(n2)
- n1_g2 = forward_map[idx1]
- n2_g2 = forward_map[idx2]
- # one of the nodes is removed, thus the edge is removed.
- if n1_g2 == np.inf or n2_g2 == np.inf:
- n_er += 1
- # corresponding edge is in g2.
- elif (n1_g2, n2_g2) in g2.edges():
- n_es += 1
- sum_squares = 0
- for a_name in g1.graph['edge_attrs']:
- diff = float(g1.edges[n1, n2][a_name]) - float(g2.nodes[n1_g2, n2_g2][a_name])
- sum_squares += np.square(diff)
- sod_es += np.sqrt(sum_squares)
- elif (n2_g2, n1_g2) in g2.edges():
- n_es += 1
- sum_squares = 0
- for a_name in g1.graph['edge_attrs']:
- diff = float(g1.edges[n2, n1][a_name]) - float(g2.nodes[n2_g2, n1_g2][a_name])
- sum_squares += np.square(diff)
- sod_es += np.sqrt(sum_squares)
- # corresponding nodes are in g2, however the edge is removed.
- else:
- n_er += 1
- n_ei = nx.number_of_edges(g2) - n_es
-
- return n_vi, n_vr, sod_vs, n_ei, n_er, sod_es
-
-
-if __name__ == '__main__':
- print('check test_ged.py')
\ No newline at end of file
diff --git a/gklearn/preimage/iam.py b/gklearn/preimage/iam.py
deleted file mode 100644
index f3e2165..0000000
--- a/gklearn/preimage/iam.py
+++ /dev/null
@@ -1,775 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Fri Apr 26 11:49:12 2019
-
-Iterative alternate minimizations using GED.
-@author: ljia
-"""
-import numpy as np
-import random
-import networkx as nx
-from tqdm import tqdm
-
-from gklearn.utils.graphdataset import get_dataset_attributes
-from gklearn.utils.utils import graph_isIdentical, get_node_labels, get_edge_labels
-from gklearn.preimage.ged import GED, ged_median
-
-
-def iam_upgraded(Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, ite_max=50,
- epsilon=0.001, node_label='atom', edge_label='bond_type',
- connected=False, removeNodes=True, allBestInit=False, allBestNodes=False,
- allBestEdges=False, allBestOutput=False,
- params_ged={'lib': 'gedlibpy', 'cost': 'CHEM_1', 'method': 'IPFP',
- 'edit_cost_constant': [], 'stabilizer': None,
- 'algo_options': '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1'}):
- """See my name, then you know what I do.
- """
-# Gn_median = Gn_median[0:10]
-# Gn_median = [nx.convert_node_labels_to_integers(g) for g in Gn_median]
- node_ir = np.inf # corresponding to the node remove and insertion.
- label_r = 'thanksdanny' # the label for node remove. # @todo: make this label unrepeatable.
- ds_attrs = get_dataset_attributes(Gn_median + Gn_candidate,
- attr_names=['edge_labeled', 'node_attr_dim', 'edge_attr_dim'],
- edge_label=edge_label)
- node_label_set = get_node_labels(Gn_median, node_label)
- edge_label_set = get_edge_labels(Gn_median, edge_label)
-
-
- def generate_graph(G, pi_p_forward):
- G_new_list = [G.copy()] # all "best" graphs generated in this iteration.
-# nx.draw_networkx(G)
-# import matplotlib.pyplot as plt
-# plt.show()
-# print(pi_p_forward)
-
- # update vertex labels.
- # pre-compute h_i0 for each label.
-# for label in get_node_labels(Gn, node_label):
-# print(label)
-# for nd in G.nodes(data=True):
-# pass
- if not ds_attrs['node_attr_dim']: # labels are symbolic
- for ndi, (nd, _) in enumerate(G.nodes(data=True)):
- h_i0_list = []
- label_list = []
- for label in node_label_set:
- h_i0 = 0
- for idx, g in enumerate(Gn_median):
- pi_i = pi_p_forward[idx][ndi]
- if pi_i != node_ir and g.nodes[pi_i][node_label] == label:
- h_i0 += 1
- h_i0_list.append(h_i0)
- label_list.append(label)
- # case when the node is to be removed.
- if removeNodes:
- h_i0_remove = 0 # @todo: maybe this can be added to the node_label_set above.
- for idx, g in enumerate(Gn_median):
- pi_i = pi_p_forward[idx][ndi]
- if pi_i == node_ir:
- h_i0_remove += 1
- h_i0_list.append(h_i0_remove)
- label_list.append(label_r)
- # get the best labels.
- idx_max = np.argwhere(h_i0_list == np.max(h_i0_list)).flatten().tolist()
- if allBestNodes: # choose all best graphs.
- nlabel_best = [label_list[idx] for idx in idx_max]
- # generate "best" graphs with regard to "best" node labels.
- G_new_list_nd = []
- for g in G_new_list: # @todo: seems it can be simplified. The G_new_list will only contain 1 graph for now.
- for nl in nlabel_best:
- g_tmp = g.copy()
- if nl == label_r:
- g_tmp.remove_node(nd)
- else:
- g_tmp.nodes[nd][node_label] = nl
- G_new_list_nd.append(g_tmp)
- # nx.draw_networkx(g_tmp)
- # import matplotlib.pyplot as plt
- # plt.show()
- # print(g_tmp.nodes(data=True))
- # print(g_tmp.edges(data=True))
- G_new_list = [ggg.copy() for ggg in G_new_list_nd]
- else:
- # choose one of the best randomly.
- idx_rdm = random.randint(0, len(idx_max) - 1)
- best_label = label_list[idx_max[idx_rdm]]
- h_i0_max = h_i0_list[idx_max[idx_rdm]]
-
- g_new = G_new_list[0]
- if best_label == label_r:
- g_new.remove_node(nd)
- else:
- g_new.nodes[nd][node_label] = best_label
- G_new_list = [g_new]
- else: # labels are non-symbolic
- for ndi, (nd, _) in enumerate(G.nodes(data=True)):
- Si_norm = 0
- phi_i_bar = np.array([0.0 for _ in range(ds_attrs['node_attr_dim'])])
- for idx, g in enumerate(Gn_median):
- pi_i = pi_p_forward[idx][ndi]
- if g.has_node(pi_i): #@todo: what if no g has node? phi_i_bar = 0?
- Si_norm += 1
- phi_i_bar += np.array([float(itm) for itm in g.nodes[pi_i]['attributes']])
- phi_i_bar /= Si_norm
- G_new_list[0].nodes[nd]['attributes'] = phi_i_bar
-
-# for g in G_new_list:
-# import matplotlib.pyplot as plt
-# nx.draw(g, labels=nx.get_node_attributes(g, 'atom'), with_labels=True)
-# plt.show()
-# print(g.nodes(data=True))
-# print(g.edges(data=True))
-
- # update edge labels and adjacency matrix.
- if ds_attrs['edge_labeled']:
- G_new_list_edge = []
- for g_new in G_new_list:
- nd_list = [n for n in g_new.nodes()]
- g_tmp_list = [g_new.copy()]
- for nd1i in range(nx.number_of_nodes(g_new)):
- nd1 = nd_list[nd1i]# @todo: not just edges, but all pairs of nodes
- for nd2i in range(nd1i + 1, nx.number_of_nodes(g_new)):
- nd2 = nd_list[nd2i]
-# for nd1, nd2, _ in g_new.edges(data=True):
- h_ij0_list = []
- label_list = []
- for label in edge_label_set:
- h_ij0 = 0
- for idx, g in enumerate(Gn_median):
- pi_i = pi_p_forward[idx][nd1i]
- pi_j = pi_p_forward[idx][nd2i]
- h_ij0_p = (g.has_node(pi_i) and g.has_node(pi_j) and
- g.has_edge(pi_i, pi_j) and
- g.edges[pi_i, pi_j][edge_label] == label)
- h_ij0 += h_ij0_p
- h_ij0_list.append(h_ij0)
- label_list.append(label)
-
- # get the best labels.
- idx_max = np.argwhere(h_ij0_list == np.max(h_ij0_list)).flatten().tolist()
- if allBestEdges: # choose all best graphs.
- elabel_best = [label_list[idx] for idx in idx_max]
- h_ij0_max = [h_ij0_list[idx] for idx in idx_max]
- # generate "best" graphs with regard to "best" node labels.
- G_new_list_ed = []
- for g_tmp in g_tmp_list: # @todo: seems it can be simplified. The G_new_list will only contain 1 graph for now.
- for idxl, el in enumerate(elabel_best):
- g_tmp_copy = g_tmp.copy()
- # check whether a_ij is 0 or 1.
- sij_norm = 0
- for idx, g in enumerate(Gn_median):
- pi_i = pi_p_forward[idx][nd1i]
- pi_j = pi_p_forward[idx][nd2i]
- if g.has_node(pi_i) and g.has_node(pi_j) and \
- g.has_edge(pi_i, pi_j):
- sij_norm += 1
- if h_ij0_max[idxl] > len(Gn_median) * c_er / c_es + \
- sij_norm * (1 - (c_er + c_ei) / c_es):
- if not g_tmp_copy.has_edge(nd1, nd2):
- g_tmp_copy.add_edge(nd1, nd2)
- g_tmp_copy.edges[nd1, nd2][edge_label] = elabel_best[idxl]
- else:
- if g_tmp_copy.has_edge(nd1, nd2):
- g_tmp_copy.remove_edge(nd1, nd2)
- G_new_list_ed.append(g_tmp_copy)
- g_tmp_list = [ggg.copy() for ggg in G_new_list_ed]
- else: # choose one of the best randomly.
- idx_rdm = random.randint(0, len(idx_max) - 1)
- best_label = label_list[idx_max[idx_rdm]]
- h_ij0_max = h_ij0_list[idx_max[idx_rdm]]
-
- # check whether a_ij is 0 or 1.
- sij_norm = 0
- for idx, g in enumerate(Gn_median):
- pi_i = pi_p_forward[idx][nd1i]
- pi_j = pi_p_forward[idx][nd2i]
- if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j):
- sij_norm += 1
- if h_ij0_max > len(Gn_median) * c_er / c_es + sij_norm * (1 - (c_er + c_ei) / c_es):
- if not g_new.has_edge(nd1, nd2):
- g_new.add_edge(nd1, nd2)
- g_new.edges[nd1, nd2][edge_label] = best_label
- else:
-# elif h_ij0_max < len(Gn_median) * c_er / c_es + sij_norm * (1 - (c_er + c_ei) / c_es):
- if g_new.has_edge(nd1, nd2):
- g_new.remove_edge(nd1, nd2)
- g_tmp_list = [g_new]
- G_new_list_edge += g_tmp_list
- G_new_list = [ggg.copy() for ggg in G_new_list_edge]
-
-
- else: # if edges are unlabeled
- # @todo: is this even right? G or g_tmp? check if the new one is right
- # @todo: works only for undirected graphs.
-
- for g_tmp in G_new_list:
- nd_list = [n for n in g_tmp.nodes()]
- for nd1i in range(nx.number_of_nodes(g_tmp)):
- nd1 = nd_list[nd1i]
- for nd2i in range(nd1i + 1, nx.number_of_nodes(g_tmp)):
- nd2 = nd_list[nd2i]
- sij_norm = 0
- for idx, g in enumerate(Gn_median):
- pi_i = pi_p_forward[idx][nd1i]
- pi_j = pi_p_forward[idx][nd2i]
- if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j):
- sij_norm += 1
- if sij_norm > len(Gn_median) * c_er / (c_er + c_ei):
- # @todo: should we consider if nd1 and nd2 in g_tmp?
- # or just add the edge anyway?
- if g_tmp.has_node(nd1) and g_tmp.has_node(nd2) \
- and not g_tmp.has_edge(nd1, nd2):
- g_tmp.add_edge(nd1, nd2)
- else: # @todo: which to use?
-# elif sij_norm < len(Gn_median) * c_er / (c_er + c_ei):
- if g_tmp.has_edge(nd1, nd2):
- g_tmp.remove_edge(nd1, nd2)
- # do not change anything when equal.
-
-# for i, g in enumerate(G_new_list):
-# import matplotlib.pyplot as plt
-# nx.draw(g, labels=nx.get_node_attributes(g, 'atom'), with_labels=True)
-## plt.savefig("results/gk_iam/simple_two/xx" + str(i) + ".png", format="PNG")
-# plt.show()
-# print(g.nodes(data=True))
-# print(g.edges(data=True))
-
-# # find the best graph generated in this iteration and update pi_p.
- # @todo: should we update all graphs generated or just the best ones?
- dis_list, pi_forward_list = ged_median(G_new_list, Gn_median,
- params_ged=params_ged)
- # @todo: should we remove the identical and connectivity check?
- # Don't know which is faster.
- if ds_attrs['node_attr_dim'] == 0 and ds_attrs['edge_attr_dim'] == 0:
- G_new_list, idx_list = remove_duplicates(G_new_list)
- pi_forward_list = [pi_forward_list[idx] for idx in idx_list]
- dis_list = [dis_list[idx] for idx in idx_list]
-# if connected == True:
-# G_new_list, idx_list = remove_disconnected(G_new_list)
-# pi_forward_list = [pi_forward_list[idx] for idx in idx_list]
-# idx_min_list = np.argwhere(dis_list == np.min(dis_list)).flatten().tolist()
-# dis_min = dis_list[idx_min_tmp_list[0]]
-# pi_forward_list = [pi_forward_list[idx] for idx in idx_min_list]
-# G_new_list = [G_new_list[idx] for idx in idx_min_list]
-
-# for g in G_new_list:
-# import matplotlib.pyplot as plt
-# nx.draw_networkx(g)
-# plt.show()
-# print(g.nodes(data=True))
-# print(g.edges(data=True))
-
- return G_new_list, pi_forward_list, dis_list
-
-
- def best_median_graphs(Gn_candidate, pi_all_forward, dis_all):
- idx_min_list = np.argwhere(dis_all == np.min(dis_all)).flatten().tolist()
- dis_min = dis_all[idx_min_list[0]]
- pi_forward_min_list = [pi_all_forward[idx] for idx in idx_min_list]
- G_min_list = [Gn_candidate[idx] for idx in idx_min_list]
- return G_min_list, pi_forward_min_list, dis_min
-
-
- def iteration_proc(G, pi_p_forward, cur_sod):
- G_list = [G]
- pi_forward_list = [pi_p_forward]
- old_sod = cur_sod * 2
- sod_list = [cur_sod]
- dis_list = [cur_sod]
- # iterations.
- itr = 0
- # @todo: what if difference == 0?
-# while itr < ite_max and (np.abs(old_sod - cur_sod) > epsilon or
-# np.abs(old_sod - cur_sod) == 0):
- while itr < ite_max and np.abs(old_sod - cur_sod) > epsilon:
-# while itr < ite_max:
-# for itr in range(0, 5): # the convergence condition?
- print('itr_iam is', itr)
- G_new_list = []
- pi_forward_new_list = []
- dis_new_list = []
- for idx, g in enumerate(G_list):
-# label_set = get_node_labels(Gn_median + [g], node_label)
- G_tmp_list, pi_forward_tmp_list, dis_tmp_list = generate_graph(
- g, pi_forward_list[idx])
- G_new_list += G_tmp_list
- pi_forward_new_list += pi_forward_tmp_list
- dis_new_list += dis_tmp_list
- # @todo: need to remove duplicates here?
- G_list = [ggg.copy() for ggg in G_new_list]
- pi_forward_list = [pitem.copy() for pitem in pi_forward_new_list]
- dis_list = dis_new_list[:]
-
- old_sod = cur_sod
- cur_sod = np.min(dis_list)
- sod_list.append(cur_sod)
-
- itr += 1
-
- # @todo: do we return all graphs or the best ones?
- # get the best ones of the generated graphs.
- G_list, pi_forward_list, dis_min = best_median_graphs(
- G_list, pi_forward_list, dis_list)
-
- if ds_attrs['node_attr_dim'] == 0 and ds_attrs['edge_attr_dim'] == 0:
- G_list, idx_list = remove_duplicates(G_list)
- pi_forward_list = [pi_forward_list[idx] for idx in idx_list]
-# dis_list = [dis_list[idx] for idx in idx_list]
-
-# import matplotlib.pyplot as plt
-# for g in G_list:
-# nx.draw_networkx(g)
-# plt.show()
-# print(g.nodes(data=True))
-# print(g.edges(data=True))
-
- print('\nsods:', sod_list, '\n')
-
- return G_list, pi_forward_list, dis_min, sod_list
-
-
- def remove_duplicates(Gn):
- """Remove duplicate graphs from list.
- """
- Gn_new = []
- idx_list = []
- for idx, g in enumerate(Gn):
- dupl = False
- for g_new in Gn_new:
- if graph_isIdentical(g_new, g):
- dupl = True
- break
- if not dupl:
- Gn_new.append(g)
- idx_list.append(idx)
- return Gn_new, idx_list
-
-
- def remove_disconnected(Gn):
- """Remove disconnected graphs from list.
- """
- Gn_new = []
- idx_list = []
- for idx, g in enumerate(Gn):
- if nx.is_connected(g):
- Gn_new.append(g)
- idx_list.append(idx)
- return Gn_new, idx_list
-
-
- ###########################################################################
-
- # phase 1: initilize.
- # compute set-median.
- dis_min = np.inf
- dis_list, pi_forward_all = ged_median(Gn_candidate, Gn_median,
- params_ged=params_ged, parallel=True)
- print('finish computing GEDs.')
- # find all smallest distances.
- if allBestInit: # try all best init graphs.
- idx_min_list = range(len(dis_list))
- dis_min = dis_list
- else:
- idx_min_list = np.argwhere(dis_list == np.min(dis_list)).flatten().tolist()
- dis_min = [dis_list[idx_min_list[0]]] * len(idx_min_list)
- idx_min_rdm = random.randint(0, len(idx_min_list) - 1)
- idx_min_list = [idx_min_list[idx_min_rdm]]
- sod_set_median = np.min(dis_min)
-
-
- # phase 2: iteration.
- G_list = []
- dis_list = []
- pi_forward_list = []
- G_set_median_list = []
-# sod_list = []
- for idx_tmp, idx_min in enumerate(idx_min_list):
-# print('idx_min is', idx_min)
- G = Gn_candidate[idx_min].copy()
- G_set_median_list.append(G.copy())
- # list of edit operations.
- pi_p_forward = pi_forward_all[idx_min]
-# pi_p_backward = pi_all_backward[idx_min]
- Gi_list, pi_i_forward_list, dis_i_min, sod_list = iteration_proc(G,
- pi_p_forward, dis_min[idx_tmp])
- G_list += Gi_list
- dis_list += [dis_i_min] * len(Gi_list)
- pi_forward_list += pi_i_forward_list
-
-
- if ds_attrs['node_attr_dim'] == 0 and ds_attrs['edge_attr_dim'] == 0:
- G_list, idx_list = remove_duplicates(G_list)
- dis_list = [dis_list[idx] for idx in idx_list]
- pi_forward_list = [pi_forward_list[idx] for idx in idx_list]
- if connected == True:
- G_list_con, idx_list = remove_disconnected(G_list)
- # if there is no connected graphs at all, then remain the disconnected ones.
- if len(G_list_con) > 0: # @todo: ??????????????????????????
- G_list = G_list_con
- dis_list = [dis_list[idx] for idx in idx_list]
- pi_forward_list = [pi_forward_list[idx] for idx in idx_list]
-
-# import matplotlib.pyplot as plt
-# for g in G_list:
-# nx.draw_networkx(g)
-# plt.show()
-# print(g.nodes(data=True))
-# print(g.edges(data=True))
-
- # get the best median graphs
- G_gen_median_list, pi_forward_min_list, sod_gen_median = best_median_graphs(
- G_list, pi_forward_list, dis_list)
-# for g in G_gen_median_list:
-# nx.draw_networkx(g)
-# plt.show()
-# print(g.nodes(data=True))
-# print(g.edges(data=True))
-
- if not allBestOutput:
- # randomly choose one graph.
- idx_rdm = random.randint(0, len(G_gen_median_list) - 1)
- G_gen_median_list = [G_gen_median_list[idx_rdm]]
-
- return G_gen_median_list, sod_gen_median, sod_list, G_set_median_list, sod_set_median
-
-
-def iam_bash(Gn_names, edit_cost_constant, cost='CONSTANT', initial_solutions=1,
- dataset='monoterpenoides',
- graph_dir=''):
- """Compute the iam by c++ implementation (gedlib) through bash.
- """
- import os
- import time
-
- def createCollectionFile(Gn_names, y, filename):
- """Create collection file.
- """
- dirname_ds = os.path.dirname(filename)
- if dirname_ds != '':
- dirname_ds += '/'
- if not os.path.exists(dirname_ds) :
- os.makedirs(dirname_ds)
-
- with open(filename + '.xml', 'w') as fgroup:
- fgroup.write("")
- fgroup.write("\n")
- fgroup.write("\n")
- for idx, fname in enumerate(Gn_names):
- fgroup.write("\n\t")
- fgroup.write("\n")
- fgroup.close()
-
- tmp_dir = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/output/tmp_ged/'
- fn_collection = tmp_dir + 'collection.' + str(time.time()) + str(random.randint(0, 1e9))
- createCollectionFile(Gn_names, ['dummy'] * len(Gn_names), fn_collection)
-# fn_collection = tmp_dir + 'collection_for_debug'
-# graph_dir = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/generated_datsets/monoterpenoides/gxl'
-
-# if dataset == 'Letter-high' or dataset == 'Fingerprint':
-# dataset = 'letter'
- command = 'GEDLIB_HOME=\'/media/ljia/DATA/research-repo/codes/Linlin/gedlib\'\n'
- command += 'LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$GEDLIB_HOME/lib\n'
- command += 'export LD_LIBRARY_PATH\n'
- command += 'cd \'' + os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/bin\'\n'
- command += './iam_for_python_bash ' + dataset + ' ' + fn_collection \
- + ' \'' + graph_dir + '\' ' + ' ' + cost + ' ' + str(initial_solutions) + ' '
- if edit_cost_constant is None:
- command += 'None'
- else:
- for ec in edit_cost_constant:
- command += str(ec) + ' '
-# output = os.system(command)
- stream = os.popen(command)
-
- output = stream.readlines()
-# print(output)
- sod_sm = float(output[0].strip())
- sod_gm = float(output[1].strip())
-
- fname_sm = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/output/tmp_ged/set_median.gxl'
- fname_gm = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/output/tmp_ged/gen_median.gxl'
-
- return sod_sm, sod_gm, fname_sm, fname_gm
-
-
-
-###############################################################################
-# Old implementations.
-
-def iam(Gn, c_ei=3, c_er=3, c_es=1, node_label='atom', edge_label='bond_type',
- connected=True):
- """See my name, then you know what I do.
- """
-# Gn = Gn[0:10]
- Gn = [nx.convert_node_labels_to_integers(g) for g in Gn]
-
- # phase 1: initilize.
- # compute set-median.
- dis_min = np.inf
- pi_p = []
- pi_all = []
- for idx1, G_p in enumerate(Gn):
- dist_sum = 0
- pi_all.append([])
- for idx2, G_p_prime in enumerate(Gn):
- dist_tmp, pi_tmp, _ = GED(G_p, G_p_prime)
- pi_all[idx1].append(pi_tmp)
- dist_sum += dist_tmp
- if dist_sum < dis_min:
- dis_min = dist_sum
- G = G_p.copy()
- idx_min = idx1
- # list of edit operations.
- pi_p = pi_all[idx_min]
-
- # phase 2: iteration.
- ds_attrs = get_dataset_attributes(Gn, attr_names=['edge_labeled', 'node_attr_dim'],
- edge_label=edge_label)
- for itr in range(0, 10): # @todo: the convergence condition?
- G_new = G.copy()
- # update vertex labels.
- # pre-compute h_i0 for each label.
-# for label in get_node_labels(Gn, node_label):
-# print(label)
-# for nd in G.nodes(data=True):
-# pass
- if not ds_attrs['node_attr_dim']: # labels are symbolic
- for nd, _ in G.nodes(data=True):
- h_i0_list = []
- label_list = []
- for label in get_node_labels(Gn, node_label):
- h_i0 = 0
- for idx, g in enumerate(Gn):
- pi_i = pi_p[idx][nd]
- if g.has_node(pi_i) and g.nodes[pi_i][node_label] == label:
- h_i0 += 1
- h_i0_list.append(h_i0)
- label_list.append(label)
- # choose one of the best randomly.
- idx_max = np.argwhere(h_i0_list == np.max(h_i0_list)).flatten().tolist()
- idx_rdm = random.randint(0, len(idx_max) - 1)
- G_new.nodes[nd][node_label] = label_list[idx_max[idx_rdm]]
- else: # labels are non-symbolic
- for nd, _ in G.nodes(data=True):
- Si_norm = 0
- phi_i_bar = np.array([0.0 for _ in range(ds_attrs['node_attr_dim'])])
- for idx, g in enumerate(Gn):
- pi_i = pi_p[idx][nd]
- if g.has_node(pi_i): #@todo: what if no g has node? phi_i_bar = 0?
- Si_norm += 1
- phi_i_bar += np.array([float(itm) for itm in g.nodes[pi_i]['attributes']])
- phi_i_bar /= Si_norm
- G_new.nodes[nd]['attributes'] = phi_i_bar
-
- # update edge labels and adjacency matrix.
- if ds_attrs['edge_labeled']:
- for nd1, nd2, _ in G.edges(data=True):
- h_ij0_list = []
- label_list = []
- for label in get_edge_labels(Gn, edge_label):
- h_ij0 = 0
- for idx, g in enumerate(Gn):
- pi_i = pi_p[idx][nd1]
- pi_j = pi_p[idx][nd2]
- h_ij0_p = (g.has_node(pi_i) and g.has_node(pi_j) and
- g.has_edge(pi_i, pi_j) and
- g.edges[pi_i, pi_j][edge_label] == label)
- h_ij0 += h_ij0_p
- h_ij0_list.append(h_ij0)
- label_list.append(label)
- # choose one of the best randomly.
- idx_max = np.argwhere(h_ij0_list == np.max(h_ij0_list)).flatten().tolist()
- h_ij0_max = h_ij0_list[idx_max[0]]
- idx_rdm = random.randint(0, len(idx_max) - 1)
- best_label = label_list[idx_max[idx_rdm]]
-
- # check whether a_ij is 0 or 1.
- sij_norm = 0
- for idx, g in enumerate(Gn):
- pi_i = pi_p[idx][nd1]
- pi_j = pi_p[idx][nd2]
- if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j):
- sij_norm += 1
- if h_ij0_max > len(Gn) * c_er / c_es + sij_norm * (1 - (c_er + c_ei) / c_es):
- if not G_new.has_edge(nd1, nd2):
- G_new.add_edge(nd1, nd2)
- G_new.edges[nd1, nd2][edge_label] = best_label
- else:
- if G_new.has_edge(nd1, nd2):
- G_new.remove_edge(nd1, nd2)
- else: # if edges are unlabeled
- for nd1, nd2, _ in G.edges(data=True):
- sij_norm = 0
- for idx, g in enumerate(Gn):
- pi_i = pi_p[idx][nd1]
- pi_j = pi_p[idx][nd2]
- if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j):
- sij_norm += 1
- if sij_norm > len(Gn) * c_er / (c_er + c_ei):
- if not G_new.has_edge(nd1, nd2):
- G_new.add_edge(nd1, nd2)
- else:
- if G_new.has_edge(nd1, nd2):
- G_new.remove_edge(nd1, nd2)
-
- G = G_new.copy()
-
- # update pi_p
- pi_p = []
- for idx1, G_p in enumerate(Gn):
- dist_tmp, pi_tmp, _ = GED(G, G_p)
- pi_p.append(pi_tmp)
-
- return G
-
-# --------------------------- These are tests --------------------------------#
-
-def test_iam_with_more_graphs_as_init(Gn, G_candidate, c_ei=3, c_er=3, c_es=1,
- node_label='atom', edge_label='bond_type'):
- """See my name, then you know what I do.
- """
-# Gn = Gn[0:10]
- Gn = [nx.convert_node_labels_to_integers(g) for g in Gn]
-
- # phase 1: initilize.
- # compute set-median.
- dis_min = np.inf
-# pi_p = []
- pi_all_forward = []
- pi_all_backward = []
- for idx1, G_p in tqdm(enumerate(G_candidate), desc='computing GEDs', file=sys.stdout):
- dist_sum = 0
- pi_all_forward.append([])
- pi_all_backward.append([])
- for idx2, G_p_prime in enumerate(Gn):
- dist_tmp, pi_tmp_forward, pi_tmp_backward = GED(G_p, G_p_prime)
- pi_all_forward[idx1].append(pi_tmp_forward)
- pi_all_backward[idx1].append(pi_tmp_backward)
- dist_sum += dist_tmp
- if dist_sum <= dis_min:
- dis_min = dist_sum
- G = G_p.copy()
- idx_min = idx1
- # list of edit operations.
- pi_p_forward = pi_all_forward[idx_min]
- pi_p_backward = pi_all_backward[idx_min]
-
- # phase 2: iteration.
- ds_attrs = get_dataset_attributes(Gn + [G], attr_names=['edge_labeled', 'node_attr_dim'],
- edge_label=edge_label)
- label_set = get_node_labels(Gn + [G], node_label)
- for itr in range(0, 10): # @todo: the convergence condition?
- G_new = G.copy()
- # update vertex labels.
- # pre-compute h_i0 for each label.
-# for label in get_node_labels(Gn, node_label):
-# print(label)
-# for nd in G.nodes(data=True):
-# pass
- if not ds_attrs['node_attr_dim']: # labels are symbolic
- for nd in G.nodes():
- h_i0_list = []
- label_list = []
- for label in label_set:
- h_i0 = 0
- for idx, g in enumerate(Gn):
- pi_i = pi_p_forward[idx][nd]
- if g.has_node(pi_i) and g.nodes[pi_i][node_label] == label:
- h_i0 += 1
- h_i0_list.append(h_i0)
- label_list.append(label)
- # choose one of the best randomly.
- idx_max = np.argwhere(h_i0_list == np.max(h_i0_list)).flatten().tolist()
- idx_rdm = random.randint(0, len(idx_max) - 1)
- G_new.nodes[nd][node_label] = label_list[idx_max[idx_rdm]]
- else: # labels are non-symbolic
- for nd in G.nodes():
- Si_norm = 0
- phi_i_bar = np.array([0.0 for _ in range(ds_attrs['node_attr_dim'])])
- for idx, g in enumerate(Gn):
- pi_i = pi_p_forward[idx][nd]
- if g.has_node(pi_i): #@todo: what if no g has node? phi_i_bar = 0?
- Si_norm += 1
- phi_i_bar += np.array([float(itm) for itm in g.nodes[pi_i]['attributes']])
- phi_i_bar /= Si_norm
- G_new.nodes[nd]['attributes'] = phi_i_bar
-
- # update edge labels and adjacency matrix.
- if ds_attrs['edge_labeled']:
- for nd1, nd2, _ in G.edges(data=True):
- h_ij0_list = []
- label_list = []
- for label in get_edge_labels(Gn, edge_label):
- h_ij0 = 0
- for idx, g in enumerate(Gn):
- pi_i = pi_p_forward[idx][nd1]
- pi_j = pi_p_forward[idx][nd2]
- h_ij0_p = (g.has_node(pi_i) and g.has_node(pi_j) and
- g.has_edge(pi_i, pi_j) and
- g.edges[pi_i, pi_j][edge_label] == label)
- h_ij0 += h_ij0_p
- h_ij0_list.append(h_ij0)
- label_list.append(label)
- # choose one of the best randomly.
- idx_max = np.argwhere(h_ij0_list == np.max(h_ij0_list)).flatten().tolist()
- h_ij0_max = h_ij0_list[idx_max[0]]
- idx_rdm = random.randint(0, len(idx_max) - 1)
- best_label = label_list[idx_max[idx_rdm]]
-
- # check whether a_ij is 0 or 1.
- sij_norm = 0
- for idx, g in enumerate(Gn):
- pi_i = pi_p_forward[idx][nd1]
- pi_j = pi_p_forward[idx][nd2]
- if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j):
- sij_norm += 1
- if h_ij0_max > len(Gn) * c_er / c_es + sij_norm * (1 - (c_er + c_ei) / c_es):
- if not G_new.has_edge(nd1, nd2):
- G_new.add_edge(nd1, nd2)
- G_new.edges[nd1, nd2][edge_label] = best_label
- else:
- if G_new.has_edge(nd1, nd2):
- G_new.remove_edge(nd1, nd2)
- else: # if edges are unlabeled
- # @todo: works only for undirected graphs.
- for nd1 in range(nx.number_of_nodes(G)):
- for nd2 in range(nd1 + 1, nx.number_of_nodes(G)):
- sij_norm = 0
- for idx, g in enumerate(Gn):
- pi_i = pi_p_forward[idx][nd1]
- pi_j = pi_p_forward[idx][nd2]
- if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j):
- sij_norm += 1
- if sij_norm > len(Gn) * c_er / (c_er + c_ei):
- if not G_new.has_edge(nd1, nd2):
- G_new.add_edge(nd1, nd2)
- elif sij_norm < len(Gn) * c_er / (c_er + c_ei):
- if G_new.has_edge(nd1, nd2):
- G_new.remove_edge(nd1, nd2)
- # do not change anything when equal.
-
- G = G_new.copy()
-
- # update pi_p
- pi_p_forward = []
- for G_p in Gn:
- dist_tmp, pi_tmp_forward, pi_tmp_backward = GED(G, G_p)
- pi_p_forward.append(pi_tmp_forward)
-
- return G
-
-
-###############################################################################
-
-if __name__ == '__main__':
- from gklearn.utils.graphfiles import loadDataset
- ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat',
- 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}} # node/edge symb
-# ds = {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt',
-# 'extra_params': {}} # node nsymb
-# ds = {'name': 'Acyclic', 'dataset': '../datasets/monoterpenoides/trainset_9.ds',
-# 'extra_params': {}}
- Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
-
- iam(Gn)
\ No newline at end of file
diff --git a/gklearn/preimage/knn.py b/gklearn/preimage/knn.py
deleted file mode 100644
index c179287..0000000
--- a/gklearn/preimage/knn.py
+++ /dev/null
@@ -1,114 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Fri Jan 10 13:22:04 2020
-
-@author: ljia
-"""
-import numpy as np
-#import matplotlib.pyplot as plt
-from tqdm import tqdm
-import random
-#import csv
-from shutil import copyfile
-import os
-
-from gklearn.preimage.iam import iam_bash
-from gklearn.utils.graphfiles import loadDataset, loadGXL
-from gklearn.preimage.ged import GED
-from gklearn.preimage.utils import get_same_item_indices
-
-def test_knn():
- ds = {'name': 'monoterpenoides',
- 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb
- Gn, y_all = loadDataset(ds['dataset'])
-# Gn = Gn[0:50]
-# gkernel = 'treeletkernel'
-# node_label = 'atom'
-# edge_label = 'bond_type'
-# ds_name = 'mono'
- dir_output = 'results/knn/'
- graph_dir = os.path.dirname(os.path.realpath(__file__)) + '../../datasets/monoterpenoides/'
-
- k_nn = 1
- percent = 0.1
- repeats = 50
- edit_cost_constant = [3, 3, 1, 3, 3, 1]
-
- # get indices by classes.
- y_idx = get_same_item_indices(y_all)
- sod_sm_list_list
- for repeat in range(0, repeats):
- print('\n---------------------------------')
- print('repeat =', repeat)
- accuracy_sm_list = []
- accuracy_gm_list = []
- sod_sm_list = []
- sod_gm_list = []
-
- random.seed(repeat)
- set_median_list = []
- gen_median_list = []
- train_y_set = []
- for y, values in y_idx.items():
- print('\ny =', y)
- size_median_set = int(len(values) * percent)
- median_set_idx = random.sample(values, size_median_set)
- print('median set: ', median_set_idx)
-
- # compute set median and gen median using IAM (C++ through bash).
- # Gn_median = [Gn[idx] for idx in median_set_idx]
- group_fnames = [Gn[g].graph['filename'] for g in median_set_idx]
- sod_sm, sod_gm, fname_sm, fname_gm = iam_bash(group_fnames, edit_cost_constant,
- graph_dir=graph_dir)
- print('sod_sm, sod_gm:', sod_sm, sod_gm)
- sod_sm_list.append(sod_sm)
- sod_gm_list.append(sod_gm)
- fname_sm_new = dir_output + 'medians/set_median.y' + str(int(y)) + '.repeat' + str(repeat) + '.gxl'
- copyfile(fname_sm, fname_sm_new)
- fname_gm_new = dir_output + 'medians/gen_median.y' + str(int(y)) + '.repeat' + str(repeat) + '.gxl'
- copyfile(fname_gm, fname_gm_new)
- set_median_list.append(loadGXL(fname_sm_new))
- gen_median_list.append(loadGXL(fname_gm_new))
- train_y_set.append(int(y))
-
- print(sod_sm, sod_gm)
-
- # do 1-nn.
- test_y_set = [int(y) for y in y_all]
- accuracy_sm = knn(set_median_list, train_y_set, Gn, test_y_set, k=k_nn, distance='ged')
- accuracy_gm = knn(set_median_list, train_y_set, Gn, test_y_set, k=k_nn, distance='ged')
- accuracy_sm_list.append(accuracy_sm)
- accuracy_gm_list.append(accuracy_gm)
- print('current accuracy sm and gm:', accuracy_sm, accuracy_gm)
-
- # output
- accuracy_sm_mean = np.mean(accuracy_sm_list)
- accuracy_gm_mean = np.mean(accuracy_gm_list)
- print('\ntotal average accuracy sm and gm:', accuracy_sm_mean, accuracy_gm_mean)
-
-
-def knn(train_set, train_y_set, test_set, test_y_set, k=1, distance='ged'):
- if k == 1 and distance == 'ged':
- algo_options = '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1'
- params_ged = {'lib': 'gedlibpy', 'cost': 'CONSTANT', 'method': 'IPFP',
- 'algo_options': algo_options, 'stabilizer': None}
- accuracy = 0
- for idx_test, g_test in tqdm(enumerate(test_set), desc='computing 1-nn',
- file=sys.stdout):
- dis = np.inf
- for idx_train, g_train in enumerate(train_set):
- dis_cur, _, _ = GED(g_test, g_train, **params_ged)
- if dis_cur < dis:
- dis = dis_cur
- test_y_cur = train_y_set[idx_train]
- if test_y_cur == test_y_set[idx_test]:
- accuracy += 1
- accuracy = accuracy / len(test_set)
-
- return accuracy
-
-
-
-if __name__ == '__main__':
- test_knn()
\ No newline at end of file
diff --git a/gklearn/preimage/libs.py b/gklearn/preimage/libs.py
deleted file mode 100644
index 76005c6..0000000
--- a/gklearn/preimage/libs.py
+++ /dev/null
@@ -1,6 +0,0 @@
-import sys
-import pathlib
-
-# insert gedlibpy library.
-sys.path.insert(0, "../../../")
-from gedlibpy import librariesImport, gedlibpy
diff --git a/gklearn/preimage/median.py b/gklearn/preimage/median.py
deleted file mode 100644
index 1c5bb0f..0000000
--- a/gklearn/preimage/median.py
+++ /dev/null
@@ -1,218 +0,0 @@
-import sys
-sys.path.insert(0, "../")
-#import pathlib
-import numpy as np
-import networkx as nx
-import time
-
-from gedlibpy import librariesImport, gedlibpy
-#import script
-sys.path.insert(0, "/home/bgauzere/dev/optim-graphes/")
-import gklearn
-from gklearn.utils.graphfiles import loadDataset
-
-def replace_graph_in_env(script, graph, old_id, label='median'):
- """
- Replace a graph in script
-
- If old_id is -1, add a new graph to the environnemt
-
- """
- if(old_id > -1):
- script.PyClearGraph(old_id)
- new_id = script.PyAddGraph(label)
- for i in graph.nodes():
- script.PyAddNode(new_id,str(i),graph.node[i]) # !! strings are required bt gedlib
- for e in graph.edges:
- script.PyAddEdge(new_id, str(e[0]),str(e[1]), {})
- script.PyInitEnv()
- script.PySetMethod("IPFP", "")
- script.PyInitMethod()
-
- return new_id
-
-#Dessin median courrant
-def draw_Letter_graph(graph, savepath=''):
- import numpy as np
- import networkx as nx
- import matplotlib.pyplot as plt
- plt.figure()
- pos = {}
- for n in graph.nodes:
- pos[n] = np.array([float(graph.node[n]['attributes'][0]),
- float(graph.node[n]['attributes'][1])])
- nx.draw_networkx(graph, pos)
- if savepath != '':
- plt.savefig(savepath + str(time.time()) + '.eps', format='eps', dpi=300)
- plt.show()
- plt.clf()
-
-#compute new mappings
-def update_mappings(script,median_id,listID):
- med_distances = {}
- med_mappings = {}
- sod = 0
- for i in range(0,len(listID)):
- script.PyRunMethod(median_id,listID[i])
- med_distances[i] = script.PyGetUpperBound(median_id,listID[i])
- med_mappings[i] = script.PyGetForwardMap(median_id,listID[i])
- sod += med_distances[i]
- return med_distances, med_mappings, sod
-
-def calcul_Sij(all_mappings, all_graphs,i,j):
- s_ij = 0
- for k in range(0,len(all_mappings)):
- cur_graph = all_graphs[k]
- cur_mapping = all_mappings[k]
- size_graph = cur_graph.order()
- if ((cur_mapping[i] < size_graph) and
- (cur_mapping[j] < size_graph) and
- (cur_graph.has_edge(cur_mapping[i], cur_mapping[j]) == True)):
- s_ij += 1
-
- return s_ij
-
-# def update_median_nodes_L1(median,listIdSet,median_id,dataset, mappings):
-# from scipy.stats.mstats import gmean
-
-# for i in median.nodes():
-# for k in listIdSet:
-# vectors = [] #np.zeros((len(listIdSet),2))
-# if(k != median_id):
-# phi_i = mappings[k][i]
-# if(phi_i < dataset[k].order()):
-# vectors.append([float(dataset[k].node[phi_i]['x']),float(dataset[k].node[phi_i]['y'])])
-
-# new_labels = gmean(vectors)
-# median.node[i]['x'] = str(new_labels[0])
-# median.node[i]['y'] = str(new_labels[1])
-# return median
-
-def update_median_nodes(median,dataset,mappings):
- #update node attributes
- for i in median.nodes():
- nb_sub=0
- mean_label = {'x' : 0, 'y' : 0}
- for k in range(0,len(mappings)):
- phi_i = mappings[k][i]
- if ( phi_i < dataset[k].order() ):
- nb_sub += 1
- mean_label['x'] += 0.75*float(dataset[k].node[phi_i]['x'])
- mean_label['y'] += 0.75*float(dataset[k].node[phi_i]['y'])
- median.node[i]['x'] = str((1/0.75)*(mean_label['x']/nb_sub))
- median.node[i]['y'] = str((1/0.75)*(mean_label['y']/nb_sub))
- return median
-
-def update_median_edges(dataset, mappings, median, cei=0.425,cer=0.425):
-#for letter high, ceir = 1.7, alpha = 0.75
- size_dataset = len(dataset)
- ratio_cei_cer = cer/(cei + cer)
- threshold = size_dataset*ratio_cei_cer
- order_graph_median = median.order()
- for i in range(0,order_graph_median):
- for j in range(i+1,order_graph_median):
- s_ij = calcul_Sij(mappings,dataset,i,j)
- if(s_ij > threshold):
- median.add_edge(i,j)
- else:
- if(median.has_edge(i,j)):
- median.remove_edge(i,j)
- return median
-
-
-
-def compute_median(script, listID, dataset,verbose=False):
- """Compute a graph median of a dataset according to an environment
-
- Parameters
-
- script : An gedlib initialized environnement
- listID (list): a list of ID in script: encodes the dataset
- dataset (list): corresponding graphs in networkX format. We assume that graph
- listID[i] corresponds to dataset[i]
-
- Returns:
- A networkX graph, which is the median, with corresponding sod
- """
- print(len(listID))
- median_set_index, median_set_sod = compute_median_set(script, listID)
- print(median_set_index)
- print(median_set_sod)
- sods = []
- #Ajout median dans environnement
- set_median = dataset[median_set_index].copy()
- median = dataset[median_set_index].copy()
- cur_med_id = replace_graph_in_env(script,median,-1)
- med_distances, med_mappings, cur_sod = update_mappings(script,cur_med_id,listID)
- sods.append(cur_sod)
- if(verbose):
- print(cur_sod)
- ite_max = 50
- old_sod = cur_sod * 2
- ite = 0
- epsilon = 0.001
-
- best_median
- while((ite < ite_max) and (np.abs(old_sod - cur_sod) > epsilon )):
- median = update_median_nodes(median,dataset, med_mappings)
- median = update_median_edges(dataset,med_mappings,median)
-
- cur_med_id = replace_graph_in_env(script,median,cur_med_id)
- med_distances, med_mappings, cur_sod = update_mappings(script,cur_med_id,listID)
-
-
- sods.append(cur_sod)
- if(verbose):
- print(cur_sod)
- ite += 1
- return median, cur_sod, sods, set_median
-
- draw_Letter_graph(median)
-
-
-def compute_median_set(script,listID):
- 'Returns the id in listID corresponding to median set'
- #Calcul median set
- N=len(listID)
- map_id_to_index = {}
- map_index_to_id = {}
- for i in range(0,len(listID)):
- map_id_to_index[listID[i]] = i
- map_index_to_id[i] = listID[i]
-
- distances = np.zeros((N,N))
- for i in listID:
- for j in listID:
- script.PyRunMethod(i,j)
- distances[map_id_to_index[i],map_id_to_index[j]] = script.PyGetUpperBound(i,j)
-
- median_set_index = np.argmin(np.sum(distances,0))
- sod = np.min(np.sum(distances,0))
-
- return median_set_index, sod
-
-if __name__ == "__main__":
- #Chargement du dataset
- script.PyLoadGXLGraph('/home/bgauzere/dev/gedlib/data/datasets/Letter/HIGH/', '/home/bgauzere/dev/gedlib/data/collections/Letter_Z.xml')
- script.PySetEditCost("LETTER")
- script.PyInitEnv()
- script.PySetMethod("IPFP", "")
- script.PyInitMethod()
-
- dataset,my_y = gklearn.utils.graphfiles.loadDataset("/home/bgauzere/dev/gedlib/data/datasets/Letter/HIGH/Letter_Z.cxl")
-
- listID = script.PyGetAllGraphIds()
- median, sod = compute_median(script,listID,dataset,verbose=True)
-
- print(sod)
- draw_Letter_graph(median)
-
-
-#if __name__ == '__main__':
-# # test draw_Letter_graph
-# ds = {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt',
-# 'extra_params': {}} # node nsymb
-# Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
-# print(y_all)
-# for g in Gn:
-# draw_Letter_graph(g)
\ No newline at end of file
diff --git a/gklearn/preimage/median_benoit.py b/gklearn/preimage/median_benoit.py
deleted file mode 100644
index 6712196..0000000
--- a/gklearn/preimage/median_benoit.py
+++ /dev/null
@@ -1,201 +0,0 @@
-import sys
-import pathlib
-import numpy as np
-import networkx as nx
-
-import librariesImport
-import script
-sys.path.insert(0, "/home/bgauzere/dev/optim-graphes/")
-import gklearn
-
-def replace_graph_in_env(script, graph, old_id, label='median'):
- """
- Replace a graph in script
-
- If old_id is -1, add a new graph to the environnemt
-
- """
- if(old_id > -1):
- script.PyClearGraph(old_id)
- new_id = script.PyAddGraph(label)
- for i in graph.nodes():
- script.PyAddNode(new_id,str(i),graph.node[i]) # !! strings are required bt gedlib
- for e in graph.edges:
- script.PyAddEdge(new_id, str(e[0]),str(e[1]), {})
- script.PyInitEnv()
- script.PySetMethod("IPFP", "")
- script.PyInitMethod()
-
- return new_id
-
-#Dessin median courrant
-def draw_Letter_graph(graph):
- import numpy as np
- import networkx as nx
- import matplotlib.pyplot as plt
- plt.figure()
- pos = {}
- for n in graph.nodes:
- pos[n] = np.array([float(graph.node[n]['x']),float(graph.node[n]['y'])])
- nx.draw_networkx(graph,pos)
- plt.show()
-
-#compute new mappings
-def update_mappings(script,median_id,listID):
- med_distances = {}
- med_mappings = {}
- sod = 0
- for i in range(0,len(listID)):
- script.PyRunMethod(median_id,listID[i])
- med_distances[i] = script.PyGetUpperBound(median_id,listID[i])
- med_mappings[i] = script.PyGetForwardMap(median_id,listID[i])
- sod += med_distances[i]
- return med_distances, med_mappings, sod
-
-def calcul_Sij(all_mappings, all_graphs,i,j):
- s_ij = 0
- for k in range(0,len(all_mappings)):
- cur_graph = all_graphs[k]
- cur_mapping = all_mappings[k]
- size_graph = cur_graph.order()
- if ((cur_mapping[i] < size_graph) and
- (cur_mapping[j] < size_graph) and
- (cur_graph.has_edge(cur_mapping[i], cur_mapping[j]) == True)):
- s_ij += 1
-
- return s_ij
-
-# def update_median_nodes_L1(median,listIdSet,median_id,dataset, mappings):
-# from scipy.stats.mstats import gmean
-
-# for i in median.nodes():
-# for k in listIdSet:
-# vectors = [] #np.zeros((len(listIdSet),2))
-# if(k != median_id):
-# phi_i = mappings[k][i]
-# if(phi_i < dataset[k].order()):
-# vectors.append([float(dataset[k].node[phi_i]['x']),float(dataset[k].node[phi_i]['y'])])
-
-# new_labels = gmean(vectors)
-# median.node[i]['x'] = str(new_labels[0])
-# median.node[i]['y'] = str(new_labels[1])
-# return median
-
-def update_median_nodes(median,dataset,mappings):
- #update node attributes
- for i in median.nodes():
- nb_sub=0
- mean_label = {'x' : 0, 'y' : 0}
- for k in range(0,len(mappings)):
- phi_i = mappings[k][i]
- if ( phi_i < dataset[k].order() ):
- nb_sub += 1
- mean_label['x'] += 0.75*float(dataset[k].node[phi_i]['x'])
- mean_label['y'] += 0.75*float(dataset[k].node[phi_i]['y'])
- median.node[i]['x'] = str((1/0.75)*(mean_label['x']/nb_sub))
- median.node[i]['y'] = str((1/0.75)*(mean_label['y']/nb_sub))
- return median
-
-def update_median_edges(dataset, mappings, median, cei=0.425,cer=0.425):
-#for letter high, ceir = 1.7, alpha = 0.75
- size_dataset = len(dataset)
- ratio_cei_cer = cer/(cei + cer)
- threshold = size_dataset*ratio_cei_cer
- order_graph_median = median.order()
- for i in range(0,order_graph_median):
- for j in range(i+1,order_graph_median):
- s_ij = calcul_Sij(mappings,dataset,i,j)
- if(s_ij > threshold):
- median.add_edge(i,j)
- else:
- if(median.has_edge(i,j)):
- median.remove_edge(i,j)
- return median
-
-
-
-def compute_median(script, listID, dataset,verbose=False):
- """Compute a graph median of a dataset according to an environment
-
- Parameters
-
- script : An gedlib initialized environnement
- listID (list): a list of ID in script: encodes the dataset
- dataset (list): corresponding graphs in networkX format. We assume that graph
- listID[i] corresponds to dataset[i]
-
- Returns:
- A networkX graph, which is the median, with corresponding sod
- """
- print(len(listID))
- median_set_index, median_set_sod = compute_median_set(script, listID)
- print(median_set_index)
- print(median_set_sod)
- sods = []
- #Ajout median dans environnement
- set_median = dataset[median_set_index].copy()
- median = dataset[median_set_index].copy()
- cur_med_id = replace_graph_in_env(script,median,-1)
- med_distances, med_mappings, cur_sod = update_mappings(script,cur_med_id,listID)
- sods.append(cur_sod)
- if(verbose):
- print(cur_sod)
- ite_max = 50
- old_sod = cur_sod * 2
- ite = 0
- epsilon = 0.001
-
- best_median
- while((ite < ite_max) and (np.abs(old_sod - cur_sod) > epsilon )):
- median = update_median_nodes(median,dataset, med_mappings)
- median = update_median_edges(dataset,med_mappings,median)
-
- cur_med_id = replace_graph_in_env(script,median,cur_med_id)
- med_distances, med_mappings, cur_sod = update_mappings(script,cur_med_id,listID)
-
-
- sods.append(cur_sod)
- if(verbose):
- print(cur_sod)
- ite += 1
- return median, cur_sod, sods, set_median
-
- draw_Letter_graph(median)
-
-
-def compute_median_set(script,listID):
- 'Returns the id in listID corresponding to median set'
- #Calcul median set
- N=len(listID)
- map_id_to_index = {}
- map_index_to_id = {}
- for i in range(0,len(listID)):
- map_id_to_index[listID[i]] = i
- map_index_to_id[i] = listID[i]
-
- distances = np.zeros((N,N))
- for i in listID:
- for j in listID:
- script.PyRunMethod(i,j)
- distances[map_id_to_index[i],map_id_to_index[j]] = script.PyGetUpperBound(i,j)
-
- median_set_index = np.argmin(np.sum(distances,0))
- sod = np.min(np.sum(distances,0))
-
- return median_set_index, sod
-
-if __name__ == "__main__":
- #Chargement du dataset
- script.PyLoadGXLGraph('/home/bgauzere/dev/gedlib/data/datasets/Letter/HIGH/', '/home/bgauzere/dev/gedlib/data/collections/Letter_Z.xml')
- script.PySetEditCost("LETTER")
- script.PyInitEnv()
- script.PySetMethod("IPFP", "")
- script.PyInitMethod()
-
- dataset,my_y = gklearn.utils.graphfiles.loadDataset("/home/bgauzere/dev/gedlib/data/datasets/Letter/HIGH/Letter_Z.cxl")
-
- listID = script.PyGetAllGraphIds()
- median, sod = compute_median(script,listID,dataset,verbose=True)
-
- print(sod)
- draw_Letter_graph(median)
diff --git a/gklearn/preimage/median_linlin.py b/gklearn/preimage/median_linlin.py
deleted file mode 100644
index 6139558..0000000
--- a/gklearn/preimage/median_linlin.py
+++ /dev/null
@@ -1,215 +0,0 @@
-import sys
-import pathlib
-import numpy as np
-import networkx as nx
-
-from gedlibpy import librariesImport, gedlibpy
-sys.path.insert(0, "/home/bgauzere/dev/optim-graphes/")
-import gklearn
-
-def replace_graph_in_env(script, graph, old_id, label='median'):
- """
- Replace a graph in script
-
- If old_id is -1, add a new graph to the environnemt
-
- """
- if(old_id > -1):
- script.PyClearGraph(old_id)
- new_id = script.PyAddGraph(label)
- for i in graph.nodes():
- script.PyAddNode(new_id,str(i),graph.node[i]) # !! strings are required bt gedlib
- for e in graph.edges:
- script.PyAddEdge(new_id, str(e[0]),str(e[1]), {})
- script.PyInitEnv()
- script.PySetMethod("IPFP", "")
- script.PyInitMethod()
-
- return new_id
-
-#Dessin median courrant
-def draw_Letter_graph(graph):
- import numpy as np
- import networkx as nx
- import matplotlib.pyplot as plt
- plt.figure()
- pos = {}
- for n in graph.nodes:
- pos[n] = np.array([float(graph.node[n]['x']),float(graph.node[n]['y'])])
- nx.draw_networkx(graph,pos)
- plt.show()
-
-#compute new mappings
-def update_mappings(script,median_id,listID):
- med_distances = {}
- med_mappings = {}
- sod = 0
- for i in range(0,len(listID)):
- script.PyRunMethod(median_id,listID[i])
- med_distances[i] = script.PyGetUpperBound(median_id,listID[i])
- med_mappings[i] = script.PyGetForwardMap(median_id,listID[i])
- sod += med_distances[i]
- return med_distances, med_mappings, sod
-
-def calcul_Sij(all_mappings, all_graphs,i,j):
- s_ij = 0
- for k in range(0,len(all_mappings)):
- cur_graph = all_graphs[k]
- cur_mapping = all_mappings[k]
- size_graph = cur_graph.order()
- if ((cur_mapping[i] < size_graph) and
- (cur_mapping[j] < size_graph) and
- (cur_graph.has_edge(cur_mapping[i], cur_mapping[j]) == True)):
- s_ij += 1
-
- return s_ij
-
-# def update_median_nodes_L1(median,listIdSet,median_id,dataset, mappings):
-# from scipy.stats.mstats import gmean
-
-# for i in median.nodes():
-# for k in listIdSet:
-# vectors = [] #np.zeros((len(listIdSet),2))
-# if(k != median_id):
-# phi_i = mappings[k][i]
-# if(phi_i < dataset[k].order()):
-# vectors.append([float(dataset[k].node[phi_i]['x']),float(dataset[k].node[phi_i]['y'])])
-
-# new_labels = gmean(vectors)
-# median.node[i]['x'] = str(new_labels[0])
-# median.node[i]['y'] = str(new_labels[1])
-# return median
-
-def update_median_nodes(median,dataset,mappings):
- #update node attributes
- for i in median.nodes():
- nb_sub=0
- mean_label = {'x' : 0, 'y' : 0}
- for k in range(0,len(mappings)):
- phi_i = mappings[k][i]
- if ( phi_i < dataset[k].order() ):
- nb_sub += 1
- mean_label['x'] += 0.75*float(dataset[k].node[phi_i]['x'])
- mean_label['y'] += 0.75*float(dataset[k].node[phi_i]['y'])
- median.node[i]['x'] = str((1/0.75)*(mean_label['x']/nb_sub))
- median.node[i]['y'] = str((1/0.75)*(mean_label['y']/nb_sub))
- return median
-
-def update_median_edges(dataset, mappings, median, cei=0.425,cer=0.425):
-#for letter high, ceir = 1.7, alpha = 0.75
- size_dataset = len(dataset)
- ratio_cei_cer = cer/(cei + cer)
- threshold = size_dataset*ratio_cei_cer
- order_graph_median = median.order()
- for i in range(0,order_graph_median):
- for j in range(i+1,order_graph_median):
- s_ij = calcul_Sij(mappings,dataset,i,j)
- if(s_ij > threshold):
- median.add_edge(i,j)
- else:
- if(median.has_edge(i,j)):
- median.remove_edge(i,j)
- return median
-
-
-
-def compute_median(script, listID, dataset,verbose=False):
- """Compute a graph median of a dataset according to an environment
-
- Parameters
-
- script : An gedlib initialized environnement
- listID (list): a list of ID in script: encodes the dataset
- dataset (list): corresponding graphs in networkX format. We assume that graph
- listID[i] corresponds to dataset[i]
-
- Returns:
- A networkX graph, which is the median, with corresponding sod
- """
- print(len(listID))
- median_set_index, median_set_sod = compute_median_set(script, listID)
- print(median_set_index)
- print(median_set_sod)
- sods = []
- #Ajout median dans environnement
- set_median = dataset[median_set_index].copy()
- median = dataset[median_set_index].copy()
- cur_med_id = replace_graph_in_env(script,median,-1)
- med_distances, med_mappings, cur_sod = update_mappings(script,cur_med_id,listID)
- sods.append(cur_sod)
- if(verbose):
- print(cur_sod)
- ite_max = 50
- old_sod = cur_sod * 2
- ite = 0
- epsilon = 0.001
-
- best_median
- while((ite < ite_max) and (np.abs(old_sod - cur_sod) > epsilon )):
- median = update_median_nodes(median,dataset, med_mappings)
- median = update_median_edges(dataset,med_mappings,median)
-
- cur_med_id = replace_graph_in_env(script,median,cur_med_id)
- med_distances, med_mappings, cur_sod = update_mappings(script,cur_med_id,listID)
-
-
- sods.append(cur_sod)
- if(verbose):
- print(cur_sod)
- ite += 1
- return median, cur_sod, sods, set_median
-
- draw_Letter_graph(median)
-
-
-def compute_median_set(script,listID):
- 'Returns the id in listID corresponding to median set'
- #Calcul median set
- N=len(listID)
- map_id_to_index = {}
- map_index_to_id = {}
- for i in range(0,len(listID)):
- map_id_to_index[listID[i]] = i
- map_index_to_id[i] = listID[i]
-
- distances = np.zeros((N,N))
- for i in listID:
- for j in listID:
- script.PyRunMethod(i,j)
- distances[map_id_to_index[i],map_id_to_index[j]] = script.PyGetUpperBound(i,j)
-
- median_set_index = np.argmin(np.sum(distances,0))
- sod = np.min(np.sum(distances,0))
-
- return median_set_index, sod
-
-def _convertGraph(G):
- """Convert a graph to the proper NetworkX format that can be
- recognized by library gedlibpy.
- """
- G_new = nx.Graph()
- for nd, attrs in G.nodes(data=True):
- G_new.add_node(str(nd), chem=attrs['atom'])
-# G_new.add_node(str(nd), x=str(attrs['attributes'][0]),
-# y=str(attrs['attributes'][1]))
- for nd1, nd2, attrs in G.edges(data=True):
- G_new.add_edge(str(nd1), str(nd2), valence=attrs['bond_type'])
-# G_new.add_edge(str(nd1), str(nd2))
-
- return G_new
-
-if __name__ == "__main__":
- #Chargement du dataset
- gedlibpy.PyLoadGXLGraph('/home/bgauzere/dev/gedlib/data/datasets/Letter/HIGH/', '/home/bgauzere/dev/gedlib/data/collections/Letter_Z.xml')
- gedlibpy.PySetEditCost("LETTER")
- gedlibpy.PyInitEnv()
- gedlibpy.PySetMethod("IPFP", "")
- gedlibpy.PyInitMethod()
-
- dataset,my_y = gklearn.utils.graphfiles.loadDataset("/home/bgauzere/dev/gedlib/data/datasets/Letter/HIGH/Letter_Z.cxl")
-
- listID = gedlibpy.PyGetAllGraphIds()
- median, sod = compute_median(gedlibpy,listID,dataset,verbose=True)
-
- print(sod)
- draw_Letter_graph(median)
diff --git a/gklearn/preimage/pathfrequency.py b/gklearn/preimage/pathfrequency.py
deleted file mode 100644
index 3bca1bc..0000000
--- a/gklearn/preimage/pathfrequency.py
+++ /dev/null
@@ -1,201 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Wed Mar 20 10:12:15 2019
-
-inferring a graph grom path frequency.
-@author: ljia
-"""
-#import numpy as np
-import networkx as nx
-from scipy.spatial.distance import hamming
-import itertools
-
-def SISF(K, v):
- if output:
- return output
- else:
- return 'no solution'
-
-
-def SISF_M(K, v):
- return output
-
-
-def GIPF_tree(v_obj, K=1, alphabet=[0, 1]):
- if K == 1:
- n_graph = v_obj[0] + v_obj[1]
- D_T, father_idx = getDynamicTable(n_graph, alphabet)
-
- # get the vector the closest to v_obj.
- if v_obj not in D_T:
- print('no exact solution')
- dis_lim = 1 / len(v_obj) # the possible shortest distance.
- dis_min = 1.0 # minimum proportional distance
- v_min = v_obj
- for vc in D_T:
- if vc[0] + vc[1] == n_graph:
-# print(vc)
- dis = hamming(vc, v_obj)
- if dis < dis_min:
- dis_min = dis
- v_min = vc
- if dis_min <= dis_lim:
- break
- v_obj = v_min
-
- # obtain required graph by traceback procedure.
- return getObjectGraph(v_obj, D_T, father_idx, alphabet), v_obj
-
-def GIPF_M(K, v):
- return G
-
-
-def getDynamicTable(n_graph, alphabet=[0, 1]):
- # init. When only one node exists.
- D_T = {(1, 0, 0, 0, 0, 0): 1, (0, 1, 0, 0, 0, 0): 1, (0, 0, 1, 0, 0, 0): 0,
- (0, 0, 0, 1, 0, 0): 0, (0, 0, 0, 0, 1, 0): 0, (0, 0, 0, 0, 0, 1): 0,}
- D_T = [(1, 0, 0, 0, 0, 0), (0, 1, 0, 0, 0, 0)]
- father_idx = [-1, -1] # index of each vector's father
- # add possible vectors.
- for idx, v in enumerate(D_T):
- if v[0] + v[1] < n_graph:
- D_T.append((v[0] + 1, v[1], v[2] + 2, v[3], v[4], v[5]))
- D_T.append((v[0] + 1, v[1], v[2], v[3] + 1, v[4] + 1, v[5]))
- D_T.append((v[0], v[1] + 1, v[2], v[3] + 1, v[4] + 1, v[5]))
- D_T.append((v[0], v[1] + 1, v[2], v[3], v[4], v[5] + 2))
- father_idx += [idx, idx, idx, idx]
-
-# D_T = itertools.chain([(1, 0, 0, 0, 0, 0)], [(0, 1, 0, 0, 0, 0)])
-# father_idx = itertools.chain([-1], [-1]) # index of each vector's father
-# # add possible vectors.
-# for idx, v in enumerate(D_T):
-# if v[0] + v[1] < n_graph:
-# D_T = itertools.chain(D_T, [(v[0] + 1, v[1], v[2] + 2, v[3], v[4], v[5])])
-# D_T = itertools.chain(D_T, [(v[0] + 1, v[1], v[2], v[3] + 1, v[4] + 1, v[5])])
-# D_T = itertools.chain(D_T, [(v[0], v[1] + 1, v[2], v[3] + 1, v[4] + 1, v[5])])
-# D_T = itertools.chain(D_T, [(v[0], v[1] + 1, v[2], v[3], v[4], v[5] + 2)])
-# father_idx = itertools.chain(father_idx, [idx, idx, idx, idx])
- return D_T, father_idx
-
-
-def getObjectGraph(v_obj, D_T, father_idx, alphabet=[0, 1]):
- g_obj = nx.Graph()
-
- # do vector traceback.
- v_tb = [list(v_obj)] # traceback vectors.
- v_tb_idx = [D_T.index(v_obj)] # indices of traceback vectors.
- while v_tb_idx[-1] > 1:
- idx_pre = father_idx[v_tb_idx[-1]]
- v_tb_idx.append(idx_pre)
- v_tb.append(list(D_T[idx_pre]))
- v_tb = v_tb[::-1] # reverse
-# v_tb_idx = v_tb_idx[::-1]
-
- # construct tree.
- v_c = v_tb[0] # current vector.
- if v_c[0] == 1:
- g_obj.add_node(0, node_label=alphabet[0])
- else:
- g_obj.add_node(0, node_label=alphabet[1])
- for vct in v_tb[1:]:
- if vct[0] - v_c[0] == 1:
- if vct[2] - v_c[2] == 2: # transfer 1
- label1 = alphabet[0]
- label2 = alphabet[0]
- else: # transfer 2
- label1 = alphabet[1]
- label2 = alphabet[0]
- else:
- if vct[3] - v_c[3] == 1: # transfer 3
- label1 = alphabet[0]
- label2 = alphabet[1]
- else: # transfer 4
- label1 = alphabet[1]
- label2 = alphabet[1]
- for nd, attr in g_obj.nodes(data=True):
- if attr['node_label'] == label1:
- nb_node = nx.number_of_nodes(g_obj)
- g_obj.add_node(nb_node, node_label=label2)
- g_obj.add_edge(nd, nb_node)
- break
- v_c = vct
- return g_obj
-
-
-import random
-def hierarchy_pos(G, root=None, width=1., vert_gap = 0.2, vert_loc = 0, xcenter = 0.5):
-
- '''
- From Joel's answer at https://stackoverflow.com/a/29597209/2966723.
- Licensed under Creative Commons Attribution-Share Alike
-
- If the graph is a tree this will return the positions to plot this in a
- hierarchical layout.
-
- G: the graph (must be a tree)
-
- root: the root node of current branch
- - if the tree is directed and this is not given,
- the root will be found and used
- - if the tree is directed and this is given, then
- the positions will be just for the descendants of this node.
- - if the tree is undirected and not given,
- then a random choice will be used.
-
- width: horizontal space allocated for this branch - avoids overlap with other branches
-
- vert_gap: gap between levels of hierarchy
-
- vert_loc: vertical location of root
-
- xcenter: horizontal location of root
- '''
- if not nx.is_tree(G):
- raise TypeError('cannot use hierarchy_pos on a graph that is not a tree')
-
- if root is None:
- if isinstance(G, nx.DiGraph):
- root = next(iter(nx.topological_sort(G))) #allows back compatibility with nx version 1.11
- else:
- root = random.choice(list(G.nodes))
-
- def _hierarchy_pos(G, root, width=1., vert_gap = 0.2, vert_loc = 0, xcenter = 0.5, pos = None, parent = None):
- '''
- see hierarchy_pos docstring for most arguments
-
- pos: a dict saying where all nodes go if they have been assigned
- parent: parent of this branch. - only affects it if non-directed
-
- '''
-
- if pos is None:
- pos = {root:(xcenter,vert_loc)}
- else:
- pos[root] = (xcenter, vert_loc)
- children = list(G.neighbors(root))
- if not isinstance(G, nx.DiGraph) and parent is not None:
- children.remove(parent)
- if len(children)!=0:
- dx = width/len(children)
- nextx = xcenter - width/2 - dx/2
- for child in children:
- nextx += dx
- pos = _hierarchy_pos(G,child, width = dx, vert_gap = vert_gap,
- vert_loc = vert_loc-vert_gap, xcenter=nextx,
- pos=pos, parent = root)
- return pos
-
-
- return _hierarchy_pos(G, root, width, vert_gap, vert_loc, xcenter)
-
-
-if __name__ == '__main__':
- v_obj = (6, 4, 10, 3, 3, 2)
-# v_obj = (6, 5, 10, 3, 3, 2)
- tree_obj, v_obj = GIPF_tree(v_obj)
- print('One closest vector is', v_obj)
- # plot
- pos = hierarchy_pos(tree_obj, 0)
- node_labels = nx.get_node_attributes(tree_obj, 'node_label')
- nx.draw(tree_obj, pos=pos, labels=node_labels, with_labels=True)
\ No newline at end of file
diff --git a/gklearn/preimage/preimage_iam.py b/gklearn/preimage/preimage_iam.py
deleted file mode 100644
index bf79d0e..0000000
--- a/gklearn/preimage/preimage_iam.py
+++ /dev/null
@@ -1,705 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Tue Apr 30 17:07:43 2019
-
-A graph pre-image method combining iterative pre-image method in reference [1]
-and the iterative alternate minimizations (IAM) in reference [2].
-@author: ljia
-@references:
- [1] Gökhan H Bakir, Alexander Zien, and Koji Tsuda. Learning to and graph
- pre-images. In Joint Pattern Re ognition Symposium , pages 253-261. Springer, 2004.
- [2] Generalized median graph via iterative alternate minimization.
-"""
-import sys
-import numpy as np
-from tqdm import tqdm
-import networkx as nx
-import matplotlib.pyplot as plt
-import random
-
-from iam import iam_upgraded
-from utils import dis_gstar, compute_kernel
-
-
-def preimage_iam(Gn_init, Gn_median, alpha, idx_gi, Kmatrix, k, r_max,
- gkernel, epsilon=0.001, InitIAMWithAllDk=False,
- params_iam={'c_ei': 1, 'c_er': 1, 'c_es': 1,
- 'ite_max': 50, 'epsilon': 0.001,
- 'removeNodes': True, 'connected': False},
- params_ged={'lib': 'gedlibpy', 'cost': 'CHEM_1', 'method': 'IPFP',
- 'edit_cost_constant': [], 'stabilizer': 'min',
- 'repeat': 50}):
- """This function constructs graph pre-image by the iterative pre-image
- framework in reference [1], algorithm 1, where the step of generating new
- graphs randomly is replaced by the IAM algorithm in reference [2].
-
- notes
- -----
- Every time a set of n better graphs is acquired, their distances in kernel space are
- compared with the k nearest ones, and the k nearest distances from the k+n
- distances will be used as the new ones.
- """
- # compute k nearest neighbors of phi in DN.
- dis_all = [] # distance between g_star and each graph.
- term3 = 0
- for i1, a1 in enumerate(alpha):
- for i2, a2 in enumerate(alpha):
- term3 += a1 * a2 * Kmatrix[idx_gi[i1], idx_gi[i2]]
- for ig, g in tqdm(enumerate(Gn_init), desc='computing distances', file=sys.stdout):
- dtemp = dis_gstar(ig, idx_gi, alpha, Kmatrix, term3=term3)
- dis_all.append(dtemp)
-
- # sort
- sort_idx = np.argsort(dis_all)
- dis_k = [dis_all[idis] for idis in sort_idx[0:k]] # the k shortest distances
- nb_best = len(np.argwhere(dis_k == dis_k[0]).flatten().tolist())
- ghat_list = [Gn_init[idx].copy() for idx in sort_idx[0:nb_best]] # the nearest neighbors of phi in DN
- if dis_k[0] == 0: # the exact pre-image.
- print('The exact pre-image is found from the input dataset.')
- return 0, ghat_list, 0, 0
- dhat = dis_k[0] # the nearest distance
-# for g in ghat_list:
-# draw_Letter_graph(g)
-# nx.draw_networkx(g)
-# plt.show()
-# print(g.nodes(data=True))
-# print(g.edges(data=True))
- Gk = [Gn_init[ig].copy() for ig in sort_idx[0:k]] # the k nearest neighbors
-# for gi in Gk:
-# nx.draw(gi, labels=nx.get_node_attributes(gi, 'atom'), with_labels=True)
-## nx.draw_networkx(gi)
-# plt.show()
-## draw_Letter_graph(g)
-# print(gi.nodes(data=True))
-# print(gi.edges(data=True))
-
-# i = 1
- r = 0
- itr_total = 0
- dis_of_each_itr = [dhat]
- found = False
- nb_updated = 0
- nb_updated_k = 0
- while r < r_max:# and not found: # @todo: if not found?# and np.abs(old_dis - cur_dis) > epsilon:
- print('\n-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-')
- print('Current preimage iteration =', r)
- print('Total preimage iteration =', itr_total, '\n')
- found = False
-
- Gn_nearest_median = [g.copy() for g in Gk]
- if InitIAMWithAllDk: # each graph in D_k is used to initialize IAM.
- ghat_new_list = []
- for g_tmp in Gk:
- Gn_nearest_init = [g_tmp.copy()]
- ghat_new_list_tmp, _, _ = iam_upgraded(Gn_nearest_median,
- Gn_nearest_init, params_ged=params_ged, **params_iam)
- ghat_new_list += ghat_new_list_tmp
- else: # only the best graph in D_k is used to initialize IAM.
- Gn_nearest_init = [g.copy() for g in Gk]
- ghat_new_list, _, _ = iam_upgraded(Gn_nearest_median, Gn_nearest_init,
- params_ged=params_ged, **params_iam)
-
-# for g in g_tmp_list:
-# nx.draw_networkx(g)
-# plt.show()
-# draw_Letter_graph(g)
-# print(g.nodes(data=True))
-# print(g.edges(data=True))
-
- # compute distance between \psi and the new generated graphs.
- knew = compute_kernel(ghat_new_list + Gn_median, gkernel, False)
- dhat_new_list = []
- for idx, g_tmp in enumerate(ghat_new_list):
- # @todo: the term3 below could use the one at the beginning of the function.
- dhat_new_list.append(dis_gstar(idx, range(len(ghat_new_list),
- len(ghat_new_list) + len(Gn_median) + 1),
- alpha, knew, withterm3=False))
-
- for idx_g, ghat_new in enumerate(ghat_new_list):
- dhat_new = dhat_new_list[idx_g]
-
- # if the new distance is smaller than the max of D_k.
- if dhat_new < dis_k[-1] and np.abs(dhat_new - dis_k[-1]) >= epsilon:
- # check if the new distance is the same as one in D_k.
- is_duplicate = False
- for dis_tmp in dis_k[1:-1]:
- if np.abs(dhat_new - dis_tmp) < epsilon:
- is_duplicate = True
- print('IAM: duplicate k nearest graph generated.')
- break
- if not is_duplicate:
- if np.abs(dhat_new - dhat) < epsilon:
- print('IAM: I am equal!')
-# dhat = dhat_new
-# ghat_list = [ghat_new.copy()]
- else:
- print('IAM: we got better k nearest neighbors!')
- nb_updated_k += 1
- print('the k nearest neighbors are updated',
- nb_updated_k, 'times.')
-
- dis_k = [dhat_new] + dis_k[0:k-1] # add the new nearest distance.
- Gk = [ghat_new.copy()] + Gk[0:k-1] # add the corresponding graph.
- sort_idx = np.argsort(dis_k)
- dis_k = [dis_k[idx] for idx in sort_idx[0:k]] # the new k nearest distances.
- Gk = [Gk[idx] for idx in sort_idx[0:k]]
- if dhat_new < dhat:
- print('IAM: I have smaller distance!')
- print(str(dhat) + '->' + str(dhat_new))
- dhat = dhat_new
- ghat_list = [Gk[0].copy()]
- r = 0
- nb_updated += 1
-
- print('the graph is updated', nb_updated, 'times.')
- nx.draw(Gk[0], labels=nx.get_node_attributes(Gk[0], 'atom'),
- with_labels=True)
- ## plt.savefig("results/gk_iam/simple_two/xx" + str(i) + ".png", format="PNG")
- plt.show()
-
- found = True
- if not found:
- r += 1
-
- dis_of_each_itr.append(dhat)
- itr_total += 1
- print('\nthe k shortest distances are', dis_k)
- print('the shortest distances for previous iterations are', dis_of_each_itr)
-
- print('\n\nthe graph is updated', nb_updated, 'times.')
- print('\nthe k nearest neighbors are updated', nb_updated_k, 'times.')
- print('distances in kernel space:', dis_of_each_itr, '\n')
-
- return dhat, ghat_list, dis_of_each_itr[-1], nb_updated, nb_updated_k
-
-
-
-
-def preimage_iam_random_mix(Gn_init, Gn_median, alpha, idx_gi, Kmatrix, k, r_max,
- l_max, gkernel, epsilon=0.001,
- InitIAMWithAllDk=False, InitRandomWithAllDk=True,
- params_iam={'c_ei': 1, 'c_er': 1, 'c_es': 1,
- 'ite_max': 50, 'epsilon': 0.001,
- 'removeNodes': True, 'connected': False},
- params_ged={'lib': 'gedlibpy', 'cost': 'CHEM_1',
- 'method': 'IPFP', 'edit_cost_constant': [],
- 'stabilizer': 'min', 'repeat': 50}):
- """This function constructs graph pre-image by the iterative pre-image
- framework in reference [1], algorithm 1, where new graphs are generated
- randomly and by the IAM algorithm in reference [2].
-
- notes
- -----
- Every time a set of n better graphs is acquired, their distances in kernel space are
- compared with the k nearest ones, and the k nearest distances from the k+n
- distances will be used as the new ones.
- """
- Gn_init = [nx.convert_node_labels_to_integers(g) for g in Gn_init]
- # compute k nearest neighbors of phi in DN.
- dis_all = [] # distance between g_star and each graph.
- term3 = 0
- for i1, a1 in enumerate(alpha):
- for i2, a2 in enumerate(alpha):
- term3 += a1 * a2 * Kmatrix[idx_gi[i1], idx_gi[i2]]
- for ig, g in tqdm(enumerate(Gn_init), desc='computing distances', file=sys.stdout):
- dtemp = dis_gstar(ig, idx_gi, alpha, Kmatrix, term3=term3)
- dis_all.append(dtemp)
-
- # sort
- sort_idx = np.argsort(dis_all)
- dis_k = [dis_all[idis] for idis in sort_idx[0:k]] # the k shortest distances
- nb_best = len(np.argwhere(dis_k == dis_k[0]).flatten().tolist())
- ghat_list = [Gn_init[idx].copy() for idx in sort_idx[0:nb_best]] # the nearest neighbors of psi in DN
- if dis_k[0] == 0: # the exact pre-image.
- print('The exact pre-image is found from the input dataset.')
- return 0, ghat_list, 0, 0
- dhat = dis_k[0] # the nearest distance
-# for g in ghat_list:
-# draw_Letter_graph(g)
-# nx.draw_networkx(g)
-# plt.show()
-# print(g.nodes(data=True))
-# print(g.edges(data=True))
- Gk = [Gn_init[ig].copy() for ig in sort_idx[0:k]] # the k nearest neighbors
-# for gi in Gk:
-# nx.draw(gi, labels=nx.get_node_attributes(gi, 'atom'), with_labels=True)
-## nx.draw_networkx(gi)
-# plt.show()
-## draw_Letter_graph(g)
-# print(gi.nodes(data=True))
-# print(gi.edges(data=True))
-
- r = 0
- itr_total = 0
- dis_of_each_itr = [dhat]
- nb_updated_iam = 0
- nb_updated_k_iam = 0
- nb_updated_random = 0
- nb_updated_k_random = 0
-# is_iam_duplicate = False
- while r < r_max: # and not found: # @todo: if not found?# and np.abs(old_dis - cur_dis) > epsilon:
- print('\n-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-')
- print('Current preimage iteration =', r)
- print('Total preimage iteration =', itr_total, '\n')
- found_iam = False
-
- Gn_nearest_median = [g.copy() for g in Gk]
- if InitIAMWithAllDk: # each graph in D_k is used to initialize IAM.
- ghat_new_list = []
- for g_tmp in Gk:
- Gn_nearest_init = [g_tmp.copy()]
- ghat_new_list_tmp, _ = iam_upgraded(Gn_nearest_median,
- Gn_nearest_init, params_ged=params_ged, **params_iam)
- ghat_new_list += ghat_new_list_tmp
- else: # only the best graph in D_k is used to initialize IAM.
- Gn_nearest_init = [g.copy() for g in Gk]
- ghat_new_list, _ = iam_upgraded(Gn_nearest_median, Gn_nearest_init,
- params_ged=params_ged, **params_iam)
-
-# for g in g_tmp_list:
-# nx.draw_networkx(g)
-# plt.show()
-# draw_Letter_graph(g)
-# print(g.nodes(data=True))
-# print(g.edges(data=True))
-
- # compute distance between \psi and the new generated graphs.
- knew = compute_kernel(ghat_new_list + Gn_median, gkernel, False)
- dhat_new_list = []
-
- for idx, g_tmp in enumerate(ghat_new_list):
- # @todo: the term3 below could use the one at the beginning of the function.
- dhat_new_list.append(dis_gstar(idx, range(len(ghat_new_list),
- len(ghat_new_list) + len(Gn_median) + 1),
- alpha, knew, withterm3=False))
-
- # find the new k nearest graphs.
- for idx_g, ghat_new in enumerate(ghat_new_list):
- dhat_new = dhat_new_list[idx_g]
-
- # if the new distance is smaller than the max of D_k.
- if dhat_new < dis_k[-1] and np.abs(dhat_new - dis_k[-1]) >= epsilon:
- # check if the new distance is the same as one in D_k.
- is_duplicate = False
- for dis_tmp in dis_k[1:-1]:
- if np.abs(dhat_new - dis_tmp) < epsilon:
- is_duplicate = True
- print('IAM: duplicate k nearest graph generated.')
- break
- if not is_duplicate:
- if np.abs(dhat_new - dhat) < epsilon:
- print('IAM: I am equal!')
-# dhat = dhat_new
-# ghat_list = [ghat_new.copy()]
- else:
- print('IAM: we got better k nearest neighbors!')
- nb_updated_k_iam += 1
- print('the k nearest neighbors are updated',
- nb_updated_k_iam, 'times.')
-
- dis_k = [dhat_new] + dis_k[0:k-1] # add the new nearest distance.
- Gk = [ghat_new.copy()] + Gk[0:k-1] # add the corresponding graph.
- sort_idx = np.argsort(dis_k)
- dis_k = [dis_k[idx] for idx in sort_idx[0:k]] # the new k nearest distances.
- Gk = [Gk[idx] for idx in sort_idx[0:k]]
- if dhat_new < dhat:
- print('IAM: I have smaller distance!')
- print(str(dhat) + '->' + str(dhat_new))
- dhat = dhat_new
- ghat_list = [Gk[0].copy()]
- r = 0
- nb_updated_iam += 1
-
- print('the graph is updated by IAM', nb_updated_iam,
- 'times.')
- nx.draw(Gk[0], labels=nx.get_node_attributes(Gk[0], 'atom'),
- with_labels=True)
- ## plt.savefig("results/gk_iam/simple_two/xx" + str(i) + ".png", format="PNG")
- plt.show()
-
- found_iam = True
-
- # when new distance is not smaller than the max of D_k, use random generation.
- if not found_iam:
- print('Distance not better, switching to random generation now.')
- print(str(dhat) + '->' + str(dhat_new))
-
- if InitRandomWithAllDk: # use all k nearest graphs as the initials.
- init_list = [g_init.copy() for g_init in Gk]
- else: # use just the nearest graph as the initial.
- init_list = [Gk[0].copy()]
-
- # number of edges to be changed.
- if len(init_list) == 1:
- # @todo what if the log is negetive? how to choose alpha (scalar)? seems fdgs is always 1.
- # fdgs = dhat_new
- fdgs = nb_updated_random + 1
- if fdgs < 1:
- fdgs = 1
- fdgs = int(np.ceil(np.log(fdgs)))
- if fdgs < 1:
- fdgs += 1
- # fdgs = nb_updated_random + 1 # @todo:
- fdgs_list = [fdgs]
- else:
- # @todo what if the log is negetive? how to choose alpha (scalar)?
- fdgs_list = np.array(dis_k[:])
- if np.min(fdgs_list) < 1:
- fdgs_list /= dis_k[0]
- fdgs_list = [int(item) for item in np.ceil(np.log(fdgs_list))]
- if np.min(fdgs_list) < 1:
- fdgs_list = np.array(fdgs_list) + 1
-
- l = 0
- found_random = False
- while l < l_max and not found_random:
- for idx_g, g_tmp in enumerate(init_list):
- # add and delete edges.
- ghat_new = nx.convert_node_labels_to_integers(g_tmp.copy())
- # @todo: should we use just half of the adjacency matrix for undirected graphs?
- nb_vpairs = nx.number_of_nodes(ghat_new) * (nx.number_of_nodes(ghat_new) - 1)
- np.random.seed()
- # which edges to change.
- # @todo: what if fdgs is bigger than nb_vpairs?
- idx_change = random.sample(range(nb_vpairs), fdgs_list[idx_g] if
- fdgs_list[idx_g] < nb_vpairs else nb_vpairs)
-# idx_change = np.random.randint(0, nx.number_of_nodes(gs) *
-# (nx.number_of_nodes(gs) - 1), fdgs)
- for item in idx_change:
- node1 = int(item / (nx.number_of_nodes(ghat_new) - 1))
- node2 = (item - node1 * (nx.number_of_nodes(ghat_new) - 1))
- if node2 >= node1: # skip the self pair.
- node2 += 1
- # @todo: is the randomness correct?
- if not ghat_new.has_edge(node1, node2):
- ghat_new.add_edge(node1, node2)
- # nx.draw_networkx(gs)
- # plt.show()
- # nx.draw_networkx(ghat_new)
- # plt.show()
- else:
- ghat_new.remove_edge(node1, node2)
- # nx.draw_networkx(gs)
- # plt.show()
- # nx.draw_networkx(ghat_new)
- # plt.show()
- # nx.draw_networkx(ghat_new)
- # plt.show()
-
- # compute distance between \psi and the new generated graph.
- knew = compute_kernel([ghat_new] + Gn_median, gkernel, verbose=False)
- dhat_new = dis_gstar(0, range(1, len(Gn_median) + 1),
- alpha, knew, withterm3=False)
- # @todo: the new distance is smaller or also equal?
- if dhat_new < dis_k[-1] and np.abs(dhat_new - dis_k[-1]) >= epsilon:
- # check if the new distance is the same as one in D_k.
- is_duplicate = False
- for dis_tmp in dis_k[1:-1]:
- if np.abs(dhat_new - dis_tmp) < epsilon:
- is_duplicate = True
- print('Random: duplicate k nearest graph generated.')
- break
- if not is_duplicate:
- if np.abs(dhat_new - dhat) < epsilon:
- print('Random: I am equal!')
- # dhat = dhat_new
- # ghat_list = [ghat_new.copy()]
- else:
- print('Random: we got better k nearest neighbors!')
- print('l =', str(l))
- nb_updated_k_random += 1
- print('the k nearest neighbors are updated by random generation',
- nb_updated_k_random, 'times.')
-
- dis_k = [dhat_new] + dis_k # add the new nearest distances.
- Gk = [ghat_new.copy()] + Gk # add the corresponding graphs.
- sort_idx = np.argsort(dis_k)
- dis_k = [dis_k[idx] for idx in sort_idx[0:k]] # the new k nearest distances.
- Gk = [Gk[idx] for idx in sort_idx[0:k]]
- if dhat_new < dhat:
- print('\nRandom: I am smaller!')
- print('l =', str(l))
- print(dhat, '->', dhat_new)
- dhat = dhat_new
- ghat_list = [ghat_new.copy()]
- r = 0
- nb_updated_random += 1
-
- print('the graph is updated by random generation',
- nb_updated_random, 'times.')
-
- nx.draw(ghat_new, labels=nx.get_node_attributes(ghat_new, 'atom'),
- with_labels=True)
- ## plt.savefig("results/gk_iam/simple_two/xx" + str(i) + ".png", format="PNG")
- plt.show()
- found_random = True
- break
- l += 1
- if not found_random: # l == l_max:
- r += 1
-
- dis_of_each_itr.append(dhat)
- itr_total += 1
- print('\nthe k shortest distances are', dis_k)
- print('the shortest distances for previous iterations are', dis_of_each_itr)
-
- print('\n\nthe graph is updated by IAM', nb_updated_iam, 'times, and by random generation',
- nb_updated_random, 'times.')
- print('\nthe k nearest neighbors are updated by IAM', nb_updated_k_iam,
- 'times, and by random generation', nb_updated_k_random, 'times.')
- print('distances in kernel space:', dis_of_each_itr, '\n')
-
- return dhat, ghat_list, dis_of_each_itr[-1], \
- nb_updated_iam, nb_updated_random, nb_updated_k_iam, nb_updated_k_random
-
-
-###############################################################################
-# Old implementations.
-
-#def gk_iam(Gn, alpha):
-# """This function constructs graph pre-image by the iterative pre-image
-# framework in reference [1], algorithm 1, where the step of generating new
-# graphs randomly is replaced by the IAM algorithm in reference [2].
-#
-# notes
-# -----
-# Every time a better graph is acquired, the older one is replaced by it.
-# """
-# pass
-# # compute k nearest neighbors of phi in DN.
-# dis_list = [] # distance between g_star and each graph.
-# for ig, g in tqdm(enumerate(Gn), desc='computing distances', file=sys.stdout):
-# dtemp = k_list[ig] - 2 * (alpha * k_g1_list[ig] + (1 - alpha) *
-# k_g2_list[ig]) + (alpha * alpha * k_list[idx1] + alpha *
-# (1 - alpha) * k_g2_list[idx1] + (1 - alpha) * alpha *
-# k_g1_list[idx2] + (1 - alpha) * (1 - alpha) * k_list[idx2])
-# dis_list.append(dtemp)
-#
-# # sort
-# sort_idx = np.argsort(dis_list)
-# dis_gs = [dis_list[idis] for idis in sort_idx[0:k]]
-# g0hat = Gn[sort_idx[0]] # the nearest neighbor of phi in DN
-# if dis_gs[0] == 0: # the exact pre-image.
-# print('The exact pre-image is found from the input dataset.')
-# return 0, g0hat
-# dhat = dis_gs[0] # the nearest distance
-# Gk = [Gn[ig] for ig in sort_idx[0:k]] # the k nearest neighbors
-# gihat_list = []
-#
-## i = 1
-# r = 1
-# while r < r_max:
-# print('r =', r)
-## found = False
-# Gs_nearest = Gk + gihat_list
-# g_tmp = iam(Gs_nearest)
-#
-# # compute distance between \psi and the new generated graph.
-# knew = marginalizedkernel([g_tmp, g1, g2], node_label='atom', edge_label=None,
-# p_quit=lmbda, n_iteration=20, remove_totters=False,
-# n_jobs=multiprocessing.cpu_count(), verbose=False)
-# dnew = knew[0][0, 0] - 2 * (alpha * knew[0][0, 1] + (1 - alpha) *
-# knew[0][0, 2]) + (alpha * alpha * k_list[idx1] + alpha *
-# (1 - alpha) * k_g2_list[idx1] + (1 - alpha) * alpha *
-# k_g1_list[idx2] + (1 - alpha) * (1 - alpha) * k_list[idx2])
-# if dnew <= dhat: # the new distance is smaller
-# print('I am smaller!')
-# dhat = dnew
-# g_new = g_tmp.copy() # found better graph.
-# gihat_list = [g_new]
-# dis_gs.append(dhat)
-# r = 0
-# else:
-# r += 1
-#
-# ghat = ([g0hat] if len(gihat_list) == 0 else gihat_list)
-#
-# return dhat, ghat
-
-
-#def gk_iam_nearest(Gn, alpha, idx_gi, Kmatrix, k, r_max):
-# """This function constructs graph pre-image by the iterative pre-image
-# framework in reference [1], algorithm 1, where the step of generating new
-# graphs randomly is replaced by the IAM algorithm in reference [2].
-#
-# notes
-# -----
-# Every time a better graph is acquired, its distance in kernel space is
-# compared with the k nearest ones, and the k nearest distances from the k+1
-# distances will be used as the new ones.
-# """
-# # compute k nearest neighbors of phi in DN.
-# dis_list = [] # distance between g_star and each graph.
-# for ig, g in tqdm(enumerate(Gn), desc='computing distances', file=sys.stdout):
-# dtemp = dis_gstar(ig, idx_gi, alpha, Kmatrix)
-## dtemp = k_list[ig] - 2 * (alpha * k_g1_list[ig] + (1 - alpha) *
-## k_g2_list[ig]) + (alpha * alpha * k_list[0] + alpha *
-## (1 - alpha) * k_g2_list[0] + (1 - alpha) * alpha *
-## k_g1_list[6] + (1 - alpha) * (1 - alpha) * k_list[6])
-# dis_list.append(dtemp)
-#
-# # sort
-# sort_idx = np.argsort(dis_list)
-# dis_gs = [dis_list[idis] for idis in sort_idx[0:k]] # the k shortest distances
-# g0hat = Gn[sort_idx[0]] # the nearest neighbor of phi in DN
-# if dis_gs[0] == 0: # the exact pre-image.
-# print('The exact pre-image is found from the input dataset.')
-# return 0, g0hat
-# dhat = dis_gs[0] # the nearest distance
-# ghat = g0hat.copy()
-# Gk = [Gn[ig].copy() for ig in sort_idx[0:k]] # the k nearest neighbors
-# for gi in Gk:
-# nx.draw_networkx(gi)
-# plt.show()
-# print(gi.nodes(data=True))
-# print(gi.edges(data=True))
-# Gs_nearest = Gk.copy()
-## gihat_list = []
-#
-## i = 1
-# r = 1
-# while r < r_max:
-# print('r =', r)
-## found = False
-## Gs_nearest = Gk + gihat_list
-## g_tmp = iam(Gs_nearest)
-# g_tmp = test_iam_with_more_graphs_as_init(Gs_nearest, Gs_nearest, c_ei=1, c_er=1, c_es=1)
-# nx.draw_networkx(g_tmp)
-# plt.show()
-# print(g_tmp.nodes(data=True))
-# print(g_tmp.edges(data=True))
-#
-# # compute distance between \psi and the new generated graph.
-# gi_list = [Gn[i] for i in idx_gi]
-# knew = compute_kernel([g_tmp] + gi_list, 'untilhpathkernel', False)
-# dnew = dis_gstar(0, range(1, len(gi_list) + 1), alpha, knew)
-#
-## dnew = knew[0, 0] - 2 * (alpha[0] * knew[0, 1] + alpha[1] *
-## knew[0, 2]) + (alpha[0] * alpha[0] * k_list[0] + alpha[0] *
-## alpha[1] * k_g2_list[0] + alpha[1] * alpha[0] *
-## k_g1_list[1] + alpha[1] * alpha[1] * k_list[1])
-# if dnew <= dhat and g_tmp != ghat: # the new distance is smaller
-# print('I am smaller!')
-# print(str(dhat) + '->' + str(dnew))
-## nx.draw_networkx(ghat)
-## plt.show()
-## print('->')
-## nx.draw_networkx(g_tmp)
-## plt.show()
-#
-# dhat = dnew
-# g_new = g_tmp.copy() # found better graph.
-# ghat = g_tmp.copy()
-# dis_gs.append(dhat) # add the new nearest distance.
-# Gs_nearest.append(g_new) # add the corresponding graph.
-# sort_idx = np.argsort(dis_gs)
-# dis_gs = [dis_gs[idx] for idx in sort_idx[0:k]] # the new k nearest distances.
-# Gs_nearest = [Gs_nearest[idx] for idx in sort_idx[0:k]]
-# r = 0
-# else:
-# r += 1
-#
-# return dhat, ghat
-
-
-#def gk_iam_nearest_multi(Gn, alpha, idx_gi, Kmatrix, k, r_max):
-# """This function constructs graph pre-image by the iterative pre-image
-# framework in reference [1], algorithm 1, where the step of generating new
-# graphs randomly is replaced by the IAM algorithm in reference [2].
-#
-# notes
-# -----
-# Every time a set of n better graphs is acquired, their distances in kernel space are
-# compared with the k nearest ones, and the k nearest distances from the k+n
-# distances will be used as the new ones.
-# """
-# Gn_median = [Gn[idx].copy() for idx in idx_gi]
-# # compute k nearest neighbors of phi in DN.
-# dis_list = [] # distance between g_star and each graph.
-# for ig, g in tqdm(enumerate(Gn), desc='computing distances', file=sys.stdout):
-# dtemp = dis_gstar(ig, idx_gi, alpha, Kmatrix)
-## dtemp = k_list[ig] - 2 * (alpha * k_g1_list[ig] + (1 - alpha) *
-## k_g2_list[ig]) + (alpha * alpha * k_list[0] + alpha *
-## (1 - alpha) * k_g2_list[0] + (1 - alpha) * alpha *
-## k_g1_list[6] + (1 - alpha) * (1 - alpha) * k_list[6])
-# dis_list.append(dtemp)
-#
-# # sort
-# sort_idx = np.argsort(dis_list)
-# dis_gs = [dis_list[idis] for idis in sort_idx[0:k]] # the k shortest distances
-# nb_best = len(np.argwhere(dis_gs == dis_gs[0]).flatten().tolist())
-# g0hat_list = [Gn[idx] for idx in sort_idx[0:nb_best]] # the nearest neighbors of phi in DN
-# if dis_gs[0] == 0: # the exact pre-image.
-# print('The exact pre-image is found from the input dataset.')
-# return 0, g0hat_list
-# dhat = dis_gs[0] # the nearest distance
-# ghat_list = [g.copy() for g in g0hat_list]
-# for g in ghat_list:
-# nx.draw_networkx(g)
-# plt.show()
-# print(g.nodes(data=True))
-# print(g.edges(data=True))
-# Gk = [Gn[ig].copy() for ig in sort_idx[0:k]] # the k nearest neighbors
-# for gi in Gk:
-# nx.draw_networkx(gi)
-# plt.show()
-# print(gi.nodes(data=True))
-# print(gi.edges(data=True))
-# Gs_nearest = Gk.copy()
-## gihat_list = []
-#
-## i = 1
-# r = 1
-# while r < r_max:
-# print('r =', r)
-## found = False
-## Gs_nearest = Gk + gihat_list
-## g_tmp = iam(Gs_nearest)
-# g_tmp_list = test_iam_moreGraphsAsInit_tryAllPossibleBestGraphs_deleteNodesInIterations(
-# Gn_median, Gs_nearest, c_ei=1, c_er=1, c_es=1)
-# for g in g_tmp_list:
-# nx.draw_networkx(g)
-# plt.show()
-# print(g.nodes(data=True))
-# print(g.edges(data=True))
-#
-# # compute distance between \psi and the new generated graphs.
-# gi_list = [Gn[i] for i in idx_gi]
-# knew = compute_kernel(g_tmp_list + gi_list, 'marginalizedkernel', False)
-# dnew_list = []
-# for idx, g_tmp in enumerate(g_tmp_list):
-# dnew_list.append(dis_gstar(idx, range(len(g_tmp_list),
-# len(g_tmp_list) + len(gi_list) + 1), alpha, knew))
-#
-## dnew = knew[0, 0] - 2 * (alpha[0] * knew[0, 1] + alpha[1] *
-## knew[0, 2]) + (alpha[0] * alpha[0] * k_list[0] + alpha[0] *
-## alpha[1] * k_g2_list[0] + alpha[1] * alpha[0] *
-## k_g1_list[1] + alpha[1] * alpha[1] * k_list[1])
-#
-# # find the new k nearest graphs.
-# dis_gs = dnew_list + dis_gs # add the new nearest distances.
-# Gs_nearest = [g.copy() for g in g_tmp_list] + Gs_nearest # add the corresponding graphs.
-# sort_idx = np.argsort(dis_gs)
-# if len([i for i in sort_idx[0:k] if i < len(dnew_list)]) > 0:
-# print('We got better k nearest neighbors! Hurray!')
-# dis_gs = [dis_gs[idx] for idx in sort_idx[0:k]] # the new k nearest distances.
-# print(dis_gs[-1])
-# Gs_nearest = [Gs_nearest[idx] for idx in sort_idx[0:k]]
-# nb_best = len(np.argwhere(dis_gs == dis_gs[0]).flatten().tolist())
-# if len([i for i in sort_idx[0:nb_best] if i < len(dnew_list)]) > 0:
-# print('I have smaller or equal distance!')
-# dhat = dis_gs[0]
-# print(str(dhat) + '->' + str(dhat))
-# idx_best_list = np.argwhere(dnew_list == dhat).flatten().tolist()
-# ghat_list = [g_tmp_list[idx].copy() for idx in idx_best_list]
-# for g in ghat_list:
-# nx.draw_networkx(g)
-# plt.show()
-# print(g.nodes(data=True))
-# print(g.edges(data=True))
-# r = 0
-# else:
-# r += 1
-#
-# return dhat, ghat_list
\ No newline at end of file
diff --git a/gklearn/preimage/preimage_random.py b/gklearn/preimage/preimage_random.py
deleted file mode 100644
index e5f74cd..0000000
--- a/gklearn/preimage/preimage_random.py
+++ /dev/null
@@ -1,309 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Wed Mar 6 16:03:11 2019
-
-pre-image
-@author: ljia
-"""
-
-import sys
-import numpy as np
-import random
-from tqdm import tqdm
-import networkx as nx
-import matplotlib.pyplot as plt
-
-from gklearn.preimage.utils import compute_kernel, dis_gstar
-
-
-def preimage_random(Gn_init, Gn_median, alpha, idx_gi, Kmatrix, k, r_max, l, gkernel):
- Gn_init = [nx.convert_node_labels_to_integers(g) for g in Gn_init]
-
- # compute k nearest neighbors of phi in DN.
- dis_list = [] # distance between g_star and each graph.
- term3 = 0
- for i1, a1 in enumerate(alpha):
- for i2, a2 in enumerate(alpha):
- term3 += a1 * a2 * Kmatrix[idx_gi[i1], idx_gi[i2]]
- for ig, g in tqdm(enumerate(Gn_init), desc='computing distances', file=sys.stdout):
- dtemp = dis_gstar(ig, idx_gi, alpha, Kmatrix, term3=term3)
- dis_list.append(dtemp)
-# print(np.max(dis_list))
-# print(np.min(dis_list))
-# print(np.min([item for item in dis_list if item != 0]))
-# print(np.mean(dis_list))
-
- # sort
- sort_idx = np.argsort(dis_list)
- dis_gs = [dis_list[idis] for idis in sort_idx[0:k]] # the k shortest distances
- nb_best = len(np.argwhere(dis_gs == dis_gs[0]).flatten().tolist())
- g0hat_list = [Gn_init[idx] for idx in sort_idx[0:nb_best]] # the nearest neighbors of phi in DN
- if dis_gs[0] == 0: # the exact pre-image.
- print('The exact pre-image is found from the input dataset.')
- return 0, g0hat_list[0], 0
- dhat = dis_gs[0] # the nearest distance
-# ghat_list = [g.copy() for g in g0hat_list]
-# for g in ghat_list:
-# draw_Letter_graph(g)
-# nx.draw_networkx(g)
-# plt.show()
-# print(g.nodes(data=True))
-# print(g.edges(data=True))
- Gk = [Gn_init[ig].copy() for ig in sort_idx[0:k]] # the k nearest neighbors
-# for gi in Gk:
-## nx.draw_networkx(gi)
-## plt.show()
-# draw_Letter_graph(g)
-# print(gi.nodes(data=True))
-# print(gi.edges(data=True))
- Gs_nearest = [g.copy() for g in Gk]
- gihat_list = []
- dihat_list = []
-
-# i = 1
- r = 0
-# sod_list = [dhat]
-# found = False
- dis_of_each_itr = [dhat]
- nb_updated = 0
- g_best = []
- while r < r_max:
- print('\nr =', r)
- print('itr for gk =', nb_updated, '\n')
- found = False
- dis_bests = dis_gs + dihat_list
- # @todo what if the log is negetive? how to choose alpha (scalar)?
- fdgs_list = np.array(dis_bests)
- if np.min(fdgs_list) < 1:
- fdgs_list /= np.min(dis_bests)
- fdgs_list = [int(item) for item in np.ceil(np.log(fdgs_list))]
- if np.min(fdgs_list) < 1:
- fdgs_list = np.array(fdgs_list) + 1
-
- for ig, gs in enumerate(Gs_nearest + gihat_list):
-# nx.draw_networkx(gs)
-# plt.show()
- for trail in range(0, l):
-# for trail in tqdm(range(0, l), desc='l loops', file=sys.stdout):
- # add and delete edges.
- gtemp = gs.copy()
- np.random.seed()
- # which edges to change.
- # @todo: should we use just half of the adjacency matrix for undirected graphs?
- nb_vpairs = nx.number_of_nodes(gs) * (nx.number_of_nodes(gs) - 1)
- # @todo: what if fdgs is bigger than nb_vpairs?
- idx_change = random.sample(range(nb_vpairs), fdgs_list[ig] if
- fdgs_list[ig] < nb_vpairs else nb_vpairs)
-# idx_change = np.random.randint(0, nx.number_of_nodes(gs) *
-# (nx.number_of_nodes(gs) - 1), fdgs)
- for item in idx_change:
- node1 = int(item / (nx.number_of_nodes(gs) - 1))
- node2 = (item - node1 * (nx.number_of_nodes(gs) - 1))
- if node2 >= node1: # skip the self pair.
- node2 += 1
- # @todo: is the randomness correct?
- if not gtemp.has_edge(node1, node2):
- gtemp.add_edge(node1, node2)
-# nx.draw_networkx(gs)
-# plt.show()
-# nx.draw_networkx(gtemp)
-# plt.show()
- else:
- gtemp.remove_edge(node1, node2)
-# nx.draw_networkx(gs)
-# plt.show()
-# nx.draw_networkx(gtemp)
-# plt.show()
-# nx.draw_networkx(gtemp)
-# plt.show()
-
- # compute distance between \psi and the new generated graph.
-# knew = marginalizedkernel([gtemp, g1, g2], node_label='atom', edge_label=None,
-# p_quit=lmbda, n_iteration=20, remove_totters=False,
-# n_jobs=multiprocessing.cpu_count(), verbose=False)
- knew = compute_kernel([gtemp] + Gn_median, gkernel, verbose=False)
- dnew = dis_gstar(0, range(1, len(Gn_median) + 1), alpha, knew,
- withterm3=False)
- if dnew <= dhat: # @todo: the new distance is smaller or also equal?
- if dnew < dhat:
- print('\nI am smaller!')
- print('ig =', str(ig), ', l =', str(trail))
- print(dhat, '->', dnew)
- nb_updated += 1
- elif dnew == dhat:
- print('I am equal!')
-# nx.draw_networkx(gtemp)
-# plt.show()
-# print(gtemp.nodes(data=True))
-# print(gtemp.edges(data=True))
- dhat = dnew
- gnew = gtemp.copy()
- found = True # found better graph.
- if found:
- r = 0
- gihat_list = [gnew]
- dihat_list = [dhat]
- else:
- r += 1
-
- dis_of_each_itr.append(dhat)
- print('the shortest distances for previous iterations are', dis_of_each_itr)
-# dis_best.append(dhat)
- g_best = (g0hat_list[0] if len(gihat_list) == 0 else gihat_list[0])
- print('distances in kernel space:', dis_of_each_itr, '\n')
-
- return dhat, g_best, nb_updated
-# return 0, 0, 0
-
-
-if __name__ == '__main__':
- from gklearn.utils.graphfiles import loadDataset
-
-# ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
-# 'extra_params': {}} # node/edge symb
- ds = {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt',
- 'extra_params': {}} # node nsymb
-# ds = {'name': 'Acyclic', 'dataset': '../datasets/monoterpenoides/trainset_9.ds',
-# 'extra_params': {}}
-# ds = {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',
-# 'extra_params': {}} # node symb
-
- DN, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
- #DN = DN[0:10]
-
- lmbda = 0.03 # termination probalility
- r_max = 3 # 10 # iteration limit.
- l = 500
- alpha_range = np.linspace(0.5, 0.5, 1)
- #alpha_range = np.linspace(0.1, 0.9, 9)
- k = 10 # 5 # k nearest neighbors
-
- # randomly select two molecules
- #np.random.seed(1)
- #idx1, idx2 = np.random.randint(0, len(DN), 2)
- #g1 = DN[idx1]
- #g2 = DN[idx2]
- idx1 = 0
- idx2 = 6
- g1 = DN[idx1]
- g2 = DN[idx2]
-
- # compute
- k_list = [] # kernel between each graph and itself.
- k_g1_list = [] # kernel between each graph and g1
- k_g2_list = [] # kernel between each graph and g2
- for ig, g in tqdm(enumerate(DN), desc='computing self kernels', file=sys.stdout):
- # ktemp = marginalizedkernel([g, g1, g2], node_label='atom', edge_label=None,
- # p_quit=lmbda, n_iteration=20, remove_totters=False,
- # n_jobs=multiprocessing.cpu_count(), verbose=False)
- ktemp = compute_kernel([g, g1, g2], 'untilhpathkernel', verbose=False)
- k_list.append(ktemp[0, 0])
- k_g1_list.append(ktemp[0, 1])
- k_g2_list.append(ktemp[0, 2])
-
- g_best = []
- dis_best = []
- # for each alpha
- for alpha in alpha_range:
- print('alpha =', alpha)
- # compute k nearest neighbors of phi in DN.
- dis_list = [] # distance between g_star and each graph.
- for ig, g in tqdm(enumerate(DN), desc='computing distances', file=sys.stdout):
- dtemp = k_list[ig] - 2 * (alpha * k_g1_list[ig] + (1 - alpha) *
- k_g2_list[ig]) + (alpha * alpha * k_list[idx1] + alpha *
- (1 - alpha) * k_g2_list[idx1] + (1 - alpha) * alpha *
- k_g1_list[idx2] + (1 - alpha) * (1 - alpha) * k_list[idx2])
- dis_list.append(np.sqrt(dtemp))
-
- # sort
- sort_idx = np.argsort(dis_list)
- dis_gs = [dis_list[idis] for idis in sort_idx[0:k]]
- g0hat = DN[sort_idx[0]] # the nearest neighbor of phi in DN
- if dis_gs[0] == 0: # the exact pre-image.
- print('The exact pre-image is found from the input dataset.')
- g_pimg = g0hat
- break
- dhat = dis_gs[0] # the nearest distance
- Dk = [DN[ig] for ig in sort_idx[0:k]] # the k nearest neighbors
- gihat_list = []
-
- i = 1
- r = 1
- while r < r_max:
- print('r =', r)
- found = False
- for ig, gs in enumerate(Dk + gihat_list):
- # nx.draw_networkx(gs)
- # plt.show()
- # @todo what if the log is negetive?
- fdgs = int(np.abs(np.ceil(np.log(alpha * dis_gs[ig]))))
- for trail in tqdm(range(0, l), desc='l loop', file=sys.stdout):
- # add and delete edges.
- gtemp = gs.copy()
- np.random.seed()
- # which edges to change.
- # @todo: should we use just half of the adjacency matrix for undirected graphs?
- nb_vpairs = nx.number_of_nodes(gs) * (nx.number_of_nodes(gs) - 1)
- # @todo: what if fdgs is bigger than nb_vpairs?
- idx_change = random.sample(range(nb_vpairs), fdgs if fdgs < nb_vpairs else nb_vpairs)
- # idx_change = np.random.randint(0, nx.number_of_nodes(gs) *
- # (nx.number_of_nodes(gs) - 1), fdgs)
- for item in idx_change:
- node1 = int(item / (nx.number_of_nodes(gs) - 1))
- node2 = (item - node1 * (nx.number_of_nodes(gs) - 1))
- if node2 >= node1: # skip the self pair.
- node2 += 1
- # @todo: is the randomness correct?
- if not gtemp.has_edge(node1, node2):
- # @todo: how to update the bond_type? 0 or 1?
- gtemp.add_edges_from([(node1, node2, {'bond_type': 1})])
- # nx.draw_networkx(gs)
- # plt.show()
- # nx.draw_networkx(gtemp)
- # plt.show()
- else:
- gtemp.remove_edge(node1, node2)
- # nx.draw_networkx(gs)
- # plt.show()
- # nx.draw_networkx(gtemp)
- # plt.show()
- # nx.draw_networkx(gtemp)
- # plt.show()
-
- # compute distance between phi and the new generated graph.
- # knew = marginalizedkernel([gtemp, g1, g2], node_label='atom', edge_label=None,
- # p_quit=lmbda, n_iteration=20, remove_totters=False,
- # n_jobs=multiprocessing.cpu_count(), verbose=False)
- knew = compute_kernel([gtemp, g1, g2], 'untilhpathkernel', verbose=False)
- dnew = np.sqrt(knew[0, 0] - 2 * (alpha * knew[0, 1] + (1 - alpha) *
- knew[0, 2]) + (alpha * alpha * k_list[idx1] + alpha *
- (1 - alpha) * k_g2_list[idx1] + (1 - alpha) * alpha *
- k_g1_list[idx2] + (1 - alpha) * (1 - alpha) * k_list[idx2]))
- if dnew < dhat: # @todo: the new distance is smaller or also equal?
- print('I am smaller!')
- print(dhat, '->', dnew)
- nx.draw_networkx(gtemp)
- plt.show()
- print(gtemp.nodes(data=True))
- print(gtemp.edges(data=True))
- dhat = dnew
- gnew = gtemp.copy()
- found = True # found better graph.
- r = 0
- elif dnew == dhat:
- print('I am equal!')
- if found:
- gihat_list = [gnew]
- dis_gs.append(dhat)
- else:
- r += 1
- dis_best.append(dhat)
- g_best += ([g0hat] if len(gihat_list) == 0 else gihat_list)
-
-
- for idx, item in enumerate(alpha_range):
- print('when alpha is', item, 'the shortest distance is', dis_best[idx])
- print('the corresponding pre-image is')
- nx.draw_networkx(g_best[idx])
- plt.show()
\ No newline at end of file
diff --git a/gklearn/preimage/python_code.py b/gklearn/preimage/python_code.py
deleted file mode 100644
index 3772526..0000000
--- a/gklearn/preimage/python_code.py
+++ /dev/null
@@ -1,122 +0,0 @@
- elif opt_name == 'random-inits':
- try:
- num_random_inits_ = std::stoul(opt_val)
- desired_num_random_inits_ = num_random_inits_
-
- except:
- raise Error('Invalid argument "' + opt_val + '" for option random-inits. Usage: options = "[--random-inits ]"')
-
- if num_random_inits_ <= 0:
- raise Error('Invalid argument "' + opt_val + '" for option random-inits. Usage: options = "[--random-inits ]"')
-
- }
- elif opt_name == 'randomness':
- if opt_val == 'PSEUDO':
- use_real_randomness_ = False
-
- elif opt_val == 'REAL':
- use_real_randomness_ = True
-
- else:
- raise Error('Invalid argument "' + opt_val + '" for option randomness. Usage: options = "[--randomness REAL|PSEUDO] [...]"')
-
- }
- elif opt_name == 'stdout':
- if opt_val == '0':
- print_to_stdout_ = 0
-
- elif opt_val == '1':
- print_to_stdout_ = 1
-
- elif opt_val == '2':
- print_to_stdout_ = 2
-
- else:
- raise Error('Invalid argument "' + opt_val + '" for option stdout. Usage: options = "[--stdout 0|1|2] [...]"')
-
- }
- elif opt_name == 'refine':
- if opt_val == 'TRUE':
- refine_ = True
-
- elif opt_val == 'FALSE':
- refine_ = False
-
- else:
- raise Error('Invalid argument "' + opt_val + '" for option refine. Usage: options = "[--refine TRUE|FALSE] [...]"')
-
- }
- elif opt_name == 'time-limit':
- try:
- time_limit_in_sec_ = std::stod(opt_val)
-
- except:
- raise Error('Invalid argument "' + opt_val + '" for option time-limit. Usage: options = "[--time-limit ] [...]')
-
- }
- elif opt_name == 'max-itrs':
- try:
- max_itrs_ = std::stoi(opt_val)
-
- except:
- raise Error('Invalid argument "' + opt_val + '" for option max-itrs. Usage: options = "[--max-itrs ] [...]')
-
- }
- elif opt_name == 'max-itrs-without-update':
- try:
- max_itrs_without_update_ = std::stoi(opt_val)
-
- except:
- raise Error('Invalid argument "' + opt_val + '" for option max-itrs-without-update. Usage: options = "[--max-itrs-without-update ] [...]')
-
- }
- elif opt_name == 'seed':
- try:
- seed_ = std::stoul(opt_val)
-
- except:
- raise Error('Invalid argument "' + opt_val + '" for option seed. Usage: options = "[--seed ] [...]')
-
- }
- elif opt_name == 'epsilon':
- try:
- epsilon_ = std::stod(opt_val)
-
- except:
- raise Error('Invalid argument "' + opt_val + '" for option epsilon. Usage: options = "[--epsilon ] [...]')
-
- if epsilon_ <= 0:
- raise Error('Invalid argument "' + opt_val + '" for option epsilon. Usage: options = "[--epsilon ] [...]')
-
- }
- elif opt_name == 'inits-increase-order':
- try:
- num_inits_increase_order_ = std::stoul(opt_val)
-
- except:
- raise Error('Invalid argument "' + opt_val + '" for option inits-increase-order. Usage: options = "[--inits-increase-order ]"')
-
- if num_inits_increase_order_ <= 0:
- raise Error('Invalid argument "' + opt_val + '" for option inits-increase-order. Usage: options = "[--inits-increase-order ]"')
-
- }
- elif opt_name == 'init-type-increase-order':
- init_type_increase_order_ = opt_val
- if opt_val != 'CLUSTERS' and opt_val != 'K-MEANS++':
- raise Exception(std::string('Invalid argument ') + opt_val + ' for option init-type-increase-order. Usage: options = "[--init-type-increase-order CLUSTERS|K-MEANS++] [...]"')
-
- }
- elif opt_name == 'max-itrs-increase-order':
- try:
- max_itrs_increase_order_ = std::stoi(opt_val)
-
- except:
- raise Error('Invalid argument "' + opt_val + '" for option max-itrs-increase-order. Usage: options = "[--max-itrs-increase-order ] [...]')
-
- }
- else:
- std::string valid_options('[--init-type ] [--random-inits ] [--randomness ] [--seed ] [--stdout ] ')
- valid_options += '[--time-limit ] [--max-itrs ] [--epsilon ] '
- valid_options += '[--inits-increase-order ] [--init-type-increase-order ] [--max-itrs-increase-order ]'
- raise Error(std::string('Invalid option "') + opt_name + '". Usage: options = "' + valid_options + '"')
-
diff --git a/gklearn/preimage/test.py b/gklearn/preimage/test.py
deleted file mode 100644
index 4110a6f..0000000
--- a/gklearn/preimage/test.py
+++ /dev/null
@@ -1,83 +0,0 @@
-#export LD_LIBRARY_PATH=.:/export/home/lambertn/Documents/gedlibpy/lib/fann/:/export/home/lambertn/Documents/gedlibpy/lib/libsvm.3.22:/export/home/lambertn/Documents/gedlibpy/lib/nomad
-
-#Pour que "import script" trouve les librairies qu'a besoin GedLib
-#Equivalent à définir la variable d'environnement LD_LIBRARY_PATH sur un bash
-import gedlibpy.librariesImport
-from gedlibpy import gedlibpy
-import networkx as nx
-
-
-def init() :
- print("List of Edit Cost Options : ")
- for i in gedlibpy.list_of_edit_cost_options :
- print (i)
- print("")
-
- print("List of Method Options : ")
- for j in gedlibpy.list_of_method_options :
- print (j)
- print("")
-
- print("List of Init Options : ")
- for k in gedlibpy.list_of_init_options :
- print (k)
- print("")
-
-def test():
-
- gedlibpy.load_GXL_graphs('include/gedlib-master/data/datasets/Mutagenicity/data/', 'collections/MUTA_10.xml')
- listID = gedlibpy.get_all_graph_ids()
- gedlibpy.set_edit_cost("CHEM_1")
- gedlibpy.init()
- gedlibpy.set_method("IPFP", "")
- gedlibpy.init_method()
- g = listID[0]
- h = listID[1]
- gedlibpy.run_method(g, h)
- print("Node Map : ", gedlibpy.get_node_map(g,h))
- print("Forward map : " , gedlibpy.get_forward_map(g, h), ", Backward map : ", gedlibpy.get_backward_map(g, h))
- print("Assignment Matrix : ")
- print(gedlibpy.get_assignment_matrix(g, h))
- print ("Upper Bound = " + str(gedlibpy.get_upper_bound(g,h)) + ", Lower Bound = " + str(gedlibpy.get_lower_bound(g, h)) + ", Runtime = " + str(gedlibpy.get_runtime(g, h)))
-
-
-def convertGraph(G):
- G_new = nx.Graph()
- for nd, attrs in G.nodes(data=True):
- G_new.add_node(str(nd), chem=attrs['atom'])
- for nd1, nd2, attrs in G.edges(data=True):
- G_new.add_edge(str(nd1), str(nd2), valence=attrs['bond_type'])
-
- return G_new
-
-
-def testNxGrapĥ():
- from gklearn.utils.graphfiles import loadDataset
- ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
- 'extra_params': {}} # node/edge symb
- Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
-
- gedlibpy.restart_env()
- for graph in Gn:
- g_new = convertGraph(graph)
- gedlibpy.add_nx_graph(g_new, "")
-
- listID = gedlibpy.get_all_graph_ids()
- gedlibpy.set_edit_cost("CHEM_1")
- gedlibpy.init()
- gedlibpy.set_method("IPFP", "")
- gedlibpy.init_method()
-
- print(listID)
- g = listID[0]
- h = listID[1]
-
- gedlibpy.run_method(g, h)
-
- print("Node Map : ", gedlibpy.get_node_map(g, h))
- print("Forward map : " , gedlibpy.get_forward_map(g, h), ", Backward map : ", gedlibpy.get_backward_map(g, h))
- print ("Upper Bound = " + str(gedlibpy.get_upper_bound(g, h)) + ", Lower Bound = " + str(gedlibpy.get_lower_bound(g, h)) + ", Runtime = " + str(gedlibpy.get_runtime(g, h)))
-
-#test()
-init()
-#testNxGrapĥ()
diff --git a/gklearn/preimage/test_fitDistance.py b/gklearn/preimage/test_fitDistance.py
deleted file mode 100644
index 2945a24..0000000
--- a/gklearn/preimage/test_fitDistance.py
+++ /dev/null
@@ -1,648 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Thu Oct 24 11:50:56 2019
-
-@author: ljia
-"""
-from matplotlib import pyplot as plt
-import numpy as np
-from tqdm import tqdm
-
-from gklearn.utils.graphfiles import loadDataset
-from gklearn.preimage.utils import remove_edges
-from gklearn.preimage.fitDistance import fit_GED_to_kernel_distance
-from gklearn.preimage.utils import normalize_distance_matrix
-
-
-def test_update_costs():
- from preimage.fitDistance import update_costs
- import cvxpy as cp
-
- ds = np.load('results/xp_fit_method/fit_data_debug4.gm.npz')
- nb_cost_mat = ds['nb_cost_mat']
- dis_k_vec = ds['dis_k_vec']
- n_edit_operations = ds['n_edit_operations']
- ged_vec_init = ds['ged_vec_init']
- ged_mat = ds['ged_mat']
-
- nb_cost_mat_new = nb_cost_mat[:,[2,3,4]]
- x = cp.Variable(nb_cost_mat_new.shape[1])
- cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec)
-# constraints = [x >= [0.000 for i in range(nb_cost_mat_new.shape[1])],
-# np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0]
-# constraints = [x >= [0.000 for i in range(nb_cost_mat_new.shape[1])],
-# np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0,
-# np.array([0.0, 0.0, 0.0, 1.0, -1.0]).T@x == 0.0]
- constraints = [x >= [0.00 for i in range(nb_cost_mat_new.shape[1])],
- np.array([0.0, 1.0, -1.0]).T@x == 0.0]
-# constraints = [x >= [0.00000 for i in range(nb_cost_mat_new.shape[1])]]
- prob = cp.Problem(cp.Minimize(cost_fun), constraints)
- prob.solve()
- print(x.value)
- edit_costs_new = np.concatenate((x.value, np.array([0.0])))
- residual = np.sqrt(prob.value)
-
-
-def median_paper_clcpc_python_best():
- """c_vs <= c_vi + c_vr, c_es <= c_ei + c_er with ged computation with
- python invoking the c++ code by bash command (with updated library).
- """
-# ds = {'name': 'monoterpenoides',
-# 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb
-# _, y_all = loadDataset(ds['dataset'])
- gkernel = 'untilhpathkernel'
- node_label = 'atom'
- edge_label = 'bond_type'
- itr_max = 6
- algo_options = '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1'
- params_ged = {'lib': 'gedlibpy', 'cost': 'CONSTANT', 'method': 'IPFP',
- 'algo_options': algo_options, 'stabilizer': None}
-
- y_all = ['3', '1', '4', '6', '7', '8', '9', '2']
- repeats = 50
- collection_path = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/generated_datsets/monoterpenoides/'
- graph_dir = collection_path + 'gxl/'
-
- fn_edit_costs_output = 'results/median_paper/edit_costs_output.python_init40.k10.txt'
-
- for y in y_all:
- for repeat in range(repeats):
- edit_costs_output_file = open(fn_edit_costs_output, 'a')
- collection_file = collection_path + 'monoterpenoides_' + y + '_' + str(repeat) + '.xml'
- Gn, _ = loadDataset(collection_file, extra_params=graph_dir)
- edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \
- nb_cost_mat_list = fit_GED_to_kernel_distance(Gn, node_label, edge_label,
- gkernel, itr_max, params_ged=params_ged,
- parallel=True)
- total_time = np.sum(time_list)
-# print('\nedit_costs:', edit_costs)
-# print('\nresidual_list:', residual_list)
-# print('\nedit_cost_list:', edit_cost_list)
-# print('\ndistance matrix in kernel space:', dis_k_mat)
-# print('\nged matrix:', ged_mat)
-# print('\ntotal time:', total_time)
-# print('\nnb_cost_mat:', nb_cost_mat_list[-1])
- np.savez('results/median_paper/fit_distance.clcpc.python_init40.monot.elabeled.uhpkernel.y'
- + y + '.repeat' + str(repeat) + '.k10..gm',
- edit_costs=edit_costs,
- residual_list=residual_list, edit_cost_list=edit_cost_list,
- dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list,
- total_time=total_time, nb_cost_mat_list=nb_cost_mat_list)
-
- for ec in edit_costs:
- edit_costs_output_file.write(str(ec) + ' ')
- edit_costs_output_file.write('\n')
- edit_costs_output_file.close()
-
-
-# # normalized distance matrices.
-# gmfile = np.load('results/fit_distance.cs_leq_ci_plus_cr.cost_leq_1en2.monot.elabeled.uhpkernel.gm.npz')
-# edit_costs = gmfile['edit_costs']
-# residual_list = gmfile['residual_list']
-# edit_cost_list = gmfile['edit_cost_list']
-# dis_k_mat = gmfile['dis_k_mat']
-# ged_mat = gmfile['ged_mat']
-# total_time = gmfile['total_time']
-# nb_cost_mat_list = gmfile['nb_cost_mat_list']
-
- nb_consistent, nb_inconsistent, ratio_consistent = pairwise_substitution_consistence(dis_k_mat, ged_mat)
- print(nb_consistent, nb_inconsistent, ratio_consistent)
-
-# norm_dis_k_mat = normalize_distance_matrix(dis_k_mat)
-# plt.imshow(norm_dis_k_mat)
-# plt.colorbar()
-# plt.savefig('results/median_paper/norm_dis_k_mat.clcpc.python_best.monot.elabeled.uhpkernel.y'
-# + y + '.repeat' + str(repeat) + '.eps', format='eps', dpi=300)
-# plt.savefig('results/median_paper/norm_dis_k_mat.clcpc.python_best.monot.elabeled.uhpkernel.y'
-# + y + '.repeat' + str(repeat) + '.png', format='png')
-# # plt.show()
-# plt.clf()
-#
-# norm_ged_mat = normalize_distance_matrix(ged_mat)
-# plt.imshow(norm_ged_mat)
-# plt.colorbar()
-# plt.savefig('results/median_paper/norm_ged_mat.clcpc.python_best.monot.elabeled.uhpkernel.y'
-# + y + '.repeat' + str(repeat) + '.eps', format='eps', dpi=300)
-# plt.savefig('results/median_paper/norm_ged_mat.clcpc.python_best.monot.elabeled.uhpkernel.y'
-# + y + '.repeat' + str(repeat) + '.png', format='png')
-# # plt.show()
-# plt.clf()
-#
-# norm_diff = norm_ged_mat - norm_dis_k_mat
-# plt.imshow(norm_diff)
-# plt.colorbar()
-# plt.savefig('results/median_paper/diff_mat_norm_ged_dis_k.clcpc.python_best.monot.elabeled.uhpkernel.y'
-# + y + '.repeat' + str(repeat) + '.eps', format='eps', dpi=300)
-# plt.savefig('results/median_paper/diff_mat_norm_ged_dis_k.clcpc.python_best.monot.elabeled.uhpkernel.y'
-# + y + '.repeat' + str(repeat) + '.png', format='png')
-# # plt.show()
-# plt.clf()
-# # draw_count_bar(norm_diff)
-
-
-def median_paper_clcpc_python_bash_cpp():
- """c_vs <= c_vi + c_vr, c_es <= c_ei + c_er with ged computation with
- python invoking the c++ code by bash command (with updated library).
- """
-# ds = {'name': 'monoterpenoides',
-# 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb
-# _, y_all = loadDataset(ds['dataset'])
- gkernel = 'untilhpathkernel'
- node_label = 'atom'
- edge_label = 'bond_type'
- itr_max = 20
- algo_options = '--threads 6 --initial-solutions 10 --ratio-runs-from-initial-solutions .5'
- params_ged = {'lib': 'gedlib-bash', 'cost': 'CONSTANT', 'method': 'IPFP',
- 'algo_options': algo_options}
-
- y_all = ['3', '1', '4', '6', '7', '8', '9', '2']
- repeats = 50
- collection_path = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/generated_datsets/monoterpenoides/'
- graph_dir = collection_path + 'gxl/'
-
- fn_edit_costs_output = 'results/median_paper/edit_costs_output.txt'
-
- for y in y_all:
- for repeat in range(repeats):
- edit_costs_output_file = open(fn_edit_costs_output, 'a')
- collection_file = collection_path + 'monoterpenoides_' + y + '_' + str(repeat) + '.xml'
- Gn, _ = loadDataset(collection_file, extra_params=graph_dir)
- edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \
- nb_cost_mat_list, coef_dk = fit_GED_to_kernel_distance(Gn, node_label, edge_label,
- gkernel, itr_max, params_ged=params_ged,
- parallel=False)
- total_time = np.sum(time_list)
-# print('\nedit_costs:', edit_costs)
-# print('\nresidual_list:', residual_list)
-# print('\nedit_cost_list:', edit_cost_list)
-# print('\ndistance matrix in kernel space:', dis_k_mat)
-# print('\nged matrix:', ged_mat)
-# print('\ntotal time:', total_time)
-# print('\nnb_cost_mat:', nb_cost_mat_list[-1])
- np.savez('results/median_paper/fit_distance.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y'
- + y + '.repeat' + str(repeat) + '.gm',
- edit_costs=edit_costs,
- residual_list=residual_list, edit_cost_list=edit_cost_list,
- dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list,
- total_time=total_time, nb_cost_mat_list=nb_cost_mat_list,
- coef_dk=coef_dk)
-
- for ec in edit_costs:
- edit_costs_output_file.write(str(ec) + ' ')
- edit_costs_output_file.write('\n')
- edit_costs_output_file.close()
-
-
-# # normalized distance matrices.
-# gmfile = np.load('results/fit_distance.cs_leq_ci_plus_cr.cost_leq_1en2.monot.elabeled.uhpkernel.gm.npz')
-# edit_costs = gmfile['edit_costs']
-# residual_list = gmfile['residual_list']
-# edit_cost_list = gmfile['edit_cost_list']
-# dis_k_mat = gmfile['dis_k_mat']
-# ged_mat = gmfile['ged_mat']
-# total_time = gmfile['total_time']
-# nb_cost_mat_list = gmfile['nb_cost_mat_list']
-# coef_dk = gmfile['coef_dk']
-
- nb_consistent, nb_inconsistent, ratio_consistent = pairwise_substitution_consistence(dis_k_mat, ged_mat)
- print(nb_consistent, nb_inconsistent, ratio_consistent)
-
-# norm_dis_k_mat = normalize_distance_matrix(dis_k_mat)
-# plt.imshow(norm_dis_k_mat)
-# plt.colorbar()
-# plt.savefig('results/median_paper/norm_dis_k_mat.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y'
-# + y + '.repeat' + str(repeat) + '.eps', format='eps', dpi=300)
-# plt.savefig('results/median_paper/norm_dis_k_mat.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y'
-# + y + '.repeat' + str(repeat) + '.png', format='png')
-# # plt.show()
-# plt.clf()
-#
-# norm_ged_mat = normalize_distance_matrix(ged_mat)
-# plt.imshow(norm_ged_mat)
-# plt.colorbar()
-# plt.savefig('results/median_paper/norm_ged_mat.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y'
-# + y + '.repeat' + str(repeat) + '.eps', format='eps', dpi=300)
-# plt.savefig('results/median_paper/norm_ged_mat.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y'
-# + y + '.repeat' + str(repeat) + '.png', format='png')
-# # plt.show()
-# plt.clf()
-#
-# norm_diff = norm_ged_mat - norm_dis_k_mat
-# plt.imshow(norm_diff)
-# plt.colorbar()
-# plt.savefig('results/median_paper/diff_mat_norm_ged_dis_k.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y'
-# + y + '.repeat' + str(repeat) + '.eps', format='eps', dpi=300)
-# plt.savefig('results/median_paper/diff_mat_norm_ged_dis_k.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y'
-# + y + '.repeat' + str(repeat) + '.png', format='png')
-# # plt.show()
-# plt.clf()
-# # draw_count_bar(norm_diff)
-
-
-
-
-
-def test_cs_leq_ci_plus_cr_python_bash_cpp():
- """c_vs <= c_vi + c_vr, c_es <= c_ei + c_er with ged computation with
- python invoking the c++ code by bash command (with updated library).
- """
- ds = {'name': 'monoterpenoides',
- 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb
- Gn, y_all = loadDataset(ds['dataset'])
-# Gn = Gn[0:10]
- gkernel = 'untilhpathkernel'
- node_label = 'atom'
- edge_label = 'bond_type'
- itr_max = 10
- algo_options = '--threads 6 --initial-solutions 10 --ratio-runs-from-initial-solutions .5'
- params_ged = {'lib': 'gedlib-bash', 'cost': 'CONSTANT', 'method': 'IPFP',
- 'algo_options': algo_options}
- edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \
- nb_cost_mat_list, coef_dk = fit_GED_to_kernel_distance(Gn, node_label, edge_label,
- gkernel, itr_max, params_ged=params_ged,
- parallel=False)
- total_time = np.sum(time_list)
- print('\nedit_costs:', edit_costs)
- print('\nresidual_list:', residual_list)
- print('\nedit_cost_list:', edit_cost_list)
- print('\ndistance matrix in kernel space:', dis_k_mat)
- print('\nged matrix:', ged_mat)
- print('\ntotal time:', total_time)
- print('\nnb_cost_mat:', nb_cost_mat_list[-1])
- np.savez('results/fit_distance.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel.gm',
- edit_costs=edit_costs,
- residual_list=residual_list, edit_cost_list=edit_cost_list,
- dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list,
- total_time=total_time, nb_cost_mat_list=nb_cost_mat_list,
- coef_dk=coef_dk)
-
-# ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
-# 'extra_params': {}} # node/edge symb
-# Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
-## Gn = Gn[0:10]
-## remove_edges(Gn)
-# gkernel = 'untilhpathkernel'
-# node_label = 'atom'
-# edge_label = 'bond_type'
-# itr_max = 10
-# edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \
-# nb_cost_mat_list, coef_dk = fit_GED_to_kernel_distance(Gn, node_label, edge_label,
-# gkernel, itr_max)
-# total_time = np.sum(time_list)
-# print('\nedit_costs:', edit_costs)
-# print('\nresidual_list:', residual_list)
-# print('\nedit_cost_list:', edit_cost_list)
-# print('\ndistance matrix in kernel space:', dis_k_mat)
-# print('\nged matrix:', ged_mat)
-# print('\ntotal time:', total_time)
-# print('\nnb_cost_mat:', nb_cost_mat_list[-1])
-# np.savez('results/fit_distance.cs_leq_ci_plus_cr.mutag.elabeled.uhpkernel.gm',
-# edit_costs=edit_costs,
-# residual_list=residual_list, edit_cost_list=edit_cost_list,
-# dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list,
-# total_time=total_time, nb_cost_mat_list=nb_cost_mat_list, coef_dk)
-
-
-# # normalized distance matrices.
-# gmfile = np.load('results/fit_distance.cs_leq_ci_plus_cr.monot.elabeled.uhpkernel.gm.npz')
-# edit_costs = gmfile['edit_costs']
-# residual_list = gmfile['residual_list']
-# edit_cost_list = gmfile['edit_cost_list']
-# dis_k_mat = gmfile['dis_k_mat']
-# ged_mat = gmfile['ged_mat']
-# total_time = gmfile['total_time']
-# nb_cost_mat_list = gmfile['nb_cost_mat_list']
-# coef_dk = gmfile['coef_dk']
-
- nb_consistent, nb_inconsistent, ratio_consistent = pairwise_substitution_consistence(dis_k_mat, ged_mat)
- print(nb_consistent, nb_inconsistent, ratio_consistent)
-
-# dis_k_sub = pairwise_substitution(dis_k_mat)
-# ged_sub = pairwise_substitution(ged_mat)
-# np.savez('results/sub_dis_mat.cs_leq_ci_plus_cr.gm',
-# dis_k_sub=dis_k_sub, ged_sub=ged_sub)
-
-
- norm_dis_k_mat = normalize_distance_matrix(dis_k_mat)
- plt.imshow(norm_dis_k_mat)
- plt.colorbar()
- plt.savefig('results/norm_dis_k_mat.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel'
- + '.eps', format='eps', dpi=300)
- plt.savefig('results/norm_dis_k_mat.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel'
- + '.png', format='png')
-# plt.show()
- plt.clf()
-
- norm_ged_mat = normalize_distance_matrix(ged_mat)
- plt.imshow(norm_ged_mat)
- plt.colorbar()
- plt.savefig('results/norm_ged_mat.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel'
- + '.eps', format='eps', dpi=300)
- plt.savefig('results/norm_ged_mat.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel'
- + '.png', format='png')
-# plt.show()
- plt.clf()
-
- norm_diff = norm_ged_mat - norm_dis_k_mat
- plt.imshow(norm_diff)
- plt.colorbar()
- plt.savefig('results/diff_mat_norm_ged_dis_k.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel'
- + '.eps', format='eps', dpi=300)
- plt.savefig('results/diff_mat_norm_ged_dis_k.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel'
- + '.png', format='png')
-# plt.show()
- plt.clf()
-# draw_count_bar(norm_diff)
-
-
-def test_anycosts():
- ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
- 'extra_params': {}} # node/edge symb
- Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
-# Gn = Gn[0:10]
- remove_edges(Gn)
- gkernel = 'marginalizedkernel'
- itr_max = 10
- edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \
- nb_cost_mat_list, coef_dk = fit_GED_to_kernel_distance(Gn, gkernel, itr_max)
- total_time = np.sum(time_list)
- print('\nedit_costs:', edit_costs)
- print('\nresidual_list:', residual_list)
- print('\nedit_cost_list:', edit_cost_list)
- print('\ndistance matrix in kernel space:', dis_k_mat)
- print('\nged matrix:', ged_mat)
- print('\ntotal time:', total_time)
- print('\nnb_cost_mat:', nb_cost_mat_list[-1])
- np.savez('results/fit_distance.any_costs.gm', edit_costs=edit_costs,
- residual_list=residual_list, edit_cost_list=edit_cost_list,
- dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list,
- total_time=total_time, nb_cost_mat_list=nb_cost_mat_list)
-
-# # normalized distance matrices.
-# gmfile = np.load('results/fit_distance.any_costs.gm.npz')
-# edit_costs = gmfile['edit_costs']
-# residual_list = gmfile['residual_list']
-# edit_cost_list = gmfile['edit_cost_list']
-# dis_k_mat = gmfile['dis_k_mat']
-# ged_mat = gmfile['ged_mat']
-# total_time = gmfile['total_time']
-## nb_cost_mat_list = gmfile['nb_cost_mat_list']
-
- norm_dis_k_mat = normalize_distance_matrix(dis_k_mat)
- plt.imshow(norm_dis_k_mat)
- plt.colorbar()
- plt.savefig('results/norm_dis_k_mat.any_costs' + '.eps', format='eps', dpi=300)
-# plt.savefig('results/norm_dis_k_mat.any_costs' + '.png', format='png')
-# plt.show()
- plt.clf()
-
- norm_ged_mat = normalize_distance_matrix(ged_mat)
- plt.imshow(norm_ged_mat)
- plt.colorbar()
- plt.savefig('results/norm_ged_mat.any_costs' + '.eps', format='eps', dpi=300)
-# plt.savefig('results/norm_ged_mat.any_costs' + '.png', format='png')
-# plt.show()
- plt.clf()
-
- norm_diff = norm_ged_mat - norm_dis_k_mat
- plt.imshow(norm_diff)
- plt.colorbar()
- plt.savefig('results/diff_mat_norm_ged_dis_k.any_costs' + '.eps', format='eps', dpi=300)
-# plt.savefig('results/diff_mat_norm_ged_dis_k.any_costs' + '.png', format='png')
-# plt.show()
- plt.clf()
-# draw_count_bar(norm_diff)
-
-
-def test_cs_leq_ci_plus_cr():
- """c_vs <= c_vi + c_vr, c_es <= c_ei + c_er
- """
- ds = {'name': 'monoterpenoides',
- 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb
- Gn, y_all = loadDataset(ds['dataset'])
-# Gn = Gn[0:10]
- gkernel = 'untilhpathkernel'
- node_label = 'atom'
- edge_label = 'bond_type'
- itr_max = 10
- edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \
- nb_cost_mat_list, coef_dk = fit_GED_to_kernel_distance(Gn, node_label, edge_label,
- gkernel, itr_max,
- fitkernel='gaussian')
- total_time = np.sum(time_list)
- print('\nedit_costs:', edit_costs)
- print('\nresidual_list:', residual_list)
- print('\nedit_cost_list:', edit_cost_list)
- print('\ndistance matrix in kernel space:', dis_k_mat)
- print('\nged matrix:', ged_mat)
- print('\ntotal time:', total_time)
- print('\nnb_cost_mat:', nb_cost_mat_list[-1])
- np.savez('results/fit_distance.cs_leq_ci_plus_cr.gaussian.cost_leq_1en2.monot.elabeled.uhpkernel.gm',
- edit_costs=edit_costs,
- residual_list=residual_list, edit_cost_list=edit_cost_list,
- dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list,
- total_time=total_time, nb_cost_mat_list=nb_cost_mat_list,
- coef_dk=coef_dk)
-
-# ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
-# 'extra_params': {}} # node/edge symb
-# Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
-## Gn = Gn[0:10]
-## remove_edges(Gn)
-# gkernel = 'untilhpathkernel'
-# node_label = 'atom'
-# edge_label = 'bond_type'
-# itr_max = 10
-# edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \
-# nb_cost_mat_list, coef_dk = fit_GED_to_kernel_distance(Gn, node_label, edge_label,
-# gkernel, itr_max)
-# total_time = np.sum(time_list)
-# print('\nedit_costs:', edit_costs)
-# print('\nresidual_list:', residual_list)
-# print('\nedit_cost_list:', edit_cost_list)
-# print('\ndistance matrix in kernel space:', dis_k_mat)
-# print('\nged matrix:', ged_mat)
-# print('\ntotal time:', total_time)
-# print('\nnb_cost_mat:', nb_cost_mat_list[-1])
-# np.savez('results/fit_distance.cs_leq_ci_plus_cr.cost_leq_1en2.mutag.elabeled.uhpkernel.gm',
-# edit_costs=edit_costs,
-# residual_list=residual_list, edit_cost_list=edit_cost_list,
-# dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list,
-# total_time=total_time, nb_cost_mat_list=nb_cost_mat_list, coef_dk)
-
-
-# # normalized distance matrices.
-# gmfile = np.load('results/fit_distance.cs_leq_ci_plus_cr.cost_leq_1en2.monot.elabeled.uhpkernel.gm.npz')
-# edit_costs = gmfile['edit_costs']
-# residual_list = gmfile['residual_list']
-# edit_cost_list = gmfile['edit_cost_list']
-# dis_k_mat = gmfile['dis_k_mat']
-# ged_mat = gmfile['ged_mat']
-# total_time = gmfile['total_time']
-# nb_cost_mat_list = gmfile['nb_cost_mat_list']
-# coef_dk = gmfile['coef_dk']
-
- nb_consistent, nb_inconsistent, ratio_consistent = pairwise_substitution_consistence(dis_k_mat, ged_mat)
- print(nb_consistent, nb_inconsistent, ratio_consistent)
-
-# dis_k_sub = pairwise_substitution(dis_k_mat)
-# ged_sub = pairwise_substitution(ged_mat)
-# np.savez('results/sub_dis_mat.cs_leq_ci_plus_cr.cost_leq_1en2.gm',
-# dis_k_sub=dis_k_sub, ged_sub=ged_sub)
-
-
- norm_dis_k_mat = normalize_distance_matrix(dis_k_mat)
- plt.imshow(norm_dis_k_mat)
- plt.colorbar()
- plt.savefig('results/norm_dis_k_mat.cs_leq_ci_plus_cr.gaussian.cost_leq_1en2.monot.elabeled.uhpkernel'
- + '.eps', format='eps', dpi=300)
- plt.savefig('results/norm_dis_k_mat.cs_leq_ci_plus_cr.gaussian.cost_leq_1en2.monot.elabeled.uhpkernel'
- + '.png', format='png')
-# plt.show()
- plt.clf()
-
- norm_ged_mat = normalize_distance_matrix(ged_mat)
- plt.imshow(norm_ged_mat)
- plt.colorbar()
- plt.savefig('results/norm_ged_mat.cs_leq_ci_plus_cr.gaussian.cost_leq_1en2.monot.elabeled.uhpkernel'
- + '.eps', format='eps', dpi=300)
- plt.savefig('results/norm_ged_mat.cs_leq_ci_plus_cr.gaussian.cost_leq_1en2.monot.elabeled.uhpkernel'
- + '.png', format='png')
-# plt.show()
- plt.clf()
-
- norm_diff = norm_ged_mat - norm_dis_k_mat
- plt.imshow(norm_diff)
- plt.colorbar()
- plt.savefig('results/diff_mat_norm_ged_dis_k.cs_leq_ci_plus_cr.gaussian.cost_leq_1en2.monot.elabeled.uhpkernel'
- + '.eps', format='eps', dpi=300)
- plt.savefig('results/diff_mat_norm_ged_dis_k.cs_leq_ci_plus_cr.gaussian.cost_leq_1en2.monot.elabeled.uhpkernel'
- + '.png', format='png')
-# plt.show()
- plt.clf()
-# draw_count_bar(norm_diff)
-
-
-def test_unfitted():
- """unfitted.
- """
- from fitDistance import compute_geds
- from utils import kernel_distance_matrix
- ds = {'name': 'monoterpenoides',
- 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb
- Gn, y_all = loadDataset(ds['dataset'])
-# Gn = Gn[0:10]
- gkernel = 'untilhpathkernel'
- node_label = 'atom'
- edge_label = 'bond_type'
-
-
-# ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
-# 'extra_params': {}} # node/edge symb
-# Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
-## Gn = Gn[0:10]
-## remove_edges(Gn)
-# gkernel = 'marginalizedkernel'
-
- dis_k_mat, _, _, _ = kernel_distance_matrix(Gn, node_label, edge_label, gkernel=gkernel)
- ged_all, ged_mat, n_edit_operations = compute_geds(Gn, [3, 3, 1, 3, 3, 1],
- [0, 1, 2, 3, 4, 5], parallel=True)
- print('\ndistance matrix in kernel space:', dis_k_mat)
- print('\nged matrix:', ged_mat)
-# np.savez('results/fit_distance.cs_leq_ci_plus_cr.cost_leq_1en2.gm', edit_costs=edit_costs,
-# residual_list=residual_list, edit_cost_list=edit_cost_list,
-# dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list,
-# total_time=total_time, nb_cost_mat_list=nb_cost_mat_list)
-
- # normalized distance matrices.
-# gmfile = np.load('results/fit_distance.cs_leq_ci_plus_cr.cost_leq_1en3.gm.npz')
-# edit_costs = gmfile['edit_costs']
-# residual_list = gmfile['residual_list']
-# edit_cost_list = gmfile['edit_cost_list']
-# dis_k_mat = gmfile['dis_k_mat']
-# ged_mat = gmfile['ged_mat']
-# total_time = gmfile['total_time']
-# nb_cost_mat_list = gmfile['nb_cost_mat_list']
-
- nb_consistent, nb_inconsistent, ratio_consistent = pairwise_substitution_consistence(dis_k_mat, ged_mat)
- print(nb_consistent, nb_inconsistent, ratio_consistent)
-
- norm_dis_k_mat = normalize_distance_matrix(dis_k_mat)
- plt.imshow(norm_dis_k_mat)
- plt.colorbar()
- plt.savefig('results/norm_dis_k_mat.unfitted.MUTAG' + '.eps', format='eps', dpi=300)
- plt.savefig('results/norm_dis_k_mat.unfitted.MUTAG' + '.png', format='png')
-# plt.show()
- plt.clf()
-
- norm_ged_mat = normalize_distance_matrix(ged_mat)
- plt.imshow(norm_ged_mat)
- plt.colorbar()
- plt.savefig('results/norm_ged_mat.unfitted.MUTAG' + '.eps', format='eps', dpi=300)
- plt.savefig('results/norm_ged_mat.unfitted.MUTAG' + '.png', format='png')
-# plt.show()
- plt.clf()
-
- norm_diff = norm_ged_mat - norm_dis_k_mat
- plt.imshow(norm_diff)
- plt.colorbar()
- plt.savefig('results/diff_mat_norm_ged_dis_k.unfitted.MUTAG' + '.eps', format='eps', dpi=300)
- plt.savefig('results/diff_mat_norm_ged_dis_k.unfitted.MUTAG' + '.png', format='png')
-# plt.show()
- plt.clf()
- draw_count_bar(norm_diff)
-
-
-def pairwise_substitution_consistence(mat1, mat2):
- """
- """
- nb_consistent = 0
- nb_inconsistent = 0
- # the matrix is considered symmetric.
- upper_tri1 = mat1[np.triu_indices_from(mat1)]
- upper_tri2 = mat2[np.tril_indices_from(mat2)]
- for i in tqdm(range(len(upper_tri1)), desc='computing consistence', file=sys.stdout):
- for j in range(i, len(upper_tri1)):
- if np.sign(upper_tri1[i] - upper_tri1[j]) == np.sign(upper_tri2[i] - upper_tri2[j]):
- nb_consistent += 1
- else:
- nb_inconsistent += 1
- return nb_consistent, nb_inconsistent, nb_consistent / (nb_consistent + nb_inconsistent)
-
-
-def pairwise_substitution(mat):
- # the matrix is considered symmetric.
- upper_tri = mat[np.triu_indices_from(mat)]
- sub_list = []
- for i in tqdm(range(len(upper_tri)), desc='computing', file=sys.stdout):
- for j in range(i, len(upper_tri)):
- sub_list.append(upper_tri[i] - upper_tri[j])
- return sub_list
-
-
-def draw_count_bar(norm_diff):
- import pandas
- from collections import Counter, OrderedDict
- norm_diff_cnt = norm_diff.flatten()
- norm_diff_cnt = norm_diff_cnt * 10
- norm_diff_cnt = np.floor(norm_diff_cnt)
- norm_diff_cnt = Counter(norm_diff_cnt)
- norm_diff_cnt = OrderedDict(sorted(norm_diff_cnt.items()))
- df = pandas.DataFrame.from_dict(norm_diff_cnt, orient='index')
- df.plot(kind='bar')
-
-
-if __name__ == '__main__':
-# test_anycosts()
-# test_cs_leq_ci_plus_cr()
-# test_unfitted()
-
-# test_cs_leq_ci_plus_cr_python_bash_cpp()
-# median_paper_clcpc_python_bash_cpp()
-# median_paper_clcpc_python_best()
-
-# x = np.array([[1,2,3],[4,5,6],[7,8,9]])
-# xx = pairwise_substitution(x)
-
- test_update_costs()
\ No newline at end of file
diff --git a/gklearn/preimage/test_ged.py b/gklearn/preimage/test_ged.py
deleted file mode 100644
index 74e18a0..0000000
--- a/gklearn/preimage/test_ged.py
+++ /dev/null
@@ -1,520 +0,0 @@
-#export LD_LIBRARY_PATH=.:/export/home/lambertn/Documents/gedlibpy/lib/fann/:/export/home/lambertn/Documents/gedlibpy/lib/libsvm.3.22:/export/home/lambertn/Documents/gedlibpy/lib/nomad
-
-#Pour que "import script" trouve les librairies qu'a besoin GedLib
-#Equivalent à définir la variable d'environnement LD_LIBRARY_PATH sur un bash
-#import gedlibpy_linlin.librariesImport
-#from gedlibpy_linlin import gedlibpy
-from libs import *
-import networkx as nx
-import numpy as np
-from tqdm import tqdm
-import sys
-
-
-def test_NON_SYMBOLIC_cost():
- """Test edit cost LETTER2.
- """
- from gklearn.preimage.ged import GED, get_nb_edit_operations_nonsymbolic, get_nb_edit_operations_letter
- from gklearn.preimage.test_k_closest_graphs import reform_attributes
- from gklearn.utils.graphfiles import loadDataset
-
- dataset = '../../datasets/Letter-high/Letter-high_A.txt'
- Gn, y_all = loadDataset(dataset)
-
- g1 = Gn[200]
- g2 = Gn[1780]
- reform_attributes(g1)
- reform_attributes(g2)
-
- c_vi = 0.675
- c_vr = 0.675
- c_vs = 0.75
- c_ei = 0.425
- c_er = 0.425
- c_es = 0
-
- edit_cost_constant = [c_vi, c_vr, c_vs, c_ei, c_er, c_es]
- dis, pi_forward, pi_backward = GED(g1, g2, lib='gedlibpy',
- cost='NON_SYMBOLIC', method='IPFP', edit_cost_constant=edit_cost_constant,
- algo_options='', stabilizer=None)
- n_vi, n_vr, sod_vs, n_ei, n_er, sod_es = get_nb_edit_operations_nonsymbolic(g1, g2,
- pi_forward, pi_backward)
-
- print('# of operations:', n_vi, n_vr, sod_vs, n_ei, n_er, sod_es)
- print('c_vi, c_vr, c_vs, c_ei, c_er:', c_vi, c_vr, c_vs, c_ei, c_er, c_es)
- cost_computed = c_vi * n_vi + c_vr * n_vr + c_vs * sod_vs \
- + c_ei * n_ei + c_er * n_er + c_es * sod_es
- print('dis (cost computed by GED):', dis)
- print('cost computed by # of operations and edit cost constants:', cost_computed)
-
-
-def test_LETTER2_cost():
- """Test edit cost LETTER2.
- """
- from gklearn.preimage.ged import GED, get_nb_edit_operations_letter
- from gklearn.preimage.test_k_closest_graphs import reform_attributes
- from gklearn.utils.graphfiles import loadDataset
-
- ds = {'dataset': 'cpp_ext/data/collections/Letter.xml',
- 'graph_dir': os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/data/datasets/Letter/HIGH/'} # node/edge symb
- Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['graph_dir'])
-
- g1 = Gn[200]
- g2 = Gn[1780]
- reform_attributes(g1)
- reform_attributes(g2)
-
- c_vi = 0.675
- c_vr = 0.675
- c_vs = 0.75
- c_ei = 0.425
- c_er = 0.425
-
- edit_cost_constant = [c_vi, c_vr, c_vs, c_ei, c_er]
- dis, pi_forward, pi_backward = GED(g1, g2, dataset='letter', lib='gedlibpy',
- cost='LETTER2', method='IPFP', edit_cost_constant=edit_cost_constant,
- algo_options='', stabilizer=None)
- n_vi, n_vr, n_vs, sod_vs, n_ei, n_er = get_nb_edit_operations_letter(g1, g2,
- pi_forward, pi_backward)
-
- print('# of operations:', n_vi, n_vr, n_vs, sod_vs, n_ei, n_er)
- print('c_vi, c_vr, c_vs, c_ei, c_er:', c_vi, c_vr, c_vs, c_ei, c_er)
- cost_computed = c_vi * n_vi + c_vr * n_vr + c_vs * sod_vs \
- + c_ei * n_ei + c_er * n_er
- print('dis (cost computed by GED):', dis)
- print('cost computed by # of operations and edit cost constants:', cost_computed)
-
-
-
-def test_get_nb_edit_operations_letter():
- """Test whether function preimage.ged.get_nb_edit_operations_letter returns
- correct numbers of edit operations. The distance/cost computed by GED
- should be the same as the cost computed by number of operations and edit
- cost constants.
- """
- from gklearn.preimage.ged import GED, get_nb_edit_operations_letter
- from gklearn.preimage.test_k_closest_graphs import reform_attributes
- from gklearn.utils.graphfiles import loadDataset
-
- ds = {'dataset': 'cpp_ext/data/collections/Letter.xml',
- 'graph_dir': os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/data/datasets/Letter/HIGH/'} # node/edge symb
- Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['graph_dir'])
-
- g1 = Gn[200]
- g2 = Gn[1780]
- reform_attributes(g1)
- reform_attributes(g2)
-
- c_vir = 0.9
- c_eir = 1.7
- alpha = 0.75
-
- edit_cost_constant = [c_vir, c_eir, alpha]
- dis, pi_forward, pi_backward = GED(g1, g2, dataset='letter', lib='gedlibpy',
- cost='LETTER', method='IPFP', edit_cost_constant=edit_cost_constant,
- algo_options='', stabilizer=None)
- n_vi, n_vr, n_vs, c_vs, n_ei, n_er = get_nb_edit_operations_letter(g1, g2,
- pi_forward, pi_backward)
-
- print('# of operations and costs:', n_vi, n_vr, n_vs, c_vs, n_ei, n_er)
- print('c_vir, c_eir, alpha:', c_vir, c_eir, alpha)
- cost_computed = alpha * c_vir * (n_vi + n_vr) \
- + alpha * c_vs \
- + (1 - alpha) * c_eir * (n_ei + n_er)
- print('dis (cost computed by GED):', dis)
- print('cost computed by # of operations and edit cost constants:', cost_computed)
-
-
-def test_get_nb_edit_operations():
- """Test whether function preimage.ged.get_nb_edit_operations returns correct
- numbers of edit operations. The distance/cost computed by GED should be the
- same as the cost computed by number of operations and edit cost constants.
- """
- from gklearn.preimage.ged import GED, get_nb_edit_operations
- from gklearn.utils.graphfiles import loadDataset
- import os
-
- ds = {'dataset': '../../datasets/monoterpenoides/dataset_10+.ds',
- 'graph_dir': os.path.dirname(os.path.realpath(__file__)) + '../../datasets/monoterpenoides/'} # node/edge symb
- Gn, y_all = loadDataset(ds['dataset'])
-
- g1 = Gn[20]
- g2 = Gn[108]
-
- c_vi = 3
- c_vr = 3
- c_vs = 1
- c_ei = 3
- c_er = 3
- c_es = 1
-
- edit_cost_constant = [c_vi, c_vr, c_vs, c_ei, c_er, c_es]
- dis, pi_forward, pi_backward = GED(g1, g2, dataset='monoterpenoides', lib='gedlibpy',
- cost='CONSTANT', method='IPFP', edit_cost_constant=edit_cost_constant,
- algo_options='', stabilizer=None)
- n_vi, n_vr, n_vs, n_ei, n_er, n_es = get_nb_edit_operations(g1, g2,
- pi_forward, pi_backward)
-
- print('# of operations and costs:', n_vi, n_vr, n_vs, n_ei, n_er, n_es)
- print('edit costs:', c_vi, c_vr, c_vs, c_ei, c_er, c_es)
- cost_computed = n_vi * c_vi + n_vr * c_vr + n_vs * c_vs \
- + n_ei * c_ei + n_er * c_er + n_es * c_es
- print('dis (cost computed by GED):', dis)
- print('cost computed by # of operations and edit cost constants:', cost_computed)
-
-
-def test_ged_python_bash_cpp():
- """Test ged computation with python invoking the c++ code by bash command (with updated library).
- """
- from gklearn.utils.graphfiles import loadDataset
- from gklearn.preimage.ged import GED
-
- data_dir_prefix = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/'
-# collection_file = data_dir_prefix + 'generated_datsets/monoterpenoides/gxl/monoterpenoides.xml'
- collection_file = data_dir_prefix + 'generated_datsets/monoterpenoides/monoterpenoides_3_20.xml'
- graph_dir = data_dir_prefix +'generated_datsets/monoterpenoides/gxl/'
-
- Gn, y = loadDataset(collection_file, extra_params=graph_dir)
-
- algo_options = '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1'
-
- for repeat in range(0, 3):
- # Generate the result file.
- ged_filename = data_dir_prefix + 'output/test_ged/ged_mat_python_bash_' + str(repeat) + '_init40.3_20.txt'
-# runtime_filename = data_dir_prefix + 'output/test_ged/runtime_mat_python_min_' + str(repeat) + '.txt'
-
- ged_file = open(ged_filename, 'a')
-# runtime_file = open(runtime_filename, 'a')
-
- ged_mat = np.empty((len(Gn), len(Gn)))
-# runtime_mat = np.empty((len(Gn), len(Gn)))
-
- for i in tqdm(range(len(Gn)), desc='computing GEDs', file=sys.stdout):
- for j in range(len(Gn)):
- print(i, j)
- g1 = Gn[i]
- g2 = Gn[j]
- upper_bound, _, _ = GED(g1, g2, lib='gedlib-bash', cost='CONSTANT',
- method='IPFP',
- edit_cost_constant=[3.0, 3.0, 1.0, 3.0, 3.0, 1.0],
- algo_options=algo_options)
-# runtime = gedlibpy.get_runtime(g1, g2)
- ged_mat[i][j] = upper_bound
-# runtime_mat[i][j] = runtime
-
- # Write to files.
- ged_file.write(str(int(upper_bound)) + ' ')
-# runtime_file.write(str(runtime) + ' ')
-
- ged_file.write('\n')
-# runtime_file.write('\n')
-
- ged_file.close()
-# runtime_file.close()
-
- print('ged_mat')
- print(ged_mat)
-# print('runtime_mat:')
-# print(runtime_mat)
-
- return
-
-
-
-def test_ged_best_settings_updated():
- """Test ged computation with best settings the same as in the C++ code (with updated library).
- """
-
- data_dir_prefix = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/'
- collection_file = data_dir_prefix + 'generated_datsets/monoterpenoides/gxl/monoterpenoides.xml'
-# collection_file = data_dir_prefix + 'generated_datsets/monoterpenoides/monoterpenoides_3_20.xml'
-
- graph_dir = data_dir_prefix +'generated_datsets/monoterpenoides/gxl/'
-
- algo_options = '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1'
-
- for repeat in range(0, 3):
- # Generate the result file.
- ged_filename = data_dir_prefix + 'output/test_ged/ged_mat_python_updated_' + str(repeat) + '_init40.txt'
- runtime_filename = data_dir_prefix + 'output/test_ged/runtime_mat_python_updated_' + str(repeat) + '_init40.txt'
-
- gedlibpy.restart_env()
- gedlibpy.load_GXL_graphs(graph_dir, collection_file)
- listID = gedlibpy.get_all_graph_ids()
- gedlibpy.set_edit_cost('CONSTANT', [3.0, 3.0, 1.0, 3.0, 3.0, 1.0])
- gedlibpy.init()
- gedlibpy.set_method("IPFP", algo_options)
- gedlibpy.init_method()
-
- ged_mat = np.empty((len(listID), len(listID)))
- runtime_mat = np.empty((len(listID), len(listID)))
-
- for i in tqdm(range(len(listID)), desc='computing GEDs', file=sys.stdout):
- ged_file = open(ged_filename, 'a')
- runtime_file = open(runtime_filename, 'a')
-
- for j in range(len(listID)):
- g1 = listID[i]
- g2 = listID[j]
- gedlibpy.run_method(g1, g2)
- upper_bound = gedlibpy.get_upper_bound(g1, g2)
- runtime = gedlibpy.get_runtime(g1, g2)
- ged_mat[i][j] = upper_bound
- runtime_mat[i][j] = runtime
-
- # Write to files.
- ged_file.write(str(int(upper_bound)) + ' ')
- runtime_file.write(str(runtime) + ' ')
-
- ged_file.write('\n')
- runtime_file.write('\n')
-
- ged_file.close()
- runtime_file.close()
-
- print('ged_mat')
- print(ged_mat)
- print('runtime_mat:')
- print(runtime_mat)
-
- return
-
-
-def test_ged_best_settings():
- """Test ged computation with best settings the same as in the C++ code.
- """
-
- data_dir_prefix = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/'
- collection_file = data_dir_prefix + 'generated_datsets/monoterpenoides/gxl/monoterpenoides.xml'
- graph_dir = data_dir_prefix +'generated_datsets/monoterpenoides/gxl/'
-
- algo_options = '--threads 6 --initial-solutions 10 --ratio-runs-from-initial-solutions .5'
-
- for repeat in range(0, 3):
- # Generate the result file.
- ged_filename = data_dir_prefix + 'output/test_ged/ged_mat_python_best_settings_' + str(repeat) + '.txt'
- runtime_filename = data_dir_prefix + 'output/test_ged/runtime_mat_python_best_settings_' + str(repeat) + '.txt'
-
- ged_file = open(ged_filename, 'a')
- runtime_file = open(runtime_filename, 'a')
-
- gedlibpy.restart_env()
- gedlibpy.load_GXL_graphs(graph_dir, collection_file)
- listID = gedlibpy.get_all_graph_ids()
- gedlibpy.set_edit_cost('CONSTANT', [3.0, 3.0, 1.0, 3.0, 3.0, 1.0])
- gedlibpy.init()
- gedlibpy.set_method("IPFP", algo_options)
- gedlibpy.init_method()
-
- ged_mat = np.empty((len(listID), len(listID)))
- runtime_mat = np.empty((len(listID), len(listID)))
-
- for i in tqdm(range(len(listID)), desc='computing GEDs', file=sys.stdout):
- for j in range(len(listID)):
- g1 = listID[i]
- g2 = listID[j]
- gedlibpy.run_method(g1, g2)
- upper_bound = gedlibpy.get_upper_bound(g1, g2)
- runtime = gedlibpy.get_runtime(g1, g2)
- ged_mat[i][j] = upper_bound
- runtime_mat[i][j] = runtime
-
- # Write to files.
- ged_file.write(str(int(upper_bound)) + ' ')
- runtime_file.write(str(runtime) + ' ')
-
- ged_file.write('\n')
- runtime_file.write('\n')
-
- ged_file.close()
- runtime_file.close()
-
- print('ged_mat')
- print(ged_mat)
- print('runtime_mat:')
- print(runtime_mat)
-
- return
-
-
-
-def test_ged_default():
- """Test ged computation with default settings.
- """
-
- data_dir_prefix = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/'
- collection_file = data_dir_prefix + 'generated_datsets/monoterpenoides/gxl/monoterpenoides.xml'
- graph_dir = data_dir_prefix +'generated_datsets/monoterpenoides/gxl/'
-
- for repeat in range(3):
- # Generate the result file.
- ged_filename = data_dir_prefix + 'output/test_ged/ged_mat_python_default_' + str(repeat) + '.txt'
- runtime_filename = data_dir_prefix + 'output/test_ged/runtime_mat_python_default_' + str(repeat) + '.txt'
-
- ged_file = open(ged_filename, 'a')
- runtime_file = open(runtime_filename, 'a')
-
- gedlibpy.restart_env()
- gedlibpy.load_GXL_graphs(graph_dir, collection_file)
- listID = gedlibpy.get_all_graph_ids()
- gedlibpy.set_edit_cost('CONSTANT', [3.0, 3.0, 1.0, 3.0, 3.0, 1.0])
- gedlibpy.init()
- gedlibpy.set_method("IPFP", "")
- gedlibpy.init_method()
-
- ged_mat = np.empty((len(listID), len(listID)))
- runtime_mat = np.empty((len(listID), len(listID)))
-
- for i in tqdm(range(len(listID)), desc='computing GEDs', file=sys.stdout):
- for j in range(len(listID)):
- g1 = listID[i]
- g2 = listID[j]
- gedlibpy.run_method(g1, g2)
- upper_bound = gedlibpy.get_upper_bound(g1, g2)
- runtime = gedlibpy.get_runtime(g1, g2)
- ged_mat[i][j] = upper_bound
- runtime_mat[i][j] = runtime
-
- # Write to files.
- ged_file.write(str(int(upper_bound)) + ' ')
- runtime_file.write(str(runtime) + ' ')
-
- ged_file.write('\n')
- runtime_file.write('\n')
-
- ged_file.close()
- runtime_file.close()
-
- print('ged_mat')
- print(ged_mat)
- print('runtime_mat:')
- print(runtime_mat)
-
- return
-
-
-def test_ged_min():
- """Test ged computation with the "min" stabilizer.
- """
- from gklearn.utils.graphfiles import loadDataset
- from gklearn.preimage.ged import GED
-
- data_dir_prefix = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/'
- collection_file = data_dir_prefix + 'generated_datsets/monoterpenoides/gxl/monoterpenoides.xml'
- graph_dir = data_dir_prefix +'generated_datsets/monoterpenoides/gxl/'
-
- Gn, y = loadDataset(collection_file, extra_params=graph_dir)
-
-# algo_options = '--threads 6 --initial-solutions 10 --ratio-runs-from-initial-solutions .5'
-
- for repeat in range(0, 3):
- # Generate the result file.
- ged_filename = data_dir_prefix + 'output/test_ged/ged_mat_python_min_' + str(repeat) + '.txt'
-# runtime_filename = data_dir_prefix + 'output/test_ged/runtime_mat_python_min_' + str(repeat) + '.txt'
-
- ged_file = open(ged_filename, 'a')
-# runtime_file = open(runtime_filename, 'a')
-
- ged_mat = np.empty((len(Gn), len(Gn)))
-# runtime_mat = np.empty((len(Gn), len(Gn)))
-
- for i in tqdm(range(len(Gn)), desc='computing GEDs', file=sys.stdout):
- for j in range(len(Gn)):
- g1 = Gn[i]
- g2 = Gn[j]
- upper_bound, _, _ = GED(g1, g2, lib='gedlibpy', cost='CONSTANT',
- method='IPFP',
- edit_cost_constant=[3.0, 3.0, 1.0, 3.0, 3.0, 1.0],
- stabilizer='min', repeat=10)
-# runtime = gedlibpy.get_runtime(g1, g2)
- ged_mat[i][j] = upper_bound
-# runtime_mat[i][j] = runtime
-
- # Write to files.
- ged_file.write(str(int(upper_bound)) + ' ')
-# runtime_file.write(str(runtime) + ' ')
-
- ged_file.write('\n')
-# runtime_file.write('\n')
-
- ged_file.close()
-# runtime_file.close()
-
- print('ged_mat')
- print(ged_mat)
-# print('runtime_mat:')
-# print(runtime_mat)
-
- return
-
-
-def init() :
- print("List of Edit Cost Options : ")
- for i in gedlibpy.list_of_edit_cost_options :
- print (i)
- print("")
-
- print("List of Method Options : ")
- for j in gedlibpy.list_of_method_options :
- print (j)
- print("")
-
- print("List of Init Options : ")
- for k in gedlibpy.list_of_init_options :
- print (k)
- print("")
-
-
-
-
-def convertGraph(G):
- G_new = nx.Graph()
- for nd, attrs in G.nodes(data=True):
- G_new.add_node(str(nd), chem=attrs['atom'])
- for nd1, nd2, attrs in G.edges(data=True):
- G_new.add_edge(str(nd1), str(nd2), valence=attrs['bond_type'])
-
- return G_new
-
-
-def testNxGrapĥ():
- from gklearn.utils.graphfiles import loadDataset
- ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
- 'extra_params': {}} # node/edge symb
- Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
-
- gedlibpy.restart_env()
- for graph in Gn:
- g_new = convertGraph(graph)
- gedlibpy.add_nx_graph(g_new, "")
-
- listID = gedlibpy.get_all_graph_ids()
- gedlibpy.set_edit_cost("CHEM_1")
- gedlibpy.init()
- gedlibpy.set_method("IPFP", "")
- gedlibpy.init_method()
-
- print(listID)
- g = listID[0]
- h = listID[1]
-
- gedlibpy.run_method(g, h)
-
- print("Node Map : ", gedlibpy.get_node_map(g, h))
- print("Forward map : " , gedlibpy.get_forward_map(g, h), ", Backward map : ", gedlibpy.get_backward_map(g, h))
- print ("Upper Bound = " + str(gedlibpy.get_upper_bound(g, h)) + ", Lower Bound = " + str(gedlibpy.get_lower_bound(g, h)) + ", Runtime = " + str(gedlibpy.get_runtime(g, h)))
-
-if __name__ == '__main__':
-# test_ged_default()
-# test_ged_min()
-# test_ged_best_settings()
-# test_ged_best_settings_updated()
-# test_ged_python_bash_cpp()
-# test_get_nb_edit_operations()
-# test_get_nb_edit_operations_letter()
-# test_LETTER2_cost()
- test_NON_SYMBOLIC_cost()
-
-
- #init()
- #testNxGrapĥ()
diff --git a/gklearn/preimage/test_iam.py b/gklearn/preimage/test_iam.py
deleted file mode 100644
index 5897f50..0000000
--- a/gklearn/preimage/test_iam.py
+++ /dev/null
@@ -1,964 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Thu Sep 5 15:59:00 2019
-
-@author: ljia
-"""
-
-import numpy as np
-import networkx as nx
-import matplotlib.pyplot as plt
-import time
-import random
-#from tqdm import tqdm
-
-from gklearn.utils.graphfiles import loadDataset
-#from gklearn.utils.logger2file import *
-from gklearn.preimage.iam import iam_upgraded
-from gklearn.preimage.utils import remove_edges, compute_kernel, get_same_item_indices, dis_gstar
-#from gklearn.preimage.ged import ged_median
-
-
-def test_iam_monoterpenoides_with_init40():
- gkernel = 'untilhpathkernel'
- node_label = 'atom'
- edge_label = 'bond_type'
- # unfitted edit costs.
- c_vi = 3
- c_vr = 3
- c_vs = 1
- c_ei = 3
- c_er = 3
- c_es = 1
- ite_max_iam = 50
- epsilon_iam = 0.0001
- removeNodes = False
- connected_iam = False
- # parameters for IAM function
-# ged_cost = 'CONSTANT'
- ged_cost = 'CONSTANT'
- ged_method = 'IPFP'
- edit_cost_constant = [c_vi, c_vr, c_vs, c_ei, c_er, c_es]
- ged_stabilizer = None
-# ged_repeat = 50
- algo_options = '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1'
- params_ged = {'lib': 'gedlibpy', 'cost': ged_cost, 'method': ged_method,
- 'edit_cost_constant': edit_cost_constant,
- 'algo_options': algo_options,
- 'stabilizer': ged_stabilizer}
-
-
- collection_path = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/generated_datsets/monoterpenoides/'
- graph_dir = collection_path + 'gxl/'
- y_all = ['3', '1', '4', '6', '7', '8', '9', '2']
- repeats = 50
-
- # classify graphs according to classes.
- time_list = []
- dis_ks_min_list = []
- dis_ks_set_median_list = []
- sod_gs_list = []
- g_best = []
- sod_set_median_list = []
- sod_list_list = []
- for y in y_all:
- print('\n-------------------------------------------------------')
- print('class of y:', y)
-
- time_list.append([])
- dis_ks_min_list.append([])
- dis_ks_set_median_list.append([])
- sod_gs_list.append([])
- g_best.append([])
- sod_set_median_list.append([])
-
- for repeat in range(repeats):
- # load median set.
- collection_file = collection_path + 'monoterpenoides_' + y + '_' + str(repeat) + '.xml'
- Gn_median, _ = loadDataset(collection_file, extra_params=graph_dir)
- Gn_candidate = [g.copy() for g in Gn_median]
-
- time0 = time.time()
- G_gen_median_list, sod_gen_median, sod_list, G_set_median_list, sod_set_median \
- = iam_upgraded(Gn_median,
- Gn_candidate, c_ei=c_ei, c_er=c_er, c_es=c_es, ite_max=ite_max_iam,
- epsilon=epsilon_iam, node_label=node_label, edge_label=edge_label,
- connected=connected_iam, removeNodes=removeNodes,
- params_ged=params_ged)
- time_total = time.time() - time0
- print('\ntime: ', time_total)
- time_list[-1].append(time_total)
- g_best[-1].append(G_gen_median_list[0])
- sod_set_median_list[-1].append(sod_set_median)
- print('\nsmallest sod of the set median:', sod_set_median)
- sod_gs_list[-1].append(sod_gen_median)
- print('\nsmallest sod in graph space:', sod_gen_median)
- sod_list_list.append(sod_list)
-
-# # show the best graph and save it to file.
-# print('one of the possible corresponding pre-images is')
-# nx.draw(G_gen_median_list[0], labels=nx.get_node_attributes(G_gen_median_list[0], 'atom'),
-# with_labels=True)
-## plt.show()
-# # plt.savefig('results/iam/mutag_median.fit_costs2.001.nb' + str(nb_median) +
-## plt.savefig('results/iam/paper_compare/monoter_y' + str(y_class) +
-## '_repeat' + str(repeat) + '_' + str(time.time()) +
-## '.png', format="PNG")
-# plt.clf()
-# # print(G_gen_median_list[0].nodes(data=True))
-# # print(G_gen_median_list[0].edges(data=True))
-
- print('\nsods of the set median for this class:', sod_set_median_list[-1])
- print('\nsods in graph space for this class:', sod_gs_list[-1])
-# print('\ndistance in kernel space of set median for this class:',
-# dis_ks_set_median_list[-1])
-# print('\nsmallest distances in kernel space for this class:',
-# dis_ks_min_list[-1])
- print('\ntimes for this class:', time_list[-1])
-
- sod_set_median_list[-1] = np.mean(sod_set_median_list[-1])
- sod_gs_list[-1] = np.mean(sod_gs_list[-1])
-# dis_ks_set_median_list[-1] = np.mean(dis_ks_set_median_list[-1])
-# dis_ks_min_list[-1] = np.mean(dis_ks_min_list[-1])
- time_list[-1] = np.mean(time_list[-1])
-
- print()
- print('\nmean sods of the set median for each class:', sod_set_median_list)
- print('\nmean sods in graph space for each class:', sod_gs_list)
-# print('\ndistances in kernel space of set median for each class:',
-# dis_ks_set_median_list)
-# print('\nmean smallest distances in kernel space for each class:',
-# dis_ks_min_list)
- print('\nmean times for each class:', time_list)
-
- print('\nmean sods of the set median of all:', np.mean(sod_set_median_list))
- print('\nmean sods in graph space of all:', np.mean(sod_gs_list))
-# print('\nmean distances in kernel space of set median of all:',
-# np.mean(dis_ks_set_median_list))
-# print('\nmean smallest distances in kernel space of all:',
-# np.mean(dis_ks_min_list))
- print('\nmean times of all:', np.mean(time_list))
-
-
-
-
-def test_iam_monoterpenoides():
- ds = {'name': 'monoterpenoides',
- 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb
- Gn, y_all = loadDataset(ds['dataset'])
-# Gn = Gn[0:50]
- gkernel = 'untilhpathkernel'
- node_label = 'atom'
- edge_label = 'bond_type'
-
- # parameters for GED function from the IAM paper.
- # fitted edit costs (Gaussian).
- c_vi = 0.03620133402089074
- c_vr = 0.0417574590207099
- c_vs = 0.009992282328587499
- c_ei = 0.08293120042342755
- c_er = 0.09512220476358019
- c_es = 0.09222529696841467
-# # fitted edit costs (linear combinations).
-# c_vi = 0.1749684054238749
-# c_vr = 0.0734054228711457
-# c_vs = 0.05017781726016715
-# c_ei = 0.1869431164806936
-# c_er = 0.32055856948274
-# c_es = 0.2569469379247611
-# # unfitted edit costs.
-# c_vi = 3
-# c_vr = 3
-# c_vs = 1
-# c_ei = 3
-# c_er = 3
-# c_es = 1
- ite_max_iam = 50
- epsilon_iam = 0.001
- removeNodes = False
- connected_iam = False
- # parameters for IAM function
-# ged_cost = 'CONSTANT'
- ged_cost = 'CONSTANT'
- ged_method = 'IPFP'
- edit_cost_constant = [c_vi, c_vr, c_vs, c_ei, c_er, c_es]
-# edit_cost_constant = []
- ged_stabilizer = 'min'
- ged_repeat = 50
- params_ged = {'lib': 'gedlibpy', 'cost': ged_cost, 'method': ged_method,
- 'edit_cost_constant': edit_cost_constant,
- 'stabilizer': ged_stabilizer, 'repeat': ged_repeat}
-
- # classify graphs according to letters.
- time_list = []
- dis_ks_min_list = []
- dis_ks_set_median_list = []
- sod_gs_list = []
- g_best = []
- sod_set_median_list = []
- sod_list_list = []
- idx_dict = get_same_item_indices(y_all)
- for y_class in idx_dict:
- print('\n-------------------------------------------------------')
- print('class of y:', y_class)
- Gn_class = [Gn[i].copy() for i in idx_dict[y_class]]
-
- time_list.append([])
- dis_ks_min_list.append([])
- dis_ks_set_median_list.append([])
- sod_gs_list.append([])
- g_best.append([])
- sod_set_median_list.append([])
-
- for repeat in range(50):
- idx_rdm = random.sample(range(len(Gn_class)), 10)
- print('graphs chosen:', idx_rdm)
- Gn_median = [Gn_class[idx].copy() for idx in idx_rdm]
- Gn_candidate = [g.copy() for g in Gn_median]
-
- alpha_range = [1 / len(Gn_median)] * len(Gn_median)
- time0 = time.time()
- G_gen_median_list, sod_gen_median, sod_list, G_set_median_list, sod_set_median \
- = iam_upgraded(Gn_median,
- Gn_candidate, c_ei=c_ei, c_er=c_er, c_es=c_es, ite_max=ite_max_iam,
- epsilon=epsilon_iam, connected=connected_iam, removeNodes=removeNodes,
- params_ged=params_ged)
- time_total = time.time() - time0
- print('\ntime: ', time_total)
- time_list[-1].append(time_total)
- g_best[-1].append(G_gen_median_list[0])
- sod_set_median_list[-1].append(sod_set_median)
- print('\nsmallest sod of the set median:', sod_set_median)
- sod_gs_list[-1].append(sod_gen_median)
- print('\nsmallest sod in graph space:', sod_gen_median)
- sod_list_list.append(sod_list)
-
- # show the best graph and save it to file.
- print('one of the possible corresponding pre-images is')
- nx.draw(G_gen_median_list[0], labels=nx.get_node_attributes(G_gen_median_list[0], 'atom'),
- with_labels=True)
-# plt.show()
- # plt.savefig('results/iam/mutag_median.fit_costs2.001.nb' + str(nb_median) +
-# plt.savefig('results/iam/paper_compare/monoter_y' + str(y_class) +
-# '_repeat' + str(repeat) + '_' + str(time.time()) +
-# '.png', format="PNG")
- plt.clf()
- # print(G_gen_median_list[0].nodes(data=True))
- # print(G_gen_median_list[0].edges(data=True))
-
-
- # compute distance between \psi and the set median graph.
- knew_set_median = compute_kernel(G_set_median_list + Gn_median,
- gkernel, node_label, edge_label, False)
- dhat_new_set_median_list = []
- for idx, g_tmp in enumerate(G_set_median_list):
- # @todo: the term3 below could use the one at the beginning of the function.
- dhat_new_set_median_list.append(dis_gstar(idx, range(len(G_set_median_list),
- len(G_set_median_list) + len(Gn_median) + 1),
- alpha_range, knew_set_median, withterm3=False))
-
- print('\ndistance in kernel space of set median: ', dhat_new_set_median_list[0])
- dis_ks_set_median_list[-1].append(dhat_new_set_median_list[0])
-
-
- # compute distance between \psi and the new generated graphs.
- knew = compute_kernel(G_gen_median_list + Gn_median, gkernel, node_label,
- edge_label, False)
- dhat_new_list = []
- for idx, g_tmp in enumerate(G_gen_median_list):
- # @todo: the term3 below could use the one at the beginning of the function.
- dhat_new_list.append(dis_gstar(idx, range(len(G_gen_median_list),
- len(G_gen_median_list) + len(Gn_median) + 1),
- alpha_range, knew, withterm3=False))
-
- print('\nsmallest distance in kernel space: ', dhat_new_list[0])
- dis_ks_min_list[-1].append(dhat_new_list[0])
-
-
- print('\nsods of the set median for this class:', sod_set_median_list[-1])
- print('\nsods in graph space for this class:', sod_gs_list[-1])
- print('\ndistance in kernel space of set median for this class:',
- dis_ks_set_median_list[-1])
- print('\nsmallest distances in kernel space for this class:',
- dis_ks_min_list[-1])
- print('\ntimes for this class:', time_list[-1])
-
- sod_set_median_list[-1] = np.mean(sod_set_median_list[-1])
- sod_gs_list[-1] = np.mean(sod_gs_list[-1])
- dis_ks_set_median_list[-1] = np.mean(dis_ks_set_median_list[-1])
- dis_ks_min_list[-1] = np.mean(dis_ks_min_list[-1])
- time_list[-1] = np.mean(time_list[-1])
-
- print()
- print('\nmean sods of the set median for each class:', sod_set_median_list)
- print('\nmean sods in graph space for each class:', sod_gs_list)
- print('\ndistances in kernel space of set median for each class:',
- dis_ks_set_median_list)
- print('\nmean smallest distances in kernel space for each class:',
- dis_ks_min_list)
- print('\nmean times for each class:', time_list)
-
- print('\nmean sods of the set median of all:', np.mean(sod_set_median_list))
- print('\nmean sods in graph space of all:', np.mean(sod_gs_list))
- print('\nmean distances in kernel space of set median of all:',
- np.mean(dis_ks_set_median_list))
- print('\nmean smallest distances in kernel space of all:',
- np.mean(dis_ks_min_list))
- print('\nmean times of all:', np.mean(time_list))
-
- nb_better_sods = 0
- nb_worse_sods = 0
- nb_same_sods = 0
- for sods in sod_list_list:
- if sods[0] > sods[-1]:
- nb_better_sods += 1
- elif sods[0] < sods[-1]:
- nb_worse_sods += 1
- else:
- nb_same_sods += 1
- print('\n In', str(len(sod_list_list)), 'sod lists,', str(nb_better_sods),
- 'are getting better,', str(nb_worse_sods), 'are getting worse,',
- str(nb_same_sods), 'are not changed; ', str(nb_better_sods / len(sod_list_list)),
- 'sods are improved.')
-
-
-def test_iam_mutag():
- ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
- 'extra_params': {}} # node/edge symb
- Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
-# Gn = Gn[0:50]
- gkernel = 'untilhpathkernel'
- node_label = 'atom'
- edge_label = 'bond_type'
-
- # parameters for GED function from the IAM paper.
- # fitted edit costs.
- c_vi = 0.03523843108436513
- c_vr = 0.03347339739350128
- c_vs = 0.06871290673612238
- c_ei = 0.08591999846720685
- c_er = 0.07962086440894103
- c_es = 0.08596855855478233
- # unfitted edit costs.
-# c_vi = 3
-# c_vr = 3
-# c_vs = 1
-# c_ei = 3
-# c_er = 3
-# c_es = 1
- ite_max_iam = 50
- epsilon_iam = 0.001
- removeNodes = False
- connected_iam = False
- # parameters for IAM function
-# ged_cost = 'CONSTANT'
- ged_cost = 'CONSTANT'
- ged_method = 'IPFP'
- edit_cost_constant = [c_vi, c_vr, c_vs, c_ei, c_er, c_es]
-# edit_cost_constant = []
- ged_stabilizer = 'min'
- ged_repeat = 50
- params_ged = {'lib': 'gedlibpy', 'cost': ged_cost, 'method': ged_method,
- 'edit_cost_constant': edit_cost_constant,
- 'stabilizer': ged_stabilizer, 'repeat': ged_repeat}
-
- # classify graphs according to letters.
- time_list = []
- dis_ks_min_list = []
- dis_ks_set_median_list = []
- sod_gs_list = []
- g_best = []
- sod_set_median_list = []
- sod_list_list = []
- idx_dict = get_same_item_indices(y_all)
- for y_class in idx_dict:
- print('\n-------------------------------------------------------')
- print('class of y:', y_class)
- Gn_class = [Gn[i].copy() for i in idx_dict[y_class]]
-
- time_list.append([])
- dis_ks_min_list.append([])
- dis_ks_set_median_list.append([])
- sod_gs_list.append([])
- g_best.append([])
- sod_set_median_list.append([])
-
- for repeat in range(50):
- idx_rdm = random.sample(range(len(Gn_class)), 10)
- print('graphs chosen:', idx_rdm)
- Gn_median = [Gn_class[idx].copy() for idx in idx_rdm]
- Gn_candidate = [g.copy() for g in Gn_median]
-
- alpha_range = [1 / len(Gn_median)] * len(Gn_median)
- time0 = time.time()
- G_gen_median_list, sod_gen_median, sod_list, G_set_median_list, sod_set_median \
- = iam_upgraded(Gn_median,
- Gn_candidate, c_ei=c_ei, c_er=c_er, c_es=c_es, ite_max=ite_max_iam,
- epsilon=epsilon_iam, connected=connected_iam, removeNodes=removeNodes,
- params_ged=params_ged)
- time_total = time.time() - time0
- print('\ntime: ', time_total)
- time_list[-1].append(time_total)
- g_best[-1].append(G_gen_median_list[0])
- sod_set_median_list[-1].append(sod_set_median)
- print('\nsmallest sod of the set median:', sod_set_median)
- sod_gs_list[-1].append(sod_gen_median)
- print('\nsmallest sod in graph space:', sod_gen_median)
- sod_list_list.append(sod_list)
-
- # show the best graph and save it to file.
- print('one of the possible corresponding pre-images is')
- nx.draw(G_gen_median_list[0], labels=nx.get_node_attributes(G_gen_median_list[0], 'atom'),
- with_labels=True)
-# plt.show()
- # plt.savefig('results/iam/mutag_median.fit_costs2.001.nb' + str(nb_median) +
-# plt.savefig('results/iam/paper_compare/mutag_y' + str(y_class) +
-# '_repeat' + str(repeat) + '_' + str(time.time()) +
-# '.png', format="PNG")
- plt.clf()
- # print(G_gen_median_list[0].nodes(data=True))
- # print(G_gen_median_list[0].edges(data=True))
-
-
- # compute distance between \psi and the set median graph.
- knew_set_median = compute_kernel(G_set_median_list + Gn_median,
- gkernel, node_label, edge_label, False)
- dhat_new_set_median_list = []
- for idx, g_tmp in enumerate(G_set_median_list):
- # @todo: the term3 below could use the one at the beginning of the function.
- dhat_new_set_median_list.append(dis_gstar(idx, range(len(G_set_median_list),
- len(G_set_median_list) + len(Gn_median) + 1),
- alpha_range, knew_set_median, withterm3=False))
-
- print('\ndistance in kernel space of set median: ', dhat_new_set_median_list[0])
- dis_ks_set_median_list[-1].append(dhat_new_set_median_list[0])
-
-
- # compute distance between \psi and the new generated graphs.
- knew = compute_kernel(G_gen_median_list + Gn_median, gkernel, node_label,
- edge_label, False)
- dhat_new_list = []
- for idx, g_tmp in enumerate(G_gen_median_list):
- # @todo: the term3 below could use the one at the beginning of the function.
- dhat_new_list.append(dis_gstar(idx, range(len(G_gen_median_list),
- len(G_gen_median_list) + len(Gn_median) + 1),
- alpha_range, knew, withterm3=False))
-
- print('\nsmallest distance in kernel space: ', dhat_new_list[0])
- dis_ks_min_list[-1].append(dhat_new_list[0])
-
-
- print('\nsods of the set median for this class:', sod_set_median_list[-1])
- print('\nsods in graph space for this class:', sod_gs_list[-1])
- print('\ndistance in kernel space of set median for this class:',
- dis_ks_set_median_list[-1])
- print('\nsmallest distances in kernel space for this class:',
- dis_ks_min_list[-1])
- print('\ntimes for this class:', time_list[-1])
-
- sod_set_median_list[-1] = np.mean(sod_set_median_list[-1])
- sod_gs_list[-1] = np.mean(sod_gs_list[-1])
- dis_ks_set_median_list[-1] = np.mean(dis_ks_set_median_list[-1])
- dis_ks_min_list[-1] = np.mean(dis_ks_min_list[-1])
- time_list[-1] = np.mean(time_list[-1])
-
- print()
- print('\nmean sods of the set median for each class:', sod_set_median_list)
- print('\nmean sods in graph space for each class:', sod_gs_list)
- print('\ndistances in kernel space of set median for each class:',
- dis_ks_set_median_list)
- print('\nmean smallest distances in kernel space for each class:',
- dis_ks_min_list)
- print('\nmean times for each class:', time_list)
-
- print('\nmean sods of the set median of all:', np.mean(sod_set_median_list))
- print('\nmean sods in graph space of all:', np.mean(sod_gs_list))
- print('\nmean distances in kernel space of set median of all:',
- np.mean(dis_ks_set_median_list))
- print('\nmean smallest distances in kernel space of all:',
- np.mean(dis_ks_min_list))
- print('\nmean times of all:', np.mean(time_list))
-
- nb_better_sods = 0
- nb_worse_sods = 0
- nb_same_sods = 0
- for sods in sod_list_list:
- if sods[0] > sods[-1]:
- nb_better_sods += 1
- elif sods[0] < sods[-1]:
- nb_worse_sods += 1
- else:
- nb_same_sods += 1
- print('\n In', str(len(sod_list_list)), 'sod lists,', str(nb_better_sods),
- 'are getting better,', str(nb_worse_sods), 'are getting worse,',
- str(nb_same_sods), 'are not changed; ', str(nb_better_sods / len(sod_list_list)),
- 'sods are improved.')
-
-
-###############################################################################
-# tests on different numbers of median-sets.
-
-def test_iam_median_nb():
-
- ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
- 'extra_params': {}} # node/edge symb
- Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
-# Gn = Gn[0:50]
- remove_edges(Gn)
- gkernel = 'marginalizedkernel'
-
- lmbda = 0.03 # termination probalility
-# # parameters for GED function
-# c_vi = 0.037
-# c_vr = 0.038
-# c_vs = 0.075
-# c_ei = 0.001
-# c_er = 0.001
-# c_es = 0.0
-# ite_max_iam = 50
-# epsilon_iam = 0.001
-# removeNodes = False
-# connected_iam = False
-# # parameters for IAM function
-# ged_cost = 'CONSTANT'
-# ged_method = 'IPFP'
-# edit_cost_constant = [c_vi, c_vr, c_vs, c_ei, c_er, c_es]
-# ged_stabilizer = 'min'
-# ged_repeat = 50
-# params_ged = {'lib': 'gedlibpy', 'cost': ged_cost, 'method': ged_method,
-# 'edit_cost_constant': edit_cost_constant,
-# 'stabilizer': ged_stabilizer, 'repeat': ged_repeat}
-
- # parameters for GED function
- c_vi = 4
- c_vr = 4
- c_vs = 2
- c_ei = 1
- c_er = 1
- c_es = 1
- ite_max_iam = 50
- epsilon_iam = 0.001
- removeNodes = False
- connected_iam = False
- # parameters for IAM function
- ged_cost = 'CHEM_1'
- ged_method = 'IPFP'
- edit_cost_constant = []
- ged_stabilizer = 'min'
- ged_repeat = 50
- params_ged = {'lib': 'gedlibpy', 'cost': ged_cost, 'method': ged_method,
- 'edit_cost_constant': edit_cost_constant,
- 'stabilizer': ged_stabilizer, 'repeat': ged_repeat}
-
- # find out all the graphs classified to positive group 1.
- idx_dict = get_same_item_indices(y_all)
- Gn = [Gn[i] for i in idx_dict[1]]
-
- # number of graphs; we what to compute the median of these graphs.
-# nb_median_range = [2, 3, 4, 5, 10, 20, 30, 40, 50, 100]
- nb_median_range = [len(Gn)]
-
-# # compute Gram matrix.
-# time0 = time.time()
-# km = compute_kernel(Gn, gkernel, True)
-# time_km = time.time() - time0
-# # write Gram matrix to file.
-# np.savez('results/gram_matrix_marg_itr10_pq0.03_mutag_positive.gm', gm=km, gmtime=time_km)
-
- time_list = []
- dis_ks_min_list = []
- sod_gs_list = []
-# sod_gs_min_list = []
-# nb_updated_list = []
-# nb_updated_k_list = []
- g_best = []
- for nb_median in nb_median_range:
- print('\n-------------------------------------------------------')
- print('number of median graphs =', nb_median)
- random.seed(1)
- idx_rdm = random.sample(range(len(Gn)), nb_median)
- print('graphs chosen:', idx_rdm)
- Gn_median = [Gn[idx].copy() for idx in idx_rdm]
- Gn_candidate = [g.copy() for g in Gn]
-
-# for g in Gn_median:
-# nx.draw(g, labels=nx.get_node_attributes(g, 'atom'), with_labels=True)
-## plt.savefig("results/preimage_mix/mutag.png", format="PNG")
-# plt.show()
-# plt.clf()
-
- ###################################################################
-# gmfile = np.load('results/gram_matrix_marg_itr10_pq0.03_mutag_positive.gm.npz')
-# km_tmp = gmfile['gm']
-# time_km = gmfile['gmtime']
-# # modify mixed gram matrix.
-# km = np.zeros((len(Gn) + nb_median, len(Gn) + nb_median))
-# for i in range(len(Gn)):
-# for j in range(i, len(Gn)):
-# km[i, j] = km_tmp[i, j]
-# km[j, i] = km[i, j]
-# for i in range(len(Gn)):
-# for j, idx in enumerate(idx_rdm):
-# km[i, len(Gn) + j] = km[i, idx]
-# km[len(Gn) + j, i] = km[i, idx]
-# for i, idx1 in enumerate(idx_rdm):
-# for j, idx2 in enumerate(idx_rdm):
-# km[len(Gn) + i, len(Gn) + j] = km[idx1, idx2]
-
- ###################################################################
- alpha_range = [1 / nb_median] * nb_median
- time0 = time.time()
- ghat_new_list, sod_min = iam_upgraded(Gn_median, Gn_candidate,
- c_ei=c_ei, c_er=c_er, c_es=c_es, ite_max=ite_max_iam,
- epsilon=epsilon_iam, connected=connected_iam, removeNodes=removeNodes,
- params_ged=params_ged)
-
- time_total = time.time() - time0
- print('\ntime: ', time_total)
- time_list.append(time_total)
-
- # compute distance between \psi and the new generated graphs.
- knew = compute_kernel(ghat_new_list + Gn_median, gkernel, False)
- dhat_new_list = []
- for idx, g_tmp in enumerate(ghat_new_list):
- # @todo: the term3 below could use the one at the beginning of the function.
- dhat_new_list.append(dis_gstar(idx, range(len(ghat_new_list),
- len(ghat_new_list) + len(Gn_median) + 1),
- alpha_range, knew, withterm3=False))
-
- print('\nsmallest distance in kernel space: ', dhat_new_list[0])
- dis_ks_min_list.append(dhat_new_list[0])
- g_best.append(ghat_new_list[0])
-
- # show the best graph and save it to file.
-# print('the shortest distance is', dhat)
- print('one of the possible corresponding pre-images is')
- nx.draw(ghat_new_list[0], labels=nx.get_node_attributes(ghat_new_list[0], 'atom'),
- with_labels=True)
- plt.show()
-# plt.savefig('results/iam/mutag_median.fit_costs2.001.nb' + str(nb_median) +
- plt.savefig('results/iam/mutag_median_unfit2.nb' + str(nb_median) +
- '.png', format="PNG")
- plt.clf()
-# print(ghat_list[0].nodes(data=True))
-# print(ghat_list[0].edges(data=True))
-
- sod_gs_list.append(sod_min)
-# sod_gs_min_list.append(np.min(sod_min))
- print('\nsmallest sod in graph space: ', sod_min)
-
- print('\nsods in graph space: ', sod_gs_list)
-# print('\nsmallest sod in graph space for each set of median graphs: ', sod_gs_min_list)
- print('\nsmallest distance in kernel space for each set of median graphs: ',
- dis_ks_min_list)
-# print('\nnumber of updates of the best graph for each set of median graphs by IAM: ',
-# nb_updated_list)
-# print('\nnumber of updates of k nearest graphs for each set of median graphs by IAM: ',
-# nb_updated_k_list)
- print('\ntimes:', time_list)
-
-
-def test_iam_letter_h():
- from median import draw_Letter_graph
- ds = {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt',
- 'extra_params': {}} # node nsymb
-# ds = {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt',
-# 'extra_params': {}} # node nsymb
-# Gn = Gn[0:50]
- Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
- gkernel = 'structuralspkernel'
-
- # parameters for GED function from the IAM paper.
- c_vi = 3
- c_vr = 3
- c_vs = 1
- c_ei = 3
- c_er = 3
- c_es = 1
- ite_max_iam = 50
- epsilon_iam = 0.001
- removeNodes = False
- connected_iam = False
- # parameters for IAM function
-# ged_cost = 'CONSTANT'
- ged_cost = 'LETTER'
- ged_method = 'IPFP'
-# edit_cost_constant = [c_vi, c_vr, c_vs, c_ei, c_er, c_es]
- edit_cost_constant = []
- ged_stabilizer = 'min'
- ged_repeat = 50
- params_ged = {'lib': 'gedlibpy', 'cost': ged_cost, 'method': ged_method,
- 'edit_cost_constant': edit_cost_constant,
- 'stabilizer': ged_stabilizer, 'repeat': ged_repeat}
-
- # classify graphs according to letters.
- time_list = []
- dis_ks_min_list = []
- sod_gs_list = []
- g_best = []
- sod_set_median_list = []
- idx_dict = get_same_item_indices(y_all)
- for letter in idx_dict:
- print('\n-------------------------------------------------------')
- print('letter', letter)
- Gn_let = [Gn[i].copy() for i in idx_dict[letter]]
-
- time_list.append([])
- dis_ks_min_list.append([])
- sod_gs_list.append([])
- g_best.append([])
- sod_set_median_list.append([])
-
- for repeat in range(50):
- idx_rdm = random.sample(range(len(Gn_let)), 50)
- print('graphs chosen:', idx_rdm)
- Gn_median = [Gn_let[idx].copy() for idx in idx_rdm]
- Gn_candidate = [g.copy() for g in Gn_median]
-
- alpha_range = [1 / len(Gn_median)] * len(Gn_median)
- time0 = time.time()
- ghat_new_list, sod_min, sod_set_median = iam_upgraded(Gn_median,
- Gn_candidate, c_ei=c_ei, c_er=c_er, c_es=c_es, ite_max=ite_max_iam,
- epsilon=epsilon_iam, connected=connected_iam, removeNodes=removeNodes,
- params_ged=params_ged)
- time_total = time.time() - time0
- print('\ntime: ', time_total)
- time_list[-1].append(time_total)
- g_best[-1].append(ghat_new_list[0])
- sod_set_median_list[-1].append(sod_set_median)
- print('\nsmallest sod of the set median:', sod_set_median)
- sod_gs_list[-1].append(sod_min)
- print('\nsmallest sod in graph space:', sod_min)
-
- # show the best graph and save it to file.
- print('one of the possible corresponding pre-images is')
- draw_Letter_graph(ghat_new_list[0], savepath='results/iam/paper_compare/')
-
- # compute distance between \psi and the new generated graphs.
- knew = compute_kernel(ghat_new_list + Gn_median, gkernel, False)
- dhat_new_list = []
- for idx, g_tmp in enumerate(ghat_new_list):
- # @todo: the term3 below could use the one at the beginning of the function.
- dhat_new_list.append(dis_gstar(idx, range(len(ghat_new_list),
- len(ghat_new_list) + len(Gn_median) + 1),
- alpha_range, knew, withterm3=False))
-
- print('\nsmallest distance in kernel space: ', dhat_new_list[0])
- dis_ks_min_list[-1].append(dhat_new_list[0])
-
- print('\nsods of the set median for this letter:', sod_set_median_list[-1])
- print('\nsods in graph space for this letter:', sod_gs_list[-1])
- print('\nsmallest distances in kernel space for this letter:',
- dis_ks_min_list[-1])
- print('\ntimes for this letter:', time_list[-1])
-
- sod_set_median_list[-1] = np.mean(sod_set_median_list[-1])
- sod_gs_list[-1] = np.mean(sod_gs_list[-1])
- dis_ks_min_list[-1] = np.mean(dis_ks_min_list[-1])
- time_list[-1] = np.mean(time_list[-1])
-
- print('\nmean sods of the set median for each letter:', sod_set_median_list)
- print('\nmean sods in graph space for each letter:', sod_gs_list)
- print('\nmean smallest distances in kernel space for each letter:',
- dis_ks_min_list)
- print('\nmean times for each letter:', time_list)
-
- print('\nmean sods of the set median of all:', np.mean(sod_set_median_list))
- print('\nmean sods in graph space of all:', np.mean(sod_gs_list))
- print('\nmean smallest distances in kernel space of all:',
- np.mean(dis_ks_min_list))
- print('\nmean times of all:', np.mean(time_list))
-
-
-
-
-
-
-
-
-
-def test_iam_fitdistance():
-
- ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
- 'extra_params': {}} # node/edge symb
- Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
-# Gn = Gn[0:50]
-# remove_edges(Gn)
- gkernel = 'marginalizedkernel'
- node_label = 'atom'
- edge_label = 'bond_type'
-
-# lmbda = 0.03 # termination probalility
-# # parameters for GED function
-# c_vi = 0.037
-# c_vr = 0.038
-# c_vs = 0.075
-# c_ei = 0.001
-# c_er = 0.001
-# c_es = 0.0
-# ite_max_iam = 50
-# epsilon_iam = 0.001
-# removeNodes = False
-# connected_iam = False
-# # parameters for IAM function
-# ged_cost = 'CONSTANT'
-# ged_method = 'IPFP'
-# edit_cost_constant = [c_vi, c_vr, c_vs, c_ei, c_er, c_es]
-# ged_stabilizer = 'min'
-# ged_repeat = 50
-# params_ged = {'lib': 'gedlibpy', 'cost': ged_cost, 'method': ged_method,
-# 'edit_cost_constant': edit_cost_constant,
-# 'stabilizer': ged_stabilizer, 'repeat': ged_repeat}
-
- # parameters for GED function
- c_vi = 4
- c_vr = 4
- c_vs = 2
- c_ei = 1
- c_er = 1
- c_es = 1
- ite_max_iam = 50
- epsilon_iam = 0.001
- removeNodes = False
- connected_iam = False
- # parameters for IAM function
- ged_cost = 'CHEM_1'
- ged_method = 'IPFP'
- edit_cost_constant = []
- ged_stabilizer = 'min'
- ged_repeat = 50
- params_ged = {'lib': 'gedlibpy', 'cost': ged_cost, 'method': ged_method,
- 'edit_cost_constant': edit_cost_constant,
- 'stabilizer': ged_stabilizer, 'repeat': ged_repeat}
-
- # find out all the graphs classified to positive group 1.
- idx_dict = get_same_item_indices(y_all)
- Gn = [Gn[i] for i in idx_dict[1]]
-
- # number of graphs; we what to compute the median of these graphs.
-# nb_median_range = [2, 3, 4, 5, 10, 20, 30, 40, 50, 100]
- nb_median_range = [10]
-
-# # compute Gram matrix.
-# time0 = time.time()
-# km = compute_kernel(Gn, gkernel, True)
-# time_km = time.time() - time0
-# # write Gram matrix to file.
-# np.savez('results/gram_matrix_marg_itr10_pq0.03_mutag_positive.gm', gm=km, gmtime=time_km)
-
- time_list = []
- dis_ks_min_list = []
- dis_ks_gen_median_list = []
- sod_gs_list = []
-# sod_gs_min_list = []
-# nb_updated_list = []
-# nb_updated_k_list = []
- g_best = []
- for nb_median in nb_median_range:
- print('\n-------------------------------------------------------')
- print('number of median graphs =', nb_median)
- random.seed(1)
- idx_rdm = random.sample(range(len(Gn)), nb_median)
- print('graphs chosen:', idx_rdm)
- Gn_median = [Gn[idx].copy() for idx in idx_rdm]
- Gn_candidate = [g.copy() for g in Gn_median]
-
-# for g in Gn_median:
-# nx.draw(g, labels=nx.get_node_attributes(g, 'atom'), with_labels=True)
-## plt.savefig("results/preimage_mix/mutag.png", format="PNG")
-# plt.show()
-# plt.clf()
-
- ###################################################################
-# gmfile = np.load('results/gram_matrix_marg_itr10_pq0.03_mutag_positive.gm.npz')
-# km_tmp = gmfile['gm']
-# time_km = gmfile['gmtime']
-# # modify mixed gram matrix.
-# km = np.zeros((len(Gn) + nb_median, len(Gn) + nb_median))
-# for i in range(len(Gn)):
-# for j in range(i, len(Gn)):
-# km[i, j] = km_tmp[i, j]
-# km[j, i] = km[i, j]
-# for i in range(len(Gn)):
-# for j, idx in enumerate(idx_rdm):
-# km[i, len(Gn) + j] = km[i, idx]
-# km[len(Gn) + j, i] = km[i, idx]
-# for i, idx1 in enumerate(idx_rdm):
-# for j, idx2 in enumerate(idx_rdm):
-# km[len(Gn) + i, len(Gn) + j] = km[idx1, idx2]
-
- ###################################################################
- alpha_range = [1 / nb_median] * nb_median
- time0 = time.time()
- G_gen_median_list, sod_gen_median, sod_list, G_set_median_list, sod_set_median \
- = iam_upgraded(Gn_median, Gn_candidate,
- c_ei=c_ei, c_er=c_er, c_es=c_es, ite_max=ite_max_iam,
- epsilon=epsilon_iam, connected=connected_iam, removeNodes=removeNodes,
- params_ged=params_ged)
-
- time_total = time.time() - time0
- print('\ntime: ', time_total)
- time_list.append(time_total)
-
- # compute distance between \psi and the new generated graphs.
- knew = compute_kernel(G_gen_median_list + Gn_median, gkernel, node_label,
- edge_label, False)
- dhat_new_list = []
- for idx, g_tmp in enumerate(G_gen_median_list):
- # @todo: the term3 below could use the one at the beginning of the function.
- dhat_new_list.append(dis_gstar(idx, range(len(G_gen_median_list),
- len(G_gen_median_list) + len(Gn_median) + 1),
- alpha_range, knew, withterm3=False))
-
- print('\nsmallest distance in kernel space: ', dhat_new_list[0])
- dis_ks_min_list.append(dhat_new_list[0])
- g_best.append(G_gen_median_list[0])
-
- # show the best graph and save it to file.
-# print('the shortest distance is', dhat)
- print('one of the possible corresponding pre-images is')
- nx.draw(G_gen_median_list[0], labels=nx.get_node_attributes(G_gen_median_list[0], 'atom'),
- with_labels=True)
- plt.show()
-# plt.savefig('results/iam/mutag_median.fit_costs2.001.nb' + str(nb_median) +
-# plt.savefig('results/iam/mutag_median_unfit2.nb' + str(nb_median) +
-# '.png', format="PNG")
- plt.clf()
-# print(ghat_list[0].nodes(data=True))
-# print(ghat_list[0].edges(data=True))
-
- sod_gs_list.append(sod_gen_median)
-# sod_gs_min_list.append(np.min(sod_gen_median))
- print('\nsmallest sod in graph space: ', sod_gen_median)
- print('\nsmallest sod of set median in graph space: ', sod_set_median)
-
- print('\nsods in graph space: ', sod_gs_list)
-# print('\nsmallest sod in graph space for each set of median graphs: ', sod_gs_min_list)
- print('\nsmallest distance in kernel space for each set of median graphs: ',
- dis_ks_min_list)
-# print('\nnumber of updates of the best graph for each set of median graphs by IAM: ',
-# nb_updated_list)
-# print('\nnumber of updates of k nearest graphs for each set of median graphs by IAM: ',
-# nb_updated_k_list)
- print('\ntimes:', time_list)
-
-
-
-
-
-###############################################################################
-
-
-if __name__ == '__main__':
-###############################################################################
-# tests on different numbers of median-sets.
-# test_iam_median_nb()
-# test_iam_letter_h()
-# test_iam_monoterpenoides()
-# test_iam_mutag()
-
-# test_iam_fitdistance()
-# print("test log")
-
- test_iam_monoterpenoides_with_init40()
diff --git a/gklearn/preimage/test_k_closest_graphs.py b/gklearn/preimage/test_k_closest_graphs.py
deleted file mode 100644
index 56971c7..0000000
--- a/gklearn/preimage/test_k_closest_graphs.py
+++ /dev/null
@@ -1,462 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Mon Dec 16 11:53:54 2019
-
-@author: ljia
-"""
-import numpy as np
-import math
-import networkx as nx
-import matplotlib.pyplot as plt
-import time
-import random
-from tqdm import tqdm
-from itertools import combinations, islice
-import multiprocessing
-from multiprocessing import Pool
-from functools import partial
-
-from gklearn.utils.graphfiles import loadDataset, loadGXL
-#from gklearn.utils.logger2file import *
-from gklearn.preimage.iam import iam_upgraded, iam_bash
-from gklearn.preimage.utils import compute_kernel, dis_gstar, kernel_distance_matrix
-from gklearn.preimage.fitDistance import fit_GED_to_kernel_distance
-#from gklearn.preimage.ged import ged_median
-
-
-def fit_edit_cost_constants(fit_method, edit_cost_name,
- edit_cost_constants=None, initial_solutions=1,
- Gn_median=None, node_label=None, edge_label=None,
- gkernel=None, dataset=None, init_ecc=None,
- Gn=None, Kmatrix_median=None):
- """fit edit cost constants.
- """
- if fit_method == 'random': # random
- if edit_cost_name == 'LETTER':
- edit_cost_constants = random.sample(range(1, 10), 3)
- edit_cost_constants = [item * 0.1 for item in edit_cost_constants]
- elif edit_cost_name == 'LETTER2':
- random.seed(time.time())
- edit_cost_constants = random.sample(range(1, 10), 5)
-# edit_cost_constants = [item * 0.1 for item in edit_cost_constants]
- elif edit_cost_name == 'NON_SYMBOLIC':
- edit_cost_constants = random.sample(range(1, 10), 6)
- if Gn_median[0].graph['node_attrs'] == []:
- edit_cost_constants[2] = 0
- if Gn_median[0].graph['edge_attrs'] == []:
- edit_cost_constants[5] = 0
- else:
- edit_cost_constants = random.sample(range(1, 10), 6)
- print('edit cost constants used:', edit_cost_constants)
- elif fit_method == 'expert': # expert
- if init_ecc is None:
- if edit_cost_name == 'LETTER':
- edit_cost_constants = [0.9, 1.7, 0.75]
- elif edit_cost_name == 'LETTER2':
- edit_cost_constants = [0.675, 0.675, 0.75, 0.425, 0.425]
- else:
- edit_cost_constants = [3, 3, 1, 3, 3, 1]
- else:
- edit_cost_constants = init_ecc
- elif fit_method == 'k-graphs':
- itr_max = 6
- if init_ecc is None:
- if edit_cost_name == 'LETTER':
- init_costs = [0.9, 1.7, 0.75]
- elif edit_cost_name == 'LETTER2':
- init_costs = [0.675, 0.675, 0.75, 0.425, 0.425]
- elif edit_cost_name == 'NON_SYMBOLIC':
- init_costs = [0, 0, 1, 1, 1, 0]
- if Gn_median[0].graph['node_attrs'] == []:
- init_costs[2] = 0
- if Gn_median[0].graph['edge_attrs'] == []:
- init_costs[5] = 0
- else:
- init_costs = [3, 3, 1, 3, 3, 1]
- else:
- init_costs = init_ecc
- algo_options = '--threads 1 --initial-solutions ' \
- + str(initial_solutions) + ' --ratio-runs-from-initial-solutions 1'
- params_ged = {'lib': 'gedlibpy', 'cost': edit_cost_name, 'method': 'IPFP',
- 'algo_options': algo_options, 'stabilizer': None}
- # fit on k-graph subset
- edit_cost_constants, _, _, _, _, _, _ = fit_GED_to_kernel_distance(Gn_median,
- node_label, edge_label, gkernel, itr_max, params_ged=params_ged,
- init_costs=init_costs, dataset=dataset, Kmatrix=Kmatrix_median,
- parallel=True)
- elif fit_method == 'whole-dataset':
- itr_max = 6
- if init_ecc is None:
- if edit_cost_name == 'LETTER':
- init_costs = [0.9, 1.7, 0.75]
- elif edit_cost_name == 'LETTER2':
- init_costs = [0.675, 0.675, 0.75, 0.425, 0.425]
- else:
- init_costs = [3, 3, 1, 3, 3, 1]
- else:
- init_costs = init_ecc
- algo_options = '--threads 1 --initial-solutions ' \
- + str(initial_solutions) + ' --ratio-runs-from-initial-solutions 1'
- params_ged = {'lib': 'gedlibpy', 'cost': edit_cost_name, 'method': 'IPFP',
- 'algo_options': algo_options, 'stabilizer': None}
- # fit on all subset
- edit_cost_constants, _, _, _, _, _, _ = fit_GED_to_kernel_distance(Gn,
- node_label, edge_label, gkernel, itr_max, params_ged=params_ged,
- init_costs=init_costs, dataset=dataset, parallel=True)
- elif fit_method == 'precomputed':
- pass
-
- return edit_cost_constants
-
-
-def compute_distances_to_true_median(Gn_median, fname_sm, fname_gm,
- gkernel, edit_cost_name,
- Kmatrix_median=None):
- # reform graphs.
- set_median = loadGXL(fname_sm)
- gen_median = loadGXL(fname_gm)
-# print(gen_median.nodes(data=True))
-# print(gen_median.edges(data=True))
- if edit_cost_name == 'LETTER' or edit_cost_name == 'LETTER2' or edit_cost_name == 'NON_SYMBOLIC':
-# dataset == 'Fingerprint':
-# for g in Gn_median:
-# reform_attributes(g)
- reform_attributes(set_median, Gn_median[0].graph['node_attrs'],
- Gn_median[0].graph['edge_attrs'])
- reform_attributes(gen_median, Gn_median[0].graph['node_attrs'],
- Gn_median[0].graph['edge_attrs'])
-
- if edit_cost_name == 'LETTER' or edit_cost_name == 'LETTER2' or edit_cost_name == 'NON_SYMBOLIC':
- node_label = None
- edge_label = None
- else:
- node_label = 'chem'
- edge_label = 'valence'
-
- # compute Gram matrix for median set.
- if Kmatrix_median is None:
- Kmatrix_median = compute_kernel(Gn_median, gkernel, node_label, edge_label, False)
-
- # compute distance in kernel space for set median.
- kernel_sm = []
- for G_median in Gn_median:
- km_tmp = compute_kernel([set_median, G_median], gkernel, node_label, edge_label, False)
- kernel_sm.append(km_tmp[0, 1])
- Kmatrix_sm = np.concatenate((np.array([kernel_sm]), np.copy(Kmatrix_median)), axis=0)
- Kmatrix_sm = np.concatenate((np.array([[km_tmp[0, 0]] + kernel_sm]).T, Kmatrix_sm), axis=1)
-# Kmatrix_sm = compute_kernel([set_median] + Gn_median, gkernel,
-# node_label, edge_label, False)
- dis_k_sm = dis_gstar(0, range(1, 1+len(Gn_median)),
- [1 / len(Gn_median)] * len(Gn_median), Kmatrix_sm, withterm3=False)
-# print(gen_median.nodes(data=True))
-# print(gen_median.edges(data=True))
-# print(set_median.nodes(data=True))
-# print(set_median.edges(data=True))
-
- # compute distance in kernel space for generalized median.
- kernel_gm = []
- for G_median in Gn_median:
- km_tmp = compute_kernel([gen_median, G_median], gkernel, node_label, edge_label, False)
- kernel_gm.append(km_tmp[0, 1])
- Kmatrix_gm = np.concatenate((np.array([kernel_gm]), np.copy(Kmatrix_median)), axis=0)
- Kmatrix_gm = np.concatenate((np.array([[km_tmp[0, 0]] + kernel_gm]).T, Kmatrix_gm), axis=1)
-# Kmatrix_gm = compute_kernel([gen_median] + Gn_median, gkernel,
-# node_label, edge_label, False)
- dis_k_gm = dis_gstar(0, range(1, 1+len(Gn_median)),
- [1 / len(Gn_median)] * len(Gn_median), Kmatrix_gm, withterm3=False)
-
- # compute distance in kernel space for each graph in median set.
- dis_k_gi = []
- for idx in range(len(Gn_median)):
- dis_k_gi.append(dis_gstar(idx+1, range(1, 1+len(Gn_median)),
- [1 / len(Gn_median)] * len(Gn_median), Kmatrix_gm, withterm3=False))
-
- print('dis_k_sm:', dis_k_sm)
- print('dis_k_gm:', dis_k_gm)
- print('dis_k_gi:', dis_k_gi)
- idx_dis_k_gi_min = np.argmin(dis_k_gi)
- dis_k_gi_min = dis_k_gi[idx_dis_k_gi_min]
- print('min dis_k_gi:', dis_k_gi_min)
-
- return dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min, idx_dis_k_gi_min
-
-
-def median_on_k_closest_graphs(Gn, node_label, edge_label, gkernel, k, fit_method,
- graph_dir=None, initial_solutions=1,
- edit_cost_constants=None, group_min=None,
- dataset=None, edit_cost_name=None, init_ecc=None,
- Kmatrix=None, parallel=True):
-# dataset = dataset.lower()
-
-# # compute distances in kernel space.
-# dis_mat, _, _, _ = kernel_distance_matrix(Gn, node_label, edge_label,
-# Kmatrix=None, gkernel=gkernel)
-# # ged.
-# gmfile = np.load('results/test_k_closest_graphs/ged_mat.fit_on_whole_dataset.with_medians.gm.npz')
-# ged_mat = gmfile['ged_mat']
-# dis_mat = ged_mat[0:len(Gn), 0:len(Gn)]
-
-# # choose k closest graphs
-# time0 = time.time()
-# sod_ks_min, group_min = get_closest_k_graphs(dis_mat, k, parallel)
-# time_spent = time.time() - time0
-# print('closest graphs:', sod_ks_min, group_min)
-# print('time spent:', time_spent)
-# group_min = (12, 13, 22, 29) # closest w.r.t path kernel
-# group_min = (77, 85, 160, 171) # closest w.r.t ged
-# group_min = (0,1,2,3,4,5,6,7,8,9,10,11) # closest w.r.t treelet kernel
- Gn_median = [Gn[g].copy() for g in group_min]
- if Kmatrix is not None:
- Kmatrix_median = np.copy(Kmatrix[group_min,:])
- Kmatrix_median = Kmatrix_median[:,group_min]
- else:
- Kmatrix_median = None
-
-
- # 1. fit edit cost constants.
- time0 = time.time()
- edit_cost_constants = fit_edit_cost_constants(fit_method, edit_cost_name,
- edit_cost_constants=edit_cost_constants, initial_solutions=initial_solutions,
- Gn_median=Gn_median, node_label=node_label, edge_label=edge_label,
- gkernel=gkernel, dataset=dataset, init_ecc=init_ecc,
- Gn=Gn, Kmatrix_median=Kmatrix_median)
- time_fitting = time.time() - time0
-
-
- # 2. compute set median and gen median using IAM (C++ through bash).
- print('\nstart computing set median and gen median using IAM (C++ through bash)...\n')
- group_fnames = [Gn[g].graph['filename'] for g in group_min]
- time0 = time.time()
- sod_sm, sod_gm, fname_sm, fname_gm = iam_bash(group_fnames, edit_cost_constants,
- cost=edit_cost_name, initial_solutions=initial_solutions,
- graph_dir=graph_dir, dataset=dataset)
- time_generating = time.time() - time0
- print('\nmedians computed.\n')
-
-
- # 3. compute distances to the true median.
- print('\nstart computing distances to true median....\n')
- Gn_median = [Gn[g].copy() for g in group_min]
- dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min, idx_dis_k_gi_min = \
- compute_distances_to_true_median(Gn_median, fname_sm, fname_gm,
- gkernel, edit_cost_name,
- Kmatrix_median=Kmatrix_median)
- idx_dis_k_gi_min = group_min[idx_dis_k_gi_min]
- print('index min dis_k_gi:', idx_dis_k_gi_min)
- print('sod_sm:', sod_sm)
- print('sod_gm:', sod_gm)
-
- # collect return values.
- return (sod_sm, sod_gm), \
- (dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min, idx_dis_k_gi_min), \
- (time_fitting, time_generating)
-
-
-def reform_attributes(G, na_names=[], ea_names=[]):
- if not na_names == []:
- for node in G.nodes:
- G.nodes[node]['attributes'] = [G.node[node][a_name] for a_name in na_names]
- if not ea_names == []:
- for edge in G.edges:
- G.edges[edge]['attributes'] = [G.edge[edge][a_name] for a_name in ea_names]
-
-
-def get_closest_k_graphs(dis_mat, k, parallel):
- k_graph_groups = combinations(range(0, len(dis_mat)), k)
- sod_ks_min = np.inf
- if parallel:
- len_combination = get_combination_length(len(dis_mat), k)
- len_itr_max = int(len_combination if len_combination < 1e7 else 1e7)
-# pos_cur = 0
- graph_groups_slices = split_iterable(k_graph_groups, len_itr_max, len_combination)
- for graph_groups_cur in graph_groups_slices:
-# while True:
-# graph_groups_cur = islice(k_graph_groups, pos_cur, pos_cur + len_itr_max)
- graph_groups_cur_list = list(graph_groups_cur)
- print('current position:', graph_groups_cur_list[0])
- len_itr_cur = len(graph_groups_cur_list)
-# if len_itr_cur < len_itr_max:
-# break
-
- itr = zip(graph_groups_cur_list, range(0, len_itr_cur))
- sod_k_list = np.empty(len_itr_cur)
- graphs_list = [None] * len_itr_cur
- n_jobs = multiprocessing.cpu_count()
- chunksize = int(len_itr_max / n_jobs + 1)
- n_jobs = multiprocessing.cpu_count()
- def init_worker(dis_mat_toshare):
- global G_dis_mat
- G_dis_mat = dis_mat_toshare
- pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(dis_mat,))
-# iterator = tqdm(pool.imap_unordered(_get_closest_k_graphs_parallel,
-# itr, chunksize),
-# desc='Choosing k closest graphs', file=sys.stdout)
- iterator = pool.imap_unordered(_get_closest_k_graphs_parallel, itr, chunksize)
- for graphs, i, sod_ks in iterator:
- sod_k_list[i] = sod_ks
- graphs_list[i] = graphs
- pool.close()
- pool.join()
-
- arg_min = np.argmin(sod_k_list)
- sod_ks_cur = sod_k_list[arg_min]
- group_cur = graphs_list[arg_min]
- if sod_ks_cur < sod_ks_min:
- sod_ks_min = sod_ks_cur
- group_min = group_cur
- print('get closer graphs:', sod_ks_min, group_min)
- else:
- for items in tqdm(k_graph_groups, desc='Choosing k closest graphs', file=sys.stdout):
- # if items[0] != itmp:
- # itmp = items[0]
- # print(items)
- k_graph_pairs = combinations(items, 2)
- sod_ks = 0
- for i1, i2 in k_graph_pairs:
- sod_ks += dis_mat[i1, i2]
- if sod_ks < sod_ks_min:
- sod_ks_min = sod_ks
- group_min = items
- print('get closer graphs:', sod_ks_min, group_min)
-
- return sod_ks_min, group_min
-
-
-def _get_closest_k_graphs_parallel(itr):
- k_graph_pairs = combinations(itr[0], 2)
- sod_ks = 0
- for i1, i2 in k_graph_pairs:
- sod_ks += G_dis_mat[i1, i2]
-
- return itr[0], itr[1], sod_ks
-
-
-def split_iterable(iterable, n, len_iter):
- it = iter(iterable)
- for i in range(0, len_iter, n):
- piece = islice(it, n)
- yield piece
-
-
-def get_combination_length(n, k):
- len_combination = 1
- for i in range(n, n - k, -1):
- len_combination *= i
- return int(len_combination / math.factorial(k))
-
-
-###############################################################################
-
-def test_k_closest_graphs():
- ds = {'name': 'monoterpenoides',
- 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb
- Gn, y_all = loadDataset(ds['dataset'])
-# Gn = Gn[0:50]
-# gkernel = 'untilhpathkernel'
-# gkernel = 'weisfeilerlehmankernel'
- gkernel = 'treeletkernel'
- node_label = 'atom'
- edge_label = 'bond_type'
-
- k = 5
- edit_costs = [0.16229209837639536, 0.06612870523413916, 0.04030113378793905, 0.20723547009415202, 0.3338607220394598, 0.27054392518077297]
-
-# sod_sm, sod_gm, dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min \
-# = median_on_k_closest_graphs(Gn, node_label, edge_label, gkernel, k,
-# 'precomputed', edit_costs=edit_costs,
-## 'k-graphs',
-# parallel=False)
-#
-# sod_sm, sod_gm, dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min \
-# = median_on_k_closest_graphs(Gn, node_label, edge_label, gkernel, k,
-# 'expert', parallel=False)
-
- sod_sm, sod_gm, dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min \
- = median_on_k_closest_graphs(Gn, node_label, edge_label, gkernel, k,
- 'expert', parallel=False)
- return
-
-
-def test_k_closest_graphs_with_cv():
- gkernel = 'untilhpathkernel'
- node_label = 'atom'
- edge_label = 'bond_type'
-
- k = 4
-
- y_all = ['3', '1', '4', '6', '7', '8', '9', '2']
- repeats = 50
- collection_path = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/generated_datsets/monoterpenoides/'
- graph_dir = collection_path + 'gxl/'
-
- sod_sm_list = []
- sod_gm_list = []
- dis_k_sm_list = []
- dis_k_gm_list = []
- dis_k_gi_min_list = []
- for y in y_all:
- print('\n-------------------------------------------------------')
- print('class of y:', y)
-
- sod_sm_list.append([])
- sod_gm_list.append([])
- dis_k_sm_list.append([])
- dis_k_gm_list.append([])
- dis_k_gi_min_list.append([])
-
- for repeat in range(repeats):
- print('\nrepeat ', repeat)
- collection_file = collection_path + 'monoterpenoides_' + y + '_' + str(repeat) + '.xml'
- Gn, _ = loadDataset(collection_file, extra_params=graph_dir)
- sod_sm, sod_gm, dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min \
- = median_on_k_closest_graphs(Gn, node_label, edge_label, gkernel,
- k, 'whole-dataset', graph_dir=graph_dir,
- parallel=False)
-
- sod_sm_list[-1].append(sod_sm)
- sod_gm_list[-1].append(sod_gm)
- dis_k_sm_list[-1].append(dis_k_sm)
- dis_k_gm_list[-1].append(dis_k_gm)
- dis_k_gi_min_list[-1].append(dis_k_gi_min)
-
- print('\nsods of the set median for this class:', sod_sm_list[-1])
- print('\nsods of the gen median for this class:', sod_gm_list[-1])
- print('\ndistances in kernel space of set median for this class:',
- dis_k_sm_list[-1])
- print('\ndistances in kernel space of gen median for this class:',
- dis_k_gm_list[-1])
- print('\ndistances in kernel space of min graph for this class:',
- dis_k_gi_min_list[-1])
-
- sod_sm_list[-1] = np.mean(sod_sm_list[-1])
- sod_gm_list[-1] = np.mean(sod_gm_list[-1])
- dis_k_sm_list[-1] = np.mean(dis_k_sm_list[-1])
- dis_k_gm_list[-1] = np.mean(dis_k_gm_list[-1])
- dis_k_gi_min_list[-1] = np.mean(dis_k_gi_min_list[-1])
-
- print()
- print('\nmean sods of the set median for each class:', sod_sm_list)
- print('\nmean sods of the gen median for each class:', sod_gm_list)
- print('\nmean distance in kernel space of set median for each class:',
- dis_k_sm_list)
- print('\nmean distances in kernel space of gen median for each class:',
- dis_k_gm_list)
- print('\nmean distances in kernel space of min graph for each class:',
- dis_k_gi_min_list)
-
- print('\nmean sods of the set median of all:', np.mean(sod_sm_list))
- print('\nmean sods of the gen median of all:', np.mean(sod_gm_list))
- print('\nmean distances in kernel space of set median of all:',
- np.mean(dis_k_sm_list))
- print('\nmean distances in kernel space of gen median of all:',
- np.mean(dis_k_gm_list))
- print('\nmean distances in kernel space of min graph of all:',
- np.mean(dis_k_gi_min_list))
-
- return
-
-
-if __name__ == '__main__':
- test_k_closest_graphs()
-# test_k_closest_graphs_with_cv()
\ No newline at end of file
diff --git a/gklearn/preimage/test_median_preimage_generator.py b/gklearn/preimage/test_median_preimage_generator.py
deleted file mode 100644
index 2f458af..0000000
--- a/gklearn/preimage/test_median_preimage_generator.py
+++ /dev/null
@@ -1,69 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Fri Mar 27 17:30:55 2020
-
-@author: ljia
-"""
-import multiprocessing
-import functools
-from gklearn.utils.kernels import deltakernel, gaussiankernel, kernelproduct
-from gklearn.preimage import MedianPreimageGenerator
-from gklearn.utils import Dataset
-
-
-def test_median_preimage_generator():
-
- # 1. set parameters.
- print('1. setting parameters...')
- ds_name = 'Letter-high'
- mpg = MedianPreimageGenerator()
- mpg_options = {'fit_method': 'k-graphs',
- 'init_ecc': [3, 3, 1, 3, 3],
- 'ds_name': 'Letter-high',
- 'parallel': True,
- 'time_limit_in_sec': 0,
- 'max_itrs': 100,
- 'max_itrs_without_update': 3,
- 'epsilon_ratio': 0.01,
- 'verbose': 2}
- mpg.set_options(**mpg_options)
- mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)
- sub_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}
- mpg.kernel_options = {'name': 'structuralspkernel',
- 'edge_weight': None,
- 'node_kernels': sub_kernels,
- 'edge_kernels': sub_kernels,
- 'compute_method': 'naive',
- 'parallel': 'imap_unordered',
-# 'parallel': None,
- 'n_jobs': multiprocessing.cpu_count(),
- 'normalize': True,
- 'verbose': 2}
- mpg.ged_options = {'method': 'IPFP',
- 'initial_solutions': 40,
- 'edit_cost': 'LETTER2',
- 'attr_distance': 'euclidean',
- 'ratio_runs_from_initial_solutions': 1,
- 'threads': multiprocessing.cpu_count(),
- 'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES'}
- mpg.mge_options = {'init_type': 'MEDOID',
- 'random_inits': 10,
- 'time_limit': 600,
- 'verbose': 2,
- 'refine': False}
-
-
- # 2. get dataset.
- print('2. getting dataset...')
- mpg.dataset = Dataset()
- mpg.dataset.load_predefined_dataset(ds_name)
- mpg.dataset.cut_graphs(range(0, 10))
-
- # 3. compute median preimage.
- print('3. computing median preimage...')
- mpg.run()
-
-
-if __name__ == '__main__':
- test_median_preimage_generator()
\ No newline at end of file
diff --git a/gklearn/preimage/test_others.py b/gklearn/preimage/test_others.py
deleted file mode 100644
index a277a17..0000000
--- a/gklearn/preimage/test_others.py
+++ /dev/null
@@ -1,686 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Thu Jul 4 12:20:16 2019
-
-@author: ljia
-"""
-import numpy as np
-import networkx as nx
-import matplotlib.pyplot as plt
-import time
-from tqdm import tqdm
-
-from gklearn.utils.graphfiles import loadDataset
-from gklearn.preimage.median import draw_Letter_graph
-from gklearn.preimage.ged import GED, ged_median
-from gklearn.preimage.utils import get_same_item_indices, compute_kernel, gram2distances, \
- dis_gstar, remove_edges
-
-
-# --------------------------- These are tests --------------------------------#
-
-def test_who_is_the_closest_in_kernel_space(Gn):
- idx_gi = [0, 6]
- g1 = Gn[idx_gi[0]]
- g2 = Gn[idx_gi[1]]
- # create the "median" graph.
- gnew = g2.copy()
- gnew.remove_node(0)
- nx.draw_networkx(gnew)
- plt.show()
- print(gnew.nodes(data=True))
- Gn = [gnew] + Gn
-
- # compute gram matrix
- Kmatrix = compute_kernel(Gn, 'untilhpathkernel', True)
- # the distance matrix
- dmatrix = gram2distances(Kmatrix)
- print(np.sort(dmatrix[idx_gi[0] + 1]))
- print(np.argsort(dmatrix[idx_gi[0] + 1]))
- print(np.sort(dmatrix[idx_gi[1] + 1]))
- print(np.argsort(dmatrix[idx_gi[1] + 1]))
- # for all g in Gn, compute (d(g1, g) + d(g2, g)) / 2
- dis_median = [(dmatrix[i, idx_gi[0] + 1] + dmatrix[i, idx_gi[1] + 1]) / 2 for i in range(len(Gn))]
- print(np.sort(dis_median))
- print(np.argsort(dis_median))
- return
-
-
-def test_who_is_the_closest_in_GED_space(Gn):
- idx_gi = [0, 6]
- g1 = Gn[idx_gi[0]]
- g2 = Gn[idx_gi[1]]
- # create the "median" graph.
- gnew = g2.copy()
- gnew.remove_node(0)
- nx.draw_networkx(gnew)
- plt.show()
- print(gnew.nodes(data=True))
- Gn = [gnew] + Gn
-
- # compute GEDs
- ged_matrix = np.zeros((len(Gn), len(Gn)))
- for i1 in tqdm(range(len(Gn)), desc='computing GEDs', file=sys.stdout):
- for i2 in range(len(Gn)):
- dis, _, _ = GED(Gn[i1], Gn[i2], lib='gedlib')
- ged_matrix[i1, i2] = dis
- print(np.sort(ged_matrix[idx_gi[0] + 1]))
- print(np.argsort(ged_matrix[idx_gi[0] + 1]))
- print(np.sort(ged_matrix[idx_gi[1] + 1]))
- print(np.argsort(ged_matrix[idx_gi[1] + 1]))
- # for all g in Gn, compute (GED(g1, g) + GED(g2, g)) / 2
- dis_median = [(ged_matrix[i, idx_gi[0] + 1] + ged_matrix[i, idx_gi[1] + 1]) / 2 for i in range(len(Gn))]
- print(np.sort(dis_median))
- print(np.argsort(dis_median))
- return
-
-
-def test_will_IAM_give_the_median_graph_we_wanted(Gn):
- idx_gi = [0, 6]
- g1 = Gn[idx_gi[0]].copy()
- g2 = Gn[idx_gi[1]].copy()
-# del Gn[idx_gi[0]]
-# del Gn[idx_gi[1] - 1]
- g_median = test_iam_with_more_graphs_as_init([g1, g2], [g1, g2], c_ei=1, c_er=1, c_es=1)
-# g_median = test_iam_with_more_graphs_as_init(Gn, Gn, c_ei=1, c_er=1, c_es=1)
- nx.draw_networkx(g_median)
- plt.show()
- print(g_median.nodes(data=True))
- print(g_median.edges(data=True))
-
-
-def test_new_IAM_allGraph_deleteNodes(Gn):
- idx_gi = [0, 6]
-# g1 = Gn[idx_gi[0]].copy()
-# g2 = Gn[idx_gi[1]].copy()
-
-# g1 = nx.Graph(name='haha')
-# g1.add_nodes_from([(0, {'atom': 'C'}), (1, {'atom': 'O'}), (2, {'atom': 'C'})])
-# g1.add_edges_from([(0, 1, {'bond_type': '1'}), (1, 2, {'bond_type': '1'})])
-# g2 = nx.Graph(name='hahaha')
-# g2.add_nodes_from([(0, {'atom': 'C'}), (1, {'atom': 'O'}), (2, {'atom': 'C'}),
-# (3, {'atom': 'O'}), (4, {'atom': 'C'})])
-# g2.add_edges_from([(0, 1, {'bond_type': '1'}), (1, 2, {'bond_type': '1'}),
-# (2, 3, {'bond_type': '1'}), (3, 4, {'bond_type': '1'})])
-
- g1 = nx.Graph(name='haha')
- g1.add_nodes_from([(0, {'atom': 'C'}), (1, {'atom': 'C'}), (2, {'atom': 'C'}),
- (3, {'atom': 'S'}), (4, {'atom': 'S'})])
- g1.add_edges_from([(0, 1, {'bond_type': '1'}), (1, 2, {'bond_type': '1'}),
- (2, 3, {'bond_type': '1'}), (2, 4, {'bond_type': '1'})])
- g2 = nx.Graph(name='hahaha')
- g2.add_nodes_from([(0, {'atom': 'C'}), (1, {'atom': 'C'}), (2, {'atom': 'C'}),
- (3, {'atom': 'O'}), (4, {'atom': 'O'})])
- g2.add_edges_from([(0, 1, {'bond_type': '1'}), (1, 2, {'bond_type': '1'}),
- (2, 3, {'bond_type': '1'}), (2, 4, {'bond_type': '1'})])
-
-# g2 = g1.copy()
-# g2.add_nodes_from([(3, {'atom': 'O'})])
-# g2.add_nodes_from([(4, {'atom': 'C'})])
-# g2.add_edges_from([(1, 3, {'bond_type': '1'})])
-# g2.add_edges_from([(3, 4, {'bond_type': '1'})])
-
-# del Gn[idx_gi[0]]
-# del Gn[idx_gi[1] - 1]
-
- nx.draw_networkx(g1)
- plt.show()
- print(g1.nodes(data=True))
- print(g1.edges(data=True))
- nx.draw_networkx(g2)
- plt.show()
- print(g2.nodes(data=True))
- print(g2.edges(data=True))
-
- g_median = test_iam_moreGraphsAsInit_tryAllPossibleBestGraphs_deleteNodesInIterations([g1, g2], [g1, g2], c_ei=1, c_er=1, c_es=1)
-# g_median = test_iam_moreGraphsAsInit_tryAllPossibleBestGraphs_deleteNodesInIterations(Gn, Gn, c_ei=1, c_er=1, c_es=1)
- nx.draw_networkx(g_median)
- plt.show()
- print(g_median.nodes(data=True))
- print(g_median.edges(data=True))
-
-
-def test_the_simple_two(Gn, gkernel):
- from gk_iam import gk_iam_nearest_multi
- lmbda = 0.03 # termination probalility
- r_max = 10 # recursions
- l = 500
- alpha_range = np.linspace(0.5, 0.5, 1)
- k = 2 # k nearest neighbors
-
- # randomly select two molecules
- np.random.seed(1)
- idx_gi = [0, 6] # np.random.randint(0, len(Gn), 2)
- g1 = Gn[idx_gi[0]]
- g2 = Gn[idx_gi[1]]
- Gn_mix = [g.copy() for g in Gn]
- Gn_mix.append(g1.copy())
- Gn_mix.append(g2.copy())
-
-# g_tmp = iam([g1, g2])
-# nx.draw_networkx(g_tmp)
-# plt.show()
-
- # compute
-# k_list = [] # kernel between each graph and itself.
-# k_g1_list = [] # kernel between each graph and g1
-# k_g2_list = [] # kernel between each graph and g2
-# for ig, g in tqdm(enumerate(Gn), desc='computing self kernels', file=sys.stdout):
-# ktemp = compute_kernel([g, g1, g2], 'marginalizedkernel', False)
-# k_list.append(ktemp[0][0, 0])
-# k_g1_list.append(ktemp[0][0, 1])
-# k_g2_list.append(ktemp[0][0, 2])
-
- km = compute_kernel(Gn_mix, gkernel, True)
-# k_list = np.diag(km) # kernel between each graph and itself.
-# k_g1_list = km[idx_gi[0]] # kernel between each graph and g1
-# k_g2_list = km[idx_gi[1]] # kernel between each graph and g2
-
- g_best = []
- dis_best = []
- # for each alpha
- for alpha in alpha_range:
- print('alpha =', alpha)
- dhat, ghat_list = gk_iam_nearest_multi(Gn, [g1, g2], [alpha, 1 - alpha],
- range(len(Gn), len(Gn) + 2), km,
- k, r_max,gkernel)
- dis_best.append(dhat)
- g_best.append(ghat_list)
-
- for idx, item in enumerate(alpha_range):
- print('when alpha is', item, 'the shortest distance is', dis_best[idx])
- print('the corresponding pre-images are')
- for g in g_best[idx]:
- nx.draw_networkx(g)
- plt.show()
- print(g.nodes(data=True))
- print(g.edges(data=True))
-
-
-def test_remove_bests(Gn, gkernel):
- from gk_iam import gk_iam_nearest_multi
- lmbda = 0.03 # termination probalility
- r_max = 10 # recursions
- l = 500
- alpha_range = np.linspace(0.5, 0.5, 1)
- k = 20 # k nearest neighbors
-
- # randomly select two molecules
- np.random.seed(1)
- idx_gi = [0, 6] # np.random.randint(0, len(Gn), 2)
- g1 = Gn[idx_gi[0]]
- g2 = Gn[idx_gi[1]]
- # remove the best 2 graphs.
- del Gn[idx_gi[0]]
- del Gn[idx_gi[1] - 1]
-# del Gn[8]
-
- Gn_mix = [g.copy() for g in Gn]
- Gn_mix.append(g1.copy())
- Gn_mix.append(g2.copy())
-
-
- # compute
- km = compute_kernel(Gn_mix, gkernel, True)
- g_best = []
- dis_best = []
- # for each alpha
- for alpha in alpha_range:
- print('alpha =', alpha)
- dhat, ghat_list = gk_iam_nearest_multi(Gn, [g1, g2], [alpha, 1 - alpha],
- range(len(Gn), len(Gn) + 2), km,
- k, r_max, gkernel)
- dis_best.append(dhat)
- g_best.append(ghat_list)
-
- for idx, item in enumerate(alpha_range):
- print('when alpha is', item, 'the shortest distance is', dis_best[idx])
- print('the corresponding pre-images are')
- for g in g_best[idx]:
- draw_Letter_graph(g)
-# nx.draw_networkx(g)
-# plt.show()
- print(g.nodes(data=True))
- print(g.edges(data=True))
-
-
-###############################################################################
-# Tests on dataset Letter-H.
-
-def test_gkiam_letter_h():
- from gk_iam import gk_iam_nearest_multi
- ds = {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt',
- 'extra_params': {}} # node nsymb
-# ds = {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt',
-# 'extra_params': {}} # node nsymb
- Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
- gkernel = 'structuralspkernel'
-
- lmbda = 0.03 # termination probalility
- r_max = 3 # recursions
-# alpha_range = np.linspace(0.5, 0.5, 1)
- k = 10 # k nearest neighbors
-
- # classify graphs according to letters.
- idx_dict = get_same_item_indices(y_all)
- time_list = []
- sod_ks_min_list = []
- sod_gs_list = []
- sod_gs_min_list = []
- nb_updated_list = []
- for letter in idx_dict:
- print('\n-------------------------------------------------------\n')
- Gn_let = [Gn[i].copy() for i in idx_dict[letter]]
- Gn_mix = Gn_let + [g.copy() for g in Gn_let]
-
- alpha_range = np.linspace(1 / len(Gn_let), 1 / len(Gn_let), 1)
-
- # compute
- time0 = time.time()
- km = compute_kernel(Gn_mix, gkernel, True)
- g_best = []
- dis_best = []
- # for each alpha
- for alpha in alpha_range:
- print('alpha =', alpha)
- dhat, ghat_list, sod_ks, nb_updated = gk_iam_nearest_multi(Gn_let,
- Gn_let, [alpha] * len(Gn_let), range(len(Gn_let), len(Gn_mix)),
- km, k, r_max, gkernel, c_ei=1.7, c_er=1.7, c_es=1.7,
- ged_cost='LETTER', ged_method='IPFP', saveGXL='gedlib-letter')
- dis_best.append(dhat)
- g_best.append(ghat_list)
- time_list.append(time.time() - time0)
-
- # show best graphs and save them to file.
- for idx, item in enumerate(alpha_range):
- print('when alpha is', item, 'the shortest distance is', dis_best[idx])
- print('the corresponding pre-images are')
- for g in g_best[idx]:
- draw_Letter_graph(g, savepath='results/gk_iam/')
-# nx.draw_networkx(g)
-# plt.show()
- print(g.nodes(data=True))
- print(g.edges(data=True))
-
- # compute the corresponding sod in graph space. (alpha range not considered.)
- sod_tmp, _ = ged_median(g_best[0], Gn_let, ged_cost='LETTER',
- ged_method='IPFP', saveGXL='gedlib-letter')
- sod_gs_list.append(sod_tmp)
- sod_gs_min_list.append(np.min(sod_tmp))
- sod_ks_min_list.append(sod_ks)
- nb_updated_list.append(nb_updated)
-
-
- print('\nsods in graph space: ', sod_gs_list)
- print('\nsmallest sod in graph space for each letter: ', sod_gs_min_list)
- print('\nsmallest sod in kernel space for each letter: ', sod_ks_min_list)
- print('\nnumber of updates for each letter: ', nb_updated_list)
- print('\ntimes:', time_list)
-
-#def compute_letter_median_by_average(Gn):
-# return g_median
-
-
-def test_iam_letter_h():
- from iam import test_iam_moreGraphsAsInit_tryAllPossibleBestGraphs_deleteNodesInIterations
- ds = {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt',
- 'extra_params': {}} # node nsymb
-# ds = {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt',
-# 'extra_params': {}} # node nsymb
- Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
-
- lmbda = 0.03 # termination probalility
-# alpha_range = np.linspace(0.5, 0.5, 1)
-
- # classify graphs according to letters.
- idx_dict = get_same_item_indices(y_all)
- time_list = []
- sod_list = []
- sod_min_list = []
- for letter in idx_dict:
- Gn_let = [Gn[i].copy() for i in idx_dict[letter]]
-
- alpha_range = np.linspace(1 / len(Gn_let), 1 / len(Gn_let), 1)
-
- # compute
- g_best = []
- dis_best = []
- time0 = time.time()
- # for each alpha
- for alpha in alpha_range:
- print('alpha =', alpha)
- ghat_list, dhat = test_iam_moreGraphsAsInit_tryAllPossibleBestGraphs_deleteNodesInIterations(
- Gn_let, Gn_let, c_ei=1.7, c_er=1.7, c_es=1.7,
- ged_cost='LETTER', ged_method='IPFP', saveGXL='gedlib-letter')
- dis_best.append(dhat)
- g_best.append(ghat_list)
- time_list.append(time.time() - time0)
-
- # show best graphs and save them to file.
- for idx, item in enumerate(alpha_range):
- print('when alpha is', item, 'the shortest distance is', dis_best[idx])
- print('the corresponding pre-images are')
- for g in g_best[idx]:
- draw_Letter_graph(g, savepath='results/iam/')
-# nx.draw_networkx(g)
-# plt.show()
- print(g.nodes(data=True))
- print(g.edges(data=True))
-
- # compute the corresponding sod in kernel space. (alpha range not considered.)
- gkernel = 'structuralspkernel'
- sod_tmp = []
- Gn_mix = g_best[0] + Gn_let
- km = compute_kernel(Gn_mix, gkernel, True)
- for ig, g in tqdm(enumerate(g_best[0]), desc='computing kernel sod', file=sys.stdout):
- dtemp = dis_gstar(ig, range(len(g_best[0]), len(Gn_mix)),
- [alpha_range[0]] * len(Gn_let), km, withterm3=False)
- sod_tmp.append(dtemp)
- sod_list.append(sod_tmp)
- sod_min_list.append(np.min(sod_tmp))
-
-
- print('\nsods in kernel space: ', sod_list)
- print('\nsmallest sod in kernel space for each letter: ', sod_min_list)
- print('\ntimes:', time_list)
-
-
-def test_random_preimage_letter_h():
- from preimage_random import preimage_random
- ds = {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt',
- 'extra_params': {}} # node nsymb
-# ds = {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt',
-# 'extra_params': {}} # node nsymb
- # ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
-# 'extra_params': {}} # node/edge symb
-# ds = {'name': 'Acyclic', 'dataset': '../datasets/monoterpenoides/trainset_9.ds',
-# 'extra_params': {}}
-# ds = {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',
-# 'extra_params': {}} # node symb
- Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
- gkernel = 'structuralspkernel'
-
-# lmbda = 0.03 # termination probalility
- r_max = 3 # 10 # recursions
- l = 500
-# alpha_range = np.linspace(0.5, 0.5, 1)
- #alpha_range = np.linspace(0.1, 0.9, 9)
- k = 10 # 5 # k nearest neighbors
-
- # classify graphs according to letters.
- idx_dict = get_same_item_indices(y_all)
- time_list = []
- sod_list = []
- sod_min_list = []
- for letter in idx_dict:
- print('\n-------------------------------------------------------\n')
- Gn_let = [Gn[i].copy() for i in idx_dict[letter]]
- Gn_mix = Gn_let + [g.copy() for g in Gn_let]
-
- alpha_range = np.linspace(1 / len(Gn_let), 1 / len(Gn_let), 1)
-
- # compute
- time0 = time.time()
- km = compute_kernel(Gn_mix, gkernel, True)
- g_best = []
- dis_best = []
- # for each alpha
- for alpha in alpha_range:
- print('alpha =', alpha)
- dhat, ghat_list = preimage_random(Gn_let, Gn_let, [alpha] * len(Gn_let),
- range(len(Gn_let), len(Gn_mix)), km,
- k, r_max, gkernel, c_ei=1.7,
- c_er=1.7, c_es=1.7)
- dis_best.append(dhat)
- g_best.append(ghat_list)
- time_list.append(time.time() - time0)
-
- # show best graphs and save them to file.
- for idx, item in enumerate(alpha_range):
- print('when alpha is', item, 'the shortest distance is', dis_best[idx])
- print('the corresponding pre-images are')
- for g in g_best[idx]:
- draw_Letter_graph(g, savepath='results/gk_iam/')
-# nx.draw_networkx(g)
-# plt.show()
- print(g.nodes(data=True))
- print(g.edges(data=True))
-
- # compute the corresponding sod in graph space. (alpha range not considered.)
- sod_tmp, _ = ged_median(g_best[0], Gn_let)
- sod_list.append(sod_tmp)
- sod_min_list.append(np.min(sod_tmp))
-
-
- print('\nsods in graph space: ', sod_list)
- print('\nsmallest sod in graph space for each letter: ', sod_min_list)
- print('\ntimes:', time_list)
-
-
-
-
-
-
-
-def test_gkiam_mutag():
- from gk_iam import gk_iam_nearest_multi
- ds = {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt',
- 'extra_params': {}} # node nsymb
-# ds = {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt',
-# 'extra_params': {}} # node nsymb
- Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
- gkernel = 'structuralspkernel'
-
- lmbda = 0.03 # termination probalility
- r_max = 3 # recursions
-# alpha_range = np.linspace(0.5, 0.5, 1)
- k = 20 # k nearest neighbors
-
- # classify graphs according to letters.
- idx_dict = get_same_item_indices(y_all)
- time_list = []
- sod_ks_min_list = []
- sod_gs_list = []
- sod_gs_min_list = []
- nb_updated_list = []
- for letter in idx_dict:
- print('\n-------------------------------------------------------\n')
- Gn_let = [Gn[i].copy() for i in idx_dict[letter]]
- Gn_mix = Gn_let + [g.copy() for g in Gn_let]
-
- alpha_range = np.linspace(1 / len(Gn_let), 1 / len(Gn_let), 1)
-
- # compute
- time0 = time.time()
- km = compute_kernel(Gn_mix, gkernel, True)
- g_best = []
- dis_best = []
- # for each alpha
- for alpha in alpha_range:
- print('alpha =', alpha)
- dhat, ghat_list, sod_ks, nb_updated = gk_iam_nearest_multi(Gn_let, Gn_let, [alpha] * len(Gn_let),
- range(len(Gn_let), len(Gn_mix)), km,
- k, r_max, gkernel, c_ei=1.7,
- c_er=1.7, c_es=1.7)
- dis_best.append(dhat)
- g_best.append(ghat_list)
- time_list.append(time.time() - time0)
-
- # show best graphs and save them to file.
- for idx, item in enumerate(alpha_range):
- print('when alpha is', item, 'the shortest distance is', dis_best[idx])
- print('the corresponding pre-images are')
- for g in g_best[idx]:
- draw_Letter_graph(g, savepath='results/gk_iam/')
-# nx.draw_networkx(g)
-# plt.show()
- print(g.nodes(data=True))
- print(g.edges(data=True))
-
- # compute the corresponding sod in graph space. (alpha range not considered.)
- sod_tmp, _ = ged_median(g_best[0], Gn_let)
- sod_gs_list.append(sod_tmp)
- sod_gs_min_list.append(np.min(sod_tmp))
- sod_ks_min_list.append(sod_ks)
- nb_updated_list.append(nb_updated)
-
-
- print('\nsods in graph space: ', sod_gs_list)
- print('\nsmallest sod in graph space for each letter: ', sod_gs_min_list)
- print('\nsmallest sod in kernel space for each letter: ', sod_ks_min_list)
- print('\nnumber of updates for each letter: ', nb_updated_list)
- print('\ntimes:', time_list)
-
-
-###############################################################################
-# Re-test.
-
-def retest_the_simple_two():
- from gk_iam import gk_iam_nearest_multi
-
- # The two simple graphs.
-# g1 = nx.Graph(name='haha')
-# g1.add_nodes_from([(0, {'atom': 'C'}), (1, {'atom': 'O'}), (2, {'atom': 'C'})])
-# g1.add_edges_from([(0, 1, {'bond_type': '1'}), (1, 2, {'bond_type': '1'})])
-# g2 = nx.Graph(name='hahaha')
-# g2.add_nodes_from([(0, {'atom': 'C'}), (1, {'atom': 'O'}), (2, {'atom': 'C'}),
-# (3, {'atom': 'O'}), (4, {'atom': 'C'})])
-# g2.add_edges_from([(0, 1, {'bond_type': '1'}), (1, 2, {'bond_type': '1'}),
-# (2, 3, {'bond_type': '1'}), (3, 4, {'bond_type': '1'})])
-
- g1 = nx.Graph(name='haha')
- g1.add_nodes_from([(0, {'atom': 'C'}), (1, {'atom': 'C'}), (2, {'atom': 'C'}),
- (3, {'atom': 'S'}), (4, {'atom': 'S'})])
- g1.add_edges_from([(0, 1, {'bond_type': '1'}), (1, 2, {'bond_type': '1'}),
- (2, 3, {'bond_type': '1'}), (2, 4, {'bond_type': '1'})])
- g2 = nx.Graph(name='hahaha')
- g2.add_nodes_from([(0, {'atom': 'C'}), (1, {'atom': 'C'}), (2, {'atom': 'C'}),
- (3, {'atom': 'O'}), (4, {'atom': 'O'})])
- g2.add_edges_from([(0, 1, {'bond_type': '1'}), (1, 2, {'bond_type': '1'}),
- (2, 3, {'bond_type': '1'}), (2, 4, {'bond_type': '1'})])
-
-# # randomly select two molecules
-# np.random.seed(1)
-# idx_gi = [0, 6] # np.random.randint(0, len(Gn), 2)
-# g1 = Gn[idx_gi[0]]
-# g2 = Gn[idx_gi[1]]
-# Gn_mix = [g.copy() for g in Gn]
-# Gn_mix.append(g1.copy())
-# Gn_mix.append(g2.copy())
-
- Gn = [g1.copy(), g2.copy()]
- remove_edges(Gn)
- gkernel = 'marginalizedkernel'
-
- lmbda = 0.03 # termination probalility
- r_max = 10 # recursions
-# l = 500
- alpha_range = np.linspace(0.5, 0.5, 1)
- k = 2 # k nearest neighbors
- epsilon = 1e-6
- ged_cost='CHEM_1'
- ged_method='IPFP'
- saveGXL='gedlib'
- c_ei=1
- c_er=1
- c_es=1
-
- Gn_mix = Gn + [g1.copy(), g2.copy()]
-
- # compute
- time0 = time.time()
- km = compute_kernel(Gn_mix, gkernel, True)
- time_km = time.time() - time0
-
- time_list = []
- sod_ks_min_list = []
- sod_gs_list = []
- sod_gs_min_list = []
- nb_updated_list = []
- g_best = []
- # for each alpha
- for alpha in alpha_range:
- print('\n-------------------------------------------------------\n')
- print('alpha =', alpha)
- time0 = time.time()
- dhat, ghat_list, sod_ks, nb_updated = gk_iam_nearest_multi(Gn, [g1, g2],
- [alpha, 1 - alpha], range(len(Gn), len(Gn) + 2), km, k, r_max,
- gkernel, c_ei=c_ei, c_er=c_er, c_es=c_es, epsilon=epsilon,
- ged_cost=ged_cost, ged_method=ged_method, saveGXL=saveGXL)
- time_total = time.time() - time0 + time_km
- print('time: ', time_total)
- time_list.append(time_total)
- sod_ks_min_list.append(dhat)
- g_best.append(ghat_list)
- nb_updated_list.append(nb_updated)
-
- # show best graphs and save them to file.
- for idx, item in enumerate(alpha_range):
- print('when alpha is', item, 'the shortest distance is', sod_ks_min_list[idx])
- print('one of the possible corresponding pre-images is')
- nx.draw(g_best[idx][0], labels=nx.get_node_attributes(g_best[idx][0], 'atom'),
- with_labels=True)
- plt.savefig('results/gk_iam/mutag_alpha' + str(item) + '.png', format="PNG")
- plt.show()
- print(g_best[idx][0].nodes(data=True))
- print(g_best[idx][0].edges(data=True))
-
-# for g in g_best[idx]:
-# draw_Letter_graph(g, savepath='results/gk_iam/')
-## nx.draw_networkx(g)
-## plt.show()
-# print(g.nodes(data=True))
-# print(g.edges(data=True))
-
- # compute the corresponding sod in graph space.
- for idx, item in enumerate(alpha_range):
- sod_tmp, _ = ged_median(g_best[0], [g1, g2], ged_cost=ged_cost,
- ged_method=ged_method, saveGXL=saveGXL)
- sod_gs_list.append(sod_tmp)
- sod_gs_min_list.append(np.min(sod_tmp))
-
- print('\nsods in graph space: ', sod_gs_list)
- print('\nsmallest sod in graph space for each alpha: ', sod_gs_min_list)
- print('\nsmallest sod in kernel space for each alpha: ', sod_ks_min_list)
- print('\nnumber of updates for each alpha: ', nb_updated_list)
- print('\ntimes:', time_list)
-
-
-
-if __name__ == '__main__':
-# ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
-# 'extra_params': {}} # node/edge symb
-# ds = {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt',
-# 'extra_params': {}} # node nsymb
-# ds = {'name': 'Acyclic', 'dataset': '../datasets/monoterpenoides/trainset_9.ds',
-# 'extra_params': {}}
-# ds = {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',
-# 'extra_params': {}} # node symb
-# Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
-# Gn = Gn[0:20]
-
-# import networkx.algorithms.isomorphism as iso
-# G1 = nx.MultiDiGraph()
-# G2 = nx.MultiDiGraph()
-# G1.add_nodes_from([1,2,3], fill='red')
-# G2.add_nodes_from([10,20,30,40], fill='red')
-# nx.add_path(G1, [1,2,3,4], weight=3, linewidth=2.5)
-# nx.add_path(G2, [10,20,30,40], weight=3)
-# nm = iso.categorical_node_match('fill', 'red')
-# print(nx.is_isomorphic(G1, G2, node_match=nm))
-#
-# test_new_IAM_allGraph_deleteNodes(Gn)
-# test_will_IAM_give_the_median_graph_we_wanted(Gn)
-# test_who_is_the_closest_in_GED_space(Gn)
-# test_who_is_the_closest_in_kernel_space(Gn)
-
-# test_the_simple_two(Gn, 'untilhpathkernel')
-# test_remove_bests(Gn, 'untilhpathkernel')
-# test_gkiam_letter_h()
-# test_iam_letter_h()
-# test_random_preimage_letter_h
-
-###############################################################################
-# retests.
- retest_the_simple_two()
\ No newline at end of file
diff --git a/gklearn/preimage/test_preimage_iam.py b/gklearn/preimage/test_preimage_iam.py
deleted file mode 100644
index 9b05dd9..0000000
--- a/gklearn/preimage/test_preimage_iam.py
+++ /dev/null
@@ -1,620 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Thu Sep 5 15:59:00 2019
-
-@author: ljia
-"""
-
-import numpy as np
-import networkx as nx
-import matplotlib.pyplot as plt
-import time
-import random
-#from tqdm import tqdm
-
-from gklearn.utils.graphfiles import loadDataset
-from gklearn.preimage.utils import remove_edges, compute_kernel, get_same_item_indices
-from gklearn.preimage.ged import ged_median
-
-from gklearn.preimage.preimage_iam import preimage_iam
-
-
-###############################################################################
-# tests on different values on grid of median-sets and k.
-
-def test_preimage_iam_grid_k_median_nb():
- ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
- 'extra_params': {}} # node/edge symb
- Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
-# Gn = Gn[0:50]
- remove_edges(Gn)
- gkernel = 'marginalizedkernel'
-
- lmbda = 0.03 # termination probalility
- r_max = 5 # iteration limit for pre-image.
-# alpha_range = np.linspace(0.5, 0.5, 1)
-# k = 5 # k nearest neighbors
- epsilon = 1e-6
- InitIAMWithAllDk = True
- # parameters for GED function
- ged_cost='CHEM_1'
- ged_method='IPFP'
- saveGXL='gedlib'
- # parameters for IAM function
- c_ei=1
- c_er=1
- c_es=1
- ite_max_iam = 50
- epsilon_iam = 0.001
- removeNodes = True
- connected_iam = False
-
- # number of graphs; we what to compute the median of these graphs.
- nb_median_range = [2, 3, 4, 5, 10, 20, 30, 40, 50, 100]
- # number of nearest neighbors.
- k_range = [5, 6, 7, 8, 9, 10, 20, 30, 40, 50, 100]
-
- # find out all the graphs classified to positive group 1.
- idx_dict = get_same_item_indices(y_all)
- Gn = [Gn[i] for i in idx_dict[1]]
-
-# # compute Gram matrix.
-# time0 = time.time()
-# km = compute_kernel(Gn, gkernel, True)
-# time_km = time.time() - time0
-# # write Gram matrix to file.
-# np.savez('results/gram_matrix_marg_itr10_pq0.03_mutag_positive.gm', gm=km, gmtime=time_km)
-
-
- time_list = []
- dis_ks_min_list = []
- sod_gs_list = []
- sod_gs_min_list = []
- nb_updated_list = []
- nb_updated_k_list = []
- g_best = []
- for idx_nb, nb_median in enumerate(nb_median_range):
- print('\n-------------------------------------------------------')
- print('number of median graphs =', nb_median)
- random.seed(1)
- idx_rdm = random.sample(range(len(Gn)), nb_median)
- print('graphs chosen:', idx_rdm)
- Gn_median = [Gn[idx].copy() for idx in idx_rdm]
-
-# for g in Gn_median:
-# nx.draw(g, labels=nx.get_node_attributes(g, 'atom'), with_labels=True)
-## plt.savefig("results/preimage_mix/mutag.png", format="PNG")
-# plt.show()
-# plt.clf()
-
- ###################################################################
- gmfile = np.load('results/gram_matrix_marg_itr10_pq0.03_mutag_positive.gm.npz')
- km_tmp = gmfile['gm']
- time_km = gmfile['gmtime']
- # modify mixed gram matrix.
- km = np.zeros((len(Gn) + nb_median, len(Gn) + nb_median))
- for i in range(len(Gn)):
- for j in range(i, len(Gn)):
- km[i, j] = km_tmp[i, j]
- km[j, i] = km[i, j]
- for i in range(len(Gn)):
- for j, idx in enumerate(idx_rdm):
- km[i, len(Gn) + j] = km[i, idx]
- km[len(Gn) + j, i] = km[i, idx]
- for i, idx1 in enumerate(idx_rdm):
- for j, idx2 in enumerate(idx_rdm):
- km[len(Gn) + i, len(Gn) + j] = km[idx1, idx2]
-
- ###################################################################
- alpha_range = [1 / nb_median] * nb_median
-
- time_list.append([])
- dis_ks_min_list.append([])
- sod_gs_list.append([])
- sod_gs_min_list.append([])
- nb_updated_list.append([])
- nb_updated_k_list.append([])
- g_best.append([])
-
- for k in k_range:
- print('\n++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n')
- print('k =', k)
- time0 = time.time()
- dhat, ghat_list, dis_of_each_itr, nb_updated, nb_updated_k = \
- preimage_iam(Gn, Gn_median,
- alpha_range, range(len(Gn), len(Gn) + nb_median), km, k, r_max,
- gkernel, epsilon=epsilon, InitIAMWithAllDk=InitIAMWithAllDk,
- params_iam={'c_ei': c_ei, 'c_er': c_er, 'c_es': c_es,
- 'ite_max': ite_max_iam, 'epsilon': epsilon_iam,
- 'removeNodes': removeNodes, 'connected': connected_iam},
- params_ged={'ged_cost': ged_cost, 'ged_method': ged_method,
- 'saveGXL': saveGXL})
-
- time_total = time.time() - time0 + time_km
- print('time: ', time_total)
- time_list[idx_nb].append(time_total)
- print('\nsmallest distance in kernel space: ', dhat)
- dis_ks_min_list[idx_nb].append(dhat)
- g_best[idx_nb].append(ghat_list)
- print('\nnumber of updates of the best graph by IAM: ', nb_updated)
- nb_updated_list[idx_nb].append(nb_updated)
- print('\nnumber of updates of k nearest graphs by IAM: ', nb_updated_k)
- nb_updated_k_list[idx_nb].append(nb_updated_k)
-
- # show the best graph and save it to file.
- print('the shortest distance is', dhat)
- print('one of the possible corresponding pre-images is')
- nx.draw(ghat_list[0], labels=nx.get_node_attributes(ghat_list[0], 'atom'),
- with_labels=True)
- plt.savefig('results/preimage_iam/mutag_median_nb' + str(nb_median) +
- '_k' + str(k) + '.png', format="PNG")
- # plt.show()
- plt.clf()
- # print(ghat_list[0].nodes(data=True))
- # print(ghat_list[0].edges(data=True))
-
- # compute the corresponding sod in graph space.
- sod_tmp, _ = ged_median([ghat_list[0]], Gn_median, ged_cost=ged_cost,
- ged_method=ged_method, saveGXL=saveGXL)
- sod_gs_list[idx_nb].append(sod_tmp)
- sod_gs_min_list[idx_nb].append(np.min(sod_tmp))
- print('\nsmallest sod in graph space: ', np.min(sod_tmp))
-
- print('\nsods in graph space: ', sod_gs_list)
- print('\nsmallest sod in graph space for each set of median graphs and k: ',
- sod_gs_min_list)
- print('\nsmallest distance in kernel space for each set of median graphs and k: ',
- dis_ks_min_list)
- print('\nnumber of updates of the best graph for each set of median graphs and k by IAM: ',
- nb_updated_list)
- print('\nnumber of updates of k nearest graphs for each set of median graphs and k by IAM: ',
- nb_updated_k_list)
- print('\ntimes:', time_list)
-
-
-
-
-
-
-###############################################################################
-# tests on different numbers of median-sets.
-
-def test_preimage_iam_median_nb():
- ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
- 'extra_params': {}} # node/edge symb
- Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
-# Gn = Gn[0:50]
- remove_edges(Gn)
- gkernel = 'marginalizedkernel'
-
- lmbda = 0.03 # termination probalility
- r_max = 3 # iteration limit for pre-image.
-# alpha_range = np.linspace(0.5, 0.5, 1)
- k = 5 # k nearest neighbors
- epsilon = 1e-6
- InitIAMWithAllDk = True
- # parameters for IAM function
-# c_vi = 0.037
-# c_vr = 0.038
-# c_vs = 0.075
-# c_ei = 0.001
-# c_er = 0.001
-# c_es = 0.0
- c_vi = 4
- c_vr = 4
- c_vs = 2
- c_ei = 1
- c_er = 1
- c_es = 1
- ite_max_iam = 50
- epsilon_iam = 0.001
- removeNodes = True
- connected_iam = False
- # parameters for GED function
-# ged_cost='CHEM_1'
- ged_cost = 'CONSTANT'
- ged_method = 'IPFP'
- edit_cost_constant = [c_vi, c_vr, c_vs, c_ei, c_er, c_es]
- ged_stabilizer = 'min'
- ged_repeat = 50
- params_ged = {'lib': 'gedlibpy', 'cost': ged_cost, 'method': ged_method,
- 'edit_cost_constant': edit_cost_constant,
- 'stabilizer': ged_stabilizer, 'repeat': ged_repeat}
-
- # number of graphs; we what to compute the median of these graphs.
-# nb_median_range = [2, 3, 4, 5, 10, 20, 30, 40, 50, 100]
- nb_median_range = [2]
-
- # find out all the graphs classified to positive group 1.
- idx_dict = get_same_item_indices(y_all)
- Gn = [Gn[i] for i in idx_dict[1]]
-
-# # compute Gram matrix.
-# time0 = time.time()
-# km = compute_kernel(Gn, gkernel, True)
-# time_km = time.time() - time0
-# # write Gram matrix to file.
-# np.savez('results/gram_matrix_marg_itr10_pq0.03_mutag_positive.gm', gm=km, gmtime=time_km)
-
-
- time_list = []
- dis_ks_min_list = []
- sod_gs_list = []
- sod_gs_min_list = []
- nb_updated_list = []
- nb_updated_k_list = []
- g_best = []
- for nb_median in nb_median_range:
- print('\n-------------------------------------------------------')
- print('number of median graphs =', nb_median)
- random.seed(1)
- idx_rdm = random.sample(range(len(Gn)), nb_median)
- print('graphs chosen:', idx_rdm)
- Gn_median = [Gn[idx].copy() for idx in idx_rdm]
-
-# for g in Gn_median:
-# nx.draw(g, labels=nx.get_node_attributes(g, 'atom'), with_labels=True)
-## plt.savefig("results/preimage_mix/mutag.png", format="PNG")
-# plt.show()
-# plt.clf()
-
- ###################################################################
- gmfile = np.load('results/gram_matrix_marg_itr10_pq0.03_mutag_positive.gm.npz')
- km_tmp = gmfile['gm']
- time_km = gmfile['gmtime']
- # modify mixed gram matrix.
- km = np.zeros((len(Gn) + nb_median, len(Gn) + nb_median))
- for i in range(len(Gn)):
- for j in range(i, len(Gn)):
- km[i, j] = km_tmp[i, j]
- km[j, i] = km[i, j]
- for i in range(len(Gn)):
- for j, idx in enumerate(idx_rdm):
- km[i, len(Gn) + j] = km[i, idx]
- km[len(Gn) + j, i] = km[i, idx]
- for i, idx1 in enumerate(idx_rdm):
- for j, idx2 in enumerate(idx_rdm):
- km[len(Gn) + i, len(Gn) + j] = km[idx1, idx2]
-
- ###################################################################
- alpha_range = [1 / nb_median] * nb_median
- time0 = time.time()
- dhat, ghat_list, dis_of_each_itr, nb_updated, nb_updated_k = \
- preimage_iam(Gn, Gn_median,
- alpha_range, range(len(Gn), len(Gn) + nb_median), km, k, r_max,
- gkernel, epsilon=epsilon, InitIAMWithAllDk=InitIAMWithAllDk,
- params_iam={'c_ei': c_ei, 'c_er': c_er, 'c_es': c_es,
- 'ite_max': ite_max_iam, 'epsilon': epsilon_iam,
- 'removeNodes': removeNodes, 'connected': connected_iam},
- params_ged=params_ged)
-
- time_total = time.time() - time0 + time_km
- print('\ntime: ', time_total)
- time_list.append(time_total)
- print('\nsmallest distance in kernel space: ', dhat)
- dis_ks_min_list.append(dhat)
- g_best.append(ghat_list)
- print('\nnumber of updates of the best graph: ', nb_updated)
- nb_updated_list.append(nb_updated)
- print('\nnumber of updates of k nearest graphs: ', nb_updated_k)
- nb_updated_k_list.append(nb_updated_k)
-
- # show the best graph and save it to file.
- print('the shortest distance is', dhat)
- print('one of the possible corresponding pre-images is')
- nx.draw(ghat_list[0], labels=nx.get_node_attributes(ghat_list[0], 'atom'),
- with_labels=True)
- plt.show()
-# plt.savefig('results/preimage_iam/mutag_median_cs.001_nb' + str(nb_median) +
-# '.png', format="PNG")
- plt.clf()
-# print(ghat_list[0].nodes(data=True))
-# print(ghat_list[0].edges(data=True))
-
- # compute the corresponding sod in graph space.
- sod_tmp, _ = ged_median([ghat_list[0]], Gn_median, params_ged=params_ged)
- sod_gs_list.append(sod_tmp)
- sod_gs_min_list.append(np.min(sod_tmp))
- print('\nsmallest sod in graph space: ', np.min(sod_tmp))
-
- print('\nsods in graph space: ', sod_gs_list)
- print('\nsmallest sod in graph space for each set of median graphs: ', sod_gs_min_list)
- print('\nsmallest distance in kernel space for each set of median graphs: ',
- dis_ks_min_list)
- print('\nnumber of updates of the best graph for each set of median graphs by IAM: ',
- nb_updated_list)
- print('\nnumber of updates of k nearest graphs for each set of median graphs by IAM: ',
- nb_updated_k_list)
- print('\ntimes:', time_list)
-
-
-
-
-
-
-###############################################################################
-# test on the combination of the two randomly chosen graphs. (the same as in the
-# random pre-image paper.)
-
-def test_gkiam_2combination_all_pairs():
- ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
- 'extra_params': {}} # node/edge symb
- Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
-# Gn = Gn[0:50]
- remove_edges(Gn)
- gkernel = 'marginalizedkernel'
-
- lmbda = 0.03 # termination probalility
- r_max = 10 # iteration limit for pre-image.
- alpha_range = np.linspace(0.5, 0.5, 1)
- k = 5 # k nearest neighbors
- epsilon = 1e-6
- InitIAMWithAllDk = False
- # parameters for GED function
- ged_cost='CHEM_1'
- ged_method='IPFP'
- saveGXL='gedlib'
- # parameters for IAM function
- c_ei=1
- c_er=1
- c_es=1
- ite_max_iam = 50
- epsilon_iam = 0.001
- removeNodes = True
- connected_iam = False
-
- nb_update_mat = np.full((len(Gn), len(Gn)), np.inf)
- # test on each pair of graphs.
-# for idx1 in range(len(Gn) - 1, -1, -1):
-# for idx2 in range(idx1, -1, -1):
- for idx1 in range(187, 188):
- for idx2 in range(167, 168):
- g1 = Gn[idx1].copy()
- g2 = Gn[idx2].copy()
- # Gn[10] = []
- # Gn[10] = []
-
- nx.draw(g1, labels=nx.get_node_attributes(g1, 'atom'), with_labels=True)
- plt.savefig("results/gk_iam/all_pairs/mutag187.png", format="PNG")
- plt.show()
- plt.clf()
- nx.draw(g2, labels=nx.get_node_attributes(g2, 'atom'), with_labels=True)
- plt.savefig("results/gk_iam/all_pairs/mutag167.png", format="PNG")
- plt.show()
- plt.clf()
-
- ###################################################################
-# Gn_mix = [g.copy() for g in Gn]
-# Gn_mix.append(g1.copy())
-# Gn_mix.append(g2.copy())
-#
-# # compute
-# time0 = time.time()
-# km = compute_kernel(Gn_mix, gkernel, True)
-# time_km = time.time() - time0
-#
-# # write Gram matrix to file and read it.
-# np.savez('results/gram_matrix_uhpath_itr7_pq0.8.gm', gm=km, gmtime=time_km)
-
- ###################################################################
- gmfile = np.load('results/gram_matrix_marg_itr10_pq0.03.gm.npz')
- km = gmfile['gm']
- time_km = gmfile['gmtime']
- # modify mixed gram matrix.
- for i in range(len(Gn)):
- km[i, len(Gn)] = km[i, idx1]
- km[i, len(Gn) + 1] = km[i, idx2]
- km[len(Gn), i] = km[i, idx1]
- km[len(Gn) + 1, i] = km[i, idx2]
- km[len(Gn), len(Gn)] = km[idx1, idx1]
- km[len(Gn), len(Gn) + 1] = km[idx1, idx2]
- km[len(Gn) + 1, len(Gn)] = km[idx2, idx1]
- km[len(Gn) + 1, len(Gn) + 1] = km[idx2, idx2]
-
- ###################################################################
-# # use only the two graphs in median set as candidates.
-# Gn = [g1.copy(), g2.copy()]
-# Gn_mix = Gn + [g1.copy(), g2.copy()]
-# # compute
-# time0 = time.time()
-# km = compute_kernel(Gn_mix, gkernel, True)
-# time_km = time.time() - time0
-
-
- time_list = []
- dis_ks_min_list = []
- sod_gs_list = []
- sod_gs_min_list = []
- nb_updated_list = []
- nb_updated_k_list = []
- g_best = []
- # for each alpha
- for alpha in alpha_range:
- print('\n-------------------------------------------------------\n')
- print('alpha =', alpha)
- time0 = time.time()
- dhat, ghat_list, sod_ks, nb_updated, nb_updated_k = \
- preimage_iam(Gn, [g1, g2],
- [alpha, 1 - alpha], range(len(Gn), len(Gn) + 2), km, k, r_max,
- gkernel, epsilon=epsilon, InitIAMWithAllDk=InitIAMWithAllDk,
- params_iam={'c_ei': c_ei, 'c_er': c_er, 'c_es': c_es,
- 'ite_max': ite_max_iam, 'epsilon': epsilon_iam,
- 'removeNodes': removeNodes, 'connected': connected_iam},
- params_ged={'ged_cost': ged_cost, 'ged_method': ged_method,
- 'saveGXL': saveGXL})
- time_total = time.time() - time0 + time_km
- print('time: ', time_total)
- time_list.append(time_total)
- dis_ks_min_list.append(dhat)
- g_best.append(ghat_list)
- nb_updated_list.append(nb_updated)
- nb_updated_k_list.append(nb_updated_k)
-
- # show best graphs and save them to file.
- for idx, item in enumerate(alpha_range):
- print('when alpha is', item, 'the shortest distance is', dis_ks_min_list[idx])
- print('one of the possible corresponding pre-images is')
- nx.draw(g_best[idx][0], labels=nx.get_node_attributes(g_best[idx][0], 'atom'),
- with_labels=True)
- plt.savefig('results/gk_iam/mutag' + str(idx1) + '_' + str(idx2)
- + '_alpha' + str(item) + '.png', format="PNG")
-# plt.show()
- plt.clf()
-# print(g_best[idx][0].nodes(data=True))
-# print(g_best[idx][0].edges(data=True))
-
- # for g in g_best[idx]:
- # draw_Letter_graph(g, savepath='results/gk_iam/')
- ## nx.draw_networkx(g)
- ## plt.show()
- # print(g.nodes(data=True))
- # print(g.edges(data=True))
-
- # compute the corresponding sod in graph space.
- for idx, item in enumerate(alpha_range):
- sod_tmp, _ = ged_median([g_best[0]], [g1, g2], ged_cost=ged_cost,
- ged_method=ged_method, saveGXL=saveGXL)
- sod_gs_list.append(sod_tmp)
- sod_gs_min_list.append(np.min(sod_tmp))
-
- print('\nsods in graph space: ', sod_gs_list)
- print('\nsmallest sod in graph space for each alpha: ', sod_gs_min_list)
- print('\nsmallest distance in kernel space for each alpha: ', dis_ks_min_list)
- print('\nnumber of updates of the best graph for each alpha: ',
- nb_updated_list)
- print('\nnumber of updates of the k nearest graphs for each alpha: ',
- nb_updated_k_list)
- print('\ntimes:', time_list)
- nb_update_mat[idx1, idx2] = nb_updated_list[0]
-
- str_fw = 'graphs %d and %d: %d.\n' % (idx1, idx2, nb_updated_list[0])
- with open('results/gk_iam/all_pairs/nb_updates.txt', 'r+') as file:
- content = file.read()
- file.seek(0, 0)
- file.write(str_fw + content)
-
-
-
-def test_gkiam_2combination():
- from gk_iam import gk_iam_nearest_multi
- ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
- 'extra_params': {}} # node/edge symb
- Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
-# Gn = Gn[0:50]
- remove_edges(Gn)
- gkernel = 'marginalizedkernel'
-
- lmbda = 0.03 # termination probalility
- r_max = 10 # iteration limit for pre-image.
- alpha_range = np.linspace(0.5, 0.5, 1)
- k = 20 # k nearest neighbors
- epsilon = 1e-6
- ged_cost='CHEM_1'
- ged_method='IPFP'
- saveGXL='gedlib'
- c_ei=1
- c_er=1
- c_es=1
-
- # randomly select two molecules
- np.random.seed(1)
- idx_gi = [10, 11] # np.random.randint(0, len(Gn), 2)
- g1 = Gn[idx_gi[0]].copy()
- g2 = Gn[idx_gi[1]].copy()
-# Gn[10] = []
-# Gn[10] = []
-
-# nx.draw(g1, labels=nx.get_node_attributes(g1, 'atom'), with_labels=True)
-# plt.savefig("results/random_preimage/mutag10.png", format="PNG")
-# plt.show()
-# nx.draw(g2, labels=nx.get_node_attributes(g2, 'atom'), with_labels=True)
-# plt.savefig("results/random_preimage/mutag11.png", format="PNG")
-# plt.show()
-
- Gn_mix = [g.copy() for g in Gn]
- Gn_mix.append(g1.copy())
- Gn_mix.append(g2.copy())
-
- # compute
-# time0 = time.time()
-# km = compute_kernel(Gn_mix, gkernel, True)
-# time_km = time.time() - time0
-
- # write Gram matrix to file and read it.
-# np.savez('results/gram_matrix.gm', gm=km, gmtime=time_km)
- gmfile = np.load('results/gram_matrix.gm.npz')
- km = gmfile['gm']
- time_km = gmfile['gmtime']
-
- time_list = []
- dis_ks_min_list = []
- sod_gs_list = []
- sod_gs_min_list = []
- nb_updated_list = []
- g_best = []
- # for each alpha
- for alpha in alpha_range:
- print('\n-------------------------------------------------------\n')
- print('alpha =', alpha)
- time0 = time.time()
- dhat, ghat_list, sod_ks, nb_updated = gk_iam_nearest_multi(Gn, [g1, g2],
- [alpha, 1 - alpha], range(len(Gn), len(Gn) + 2), km, k, r_max,
- gkernel, c_ei=c_ei, c_er=c_er, c_es=c_es, epsilon=epsilon,
- ged_cost=ged_cost, ged_method=ged_method, saveGXL=saveGXL)
- time_total = time.time() - time0 + time_km
- print('time: ', time_total)
- time_list.append(time_total)
- dis_ks_min_list.append(dhat)
- g_best.append(ghat_list)
- nb_updated_list.append(nb_updated)
-
- # show best graphs and save them to file.
- for idx, item in enumerate(alpha_range):
- print('when alpha is', item, 'the shortest distance is', dis_ks_min_list[idx])
- print('one of the possible corresponding pre-images is')
- nx.draw(g_best[idx][0], labels=nx.get_node_attributes(g_best[idx][0], 'atom'),
- with_labels=True)
- plt.savefig('results/gk_iam/mutag_alpha' + str(item) + '.png', format="PNG")
- plt.show()
- print(g_best[idx][0].nodes(data=True))
- print(g_best[idx][0].edges(data=True))
-
-# for g in g_best[idx]:
-# draw_Letter_graph(g, savepath='results/gk_iam/')
-## nx.draw_networkx(g)
-## plt.show()
-# print(g.nodes(data=True))
-# print(g.edges(data=True))
-
- # compute the corresponding sod in graph space.
- for idx, item in enumerate(alpha_range):
- sod_tmp, _ = ged_median([g_best[0]], [g1, g2], ged_cost=ged_cost,
- ged_method=ged_method, saveGXL=saveGXL)
- sod_gs_list.append(sod_tmp)
- sod_gs_min_list.append(np.min(sod_tmp))
-
- print('\nsods in graph space: ', sod_gs_list)
- print('\nsmallest sod in graph space for each alpha: ', sod_gs_min_list)
- print('\nsmallest distance in kernel space for each alpha: ', dis_ks_min_list)
- print('\nnumber of updates for each alpha: ', nb_updated_list)
- print('\ntimes:', time_list)
-
-
-###############################################################################
-
-
-if __name__ == '__main__':
-###############################################################################
-# test on the combination of the two randomly chosen graphs. (the same as in the
-# random pre-image paper.)
-# test_gkiam_2combination()
-# test_gkiam_2combination_all_pairs()
-
-###############################################################################
-# tests on different numbers of median-sets.
- test_preimage_iam_median_nb()
-
-###############################################################################
-# tests on different values on grid of median-sets and k.
-# test_preimage_iam_grid_k_median_nb()
\ No newline at end of file
diff --git a/gklearn/preimage/test_preimage_mix.py b/gklearn/preimage/test_preimage_mix.py
deleted file mode 100644
index 888de86..0000000
--- a/gklearn/preimage/test_preimage_mix.py
+++ /dev/null
@@ -1,539 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Thu Sep 5 15:59:00 2019
-
-@author: ljia
-"""
-
-import numpy as np
-import networkx as nx
-import matplotlib.pyplot as plt
-import time
-import random
-#from tqdm import tqdm
-
-from gklearn.utils.graphfiles import loadDataset
-from gklearn.preimage.ged import ged_median
-from gklearn.preimage.utils import compute_kernel, get_same_item_indices, remove_edges
-from gklearn.preimage.preimage_iam import preimage_iam_random_mix
-
-###############################################################################
-# tests on different values on grid of median-sets and k.
-
-def test_preimage_mix_grid_k_median_nb():
- ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
- 'extra_params': {}} # node/edge symb
- Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
-# Gn = Gn[0:50]
- remove_edges(Gn)
- gkernel = 'marginalizedkernel'
-
- lmbda = 0.03 # termination probalility
- r_max = 5 # iteration limit for pre-image.
- l_max = 500 # update limit for random generation
-# alpha_range = np.linspace(0.5, 0.5, 1)
-# k = 5 # k nearest neighbors
- epsilon = 1e-6
- InitIAMWithAllDk = True
- InitRandomWithAllDk = True
- # parameters for GED function
- ged_cost='CHEM_1'
- ged_method='IPFP'
- saveGXL='gedlib'
- # parameters for IAM function
- c_ei=1
- c_er=1
- c_es=1
- ite_max_iam = 50
- epsilon_iam = 0.001
- removeNodes = True
- connected_iam = False
-
- # number of graphs; we what to compute the median of these graphs.
- nb_median_range = [2, 3, 4, 5, 10, 20, 30, 40, 50, 100]
- # number of nearest neighbors.
- k_range = [5, 6, 7, 8, 9, 10, 20, 30, 40, 50, 100]
-
- # find out all the graphs classified to positive group 1.
- idx_dict = get_same_item_indices(y_all)
- Gn = [Gn[i] for i in idx_dict[1]]
-
-# # compute Gram matrix.
-# time0 = time.time()
-# km = compute_kernel(Gn, gkernel, True)
-# time_km = time.time() - time0
-# # write Gram matrix to file.
-# np.savez('results/gram_matrix_marg_itr10_pq0.03_mutag_positive.gm', gm=km, gmtime=time_km)
-
-
- time_list = []
- dis_ks_min_list = []
- sod_gs_list = []
- sod_gs_min_list = []
- nb_updated_list_iam = []
- nb_updated_list_random = []
- nb_updated_k_list_iam = []
- nb_updated_k_list_random = []
- g_best = []
- for idx_nb, nb_median in enumerate(nb_median_range):
- print('\n-------------------------------------------------------')
- print('number of median graphs =', nb_median)
- random.seed(1)
- idx_rdm = random.sample(range(len(Gn)), nb_median)
- print('graphs chosen:', idx_rdm)
- Gn_median = [Gn[idx].copy() for idx in idx_rdm]
-
-# for g in Gn_median:
-# nx.draw(g, labels=nx.get_node_attributes(g, 'atom'), with_labels=True)
-## plt.savefig("results/preimage_mix/mutag.png", format="PNG")
-# plt.show()
-# plt.clf()
-
- ###################################################################
- gmfile = np.load('results/gram_matrix_marg_itr10_pq0.03_mutag_positive.gm.npz')
- km_tmp = gmfile['gm']
- time_km = gmfile['gmtime']
- # modify mixed gram matrix.
- km = np.zeros((len(Gn) + nb_median, len(Gn) + nb_median))
- for i in range(len(Gn)):
- for j in range(i, len(Gn)):
- km[i, j] = km_tmp[i, j]
- km[j, i] = km[i, j]
- for i in range(len(Gn)):
- for j, idx in enumerate(idx_rdm):
- km[i, len(Gn) + j] = km[i, idx]
- km[len(Gn) + j, i] = km[i, idx]
- for i, idx1 in enumerate(idx_rdm):
- for j, idx2 in enumerate(idx_rdm):
- km[len(Gn) + i, len(Gn) + j] = km[idx1, idx2]
-
- ###################################################################
- alpha_range = [1 / nb_median] * nb_median
-
- time_list.append([])
- dis_ks_min_list.append([])
- sod_gs_list.append([])
- sod_gs_min_list.append([])
- nb_updated_list_iam.append([])
- nb_updated_list_random.append([])
- nb_updated_k_list_iam.append([])
- nb_updated_k_list_random.append([])
- g_best.append([])
-
- for k in k_range:
- print('\n++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n')
- print('k =', k)
- time0 = time.time()
- dhat, ghat_list, dis_of_each_itr, nb_updated_iam, nb_updated_random, \
- nb_updated_k_iam, nb_updated_k_random = \
- preimage_iam_random_mix(Gn, Gn_median,
- alpha_range, range(len(Gn), len(Gn) + nb_median), km, k, r_max,
- l_max, gkernel, epsilon=epsilon, InitIAMWithAllDk=InitIAMWithAllDk,
- InitRandomWithAllDk=InitRandomWithAllDk,
- params_iam={'c_ei': c_ei, 'c_er': c_er, 'c_es': c_es,
- 'ite_max': ite_max_iam, 'epsilon': epsilon_iam,
- 'removeNodes': removeNodes, 'connected': connected_iam},
- params_ged={'ged_cost': ged_cost, 'ged_method': ged_method,
- 'saveGXL': saveGXL})
-
- time_total = time.time() - time0 + time_km
- print('time: ', time_total)
- time_list[idx_nb].append(time_total)
- print('\nsmallest distance in kernel space: ', dhat)
- dis_ks_min_list[idx_nb].append(dhat)
- g_best[idx_nb].append(ghat_list)
- print('\nnumber of updates of the best graph by IAM: ', nb_updated_iam)
- nb_updated_list_iam[idx_nb].append(nb_updated_iam)
- print('\nnumber of updates of the best graph by random generation: ',
- nb_updated_random)
- nb_updated_list_random[idx_nb].append(nb_updated_random)
- print('\nnumber of updates of k nearest graphs by IAM: ', nb_updated_k_iam)
- nb_updated_k_list_iam[idx_nb].append(nb_updated_k_iam)
- print('\nnumber of updates of k nearest graphs by random generation: ',
- nb_updated_k_random)
- nb_updated_k_list_random[idx_nb].append(nb_updated_k_random)
-
- # show the best graph and save it to file.
- print('the shortest distance is', dhat)
- print('one of the possible corresponding pre-images is')
- nx.draw(ghat_list[0], labels=nx.get_node_attributes(ghat_list[0], 'atom'),
- with_labels=True)
- plt.savefig('results/preimage_mix/mutag_median_nb' + str(nb_median) +
- '_k' + str(k) + '.png', format="PNG")
- # plt.show()
- plt.clf()
- # print(ghat_list[0].nodes(data=True))
- # print(ghat_list[0].edges(data=True))
-
- # compute the corresponding sod in graph space.
- sod_tmp, _ = ged_median([ghat_list[0]], Gn_median, ged_cost=ged_cost,
- ged_method=ged_method, saveGXL=saveGXL)
- sod_gs_list[idx_nb].append(sod_tmp)
- sod_gs_min_list[idx_nb].append(np.min(sod_tmp))
- print('\nsmallest sod in graph space: ', np.min(sod_tmp))
-
- print('\nsods in graph space: ', sod_gs_list)
- print('\nsmallest sod in graph space for each set of median graphs and k: ',
- sod_gs_min_list)
- print('\nsmallest distance in kernel space for each set of median graphs and k: ',
- dis_ks_min_list)
- print('\nnumber of updates of the best graph for each set of median graphs and k by IAM: ',
- nb_updated_list_iam)
- print('\nnumber of updates of the best graph for each set of median graphs and k by random generation: ',
- nb_updated_list_random)
- print('\nnumber of updates of k nearest graphs for each set of median graphs and k by IAM: ',
- nb_updated_k_list_iam)
- print('\nnumber of updates of k nearest graphs for each set of median graphs and k by random generation: ',
- nb_updated_k_list_random)
- print('\ntimes:', time_list)
-
-
-
-
-###############################################################################
-# tests on different numbers of median-sets.
-
-def test_preimage_mix_median_nb():
- ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
- 'extra_params': {}} # node/edge symb
- Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
-# Gn = Gn[0:50]
- remove_edges(Gn)
- gkernel = 'marginalizedkernel'
-
- lmbda = 0.03 # termination probalility
- r_max = 5 # iteration limit for pre-image.
- l_max = 500 # update limit for random generation
-# alpha_range = np.linspace(0.5, 0.5, 1)
- k = 5 # k nearest neighbors
- epsilon = 1e-6
- InitIAMWithAllDk = True
- InitRandomWithAllDk = True
- # parameters for GED function
- ged_cost='CHEM_1'
- ged_method='IPFP'
- saveGXL='gedlib'
- # parameters for IAM function
- c_ei=1
- c_er=1
- c_es=1
- ite_max_iam = 50
- epsilon_iam = 0.001
- removeNodes = True
- connected_iam = False
-
- # number of graphs; we what to compute the median of these graphs.
- nb_median_range = [2, 3, 4, 5, 10, 20, 30, 40, 50, 100]
-
- # find out all the graphs classified to positive group 1.
- idx_dict = get_same_item_indices(y_all)
- Gn = [Gn[i] for i in idx_dict[1]]
-
-# # compute Gram matrix.
-# time0 = time.time()
-# km = compute_kernel(Gn, gkernel, True)
-# time_km = time.time() - time0
-# # write Gram matrix to file.
-# np.savez('results/gram_matrix_marg_itr10_pq0.03_mutag_positive.gm', gm=km, gmtime=time_km)
-
-
- time_list = []
- dis_ks_min_list = []
- sod_gs_list = []
- sod_gs_min_list = []
- nb_updated_list_iam = []
- nb_updated_list_random = []
- nb_updated_k_list_iam = []
- nb_updated_k_list_random = []
- g_best = []
- for nb_median in nb_median_range:
- print('\n-------------------------------------------------------')
- print('number of median graphs =', nb_median)
- random.seed(1)
- idx_rdm = random.sample(range(len(Gn)), nb_median)
- print('graphs chosen:', idx_rdm)
- Gn_median = [Gn[idx].copy() for idx in idx_rdm]
-
-# for g in Gn_median:
-# nx.draw(g, labels=nx.get_node_attributes(g, 'atom'), with_labels=True)
-## plt.savefig("results/preimage_mix/mutag.png", format="PNG")
-# plt.show()
-# plt.clf()
-
- ###################################################################
- gmfile = np.load('results/gram_matrix_marg_itr10_pq0.03_mutag_positive.gm.npz')
- km_tmp = gmfile['gm']
- time_km = gmfile['gmtime']
- # modify mixed gram matrix.
- km = np.zeros((len(Gn) + nb_median, len(Gn) + nb_median))
- for i in range(len(Gn)):
- for j in range(i, len(Gn)):
- km[i, j] = km_tmp[i, j]
- km[j, i] = km[i, j]
- for i in range(len(Gn)):
- for j, idx in enumerate(idx_rdm):
- km[i, len(Gn) + j] = km[i, idx]
- km[len(Gn) + j, i] = km[i, idx]
- for i, idx1 in enumerate(idx_rdm):
- for j, idx2 in enumerate(idx_rdm):
- km[len(Gn) + i, len(Gn) + j] = km[idx1, idx2]
-
- ###################################################################
- alpha_range = [1 / nb_median] * nb_median
- time0 = time.time()
- dhat, ghat_list, dis_of_each_itr, nb_updated_iam, nb_updated_random, \
- nb_updated_k_iam, nb_updated_k_random = \
- preimage_iam_random_mix(Gn, Gn_median,
- alpha_range, range(len(Gn), len(Gn) + nb_median), km, k, r_max,
- l_max, gkernel, epsilon=epsilon, InitIAMWithAllDk=InitIAMWithAllDk,
- InitRandomWithAllDk=InitRandomWithAllDk,
- params_iam={'c_ei': c_ei, 'c_er': c_er, 'c_es': c_es,
- 'ite_max': ite_max_iam, 'epsilon': epsilon_iam,
- 'removeNodes': removeNodes, 'connected': connected_iam},
- params_ged={'ged_cost': ged_cost, 'ged_method': ged_method,
- 'saveGXL': saveGXL})
-
- time_total = time.time() - time0 + time_km
- print('time: ', time_total)
- time_list.append(time_total)
- print('\nsmallest distance in kernel space: ', dhat)
- dis_ks_min_list.append(dhat)
- g_best.append(ghat_list)
- print('\nnumber of updates of the best graph by IAM: ', nb_updated_iam)
- nb_updated_list_iam.append(nb_updated_iam)
- print('\nnumber of updates of the best graph by random generation: ',
- nb_updated_random)
- nb_updated_list_random.append(nb_updated_random)
- print('\nnumber of updates of k nearest graphs by IAM: ', nb_updated_k_iam)
- nb_updated_k_list_iam.append(nb_updated_k_iam)
- print('\nnumber of updates of k nearest graphs by random generation: ',
- nb_updated_k_random)
- nb_updated_k_list_random.append(nb_updated_k_random)
-
- # show the best graph and save it to file.
- print('the shortest distance is', dhat)
- print('one of the possible corresponding pre-images is')
- nx.draw(ghat_list[0], labels=nx.get_node_attributes(ghat_list[0], 'atom'),
- with_labels=True)
- plt.savefig('results/preimage_mix/mutag_median_nb' + str(nb_median) +
- '.png', format="PNG")
-# plt.show()
- plt.clf()
-# print(ghat_list[0].nodes(data=True))
-# print(ghat_list[0].edges(data=True))
-
- # compute the corresponding sod in graph space.
- sod_tmp, _ = ged_median([ghat_list[0]], Gn_median, ged_cost=ged_cost,
- ged_method=ged_method, saveGXL=saveGXL)
- sod_gs_list.append(sod_tmp)
- sod_gs_min_list.append(np.min(sod_tmp))
- print('\nsmallest sod in graph space: ', np.min(sod_tmp))
-
- print('\nsods in graph space: ', sod_gs_list)
- print('\nsmallest sod in graph space for each set of median graphs: ', sod_gs_min_list)
- print('\nsmallest distance in kernel space for each set of median graphs: ',
- dis_ks_min_list)
- print('\nnumber of updates of the best graph for each set of median graphs by IAM: ',
- nb_updated_list_iam)
- print('\nnumber of updates of the best graph for each set of median graphs by random generation: ',
- nb_updated_list_random)
- print('\nnumber of updates of k nearest graphs for each set of median graphs by IAM: ',
- nb_updated_k_list_iam)
- print('\nnumber of updates of k nearest graphs for each set of median graphs by random generation: ',
- nb_updated_k_list_random)
- print('\ntimes:', time_list)
-
-
-
-###############################################################################
-# test on the combination of the two randomly chosen graphs. (the same as in the
-# random pre-image paper.)
-
-def test_preimage_mix_2combination_all_pairs():
- ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
- 'extra_params': {}} # node/edge symb
- Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
-# Gn = Gn[0:50]
- remove_edges(Gn)
- gkernel = 'marginalizedkernel'
-
- lmbda = 0.03 # termination probalility
- r_max = 10 # iteration limit for pre-image.
- l_max = 500 # update limit for random generation
- alpha_range = np.linspace(0.5, 0.5, 1)
- k = 5 # k nearest neighbors
- epsilon = 1e-6
- InitIAMWithAllDk = True
- InitRandomWithAllDk = True
- # parameters for GED function
- ged_cost='CHEM_1'
- ged_method='IPFP'
- saveGXL='gedlib'
- # parameters for IAM function
- c_ei=1
- c_er=1
- c_es=1
- ite_max_iam = 50
- epsilon_iam = 0.001
- removeNodes = True
- connected_iam = False
-
- nb_update_mat_iam = np.full((len(Gn), len(Gn)), np.inf)
- nb_update_mat_random = np.full((len(Gn), len(Gn)), np.inf)
- # test on each pair of graphs.
-# for idx1 in range(len(Gn) - 1, -1, -1):
-# for idx2 in range(idx1, -1, -1):
- for idx1 in range(187, 188):
- for idx2 in range(167, 168):
- g1 = Gn[idx1].copy()
- g2 = Gn[idx2].copy()
- # Gn[10] = []
- # Gn[10] = []
-
- nx.draw(g1, labels=nx.get_node_attributes(g1, 'atom'), with_labels=True)
- plt.savefig("results/preimage_mix/mutag187.png", format="PNG")
- plt.show()
- plt.clf()
- nx.draw(g2, labels=nx.get_node_attributes(g2, 'atom'), with_labels=True)
- plt.savefig("results/preimage_mix/mutag167.png", format="PNG")
- plt.show()
- plt.clf()
-
- ###################################################################
-# Gn_mix = [g.copy() for g in Gn]
-# Gn_mix.append(g1.copy())
-# Gn_mix.append(g2.copy())
-#
-# # compute
-# time0 = time.time()
-# km = compute_kernel(Gn_mix, gkernel, True)
-# time_km = time.time() - time0
-#
-# # write Gram matrix to file and read it.
-# np.savez('results/gram_matrix_uhpath_itr7_pq0.8.gm', gm=km, gmtime=time_km)
-
- ###################################################################
- gmfile = np.load('results/gram_matrix_marg_itr10_pq0.03.gm.npz')
- km = gmfile['gm']
- time_km = gmfile['gmtime']
- # modify mixed gram matrix.
- for i in range(len(Gn)):
- km[i, len(Gn)] = km[i, idx1]
- km[i, len(Gn) + 1] = km[i, idx2]
- km[len(Gn), i] = km[i, idx1]
- km[len(Gn) + 1, i] = km[i, idx2]
- km[len(Gn), len(Gn)] = km[idx1, idx1]
- km[len(Gn), len(Gn) + 1] = km[idx1, idx2]
- km[len(Gn) + 1, len(Gn)] = km[idx2, idx1]
- km[len(Gn) + 1, len(Gn) + 1] = km[idx2, idx2]
-
- ###################################################################
-# # use only the two graphs in median set as candidates.
-# Gn = [g1.copy(), g2.copy()]
-# Gn_mix = Gn + [g1.copy(), g2.copy()]
-# # compute
-# time0 = time.time()
-# km = compute_kernel(Gn_mix, gkernel, True)
-# time_km = time.time() - time0
-
-
- time_list = []
- dis_ks_min_list = []
- sod_gs_list = []
- sod_gs_min_list = []
- nb_updated_list_iam = []
- nb_updated_list_random = []
- nb_updated_k_list_iam = []
- nb_updated_k_list_random = []
- g_best = []
- # for each alpha
- for alpha in alpha_range:
- print('\n-------------------------------------------------------\n')
- print('alpha =', alpha)
- time0 = time.time()
- dhat, ghat_list, dis_of_each_itr, nb_updated_iam, nb_updated_random, \
- nb_updated_k_iam, nb_updated_k_random = \
- preimage_iam_random_mix(Gn, [g1, g2],
- [alpha, 1 - alpha], range(len(Gn), len(Gn) + 2), km, k, r_max,
- l_max, gkernel, epsilon=epsilon, InitIAMWithAllDk=InitIAMWithAllDk,
- InitRandomWithAllDk=InitRandomWithAllDk,
- params_iam={'c_ei': c_ei, 'c_er': c_er, 'c_es': c_es,
- 'ite_max': ite_max_iam, 'epsilon': epsilon_iam,
- 'removeNodes': removeNodes, 'connected': connected_iam},
- params_ged={'ged_cost': ged_cost, 'ged_method': ged_method,
- 'saveGXL': saveGXL})
- time_total = time.time() - time0 + time_km
- print('time: ', time_total)
- time_list.append(time_total)
- dis_ks_min_list.append(dhat)
- g_best.append(ghat_list)
- nb_updated_list_iam.append(nb_updated_iam)
- nb_updated_list_random.append(nb_updated_random)
- nb_updated_k_list_iam.append(nb_updated_k_iam)
- nb_updated_k_list_random.append(nb_updated_k_random)
-
- # show best graphs and save them to file.
- for idx, item in enumerate(alpha_range):
- print('when alpha is', item, 'the shortest distance is', dis_ks_min_list[idx])
- print('one of the possible corresponding pre-images is')
- nx.draw(g_best[idx][0], labels=nx.get_node_attributes(g_best[idx][0], 'atom'),
- with_labels=True)
- plt.savefig('results/preimage_mix/mutag' + str(idx1) + '_' + str(idx2)
- + '_alpha' + str(item) + '.png', format="PNG")
-# plt.show()
- plt.clf()
-# print(g_best[idx][0].nodes(data=True))
-# print(g_best[idx][0].edges(data=True))
-
- # for g in g_best[idx]:
- # draw_Letter_graph(g, savepath='results/gk_iam/')
- ## nx.draw_networkx(g)
- ## plt.show()
- # print(g.nodes(data=True))
- # print(g.edges(data=True))
-
- # compute the corresponding sod in graph space.
- for idx, item in enumerate(alpha_range):
- sod_tmp, _ = ged_median([g_best[0]], [g1, g2], ged_cost=ged_cost,
- ged_method=ged_method, saveGXL=saveGXL)
- sod_gs_list.append(sod_tmp)
- sod_gs_min_list.append(np.min(sod_tmp))
-
- print('\nsods in graph space: ', sod_gs_list)
- print('\nsmallest sod in graph space for each alpha: ', sod_gs_min_list)
- print('\nsmallest distance in kernel space for each alpha: ', dis_ks_min_list)
- print('\nnumber of updates of the best graph for each alpha by IAM: ', nb_updated_list_iam)
- print('\nnumber of updates of the best graph for each alpha by random generation: ',
- nb_updated_list_random)
- print('\nnumber of updates of k nearest graphs for each alpha by IAM: ',
- nb_updated_k_list_iam)
- print('\nnumber of updates of k nearest graphs for each alpha by random generation: ',
- nb_updated_k_list_random)
- print('\ntimes:', time_list)
- nb_update_mat_iam[idx1, idx2] = nb_updated_list_iam[0]
- nb_update_mat_random[idx1, idx2] = nb_updated_list_random[0]
-
- str_fw = 'graphs %d and %d: %d times by IAM, %d times by random generation.\n' \
- % (idx1, idx2, nb_updated_list_iam[0], nb_updated_list_random[0])
- with open('results/preimage_mix/nb_updates.txt', 'r+') as file:
- content = file.read()
- file.seek(0, 0)
- file.write(str_fw + content)
-
-###############################################################################
-
-
-if __name__ == '__main__':
-###############################################################################
-# test on the combination of the two randomly chosen graphs. (the same as in the
-# random pre-image paper.)
-# test_preimage_mix_2combination_all_pairs()
-
-###############################################################################
-# tests on different numbers of median-sets.
-# test_preimage_mix_median_nb()
-
-###############################################################################
-# tests on different values on grid of median-sets and k.
- test_preimage_mix_grid_k_median_nb()
\ No newline at end of file
diff --git a/gklearn/preimage/test_preimage_random.py b/gklearn/preimage/test_preimage_random.py
deleted file mode 100644
index bb77d2f..0000000
--- a/gklearn/preimage/test_preimage_random.py
+++ /dev/null
@@ -1,398 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Thu Sep 5 15:59:00 2019
-
-@author: ljia
-"""
-
-import numpy as np
-import networkx as nx
-import matplotlib.pyplot as plt
-import time
-import random
-#from tqdm import tqdm
-
-from gklearn.utils.graphfiles import loadDataset
-from gklearn.preimage.preimage_random import preimage_random
-from gklearn.preimage.ged import ged_median
-from gklearn.preimage.utils import compute_kernel, get_same_item_indices, remove_edges
-
-
-###############################################################################
-# tests on different values on grid of median-sets and k.
-
-def test_preimage_random_grid_k_median_nb():
- ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
- 'extra_params': {}} # node/edge symb
- Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
-# Gn = Gn[0:50]
- remove_edges(Gn)
- gkernel = 'marginalizedkernel'
-
- lmbda = 0.03 # termination probalility
- r_max = 5 # iteration limit for pre-image.
- l = 500 # update limit for random generation
-# alpha_range = np.linspace(0.5, 0.5, 1)
-# k = 5 # k nearest neighbors
- # parameters for GED function
- ged_cost='CHEM_1'
- ged_method='IPFP'
- saveGXL='gedlib'
-
- # number of graphs; we what to compute the median of these graphs.
- nb_median_range = [2, 3, 4, 5, 10, 20, 30, 40, 50, 100]
- # number of nearest neighbors.
- k_range = [5, 6, 7, 8, 9, 10, 20, 30, 40, 50, 100]
-
- # find out all the graphs classified to positive group 1.
- idx_dict = get_same_item_indices(y_all)
- Gn = [Gn[i] for i in idx_dict[1]]
-
-# # compute Gram matrix.
-# time0 = time.time()
-# km = compute_kernel(Gn, gkernel, True)
-# time_km = time.time() - time0
-# # write Gram matrix to file.
-# np.savez('results/gram_matrix_marg_itr10_pq0.03_mutag_positive.gm', gm=km, gmtime=time_km)
-
-
- time_list = []
- dis_ks_min_list = []
- sod_gs_list = []
- sod_gs_min_list = []
- nb_updated_list = []
- g_best = []
- for idx_nb, nb_median in enumerate(nb_median_range):
- print('\n-------------------------------------------------------')
- print('number of median graphs =', nb_median)
- random.seed(1)
- idx_rdm = random.sample(range(len(Gn)), nb_median)
- print('graphs chosen:', idx_rdm)
- Gn_median = [Gn[idx].copy() for idx in idx_rdm]
-
-# for g in Gn_median:
-# nx.draw(g, labels=nx.get_node_attributes(g, 'atom'), with_labels=True)
-## plt.savefig("results/preimage_mix/mutag.png", format="PNG")
-# plt.show()
-# plt.clf()
-
- ###################################################################
- gmfile = np.load('results/gram_matrix_marg_itr10_pq0.03_mutag_positive.gm.npz')
- km_tmp = gmfile['gm']
- time_km = gmfile['gmtime']
- # modify mixed gram matrix.
- km = np.zeros((len(Gn) + nb_median, len(Gn) + nb_median))
- for i in range(len(Gn)):
- for j in range(i, len(Gn)):
- km[i, j] = km_tmp[i, j]
- km[j, i] = km[i, j]
- for i in range(len(Gn)):
- for j, idx in enumerate(idx_rdm):
- km[i, len(Gn) + j] = km[i, idx]
- km[len(Gn) + j, i] = km[i, idx]
- for i, idx1 in enumerate(idx_rdm):
- for j, idx2 in enumerate(idx_rdm):
- km[len(Gn) + i, len(Gn) + j] = km[idx1, idx2]
-
- ###################################################################
- alpha_range = [1 / nb_median] * nb_median
-
- time_list.append([])
- dis_ks_min_list.append([])
- sod_gs_list.append([])
- sod_gs_min_list.append([])
- nb_updated_list.append([])
- g_best.append([])
-
- for k in k_range:
- print('\n++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n')
- print('k =', k)
- time0 = time.time()
- dhat, ghat, nb_updated = preimage_random(Gn, Gn_median, alpha_range,
- range(len(Gn), len(Gn) + nb_median), km, k, r_max, l, gkernel)
-
- time_total = time.time() - time0 + time_km
- print('time: ', time_total)
- time_list[idx_nb].append(time_total)
- print('\nsmallest distance in kernel space: ', dhat)
- dis_ks_min_list[idx_nb].append(dhat)
- g_best[idx_nb].append(ghat)
- print('\nnumber of updates of the best graph: ', nb_updated)
- nb_updated_list[idx_nb].append(nb_updated)
-
- # show the best graph and save it to file.
- print('the shortest distance is', dhat)
- print('one of the possible corresponding pre-images is')
- nx.draw(ghat, labels=nx.get_node_attributes(ghat, 'atom'),
- with_labels=True)
- plt.savefig('results/preimage_random/mutag_median_nb' + str(nb_median) +
- '_k' + str(k) + '.png', format="PNG")
- # plt.show()
- plt.clf()
- # print(ghat_list[0].nodes(data=True))
- # print(ghat_list[0].edges(data=True))
-
- # compute the corresponding sod in graph space.
- sod_tmp, _ = ged_median([ghat], Gn_median, ged_cost=ged_cost,
- ged_method=ged_method, saveGXL=saveGXL)
- sod_gs_list[idx_nb].append(sod_tmp)
- sod_gs_min_list[idx_nb].append(np.min(sod_tmp))
- print('\nsmallest sod in graph space: ', np.min(sod_tmp))
-
- print('\nsods in graph space: ', sod_gs_list)
- print('\nsmallest sod in graph space for each set of median graphs and k: ',
- sod_gs_min_list)
- print('\nsmallest distance in kernel space for each set of median graphs and k: ',
- dis_ks_min_list)
- print('\nnumber of updates of the best graph for each set of median graphs and k by IAM: ',
- nb_updated_list)
- print('\ntimes:', time_list)
-
-
-
-
-###############################################################################
-# tests on different numbers of median-sets.
-
-def test_preimage_random_median_nb():
- ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
- 'extra_params': {}} # node/edge symb
- Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
-# Gn = Gn[0:50]
- remove_edges(Gn)
- gkernel = 'marginalizedkernel'
-
- lmbda = 0.03 # termination probalility
- r_max = 5 # iteration limit for pre-image.
- l = 500 # update limit for random generation
-# alpha_range = np.linspace(0.5, 0.5, 1)
- k = 5 # k nearest neighbors
- # parameters for GED function
- ged_cost='CHEM_1'
- ged_method='IPFP'
- saveGXL='gedlib'
-
- # number of graphs; we what to compute the median of these graphs.
- nb_median_range = [2, 3, 4, 5, 10, 20, 30, 40, 50, 100]
-
- # find out all the graphs classified to positive group 1.
- idx_dict = get_same_item_indices(y_all)
- Gn = [Gn[i] for i in idx_dict[1]]
-
-# # compute Gram matrix.
-# time0 = time.time()
-# km = compute_kernel(Gn, gkernel, True)
-# time_km = time.time() - time0
-# # write Gram matrix to file.
-# np.savez('results/gram_matrix_marg_itr10_pq0.03_mutag_positive.gm', gm=km, gmtime=time_km)
-
-
- time_list = []
- dis_ks_min_list = []
- sod_gs_list = []
- sod_gs_min_list = []
- nb_updated_list = []
- g_best = []
- for nb_median in nb_median_range:
- print('\n-------------------------------------------------------')
- print('number of median graphs =', nb_median)
- random.seed(1)
- idx_rdm = random.sample(range(len(Gn)), nb_median)
- print('graphs chosen:', idx_rdm)
- Gn_median = [Gn[idx].copy() for idx in idx_rdm]
-
-# for g in Gn_median:
-# nx.draw(g, labels=nx.get_node_attributes(g, 'atom'), with_labels=True)
-## plt.savefig("results/preimage_mix/mutag.png", format="PNG")
-# plt.show()
-# plt.clf()
-
- ###################################################################
- gmfile = np.load('results/gram_matrix_marg_itr10_pq0.03_mutag_positive.gm.npz')
- km_tmp = gmfile['gm']
- time_km = gmfile['gmtime']
- # modify mixed gram matrix.
- km = np.zeros((len(Gn) + nb_median, len(Gn) + nb_median))
- for i in range(len(Gn)):
- for j in range(i, len(Gn)):
- km[i, j] = km_tmp[i, j]
- km[j, i] = km[i, j]
- for i in range(len(Gn)):
- for j, idx in enumerate(idx_rdm):
- km[i, len(Gn) + j] = km[i, idx]
- km[len(Gn) + j, i] = km[i, idx]
- for i, idx1 in enumerate(idx_rdm):
- for j, idx2 in enumerate(idx_rdm):
- km[len(Gn) + i, len(Gn) + j] = km[idx1, idx2]
-
- ###################################################################
- alpha_range = [1 / nb_median] * nb_median
- time0 = time.time()
- dhat, ghat, nb_updated = preimage_random(Gn, Gn_median, alpha_range,
- range(len(Gn), len(Gn) + nb_median), km, k, r_max, l, gkernel)
-
- time_total = time.time() - time0 + time_km
- print('time: ', time_total)
- time_list.append(time_total)
- print('\nsmallest distance in kernel space: ', dhat)
- dis_ks_min_list.append(dhat)
- g_best.append(ghat)
- print('\nnumber of updates of the best graph: ', nb_updated)
- nb_updated_list.append(nb_updated)
-
- # show the best graph and save it to file.
- print('the shortest distance is', dhat)
- print('one of the possible corresponding pre-images is')
- nx.draw(ghat, labels=nx.get_node_attributes(ghat, 'atom'),
- with_labels=True)
- plt.savefig('results/preimage_random/mutag_median_nb' + str(nb_median) +
- '.png', format="PNG")
-# plt.show()
- plt.clf()
-# print(ghat_list[0].nodes(data=True))
-# print(ghat_list[0].edges(data=True))
-
- # compute the corresponding sod in graph space.
- sod_tmp, _ = ged_median([ghat], Gn_median, ged_cost=ged_cost,
- ged_method=ged_method, saveGXL=saveGXL)
- sod_gs_list.append(sod_tmp)
- sod_gs_min_list.append(np.min(sod_tmp))
- print('\nsmallest sod in graph space: ', np.min(sod_tmp))
-
- print('\nsods in graph space: ', sod_gs_list)
- print('\nsmallest sod in graph space for each set of median graphs: ', sod_gs_min_list)
- print('\nsmallest distance in kernel space for each set of median graphs: ',
- dis_ks_min_list)
- print('\nnumber of updates of the best graph for each set of median graphs: ',
- nb_updated_list)
- print('\ntimes:', time_list)
-
-
-
-###############################################################################
-# test on the combination of the two randomly chosen graphs. (the same as in the
-# random pre-image paper.)
-
-def test_random_preimage_2combination():
- ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
- 'extra_params': {}} # node/edge symb
- Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
-# Gn = Gn[0:12]
- remove_edges(Gn)
- gkernel = 'marginalizedkernel'
-
-# dis_mat, dis_max, dis_min, dis_mean = kernel_distance_matrix(Gn, gkernel=gkernel)
-# print(dis_max, dis_min, dis_mean)
-
- lmbda = 0.03 # termination probalility
- r_max = 10 # iteration limit for pre-image.
- l = 500
- alpha_range = np.linspace(0, 1, 11)
- k = 5 # k nearest neighbors
-
- # randomly select two molecules
- np.random.seed(1)
- idx_gi = [187, 167] # np.random.randint(0, len(Gn), 2)
- g1 = Gn[idx_gi[0]].copy()
- g2 = Gn[idx_gi[1]].copy()
-
-# nx.draw(g1, labels=nx.get_node_attributes(g1, 'atom'), with_labels=True)
-# plt.savefig("results/random_preimage/mutag10.png", format="PNG")
-# plt.show()
-# nx.draw(g2, labels=nx.get_node_attributes(g2, 'atom'), with_labels=True)
-# plt.savefig("results/random_preimage/mutag11.png", format="PNG")
-# plt.show()
-
- ######################################################################
-# Gn_mix = [g.copy() for g in Gn]
-# Gn_mix.append(g1.copy())
-# Gn_mix.append(g2.copy())
-#
-## g_tmp = iam([g1, g2])
-## nx.draw_networkx(g_tmp)
-## plt.show()
-#
-# # compute
-# time0 = time.time()
-# km = compute_kernel(Gn_mix, gkernel, True)
-# time_km = time.time() - time0
-
- ###################################################################
- idx1 = idx_gi[0]
- idx2 = idx_gi[1]
- gmfile = np.load('results/gram_matrix_marg_itr10_pq0.03.gm.npz')
- km = gmfile['gm']
- time_km = gmfile['gmtime']
- # modify mixed gram matrix.
- for i in range(len(Gn)):
- km[i, len(Gn)] = km[i, idx1]
- km[i, len(Gn) + 1] = km[i, idx2]
- km[len(Gn), i] = km[i, idx1]
- km[len(Gn) + 1, i] = km[i, idx2]
- km[len(Gn), len(Gn)] = km[idx1, idx1]
- km[len(Gn), len(Gn) + 1] = km[idx1, idx2]
- km[len(Gn) + 1, len(Gn)] = km[idx2, idx1]
- km[len(Gn) + 1, len(Gn) + 1] = km[idx2, idx2]
-
- ###################################################################
-
- time_list = []
- nb_updated_list = []
- g_best = []
- dis_ks_min_list = []
- # for each alpha
- for alpha in alpha_range:
- print('\n-------------------------------------------------------\n')
- print('alpha =', alpha)
- time0 = time.time()
- dhat, ghat, nb_updated = preimage_random(Gn, [g1, g2], [alpha, 1 - alpha],
- range(len(Gn), len(Gn) + 2), km,
- k, r_max, l, gkernel)
- time_total = time.time() - time0 + time_km
- print('time: ', time_total)
- time_list.append(time_total)
- dis_ks_min_list.append(dhat)
- g_best.append(ghat)
- nb_updated_list.append(nb_updated)
-
- # show best graphs and save them to file.
- for idx, item in enumerate(alpha_range):
- print('when alpha is', item, 'the shortest distance is', dis_ks_min_list[idx])
- print('one of the possible corresponding pre-images is')
- nx.draw(g_best[idx], labels=nx.get_node_attributes(g_best[idx], 'atom'),
- with_labels=True)
- plt.show()
- plt.savefig('results/random_preimage/mutag_alpha' + str(item) + '.png', format="PNG")
- plt.clf()
- print(g_best[idx].nodes(data=True))
- print(g_best[idx].edges(data=True))
-
-# # compute the corresponding sod in graph space. (alpha range not considered.)
-# sod_tmp, _ = median_distance(g_best[0], Gn_let)
-# sod_gs_list.append(sod_tmp)
-# sod_gs_min_list.append(np.min(sod_tmp))
-# sod_ks_min_list.append(sod_ks)
-# nb_updated_list.append(nb_updated)
-
-# print('\nsmallest sod in graph space for each alpha: ', sod_gs_min_list)
- print('\nsmallest distance in kernel space for each alpha: ', dis_ks_min_list)
- print('\nnumber of updates for each alpha: ', nb_updated_list)
- print('\ntimes:', time_list)
-
-###############################################################################
-
-
-if __name__ == '__main__':
-###############################################################################
-# test on the combination of the two randomly chosen graphs. (the same as in the
-# random pre-image paper.)
-# test_random_preimage_2combination()
-
-###############################################################################
-# tests all algorithms on different numbers of median-sets.
- test_preimage_random_median_nb()
-
-###############################################################################
-# tests all algorithms on different values on grid of median-sets and k.
-# test_preimage_random_grid_k_median_nb()
\ No newline at end of file
diff --git a/gklearn/preimage/xp_fit_method.py b/gklearn/preimage/xp_fit_method.py
deleted file mode 100644
index ead2786..0000000
--- a/gklearn/preimage/xp_fit_method.py
+++ /dev/null
@@ -1,935 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Tue Jan 14 15:39:29 2020
-
-@author: ljia
-"""
-import numpy as np
-import random
-import csv
-from shutil import copyfile
-import networkx as nx
-import matplotlib.pyplot as plt
-import os
-import time
-
-from gklearn.utils.graphfiles import loadDataset, loadGXL, saveGXL
-from gklearn.preimage.test_k_closest_graphs import median_on_k_closest_graphs, reform_attributes
-from gklearn.preimage.utils import get_same_item_indices, kernel_distance_matrix, compute_kernel
-from gklearn.preimage.find_best_k import getRelations
-
-
-def get_dataset(ds_name):
- if ds_name == 'Letter-high': # node non-symb
- dataset = 'cpp_ext/data/collections/Letter.xml'
- graph_dir = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/data/datasets/Letter/HIGH/'
- Gn, y_all = loadDataset(dataset, extra_params=graph_dir)
- for G in Gn:
- reform_attributes(G, na_names=['x', 'y'])
- G.graph['node_labels'] = []
- G.graph['edge_labels'] = []
- G.graph['node_attrs'] = ['x', 'y']
- G.graph['edge_attrs'] = []
- elif ds_name == 'Letter-med': # node non-symb
- dataset = 'cpp_ext/data/collections/Letter.xml'
- graph_dir = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/data/datasets/Letter/MED/'
- Gn, y_all = loadDataset(dataset, extra_params=graph_dir)
- for G in Gn:
- reform_attributes(G, na_names=['x', 'y'])
- G.graph['node_labels'] = []
- G.graph['edge_labels'] = []
- G.graph['node_attrs'] = ['x', 'y']
- G.graph['edge_attrs'] = []
- elif ds_name == 'Letter-low': # node non-symb
- dataset = 'cpp_ext/data/collections/Letter.xml'
- graph_dir = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/data/datasets/Letter/LOW/'
- Gn, y_all = loadDataset(dataset, extra_params=graph_dir)
- for G in Gn:
- reform_attributes(G, na_names=['x', 'y'])
- G.graph['node_labels'] = []
- G.graph['edge_labels'] = []
- G.graph['node_attrs'] = ['x', 'y']
- G.graph['edge_attrs'] = []
- elif ds_name == 'Fingerprint':
-# dataset = 'cpp_ext/data/collections/Fingerprint.xml'
-# graph_dir = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/generated_datsets/Fingerprint/node_attrs/'
-# Gn, y_all = loadDataset(dataset, extra_params=graph_dir)
-# for G in Gn:
-# reform_attributes(G)
- dataset = '../../datasets/Fingerprint/Fingerprint_A.txt'
- graph_dir = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/generated_datsets/Fingerprint/node_attrs/'
- Gn, y_all = loadDataset(dataset)
- elif ds_name == 'SYNTHETIC':
- pass
- elif ds_name == 'SYNTHETICnew':
- dataset = '../../datasets/SYNTHETICnew/SYNTHETICnew_A.txt'
- graph_dir = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/generated_datsets/SYNTHETICnew'
-# dataset = '../../datasets/Letter-high/Letter-high_A.txt'
-# graph_dir = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/data/datasets/Letter/HIGH/'
- Gn, y_all = loadDataset(dataset)
- elif ds_name == 'Synthie':
- pass
- elif ds_name == 'COIL-DEL':
- dataset = '../../datasets/COIL-DEL/COIL-DEL_A.txt'
- graph_dir = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/generated_datsets/COIL-DEL/'
- Gn, y_all = loadDataset(dataset)
- elif ds_name == 'COIL-RAG':
- pass
- elif ds_name == 'COLORS-3':
- pass
- elif ds_name == 'FRANKENSTEIN':
- pass
-
- return Gn, y_all, graph_dir
-
-
-def init_output_file(ds_name, gkernel, fit_method, dir_output):
-# fn_output_detail = 'results_detail.' + ds_name + '.' + gkernel + '.' + fit_method + '.csv'
- fn_output_detail = 'results_detail.' + ds_name + '.' + gkernel + '.csv'
- f_detail = open(dir_output + fn_output_detail, 'a')
- csv.writer(f_detail).writerow(['dataset', 'graph kernel', 'edit cost',
- 'GED method', 'attr distance', 'fit method', 'k',
- 'target', 'repeat', 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM',
- 'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM',
- 'dis_k gi -> GM', 'fitting time', 'generating time', 'total time',
- 'median set'])
- f_detail.close()
-
-# fn_output_summary = 'results_summary.' + ds_name + '.' + gkernel + '.' + fit_method + '.csv'
- fn_output_summary = 'results_summary.' + ds_name + '.' + gkernel + '.csv'
- f_summary = open(dir_output + fn_output_summary, 'a')
- csv.writer(f_summary).writerow(['dataset', 'graph kernel', 'edit cost',
- 'GED method', 'attr distance', 'fit method', 'k',
- 'target', 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM',
- 'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM',
- 'dis_k gi -> GM', 'fitting time', 'generating time', 'total time',
- '# SOD SM -> GM', '# dis_k SM -> GM',
- '# dis_k gi -> SM', '# dis_k gi -> GM', 'repeats better SOD SM -> GM',
- 'repeats better dis_k SM -> GM', 'repeats better dis_k gi -> SM',
- 'repeats better dis_k gi -> GM'])
- f_summary.close()
-
- return fn_output_detail, fn_output_summary
-
-
-def xp_fit_method_for_non_symbolic(parameters, save_results=True, initial_solutions=1,
- Gn_data=None, k_dis_data=None, Kmatrix=None,
- is_separate=False):
-
- # 1. set parameters.
- print('1. setting parameters...')
- ds_name = parameters['ds_name']
- gkernel = parameters['gkernel']
- edit_cost_name = parameters['edit_cost_name']
- ged_method = parameters['ged_method']
- attr_distance = parameters['attr_distance']
- fit_method = parameters['fit_method']
- init_ecc = parameters['init_ecc']
-
- node_label = None
- edge_label = None
- dir_output = 'results/xp_fit_method/'
-
-
- # 2. get dataset.
- print('2. getting dataset...')
- if Gn_data is None:
- Gn, y_all, graph_dir = get_dataset(ds_name)
- else:
- Gn = Gn_data[0]
- y_all = Gn_data[1]
- graph_dir = Gn_data[2]
-
-
- # 3. compute kernel distance matrix.
- print('3. computing kernel distance matrix...')
- if k_dis_data is None:
- dis_mat, dis_max, dis_min, dis_mean = kernel_distance_matrix(Gn, None,
- None, Kmatrix=Kmatrix, gkernel=gkernel)
- else:
-# dis_mat = k_dis_data[0]
-# dis_max = k_dis_data[1]
-# dis_min = k_dis_data[2]
-# dis_mean = k_dis_data[3]
-# print('pair distances - dis_max, dis_min, dis_mean:', dis_max, dis_min, dis_mean)
- pass
-
-
- if save_results:
- # create result files.
- print('creating output files...')
- fn_output_detail, fn_output_summary = init_output_file(ds_name, gkernel,
- fit_method, dir_output)
-
-
- # start repeats.
- repeats = 1
-# k_list = range(2, 11)
- k_list = [0]
- # get indices by classes.
- y_idx = get_same_item_indices(y_all)
- random.seed(1)
- rdn_seed_list = random.sample(range(0, repeats * 100), repeats)
-
- for k in k_list:
-# print('\n--------- k =', k, '----------')
-
- sod_sm_mean_list = []
- sod_gm_mean_list = []
- dis_k_sm_mean_list = []
- dis_k_gm_mean_list = []
- dis_k_gi_min_mean_list = []
- time_fitting_mean_list = []
- time_generating_mean_list = []
- time_total_mean_list = []
-
- # 3. start generating and computing over targets.
- print('4. starting generating and computing over targets......')
- for i, (y, values) in enumerate(y_idx.items()):
-# y = 'I'
-# values = y_idx[y]
-# values = values[0:10]
- print('\ny =', y)
-# if y.strip() == 'A':
-# continue
-
- k = len(values)
- print('\n--------- k =', k, '----------')
-
- if k < 2:
- print('\nk = ', k, ', skip.\n')
- continue
-
- sod_sm_list = []
- sod_gm_list = []
- dis_k_sm_list = []
- dis_k_gm_list = []
- dis_k_gi_min_list = []
- time_fitting_list = []
- time_generating_list = []
- time_total_list = []
- nb_sod_sm2gm = [0, 0, 0]
- nb_dis_k_sm2gm = [0, 0, 0]
- nb_dis_k_gi2sm = [0, 0, 0]
- nb_dis_k_gi2gm = [0, 0, 0]
- repeats_better_sod_sm2gm = []
- repeats_better_dis_k_sm2gm = []
- repeats_better_dis_k_gi2sm = []
- repeats_better_dis_k_gi2gm = []
-
- # get Gram matrix for this part of data.
- if Kmatrix is not None:
- if is_separate:
- Kmatrix_sub = Kmatrix[i].copy()
- else:
- Kmatrix_sub = Kmatrix[values,:]
- Kmatrix_sub = Kmatrix_sub[:,values]
- else:
- Kmatrix_sub = None
-
- for repeat in range(repeats):
- print('\nrepeat =', repeat)
- random.seed(rdn_seed_list[repeat])
- median_set_idx_idx = random.sample(range(0, len(values)), k)
- median_set_idx = [values[idx] for idx in median_set_idx_idx]
- print('median set: ', median_set_idx)
- Gn_median = [Gn[g] for g in values]
-# from notebooks.utils.plot_all_graphs import draw_Fingerprint_graph
-# for Gn in Gn_median:
-# draw_Fingerprint_graph(Gn, save=None)
-
- # GENERATING & COMPUTING!!
- res_sods, res_dis_ks, res_times = median_on_k_closest_graphs(Gn_median,
- node_label, edge_label,
- gkernel, k, fit_method=fit_method, graph_dir=graph_dir,
- edit_cost_constants=None, group_min=median_set_idx_idx,
- dataset=ds_name, initial_solutions=initial_solutions,
- edit_cost_name=edit_cost_name, init_ecc=init_ecc,
- Kmatrix=Kmatrix_sub, parallel=False)
- sod_sm = res_sods[0]
- sod_gm = res_sods[1]
- dis_k_sm = res_dis_ks[0]
- dis_k_gm = res_dis_ks[1]
- dis_k_gi = res_dis_ks[2]
- dis_k_gi_min = res_dis_ks[3]
- idx_dis_k_gi_min = res_dis_ks[4]
- time_fitting = res_times[0]
- time_generating = res_times[1]
-
- # write result detail.
- sod_sm2gm = getRelations(np.sign(sod_gm - sod_sm))
- dis_k_sm2gm = getRelations(np.sign(dis_k_gm - dis_k_sm))
- dis_k_gi2sm = getRelations(np.sign(dis_k_sm - dis_k_gi_min))
- dis_k_gi2gm = getRelations(np.sign(dis_k_gm - dis_k_gi_min))
- if save_results:
- f_detail = open(dir_output + fn_output_detail, 'a')
- csv.writer(f_detail).writerow([ds_name, gkernel,
- edit_cost_name, ged_method, attr_distance,
- fit_method, k, y, repeat,
- sod_sm, sod_gm, dis_k_sm, dis_k_gm,
- dis_k_gi_min, sod_sm2gm, dis_k_sm2gm, dis_k_gi2sm,
- dis_k_gi2gm, time_fitting, time_generating,
- time_fitting + time_generating, median_set_idx])
- f_detail.close()
-
- # compute result summary.
- sod_sm_list.append(sod_sm)
- sod_gm_list.append(sod_gm)
- dis_k_sm_list.append(dis_k_sm)
- dis_k_gm_list.append(dis_k_gm)
- dis_k_gi_min_list.append(dis_k_gi_min)
- time_fitting_list.append(time_fitting)
- time_generating_list.append(time_generating)
- time_total_list.append(time_fitting + time_generating)
- # # SOD SM -> GM
- if sod_sm > sod_gm:
- nb_sod_sm2gm[0] += 1
- repeats_better_sod_sm2gm.append(repeat)
- elif sod_sm == sod_gm:
- nb_sod_sm2gm[1] += 1
- elif sod_sm < sod_gm:
- nb_sod_sm2gm[2] += 1
- # # dis_k SM -> GM
- if dis_k_sm > dis_k_gm:
- nb_dis_k_sm2gm[0] += 1
- repeats_better_dis_k_sm2gm.append(repeat)
- elif dis_k_sm == dis_k_gm:
- nb_dis_k_sm2gm[1] += 1
- elif dis_k_sm < dis_k_gm:
- nb_dis_k_sm2gm[2] += 1
- # # dis_k gi -> SM
- if dis_k_gi_min > dis_k_sm:
- nb_dis_k_gi2sm[0] += 1
- repeats_better_dis_k_gi2sm.append(repeat)
- elif dis_k_gi_min == dis_k_sm:
- nb_dis_k_gi2sm[1] += 1
- elif dis_k_gi_min < dis_k_sm:
- nb_dis_k_gi2sm[2] += 1
- # # dis_k gi -> GM
- if dis_k_gi_min > dis_k_gm:
- nb_dis_k_gi2gm[0] += 1
- repeats_better_dis_k_gi2gm.append(repeat)
- elif dis_k_gi_min == dis_k_gm:
- nb_dis_k_gi2gm[1] += 1
- elif dis_k_gi_min < dis_k_gm:
- nb_dis_k_gi2gm[2] += 1
-
- # save median graphs.
- fname_sm = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/output/tmp_ged/set_median.gxl'
- fn_pre_sm_new = dir_output + 'medians/set_median.' + fit_method \
- + '.k' + str(int(k)) + '.y' + str(y) + '.repeat' + str(repeat)
- copyfile(fname_sm, fn_pre_sm_new + '.gxl')
- fname_gm = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/output/tmp_ged/gen_median.gxl'
- fn_pre_gm_new = dir_output + 'medians/gen_median.' + fit_method \
- + '.k' + str(int(k)) + '.y' + str(y) + '.repeat' + str(repeat)
- copyfile(fname_gm, fn_pre_gm_new + '.gxl')
- G_best_kernel = Gn_median[idx_dis_k_gi_min].copy()
-# reform_attributes(G_best_kernel)
- fn_pre_g_best_kernel = dir_output + 'medians/g_best_kernel.' + fit_method \
- + '.k' + str(int(k)) + '.y' + str(y) + '.repeat' + str(repeat)
- saveGXL(G_best_kernel, fn_pre_g_best_kernel + '.gxl', method='default')
-
- # plot median graphs.
- if ds_name == 'Letter-high' or ds_name == 'Letter-med' or ds_name == 'Letter-low':
- set_median = loadGXL(fn_pre_sm_new + '.gxl')
- gen_median = loadGXL(fn_pre_gm_new + '.gxl')
- draw_Letter_graph(set_median, fn_pre_sm_new)
- draw_Letter_graph(gen_median, fn_pre_gm_new)
- draw_Letter_graph(G_best_kernel, fn_pre_g_best_kernel)
-
- # write result summary for each letter.
- sod_sm_mean_list.append(np.mean(sod_sm_list))
- sod_gm_mean_list.append(np.mean(sod_gm_list))
- dis_k_sm_mean_list.append(np.mean(dis_k_sm_list))
- dis_k_gm_mean_list.append(np.mean(dis_k_gm_list))
- dis_k_gi_min_mean_list.append(np.mean(dis_k_gi_min_list))
- time_fitting_mean_list.append(np.mean(time_fitting_list))
- time_generating_mean_list.append(np.mean(time_generating_list))
- time_total_mean_list.append(np.mean(time_total_list))
- sod_sm2gm_mean = getRelations(np.sign(sod_gm_mean_list[-1] - sod_sm_mean_list[-1]))
- dis_k_sm2gm_mean = getRelations(np.sign(dis_k_gm_mean_list[-1] - dis_k_sm_mean_list[-1]))
- dis_k_gi2sm_mean = getRelations(np.sign(dis_k_sm_mean_list[-1] - dis_k_gi_min_mean_list[-1]))
- dis_k_gi2gm_mean = getRelations(np.sign(dis_k_gm_mean_list[-1] - dis_k_gi_min_mean_list[-1]))
- if save_results:
- f_summary = open(dir_output + fn_output_summary, 'a')
- csv.writer(f_summary).writerow([ds_name, gkernel,
- edit_cost_name, ged_method, attr_distance,
- fit_method, k, y,
- sod_sm_mean_list[-1], sod_gm_mean_list[-1],
- dis_k_sm_mean_list[-1], dis_k_gm_mean_list[-1],
- dis_k_gi_min_mean_list[-1], sod_sm2gm_mean, dis_k_sm2gm_mean,
- dis_k_gi2sm_mean, dis_k_gi2gm_mean,
- time_fitting_mean_list[-1], time_generating_mean_list[-1],
- time_total_mean_list[-1], nb_sod_sm2gm,
- nb_dis_k_sm2gm, nb_dis_k_gi2sm, nb_dis_k_gi2gm,
- repeats_better_sod_sm2gm, repeats_better_dis_k_sm2gm,
- repeats_better_dis_k_gi2sm, repeats_better_dis_k_gi2gm])
- f_summary.close()
-
-
- # write result summary for each letter.
- sod_sm_mean = np.mean(sod_sm_mean_list)
- sod_gm_mean = np.mean(sod_gm_mean_list)
- dis_k_sm_mean = np.mean(dis_k_sm_mean_list)
- dis_k_gm_mean = np.mean(dis_k_gm_mean_list)
- dis_k_gi_min_mean = np.mean(dis_k_gi_min_list)
- time_fitting_mean = np.mean(time_fitting_list)
- time_generating_mean = np.mean(time_generating_list)
- time_total_mean = np.mean(time_total_list)
- sod_sm2gm_mean = getRelations(np.sign(sod_gm_mean - sod_sm_mean))
- dis_k_sm2gm_mean = getRelations(np.sign(dis_k_gm_mean - dis_k_sm_mean))
- dis_k_gi2sm_mean = getRelations(np.sign(dis_k_sm_mean - dis_k_gi_min_mean))
- dis_k_gi2gm_mean = getRelations(np.sign(dis_k_gm_mean - dis_k_gi_min_mean))
- if save_results:
- f_summary = open(dir_output + fn_output_summary, 'a')
- csv.writer(f_summary).writerow([ds_name, gkernel,
- edit_cost_name, ged_method, attr_distance,
- fit_method, k, 'all',
- sod_sm_mean, sod_gm_mean, dis_k_sm_mean, dis_k_gm_mean,
- dis_k_gi_min_mean, sod_sm2gm_mean, dis_k_sm2gm_mean,
- dis_k_gi2sm_mean, dis_k_gi2gm_mean,
- time_fitting_mean, time_generating_mean, time_total_mean])
- f_summary.close()
-
- print('\ncomplete.')
-
-
-#Dessin median courrant
-def draw_Letter_graph(graph, file_prefix):
- plt.figure()
- pos = {}
- for n in graph.nodes:
- pos[n] = np.array([float(graph.node[n]['x']),float(graph.node[n]['y'])])
- nx.draw_networkx(graph, pos)
- plt.savefig(file_prefix + '.eps', format='eps', dpi=300)
-# plt.show()
- plt.clf()
-
-
-def compute_gm_for_each_class(Gn, y_all, gkernel, parallel='imap_unordered', is_separate=True):
-
- if is_separate:
- print('the Gram matrix is computed for each class.')
- y_idx = get_same_item_indices(y_all)
- Kmatrix = []
- run_time = []
- k_dis_data = []
- for i, (y, values) in enumerate(y_idx.items()):
- print('The ', str(i), ' class:')
- Gn_i = [Gn[val] for val in values]
- time0 = time.time()
- Kmatrix.append(compute_kernel(Gn_i, gkernel, None, None, True, parallel=parallel))
- run_time.append(time.time() - time0)
- k_dis_data.append(kernel_distance_matrix(Gn_i, None, None,
- Kmatrix=Kmatrix[i], gkernel=gkernel, verbose=True))
- np.savez('results/xp_fit_method/Kmatrix.' + ds_name + '.' + gkernel + '.gm',
- Kmatrix=Kmatrix, run_time=run_time, is_separate=is_separate)
- dis_max = np.max([item[1] for item in k_dis_data])
- dis_min = np.min([item[2] for item in k_dis_data])
- dis_mean = np.mean([item[3] for item in k_dis_data])
- print('pair distances - dis_max, dis_min, dis_mean:', dis_max, dis_min,
- dis_mean)
-
- else:
- time0 = time.time()
- Kmatrix = compute_kernel(Gn, gkernel, None, None, True, parallel=parallel)
- run_time = time.time() - time0
- np.savez('results/xp_fit_method/Kmatrix.' + ds_name + '.' + gkernel + '.gm',
- Kmatrix=Kmatrix, run_time=run_time, is_separate=is_separate)
- k_dis_data = kernel_distance_matrix(Gn, None, None,
- Kmatrix=Kmatrix, gkernel=gkernel, verbose=True)
- print('the Gram matrix is computed for the whole dataset.')
- print('pair distances - dis_max, dis_min, dis_mean:', k_dis_data[1],
- k_dis_data[2], k_dis_data[3])
-
- print('\nTime to compute Gram matrix for the whole dataset: ', run_time)
-# k_dis_data = [dis_mat, dis_max, dis_min, dis_mean]
- return Kmatrix, run_time, k_dis_data
-
-
-if __name__ == "__main__":
-# #### xp 1: Letter-high, spkernel.
-# # load dataset.
-# print('getting dataset and computing kernel distance matrix first...')
-# ds_name = 'Letter-high'
-# gkernel = 'spkernel'
-# Gn, y_all, graph_dir = get_dataset(ds_name)
-# # remove graphs without edges.
-# Gn = [(idx, G) for idx, G in enumerate(Gn) if nx.number_of_edges(G) != 0]
-# idx = [G[0] for G in Gn]
-# Gn = [G[1] for G in Gn]
-# y_all = [y_all[i] for i in idx]
-## Gn = Gn[0:50]
-## y_all = y_all[0:50]
-# # compute pair distances.
-# dis_mat, dis_max, dis_min, dis_mean = kernel_distance_matrix(Gn, None, None,
-# Kmatrix=None, gkernel=gkernel, verbose=True)
-## dis_mat, dis_max, dis_min, dis_mean = 0, 0, 0, 0
-# # fitting and computing.
-# fit_methods = ['random', 'expert', 'k-graphs']
-# for fit_method in fit_methods:
-# print('\n-------------------------------------')
-# print('fit method:', fit_method)
-# parameters = {'ds_name': ds_name,
-# 'gkernel': gkernel,
-# 'edit_cost_name': 'LETTER2',
-# 'ged_method': 'mIPFP',
-# 'attr_distance': 'euclidean',
-# 'fit_method': fit_method}
-# xp_fit_method_for_non_symbolic(parameters, save_results=True,
-# initial_solutions=40,
-# Gn_data = [Gn, y_all, graph_dir],
-# k_dis_data = [dis_mat, dis_max, dis_min, dis_mean])
-
-
-# #### xp 2: Letter-high, sspkernel.
-# # load dataset.
-# print('getting dataset and computing kernel distance matrix first...')
-# ds_name = 'Letter-high'
-# gkernel = 'structuralspkernel'
-# Gn, y_all, graph_dir = get_dataset(ds_name)
-## Gn = Gn[0:50]
-## y_all = y_all[0:50]
-# # compute pair distances.
-# dis_mat, dis_max, dis_min, dis_mean = kernel_distance_matrix(Gn, None, None,
-# Kmatrix=None, gkernel=gkernel, verbose=True)
-## dis_mat, dis_max, dis_min, dis_mean = 0, 0, 0, 0
-# # fitting and computing.
-# fit_methods = ['random', 'expert', 'k-graphs']
-# for fit_method in fit_methods:
-# print('\n-------------------------------------')
-# print('fit method:', fit_method)
-# parameters = {'ds_name': ds_name,
-# 'gkernel': gkernel,
-# 'edit_cost_name': 'LETTER2',
-# 'ged_method': 'mIPFP',
-# 'attr_distance': 'euclidean',
-# 'fit_method': fit_method}
-# print('parameters: ', parameters)
-# xp_fit_method_for_non_symbolic(parameters, save_results=True,
-# initial_solutions=40,
-# Gn_data = [Gn, y_all, graph_dir],
-# k_dis_data = [dis_mat, dis_max, dis_min, dis_mean])
-
-
-# #### xp 3: SYNTHETICnew, sspkernel, using NON_SYMBOLIC.
-# gmfile = np.load('results/xp_fit_method/Kmatrix.SYNTHETICnew.structuralspkernel.gm.npz')
-# Kmatrix = gmfile['Kmatrix']
-# run_time = gmfile['run_time']
-# # normalization
-# Kmatrix_diag = Kmatrix.diagonal().copy()
-# for i in range(len(Kmatrix)):
-# for j in range(i, len(Kmatrix)):
-# Kmatrix[i][j] /= np.sqrt(Kmatrix_diag[i] * Kmatrix_diag[j])
-# Kmatrix[j][i] = Kmatrix[i][j]
-## np.savez('results/xp_fit_method/Kmatrix.SYNTHETICnew.spkernel.gm',
-## Kmatrix=Kmatrix, run_time=run_time)
-# # load dataset.
-# print('getting dataset and computing kernel distance matrix first...')
-# ds_name = 'SYNTHETICnew'
-# gkernel = 'structuralspkernel'
-# Gn, y_all, graph_dir = get_dataset(ds_name)
-# # remove graphs without nodes and edges.
-# Gn = [(idx, G) for idx, G in enumerate(Gn) if (nx.number_of_nodes(G) != 0
-# and nx.number_of_edges(G) != 0)]
-# idx = [G[0] for G in Gn]
-# Gn = [G[1] for G in Gn]
-# y_all = [y_all[i] for i in idx]
-## Gn = Gn[0:10]
-## y_all = y_all[0:10]
-# for G in Gn:
-# G.graph['filename'] = 'graph' + str(G.graph['name']) + '.gxl'
-# # compute pair distances.
-# dis_mat, dis_max, dis_min, dis_mean = kernel_distance_matrix(Gn, None, None,
-# Kmatrix=Kmatrix, gkernel=gkernel, verbose=True)
-## dis_mat, dis_max, dis_min, dis_mean = 0, 0, 0, 0
-# # fitting and computing.
-# fit_methods = ['k-graphs', 'random', 'random', 'random']
-# for fit_method in fit_methods:
-# print('\n-------------------------------------')
-# print('fit method:', fit_method)
-# parameters = {'ds_name': ds_name,
-# 'gkernel': gkernel,
-# 'edit_cost_name': 'NON_SYMBOLIC',
-# 'ged_method': 'mIPFP',
-# 'attr_distance': 'euclidean',
-# 'fit_method': fit_method}
-# xp_fit_method_for_non_symbolic(parameters, save_results=True,
-# initial_solutions=1,
-# Gn_data = [Gn, y_all, graph_dir],
-# k_dis_data = [dis_mat, dis_max, dis_min, dis_mean],
-# Kmatrix=Kmatrix)
-
-
-# ### xp 4: SYNTHETICnew, spkernel, using NON_SYMBOLIC.
-# gmfile = np.load('results/xp_fit_method/Kmatrix.SYNTHETICnew.spkernel.gm.npz')
-# Kmatrix = gmfile['Kmatrix']
-# # normalization
-# Kmatrix_diag = Kmatrix.diagonal().copy()
-# for i in range(len(Kmatrix)):
-# for j in range(i, len(Kmatrix)):
-# Kmatrix[i][j] /= np.sqrt(Kmatrix_diag[i] * Kmatrix_diag[j])
-# Kmatrix[j][i] = Kmatrix[i][j]
-# run_time = 21821.35
-# np.savez('results/xp_fit_method/Kmatrix.SYNTHETICnew.spkernel.gm',
-# Kmatrix=Kmatrix, run_time=run_time)
-#
-# # load dataset.
-# print('getting dataset and computing kernel distance matrix first...')
-# ds_name = 'SYNTHETICnew'
-# gkernel = 'spkernel'
-# Gn, y_all, graph_dir = get_dataset(ds_name)
-## # remove graphs without nodes and edges.
-## Gn = [(idx, G) for idx, G in enumerate(Gn) if (nx.number_of_node(G) != 0
-## and nx.number_of_edges(G) != 0)]
-## idx = [G[0] for G in Gn]
-## Gn = [G[1] for G in Gn]
-## y_all = [y_all[i] for i in idx]
-## Gn = Gn[0:5]
-## y_all = y_all[0:5]
-# for G in Gn:
-# G.graph['filename'] = 'graph' + str(G.graph['name']) + '.gxl'
-#
-# # compute/read Gram matrix and pair distances.
-## Kmatrix = compute_kernel(Gn, gkernel, None, None, True)
-## np.savez('results/xp_fit_method/Kmatrix.' + ds_name + '.' + gkernel + '.gm',
-## Kmatrix=Kmatrix)
-# gmfile = np.load('results/xp_fit_method/Kmatrix.' + ds_name + '.' + gkernel + '.gm.npz')
-# Kmatrix = gmfile['Kmatrix']
-# run_time = gmfile['run_time']
-## Kmatrix = Kmatrix[[0,1,2,3,4],:]
-## Kmatrix = Kmatrix[:,[0,1,2,3,4]]
-# print('\nTime to compute Gram matrix for the whole dataset: ', run_time)
-# dis_mat, dis_max, dis_min, dis_mean = kernel_distance_matrix(Gn, None, None,
-# Kmatrix=Kmatrix, gkernel=gkernel, verbose=True)
-## Kmatrix = np.zeros((len(Gn), len(Gn)))
-## dis_mat, dis_max, dis_min, dis_mean = 0, 0, 0, 0
-#
-# # fitting and computing.
-# fit_methods = ['k-graphs', 'random', 'random', 'random']
-# for fit_method in fit_methods:
-# print('\n-------------------------------------')
-# print('fit method:', fit_method)
-# parameters = {'ds_name': ds_name,
-# 'gkernel': gkernel,
-# 'edit_cost_name': 'NON_SYMBOLIC',
-# 'ged_method': 'mIPFP',
-# 'attr_distance': 'euclidean',
-# 'fit_method': fit_method}
-# xp_fit_method_for_non_symbolic(parameters, save_results=True,
-# initial_solutions=1,
-# Gn_data=[Gn, y_all, graph_dir],
-# k_dis_data=[dis_mat, dis_max, dis_min, dis_mean],
-# Kmatrix=Kmatrix)
-
-
-# #### xp 5: Fingerprint, sspkernel, using LETTER2, only node attrs.
-# # load dataset.
-# print('getting dataset and computing kernel distance matrix first...')
-# ds_name = 'Fingerprint'
-# gkernel = 'structuralspkernel'
-# Gn, y_all, graph_dir = get_dataset(ds_name)
-# # remove graphs without nodes and edges.
-# Gn = [(idx, G) for idx, G in enumerate(Gn) if nx.number_of_nodes(G) != 0]
-## and nx.number_of_edges(G) != 0)]
-# idx = [G[0] for G in Gn]
-# Gn = [G[1] for G in Gn]
-# y_all = [y_all[i] for i in idx]
-# y_idx = get_same_item_indices(y_all)
-# # remove unused labels.
-# for G in Gn:
-# G.graph['edge_attrs'] = []
-# for edge in G.edges:
-# del G.edges[edge]['attributes']
-# del G.edges[edge]['orient']
-# del G.edges[edge]['angle']
-## Gn = Gn[805:815]
-## y_all = y_all[805:815]
-# for G in Gn:
-# G.graph['filename'] = 'graph' + str(G.graph['name']) + '.gxl'
-#
-# # compute/read Gram matrix and pair distances.
-## Kmatrix = compute_kernel(Gn, gkernel, None, None, True, parallel='imap_unordered')
-## np.savez('results/xp_fit_method/Kmatrix.' + ds_name + '.' + gkernel + '.gm',
-## Kmatrix=Kmatrix)
-# gmfile = np.load('results/xp_fit_method/Kmatrix.' + ds_name + '.' + gkernel + '.gm.npz')
-# Kmatrix = gmfile['Kmatrix']
-## run_time = gmfile['run_time']
-## Kmatrix = Kmatrix[[0,1,2,3,4],:]
-## Kmatrix = Kmatrix[:,[0,1,2,3,4]]
-## print('\nTime to compute Gram matrix for the whole dataset: ', run_time)
-# dis_mat, dis_max, dis_min, dis_mean = kernel_distance_matrix(Gn, None, None,
-# Kmatrix=Kmatrix, gkernel=gkernel, verbose=True)
-## Kmatrix = np.zeros((len(Gn), len(Gn)))
-## dis_mat, dis_max, dis_min, dis_mean = 0, 0, 0, 0
-#
-# # fitting and computing.
-# fit_methods = ['k-graphs', 'random', 'random', 'random']
-# for fit_method in fit_methods:
-# print('\n-------------------------------------')
-# print('fit method:', fit_method)
-# parameters = {'ds_name': ds_name,
-# 'gkernel': gkernel,
-# 'edit_cost_name': 'LETTER2',
-# 'ged_method': 'mIPFP',
-# 'attr_distance': 'euclidean',
-# 'fit_method': fit_method,
-# 'init_ecc': [1,1,1,1,1]} # [0.525, 0.525, 0.001, 0.125, 0.125]}
-# xp_fit_method_for_non_symbolic(parameters, save_results=True,
-# initial_solutions=40,
-# Gn_data = [Gn, y_all, graph_dir],
-# k_dis_data = [dis_mat, dis_max, dis_min, dis_mean],
-# Kmatrix=Kmatrix)
-
-
-# #### xp 6: Letter-med, sspkernel.
-# # load dataset.
-# print('getting dataset and computing kernel distance matrix first...')
-# ds_name = 'Letter-med'
-# gkernel = 'structuralspkernel'
-# Gn, y_all, graph_dir = get_dataset(ds_name)
-## Gn = Gn[0:50]
-## y_all = y_all[0:50]
-#
-# # compute/read Gram matrix and pair distances.
-# Kmatrix = compute_kernel(Gn, gkernel, None, None, True, parallel='imap_unordered')
-# np.savez('results/xp_fit_method/Kmatrix.' + ds_name + '.' + gkernel + '.gm',
-# Kmatrix=Kmatrix)
-## gmfile = np.load('results/xp_fit_method/Kmatrix.' + ds_name + '.' + gkernel + '.gm.npz')
-## Kmatrix = gmfile['Kmatrix']
-## run_time = gmfile['run_time']
-## Kmatrix = Kmatrix[[0,1,2,3,4],:]
-## Kmatrix = Kmatrix[:,[0,1,2,3,4]]
-## print('\nTime to compute Gram matrix for the whole dataset: ', run_time)
-# dis_mat, dis_max, dis_min, dis_mean = kernel_distance_matrix(Gn, None, None,
-# Kmatrix=Kmatrix, gkernel=gkernel, verbose=True)
-## Kmatrix = np.zeros((len(Gn), len(Gn)))
-## dis_mat, dis_max, dis_min, dis_mean = 0, 0, 0, 0
-#
-# # fitting and computing.
-# fit_methods = ['k-graphs', 'expert', 'random', 'random', 'random']
-# for fit_method in fit_methods:
-# print('\n-------------------------------------')
-# print('fit method:', fit_method)
-# parameters = {'ds_name': ds_name,
-# 'gkernel': gkernel,
-# 'edit_cost_name': 'LETTER2',
-# 'ged_method': 'mIPFP',
-# 'attr_distance': 'euclidean',
-# 'fit_method': fit_method,
-# 'init_ecc': [0.525, 0.525, 0.75, 0.475, 0.475]}
-# print('parameters: ', parameters)
-# xp_fit_method_for_non_symbolic(parameters, save_results=True,
-# initial_solutions=40,
-# Gn_data = [Gn, y_all, graph_dir],
-# k_dis_data = [dis_mat, dis_max, dis_min, dis_mean],
-# Kmatrix=Kmatrix)
-
-
-# #### xp 7: Letter-low, sspkernel.
-# # load dataset.
-# print('getting dataset and computing kernel distance matrix first...')
-# ds_name = 'Letter-low'
-# gkernel = 'structuralspkernel'
-# Gn, y_all, graph_dir = get_dataset(ds_name)
-## Gn = Gn[0:50]
-## y_all = y_all[0:50]
-#
-# # compute/read Gram matrix and pair distances.
-# Kmatrix = compute_kernel(Gn, gkernel, None, None, True, parallel='imap_unordered')
-# np.savez('results/xp_fit_method/Kmatrix.' + ds_name + '.' + gkernel + '.gm',
-# Kmatrix=Kmatrix)
-## gmfile = np.load('results/xp_fit_method/Kmatrix.' + ds_name + '.' + gkernel + '.gm.npz')
-## Kmatrix = gmfile['Kmatrix']
-## run_time = gmfile['run_time']
-## Kmatrix = Kmatrix[[0,1,2,3,4],:]
-## Kmatrix = Kmatrix[:,[0,1,2,3,4]]
-## print('\nTime to compute Gram matrix for the whole dataset: ', run_time)
-# dis_mat, dis_max, dis_min, dis_mean = kernel_distance_matrix(Gn, None, None,
-# Kmatrix=Kmatrix, gkernel=gkernel, verbose=True)
-## Kmatrix = np.zeros((len(Gn), len(Gn)))
-## dis_mat, dis_max, dis_min, dis_mean = 0, 0, 0, 0
-#
-# # fitting and computing.
-# fit_methods = ['k-graphs', 'expert', 'random', 'random', 'random']
-# for fit_method in fit_methods:
-# print('\n-------------------------------------')
-# print('fit method:', fit_method)
-# parameters = {'ds_name': ds_name,
-# 'gkernel': gkernel,
-# 'edit_cost_name': 'LETTER2',
-# 'ged_method': 'mIPFP',
-# 'attr_distance': 'euclidean',
-# 'fit_method': fit_method,
-# 'init_ecc': [0.075, 0.075, 0.25, 0.075, 0.075]}
-# print('parameters: ', parameters)
-# xp_fit_method_for_non_symbolic(parameters, save_results=True,
-# initial_solutions=40,
-# Gn_data = [Gn, y_all, graph_dir],
-# k_dis_data = [dis_mat, dis_max, dis_min, dis_mean],
-# Kmatrix=Kmatrix)
-
-
-# #### xp 8: Letter-med, spkernel.
-# # load dataset.
-# print('getting dataset and computing kernel distance matrix first...')
-# ds_name = 'Letter-med'
-# gkernel = 'spkernel'
-# Gn, y_all, graph_dir = get_dataset(ds_name)
-# # remove graphs without nodes and edges.
-# Gn = [(idx, G) for idx, G in enumerate(Gn) if (nx.number_of_nodes(G) != 0
-# and nx.number_of_edges(G) != 0)]
-# idx = [G[0] for G in Gn]
-# Gn = [G[1] for G in Gn]
-# y_all = [y_all[i] for i in idx]
-## Gn = Gn[0:50]
-## y_all = y_all[0:50]
-#
-# # compute/read Gram matrix and pair distances.
-# Kmatrix = compute_kernel(Gn, gkernel, None, None, True, parallel='imap_unordered')
-# np.savez('results/xp_fit_method/Kmatrix.' + ds_name + '.' + gkernel + '.gm',
-# Kmatrix=Kmatrix)
-## gmfile = np.load('results/xp_fit_method/Kmatrix.' + ds_name + '.' + gkernel + '.gm.npz')
-## Kmatrix = gmfile['Kmatrix']
-## run_time = gmfile['run_time']
-## Kmatrix = Kmatrix[[0,1,2,3,4],:]
-## Kmatrix = Kmatrix[:,[0,1,2,3,4]]
-## print('\nTime to compute Gram matrix for the whole dataset: ', run_time)
-# dis_mat, dis_max, dis_min, dis_mean = kernel_distance_matrix(Gn, None, None,
-# Kmatrix=Kmatrix, gkernel=gkernel, verbose=True)
-## Kmatrix = np.zeros((len(Gn), len(Gn)))
-## dis_mat, dis_max, dis_min, dis_mean = 0, 0, 0, 0
-#
-# # fitting and computing.
-# fit_methods = ['k-graphs', 'expert', 'random', 'random', 'random']
-# for fit_method in fit_methods:
-# print('\n-------------------------------------')
-# print('fit method:', fit_method)
-# parameters = {'ds_name': ds_name,
-# 'gkernel': gkernel,
-# 'edit_cost_name': 'LETTER2',
-# 'ged_method': 'mIPFP',
-# 'attr_distance': 'euclidean',
-# 'fit_method': fit_method,
-# 'init_ecc': [0.525, 0.525, 0.75, 0.475, 0.475]}
-# print('parameters: ', parameters)
-# xp_fit_method_for_non_symbolic(parameters, save_results=True,
-# initial_solutions=40,
-# Gn_data = [Gn, y_all, graph_dir],
-# k_dis_data = [dis_mat, dis_max, dis_min, dis_mean],
-# Kmatrix=Kmatrix)
-
-
-# #### xp 9: Letter-low, spkernel.
-# # load dataset.
-# print('getting dataset and computing kernel distance matrix first...')
-# ds_name = 'Letter-low'
-# gkernel = 'spkernel'
-# Gn, y_all, graph_dir = get_dataset(ds_name)
-# # remove graphs without nodes and edges.
-# Gn = [(idx, G) for idx, G in enumerate(Gn) if (nx.number_of_nodes(G) != 0
-# and nx.number_of_edges(G) != 0)]
-# idx = [G[0] for G in Gn]
-# Gn = [G[1] for G in Gn]
-# y_all = [y_all[i] for i in idx]
-## Gn = Gn[0:50]
-## y_all = y_all[0:50]
-#
-# # compute/read Gram matrix and pair distances.
-# Kmatrix = compute_kernel(Gn, gkernel, None, None, True, parallel='imap_unordered')
-# np.savez('results/xp_fit_method/Kmatrix.' + ds_name + '.' + gkernel + '.gm',
-# Kmatrix=Kmatrix)
-## gmfile = np.load('results/xp_fit_method/Kmatrix.' + ds_name + '.' + gkernel + '.gm.npz')
-## Kmatrix = gmfile['Kmatrix']
-## run_time = gmfile['run_time']
-## Kmatrix = Kmatrix[[0,1,2,3,4],:]
-## Kmatrix = Kmatrix[:,[0,1,2,3,4]]
-## print('\nTime to compute Gram matrix for the whole dataset: ', run_time)
-# dis_mat, dis_max, dis_min, dis_mean = kernel_distance_matrix(Gn, None, None,
-# Kmatrix=Kmatrix, gkernel=gkernel, verbose=True)
-## Kmatrix = np.zeros((len(Gn), len(Gn)))
-## dis_mat, dis_max, dis_min, dis_mean = 0, 0, 0, 0
-#
-# # fitting and computing.
-# fit_methods = ['k-graphs', 'expert', 'random', 'random', 'random']
-# for fit_method in fit_methods:
-# print('\n-------------------------------------')
-# print('fit method:', fit_method)
-# parameters = {'ds_name': ds_name,
-# 'gkernel': gkernel,
-# 'edit_cost_name': 'LETTER2',
-# 'ged_method': 'mIPFP',
-# 'attr_distance': 'euclidean',
-# 'fit_method': fit_method,
-# 'init_ecc': [0.075, 0.075, 0.25, 0.075, 0.075]}
-# print('parameters: ', parameters)
-# xp_fit_method_for_non_symbolic(parameters, save_results=True,
-# initial_solutions=40,
-# Gn_data = [Gn, y_all, graph_dir],
-# k_dis_data = [dis_mat, dis_max, dis_min, dis_mean],
-# Kmatrix=Kmatrix)
-
-
- #### xp 5: COIL-DEL, sspkernel, using LETTER2, only node attrs.
- # load dataset.
- print('getting dataset and computing kernel distance matrix first...')
- ds_name = 'COIL-DEL'
- gkernel = 'structuralspkernel'
- Gn, y_all, graph_dir = get_dataset(ds_name)
- # remove graphs without nodes and edges.
- Gn = [(idx, G) for idx, G in enumerate(Gn) if nx.number_of_nodes(G) != 0]
-# and nx.number_of_edges(G) != 0)]
- idx = [G[0] for G in Gn]
- Gn = [G[1] for G in Gn]
- y_all = [y_all[i] for i in idx]
- # remove unused labels.
- for G in Gn:
- G.graph['edge_labels'] = []
- for edge in G.edges:
- del G.edges[edge]['bond_type']
- del G.edges[edge]['valence']
-# Gn = Gn[805:815]
-# y_all = y_all[805:815]
- for G in Gn:
- G.graph['filename'] = 'graph' + str(G.graph['name']) + '.gxl'
-
- # compute/read Gram matrix and pair distances.
- is_separate = True
- Kmatrix, run_time, k_dis_data = compute_gm_for_each_class(Gn,
- y_all,
- gkernel,
- parallel='imap_unordered',
- is_separate=is_separate)
-# Kmatrix = compute_kernel(Gn, gkernel, None, None, True, parallel='imap_unordered')
-# np.savez('results/xp_fit_method/Kmatrix.' + ds_name + '.' + gkernel + '.gm',
-# Kmatrix=Kmatrix)
-# gmfile = np.load('results/xp_fit_method/Kmatrix.' + ds_name + '.' + gkernel + '.gm.npz')
-# Kmatrix = gmfile['Kmatrix']
-# run_time = gmfile['run_time']
-# Kmatrix = Kmatrix[[0,1,2,3,4],:]
-# Kmatrix = Kmatrix[:,[0,1,2,3,4]]
-# print('\nTime to compute Gram matrix for the whole dataset: ', run_time)
-# dis_mat, dis_max, dis_min, dis_mean = kernel_distance_matrix(Gn, None, None,
-# Kmatrix=Kmatrix, gkernel=gkernel, verbose=True)
-# Kmatrix = np.zeros((len(Gn), len(Gn)))
-# dis_mat, dis_max, dis_min, dis_mean = 0, 0, 0, 0
-
- # fitting and computing.
- fit_methods = ['k-graphs', 'random', 'random', 'random']
- for fit_method in fit_methods:
- print('\n-------------------------------------')
- print('fit method:', fit_method)
- parameters = {'ds_name': ds_name,
- 'gkernel': gkernel,
- 'edit_cost_name': 'LETTER2',
- 'ged_method': 'mIPFP',
- 'attr_distance': 'euclidean',
- 'fit_method': fit_method,
- 'init_ecc': [3,3,1,3,3]} # [0.525, 0.525, 0.001, 0.125, 0.125]}
- xp_fit_method_for_non_symbolic(parameters, save_results=True,
- initial_solutions=40,
- Gn_data=[Gn, y_all, graph_dir],
- k_dis_data=k_dis_data,
- Kmatrix=Kmatrix,
- is_separate=is_separate)
\ No newline at end of file
diff --git a/gklearn/preimage/xp_letter_h.py b/gklearn/preimage/xp_letter_h.py
deleted file mode 100644
index 1e16fcf..0000000
--- a/gklearn/preimage/xp_letter_h.py
+++ /dev/null
@@ -1,476 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Tue Jan 14 15:39:29 2020
-
-@author: ljia
-"""
-import numpy as np
-import random
-import csv
-from shutil import copyfile
-import networkx as nx
-import matplotlib.pyplot as plt
-
-from gklearn.utils.graphfiles import loadDataset, loadGXL, saveGXL
-from gklearn.preimage.test_k_closest_graphs import median_on_k_closest_graphs, reform_attributes
-from gklearn.preimage.utils import get_same_item_indices, kernel_distance_matrix
-from gklearn.preimage.find_best_k import getRelations
-
-
-def xp_letter_h_LETTER2_cost():
- ds = {'dataset': 'cpp_ext/data/collections/Letter.xml',
- 'graph_dir': os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/data/datasets/Letter/HIGH/'} # node/edge symb
- Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['graph_dir'])
-
- dis_mat, dis_max, dis_min, dis_mean = kernel_distance_matrix(Gn, None, None, Kmatrix=None, gkernel='structuralspkernel')
- for G in Gn:
- reform_attributes(G)
-# ds = {'name': 'Letter-high',
-# 'dataset': '../datasets/Letter-high/Letter-high_A.txt'} # node/edge symb
-# Gn, y_all = loadDataset(ds['dataset'])
-# Gn = Gn[0:50]
- gkernel = 'structuralspkernel'
- node_label = None
- edge_label = None
- ds_name = 'letter-h'
- dir_output = 'results/xp_letter_h/'
- save_results = True
- cost = 'LETTER2'
-
- repeats = 1
-# k_list = range(2, 11)
- k_list = [150]
- fit_method = 'k-graphs'
- # get indices by classes.
- y_idx = get_same_item_indices(y_all)
-
- if save_results:
- # create result files.
- fn_output_detail = 'results_detail.' + ds_name + '.' + gkernel + '.' + fit_method + '.csv'
- f_detail = open(dir_output + fn_output_detail, 'a')
- csv.writer(f_detail).writerow(['dataset', 'graph kernel', 'fit method', 'k',
- 'target', 'repeat', 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM',
- 'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM',
- 'dis_k gi -> GM', 'median set'])
- f_detail.close()
- fn_output_summary = 'results_summary.' + ds_name + '.' + gkernel + '.' + fit_method + '.csv'
- f_summary = open(dir_output + fn_output_summary, 'a')
- csv.writer(f_summary).writerow(['dataset', 'graph kernel', 'fit method', 'k',
- 'target', 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM',
- 'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM',
- 'dis_k gi -> GM', '# SOD SM -> GM', '# dis_k SM -> GM',
- '# dis_k gi -> SM', '# dis_k gi -> GM', 'repeats better SOD SM -> GM',
- 'repeats better dis_k SM -> GM', 'repeats better dis_k gi -> SM',
- 'repeats better dis_k gi -> GM'])
- f_summary.close()
-
- random.seed(1)
- rdn_seed_list = random.sample(range(0, repeats * 100), repeats)
-
- for k in k_list:
- print('\n--------- k =', k, '----------')
-
- sod_sm_mean_list = []
- sod_gm_mean_list = []
- dis_k_sm_mean_list = []
- dis_k_gm_mean_list = []
- dis_k_gi_min_mean_list = []
-# nb_sod_sm2gm = [0, 0, 0]
-# nb_dis_k_sm2gm = [0, 0, 0]
-# nb_dis_k_gi2sm = [0, 0, 0]
-# nb_dis_k_gi2gm = [0, 0, 0]
-# repeats_better_sod_sm2gm = []
-# repeats_better_dis_k_sm2gm = []
-# repeats_better_dis_k_gi2sm = []
-# repeats_better_dis_k_gi2gm = []
-
- for i, (y, values) in enumerate(y_idx.items()):
- print('\ny =', y)
-# y = 'F'
-# values = y_idx[y]
-# values = values[0:10]
-
- k = len(values)
-
- sod_sm_list = []
- sod_gm_list = []
- dis_k_sm_list = []
- dis_k_gm_list = []
- dis_k_gi_min_list = []
- nb_sod_sm2gm = [0, 0, 0]
- nb_dis_k_sm2gm = [0, 0, 0]
- nb_dis_k_gi2sm = [0, 0, 0]
- nb_dis_k_gi2gm = [0, 0, 0]
- repeats_better_sod_sm2gm = []
- repeats_better_dis_k_sm2gm = []
- repeats_better_dis_k_gi2sm = []
- repeats_better_dis_k_gi2gm = []
-
- for repeat in range(repeats):
- print('\nrepeat =', repeat)
- random.seed(rdn_seed_list[repeat])
- median_set_idx_idx = random.sample(range(0, len(values)), k)
- median_set_idx = [values[idx] for idx in median_set_idx_idx]
- print('median set: ', median_set_idx)
- Gn_median = [Gn[g] for g in values]
-
- sod_sm, sod_gm, dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min, idx_dis_k_gi_min \
- = median_on_k_closest_graphs(Gn_median, node_label, edge_label,
- gkernel, k, fit_method=fit_method, graph_dir=ds['graph_dir'],
- edit_costs=None, group_min=median_set_idx_idx,
- dataset='Letter', cost=cost, parallel=False)
-
- # write result detail.
- sod_sm2gm = getRelations(np.sign(sod_gm - sod_sm))
- dis_k_sm2gm = getRelations(np.sign(dis_k_gm - dis_k_sm))
- dis_k_gi2sm = getRelations(np.sign(dis_k_sm - dis_k_gi_min))
- dis_k_gi2gm = getRelations(np.sign(dis_k_gm - dis_k_gi_min))
- if save_results:
- f_detail = open(dir_output + fn_output_detail, 'a')
- csv.writer(f_detail).writerow([ds_name, gkernel, fit_method, k,
- y, repeat,
- sod_sm, sod_gm, dis_k_sm, dis_k_gm,
- dis_k_gi_min, sod_sm2gm, dis_k_sm2gm, dis_k_gi2sm,
- dis_k_gi2gm, median_set_idx])
- f_detail.close()
-
- # compute result summary.
- sod_sm_list.append(sod_sm)
- sod_gm_list.append(sod_gm)
- dis_k_sm_list.append(dis_k_sm)
- dis_k_gm_list.append(dis_k_gm)
- dis_k_gi_min_list.append(dis_k_gi_min)
- # # SOD SM -> GM
- if sod_sm > sod_gm:
- nb_sod_sm2gm[0] += 1
- repeats_better_sod_sm2gm.append(repeat)
- elif sod_sm == sod_gm:
- nb_sod_sm2gm[1] += 1
- elif sod_sm < sod_gm:
- nb_sod_sm2gm[2] += 1
- # # dis_k SM -> GM
- if dis_k_sm > dis_k_gm:
- nb_dis_k_sm2gm[0] += 1
- repeats_better_dis_k_sm2gm.append(repeat)
- elif dis_k_sm == dis_k_gm:
- nb_dis_k_sm2gm[1] += 1
- elif dis_k_sm < dis_k_gm:
- nb_dis_k_sm2gm[2] += 1
- # # dis_k gi -> SM
- if dis_k_gi_min > dis_k_sm:
- nb_dis_k_gi2sm[0] += 1
- repeats_better_dis_k_gi2sm.append(repeat)
- elif dis_k_gi_min == dis_k_sm:
- nb_dis_k_gi2sm[1] += 1
- elif dis_k_gi_min < dis_k_sm:
- nb_dis_k_gi2sm[2] += 1
- # # dis_k gi -> GM
- if dis_k_gi_min > dis_k_gm:
- nb_dis_k_gi2gm[0] += 1
- repeats_better_dis_k_gi2gm.append(repeat)
- elif dis_k_gi_min == dis_k_gm:
- nb_dis_k_gi2gm[1] += 1
- elif dis_k_gi_min < dis_k_gm:
- nb_dis_k_gi2gm[2] += 1
-
- # save median graphs.
- fname_sm = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/output/tmp_ged/set_median.gxl'
- fn_pre_sm_new = dir_output + 'medians/set_median.' + fit_method \
- + '.k' + str(int(k)) + '.y' + y + '.repeat' + str(repeat)
- copyfile(fname_sm, fn_pre_sm_new + '.gxl')
- fname_gm = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/output/tmp_ged/gen_median.gxl'
- fn_pre_gm_new = dir_output + 'medians/gen_median.' + fit_method \
- + '.k' + str(int(k)) + '.y' + y + '.repeat' + str(repeat)
- copyfile(fname_gm, fn_pre_gm_new + '.gxl')
- G_best_kernel = Gn_median[idx_dis_k_gi_min].copy()
- reform_attributes(G_best_kernel)
- fn_pre_g_best_kernel = dir_output + 'medians/g_best_kernel.' + fit_method \
- + '.k' + str(int(k)) + '.y' + y + '.repeat' + str(repeat)
- saveGXL(G_best_kernel, fn_pre_g_best_kernel + '.gxl', method='gedlib-letter')
-
- # plot median graphs.
- set_median = loadGXL(fn_pre_sm_new + '.gxl')
- gen_median = loadGXL(fn_pre_gm_new + '.gxl')
- draw_Letter_graph(set_median, fn_pre_sm_new)
- draw_Letter_graph(gen_median, fn_pre_gm_new)
- draw_Letter_graph(G_best_kernel, fn_pre_g_best_kernel)
-
- # write result summary for each letter.
- sod_sm_mean_list.append(np.mean(sod_sm_list))
- sod_gm_mean_list.append(np.mean(sod_gm_list))
- dis_k_sm_mean_list.append(np.mean(dis_k_sm_list))
- dis_k_gm_mean_list.append(np.mean(dis_k_gm_list))
- dis_k_gi_min_mean_list.append(np.mean(dis_k_gi_min_list))
- sod_sm2gm_mean = getRelations(np.sign(sod_gm_mean_list[-1] - sod_sm_mean_list[-1]))
- dis_k_sm2gm_mean = getRelations(np.sign(dis_k_gm_mean_list[-1] - dis_k_sm_mean_list[-1]))
- dis_k_gi2sm_mean = getRelations(np.sign(dis_k_sm_mean_list[-1] - dis_k_gi_min_mean_list[-1]))
- dis_k_gi2gm_mean = getRelations(np.sign(dis_k_gm_mean_list[-1] - dis_k_gi_min_mean_list[-1]))
- if save_results:
- f_summary = open(dir_output + fn_output_summary, 'a')
- csv.writer(f_summary).writerow([ds_name, gkernel, fit_method, k, y,
- sod_sm_mean_list[-1], sod_gm_mean_list[-1],
- dis_k_sm_mean_list[-1], dis_k_gm_mean_list[-1],
- dis_k_gi_min_mean_list[-1], sod_sm2gm_mean, dis_k_sm2gm_mean,
- dis_k_gi2sm_mean, dis_k_gi2gm_mean, nb_sod_sm2gm,
- nb_dis_k_sm2gm, nb_dis_k_gi2sm, nb_dis_k_gi2gm,
- repeats_better_sod_sm2gm, repeats_better_dis_k_sm2gm,
- repeats_better_dis_k_gi2sm, repeats_better_dis_k_gi2gm])
- f_summary.close()
-
-
- # write result summary for each letter.
- sod_sm_mean = np.mean(sod_sm_mean_list)
- sod_gm_mean = np.mean(sod_gm_mean_list)
- dis_k_sm_mean = np.mean(dis_k_sm_mean_list)
- dis_k_gm_mean = np.mean(dis_k_gm_mean_list)
- dis_k_gi_min_mean = np.mean(dis_k_gi_min_list)
- sod_sm2gm_mean = getRelations(np.sign(sod_gm_mean - sod_sm_mean))
- dis_k_sm2gm_mean = getRelations(np.sign(dis_k_gm_mean - dis_k_sm_mean))
- dis_k_gi2sm_mean = getRelations(np.sign(dis_k_sm_mean - dis_k_gi_min_mean))
- dis_k_gi2gm_mean = getRelations(np.sign(dis_k_gm_mean - dis_k_gi_min_mean))
- if save_results:
- f_summary = open(dir_output + fn_output_summary, 'a')
- csv.writer(f_summary).writerow([ds_name, gkernel, fit_method, k, 'all',
- sod_sm_mean, sod_gm_mean, dis_k_sm_mean, dis_k_gm_mean,
- dis_k_gi_min_mean, sod_sm2gm_mean, dis_k_sm2gm_mean,
- dis_k_gi2sm_mean, dis_k_gi2gm_mean])
- f_summary.close()
-
- print('\ncomplete.')
-
-
-def xp_letter_h():
- ds = {'dataset': 'cpp_ext/data/collections/Letter.xml',
- 'graph_dir': os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/data/datasets/Letter/HIGH/'} # node/edge symb
- Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['graph_dir'])
- for G in Gn:
- reform_attributes(G)
-# ds = {'name': 'Letter-high',
-# 'dataset': '../datasets/Letter-high/Letter-high_A.txt'} # node/edge symb
-# Gn, y_all = loadDataset(ds['dataset'])
-# Gn = Gn[0:50]
- gkernel = 'structuralspkernel'
- node_label = None
- edge_label = None
- ds_name = 'letter-h'
- dir_output = 'results/xp_letter_h/'
- save_results = False
-
- repeats = 1
-# k_list = range(2, 11)
- k_list = [150]
- fit_method = 'k-graphs'
- # get indices by classes.
- y_idx = get_same_item_indices(y_all)
-
- if save_results:
- # create result files.
- fn_output_detail = 'results_detail.' + ds_name + '.' + gkernel + '.' + fit_method + '.csv'
- f_detail = open(dir_output + fn_output_detail, 'a')
- csv.writer(f_detail).writerow(['dataset', 'graph kernel', 'fit method', 'k',
- 'target', 'repeat', 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM',
- 'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM',
- 'dis_k gi -> GM', 'median set'])
- f_detail.close()
- fn_output_summary = 'results_summary.' + ds_name + '.' + gkernel + '.' + fit_method + '.csv'
- f_summary = open(dir_output + fn_output_summary, 'a')
- csv.writer(f_summary).writerow(['dataset', 'graph kernel', 'fit method', 'k',
- 'target', 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM',
- 'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM',
- 'dis_k gi -> GM', '# SOD SM -> GM', '# dis_k SM -> GM',
- '# dis_k gi -> SM', '# dis_k gi -> GM', 'repeats better SOD SM -> GM',
- 'repeats better dis_k SM -> GM', 'repeats better dis_k gi -> SM',
- 'repeats better dis_k gi -> GM'])
- f_summary.close()
-
- random.seed(1)
- rdn_seed_list = random.sample(range(0, repeats * 100), repeats)
-
- for k in k_list:
- print('\n--------- k =', k, '----------')
-
- sod_sm_mean_list = []
- sod_gm_mean_list = []
- dis_k_sm_mean_list = []
- dis_k_gm_mean_list = []
- dis_k_gi_min_mean_list = []
-# nb_sod_sm2gm = [0, 0, 0]
-# nb_dis_k_sm2gm = [0, 0, 0]
-# nb_dis_k_gi2sm = [0, 0, 0]
-# nb_dis_k_gi2gm = [0, 0, 0]
-# repeats_better_sod_sm2gm = []
-# repeats_better_dis_k_sm2gm = []
-# repeats_better_dis_k_gi2sm = []
-# repeats_better_dis_k_gi2gm = []
-
- for i, (y, values) in enumerate(y_idx.items()):
- print('\ny =', y)
-# y = 'N'
-# values = y_idx[y]
-# values = values[0:10]
-
- k = len(values)
-
- sod_sm_list = []
- sod_gm_list = []
- dis_k_sm_list = []
- dis_k_gm_list = []
- dis_k_gi_min_list = []
- nb_sod_sm2gm = [0, 0, 0]
- nb_dis_k_sm2gm = [0, 0, 0]
- nb_dis_k_gi2sm = [0, 0, 0]
- nb_dis_k_gi2gm = [0, 0, 0]
- repeats_better_sod_sm2gm = []
- repeats_better_dis_k_sm2gm = []
- repeats_better_dis_k_gi2sm = []
- repeats_better_dis_k_gi2gm = []
-
- for repeat in range(repeats):
- print('\nrepeat =', repeat)
- random.seed(rdn_seed_list[repeat])
- median_set_idx_idx = random.sample(range(0, len(values)), k)
- median_set_idx = [values[idx] for idx in median_set_idx_idx]
- print('median set: ', median_set_idx)
- Gn_median = [Gn[g] for g in values]
-
- sod_sm, sod_gm, dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min, idx_dis_k_gi_min \
- = median_on_k_closest_graphs(Gn_median, node_label, edge_label,
- gkernel, k, fit_method=fit_method, graph_dir=ds['graph_dir'],
- edit_costs=None, group_min=median_set_idx_idx,
- dataset='Letter', parallel=False)
-
- # write result detail.
- sod_sm2gm = getRelations(np.sign(sod_gm - sod_sm))
- dis_k_sm2gm = getRelations(np.sign(dis_k_gm - dis_k_sm))
- dis_k_gi2sm = getRelations(np.sign(dis_k_sm - dis_k_gi_min))
- dis_k_gi2gm = getRelations(np.sign(dis_k_gm - dis_k_gi_min))
- if save_results:
- f_detail = open(dir_output + fn_output_detail, 'a')
- csv.writer(f_detail).writerow([ds_name, gkernel, fit_method, k,
- y, repeat,
- sod_sm, sod_gm, dis_k_sm, dis_k_gm,
- dis_k_gi_min, sod_sm2gm, dis_k_sm2gm, dis_k_gi2sm,
- dis_k_gi2gm, median_set_idx])
- f_detail.close()
-
- # compute result summary.
- sod_sm_list.append(sod_sm)
- sod_gm_list.append(sod_gm)
- dis_k_sm_list.append(dis_k_sm)
- dis_k_gm_list.append(dis_k_gm)
- dis_k_gi_min_list.append(dis_k_gi_min)
- # # SOD SM -> GM
- if sod_sm > sod_gm:
- nb_sod_sm2gm[0] += 1
- repeats_better_sod_sm2gm.append(repeat)
- elif sod_sm == sod_gm:
- nb_sod_sm2gm[1] += 1
- elif sod_sm < sod_gm:
- nb_sod_sm2gm[2] += 1
- # # dis_k SM -> GM
- if dis_k_sm > dis_k_gm:
- nb_dis_k_sm2gm[0] += 1
- repeats_better_dis_k_sm2gm.append(repeat)
- elif dis_k_sm == dis_k_gm:
- nb_dis_k_sm2gm[1] += 1
- elif dis_k_sm < dis_k_gm:
- nb_dis_k_sm2gm[2] += 1
- # # dis_k gi -> SM
- if dis_k_gi_min > dis_k_sm:
- nb_dis_k_gi2sm[0] += 1
- repeats_better_dis_k_gi2sm.append(repeat)
- elif dis_k_gi_min == dis_k_sm:
- nb_dis_k_gi2sm[1] += 1
- elif dis_k_gi_min < dis_k_sm:
- nb_dis_k_gi2sm[2] += 1
- # # dis_k gi -> GM
- if dis_k_gi_min > dis_k_gm:
- nb_dis_k_gi2gm[0] += 1
- repeats_better_dis_k_gi2gm.append(repeat)
- elif dis_k_gi_min == dis_k_gm:
- nb_dis_k_gi2gm[1] += 1
- elif dis_k_gi_min < dis_k_gm:
- nb_dis_k_gi2gm[2] += 1
-
- # save median graphs.
- fname_sm = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/output/tmp_ged/set_median.gxl'
- fn_pre_sm_new = dir_output + 'medians/set_median.' + fit_method \
- + '.k' + str(int(k)) + '.y' + y + '.repeat' + str(repeat)
- copyfile(fname_sm, fn_pre_sm_new + '.gxl')
- fname_gm = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/output/tmp_ged/gen_median.gxl'
- fn_pre_gm_new = dir_output + 'medians/gen_median.' + fit_method \
- + '.k' + str(int(k)) + '.y' + y + '.repeat' + str(repeat)
- copyfile(fname_gm, fn_pre_gm_new + '.gxl')
- G_best_kernel = Gn_median[idx_dis_k_gi_min].copy()
- reform_attributes(G_best_kernel)
- fn_pre_g_best_kernel = dir_output + 'medians/g_best_kernel.' + fit_method \
- + '.k' + str(int(k)) + '.y' + y + '.repeat' + str(repeat)
- saveGXL(G_best_kernel, fn_pre_g_best_kernel + '.gxl', method='gedlib-letter')
-
- # plot median graphs.
- set_median = loadGXL(fn_pre_sm_new + '.gxl')
- gen_median = loadGXL(fn_pre_gm_new + '.gxl')
- draw_Letter_graph(set_median, fn_pre_sm_new)
- draw_Letter_graph(gen_median, fn_pre_gm_new)
- draw_Letter_graph(G_best_kernel, fn_pre_g_best_kernel)
-
- # write result summary for each letter.
- sod_sm_mean_list.append(np.mean(sod_sm_list))
- sod_gm_mean_list.append(np.mean(sod_gm_list))
- dis_k_sm_mean_list.append(np.mean(dis_k_sm_list))
- dis_k_gm_mean_list.append(np.mean(dis_k_gm_list))
- dis_k_gi_min_mean_list.append(np.mean(dis_k_gi_min_list))
- sod_sm2gm_mean = getRelations(np.sign(sod_gm_mean_list[-1] - sod_sm_mean_list[-1]))
- dis_k_sm2gm_mean = getRelations(np.sign(dis_k_gm_mean_list[-1] - dis_k_sm_mean_list[-1]))
- dis_k_gi2sm_mean = getRelations(np.sign(dis_k_sm_mean_list[-1] - dis_k_gi_min_mean_list[-1]))
- dis_k_gi2gm_mean = getRelations(np.sign(dis_k_gm_mean_list[-1] - dis_k_gi_min_mean_list[-1]))
- if save_results:
- f_summary = open(dir_output + fn_output_summary, 'a')
- csv.writer(f_summary).writerow([ds_name, gkernel, fit_method, k, y,
- sod_sm_mean_list[-1], sod_gm_mean_list[-1],
- dis_k_sm_mean_list[-1], dis_k_gm_mean_list[-1],
- dis_k_gi_min_mean_list[-1], sod_sm2gm_mean, dis_k_sm2gm_mean,
- dis_k_gi2sm_mean, dis_k_gi2gm_mean, nb_sod_sm2gm,
- nb_dis_k_sm2gm, nb_dis_k_gi2sm, nb_dis_k_gi2gm,
- repeats_better_sod_sm2gm, repeats_better_dis_k_sm2gm,
- repeats_better_dis_k_gi2sm, repeats_better_dis_k_gi2gm])
- f_summary.close()
-
-
- # write result summary for each letter.
- sod_sm_mean = np.mean(sod_sm_mean_list)
- sod_gm_mean = np.mean(sod_gm_mean_list)
- dis_k_sm_mean = np.mean(dis_k_sm_mean_list)
- dis_k_gm_mean = np.mean(dis_k_gm_mean_list)
- dis_k_gi_min_mean = np.mean(dis_k_gi_min_list)
- sod_sm2gm_mean = getRelations(np.sign(sod_gm_mean - sod_sm_mean))
- dis_k_sm2gm_mean = getRelations(np.sign(dis_k_gm_mean - dis_k_sm_mean))
- dis_k_gi2sm_mean = getRelations(np.sign(dis_k_sm_mean - dis_k_gi_min_mean))
- dis_k_gi2gm_mean = getRelations(np.sign(dis_k_gm_mean - dis_k_gi_min_mean))
- if save_results:
- f_summary = open(dir_output + fn_output_summary, 'a')
- csv.writer(f_summary).writerow([ds_name, gkernel, fit_method, k, 'all',
- sod_sm_mean, sod_gm_mean, dis_k_sm_mean, dis_k_gm_mean,
- dis_k_gi_min_mean, sod_sm2gm_mean, dis_k_sm2gm_mean,
- dis_k_gi2sm_mean, dis_k_gi2gm_mean])
- f_summary.close()
-
- print('\ncomplete.')
-
-
-#Dessin median courrant
-def draw_Letter_graph(graph, file_prefix):
- plt.figure()
- pos = {}
- for n in graph.nodes:
- pos[n] = np.array([float(graph.node[n]['x']),float(graph.node[n]['y'])])
- nx.draw_networkx(graph, pos)
- plt.savefig(file_prefix + '.eps', format='eps', dpi=300)
-# plt.show()
- plt.clf()
-
-
-if __name__ == "__main__":
-# xp_letter_h()
- xp_letter_h_LETTER2_cost()
\ No newline at end of file
diff --git a/gklearn/preimage/xp_monoterpenoides.py b/gklearn/preimage/xp_monoterpenoides.py
deleted file mode 100644
index 2270471..0000000
--- a/gklearn/preimage/xp_monoterpenoides.py
+++ /dev/null
@@ -1,249 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Thu Jan 16 11:03:11 2020
-
-@author: ljia
-"""
-
-import numpy as np
-import random
-import csv
-from shutil import copyfile
-import networkx as nx
-import matplotlib.pyplot as plt
-
-from gklearn.utils.graphfiles import loadDataset, loadGXL, saveGXL
-from gklearn.preimage.test_k_closest_graphs import median_on_k_closest_graphs, reform_attributes
-from gklearn.preimage.utils import get_same_item_indices
-from gklearn.preimage.find_best_k import getRelations
-
-def xp_monoterpenoides():
- import os
-
- ds = {'dataset': '../../datasets/monoterpenoides/dataset_10+.ds',
- 'graph_dir': os.path.dirname(os.path.realpath(__file__)) + '../../datasets/monoterpenoides/'} # node/edge symb
- Gn, y_all = loadDataset(ds['dataset'])
-# ds = {'name': 'Letter-high',
-# 'dataset': '../datasets/Letter-high/Letter-high_A.txt'} # node/edge symb
-# Gn, y_all = loadDataset(ds['dataset'])
-# Gn = Gn[0:50]
- gkernel = 'treeletkernel'
- node_label = 'atom'
- edge_label = 'bond_type'
- ds_name = 'monoterpenoides'
- dir_output = 'results/xp_monoterpenoides/'
-
- repeats = 1
-# k_list = range(2, 11)
- k_list = [0]
- fit_method = 'k-graphs'
- # get indices by classes.
- y_idx = get_same_item_indices(y_all)
-
- # create result files.
- fn_output_detail = 'results_detail.' + ds_name + '.' + gkernel + '.' + fit_method + '.csv'
- f_detail = open(dir_output + fn_output_detail, 'a')
- csv.writer(f_detail).writerow(['dataset', 'graph kernel', 'fit method', 'k',
- 'target', 'repeat', 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM',
- 'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM',
- 'dis_k gi -> GM', 'median set'])
- f_detail.close()
- fn_output_summary = 'results_summary.' + ds_name + '.' + gkernel + '.' + fit_method + '.csv'
- f_summary = open(dir_output + fn_output_summary, 'a')
- csv.writer(f_summary).writerow(['dataset', 'graph kernel', 'fit method', 'k',
- 'target', 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM',
- 'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM',
- 'dis_k gi -> GM', '# SOD SM -> GM', '# dis_k SM -> GM',
- '# dis_k gi -> SM', '# dis_k gi -> GM', 'repeats better SOD SM -> GM',
- 'repeats better dis_k SM -> GM', 'repeats better dis_k gi -> SM',
- 'repeats better dis_k gi -> GM'])
- f_summary.close()
-
- random.seed(1)
- rdn_seed_list = random.sample(range(0, repeats * 100), repeats)
-
- for k in k_list:
- print('\n--------- k =', k, '----------')
-
- sod_sm_mean_list = []
- sod_gm_mean_list = []
- dis_k_sm_mean_list = []
- dis_k_gm_mean_list = []
- dis_k_gi_min_mean_list = []
-# nb_sod_sm2gm = [0, 0, 0]
-# nb_dis_k_sm2gm = [0, 0, 0]
-# nb_dis_k_gi2sm = [0, 0, 0]
-# nb_dis_k_gi2gm = [0, 0, 0]
-# repeats_better_sod_sm2gm = []
-# repeats_better_dis_k_sm2gm = []
-# repeats_better_dis_k_gi2sm = []
-# repeats_better_dis_k_gi2gm = []
-
- for i, (y, values) in enumerate(y_idx.items()):
- print('\ny =', y)
-# y = 'I'
-# values = y_idx[y]
-
- k = len(values)
-# k = kkk
-
- sod_sm_list = []
- sod_gm_list = []
- dis_k_sm_list = []
- dis_k_gm_list = []
- dis_k_gi_min_list = []
- nb_sod_sm2gm = [0, 0, 0]
- nb_dis_k_sm2gm = [0, 0, 0]
- nb_dis_k_gi2sm = [0, 0, 0]
- nb_dis_k_gi2gm = [0, 0, 0]
- repeats_better_sod_sm2gm = []
- repeats_better_dis_k_sm2gm = []
- repeats_better_dis_k_gi2sm = []
- repeats_better_dis_k_gi2gm = []
-
- for repeat in range(repeats):
- print('\nrepeat =', repeat)
- random.seed(rdn_seed_list[repeat])
- median_set_idx_idx = random.sample(range(0, len(values)), k)
- median_set_idx = [values[idx] for idx in median_set_idx_idx]
- print('median set: ', median_set_idx)
- Gn_median = [Gn[g] for g in values]
-
- sod_sm, sod_gm, dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min, idx_dis_k_gi_min \
- = median_on_k_closest_graphs(Gn_median, node_label, edge_label,
- gkernel, k, fit_method=fit_method, graph_dir=ds['graph_dir'],
- edit_costs=None, group_min=median_set_idx_idx,
- dataset=ds_name, parallel=False)
-
- # write result detail.
- sod_sm2gm = getRelations(np.sign(sod_gm - sod_sm))
- dis_k_sm2gm = getRelations(np.sign(dis_k_gm - dis_k_sm))
- dis_k_gi2sm = getRelations(np.sign(dis_k_sm - dis_k_gi_min))
- dis_k_gi2gm = getRelations(np.sign(dis_k_gm - dis_k_gi_min))
- f_detail = open(dir_output + fn_output_detail, 'a')
- csv.writer(f_detail).writerow([ds_name, gkernel, fit_method, k,
- y, repeat,
- sod_sm, sod_gm, dis_k_sm, dis_k_gm,
- dis_k_gi_min, sod_sm2gm, dis_k_sm2gm, dis_k_gi2sm,
- dis_k_gi2gm, median_set_idx])
- f_detail.close()
-
- # compute result summary.
- sod_sm_list.append(sod_sm)
- sod_gm_list.append(sod_gm)
- dis_k_sm_list.append(dis_k_sm)
- dis_k_gm_list.append(dis_k_gm)
- dis_k_gi_min_list.append(dis_k_gi_min)
- # # SOD SM -> GM
- if sod_sm > sod_gm:
- nb_sod_sm2gm[0] += 1
- repeats_better_sod_sm2gm.append(repeat)
- elif sod_sm == sod_gm:
- nb_sod_sm2gm[1] += 1
- elif sod_sm < sod_gm:
- nb_sod_sm2gm[2] += 1
- # # dis_k SM -> GM
- if dis_k_sm > dis_k_gm:
- nb_dis_k_sm2gm[0] += 1
- repeats_better_dis_k_sm2gm.append(repeat)
- elif dis_k_sm == dis_k_gm:
- nb_dis_k_sm2gm[1] += 1
- elif dis_k_sm < dis_k_gm:
- nb_dis_k_sm2gm[2] += 1
- # # dis_k gi -> SM
- if dis_k_gi_min > dis_k_sm:
- nb_dis_k_gi2sm[0] += 1
- repeats_better_dis_k_gi2sm.append(repeat)
- elif dis_k_gi_min == dis_k_sm:
- nb_dis_k_gi2sm[1] += 1
- elif dis_k_gi_min < dis_k_sm:
- nb_dis_k_gi2sm[2] += 1
- # # dis_k gi -> GM
- if dis_k_gi_min > dis_k_gm:
- nb_dis_k_gi2gm[0] += 1
- repeats_better_dis_k_gi2gm.append(repeat)
- elif dis_k_gi_min == dis_k_gm:
- nb_dis_k_gi2gm[1] += 1
- elif dis_k_gi_min < dis_k_gm:
- nb_dis_k_gi2gm[2] += 1
-
- # save median graphs.
- fname_sm = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/output/tmp_ged/set_median.gxl'
- fn_pre_sm_new = dir_output + 'medians/set_median.' + fit_method \
- + '.k' + str(int(k)) + '.y' + str(int(y)) + '.repeat' + str(repeat)
- copyfile(fname_sm, fn_pre_sm_new + '.gxl')
- fname_gm = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/output/tmp_ged/gen_median.gxl'
- fn_pre_gm_new = dir_output + 'medians/gen_median.' + fit_method \
- + '.k' + str(int(k)) + '.y' + str(int(y)) + '.repeat' + str(repeat)
- copyfile(fname_gm, fn_pre_gm_new + '.gxl')
- G_best_kernel = Gn_median[idx_dis_k_gi_min].copy()
-# reform_attributes(G_best_kernel)
- fn_pre_g_best_kernel = dir_output + 'medians/g_best_kernel.' + fit_method \
- + '.k' + str(int(k)) + '.y' + str(int(y)) + '.repeat' + str(repeat)
- saveGXL(G_best_kernel, fn_pre_g_best_kernel + '.gxl', method='gedlib')
-
-# # plot median graphs.
-# set_median = loadGXL(fn_pre_sm_new + '.gxl')
-# gen_median = loadGXL(fn_pre_gm_new + '.gxl')
-# draw_Letter_graph(set_median, fn_pre_sm_new)
-# draw_Letter_graph(gen_median, fn_pre_gm_new)
-# draw_Letter_graph(G_best_kernel, fn_pre_g_best_kernel)
-
- # write result summary for each letter.
- sod_sm_mean_list.append(np.mean(sod_sm_list))
- sod_gm_mean_list.append(np.mean(sod_gm_list))
- dis_k_sm_mean_list.append(np.mean(dis_k_sm_list))
- dis_k_gm_mean_list.append(np.mean(dis_k_gm_list))
- dis_k_gi_min_mean_list.append(np.mean(dis_k_gi_min_list))
- sod_sm2gm_mean = getRelations(np.sign(sod_gm_mean_list[-1] - sod_sm_mean_list[-1]))
- dis_k_sm2gm_mean = getRelations(np.sign(dis_k_gm_mean_list[-1] - dis_k_sm_mean_list[-1]))
- dis_k_gi2sm_mean = getRelations(np.sign(dis_k_sm_mean_list[-1] - dis_k_gi_min_mean_list[-1]))
- dis_k_gi2gm_mean = getRelations(np.sign(dis_k_gm_mean_list[-1] - dis_k_gi_min_mean_list[-1]))
- f_summary = open(dir_output + fn_output_summary, 'a')
- csv.writer(f_summary).writerow([ds_name, gkernel, fit_method, k, y,
- sod_sm_mean_list[-1], sod_gm_mean_list[-1],
- dis_k_sm_mean_list[-1], dis_k_gm_mean_list[-1],
- dis_k_gi_min_mean_list[-1], sod_sm2gm_mean, dis_k_sm2gm_mean,
- dis_k_gi2sm_mean, dis_k_gi2gm_mean, nb_sod_sm2gm,
- nb_dis_k_sm2gm, nb_dis_k_gi2sm, nb_dis_k_gi2gm,
- repeats_better_sod_sm2gm, repeats_better_dis_k_sm2gm,
- repeats_better_dis_k_gi2sm, repeats_better_dis_k_gi2gm])
- f_summary.close()
-
-
- # write result summary for each letter.
- sod_sm_mean = np.mean(sod_sm_mean_list)
- sod_gm_mean = np.mean(sod_gm_mean_list)
- dis_k_sm_mean = np.mean(dis_k_sm_mean_list)
- dis_k_gm_mean = np.mean(dis_k_gm_mean_list)
- dis_k_gi_min_mean = np.mean(dis_k_gi_min_list)
- sod_sm2gm_mean = getRelations(np.sign(sod_gm_mean - sod_sm_mean))
- dis_k_sm2gm_mean = getRelations(np.sign(dis_k_gm_mean - dis_k_sm_mean))
- dis_k_gi2sm_mean = getRelations(np.sign(dis_k_sm_mean - dis_k_gi_min_mean))
- dis_k_gi2gm_mean = getRelations(np.sign(dis_k_gm_mean - dis_k_gi_min_mean))
- f_summary = open(dir_output + fn_output_summary, 'a')
- csv.writer(f_summary).writerow([ds_name, gkernel, fit_method, k, 'all',
- sod_sm_mean, sod_gm_mean, dis_k_sm_mean, dis_k_gm_mean,
- dis_k_gi_min_mean, sod_sm2gm_mean, dis_k_sm2gm_mean,
- dis_k_gi2sm_mean, dis_k_gi2gm_mean])
- f_summary.close()
-
-
- print('\ncomplete.')
-
-
-#Dessin median courrant
-def draw_Letter_graph(graph, file_prefix):
- plt.figure()
- pos = {}
- for n in graph.nodes:
- pos[n] = np.array([float(graph.node[n]['x']),float(graph.node[n]['y'])])
- nx.draw_networkx(graph, pos)
- plt.savefig(file_prefix + '.eps', format='eps', dpi=300)
-# plt.show()
- plt.clf()
-
-
-if __name__ == "__main__":
- xp_monoterpenoides()
\ No newline at end of file