diff --git a/gklearn/preimage/common_types.py b/gklearn/preimage/common_types.py
new file mode 100644
index 0000000..2face25
--- /dev/null
+++ b/gklearn/preimage/common_types.py
@@ -0,0 +1,17 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Thu Mar 19 18:17:38 2020
+
+@author: ljia
+"""
+
+from enum import Enum, auto
+
+class AlgorithmState(Enum):
+ """can be used to specify the state of an algorithm.
+ """
+ CALLED = auto # The algorithm has been called.
+ INITIALIZED = auto # The algorithm has been initialized.
+ CONVERGED = auto # The algorithm has converged.
+ TERMINATED = auto # The algorithm has terminated.
\ No newline at end of file
diff --git a/gklearn/preimage/cpp2python.py b/gklearn/preimage/cpp2python.py
new file mode 100644
index 0000000..9d63026
--- /dev/null
+++ b/gklearn/preimage/cpp2python.py
@@ -0,0 +1,134 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Mar 20 11:09:04 2020
+
+@author: ljia
+"""
+import re
+
+def convert_function(cpp_code):
+# f_cpp = open('cpp_code.cpp', 'r')
+# # f_cpp = open('cpp_ext/src/median_graph_estimator.ipp', 'r')
+# cpp_code = f_cpp.read()
+ python_code = cpp_code.replace('else if (', 'elif ')
+ python_code = python_code.replace('if (', 'if ')
+ python_code = python_code.replace('else {', 'else:')
+ python_code = python_code.replace(') {', ':')
+ python_code = python_code.replace(';\n', '\n')
+ python_code = re.sub('\n(.*)}\n', '\n\n', python_code)
+ # python_code = python_code.replace('}\n', '')
+ python_code = python_code.replace('throw', 'raise')
+ python_code = python_code.replace('error', 'Exception')
+ python_code = python_code.replace('"', '\'')
+ python_code = python_code.replace('\\\'', '"')
+ python_code = python_code.replace('try {', 'try:')
+ python_code = python_code.replace('true', 'True')
+ python_code = python_code.replace('false', 'False')
+ python_code = python_code.replace('catch (...', 'except')
+ # python_code = re.sub('std::string\(\'(.*)\'\)', '$1', python_code)
+
+ return python_code
+
+
+
+# # python_code = python_code.replace('}\n', '')
+
+
+
+
+# python_code = python_code.replace('option.first', 'opt_name')
+# python_code = python_code.replace('option.second', 'opt_val')
+# python_code = python_code.replace('ged::Error', 'Exception')
+# python_code = python_code.replace('std::string(\'Invalid argument "\')', '\'Invalid argument "\'')
+
+
+# f_cpp.close()
+# f_python = open('python_code.py', 'w')
+# f_python.write(python_code)
+# f_python.close()
+
+
+def convert_function_comment(cpp_fun_cmt, param_types):
+ cpp_fun_cmt = cpp_fun_cmt.replace('\t', '')
+ cpp_fun_cmt = cpp_fun_cmt.replace('\n * ', ' ')
+ # split the input comment according to key words.
+ param_split = None
+ note = None
+ cmt_split = cpp_fun_cmt.split('@brief')[1]
+ brief = cmt_split
+ if '@param' in cmt_split:
+ cmt_split = cmt_split.split('@param')
+ brief = cmt_split[0]
+ param_split = cmt_split[1:]
+ if '@note' in cmt_split[-1]:
+ note_split = cmt_split[-1].split('@note')
+ if param_split is not None:
+ param_split.pop()
+ param_split.append(note_split[0])
+ else:
+ brief = note_split[0]
+ note = note_split[1]
+
+ # get parameters.
+ if param_split is not None:
+ for idx, param in enumerate(param_split):
+ _, param_name, param_desc = param.split(' ', 2)
+ param_name = function_comment_strip(param_name, ' *\n\t/')
+ param_desc = function_comment_strip(param_desc, ' *\n\t/')
+ param_split[idx] = (param_name, param_desc)
+
+ # strip comments.
+ brief = function_comment_strip(brief, ' *\n\t/')
+ if note is not None:
+ note = function_comment_strip(note, ' *\n\t/')
+
+ # construct the Python function comment.
+ python_fun_cmt = '"""'
+ python_fun_cmt += brief + '\n'
+ if param_split is not None and len(param_split) > 0:
+ python_fun_cmt += '\nParameters\n----------'
+ for idx, param in enumerate(param_split):
+ python_fun_cmt += '\n' + param[0] + ' : ' + param_types[idx]
+ python_fun_cmt += '\n\t' + param[1] + '\n'
+ if note is not None:
+ python_fun_cmt += '\nNote\n----\n' + note + '\n'
+ python_fun_cmt += '"""'
+
+ return python_fun_cmt
+
+
+def function_comment_strip(comment, bad_chars):
+ head_removed, tail_removed = False, False
+ while not head_removed or not tail_removed:
+ if comment[0] in bad_chars:
+ comment = comment[1:]
+ head_removed = False
+ else:
+ head_removed = True
+ if comment[-1] in bad_chars:
+ comment = comment[:-1]
+ tail_removed = False
+ else:
+ tail_removed = True
+
+ return comment
+
+
+if __name__ == '__main__':
+# python_code = convert_function("""
+# if (print_to_stdout_ == 2) {
+# std::cout << "\n===========================================================\n";
+# std::cout << "Block gradient descent for initial median " << median_pos + 1 << " of " << medians.size() << ".\n";
+# std::cout << "-----------------------------------------------------------\n";
+# }
+# """)
+
+
+ python_fun_cmt = convert_function_comment("""
+ /*!
+ * @brief Returns the sum of distances.
+ * @param[in] state The state of the estimator.
+ * @return The sum of distances of the median when the estimator was in the state @p state during the last call to run().
+ */
+ """, ['string', 'string'])
\ No newline at end of file
diff --git a/gklearn/preimage/find_best_k.py b/gklearn/preimage/find_best_k.py
new file mode 100644
index 0000000..df38d32
--- /dev/null
+++ b/gklearn/preimage/find_best_k.py
@@ -0,0 +1,170 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Thu Jan 9 11:54:32 2020
+
+@author: ljia
+"""
+import numpy as np
+import random
+import csv
+
+from gklearn.utils.graphfiles import loadDataset
+from gklearn.preimage.test_k_closest_graphs import median_on_k_closest_graphs
+
+def find_best_k():
+ ds = {'name': 'monoterpenoides',
+ 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb
+ Gn, y_all = loadDataset(ds['dataset'])
+# Gn = Gn[0:50]
+ gkernel = 'treeletkernel'
+ node_label = 'atom'
+ edge_label = 'bond_type'
+ ds_name = 'mono'
+ dir_output = 'results/test_find_best_k/'
+
+ repeats = 50
+ k_list = range(2, 11)
+ fit_method = 'k-graphs'
+ # fitted on the whole dataset - treelet - mono
+ edit_costs = [0.1268873773592978, 0.004084633224249829, 0.0897581955378986, 0.15328856114451297, 0.3109956881625734, 0.0]
+
+ # create result files.
+ fn_output_detail = 'results_detail.' + fit_method + '.csv'
+ f_detail = open(dir_output + fn_output_detail, 'a')
+ csv.writer(f_detail).writerow(['dataset', 'graph kernel', 'fit method', 'k',
+ 'repeat', 'median set', 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM',
+ 'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM',
+ 'dis_k gi -> GM'])
+ f_detail.close()
+ fn_output_summary = 'results_summary.csv'
+ f_summary = open(dir_output + fn_output_summary, 'a')
+ csv.writer(f_summary).writerow(['dataset', 'graph kernel', 'fit method', 'k',
+ 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM',
+ 'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM',
+ 'dis_k gi -> GM', '# SOD SM -> GM', '# dis_k SM -> GM',
+ '# dis_k gi -> SM', '# dis_k gi -> GM', 'repeats better SOD SM -> GM',
+ 'repeats better dis_k SM -> GM', 'repeats better dis_k gi -> SM',
+ 'repeats better dis_k gi -> GM'])
+ f_summary.close()
+
+ random.seed(1)
+ rdn_seed_list = random.sample(range(0, repeats * 100), repeats)
+
+ for k in k_list:
+ print('\n--------- k =', k, '----------')
+
+ sod_sm_list = []
+ sod_gm_list = []
+ dis_k_sm_list = []
+ dis_k_gm_list = []
+ dis_k_gi_min_list = []
+ nb_sod_sm2gm = [0, 0, 0]
+ nb_dis_k_sm2gm = [0, 0, 0]
+ nb_dis_k_gi2sm = [0, 0, 0]
+ nb_dis_k_gi2gm = [0, 0, 0]
+ repeats_better_sod_sm2gm = []
+ repeats_better_dis_k_sm2gm = []
+ repeats_better_dis_k_gi2sm = []
+ repeats_better_dis_k_gi2gm = []
+
+
+ for repeat in range(repeats):
+ print('\nrepeat =', repeat)
+ random.seed(rdn_seed_list[repeat])
+ median_set_idx = random.sample(range(0, len(Gn)), k)
+ print('median set: ', median_set_idx)
+
+ sod_sm, sod_gm, dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min \
+ = median_on_k_closest_graphs(Gn, node_label, edge_label, gkernel, k,
+ fit_method='k-graphs',
+ edit_costs=edit_costs,
+ group_min=median_set_idx,
+ parallel=False)
+
+ # write result detail.
+ sod_sm2gm = getRelations(np.sign(sod_gm - sod_sm))
+ dis_k_sm2gm = getRelations(np.sign(dis_k_gm - dis_k_sm))
+ dis_k_gi2sm = getRelations(np.sign(dis_k_sm - dis_k_gi_min))
+ dis_k_gi2gm = getRelations(np.sign(dis_k_gm - dis_k_gi_min))
+ f_detail = open(dir_output + fn_output_detail, 'a')
+ csv.writer(f_detail).writerow([ds_name, gkernel, fit_method, k, repeat,
+ median_set_idx, sod_sm, sod_gm, dis_k_sm, dis_k_gm,
+ dis_k_gi_min, sod_sm2gm, dis_k_sm2gm, dis_k_gi2sm,
+ dis_k_gi2gm])
+ f_detail.close()
+
+ # compute result summary.
+ sod_sm_list.append(sod_sm)
+ sod_gm_list.append(sod_gm)
+ dis_k_sm_list.append(dis_k_sm)
+ dis_k_gm_list.append(dis_k_gm)
+ dis_k_gi_min_list.append(dis_k_gi_min)
+ # # SOD SM -> GM
+ if sod_sm > sod_gm:
+ nb_sod_sm2gm[0] += 1
+ repeats_better_sod_sm2gm.append(repeat)
+ elif sod_sm == sod_gm:
+ nb_sod_sm2gm[1] += 1
+ elif sod_sm < sod_gm:
+ nb_sod_sm2gm[2] += 1
+ # # dis_k SM -> GM
+ if dis_k_sm > dis_k_gm:
+ nb_dis_k_sm2gm[0] += 1
+ repeats_better_dis_k_sm2gm.append(repeat)
+ elif dis_k_sm == dis_k_gm:
+ nb_dis_k_sm2gm[1] += 1
+ elif dis_k_sm < dis_k_gm:
+ nb_dis_k_sm2gm[2] += 1
+ # # dis_k gi -> SM
+ if dis_k_gi_min > dis_k_sm:
+ nb_dis_k_gi2sm[0] += 1
+ repeats_better_dis_k_gi2sm.append(repeat)
+ elif dis_k_gi_min == dis_k_sm:
+ nb_dis_k_gi2sm[1] += 1
+ elif dis_k_gi_min < dis_k_sm:
+ nb_dis_k_gi2sm[2] += 1
+ # # dis_k gi -> GM
+ if dis_k_gi_min > dis_k_gm:
+ nb_dis_k_gi2gm[0] += 1
+ repeats_better_dis_k_gi2gm.append(repeat)
+ elif dis_k_gi_min == dis_k_gm:
+ nb_dis_k_gi2gm[1] += 1
+ elif dis_k_gi_min < dis_k_gm:
+ nb_dis_k_gi2gm[2] += 1
+
+ # write result summary.
+ sod_sm_mean = np.mean(sod_sm_list)
+ sod_gm_mean = np.mean(sod_gm_list)
+ dis_k_sm_mean = np.mean(dis_k_sm_list)
+ dis_k_gm_mean = np.mean(dis_k_gm_list)
+ dis_k_gi_min_mean = np.mean(dis_k_gi_min_list)
+ sod_sm2gm_mean = getRelations(np.sign(sod_gm_mean - sod_sm_mean))
+ dis_k_sm2gm_mean = getRelations(np.sign(dis_k_gm_mean - dis_k_sm_mean))
+ dis_k_gi2sm_mean = getRelations(np.sign(dis_k_sm_mean - dis_k_gi_min_mean))
+ dis_k_gi2gm_mean = getRelations(np.sign(dis_k_gm_mean - dis_k_gi_min_mean))
+ f_summary = open(dir_output + fn_output_summary, 'a')
+ csv.writer(f_summary).writerow([ds_name, gkernel, fit_method, k,
+ sod_sm_mean, sod_gm_mean, dis_k_sm_mean, dis_k_gm_mean,
+ dis_k_gi_min_mean, sod_sm2gm_mean, dis_k_sm2gm_mean,
+ dis_k_gi2sm_mean, dis_k_gi2gm_mean, nb_sod_sm2gm,
+ nb_dis_k_sm2gm, nb_dis_k_gi2sm, nb_dis_k_gi2gm,
+ repeats_better_sod_sm2gm, repeats_better_dis_k_sm2gm,
+ repeats_better_dis_k_gi2sm, repeats_better_dis_k_gi2gm])
+ f_summary.close()
+
+ print('\ncomplete.')
+ return
+
+
+def getRelations(sign):
+ if sign == -1:
+ return 'better'
+ elif sign == 0:
+ return 'same'
+ elif sign == 1:
+ return 'worse'
+
+
+if __name__ == '__main__':
+ find_best_k()
\ No newline at end of file
diff --git a/gklearn/preimage/fitDistance.py b/gklearn/preimage/fitDistance.py
new file mode 100644
index 0000000..234f7fc
--- /dev/null
+++ b/gklearn/preimage/fitDistance.py
@@ -0,0 +1,430 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Oct 16 14:20:06 2019
+
+@author: ljia
+"""
+import numpy as np
+from tqdm import tqdm
+from itertools import combinations_with_replacement, combinations
+import multiprocessing
+from multiprocessing import Pool
+from functools import partial
+import time
+import random
+import sys
+
+from scipy import optimize
+from scipy.optimize import minimize
+import cvxpy as cp
+
+from gklearn.preimage.ged import GED, get_nb_edit_operations, get_nb_edit_operations_letter, get_nb_edit_operations_nonsymbolic
+from gklearn.preimage.utils import kernel_distance_matrix
+
+def fit_GED_to_kernel_distance(Gn, node_label, edge_label, gkernel, itr_max,
+ params_ged={'lib': 'gedlibpy', 'cost': 'CONSTANT',
+ 'method': 'IPFP', 'stabilizer': None},
+ init_costs=[3, 3, 1, 3, 3, 1],
+ dataset='monoterpenoides', Kmatrix=None,
+ parallel=True):
+# dataset = dataset.lower()
+
+ # c_vi, c_vr, c_vs, c_ei, c_er, c_es or parts of them.
+# random.seed(1)
+# cost_rdm = random.sample(range(1, 10), 6)
+# init_costs = cost_rdm + [0]
+# init_costs = cost_rdm
+# init_costs = [3, 3, 1, 3, 3, 1]
+# init_costs = [i * 0.01 for i in cost_rdm] + [0]
+# init_costs = [0.2, 0.2, 0.2, 0.2, 0.2, 0]
+# init_costs = [0, 0, 0.9544, 0.026, 0.0196, 0]
+# init_costs = [0.008429912251810438, 0.025461055985319694, 0.2047320869225948, 0.004148727085832133, 0.0, 0]
+# idx_cost_nonzeros = [i for i, item in enumerate(edit_costs) if item != 0]
+
+ # compute distances in feature space.
+ dis_k_mat, _, _, _ = kernel_distance_matrix(Gn, node_label, edge_label,
+ Kmatrix=Kmatrix, gkernel=gkernel)
+ dis_k_vec = []
+ for i in range(len(dis_k_mat)):
+# for j in range(i, len(dis_k_mat)):
+ for j in range(i + 1, len(dis_k_mat)):
+ dis_k_vec.append(dis_k_mat[i, j])
+ dis_k_vec = np.array(dis_k_vec)
+
+ # init ged.
+ print('\ninitial:')
+ time0 = time.time()
+ params_ged['dataset'] = dataset
+ params_ged['edit_cost_constant'] = init_costs
+ ged_vec_init, ged_mat, n_edit_operations = compute_geds(Gn, params_ged,
+ parallel=parallel)
+ residual_list = [np.sqrt(np.sum(np.square(np.array(ged_vec_init) - dis_k_vec)))]
+ time_list = [time.time() - time0]
+ edit_cost_list = [init_costs]
+ nb_cost_mat = np.array(n_edit_operations)
+ nb_cost_mat_list = [nb_cost_mat]
+ print('edit_costs:', init_costs)
+ print('residual_list:', residual_list)
+
+ for itr in range(itr_max):
+ print('\niteration', itr)
+ time0 = time.time()
+ # "fit" geds to distances in feature space by tuning edit costs using the
+ # Least Squares Method.
+ np.savez('results/xp_fit_method/fit_data_debug' + str(itr) + '.gm',
+ nb_cost_mat=nb_cost_mat, dis_k_vec=dis_k_vec,
+ n_edit_operations=n_edit_operations, ged_vec_init=ged_vec_init,
+ ged_mat=ged_mat)
+ edit_costs_new, residual = update_costs(nb_cost_mat, dis_k_vec,
+ dataset=dataset, cost=params_ged['cost'])
+ for i in range(len(edit_costs_new)):
+ if -1e-9 <= edit_costs_new[i] <= 1e-9:
+ edit_costs_new[i] = 0
+ if edit_costs_new[i] < 0:
+ raise ValueError('The edit cost is negative.')
+# for i in range(len(edit_costs_new)):
+# if edit_costs_new[i] < 0:
+# edit_costs_new[i] = 0
+
+ # compute new GEDs and numbers of edit operations.
+ params_ged['edit_cost_constant'] = edit_costs_new # np.array([edit_costs_new[0], edit_costs_new[1], 0.75])
+ ged_vec, ged_mat, n_edit_operations = compute_geds(Gn, params_ged,
+ parallel=parallel)
+ residual_list.append(np.sqrt(np.sum(np.square(np.array(ged_vec) - dis_k_vec))))
+ time_list.append(time.time() - time0)
+ edit_cost_list.append(edit_costs_new)
+ nb_cost_mat = np.array(n_edit_operations)
+ nb_cost_mat_list.append(nb_cost_mat)
+ print('edit_costs:', edit_costs_new)
+ print('residual_list:', residual_list)
+
+ return edit_costs_new, residual_list, edit_cost_list, dis_k_mat, ged_mat, \
+ time_list, nb_cost_mat_list
+
+
+def compute_geds(Gn, params_ged, parallel=False):
+ edit_cost_name = params_ged['cost']
+ if edit_cost_name == 'LETTER' or edit_cost_name == 'LETTER2':
+ get_nb_eo = get_nb_edit_operations_letter
+ elif edit_cost_name == 'NON_SYMBOLIC':
+ get_nb_eo = get_nb_edit_operations_nonsymbolic
+ else:
+ get_nb_eo = get_nb_edit_operations
+ ged_mat = np.zeros((len(Gn), len(Gn)))
+ if parallel:
+# print('parallel')
+# len_itr = int(len(Gn) * (len(Gn) + 1) / 2)
+ len_itr = int(len(Gn) * (len(Gn) - 1) / 2)
+ ged_vec = [0 for i in range(len_itr)]
+ n_edit_operations = [0 for i in range(len_itr)]
+# itr = combinations_with_replacement(range(0, len(Gn)), 2)
+ itr = combinations(range(0, len(Gn)), 2)
+ n_jobs = multiprocessing.cpu_count()
+ if len_itr < 100 * n_jobs:
+ chunksize = int(len_itr / n_jobs) + 1
+ else:
+ chunksize = 100
+ def init_worker(gn_toshare):
+ global G_gn
+ G_gn = gn_toshare
+ do_partial = partial(_wrapper_compute_ged_parallel, params_ged, get_nb_eo)
+ pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(Gn,))
+ iterator = tqdm(pool.imap_unordered(do_partial, itr, chunksize),
+ desc='computing GEDs', file=sys.stdout)
+# iterator = pool.imap_unordered(do_partial, itr, chunksize)
+ for i, j, dis, n_eo_tmp in iterator:
+ idx_itr = int(len(Gn) * i + j - (i + 1) * (i + 2) / 2)
+ ged_vec[idx_itr] = dis
+ ged_mat[i][j] = dis
+ ged_mat[j][i] = dis
+ n_edit_operations[idx_itr] = n_eo_tmp
+# print('\n-------------------------------------------')
+# print(i, j, idx_itr, dis)
+ pool.close()
+ pool.join()
+
+ else:
+ ged_vec = []
+ n_edit_operations = []
+ for i in tqdm(range(len(Gn)), desc='computing GEDs', file=sys.stdout):
+# for i in range(len(Gn)):
+ for j in range(i + 1, len(Gn)):
+ dis, pi_forward, pi_backward = GED(Gn[i], Gn[j], **params_ged)
+ ged_vec.append(dis)
+ ged_mat[i][j] = dis
+ ged_mat[j][i] = dis
+ n_eo_tmp = get_nb_eo(Gn[i], Gn[j], pi_forward, pi_backward)
+ n_edit_operations.append(n_eo_tmp)
+
+ return ged_vec, ged_mat, n_edit_operations
+
+
+def _wrapper_compute_ged_parallel(params_ged, get_nb_eo, itr):
+ i = itr[0]
+ j = itr[1]
+ dis, n_eo_tmp = _compute_ged_parallel(G_gn[i], G_gn[j], params_ged, get_nb_eo)
+ return i, j, dis, n_eo_tmp
+
+
+def _compute_ged_parallel(g1, g2, params_ged, get_nb_eo):
+ dis, pi_forward, pi_backward = GED(g1, g2, **params_ged)
+ n_eo_tmp = get_nb_eo(g1, g2, pi_forward, pi_backward) # [0,0,0,0,0,0]
+ return dis, n_eo_tmp
+
+
+def update_costs(nb_cost_mat, dis_k_vec, dataset='monoterpenoides',
+ cost='CONSTANT', rw_constraints='inequality'):
+# if dataset == 'Letter-high':
+ if cost == 'LETTER':
+ pass
+# # method 1: set alpha automatically, just tune c_vir and c_eir by
+# # LMS using cvxpy.
+# alpha = 0.5
+# coeff = 100 # np.max(alpha * nb_cost_mat[:,4] / dis_k_vec)
+## if np.count_nonzero(nb_cost_mat[:,4]) == 0:
+## alpha = 0.75
+## else:
+## alpha = np.min([dis_k_vec / c_vs for c_vs in nb_cost_mat[:,4] if c_vs != 0])
+## alpha = alpha * 0.99
+# param_vir = alpha * (nb_cost_mat[:,0] + nb_cost_mat[:,1])
+# param_eir = (1 - alpha) * (nb_cost_mat[:,4] + nb_cost_mat[:,5])
+# nb_cost_mat_new = np.column_stack((param_vir, param_eir))
+# dis_new = coeff * dis_k_vec - alpha * nb_cost_mat[:,3]
+#
+# x = cp.Variable(nb_cost_mat_new.shape[1])
+# cost = cp.sum_squares(nb_cost_mat_new * x - dis_new)
+# constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])]]
+# prob = cp.Problem(cp.Minimize(cost), constraints)
+# prob.solve()
+# edit_costs_new = x.value
+# edit_costs_new = np.array([edit_costs_new[0], edit_costs_new[1], alpha])
+# residual = np.sqrt(prob.value)
+
+# # method 2: tune c_vir, c_eir and alpha by nonlinear programming by
+# # scipy.optimize.minimize.
+# w0 = nb_cost_mat[:,0] + nb_cost_mat[:,1]
+# w1 = nb_cost_mat[:,4] + nb_cost_mat[:,5]
+# w2 = nb_cost_mat[:,3]
+# w3 = dis_k_vec
+# func_min = lambda x: np.sum((w0 * x[0] * x[3] + w1 * x[1] * (1 - x[2]) \
+# + w2 * x[2] - w3 * x[3]) ** 2)
+# bounds = ((0, None), (0., None), (0.5, 0.5), (0, None))
+# res = minimize(func_min, [0.9, 1.7, 0.75, 10], bounds=bounds)
+# edit_costs_new = res.x[0:3]
+# residual = res.fun
+
+ # method 3: tune c_vir, c_eir and alpha by nonlinear programming using cvxpy.
+
+
+# # method 4: tune c_vir, c_eir and alpha by QP function
+# # scipy.optimize.least_squares. An initial guess is required.
+# w0 = nb_cost_mat[:,0] + nb_cost_mat[:,1]
+# w1 = nb_cost_mat[:,4] + nb_cost_mat[:,5]
+# w2 = nb_cost_mat[:,3]
+# w3 = dis_k_vec
+# func = lambda x: (w0 * x[0] * x[3] + w1 * x[1] * (1 - x[2]) \
+# + w2 * x[2] - w3 * x[3]) ** 2
+# res = optimize.root(func, [0.9, 1.7, 0.75, 100])
+# edit_costs_new = res.x
+# residual = None
+ elif cost == 'LETTER2':
+# # 1. if c_vi != c_vr, c_ei != c_er.
+# nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]]
+# x = cp.Variable(nb_cost_mat_new.shape[1])
+# cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec)
+## # 1.1 no constraints.
+## constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])]]
+# # 1.2 c_vs <= c_vi + c_vr.
+# constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])],
+# np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0]
+## # 2. if c_vi == c_vr, c_ei == c_er.
+## nb_cost_mat_new = nb_cost_mat[:,[0,3,4]]
+## nb_cost_mat_new[:,0] += nb_cost_mat[:,1]
+## nb_cost_mat_new[:,2] += nb_cost_mat[:,5]
+## x = cp.Variable(nb_cost_mat_new.shape[1])
+## cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec)
+## # 2.1 no constraints.
+## constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])]]
+### # 2.2 c_vs <= c_vi + c_vr.
+### constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])],
+### np.array([2.0, -1.0, 0.0]).T@x >= 0.0]
+#
+# prob = cp.Problem(cp.Minimize(cost_fun), constraints)
+# prob.solve()
+# edit_costs_new = [x.value[0], x.value[0], x.value[1], x.value[2], x.value[2]]
+# edit_costs_new = np.array(edit_costs_new)
+# residual = np.sqrt(prob.value)
+ if rw_constraints == 'inequality':
+ # c_vs <= c_vi + c_vr.
+ nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]]
+ x = cp.Variable(nb_cost_mat_new.shape[1])
+ cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec)
+ constraints = [x >= [0.001 for i in range(nb_cost_mat_new.shape[1])],
+ np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0]
+ prob = cp.Problem(cp.Minimize(cost_fun), constraints)
+ try:
+ prob.solve(verbose=True)
+ except MemoryError as error0:
+ print('\nUsing solver "OSQP" caused a memory error.')
+ print('the original error message is\n', error0)
+ print('solver status: ', prob.status)
+ print('trying solver "CVXOPT" instead...\n')
+ try:
+ prob.solve(solver=cp.CVXOPT, verbose=True)
+ except Exception as error1:
+ print('\nAn error occured when using solver "CVXOPT".')
+ print('the original error message is\n', error1)
+ print('solver status: ', prob.status)
+ print('trying solver "MOSEK" instead. Notice this solver is commercial and a lisence is required.\n')
+ prob.solve(solver=cp.MOSEK, verbose=True)
+ else:
+ print('solver status: ', prob.status)
+ else:
+ print('solver status: ', prob.status)
+ print()
+ edit_costs_new = x.value
+ residual = np.sqrt(prob.value)
+ elif rw_constraints == '2constraints':
+ # c_vs <= c_vi + c_vr and c_vi == c_vr, c_ei == c_er.
+ nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]]
+ x = cp.Variable(nb_cost_mat_new.shape[1])
+ cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec)
+ constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])],
+ np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0,
+ np.array([1.0, -1.0, 0.0, 0.0, 0.0]).T@x == 0.0,
+ np.array([0.0, 0.0, 0.0, 1.0, -1.0]).T@x == 0.0]
+ prob = cp.Problem(cp.Minimize(cost_fun), constraints)
+ prob.solve()
+ edit_costs_new = x.value
+ residual = np.sqrt(prob.value)
+ elif rw_constraints == 'no-constraint':
+ # no constraint.
+ nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]]
+ x = cp.Variable(nb_cost_mat_new.shape[1])
+ cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec)
+ constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])]]
+ prob = cp.Problem(cp.Minimize(cost_fun), constraints)
+ prob.solve()
+ edit_costs_new = x.value
+ residual = np.sqrt(prob.value)
+# elif method == 'inequality_modified':
+# # c_vs <= c_vi + c_vr.
+# nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]]
+# x = cp.Variable(nb_cost_mat_new.shape[1])
+# cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec)
+# constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])],
+# np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0]
+# prob = cp.Problem(cp.Minimize(cost_fun), constraints)
+# prob.solve()
+# # use same costs for insertion and removal rather than the fitted costs.
+# edit_costs_new = [x.value[0], x.value[0], x.value[1], x.value[2], x.value[2]]
+# edit_costs_new = np.array(edit_costs_new)
+# residual = np.sqrt(prob.value)
+ elif cost == 'NON_SYMBOLIC':
+ is_n_attr = np.count_nonzero(nb_cost_mat[:,2])
+ is_e_attr = np.count_nonzero(nb_cost_mat[:,5])
+
+ if dataset == 'SYNTHETICnew':
+# nb_cost_mat_new = nb_cost_mat[:,[0,1,2,3,4]]
+ nb_cost_mat_new = nb_cost_mat[:,[2,3,4]]
+ x = cp.Variable(nb_cost_mat_new.shape[1])
+ cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec)
+# constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])],
+# np.array([0.0, 0.0, 0.0, 1.0, -1.0]).T@x == 0.0]
+# constraints = [x >= [0.0001 for i in range(nb_cost_mat_new.shape[1])]]
+ constraints = [x >= [0.0001 for i in range(nb_cost_mat_new.shape[1])],
+ np.array([0.0, 1.0, -1.0]).T@x == 0.0]
+ prob = cp.Problem(cp.Minimize(cost_fun), constraints)
+ prob.solve()
+# print(x.value)
+ edit_costs_new = np.concatenate((np.array([0.0, 0.0]), x.value,
+ np.array([0.0])))
+ residual = np.sqrt(prob.value)
+
+ elif rw_constraints == 'inequality':
+ # c_vs <= c_vi + c_vr.
+ if is_n_attr and is_e_attr:
+ nb_cost_mat_new = nb_cost_mat[:,[0,1,2,3,4,5]]
+ x = cp.Variable(nb_cost_mat_new.shape[1])
+ cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec)
+ constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])],
+ np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0,
+ np.array([0.0, 0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0]
+ prob = cp.Problem(cp.Minimize(cost_fun), constraints)
+ prob.solve()
+ edit_costs_new = x.value
+ residual = np.sqrt(prob.value)
+ elif is_n_attr and not is_e_attr:
+ nb_cost_mat_new = nb_cost_mat[:,[0,1,2,3,4]]
+ x = cp.Variable(nb_cost_mat_new.shape[1])
+ cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec)
+ constraints = [x >= [0.001 for i in range(nb_cost_mat_new.shape[1])],
+ np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0]
+ prob = cp.Problem(cp.Minimize(cost_fun), constraints)
+ prob.solve()
+ print(x.value)
+ edit_costs_new = np.concatenate((x.value, np.array([0.0])))
+ residual = np.sqrt(prob.value)
+ elif not is_n_attr and is_e_attr:
+ nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]]
+ x = cp.Variable(nb_cost_mat_new.shape[1])
+ cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec)
+ constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])],
+ np.array([0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0]
+ prob = cp.Problem(cp.Minimize(cost_fun), constraints)
+ prob.solve()
+ edit_costs_new = np.concatenate((x.value[0:2], np.array([0.0]), x.value[2:]))
+ residual = np.sqrt(prob.value)
+ else:
+ nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4]]
+ x = cp.Variable(nb_cost_mat_new.shape[1])
+ cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec)
+ constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])]]
+ prob = cp.Problem(cp.Minimize(cost_fun), constraints)
+ prob.solve()
+ edit_costs_new = np.concatenate((x.value[0:2], np.array([0.0]),
+ x.value[2:], np.array([0.0])))
+ residual = np.sqrt(prob.value)
+ else:
+# # method 1: simple least square method.
+# edit_costs_new, residual, _, _ = np.linalg.lstsq(nb_cost_mat, dis_k_vec,
+# rcond=None)
+
+# # method 2: least square method with x_i >= 0.
+# edit_costs_new, residual = optimize.nnls(nb_cost_mat, dis_k_vec)
+
+ # method 3: solve as a quadratic program with constraints.
+# P = np.dot(nb_cost_mat.T, nb_cost_mat)
+# q_T = -2 * np.dot(dis_k_vec.T, nb_cost_mat)
+# G = -1 * np.identity(nb_cost_mat.shape[1])
+# h = np.array([0 for i in range(nb_cost_mat.shape[1])])
+# A = np.array([1 for i in range(nb_cost_mat.shape[1])])
+# b = 1
+# x = cp.Variable(nb_cost_mat.shape[1])
+# prob = cp.Problem(cp.Minimize(cp.quad_form(x, P) + q_T@x),
+# [G@x <= h])
+# prob.solve()
+# edit_costs_new = x.value
+# residual = prob.value - np.dot(dis_k_vec.T, dis_k_vec)
+
+# G = -1 * np.identity(nb_cost_mat.shape[1])
+# h = np.array([0 for i in range(nb_cost_mat.shape[1])])
+ x = cp.Variable(nb_cost_mat.shape[1])
+ cost_fun = cp.sum_squares(nb_cost_mat * x - dis_k_vec)
+ constraints = [x >= [0.0 for i in range(nb_cost_mat.shape[1])],
+ # np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0]
+ np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0,
+ np.array([0.0, 0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0]
+ prob = cp.Problem(cp.Minimize(cost_fun), constraints)
+ prob.solve()
+ edit_costs_new = x.value
+ residual = np.sqrt(prob.value)
+
+ # method 4:
+
+ return edit_costs_new, residual
+
+
+if __name__ == '__main__':
+ print('check test_fitDistance.py')
\ No newline at end of file
diff --git a/gklearn/preimage/ged.py b/gklearn/preimage/ged.py
new file mode 100644
index 0000000..a66baaf
--- /dev/null
+++ b/gklearn/preimage/ged.py
@@ -0,0 +1,467 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Thu Oct 17 18:44:59 2019
+
+@author: ljia
+"""
+import numpy as np
+import networkx as nx
+from tqdm import tqdm
+import sys
+import multiprocessing
+from multiprocessing import Pool
+from functools import partial
+
+#from gedlibpy_linlin import librariesImport, gedlibpy
+from gklearn.gedlib import librariesImport, gedlibpy
+
+def GED(g1, g2, dataset='monoterpenoides', lib='gedlibpy', cost='CHEM_1', method='IPFP',
+ edit_cost_constant=[], algo_options='', stabilizer='min', repeat=50):
+ """
+ Compute GED for 2 graphs.
+ """
+
+# dataset = dataset.lower()
+
+ if lib == 'gedlibpy':
+ gedlibpy.restart_env()
+ gedlibpy.add_nx_graph(convertGraph(g1, cost), "")
+ gedlibpy.add_nx_graph(convertGraph(g2, cost), "")
+
+ listID = gedlibpy.get_all_graph_ids()
+ gedlibpy.set_edit_cost(cost, edit_cost_constant=edit_cost_constant)
+ gedlibpy.init()
+ gedlibpy.set_method(method, algo_options)
+ gedlibpy.init_method()
+
+ g = listID[0]
+ h = listID[1]
+ if stabilizer is None:
+ gedlibpy.run_method(g, h)
+ pi_forward = gedlibpy.get_forward_map(g, h)
+ pi_backward = gedlibpy.get_backward_map(g, h)
+ upper = gedlibpy.get_upper_bound(g, h)
+ lower = gedlibpy.get_lower_bound(g, h)
+ elif stabilizer == 'mean':
+ # @todo: to be finished...
+ upper_list = [np.inf] * repeat
+ for itr in range(repeat):
+ gedlibpy.run_method(g, h)
+ upper_list[itr] = gedlibpy.get_upper_bound(g, h)
+ pi_forward = gedlibpy.get_forward_map(g, h)
+ pi_backward = gedlibpy.get_backward_map(g, h)
+ lower = gedlibpy.get_lower_bound(g, h)
+ upper = np.mean(upper_list)
+ elif stabilizer == 'median':
+ if repeat % 2 == 0:
+ repeat += 1
+ upper_list = [np.inf] * repeat
+ pi_forward_list = [0] * repeat
+ pi_backward_list = [0] * repeat
+ for itr in range(repeat):
+ gedlibpy.run_method(g, h)
+ upper_list[itr] = gedlibpy.get_upper_bound(g, h)
+ pi_forward_list[itr] = gedlibpy.get_forward_map(g, h)
+ pi_backward_list[itr] = gedlibpy.get_backward_map(g, h)
+ lower = gedlibpy.get_lower_bound(g, h)
+ upper = np.median(upper_list)
+ idx_median = upper_list.index(upper)
+ pi_forward = pi_forward_list[idx_median]
+ pi_backward = pi_backward_list[idx_median]
+ elif stabilizer == 'min':
+ upper = np.inf
+ for itr in range(repeat):
+ gedlibpy.run_method(g, h)
+ upper_tmp = gedlibpy.get_upper_bound(g, h)
+ if upper_tmp < upper:
+ upper = upper_tmp
+ pi_forward = gedlibpy.get_forward_map(g, h)
+ pi_backward = gedlibpy.get_backward_map(g, h)
+ lower = gedlibpy.get_lower_bound(g, h)
+ if upper == 0:
+ break
+ elif stabilizer == 'max':
+ upper = 0
+ for itr in range(repeat):
+ gedlibpy.run_method(g, h)
+ upper_tmp = gedlibpy.get_upper_bound(g, h)
+ if upper_tmp > upper:
+ upper = upper_tmp
+ pi_forward = gedlibpy.get_forward_map(g, h)
+ pi_backward = gedlibpy.get_backward_map(g, h)
+ lower = gedlibpy.get_lower_bound(g, h)
+ elif stabilizer == 'gaussian':
+ pass
+
+ dis = upper
+
+ elif lib == 'gedlib-bash':
+ import time
+ import random
+ import os
+ from gklearn.utils.graphfiles import saveDataset
+
+ tmp_dir = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/output/tmp_ged/'
+ if not os.path.exists(tmp_dir):
+ os.makedirs(tmp_dir)
+ fn_collection = tmp_dir + 'collection.' + str(time.time()) + str(random.randint(0, 1e9))
+ xparams = {'method': 'gedlib', 'graph_dir': fn_collection}
+ saveDataset([g1, g2], ['dummy', 'dummy'], gformat='gxl', group='xml',
+ filename=fn_collection, xparams=xparams)
+
+ command = 'GEDLIB_HOME=\'/media/ljia/DATA/research-repo/codes/others/gedlib/gedlib2\'\n'
+ command += 'LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$GEDLIB_HOME/lib\n'
+ command += 'export LD_LIBRARY_PATH\n'
+ command += 'cd \'' + os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/bin\'\n'
+ command += './ged_for_python_bash monoterpenoides ' + fn_collection \
+ + ' \'' + algo_options + '\' '
+ for ec in edit_cost_constant:
+ command += str(ec) + ' '
+# output = os.system(command)
+ stream = os.popen(command)
+ output = stream.readlines()
+# print(output)
+
+ dis = float(output[0].strip())
+ runtime = float(output[1].strip())
+ size_forward = int(output[2].strip())
+ pi_forward = [int(item.strip()) for item in output[3:3+size_forward]]
+ pi_backward = [int(item.strip()) for item in output[3+size_forward:]]
+
+# print(dis)
+# print(runtime)
+# print(size_forward)
+# print(pi_forward)
+# print(pi_backward)
+
+
+ # make the map label correct (label remove map as np.inf)
+ nodes1 = [n for n in g1.nodes()]
+ nodes2 = [n for n in g2.nodes()]
+ nb1 = nx.number_of_nodes(g1)
+ nb2 = nx.number_of_nodes(g2)
+ pi_forward = [nodes2[pi] if pi < nb2 else np.inf for pi in pi_forward]
+ pi_backward = [nodes1[pi] if pi < nb1 else np.inf for pi in pi_backward]
+# print(pi_forward)
+
+
+ return dis, pi_forward, pi_backward
+
+
+def convertGraph(G, cost):
+ """Convert a graph to the proper NetworkX format that can be
+ recognized by library gedlibpy.
+ """
+ G_new = nx.Graph()
+ if cost == 'LETTER' or cost == 'LETTER2':
+ for nd, attrs in G.nodes(data=True):
+ G_new.add_node(str(nd), x=str(attrs['attributes'][0]),
+ y=str(attrs['attributes'][1]))
+ for nd1, nd2, attrs in G.edges(data=True):
+ G_new.add_edge(str(nd1), str(nd2))
+ elif cost == 'NON_SYMBOLIC':
+ for nd, attrs in G.nodes(data=True):
+ G_new.add_node(str(nd))
+ for a_name in G.graph['node_attrs']:
+ G_new.nodes[str(nd)][a_name] = str(attrs[a_name])
+ for nd1, nd2, attrs in G.edges(data=True):
+ G_new.add_edge(str(nd1), str(nd2))
+ for a_name in G.graph['edge_attrs']:
+ G_new.edges[str(nd1), str(nd2)][a_name] = str(attrs[a_name])
+ else:
+ for nd, attrs in G.nodes(data=True):
+ G_new.add_node(str(nd), chem=attrs['atom'])
+ for nd1, nd2, attrs in G.edges(data=True):
+ G_new.add_edge(str(nd1), str(nd2), valence=attrs['bond_type'])
+# G_new.add_edge(str(nd1), str(nd2))
+
+ return G_new
+
+
+def GED_n(Gn, lib='gedlibpy', cost='CHEM_1', method='IPFP',
+ edit_cost_constant=[], stabilizer='min', repeat=50):
+ """
+ Compute GEDs for a group of graphs.
+ """
+ if lib == 'gedlibpy':
+ def convertGraph(G):
+ """Convert a graph to the proper NetworkX format that can be
+ recognized by library gedlibpy.
+ """
+ G_new = nx.Graph()
+ for nd, attrs in G.nodes(data=True):
+ G_new.add_node(str(nd), chem=attrs['atom'])
+ for nd1, nd2, attrs in G.edges(data=True):
+# G_new.add_edge(str(nd1), str(nd2), valence=attrs['bond_type'])
+ G_new.add_edge(str(nd1), str(nd2))
+
+ return G_new
+
+ gedlibpy.restart_env()
+ gedlibpy.add_nx_graph(convertGraph(g1), "")
+ gedlibpy.add_nx_graph(convertGraph(g2), "")
+
+ listID = gedlibpy.get_all_graph_ids()
+ gedlibpy.set_edit_cost(cost, edit_cost_constant=edit_cost_constant)
+ gedlibpy.init()
+ gedlibpy.set_method(method, "")
+ gedlibpy.init_method()
+
+ g = listID[0]
+ h = listID[1]
+ if stabilizer is None:
+ gedlibpy.run_method(g, h)
+ pi_forward = gedlibpy.get_forward_map(g, h)
+ pi_backward = gedlibpy.get_backward_map(g, h)
+ upper = gedlibpy.get_upper_bound(g, h)
+ lower = gedlibpy.get_lower_bound(g, h)
+ elif stabilizer == 'min':
+ upper = np.inf
+ for itr in range(repeat):
+ gedlibpy.run_method(g, h)
+ upper_tmp = gedlibpy.get_upper_bound(g, h)
+ if upper_tmp < upper:
+ upper = upper_tmp
+ pi_forward = gedlibpy.get_forward_map(g, h)
+ pi_backward = gedlibpy.get_backward_map(g, h)
+ lower = gedlibpy.get_lower_bound(g, h)
+ if upper == 0:
+ break
+
+ dis = upper
+
+ # make the map label correct (label remove map as np.inf)
+ nodes1 = [n for n in g1.nodes()]
+ nodes2 = [n for n in g2.nodes()]
+ nb1 = nx.number_of_nodes(g1)
+ nb2 = nx.number_of_nodes(g2)
+ pi_forward = [nodes2[pi] if pi < nb2 else np.inf for pi in pi_forward]
+ pi_backward = [nodes1[pi] if pi < nb1 else np.inf for pi in pi_backward]
+
+ return dis, pi_forward, pi_backward
+
+
+def ged_median(Gn, Gn_median, verbose=False, params_ged={'lib': 'gedlibpy',
+ 'cost': 'CHEM_1', 'method': 'IPFP', 'edit_cost_constant': [],
+ 'algo_options': '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1',
+ 'stabilizer': None}, parallel=False):
+ if parallel:
+ len_itr = int(len(Gn))
+ pi_forward_list = [[] for i in range(len_itr)]
+ dis_list = [0 for i in range(len_itr)]
+
+ itr = range(0, len_itr)
+ n_jobs = multiprocessing.cpu_count()
+ if len_itr < 100 * n_jobs:
+ chunksize = int(len_itr / n_jobs) + 1
+ else:
+ chunksize = 100
+ def init_worker(gn_toshare, gn_median_toshare):
+ global G_gn, G_gn_median
+ G_gn = gn_toshare
+ G_gn_median = gn_median_toshare
+ do_partial = partial(_compute_ged_median, params_ged)
+ pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(Gn, Gn_median))
+ if verbose:
+ iterator = tqdm(pool.imap_unordered(do_partial, itr, chunksize),
+ desc='computing GEDs', file=sys.stdout)
+ else:
+ iterator = pool.imap_unordered(do_partial, itr, chunksize)
+ for i, dis_sum, pi_forward in iterator:
+ pi_forward_list[i] = pi_forward
+ dis_list[i] = dis_sum
+# print('\n-------------------------------------------')
+# print(i, j, idx_itr, dis)
+ pool.close()
+ pool.join()
+
+ else:
+ dis_list = []
+ pi_forward_list = []
+ for idx, G in tqdm(enumerate(Gn), desc='computing median distances',
+ file=sys.stdout) if verbose else enumerate(Gn):
+ dis_sum = 0
+ pi_forward_list.append([])
+ for G_p in Gn_median:
+ dis_tmp, pi_tmp_forward, pi_tmp_backward = GED(G, G_p,
+ **params_ged)
+ pi_forward_list[idx].append(pi_tmp_forward)
+ dis_sum += dis_tmp
+ dis_list.append(dis_sum)
+
+ return dis_list, pi_forward_list
+
+
+def _compute_ged_median(params_ged, itr):
+# print(itr)
+ dis_sum = 0
+ pi_forward = []
+ for G_p in G_gn_median:
+ dis_tmp, pi_tmp_forward, pi_tmp_backward = GED(G_gn[itr], G_p,
+ **params_ged)
+ pi_forward.append(pi_tmp_forward)
+ dis_sum += dis_tmp
+
+ return itr, dis_sum, pi_forward
+
+
+def get_nb_edit_operations(g1, g2, forward_map, backward_map):
+ """Compute the number of each edit operations.
+ """
+ n_vi = 0
+ n_vr = 0
+ n_vs = 0
+ n_ei = 0
+ n_er = 0
+ n_es = 0
+
+ nodes1 = [n for n in g1.nodes()]
+ for i, map_i in enumerate(forward_map):
+ if map_i == np.inf:
+ n_vr += 1
+ elif g1.node[nodes1[i]]['atom'] != g2.node[map_i]['atom']:
+ n_vs += 1
+ for map_i in backward_map:
+ if map_i == np.inf:
+ n_vi += 1
+
+# idx_nodes1 = range(0, len(node1))
+
+ edges1 = [e for e in g1.edges()]
+ nb_edges2_cnted = 0
+ for n1, n2 in edges1:
+ idx1 = nodes1.index(n1)
+ idx2 = nodes1.index(n2)
+ # one of the nodes is removed, thus the edge is removed.
+ if forward_map[idx1] == np.inf or forward_map[idx2] == np.inf:
+ n_er += 1
+ # corresponding edge is in g2.
+ elif (forward_map[idx1], forward_map[idx2]) in g2.edges():
+ nb_edges2_cnted += 1
+ # edge labels are different.
+ if g2.edges[((forward_map[idx1], forward_map[idx2]))]['bond_type'] \
+ != g1.edges[(n1, n2)]['bond_type']:
+ n_es += 1
+ elif (forward_map[idx2], forward_map[idx1]) in g2.edges():
+ nb_edges2_cnted += 1
+ # edge labels are different.
+ if g2.edges[((forward_map[idx2], forward_map[idx1]))]['bond_type'] \
+ != g1.edges[(n1, n2)]['bond_type']:
+ n_es += 1
+ # corresponding nodes are in g2, however the edge is removed.
+ else:
+ n_er += 1
+ n_ei = nx.number_of_edges(g2) - nb_edges2_cnted
+
+ return n_vi, n_vr, n_vs, n_ei, n_er, n_es
+
+
+def get_nb_edit_operations_letter(g1, g2, forward_map, backward_map):
+ """Compute the number of each edit operations.
+ """
+ n_vi = 0
+ n_vr = 0
+ n_vs = 0
+ sod_vs = 0
+ n_ei = 0
+ n_er = 0
+
+ nodes1 = [n for n in g1.nodes()]
+ for i, map_i in enumerate(forward_map):
+ if map_i == np.inf:
+ n_vr += 1
+ else:
+ n_vs += 1
+ diff_x = float(g1.nodes[nodes1[i]]['x']) - float(g2.nodes[map_i]['x'])
+ diff_y = float(g1.nodes[nodes1[i]]['y']) - float(g2.nodes[map_i]['y'])
+ sod_vs += np.sqrt(np.square(diff_x) + np.square(diff_y))
+ for map_i in backward_map:
+ if map_i == np.inf:
+ n_vi += 1
+
+# idx_nodes1 = range(0, len(node1))
+
+ edges1 = [e for e in g1.edges()]
+ nb_edges2_cnted = 0
+ for n1, n2 in edges1:
+ idx1 = nodes1.index(n1)
+ idx2 = nodes1.index(n2)
+ # one of the nodes is removed, thus the edge is removed.
+ if forward_map[idx1] == np.inf or forward_map[idx2] == np.inf:
+ n_er += 1
+ # corresponding edge is in g2. Edge label is not considered.
+ elif (forward_map[idx1], forward_map[idx2]) in g2.edges() or \
+ (forward_map[idx2], forward_map[idx1]) in g2.edges():
+ nb_edges2_cnted += 1
+ # corresponding nodes are in g2, however the edge is removed.
+ else:
+ n_er += 1
+ n_ei = nx.number_of_edges(g2) - nb_edges2_cnted
+
+ return n_vi, n_vr, n_vs, sod_vs, n_ei, n_er
+
+
+def get_nb_edit_operations_nonsymbolic(g1, g2, forward_map, backward_map):
+ """Compute the number of each edit operations.
+ """
+ n_vi = 0
+ n_vr = 0
+ n_vs = 0
+ sod_vs = 0
+ n_ei = 0
+ n_er = 0
+ n_es = 0
+ sod_es = 0
+
+ nodes1 = [n for n in g1.nodes()]
+ for i, map_i in enumerate(forward_map):
+ if map_i == np.inf:
+ n_vr += 1
+ else:
+ n_vs += 1
+ sum_squares = 0
+ for a_name in g1.graph['node_attrs']:
+ diff = float(g1.nodes[nodes1[i]][a_name]) - float(g2.nodes[map_i][a_name])
+ sum_squares += np.square(diff)
+ sod_vs += np.sqrt(sum_squares)
+ for map_i in backward_map:
+ if map_i == np.inf:
+ n_vi += 1
+
+# idx_nodes1 = range(0, len(node1))
+
+ edges1 = [e for e in g1.edges()]
+ for n1, n2 in edges1:
+ idx1 = nodes1.index(n1)
+ idx2 = nodes1.index(n2)
+ n1_g2 = forward_map[idx1]
+ n2_g2 = forward_map[idx2]
+ # one of the nodes is removed, thus the edge is removed.
+ if n1_g2 == np.inf or n2_g2 == np.inf:
+ n_er += 1
+ # corresponding edge is in g2.
+ elif (n1_g2, n2_g2) in g2.edges():
+ n_es += 1
+ sum_squares = 0
+ for a_name in g1.graph['edge_attrs']:
+ diff = float(g1.edges[n1, n2][a_name]) - float(g2.nodes[n1_g2, n2_g2][a_name])
+ sum_squares += np.square(diff)
+ sod_es += np.sqrt(sum_squares)
+ elif (n2_g2, n1_g2) in g2.edges():
+ n_es += 1
+ sum_squares = 0
+ for a_name in g1.graph['edge_attrs']:
+ diff = float(g1.edges[n2, n1][a_name]) - float(g2.nodes[n2_g2, n1_g2][a_name])
+ sum_squares += np.square(diff)
+ sod_es += np.sqrt(sum_squares)
+ # corresponding nodes are in g2, however the edge is removed.
+ else:
+ n_er += 1
+ n_ei = nx.number_of_edges(g2) - n_es
+
+ return n_vi, n_vr, sod_vs, n_ei, n_er, sod_es
+
+
+if __name__ == '__main__':
+ print('check test_ged.py')
\ No newline at end of file
diff --git a/gklearn/preimage/iam.py b/gklearn/preimage/iam.py
new file mode 100644
index 0000000..f3e2165
--- /dev/null
+++ b/gklearn/preimage/iam.py
@@ -0,0 +1,775 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Apr 26 11:49:12 2019
+
+Iterative alternate minimizations using GED.
+@author: ljia
+"""
+import numpy as np
+import random
+import networkx as nx
+from tqdm import tqdm
+
+from gklearn.utils.graphdataset import get_dataset_attributes
+from gklearn.utils.utils import graph_isIdentical, get_node_labels, get_edge_labels
+from gklearn.preimage.ged import GED, ged_median
+
+
+def iam_upgraded(Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, ite_max=50,
+ epsilon=0.001, node_label='atom', edge_label='bond_type',
+ connected=False, removeNodes=True, allBestInit=False, allBestNodes=False,
+ allBestEdges=False, allBestOutput=False,
+ params_ged={'lib': 'gedlibpy', 'cost': 'CHEM_1', 'method': 'IPFP',
+ 'edit_cost_constant': [], 'stabilizer': None,
+ 'algo_options': '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1'}):
+ """See my name, then you know what I do.
+ """
+# Gn_median = Gn_median[0:10]
+# Gn_median = [nx.convert_node_labels_to_integers(g) for g in Gn_median]
+ node_ir = np.inf # corresponding to the node remove and insertion.
+ label_r = 'thanksdanny' # the label for node remove. # @todo: make this label unrepeatable.
+ ds_attrs = get_dataset_attributes(Gn_median + Gn_candidate,
+ attr_names=['edge_labeled', 'node_attr_dim', 'edge_attr_dim'],
+ edge_label=edge_label)
+ node_label_set = get_node_labels(Gn_median, node_label)
+ edge_label_set = get_edge_labels(Gn_median, edge_label)
+
+
+ def generate_graph(G, pi_p_forward):
+ G_new_list = [G.copy()] # all "best" graphs generated in this iteration.
+# nx.draw_networkx(G)
+# import matplotlib.pyplot as plt
+# plt.show()
+# print(pi_p_forward)
+
+ # update vertex labels.
+ # pre-compute h_i0 for each label.
+# for label in get_node_labels(Gn, node_label):
+# print(label)
+# for nd in G.nodes(data=True):
+# pass
+ if not ds_attrs['node_attr_dim']: # labels are symbolic
+ for ndi, (nd, _) in enumerate(G.nodes(data=True)):
+ h_i0_list = []
+ label_list = []
+ for label in node_label_set:
+ h_i0 = 0
+ for idx, g in enumerate(Gn_median):
+ pi_i = pi_p_forward[idx][ndi]
+ if pi_i != node_ir and g.nodes[pi_i][node_label] == label:
+ h_i0 += 1
+ h_i0_list.append(h_i0)
+ label_list.append(label)
+ # case when the node is to be removed.
+ if removeNodes:
+ h_i0_remove = 0 # @todo: maybe this can be added to the node_label_set above.
+ for idx, g in enumerate(Gn_median):
+ pi_i = pi_p_forward[idx][ndi]
+ if pi_i == node_ir:
+ h_i0_remove += 1
+ h_i0_list.append(h_i0_remove)
+ label_list.append(label_r)
+ # get the best labels.
+ idx_max = np.argwhere(h_i0_list == np.max(h_i0_list)).flatten().tolist()
+ if allBestNodes: # choose all best graphs.
+ nlabel_best = [label_list[idx] for idx in idx_max]
+ # generate "best" graphs with regard to "best" node labels.
+ G_new_list_nd = []
+ for g in G_new_list: # @todo: seems it can be simplified. The G_new_list will only contain 1 graph for now.
+ for nl in nlabel_best:
+ g_tmp = g.copy()
+ if nl == label_r:
+ g_tmp.remove_node(nd)
+ else:
+ g_tmp.nodes[nd][node_label] = nl
+ G_new_list_nd.append(g_tmp)
+ # nx.draw_networkx(g_tmp)
+ # import matplotlib.pyplot as plt
+ # plt.show()
+ # print(g_tmp.nodes(data=True))
+ # print(g_tmp.edges(data=True))
+ G_new_list = [ggg.copy() for ggg in G_new_list_nd]
+ else:
+ # choose one of the best randomly.
+ idx_rdm = random.randint(0, len(idx_max) - 1)
+ best_label = label_list[idx_max[idx_rdm]]
+ h_i0_max = h_i0_list[idx_max[idx_rdm]]
+
+ g_new = G_new_list[0]
+ if best_label == label_r:
+ g_new.remove_node(nd)
+ else:
+ g_new.nodes[nd][node_label] = best_label
+ G_new_list = [g_new]
+ else: # labels are non-symbolic
+ for ndi, (nd, _) in enumerate(G.nodes(data=True)):
+ Si_norm = 0
+ phi_i_bar = np.array([0.0 for _ in range(ds_attrs['node_attr_dim'])])
+ for idx, g in enumerate(Gn_median):
+ pi_i = pi_p_forward[idx][ndi]
+ if g.has_node(pi_i): #@todo: what if no g has node? phi_i_bar = 0?
+ Si_norm += 1
+ phi_i_bar += np.array([float(itm) for itm in g.nodes[pi_i]['attributes']])
+ phi_i_bar /= Si_norm
+ G_new_list[0].nodes[nd]['attributes'] = phi_i_bar
+
+# for g in G_new_list:
+# import matplotlib.pyplot as plt
+# nx.draw(g, labels=nx.get_node_attributes(g, 'atom'), with_labels=True)
+# plt.show()
+# print(g.nodes(data=True))
+# print(g.edges(data=True))
+
+ # update edge labels and adjacency matrix.
+ if ds_attrs['edge_labeled']:
+ G_new_list_edge = []
+ for g_new in G_new_list:
+ nd_list = [n for n in g_new.nodes()]
+ g_tmp_list = [g_new.copy()]
+ for nd1i in range(nx.number_of_nodes(g_new)):
+ nd1 = nd_list[nd1i]# @todo: not just edges, but all pairs of nodes
+ for nd2i in range(nd1i + 1, nx.number_of_nodes(g_new)):
+ nd2 = nd_list[nd2i]
+# for nd1, nd2, _ in g_new.edges(data=True):
+ h_ij0_list = []
+ label_list = []
+ for label in edge_label_set:
+ h_ij0 = 0
+ for idx, g in enumerate(Gn_median):
+ pi_i = pi_p_forward[idx][nd1i]
+ pi_j = pi_p_forward[idx][nd2i]
+ h_ij0_p = (g.has_node(pi_i) and g.has_node(pi_j) and
+ g.has_edge(pi_i, pi_j) and
+ g.edges[pi_i, pi_j][edge_label] == label)
+ h_ij0 += h_ij0_p
+ h_ij0_list.append(h_ij0)
+ label_list.append(label)
+
+ # get the best labels.
+ idx_max = np.argwhere(h_ij0_list == np.max(h_ij0_list)).flatten().tolist()
+ if allBestEdges: # choose all best graphs.
+ elabel_best = [label_list[idx] for idx in idx_max]
+ h_ij0_max = [h_ij0_list[idx] for idx in idx_max]
+ # generate "best" graphs with regard to "best" node labels.
+ G_new_list_ed = []
+ for g_tmp in g_tmp_list: # @todo: seems it can be simplified. The G_new_list will only contain 1 graph for now.
+ for idxl, el in enumerate(elabel_best):
+ g_tmp_copy = g_tmp.copy()
+ # check whether a_ij is 0 or 1.
+ sij_norm = 0
+ for idx, g in enumerate(Gn_median):
+ pi_i = pi_p_forward[idx][nd1i]
+ pi_j = pi_p_forward[idx][nd2i]
+ if g.has_node(pi_i) and g.has_node(pi_j) and \
+ g.has_edge(pi_i, pi_j):
+ sij_norm += 1
+ if h_ij0_max[idxl] > len(Gn_median) * c_er / c_es + \
+ sij_norm * (1 - (c_er + c_ei) / c_es):
+ if not g_tmp_copy.has_edge(nd1, nd2):
+ g_tmp_copy.add_edge(nd1, nd2)
+ g_tmp_copy.edges[nd1, nd2][edge_label] = elabel_best[idxl]
+ else:
+ if g_tmp_copy.has_edge(nd1, nd2):
+ g_tmp_copy.remove_edge(nd1, nd2)
+ G_new_list_ed.append(g_tmp_copy)
+ g_tmp_list = [ggg.copy() for ggg in G_new_list_ed]
+ else: # choose one of the best randomly.
+ idx_rdm = random.randint(0, len(idx_max) - 1)
+ best_label = label_list[idx_max[idx_rdm]]
+ h_ij0_max = h_ij0_list[idx_max[idx_rdm]]
+
+ # check whether a_ij is 0 or 1.
+ sij_norm = 0
+ for idx, g in enumerate(Gn_median):
+ pi_i = pi_p_forward[idx][nd1i]
+ pi_j = pi_p_forward[idx][nd2i]
+ if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j):
+ sij_norm += 1
+ if h_ij0_max > len(Gn_median) * c_er / c_es + sij_norm * (1 - (c_er + c_ei) / c_es):
+ if not g_new.has_edge(nd1, nd2):
+ g_new.add_edge(nd1, nd2)
+ g_new.edges[nd1, nd2][edge_label] = best_label
+ else:
+# elif h_ij0_max < len(Gn_median) * c_er / c_es + sij_norm * (1 - (c_er + c_ei) / c_es):
+ if g_new.has_edge(nd1, nd2):
+ g_new.remove_edge(nd1, nd2)
+ g_tmp_list = [g_new]
+ G_new_list_edge += g_tmp_list
+ G_new_list = [ggg.copy() for ggg in G_new_list_edge]
+
+
+ else: # if edges are unlabeled
+ # @todo: is this even right? G or g_tmp? check if the new one is right
+ # @todo: works only for undirected graphs.
+
+ for g_tmp in G_new_list:
+ nd_list = [n for n in g_tmp.nodes()]
+ for nd1i in range(nx.number_of_nodes(g_tmp)):
+ nd1 = nd_list[nd1i]
+ for nd2i in range(nd1i + 1, nx.number_of_nodes(g_tmp)):
+ nd2 = nd_list[nd2i]
+ sij_norm = 0
+ for idx, g in enumerate(Gn_median):
+ pi_i = pi_p_forward[idx][nd1i]
+ pi_j = pi_p_forward[idx][nd2i]
+ if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j):
+ sij_norm += 1
+ if sij_norm > len(Gn_median) * c_er / (c_er + c_ei):
+ # @todo: should we consider if nd1 and nd2 in g_tmp?
+ # or just add the edge anyway?
+ if g_tmp.has_node(nd1) and g_tmp.has_node(nd2) \
+ and not g_tmp.has_edge(nd1, nd2):
+ g_tmp.add_edge(nd1, nd2)
+ else: # @todo: which to use?
+# elif sij_norm < len(Gn_median) * c_er / (c_er + c_ei):
+ if g_tmp.has_edge(nd1, nd2):
+ g_tmp.remove_edge(nd1, nd2)
+ # do not change anything when equal.
+
+# for i, g in enumerate(G_new_list):
+# import matplotlib.pyplot as plt
+# nx.draw(g, labels=nx.get_node_attributes(g, 'atom'), with_labels=True)
+## plt.savefig("results/gk_iam/simple_two/xx" + str(i) + ".png", format="PNG")
+# plt.show()
+# print(g.nodes(data=True))
+# print(g.edges(data=True))
+
+# # find the best graph generated in this iteration and update pi_p.
+ # @todo: should we update all graphs generated or just the best ones?
+ dis_list, pi_forward_list = ged_median(G_new_list, Gn_median,
+ params_ged=params_ged)
+ # @todo: should we remove the identical and connectivity check?
+ # Don't know which is faster.
+ if ds_attrs['node_attr_dim'] == 0 and ds_attrs['edge_attr_dim'] == 0:
+ G_new_list, idx_list = remove_duplicates(G_new_list)
+ pi_forward_list = [pi_forward_list[idx] for idx in idx_list]
+ dis_list = [dis_list[idx] for idx in idx_list]
+# if connected == True:
+# G_new_list, idx_list = remove_disconnected(G_new_list)
+# pi_forward_list = [pi_forward_list[idx] for idx in idx_list]
+# idx_min_list = np.argwhere(dis_list == np.min(dis_list)).flatten().tolist()
+# dis_min = dis_list[idx_min_tmp_list[0]]
+# pi_forward_list = [pi_forward_list[idx] for idx in idx_min_list]
+# G_new_list = [G_new_list[idx] for idx in idx_min_list]
+
+# for g in G_new_list:
+# import matplotlib.pyplot as plt
+# nx.draw_networkx(g)
+# plt.show()
+# print(g.nodes(data=True))
+# print(g.edges(data=True))
+
+ return G_new_list, pi_forward_list, dis_list
+
+
+ def best_median_graphs(Gn_candidate, pi_all_forward, dis_all):
+ idx_min_list = np.argwhere(dis_all == np.min(dis_all)).flatten().tolist()
+ dis_min = dis_all[idx_min_list[0]]
+ pi_forward_min_list = [pi_all_forward[idx] for idx in idx_min_list]
+ G_min_list = [Gn_candidate[idx] for idx in idx_min_list]
+ return G_min_list, pi_forward_min_list, dis_min
+
+
+ def iteration_proc(G, pi_p_forward, cur_sod):
+ G_list = [G]
+ pi_forward_list = [pi_p_forward]
+ old_sod = cur_sod * 2
+ sod_list = [cur_sod]
+ dis_list = [cur_sod]
+ # iterations.
+ itr = 0
+ # @todo: what if difference == 0?
+# while itr < ite_max and (np.abs(old_sod - cur_sod) > epsilon or
+# np.abs(old_sod - cur_sod) == 0):
+ while itr < ite_max and np.abs(old_sod - cur_sod) > epsilon:
+# while itr < ite_max:
+# for itr in range(0, 5): # the convergence condition?
+ print('itr_iam is', itr)
+ G_new_list = []
+ pi_forward_new_list = []
+ dis_new_list = []
+ for idx, g in enumerate(G_list):
+# label_set = get_node_labels(Gn_median + [g], node_label)
+ G_tmp_list, pi_forward_tmp_list, dis_tmp_list = generate_graph(
+ g, pi_forward_list[idx])
+ G_new_list += G_tmp_list
+ pi_forward_new_list += pi_forward_tmp_list
+ dis_new_list += dis_tmp_list
+ # @todo: need to remove duplicates here?
+ G_list = [ggg.copy() for ggg in G_new_list]
+ pi_forward_list = [pitem.copy() for pitem in pi_forward_new_list]
+ dis_list = dis_new_list[:]
+
+ old_sod = cur_sod
+ cur_sod = np.min(dis_list)
+ sod_list.append(cur_sod)
+
+ itr += 1
+
+ # @todo: do we return all graphs or the best ones?
+ # get the best ones of the generated graphs.
+ G_list, pi_forward_list, dis_min = best_median_graphs(
+ G_list, pi_forward_list, dis_list)
+
+ if ds_attrs['node_attr_dim'] == 0 and ds_attrs['edge_attr_dim'] == 0:
+ G_list, idx_list = remove_duplicates(G_list)
+ pi_forward_list = [pi_forward_list[idx] for idx in idx_list]
+# dis_list = [dis_list[idx] for idx in idx_list]
+
+# import matplotlib.pyplot as plt
+# for g in G_list:
+# nx.draw_networkx(g)
+# plt.show()
+# print(g.nodes(data=True))
+# print(g.edges(data=True))
+
+ print('\nsods:', sod_list, '\n')
+
+ return G_list, pi_forward_list, dis_min, sod_list
+
+
+ def remove_duplicates(Gn):
+ """Remove duplicate graphs from list.
+ """
+ Gn_new = []
+ idx_list = []
+ for idx, g in enumerate(Gn):
+ dupl = False
+ for g_new in Gn_new:
+ if graph_isIdentical(g_new, g):
+ dupl = True
+ break
+ if not dupl:
+ Gn_new.append(g)
+ idx_list.append(idx)
+ return Gn_new, idx_list
+
+
+ def remove_disconnected(Gn):
+ """Remove disconnected graphs from list.
+ """
+ Gn_new = []
+ idx_list = []
+ for idx, g in enumerate(Gn):
+ if nx.is_connected(g):
+ Gn_new.append(g)
+ idx_list.append(idx)
+ return Gn_new, idx_list
+
+
+ ###########################################################################
+
+ # phase 1: initilize.
+ # compute set-median.
+ dis_min = np.inf
+ dis_list, pi_forward_all = ged_median(Gn_candidate, Gn_median,
+ params_ged=params_ged, parallel=True)
+ print('finish computing GEDs.')
+ # find all smallest distances.
+ if allBestInit: # try all best init graphs.
+ idx_min_list = range(len(dis_list))
+ dis_min = dis_list
+ else:
+ idx_min_list = np.argwhere(dis_list == np.min(dis_list)).flatten().tolist()
+ dis_min = [dis_list[idx_min_list[0]]] * len(idx_min_list)
+ idx_min_rdm = random.randint(0, len(idx_min_list) - 1)
+ idx_min_list = [idx_min_list[idx_min_rdm]]
+ sod_set_median = np.min(dis_min)
+
+
+ # phase 2: iteration.
+ G_list = []
+ dis_list = []
+ pi_forward_list = []
+ G_set_median_list = []
+# sod_list = []
+ for idx_tmp, idx_min in enumerate(idx_min_list):
+# print('idx_min is', idx_min)
+ G = Gn_candidate[idx_min].copy()
+ G_set_median_list.append(G.copy())
+ # list of edit operations.
+ pi_p_forward = pi_forward_all[idx_min]
+# pi_p_backward = pi_all_backward[idx_min]
+ Gi_list, pi_i_forward_list, dis_i_min, sod_list = iteration_proc(G,
+ pi_p_forward, dis_min[idx_tmp])
+ G_list += Gi_list
+ dis_list += [dis_i_min] * len(Gi_list)
+ pi_forward_list += pi_i_forward_list
+
+
+ if ds_attrs['node_attr_dim'] == 0 and ds_attrs['edge_attr_dim'] == 0:
+ G_list, idx_list = remove_duplicates(G_list)
+ dis_list = [dis_list[idx] for idx in idx_list]
+ pi_forward_list = [pi_forward_list[idx] for idx in idx_list]
+ if connected == True:
+ G_list_con, idx_list = remove_disconnected(G_list)
+ # if there is no connected graphs at all, then remain the disconnected ones.
+ if len(G_list_con) > 0: # @todo: ??????????????????????????
+ G_list = G_list_con
+ dis_list = [dis_list[idx] for idx in idx_list]
+ pi_forward_list = [pi_forward_list[idx] for idx in idx_list]
+
+# import matplotlib.pyplot as plt
+# for g in G_list:
+# nx.draw_networkx(g)
+# plt.show()
+# print(g.nodes(data=True))
+# print(g.edges(data=True))
+
+ # get the best median graphs
+ G_gen_median_list, pi_forward_min_list, sod_gen_median = best_median_graphs(
+ G_list, pi_forward_list, dis_list)
+# for g in G_gen_median_list:
+# nx.draw_networkx(g)
+# plt.show()
+# print(g.nodes(data=True))
+# print(g.edges(data=True))
+
+ if not allBestOutput:
+ # randomly choose one graph.
+ idx_rdm = random.randint(0, len(G_gen_median_list) - 1)
+ G_gen_median_list = [G_gen_median_list[idx_rdm]]
+
+ return G_gen_median_list, sod_gen_median, sod_list, G_set_median_list, sod_set_median
+
+
+def iam_bash(Gn_names, edit_cost_constant, cost='CONSTANT', initial_solutions=1,
+ dataset='monoterpenoides',
+ graph_dir=''):
+ """Compute the iam by c++ implementation (gedlib) through bash.
+ """
+ import os
+ import time
+
+ def createCollectionFile(Gn_names, y, filename):
+ """Create collection file.
+ """
+ dirname_ds = os.path.dirname(filename)
+ if dirname_ds != '':
+ dirname_ds += '/'
+ if not os.path.exists(dirname_ds) :
+ os.makedirs(dirname_ds)
+
+ with open(filename + '.xml', 'w') as fgroup:
+ fgroup.write("")
+ fgroup.write("\n")
+ fgroup.write("\n")
+ for idx, fname in enumerate(Gn_names):
+ fgroup.write("\n\t")
+ fgroup.write("\n")
+ fgroup.close()
+
+ tmp_dir = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/output/tmp_ged/'
+ fn_collection = tmp_dir + 'collection.' + str(time.time()) + str(random.randint(0, 1e9))
+ createCollectionFile(Gn_names, ['dummy'] * len(Gn_names), fn_collection)
+# fn_collection = tmp_dir + 'collection_for_debug'
+# graph_dir = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/generated_datsets/monoterpenoides/gxl'
+
+# if dataset == 'Letter-high' or dataset == 'Fingerprint':
+# dataset = 'letter'
+ command = 'GEDLIB_HOME=\'/media/ljia/DATA/research-repo/codes/Linlin/gedlib\'\n'
+ command += 'LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$GEDLIB_HOME/lib\n'
+ command += 'export LD_LIBRARY_PATH\n'
+ command += 'cd \'' + os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/bin\'\n'
+ command += './iam_for_python_bash ' + dataset + ' ' + fn_collection \
+ + ' \'' + graph_dir + '\' ' + ' ' + cost + ' ' + str(initial_solutions) + ' '
+ if edit_cost_constant is None:
+ command += 'None'
+ else:
+ for ec in edit_cost_constant:
+ command += str(ec) + ' '
+# output = os.system(command)
+ stream = os.popen(command)
+
+ output = stream.readlines()
+# print(output)
+ sod_sm = float(output[0].strip())
+ sod_gm = float(output[1].strip())
+
+ fname_sm = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/output/tmp_ged/set_median.gxl'
+ fname_gm = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/output/tmp_ged/gen_median.gxl'
+
+ return sod_sm, sod_gm, fname_sm, fname_gm
+
+
+
+###############################################################################
+# Old implementations.
+
+def iam(Gn, c_ei=3, c_er=3, c_es=1, node_label='atom', edge_label='bond_type',
+ connected=True):
+ """See my name, then you know what I do.
+ """
+# Gn = Gn[0:10]
+ Gn = [nx.convert_node_labels_to_integers(g) for g in Gn]
+
+ # phase 1: initilize.
+ # compute set-median.
+ dis_min = np.inf
+ pi_p = []
+ pi_all = []
+ for idx1, G_p in enumerate(Gn):
+ dist_sum = 0
+ pi_all.append([])
+ for idx2, G_p_prime in enumerate(Gn):
+ dist_tmp, pi_tmp, _ = GED(G_p, G_p_prime)
+ pi_all[idx1].append(pi_tmp)
+ dist_sum += dist_tmp
+ if dist_sum < dis_min:
+ dis_min = dist_sum
+ G = G_p.copy()
+ idx_min = idx1
+ # list of edit operations.
+ pi_p = pi_all[idx_min]
+
+ # phase 2: iteration.
+ ds_attrs = get_dataset_attributes(Gn, attr_names=['edge_labeled', 'node_attr_dim'],
+ edge_label=edge_label)
+ for itr in range(0, 10): # @todo: the convergence condition?
+ G_new = G.copy()
+ # update vertex labels.
+ # pre-compute h_i0 for each label.
+# for label in get_node_labels(Gn, node_label):
+# print(label)
+# for nd in G.nodes(data=True):
+# pass
+ if not ds_attrs['node_attr_dim']: # labels are symbolic
+ for nd, _ in G.nodes(data=True):
+ h_i0_list = []
+ label_list = []
+ for label in get_node_labels(Gn, node_label):
+ h_i0 = 0
+ for idx, g in enumerate(Gn):
+ pi_i = pi_p[idx][nd]
+ if g.has_node(pi_i) and g.nodes[pi_i][node_label] == label:
+ h_i0 += 1
+ h_i0_list.append(h_i0)
+ label_list.append(label)
+ # choose one of the best randomly.
+ idx_max = np.argwhere(h_i0_list == np.max(h_i0_list)).flatten().tolist()
+ idx_rdm = random.randint(0, len(idx_max) - 1)
+ G_new.nodes[nd][node_label] = label_list[idx_max[idx_rdm]]
+ else: # labels are non-symbolic
+ for nd, _ in G.nodes(data=True):
+ Si_norm = 0
+ phi_i_bar = np.array([0.0 for _ in range(ds_attrs['node_attr_dim'])])
+ for idx, g in enumerate(Gn):
+ pi_i = pi_p[idx][nd]
+ if g.has_node(pi_i): #@todo: what if no g has node? phi_i_bar = 0?
+ Si_norm += 1
+ phi_i_bar += np.array([float(itm) for itm in g.nodes[pi_i]['attributes']])
+ phi_i_bar /= Si_norm
+ G_new.nodes[nd]['attributes'] = phi_i_bar
+
+ # update edge labels and adjacency matrix.
+ if ds_attrs['edge_labeled']:
+ for nd1, nd2, _ in G.edges(data=True):
+ h_ij0_list = []
+ label_list = []
+ for label in get_edge_labels(Gn, edge_label):
+ h_ij0 = 0
+ for idx, g in enumerate(Gn):
+ pi_i = pi_p[idx][nd1]
+ pi_j = pi_p[idx][nd2]
+ h_ij0_p = (g.has_node(pi_i) and g.has_node(pi_j) and
+ g.has_edge(pi_i, pi_j) and
+ g.edges[pi_i, pi_j][edge_label] == label)
+ h_ij0 += h_ij0_p
+ h_ij0_list.append(h_ij0)
+ label_list.append(label)
+ # choose one of the best randomly.
+ idx_max = np.argwhere(h_ij0_list == np.max(h_ij0_list)).flatten().tolist()
+ h_ij0_max = h_ij0_list[idx_max[0]]
+ idx_rdm = random.randint(0, len(idx_max) - 1)
+ best_label = label_list[idx_max[idx_rdm]]
+
+ # check whether a_ij is 0 or 1.
+ sij_norm = 0
+ for idx, g in enumerate(Gn):
+ pi_i = pi_p[idx][nd1]
+ pi_j = pi_p[idx][nd2]
+ if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j):
+ sij_norm += 1
+ if h_ij0_max > len(Gn) * c_er / c_es + sij_norm * (1 - (c_er + c_ei) / c_es):
+ if not G_new.has_edge(nd1, nd2):
+ G_new.add_edge(nd1, nd2)
+ G_new.edges[nd1, nd2][edge_label] = best_label
+ else:
+ if G_new.has_edge(nd1, nd2):
+ G_new.remove_edge(nd1, nd2)
+ else: # if edges are unlabeled
+ for nd1, nd2, _ in G.edges(data=True):
+ sij_norm = 0
+ for idx, g in enumerate(Gn):
+ pi_i = pi_p[idx][nd1]
+ pi_j = pi_p[idx][nd2]
+ if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j):
+ sij_norm += 1
+ if sij_norm > len(Gn) * c_er / (c_er + c_ei):
+ if not G_new.has_edge(nd1, nd2):
+ G_new.add_edge(nd1, nd2)
+ else:
+ if G_new.has_edge(nd1, nd2):
+ G_new.remove_edge(nd1, nd2)
+
+ G = G_new.copy()
+
+ # update pi_p
+ pi_p = []
+ for idx1, G_p in enumerate(Gn):
+ dist_tmp, pi_tmp, _ = GED(G, G_p)
+ pi_p.append(pi_tmp)
+
+ return G
+
+# --------------------------- These are tests --------------------------------#
+
+def test_iam_with_more_graphs_as_init(Gn, G_candidate, c_ei=3, c_er=3, c_es=1,
+ node_label='atom', edge_label='bond_type'):
+ """See my name, then you know what I do.
+ """
+# Gn = Gn[0:10]
+ Gn = [nx.convert_node_labels_to_integers(g) for g in Gn]
+
+ # phase 1: initilize.
+ # compute set-median.
+ dis_min = np.inf
+# pi_p = []
+ pi_all_forward = []
+ pi_all_backward = []
+ for idx1, G_p in tqdm(enumerate(G_candidate), desc='computing GEDs', file=sys.stdout):
+ dist_sum = 0
+ pi_all_forward.append([])
+ pi_all_backward.append([])
+ for idx2, G_p_prime in enumerate(Gn):
+ dist_tmp, pi_tmp_forward, pi_tmp_backward = GED(G_p, G_p_prime)
+ pi_all_forward[idx1].append(pi_tmp_forward)
+ pi_all_backward[idx1].append(pi_tmp_backward)
+ dist_sum += dist_tmp
+ if dist_sum <= dis_min:
+ dis_min = dist_sum
+ G = G_p.copy()
+ idx_min = idx1
+ # list of edit operations.
+ pi_p_forward = pi_all_forward[idx_min]
+ pi_p_backward = pi_all_backward[idx_min]
+
+ # phase 2: iteration.
+ ds_attrs = get_dataset_attributes(Gn + [G], attr_names=['edge_labeled', 'node_attr_dim'],
+ edge_label=edge_label)
+ label_set = get_node_labels(Gn + [G], node_label)
+ for itr in range(0, 10): # @todo: the convergence condition?
+ G_new = G.copy()
+ # update vertex labels.
+ # pre-compute h_i0 for each label.
+# for label in get_node_labels(Gn, node_label):
+# print(label)
+# for nd in G.nodes(data=True):
+# pass
+ if not ds_attrs['node_attr_dim']: # labels are symbolic
+ for nd in G.nodes():
+ h_i0_list = []
+ label_list = []
+ for label in label_set:
+ h_i0 = 0
+ for idx, g in enumerate(Gn):
+ pi_i = pi_p_forward[idx][nd]
+ if g.has_node(pi_i) and g.nodes[pi_i][node_label] == label:
+ h_i0 += 1
+ h_i0_list.append(h_i0)
+ label_list.append(label)
+ # choose one of the best randomly.
+ idx_max = np.argwhere(h_i0_list == np.max(h_i0_list)).flatten().tolist()
+ idx_rdm = random.randint(0, len(idx_max) - 1)
+ G_new.nodes[nd][node_label] = label_list[idx_max[idx_rdm]]
+ else: # labels are non-symbolic
+ for nd in G.nodes():
+ Si_norm = 0
+ phi_i_bar = np.array([0.0 for _ in range(ds_attrs['node_attr_dim'])])
+ for idx, g in enumerate(Gn):
+ pi_i = pi_p_forward[idx][nd]
+ if g.has_node(pi_i): #@todo: what if no g has node? phi_i_bar = 0?
+ Si_norm += 1
+ phi_i_bar += np.array([float(itm) for itm in g.nodes[pi_i]['attributes']])
+ phi_i_bar /= Si_norm
+ G_new.nodes[nd]['attributes'] = phi_i_bar
+
+ # update edge labels and adjacency matrix.
+ if ds_attrs['edge_labeled']:
+ for nd1, nd2, _ in G.edges(data=True):
+ h_ij0_list = []
+ label_list = []
+ for label in get_edge_labels(Gn, edge_label):
+ h_ij0 = 0
+ for idx, g in enumerate(Gn):
+ pi_i = pi_p_forward[idx][nd1]
+ pi_j = pi_p_forward[idx][nd2]
+ h_ij0_p = (g.has_node(pi_i) and g.has_node(pi_j) and
+ g.has_edge(pi_i, pi_j) and
+ g.edges[pi_i, pi_j][edge_label] == label)
+ h_ij0 += h_ij0_p
+ h_ij0_list.append(h_ij0)
+ label_list.append(label)
+ # choose one of the best randomly.
+ idx_max = np.argwhere(h_ij0_list == np.max(h_ij0_list)).flatten().tolist()
+ h_ij0_max = h_ij0_list[idx_max[0]]
+ idx_rdm = random.randint(0, len(idx_max) - 1)
+ best_label = label_list[idx_max[idx_rdm]]
+
+ # check whether a_ij is 0 or 1.
+ sij_norm = 0
+ for idx, g in enumerate(Gn):
+ pi_i = pi_p_forward[idx][nd1]
+ pi_j = pi_p_forward[idx][nd2]
+ if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j):
+ sij_norm += 1
+ if h_ij0_max > len(Gn) * c_er / c_es + sij_norm * (1 - (c_er + c_ei) / c_es):
+ if not G_new.has_edge(nd1, nd2):
+ G_new.add_edge(nd1, nd2)
+ G_new.edges[nd1, nd2][edge_label] = best_label
+ else:
+ if G_new.has_edge(nd1, nd2):
+ G_new.remove_edge(nd1, nd2)
+ else: # if edges are unlabeled
+ # @todo: works only for undirected graphs.
+ for nd1 in range(nx.number_of_nodes(G)):
+ for nd2 in range(nd1 + 1, nx.number_of_nodes(G)):
+ sij_norm = 0
+ for idx, g in enumerate(Gn):
+ pi_i = pi_p_forward[idx][nd1]
+ pi_j = pi_p_forward[idx][nd2]
+ if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j):
+ sij_norm += 1
+ if sij_norm > len(Gn) * c_er / (c_er + c_ei):
+ if not G_new.has_edge(nd1, nd2):
+ G_new.add_edge(nd1, nd2)
+ elif sij_norm < len(Gn) * c_er / (c_er + c_ei):
+ if G_new.has_edge(nd1, nd2):
+ G_new.remove_edge(nd1, nd2)
+ # do not change anything when equal.
+
+ G = G_new.copy()
+
+ # update pi_p
+ pi_p_forward = []
+ for G_p in Gn:
+ dist_tmp, pi_tmp_forward, pi_tmp_backward = GED(G, G_p)
+ pi_p_forward.append(pi_tmp_forward)
+
+ return G
+
+
+###############################################################################
+
+if __name__ == '__main__':
+ from gklearn.utils.graphfiles import loadDataset
+ ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat',
+ 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}} # node/edge symb
+# ds = {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt',
+# 'extra_params': {}} # node nsymb
+# ds = {'name': 'Acyclic', 'dataset': '../datasets/monoterpenoides/trainset_9.ds',
+# 'extra_params': {}}
+ Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
+
+ iam(Gn)
\ No newline at end of file
diff --git a/gklearn/preimage/knn.py b/gklearn/preimage/knn.py
new file mode 100644
index 0000000..c179287
--- /dev/null
+++ b/gklearn/preimage/knn.py
@@ -0,0 +1,114 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Jan 10 13:22:04 2020
+
+@author: ljia
+"""
+import numpy as np
+#import matplotlib.pyplot as plt
+from tqdm import tqdm
+import random
+#import csv
+from shutil import copyfile
+import os
+
+from gklearn.preimage.iam import iam_bash
+from gklearn.utils.graphfiles import loadDataset, loadGXL
+from gklearn.preimage.ged import GED
+from gklearn.preimage.utils import get_same_item_indices
+
+def test_knn():
+ ds = {'name': 'monoterpenoides',
+ 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb
+ Gn, y_all = loadDataset(ds['dataset'])
+# Gn = Gn[0:50]
+# gkernel = 'treeletkernel'
+# node_label = 'atom'
+# edge_label = 'bond_type'
+# ds_name = 'mono'
+ dir_output = 'results/knn/'
+ graph_dir = os.path.dirname(os.path.realpath(__file__)) + '../../datasets/monoterpenoides/'
+
+ k_nn = 1
+ percent = 0.1
+ repeats = 50
+ edit_cost_constant = [3, 3, 1, 3, 3, 1]
+
+ # get indices by classes.
+ y_idx = get_same_item_indices(y_all)
+ sod_sm_list_list
+ for repeat in range(0, repeats):
+ print('\n---------------------------------')
+ print('repeat =', repeat)
+ accuracy_sm_list = []
+ accuracy_gm_list = []
+ sod_sm_list = []
+ sod_gm_list = []
+
+ random.seed(repeat)
+ set_median_list = []
+ gen_median_list = []
+ train_y_set = []
+ for y, values in y_idx.items():
+ print('\ny =', y)
+ size_median_set = int(len(values) * percent)
+ median_set_idx = random.sample(values, size_median_set)
+ print('median set: ', median_set_idx)
+
+ # compute set median and gen median using IAM (C++ through bash).
+ # Gn_median = [Gn[idx] for idx in median_set_idx]
+ group_fnames = [Gn[g].graph['filename'] for g in median_set_idx]
+ sod_sm, sod_gm, fname_sm, fname_gm = iam_bash(group_fnames, edit_cost_constant,
+ graph_dir=graph_dir)
+ print('sod_sm, sod_gm:', sod_sm, sod_gm)
+ sod_sm_list.append(sod_sm)
+ sod_gm_list.append(sod_gm)
+ fname_sm_new = dir_output + 'medians/set_median.y' + str(int(y)) + '.repeat' + str(repeat) + '.gxl'
+ copyfile(fname_sm, fname_sm_new)
+ fname_gm_new = dir_output + 'medians/gen_median.y' + str(int(y)) + '.repeat' + str(repeat) + '.gxl'
+ copyfile(fname_gm, fname_gm_new)
+ set_median_list.append(loadGXL(fname_sm_new))
+ gen_median_list.append(loadGXL(fname_gm_new))
+ train_y_set.append(int(y))
+
+ print(sod_sm, sod_gm)
+
+ # do 1-nn.
+ test_y_set = [int(y) for y in y_all]
+ accuracy_sm = knn(set_median_list, train_y_set, Gn, test_y_set, k=k_nn, distance='ged')
+ accuracy_gm = knn(set_median_list, train_y_set, Gn, test_y_set, k=k_nn, distance='ged')
+ accuracy_sm_list.append(accuracy_sm)
+ accuracy_gm_list.append(accuracy_gm)
+ print('current accuracy sm and gm:', accuracy_sm, accuracy_gm)
+
+ # output
+ accuracy_sm_mean = np.mean(accuracy_sm_list)
+ accuracy_gm_mean = np.mean(accuracy_gm_list)
+ print('\ntotal average accuracy sm and gm:', accuracy_sm_mean, accuracy_gm_mean)
+
+
+def knn(train_set, train_y_set, test_set, test_y_set, k=1, distance='ged'):
+ if k == 1 and distance == 'ged':
+ algo_options = '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1'
+ params_ged = {'lib': 'gedlibpy', 'cost': 'CONSTANT', 'method': 'IPFP',
+ 'algo_options': algo_options, 'stabilizer': None}
+ accuracy = 0
+ for idx_test, g_test in tqdm(enumerate(test_set), desc='computing 1-nn',
+ file=sys.stdout):
+ dis = np.inf
+ for idx_train, g_train in enumerate(train_set):
+ dis_cur, _, _ = GED(g_test, g_train, **params_ged)
+ if dis_cur < dis:
+ dis = dis_cur
+ test_y_cur = train_y_set[idx_train]
+ if test_y_cur == test_y_set[idx_test]:
+ accuracy += 1
+ accuracy = accuracy / len(test_set)
+
+ return accuracy
+
+
+
+if __name__ == '__main__':
+ test_knn()
\ No newline at end of file
diff --git a/gklearn/preimage/libs.py b/gklearn/preimage/libs.py
new file mode 100644
index 0000000..76005c6
--- /dev/null
+++ b/gklearn/preimage/libs.py
@@ -0,0 +1,6 @@
+import sys
+import pathlib
+
+# insert gedlibpy library.
+sys.path.insert(0, "../../../")
+from gedlibpy import librariesImport, gedlibpy
diff --git a/gklearn/preimage/median.py b/gklearn/preimage/median.py
new file mode 100644
index 0000000..1c5bb0f
--- /dev/null
+++ b/gklearn/preimage/median.py
@@ -0,0 +1,218 @@
+import sys
+sys.path.insert(0, "../")
+#import pathlib
+import numpy as np
+import networkx as nx
+import time
+
+from gedlibpy import librariesImport, gedlibpy
+#import script
+sys.path.insert(0, "/home/bgauzere/dev/optim-graphes/")
+import gklearn
+from gklearn.utils.graphfiles import loadDataset
+
+def replace_graph_in_env(script, graph, old_id, label='median'):
+ """
+ Replace a graph in script
+
+ If old_id is -1, add a new graph to the environnemt
+
+ """
+ if(old_id > -1):
+ script.PyClearGraph(old_id)
+ new_id = script.PyAddGraph(label)
+ for i in graph.nodes():
+ script.PyAddNode(new_id,str(i),graph.node[i]) # !! strings are required bt gedlib
+ for e in graph.edges:
+ script.PyAddEdge(new_id, str(e[0]),str(e[1]), {})
+ script.PyInitEnv()
+ script.PySetMethod("IPFP", "")
+ script.PyInitMethod()
+
+ return new_id
+
+#Dessin median courrant
+def draw_Letter_graph(graph, savepath=''):
+ import numpy as np
+ import networkx as nx
+ import matplotlib.pyplot as plt
+ plt.figure()
+ pos = {}
+ for n in graph.nodes:
+ pos[n] = np.array([float(graph.node[n]['attributes'][0]),
+ float(graph.node[n]['attributes'][1])])
+ nx.draw_networkx(graph, pos)
+ if savepath != '':
+ plt.savefig(savepath + str(time.time()) + '.eps', format='eps', dpi=300)
+ plt.show()
+ plt.clf()
+
+#compute new mappings
+def update_mappings(script,median_id,listID):
+ med_distances = {}
+ med_mappings = {}
+ sod = 0
+ for i in range(0,len(listID)):
+ script.PyRunMethod(median_id,listID[i])
+ med_distances[i] = script.PyGetUpperBound(median_id,listID[i])
+ med_mappings[i] = script.PyGetForwardMap(median_id,listID[i])
+ sod += med_distances[i]
+ return med_distances, med_mappings, sod
+
+def calcul_Sij(all_mappings, all_graphs,i,j):
+ s_ij = 0
+ for k in range(0,len(all_mappings)):
+ cur_graph = all_graphs[k]
+ cur_mapping = all_mappings[k]
+ size_graph = cur_graph.order()
+ if ((cur_mapping[i] < size_graph) and
+ (cur_mapping[j] < size_graph) and
+ (cur_graph.has_edge(cur_mapping[i], cur_mapping[j]) == True)):
+ s_ij += 1
+
+ return s_ij
+
+# def update_median_nodes_L1(median,listIdSet,median_id,dataset, mappings):
+# from scipy.stats.mstats import gmean
+
+# for i in median.nodes():
+# for k in listIdSet:
+# vectors = [] #np.zeros((len(listIdSet),2))
+# if(k != median_id):
+# phi_i = mappings[k][i]
+# if(phi_i < dataset[k].order()):
+# vectors.append([float(dataset[k].node[phi_i]['x']),float(dataset[k].node[phi_i]['y'])])
+
+# new_labels = gmean(vectors)
+# median.node[i]['x'] = str(new_labels[0])
+# median.node[i]['y'] = str(new_labels[1])
+# return median
+
+def update_median_nodes(median,dataset,mappings):
+ #update node attributes
+ for i in median.nodes():
+ nb_sub=0
+ mean_label = {'x' : 0, 'y' : 0}
+ for k in range(0,len(mappings)):
+ phi_i = mappings[k][i]
+ if ( phi_i < dataset[k].order() ):
+ nb_sub += 1
+ mean_label['x'] += 0.75*float(dataset[k].node[phi_i]['x'])
+ mean_label['y'] += 0.75*float(dataset[k].node[phi_i]['y'])
+ median.node[i]['x'] = str((1/0.75)*(mean_label['x']/nb_sub))
+ median.node[i]['y'] = str((1/0.75)*(mean_label['y']/nb_sub))
+ return median
+
+def update_median_edges(dataset, mappings, median, cei=0.425,cer=0.425):
+#for letter high, ceir = 1.7, alpha = 0.75
+ size_dataset = len(dataset)
+ ratio_cei_cer = cer/(cei + cer)
+ threshold = size_dataset*ratio_cei_cer
+ order_graph_median = median.order()
+ for i in range(0,order_graph_median):
+ for j in range(i+1,order_graph_median):
+ s_ij = calcul_Sij(mappings,dataset,i,j)
+ if(s_ij > threshold):
+ median.add_edge(i,j)
+ else:
+ if(median.has_edge(i,j)):
+ median.remove_edge(i,j)
+ return median
+
+
+
+def compute_median(script, listID, dataset,verbose=False):
+ """Compute a graph median of a dataset according to an environment
+
+ Parameters
+
+ script : An gedlib initialized environnement
+ listID (list): a list of ID in script: encodes the dataset
+ dataset (list): corresponding graphs in networkX format. We assume that graph
+ listID[i] corresponds to dataset[i]
+
+ Returns:
+ A networkX graph, which is the median, with corresponding sod
+ """
+ print(len(listID))
+ median_set_index, median_set_sod = compute_median_set(script, listID)
+ print(median_set_index)
+ print(median_set_sod)
+ sods = []
+ #Ajout median dans environnement
+ set_median = dataset[median_set_index].copy()
+ median = dataset[median_set_index].copy()
+ cur_med_id = replace_graph_in_env(script,median,-1)
+ med_distances, med_mappings, cur_sod = update_mappings(script,cur_med_id,listID)
+ sods.append(cur_sod)
+ if(verbose):
+ print(cur_sod)
+ ite_max = 50
+ old_sod = cur_sod * 2
+ ite = 0
+ epsilon = 0.001
+
+ best_median
+ while((ite < ite_max) and (np.abs(old_sod - cur_sod) > epsilon )):
+ median = update_median_nodes(median,dataset, med_mappings)
+ median = update_median_edges(dataset,med_mappings,median)
+
+ cur_med_id = replace_graph_in_env(script,median,cur_med_id)
+ med_distances, med_mappings, cur_sod = update_mappings(script,cur_med_id,listID)
+
+
+ sods.append(cur_sod)
+ if(verbose):
+ print(cur_sod)
+ ite += 1
+ return median, cur_sod, sods, set_median
+
+ draw_Letter_graph(median)
+
+
+def compute_median_set(script,listID):
+ 'Returns the id in listID corresponding to median set'
+ #Calcul median set
+ N=len(listID)
+ map_id_to_index = {}
+ map_index_to_id = {}
+ for i in range(0,len(listID)):
+ map_id_to_index[listID[i]] = i
+ map_index_to_id[i] = listID[i]
+
+ distances = np.zeros((N,N))
+ for i in listID:
+ for j in listID:
+ script.PyRunMethod(i,j)
+ distances[map_id_to_index[i],map_id_to_index[j]] = script.PyGetUpperBound(i,j)
+
+ median_set_index = np.argmin(np.sum(distances,0))
+ sod = np.min(np.sum(distances,0))
+
+ return median_set_index, sod
+
+if __name__ == "__main__":
+ #Chargement du dataset
+ script.PyLoadGXLGraph('/home/bgauzere/dev/gedlib/data/datasets/Letter/HIGH/', '/home/bgauzere/dev/gedlib/data/collections/Letter_Z.xml')
+ script.PySetEditCost("LETTER")
+ script.PyInitEnv()
+ script.PySetMethod("IPFP", "")
+ script.PyInitMethod()
+
+ dataset,my_y = gklearn.utils.graphfiles.loadDataset("/home/bgauzere/dev/gedlib/data/datasets/Letter/HIGH/Letter_Z.cxl")
+
+ listID = script.PyGetAllGraphIds()
+ median, sod = compute_median(script,listID,dataset,verbose=True)
+
+ print(sod)
+ draw_Letter_graph(median)
+
+
+#if __name__ == '__main__':
+# # test draw_Letter_graph
+# ds = {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt',
+# 'extra_params': {}} # node nsymb
+# Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
+# print(y_all)
+# for g in Gn:
+# draw_Letter_graph(g)
\ No newline at end of file
diff --git a/gklearn/preimage/median_benoit.py b/gklearn/preimage/median_benoit.py
new file mode 100644
index 0000000..6712196
--- /dev/null
+++ b/gklearn/preimage/median_benoit.py
@@ -0,0 +1,201 @@
+import sys
+import pathlib
+import numpy as np
+import networkx as nx
+
+import librariesImport
+import script
+sys.path.insert(0, "/home/bgauzere/dev/optim-graphes/")
+import gklearn
+
+def replace_graph_in_env(script, graph, old_id, label='median'):
+ """
+ Replace a graph in script
+
+ If old_id is -1, add a new graph to the environnemt
+
+ """
+ if(old_id > -1):
+ script.PyClearGraph(old_id)
+ new_id = script.PyAddGraph(label)
+ for i in graph.nodes():
+ script.PyAddNode(new_id,str(i),graph.node[i]) # !! strings are required bt gedlib
+ for e in graph.edges:
+ script.PyAddEdge(new_id, str(e[0]),str(e[1]), {})
+ script.PyInitEnv()
+ script.PySetMethod("IPFP", "")
+ script.PyInitMethod()
+
+ return new_id
+
+#Dessin median courrant
+def draw_Letter_graph(graph):
+ import numpy as np
+ import networkx as nx
+ import matplotlib.pyplot as plt
+ plt.figure()
+ pos = {}
+ for n in graph.nodes:
+ pos[n] = np.array([float(graph.node[n]['x']),float(graph.node[n]['y'])])
+ nx.draw_networkx(graph,pos)
+ plt.show()
+
+#compute new mappings
+def update_mappings(script,median_id,listID):
+ med_distances = {}
+ med_mappings = {}
+ sod = 0
+ for i in range(0,len(listID)):
+ script.PyRunMethod(median_id,listID[i])
+ med_distances[i] = script.PyGetUpperBound(median_id,listID[i])
+ med_mappings[i] = script.PyGetForwardMap(median_id,listID[i])
+ sod += med_distances[i]
+ return med_distances, med_mappings, sod
+
+def calcul_Sij(all_mappings, all_graphs,i,j):
+ s_ij = 0
+ for k in range(0,len(all_mappings)):
+ cur_graph = all_graphs[k]
+ cur_mapping = all_mappings[k]
+ size_graph = cur_graph.order()
+ if ((cur_mapping[i] < size_graph) and
+ (cur_mapping[j] < size_graph) and
+ (cur_graph.has_edge(cur_mapping[i], cur_mapping[j]) == True)):
+ s_ij += 1
+
+ return s_ij
+
+# def update_median_nodes_L1(median,listIdSet,median_id,dataset, mappings):
+# from scipy.stats.mstats import gmean
+
+# for i in median.nodes():
+# for k in listIdSet:
+# vectors = [] #np.zeros((len(listIdSet),2))
+# if(k != median_id):
+# phi_i = mappings[k][i]
+# if(phi_i < dataset[k].order()):
+# vectors.append([float(dataset[k].node[phi_i]['x']),float(dataset[k].node[phi_i]['y'])])
+
+# new_labels = gmean(vectors)
+# median.node[i]['x'] = str(new_labels[0])
+# median.node[i]['y'] = str(new_labels[1])
+# return median
+
+def update_median_nodes(median,dataset,mappings):
+ #update node attributes
+ for i in median.nodes():
+ nb_sub=0
+ mean_label = {'x' : 0, 'y' : 0}
+ for k in range(0,len(mappings)):
+ phi_i = mappings[k][i]
+ if ( phi_i < dataset[k].order() ):
+ nb_sub += 1
+ mean_label['x'] += 0.75*float(dataset[k].node[phi_i]['x'])
+ mean_label['y'] += 0.75*float(dataset[k].node[phi_i]['y'])
+ median.node[i]['x'] = str((1/0.75)*(mean_label['x']/nb_sub))
+ median.node[i]['y'] = str((1/0.75)*(mean_label['y']/nb_sub))
+ return median
+
+def update_median_edges(dataset, mappings, median, cei=0.425,cer=0.425):
+#for letter high, ceir = 1.7, alpha = 0.75
+ size_dataset = len(dataset)
+ ratio_cei_cer = cer/(cei + cer)
+ threshold = size_dataset*ratio_cei_cer
+ order_graph_median = median.order()
+ for i in range(0,order_graph_median):
+ for j in range(i+1,order_graph_median):
+ s_ij = calcul_Sij(mappings,dataset,i,j)
+ if(s_ij > threshold):
+ median.add_edge(i,j)
+ else:
+ if(median.has_edge(i,j)):
+ median.remove_edge(i,j)
+ return median
+
+
+
+def compute_median(script, listID, dataset,verbose=False):
+ """Compute a graph median of a dataset according to an environment
+
+ Parameters
+
+ script : An gedlib initialized environnement
+ listID (list): a list of ID in script: encodes the dataset
+ dataset (list): corresponding graphs in networkX format. We assume that graph
+ listID[i] corresponds to dataset[i]
+
+ Returns:
+ A networkX graph, which is the median, with corresponding sod
+ """
+ print(len(listID))
+ median_set_index, median_set_sod = compute_median_set(script, listID)
+ print(median_set_index)
+ print(median_set_sod)
+ sods = []
+ #Ajout median dans environnement
+ set_median = dataset[median_set_index].copy()
+ median = dataset[median_set_index].copy()
+ cur_med_id = replace_graph_in_env(script,median,-1)
+ med_distances, med_mappings, cur_sod = update_mappings(script,cur_med_id,listID)
+ sods.append(cur_sod)
+ if(verbose):
+ print(cur_sod)
+ ite_max = 50
+ old_sod = cur_sod * 2
+ ite = 0
+ epsilon = 0.001
+
+ best_median
+ while((ite < ite_max) and (np.abs(old_sod - cur_sod) > epsilon )):
+ median = update_median_nodes(median,dataset, med_mappings)
+ median = update_median_edges(dataset,med_mappings,median)
+
+ cur_med_id = replace_graph_in_env(script,median,cur_med_id)
+ med_distances, med_mappings, cur_sod = update_mappings(script,cur_med_id,listID)
+
+
+ sods.append(cur_sod)
+ if(verbose):
+ print(cur_sod)
+ ite += 1
+ return median, cur_sod, sods, set_median
+
+ draw_Letter_graph(median)
+
+
+def compute_median_set(script,listID):
+ 'Returns the id in listID corresponding to median set'
+ #Calcul median set
+ N=len(listID)
+ map_id_to_index = {}
+ map_index_to_id = {}
+ for i in range(0,len(listID)):
+ map_id_to_index[listID[i]] = i
+ map_index_to_id[i] = listID[i]
+
+ distances = np.zeros((N,N))
+ for i in listID:
+ for j in listID:
+ script.PyRunMethod(i,j)
+ distances[map_id_to_index[i],map_id_to_index[j]] = script.PyGetUpperBound(i,j)
+
+ median_set_index = np.argmin(np.sum(distances,0))
+ sod = np.min(np.sum(distances,0))
+
+ return median_set_index, sod
+
+if __name__ == "__main__":
+ #Chargement du dataset
+ script.PyLoadGXLGraph('/home/bgauzere/dev/gedlib/data/datasets/Letter/HIGH/', '/home/bgauzere/dev/gedlib/data/collections/Letter_Z.xml')
+ script.PySetEditCost("LETTER")
+ script.PyInitEnv()
+ script.PySetMethod("IPFP", "")
+ script.PyInitMethod()
+
+ dataset,my_y = gklearn.utils.graphfiles.loadDataset("/home/bgauzere/dev/gedlib/data/datasets/Letter/HIGH/Letter_Z.cxl")
+
+ listID = script.PyGetAllGraphIds()
+ median, sod = compute_median(script,listID,dataset,verbose=True)
+
+ print(sod)
+ draw_Letter_graph(median)
diff --git a/gklearn/preimage/median_graph_estimator.py b/gklearn/preimage/median_graph_estimator.py
new file mode 100644
index 0000000..b70cc61
--- /dev/null
+++ b/gklearn/preimage/median_graph_estimator.py
@@ -0,0 +1,826 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Mon Mar 16 18:04:55 2020
+
+@author: ljia
+"""
+import numpy as np
+from gklearn.preimage.common_types import AlgorithmState
+from gklearn.preimage import misc
+from gklearn.preimage.timer import Timer
+from gklearn.utils.utils import graph_isIdentical
+import time
+from tqdm import tqdm
+import sys
+import networkx as nx
+
+
+class MedianGraphEstimator(object):
+
+ def __init__(self, ged_env, constant_node_costs):
+ """Constructor.
+
+ Parameters
+ ----------
+ ged_env : gklearn.gedlib.gedlibpy.GEDEnv
+ Initialized GED environment. The edit costs must be set by the user.
+
+ constant_node_costs : Boolean
+ Set to True if the node relabeling costs are constant.
+ """
+ self.__ged_env = ged_env
+ self.__init_method = 'BRANCH_FAST'
+ self.__init_options = ''
+ self.__descent_method = 'BRANCH_FAST'
+ self.__descent_options = ''
+ self.__refine_method = 'IPFP'
+ self.__refine_options = ''
+ self.__constant_node_costs = constant_node_costs
+ self.__labeled_nodes = (ged_env.get_num_node_labels() > 1)
+ self.__node_del_cost = ged_env.get_node_del_cost(ged_env.get_node_label(1))
+ self.__node_ins_cost = ged_env.get_node_ins_cost(ged_env.get_node_label(1))
+ self.__labeled_edges = (ged_env.get_num_edge_labels() > 1)
+ self.__edge_del_cost = ged_env.get_edge_del_cost(ged_env.get_edge_label(1))
+ self.__edge_ins_cost = ged_env.get_edge_ins_cost(ged_env.get_edge_label(1))
+ self.__init_type = 'RANDOM'
+ self.__num_random_inits = 10
+ self.__desired_num_random_inits = 10
+ self.__use_real_randomness = True
+ self.__seed = 0
+ self.__refine = True
+ self.__time_limit_in_sec = 0
+ self.__epsilon = 0.0001
+ self.__max_itrs = 100
+ self.__max_itrs_without_update = 3
+ self.__num_inits_increase_order = 10
+ self.__init_type_increase_order = 'K-MEANS++'
+ self.__max_itrs_increase_order = 10
+ self.__print_to_stdout = 2
+ self.__median_id = np.inf # @todo: check
+ self.__median_node_id_prefix = '' # @todo: check
+ self.__node_maps_from_median = {}
+ self.__sum_of_distances = 0
+ self.__best_init_sum_of_distances = np.inf
+ self.__converged_sum_of_distances = np.inf
+ self.__runtime = None
+ self.__runtime_initialized = None
+ self.__runtime_converged = None
+ self.__itrs = [] # @todo: check: {} ?
+ self.__num_decrease_order = 0
+ self.__num_increase_order = 0
+ self.__num_converged_descents = 0
+ self.__state = AlgorithmState.TERMINATED
+
+ if ged_env is None:
+ raise Exception('The GED environment pointer passed to the constructor of MedianGraphEstimator is null.')
+ elif not ged_env.is_initialized():
+ raise Exception('The GED environment is uninitialized. Call gedlibpy.GEDEnv.init() before passing it to the constructor of MedianGraphEstimator.')
+
+
+ def set_options(self, options):
+ """Sets the options of the estimator.
+
+ Parameters
+ ----------
+ options : string
+ String that specifies with which options to run the estimator.
+ """
+ self.__set_default_options()
+ options_map = misc.options_string_to_options_map(options)
+ for opt_name, opt_val in options_map.items():
+ if opt_name == 'init-type':
+ self.__init_type = opt_val
+ if opt_val != 'MEDOID' and opt_val != 'RANDOM' and opt_val != 'MIN' and opt_val != 'MAX' and opt_val != 'MEAN':
+ raise Exception('Invalid argument ' + opt_val + ' for option init-type. Usage: options = "[--init-type RANDOM|MEDOID|EMPTY|MIN|MAX|MEAN] [...]"')
+ elif opt_name == 'random-inits':
+ try:
+ self.__num_random_inits = int(opt_val)
+ self.__desired_num_random_inits = self.__num_random_inits
+ except:
+ raise Exception('Invalid argument "' + opt_val + '" for option random-inits. Usage: options = "[--random-inits ]"')
+
+ if self.__num_random_inits <= 0:
+ raise Exception('Invalid argument "' + opt_val + '" for option random-inits. Usage: options = "[--random-inits ]"')
+
+ elif opt_name == 'randomness':
+ if opt_val == 'PSEUDO':
+ self.__use_real_randomness = False
+
+ elif opt_val == 'REAL':
+ self.__use_real_randomness = True
+
+ else:
+ raise Exception('Invalid argument "' + opt_val + '" for option randomness. Usage: options = "[--randomness REAL|PSEUDO] [...]"')
+
+ elif opt_name == 'stdout':
+ if opt_val == '0':
+ self.__print_to_stdout = 0
+
+ elif opt_val == '1':
+ self.__print_to_stdout = 1
+
+ elif opt_val == '2':
+ self.__print_to_stdout = 2
+
+ else:
+ raise Exception('Invalid argument "' + opt_val + '" for option stdout. Usage: options = "[--stdout 0|1|2] [...]"')
+
+ elif opt_name == 'refine':
+ if opt_val == 'TRUE':
+ self.__refine = True
+
+ elif opt_val == 'FALSE':
+ self.__refine = False
+
+ else:
+ raise Exception('Invalid argument "' + opt_val + '" for option refine. Usage: options = "[--refine TRUE|FALSE] [...]"')
+
+ elif opt_name == 'time-limit':
+ try:
+ self.__time_limit_in_sec = float(opt_val)
+
+ except:
+ raise Exception('Invalid argument "' + opt_val + '" for option time-limit. Usage: options = "[--time-limit ] [...]')
+
+ elif opt_name == 'max-itrs':
+ try:
+ self.__max_itrs = int(opt_val)
+
+ except:
+ raise Exception('Invalid argument "' + opt_val + '" for option max-itrs. Usage: options = "[--max-itrs ] [...]')
+
+ elif opt_name == 'max-itrs-without-update':
+ try:
+ self.__max_itrs_without_update = int(opt_val)
+
+ except:
+ raise Exception('Invalid argument "' + opt_val + '" for option max-itrs-without-update. Usage: options = "[--max-itrs-without-update ] [...]')
+
+ elif opt_name == 'seed':
+ try:
+ self.__seed = int(opt_val)
+
+ except:
+ raise Exception('Invalid argument "' + opt_val + '" for option seed. Usage: options = "[--seed ] [...]')
+
+ elif opt_name == 'epsilon':
+ try:
+ self.__epsilon = float(opt_val)
+
+ except:
+ raise Exception('Invalid argument "' + opt_val + '" for option epsilon. Usage: options = "[--epsilon ] [...]')
+
+ if self.__epsilon <= 0:
+ raise Exception('Invalid argument "' + opt_val + '" for option epsilon. Usage: options = "[--epsilon ] [...]')
+
+ elif opt_name == 'inits-increase-order':
+ try:
+ self.__num_inits_increase_order = int(opt_val)
+
+ except:
+ raise Exception('Invalid argument "' + opt_val + '" for option inits-increase-order. Usage: options = "[--inits-increase-order ]"')
+
+ if self.__num_inits_increase_order <= 0:
+ raise Exception('Invalid argument "' + opt_val + '" for option inits-increase-order. Usage: options = "[--inits-increase-order ]"')
+
+ elif opt_name == 'init-type-increase-order':
+ self.__init_type_increase_order = opt_val
+ if opt_val != 'CLUSTERS' and opt_val != 'K-MEANS++':
+ raise Exception('Invalid argument ' + opt_val + ' for option init-type-increase-order. Usage: options = "[--init-type-increase-order CLUSTERS|K-MEANS++] [...]"')
+
+ elif opt_name == 'max-itrs-increase-order':
+ try:
+ self.__max_itrs_increase_order = int(opt_val)
+
+ except:
+ raise Exception('Invalid argument "' + opt_val + '" for option max-itrs-increase-order. Usage: options = "[--max-itrs-increase-order ] [...]')
+
+ else:
+ valid_options = '[--init-type ] [--random-inits ] [--randomness ] [--seed ] [--stdout ] '
+ valid_options += '[--time-limit ] [--max-itrs ] [--epsilon ] '
+ valid_options += '[--inits-increase-order ] [--init-type-increase-order ] [--max-itrs-increase-order ]'
+ raise Exception('Invalid option "' + opt_name + '". Usage: options = "' + valid_options + '"')
+
+
+ def set_init_method(self, init_method, init_options=''):
+ """Selects method to be used for computing the initial medoid graph.
+
+ Parameters
+ ----------
+ init_method : string
+ The selected method. Default: ged::Options::GEDMethod::BRANCH_UNIFORM.
+
+ init_options : string
+ The options for the selected method. Default: "".
+
+ Notes
+ -----
+ Has no effect unless "--init-type MEDOID" is passed to set_options().
+ """
+ self.__init_method = init_method;
+ self.__init_options = init_options;
+
+
+ def set_descent_method(self, descent_method, descent_options=''):
+ """Selects method to be used for block gradient descent..
+
+ Parameters
+ ----------
+ descent_method : string
+ The selected method. Default: ged::Options::GEDMethod::BRANCH_FAST.
+
+ descent_options : string
+ The options for the selected method. Default: "".
+
+ Notes
+ -----
+ Has no effect unless "--init-type MEDOID" is passed to set_options().
+ """
+ self.__descent_method = descent_method;
+ self.__descent_options = descent_options;
+
+
+ def set_refine_method(self, refine_method, refine_options):
+ """Selects method to be used for improving the sum of distances and the node maps for the converged median.
+
+ Parameters
+ ----------
+ refine_method : string
+ The selected method. Default: "IPFP".
+
+ refine_options : string
+ The options for the selected method. Default: "".
+
+ Notes
+ -----
+ Has no effect if "--refine FALSE" is passed to set_options().
+ """
+ self.__refine_method = refine_method
+ self.__refine_options = refine_options
+
+
+ def run(self, graph_ids, set_median_id, gen_median_id):
+ """Computes a generalized median graph.
+
+ Parameters
+ ----------
+ graph_ids : list[integer]
+ The IDs of the graphs for which the median should be computed. Must have been added to the environment passed to the constructor.
+
+ set_median_id : integer
+ The ID of the computed set-median. A dummy graph with this ID must have been added to the environment passed to the constructor. Upon termination, the computed median can be obtained via gklearn.gedlib.gedlibpy.GEDEnv.get_graph().
+
+
+ gen_median_id : integer
+ The ID of the computed generalized median. Upon termination, the computed median can be obtained via gklearn.gedlib.gedlibpy.GEDEnv.get_graph().
+ """
+ # Sanity checks.
+ if len(graph_ids) == 0:
+ raise Exception('Empty vector of graph IDs, unable to compute median.')
+ all_graphs_empty = True
+ for graph_id in graph_ids:
+ if self.__ged_env.get_graph_num_nodes(graph_id) > 0:
+ self.__median_node_id_prefix = self.__ged_env.get_original_node_ids(graph_id)[0]
+ all_graphs_empty = False
+ break
+ if all_graphs_empty:
+ raise Exception('All graphs in the collection are empty.')
+
+ # Start timer and record start time.
+ start = time.time()
+ timer = Timer(self.__time_limit_in_sec)
+ self.__median_id = gen_median_id
+ self.__state = AlgorithmState.TERMINATED
+
+ # Get ExchangeGraph representations of the input graphs.
+ graphs = {}
+ for graph_id in graph_ids:
+ # @todo: get_nx_graph() function may need to be modified according to the coming code.
+ graphs[graph_id] = self.__ged_env.get_nx_graph(graph_id, True, True, False)
+# print(self.__ged_env.get_graph_internal_id(0))
+# print(graphs[0].graph)
+# print(graphs[0].nodes(data=True))
+# print(graphs[0].edges(data=True))
+# print(nx.adjacency_matrix(graphs[0]))
+
+
+ # Construct initial medians.
+ medians = []
+ self.__construct_initial_medians(graph_ids, timer, medians)
+ end_init = time.time()
+ self.__runtime_initialized = end_init - start
+# print(medians[0].graph)
+# print(medians[0].nodes(data=True))
+# print(medians[0].edges(data=True))
+# print(nx.adjacency_matrix(medians[0]))
+
+ # Reset information about iterations and number of times the median decreases and increases.
+ self.__itrs = [0] * len(medians)
+ self.__num_decrease_order = 0
+ self.__num_increase_order = 0
+ self.__num_converged_descents = 0
+
+ # Initialize the best median.
+ best_sum_of_distances = np.inf
+ self.__best_init_sum_of_distances = np.inf
+ node_maps_from_best_median = {}
+
+ # Run block gradient descent from all initial medians.
+ self.__ged_env.set_method(self.__descent_method, self.__descent_options)
+ for median_pos in range(0, len(medians)):
+
+ # Terminate if the timer has expired and at least one SOD has been computed.
+ if timer.expired() and median_pos > 0:
+ break
+
+ # Print information about current iteration.
+ if self.__print_to_stdout == 2:
+ print('\n===========================================================')
+ print('Block gradient descent for initial median', str(median_pos + 1), 'of', str(len(medians)), '.')
+ print('-----------------------------------------------------------')
+
+ # Get reference to the median.
+ median = medians[median_pos]
+
+ # Load initial median into the environment.
+ self.__ged_env.load_nx_graph(median, gen_median_id)
+ self.__ged_env.init(self.__ged_env.get_init_type())
+
+ # Print information about current iteration.
+ if self.__print_to_stdout == 2:
+ progress = tqdm(desc='\rComputing initial node maps', total=len(graph_ids), file=sys.stdout)
+
+ # Compute node maps and sum of distances for initial median.
+ self.__sum_of_distances = 0
+ self.__node_maps_from_median.clear() # @todo
+ for graph_id in graph_ids:
+ self.__ged_env.run_method(gen_median_id, graph_id)
+ self.__node_maps_from_median[graph_id] = self.__ged_env.get_node_map(gen_median_id, graph_id)
+# print(self.__node_maps_from_median[graph_id])
+ self.__sum_of_distances += self.__ged_env.get_induced_cost(gen_median_id, graph_id) # @todo: the C++ implementation for this function in GedLibBind.ipp re-call get_node_map() once more, this is not neccessary.
+# print(self.__sum_of_distances)
+ # Print information about current iteration.
+ if self.__print_to_stdout == 2:
+ progress.update(1)
+
+ self.__best_init_sum_of_distances = min(self.__best_init_sum_of_distances, self.__sum_of_distances)
+ self.__ged_env.load_nx_graph(median, set_median_id)
+# print(self.__best_init_sum_of_distances)
+
+ # Print information about current iteration.
+ if self.__print_to_stdout == 2:
+ print('\n')
+
+ # Run block gradient descent from initial median.
+ converged = False
+ itrs_without_update = 0
+ while not self.__termination_criterion_met(converged, timer, self.__itrs[median_pos], itrs_without_update):
+
+ # Print information about current iteration.
+ if self.__print_to_stdout == 2:
+ print('\n===========================================================')
+ print('Iteration', str(self.__itrs[median_pos] + 1), 'for initial median', str(median_pos + 1), 'of', str(len(medians)), '.')
+ print('-----------------------------------------------------------')
+
+ # Initialize flags that tell us what happened in the iteration.
+ median_modified = False
+ node_maps_modified = False
+ decreased_order = False
+ increased_order = False
+
+ # Update the median. # @todo!!!!!!!!!!!!!!!!!!!!!!
+ median_modified = self.__update_median(graphs, median)
+ if not median_modified or self.__itrs[median_pos] == 0:
+ decreased_order = False
+ if not decreased_order or self.__itrs[median_pos] == 0:
+ increased_order = False
+
+ # Update the number of iterations without update of the median.
+ if median_modified or decreased_order or increased_order:
+ itrs_without_update = 0
+ else:
+ itrs_without_update += 1
+
+ # Print information about current iteration.
+ if self.__print_to_stdout == 2:
+ print('Loading median to environment: ... ', end='')
+
+ # Load the median into the environment.
+ # @todo: should this function use the original node label?
+ self.__ged_env.load_nx_graph(median, gen_median_id)
+ self.__ged_env.init(self.__ged_env.get_init_type())
+
+ # Print information about current iteration.
+ if self.__print_to_stdout == 2:
+ print('done.')
+
+ # Print information about current iteration.
+ if self.__print_to_stdout == 2:
+ print('Updating induced costs: ... ', end='')
+
+ # Compute induced costs of the old node maps w.r.t. the updated median.
+ for graph_id in graph_ids:
+# print(self.__ged_env.get_induced_cost(gen_median_id, graph_id))
+ # @todo: watch out if compute_induced_cost is correct, this may influence: increase/decrease order, induced_cost() in the following code.!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+ self.__ged_env.compute_induced_cost(gen_median_id, graph_id)
+# print('---------------------------------------')
+# print(self.__ged_env.get_induced_cost(gen_median_id, graph_id))
+
+ # Print information about current iteration.
+ if self.__print_to_stdout == 2:
+ print('done.')
+
+ # Update the node maps.
+ node_maps_modified = self.__update_node_maps() # @todo
+
+ # Update the order of the median if no improvement can be found with the current order.
+
+ # Update the sum of distances.
+ old_sum_of_distances = self.__sum_of_distances
+ self.__sum_of_distances = 0
+ for graph_id in self.__node_maps_from_median:
+ self.__sum_of_distances += self.__ged_env.get_induced_cost(gen_median_id, graph_id) # @todo: see above.
+
+ # Print information about current iteration.
+ if self.__print_to_stdout == 2:
+ print('Old local SOD: ', old_sum_of_distances)
+ print('New local SOD: ', self.__sum_of_distances)
+ print('Best converged SOD: ', best_sum_of_distances)
+ print('Modified median: ', median_modified)
+ print('Modified node maps: ', node_maps_modified)
+ print('Decreased order: ', decreased_order)
+ print('Increased order: ', increased_order)
+ print('===========================================================\n')
+
+ converged = not (median_modified or node_maps_modified or decreased_order or increased_order)
+
+ self.__itrs[median_pos] += 1
+
+ # Update the best median.
+ if self.__sum_of_distances < self.__best_init_sum_of_distances:
+ best_sum_of_distances = self.__sum_of_distances
+ node_maps_from_best_median = self.__node_maps_from_median
+ best_median = median
+
+ # Update the number of converged descents.
+ if converged:
+ self.__num_converged_descents += 1
+
+ # Store the best encountered median.
+ self.__sum_of_distances = best_sum_of_distances
+ self.__node_maps_from_median = node_maps_from_best_median
+ self.__ged_env.load_nx_graph(best_median, gen_median_id)
+ self.__ged_env.init(self.__ged_env.get_init_type())
+ end_descent = time.time()
+ self.__runtime_converged = end_descent - start
+
+ # Refine the sum of distances and the node maps for the converged median.
+ self.__converged_sum_of_distances = self.__sum_of_distances
+ if self.__refine:
+ self.__improve_sum_of_distances(timer) # @todo
+
+ # Record end time, set runtime and reset the number of initial medians.
+ end = time.time()
+ self.__runtime = end - start
+ self.__num_random_inits = self.__desired_num_random_inits
+
+ # Print global information.
+ if self.__print_to_stdout != 0:
+ print('\n===========================================================')
+ print('Finished computation of generalized median graph.')
+ print('-----------------------------------------------------------')
+ print('Best SOD after initialization: ', self.__best_init_sum_of_distances)
+ print('Converged SOD: ', self.__converged_sum_of_distances)
+ if self.__refine:
+ print('Refined SOD: ', self.__sum_of_distances)
+ print('Overall runtime: ', self.__runtime)
+ print('Runtime of initialization: ', self.__runtime_initialized)
+ print('Runtime of block gradient descent: ', self.__runtime_converged - self.__runtime_initialized)
+ if self.__refine:
+ print('Runtime of refinement: ', self.__runtime - self.__runtime_converged)
+ print('Number of initial medians: ', len(medians))
+ total_itr = 0
+ num_started_descents = 0
+ for itr in self.__itrs:
+ total_itr += itr
+ if itr > 0:
+ num_started_descents += 1
+ print('Size of graph collection: ', len(graph_ids))
+ print('Number of started descents: ', num_started_descents)
+ print('Number of converged descents: ', self.__num_converged_descents)
+ print('Overall number of iterations: ', total_itr)
+ print('Overall number of times the order decreased: ', self.__num_decrease_order)
+ print('Overall number of times the order increased: ', self.__num_increase_order)
+ print('===========================================================\n')
+
+
+ def get_sum_of_distances(self, state=''):
+ """Returns the sum of distances.
+
+ Parameters
+ ----------
+ state : string
+ The state of the estimator. Can be 'initialized' or 'converged'. Default: ""
+
+ Returns
+ -------
+ float
+ The sum of distances (SOD) of the median when the estimator was in the state `state` during the last call to run(). If `state` is not given, the converged SOD (without refinement) or refined SOD (with refinement) is returned.
+ """
+ if not self.__median_available():
+ raise Exception('No median has been computed. Call run() before calling get_sum_of_distances().')
+ if state == 'initialized':
+ return self.__best_init_sum_of_distances
+ if state == 'converged':
+ return self.__converged_sum_of_distances
+ return self.__sum_of_distances
+
+
+ def __set_default_options(self):
+ self.__init_type = 'RANDOM'
+ self.__num_random_inits = 10
+ self.__desired_num_random_inits = 10
+ self.__use_real_randomness = True
+ self.__seed = 0
+ self.__refine = True
+ self.__time_limit_in_sec = 0
+ self.__epsilon = 0.0001
+ self.__max_itrs = 100
+ self.__max_itrs_without_update = 3
+ self.__num_inits_increase_order = 10
+ self.__init_type_increase_order = 'K-MEANS++'
+ self.__max_itrs_increase_order = 10
+ self.__print_to_stdout = 2
+
+
+ def __construct_initial_medians(self, graph_ids, timer, initial_medians):
+ # Print information about current iteration.
+ if self.__print_to_stdout == 2:
+ print('\n===========================================================')
+ print('Constructing initial median(s).')
+ print('-----------------------------------------------------------')
+
+ # Compute or sample the initial median(s).
+ initial_medians.clear()
+ if self.__init_type == 'MEDOID':
+ self.__compute_medoid(graph_ids, timer, initial_medians)
+ elif self.__init_type == 'MAX':
+ pass # @todo
+# compute_max_order_graph_(graph_ids, initial_medians)
+ elif self.__init_type == 'MIN':
+ pass # @todo
+# compute_min_order_graph_(graph_ids, initial_medians)
+ elif self.__init_type == 'MEAN':
+ pass # @todo
+# compute_mean_order_graph_(graph_ids, initial_medians)
+ else:
+ pass # @todo
+# sample_initial_medians_(graph_ids, initial_medians)
+
+ # Print information about current iteration.
+ if self.__print_to_stdout == 2:
+ print('===========================================================')
+
+
+ def __compute_medoid(self, graph_ids, timer, initial_medians):
+ # Use method selected for initialization phase.
+ self.__ged_env.set_method(self.__init_method, self.__init_options)
+
+ # Print information about current iteration.
+ if self.__print_to_stdout == 2:
+ progress = tqdm(desc='\rComputing medoid', total=len(graph_ids), file=sys.stdout)
+
+ # Compute the medoid.
+ medoid_id = graph_ids[0]
+ best_sum_of_distances = np.inf
+ for g_id in graph_ids:
+ if timer.expired():
+ self.__state = AlgorithmState.CALLED
+ break
+ sum_of_distances = 0
+ for h_id in graph_ids:
+ self.__ged_env.run_method(g_id, h_id)
+ sum_of_distances += self.__ged_env.get_upper_bound(g_id, h_id)
+ if sum_of_distances < best_sum_of_distances:
+ best_sum_of_distances = sum_of_distances
+ medoid_id = g_id
+
+ # Print information about current iteration.
+ if self.__print_to_stdout == 2:
+ progress.update(1)
+ initial_medians.append(self.__ged_env.get_nx_graph(medoid_id, True, True, False)) # @todo
+
+ # Print information about current iteration.
+ if self.__print_to_stdout == 2:
+ print('\n')
+
+
+ def __termination_criterion_met(self, converged, timer, itr, itrs_without_update):
+ if timer.expired() or (itr >= self.__max_itrs if self.__max_itrs >= 0 else False):
+ if self.__state == AlgorithmState.TERMINATED:
+ self.__state = AlgorithmState.INITIALIZED
+ return True
+ return converged or (itrs_without_update > self.__max_itrs_without_update if self.__max_itrs_without_update >= 0 else False)
+
+
+ def __update_median(self, graphs, median):
+ # Print information about current iteration.
+ if self.__print_to_stdout == 2:
+ print('Updating median: ', end='')
+
+ # Store copy of the old median.
+ old_median = median.copy() # @todo: this is just a shallow copy.
+
+ # Update the node labels.
+ if self.__labeled_nodes:
+ self.__update_node_labels(graphs, median)
+
+ # Update the edges and their labels.
+ self.__update_edges(graphs, median)
+
+ # Print information about current iteration.
+ if self.__print_to_stdout == 2:
+ print('done.')
+
+ return not self.__are_graphs_equal(median, old_median)
+
+
+ def __update_node_labels(self, graphs, median):
+
+ # Print information about current iteration.
+ if self.__print_to_stdout == 2:
+ print('nodes ... ', end='')
+
+ # Iterate through all nodes of the median.
+ for i in range(0, nx.number_of_nodes(median)):
+# print('i: ', i)
+ # Collect the labels of the substituted nodes.
+ node_labels = []
+ for graph_id, graph in graphs.items():
+# print('graph_id: ', graph_id)
+# print(self.__node_maps_from_median[graph_id])
+ k = self.__get_node_image_from_map(self.__node_maps_from_median[graph_id], i)
+# print('k: ', k)
+ if k != np.inf:
+ node_labels.append(graph.nodes[k])
+
+ # Compute the median label and update the median.
+ if len(node_labels) > 0:
+ median_label = self.__ged_env.get_median_node_label(node_labels)
+ if self.__ged_env.get_node_rel_cost(median.nodes[i], median_label) > self.__epsilon:
+ nx.set_node_attributes(median, {i: median_label})
+
+
+ def __update_edges(self, graphs, median):
+ # Print information about current iteration.
+ if self.__print_to_stdout == 2:
+ print('edges ... ', end='')
+
+ # Clear the adjacency lists of the median and reset number of edges to 0.
+ median_edges = list(median.edges)
+ for (head, tail) in median_edges:
+ median.remove_edge(head, tail)
+
+ # @todo: what if edge is not labeled?
+ # Iterate through all possible edges (i,j) of the median.
+ for i in range(0, nx.number_of_nodes(median)):
+ for j in range(i + 1, nx.number_of_nodes(median)):
+
+ # Collect the labels of the edges to which (i,j) is mapped by the node maps.
+ edge_labels = []
+ for graph_id, graph in graphs.items():
+ k = self.__get_node_image_from_map(self.__node_maps_from_median[graph_id], i)
+ l = self.__get_node_image_from_map(self.__node_maps_from_median[graph_id], j)
+ if k != np.inf and l != np.inf:
+ if graph.has_edge(k, l):
+ edge_labels.append(graph.edges[(k, l)])
+
+ # Compute the median edge label and the overall edge relabeling cost.
+ rel_cost = 0
+ median_label = self.__ged_env.get_edge_label(1)
+ if median.has_edge(i, j):
+ median_label = median.edges[(i, j)]
+ if self.__labeled_edges and len(edge_labels) > 0:
+ new_median_label = self.__ged_env.median_edge_label(edge_labels)
+ if self.__ged_env.get_edge_rel_cost(median_label, new_median_label) > self.__epsilon:
+ median_label = new_median_label
+ for edge_label in edge_labels:
+ rel_cost += self.__ged_env.get_edge_rel_cost(median_label, edge_label)
+
+ # Update the median.
+ if rel_cost < (self.__edge_ins_cost + self.__edge_del_cost) * len(edge_labels) - self.__edge_del_cost * len(graphs):
+ median.add_edge(i, j, **median_label)
+ else:
+ if median.has_edge(i, j):
+ median.remove_edge(i, j)
+
+
+ def __update_node_maps(self):
+ # Print information about current iteration.
+ if self.__print_to_stdout == 2:
+ progress = tqdm(desc='\rUpdating node maps', total=len(self.__node_maps_from_median), file=sys.stdout)
+
+ # Update the node maps.
+ node_maps_were_modified = False
+ for graph_id in self.__node_maps_from_median:
+ self.__ged_env.run_method(self.__median_id, graph_id)
+ if self.__ged_env.get_upper_bound(self.__median_id, graph_id) < self.__ged_env.get_induced_cost(self.__median_id, graph_id) - self.__epsilon: # @todo: see above.
+ self.__node_maps_from_median[graph_id] = self.__ged_env.get_node_map(self.__median_id, graph_id) # @todo: node_map may not assigned.
+ node_maps_were_modified = True
+ # Print information about current iteration.
+ if self.__print_to_stdout == 2:
+ progress.update(1)
+
+ # Print information about current iteration.
+ if self.__print_to_stdout == 2:
+ print('\n')
+
+ # Return true if the node maps were modified.
+ return node_maps_were_modified
+
+
+ def __improve_sum_of_distances(self, timer):
+ pass
+
+
+ def __median_available(self):
+ return self.__median_id != np.inf
+
+
+ def __get_node_image_from_map(self, node_map, node):
+ """
+ Return ID of the node mapping of `node` in `node_map`.
+
+ Parameters
+ ----------
+ node_map : list[tuple(int, int)]
+ List of node maps where the mapping node is found.
+
+ node : int
+ The mapping node of this node is returned
+
+ Raises
+ ------
+ Exception
+ If the node with ID `node` is not contained in the source nodes of the node map.
+
+ Returns
+ -------
+ int
+ ID of the mapping of `node`.
+
+ Notes
+ -----
+ This function is not implemented in the `ged::MedianGraphEstimator` class of the `GEDLIB` library. Instead it is a Python implementation of the `ged::NodeMap::image` function.
+ """
+ if node < len(node_map):
+ return node_map[node][1] if node_map[node][1] < len(node_map) else np.inf
+ else:
+ raise Exception('The node with ID ', str(node), ' is not contained in the source nodes of the node map.')
+ return np.inf
+
+
+ def __are_graphs_equal(self, g1, g2):
+ """
+ Check if the two graphs are equal.
+
+ Parameters
+ ----------
+ g1 : NetworkX graph object
+ Graph 1 to be compared.
+
+ g2 : NetworkX graph object
+ Graph 2 to be compared.
+
+ Returns
+ -------
+ bool
+ True if the two graph are equal.
+
+ Notes
+ -----
+ This is not an identical check. Here the two graphs are equal if and only if their original_node_ids, nodes, all node labels, edges and all edge labels are equal. This function is specifically designed for class `MedianGraphEstimator` and should not be used elsewhere.
+ """
+ # check original node ids.
+ if not g1.graph['original_node_ids'] == g2.graph['original_node_ids']:
+ return False
+ # check nodes.
+ nlist1 = [n for n in g1.nodes(data=True)]
+ nlist2 = [n for n in g2.nodes(data=True)]
+ if not nlist1 == nlist2:
+ return False
+ # check edges.
+ elist1 = [n for n in g1.edges(data=True)]
+ elist2 = [n for n in g2.edges(data=True)]
+ if not elist1 == elist2:
+ return False
+
+ return True
+
+
+ def compute_my_cost(g, h, node_map):
+ cost = 0.0
+ for node in g.nodes:
+ cost += 0
+
\ No newline at end of file
diff --git a/gklearn/preimage/median_linlin.py b/gklearn/preimage/median_linlin.py
new file mode 100644
index 0000000..6139558
--- /dev/null
+++ b/gklearn/preimage/median_linlin.py
@@ -0,0 +1,215 @@
+import sys
+import pathlib
+import numpy as np
+import networkx as nx
+
+from gedlibpy import librariesImport, gedlibpy
+sys.path.insert(0, "/home/bgauzere/dev/optim-graphes/")
+import gklearn
+
+def replace_graph_in_env(script, graph, old_id, label='median'):
+ """
+ Replace a graph in script
+
+ If old_id is -1, add a new graph to the environnemt
+
+ """
+ if(old_id > -1):
+ script.PyClearGraph(old_id)
+ new_id = script.PyAddGraph(label)
+ for i in graph.nodes():
+ script.PyAddNode(new_id,str(i),graph.node[i]) # !! strings are required bt gedlib
+ for e in graph.edges:
+ script.PyAddEdge(new_id, str(e[0]),str(e[1]), {})
+ script.PyInitEnv()
+ script.PySetMethod("IPFP", "")
+ script.PyInitMethod()
+
+ return new_id
+
+#Dessin median courrant
+def draw_Letter_graph(graph):
+ import numpy as np
+ import networkx as nx
+ import matplotlib.pyplot as plt
+ plt.figure()
+ pos = {}
+ for n in graph.nodes:
+ pos[n] = np.array([float(graph.node[n]['x']),float(graph.node[n]['y'])])
+ nx.draw_networkx(graph,pos)
+ plt.show()
+
+#compute new mappings
+def update_mappings(script,median_id,listID):
+ med_distances = {}
+ med_mappings = {}
+ sod = 0
+ for i in range(0,len(listID)):
+ script.PyRunMethod(median_id,listID[i])
+ med_distances[i] = script.PyGetUpperBound(median_id,listID[i])
+ med_mappings[i] = script.PyGetForwardMap(median_id,listID[i])
+ sod += med_distances[i]
+ return med_distances, med_mappings, sod
+
+def calcul_Sij(all_mappings, all_graphs,i,j):
+ s_ij = 0
+ for k in range(0,len(all_mappings)):
+ cur_graph = all_graphs[k]
+ cur_mapping = all_mappings[k]
+ size_graph = cur_graph.order()
+ if ((cur_mapping[i] < size_graph) and
+ (cur_mapping[j] < size_graph) and
+ (cur_graph.has_edge(cur_mapping[i], cur_mapping[j]) == True)):
+ s_ij += 1
+
+ return s_ij
+
+# def update_median_nodes_L1(median,listIdSet,median_id,dataset, mappings):
+# from scipy.stats.mstats import gmean
+
+# for i in median.nodes():
+# for k in listIdSet:
+# vectors = [] #np.zeros((len(listIdSet),2))
+# if(k != median_id):
+# phi_i = mappings[k][i]
+# if(phi_i < dataset[k].order()):
+# vectors.append([float(dataset[k].node[phi_i]['x']),float(dataset[k].node[phi_i]['y'])])
+
+# new_labels = gmean(vectors)
+# median.node[i]['x'] = str(new_labels[0])
+# median.node[i]['y'] = str(new_labels[1])
+# return median
+
+def update_median_nodes(median,dataset,mappings):
+ #update node attributes
+ for i in median.nodes():
+ nb_sub=0
+ mean_label = {'x' : 0, 'y' : 0}
+ for k in range(0,len(mappings)):
+ phi_i = mappings[k][i]
+ if ( phi_i < dataset[k].order() ):
+ nb_sub += 1
+ mean_label['x'] += 0.75*float(dataset[k].node[phi_i]['x'])
+ mean_label['y'] += 0.75*float(dataset[k].node[phi_i]['y'])
+ median.node[i]['x'] = str((1/0.75)*(mean_label['x']/nb_sub))
+ median.node[i]['y'] = str((1/0.75)*(mean_label['y']/nb_sub))
+ return median
+
+def update_median_edges(dataset, mappings, median, cei=0.425,cer=0.425):
+#for letter high, ceir = 1.7, alpha = 0.75
+ size_dataset = len(dataset)
+ ratio_cei_cer = cer/(cei + cer)
+ threshold = size_dataset*ratio_cei_cer
+ order_graph_median = median.order()
+ for i in range(0,order_graph_median):
+ for j in range(i+1,order_graph_median):
+ s_ij = calcul_Sij(mappings,dataset,i,j)
+ if(s_ij > threshold):
+ median.add_edge(i,j)
+ else:
+ if(median.has_edge(i,j)):
+ median.remove_edge(i,j)
+ return median
+
+
+
+def compute_median(script, listID, dataset,verbose=False):
+ """Compute a graph median of a dataset according to an environment
+
+ Parameters
+
+ script : An gedlib initialized environnement
+ listID (list): a list of ID in script: encodes the dataset
+ dataset (list): corresponding graphs in networkX format. We assume that graph
+ listID[i] corresponds to dataset[i]
+
+ Returns:
+ A networkX graph, which is the median, with corresponding sod
+ """
+ print(len(listID))
+ median_set_index, median_set_sod = compute_median_set(script, listID)
+ print(median_set_index)
+ print(median_set_sod)
+ sods = []
+ #Ajout median dans environnement
+ set_median = dataset[median_set_index].copy()
+ median = dataset[median_set_index].copy()
+ cur_med_id = replace_graph_in_env(script,median,-1)
+ med_distances, med_mappings, cur_sod = update_mappings(script,cur_med_id,listID)
+ sods.append(cur_sod)
+ if(verbose):
+ print(cur_sod)
+ ite_max = 50
+ old_sod = cur_sod * 2
+ ite = 0
+ epsilon = 0.001
+
+ best_median
+ while((ite < ite_max) and (np.abs(old_sod - cur_sod) > epsilon )):
+ median = update_median_nodes(median,dataset, med_mappings)
+ median = update_median_edges(dataset,med_mappings,median)
+
+ cur_med_id = replace_graph_in_env(script,median,cur_med_id)
+ med_distances, med_mappings, cur_sod = update_mappings(script,cur_med_id,listID)
+
+
+ sods.append(cur_sod)
+ if(verbose):
+ print(cur_sod)
+ ite += 1
+ return median, cur_sod, sods, set_median
+
+ draw_Letter_graph(median)
+
+
+def compute_median_set(script,listID):
+ 'Returns the id in listID corresponding to median set'
+ #Calcul median set
+ N=len(listID)
+ map_id_to_index = {}
+ map_index_to_id = {}
+ for i in range(0,len(listID)):
+ map_id_to_index[listID[i]] = i
+ map_index_to_id[i] = listID[i]
+
+ distances = np.zeros((N,N))
+ for i in listID:
+ for j in listID:
+ script.PyRunMethod(i,j)
+ distances[map_id_to_index[i],map_id_to_index[j]] = script.PyGetUpperBound(i,j)
+
+ median_set_index = np.argmin(np.sum(distances,0))
+ sod = np.min(np.sum(distances,0))
+
+ return median_set_index, sod
+
+def _convertGraph(G):
+ """Convert a graph to the proper NetworkX format that can be
+ recognized by library gedlibpy.
+ """
+ G_new = nx.Graph()
+ for nd, attrs in G.nodes(data=True):
+ G_new.add_node(str(nd), chem=attrs['atom'])
+# G_new.add_node(str(nd), x=str(attrs['attributes'][0]),
+# y=str(attrs['attributes'][1]))
+ for nd1, nd2, attrs in G.edges(data=True):
+ G_new.add_edge(str(nd1), str(nd2), valence=attrs['bond_type'])
+# G_new.add_edge(str(nd1), str(nd2))
+
+ return G_new
+
+if __name__ == "__main__":
+ #Chargement du dataset
+ gedlibpy.PyLoadGXLGraph('/home/bgauzere/dev/gedlib/data/datasets/Letter/HIGH/', '/home/bgauzere/dev/gedlib/data/collections/Letter_Z.xml')
+ gedlibpy.PySetEditCost("LETTER")
+ gedlibpy.PyInitEnv()
+ gedlibpy.PySetMethod("IPFP", "")
+ gedlibpy.PyInitMethod()
+
+ dataset,my_y = gklearn.utils.graphfiles.loadDataset("/home/bgauzere/dev/gedlib/data/datasets/Letter/HIGH/Letter_Z.cxl")
+
+ listID = gedlibpy.PyGetAllGraphIds()
+ median, sod = compute_median(gedlibpy,listID,dataset,verbose=True)
+
+ print(sod)
+ draw_Letter_graph(median)
diff --git a/gklearn/preimage/median_preimage_generator.py b/gklearn/preimage/median_preimage_generator.py
new file mode 100644
index 0000000..dfbaef2
--- /dev/null
+++ b/gklearn/preimage/median_preimage_generator.py
@@ -0,0 +1,15 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Thu Mar 26 18:27:22 2020
+
+@author: ljia
+"""
+from gklearn.preimage.preimage_generator import PreimageGenerator
+# from gklearn.utils.dataset import Dataset
+
+class MedianPreimageGenerator(PreimageGenerator):
+
+ def __init__(self, mge, dataset):
+ self.__mge = mge
+ self.__dataset = dataset
\ No newline at end of file
diff --git a/gklearn/preimage/misc.py b/gklearn/preimage/misc.py
new file mode 100644
index 0000000..18682c8
--- /dev/null
+++ b/gklearn/preimage/misc.py
@@ -0,0 +1,108 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Thu Mar 19 18:13:56 2020
+
+@author: ljia
+"""
+
+def options_string_to_options_map(options_string):
+ """Transforms an options string into an options map.
+
+ Parameters
+ ----------
+ options_string : string
+ Options string of the form "[--