From dcff21cda7827a46f9e5c15b71364cb240d56598 Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Fri, 27 Mar 2020 10:46:00 +0100 Subject: [PATCH] Revert "clear repo: remove preimage." This reverts commit 5fe81a932b96b647d773939175784bce5f703413. --- gklearn/preimage/common_types.py | 17 + gklearn/preimage/cpp2python.py | 134 ++++ gklearn/preimage/find_best_k.py | 170 +++++ gklearn/preimage/fitDistance.py | 430 +++++++++++ gklearn/preimage/ged.py | 467 ++++++++++++ gklearn/preimage/iam.py | 775 +++++++++++++++++++ gklearn/preimage/knn.py | 114 +++ gklearn/preimage/libs.py | 6 + gklearn/preimage/median.py | 218 ++++++ gklearn/preimage/median_benoit.py | 201 +++++ gklearn/preimage/median_graph_estimator.py | 826 ++++++++++++++++++++ gklearn/preimage/median_linlin.py | 215 ++++++ gklearn/preimage/median_preimage_generator.py | 15 + gklearn/preimage/misc.py | 108 +++ gklearn/preimage/pathfrequency.py | 201 +++++ gklearn/preimage/preimage_generator.py | 12 + gklearn/preimage/preimage_iam.py | 705 +++++++++++++++++ gklearn/preimage/preimage_random.py | 309 ++++++++ gklearn/preimage/python_code.py | 122 +++ gklearn/preimage/test.py | 83 ++ gklearn/preimage/test_fitDistance.py | 648 ++++++++++++++++ gklearn/preimage/test_ged.py | 520 +++++++++++++ gklearn/preimage/test_iam.py | 964 ++++++++++++++++++++++++ gklearn/preimage/test_k_closest_graphs.py | 462 ++++++++++++ gklearn/preimage/test_median_graph_estimator.py | 91 +++ gklearn/preimage/test_others.py | 686 +++++++++++++++++ gklearn/preimage/test_preimage_iam.py | 620 +++++++++++++++ gklearn/preimage/test_preimage_mix.py | 539 +++++++++++++ gklearn/preimage/test_preimage_random.py | 398 ++++++++++ gklearn/preimage/timer.py | 40 + gklearn/preimage/utils.py | 151 ++++ gklearn/preimage/visualization.py | 585 ++++++++++++++ gklearn/preimage/xp_fit_method.py | 935 +++++++++++++++++++++++ gklearn/preimage/xp_letter_h.py | 476 ++++++++++++ gklearn/preimage/xp_monoterpenoides.py | 249 ++++++ 35 files changed, 12492 insertions(+) create mode 100644 gklearn/preimage/common_types.py create mode 100644 gklearn/preimage/cpp2python.py create mode 100644 gklearn/preimage/find_best_k.py create mode 100644 gklearn/preimage/fitDistance.py create mode 100644 gklearn/preimage/ged.py create mode 100644 gklearn/preimage/iam.py create mode 100644 gklearn/preimage/knn.py create mode 100644 gklearn/preimage/libs.py create mode 100644 gklearn/preimage/median.py create mode 100644 gklearn/preimage/median_benoit.py create mode 100644 gklearn/preimage/median_graph_estimator.py create mode 100644 gklearn/preimage/median_linlin.py create mode 100644 gklearn/preimage/median_preimage_generator.py create mode 100644 gklearn/preimage/misc.py create mode 100644 gklearn/preimage/pathfrequency.py create mode 100644 gklearn/preimage/preimage_generator.py create mode 100644 gklearn/preimage/preimage_iam.py create mode 100644 gklearn/preimage/preimage_random.py create mode 100644 gklearn/preimage/python_code.py create mode 100644 gklearn/preimage/test.py create mode 100644 gklearn/preimage/test_fitDistance.py create mode 100644 gklearn/preimage/test_ged.py create mode 100644 gklearn/preimage/test_iam.py create mode 100644 gklearn/preimage/test_k_closest_graphs.py create mode 100644 gklearn/preimage/test_median_graph_estimator.py create mode 100644 gklearn/preimage/test_others.py create mode 100644 gklearn/preimage/test_preimage_iam.py create mode 100644 gklearn/preimage/test_preimage_mix.py create mode 100644 gklearn/preimage/test_preimage_random.py create mode 100644 gklearn/preimage/timer.py create mode 100644 gklearn/preimage/utils.py create mode 100644 gklearn/preimage/visualization.py create mode 100644 gklearn/preimage/xp_fit_method.py create mode 100644 gklearn/preimage/xp_letter_h.py create mode 100644 gklearn/preimage/xp_monoterpenoides.py diff --git a/gklearn/preimage/common_types.py b/gklearn/preimage/common_types.py new file mode 100644 index 0000000..2face25 --- /dev/null +++ b/gklearn/preimage/common_types.py @@ -0,0 +1,17 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Thu Mar 19 18:17:38 2020 + +@author: ljia +""" + +from enum import Enum, auto + +class AlgorithmState(Enum): + """can be used to specify the state of an algorithm. + """ + CALLED = auto # The algorithm has been called. + INITIALIZED = auto # The algorithm has been initialized. + CONVERGED = auto # The algorithm has converged. + TERMINATED = auto # The algorithm has terminated. \ No newline at end of file diff --git a/gklearn/preimage/cpp2python.py b/gklearn/preimage/cpp2python.py new file mode 100644 index 0000000..9d63026 --- /dev/null +++ b/gklearn/preimage/cpp2python.py @@ -0,0 +1,134 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Fri Mar 20 11:09:04 2020 + +@author: ljia +""" +import re + +def convert_function(cpp_code): +# f_cpp = open('cpp_code.cpp', 'r') +# # f_cpp = open('cpp_ext/src/median_graph_estimator.ipp', 'r') +# cpp_code = f_cpp.read() + python_code = cpp_code.replace('else if (', 'elif ') + python_code = python_code.replace('if (', 'if ') + python_code = python_code.replace('else {', 'else:') + python_code = python_code.replace(') {', ':') + python_code = python_code.replace(';\n', '\n') + python_code = re.sub('\n(.*)}\n', '\n\n', python_code) + # python_code = python_code.replace('}\n', '') + python_code = python_code.replace('throw', 'raise') + python_code = python_code.replace('error', 'Exception') + python_code = python_code.replace('"', '\'') + python_code = python_code.replace('\\\'', '"') + python_code = python_code.replace('try {', 'try:') + python_code = python_code.replace('true', 'True') + python_code = python_code.replace('false', 'False') + python_code = python_code.replace('catch (...', 'except') + # python_code = re.sub('std::string\(\'(.*)\'\)', '$1', python_code) + + return python_code + + + +# # python_code = python_code.replace('}\n', '') + + + + +# python_code = python_code.replace('option.first', 'opt_name') +# python_code = python_code.replace('option.second', 'opt_val') +# python_code = python_code.replace('ged::Error', 'Exception') +# python_code = python_code.replace('std::string(\'Invalid argument "\')', '\'Invalid argument "\'') + + +# f_cpp.close() +# f_python = open('python_code.py', 'w') +# f_python.write(python_code) +# f_python.close() + + +def convert_function_comment(cpp_fun_cmt, param_types): + cpp_fun_cmt = cpp_fun_cmt.replace('\t', '') + cpp_fun_cmt = cpp_fun_cmt.replace('\n * ', ' ') + # split the input comment according to key words. + param_split = None + note = None + cmt_split = cpp_fun_cmt.split('@brief')[1] + brief = cmt_split + if '@param' in cmt_split: + cmt_split = cmt_split.split('@param') + brief = cmt_split[0] + param_split = cmt_split[1:] + if '@note' in cmt_split[-1]: + note_split = cmt_split[-1].split('@note') + if param_split is not None: + param_split.pop() + param_split.append(note_split[0]) + else: + brief = note_split[0] + note = note_split[1] + + # get parameters. + if param_split is not None: + for idx, param in enumerate(param_split): + _, param_name, param_desc = param.split(' ', 2) + param_name = function_comment_strip(param_name, ' *\n\t/') + param_desc = function_comment_strip(param_desc, ' *\n\t/') + param_split[idx] = (param_name, param_desc) + + # strip comments. + brief = function_comment_strip(brief, ' *\n\t/') + if note is not None: + note = function_comment_strip(note, ' *\n\t/') + + # construct the Python function comment. + python_fun_cmt = '"""' + python_fun_cmt += brief + '\n' + if param_split is not None and len(param_split) > 0: + python_fun_cmt += '\nParameters\n----------' + for idx, param in enumerate(param_split): + python_fun_cmt += '\n' + param[0] + ' : ' + param_types[idx] + python_fun_cmt += '\n\t' + param[1] + '\n' + if note is not None: + python_fun_cmt += '\nNote\n----\n' + note + '\n' + python_fun_cmt += '"""' + + return python_fun_cmt + + +def function_comment_strip(comment, bad_chars): + head_removed, tail_removed = False, False + while not head_removed or not tail_removed: + if comment[0] in bad_chars: + comment = comment[1:] + head_removed = False + else: + head_removed = True + if comment[-1] in bad_chars: + comment = comment[:-1] + tail_removed = False + else: + tail_removed = True + + return comment + + +if __name__ == '__main__': +# python_code = convert_function(""" +# if (print_to_stdout_ == 2) { +# std::cout << "\n===========================================================\n"; +# std::cout << "Block gradient descent for initial median " << median_pos + 1 << " of " << medians.size() << ".\n"; +# std::cout << "-----------------------------------------------------------\n"; +# } +# """) + + + python_fun_cmt = convert_function_comment(""" + /*! + * @brief Returns the sum of distances. + * @param[in] state The state of the estimator. + * @return The sum of distances of the median when the estimator was in the state @p state during the last call to run(). + */ + """, ['string', 'string']) \ No newline at end of file diff --git a/gklearn/preimage/find_best_k.py b/gklearn/preimage/find_best_k.py new file mode 100644 index 0000000..df38d32 --- /dev/null +++ b/gklearn/preimage/find_best_k.py @@ -0,0 +1,170 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Thu Jan 9 11:54:32 2020 + +@author: ljia +""" +import numpy as np +import random +import csv + +from gklearn.utils.graphfiles import loadDataset +from gklearn.preimage.test_k_closest_graphs import median_on_k_closest_graphs + +def find_best_k(): + ds = {'name': 'monoterpenoides', + 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb + Gn, y_all = loadDataset(ds['dataset']) +# Gn = Gn[0:50] + gkernel = 'treeletkernel' + node_label = 'atom' + edge_label = 'bond_type' + ds_name = 'mono' + dir_output = 'results/test_find_best_k/' + + repeats = 50 + k_list = range(2, 11) + fit_method = 'k-graphs' + # fitted on the whole dataset - treelet - mono + edit_costs = [0.1268873773592978, 0.004084633224249829, 0.0897581955378986, 0.15328856114451297, 0.3109956881625734, 0.0] + + # create result files. + fn_output_detail = 'results_detail.' + fit_method + '.csv' + f_detail = open(dir_output + fn_output_detail, 'a') + csv.writer(f_detail).writerow(['dataset', 'graph kernel', 'fit method', 'k', + 'repeat', 'median set', 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM', + 'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM', + 'dis_k gi -> GM']) + f_detail.close() + fn_output_summary = 'results_summary.csv' + f_summary = open(dir_output + fn_output_summary, 'a') + csv.writer(f_summary).writerow(['dataset', 'graph kernel', 'fit method', 'k', + 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM', + 'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM', + 'dis_k gi -> GM', '# SOD SM -> GM', '# dis_k SM -> GM', + '# dis_k gi -> SM', '# dis_k gi -> GM', 'repeats better SOD SM -> GM', + 'repeats better dis_k SM -> GM', 'repeats better dis_k gi -> SM', + 'repeats better dis_k gi -> GM']) + f_summary.close() + + random.seed(1) + rdn_seed_list = random.sample(range(0, repeats * 100), repeats) + + for k in k_list: + print('\n--------- k =', k, '----------') + + sod_sm_list = [] + sod_gm_list = [] + dis_k_sm_list = [] + dis_k_gm_list = [] + dis_k_gi_min_list = [] + nb_sod_sm2gm = [0, 0, 0] + nb_dis_k_sm2gm = [0, 0, 0] + nb_dis_k_gi2sm = [0, 0, 0] + nb_dis_k_gi2gm = [0, 0, 0] + repeats_better_sod_sm2gm = [] + repeats_better_dis_k_sm2gm = [] + repeats_better_dis_k_gi2sm = [] + repeats_better_dis_k_gi2gm = [] + + + for repeat in range(repeats): + print('\nrepeat =', repeat) + random.seed(rdn_seed_list[repeat]) + median_set_idx = random.sample(range(0, len(Gn)), k) + print('median set: ', median_set_idx) + + sod_sm, sod_gm, dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min \ + = median_on_k_closest_graphs(Gn, node_label, edge_label, gkernel, k, + fit_method='k-graphs', + edit_costs=edit_costs, + group_min=median_set_idx, + parallel=False) + + # write result detail. + sod_sm2gm = getRelations(np.sign(sod_gm - sod_sm)) + dis_k_sm2gm = getRelations(np.sign(dis_k_gm - dis_k_sm)) + dis_k_gi2sm = getRelations(np.sign(dis_k_sm - dis_k_gi_min)) + dis_k_gi2gm = getRelations(np.sign(dis_k_gm - dis_k_gi_min)) + f_detail = open(dir_output + fn_output_detail, 'a') + csv.writer(f_detail).writerow([ds_name, gkernel, fit_method, k, repeat, + median_set_idx, sod_sm, sod_gm, dis_k_sm, dis_k_gm, + dis_k_gi_min, sod_sm2gm, dis_k_sm2gm, dis_k_gi2sm, + dis_k_gi2gm]) + f_detail.close() + + # compute result summary. + sod_sm_list.append(sod_sm) + sod_gm_list.append(sod_gm) + dis_k_sm_list.append(dis_k_sm) + dis_k_gm_list.append(dis_k_gm) + dis_k_gi_min_list.append(dis_k_gi_min) + # # SOD SM -> GM + if sod_sm > sod_gm: + nb_sod_sm2gm[0] += 1 + repeats_better_sod_sm2gm.append(repeat) + elif sod_sm == sod_gm: + nb_sod_sm2gm[1] += 1 + elif sod_sm < sod_gm: + nb_sod_sm2gm[2] += 1 + # # dis_k SM -> GM + if dis_k_sm > dis_k_gm: + nb_dis_k_sm2gm[0] += 1 + repeats_better_dis_k_sm2gm.append(repeat) + elif dis_k_sm == dis_k_gm: + nb_dis_k_sm2gm[1] += 1 + elif dis_k_sm < dis_k_gm: + nb_dis_k_sm2gm[2] += 1 + # # dis_k gi -> SM + if dis_k_gi_min > dis_k_sm: + nb_dis_k_gi2sm[0] += 1 + repeats_better_dis_k_gi2sm.append(repeat) + elif dis_k_gi_min == dis_k_sm: + nb_dis_k_gi2sm[1] += 1 + elif dis_k_gi_min < dis_k_sm: + nb_dis_k_gi2sm[2] += 1 + # # dis_k gi -> GM + if dis_k_gi_min > dis_k_gm: + nb_dis_k_gi2gm[0] += 1 + repeats_better_dis_k_gi2gm.append(repeat) + elif dis_k_gi_min == dis_k_gm: + nb_dis_k_gi2gm[1] += 1 + elif dis_k_gi_min < dis_k_gm: + nb_dis_k_gi2gm[2] += 1 + + # write result summary. + sod_sm_mean = np.mean(sod_sm_list) + sod_gm_mean = np.mean(sod_gm_list) + dis_k_sm_mean = np.mean(dis_k_sm_list) + dis_k_gm_mean = np.mean(dis_k_gm_list) + dis_k_gi_min_mean = np.mean(dis_k_gi_min_list) + sod_sm2gm_mean = getRelations(np.sign(sod_gm_mean - sod_sm_mean)) + dis_k_sm2gm_mean = getRelations(np.sign(dis_k_gm_mean - dis_k_sm_mean)) + dis_k_gi2sm_mean = getRelations(np.sign(dis_k_sm_mean - dis_k_gi_min_mean)) + dis_k_gi2gm_mean = getRelations(np.sign(dis_k_gm_mean - dis_k_gi_min_mean)) + f_summary = open(dir_output + fn_output_summary, 'a') + csv.writer(f_summary).writerow([ds_name, gkernel, fit_method, k, + sod_sm_mean, sod_gm_mean, dis_k_sm_mean, dis_k_gm_mean, + dis_k_gi_min_mean, sod_sm2gm_mean, dis_k_sm2gm_mean, + dis_k_gi2sm_mean, dis_k_gi2gm_mean, nb_sod_sm2gm, + nb_dis_k_sm2gm, nb_dis_k_gi2sm, nb_dis_k_gi2gm, + repeats_better_sod_sm2gm, repeats_better_dis_k_sm2gm, + repeats_better_dis_k_gi2sm, repeats_better_dis_k_gi2gm]) + f_summary.close() + + print('\ncomplete.') + return + + +def getRelations(sign): + if sign == -1: + return 'better' + elif sign == 0: + return 'same' + elif sign == 1: + return 'worse' + + +if __name__ == '__main__': + find_best_k() \ No newline at end of file diff --git a/gklearn/preimage/fitDistance.py b/gklearn/preimage/fitDistance.py new file mode 100644 index 0000000..234f7fc --- /dev/null +++ b/gklearn/preimage/fitDistance.py @@ -0,0 +1,430 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Wed Oct 16 14:20:06 2019 + +@author: ljia +""" +import numpy as np +from tqdm import tqdm +from itertools import combinations_with_replacement, combinations +import multiprocessing +from multiprocessing import Pool +from functools import partial +import time +import random +import sys + +from scipy import optimize +from scipy.optimize import minimize +import cvxpy as cp + +from gklearn.preimage.ged import GED, get_nb_edit_operations, get_nb_edit_operations_letter, get_nb_edit_operations_nonsymbolic +from gklearn.preimage.utils import kernel_distance_matrix + +def fit_GED_to_kernel_distance(Gn, node_label, edge_label, gkernel, itr_max, + params_ged={'lib': 'gedlibpy', 'cost': 'CONSTANT', + 'method': 'IPFP', 'stabilizer': None}, + init_costs=[3, 3, 1, 3, 3, 1], + dataset='monoterpenoides', Kmatrix=None, + parallel=True): +# dataset = dataset.lower() + + # c_vi, c_vr, c_vs, c_ei, c_er, c_es or parts of them. +# random.seed(1) +# cost_rdm = random.sample(range(1, 10), 6) +# init_costs = cost_rdm + [0] +# init_costs = cost_rdm +# init_costs = [3, 3, 1, 3, 3, 1] +# init_costs = [i * 0.01 for i in cost_rdm] + [0] +# init_costs = [0.2, 0.2, 0.2, 0.2, 0.2, 0] +# init_costs = [0, 0, 0.9544, 0.026, 0.0196, 0] +# init_costs = [0.008429912251810438, 0.025461055985319694, 0.2047320869225948, 0.004148727085832133, 0.0, 0] +# idx_cost_nonzeros = [i for i, item in enumerate(edit_costs) if item != 0] + + # compute distances in feature space. + dis_k_mat, _, _, _ = kernel_distance_matrix(Gn, node_label, edge_label, + Kmatrix=Kmatrix, gkernel=gkernel) + dis_k_vec = [] + for i in range(len(dis_k_mat)): +# for j in range(i, len(dis_k_mat)): + for j in range(i + 1, len(dis_k_mat)): + dis_k_vec.append(dis_k_mat[i, j]) + dis_k_vec = np.array(dis_k_vec) + + # init ged. + print('\ninitial:') + time0 = time.time() + params_ged['dataset'] = dataset + params_ged['edit_cost_constant'] = init_costs + ged_vec_init, ged_mat, n_edit_operations = compute_geds(Gn, params_ged, + parallel=parallel) + residual_list = [np.sqrt(np.sum(np.square(np.array(ged_vec_init) - dis_k_vec)))] + time_list = [time.time() - time0] + edit_cost_list = [init_costs] + nb_cost_mat = np.array(n_edit_operations) + nb_cost_mat_list = [nb_cost_mat] + print('edit_costs:', init_costs) + print('residual_list:', residual_list) + + for itr in range(itr_max): + print('\niteration', itr) + time0 = time.time() + # "fit" geds to distances in feature space by tuning edit costs using the + # Least Squares Method. + np.savez('results/xp_fit_method/fit_data_debug' + str(itr) + '.gm', + nb_cost_mat=nb_cost_mat, dis_k_vec=dis_k_vec, + n_edit_operations=n_edit_operations, ged_vec_init=ged_vec_init, + ged_mat=ged_mat) + edit_costs_new, residual = update_costs(nb_cost_mat, dis_k_vec, + dataset=dataset, cost=params_ged['cost']) + for i in range(len(edit_costs_new)): + if -1e-9 <= edit_costs_new[i] <= 1e-9: + edit_costs_new[i] = 0 + if edit_costs_new[i] < 0: + raise ValueError('The edit cost is negative.') +# for i in range(len(edit_costs_new)): +# if edit_costs_new[i] < 0: +# edit_costs_new[i] = 0 + + # compute new GEDs and numbers of edit operations. + params_ged['edit_cost_constant'] = edit_costs_new # np.array([edit_costs_new[0], edit_costs_new[1], 0.75]) + ged_vec, ged_mat, n_edit_operations = compute_geds(Gn, params_ged, + parallel=parallel) + residual_list.append(np.sqrt(np.sum(np.square(np.array(ged_vec) - dis_k_vec)))) + time_list.append(time.time() - time0) + edit_cost_list.append(edit_costs_new) + nb_cost_mat = np.array(n_edit_operations) + nb_cost_mat_list.append(nb_cost_mat) + print('edit_costs:', edit_costs_new) + print('residual_list:', residual_list) + + return edit_costs_new, residual_list, edit_cost_list, dis_k_mat, ged_mat, \ + time_list, nb_cost_mat_list + + +def compute_geds(Gn, params_ged, parallel=False): + edit_cost_name = params_ged['cost'] + if edit_cost_name == 'LETTER' or edit_cost_name == 'LETTER2': + get_nb_eo = get_nb_edit_operations_letter + elif edit_cost_name == 'NON_SYMBOLIC': + get_nb_eo = get_nb_edit_operations_nonsymbolic + else: + get_nb_eo = get_nb_edit_operations + ged_mat = np.zeros((len(Gn), len(Gn))) + if parallel: +# print('parallel') +# len_itr = int(len(Gn) * (len(Gn) + 1) / 2) + len_itr = int(len(Gn) * (len(Gn) - 1) / 2) + ged_vec = [0 for i in range(len_itr)] + n_edit_operations = [0 for i in range(len_itr)] +# itr = combinations_with_replacement(range(0, len(Gn)), 2) + itr = combinations(range(0, len(Gn)), 2) + n_jobs = multiprocessing.cpu_count() + if len_itr < 100 * n_jobs: + chunksize = int(len_itr / n_jobs) + 1 + else: + chunksize = 100 + def init_worker(gn_toshare): + global G_gn + G_gn = gn_toshare + do_partial = partial(_wrapper_compute_ged_parallel, params_ged, get_nb_eo) + pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(Gn,)) + iterator = tqdm(pool.imap_unordered(do_partial, itr, chunksize), + desc='computing GEDs', file=sys.stdout) +# iterator = pool.imap_unordered(do_partial, itr, chunksize) + for i, j, dis, n_eo_tmp in iterator: + idx_itr = int(len(Gn) * i + j - (i + 1) * (i + 2) / 2) + ged_vec[idx_itr] = dis + ged_mat[i][j] = dis + ged_mat[j][i] = dis + n_edit_operations[idx_itr] = n_eo_tmp +# print('\n-------------------------------------------') +# print(i, j, idx_itr, dis) + pool.close() + pool.join() + + else: + ged_vec = [] + n_edit_operations = [] + for i in tqdm(range(len(Gn)), desc='computing GEDs', file=sys.stdout): +# for i in range(len(Gn)): + for j in range(i + 1, len(Gn)): + dis, pi_forward, pi_backward = GED(Gn[i], Gn[j], **params_ged) + ged_vec.append(dis) + ged_mat[i][j] = dis + ged_mat[j][i] = dis + n_eo_tmp = get_nb_eo(Gn[i], Gn[j], pi_forward, pi_backward) + n_edit_operations.append(n_eo_tmp) + + return ged_vec, ged_mat, n_edit_operations + + +def _wrapper_compute_ged_parallel(params_ged, get_nb_eo, itr): + i = itr[0] + j = itr[1] + dis, n_eo_tmp = _compute_ged_parallel(G_gn[i], G_gn[j], params_ged, get_nb_eo) + return i, j, dis, n_eo_tmp + + +def _compute_ged_parallel(g1, g2, params_ged, get_nb_eo): + dis, pi_forward, pi_backward = GED(g1, g2, **params_ged) + n_eo_tmp = get_nb_eo(g1, g2, pi_forward, pi_backward) # [0,0,0,0,0,0] + return dis, n_eo_tmp + + +def update_costs(nb_cost_mat, dis_k_vec, dataset='monoterpenoides', + cost='CONSTANT', rw_constraints='inequality'): +# if dataset == 'Letter-high': + if cost == 'LETTER': + pass +# # method 1: set alpha automatically, just tune c_vir and c_eir by +# # LMS using cvxpy. +# alpha = 0.5 +# coeff = 100 # np.max(alpha * nb_cost_mat[:,4] / dis_k_vec) +## if np.count_nonzero(nb_cost_mat[:,4]) == 0: +## alpha = 0.75 +## else: +## alpha = np.min([dis_k_vec / c_vs for c_vs in nb_cost_mat[:,4] if c_vs != 0]) +## alpha = alpha * 0.99 +# param_vir = alpha * (nb_cost_mat[:,0] + nb_cost_mat[:,1]) +# param_eir = (1 - alpha) * (nb_cost_mat[:,4] + nb_cost_mat[:,5]) +# nb_cost_mat_new = np.column_stack((param_vir, param_eir)) +# dis_new = coeff * dis_k_vec - alpha * nb_cost_mat[:,3] +# +# x = cp.Variable(nb_cost_mat_new.shape[1]) +# cost = cp.sum_squares(nb_cost_mat_new * x - dis_new) +# constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])]] +# prob = cp.Problem(cp.Minimize(cost), constraints) +# prob.solve() +# edit_costs_new = x.value +# edit_costs_new = np.array([edit_costs_new[0], edit_costs_new[1], alpha]) +# residual = np.sqrt(prob.value) + +# # method 2: tune c_vir, c_eir and alpha by nonlinear programming by +# # scipy.optimize.minimize. +# w0 = nb_cost_mat[:,0] + nb_cost_mat[:,1] +# w1 = nb_cost_mat[:,4] + nb_cost_mat[:,5] +# w2 = nb_cost_mat[:,3] +# w3 = dis_k_vec +# func_min = lambda x: np.sum((w0 * x[0] * x[3] + w1 * x[1] * (1 - x[2]) \ +# + w2 * x[2] - w3 * x[3]) ** 2) +# bounds = ((0, None), (0., None), (0.5, 0.5), (0, None)) +# res = minimize(func_min, [0.9, 1.7, 0.75, 10], bounds=bounds) +# edit_costs_new = res.x[0:3] +# residual = res.fun + + # method 3: tune c_vir, c_eir and alpha by nonlinear programming using cvxpy. + + +# # method 4: tune c_vir, c_eir and alpha by QP function +# # scipy.optimize.least_squares. An initial guess is required. +# w0 = nb_cost_mat[:,0] + nb_cost_mat[:,1] +# w1 = nb_cost_mat[:,4] + nb_cost_mat[:,5] +# w2 = nb_cost_mat[:,3] +# w3 = dis_k_vec +# func = lambda x: (w0 * x[0] * x[3] + w1 * x[1] * (1 - x[2]) \ +# + w2 * x[2] - w3 * x[3]) ** 2 +# res = optimize.root(func, [0.9, 1.7, 0.75, 100]) +# edit_costs_new = res.x +# residual = None + elif cost == 'LETTER2': +# # 1. if c_vi != c_vr, c_ei != c_er. +# nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]] +# x = cp.Variable(nb_cost_mat_new.shape[1]) +# cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec) +## # 1.1 no constraints. +## constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])]] +# # 1.2 c_vs <= c_vi + c_vr. +# constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])], +# np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0] +## # 2. if c_vi == c_vr, c_ei == c_er. +## nb_cost_mat_new = nb_cost_mat[:,[0,3,4]] +## nb_cost_mat_new[:,0] += nb_cost_mat[:,1] +## nb_cost_mat_new[:,2] += nb_cost_mat[:,5] +## x = cp.Variable(nb_cost_mat_new.shape[1]) +## cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec) +## # 2.1 no constraints. +## constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])]] +### # 2.2 c_vs <= c_vi + c_vr. +### constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])], +### np.array([2.0, -1.0, 0.0]).T@x >= 0.0] +# +# prob = cp.Problem(cp.Minimize(cost_fun), constraints) +# prob.solve() +# edit_costs_new = [x.value[0], x.value[0], x.value[1], x.value[2], x.value[2]] +# edit_costs_new = np.array(edit_costs_new) +# residual = np.sqrt(prob.value) + if rw_constraints == 'inequality': + # c_vs <= c_vi + c_vr. + nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]] + x = cp.Variable(nb_cost_mat_new.shape[1]) + cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec) + constraints = [x >= [0.001 for i in range(nb_cost_mat_new.shape[1])], + np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0] + prob = cp.Problem(cp.Minimize(cost_fun), constraints) + try: + prob.solve(verbose=True) + except MemoryError as error0: + print('\nUsing solver "OSQP" caused a memory error.') + print('the original error message is\n', error0) + print('solver status: ', prob.status) + print('trying solver "CVXOPT" instead...\n') + try: + prob.solve(solver=cp.CVXOPT, verbose=True) + except Exception as error1: + print('\nAn error occured when using solver "CVXOPT".') + print('the original error message is\n', error1) + print('solver status: ', prob.status) + print('trying solver "MOSEK" instead. Notice this solver is commercial and a lisence is required.\n') + prob.solve(solver=cp.MOSEK, verbose=True) + else: + print('solver status: ', prob.status) + else: + print('solver status: ', prob.status) + print() + edit_costs_new = x.value + residual = np.sqrt(prob.value) + elif rw_constraints == '2constraints': + # c_vs <= c_vi + c_vr and c_vi == c_vr, c_ei == c_er. + nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]] + x = cp.Variable(nb_cost_mat_new.shape[1]) + cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec) + constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])], + np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0, + np.array([1.0, -1.0, 0.0, 0.0, 0.0]).T@x == 0.0, + np.array([0.0, 0.0, 0.0, 1.0, -1.0]).T@x == 0.0] + prob = cp.Problem(cp.Minimize(cost_fun), constraints) + prob.solve() + edit_costs_new = x.value + residual = np.sqrt(prob.value) + elif rw_constraints == 'no-constraint': + # no constraint. + nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]] + x = cp.Variable(nb_cost_mat_new.shape[1]) + cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec) + constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])]] + prob = cp.Problem(cp.Minimize(cost_fun), constraints) + prob.solve() + edit_costs_new = x.value + residual = np.sqrt(prob.value) +# elif method == 'inequality_modified': +# # c_vs <= c_vi + c_vr. +# nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]] +# x = cp.Variable(nb_cost_mat_new.shape[1]) +# cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec) +# constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])], +# np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0] +# prob = cp.Problem(cp.Minimize(cost_fun), constraints) +# prob.solve() +# # use same costs for insertion and removal rather than the fitted costs. +# edit_costs_new = [x.value[0], x.value[0], x.value[1], x.value[2], x.value[2]] +# edit_costs_new = np.array(edit_costs_new) +# residual = np.sqrt(prob.value) + elif cost == 'NON_SYMBOLIC': + is_n_attr = np.count_nonzero(nb_cost_mat[:,2]) + is_e_attr = np.count_nonzero(nb_cost_mat[:,5]) + + if dataset == 'SYNTHETICnew': +# nb_cost_mat_new = nb_cost_mat[:,[0,1,2,3,4]] + nb_cost_mat_new = nb_cost_mat[:,[2,3,4]] + x = cp.Variable(nb_cost_mat_new.shape[1]) + cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec) +# constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])], +# np.array([0.0, 0.0, 0.0, 1.0, -1.0]).T@x == 0.0] +# constraints = [x >= [0.0001 for i in range(nb_cost_mat_new.shape[1])]] + constraints = [x >= [0.0001 for i in range(nb_cost_mat_new.shape[1])], + np.array([0.0, 1.0, -1.0]).T@x == 0.0] + prob = cp.Problem(cp.Minimize(cost_fun), constraints) + prob.solve() +# print(x.value) + edit_costs_new = np.concatenate((np.array([0.0, 0.0]), x.value, + np.array([0.0]))) + residual = np.sqrt(prob.value) + + elif rw_constraints == 'inequality': + # c_vs <= c_vi + c_vr. + if is_n_attr and is_e_attr: + nb_cost_mat_new = nb_cost_mat[:,[0,1,2,3,4,5]] + x = cp.Variable(nb_cost_mat_new.shape[1]) + cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec) + constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])], + np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0, + np.array([0.0, 0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0] + prob = cp.Problem(cp.Minimize(cost_fun), constraints) + prob.solve() + edit_costs_new = x.value + residual = np.sqrt(prob.value) + elif is_n_attr and not is_e_attr: + nb_cost_mat_new = nb_cost_mat[:,[0,1,2,3,4]] + x = cp.Variable(nb_cost_mat_new.shape[1]) + cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec) + constraints = [x >= [0.001 for i in range(nb_cost_mat_new.shape[1])], + np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0] + prob = cp.Problem(cp.Minimize(cost_fun), constraints) + prob.solve() + print(x.value) + edit_costs_new = np.concatenate((x.value, np.array([0.0]))) + residual = np.sqrt(prob.value) + elif not is_n_attr and is_e_attr: + nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]] + x = cp.Variable(nb_cost_mat_new.shape[1]) + cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec) + constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])], + np.array([0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0] + prob = cp.Problem(cp.Minimize(cost_fun), constraints) + prob.solve() + edit_costs_new = np.concatenate((x.value[0:2], np.array([0.0]), x.value[2:])) + residual = np.sqrt(prob.value) + else: + nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4]] + x = cp.Variable(nb_cost_mat_new.shape[1]) + cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec) + constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])]] + prob = cp.Problem(cp.Minimize(cost_fun), constraints) + prob.solve() + edit_costs_new = np.concatenate((x.value[0:2], np.array([0.0]), + x.value[2:], np.array([0.0]))) + residual = np.sqrt(prob.value) + else: +# # method 1: simple least square method. +# edit_costs_new, residual, _, _ = np.linalg.lstsq(nb_cost_mat, dis_k_vec, +# rcond=None) + +# # method 2: least square method with x_i >= 0. +# edit_costs_new, residual = optimize.nnls(nb_cost_mat, dis_k_vec) + + # method 3: solve as a quadratic program with constraints. +# P = np.dot(nb_cost_mat.T, nb_cost_mat) +# q_T = -2 * np.dot(dis_k_vec.T, nb_cost_mat) +# G = -1 * np.identity(nb_cost_mat.shape[1]) +# h = np.array([0 for i in range(nb_cost_mat.shape[1])]) +# A = np.array([1 for i in range(nb_cost_mat.shape[1])]) +# b = 1 +# x = cp.Variable(nb_cost_mat.shape[1]) +# prob = cp.Problem(cp.Minimize(cp.quad_form(x, P) + q_T@x), +# [G@x <= h]) +# prob.solve() +# edit_costs_new = x.value +# residual = prob.value - np.dot(dis_k_vec.T, dis_k_vec) + +# G = -1 * np.identity(nb_cost_mat.shape[1]) +# h = np.array([0 for i in range(nb_cost_mat.shape[1])]) + x = cp.Variable(nb_cost_mat.shape[1]) + cost_fun = cp.sum_squares(nb_cost_mat * x - dis_k_vec) + constraints = [x >= [0.0 for i in range(nb_cost_mat.shape[1])], + # np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0] + np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0, + np.array([0.0, 0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0] + prob = cp.Problem(cp.Minimize(cost_fun), constraints) + prob.solve() + edit_costs_new = x.value + residual = np.sqrt(prob.value) + + # method 4: + + return edit_costs_new, residual + + +if __name__ == '__main__': + print('check test_fitDistance.py') \ No newline at end of file diff --git a/gklearn/preimage/ged.py b/gklearn/preimage/ged.py new file mode 100644 index 0000000..a66baaf --- /dev/null +++ b/gklearn/preimage/ged.py @@ -0,0 +1,467 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Thu Oct 17 18:44:59 2019 + +@author: ljia +""" +import numpy as np +import networkx as nx +from tqdm import tqdm +import sys +import multiprocessing +from multiprocessing import Pool +from functools import partial + +#from gedlibpy_linlin import librariesImport, gedlibpy +from gklearn.gedlib import librariesImport, gedlibpy + +def GED(g1, g2, dataset='monoterpenoides', lib='gedlibpy', cost='CHEM_1', method='IPFP', + edit_cost_constant=[], algo_options='', stabilizer='min', repeat=50): + """ + Compute GED for 2 graphs. + """ + +# dataset = dataset.lower() + + if lib == 'gedlibpy': + gedlibpy.restart_env() + gedlibpy.add_nx_graph(convertGraph(g1, cost), "") + gedlibpy.add_nx_graph(convertGraph(g2, cost), "") + + listID = gedlibpy.get_all_graph_ids() + gedlibpy.set_edit_cost(cost, edit_cost_constant=edit_cost_constant) + gedlibpy.init() + gedlibpy.set_method(method, algo_options) + gedlibpy.init_method() + + g = listID[0] + h = listID[1] + if stabilizer is None: + gedlibpy.run_method(g, h) + pi_forward = gedlibpy.get_forward_map(g, h) + pi_backward = gedlibpy.get_backward_map(g, h) + upper = gedlibpy.get_upper_bound(g, h) + lower = gedlibpy.get_lower_bound(g, h) + elif stabilizer == 'mean': + # @todo: to be finished... + upper_list = [np.inf] * repeat + for itr in range(repeat): + gedlibpy.run_method(g, h) + upper_list[itr] = gedlibpy.get_upper_bound(g, h) + pi_forward = gedlibpy.get_forward_map(g, h) + pi_backward = gedlibpy.get_backward_map(g, h) + lower = gedlibpy.get_lower_bound(g, h) + upper = np.mean(upper_list) + elif stabilizer == 'median': + if repeat % 2 == 0: + repeat += 1 + upper_list = [np.inf] * repeat + pi_forward_list = [0] * repeat + pi_backward_list = [0] * repeat + for itr in range(repeat): + gedlibpy.run_method(g, h) + upper_list[itr] = gedlibpy.get_upper_bound(g, h) + pi_forward_list[itr] = gedlibpy.get_forward_map(g, h) + pi_backward_list[itr] = gedlibpy.get_backward_map(g, h) + lower = gedlibpy.get_lower_bound(g, h) + upper = np.median(upper_list) + idx_median = upper_list.index(upper) + pi_forward = pi_forward_list[idx_median] + pi_backward = pi_backward_list[idx_median] + elif stabilizer == 'min': + upper = np.inf + for itr in range(repeat): + gedlibpy.run_method(g, h) + upper_tmp = gedlibpy.get_upper_bound(g, h) + if upper_tmp < upper: + upper = upper_tmp + pi_forward = gedlibpy.get_forward_map(g, h) + pi_backward = gedlibpy.get_backward_map(g, h) + lower = gedlibpy.get_lower_bound(g, h) + if upper == 0: + break + elif stabilizer == 'max': + upper = 0 + for itr in range(repeat): + gedlibpy.run_method(g, h) + upper_tmp = gedlibpy.get_upper_bound(g, h) + if upper_tmp > upper: + upper = upper_tmp + pi_forward = gedlibpy.get_forward_map(g, h) + pi_backward = gedlibpy.get_backward_map(g, h) + lower = gedlibpy.get_lower_bound(g, h) + elif stabilizer == 'gaussian': + pass + + dis = upper + + elif lib == 'gedlib-bash': + import time + import random + import os + from gklearn.utils.graphfiles import saveDataset + + tmp_dir = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/output/tmp_ged/' + if not os.path.exists(tmp_dir): + os.makedirs(tmp_dir) + fn_collection = tmp_dir + 'collection.' + str(time.time()) + str(random.randint(0, 1e9)) + xparams = {'method': 'gedlib', 'graph_dir': fn_collection} + saveDataset([g1, g2], ['dummy', 'dummy'], gformat='gxl', group='xml', + filename=fn_collection, xparams=xparams) + + command = 'GEDLIB_HOME=\'/media/ljia/DATA/research-repo/codes/others/gedlib/gedlib2\'\n' + command += 'LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$GEDLIB_HOME/lib\n' + command += 'export LD_LIBRARY_PATH\n' + command += 'cd \'' + os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/bin\'\n' + command += './ged_for_python_bash monoterpenoides ' + fn_collection \ + + ' \'' + algo_options + '\' ' + for ec in edit_cost_constant: + command += str(ec) + ' ' +# output = os.system(command) + stream = os.popen(command) + output = stream.readlines() +# print(output) + + dis = float(output[0].strip()) + runtime = float(output[1].strip()) + size_forward = int(output[2].strip()) + pi_forward = [int(item.strip()) for item in output[3:3+size_forward]] + pi_backward = [int(item.strip()) for item in output[3+size_forward:]] + +# print(dis) +# print(runtime) +# print(size_forward) +# print(pi_forward) +# print(pi_backward) + + + # make the map label correct (label remove map as np.inf) + nodes1 = [n for n in g1.nodes()] + nodes2 = [n for n in g2.nodes()] + nb1 = nx.number_of_nodes(g1) + nb2 = nx.number_of_nodes(g2) + pi_forward = [nodes2[pi] if pi < nb2 else np.inf for pi in pi_forward] + pi_backward = [nodes1[pi] if pi < nb1 else np.inf for pi in pi_backward] +# print(pi_forward) + + + return dis, pi_forward, pi_backward + + +def convertGraph(G, cost): + """Convert a graph to the proper NetworkX format that can be + recognized by library gedlibpy. + """ + G_new = nx.Graph() + if cost == 'LETTER' or cost == 'LETTER2': + for nd, attrs in G.nodes(data=True): + G_new.add_node(str(nd), x=str(attrs['attributes'][0]), + y=str(attrs['attributes'][1])) + for nd1, nd2, attrs in G.edges(data=True): + G_new.add_edge(str(nd1), str(nd2)) + elif cost == 'NON_SYMBOLIC': + for nd, attrs in G.nodes(data=True): + G_new.add_node(str(nd)) + for a_name in G.graph['node_attrs']: + G_new.nodes[str(nd)][a_name] = str(attrs[a_name]) + for nd1, nd2, attrs in G.edges(data=True): + G_new.add_edge(str(nd1), str(nd2)) + for a_name in G.graph['edge_attrs']: + G_new.edges[str(nd1), str(nd2)][a_name] = str(attrs[a_name]) + else: + for nd, attrs in G.nodes(data=True): + G_new.add_node(str(nd), chem=attrs['atom']) + for nd1, nd2, attrs in G.edges(data=True): + G_new.add_edge(str(nd1), str(nd2), valence=attrs['bond_type']) +# G_new.add_edge(str(nd1), str(nd2)) + + return G_new + + +def GED_n(Gn, lib='gedlibpy', cost='CHEM_1', method='IPFP', + edit_cost_constant=[], stabilizer='min', repeat=50): + """ + Compute GEDs for a group of graphs. + """ + if lib == 'gedlibpy': + def convertGraph(G): + """Convert a graph to the proper NetworkX format that can be + recognized by library gedlibpy. + """ + G_new = nx.Graph() + for nd, attrs in G.nodes(data=True): + G_new.add_node(str(nd), chem=attrs['atom']) + for nd1, nd2, attrs in G.edges(data=True): +# G_new.add_edge(str(nd1), str(nd2), valence=attrs['bond_type']) + G_new.add_edge(str(nd1), str(nd2)) + + return G_new + + gedlibpy.restart_env() + gedlibpy.add_nx_graph(convertGraph(g1), "") + gedlibpy.add_nx_graph(convertGraph(g2), "") + + listID = gedlibpy.get_all_graph_ids() + gedlibpy.set_edit_cost(cost, edit_cost_constant=edit_cost_constant) + gedlibpy.init() + gedlibpy.set_method(method, "") + gedlibpy.init_method() + + g = listID[0] + h = listID[1] + if stabilizer is None: + gedlibpy.run_method(g, h) + pi_forward = gedlibpy.get_forward_map(g, h) + pi_backward = gedlibpy.get_backward_map(g, h) + upper = gedlibpy.get_upper_bound(g, h) + lower = gedlibpy.get_lower_bound(g, h) + elif stabilizer == 'min': + upper = np.inf + for itr in range(repeat): + gedlibpy.run_method(g, h) + upper_tmp = gedlibpy.get_upper_bound(g, h) + if upper_tmp < upper: + upper = upper_tmp + pi_forward = gedlibpy.get_forward_map(g, h) + pi_backward = gedlibpy.get_backward_map(g, h) + lower = gedlibpy.get_lower_bound(g, h) + if upper == 0: + break + + dis = upper + + # make the map label correct (label remove map as np.inf) + nodes1 = [n for n in g1.nodes()] + nodes2 = [n for n in g2.nodes()] + nb1 = nx.number_of_nodes(g1) + nb2 = nx.number_of_nodes(g2) + pi_forward = [nodes2[pi] if pi < nb2 else np.inf for pi in pi_forward] + pi_backward = [nodes1[pi] if pi < nb1 else np.inf for pi in pi_backward] + + return dis, pi_forward, pi_backward + + +def ged_median(Gn, Gn_median, verbose=False, params_ged={'lib': 'gedlibpy', + 'cost': 'CHEM_1', 'method': 'IPFP', 'edit_cost_constant': [], + 'algo_options': '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1', + 'stabilizer': None}, parallel=False): + if parallel: + len_itr = int(len(Gn)) + pi_forward_list = [[] for i in range(len_itr)] + dis_list = [0 for i in range(len_itr)] + + itr = range(0, len_itr) + n_jobs = multiprocessing.cpu_count() + if len_itr < 100 * n_jobs: + chunksize = int(len_itr / n_jobs) + 1 + else: + chunksize = 100 + def init_worker(gn_toshare, gn_median_toshare): + global G_gn, G_gn_median + G_gn = gn_toshare + G_gn_median = gn_median_toshare + do_partial = partial(_compute_ged_median, params_ged) + pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(Gn, Gn_median)) + if verbose: + iterator = tqdm(pool.imap_unordered(do_partial, itr, chunksize), + desc='computing GEDs', file=sys.stdout) + else: + iterator = pool.imap_unordered(do_partial, itr, chunksize) + for i, dis_sum, pi_forward in iterator: + pi_forward_list[i] = pi_forward + dis_list[i] = dis_sum +# print('\n-------------------------------------------') +# print(i, j, idx_itr, dis) + pool.close() + pool.join() + + else: + dis_list = [] + pi_forward_list = [] + for idx, G in tqdm(enumerate(Gn), desc='computing median distances', + file=sys.stdout) if verbose else enumerate(Gn): + dis_sum = 0 + pi_forward_list.append([]) + for G_p in Gn_median: + dis_tmp, pi_tmp_forward, pi_tmp_backward = GED(G, G_p, + **params_ged) + pi_forward_list[idx].append(pi_tmp_forward) + dis_sum += dis_tmp + dis_list.append(dis_sum) + + return dis_list, pi_forward_list + + +def _compute_ged_median(params_ged, itr): +# print(itr) + dis_sum = 0 + pi_forward = [] + for G_p in G_gn_median: + dis_tmp, pi_tmp_forward, pi_tmp_backward = GED(G_gn[itr], G_p, + **params_ged) + pi_forward.append(pi_tmp_forward) + dis_sum += dis_tmp + + return itr, dis_sum, pi_forward + + +def get_nb_edit_operations(g1, g2, forward_map, backward_map): + """Compute the number of each edit operations. + """ + n_vi = 0 + n_vr = 0 + n_vs = 0 + n_ei = 0 + n_er = 0 + n_es = 0 + + nodes1 = [n for n in g1.nodes()] + for i, map_i in enumerate(forward_map): + if map_i == np.inf: + n_vr += 1 + elif g1.node[nodes1[i]]['atom'] != g2.node[map_i]['atom']: + n_vs += 1 + for map_i in backward_map: + if map_i == np.inf: + n_vi += 1 + +# idx_nodes1 = range(0, len(node1)) + + edges1 = [e for e in g1.edges()] + nb_edges2_cnted = 0 + for n1, n2 in edges1: + idx1 = nodes1.index(n1) + idx2 = nodes1.index(n2) + # one of the nodes is removed, thus the edge is removed. + if forward_map[idx1] == np.inf or forward_map[idx2] == np.inf: + n_er += 1 + # corresponding edge is in g2. + elif (forward_map[idx1], forward_map[idx2]) in g2.edges(): + nb_edges2_cnted += 1 + # edge labels are different. + if g2.edges[((forward_map[idx1], forward_map[idx2]))]['bond_type'] \ + != g1.edges[(n1, n2)]['bond_type']: + n_es += 1 + elif (forward_map[idx2], forward_map[idx1]) in g2.edges(): + nb_edges2_cnted += 1 + # edge labels are different. + if g2.edges[((forward_map[idx2], forward_map[idx1]))]['bond_type'] \ + != g1.edges[(n1, n2)]['bond_type']: + n_es += 1 + # corresponding nodes are in g2, however the edge is removed. + else: + n_er += 1 + n_ei = nx.number_of_edges(g2) - nb_edges2_cnted + + return n_vi, n_vr, n_vs, n_ei, n_er, n_es + + +def get_nb_edit_operations_letter(g1, g2, forward_map, backward_map): + """Compute the number of each edit operations. + """ + n_vi = 0 + n_vr = 0 + n_vs = 0 + sod_vs = 0 + n_ei = 0 + n_er = 0 + + nodes1 = [n for n in g1.nodes()] + for i, map_i in enumerate(forward_map): + if map_i == np.inf: + n_vr += 1 + else: + n_vs += 1 + diff_x = float(g1.nodes[nodes1[i]]['x']) - float(g2.nodes[map_i]['x']) + diff_y = float(g1.nodes[nodes1[i]]['y']) - float(g2.nodes[map_i]['y']) + sod_vs += np.sqrt(np.square(diff_x) + np.square(diff_y)) + for map_i in backward_map: + if map_i == np.inf: + n_vi += 1 + +# idx_nodes1 = range(0, len(node1)) + + edges1 = [e for e in g1.edges()] + nb_edges2_cnted = 0 + for n1, n2 in edges1: + idx1 = nodes1.index(n1) + idx2 = nodes1.index(n2) + # one of the nodes is removed, thus the edge is removed. + if forward_map[idx1] == np.inf or forward_map[idx2] == np.inf: + n_er += 1 + # corresponding edge is in g2. Edge label is not considered. + elif (forward_map[idx1], forward_map[idx2]) in g2.edges() or \ + (forward_map[idx2], forward_map[idx1]) in g2.edges(): + nb_edges2_cnted += 1 + # corresponding nodes are in g2, however the edge is removed. + else: + n_er += 1 + n_ei = nx.number_of_edges(g2) - nb_edges2_cnted + + return n_vi, n_vr, n_vs, sod_vs, n_ei, n_er + + +def get_nb_edit_operations_nonsymbolic(g1, g2, forward_map, backward_map): + """Compute the number of each edit operations. + """ + n_vi = 0 + n_vr = 0 + n_vs = 0 + sod_vs = 0 + n_ei = 0 + n_er = 0 + n_es = 0 + sod_es = 0 + + nodes1 = [n for n in g1.nodes()] + for i, map_i in enumerate(forward_map): + if map_i == np.inf: + n_vr += 1 + else: + n_vs += 1 + sum_squares = 0 + for a_name in g1.graph['node_attrs']: + diff = float(g1.nodes[nodes1[i]][a_name]) - float(g2.nodes[map_i][a_name]) + sum_squares += np.square(diff) + sod_vs += np.sqrt(sum_squares) + for map_i in backward_map: + if map_i == np.inf: + n_vi += 1 + +# idx_nodes1 = range(0, len(node1)) + + edges1 = [e for e in g1.edges()] + for n1, n2 in edges1: + idx1 = nodes1.index(n1) + idx2 = nodes1.index(n2) + n1_g2 = forward_map[idx1] + n2_g2 = forward_map[idx2] + # one of the nodes is removed, thus the edge is removed. + if n1_g2 == np.inf or n2_g2 == np.inf: + n_er += 1 + # corresponding edge is in g2. + elif (n1_g2, n2_g2) in g2.edges(): + n_es += 1 + sum_squares = 0 + for a_name in g1.graph['edge_attrs']: + diff = float(g1.edges[n1, n2][a_name]) - float(g2.nodes[n1_g2, n2_g2][a_name]) + sum_squares += np.square(diff) + sod_es += np.sqrt(sum_squares) + elif (n2_g2, n1_g2) in g2.edges(): + n_es += 1 + sum_squares = 0 + for a_name in g1.graph['edge_attrs']: + diff = float(g1.edges[n2, n1][a_name]) - float(g2.nodes[n2_g2, n1_g2][a_name]) + sum_squares += np.square(diff) + sod_es += np.sqrt(sum_squares) + # corresponding nodes are in g2, however the edge is removed. + else: + n_er += 1 + n_ei = nx.number_of_edges(g2) - n_es + + return n_vi, n_vr, sod_vs, n_ei, n_er, sod_es + + +if __name__ == '__main__': + print('check test_ged.py') \ No newline at end of file diff --git a/gklearn/preimage/iam.py b/gklearn/preimage/iam.py new file mode 100644 index 0000000..f3e2165 --- /dev/null +++ b/gklearn/preimage/iam.py @@ -0,0 +1,775 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Fri Apr 26 11:49:12 2019 + +Iterative alternate minimizations using GED. +@author: ljia +""" +import numpy as np +import random +import networkx as nx +from tqdm import tqdm + +from gklearn.utils.graphdataset import get_dataset_attributes +from gklearn.utils.utils import graph_isIdentical, get_node_labels, get_edge_labels +from gklearn.preimage.ged import GED, ged_median + + +def iam_upgraded(Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, ite_max=50, + epsilon=0.001, node_label='atom', edge_label='bond_type', + connected=False, removeNodes=True, allBestInit=False, allBestNodes=False, + allBestEdges=False, allBestOutput=False, + params_ged={'lib': 'gedlibpy', 'cost': 'CHEM_1', 'method': 'IPFP', + 'edit_cost_constant': [], 'stabilizer': None, + 'algo_options': '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1'}): + """See my name, then you know what I do. + """ +# Gn_median = Gn_median[0:10] +# Gn_median = [nx.convert_node_labels_to_integers(g) for g in Gn_median] + node_ir = np.inf # corresponding to the node remove and insertion. + label_r = 'thanksdanny' # the label for node remove. # @todo: make this label unrepeatable. + ds_attrs = get_dataset_attributes(Gn_median + Gn_candidate, + attr_names=['edge_labeled', 'node_attr_dim', 'edge_attr_dim'], + edge_label=edge_label) + node_label_set = get_node_labels(Gn_median, node_label) + edge_label_set = get_edge_labels(Gn_median, edge_label) + + + def generate_graph(G, pi_p_forward): + G_new_list = [G.copy()] # all "best" graphs generated in this iteration. +# nx.draw_networkx(G) +# import matplotlib.pyplot as plt +# plt.show() +# print(pi_p_forward) + + # update vertex labels. + # pre-compute h_i0 for each label. +# for label in get_node_labels(Gn, node_label): +# print(label) +# for nd in G.nodes(data=True): +# pass + if not ds_attrs['node_attr_dim']: # labels are symbolic + for ndi, (nd, _) in enumerate(G.nodes(data=True)): + h_i0_list = [] + label_list = [] + for label in node_label_set: + h_i0 = 0 + for idx, g in enumerate(Gn_median): + pi_i = pi_p_forward[idx][ndi] + if pi_i != node_ir and g.nodes[pi_i][node_label] == label: + h_i0 += 1 + h_i0_list.append(h_i0) + label_list.append(label) + # case when the node is to be removed. + if removeNodes: + h_i0_remove = 0 # @todo: maybe this can be added to the node_label_set above. + for idx, g in enumerate(Gn_median): + pi_i = pi_p_forward[idx][ndi] + if pi_i == node_ir: + h_i0_remove += 1 + h_i0_list.append(h_i0_remove) + label_list.append(label_r) + # get the best labels. + idx_max = np.argwhere(h_i0_list == np.max(h_i0_list)).flatten().tolist() + if allBestNodes: # choose all best graphs. + nlabel_best = [label_list[idx] for idx in idx_max] + # generate "best" graphs with regard to "best" node labels. + G_new_list_nd = [] + for g in G_new_list: # @todo: seems it can be simplified. The G_new_list will only contain 1 graph for now. + for nl in nlabel_best: + g_tmp = g.copy() + if nl == label_r: + g_tmp.remove_node(nd) + else: + g_tmp.nodes[nd][node_label] = nl + G_new_list_nd.append(g_tmp) + # nx.draw_networkx(g_tmp) + # import matplotlib.pyplot as plt + # plt.show() + # print(g_tmp.nodes(data=True)) + # print(g_tmp.edges(data=True)) + G_new_list = [ggg.copy() for ggg in G_new_list_nd] + else: + # choose one of the best randomly. + idx_rdm = random.randint(0, len(idx_max) - 1) + best_label = label_list[idx_max[idx_rdm]] + h_i0_max = h_i0_list[idx_max[idx_rdm]] + + g_new = G_new_list[0] + if best_label == label_r: + g_new.remove_node(nd) + else: + g_new.nodes[nd][node_label] = best_label + G_new_list = [g_new] + else: # labels are non-symbolic + for ndi, (nd, _) in enumerate(G.nodes(data=True)): + Si_norm = 0 + phi_i_bar = np.array([0.0 for _ in range(ds_attrs['node_attr_dim'])]) + for idx, g in enumerate(Gn_median): + pi_i = pi_p_forward[idx][ndi] + if g.has_node(pi_i): #@todo: what if no g has node? phi_i_bar = 0? + Si_norm += 1 + phi_i_bar += np.array([float(itm) for itm in g.nodes[pi_i]['attributes']]) + phi_i_bar /= Si_norm + G_new_list[0].nodes[nd]['attributes'] = phi_i_bar + +# for g in G_new_list: +# import matplotlib.pyplot as plt +# nx.draw(g, labels=nx.get_node_attributes(g, 'atom'), with_labels=True) +# plt.show() +# print(g.nodes(data=True)) +# print(g.edges(data=True)) + + # update edge labels and adjacency matrix. + if ds_attrs['edge_labeled']: + G_new_list_edge = [] + for g_new in G_new_list: + nd_list = [n for n in g_new.nodes()] + g_tmp_list = [g_new.copy()] + for nd1i in range(nx.number_of_nodes(g_new)): + nd1 = nd_list[nd1i]# @todo: not just edges, but all pairs of nodes + for nd2i in range(nd1i + 1, nx.number_of_nodes(g_new)): + nd2 = nd_list[nd2i] +# for nd1, nd2, _ in g_new.edges(data=True): + h_ij0_list = [] + label_list = [] + for label in edge_label_set: + h_ij0 = 0 + for idx, g in enumerate(Gn_median): + pi_i = pi_p_forward[idx][nd1i] + pi_j = pi_p_forward[idx][nd2i] + h_ij0_p = (g.has_node(pi_i) and g.has_node(pi_j) and + g.has_edge(pi_i, pi_j) and + g.edges[pi_i, pi_j][edge_label] == label) + h_ij0 += h_ij0_p + h_ij0_list.append(h_ij0) + label_list.append(label) + + # get the best labels. + idx_max = np.argwhere(h_ij0_list == np.max(h_ij0_list)).flatten().tolist() + if allBestEdges: # choose all best graphs. + elabel_best = [label_list[idx] for idx in idx_max] + h_ij0_max = [h_ij0_list[idx] for idx in idx_max] + # generate "best" graphs with regard to "best" node labels. + G_new_list_ed = [] + for g_tmp in g_tmp_list: # @todo: seems it can be simplified. The G_new_list will only contain 1 graph for now. + for idxl, el in enumerate(elabel_best): + g_tmp_copy = g_tmp.copy() + # check whether a_ij is 0 or 1. + sij_norm = 0 + for idx, g in enumerate(Gn_median): + pi_i = pi_p_forward[idx][nd1i] + pi_j = pi_p_forward[idx][nd2i] + if g.has_node(pi_i) and g.has_node(pi_j) and \ + g.has_edge(pi_i, pi_j): + sij_norm += 1 + if h_ij0_max[idxl] > len(Gn_median) * c_er / c_es + \ + sij_norm * (1 - (c_er + c_ei) / c_es): + if not g_tmp_copy.has_edge(nd1, nd2): + g_tmp_copy.add_edge(nd1, nd2) + g_tmp_copy.edges[nd1, nd2][edge_label] = elabel_best[idxl] + else: + if g_tmp_copy.has_edge(nd1, nd2): + g_tmp_copy.remove_edge(nd1, nd2) + G_new_list_ed.append(g_tmp_copy) + g_tmp_list = [ggg.copy() for ggg in G_new_list_ed] + else: # choose one of the best randomly. + idx_rdm = random.randint(0, len(idx_max) - 1) + best_label = label_list[idx_max[idx_rdm]] + h_ij0_max = h_ij0_list[idx_max[idx_rdm]] + + # check whether a_ij is 0 or 1. + sij_norm = 0 + for idx, g in enumerate(Gn_median): + pi_i = pi_p_forward[idx][nd1i] + pi_j = pi_p_forward[idx][nd2i] + if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j): + sij_norm += 1 + if h_ij0_max > len(Gn_median) * c_er / c_es + sij_norm * (1 - (c_er + c_ei) / c_es): + if not g_new.has_edge(nd1, nd2): + g_new.add_edge(nd1, nd2) + g_new.edges[nd1, nd2][edge_label] = best_label + else: +# elif h_ij0_max < len(Gn_median) * c_er / c_es + sij_norm * (1 - (c_er + c_ei) / c_es): + if g_new.has_edge(nd1, nd2): + g_new.remove_edge(nd1, nd2) + g_tmp_list = [g_new] + G_new_list_edge += g_tmp_list + G_new_list = [ggg.copy() for ggg in G_new_list_edge] + + + else: # if edges are unlabeled + # @todo: is this even right? G or g_tmp? check if the new one is right + # @todo: works only for undirected graphs. + + for g_tmp in G_new_list: + nd_list = [n for n in g_tmp.nodes()] + for nd1i in range(nx.number_of_nodes(g_tmp)): + nd1 = nd_list[nd1i] + for nd2i in range(nd1i + 1, nx.number_of_nodes(g_tmp)): + nd2 = nd_list[nd2i] + sij_norm = 0 + for idx, g in enumerate(Gn_median): + pi_i = pi_p_forward[idx][nd1i] + pi_j = pi_p_forward[idx][nd2i] + if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j): + sij_norm += 1 + if sij_norm > len(Gn_median) * c_er / (c_er + c_ei): + # @todo: should we consider if nd1 and nd2 in g_tmp? + # or just add the edge anyway? + if g_tmp.has_node(nd1) and g_tmp.has_node(nd2) \ + and not g_tmp.has_edge(nd1, nd2): + g_tmp.add_edge(nd1, nd2) + else: # @todo: which to use? +# elif sij_norm < len(Gn_median) * c_er / (c_er + c_ei): + if g_tmp.has_edge(nd1, nd2): + g_tmp.remove_edge(nd1, nd2) + # do not change anything when equal. + +# for i, g in enumerate(G_new_list): +# import matplotlib.pyplot as plt +# nx.draw(g, labels=nx.get_node_attributes(g, 'atom'), with_labels=True) +## plt.savefig("results/gk_iam/simple_two/xx" + str(i) + ".png", format="PNG") +# plt.show() +# print(g.nodes(data=True)) +# print(g.edges(data=True)) + +# # find the best graph generated in this iteration and update pi_p. + # @todo: should we update all graphs generated or just the best ones? + dis_list, pi_forward_list = ged_median(G_new_list, Gn_median, + params_ged=params_ged) + # @todo: should we remove the identical and connectivity check? + # Don't know which is faster. + if ds_attrs['node_attr_dim'] == 0 and ds_attrs['edge_attr_dim'] == 0: + G_new_list, idx_list = remove_duplicates(G_new_list) + pi_forward_list = [pi_forward_list[idx] for idx in idx_list] + dis_list = [dis_list[idx] for idx in idx_list] +# if connected == True: +# G_new_list, idx_list = remove_disconnected(G_new_list) +# pi_forward_list = [pi_forward_list[idx] for idx in idx_list] +# idx_min_list = np.argwhere(dis_list == np.min(dis_list)).flatten().tolist() +# dis_min = dis_list[idx_min_tmp_list[0]] +# pi_forward_list = [pi_forward_list[idx] for idx in idx_min_list] +# G_new_list = [G_new_list[idx] for idx in idx_min_list] + +# for g in G_new_list: +# import matplotlib.pyplot as plt +# nx.draw_networkx(g) +# plt.show() +# print(g.nodes(data=True)) +# print(g.edges(data=True)) + + return G_new_list, pi_forward_list, dis_list + + + def best_median_graphs(Gn_candidate, pi_all_forward, dis_all): + idx_min_list = np.argwhere(dis_all == np.min(dis_all)).flatten().tolist() + dis_min = dis_all[idx_min_list[0]] + pi_forward_min_list = [pi_all_forward[idx] for idx in idx_min_list] + G_min_list = [Gn_candidate[idx] for idx in idx_min_list] + return G_min_list, pi_forward_min_list, dis_min + + + def iteration_proc(G, pi_p_forward, cur_sod): + G_list = [G] + pi_forward_list = [pi_p_forward] + old_sod = cur_sod * 2 + sod_list = [cur_sod] + dis_list = [cur_sod] + # iterations. + itr = 0 + # @todo: what if difference == 0? +# while itr < ite_max and (np.abs(old_sod - cur_sod) > epsilon or +# np.abs(old_sod - cur_sod) == 0): + while itr < ite_max and np.abs(old_sod - cur_sod) > epsilon: +# while itr < ite_max: +# for itr in range(0, 5): # the convergence condition? + print('itr_iam is', itr) + G_new_list = [] + pi_forward_new_list = [] + dis_new_list = [] + for idx, g in enumerate(G_list): +# label_set = get_node_labels(Gn_median + [g], node_label) + G_tmp_list, pi_forward_tmp_list, dis_tmp_list = generate_graph( + g, pi_forward_list[idx]) + G_new_list += G_tmp_list + pi_forward_new_list += pi_forward_tmp_list + dis_new_list += dis_tmp_list + # @todo: need to remove duplicates here? + G_list = [ggg.copy() for ggg in G_new_list] + pi_forward_list = [pitem.copy() for pitem in pi_forward_new_list] + dis_list = dis_new_list[:] + + old_sod = cur_sod + cur_sod = np.min(dis_list) + sod_list.append(cur_sod) + + itr += 1 + + # @todo: do we return all graphs or the best ones? + # get the best ones of the generated graphs. + G_list, pi_forward_list, dis_min = best_median_graphs( + G_list, pi_forward_list, dis_list) + + if ds_attrs['node_attr_dim'] == 0 and ds_attrs['edge_attr_dim'] == 0: + G_list, idx_list = remove_duplicates(G_list) + pi_forward_list = [pi_forward_list[idx] for idx in idx_list] +# dis_list = [dis_list[idx] for idx in idx_list] + +# import matplotlib.pyplot as plt +# for g in G_list: +# nx.draw_networkx(g) +# plt.show() +# print(g.nodes(data=True)) +# print(g.edges(data=True)) + + print('\nsods:', sod_list, '\n') + + return G_list, pi_forward_list, dis_min, sod_list + + + def remove_duplicates(Gn): + """Remove duplicate graphs from list. + """ + Gn_new = [] + idx_list = [] + for idx, g in enumerate(Gn): + dupl = False + for g_new in Gn_new: + if graph_isIdentical(g_new, g): + dupl = True + break + if not dupl: + Gn_new.append(g) + idx_list.append(idx) + return Gn_new, idx_list + + + def remove_disconnected(Gn): + """Remove disconnected graphs from list. + """ + Gn_new = [] + idx_list = [] + for idx, g in enumerate(Gn): + if nx.is_connected(g): + Gn_new.append(g) + idx_list.append(idx) + return Gn_new, idx_list + + + ########################################################################### + + # phase 1: initilize. + # compute set-median. + dis_min = np.inf + dis_list, pi_forward_all = ged_median(Gn_candidate, Gn_median, + params_ged=params_ged, parallel=True) + print('finish computing GEDs.') + # find all smallest distances. + if allBestInit: # try all best init graphs. + idx_min_list = range(len(dis_list)) + dis_min = dis_list + else: + idx_min_list = np.argwhere(dis_list == np.min(dis_list)).flatten().tolist() + dis_min = [dis_list[idx_min_list[0]]] * len(idx_min_list) + idx_min_rdm = random.randint(0, len(idx_min_list) - 1) + idx_min_list = [idx_min_list[idx_min_rdm]] + sod_set_median = np.min(dis_min) + + + # phase 2: iteration. + G_list = [] + dis_list = [] + pi_forward_list = [] + G_set_median_list = [] +# sod_list = [] + for idx_tmp, idx_min in enumerate(idx_min_list): +# print('idx_min is', idx_min) + G = Gn_candidate[idx_min].copy() + G_set_median_list.append(G.copy()) + # list of edit operations. + pi_p_forward = pi_forward_all[idx_min] +# pi_p_backward = pi_all_backward[idx_min] + Gi_list, pi_i_forward_list, dis_i_min, sod_list = iteration_proc(G, + pi_p_forward, dis_min[idx_tmp]) + G_list += Gi_list + dis_list += [dis_i_min] * len(Gi_list) + pi_forward_list += pi_i_forward_list + + + if ds_attrs['node_attr_dim'] == 0 and ds_attrs['edge_attr_dim'] == 0: + G_list, idx_list = remove_duplicates(G_list) + dis_list = [dis_list[idx] for idx in idx_list] + pi_forward_list = [pi_forward_list[idx] for idx in idx_list] + if connected == True: + G_list_con, idx_list = remove_disconnected(G_list) + # if there is no connected graphs at all, then remain the disconnected ones. + if len(G_list_con) > 0: # @todo: ?????????????????????????? + G_list = G_list_con + dis_list = [dis_list[idx] for idx in idx_list] + pi_forward_list = [pi_forward_list[idx] for idx in idx_list] + +# import matplotlib.pyplot as plt +# for g in G_list: +# nx.draw_networkx(g) +# plt.show() +# print(g.nodes(data=True)) +# print(g.edges(data=True)) + + # get the best median graphs + G_gen_median_list, pi_forward_min_list, sod_gen_median = best_median_graphs( + G_list, pi_forward_list, dis_list) +# for g in G_gen_median_list: +# nx.draw_networkx(g) +# plt.show() +# print(g.nodes(data=True)) +# print(g.edges(data=True)) + + if not allBestOutput: + # randomly choose one graph. + idx_rdm = random.randint(0, len(G_gen_median_list) - 1) + G_gen_median_list = [G_gen_median_list[idx_rdm]] + + return G_gen_median_list, sod_gen_median, sod_list, G_set_median_list, sod_set_median + + +def iam_bash(Gn_names, edit_cost_constant, cost='CONSTANT', initial_solutions=1, + dataset='monoterpenoides', + graph_dir=''): + """Compute the iam by c++ implementation (gedlib) through bash. + """ + import os + import time + + def createCollectionFile(Gn_names, y, filename): + """Create collection file. + """ + dirname_ds = os.path.dirname(filename) + if dirname_ds != '': + dirname_ds += '/' + if not os.path.exists(dirname_ds) : + os.makedirs(dirname_ds) + + with open(filename + '.xml', 'w') as fgroup: + fgroup.write("") + fgroup.write("\n") + fgroup.write("\n") + for idx, fname in enumerate(Gn_names): + fgroup.write("\n\t") + fgroup.write("\n") + fgroup.close() + + tmp_dir = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/output/tmp_ged/' + fn_collection = tmp_dir + 'collection.' + str(time.time()) + str(random.randint(0, 1e9)) + createCollectionFile(Gn_names, ['dummy'] * len(Gn_names), fn_collection) +# fn_collection = tmp_dir + 'collection_for_debug' +# graph_dir = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/generated_datsets/monoterpenoides/gxl' + +# if dataset == 'Letter-high' or dataset == 'Fingerprint': +# dataset = 'letter' + command = 'GEDLIB_HOME=\'/media/ljia/DATA/research-repo/codes/Linlin/gedlib\'\n' + command += 'LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$GEDLIB_HOME/lib\n' + command += 'export LD_LIBRARY_PATH\n' + command += 'cd \'' + os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/bin\'\n' + command += './iam_for_python_bash ' + dataset + ' ' + fn_collection \ + + ' \'' + graph_dir + '\' ' + ' ' + cost + ' ' + str(initial_solutions) + ' ' + if edit_cost_constant is None: + command += 'None' + else: + for ec in edit_cost_constant: + command += str(ec) + ' ' +# output = os.system(command) + stream = os.popen(command) + + output = stream.readlines() +# print(output) + sod_sm = float(output[0].strip()) + sod_gm = float(output[1].strip()) + + fname_sm = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/output/tmp_ged/set_median.gxl' + fname_gm = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/output/tmp_ged/gen_median.gxl' + + return sod_sm, sod_gm, fname_sm, fname_gm + + + +############################################################################### +# Old implementations. + +def iam(Gn, c_ei=3, c_er=3, c_es=1, node_label='atom', edge_label='bond_type', + connected=True): + """See my name, then you know what I do. + """ +# Gn = Gn[0:10] + Gn = [nx.convert_node_labels_to_integers(g) for g in Gn] + + # phase 1: initilize. + # compute set-median. + dis_min = np.inf + pi_p = [] + pi_all = [] + for idx1, G_p in enumerate(Gn): + dist_sum = 0 + pi_all.append([]) + for idx2, G_p_prime in enumerate(Gn): + dist_tmp, pi_tmp, _ = GED(G_p, G_p_prime) + pi_all[idx1].append(pi_tmp) + dist_sum += dist_tmp + if dist_sum < dis_min: + dis_min = dist_sum + G = G_p.copy() + idx_min = idx1 + # list of edit operations. + pi_p = pi_all[idx_min] + + # phase 2: iteration. + ds_attrs = get_dataset_attributes(Gn, attr_names=['edge_labeled', 'node_attr_dim'], + edge_label=edge_label) + for itr in range(0, 10): # @todo: the convergence condition? + G_new = G.copy() + # update vertex labels. + # pre-compute h_i0 for each label. +# for label in get_node_labels(Gn, node_label): +# print(label) +# for nd in G.nodes(data=True): +# pass + if not ds_attrs['node_attr_dim']: # labels are symbolic + for nd, _ in G.nodes(data=True): + h_i0_list = [] + label_list = [] + for label in get_node_labels(Gn, node_label): + h_i0 = 0 + for idx, g in enumerate(Gn): + pi_i = pi_p[idx][nd] + if g.has_node(pi_i) and g.nodes[pi_i][node_label] == label: + h_i0 += 1 + h_i0_list.append(h_i0) + label_list.append(label) + # choose one of the best randomly. + idx_max = np.argwhere(h_i0_list == np.max(h_i0_list)).flatten().tolist() + idx_rdm = random.randint(0, len(idx_max) - 1) + G_new.nodes[nd][node_label] = label_list[idx_max[idx_rdm]] + else: # labels are non-symbolic + for nd, _ in G.nodes(data=True): + Si_norm = 0 + phi_i_bar = np.array([0.0 for _ in range(ds_attrs['node_attr_dim'])]) + for idx, g in enumerate(Gn): + pi_i = pi_p[idx][nd] + if g.has_node(pi_i): #@todo: what if no g has node? phi_i_bar = 0? + Si_norm += 1 + phi_i_bar += np.array([float(itm) for itm in g.nodes[pi_i]['attributes']]) + phi_i_bar /= Si_norm + G_new.nodes[nd]['attributes'] = phi_i_bar + + # update edge labels and adjacency matrix. + if ds_attrs['edge_labeled']: + for nd1, nd2, _ in G.edges(data=True): + h_ij0_list = [] + label_list = [] + for label in get_edge_labels(Gn, edge_label): + h_ij0 = 0 + for idx, g in enumerate(Gn): + pi_i = pi_p[idx][nd1] + pi_j = pi_p[idx][nd2] + h_ij0_p = (g.has_node(pi_i) and g.has_node(pi_j) and + g.has_edge(pi_i, pi_j) and + g.edges[pi_i, pi_j][edge_label] == label) + h_ij0 += h_ij0_p + h_ij0_list.append(h_ij0) + label_list.append(label) + # choose one of the best randomly. + idx_max = np.argwhere(h_ij0_list == np.max(h_ij0_list)).flatten().tolist() + h_ij0_max = h_ij0_list[idx_max[0]] + idx_rdm = random.randint(0, len(idx_max) - 1) + best_label = label_list[idx_max[idx_rdm]] + + # check whether a_ij is 0 or 1. + sij_norm = 0 + for idx, g in enumerate(Gn): + pi_i = pi_p[idx][nd1] + pi_j = pi_p[idx][nd2] + if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j): + sij_norm += 1 + if h_ij0_max > len(Gn) * c_er / c_es + sij_norm * (1 - (c_er + c_ei) / c_es): + if not G_new.has_edge(nd1, nd2): + G_new.add_edge(nd1, nd2) + G_new.edges[nd1, nd2][edge_label] = best_label + else: + if G_new.has_edge(nd1, nd2): + G_new.remove_edge(nd1, nd2) + else: # if edges are unlabeled + for nd1, nd2, _ in G.edges(data=True): + sij_norm = 0 + for idx, g in enumerate(Gn): + pi_i = pi_p[idx][nd1] + pi_j = pi_p[idx][nd2] + if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j): + sij_norm += 1 + if sij_norm > len(Gn) * c_er / (c_er + c_ei): + if not G_new.has_edge(nd1, nd2): + G_new.add_edge(nd1, nd2) + else: + if G_new.has_edge(nd1, nd2): + G_new.remove_edge(nd1, nd2) + + G = G_new.copy() + + # update pi_p + pi_p = [] + for idx1, G_p in enumerate(Gn): + dist_tmp, pi_tmp, _ = GED(G, G_p) + pi_p.append(pi_tmp) + + return G + +# --------------------------- These are tests --------------------------------# + +def test_iam_with_more_graphs_as_init(Gn, G_candidate, c_ei=3, c_er=3, c_es=1, + node_label='atom', edge_label='bond_type'): + """See my name, then you know what I do. + """ +# Gn = Gn[0:10] + Gn = [nx.convert_node_labels_to_integers(g) for g in Gn] + + # phase 1: initilize. + # compute set-median. + dis_min = np.inf +# pi_p = [] + pi_all_forward = [] + pi_all_backward = [] + for idx1, G_p in tqdm(enumerate(G_candidate), desc='computing GEDs', file=sys.stdout): + dist_sum = 0 + pi_all_forward.append([]) + pi_all_backward.append([]) + for idx2, G_p_prime in enumerate(Gn): + dist_tmp, pi_tmp_forward, pi_tmp_backward = GED(G_p, G_p_prime) + pi_all_forward[idx1].append(pi_tmp_forward) + pi_all_backward[idx1].append(pi_tmp_backward) + dist_sum += dist_tmp + if dist_sum <= dis_min: + dis_min = dist_sum + G = G_p.copy() + idx_min = idx1 + # list of edit operations. + pi_p_forward = pi_all_forward[idx_min] + pi_p_backward = pi_all_backward[idx_min] + + # phase 2: iteration. + ds_attrs = get_dataset_attributes(Gn + [G], attr_names=['edge_labeled', 'node_attr_dim'], + edge_label=edge_label) + label_set = get_node_labels(Gn + [G], node_label) + for itr in range(0, 10): # @todo: the convergence condition? + G_new = G.copy() + # update vertex labels. + # pre-compute h_i0 for each label. +# for label in get_node_labels(Gn, node_label): +# print(label) +# for nd in G.nodes(data=True): +# pass + if not ds_attrs['node_attr_dim']: # labels are symbolic + for nd in G.nodes(): + h_i0_list = [] + label_list = [] + for label in label_set: + h_i0 = 0 + for idx, g in enumerate(Gn): + pi_i = pi_p_forward[idx][nd] + if g.has_node(pi_i) and g.nodes[pi_i][node_label] == label: + h_i0 += 1 + h_i0_list.append(h_i0) + label_list.append(label) + # choose one of the best randomly. + idx_max = np.argwhere(h_i0_list == np.max(h_i0_list)).flatten().tolist() + idx_rdm = random.randint(0, len(idx_max) - 1) + G_new.nodes[nd][node_label] = label_list[idx_max[idx_rdm]] + else: # labels are non-symbolic + for nd in G.nodes(): + Si_norm = 0 + phi_i_bar = np.array([0.0 for _ in range(ds_attrs['node_attr_dim'])]) + for idx, g in enumerate(Gn): + pi_i = pi_p_forward[idx][nd] + if g.has_node(pi_i): #@todo: what if no g has node? phi_i_bar = 0? + Si_norm += 1 + phi_i_bar += np.array([float(itm) for itm in g.nodes[pi_i]['attributes']]) + phi_i_bar /= Si_norm + G_new.nodes[nd]['attributes'] = phi_i_bar + + # update edge labels and adjacency matrix. + if ds_attrs['edge_labeled']: + for nd1, nd2, _ in G.edges(data=True): + h_ij0_list = [] + label_list = [] + for label in get_edge_labels(Gn, edge_label): + h_ij0 = 0 + for idx, g in enumerate(Gn): + pi_i = pi_p_forward[idx][nd1] + pi_j = pi_p_forward[idx][nd2] + h_ij0_p = (g.has_node(pi_i) and g.has_node(pi_j) and + g.has_edge(pi_i, pi_j) and + g.edges[pi_i, pi_j][edge_label] == label) + h_ij0 += h_ij0_p + h_ij0_list.append(h_ij0) + label_list.append(label) + # choose one of the best randomly. + idx_max = np.argwhere(h_ij0_list == np.max(h_ij0_list)).flatten().tolist() + h_ij0_max = h_ij0_list[idx_max[0]] + idx_rdm = random.randint(0, len(idx_max) - 1) + best_label = label_list[idx_max[idx_rdm]] + + # check whether a_ij is 0 or 1. + sij_norm = 0 + for idx, g in enumerate(Gn): + pi_i = pi_p_forward[idx][nd1] + pi_j = pi_p_forward[idx][nd2] + if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j): + sij_norm += 1 + if h_ij0_max > len(Gn) * c_er / c_es + sij_norm * (1 - (c_er + c_ei) / c_es): + if not G_new.has_edge(nd1, nd2): + G_new.add_edge(nd1, nd2) + G_new.edges[nd1, nd2][edge_label] = best_label + else: + if G_new.has_edge(nd1, nd2): + G_new.remove_edge(nd1, nd2) + else: # if edges are unlabeled + # @todo: works only for undirected graphs. + for nd1 in range(nx.number_of_nodes(G)): + for nd2 in range(nd1 + 1, nx.number_of_nodes(G)): + sij_norm = 0 + for idx, g in enumerate(Gn): + pi_i = pi_p_forward[idx][nd1] + pi_j = pi_p_forward[idx][nd2] + if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j): + sij_norm += 1 + if sij_norm > len(Gn) * c_er / (c_er + c_ei): + if not G_new.has_edge(nd1, nd2): + G_new.add_edge(nd1, nd2) + elif sij_norm < len(Gn) * c_er / (c_er + c_ei): + if G_new.has_edge(nd1, nd2): + G_new.remove_edge(nd1, nd2) + # do not change anything when equal. + + G = G_new.copy() + + # update pi_p + pi_p_forward = [] + for G_p in Gn: + dist_tmp, pi_tmp_forward, pi_tmp_backward = GED(G, G_p) + pi_p_forward.append(pi_tmp_forward) + + return G + + +############################################################################### + +if __name__ == '__main__': + from gklearn.utils.graphfiles import loadDataset + ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat', + 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}} # node/edge symb +# ds = {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt', +# 'extra_params': {}} # node nsymb +# ds = {'name': 'Acyclic', 'dataset': '../datasets/monoterpenoides/trainset_9.ds', +# 'extra_params': {}} + Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params']) + + iam(Gn) \ No newline at end of file diff --git a/gklearn/preimage/knn.py b/gklearn/preimage/knn.py new file mode 100644 index 0000000..c179287 --- /dev/null +++ b/gklearn/preimage/knn.py @@ -0,0 +1,114 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Fri Jan 10 13:22:04 2020 + +@author: ljia +""" +import numpy as np +#import matplotlib.pyplot as plt +from tqdm import tqdm +import random +#import csv +from shutil import copyfile +import os + +from gklearn.preimage.iam import iam_bash +from gklearn.utils.graphfiles import loadDataset, loadGXL +from gklearn.preimage.ged import GED +from gklearn.preimage.utils import get_same_item_indices + +def test_knn(): + ds = {'name': 'monoterpenoides', + 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb + Gn, y_all = loadDataset(ds['dataset']) +# Gn = Gn[0:50] +# gkernel = 'treeletkernel' +# node_label = 'atom' +# edge_label = 'bond_type' +# ds_name = 'mono' + dir_output = 'results/knn/' + graph_dir = os.path.dirname(os.path.realpath(__file__)) + '../../datasets/monoterpenoides/' + + k_nn = 1 + percent = 0.1 + repeats = 50 + edit_cost_constant = [3, 3, 1, 3, 3, 1] + + # get indices by classes. + y_idx = get_same_item_indices(y_all) + sod_sm_list_list + for repeat in range(0, repeats): + print('\n---------------------------------') + print('repeat =', repeat) + accuracy_sm_list = [] + accuracy_gm_list = [] + sod_sm_list = [] + sod_gm_list = [] + + random.seed(repeat) + set_median_list = [] + gen_median_list = [] + train_y_set = [] + for y, values in y_idx.items(): + print('\ny =', y) + size_median_set = int(len(values) * percent) + median_set_idx = random.sample(values, size_median_set) + print('median set: ', median_set_idx) + + # compute set median and gen median using IAM (C++ through bash). + # Gn_median = [Gn[idx] for idx in median_set_idx] + group_fnames = [Gn[g].graph['filename'] for g in median_set_idx] + sod_sm, sod_gm, fname_sm, fname_gm = iam_bash(group_fnames, edit_cost_constant, + graph_dir=graph_dir) + print('sod_sm, sod_gm:', sod_sm, sod_gm) + sod_sm_list.append(sod_sm) + sod_gm_list.append(sod_gm) + fname_sm_new = dir_output + 'medians/set_median.y' + str(int(y)) + '.repeat' + str(repeat) + '.gxl' + copyfile(fname_sm, fname_sm_new) + fname_gm_new = dir_output + 'medians/gen_median.y' + str(int(y)) + '.repeat' + str(repeat) + '.gxl' + copyfile(fname_gm, fname_gm_new) + set_median_list.append(loadGXL(fname_sm_new)) + gen_median_list.append(loadGXL(fname_gm_new)) + train_y_set.append(int(y)) + + print(sod_sm, sod_gm) + + # do 1-nn. + test_y_set = [int(y) for y in y_all] + accuracy_sm = knn(set_median_list, train_y_set, Gn, test_y_set, k=k_nn, distance='ged') + accuracy_gm = knn(set_median_list, train_y_set, Gn, test_y_set, k=k_nn, distance='ged') + accuracy_sm_list.append(accuracy_sm) + accuracy_gm_list.append(accuracy_gm) + print('current accuracy sm and gm:', accuracy_sm, accuracy_gm) + + # output + accuracy_sm_mean = np.mean(accuracy_sm_list) + accuracy_gm_mean = np.mean(accuracy_gm_list) + print('\ntotal average accuracy sm and gm:', accuracy_sm_mean, accuracy_gm_mean) + + +def knn(train_set, train_y_set, test_set, test_y_set, k=1, distance='ged'): + if k == 1 and distance == 'ged': + algo_options = '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1' + params_ged = {'lib': 'gedlibpy', 'cost': 'CONSTANT', 'method': 'IPFP', + 'algo_options': algo_options, 'stabilizer': None} + accuracy = 0 + for idx_test, g_test in tqdm(enumerate(test_set), desc='computing 1-nn', + file=sys.stdout): + dis = np.inf + for idx_train, g_train in enumerate(train_set): + dis_cur, _, _ = GED(g_test, g_train, **params_ged) + if dis_cur < dis: + dis = dis_cur + test_y_cur = train_y_set[idx_train] + if test_y_cur == test_y_set[idx_test]: + accuracy += 1 + accuracy = accuracy / len(test_set) + + return accuracy + + + +if __name__ == '__main__': + test_knn() \ No newline at end of file diff --git a/gklearn/preimage/libs.py b/gklearn/preimage/libs.py new file mode 100644 index 0000000..76005c6 --- /dev/null +++ b/gklearn/preimage/libs.py @@ -0,0 +1,6 @@ +import sys +import pathlib + +# insert gedlibpy library. +sys.path.insert(0, "../../../") +from gedlibpy import librariesImport, gedlibpy diff --git a/gklearn/preimage/median.py b/gklearn/preimage/median.py new file mode 100644 index 0000000..1c5bb0f --- /dev/null +++ b/gklearn/preimage/median.py @@ -0,0 +1,218 @@ +import sys +sys.path.insert(0, "../") +#import pathlib +import numpy as np +import networkx as nx +import time + +from gedlibpy import librariesImport, gedlibpy +#import script +sys.path.insert(0, "/home/bgauzere/dev/optim-graphes/") +import gklearn +from gklearn.utils.graphfiles import loadDataset + +def replace_graph_in_env(script, graph, old_id, label='median'): + """ + Replace a graph in script + + If old_id is -1, add a new graph to the environnemt + + """ + if(old_id > -1): + script.PyClearGraph(old_id) + new_id = script.PyAddGraph(label) + for i in graph.nodes(): + script.PyAddNode(new_id,str(i),graph.node[i]) # !! strings are required bt gedlib + for e in graph.edges: + script.PyAddEdge(new_id, str(e[0]),str(e[1]), {}) + script.PyInitEnv() + script.PySetMethod("IPFP", "") + script.PyInitMethod() + + return new_id + +#Dessin median courrant +def draw_Letter_graph(graph, savepath=''): + import numpy as np + import networkx as nx + import matplotlib.pyplot as plt + plt.figure() + pos = {} + for n in graph.nodes: + pos[n] = np.array([float(graph.node[n]['attributes'][0]), + float(graph.node[n]['attributes'][1])]) + nx.draw_networkx(graph, pos) + if savepath != '': + plt.savefig(savepath + str(time.time()) + '.eps', format='eps', dpi=300) + plt.show() + plt.clf() + +#compute new mappings +def update_mappings(script,median_id,listID): + med_distances = {} + med_mappings = {} + sod = 0 + for i in range(0,len(listID)): + script.PyRunMethod(median_id,listID[i]) + med_distances[i] = script.PyGetUpperBound(median_id,listID[i]) + med_mappings[i] = script.PyGetForwardMap(median_id,listID[i]) + sod += med_distances[i] + return med_distances, med_mappings, sod + +def calcul_Sij(all_mappings, all_graphs,i,j): + s_ij = 0 + for k in range(0,len(all_mappings)): + cur_graph = all_graphs[k] + cur_mapping = all_mappings[k] + size_graph = cur_graph.order() + if ((cur_mapping[i] < size_graph) and + (cur_mapping[j] < size_graph) and + (cur_graph.has_edge(cur_mapping[i], cur_mapping[j]) == True)): + s_ij += 1 + + return s_ij + +# def update_median_nodes_L1(median,listIdSet,median_id,dataset, mappings): +# from scipy.stats.mstats import gmean + +# for i in median.nodes(): +# for k in listIdSet: +# vectors = [] #np.zeros((len(listIdSet),2)) +# if(k != median_id): +# phi_i = mappings[k][i] +# if(phi_i < dataset[k].order()): +# vectors.append([float(dataset[k].node[phi_i]['x']),float(dataset[k].node[phi_i]['y'])]) + +# new_labels = gmean(vectors) +# median.node[i]['x'] = str(new_labels[0]) +# median.node[i]['y'] = str(new_labels[1]) +# return median + +def update_median_nodes(median,dataset,mappings): + #update node attributes + for i in median.nodes(): + nb_sub=0 + mean_label = {'x' : 0, 'y' : 0} + for k in range(0,len(mappings)): + phi_i = mappings[k][i] + if ( phi_i < dataset[k].order() ): + nb_sub += 1 + mean_label['x'] += 0.75*float(dataset[k].node[phi_i]['x']) + mean_label['y'] += 0.75*float(dataset[k].node[phi_i]['y']) + median.node[i]['x'] = str((1/0.75)*(mean_label['x']/nb_sub)) + median.node[i]['y'] = str((1/0.75)*(mean_label['y']/nb_sub)) + return median + +def update_median_edges(dataset, mappings, median, cei=0.425,cer=0.425): +#for letter high, ceir = 1.7, alpha = 0.75 + size_dataset = len(dataset) + ratio_cei_cer = cer/(cei + cer) + threshold = size_dataset*ratio_cei_cer + order_graph_median = median.order() + for i in range(0,order_graph_median): + for j in range(i+1,order_graph_median): + s_ij = calcul_Sij(mappings,dataset,i,j) + if(s_ij > threshold): + median.add_edge(i,j) + else: + if(median.has_edge(i,j)): + median.remove_edge(i,j) + return median + + + +def compute_median(script, listID, dataset,verbose=False): + """Compute a graph median of a dataset according to an environment + + Parameters + + script : An gedlib initialized environnement + listID (list): a list of ID in script: encodes the dataset + dataset (list): corresponding graphs in networkX format. We assume that graph + listID[i] corresponds to dataset[i] + + Returns: + A networkX graph, which is the median, with corresponding sod + """ + print(len(listID)) + median_set_index, median_set_sod = compute_median_set(script, listID) + print(median_set_index) + print(median_set_sod) + sods = [] + #Ajout median dans environnement + set_median = dataset[median_set_index].copy() + median = dataset[median_set_index].copy() + cur_med_id = replace_graph_in_env(script,median,-1) + med_distances, med_mappings, cur_sod = update_mappings(script,cur_med_id,listID) + sods.append(cur_sod) + if(verbose): + print(cur_sod) + ite_max = 50 + old_sod = cur_sod * 2 + ite = 0 + epsilon = 0.001 + + best_median + while((ite < ite_max) and (np.abs(old_sod - cur_sod) > epsilon )): + median = update_median_nodes(median,dataset, med_mappings) + median = update_median_edges(dataset,med_mappings,median) + + cur_med_id = replace_graph_in_env(script,median,cur_med_id) + med_distances, med_mappings, cur_sod = update_mappings(script,cur_med_id,listID) + + + sods.append(cur_sod) + if(verbose): + print(cur_sod) + ite += 1 + return median, cur_sod, sods, set_median + + draw_Letter_graph(median) + + +def compute_median_set(script,listID): + 'Returns the id in listID corresponding to median set' + #Calcul median set + N=len(listID) + map_id_to_index = {} + map_index_to_id = {} + for i in range(0,len(listID)): + map_id_to_index[listID[i]] = i + map_index_to_id[i] = listID[i] + + distances = np.zeros((N,N)) + for i in listID: + for j in listID: + script.PyRunMethod(i,j) + distances[map_id_to_index[i],map_id_to_index[j]] = script.PyGetUpperBound(i,j) + + median_set_index = np.argmin(np.sum(distances,0)) + sod = np.min(np.sum(distances,0)) + + return median_set_index, sod + +if __name__ == "__main__": + #Chargement du dataset + script.PyLoadGXLGraph('/home/bgauzere/dev/gedlib/data/datasets/Letter/HIGH/', '/home/bgauzere/dev/gedlib/data/collections/Letter_Z.xml') + script.PySetEditCost("LETTER") + script.PyInitEnv() + script.PySetMethod("IPFP", "") + script.PyInitMethod() + + dataset,my_y = gklearn.utils.graphfiles.loadDataset("/home/bgauzere/dev/gedlib/data/datasets/Letter/HIGH/Letter_Z.cxl") + + listID = script.PyGetAllGraphIds() + median, sod = compute_median(script,listID,dataset,verbose=True) + + print(sod) + draw_Letter_graph(median) + + +#if __name__ == '__main__': +# # test draw_Letter_graph +# ds = {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt', +# 'extra_params': {}} # node nsymb +# Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params']) +# print(y_all) +# for g in Gn: +# draw_Letter_graph(g) \ No newline at end of file diff --git a/gklearn/preimage/median_benoit.py b/gklearn/preimage/median_benoit.py new file mode 100644 index 0000000..6712196 --- /dev/null +++ b/gklearn/preimage/median_benoit.py @@ -0,0 +1,201 @@ +import sys +import pathlib +import numpy as np +import networkx as nx + +import librariesImport +import script +sys.path.insert(0, "/home/bgauzere/dev/optim-graphes/") +import gklearn + +def replace_graph_in_env(script, graph, old_id, label='median'): + """ + Replace a graph in script + + If old_id is -1, add a new graph to the environnemt + + """ + if(old_id > -1): + script.PyClearGraph(old_id) + new_id = script.PyAddGraph(label) + for i in graph.nodes(): + script.PyAddNode(new_id,str(i),graph.node[i]) # !! strings are required bt gedlib + for e in graph.edges: + script.PyAddEdge(new_id, str(e[0]),str(e[1]), {}) + script.PyInitEnv() + script.PySetMethod("IPFP", "") + script.PyInitMethod() + + return new_id + +#Dessin median courrant +def draw_Letter_graph(graph): + import numpy as np + import networkx as nx + import matplotlib.pyplot as plt + plt.figure() + pos = {} + for n in graph.nodes: + pos[n] = np.array([float(graph.node[n]['x']),float(graph.node[n]['y'])]) + nx.draw_networkx(graph,pos) + plt.show() + +#compute new mappings +def update_mappings(script,median_id,listID): + med_distances = {} + med_mappings = {} + sod = 0 + for i in range(0,len(listID)): + script.PyRunMethod(median_id,listID[i]) + med_distances[i] = script.PyGetUpperBound(median_id,listID[i]) + med_mappings[i] = script.PyGetForwardMap(median_id,listID[i]) + sod += med_distances[i] + return med_distances, med_mappings, sod + +def calcul_Sij(all_mappings, all_graphs,i,j): + s_ij = 0 + for k in range(0,len(all_mappings)): + cur_graph = all_graphs[k] + cur_mapping = all_mappings[k] + size_graph = cur_graph.order() + if ((cur_mapping[i] < size_graph) and + (cur_mapping[j] < size_graph) and + (cur_graph.has_edge(cur_mapping[i], cur_mapping[j]) == True)): + s_ij += 1 + + return s_ij + +# def update_median_nodes_L1(median,listIdSet,median_id,dataset, mappings): +# from scipy.stats.mstats import gmean + +# for i in median.nodes(): +# for k in listIdSet: +# vectors = [] #np.zeros((len(listIdSet),2)) +# if(k != median_id): +# phi_i = mappings[k][i] +# if(phi_i < dataset[k].order()): +# vectors.append([float(dataset[k].node[phi_i]['x']),float(dataset[k].node[phi_i]['y'])]) + +# new_labels = gmean(vectors) +# median.node[i]['x'] = str(new_labels[0]) +# median.node[i]['y'] = str(new_labels[1]) +# return median + +def update_median_nodes(median,dataset,mappings): + #update node attributes + for i in median.nodes(): + nb_sub=0 + mean_label = {'x' : 0, 'y' : 0} + for k in range(0,len(mappings)): + phi_i = mappings[k][i] + if ( phi_i < dataset[k].order() ): + nb_sub += 1 + mean_label['x'] += 0.75*float(dataset[k].node[phi_i]['x']) + mean_label['y'] += 0.75*float(dataset[k].node[phi_i]['y']) + median.node[i]['x'] = str((1/0.75)*(mean_label['x']/nb_sub)) + median.node[i]['y'] = str((1/0.75)*(mean_label['y']/nb_sub)) + return median + +def update_median_edges(dataset, mappings, median, cei=0.425,cer=0.425): +#for letter high, ceir = 1.7, alpha = 0.75 + size_dataset = len(dataset) + ratio_cei_cer = cer/(cei + cer) + threshold = size_dataset*ratio_cei_cer + order_graph_median = median.order() + for i in range(0,order_graph_median): + for j in range(i+1,order_graph_median): + s_ij = calcul_Sij(mappings,dataset,i,j) + if(s_ij > threshold): + median.add_edge(i,j) + else: + if(median.has_edge(i,j)): + median.remove_edge(i,j) + return median + + + +def compute_median(script, listID, dataset,verbose=False): + """Compute a graph median of a dataset according to an environment + + Parameters + + script : An gedlib initialized environnement + listID (list): a list of ID in script: encodes the dataset + dataset (list): corresponding graphs in networkX format. We assume that graph + listID[i] corresponds to dataset[i] + + Returns: + A networkX graph, which is the median, with corresponding sod + """ + print(len(listID)) + median_set_index, median_set_sod = compute_median_set(script, listID) + print(median_set_index) + print(median_set_sod) + sods = [] + #Ajout median dans environnement + set_median = dataset[median_set_index].copy() + median = dataset[median_set_index].copy() + cur_med_id = replace_graph_in_env(script,median,-1) + med_distances, med_mappings, cur_sod = update_mappings(script,cur_med_id,listID) + sods.append(cur_sod) + if(verbose): + print(cur_sod) + ite_max = 50 + old_sod = cur_sod * 2 + ite = 0 + epsilon = 0.001 + + best_median + while((ite < ite_max) and (np.abs(old_sod - cur_sod) > epsilon )): + median = update_median_nodes(median,dataset, med_mappings) + median = update_median_edges(dataset,med_mappings,median) + + cur_med_id = replace_graph_in_env(script,median,cur_med_id) + med_distances, med_mappings, cur_sod = update_mappings(script,cur_med_id,listID) + + + sods.append(cur_sod) + if(verbose): + print(cur_sod) + ite += 1 + return median, cur_sod, sods, set_median + + draw_Letter_graph(median) + + +def compute_median_set(script,listID): + 'Returns the id in listID corresponding to median set' + #Calcul median set + N=len(listID) + map_id_to_index = {} + map_index_to_id = {} + for i in range(0,len(listID)): + map_id_to_index[listID[i]] = i + map_index_to_id[i] = listID[i] + + distances = np.zeros((N,N)) + for i in listID: + for j in listID: + script.PyRunMethod(i,j) + distances[map_id_to_index[i],map_id_to_index[j]] = script.PyGetUpperBound(i,j) + + median_set_index = np.argmin(np.sum(distances,0)) + sod = np.min(np.sum(distances,0)) + + return median_set_index, sod + +if __name__ == "__main__": + #Chargement du dataset + script.PyLoadGXLGraph('/home/bgauzere/dev/gedlib/data/datasets/Letter/HIGH/', '/home/bgauzere/dev/gedlib/data/collections/Letter_Z.xml') + script.PySetEditCost("LETTER") + script.PyInitEnv() + script.PySetMethod("IPFP", "") + script.PyInitMethod() + + dataset,my_y = gklearn.utils.graphfiles.loadDataset("/home/bgauzere/dev/gedlib/data/datasets/Letter/HIGH/Letter_Z.cxl") + + listID = script.PyGetAllGraphIds() + median, sod = compute_median(script,listID,dataset,verbose=True) + + print(sod) + draw_Letter_graph(median) diff --git a/gklearn/preimage/median_graph_estimator.py b/gklearn/preimage/median_graph_estimator.py new file mode 100644 index 0000000..b70cc61 --- /dev/null +++ b/gklearn/preimage/median_graph_estimator.py @@ -0,0 +1,826 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Mon Mar 16 18:04:55 2020 + +@author: ljia +""" +import numpy as np +from gklearn.preimage.common_types import AlgorithmState +from gklearn.preimage import misc +from gklearn.preimage.timer import Timer +from gklearn.utils.utils import graph_isIdentical +import time +from tqdm import tqdm +import sys +import networkx as nx + + +class MedianGraphEstimator(object): + + def __init__(self, ged_env, constant_node_costs): + """Constructor. + + Parameters + ---------- + ged_env : gklearn.gedlib.gedlibpy.GEDEnv + Initialized GED environment. The edit costs must be set by the user. + + constant_node_costs : Boolean + Set to True if the node relabeling costs are constant. + """ + self.__ged_env = ged_env + self.__init_method = 'BRANCH_FAST' + self.__init_options = '' + self.__descent_method = 'BRANCH_FAST' + self.__descent_options = '' + self.__refine_method = 'IPFP' + self.__refine_options = '' + self.__constant_node_costs = constant_node_costs + self.__labeled_nodes = (ged_env.get_num_node_labels() > 1) + self.__node_del_cost = ged_env.get_node_del_cost(ged_env.get_node_label(1)) + self.__node_ins_cost = ged_env.get_node_ins_cost(ged_env.get_node_label(1)) + self.__labeled_edges = (ged_env.get_num_edge_labels() > 1) + self.__edge_del_cost = ged_env.get_edge_del_cost(ged_env.get_edge_label(1)) + self.__edge_ins_cost = ged_env.get_edge_ins_cost(ged_env.get_edge_label(1)) + self.__init_type = 'RANDOM' + self.__num_random_inits = 10 + self.__desired_num_random_inits = 10 + self.__use_real_randomness = True + self.__seed = 0 + self.__refine = True + self.__time_limit_in_sec = 0 + self.__epsilon = 0.0001 + self.__max_itrs = 100 + self.__max_itrs_without_update = 3 + self.__num_inits_increase_order = 10 + self.__init_type_increase_order = 'K-MEANS++' + self.__max_itrs_increase_order = 10 + self.__print_to_stdout = 2 + self.__median_id = np.inf # @todo: check + self.__median_node_id_prefix = '' # @todo: check + self.__node_maps_from_median = {} + self.__sum_of_distances = 0 + self.__best_init_sum_of_distances = np.inf + self.__converged_sum_of_distances = np.inf + self.__runtime = None + self.__runtime_initialized = None + self.__runtime_converged = None + self.__itrs = [] # @todo: check: {} ? + self.__num_decrease_order = 0 + self.__num_increase_order = 0 + self.__num_converged_descents = 0 + self.__state = AlgorithmState.TERMINATED + + if ged_env is None: + raise Exception('The GED environment pointer passed to the constructor of MedianGraphEstimator is null.') + elif not ged_env.is_initialized(): + raise Exception('The GED environment is uninitialized. Call gedlibpy.GEDEnv.init() before passing it to the constructor of MedianGraphEstimator.') + + + def set_options(self, options): + """Sets the options of the estimator. + + Parameters + ---------- + options : string + String that specifies with which options to run the estimator. + """ + self.__set_default_options() + options_map = misc.options_string_to_options_map(options) + for opt_name, opt_val in options_map.items(): + if opt_name == 'init-type': + self.__init_type = opt_val + if opt_val != 'MEDOID' and opt_val != 'RANDOM' and opt_val != 'MIN' and opt_val != 'MAX' and opt_val != 'MEAN': + raise Exception('Invalid argument ' + opt_val + ' for option init-type. Usage: options = "[--init-type RANDOM|MEDOID|EMPTY|MIN|MAX|MEAN] [...]"') + elif opt_name == 'random-inits': + try: + self.__num_random_inits = int(opt_val) + self.__desired_num_random_inits = self.__num_random_inits + except: + raise Exception('Invalid argument "' + opt_val + '" for option random-inits. Usage: options = "[--random-inits ]"') + + if self.__num_random_inits <= 0: + raise Exception('Invalid argument "' + opt_val + '" for option random-inits. Usage: options = "[--random-inits ]"') + + elif opt_name == 'randomness': + if opt_val == 'PSEUDO': + self.__use_real_randomness = False + + elif opt_val == 'REAL': + self.__use_real_randomness = True + + else: + raise Exception('Invalid argument "' + opt_val + '" for option randomness. Usage: options = "[--randomness REAL|PSEUDO] [...]"') + + elif opt_name == 'stdout': + if opt_val == '0': + self.__print_to_stdout = 0 + + elif opt_val == '1': + self.__print_to_stdout = 1 + + elif opt_val == '2': + self.__print_to_stdout = 2 + + else: + raise Exception('Invalid argument "' + opt_val + '" for option stdout. Usage: options = "[--stdout 0|1|2] [...]"') + + elif opt_name == 'refine': + if opt_val == 'TRUE': + self.__refine = True + + elif opt_val == 'FALSE': + self.__refine = False + + else: + raise Exception('Invalid argument "' + opt_val + '" for option refine. Usage: options = "[--refine TRUE|FALSE] [...]"') + + elif opt_name == 'time-limit': + try: + self.__time_limit_in_sec = float(opt_val) + + except: + raise Exception('Invalid argument "' + opt_val + '" for option time-limit. Usage: options = "[--time-limit ] [...]') + + elif opt_name == 'max-itrs': + try: + self.__max_itrs = int(opt_val) + + except: + raise Exception('Invalid argument "' + opt_val + '" for option max-itrs. Usage: options = "[--max-itrs ] [...]') + + elif opt_name == 'max-itrs-without-update': + try: + self.__max_itrs_without_update = int(opt_val) + + except: + raise Exception('Invalid argument "' + opt_val + '" for option max-itrs-without-update. Usage: options = "[--max-itrs-without-update ] [...]') + + elif opt_name == 'seed': + try: + self.__seed = int(opt_val) + + except: + raise Exception('Invalid argument "' + opt_val + '" for option seed. Usage: options = "[--seed ] [...]') + + elif opt_name == 'epsilon': + try: + self.__epsilon = float(opt_val) + + except: + raise Exception('Invalid argument "' + opt_val + '" for option epsilon. Usage: options = "[--epsilon ] [...]') + + if self.__epsilon <= 0: + raise Exception('Invalid argument "' + opt_val + '" for option epsilon. Usage: options = "[--epsilon ] [...]') + + elif opt_name == 'inits-increase-order': + try: + self.__num_inits_increase_order = int(opt_val) + + except: + raise Exception('Invalid argument "' + opt_val + '" for option inits-increase-order. Usage: options = "[--inits-increase-order ]"') + + if self.__num_inits_increase_order <= 0: + raise Exception('Invalid argument "' + opt_val + '" for option inits-increase-order. Usage: options = "[--inits-increase-order ]"') + + elif opt_name == 'init-type-increase-order': + self.__init_type_increase_order = opt_val + if opt_val != 'CLUSTERS' and opt_val != 'K-MEANS++': + raise Exception('Invalid argument ' + opt_val + ' for option init-type-increase-order. Usage: options = "[--init-type-increase-order CLUSTERS|K-MEANS++] [...]"') + + elif opt_name == 'max-itrs-increase-order': + try: + self.__max_itrs_increase_order = int(opt_val) + + except: + raise Exception('Invalid argument "' + opt_val + '" for option max-itrs-increase-order. Usage: options = "[--max-itrs-increase-order ] [...]') + + else: + valid_options = '[--init-type ] [--random-inits ] [--randomness ] [--seed ] [--stdout ] ' + valid_options += '[--time-limit ] [--max-itrs ] [--epsilon ] ' + valid_options += '[--inits-increase-order ] [--init-type-increase-order ] [--max-itrs-increase-order ]' + raise Exception('Invalid option "' + opt_name + '". Usage: options = "' + valid_options + '"') + + + def set_init_method(self, init_method, init_options=''): + """Selects method to be used for computing the initial medoid graph. + + Parameters + ---------- + init_method : string + The selected method. Default: ged::Options::GEDMethod::BRANCH_UNIFORM. + + init_options : string + The options for the selected method. Default: "". + + Notes + ----- + Has no effect unless "--init-type MEDOID" is passed to set_options(). + """ + self.__init_method = init_method; + self.__init_options = init_options; + + + def set_descent_method(self, descent_method, descent_options=''): + """Selects method to be used for block gradient descent.. + + Parameters + ---------- + descent_method : string + The selected method. Default: ged::Options::GEDMethod::BRANCH_FAST. + + descent_options : string + The options for the selected method. Default: "". + + Notes + ----- + Has no effect unless "--init-type MEDOID" is passed to set_options(). + """ + self.__descent_method = descent_method; + self.__descent_options = descent_options; + + + def set_refine_method(self, refine_method, refine_options): + """Selects method to be used for improving the sum of distances and the node maps for the converged median. + + Parameters + ---------- + refine_method : string + The selected method. Default: "IPFP". + + refine_options : string + The options for the selected method. Default: "". + + Notes + ----- + Has no effect if "--refine FALSE" is passed to set_options(). + """ + self.__refine_method = refine_method + self.__refine_options = refine_options + + + def run(self, graph_ids, set_median_id, gen_median_id): + """Computes a generalized median graph. + + Parameters + ---------- + graph_ids : list[integer] + The IDs of the graphs for which the median should be computed. Must have been added to the environment passed to the constructor. + + set_median_id : integer + The ID of the computed set-median. A dummy graph with this ID must have been added to the environment passed to the constructor. Upon termination, the computed median can be obtained via gklearn.gedlib.gedlibpy.GEDEnv.get_graph(). + + + gen_median_id : integer + The ID of the computed generalized median. Upon termination, the computed median can be obtained via gklearn.gedlib.gedlibpy.GEDEnv.get_graph(). + """ + # Sanity checks. + if len(graph_ids) == 0: + raise Exception('Empty vector of graph IDs, unable to compute median.') + all_graphs_empty = True + for graph_id in graph_ids: + if self.__ged_env.get_graph_num_nodes(graph_id) > 0: + self.__median_node_id_prefix = self.__ged_env.get_original_node_ids(graph_id)[0] + all_graphs_empty = False + break + if all_graphs_empty: + raise Exception('All graphs in the collection are empty.') + + # Start timer and record start time. + start = time.time() + timer = Timer(self.__time_limit_in_sec) + self.__median_id = gen_median_id + self.__state = AlgorithmState.TERMINATED + + # Get ExchangeGraph representations of the input graphs. + graphs = {} + for graph_id in graph_ids: + # @todo: get_nx_graph() function may need to be modified according to the coming code. + graphs[graph_id] = self.__ged_env.get_nx_graph(graph_id, True, True, False) +# print(self.__ged_env.get_graph_internal_id(0)) +# print(graphs[0].graph) +# print(graphs[0].nodes(data=True)) +# print(graphs[0].edges(data=True)) +# print(nx.adjacency_matrix(graphs[0])) + + + # Construct initial medians. + medians = [] + self.__construct_initial_medians(graph_ids, timer, medians) + end_init = time.time() + self.__runtime_initialized = end_init - start +# print(medians[0].graph) +# print(medians[0].nodes(data=True)) +# print(medians[0].edges(data=True)) +# print(nx.adjacency_matrix(medians[0])) + + # Reset information about iterations and number of times the median decreases and increases. + self.__itrs = [0] * len(medians) + self.__num_decrease_order = 0 + self.__num_increase_order = 0 + self.__num_converged_descents = 0 + + # Initialize the best median. + best_sum_of_distances = np.inf + self.__best_init_sum_of_distances = np.inf + node_maps_from_best_median = {} + + # Run block gradient descent from all initial medians. + self.__ged_env.set_method(self.__descent_method, self.__descent_options) + for median_pos in range(0, len(medians)): + + # Terminate if the timer has expired and at least one SOD has been computed. + if timer.expired() and median_pos > 0: + break + + # Print information about current iteration. + if self.__print_to_stdout == 2: + print('\n===========================================================') + print('Block gradient descent for initial median', str(median_pos + 1), 'of', str(len(medians)), '.') + print('-----------------------------------------------------------') + + # Get reference to the median. + median = medians[median_pos] + + # Load initial median into the environment. + self.__ged_env.load_nx_graph(median, gen_median_id) + self.__ged_env.init(self.__ged_env.get_init_type()) + + # Print information about current iteration. + if self.__print_to_stdout == 2: + progress = tqdm(desc='\rComputing initial node maps', total=len(graph_ids), file=sys.stdout) + + # Compute node maps and sum of distances for initial median. + self.__sum_of_distances = 0 + self.__node_maps_from_median.clear() # @todo + for graph_id in graph_ids: + self.__ged_env.run_method(gen_median_id, graph_id) + self.__node_maps_from_median[graph_id] = self.__ged_env.get_node_map(gen_median_id, graph_id) +# print(self.__node_maps_from_median[graph_id]) + self.__sum_of_distances += self.__ged_env.get_induced_cost(gen_median_id, graph_id) # @todo: the C++ implementation for this function in GedLibBind.ipp re-call get_node_map() once more, this is not neccessary. +# print(self.__sum_of_distances) + # Print information about current iteration. + if self.__print_to_stdout == 2: + progress.update(1) + + self.__best_init_sum_of_distances = min(self.__best_init_sum_of_distances, self.__sum_of_distances) + self.__ged_env.load_nx_graph(median, set_median_id) +# print(self.__best_init_sum_of_distances) + + # Print information about current iteration. + if self.__print_to_stdout == 2: + print('\n') + + # Run block gradient descent from initial median. + converged = False + itrs_without_update = 0 + while not self.__termination_criterion_met(converged, timer, self.__itrs[median_pos], itrs_without_update): + + # Print information about current iteration. + if self.__print_to_stdout == 2: + print('\n===========================================================') + print('Iteration', str(self.__itrs[median_pos] + 1), 'for initial median', str(median_pos + 1), 'of', str(len(medians)), '.') + print('-----------------------------------------------------------') + + # Initialize flags that tell us what happened in the iteration. + median_modified = False + node_maps_modified = False + decreased_order = False + increased_order = False + + # Update the median. # @todo!!!!!!!!!!!!!!!!!!!!!! + median_modified = self.__update_median(graphs, median) + if not median_modified or self.__itrs[median_pos] == 0: + decreased_order = False + if not decreased_order or self.__itrs[median_pos] == 0: + increased_order = False + + # Update the number of iterations without update of the median. + if median_modified or decreased_order or increased_order: + itrs_without_update = 0 + else: + itrs_without_update += 1 + + # Print information about current iteration. + if self.__print_to_stdout == 2: + print('Loading median to environment: ... ', end='') + + # Load the median into the environment. + # @todo: should this function use the original node label? + self.__ged_env.load_nx_graph(median, gen_median_id) + self.__ged_env.init(self.__ged_env.get_init_type()) + + # Print information about current iteration. + if self.__print_to_stdout == 2: + print('done.') + + # Print information about current iteration. + if self.__print_to_stdout == 2: + print('Updating induced costs: ... ', end='') + + # Compute induced costs of the old node maps w.r.t. the updated median. + for graph_id in graph_ids: +# print(self.__ged_env.get_induced_cost(gen_median_id, graph_id)) + # @todo: watch out if compute_induced_cost is correct, this may influence: increase/decrease order, induced_cost() in the following code.!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + self.__ged_env.compute_induced_cost(gen_median_id, graph_id) +# print('---------------------------------------') +# print(self.__ged_env.get_induced_cost(gen_median_id, graph_id)) + + # Print information about current iteration. + if self.__print_to_stdout == 2: + print('done.') + + # Update the node maps. + node_maps_modified = self.__update_node_maps() # @todo + + # Update the order of the median if no improvement can be found with the current order. + + # Update the sum of distances. + old_sum_of_distances = self.__sum_of_distances + self.__sum_of_distances = 0 + for graph_id in self.__node_maps_from_median: + self.__sum_of_distances += self.__ged_env.get_induced_cost(gen_median_id, graph_id) # @todo: see above. + + # Print information about current iteration. + if self.__print_to_stdout == 2: + print('Old local SOD: ', old_sum_of_distances) + print('New local SOD: ', self.__sum_of_distances) + print('Best converged SOD: ', best_sum_of_distances) + print('Modified median: ', median_modified) + print('Modified node maps: ', node_maps_modified) + print('Decreased order: ', decreased_order) + print('Increased order: ', increased_order) + print('===========================================================\n') + + converged = not (median_modified or node_maps_modified or decreased_order or increased_order) + + self.__itrs[median_pos] += 1 + + # Update the best median. + if self.__sum_of_distances < self.__best_init_sum_of_distances: + best_sum_of_distances = self.__sum_of_distances + node_maps_from_best_median = self.__node_maps_from_median + best_median = median + + # Update the number of converged descents. + if converged: + self.__num_converged_descents += 1 + + # Store the best encountered median. + self.__sum_of_distances = best_sum_of_distances + self.__node_maps_from_median = node_maps_from_best_median + self.__ged_env.load_nx_graph(best_median, gen_median_id) + self.__ged_env.init(self.__ged_env.get_init_type()) + end_descent = time.time() + self.__runtime_converged = end_descent - start + + # Refine the sum of distances and the node maps for the converged median. + self.__converged_sum_of_distances = self.__sum_of_distances + if self.__refine: + self.__improve_sum_of_distances(timer) # @todo + + # Record end time, set runtime and reset the number of initial medians. + end = time.time() + self.__runtime = end - start + self.__num_random_inits = self.__desired_num_random_inits + + # Print global information. + if self.__print_to_stdout != 0: + print('\n===========================================================') + print('Finished computation of generalized median graph.') + print('-----------------------------------------------------------') + print('Best SOD after initialization: ', self.__best_init_sum_of_distances) + print('Converged SOD: ', self.__converged_sum_of_distances) + if self.__refine: + print('Refined SOD: ', self.__sum_of_distances) + print('Overall runtime: ', self.__runtime) + print('Runtime of initialization: ', self.__runtime_initialized) + print('Runtime of block gradient descent: ', self.__runtime_converged - self.__runtime_initialized) + if self.__refine: + print('Runtime of refinement: ', self.__runtime - self.__runtime_converged) + print('Number of initial medians: ', len(medians)) + total_itr = 0 + num_started_descents = 0 + for itr in self.__itrs: + total_itr += itr + if itr > 0: + num_started_descents += 1 + print('Size of graph collection: ', len(graph_ids)) + print('Number of started descents: ', num_started_descents) + print('Number of converged descents: ', self.__num_converged_descents) + print('Overall number of iterations: ', total_itr) + print('Overall number of times the order decreased: ', self.__num_decrease_order) + print('Overall number of times the order increased: ', self.__num_increase_order) + print('===========================================================\n') + + + def get_sum_of_distances(self, state=''): + """Returns the sum of distances. + + Parameters + ---------- + state : string + The state of the estimator. Can be 'initialized' or 'converged'. Default: "" + + Returns + ------- + float + The sum of distances (SOD) of the median when the estimator was in the state `state` during the last call to run(). If `state` is not given, the converged SOD (without refinement) or refined SOD (with refinement) is returned. + """ + if not self.__median_available(): + raise Exception('No median has been computed. Call run() before calling get_sum_of_distances().') + if state == 'initialized': + return self.__best_init_sum_of_distances + if state == 'converged': + return self.__converged_sum_of_distances + return self.__sum_of_distances + + + def __set_default_options(self): + self.__init_type = 'RANDOM' + self.__num_random_inits = 10 + self.__desired_num_random_inits = 10 + self.__use_real_randomness = True + self.__seed = 0 + self.__refine = True + self.__time_limit_in_sec = 0 + self.__epsilon = 0.0001 + self.__max_itrs = 100 + self.__max_itrs_without_update = 3 + self.__num_inits_increase_order = 10 + self.__init_type_increase_order = 'K-MEANS++' + self.__max_itrs_increase_order = 10 + self.__print_to_stdout = 2 + + + def __construct_initial_medians(self, graph_ids, timer, initial_medians): + # Print information about current iteration. + if self.__print_to_stdout == 2: + print('\n===========================================================') + print('Constructing initial median(s).') + print('-----------------------------------------------------------') + + # Compute or sample the initial median(s). + initial_medians.clear() + if self.__init_type == 'MEDOID': + self.__compute_medoid(graph_ids, timer, initial_medians) + elif self.__init_type == 'MAX': + pass # @todo +# compute_max_order_graph_(graph_ids, initial_medians) + elif self.__init_type == 'MIN': + pass # @todo +# compute_min_order_graph_(graph_ids, initial_medians) + elif self.__init_type == 'MEAN': + pass # @todo +# compute_mean_order_graph_(graph_ids, initial_medians) + else: + pass # @todo +# sample_initial_medians_(graph_ids, initial_medians) + + # Print information about current iteration. + if self.__print_to_stdout == 2: + print('===========================================================') + + + def __compute_medoid(self, graph_ids, timer, initial_medians): + # Use method selected for initialization phase. + self.__ged_env.set_method(self.__init_method, self.__init_options) + + # Print information about current iteration. + if self.__print_to_stdout == 2: + progress = tqdm(desc='\rComputing medoid', total=len(graph_ids), file=sys.stdout) + + # Compute the medoid. + medoid_id = graph_ids[0] + best_sum_of_distances = np.inf + for g_id in graph_ids: + if timer.expired(): + self.__state = AlgorithmState.CALLED + break + sum_of_distances = 0 + for h_id in graph_ids: + self.__ged_env.run_method(g_id, h_id) + sum_of_distances += self.__ged_env.get_upper_bound(g_id, h_id) + if sum_of_distances < best_sum_of_distances: + best_sum_of_distances = sum_of_distances + medoid_id = g_id + + # Print information about current iteration. + if self.__print_to_stdout == 2: + progress.update(1) + initial_medians.append(self.__ged_env.get_nx_graph(medoid_id, True, True, False)) # @todo + + # Print information about current iteration. + if self.__print_to_stdout == 2: + print('\n') + + + def __termination_criterion_met(self, converged, timer, itr, itrs_without_update): + if timer.expired() or (itr >= self.__max_itrs if self.__max_itrs >= 0 else False): + if self.__state == AlgorithmState.TERMINATED: + self.__state = AlgorithmState.INITIALIZED + return True + return converged or (itrs_without_update > self.__max_itrs_without_update if self.__max_itrs_without_update >= 0 else False) + + + def __update_median(self, graphs, median): + # Print information about current iteration. + if self.__print_to_stdout == 2: + print('Updating median: ', end='') + + # Store copy of the old median. + old_median = median.copy() # @todo: this is just a shallow copy. + + # Update the node labels. + if self.__labeled_nodes: + self.__update_node_labels(graphs, median) + + # Update the edges and their labels. + self.__update_edges(graphs, median) + + # Print information about current iteration. + if self.__print_to_stdout == 2: + print('done.') + + return not self.__are_graphs_equal(median, old_median) + + + def __update_node_labels(self, graphs, median): + + # Print information about current iteration. + if self.__print_to_stdout == 2: + print('nodes ... ', end='') + + # Iterate through all nodes of the median. + for i in range(0, nx.number_of_nodes(median)): +# print('i: ', i) + # Collect the labels of the substituted nodes. + node_labels = [] + for graph_id, graph in graphs.items(): +# print('graph_id: ', graph_id) +# print(self.__node_maps_from_median[graph_id]) + k = self.__get_node_image_from_map(self.__node_maps_from_median[graph_id], i) +# print('k: ', k) + if k != np.inf: + node_labels.append(graph.nodes[k]) + + # Compute the median label and update the median. + if len(node_labels) > 0: + median_label = self.__ged_env.get_median_node_label(node_labels) + if self.__ged_env.get_node_rel_cost(median.nodes[i], median_label) > self.__epsilon: + nx.set_node_attributes(median, {i: median_label}) + + + def __update_edges(self, graphs, median): + # Print information about current iteration. + if self.__print_to_stdout == 2: + print('edges ... ', end='') + + # Clear the adjacency lists of the median and reset number of edges to 0. + median_edges = list(median.edges) + for (head, tail) in median_edges: + median.remove_edge(head, tail) + + # @todo: what if edge is not labeled? + # Iterate through all possible edges (i,j) of the median. + for i in range(0, nx.number_of_nodes(median)): + for j in range(i + 1, nx.number_of_nodes(median)): + + # Collect the labels of the edges to which (i,j) is mapped by the node maps. + edge_labels = [] + for graph_id, graph in graphs.items(): + k = self.__get_node_image_from_map(self.__node_maps_from_median[graph_id], i) + l = self.__get_node_image_from_map(self.__node_maps_from_median[graph_id], j) + if k != np.inf and l != np.inf: + if graph.has_edge(k, l): + edge_labels.append(graph.edges[(k, l)]) + + # Compute the median edge label and the overall edge relabeling cost. + rel_cost = 0 + median_label = self.__ged_env.get_edge_label(1) + if median.has_edge(i, j): + median_label = median.edges[(i, j)] + if self.__labeled_edges and len(edge_labels) > 0: + new_median_label = self.__ged_env.median_edge_label(edge_labels) + if self.__ged_env.get_edge_rel_cost(median_label, new_median_label) > self.__epsilon: + median_label = new_median_label + for edge_label in edge_labels: + rel_cost += self.__ged_env.get_edge_rel_cost(median_label, edge_label) + + # Update the median. + if rel_cost < (self.__edge_ins_cost + self.__edge_del_cost) * len(edge_labels) - self.__edge_del_cost * len(graphs): + median.add_edge(i, j, **median_label) + else: + if median.has_edge(i, j): + median.remove_edge(i, j) + + + def __update_node_maps(self): + # Print information about current iteration. + if self.__print_to_stdout == 2: + progress = tqdm(desc='\rUpdating node maps', total=len(self.__node_maps_from_median), file=sys.stdout) + + # Update the node maps. + node_maps_were_modified = False + for graph_id in self.__node_maps_from_median: + self.__ged_env.run_method(self.__median_id, graph_id) + if self.__ged_env.get_upper_bound(self.__median_id, graph_id) < self.__ged_env.get_induced_cost(self.__median_id, graph_id) - self.__epsilon: # @todo: see above. + self.__node_maps_from_median[graph_id] = self.__ged_env.get_node_map(self.__median_id, graph_id) # @todo: node_map may not assigned. + node_maps_were_modified = True + # Print information about current iteration. + if self.__print_to_stdout == 2: + progress.update(1) + + # Print information about current iteration. + if self.__print_to_stdout == 2: + print('\n') + + # Return true if the node maps were modified. + return node_maps_were_modified + + + def __improve_sum_of_distances(self, timer): + pass + + + def __median_available(self): + return self.__median_id != np.inf + + + def __get_node_image_from_map(self, node_map, node): + """ + Return ID of the node mapping of `node` in `node_map`. + + Parameters + ---------- + node_map : list[tuple(int, int)] + List of node maps where the mapping node is found. + + node : int + The mapping node of this node is returned + + Raises + ------ + Exception + If the node with ID `node` is not contained in the source nodes of the node map. + + Returns + ------- + int + ID of the mapping of `node`. + + Notes + ----- + This function is not implemented in the `ged::MedianGraphEstimator` class of the `GEDLIB` library. Instead it is a Python implementation of the `ged::NodeMap::image` function. + """ + if node < len(node_map): + return node_map[node][1] if node_map[node][1] < len(node_map) else np.inf + else: + raise Exception('The node with ID ', str(node), ' is not contained in the source nodes of the node map.') + return np.inf + + + def __are_graphs_equal(self, g1, g2): + """ + Check if the two graphs are equal. + + Parameters + ---------- + g1 : NetworkX graph object + Graph 1 to be compared. + + g2 : NetworkX graph object + Graph 2 to be compared. + + Returns + ------- + bool + True if the two graph are equal. + + Notes + ----- + This is not an identical check. Here the two graphs are equal if and only if their original_node_ids, nodes, all node labels, edges and all edge labels are equal. This function is specifically designed for class `MedianGraphEstimator` and should not be used elsewhere. + """ + # check original node ids. + if not g1.graph['original_node_ids'] == g2.graph['original_node_ids']: + return False + # check nodes. + nlist1 = [n for n in g1.nodes(data=True)] + nlist2 = [n for n in g2.nodes(data=True)] + if not nlist1 == nlist2: + return False + # check edges. + elist1 = [n for n in g1.edges(data=True)] + elist2 = [n for n in g2.edges(data=True)] + if not elist1 == elist2: + return False + + return True + + + def compute_my_cost(g, h, node_map): + cost = 0.0 + for node in g.nodes: + cost += 0 + \ No newline at end of file diff --git a/gklearn/preimage/median_linlin.py b/gklearn/preimage/median_linlin.py new file mode 100644 index 0000000..6139558 --- /dev/null +++ b/gklearn/preimage/median_linlin.py @@ -0,0 +1,215 @@ +import sys +import pathlib +import numpy as np +import networkx as nx + +from gedlibpy import librariesImport, gedlibpy +sys.path.insert(0, "/home/bgauzere/dev/optim-graphes/") +import gklearn + +def replace_graph_in_env(script, graph, old_id, label='median'): + """ + Replace a graph in script + + If old_id is -1, add a new graph to the environnemt + + """ + if(old_id > -1): + script.PyClearGraph(old_id) + new_id = script.PyAddGraph(label) + for i in graph.nodes(): + script.PyAddNode(new_id,str(i),graph.node[i]) # !! strings are required bt gedlib + for e in graph.edges: + script.PyAddEdge(new_id, str(e[0]),str(e[1]), {}) + script.PyInitEnv() + script.PySetMethod("IPFP", "") + script.PyInitMethod() + + return new_id + +#Dessin median courrant +def draw_Letter_graph(graph): + import numpy as np + import networkx as nx + import matplotlib.pyplot as plt + plt.figure() + pos = {} + for n in graph.nodes: + pos[n] = np.array([float(graph.node[n]['x']),float(graph.node[n]['y'])]) + nx.draw_networkx(graph,pos) + plt.show() + +#compute new mappings +def update_mappings(script,median_id,listID): + med_distances = {} + med_mappings = {} + sod = 0 + for i in range(0,len(listID)): + script.PyRunMethod(median_id,listID[i]) + med_distances[i] = script.PyGetUpperBound(median_id,listID[i]) + med_mappings[i] = script.PyGetForwardMap(median_id,listID[i]) + sod += med_distances[i] + return med_distances, med_mappings, sod + +def calcul_Sij(all_mappings, all_graphs,i,j): + s_ij = 0 + for k in range(0,len(all_mappings)): + cur_graph = all_graphs[k] + cur_mapping = all_mappings[k] + size_graph = cur_graph.order() + if ((cur_mapping[i] < size_graph) and + (cur_mapping[j] < size_graph) and + (cur_graph.has_edge(cur_mapping[i], cur_mapping[j]) == True)): + s_ij += 1 + + return s_ij + +# def update_median_nodes_L1(median,listIdSet,median_id,dataset, mappings): +# from scipy.stats.mstats import gmean + +# for i in median.nodes(): +# for k in listIdSet: +# vectors = [] #np.zeros((len(listIdSet),2)) +# if(k != median_id): +# phi_i = mappings[k][i] +# if(phi_i < dataset[k].order()): +# vectors.append([float(dataset[k].node[phi_i]['x']),float(dataset[k].node[phi_i]['y'])]) + +# new_labels = gmean(vectors) +# median.node[i]['x'] = str(new_labels[0]) +# median.node[i]['y'] = str(new_labels[1]) +# return median + +def update_median_nodes(median,dataset,mappings): + #update node attributes + for i in median.nodes(): + nb_sub=0 + mean_label = {'x' : 0, 'y' : 0} + for k in range(0,len(mappings)): + phi_i = mappings[k][i] + if ( phi_i < dataset[k].order() ): + nb_sub += 1 + mean_label['x'] += 0.75*float(dataset[k].node[phi_i]['x']) + mean_label['y'] += 0.75*float(dataset[k].node[phi_i]['y']) + median.node[i]['x'] = str((1/0.75)*(mean_label['x']/nb_sub)) + median.node[i]['y'] = str((1/0.75)*(mean_label['y']/nb_sub)) + return median + +def update_median_edges(dataset, mappings, median, cei=0.425,cer=0.425): +#for letter high, ceir = 1.7, alpha = 0.75 + size_dataset = len(dataset) + ratio_cei_cer = cer/(cei + cer) + threshold = size_dataset*ratio_cei_cer + order_graph_median = median.order() + for i in range(0,order_graph_median): + for j in range(i+1,order_graph_median): + s_ij = calcul_Sij(mappings,dataset,i,j) + if(s_ij > threshold): + median.add_edge(i,j) + else: + if(median.has_edge(i,j)): + median.remove_edge(i,j) + return median + + + +def compute_median(script, listID, dataset,verbose=False): + """Compute a graph median of a dataset according to an environment + + Parameters + + script : An gedlib initialized environnement + listID (list): a list of ID in script: encodes the dataset + dataset (list): corresponding graphs in networkX format. We assume that graph + listID[i] corresponds to dataset[i] + + Returns: + A networkX graph, which is the median, with corresponding sod + """ + print(len(listID)) + median_set_index, median_set_sod = compute_median_set(script, listID) + print(median_set_index) + print(median_set_sod) + sods = [] + #Ajout median dans environnement + set_median = dataset[median_set_index].copy() + median = dataset[median_set_index].copy() + cur_med_id = replace_graph_in_env(script,median,-1) + med_distances, med_mappings, cur_sod = update_mappings(script,cur_med_id,listID) + sods.append(cur_sod) + if(verbose): + print(cur_sod) + ite_max = 50 + old_sod = cur_sod * 2 + ite = 0 + epsilon = 0.001 + + best_median + while((ite < ite_max) and (np.abs(old_sod - cur_sod) > epsilon )): + median = update_median_nodes(median,dataset, med_mappings) + median = update_median_edges(dataset,med_mappings,median) + + cur_med_id = replace_graph_in_env(script,median,cur_med_id) + med_distances, med_mappings, cur_sod = update_mappings(script,cur_med_id,listID) + + + sods.append(cur_sod) + if(verbose): + print(cur_sod) + ite += 1 + return median, cur_sod, sods, set_median + + draw_Letter_graph(median) + + +def compute_median_set(script,listID): + 'Returns the id in listID corresponding to median set' + #Calcul median set + N=len(listID) + map_id_to_index = {} + map_index_to_id = {} + for i in range(0,len(listID)): + map_id_to_index[listID[i]] = i + map_index_to_id[i] = listID[i] + + distances = np.zeros((N,N)) + for i in listID: + for j in listID: + script.PyRunMethod(i,j) + distances[map_id_to_index[i],map_id_to_index[j]] = script.PyGetUpperBound(i,j) + + median_set_index = np.argmin(np.sum(distances,0)) + sod = np.min(np.sum(distances,0)) + + return median_set_index, sod + +def _convertGraph(G): + """Convert a graph to the proper NetworkX format that can be + recognized by library gedlibpy. + """ + G_new = nx.Graph() + for nd, attrs in G.nodes(data=True): + G_new.add_node(str(nd), chem=attrs['atom']) +# G_new.add_node(str(nd), x=str(attrs['attributes'][0]), +# y=str(attrs['attributes'][1])) + for nd1, nd2, attrs in G.edges(data=True): + G_new.add_edge(str(nd1), str(nd2), valence=attrs['bond_type']) +# G_new.add_edge(str(nd1), str(nd2)) + + return G_new + +if __name__ == "__main__": + #Chargement du dataset + gedlibpy.PyLoadGXLGraph('/home/bgauzere/dev/gedlib/data/datasets/Letter/HIGH/', '/home/bgauzere/dev/gedlib/data/collections/Letter_Z.xml') + gedlibpy.PySetEditCost("LETTER") + gedlibpy.PyInitEnv() + gedlibpy.PySetMethod("IPFP", "") + gedlibpy.PyInitMethod() + + dataset,my_y = gklearn.utils.graphfiles.loadDataset("/home/bgauzere/dev/gedlib/data/datasets/Letter/HIGH/Letter_Z.cxl") + + listID = gedlibpy.PyGetAllGraphIds() + median, sod = compute_median(gedlibpy,listID,dataset,verbose=True) + + print(sod) + draw_Letter_graph(median) diff --git a/gklearn/preimage/median_preimage_generator.py b/gklearn/preimage/median_preimage_generator.py new file mode 100644 index 0000000..dfbaef2 --- /dev/null +++ b/gklearn/preimage/median_preimage_generator.py @@ -0,0 +1,15 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Thu Mar 26 18:27:22 2020 + +@author: ljia +""" +from gklearn.preimage.preimage_generator import PreimageGenerator +# from gklearn.utils.dataset import Dataset + +class MedianPreimageGenerator(PreimageGenerator): + + def __init__(self, mge, dataset): + self.__mge = mge + self.__dataset = dataset \ No newline at end of file diff --git a/gklearn/preimage/misc.py b/gklearn/preimage/misc.py new file mode 100644 index 0000000..18682c8 --- /dev/null +++ b/gklearn/preimage/misc.py @@ -0,0 +1,108 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Thu Mar 19 18:13:56 2020 + +@author: ljia +""" + +def options_string_to_options_map(options_string): + """Transforms an options string into an options map. + + Parameters + ---------- + options_string : string + Options string of the form "[--