diff --git a/notebooks/run_marginalizedkernel.py b/notebooks/run_marginalizedkernel.py index df1c66b..cd7bf73 100644 --- a/notebooks/run_marginalizedkernel.py +++ b/notebooks/run_marginalizedkernel.py @@ -28,6 +28,7 @@ dslist = [ # {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1_A.txt'}, # node symb # {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109_A.txt'}, # node symb # {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb +# {'name': 'monoterpenoides', 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'}, # node/edge symb # # {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, # # node/edge symb @@ -57,7 +58,7 @@ estimator = marginalizedkernel #param_grid_precomputed = {'p_quit': np.linspace(0.1, 0.3, 3), # 'n_iteration': np.linspace(1, 1, 1), param_grid_precomputed = {'p_quit': np.linspace(0.1, 0.9, 9), - 'n_iteration': np.linspace(5, 20, 4), + 'n_iteration': np.linspace(1, 19, 7), 'remove_totters': [False]} param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)}, {'alpha': np.logspace(-10, 10, num=41, base=10)}] diff --git a/notebooks/run_spkernel.py b/notebooks/run_spkernel.py index fcae61f..0698d2a 100644 --- a/notebooks/run_spkernel.py +++ b/notebooks/run_spkernel.py @@ -24,6 +24,9 @@ dslist = [ # {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1_A.txt'}, # node symb # {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109_A.txt'}, # node symb # {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb +# {'name': 'monoterpenoides', 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'}, # node/edge +# {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt'}, +# # node nsymb symb # # {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, # # node/edge symb diff --git a/notebooks/run_structuralspkernel.py b/notebooks/run_structuralspkernel.py index 071cd3c..223d832 100644 --- a/notebooks/run_structuralspkernel.py +++ b/notebooks/run_structuralspkernel.py @@ -30,6 +30,8 @@ dslist = [ # {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'}, # # node symb/nsymb # {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb +# {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt'}, +# # node nsymb symb # # {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, # # node/edge symb diff --git a/notebooks/run_treeletkernel.py b/notebooks/run_treeletkernel.py index 25c83d5..b4631fc 100644 --- a/notebooks/run_treeletkernel.py +++ b/notebooks/run_treeletkernel.py @@ -26,6 +26,7 @@ dslist = [ {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109_A.txt'}, # node symb {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'}, +# {'name': 'monoterpenoides', 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'}, # node/edge symb # node symb/nsymb # {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb # {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'}, diff --git a/notebooks/run_untilhpathkernel.py b/notebooks/run_untilhpathkernel.py index 6210708..3127ea5 100644 --- a/notebooks/run_untilhpathkernel.py +++ b/notebooks/run_untilhpathkernel.py @@ -27,7 +27,8 @@ dslist = [ {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1_A.txt'}, # node symb {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109_A.txt'}, # node symb - {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb + {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb +# {'name': 'monoterpenoides', 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'}, # node/edge symb # # {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, # # node/edge symb @@ -54,11 +55,11 @@ dslist = [ # {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',}, ] estimator = untilhpathkernel -param_grid_precomputed = {'depth': np.linspace(3, 10, 8), # [2], - 'k_func': [None]} # ['MinMax', 'tanimoto'], -#param_grid_precomputed = {'depth': np.linspace(1, 10, 10), # [2], -# 'k_func': ['MinMax'], # ['MinMax', 'tanimoto'], -# 'compute_method': ['trie']} # ['MinMax']} +#param_grid_precomputed = {'depth': np.linspace(3, 10, 8), # [2], +# 'k_func': [None]} # ['MinMax', 'tanimoto'], +param_grid_precomputed = {'depth': np.linspace(1, 10, 10), # [2], + 'k_func': ['MinMax', 'tanimoto'], # ['MinMax'], # + 'compute_method': ['trie']} # ['MinMax']} param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)}, {'alpha': np.logspace(-10, 10, num=41, base=10)}] diff --git a/notebooks/run_weisfeilerlehmankernel.py b/notebooks/run_weisfeilerlehmankernel.py index 423da8b..ed03adc 100644 --- a/notebooks/run_weisfeilerlehmankernel.py +++ b/notebooks/run_weisfeilerlehmankernel.py @@ -30,6 +30,8 @@ dslist = [ {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1_A.txt'}, # node symb {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109_A.txt'}, # node symb {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb +# {'name': 'monoterpenoides', 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'}, # node/edge symb + # # {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, # # node/edge symb diff --git a/preimage/fitDistance.py b/preimage/fitDistance.py index 5268014..f07c3f2 100644 --- a/preimage/fitDistance.py +++ b/preimage/fitDistance.py @@ -7,7 +7,7 @@ Created on Wed Oct 16 14:20:06 2019 """ import numpy as np from tqdm import tqdm -from itertools import combinations_with_replacement +from itertools import combinations_with_replacement, combinations import multiprocessing from multiprocessing import Pool from functools import partial @@ -22,110 +22,88 @@ import sys from ged import GED, get_nb_edit_operations from utils import kernel_distance_matrix -def fit_GED_to_kernel_distance(Gn, node_label, edge_label, gkernel, itr_max, - fitkernel=None, gamma=1.0): +def fit_GED_to_kernel_distance(Gn, node_label, edge_label, gkernel, itr_max, k=4, + params_ged={'lib': 'gedlibpy', 'cost': 'CONSTANT', + 'method': 'IPFP', 'stabilizer': None}, + init_costs=[3, 3, 1, 3, 3, 1], + parallel=True): # c_vi, c_vr, c_vs, c_ei, c_er, c_es or parts of them. # random.seed(1) - cost_rdm = random.sample(range(1, 10), 6) -# edit_costs = cost_rdm + [0] - edit_costs = cost_rdm -# edit_costs = [i * 0.01 for i in cost_rdm] + [0] -# edit_costs = [0.2, 0.2, 0.2, 0.2, 0.2, 0] -# edit_costs = [0, 0, 0.9544, 0.026, 0.0196, 0] -# edit_costs = [0.008429912251810438, 0.025461055985319694, 0.2047320869225948, 0.004148727085832133, 0.0, 0] - idx_cost_nonzeros = [i for i, item in enumerate(edit_costs) if item != 0] +# cost_rdm = random.sample(range(1, 10), 6) +# init_costs = cost_rdm + [0] +# init_costs = cost_rdm + init_costs = [3, 3, 1, 3, 3, 1] +# init_costs = [i * 0.01 for i in cost_rdm] + [0] +# init_costs = [0.2, 0.2, 0.2, 0.2, 0.2, 0] +# init_costs = [0, 0, 0.9544, 0.026, 0.0196, 0] +# init_costs = [0.008429912251810438, 0.025461055985319694, 0.2047320869225948, 0.004148727085832133, 0.0, 0] +# idx_cost_nonzeros = [i for i, item in enumerate(edit_costs) if item != 0] # compute distances in feature space. - coef_dk = 1 dis_k_mat, _, _, _ = kernel_distance_matrix(Gn, node_label, edge_label, gkernel=gkernel) dis_k_vec = [] for i in range(len(dis_k_mat)): - for j in range(i, len(dis_k_mat)): +# for j in range(i, len(dis_k_mat)): + for j in range(i + 1, len(dis_k_mat)): dis_k_vec.append(dis_k_mat[i, j]) dis_k_vec = np.array(dis_k_vec) - if fitkernel == None: - dis_k_vec_ajusted = dis_k_vec - elif fitkernel == 'gaussian': - coef_dk = 1 / np.max(dis_k_vec) - idx_dk_nonzeros = np.where(dis_k_vec != 0)[0] - # remove 0's and constraint d_k between 0 and 1. - dis_k_vec = dis_k_vec[idx_dk_nonzeros] * coef_dk - dis_k_vec_ajusted = np.sqrt(-np.log(dis_k_vec) / gamma) - residual_list = [] - edit_cost_list = [] - time_list = [] - nb_cost_mat_list = [] + # init ged. + print('\ninitial:') + time0 = time.time() + params_ged['edit_cost_constant'] = init_costs + ged_vec_init, ged_mat, n_edit_operations = compute_geds(Gn, params_ged, + parallel=parallel) + residual_list = [np.sqrt(np.sum(np.square(np.array(ged_vec_init) - dis_k_vec)))] + time_list = [time.time() - time0] + edit_cost_list = [init_costs] + nb_cost_mat = np.array(n_edit_operations) + nb_cost_mat_list = [nb_cost_mat] + print('edit_costs:', init_costs) + print('residual_list:', residual_list) for itr in range(itr_max): print('\niteration', itr) time0 = time.time() - # compute GEDs and numbers of edit operations. - edit_cost_constant = [i for i in edit_costs] - edit_cost_list.append(edit_cost_constant) - - ged_all, ged_mat, n_edit_operations = compute_geds(Gn, edit_cost_constant, - idx_cost_nonzeros, parallel=True) - - if fitkernel == None: - residual = np.sqrt(np.sum(np.square(np.array(ged_all) - dis_k_vec))) - elif fitkernel == 'gaussian': - ged_all = np.array(ged_all)[idx_dk_nonzeros] - residual = np.sqrt(np.sum(np.square( - np.exp(-gamma * ged_all ** 2) / coef_dk - dis_k_vec))) - residual_list.append(residual) - # "fit" geds to distances in feature space by tuning edit costs using the # Least Squares Method. - nb_cost_mat = np.array(n_edit_operations).T - if fitkernel == 'gaussian': - nb_cost_mat = nb_cost_mat[idx_dk_nonzeros] - nb_cost_mat_list.append(nb_cost_mat) - edit_costs_new, residual = compute_better_costs(nb_cost_mat, dis_k_vec_ajusted) - - print('pseudo residual:', residual) + edit_costs_new, residual = update_costs(nb_cost_mat, dis_k_vec) for i in range(len(edit_costs_new)): if edit_costs_new[i] < 0: if edit_costs_new[i] > -1e-9: edit_costs_new[i] = 0 else: raise ValueError('The edit cost is negative.') - - for idx, item in enumerate(idx_cost_nonzeros): - edit_costs[item] = edit_costs_new[idx] - +# for i in range(len(edit_costs_new)): +# if edit_costs_new[i] < 0: +# edit_costs_new[i] = 0 + + # compute new GEDs and numbers of edit operations. + params_ged['edit_cost_constant'] = edit_costs_new + ged_vec, ged_mat, n_edit_operations = compute_geds(Gn, params_ged, + parallel=parallel) + residual_list.append(np.sqrt(np.sum(np.square(np.array(ged_vec) - dis_k_vec)))) time_list.append(time.time() - time0) - - print('edit_costs:', edit_costs) + edit_cost_list.append(edit_costs_new) + nb_cost_mat = np.array(n_edit_operations) + nb_cost_mat_list.append(nb_cost_mat) + print('edit_costs:', edit_costs_new) print('residual_list:', residual_list) - - print() - edit_cost_list.append(edit_costs) - ged_all, ged_mat, n_edit_operations = compute_geds(Gn, edit_costs, - idx_cost_nonzeros, parallel=True) - if fitkernel == 0: - residual = np.sqrt(np.sum(np.square(np.array(ged_all) - dis_k_vec))) - elif fitkernel == 'gaussian': - ged_all = np.array(ged_all)[idx_dk_nonzeros] - residual = np.sqrt(np.sum(np.square( - np.exp(-gamma * ged_all ** 2) / coef_dk - dis_k_vec))) - residual_list.append(residual) - nb_cost_mat_list.append(np.array(n_edit_operations).T) - return edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, \ - time_list, nb_cost_mat_list, coef_dk + return edit_costs_new, residual_list, edit_cost_list, dis_k_mat, ged_mat, \ + time_list, nb_cost_mat_list -def compute_geds(Gn, edit_cost_constant, idx_nonzeros, parallel=False): +def compute_geds(Gn, params_ged, parallel=False): ged_mat = np.zeros((len(Gn), len(Gn))) if parallel: # print('parallel') - len_itr = int(len(Gn) * (len(Gn) + 1) / 2) - ged_all = [0 for i in range(len_itr)] - n_edit_operations = [[0 for i in range(len_itr)] for j in - range(len(idx_nonzeros))] - - itr = combinations_with_replacement(range(0, len(Gn)), 2) +# len_itr = int(len(Gn) * (len(Gn) + 1) / 2) + len_itr = int(len(Gn) * (len(Gn) - 1) / 2) + ged_vec = [0 for i in range(len_itr)] + n_edit_operations = [0 for i in range(len_itr)] +# itr = combinations_with_replacement(range(0, len(Gn)), 2) + itr = combinations(range(0, len(Gn)), 2) n_jobs = multiprocessing.cpu_count() if len_itr < 100 * n_jobs: chunksize = int(len_itr / n_jobs) + 1 @@ -134,68 +112,52 @@ def compute_geds(Gn, edit_cost_constant, idx_nonzeros, parallel=False): def init_worker(gn_toshare): global G_gn G_gn = gn_toshare - do_partial = partial(_wrapper_compute_ged_parallel, edit_cost_constant, - idx_nonzeros) + do_partial = partial(_wrapper_compute_ged_parallel, params_ged) pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(Gn,)) iterator = tqdm(pool.imap_unordered(do_partial, itr, chunksize), desc='computing GEDs', file=sys.stdout) # iterator = pool.imap_unordered(do_partial, itr, chunksize) for i, j, dis, n_eo_tmp in iterator: - idx_itr = int(len(Gn) * i + j - i * (i + 1) / 2) - ged_all[idx_itr] = dis + idx_itr = int(len(Gn) * i + j - (i + 1) * (i + 2) / 2) + ged_vec[idx_itr] = dis ged_mat[i][j] = dis ged_mat[j][i] = dis - for idx, item in enumerate(idx_nonzeros): - n_edit_operations[idx][idx_itr] = n_eo_tmp[item] + n_edit_operations[idx_itr] = n_eo_tmp # print('\n-------------------------------------------') # print(i, j, idx_itr, dis) pool.close() pool.join() else: - ged_all = [] - n_edit_operations = [[] for i in range(len(idx_nonzeros))] + ged_vec = [] + n_edit_operations = [] for i in tqdm(range(len(Gn)), desc='computing GEDs', file=sys.stdout): # for i in range(len(Gn)): - for j in range(i, len(Gn)): -# time0 = time.time() - dis, pi_forward, pi_backward = GED(Gn[i], Gn[j], lib='gedlibpy', - cost='CONSTANT', method='IPFP', - edit_cost_constant=edit_cost_constant, stabilizer='min', - repeat=50) -# time1 = time.time() - time0 -# time0 = time.time() - ged_all.append(dis) + for j in range(i + 1, len(Gn)): + dis, pi_forward, pi_backward = GED(Gn[i], Gn[j], **params_ged) + ged_vec.append(dis) ged_mat[i][j] = dis ged_mat[j][i] = dis n_eo_tmp = get_nb_edit_operations(Gn[i], Gn[j], pi_forward, pi_backward) - for idx, item in enumerate(idx_nonzeros): - n_edit_operations[idx].append(n_eo_tmp[item]) -# time2 = time.time() - time0 -# print(time1, time2, time1 / time2) + n_edit_operations.append(n_eo_tmp) - return ged_all, ged_mat, n_edit_operations + return ged_vec, ged_mat, n_edit_operations -def _wrapper_compute_ged_parallel(edit_cost_constant, idx_nonzeros, itr): +def _wrapper_compute_ged_parallel(params_ged, itr): i = itr[0] j = itr[1] - dis, n_eo_tmp = _compute_ged_parallel(G_gn[i], G_gn[j], edit_cost_constant, - idx_nonzeros) + dis, n_eo_tmp = _compute_ged_parallel(G_gn[i], G_gn[j], params_ged) return i, j, dis, n_eo_tmp -def _compute_ged_parallel(g1, g2, edit_cost_constant, idx_nonzeros): - dis, pi_forward, pi_backward = GED(g1, g2, lib='gedlibpy', - cost='CONSTANT', method='IPFP', - edit_cost_constant=edit_cost_constant, stabilizer='min', - repeat=50) - n_eo_tmp = get_nb_edit_operations(g1, g2, pi_forward, pi_backward) - +def _compute_ged_parallel(g1, g2, params_ged): + dis, pi_forward, pi_backward = GED(g1, g2, **params_ged) + n_eo_tmp = get_nb_edit_operations(g1, g2, pi_forward, pi_backward) return dis, n_eo_tmp -def compute_better_costs(nb_cost_mat, dis_k_vec): +def update_costs(nb_cost_mat, dis_k_vec): # # method 1: simple least square method. # edit_costs_new, residual, _, _ = np.linalg.lstsq(nb_cost_mat, dis_k_vec, # rcond=None) @@ -203,7 +165,7 @@ def compute_better_costs(nb_cost_mat, dis_k_vec): # # method 2: least square method with x_i >= 0. # edit_costs_new, residual = optimize.nnls(nb_cost_mat, dis_k_vec) - # method 3: solve as a quadratic program with constraints: x_i >= 0, sum(x) = 1. + # method 3: solve as a quadratic program with constraints. # P = np.dot(nb_cost_mat.T, nb_cost_mat) # q_T = -2 * np.dot(dis_k_vec.T, nb_cost_mat) # G = -1 * np.identity(nb_cost_mat.shape[1]) @@ -221,7 +183,7 @@ def compute_better_costs(nb_cost_mat, dis_k_vec): # h = np.array([0 for i in range(nb_cost_mat.shape[1])]) x = cp.Variable(nb_cost_mat.shape[1]) cost = cp.sum_squares(nb_cost_mat * x - dis_k_vec) - constraints = [x >= [0.01 for i in range(nb_cost_mat.shape[1])], + constraints = [x >= [0.0001 for i in range(nb_cost_mat.shape[1])], # np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0] np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0, np.array([0.0, 0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0] diff --git a/preimage/ged.py b/preimage/ged.py index eaa7294..073fae6 100644 --- a/preimage/ged.py +++ b/preimage/ged.py @@ -13,29 +13,30 @@ import multiprocessing from multiprocessing import Pool from functools import partial -from gedlibpy import librariesImport, gedlibpy +from gedlibpy_linlin import librariesImport, gedlibpy def GED(g1, g2, lib='gedlibpy', cost='CHEM_1', method='IPFP', - edit_cost_constant=[], stabilizer='min', repeat=50): + edit_cost_constant=[], algo_options='', stabilizer='min', repeat=50): """ Compute GED for 2 graphs. """ - if lib == 'gedlibpy': - def convertGraph(G): - """Convert a graph to the proper NetworkX format that can be - recognized by library gedlibpy. - """ - G_new = nx.Graph() - for nd, attrs in G.nodes(data=True): - G_new.add_node(str(nd), chem=attrs['atom']) + def convertGraph(G): + """Convert a graph to the proper NetworkX format that can be + recognized by library gedlibpy. + """ + G_new = nx.Graph() + for nd, attrs in G.nodes(data=True): + G_new.add_node(str(nd), chem=attrs['atom']) # G_new.add_node(str(nd), x=str(attrs['attributes'][0]), # y=str(attrs['attributes'][1])) - for nd1, nd2, attrs in G.edges(data=True): - G_new.add_edge(str(nd1), str(nd2), valence=attrs['bond_type']) -# G_new.add_edge(str(nd1), str(nd2)) - - return G_new + for nd1, nd2, attrs in G.edges(data=True): +# G_new.add_edge(str(nd1), str(nd2), valence=attrs['bond_type']) + G_new.add_edge(str(nd1), str(nd2)) + + return G_new + + if lib == 'gedlibpy': gedlibpy.restart_env() gedlibpy.add_nx_graph(convertGraph(g1), "") gedlibpy.add_nx_graph(convertGraph(g2), "") @@ -43,12 +44,12 @@ def GED(g1, g2, lib='gedlibpy', cost='CHEM_1', method='IPFP', listID = gedlibpy.get_all_graph_ids() gedlibpy.set_edit_cost(cost, edit_cost_constant=edit_cost_constant) gedlibpy.init() - gedlibpy.set_method(method, "") + gedlibpy.set_method(method, algo_options) gedlibpy.init_method() g = listID[0] h = listID[1] - if stabilizer == None: + if stabilizer is None: gedlibpy.run_method(g, h) pi_forward = gedlibpy.get_forward_map(g, h) pi_backward = gedlibpy.get_backward_map(g, h) @@ -107,13 +108,57 @@ def GED(g1, g2, lib='gedlibpy', cost='CHEM_1', method='IPFP', dis = upper - # make the map label correct (label remove map as np.inf) - nodes1 = [n for n in g1.nodes()] - nodes2 = [n for n in g2.nodes()] - nb1 = nx.number_of_nodes(g1) - nb2 = nx.number_of_nodes(g2) - pi_forward = [nodes2[pi] if pi < nb2 else np.inf for pi in pi_forward] - pi_backward = [nodes1[pi] if pi < nb1 else np.inf for pi in pi_backward] + elif lib == 'gedlib-bash': + import time + import random + import sys + import os + sys.path.insert(0, "../") + from pygraph.utils.graphfiles import saveDataset + + tmp_dir = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/output/tmp_ged/' + if not os.path.exists(tmp_dir): + os.makedirs(tmp_dir) + fn_collection = tmp_dir + 'collection.' + str(time.time()) + str(random.randint(0, 1e9)) + xparams = {'method': 'gedlib', 'graph_dir': fn_collection} + saveDataset([g1, g2], ['dummy', 'dummy'], gformat='gxl', group='xml', + filename=fn_collection, xparams=xparams) + + command = 'GEDLIB_HOME=\'/media/ljia/DATA/research-repo/codes/others/gedlib/gedlib2\'\n' + command += 'LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$GEDLIB_HOME/lib\n' + command += 'export LD_LIBRARY_PATH\n' + command += 'cd \'/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/bin\'\n' + command += './ged_for_python_bash monoterpenoides ' + fn_collection \ + + ' \'' + algo_options + '\' ' + for ec in edit_cost_constant: + command += str(ec) + ' ' +# output = os.system(command) + stream = os.popen(command) + output = stream.readlines() +# print(output) + + dis = float(output[0].strip()) + runtime = float(output[1].strip()) + size_forward = int(output[2].strip()) + pi_forward = [int(item.strip()) for item in output[3:3+size_forward]] + pi_backward = [int(item.strip()) for item in output[3+size_forward:]] + +# print(dis) +# print(runtime) +# print(size_forward) +# print(pi_forward) +# print(pi_backward) + + + # make the map label correct (label remove map as np.inf) + nodes1 = [n for n in g1.nodes()] + nodes2 = [n for n in g2.nodes()] + nb1 = nx.number_of_nodes(g1) + nb2 = nx.number_of_nodes(g2) + pi_forward = [nodes2[pi] if pi < nb2 else np.inf for pi in pi_forward] + pi_backward = [nodes1[pi] if pi < nb1 else np.inf for pi in pi_backward] +# print(pi_forward) + return dis, pi_forward, pi_backward @@ -149,7 +194,7 @@ def GED_n(Gn, lib='gedlibpy', cost='CHEM_1', method='IPFP', g = listID[0] h = listID[1] - if stabilizer == None: + if stabilizer is None: gedlibpy.run_method(g, h) pi_forward = gedlibpy.get_forward_map(g, h) pi_backward = gedlibpy.get_backward_map(g, h) @@ -183,7 +228,8 @@ def GED_n(Gn, lib='gedlibpy', cost='CHEM_1', method='IPFP', def ged_median(Gn, Gn_median, verbose=False, params_ged={'lib': 'gedlibpy', 'cost': 'CHEM_1', 'method': 'IPFP', 'edit_cost_constant': [], - 'stabilizer': 'min', 'repeat': 50}, parallel=False): + 'algo_options': '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1', + 'stabilizer': None}, parallel=False): if parallel: len_itr = int(len(Gn)) pi_forward_list = [[] for i in range(len_itr)] diff --git a/preimage/iam.py b/preimage/iam.py index fa38582..0a63b98 100644 --- a/preimage/iam.py +++ b/preimage/iam.py @@ -23,7 +23,8 @@ def iam_upgraded(Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, ite_max=50, connected=False, removeNodes=True, allBestInit=False, allBestNodes=False, allBestEdges=False, allBestOutput=False, params_ged={'lib': 'gedlibpy', 'cost': 'CHEM_1', 'method': 'IPFP', - 'edit_cost_constant': [], 'stabilizer': 'min', 'repeat': 50}): + 'edit_cost_constant': [], 'stabilizer': None, + 'algo_options': '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1'}): """See my name, then you know what I do. """ # Gn_median = Gn_median[0:10] @@ -435,6 +436,62 @@ def iam_upgraded(Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, ite_max=50, return G_gen_median_list, sod_gen_median, sod_list, G_set_median_list, sod_set_median +def iam_bash(Gn_names, edit_cost_constant, dataset='monoterpenoides', + graph_dir='/media/ljia/DATA/research-repo/codes/Linlin/py-graph/datasets/monoterpenoides/'): + """Compute the iam by c++ implementation (gedlib) through bash. + """ + import os + import time + + def createCollectionFile(Gn_names, y, filename): + """Create collection file. + """ + dirname_ds = os.path.dirname(filename) + if dirname_ds != '': + dirname_ds += '/' + if not os.path.exists(dirname_ds) : + os.makedirs(dirname_ds) + + with open(filename + '.xml', 'w') as fgroup: + fgroup.write("") + fgroup.write("\n") + fgroup.write("\n") + for idx, fname in enumerate(Gn_names): + fgroup.write("\n\t") + fgroup.write("\n") + fgroup.close() + + tmp_dir = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/output/tmp_ged/' + fn_collection = tmp_dir + 'collection.' + str(time.time()) + str(random.randint(0, 1e9)) + createCollectionFile(Gn_names, ['dummy'] * len(Gn_names), fn_collection) +# graph_dir = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/generated_datsets/monoterpenoides/gxl' + + + command = 'GEDLIB_HOME=\'/media/ljia/DATA/research-repo/codes/others/gedlib/gedlib2\'\n' + command += 'LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$GEDLIB_HOME/lib\n' + command += 'export LD_LIBRARY_PATH\n' + command += 'cd \'/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/bin\'\n' + command += './iam_for_python_bash ' + dataset + ' ' + fn_collection \ + + ' \'' + graph_dir + '\' ' + if edit_cost_constant is None: + command += 'None' + else: + for ec in edit_cost_constant: + command += str(ec) + ' ' +# output = os.system(command) + stream = os.popen(command) + + output = stream.readlines() +# print(output) + sod_sm = float(output[0].strip()) + sod_gm= float(output[1].strip()) + + fname_sm = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/output/tmp_ged/set_median.gxl' + fname_gm = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/output/tmp_ged/gen_median.gxl' + + return sod_sm, sod_gm, fname_sm, fname_gm + + ############################################################################### # Old implementations. diff --git a/preimage/test_fitDistance.py b/preimage/test_fitDistance.py index f2de5ae..2f2907d 100644 --- a/preimage/test_fitDistance.py +++ b/preimage/test_fitDistance.py @@ -16,6 +16,319 @@ from utils import remove_edges from fitDistance import fit_GED_to_kernel_distance from utils import normalize_distance_matrix + +def median_paper_clcpc_python_best(): + """c_vs <= c_vi + c_vr, c_es <= c_ei + c_er with ged computation with + python invoking the c++ code by bash command (with updated library). + """ +# ds = {'name': 'monoterpenoides', +# 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb +# _, y_all = loadDataset(ds['dataset']) + gkernel = 'untilhpathkernel' + node_label = 'atom' + edge_label = 'bond_type' + itr_max = 6 + algo_options = '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1' + params_ged = {'lib': 'gedlibpy', 'cost': 'CONSTANT', 'method': 'IPFP', + 'algo_options': algo_options, 'stabilizer': None} + + y_all = ['3', '1', '4', '6', '7', '8', '9', '2'] + repeats = 50 + collection_path = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/generated_datsets/monoterpenoides/' + graph_dir = collection_path + 'gxl/' + + fn_edit_costs_output = 'results/median_paper/edit_costs_output.python_init40.k10.txt' + + for y in y_all: + for repeat in range(repeats): + edit_costs_output_file = open(fn_edit_costs_output, 'a') + collection_file = collection_path + 'monoterpenoides_' + y + '_' + str(repeat) + '.xml' + Gn, _ = loadDataset(collection_file, extra_params=graph_dir) + edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \ + nb_cost_mat_list = fit_GED_to_kernel_distance(Gn, node_label, edge_label, + gkernel, itr_max, params_ged=params_ged, + parallel=True) + total_time = np.sum(time_list) +# print('\nedit_costs:', edit_costs) +# print('\nresidual_list:', residual_list) +# print('\nedit_cost_list:', edit_cost_list) +# print('\ndistance matrix in kernel space:', dis_k_mat) +# print('\nged matrix:', ged_mat) +# print('\ntotal time:', total_time) +# print('\nnb_cost_mat:', nb_cost_mat_list[-1]) + np.savez('results/median_paper/fit_distance.clcpc.python_init40.monot.elabeled.uhpkernel.y' + + y + '.repeat' + str(repeat) + '.k10..gm', + edit_costs=edit_costs, + residual_list=residual_list, edit_cost_list=edit_cost_list, + dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list, + total_time=total_time, nb_cost_mat_list=nb_cost_mat_list) + + for ec in edit_costs: + edit_costs_output_file.write(str(ec) + ' ') + edit_costs_output_file.write('\n') + edit_costs_output_file.close() + + +# # normalized distance matrices. +# gmfile = np.load('results/fit_distance.cs_leq_ci_plus_cr.cost_leq_1en2.monot.elabeled.uhpkernel.gm.npz') +# edit_costs = gmfile['edit_costs'] +# residual_list = gmfile['residual_list'] +# edit_cost_list = gmfile['edit_cost_list'] +# dis_k_mat = gmfile['dis_k_mat'] +# ged_mat = gmfile['ged_mat'] +# total_time = gmfile['total_time'] +# nb_cost_mat_list = gmfile['nb_cost_mat_list'] + + nb_consistent, nb_inconsistent, ratio_consistent = pairwise_substitution_consistence(dis_k_mat, ged_mat) + print(nb_consistent, nb_inconsistent, ratio_consistent) + +# norm_dis_k_mat = normalize_distance_matrix(dis_k_mat) +# plt.imshow(norm_dis_k_mat) +# plt.colorbar() +# plt.savefig('results/median_paper/norm_dis_k_mat.clcpc.python_best.monot.elabeled.uhpkernel.y' +# + y + '.repeat' + str(repeat) + '.eps', format='eps', dpi=300) +# plt.savefig('results/median_paper/norm_dis_k_mat.clcpc.python_best.monot.elabeled.uhpkernel.y' +# + y + '.repeat' + str(repeat) + '.png', format='png') +# # plt.show() +# plt.clf() +# +# norm_ged_mat = normalize_distance_matrix(ged_mat) +# plt.imshow(norm_ged_mat) +# plt.colorbar() +# plt.savefig('results/median_paper/norm_ged_mat.clcpc.python_best.monot.elabeled.uhpkernel.y' +# + y + '.repeat' + str(repeat) + '.eps', format='eps', dpi=300) +# plt.savefig('results/median_paper/norm_ged_mat.clcpc.python_best.monot.elabeled.uhpkernel.y' +# + y + '.repeat' + str(repeat) + '.png', format='png') +# # plt.show() +# plt.clf() +# +# norm_diff = norm_ged_mat - norm_dis_k_mat +# plt.imshow(norm_diff) +# plt.colorbar() +# plt.savefig('results/median_paper/diff_mat_norm_ged_dis_k.clcpc.python_best.monot.elabeled.uhpkernel.y' +# + y + '.repeat' + str(repeat) + '.eps', format='eps', dpi=300) +# plt.savefig('results/median_paper/diff_mat_norm_ged_dis_k.clcpc.python_best.monot.elabeled.uhpkernel.y' +# + y + '.repeat' + str(repeat) + '.png', format='png') +# # plt.show() +# plt.clf() +# # draw_count_bar(norm_diff) + + +def median_paper_clcpc_python_bash_cpp(): + """c_vs <= c_vi + c_vr, c_es <= c_ei + c_er with ged computation with + python invoking the c++ code by bash command (with updated library). + """ +# ds = {'name': 'monoterpenoides', +# 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb +# _, y_all = loadDataset(ds['dataset']) + gkernel = 'untilhpathkernel' + node_label = 'atom' + edge_label = 'bond_type' + itr_max = 20 + algo_options = '--threads 6 --initial-solutions 10 --ratio-runs-from-initial-solutions .5' + params_ged = {'lib': 'gedlib-bash', 'cost': 'CONSTANT', 'method': 'IPFP', + 'algo_options': algo_options} + + y_all = ['3', '1', '4', '6', '7', '8', '9', '2'] + repeats = 50 + collection_path = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/generated_datsets/monoterpenoides/' + graph_dir = collection_path + 'gxl/' + + fn_edit_costs_output = 'results/median_paper/edit_costs_output.txt' + + for y in y_all: + for repeat in range(repeats): + edit_costs_output_file = open(fn_edit_costs_output, 'a') + collection_file = collection_path + 'monoterpenoides_' + y + '_' + str(repeat) + '.xml' + Gn, _ = loadDataset(collection_file, extra_params=graph_dir) + edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \ + nb_cost_mat_list, coef_dk = fit_GED_to_kernel_distance(Gn, node_label, edge_label, + gkernel, itr_max, params_ged=params_ged, + parallel=False) + total_time = np.sum(time_list) +# print('\nedit_costs:', edit_costs) +# print('\nresidual_list:', residual_list) +# print('\nedit_cost_list:', edit_cost_list) +# print('\ndistance matrix in kernel space:', dis_k_mat) +# print('\nged matrix:', ged_mat) +# print('\ntotal time:', total_time) +# print('\nnb_cost_mat:', nb_cost_mat_list[-1]) + np.savez('results/median_paper/fit_distance.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y' + + y + '.repeat' + str(repeat) + '.gm', + edit_costs=edit_costs, + residual_list=residual_list, edit_cost_list=edit_cost_list, + dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list, + total_time=total_time, nb_cost_mat_list=nb_cost_mat_list, + coef_dk=coef_dk) + + for ec in edit_costs: + edit_costs_output_file.write(str(ec) + ' ') + edit_costs_output_file.write('\n') + edit_costs_output_file.close() + + +# # normalized distance matrices. +# gmfile = np.load('results/fit_distance.cs_leq_ci_plus_cr.cost_leq_1en2.monot.elabeled.uhpkernel.gm.npz') +# edit_costs = gmfile['edit_costs'] +# residual_list = gmfile['residual_list'] +# edit_cost_list = gmfile['edit_cost_list'] +# dis_k_mat = gmfile['dis_k_mat'] +# ged_mat = gmfile['ged_mat'] +# total_time = gmfile['total_time'] +# nb_cost_mat_list = gmfile['nb_cost_mat_list'] +# coef_dk = gmfile['coef_dk'] + + nb_consistent, nb_inconsistent, ratio_consistent = pairwise_substitution_consistence(dis_k_mat, ged_mat) + print(nb_consistent, nb_inconsistent, ratio_consistent) + +# norm_dis_k_mat = normalize_distance_matrix(dis_k_mat) +# plt.imshow(norm_dis_k_mat) +# plt.colorbar() +# plt.savefig('results/median_paper/norm_dis_k_mat.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y' +# + y + '.repeat' + str(repeat) + '.eps', format='eps', dpi=300) +# plt.savefig('results/median_paper/norm_dis_k_mat.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y' +# + y + '.repeat' + str(repeat) + '.png', format='png') +# # plt.show() +# plt.clf() +# +# norm_ged_mat = normalize_distance_matrix(ged_mat) +# plt.imshow(norm_ged_mat) +# plt.colorbar() +# plt.savefig('results/median_paper/norm_ged_mat.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y' +# + y + '.repeat' + str(repeat) + '.eps', format='eps', dpi=300) +# plt.savefig('results/median_paper/norm_ged_mat.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y' +# + y + '.repeat' + str(repeat) + '.png', format='png') +# # plt.show() +# plt.clf() +# +# norm_diff = norm_ged_mat - norm_dis_k_mat +# plt.imshow(norm_diff) +# plt.colorbar() +# plt.savefig('results/median_paper/diff_mat_norm_ged_dis_k.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y' +# + y + '.repeat' + str(repeat) + '.eps', format='eps', dpi=300) +# plt.savefig('results/median_paper/diff_mat_norm_ged_dis_k.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y' +# + y + '.repeat' + str(repeat) + '.png', format='png') +# # plt.show() +# plt.clf() +# # draw_count_bar(norm_diff) + + + + + +def test_cs_leq_ci_plus_cr_python_bash_cpp(): + """c_vs <= c_vi + c_vr, c_es <= c_ei + c_er with ged computation with + python invoking the c++ code by bash command (with updated library). + """ + ds = {'name': 'monoterpenoides', + 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb + Gn, y_all = loadDataset(ds['dataset']) +# Gn = Gn[0:10] + gkernel = 'untilhpathkernel' + node_label = 'atom' + edge_label = 'bond_type' + itr_max = 10 + algo_options = '--threads 6 --initial-solutions 10 --ratio-runs-from-initial-solutions .5' + params_ged = {'lib': 'gedlib-bash', 'cost': 'CONSTANT', 'method': 'IPFP', + 'algo_options': algo_options} + edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \ + nb_cost_mat_list, coef_dk = fit_GED_to_kernel_distance(Gn, node_label, edge_label, + gkernel, itr_max, params_ged=params_ged, + parallel=False) + total_time = np.sum(time_list) + print('\nedit_costs:', edit_costs) + print('\nresidual_list:', residual_list) + print('\nedit_cost_list:', edit_cost_list) + print('\ndistance matrix in kernel space:', dis_k_mat) + print('\nged matrix:', ged_mat) + print('\ntotal time:', total_time) + print('\nnb_cost_mat:', nb_cost_mat_list[-1]) + np.savez('results/fit_distance.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel.gm', + edit_costs=edit_costs, + residual_list=residual_list, edit_cost_list=edit_cost_list, + dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list, + total_time=total_time, nb_cost_mat_list=nb_cost_mat_list, + coef_dk=coef_dk) + +# ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt', +# 'extra_params': {}} # node/edge symb +# Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params']) +## Gn = Gn[0:10] +## remove_edges(Gn) +# gkernel = 'untilhpathkernel' +# node_label = 'atom' +# edge_label = 'bond_type' +# itr_max = 10 +# edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \ +# nb_cost_mat_list, coef_dk = fit_GED_to_kernel_distance(Gn, node_label, edge_label, +# gkernel, itr_max) +# total_time = np.sum(time_list) +# print('\nedit_costs:', edit_costs) +# print('\nresidual_list:', residual_list) +# print('\nedit_cost_list:', edit_cost_list) +# print('\ndistance matrix in kernel space:', dis_k_mat) +# print('\nged matrix:', ged_mat) +# print('\ntotal time:', total_time) +# print('\nnb_cost_mat:', nb_cost_mat_list[-1]) +# np.savez('results/fit_distance.cs_leq_ci_plus_cr.mutag.elabeled.uhpkernel.gm', +# edit_costs=edit_costs, +# residual_list=residual_list, edit_cost_list=edit_cost_list, +# dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list, +# total_time=total_time, nb_cost_mat_list=nb_cost_mat_list, coef_dk) + + +# # normalized distance matrices. +# gmfile = np.load('results/fit_distance.cs_leq_ci_plus_cr.monot.elabeled.uhpkernel.gm.npz') +# edit_costs = gmfile['edit_costs'] +# residual_list = gmfile['residual_list'] +# edit_cost_list = gmfile['edit_cost_list'] +# dis_k_mat = gmfile['dis_k_mat'] +# ged_mat = gmfile['ged_mat'] +# total_time = gmfile['total_time'] +# nb_cost_mat_list = gmfile['nb_cost_mat_list'] +# coef_dk = gmfile['coef_dk'] + + nb_consistent, nb_inconsistent, ratio_consistent = pairwise_substitution_consistence(dis_k_mat, ged_mat) + print(nb_consistent, nb_inconsistent, ratio_consistent) + +# dis_k_sub = pairwise_substitution(dis_k_mat) +# ged_sub = pairwise_substitution(ged_mat) +# np.savez('results/sub_dis_mat.cs_leq_ci_plus_cr.gm', +# dis_k_sub=dis_k_sub, ged_sub=ged_sub) + + + norm_dis_k_mat = normalize_distance_matrix(dis_k_mat) + plt.imshow(norm_dis_k_mat) + plt.colorbar() + plt.savefig('results/norm_dis_k_mat.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel' + + '.eps', format='eps', dpi=300) + plt.savefig('results/norm_dis_k_mat.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel' + + '.png', format='png') +# plt.show() + plt.clf() + + norm_ged_mat = normalize_distance_matrix(ged_mat) + plt.imshow(norm_ged_mat) + plt.colorbar() + plt.savefig('results/norm_ged_mat.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel' + + '.eps', format='eps', dpi=300) + plt.savefig('results/norm_ged_mat.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel' + + '.png', format='png') +# plt.show() + plt.clf() + + norm_diff = norm_ged_mat - norm_dis_k_mat + plt.imshow(norm_diff) + plt.colorbar() + plt.savefig('results/diff_mat_norm_ged_dis_k.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel' + + '.eps', format='eps', dpi=300) + plt.savefig('results/diff_mat_norm_ged_dis_k.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel' + + '.png', format='png') +# plt.show() + plt.clf() +# draw_count_bar(norm_diff) + + def test_anycosts(): ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt', 'extra_params': {}} # node/edge symb @@ -295,8 +608,12 @@ def draw_count_bar(norm_diff): if __name__ == '__main__': # test_anycosts() - test_cs_leq_ci_plus_cr() +# test_cs_leq_ci_plus_cr() # test_unfitted() +# test_cs_leq_ci_plus_cr_python_bash_cpp() +# median_paper_clcpc_python_bash_cpp() + median_paper_clcpc_python_best() + # x = np.array([[1,2,3],[4,5,6],[7,8,9]]) # xx = pairwise_substitution(x) \ No newline at end of file diff --git a/preimage/test_iam.py b/preimage/test_iam.py index 82d5446..5d286cc 100644 --- a/preimage/test_iam.py +++ b/preimage/test_iam.py @@ -22,6 +22,130 @@ from iam import iam_upgraded from utils import remove_edges, compute_kernel, get_same_item_indices, dis_gstar #from ged import ged_median + +def test_iam_monoterpenoides_with_init40(): + gkernel = 'untilhpathkernel' + node_label = 'atom' + edge_label = 'bond_type' + # unfitted edit costs. + c_vi = 3 + c_vr = 3 + c_vs = 1 + c_ei = 3 + c_er = 3 + c_es = 1 + ite_max_iam = 50 + epsilon_iam = 0.0001 + removeNodes = False + connected_iam = False + # parameters for IAM function +# ged_cost = 'CONSTANT' + ged_cost = 'CONSTANT' + ged_method = 'IPFP' + edit_cost_constant = [c_vi, c_vr, c_vs, c_ei, c_er, c_es] + ged_stabilizer = None +# ged_repeat = 50 + algo_options = '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1' + params_ged = {'lib': 'gedlibpy', 'cost': ged_cost, 'method': ged_method, + 'edit_cost_constant': edit_cost_constant, + 'algo_options': algo_options, + 'stabilizer': ged_stabilizer} + + + collection_path = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/generated_datsets/monoterpenoides/' + graph_dir = collection_path + 'gxl/' + y_all = ['3', '1', '4', '6', '7', '8', '9', '2'] + repeats = 50 + + # classify graphs according to classes. + time_list = [] + dis_ks_min_list = [] + dis_ks_set_median_list = [] + sod_gs_list = [] + g_best = [] + sod_set_median_list = [] + sod_list_list = [] + for y in y_all: + print('\n-------------------------------------------------------') + print('class of y:', y) + + time_list.append([]) + dis_ks_min_list.append([]) + dis_ks_set_median_list.append([]) + sod_gs_list.append([]) + g_best.append([]) + sod_set_median_list.append([]) + + for repeat in range(repeats): + # load median set. + collection_file = collection_path + 'monoterpenoides_' + y + '_' + str(repeat) + '.xml' + Gn_median, _ = loadDataset(collection_file, extra_params=graph_dir) + Gn_candidate = [g.copy() for g in Gn_median] + + time0 = time.time() + G_gen_median_list, sod_gen_median, sod_list, G_set_median_list, sod_set_median \ + = iam_upgraded(Gn_median, + Gn_candidate, c_ei=c_ei, c_er=c_er, c_es=c_es, ite_max=ite_max_iam, + epsilon=epsilon_iam, node_label=node_label, edge_label=edge_label, + connected=connected_iam, removeNodes=removeNodes, + params_ged=params_ged) + time_total = time.time() - time0 + print('\ntime: ', time_total) + time_list[-1].append(time_total) + g_best[-1].append(G_gen_median_list[0]) + sod_set_median_list[-1].append(sod_set_median) + print('\nsmallest sod of the set median:', sod_set_median) + sod_gs_list[-1].append(sod_gen_median) + print('\nsmallest sod in graph space:', sod_gen_median) + sod_list_list.append(sod_list) + +# # show the best graph and save it to file. +# print('one of the possible corresponding pre-images is') +# nx.draw(G_gen_median_list[0], labels=nx.get_node_attributes(G_gen_median_list[0], 'atom'), +# with_labels=True) +## plt.show() +# # plt.savefig('results/iam/mutag_median.fit_costs2.001.nb' + str(nb_median) + +## plt.savefig('results/iam/paper_compare/monoter_y' + str(y_class) + +## '_repeat' + str(repeat) + '_' + str(time.time()) + +## '.png', format="PNG") +# plt.clf() +# # print(G_gen_median_list[0].nodes(data=True)) +# # print(G_gen_median_list[0].edges(data=True)) + + print('\nsods of the set median for this class:', sod_set_median_list[-1]) + print('\nsods in graph space for this class:', sod_gs_list[-1]) +# print('\ndistance in kernel space of set median for this class:', +# dis_ks_set_median_list[-1]) +# print('\nsmallest distances in kernel space for this class:', +# dis_ks_min_list[-1]) + print('\ntimes for this class:', time_list[-1]) + + sod_set_median_list[-1] = np.mean(sod_set_median_list[-1]) + sod_gs_list[-1] = np.mean(sod_gs_list[-1]) +# dis_ks_set_median_list[-1] = np.mean(dis_ks_set_median_list[-1]) +# dis_ks_min_list[-1] = np.mean(dis_ks_min_list[-1]) + time_list[-1] = np.mean(time_list[-1]) + + print() + print('\nmean sods of the set median for each class:', sod_set_median_list) + print('\nmean sods in graph space for each class:', sod_gs_list) +# print('\ndistances in kernel space of set median for each class:', +# dis_ks_set_median_list) +# print('\nmean smallest distances in kernel space for each class:', +# dis_ks_min_list) + print('\nmean times for each class:', time_list) + + print('\nmean sods of the set median of all:', np.mean(sod_set_median_list)) + print('\nmean sods in graph space of all:', np.mean(sod_gs_list)) +# print('\nmean distances in kernel space of set median of all:', +# np.mean(dis_ks_set_median_list)) +# print('\nmean smallest distances in kernel space of all:', +# np.mean(dis_ks_min_list)) + print('\nmean times of all:', np.mean(time_list)) + + + + def test_iam_monoterpenoides(): ds = {'name': 'monoterpenoides', 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb @@ -834,9 +958,10 @@ if __name__ == '__main__': # tests on different numbers of median-sets. # test_iam_median_nb() # test_iam_letter_h() - test_iam_monoterpenoides() +# test_iam_monoterpenoides() # test_iam_mutag() # test_iam_fitdistance() # print("test log") + test_iam_monoterpenoides_with_init40() diff --git a/preimage/utils.py b/preimage/utils.py index 99c63c0..51d4edf 100644 --- a/preimage/utils.py +++ b/preimage/utils.py @@ -17,8 +17,10 @@ from pygraph.kernels.marginalizedKernel import marginalizedkernel from pygraph.kernels.untilHPathKernel import untilhpathkernel from pygraph.kernels.spKernel import spkernel import functools -from pygraph.utils.kernels import deltakernel, gaussiankernel, kernelproduct +from pygraph.utils.kernels import deltakernel, gaussiankernel, kernelproduct, polynomialkernel from pygraph.kernels.structuralspKernel import structuralspkernel +from pygraph.kernels.treeletKernel import treeletkernel +from pygraph.kernels.weisfeilerLehmanKernel import weisfeilerlehmankernel def remove_edges(Gn): @@ -46,18 +48,29 @@ def compute_kernel(Gn, graph_kernel, node_label, edge_label, verbose): n_jobs=multiprocessing.cpu_count(), verbose=verbose) elif graph_kernel == 'untilhpathkernel': Kmatrix, _ = untilhpathkernel(Gn, node_label=node_label, edge_label=edge_label, - depth=10, k_func='MinMax', compute_method='trie', + depth=7, k_func='MinMax', compute_method='trie', n_jobs=multiprocessing.cpu_count(), verbose=verbose) elif graph_kernel == 'spkernel': mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) - Kmatrix, _, _ = spkernel(Gn, node_label='atom', node_kernels= + Kmatrix, _, _ = spkernel(Gn, node_label=node_label, node_kernels= {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}, n_jobs=multiprocessing.cpu_count(), verbose=verbose) elif graph_kernel == 'structuralspkernel': mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) - Kmatrix, _ = structuralspkernel(Gn, node_label='atom', node_kernels= + Kmatrix, _ = structuralspkernel(Gn, node_label=node_label, node_kernels= {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}, n_jobs=multiprocessing.cpu_count(), verbose=verbose) + elif graph_kernel == 'treeletkernel': +# pkernel = functools.partial(polynomialkernel, d=2, c=1e5) + pkernel = functools.partial(gaussiankernel, gamma=1e-6) + mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) + Kmatrix, _ = treeletkernel(Gn, node_label=node_label, edge_label=edge_label, + sub_kernel=pkernel, + n_jobs=multiprocessing.cpu_count(), verbose=verbose) + elif graph_kernel == 'weisfeilerlehmankernel': + Kmatrix, _ = weisfeilerlehmankernel(Gn, node_label=node_label, edge_label=edge_label, + height=4, base_kernel='subtree', + n_jobs=multiprocessing.cpu_count(), verbose=verbose) # normalization Kmatrix_diag = Kmatrix.diagonal().copy() @@ -79,7 +92,7 @@ def gram2distances(Kmatrix): def kernel_distance_matrix(Gn, node_label, edge_label, Kmatrix=None, gkernel=None): dis_mat = np.empty((len(Gn), len(Gn))) - if Kmatrix == None: + if Kmatrix is None: Kmatrix = compute_kernel(Gn, gkernel, node_label, edge_label, True) for i in range(len(Gn)): for j in range(i, len(Gn)): @@ -109,6 +122,21 @@ def get_same_item_indices(ls): return idx_dict +def k_nearest_neighbors_to_median_in_kernel_space(Gn, Kmatrix=None, gkernel=None, + node_label=None, edge_label=None): + dis_k_all = [] # distance between g_star and each graph. + alpha = [1 / len(Gn)] * len(Gn) + if Kmatrix is None: + Kmatrix = compute_kernel(Gn, gkernel, node_label, edge_label, True) + term3 = 0 + for i1, a1 in enumerate(alpha): + for i2, a2 in enumerate(alpha): + term3 += a1 * a2 * Kmatrix[idx_gi[i1], idx_gi[i2]] + for ig, g in tqdm(enumerate(Gn_init), desc='computing distances', file=sys.stdout): + dtemp = dis_gstar(ig, idx_gi, alpha, Kmatrix, term3=term3) + dis_all.append(dtemp) + + def normalize_distance_matrix(D): max_value = np.amax(D) min_value = np.amin(D) diff --git a/pygraph/utils/graphfiles.py b/pygraph/utils/graphfiles.py index f5daeda..48583dd 100644 --- a/pygraph/utils/graphfiles.py +++ b/pygraph/utils/graphfiles.py @@ -124,21 +124,21 @@ def saveGXL(graph, filename, method='benoit'): # reference: https://github.com/dbblumenthal/gedlib/blob/master/data/generate_molecules.py#L22 # pass gxl_file = open(filename, 'w') - gxl_file.write("\n") + gxl_file.write("\n") gxl_file.write("\n") - gxl_file.write("\n") + gxl_file.write("\n") gxl_file.write("\n") for v, attrs in graph.nodes(data=True): gxl_file.write("") - gxl_file.write("" + str(attrs['atom']) + "") + gxl_file.write("" + str(attrs['chem']) + "") gxl_file.write("\n") for v1, v2, attrs in graph.edges(data=True): gxl_file.write("") -# gxl_file.write("" + str(attrs['bond_type']) + "") - gxl_file.write("" + "1" + "") + gxl_file.write("" + str(attrs['valence']) + "") +# gxl_file.write("" + "1" + "") gxl_file.write("\n") gxl_file.write("\n") - gxl_file.write("\n") + gxl_file.write("") gxl_file.close() elif method == 'gedlib-letter': # reference: https://github.com/dbblumenthal/gedlib/blob/master/data/generate_molecules.py#L22 @@ -147,15 +147,15 @@ def saveGXL(graph, filename, method='benoit'): gxl_file.write("\n") gxl_file.write("\n") gxl_file.write("\n") - gxl_file.write("") + gxl_file.write("\n") for v, attrs in graph.nodes(data=True): gxl_file.write("") gxl_file.write("" + str(attrs['attributes'][0]) + "") gxl_file.write("" + str(attrs['attributes'][1]) + "") - gxl_file.write("") + gxl_file.write("\n") for v1, v2, attrs in graph.edges(data=True): - gxl_file.write("") - gxl_file.write("") + gxl_file.write("\n") + gxl_file.write("\n") gxl_file.write("") gxl_file.close() @@ -466,12 +466,15 @@ def loadDataset(filename, filename_y=None, extra_params=None): def loadFromXML(filename, extra_params): import xml.etree.ElementTree as ET - dirname_dataset = dirname(filename) + if extra_params: + dirname_dataset = extra_params + else: + dirname_dataset = dirname(filename) tree = ET.parse(filename) root = tree.getroot() data = [] y = [] - for graph in root.iter('print'): + for graph in root.iter('graph'): mol_filename = graph.attrib['file'] mol_class = graph.attrib['class'] data.append(loadGXL(dirname_dataset + '/' + mol_filename)) @@ -541,15 +544,22 @@ def saveDataset(Gn, y, gformat='gxl', group=None, filename='gfile', xparams=None dirname_ds += '/' if not os.path.exists(dirname_ds) : os.makedirs(dirname_ds) + + if 'graph_dir' in xparams: + graph_dir = xparams['graph_dir'] + '/' + if not os.path.exists(graph_dir): + os.makedirs(graph_dir) + else: + graph_dir = dirname_ds if group == 'xml' and gformat == 'gxl': with open(filename + '.xml', 'w') as fgroup: fgroup.write("") - fgroup.write("\n") + fgroup.write("\n") fgroup.write("\n") for idx, g in enumerate(Gn): fname_tmp = "graph" + str(idx) + ".gxl" - saveGXL(g, dirname_ds + fname_tmp, method=xparams['method']) + saveGXL(g, graph_dir + fname_tmp, method=xparams['method']) fgroup.write("\n\t") fgroup.write("\n") fgroup.close() @@ -558,18 +568,18 @@ def saveDataset(Gn, y, gformat='gxl', group=None, filename='gfile', xparams=None if __name__ == '__main__': # ### Load dataset from .ds file. # # .ct files. - ds = {'name': 'Alkane', 'dataset': '../../datasets/Alkane/dataset.ds', - 'dataset_y': '../../datasets/Alkane/dataset_boiling_point_names.txt'} - Gn, y = loadDataset(ds['dataset'], filename_y=ds['dataset_y']) -# ds = {'name': 'Acyclic', 'dataset': '../../datasets/acyclic/dataset_bps.ds'} # node symb -# Gn, y = loadDataset(ds['dataset']) -# ds = {'name': 'MAO', 'dataset': '../../datasets/MAO/dataset.ds'} # node/edge symb -# Gn, y = loadDataset(ds['dataset']) -# ds = {'name': 'PAH', 'dataset': '../../datasets/PAH/dataset.ds'} # unlabeled -# Gn, y = loadDataset(ds['dataset']) - print(Gn[1].nodes(data=True)) - print(Gn[1].edges(data=True)) - print(y[1]) +# ds = {'name': 'Alkane', 'dataset': '../../datasets/Alkane/dataset.ds', +# 'dataset_y': '../../datasets/Alkane/dataset_boiling_point_names.txt'} +# Gn, y = loadDataset(ds['dataset'], filename_y=ds['dataset_y']) +## ds = {'name': 'Acyclic', 'dataset': '../../datasets/acyclic/dataset_bps.ds'} # node symb +## Gn, y = loadDataset(ds['dataset']) +## ds = {'name': 'MAO', 'dataset': '../../datasets/MAO/dataset.ds'} # node/edge symb +## Gn, y = loadDataset(ds['dataset']) +## ds = {'name': 'PAH', 'dataset': '../../datasets/PAH/dataset.ds'} # unlabeled +## Gn, y = loadDataset(ds['dataset']) +# print(Gn[1].nodes(data=True)) +# print(Gn[1].edges(data=True)) +# print(y[1]) # # .gxl file. # ds = {'name': 'monoterpenoides', @@ -579,6 +589,33 @@ if __name__ == '__main__': # print(Gn[1].edges(data=True)) # print(y[1]) + ### Convert graph from one format to another. + # .gxl file. + import networkx as nx + ds = {'name': 'monoterpenoides', + 'dataset': '../../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb + Gn, y = loadDataset(ds['dataset']) + y = [int(i) for i in y] + print(Gn[1].nodes(data=True)) + print(Gn[1].edges(data=True)) + print(y[1]) + # Convert a graph to the proper NetworkX format that can be recognized by library gedlib. + Gn_new = [] + for G in Gn: + G_new = nx.Graph() + for nd, attrs in G.nodes(data=True): + G_new.add_node(str(nd), chem=attrs['atom']) + for nd1, nd2, attrs in G.edges(data=True): + G_new.add_edge(str(nd1), str(nd2), valence=attrs['bond_type']) +# G_new.add_edge(str(nd1), str(nd2)) + Gn_new.append(G_new) + print(Gn_new[1].nodes(data=True)) + print(Gn_new[1].edges(data=True)) + print(Gn_new[1]) + filename = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/generated_datsets/monoterpenoides/gxl/monoterpenoides' + xparams = {'method': 'gedlib'} + saveDataset(Gn, y, gformat='gxl', group='xml', filename=filename, xparams=xparams) + # ds = {'name': 'MUTAG', 'dataset': '../../datasets/MUTAG/MUTAG.mat', # 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}} # node/edge symb # Gn, y = loadDataset(ds['dataset'], extra_params=ds['extra_params'])