diff --git a/notebooks/run_spkernel.py b/notebooks/run_spkernel.py index 0f98b3e..fcae61f 100644 --- a/notebooks/run_spkernel.py +++ b/notebooks/run_spkernel.py @@ -74,6 +74,7 @@ for ds in dslist: extra_params=(ds['extra_params'] if 'extra_params' in ds else None), ds_name=ds['name'], n_jobs=multiprocessing.cpu_count(), +# n_jobs=7, read_gm_from_file=False, verbose=True) print() diff --git a/preimage/fitDistance.py b/preimage/fitDistance.py index 81ef711..5268014 100644 --- a/preimage/fitDistance.py +++ b/preimage/fitDistance.py @@ -18,31 +18,44 @@ from scipy import optimize import cvxpy as cp import sys -sys.path.insert(0, "../") +#sys.path.insert(0, "../") from ged import GED, get_nb_edit_operations from utils import kernel_distance_matrix -def fit_GED_to_kernel_distance(Gn, gkernel, itr_max): +def fit_GED_to_kernel_distance(Gn, node_label, edge_label, gkernel, itr_max, + fitkernel=None, gamma=1.0): # c_vi, c_vr, c_vs, c_ei, c_er, c_es or parts of them. - random.seed(1) - cost_rdm = random.sample(range(1, 10), 5) - edit_costs = cost_rdm + [0] +# random.seed(1) + cost_rdm = random.sample(range(1, 10), 6) +# edit_costs = cost_rdm + [0] + edit_costs = cost_rdm +# edit_costs = [i * 0.01 for i in cost_rdm] + [0] # edit_costs = [0.2, 0.2, 0.2, 0.2, 0.2, 0] # edit_costs = [0, 0, 0.9544, 0.026, 0.0196, 0] # edit_costs = [0.008429912251810438, 0.025461055985319694, 0.2047320869225948, 0.004148727085832133, 0.0, 0] - idx_nonzeros = [i for i, item in enumerate(edit_costs) if item != 0] + idx_cost_nonzeros = [i for i, item in enumerate(edit_costs) if item != 0] # compute distances in feature space. - dis_k_mat, _, _, _ = kernel_distance_matrix(Gn, gkernel=gkernel) + coef_dk = 1 + dis_k_mat, _, _, _ = kernel_distance_matrix(Gn, node_label, edge_label, gkernel=gkernel) dis_k_vec = [] for i in range(len(dis_k_mat)): for j in range(i, len(dis_k_mat)): dis_k_vec.append(dis_k_mat[i, j]) dis_k_vec = np.array(dis_k_vec) + if fitkernel == None: + dis_k_vec_ajusted = dis_k_vec + elif fitkernel == 'gaussian': + coef_dk = 1 / np.max(dis_k_vec) + idx_dk_nonzeros = np.where(dis_k_vec != 0)[0] + # remove 0's and constraint d_k between 0 and 1. + dis_k_vec = dis_k_vec[idx_dk_nonzeros] * coef_dk + dis_k_vec_ajusted = np.sqrt(-np.log(dis_k_vec) / gamma) residual_list = [] edit_cost_list = [] time_list = [] + nb_cost_mat_list = [] for itr in range(itr_max): print('\niteration', itr) @@ -52,15 +65,23 @@ def fit_GED_to_kernel_distance(Gn, gkernel, itr_max): edit_cost_list.append(edit_cost_constant) ged_all, ged_mat, n_edit_operations = compute_geds(Gn, edit_cost_constant, - idx_nonzeros, parallel=True) + idx_cost_nonzeros, parallel=True) - residual = np.sqrt(np.sum(np.square(np.array(ged_all) - dis_k_vec))) + if fitkernel == None: + residual = np.sqrt(np.sum(np.square(np.array(ged_all) - dis_k_vec))) + elif fitkernel == 'gaussian': + ged_all = np.array(ged_all)[idx_dk_nonzeros] + residual = np.sqrt(np.sum(np.square( + np.exp(-gamma * ged_all ** 2) / coef_dk - dis_k_vec))) residual_list.append(residual) # "fit" geds to distances in feature space by tuning edit costs using the # Least Squares Method. nb_cost_mat = np.array(n_edit_operations).T - edit_costs_new, residual = compute_better_costs(nb_cost_mat, dis_k_vec) + if fitkernel == 'gaussian': + nb_cost_mat = nb_cost_mat[idx_dk_nonzeros] + nb_cost_mat_list.append(nb_cost_mat) + edit_costs_new, residual = compute_better_costs(nb_cost_mat, dis_k_vec_ajusted) print('pseudo residual:', residual) for i in range(len(edit_costs_new)): @@ -70,7 +91,7 @@ def fit_GED_to_kernel_distance(Gn, gkernel, itr_max): else: raise ValueError('The edit cost is negative.') - for idx, item in enumerate(idx_nonzeros): + for idx, item in enumerate(idx_cost_nonzeros): edit_costs[item] = edit_costs_new[idx] time_list.append(time.time() - time0) @@ -78,14 +99,21 @@ def fit_GED_to_kernel_distance(Gn, gkernel, itr_max): print('edit_costs:', edit_costs) print('residual_list:', residual_list) - + print() edit_cost_list.append(edit_costs) ged_all, ged_mat, n_edit_operations = compute_geds(Gn, edit_costs, - idx_nonzeros, parallel=True) - residual = np.sqrt(np.sum(np.square(np.array(ged_all) - dis_k_vec))) + idx_cost_nonzeros, parallel=True) + if fitkernel == 0: + residual = np.sqrt(np.sum(np.square(np.array(ged_all) - dis_k_vec))) + elif fitkernel == 'gaussian': + ged_all = np.array(ged_all)[idx_dk_nonzeros] + residual = np.sqrt(np.sum(np.square( + np.exp(-gamma * ged_all ** 2) / coef_dk - dis_k_vec))) residual_list.append(residual) + nb_cost_mat_list.append(np.array(n_edit_operations).T) - return edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list + return edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, \ + time_list, nb_cost_mat_list, coef_dk def compute_geds(Gn, edit_cost_constant, idx_nonzeros, parallel=False): @@ -193,7 +221,10 @@ def compute_better_costs(nb_cost_mat, dis_k_vec): # h = np.array([0 for i in range(nb_cost_mat.shape[1])]) x = cp.Variable(nb_cost_mat.shape[1]) cost = cp.sum_squares(nb_cost_mat * x - dis_k_vec) - constraints = [x >= [0 for i in range(nb_cost_mat.shape[1])]] + constraints = [x >= [0.01 for i in range(nb_cost_mat.shape[1])], +# np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0] + np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0, + np.array([0.0, 0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0] prob = cp.Problem(cp.Minimize(cost), constraints) prob.solve() edit_costs_new = x.value diff --git a/preimage/ged.py b/preimage/ged.py index 49b4403..eaa7294 100644 --- a/preimage/ged.py +++ b/preimage/ged.py @@ -9,11 +9,14 @@ import numpy as np import networkx as nx from tqdm import tqdm import sys +import multiprocessing +from multiprocessing import Pool +from functools import partial from gedlibpy import librariesImport, gedlibpy def GED(g1, g2, lib='gedlibpy', cost='CHEM_1', method='IPFP', - edit_cost_constant=[], saveGXL='benoit', stabilizer='min', repeat=50): + edit_cost_constant=[], stabilizer='min', repeat=50): """ Compute GED for 2 graphs. """ @@ -25,9 +28,11 @@ def GED(g1, g2, lib='gedlibpy', cost='CHEM_1', method='IPFP', G_new = nx.Graph() for nd, attrs in G.nodes(data=True): G_new.add_node(str(nd), chem=attrs['atom']) +# G_new.add_node(str(nd), x=str(attrs['attributes'][0]), +# y=str(attrs['attributes'][1])) for nd1, nd2, attrs in G.edges(data=True): -# G_new.add_edge(str(nd1), str(nd2), valence=attrs['bond_type']) - G_new.add_edge(str(nd1), str(nd2)) + G_new.add_edge(str(nd1), str(nd2), valence=attrs['bond_type']) +# G_new.add_edge(str(nd1), str(nd2)) return G_new @@ -49,6 +54,32 @@ def GED(g1, g2, lib='gedlibpy', cost='CHEM_1', method='IPFP', pi_backward = gedlibpy.get_backward_map(g, h) upper = gedlibpy.get_upper_bound(g, h) lower = gedlibpy.get_lower_bound(g, h) + elif stabilizer == 'mean': + # @todo: to be finished... + upper_list = [np.inf] * repeat + for itr in range(repeat): + gedlibpy.run_method(g, h) + upper_list[itr] = gedlibpy.get_upper_bound(g, h) + pi_forward = gedlibpy.get_forward_map(g, h) + pi_backward = gedlibpy.get_backward_map(g, h) + lower = gedlibpy.get_lower_bound(g, h) + upper = np.mean(upper_list) + elif stabilizer == 'median': + if repeat % 2 == 0: + repeat += 1 + upper_list = [np.inf] * repeat + pi_forward_list = [0] * repeat + pi_backward_list = [0] * repeat + for itr in range(repeat): + gedlibpy.run_method(g, h) + upper_list[itr] = gedlibpy.get_upper_bound(g, h) + pi_forward_list[itr] = gedlibpy.get_forward_map(g, h) + pi_backward_list[itr] = gedlibpy.get_backward_map(g, h) + lower = gedlibpy.get_lower_bound(g, h) + upper = np.median(upper_list) + idx_median = upper_list.index(upper) + pi_forward = pi_forward_list[idx_median] + pi_backward = pi_backward_list[idx_median] elif stabilizer == 'min': upper = np.inf for itr in range(repeat): @@ -61,6 +92,18 @@ def GED(g1, g2, lib='gedlibpy', cost='CHEM_1', method='IPFP', lower = gedlibpy.get_lower_bound(g, h) if upper == 0: break + elif stabilizer == 'max': + upper = 0 + for itr in range(repeat): + gedlibpy.run_method(g, h) + upper_tmp = gedlibpy.get_upper_bound(g, h) + if upper_tmp > upper: + upper = upper_tmp + pi_forward = gedlibpy.get_forward_map(g, h) + pi_backward = gedlibpy.get_backward_map(g, h) + lower = gedlibpy.get_lower_bound(g, h) + elif stabilizer == 'gaussian': + pass dis = upper @@ -138,23 +181,69 @@ def GED_n(Gn, lib='gedlibpy', cost='CHEM_1', method='IPFP', return dis, pi_forward, pi_backward -def ged_median(Gn, Gn_median, measure='ged', verbose=False, - ged_cost='CHEM_1', ged_method='IPFP', saveGXL='benoit'): - dis_list = [] - pi_forward_list = [] - for idx, G in tqdm(enumerate(Gn), desc='computing median distances', - file=sys.stdout) if verbose else enumerate(Gn): - dis_sum = 0 - pi_forward_list.append([]) - for G_p in Gn_median: - dis_tmp, pi_tmp_forward, pi_tmp_backward = GED(G, G_p, - cost=ged_cost, method=ged_method, saveGXL=saveGXL) - pi_forward_list[idx].append(pi_tmp_forward) - dis_sum += dis_tmp - dis_list.append(dis_sum) +def ged_median(Gn, Gn_median, verbose=False, params_ged={'lib': 'gedlibpy', + 'cost': 'CHEM_1', 'method': 'IPFP', 'edit_cost_constant': [], + 'stabilizer': 'min', 'repeat': 50}, parallel=False): + if parallel: + len_itr = int(len(Gn)) + pi_forward_list = [[] for i in range(len_itr)] + dis_list = [0 for i in range(len_itr)] + + itr = range(0, len_itr) + n_jobs = multiprocessing.cpu_count() + if len_itr < 100 * n_jobs: + chunksize = int(len_itr / n_jobs) + 1 + else: + chunksize = 100 + def init_worker(gn_toshare, gn_median_toshare): + global G_gn, G_gn_median + G_gn = gn_toshare + G_gn_median = gn_median_toshare + do_partial = partial(_compute_ged_median, params_ged) + pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(Gn, Gn_median)) + if verbose: + iterator = tqdm(pool.imap_unordered(do_partial, itr, chunksize), + desc='computing GEDs', file=sys.stdout) + else: + iterator = pool.imap_unordered(do_partial, itr, chunksize) + for i, dis_sum, pi_forward in iterator: + pi_forward_list[i] = pi_forward + dis_list[i] = dis_sum +# print('\n-------------------------------------------') +# print(i, j, idx_itr, dis) + pool.close() + pool.join() + + else: + dis_list = [] + pi_forward_list = [] + for idx, G in tqdm(enumerate(Gn), desc='computing median distances', + file=sys.stdout) if verbose else enumerate(Gn): + dis_sum = 0 + pi_forward_list.append([]) + for G_p in Gn_median: + dis_tmp, pi_tmp_forward, pi_tmp_backward = GED(G, G_p, + **params_ged) + pi_forward_list[idx].append(pi_tmp_forward) + dis_sum += dis_tmp + dis_list.append(dis_sum) + return dis_list, pi_forward_list +def _compute_ged_median(params_ged, itr): +# print(itr) + dis_sum = 0 + pi_forward = [] + for G_p in G_gn_median: + dis_tmp, pi_tmp_forward, pi_tmp_backward = GED(G_gn[itr], G_p, + **params_ged) + pi_forward.append(pi_tmp_forward) + dis_sum += dis_tmp + + return itr, dis_sum, pi_forward + + def get_nb_edit_operations(g1, g2, forward_map, backward_map): """Compute the number of each edit operations. """ diff --git a/preimage/iam.py b/preimage/iam.py index a7b01b9..fa38582 100644 --- a/preimage/iam.py +++ b/preimage/iam.py @@ -22,20 +22,22 @@ def iam_upgraded(Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, ite_max=50, epsilon=0.001, node_label='atom', edge_label='bond_type', connected=False, removeNodes=True, allBestInit=False, allBestNodes=False, allBestEdges=False, allBestOutput=False, - params_ged={'ged_cost': 'CHEM_1', 'ged_method': 'IPFP', 'saveGXL': 'benoit'}): + params_ged={'lib': 'gedlibpy', 'cost': 'CHEM_1', 'method': 'IPFP', + 'edit_cost_constant': [], 'stabilizer': 'min', 'repeat': 50}): """See my name, then you know what I do. """ # Gn_median = Gn_median[0:10] # Gn_median = [nx.convert_node_labels_to_integers(g) for g in Gn_median] - if removeNodes: - node_ir = np.inf # corresponding to the node remove and insertion. - label_r = 'thanksdanny' # the label for node remove. # @todo: make this label unrepeatable. + node_ir = np.inf # corresponding to the node remove and insertion. + label_r = 'thanksdanny' # the label for node remove. # @todo: make this label unrepeatable. ds_attrs = get_dataset_attributes(Gn_median + Gn_candidate, attr_names=['edge_labeled', 'node_attr_dim', 'edge_attr_dim'], edge_label=edge_label) + node_label_set = get_node_labels(Gn_median, node_label) + edge_label_set = get_edge_labels(Gn_median, edge_label) - def generate_graph(G, pi_p_forward, label_set): + def generate_graph(G, pi_p_forward): G_new_list = [G.copy()] # all "best" graphs generated in this iteration. # nx.draw_networkx(G) # import matplotlib.pyplot as plt @@ -52,7 +54,7 @@ def iam_upgraded(Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, ite_max=50, for ndi, (nd, _) in enumerate(G.nodes(data=True)): h_i0_list = [] label_list = [] - for label in label_set: + for label in node_label_set: h_i0 = 0 for idx, g in enumerate(Gn_median): pi_i = pi_p_forward[idx][ndi] @@ -62,7 +64,7 @@ def iam_upgraded(Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, ite_max=50, label_list.append(label) # case when the node is to be removed. if removeNodes: - h_i0_remove = 0 # @todo: maybe this can be added to the label_set above. + h_i0_remove = 0 # @todo: maybe this can be added to the node_label_set above. for idx, g in enumerate(Gn_median): pi_i = pi_p_forward[idx][ndi] if pi_i == node_ir: @@ -91,11 +93,10 @@ def iam_upgraded(Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, ite_max=50, G_new_list = [ggg.copy() for ggg in G_new_list_nd] else: # choose one of the best randomly. - h_ij0_max = h_i0_list[idx_max[0]] idx_rdm = random.randint(0, len(idx_max) - 1) best_label = label_list[idx_max[idx_rdm]] - - # check whether a_ij is 0 or 1. + h_i0_max = h_i0_list[idx_max[idx_rdm]] + g_new = G_new_list[0] if best_label == label_r: g_new.remove_node(nd) @@ -134,8 +135,7 @@ def iam_upgraded(Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, ite_max=50, # for nd1, nd2, _ in g_new.edges(data=True): h_ij0_list = [] label_list = [] - # @todo: compute edge label set before. - for label in get_edge_labels(Gn_median, edge_label): + for label in edge_label_set: h_ij0 = 0 for idx, g in enumerate(Gn_median): pi_i = pi_p_forward[idx][nd1i] @@ -176,9 +176,9 @@ def iam_upgraded(Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, ite_max=50, G_new_list_ed.append(g_tmp_copy) g_tmp_list = [ggg.copy() for ggg in G_new_list_ed] else: # choose one of the best randomly. - h_ij0_max = h_ij0_list[idx_max[0]] idx_rdm = random.randint(0, len(idx_max) - 1) best_label = label_list[idx_max[idx_rdm]] + h_ij0_max = h_ij0_list[idx_max[idx_rdm]] # check whether a_ij is 0 or 1. sij_norm = 0 @@ -192,6 +192,7 @@ def iam_upgraded(Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, ite_max=50, g_new.add_edge(nd1, nd2) g_new.edges[nd1, nd2][edge_label] = best_label else: +# elif h_ij0_max < len(Gn_median) * c_er / c_es + sij_norm * (1 - (c_er + c_ei) / c_es): if g_new.has_edge(nd1, nd2): g_new.remove_edge(nd1, nd2) g_tmp_list = [g_new] @@ -221,8 +222,8 @@ def iam_upgraded(Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, ite_max=50, if g_tmp.has_node(nd1) and g_tmp.has_node(nd2) \ and not g_tmp.has_edge(nd1, nd2): g_tmp.add_edge(nd1, nd2) -# else: # @todo: which to use? - elif sij_norm < len(Gn_median) * c_er / (c_er + c_ei): + else: # @todo: which to use? +# elif sij_norm < len(Gn_median) * c_er / (c_er + c_ei): if g_tmp.has_edge(nd1, nd2): g_tmp.remove_edge(nd1, nd2) # do not change anything when equal. @@ -238,7 +239,7 @@ def iam_upgraded(Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, ite_max=50, # # find the best graph generated in this iteration and update pi_p. # @todo: should we update all graphs generated or just the best ones? dis_list, pi_forward_list = ged_median(G_new_list, Gn_median, - **params_ged) + params_ged=params_ged) # @todo: should we remove the identical and connectivity check? # Don't know which is faster. if ds_attrs['node_attr_dim'] == 0 and ds_attrs['edge_attr_dim'] == 0: @@ -283,15 +284,16 @@ def iam_upgraded(Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, ite_max=50, # while itr < ite_max and (np.abs(old_sod - cur_sod) > epsilon or # np.abs(old_sod - cur_sod) == 0): while itr < ite_max and np.abs(old_sod - cur_sod) > epsilon: +# while itr < ite_max: # for itr in range(0, 5): # the convergence condition? print('itr_iam is', itr) G_new_list = [] pi_forward_new_list = [] dis_new_list = [] for idx, g in enumerate(G_list): - label_set = get_node_labels(Gn_median + [g], node_label) +# label_set = get_node_labels(Gn_median + [g], node_label) G_tmp_list, pi_forward_tmp_list, dis_tmp_list = generate_graph( - g, pi_forward_list[idx], label_set) + g, pi_forward_list[idx]) G_new_list += G_tmp_list pi_forward_new_list += pi_forward_tmp_list dis_new_list += dis_tmp_list @@ -325,7 +327,7 @@ def iam_upgraded(Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, ite_max=50, print('\nsods:', sod_list, '\n') - return G_list, pi_forward_list, dis_min + return G_list, pi_forward_list, dis_min, sod_list def remove_duplicates(Gn): @@ -363,7 +365,8 @@ def iam_upgraded(Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, ite_max=50, # compute set-median. dis_min = np.inf dis_list, pi_forward_all = ged_median(Gn_candidate, Gn_median, - **params_ged) + params_ged=params_ged, parallel=True) + print('finish computing GEDs.') # find all smallest distances. if allBestInit: # try all best init graphs. idx_min_list = range(len(dis_list)) @@ -371,19 +374,26 @@ def iam_upgraded(Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, ite_max=50, else: idx_min_list = np.argwhere(dis_list == np.min(dis_list)).flatten().tolist() dis_min = [dis_list[idx_min_list[0]]] * len(idx_min_list) + idx_min_rdm = random.randint(0, len(idx_min_list) - 1) + idx_min_list = [idx_min_list[idx_min_rdm]] + sod_set_median = np.min(dis_min) # phase 2: iteration. G_list = [] dis_list = [] pi_forward_list = [] + G_set_median_list = [] +# sod_list = [] for idx_tmp, idx_min in enumerate(idx_min_list): # print('idx_min is', idx_min) G = Gn_candidate[idx_min].copy() + G_set_median_list.append(G.copy()) # list of edit operations. pi_p_forward = pi_forward_all[idx_min] # pi_p_backward = pi_all_backward[idx_min] - Gi_list, pi_i_forward_list, dis_i_min = iteration_proc(G, pi_p_forward, dis_min[idx_tmp]) + Gi_list, pi_i_forward_list, dis_i_min, sod_list = iteration_proc(G, + pi_p_forward, dis_min[idx_tmp]) G_list += Gi_list dis_list += [dis_i_min] * len(Gi_list) pi_forward_list += pi_i_forward_list @@ -409,9 +419,9 @@ def iam_upgraded(Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, ite_max=50, # print(g.edges(data=True)) # get the best median graphs - G_min_list, pi_forward_min_list, dis_min = best_median_graphs( + G_gen_median_list, pi_forward_min_list, sod_gen_median = best_median_graphs( G_list, pi_forward_list, dis_list) -# for g in G_min_list: +# for g in G_gen_median_list: # nx.draw_networkx(g) # plt.show() # print(g.nodes(data=True)) @@ -419,10 +429,10 @@ def iam_upgraded(Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, ite_max=50, if not allBestOutput: # randomly choose one graph. - idx_rdm = random.randint(0, len(G_min_list) - 1) - G_min_list = [G_min_list[idx_rdm]] + idx_rdm = random.randint(0, len(G_gen_median_list) - 1) + G_gen_median_list = [G_gen_median_list[idx_rdm]] - return G_min_list, dis_min + return G_gen_median_list, sod_gen_median, sod_list, G_set_median_list, sod_set_median diff --git a/preimage/median.py b/preimage/median.py index 2844043..ed3e6cd 100644 --- a/preimage/median.py +++ b/preimage/median.py @@ -5,8 +5,8 @@ import numpy as np import networkx as nx import time -import librariesImport -import script +from gedlibpy import librariesImport, gedlibpy +#import script sys.path.insert(0, "/home/bgauzere/dev/optim-graphes/") import pygraph from pygraph.utils.graphfiles import loadDataset diff --git a/preimage/preimage_iam.py b/preimage/preimage_iam.py index ff16955..bf79d0e 100644 --- a/preimage/preimage_iam.py +++ b/preimage/preimage_iam.py @@ -27,8 +27,9 @@ def preimage_iam(Gn_init, Gn_median, alpha, idx_gi, Kmatrix, k, r_max, params_iam={'c_ei': 1, 'c_er': 1, 'c_es': 1, 'ite_max': 50, 'epsilon': 0.001, 'removeNodes': True, 'connected': False}, - params_ged={'ged_cost': 'CHEM_1', 'ged_method': 'IPFP', - 'saveGXL': 'benoit'}): + params_ged={'lib': 'gedlibpy', 'cost': 'CHEM_1', 'method': 'IPFP', + 'edit_cost_constant': [], 'stabilizer': 'min', + 'repeat': 50}): """This function constructs graph pre-image by the iterative pre-image framework in reference [1], algorithm 1, where the step of generating new graphs randomly is replaced by the IAM algorithm in reference [2]. @@ -91,12 +92,12 @@ def preimage_iam(Gn_init, Gn_median, alpha, idx_gi, Kmatrix, k, r_max, ghat_new_list = [] for g_tmp in Gk: Gn_nearest_init = [g_tmp.copy()] - ghat_new_list_tmp, _ = iam_upgraded(Gn_nearest_median, + ghat_new_list_tmp, _, _ = iam_upgraded(Gn_nearest_median, Gn_nearest_init, params_ged=params_ged, **params_iam) ghat_new_list += ghat_new_list_tmp else: # only the best graph in D_k is used to initialize IAM. Gn_nearest_init = [g.copy() for g in Gk] - ghat_new_list, _ = iam_upgraded(Gn_nearest_median, Gn_nearest_init, + ghat_new_list, _, _ = iam_upgraded(Gn_nearest_median, Gn_nearest_init, params_ged=params_ged, **params_iam) # for g in g_tmp_list: @@ -181,8 +182,9 @@ def preimage_iam_random_mix(Gn_init, Gn_median, alpha, idx_gi, Kmatrix, k, r_max params_iam={'c_ei': 1, 'c_er': 1, 'c_es': 1, 'ite_max': 50, 'epsilon': 0.001, 'removeNodes': True, 'connected': False}, - params_ged={'ged_cost': 'CHEM_1', 'ged_method': 'IPFP', - 'saveGXL': 'benoit'}): + params_ged={'lib': 'gedlibpy', 'cost': 'CHEM_1', + 'method': 'IPFP', 'edit_cost_constant': [], + 'stabilizer': 'min', 'repeat': 50}): """This function constructs graph pre-image by the iterative pre-image framework in reference [1], algorithm 1, where new graphs are generated randomly and by the IAM algorithm in reference [2]. diff --git a/preimage/test_fitDistance.py b/preimage/test_fitDistance.py index 4ee26b6..bc941d4 100644 --- a/preimage/test_fitDistance.py +++ b/preimage/test_fitDistance.py @@ -7,7 +7,10 @@ Created on Thu Oct 24 11:50:56 2019 """ from matplotlib import pyplot as plt import numpy as np +from tqdm import tqdm +import sys +sys.path.insert(0, "../") from pygraph.utils.graphfiles import loadDataset from utils import remove_edges from fitDistance import fit_GED_to_kernel_distance @@ -21,21 +24,22 @@ def test_anycosts(): remove_edges(Gn) gkernel = 'marginalizedkernel' itr_max = 10 - edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list = \ - fit_GED_to_kernel_distance(Gn, gkernel, itr_max) + edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \ + nb_cost_mat_list, coef_dk = fit_GED_to_kernel_distance(Gn, gkernel, itr_max) total_time = np.sum(time_list) print('\nedit_costs:', edit_costs) print('\nresidual_list:', residual_list) print('\nedit_cost_list:', edit_cost_list) print('\ndistance matrix in kernel space:', dis_k_mat) print('\nged matrix:', ged_mat) - print('total time:', total_time) + print('\ntotal time:', total_time) + print('\nnb_cost_mat:', nb_cost_mat_list[-1]) np.savez('results/fit_distance.any_costs.gm', edit_costs=edit_costs, residual_list=residual_list, edit_cost_list=edit_cost_list, dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list, - total_time=total_time) + total_time=total_time, nb_cost_mat_list=nb_cost_mat_list) - # normalized distance matrices. +# # normalized distance matrices. # gmfile = np.load('results/fit_distance.any_costs.gm.npz') # edit_costs = gmfile['edit_costs'] # residual_list = gmfile['residual_list'] @@ -43,72 +47,256 @@ def test_anycosts(): # dis_k_mat = gmfile['dis_k_mat'] # ged_mat = gmfile['ged_mat'] # total_time = gmfile['total_time'] +## nb_cost_mat_list = gmfile['nb_cost_mat_list'] norm_dis_k_mat = normalize_distance_matrix(dis_k_mat) plt.imshow(norm_dis_k_mat) plt.colorbar() plt.savefig('results/norm_dis_k_mat.any_costs' + '.eps', format='eps', dpi=300) -# plt.savefig('results/norm_dis_k_mat.any_costs' + '.jpg', format='jpg') +# plt.savefig('results/norm_dis_k_mat.any_costs' + '.png', format='png') # plt.show() plt.clf() + norm_ged_mat = normalize_distance_matrix(ged_mat) plt.imshow(norm_ged_mat) plt.colorbar() plt.savefig('results/norm_ged_mat.any_costs' + '.eps', format='eps', dpi=300) -# plt.savefig('results/norm_ged_mat.any_costs' + '.jpg', format='jpg') +# plt.savefig('results/norm_ged_mat.any_costs' + '.png', format='png') +# plt.show() + plt.clf() + + norm_diff = norm_ged_mat - norm_dis_k_mat + plt.imshow(norm_diff) + plt.colorbar() + plt.savefig('results/diff_mat_norm_ged_dis_k.any_costs' + '.eps', format='eps', dpi=300) +# plt.savefig('results/diff_mat_norm_ged_dis_k.any_costs' + '.png', format='png') # plt.show() plt.clf() +# draw_count_bar(norm_diff) def test_cs_leq_ci_plus_cr(): """c_vs <= c_vi + c_vr, c_es <= c_ei + c_er """ - ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt', - 'extra_params': {}} # node/edge symb - Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params']) + ds = {'name': 'monoterpenoides', + 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb + Gn, y_all = loadDataset(ds['dataset']) # Gn = Gn[0:10] - remove_edges(Gn) - gkernel = 'marginalizedkernel' + gkernel = 'untilhpathkernel' + node_label = 'atom' + edge_label = 'bond_type' itr_max = 10 - edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list = \ - fit_GED_to_kernel_distance(Gn, gkernel, itr_max) + edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \ + nb_cost_mat_list, coef_dk = fit_GED_to_kernel_distance(Gn, node_label, edge_label, + gkernel, itr_max, + fitkernel='gaussian') total_time = np.sum(time_list) print('\nedit_costs:', edit_costs) print('\nresidual_list:', residual_list) print('\nedit_cost_list:', edit_cost_list) print('\ndistance matrix in kernel space:', dis_k_mat) print('\nged matrix:', ged_mat) - print('total time:', total_time) - np.savez('results/fit_distance.cs_leq_ci_plus_cr.gm', edit_costs=edit_costs, + print('\ntotal time:', total_time) + print('\nnb_cost_mat:', nb_cost_mat_list[-1]) + np.savez('results/fit_distance.cs_leq_ci_plus_cr.gaussian.cost_leq_1en2.monot.elabeled.uhpkernel.gm', + edit_costs=edit_costs, residual_list=residual_list, edit_cost_list=edit_cost_list, dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list, - total_time=total_time) + total_time=total_time, nb_cost_mat_list=nb_cost_mat_list, + coef_dk=coef_dk) + +# ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt', +# 'extra_params': {}} # node/edge symb +# Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params']) +## Gn = Gn[0:10] +## remove_edges(Gn) +# gkernel = 'untilhpathkernel' +# node_label = 'atom' +# edge_label = 'bond_type' +# itr_max = 10 +# edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \ +# nb_cost_mat_list, coef_dk = fit_GED_to_kernel_distance(Gn, node_label, edge_label, +# gkernel, itr_max) +# total_time = np.sum(time_list) +# print('\nedit_costs:', edit_costs) +# print('\nresidual_list:', residual_list) +# print('\nedit_cost_list:', edit_cost_list) +# print('\ndistance matrix in kernel space:', dis_k_mat) +# print('\nged matrix:', ged_mat) +# print('\ntotal time:', total_time) +# print('\nnb_cost_mat:', nb_cost_mat_list[-1]) +# np.savez('results/fit_distance.cs_leq_ci_plus_cr.cost_leq_1en2.mutag.elabeled.uhpkernel.gm', +# edit_costs=edit_costs, +# residual_list=residual_list, edit_cost_list=edit_cost_list, +# dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list, +# total_time=total_time, nb_cost_mat_list=nb_cost_mat_list, coef_dk) + + +# # normalized distance matrices. +# gmfile = np.load('results/fit_distance.cs_leq_ci_plus_cr.cost_leq_1en2.monot.elabeled.uhpkernel.gm.npz') +# edit_costs = gmfile['edit_costs'] +# residual_list = gmfile['residual_list'] +# edit_cost_list = gmfile['edit_cost_list'] +# dis_k_mat = gmfile['dis_k_mat'] +# ged_mat = gmfile['ged_mat'] +# total_time = gmfile['total_time'] +# nb_cost_mat_list = gmfile['nb_cost_mat_list'] +# coef_dk = gmfile['coef_dk'] + + nb_consistent, nb_inconsistent, ratio_consistent = pairwise_substitution_consistence(dis_k_mat, ged_mat) + print(nb_consistent, nb_inconsistent, ratio_consistent) + +# dis_k_sub = pairwise_substitution(dis_k_mat) +# ged_sub = pairwise_substitution(ged_mat) +# np.savez('results/sub_dis_mat.cs_leq_ci_plus_cr.cost_leq_1en2.gm', +# dis_k_sub=dis_k_sub, ged_sub=ged_sub) + + + norm_dis_k_mat = normalize_distance_matrix(dis_k_mat) + plt.imshow(norm_dis_k_mat) + plt.colorbar() + plt.savefig('results/norm_dis_k_mat.cs_leq_ci_plus_cr.gaussian.cost_leq_1en2.monot.elabeled.uhpkernel' + + '.eps', format='eps', dpi=300) + plt.savefig('results/norm_dis_k_mat.cs_leq_ci_plus_cr.gaussian.cost_leq_1en2.monot.elabeled.uhpkernel' + + '.png', format='png') +# plt.show() + plt.clf() + + norm_ged_mat = normalize_distance_matrix(ged_mat) + plt.imshow(norm_ged_mat) + plt.colorbar() + plt.savefig('results/norm_ged_mat.cs_leq_ci_plus_cr.gaussian.cost_leq_1en2.monot.elabeled.uhpkernel' + + '.eps', format='eps', dpi=300) + plt.savefig('results/norm_ged_mat.cs_leq_ci_plus_cr.gaussian.cost_leq_1en2.monot.elabeled.uhpkernel' + + '.png', format='png') +# plt.show() + plt.clf() + + norm_diff = norm_ged_mat - norm_dis_k_mat + plt.imshow(norm_diff) + plt.colorbar() + plt.savefig('results/diff_mat_norm_ged_dis_k.cs_leq_ci_plus_cr.gaussian.cost_leq_1en2.monot.elabeled.uhpkernel' + + '.eps', format='eps', dpi=300) + plt.savefig('results/diff_mat_norm_ged_dis_k.cs_leq_ci_plus_cr.gaussian.cost_leq_1en2.monot.elabeled.uhpkernel' + + '.png', format='png') +# plt.show() + plt.clf() +# draw_count_bar(norm_diff) + + +def test_unfitted(): + """unfitted. + """ + from fitDistance import compute_geds + from utils import kernel_distance_matrix + ds = {'name': 'monoterpenoides', + 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb + Gn, y_all = loadDataset(ds['dataset']) +# Gn = Gn[0:10] + gkernel = 'untilhpathkernel' + node_label = 'atom' + edge_label = 'bond_type' + + +# ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt', +# 'extra_params': {}} # node/edge symb +# Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params']) +## Gn = Gn[0:10] +## remove_edges(Gn) +# gkernel = 'marginalizedkernel' + + dis_k_mat, _, _, _ = kernel_distance_matrix(Gn, node_label, edge_label, gkernel=gkernel) + ged_all, ged_mat, n_edit_operations = compute_geds(Gn, [3, 3, 1, 3, 3, 1], + [0, 1, 2, 3, 4, 5], parallel=True) + print('\ndistance matrix in kernel space:', dis_k_mat) + print('\nged matrix:', ged_mat) +# np.savez('results/fit_distance.cs_leq_ci_plus_cr.cost_leq_1en2.gm', edit_costs=edit_costs, +# residual_list=residual_list, edit_cost_list=edit_cost_list, +# dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list, +# total_time=total_time, nb_cost_mat_list=nb_cost_mat_list) # normalized distance matrices. -# gmfile = np.load('results/fit_distance.cs_leq_ci_plus_cr.gm.npz') +# gmfile = np.load('results/fit_distance.cs_leq_ci_plus_cr.cost_leq_1en3.gm.npz') # edit_costs = gmfile['edit_costs'] # residual_list = gmfile['residual_list'] # edit_cost_list = gmfile['edit_cost_list'] # dis_k_mat = gmfile['dis_k_mat'] # ged_mat = gmfile['ged_mat'] # total_time = gmfile['total_time'] +# nb_cost_mat_list = gmfile['nb_cost_mat_list'] + + nb_consistent, nb_inconsistent, ratio_consistent = pairwise_substitution_consistence(dis_k_mat, ged_mat) + print(nb_consistent, nb_inconsistent, ratio_consistent) norm_dis_k_mat = normalize_distance_matrix(dis_k_mat) plt.imshow(norm_dis_k_mat) plt.colorbar() - plt.savefig('results/norm_dis_k_mat.cs_leq_ci_plus_cr' + '.eps', format='eps', dpi=300) -# plt.savefig('results/norm_dis_k_mat.cs_leq_ci_plus_cr' + '.jpg', format='jpg') + plt.savefig('results/norm_dis_k_mat.unfitted.MUTAG' + '.eps', format='eps', dpi=300) + plt.savefig('results/norm_dis_k_mat.unfitted.MUTAG' + '.png', format='png') # plt.show() plt.clf() + norm_ged_mat = normalize_distance_matrix(ged_mat) plt.imshow(norm_ged_mat) plt.colorbar() - plt.savefig('results/norm_ged_mat.cs_leq_ci_plus_cr' + '.eps', format='eps', dpi=300) -# plt.savefig('results/norm_ged_mat.cs_leq_ci_plus_cr' + '.jpg', format='jpg') + plt.savefig('results/norm_ged_mat.unfitted.MUTAG' + '.eps', format='eps', dpi=300) + plt.savefig('results/norm_ged_mat.unfitted.MUTAG' + '.png', format='png') +# plt.show() + plt.clf() + + norm_diff = norm_ged_mat - norm_dis_k_mat + plt.imshow(norm_diff) + plt.colorbar() + plt.savefig('results/diff_mat_norm_ged_dis_k.unfitted.MUTAG' + '.eps', format='eps', dpi=300) + plt.savefig('results/diff_mat_norm_ged_dis_k.unfitted.MUTAG' + '.png', format='png') # plt.show() plt.clf() + draw_count_bar(norm_diff) + + +def pairwise_substitution_consistence(mat1, mat2): + """ + """ + nb_consistent = 0 + nb_inconsistent = 0 + # the matrix is considered symmetric. + upper_tri1 = mat1[np.triu_indices_from(mat1)] + upper_tri2 = mat2[np.tril_indices_from(mat2)] + for i in tqdm(range(len(upper_tri1)), desc='computing consistence', file=sys.stdout): + for j in range(i, len(upper_tri1)): + if np.sign(upper_tri1[i] - upper_tri1[j]) == np.sign(upper_tri2[i] - upper_tri2[j]): + nb_consistent += 1 + else: + nb_inconsistent += 1 + return nb_consistent, nb_inconsistent, nb_consistent / (nb_consistent + nb_inconsistent) + + +def pairwise_substitution(mat): + # the matrix is considered symmetric. + upper_tri = mat[np.triu_indices_from(mat)] + sub_list = [] + for i in tqdm(range(len(upper_tri)), desc='computing', file=sys.stdout): + for j in range(i, len(upper_tri)): + sub_list.append(upper_tri[i] - upper_tri[j]) + return sub_list + + +def draw_count_bar(norm_diff): + import pandas + from collections import Counter, OrderedDict + norm_diff_cnt = norm_diff.flatten() + norm_diff_cnt = norm_diff_cnt * 10 + norm_diff_cnt = np.floor(norm_diff_cnt) + norm_diff_cnt = Counter(norm_diff_cnt) + norm_diff_cnt = OrderedDict(sorted(norm_diff_cnt.items())) + df = pandas.DataFrame.from_dict(norm_diff_cnt, orient='index') + df.plot(kind='bar') if __name__ == '__main__': - test_anycosts() - test_cs_leq_ci_plus_cr() \ No newline at end of file +# test_anycosts() +# test_cs_leq_ci_plus_cr() + test_unfitted() + +# x = np.array([[1,2,3],[4,5,6],[7,8,9]]) +# xx = pairwise_substitution(x) \ No newline at end of file diff --git a/preimage/test_iam.py b/preimage/test_iam.py index ee51e4b..82d5446 100644 --- a/preimage/test_iam.py +++ b/preimage/test_iam.py @@ -17,9 +17,363 @@ import random import sys sys.path.insert(0, "../") from pygraph.utils.graphfiles import loadDataset +#from pygraph.utils.logger2file import * from iam import iam_upgraded -from utils import remove_edges, compute_kernel, get_same_item_indices -from ged import ged_median +from utils import remove_edges, compute_kernel, get_same_item_indices, dis_gstar +#from ged import ged_median + +def test_iam_monoterpenoides(): + ds = {'name': 'monoterpenoides', + 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb + Gn, y_all = loadDataset(ds['dataset']) +# Gn = Gn[0:50] + gkernel = 'untilhpathkernel' + node_label = 'atom' + edge_label = 'bond_type' + + # parameters for GED function from the IAM paper. + # fitted edit costs (Gaussian). + c_vi = 0.03620133402089074 + c_vr = 0.0417574590207099 + c_vs = 0.009992282328587499 + c_ei = 0.08293120042342755 + c_er = 0.09512220476358019 + c_es = 0.09222529696841467 +# # fitted edit costs (linear combinations). +# c_vi = 0.1749684054238749 +# c_vr = 0.0734054228711457 +# c_vs = 0.05017781726016715 +# c_ei = 0.1869431164806936 +# c_er = 0.32055856948274 +# c_es = 0.2569469379247611 +# # unfitted edit costs. +# c_vi = 3 +# c_vr = 3 +# c_vs = 1 +# c_ei = 3 +# c_er = 3 +# c_es = 1 + ite_max_iam = 50 + epsilon_iam = 0.001 + removeNodes = False + connected_iam = False + # parameters for IAM function +# ged_cost = 'CONSTANT' + ged_cost = 'CONSTANT' + ged_method = 'IPFP' + edit_cost_constant = [c_vi, c_vr, c_vs, c_ei, c_er, c_es] +# edit_cost_constant = [] + ged_stabilizer = 'min' + ged_repeat = 50 + params_ged = {'lib': 'gedlibpy', 'cost': ged_cost, 'method': ged_method, + 'edit_cost_constant': edit_cost_constant, + 'stabilizer': ged_stabilizer, 'repeat': ged_repeat} + + # classify graphs according to letters. + time_list = [] + dis_ks_min_list = [] + dis_ks_set_median_list = [] + sod_gs_list = [] + g_best = [] + sod_set_median_list = [] + sod_list_list = [] + idx_dict = get_same_item_indices(y_all) + for y_class in idx_dict: + print('\n-------------------------------------------------------') + print('class of y:', y_class) + Gn_class = [Gn[i].copy() for i in idx_dict[y_class]] + + time_list.append([]) + dis_ks_min_list.append([]) + dis_ks_set_median_list.append([]) + sod_gs_list.append([]) + g_best.append([]) + sod_set_median_list.append([]) + + for repeat in range(50): + idx_rdm = random.sample(range(len(Gn_class)), 10) + print('graphs chosen:', idx_rdm) + Gn_median = [Gn_class[idx].copy() for idx in idx_rdm] + Gn_candidate = [g.copy() for g in Gn_median] + + alpha_range = [1 / len(Gn_median)] * len(Gn_median) + time0 = time.time() + G_gen_median_list, sod_gen_median, sod_list, G_set_median_list, sod_set_median \ + = iam_upgraded(Gn_median, + Gn_candidate, c_ei=c_ei, c_er=c_er, c_es=c_es, ite_max=ite_max_iam, + epsilon=epsilon_iam, connected=connected_iam, removeNodes=removeNodes, + params_ged=params_ged) + time_total = time.time() - time0 + print('\ntime: ', time_total) + time_list[-1].append(time_total) + g_best[-1].append(G_gen_median_list[0]) + sod_set_median_list[-1].append(sod_set_median) + print('\nsmallest sod of the set median:', sod_set_median) + sod_gs_list[-1].append(sod_gen_median) + print('\nsmallest sod in graph space:', sod_gen_median) + sod_list_list.append(sod_list) + + # show the best graph and save it to file. + print('one of the possible corresponding pre-images is') + nx.draw(G_gen_median_list[0], labels=nx.get_node_attributes(G_gen_median_list[0], 'atom'), + with_labels=True) +# plt.show() + # plt.savefig('results/iam/mutag_median.fit_costs2.001.nb' + str(nb_median) + +# plt.savefig('results/iam/paper_compare/monoter_y' + str(y_class) + +# '_repeat' + str(repeat) + '_' + str(time.time()) + +# '.png', format="PNG") + plt.clf() + # print(G_gen_median_list[0].nodes(data=True)) + # print(G_gen_median_list[0].edges(data=True)) + + + # compute distance between \psi and the set median graph. + knew_set_median = compute_kernel(G_set_median_list + Gn_median, + gkernel, node_label, edge_label, False) + dhat_new_set_median_list = [] + for idx, g_tmp in enumerate(G_set_median_list): + # @todo: the term3 below could use the one at the beginning of the function. + dhat_new_set_median_list.append(dis_gstar(idx, range(len(G_set_median_list), + len(G_set_median_list) + len(Gn_median) + 1), + alpha_range, knew_set_median, withterm3=False)) + + print('\ndistance in kernel space of set median: ', dhat_new_set_median_list[0]) + dis_ks_set_median_list[-1].append(dhat_new_set_median_list[0]) + + + # compute distance between \psi and the new generated graphs. + knew = compute_kernel(G_gen_median_list + Gn_median, gkernel, node_label, + edge_label, False) + dhat_new_list = [] + for idx, g_tmp in enumerate(G_gen_median_list): + # @todo: the term3 below could use the one at the beginning of the function. + dhat_new_list.append(dis_gstar(idx, range(len(G_gen_median_list), + len(G_gen_median_list) + len(Gn_median) + 1), + alpha_range, knew, withterm3=False)) + + print('\nsmallest distance in kernel space: ', dhat_new_list[0]) + dis_ks_min_list[-1].append(dhat_new_list[0]) + + + print('\nsods of the set median for this class:', sod_set_median_list[-1]) + print('\nsods in graph space for this class:', sod_gs_list[-1]) + print('\ndistance in kernel space of set median for this class:', + dis_ks_set_median_list[-1]) + print('\nsmallest distances in kernel space for this class:', + dis_ks_min_list[-1]) + print('\ntimes for this class:', time_list[-1]) + + sod_set_median_list[-1] = np.mean(sod_set_median_list[-1]) + sod_gs_list[-1] = np.mean(sod_gs_list[-1]) + dis_ks_set_median_list[-1] = np.mean(dis_ks_set_median_list[-1]) + dis_ks_min_list[-1] = np.mean(dis_ks_min_list[-1]) + time_list[-1] = np.mean(time_list[-1]) + + print() + print('\nmean sods of the set median for each class:', sod_set_median_list) + print('\nmean sods in graph space for each class:', sod_gs_list) + print('\ndistances in kernel space of set median for each class:', + dis_ks_set_median_list) + print('\nmean smallest distances in kernel space for each class:', + dis_ks_min_list) + print('\nmean times for each class:', time_list) + + print('\nmean sods of the set median of all:', np.mean(sod_set_median_list)) + print('\nmean sods in graph space of all:', np.mean(sod_gs_list)) + print('\nmean distances in kernel space of set median of all:', + np.mean(dis_ks_set_median_list)) + print('\nmean smallest distances in kernel space of all:', + np.mean(dis_ks_min_list)) + print('\nmean times of all:', np.mean(time_list)) + + nb_better_sods = 0 + nb_worse_sods = 0 + nb_same_sods = 0 + for sods in sod_list_list: + if sods[0] > sods[-1]: + nb_better_sods += 1 + elif sods[0] < sods[-1]: + nb_worse_sods += 1 + else: + nb_same_sods += 1 + print('\n In', str(len(sod_list_list)), 'sod lists,', str(nb_better_sods), + 'are getting better,', str(nb_worse_sods), 'are getting worse,', + str(nb_same_sods), 'are not changed; ', str(nb_better_sods / len(sod_list_list)), + 'sods are improved.') + + +def test_iam_mutag(): + ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt', + 'extra_params': {}} # node/edge symb + Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params']) +# Gn = Gn[0:50] + gkernel = 'untilhpathkernel' + node_label = 'atom' + edge_label = 'bond_type' + + # parameters for GED function from the IAM paper. + # fitted edit costs. + c_vi = 0.03523843108436513 + c_vr = 0.03347339739350128 + c_vs = 0.06871290673612238 + c_ei = 0.08591999846720685 + c_er = 0.07962086440894103 + c_es = 0.08596855855478233 + # unfitted edit costs. +# c_vi = 3 +# c_vr = 3 +# c_vs = 1 +# c_ei = 3 +# c_er = 3 +# c_es = 1 + ite_max_iam = 50 + epsilon_iam = 0.001 + removeNodes = False + connected_iam = False + # parameters for IAM function +# ged_cost = 'CONSTANT' + ged_cost = 'CONSTANT' + ged_method = 'IPFP' + edit_cost_constant = [c_vi, c_vr, c_vs, c_ei, c_er, c_es] +# edit_cost_constant = [] + ged_stabilizer = 'min' + ged_repeat = 50 + params_ged = {'lib': 'gedlibpy', 'cost': ged_cost, 'method': ged_method, + 'edit_cost_constant': edit_cost_constant, + 'stabilizer': ged_stabilizer, 'repeat': ged_repeat} + + # classify graphs according to letters. + time_list = [] + dis_ks_min_list = [] + dis_ks_set_median_list = [] + sod_gs_list = [] + g_best = [] + sod_set_median_list = [] + sod_list_list = [] + idx_dict = get_same_item_indices(y_all) + for y_class in idx_dict: + print('\n-------------------------------------------------------') + print('class of y:', y_class) + Gn_class = [Gn[i].copy() for i in idx_dict[y_class]] + + time_list.append([]) + dis_ks_min_list.append([]) + dis_ks_set_median_list.append([]) + sod_gs_list.append([]) + g_best.append([]) + sod_set_median_list.append([]) + + for repeat in range(50): + idx_rdm = random.sample(range(len(Gn_class)), 10) + print('graphs chosen:', idx_rdm) + Gn_median = [Gn_class[idx].copy() for idx in idx_rdm] + Gn_candidate = [g.copy() for g in Gn_median] + + alpha_range = [1 / len(Gn_median)] * len(Gn_median) + time0 = time.time() + G_gen_median_list, sod_gen_median, sod_list, G_set_median_list, sod_set_median \ + = iam_upgraded(Gn_median, + Gn_candidate, c_ei=c_ei, c_er=c_er, c_es=c_es, ite_max=ite_max_iam, + epsilon=epsilon_iam, connected=connected_iam, removeNodes=removeNodes, + params_ged=params_ged) + time_total = time.time() - time0 + print('\ntime: ', time_total) + time_list[-1].append(time_total) + g_best[-1].append(G_gen_median_list[0]) + sod_set_median_list[-1].append(sod_set_median) + print('\nsmallest sod of the set median:', sod_set_median) + sod_gs_list[-1].append(sod_gen_median) + print('\nsmallest sod in graph space:', sod_gen_median) + sod_list_list.append(sod_list) + + # show the best graph and save it to file. + print('one of the possible corresponding pre-images is') + nx.draw(G_gen_median_list[0], labels=nx.get_node_attributes(G_gen_median_list[0], 'atom'), + with_labels=True) +# plt.show() + # plt.savefig('results/iam/mutag_median.fit_costs2.001.nb' + str(nb_median) + +# plt.savefig('results/iam/paper_compare/mutag_y' + str(y_class) + +# '_repeat' + str(repeat) + '_' + str(time.time()) + +# '.png', format="PNG") + plt.clf() + # print(G_gen_median_list[0].nodes(data=True)) + # print(G_gen_median_list[0].edges(data=True)) + + + # compute distance between \psi and the set median graph. + knew_set_median = compute_kernel(G_set_median_list + Gn_median, + gkernel, node_label, edge_label, False) + dhat_new_set_median_list = [] + for idx, g_tmp in enumerate(G_set_median_list): + # @todo: the term3 below could use the one at the beginning of the function. + dhat_new_set_median_list.append(dis_gstar(idx, range(len(G_set_median_list), + len(G_set_median_list) + len(Gn_median) + 1), + alpha_range, knew_set_median, withterm3=False)) + + print('\ndistance in kernel space of set median: ', dhat_new_set_median_list[0]) + dis_ks_set_median_list[-1].append(dhat_new_set_median_list[0]) + + + # compute distance between \psi and the new generated graphs. + knew = compute_kernel(G_gen_median_list + Gn_median, gkernel, node_label, + edge_label, False) + dhat_new_list = [] + for idx, g_tmp in enumerate(G_gen_median_list): + # @todo: the term3 below could use the one at the beginning of the function. + dhat_new_list.append(dis_gstar(idx, range(len(G_gen_median_list), + len(G_gen_median_list) + len(Gn_median) + 1), + alpha_range, knew, withterm3=False)) + + print('\nsmallest distance in kernel space: ', dhat_new_list[0]) + dis_ks_min_list[-1].append(dhat_new_list[0]) + + + print('\nsods of the set median for this class:', sod_set_median_list[-1]) + print('\nsods in graph space for this class:', sod_gs_list[-1]) + print('\ndistance in kernel space of set median for this class:', + dis_ks_set_median_list[-1]) + print('\nsmallest distances in kernel space for this class:', + dis_ks_min_list[-1]) + print('\ntimes for this class:', time_list[-1]) + + sod_set_median_list[-1] = np.mean(sod_set_median_list[-1]) + sod_gs_list[-1] = np.mean(sod_gs_list[-1]) + dis_ks_set_median_list[-1] = np.mean(dis_ks_set_median_list[-1]) + dis_ks_min_list[-1] = np.mean(dis_ks_min_list[-1]) + time_list[-1] = np.mean(time_list[-1]) + + print() + print('\nmean sods of the set median for each class:', sod_set_median_list) + print('\nmean sods in graph space for each class:', sod_gs_list) + print('\ndistances in kernel space of set median for each class:', + dis_ks_set_median_list) + print('\nmean smallest distances in kernel space for each class:', + dis_ks_min_list) + print('\nmean times for each class:', time_list) + + print('\nmean sods of the set median of all:', np.mean(sod_set_median_list)) + print('\nmean sods in graph space of all:', np.mean(sod_gs_list)) + print('\nmean distances in kernel space of set median of all:', + np.mean(dis_ks_set_median_list)) + print('\nmean smallest distances in kernel space of all:', + np.mean(dis_ks_min_list)) + print('\nmean times of all:', np.mean(time_list)) + + nb_better_sods = 0 + nb_worse_sods = 0 + nb_same_sods = 0 + for sods in sod_list_list: + if sods[0] > sods[-1]: + nb_better_sods += 1 + elif sods[0] < sods[-1]: + nb_worse_sods += 1 + else: + nb_same_sods += 1 + print('\n In', str(len(sod_list_list)), 'sod lists,', str(nb_better_sods), + 'are getting better,', str(nb_worse_sods), 'are getting worse,', + str(nb_same_sods), 'are not changed; ', str(nb_better_sods / len(sod_list_list)), + 'sods are improved.') + ############################################################################### # tests on different numbers of median-sets. @@ -33,46 +387,352 @@ def test_iam_median_nb(): remove_edges(Gn) gkernel = 'marginalizedkernel' -# lmbda = 0.03 # termination probalility -# r_max = 10 # iteration limit for pre-image. -# alpha_range = np.linspace(0.5, 0.5, 1) -# k = 5 # k nearest neighbors -# epsilon = 1e-6 -# InitIAMWithAllDk = True + lmbda = 0.03 # termination probalility +# # parameters for GED function +# c_vi = 0.037 +# c_vr = 0.038 +# c_vs = 0.075 +# c_ei = 0.001 +# c_er = 0.001 +# c_es = 0.0 +# ite_max_iam = 50 +# epsilon_iam = 0.001 +# removeNodes = False +# connected_iam = False +# # parameters for IAM function +# ged_cost = 'CONSTANT' +# ged_method = 'IPFP' +# edit_cost_constant = [c_vi, c_vr, c_vs, c_ei, c_er, c_es] +# ged_stabilizer = 'min' +# ged_repeat = 50 +# params_ged = {'lib': 'gedlibpy', 'cost': ged_cost, 'method': ged_method, +# 'edit_cost_constant': edit_cost_constant, +# 'stabilizer': ged_stabilizer, 'repeat': ged_repeat} + # parameters for GED function - ged_cost='CHEM_1' - ged_method='IPFP' - saveGXL='gedlib' - # parameters for IAM function - c_ei=1 - c_er=1 - c_es=1 + c_vi = 4 + c_vr = 4 + c_vs = 2 + c_ei = 1 + c_er = 1 + c_es = 1 ite_max_iam = 50 epsilon_iam = 0.001 removeNodes = False connected_iam = False - - # number of graphs; we what to compute the median of these graphs. - nb_median_range = [2, 3, 4, 5, 10, 20, 30, 40, 50, 100] + # parameters for IAM function + ged_cost = 'CHEM_1' + ged_method = 'IPFP' + edit_cost_constant = [] + ged_stabilizer = 'min' + ged_repeat = 50 + params_ged = {'lib': 'gedlibpy', 'cost': ged_cost, 'method': ged_method, + 'edit_cost_constant': edit_cost_constant, + 'stabilizer': ged_stabilizer, 'repeat': ged_repeat} # find out all the graphs classified to positive group 1. idx_dict = get_same_item_indices(y_all) Gn = [Gn[i] for i in idx_dict[1]] + # number of graphs; we what to compute the median of these graphs. +# nb_median_range = [2, 3, 4, 5, 10, 20, 30, 40, 50, 100] + nb_median_range = [len(Gn)] + # # compute Gram matrix. # time0 = time.time() # km = compute_kernel(Gn, gkernel, True) # time_km = time.time() - time0 # # write Gram matrix to file. # np.savez('results/gram_matrix_marg_itr10_pq0.03_mutag_positive.gm', gm=km, gmtime=time_km) + + time_list = [] + dis_ks_min_list = [] + sod_gs_list = [] +# sod_gs_min_list = [] +# nb_updated_list = [] +# nb_updated_k_list = [] + g_best = [] + for nb_median in nb_median_range: + print('\n-------------------------------------------------------') + print('number of median graphs =', nb_median) + random.seed(1) + idx_rdm = random.sample(range(len(Gn)), nb_median) + print('graphs chosen:', idx_rdm) + Gn_median = [Gn[idx].copy() for idx in idx_rdm] + Gn_candidate = [g.copy() for g in Gn] + +# for g in Gn_median: +# nx.draw(g, labels=nx.get_node_attributes(g, 'atom'), with_labels=True) +## plt.savefig("results/preimage_mix/mutag.png", format="PNG") +# plt.show() +# plt.clf() + + ################################################################### +# gmfile = np.load('results/gram_matrix_marg_itr10_pq0.03_mutag_positive.gm.npz') +# km_tmp = gmfile['gm'] +# time_km = gmfile['gmtime'] +# # modify mixed gram matrix. +# km = np.zeros((len(Gn) + nb_median, len(Gn) + nb_median)) +# for i in range(len(Gn)): +# for j in range(i, len(Gn)): +# km[i, j] = km_tmp[i, j] +# km[j, i] = km[i, j] +# for i in range(len(Gn)): +# for j, idx in enumerate(idx_rdm): +# km[i, len(Gn) + j] = km[i, idx] +# km[len(Gn) + j, i] = km[i, idx] +# for i, idx1 in enumerate(idx_rdm): +# for j, idx2 in enumerate(idx_rdm): +# km[len(Gn) + i, len(Gn) + j] = km[idx1, idx2] + + ################################################################### + alpha_range = [1 / nb_median] * nb_median + time0 = time.time() + ghat_new_list, sod_min = iam_upgraded(Gn_median, Gn_candidate, + c_ei=c_ei, c_er=c_er, c_es=c_es, ite_max=ite_max_iam, + epsilon=epsilon_iam, connected=connected_iam, removeNodes=removeNodes, + params_ged=params_ged) + + time_total = time.time() - time0 + print('\ntime: ', time_total) + time_list.append(time_total) + + # compute distance between \psi and the new generated graphs. + knew = compute_kernel(ghat_new_list + Gn_median, gkernel, False) + dhat_new_list = [] + for idx, g_tmp in enumerate(ghat_new_list): + # @todo: the term3 below could use the one at the beginning of the function. + dhat_new_list.append(dis_gstar(idx, range(len(ghat_new_list), + len(ghat_new_list) + len(Gn_median) + 1), + alpha_range, knew, withterm3=False)) + + print('\nsmallest distance in kernel space: ', dhat_new_list[0]) + dis_ks_min_list.append(dhat_new_list[0]) + g_best.append(ghat_new_list[0]) + + # show the best graph and save it to file. +# print('the shortest distance is', dhat) + print('one of the possible corresponding pre-images is') + nx.draw(ghat_new_list[0], labels=nx.get_node_attributes(ghat_new_list[0], 'atom'), + with_labels=True) + plt.show() +# plt.savefig('results/iam/mutag_median.fit_costs2.001.nb' + str(nb_median) + + plt.savefig('results/iam/mutag_median_unfit2.nb' + str(nb_median) + + '.png', format="PNG") + plt.clf() +# print(ghat_list[0].nodes(data=True)) +# print(ghat_list[0].edges(data=True)) + + sod_gs_list.append(sod_min) +# sod_gs_min_list.append(np.min(sod_min)) + print('\nsmallest sod in graph space: ', sod_min) + + print('\nsods in graph space: ', sod_gs_list) +# print('\nsmallest sod in graph space for each set of median graphs: ', sod_gs_min_list) + print('\nsmallest distance in kernel space for each set of median graphs: ', + dis_ks_min_list) +# print('\nnumber of updates of the best graph for each set of median graphs by IAM: ', +# nb_updated_list) +# print('\nnumber of updates of k nearest graphs for each set of median graphs by IAM: ', +# nb_updated_k_list) + print('\ntimes:', time_list) + + +def test_iam_letter_h(): + from median import draw_Letter_graph + ds = {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt', + 'extra_params': {}} # node nsymb +# ds = {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt', +# 'extra_params': {}} # node nsymb +# Gn = Gn[0:50] + Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params']) + gkernel = 'structuralspkernel' + + # parameters for GED function from the IAM paper. + c_vi = 3 + c_vr = 3 + c_vs = 1 + c_ei = 3 + c_er = 3 + c_es = 1 + ite_max_iam = 50 + epsilon_iam = 0.001 + removeNodes = False + connected_iam = False + # parameters for IAM function +# ged_cost = 'CONSTANT' + ged_cost = 'LETTER' + ged_method = 'IPFP' +# edit_cost_constant = [c_vi, c_vr, c_vs, c_ei, c_er, c_es] + edit_cost_constant = [] + ged_stabilizer = 'min' + ged_repeat = 50 + params_ged = {'lib': 'gedlibpy', 'cost': ged_cost, 'method': ged_method, + 'edit_cost_constant': edit_cost_constant, + 'stabilizer': ged_stabilizer, 'repeat': ged_repeat} + + # classify graphs according to letters. + time_list = [] + dis_ks_min_list = [] + sod_gs_list = [] + g_best = [] + sod_set_median_list = [] + idx_dict = get_same_item_indices(y_all) + for letter in idx_dict: + print('\n-------------------------------------------------------') + print('letter', letter) + Gn_let = [Gn[i].copy() for i in idx_dict[letter]] + + time_list.append([]) + dis_ks_min_list.append([]) + sod_gs_list.append([]) + g_best.append([]) + sod_set_median_list.append([]) + + for repeat in range(50): + idx_rdm = random.sample(range(len(Gn_let)), 50) + print('graphs chosen:', idx_rdm) + Gn_median = [Gn_let[idx].copy() for idx in idx_rdm] + Gn_candidate = [g.copy() for g in Gn_median] + + alpha_range = [1 / len(Gn_median)] * len(Gn_median) + time0 = time.time() + ghat_new_list, sod_min, sod_set_median = iam_upgraded(Gn_median, + Gn_candidate, c_ei=c_ei, c_er=c_er, c_es=c_es, ite_max=ite_max_iam, + epsilon=epsilon_iam, connected=connected_iam, removeNodes=removeNodes, + params_ged=params_ged) + time_total = time.time() - time0 + print('\ntime: ', time_total) + time_list[-1].append(time_total) + g_best[-1].append(ghat_new_list[0]) + sod_set_median_list[-1].append(sod_set_median) + print('\nsmallest sod of the set median:', sod_set_median) + sod_gs_list[-1].append(sod_min) + print('\nsmallest sod in graph space:', sod_min) + + # show the best graph and save it to file. + print('one of the possible corresponding pre-images is') + draw_Letter_graph(ghat_new_list[0], savepath='results/iam/paper_compare/') + + # compute distance between \psi and the new generated graphs. + knew = compute_kernel(ghat_new_list + Gn_median, gkernel, False) + dhat_new_list = [] + for idx, g_tmp in enumerate(ghat_new_list): + # @todo: the term3 below could use the one at the beginning of the function. + dhat_new_list.append(dis_gstar(idx, range(len(ghat_new_list), + len(ghat_new_list) + len(Gn_median) + 1), + alpha_range, knew, withterm3=False)) + + print('\nsmallest distance in kernel space: ', dhat_new_list[0]) + dis_ks_min_list[-1].append(dhat_new_list[0]) + + print('\nsods of the set median for this letter:', sod_set_median_list[-1]) + print('\nsods in graph space for this letter:', sod_gs_list[-1]) + print('\nsmallest distances in kernel space for this letter:', + dis_ks_min_list[-1]) + print('\ntimes for this letter:', time_list[-1]) + + sod_set_median_list[-1] = np.mean(sod_set_median_list[-1]) + sod_gs_list[-1] = np.mean(sod_gs_list[-1]) + dis_ks_min_list[-1] = np.mean(dis_ks_min_list[-1]) + time_list[-1] = np.mean(time_list[-1]) + print('\nmean sods of the set median for each letter:', sod_set_median_list) + print('\nmean sods in graph space for each letter:', sod_gs_list) + print('\nmean smallest distances in kernel space for each letter:', + dis_ks_min_list) + print('\nmean times for each letter:', time_list) + + print('\nmean sods of the set median of all:', np.mean(sod_set_median_list)) + print('\nmean sods in graph space of all:', np.mean(sod_gs_list)) + print('\nmean smallest distances in kernel space of all:', + np.mean(dis_ks_min_list)) + print('\nmean times of all:', np.mean(time_list)) + + + + + + + + + +def test_iam_fitdistance(): + + ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt', + 'extra_params': {}} # node/edge symb + Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params']) +# Gn = Gn[0:50] +# remove_edges(Gn) + gkernel = 'marginalizedkernel' + node_label = 'atom' + edge_label = 'bond_type' + +# lmbda = 0.03 # termination probalility +# # parameters for GED function +# c_vi = 0.037 +# c_vr = 0.038 +# c_vs = 0.075 +# c_ei = 0.001 +# c_er = 0.001 +# c_es = 0.0 +# ite_max_iam = 50 +# epsilon_iam = 0.001 +# removeNodes = False +# connected_iam = False +# # parameters for IAM function +# ged_cost = 'CONSTANT' +# ged_method = 'IPFP' +# edit_cost_constant = [c_vi, c_vr, c_vs, c_ei, c_er, c_es] +# ged_stabilizer = 'min' +# ged_repeat = 50 +# params_ged = {'lib': 'gedlibpy', 'cost': ged_cost, 'method': ged_method, +# 'edit_cost_constant': edit_cost_constant, +# 'stabilizer': ged_stabilizer, 'repeat': ged_repeat} + + # parameters for GED function + c_vi = 4 + c_vr = 4 + c_vs = 2 + c_ei = 1 + c_er = 1 + c_es = 1 + ite_max_iam = 50 + epsilon_iam = 0.001 + removeNodes = False + connected_iam = False + # parameters for IAM function + ged_cost = 'CHEM_1' + ged_method = 'IPFP' + edit_cost_constant = [] + ged_stabilizer = 'min' + ged_repeat = 50 + params_ged = {'lib': 'gedlibpy', 'cost': ged_cost, 'method': ged_method, + 'edit_cost_constant': edit_cost_constant, + 'stabilizer': ged_stabilizer, 'repeat': ged_repeat} + + # find out all the graphs classified to positive group 1. + idx_dict = get_same_item_indices(y_all) + Gn = [Gn[i] for i in idx_dict[1]] + + # number of graphs; we what to compute the median of these graphs. +# nb_median_range = [2, 3, 4, 5, 10, 20, 30, 40, 50, 100] + nb_median_range = [10] + +# # compute Gram matrix. +# time0 = time.time() +# km = compute_kernel(Gn, gkernel, True) +# time_km = time.time() - time0 +# # write Gram matrix to file. +# np.savez('results/gram_matrix_marg_itr10_pq0.03_mutag_positive.gm', gm=km, gmtime=time_km) time_list = [] dis_ks_min_list = [] + dis_ks_gen_median_list = [] sod_gs_list = [] - sod_gs_min_list = [] - nb_updated_list = [] - nb_updated_k_list = [] +# sod_gs_min_list = [] +# nb_updated_list = [] +# nb_updated_k_list = [] g_best = [] for nb_median in nb_median_range: print('\n-------------------------------------------------------') @@ -90,72 +750,80 @@ def test_iam_median_nb(): # plt.clf() ################################################################### - gmfile = np.load('results/gram_matrix_marg_itr10_pq0.03_mutag_positive.gm.npz') - km_tmp = gmfile['gm'] - time_km = gmfile['gmtime'] - # modify mixed gram matrix. - km = np.zeros((len(Gn) + nb_median, len(Gn) + nb_median)) - for i in range(len(Gn)): - for j in range(i, len(Gn)): - km[i, j] = km_tmp[i, j] - km[j, i] = km[i, j] - for i in range(len(Gn)): - for j, idx in enumerate(idx_rdm): - km[i, len(Gn) + j] = km[i, idx] - km[len(Gn) + j, i] = km[i, idx] - for i, idx1 in enumerate(idx_rdm): - for j, idx2 in enumerate(idx_rdm): - km[len(Gn) + i, len(Gn) + j] = km[idx1, idx2] +# gmfile = np.load('results/gram_matrix_marg_itr10_pq0.03_mutag_positive.gm.npz') +# km_tmp = gmfile['gm'] +# time_km = gmfile['gmtime'] +# # modify mixed gram matrix. +# km = np.zeros((len(Gn) + nb_median, len(Gn) + nb_median)) +# for i in range(len(Gn)): +# for j in range(i, len(Gn)): +# km[i, j] = km_tmp[i, j] +# km[j, i] = km[i, j] +# for i in range(len(Gn)): +# for j, idx in enumerate(idx_rdm): +# km[i, len(Gn) + j] = km[i, idx] +# km[len(Gn) + j, i] = km[i, idx] +# for i, idx1 in enumerate(idx_rdm): +# for j, idx2 in enumerate(idx_rdm): +# km[len(Gn) + i, len(Gn) + j] = km[idx1, idx2] ################################################################### alpha_range = [1 / nb_median] * nb_median time0 = time.time() - ghat_new_list, dis_min = iam_upgraded(Gn_median, Gn_candidate, - c_ei=c_ei, c_er=c_er, c_es=c_es, ite_max=ite_max_iam, - epsilon=epsilon_iam, removeNodes=removeNodes, - connected=connected_iam, - params_ged={'ged_cost': ged_cost, 'ged_method': ged_method, - 'saveGXL': saveGXL}) + G_gen_median_list, sod_gen_median, sod_list, G_set_median_list, sod_set_median \ + = iam_upgraded(Gn_median, Gn_candidate, + c_ei=c_ei, c_er=c_er, c_es=c_es, ite_max=ite_max_iam, + epsilon=epsilon_iam, connected=connected_iam, removeNodes=removeNodes, + params_ged=params_ged) time_total = time.time() - time0 print('\ntime: ', time_total) time_list.append(time_total) - print('\nsmallest distance in kernel space: ', dhat) - dis_ks_min_list.append(dhat) - g_best.append(ghat_list) - print('\nnumber of updates of the best graph: ', nb_updated) - nb_updated_list.append(nb_updated) - print('\nnumber of updates of k nearest graphs: ', nb_updated_k) - nb_updated_k_list.append(nb_updated_k) + + # compute distance between \psi and the new generated graphs. + knew = compute_kernel(G_gen_median_list + Gn_median, gkernel, node_label, + edge_label, False) + dhat_new_list = [] + for idx, g_tmp in enumerate(G_gen_median_list): + # @todo: the term3 below could use the one at the beginning of the function. + dhat_new_list.append(dis_gstar(idx, range(len(G_gen_median_list), + len(G_gen_median_list) + len(Gn_median) + 1), + alpha_range, knew, withterm3=False)) + + print('\nsmallest distance in kernel space: ', dhat_new_list[0]) + dis_ks_min_list.append(dhat_new_list[0]) + g_best.append(G_gen_median_list[0]) # show the best graph and save it to file. - print('the shortest distance is', dhat) +# print('the shortest distance is', dhat) print('one of the possible corresponding pre-images is') - nx.draw(ghat_list[0], labels=nx.get_node_attributes(ghat_list[0], 'atom'), + nx.draw(G_gen_median_list[0], labels=nx.get_node_attributes(G_gen_median_list[0], 'atom'), with_labels=True) plt.show() - plt.savefig('results/preimage_iam/mutag_median_nb' + str(nb_median) + - '.png', format="PNG") +# plt.savefig('results/iam/mutag_median.fit_costs2.001.nb' + str(nb_median) + +# plt.savefig('results/iam/mutag_median_unfit2.nb' + str(nb_median) + +# '.png', format="PNG") plt.clf() # print(ghat_list[0].nodes(data=True)) # print(ghat_list[0].edges(data=True)) - # compute the corresponding sod in graph space. - sod_tmp, _ = ged_median([ghat_list[0]], Gn_median, ged_cost=ged_cost, - ged_method=ged_method, saveGXL=saveGXL) - sod_gs_list.append(sod_tmp) - sod_gs_min_list.append(np.min(sod_tmp)) - print('\nsmallest sod in graph space: ', np.min(sod_tmp)) + sod_gs_list.append(sod_gen_median) +# sod_gs_min_list.append(np.min(sod_gen_median)) + print('\nsmallest sod in graph space: ', sod_gen_median) + print('\nsmallest sod of set median in graph space: ', sod_set_median) print('\nsods in graph space: ', sod_gs_list) - print('\nsmallest sod in graph space for each set of median graphs: ', sod_gs_min_list) +# print('\nsmallest sod in graph space for each set of median graphs: ', sod_gs_min_list) print('\nsmallest distance in kernel space for each set of median graphs: ', dis_ks_min_list) - print('\nnumber of updates of the best graph for each set of median graphs by IAM: ', - nb_updated_list) - print('\nnumber of updates of k nearest graphs for each set of median graphs by IAM: ', - nb_updated_k_list) +# print('\nnumber of updates of the best graph for each set of median graphs by IAM: ', +# nb_updated_list) +# print('\nnumber of updates of k nearest graphs for each set of median graphs by IAM: ', +# nb_updated_k_list) print('\ntimes:', time_list) + + + ############################################################################### @@ -164,4 +832,11 @@ def test_iam_median_nb(): if __name__ == '__main__': ############################################################################### # tests on different numbers of median-sets. - test_iam_median_nb() \ No newline at end of file +# test_iam_median_nb() +# test_iam_letter_h() + test_iam_monoterpenoides() +# test_iam_mutag() + +# test_iam_fitdistance() +# print("test log") + diff --git a/preimage/test_preimage_iam.py b/preimage/test_preimage_iam.py index 936ce35..34973fd 100644 --- a/preimage/test_preimage_iam.py +++ b/preimage/test_preimage_iam.py @@ -192,26 +192,42 @@ def test_preimage_iam_median_nb(): gkernel = 'marginalizedkernel' lmbda = 0.03 # termination probalility - r_max = 10 # iteration limit for pre-image. + r_max = 3 # iteration limit for pre-image. # alpha_range = np.linspace(0.5, 0.5, 1) k = 5 # k nearest neighbors epsilon = 1e-6 InitIAMWithAllDk = True - # parameters for GED function - ged_cost='CHEM_1' - ged_method='IPFP' - saveGXL='gedlib' # parameters for IAM function - c_ei=1 - c_er=1 - c_es=1 +# c_vi = 0.037 +# c_vr = 0.038 +# c_vs = 0.075 +# c_ei = 0.001 +# c_er = 0.001 +# c_es = 0.0 + c_vi = 4 + c_vr = 4 + c_vs = 2 + c_ei = 1 + c_er = 1 + c_es = 1 ite_max_iam = 50 epsilon_iam = 0.001 removeNodes = True connected_iam = False + # parameters for GED function +# ged_cost='CHEM_1' + ged_cost = 'CONSTANT' + ged_method = 'IPFP' + edit_cost_constant = [c_vi, c_vr, c_vs, c_ei, c_er, c_es] + ged_stabilizer = 'min' + ged_repeat = 50 + params_ged = {'lib': 'gedlibpy', 'cost': ged_cost, 'method': ged_method, + 'edit_cost_constant': edit_cost_constant, + 'stabilizer': ged_stabilizer, 'repeat': ged_repeat} # number of graphs; we what to compute the median of these graphs. - nb_median_range = [2, 3, 4, 5, 10, 20, 30, 40, 50, 100] +# nb_median_range = [2, 3, 4, 5, 10, 20, 30, 40, 50, 100] + nb_median_range = [2] # find out all the graphs classified to positive group 1. idx_dict = get_same_item_indices(y_all) @@ -274,8 +290,7 @@ def test_preimage_iam_median_nb(): params_iam={'c_ei': c_ei, 'c_er': c_er, 'c_es': c_es, 'ite_max': ite_max_iam, 'epsilon': epsilon_iam, 'removeNodes': removeNodes, 'connected': connected_iam}, - params_ged={'ged_cost': ged_cost, 'ged_method': ged_method, - 'saveGXL': saveGXL}) + params_ged=params_ged) time_total = time.time() - time0 + time_km print('\ntime: ', time_total) @@ -293,16 +308,15 @@ def test_preimage_iam_median_nb(): print('one of the possible corresponding pre-images is') nx.draw(ghat_list[0], labels=nx.get_node_attributes(ghat_list[0], 'atom'), with_labels=True) -# plt.show() - plt.savefig('results/preimage_iam/mutag_median_nb' + str(nb_median) + - '.png', format="PNG") + plt.show() +# plt.savefig('results/preimage_iam/mutag_median_cs.001_nb' + str(nb_median) + +# '.png', format="PNG") plt.clf() # print(ghat_list[0].nodes(data=True)) # print(ghat_list[0].edges(data=True)) # compute the corresponding sod in graph space. - sod_tmp, _ = ged_median([ghat_list[0]], Gn_median, ged_cost=ged_cost, - ged_method=ged_method, saveGXL=saveGXL) + sod_tmp, _ = ged_median([ghat_list[0]], Gn_median, params_ged=params_ged) sod_gs_list.append(sod_tmp) sod_gs_min_list.append(np.min(sod_tmp)) print('\nsmallest sod in graph space: ', np.min(sod_tmp)) diff --git a/preimage/utils.py b/preimage/utils.py index 70cb6f5..99c63c0 100644 --- a/preimage/utils.py +++ b/preimage/utils.py @@ -39,13 +39,13 @@ def dis_gstar(idx_g, idx_gi, alpha, Kmatrix, term3=0, withterm3=True): return np.sqrt(term1 - term2 + term3) -def compute_kernel(Gn, graph_kernel, verbose): +def compute_kernel(Gn, graph_kernel, node_label, edge_label, verbose): if graph_kernel == 'marginalizedkernel': - Kmatrix, _ = marginalizedkernel(Gn, node_label='atom', edge_label=None, + Kmatrix, _ = marginalizedkernel(Gn, node_label=node_label, edge_label=edge_label, p_quit=0.03, n_iteration=10, remove_totters=False, n_jobs=multiprocessing.cpu_count(), verbose=verbose) elif graph_kernel == 'untilhpathkernel': - Kmatrix, _ = untilhpathkernel(Gn, node_label='atom', edge_label=None, + Kmatrix, _ = untilhpathkernel(Gn, node_label=node_label, edge_label=edge_label, depth=10, k_func='MinMax', compute_method='trie', n_jobs=multiprocessing.cpu_count(), verbose=verbose) elif graph_kernel == 'spkernel': @@ -77,10 +77,10 @@ def gram2distances(Kmatrix): return dmatrix -def kernel_distance_matrix(Gn, Kmatrix=None, gkernel=None): +def kernel_distance_matrix(Gn, node_label, edge_label, Kmatrix=None, gkernel=None): dis_mat = np.empty((len(Gn), len(Gn))) if Kmatrix == None: - Kmatrix = compute_kernel(Gn, gkernel, True) + Kmatrix = compute_kernel(Gn, gkernel, node_label, edge_label, True) for i in range(len(Gn)): for j in range(i, len(Gn)): dis = Kmatrix[i, i] + Kmatrix[j, j] - 2 * Kmatrix[i, j] diff --git a/pygraph/utils/graphfiles.py b/pygraph/utils/graphfiles.py index e678b83..f5daeda 100644 --- a/pygraph/utils/graphfiles.py +++ b/pygraph/utils/graphfiles.py @@ -1,9 +1,9 @@ """ Utilities function to manage graph files """ - +from os.path import dirname, splitext def loadCT(filename): - """load data from .ct file. + """load data from a Chemical Table (.ct) file. Notes ------ @@ -13,8 +13,11 @@ def loadCT(filename): 0.0000 0.0000 0.0000 C <- each line describes a node (x,y,z + label) 0.0000 0.0000 0.0000 C 0.0000 0.0000 0.0000 O - 1 3 1 1 <- each line describes an edge : to, from,?, label + 1 3 1 1 <- each line describes an edge : to, from, bond type, bond stereo 2 3 1 1 + + Check https://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=10&ved=2ahUKEwivhaSdjsTlAhVhx4UKHczHA8gQFjAJegQIARAC&url=https%3A%2F%2Fwww.daylight.com%2Fmeetings%2Fmug05%2FKappler%2Fctfile.pdf&usg=AOvVaw1cDNrrmMClkFPqodlF2inS + for detailed format discription. """ import networkx as nx from os.path import basename @@ -35,22 +38,15 @@ def loadCT(filename): for i in range(0, nb_nodes): tmp = content[i + 2].split(" ") tmp = [x for x in tmp if x != ''] - g.add_node(i, atom=tmp[3], label=tmp[3]) + g.add_node(i, atom=tmp[3].strip(), + label=[item.strip() for item in tmp[3:]], + attributes=[item.strip() for item in tmp[0:3]]) for i in range(0, nb_edges): tmp = content[i + g.number_of_nodes() + 2].split(" ") tmp = [x for x in tmp if x != ''] - g.add_edge( - int(tmp[0]) - 1, - int(tmp[1]) - 1, - bond_type=tmp[3].strip(), - label=tmp[3].strip()) - - -# for i in range(0, nb_edges): -# tmp = content[i + g.number_of_nodes() + 2] -# tmp = [tmp[i:i+3] for i in range(0, len(tmp), 3)] -# g.add_edge(int(tmp[0]) - 1, int(tmp[1]) - 1, -# bond_type=tmp[3].strip(), label=tmp[3].strip()) + g.add_edge(int(tmp[0]) - 1, int(tmp[1]) - 1, + bond_type=tmp[2].strip(), + label=[item.strip() for item in tmp[2:]]) return g @@ -71,6 +67,7 @@ def loadGXL(filename): labels[attr.attrib['name']] = attr[0].text if 'chem' in labels: labels['label'] = labels['chem'] + labels['atom'] = labels['chem'] g.add_node(index, **labels) index += 1 @@ -80,6 +77,7 @@ def loadGXL(filename): labels[attr.attrib['name']] = attr[0].text if 'valence' in labels: labels['label'] = labels['valence'] + labels['bond_type'] = labels['valence'] g.add_edge(dic[edge.attrib['from']], dic[edge.attrib['to']], **labels) return g @@ -392,7 +390,7 @@ def loadDataset(filename, filename_y=None, extra_params=None): Notes ----- This function supports following graph dataset formats: - 'ds': load data from .ct file. See comments of function loadCT for a example. + 'ds': load data from .ds file. See comments of function loadFromDS for a example. 'cxl': load data from Graph eXchange Language file (.cxl file). See http://www.gupro.de/GXL/Introduction/background.html, 2019 for detail. 'sdf': load data from structured data file (.sdf file). See @@ -406,45 +404,24 @@ def loadDataset(filename, filename_y=None, extra_params=None): 2019 for details. Note here filename is the name of either .txt file in the dataset directory. """ - from os.path import dirname, splitext - - dirname_dataset = dirname(filename) extension = splitext(filename)[1][1:] - data = [] - y = [] if extension == "ds": - content = open(filename).read().splitlines() - if filename_y is None or filename_y == '': - for i in range(0, len(content)): - tmp = content[i].split(' ') - # remove the '#'s in file names - data.append( - loadCT(dirname_dataset + '/' + tmp[0].replace('#', '', 1))) - y.append(float(tmp[1])) - else: # y in a seperate file - for i in range(0, len(content)): - tmp = content[i] - # remove the '#'s in file names - data.append( - loadCT(dirname_dataset + '/' + tmp.replace('#', '', 1))) - content_y = open(filename_y).read().splitlines() - # assume entries in filename and filename_y have the same order. - for item in content_y: - tmp = item.split(' ') - # assume the 3rd entry in a line is y (for Alkane dataset) - y.append(float(tmp[2])) + data, y = loadFromDS(filename, filename_y) elif extension == "cxl": import xml.etree.ElementTree as ET - + + dirname_dataset = dirname(filename) tree = ET.parse(filename) root = tree.getroot() data = [] y = [] - for graph in root.iter('print'): + for graph in root.iter('graph'): mol_filename = graph.attrib['file'] mol_class = graph.attrib['class'] data.append(loadGXL(dirname_dataset + '/' + mol_filename)) y.append(mol_class) + elif extension == 'xml': + data, y = loadFromXML(filename, extra_params) elif extension == "sdf": import numpy as np from tqdm import tqdm @@ -471,6 +448,7 @@ def loadDataset(filename, filename_y=None, extra_params=None): elif extension == "mat": data, y = loadMAT(filename, extra_params) elif extension == 'txt': + dirname_dataset = dirname(filename) data, y = loadTXT(dirname_dataset) # print(len(y)) # print(y) @@ -485,6 +463,75 @@ def loadDataset(filename, filename_y=None, extra_params=None): return data, y +def loadFromXML(filename, extra_params): + import xml.etree.ElementTree as ET + + dirname_dataset = dirname(filename) + tree = ET.parse(filename) + root = tree.getroot() + data = [] + y = [] + for graph in root.iter('print'): + mol_filename = graph.attrib['file'] + mol_class = graph.attrib['class'] + data.append(loadGXL(dirname_dataset + '/' + mol_filename)) + y.append(mol_class) + + return data, y + + +def loadFromDS(filename, filename_y): + """Load data from .ds file. + Possible graph formats include: + '.ct': see function loadCT for detail. + '.gxl': see dunction loadGXL for detail. + Note these graph formats are checked automatically by the extensions of + graph files. + """ + dirname_dataset = dirname(filename) + data = [] + y = [] + content = open(filename).read().splitlines() + extension = splitext(content[0].split(' ')[0])[1][1:] + if filename_y is None or filename_y == '': + if extension == 'ct': + for i in range(0, len(content)): + tmp = content[i].split(' ') + # remove the '#'s in file names + data.append( + loadCT(dirname_dataset + '/' + tmp[0].replace('#', '', 1))) + y.append(float(tmp[1])) + elif extension == 'gxl': + for i in range(0, len(content)): + tmp = content[i].split(' ') + # remove the '#'s in file names + data.append( + loadGXL(dirname_dataset + '/' + tmp[0].replace('#', '', 1))) + y.append(float(tmp[1])) + else: # y in a seperate file + if extension == 'ct': + for i in range(0, len(content)): + tmp = content[i] + # remove the '#'s in file names + data.append( + loadCT(dirname_dataset + '/' + tmp.replace('#', '', 1))) + elif extension == 'gxl': + for i in range(0, len(content)): + tmp = content[i] + # remove the '#'s in file names + data.append( + loadGXL(dirname_dataset + '/' + tmp.replace('#', '', 1))) + + content_y = open(filename_y).read().splitlines() + # assume entries in filename and filename_y have the same order. + for item in content_y: + tmp = item.split(' ') + # assume the 3rd entry in a line is y (for Alkane dataset) + y.append(float(tmp[2])) + + return data, y + + def saveDataset(Gn, y, gformat='gxl', group=None, filename='gfile', xparams=None): """Save list of graphs. """ @@ -509,7 +556,30 @@ def saveDataset(Gn, y, gformat='gxl', group=None, filename='gfile', xparams=None if __name__ == '__main__': - ds = {'name': 'MUTAG', 'dataset': '../../datasets/MUTAG/MUTAG.mat', - 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}} # node/edge symb - Gn, y = loadDataset(ds['dataset'], extra_params=ds['extra_params']) - saveDataset(Gn, y, group='xml', filename='temp/temp') \ No newline at end of file +# ### Load dataset from .ds file. +# # .ct files. + ds = {'name': 'Alkane', 'dataset': '../../datasets/Alkane/dataset.ds', + 'dataset_y': '../../datasets/Alkane/dataset_boiling_point_names.txt'} + Gn, y = loadDataset(ds['dataset'], filename_y=ds['dataset_y']) +# ds = {'name': 'Acyclic', 'dataset': '../../datasets/acyclic/dataset_bps.ds'} # node symb +# Gn, y = loadDataset(ds['dataset']) +# ds = {'name': 'MAO', 'dataset': '../../datasets/MAO/dataset.ds'} # node/edge symb +# Gn, y = loadDataset(ds['dataset']) +# ds = {'name': 'PAH', 'dataset': '../../datasets/PAH/dataset.ds'} # unlabeled +# Gn, y = loadDataset(ds['dataset']) + print(Gn[1].nodes(data=True)) + print(Gn[1].edges(data=True)) + print(y[1]) + +# # .gxl file. +# ds = {'name': 'monoterpenoides', +# 'dataset': '../../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb +# Gn, y = loadDataset(ds['dataset']) +# print(Gn[1].nodes(data=True)) +# print(Gn[1].edges(data=True)) +# print(y[1]) + +# ds = {'name': 'MUTAG', 'dataset': '../../datasets/MUTAG/MUTAG.mat', +# 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}} # node/edge symb +# Gn, y = loadDataset(ds['dataset'], extra_params=ds['extra_params']) +# saveDataset(Gn, y, group='xml', filename='temp/temp') \ No newline at end of file