@@ -15,24 +15,28 @@ import time | |||||
import random | import random | ||||
from scipy import optimize | from scipy import optimize | ||||
from scipy.optimize import minimize | |||||
import cvxpy as cp | import cvxpy as cp | ||||
import sys | import sys | ||||
#sys.path.insert(0, "../") | |||||
from ged import GED, get_nb_edit_operations | |||||
from utils import kernel_distance_matrix | |||||
sys.path.insert(0, "../") | |||||
from preimage.ged import GED, get_nb_edit_operations, get_nb_edit_operations_letter | |||||
from preimage.utils import kernel_distance_matrix | |||||
def fit_GED_to_kernel_distance(Gn, node_label, edge_label, gkernel, itr_max, k=4, | |||||
def fit_GED_to_kernel_distance(Gn, node_label, edge_label, gkernel, itr_max, | |||||
params_ged={'lib': 'gedlibpy', 'cost': 'CONSTANT', | params_ged={'lib': 'gedlibpy', 'cost': 'CONSTANT', | ||||
'method': 'IPFP', 'stabilizer': None}, | 'method': 'IPFP', 'stabilizer': None}, | ||||
init_costs=[3, 3, 1, 3, 3, 1], | init_costs=[3, 3, 1, 3, 3, 1], | ||||
dataset='monoterpenoides', | |||||
parallel=True): | parallel=True): | ||||
dataset = dataset.lower() | |||||
# c_vi, c_vr, c_vs, c_ei, c_er, c_es or parts of them. | # c_vi, c_vr, c_vs, c_ei, c_er, c_es or parts of them. | ||||
# random.seed(1) | # random.seed(1) | ||||
# cost_rdm = random.sample(range(1, 10), 6) | # cost_rdm = random.sample(range(1, 10), 6) | ||||
# init_costs = cost_rdm + [0] | # init_costs = cost_rdm + [0] | ||||
# init_costs = cost_rdm | # init_costs = cost_rdm | ||||
init_costs = [3, 3, 1, 3, 3, 1] | |||||
# init_costs = [3, 3, 1, 3, 3, 1] | |||||
# init_costs = [i * 0.01 for i in cost_rdm] + [0] | # init_costs = [i * 0.01 for i in cost_rdm] + [0] | ||||
# init_costs = [0.2, 0.2, 0.2, 0.2, 0.2, 0] | # init_costs = [0.2, 0.2, 0.2, 0.2, 0.2, 0] | ||||
# init_costs = [0, 0, 0.9544, 0.026, 0.0196, 0] | # init_costs = [0, 0, 0.9544, 0.026, 0.0196, 0] | ||||
@@ -51,8 +55,10 @@ def fit_GED_to_kernel_distance(Gn, node_label, edge_label, gkernel, itr_max, k=4 | |||||
# init ged. | # init ged. | ||||
print('\ninitial:') | print('\ninitial:') | ||||
time0 = time.time() | time0 = time.time() | ||||
params_ged['dataset'] = dataset | |||||
params_ged['edit_cost_constant'] = init_costs | params_ged['edit_cost_constant'] = init_costs | ||||
ged_vec_init, ged_mat, n_edit_operations = compute_geds(Gn, params_ged, | ged_vec_init, ged_mat, n_edit_operations = compute_geds(Gn, params_ged, | ||||
dataset, | |||||
parallel=parallel) | parallel=parallel) | ||||
residual_list = [np.sqrt(np.sum(np.square(np.array(ged_vec_init) - dis_k_vec)))] | residual_list = [np.sqrt(np.sum(np.square(np.array(ged_vec_init) - dis_k_vec)))] | ||||
time_list = [time.time() - time0] | time_list = [time.time() - time0] | ||||
@@ -67,20 +73,21 @@ def fit_GED_to_kernel_distance(Gn, node_label, edge_label, gkernel, itr_max, k=4 | |||||
time0 = time.time() | time0 = time.time() | ||||
# "fit" geds to distances in feature space by tuning edit costs using the | # "fit" geds to distances in feature space by tuning edit costs using the | ||||
# Least Squares Method. | # Least Squares Method. | ||||
edit_costs_new, residual = update_costs(nb_cost_mat, dis_k_vec) | |||||
edit_costs_new, residual = update_costs(nb_cost_mat, dis_k_vec, | |||||
dataset=dataset, cost=params_ged['cost']) | |||||
for i in range(len(edit_costs_new)): | for i in range(len(edit_costs_new)): | ||||
if -1e-9 <= edit_costs_new[i] <= 1e-9: | |||||
edit_costs_new[i] = 0 | |||||
if edit_costs_new[i] < 0: | if edit_costs_new[i] < 0: | ||||
if edit_costs_new[i] > -1e-9: | |||||
edit_costs_new[i] = 0 | |||||
else: | |||||
raise ValueError('The edit cost is negative.') | |||||
raise ValueError('The edit cost is negative.') | |||||
# for i in range(len(edit_costs_new)): | # for i in range(len(edit_costs_new)): | ||||
# if edit_costs_new[i] < 0: | # if edit_costs_new[i] < 0: | ||||
# edit_costs_new[i] = 0 | # edit_costs_new[i] = 0 | ||||
# compute new GEDs and numbers of edit operations. | # compute new GEDs and numbers of edit operations. | ||||
params_ged['edit_cost_constant'] = edit_costs_new | |||||
ged_vec, ged_mat, n_edit_operations = compute_geds(Gn, params_ged, | |||||
params_ged['edit_cost_constant'] = edit_costs_new # np.array([edit_costs_new[0], edit_costs_new[1], 0.75]) | |||||
ged_vec, ged_mat, n_edit_operations = compute_geds(Gn, params_ged, | |||||
dataset, | |||||
parallel=parallel) | parallel=parallel) | ||||
residual_list.append(np.sqrt(np.sum(np.square(np.array(ged_vec) - dis_k_vec)))) | residual_list.append(np.sqrt(np.sum(np.square(np.array(ged_vec) - dis_k_vec)))) | ||||
time_list.append(time.time() - time0) | time_list.append(time.time() - time0) | ||||
@@ -94,7 +101,8 @@ def fit_GED_to_kernel_distance(Gn, node_label, edge_label, gkernel, itr_max, k=4 | |||||
time_list, nb_cost_mat_list | time_list, nb_cost_mat_list | ||||
def compute_geds(Gn, params_ged, parallel=False): | |||||
def compute_geds(Gn, params_ged, dataset, parallel=False): | |||||
get_nb_eo = get_nb_edit_operations_letter if dataset == 'letter' else get_nb_edit_operations | |||||
ged_mat = np.zeros((len(Gn), len(Gn))) | ged_mat = np.zeros((len(Gn), len(Gn))) | ||||
if parallel: | if parallel: | ||||
# print('parallel') | # print('parallel') | ||||
@@ -112,7 +120,7 @@ def compute_geds(Gn, params_ged, parallel=False): | |||||
def init_worker(gn_toshare): | def init_worker(gn_toshare): | ||||
global G_gn | global G_gn | ||||
G_gn = gn_toshare | G_gn = gn_toshare | ||||
do_partial = partial(_wrapper_compute_ged_parallel, params_ged) | |||||
do_partial = partial(_wrapper_compute_ged_parallel, params_ged, get_nb_eo) | |||||
pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(Gn,)) | pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(Gn,)) | ||||
iterator = tqdm(pool.imap_unordered(do_partial, itr, chunksize), | iterator = tqdm(pool.imap_unordered(do_partial, itr, chunksize), | ||||
desc='computing GEDs', file=sys.stdout) | desc='computing GEDs', file=sys.stdout) | ||||
@@ -138,26 +146,146 @@ def compute_geds(Gn, params_ged, parallel=False): | |||||
ged_vec.append(dis) | ged_vec.append(dis) | ||||
ged_mat[i][j] = dis | ged_mat[i][j] = dis | ||||
ged_mat[j][i] = dis | ged_mat[j][i] = dis | ||||
n_eo_tmp = get_nb_edit_operations(Gn[i], Gn[j], pi_forward, pi_backward) | |||||
n_eo_tmp = get_nb_eo(Gn[i], Gn[j], pi_forward, pi_backward) | |||||
n_edit_operations.append(n_eo_tmp) | n_edit_operations.append(n_eo_tmp) | ||||
return ged_vec, ged_mat, n_edit_operations | return ged_vec, ged_mat, n_edit_operations | ||||
def _wrapper_compute_ged_parallel(params_ged, itr): | |||||
def _wrapper_compute_ged_parallel(params_ged, get_nb_eo, itr): | |||||
i = itr[0] | i = itr[0] | ||||
j = itr[1] | j = itr[1] | ||||
dis, n_eo_tmp = _compute_ged_parallel(G_gn[i], G_gn[j], params_ged) | |||||
dis, n_eo_tmp = _compute_ged_parallel(G_gn[i], G_gn[j], params_ged, get_nb_eo) | |||||
return i, j, dis, n_eo_tmp | return i, j, dis, n_eo_tmp | ||||
def _compute_ged_parallel(g1, g2, params_ged): | |||||
def _compute_ged_parallel(g1, g2, params_ged, get_nb_eo): | |||||
dis, pi_forward, pi_backward = GED(g1, g2, **params_ged) | dis, pi_forward, pi_backward = GED(g1, g2, **params_ged) | ||||
n_eo_tmp = get_nb_edit_operations(g1, g2, pi_forward, pi_backward) | |||||
n_eo_tmp = get_nb_eo(g1, g2, pi_forward, pi_backward) # [0,0,0,0,0,0] | |||||
return dis, n_eo_tmp | return dis, n_eo_tmp | ||||
def update_costs(nb_cost_mat, dis_k_vec): | |||||
def update_costs(nb_cost_mat, dis_k_vec, dataset='monoterpenoides', | |||||
cost='CONSTANT', rw_constraints='2constraints'): | |||||
if dataset.lower() == 'letter': | |||||
if cost == 'LETTER': | |||||
pass | |||||
# # method 1: set alpha automatically, just tune c_vir and c_eir by | |||||
# # LMS using cvxpy. | |||||
# alpha = 0.5 | |||||
# coeff = 100 # np.max(alpha * nb_cost_mat[:,4] / dis_k_vec) | |||||
## if np.count_nonzero(nb_cost_mat[:,4]) == 0: | |||||
## alpha = 0.75 | |||||
## else: | |||||
## alpha = np.min([dis_k_vec / c_vs for c_vs in nb_cost_mat[:,4] if c_vs != 0]) | |||||
## alpha = alpha * 0.99 | |||||
# param_vir = alpha * (nb_cost_mat[:,0] + nb_cost_mat[:,1]) | |||||
# param_eir = (1 - alpha) * (nb_cost_mat[:,4] + nb_cost_mat[:,5]) | |||||
# nb_cost_mat_new = np.column_stack((param_vir, param_eir)) | |||||
# dis_new = coeff * dis_k_vec - alpha * nb_cost_mat[:,3] | |||||
# | |||||
# x = cp.Variable(nb_cost_mat_new.shape[1]) | |||||
# cost = cp.sum_squares(nb_cost_mat_new * x - dis_new) | |||||
# constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])]] | |||||
# prob = cp.Problem(cp.Minimize(cost), constraints) | |||||
# prob.solve() | |||||
# edit_costs_new = x.value | |||||
# edit_costs_new = np.array([edit_costs_new[0], edit_costs_new[1], alpha]) | |||||
# residual = np.sqrt(prob.value) | |||||
# # method 2: tune c_vir, c_eir and alpha by nonlinear programming by | |||||
# # scipy.optimize.minimize. | |||||
# w0 = nb_cost_mat[:,0] + nb_cost_mat[:,1] | |||||
# w1 = nb_cost_mat[:,4] + nb_cost_mat[:,5] | |||||
# w2 = nb_cost_mat[:,3] | |||||
# w3 = dis_k_vec | |||||
# func_min = lambda x: np.sum((w0 * x[0] * x[3] + w1 * x[1] * (1 - x[2]) \ | |||||
# + w2 * x[2] - w3 * x[3]) ** 2) | |||||
# bounds = ((0, None), (0., None), (0.5, 0.5), (0, None)) | |||||
# res = minimize(func_min, [0.9, 1.7, 0.75, 10], bounds=bounds) | |||||
# edit_costs_new = res.x[0:3] | |||||
# residual = res.fun | |||||
# method 3: tune c_vir, c_eir and alpha by nonlinear programming using cvxpy. | |||||
# # method 4: tune c_vir, c_eir and alpha by QP function | |||||
# # scipy.optimize.least_squares. An initial guess is required. | |||||
# w0 = nb_cost_mat[:,0] + nb_cost_mat[:,1] | |||||
# w1 = nb_cost_mat[:,4] + nb_cost_mat[:,5] | |||||
# w2 = nb_cost_mat[:,3] | |||||
# w3 = dis_k_vec | |||||
# func = lambda x: (w0 * x[0] * x[3] + w1 * x[1] * (1 - x[2]) \ | |||||
# + w2 * x[2] - w3 * x[3]) ** 2 | |||||
# res = optimize.root(func, [0.9, 1.7, 0.75, 100]) | |||||
# edit_costs_new = res.x | |||||
# residual = None | |||||
elif cost == 'LETTER2': | |||||
# # 1. if c_vi != c_vr, c_ei != c_er. | |||||
# nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]] | |||||
# x = cp.Variable(nb_cost_mat_new.shape[1]) | |||||
# cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec) | |||||
## # 1.1 no constraints. | |||||
## constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])]] | |||||
# # 1.2 c_vs <= c_vi + c_vr. | |||||
# constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])], | |||||
# np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0] | |||||
## # 2. if c_vi == c_vr, c_ei == c_er. | |||||
## nb_cost_mat_new = nb_cost_mat[:,[0,3,4]] | |||||
## nb_cost_mat_new[:,0] += nb_cost_mat[:,1] | |||||
## nb_cost_mat_new[:,2] += nb_cost_mat[:,5] | |||||
## x = cp.Variable(nb_cost_mat_new.shape[1]) | |||||
## cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec) | |||||
## # 2.1 no constraints. | |||||
## constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])]] | |||||
### # 2.2 c_vs <= c_vi + c_vr. | |||||
### constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])], | |||||
### np.array([2.0, -1.0, 0.0]).T@x >= 0.0] | |||||
# | |||||
# prob = cp.Problem(cp.Minimize(cost_fun), constraints) | |||||
# prob.solve() | |||||
# edit_costs_new = [x.value[0], x.value[0], x.value[1], x.value[2], x.value[2]] | |||||
# edit_costs_new = np.array(edit_costs_new) | |||||
# residual = np.sqrt(prob.value) | |||||
if rw_constraints == 'inequality': | |||||
# c_vs <= c_vi + c_vr. | |||||
nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]] | |||||
x = cp.Variable(nb_cost_mat_new.shape[1]) | |||||
cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec) | |||||
constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])], | |||||
np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0] | |||||
prob = cp.Problem(cp.Minimize(cost_fun), constraints) | |||||
prob.solve() | |||||
edit_costs_new = x.value | |||||
residual = np.sqrt(prob.value) | |||||
elif rw_constraints == '2constraints': | |||||
# c_vs <= c_vi + c_vr and c_vi == c_vr, c_ei == c_er. | |||||
nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]] | |||||
x = cp.Variable(nb_cost_mat_new.shape[1]) | |||||
cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec) | |||||
constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])], | |||||
np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0, | |||||
np.array([1.0, -1.0, 0.0, 0.0, 0.0]).T@x == 0.0, | |||||
np.array([0.0, 0.0, 0.0, 1.0, -1.0]).T@x == 0.0] | |||||
prob = cp.Problem(cp.Minimize(cost_fun), constraints) | |||||
prob.solve() | |||||
edit_costs_new = x.value | |||||
residual = np.sqrt(prob.value) | |||||
# elif method == 'inequality_modified': | |||||
# # c_vs <= c_vi + c_vr. | |||||
# nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]] | |||||
# x = cp.Variable(nb_cost_mat_new.shape[1]) | |||||
# cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec) | |||||
# constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])], | |||||
# np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0] | |||||
# prob = cp.Problem(cp.Minimize(cost_fun), constraints) | |||||
# prob.solve() | |||||
# # use same costs for insertion and removal rather than the fitted costs. | |||||
# edit_costs_new = [x.value[0], x.value[0], x.value[1], x.value[2], x.value[2]] | |||||
# edit_costs_new = np.array(edit_costs_new) | |||||
# residual = np.sqrt(prob.value) | |||||
else: | |||||
# # method 1: simple least square method. | # # method 1: simple least square method. | ||||
# edit_costs_new, residual, _, _ = np.linalg.lstsq(nb_cost_mat, dis_k_vec, | # edit_costs_new, residual, _, _ = np.linalg.lstsq(nb_cost_mat, dis_k_vec, | ||||
# rcond=None) | # rcond=None) | ||||
@@ -181,16 +309,16 @@ def update_costs(nb_cost_mat, dis_k_vec): | |||||
# G = -1 * np.identity(nb_cost_mat.shape[1]) | # G = -1 * np.identity(nb_cost_mat.shape[1]) | ||||
# h = np.array([0 for i in range(nb_cost_mat.shape[1])]) | # h = np.array([0 for i in range(nb_cost_mat.shape[1])]) | ||||
x = cp.Variable(nb_cost_mat.shape[1]) | |||||
cost = cp.sum_squares(nb_cost_mat * x - dis_k_vec) | |||||
constraints = [x >= [0.0001 for i in range(nb_cost_mat.shape[1])], | |||||
# np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0] | |||||
np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0, | |||||
np.array([0.0, 0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0] | |||||
prob = cp.Problem(cp.Minimize(cost), constraints) | |||||
prob.solve() | |||||
edit_costs_new = x.value | |||||
residual = np.sqrt(prob.value) | |||||
x = cp.Variable(nb_cost_mat.shape[1]) | |||||
cost_fun = cp.sum_squares(nb_cost_mat * x - dis_k_vec) | |||||
constraints = [x >= [0.0 for i in range(nb_cost_mat.shape[1])], | |||||
# np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0] | |||||
np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0, | |||||
np.array([0.0, 0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0] | |||||
prob = cp.Problem(cp.Minimize(cost_fun), constraints) | |||||
prob.solve() | |||||
edit_costs_new = x.value | |||||
residual = np.sqrt(prob.value) | |||||
# method 4: | # method 4: | ||||
@@ -13,33 +13,46 @@ import multiprocessing | |||||
from multiprocessing import Pool | from multiprocessing import Pool | ||||
from functools import partial | from functools import partial | ||||
from gedlibpy_linlin import librariesImport, gedlibpy | |||||
#from gedlibpy_linlin import librariesImport, gedlibpy | |||||
from libs import * | |||||
def GED(g1, g2, lib='gedlibpy', cost='CHEM_1', method='IPFP', | |||||
def GED(g1, g2, dataset='monoterpenoides', lib='gedlibpy', cost='CHEM_1', method='IPFP', | |||||
edit_cost_constant=[], algo_options='', stabilizer='min', repeat=50): | edit_cost_constant=[], algo_options='', stabilizer='min', repeat=50): | ||||
""" | """ | ||||
Compute GED for 2 graphs. | Compute GED for 2 graphs. | ||||
""" | """ | ||||
def convertGraph(G): | |||||
def convertGraph(G, dataset): | |||||
"""Convert a graph to the proper NetworkX format that can be | """Convert a graph to the proper NetworkX format that can be | ||||
recognized by library gedlibpy. | recognized by library gedlibpy. | ||||
""" | """ | ||||
G_new = nx.Graph() | G_new = nx.Graph() | ||||
for nd, attrs in G.nodes(data=True): | |||||
G_new.add_node(str(nd), chem=attrs['atom']) | |||||
# G_new.add_node(str(nd), x=str(attrs['attributes'][0]), | |||||
# y=str(attrs['attributes'][1])) | |||||
for nd1, nd2, attrs in G.edges(data=True): | |||||
# G_new.add_edge(str(nd1), str(nd2), valence=attrs['bond_type']) | |||||
G_new.add_edge(str(nd1), str(nd2)) | |||||
if dataset == 'monoterpenoides': | |||||
for nd, attrs in G.nodes(data=True): | |||||
G_new.add_node(str(nd), chem=attrs['atom']) | |||||
for nd1, nd2, attrs in G.edges(data=True): | |||||
G_new.add_edge(str(nd1), str(nd2), valence=attrs['bond_type']) | |||||
elif dataset == 'letter': | |||||
for nd, attrs in G.nodes(data=True): | |||||
G_new.add_node(str(nd), x=str(attrs['attributes'][0]), | |||||
y=str(attrs['attributes'][1])) | |||||
for nd1, nd2, attrs in G.edges(data=True): | |||||
G_new.add_edge(str(nd1), str(nd2)) | |||||
else: | |||||
for nd, attrs in G.nodes(data=True): | |||||
G_new.add_node(str(nd), chem=attrs['atom']) | |||||
for nd1, nd2, attrs in G.edges(data=True): | |||||
G_new.add_edge(str(nd1), str(nd2), valence=attrs['bond_type']) | |||||
# G_new.add_edge(str(nd1), str(nd2)) | |||||
return G_new | return G_new | ||||
dataset = dataset.lower() | |||||
if lib == 'gedlibpy': | if lib == 'gedlibpy': | ||||
gedlibpy.restart_env() | gedlibpy.restart_env() | ||||
gedlibpy.add_nx_graph(convertGraph(g1), "") | |||||
gedlibpy.add_nx_graph(convertGraph(g2), "") | |||||
gedlibpy.add_nx_graph(convertGraph(g1, dataset), "") | |||||
gedlibpy.add_nx_graph(convertGraph(g2, dataset), "") | |||||
listID = gedlibpy.get_all_graph_ids() | listID = gedlibpy.get_all_graph_ids() | ||||
gedlibpy.set_edit_cost(cost, edit_cost_constant=edit_cost_constant) | gedlibpy.set_edit_cost(cost, edit_cost_constant=edit_cost_constant) | ||||
@@ -320,6 +333,60 @@ def get_nb_edit_operations(g1, g2, forward_map, backward_map): | |||||
# one of the nodes is removed, thus the edge is removed. | # one of the nodes is removed, thus the edge is removed. | ||||
if forward_map[idx1] == np.inf or forward_map[idx2] == np.inf: | if forward_map[idx1] == np.inf or forward_map[idx2] == np.inf: | ||||
n_er += 1 | n_er += 1 | ||||
# corresponding edge is in g2. | |||||
elif (forward_map[idx1], forward_map[idx2]) in g2.edges(): | |||||
nb_edges2_cnted += 1 | |||||
# edge labels are different. | |||||
if g2.edges[((forward_map[idx1], forward_map[idx2]))]['bond_type'] \ | |||||
!= g1.edges[(n1, n2)]['bond_type']: | |||||
n_es += 1 | |||||
elif (forward_map[idx2], forward_map[idx1]) in g2.edges(): | |||||
nb_edges2_cnted += 1 | |||||
# edge labels are different. | |||||
if g2.edges[((forward_map[idx2], forward_map[idx1]))]['bond_type'] \ | |||||
!= g1.edges[(n1, n2)]['bond_type']: | |||||
n_es += 1 | |||||
# corresponding nodes are in g2, however the edge is removed. | |||||
else: | |||||
n_er += 1 | |||||
n_ei = nx.number_of_edges(g2) - nb_edges2_cnted | |||||
return n_vi, n_vr, n_vs, n_ei, n_er, n_es | |||||
def get_nb_edit_operations_letter(g1, g2, forward_map, backward_map): | |||||
"""Compute the number of each edit operations. | |||||
""" | |||||
n_vi = 0 | |||||
n_vr = 0 | |||||
n_vs = 0 | |||||
sod_vs = 0 | |||||
n_ei = 0 | |||||
n_er = 0 | |||||
nodes1 = [n for n in g1.nodes()] | |||||
for i, map_i in enumerate(forward_map): | |||||
if map_i == np.inf: | |||||
n_vr += 1 | |||||
else: | |||||
n_vs += 1 | |||||
diff_x = float(g1.nodes[i]['x']) - float(g2.nodes[map_i]['x']) | |||||
diff_y = float(g1.nodes[i]['y']) - float(g2.nodes[map_i]['y']) | |||||
sod_vs += np.sqrt(np.square(diff_x) + np.square(diff_y)) | |||||
for map_i in backward_map: | |||||
if map_i == np.inf: | |||||
n_vi += 1 | |||||
# idx_nodes1 = range(0, len(node1)) | |||||
edges1 = [e for e in g1.edges()] | |||||
nb_edges2_cnted = 0 | |||||
for n1, n2 in edges1: | |||||
idx1 = nodes1.index(n1) | |||||
idx2 = nodes1.index(n2) | |||||
# one of the nodes is removed, thus the edge is removed. | |||||
if forward_map[idx1] == np.inf or forward_map[idx2] == np.inf: | |||||
n_er += 1 | |||||
# corresponding edge is in g2. Edge label is not considered. | # corresponding edge is in g2. Edge label is not considered. | ||||
elif (forward_map[idx1], forward_map[idx2]) in g2.edges() or \ | elif (forward_map[idx1], forward_map[idx2]) in g2.edges() or \ | ||||
(forward_map[idx2], forward_map[idx1]) in g2.edges(): | (forward_map[idx2], forward_map[idx1]) in g2.edges(): | ||||
@@ -329,4 +396,8 @@ def get_nb_edit_operations(g1, g2, forward_map, backward_map): | |||||
n_er += 1 | n_er += 1 | ||||
n_ei = nx.number_of_edges(g2) - nb_edges2_cnted | n_ei = nx.number_of_edges(g2) - nb_edges2_cnted | ||||
return n_vi, n_vr, n_vs, n_ei, n_er, n_es | |||||
return n_vi, n_vr, n_vs, sod_vs, n_ei, n_er | |||||
if __name__ == '__main__': | |||||
print('check test_ged.py') |
@@ -436,7 +436,8 @@ def iam_upgraded(Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, ite_max=50, | |||||
return G_gen_median_list, sod_gen_median, sod_list, G_set_median_list, sod_set_median | return G_gen_median_list, sod_gen_median, sod_list, G_set_median_list, sod_set_median | ||||
def iam_bash(Gn_names, edit_cost_constant, dataset='monoterpenoides', | |||||
def iam_bash(Gn_names, edit_cost_constant, cost='CONSTANT', | |||||
dataset='monoterpenoides', | |||||
graph_dir='/media/ljia/DATA/research-repo/codes/Linlin/py-graph/datasets/monoterpenoides/'): | graph_dir='/media/ljia/DATA/research-repo/codes/Linlin/py-graph/datasets/monoterpenoides/'): | ||||
"""Compute the iam by c++ implementation (gedlib) through bash. | """Compute the iam by c++ implementation (gedlib) through bash. | ||||
""" | """ | ||||
@@ -467,12 +468,12 @@ def iam_bash(Gn_names, edit_cost_constant, dataset='monoterpenoides', | |||||
# graph_dir = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/generated_datsets/monoterpenoides/gxl' | # graph_dir = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/generated_datsets/monoterpenoides/gxl' | ||||
command = 'GEDLIB_HOME=\'/media/ljia/DATA/research-repo/codes/others/gedlib/gedlib2\'\n' | |||||
command = 'GEDLIB_HOME=\'/media/ljia/DATA/research-repo/codes/Linlin/gedlib\'\n' | |||||
command += 'LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$GEDLIB_HOME/lib\n' | command += 'LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$GEDLIB_HOME/lib\n' | ||||
command += 'export LD_LIBRARY_PATH\n' | command += 'export LD_LIBRARY_PATH\n' | ||||
command += 'cd \'/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/bin\'\n' | command += 'cd \'/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/bin\'\n' | ||||
command += './iam_for_python_bash ' + dataset + ' ' + fn_collection \ | command += './iam_for_python_bash ' + dataset + ' ' + fn_collection \ | ||||
+ ' \'' + graph_dir + '\' ' | |||||
+ ' \'' + graph_dir + '\' ' + ' ' + cost + ' ' | |||||
if edit_cost_constant is None: | if edit_cost_constant is None: | ||||
command += 'None' | command += 'None' | ||||
else: | else: | ||||
@@ -484,7 +485,7 @@ def iam_bash(Gn_names, edit_cost_constant, dataset='monoterpenoides', | |||||
output = stream.readlines() | output = stream.readlines() | ||||
# print(output) | # print(output) | ||||
sod_sm = float(output[0].strip()) | sod_sm = float(output[0].strip()) | ||||
sod_gm= float(output[1].strip()) | |||||
sod_gm = float(output[1].strip()) | |||||
fname_sm = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/output/tmp_ged/set_median.gxl' | fname_sm = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/output/tmp_ged/set_median.gxl' | ||||
fname_gm = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/output/tmp_ged/gen_median.gxl' | fname_gm = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/output/tmp_ged/gen_median.gxl' | ||||
@@ -31,8 +31,9 @@ from fitDistance import fit_GED_to_kernel_distance | |||||
def median_on_k_closest_graphs(Gn, node_label, edge_label, gkernel, k, fit_method, | def median_on_k_closest_graphs(Gn, node_label, edge_label, gkernel, k, fit_method, | ||||
graph_dir='/media/ljia/DATA/research-repo/codes/Linlin/py-graph/datasets/monoterpenoides/', | graph_dir='/media/ljia/DATA/research-repo/codes/Linlin/py-graph/datasets/monoterpenoides/', | ||||
edit_costs=None, group_min=None, dataset='monoterpenoides', | edit_costs=None, group_min=None, dataset='monoterpenoides', | ||||
parallel=True): | |||||
cost='CONSTANT', parallel=True): | |||||
dataset = dataset.lower() | |||||
# # compute distances in kernel space. | # # compute distances in kernel space. | ||||
# dis_mat, _, _, _ = kernel_distance_matrix(Gn, node_label, edge_label, | # dis_mat, _, _, _ = kernel_distance_matrix(Gn, node_label, edge_label, | ||||
# Kmatrix=None, gkernel=gkernel) | # Kmatrix=None, gkernel=gkernel) | ||||
@@ -50,32 +51,53 @@ def median_on_k_closest_graphs(Gn, node_label, edge_label, gkernel, k, fit_metho | |||||
# group_min = (12, 13, 22, 29) # closest w.r.t path kernel | # group_min = (12, 13, 22, 29) # closest w.r.t path kernel | ||||
# group_min = (77, 85, 160, 171) # closest w.r.t ged | # group_min = (77, 85, 160, 171) # closest w.r.t ged | ||||
# group_min = (0,1,2,3,4,5,6,7,8,9,10,11) # closest w.r.t treelet kernel | # group_min = (0,1,2,3,4,5,6,7,8,9,10,11) # closest w.r.t treelet kernel | ||||
Gn_median = [Gn[g].copy() for g in group_min] | Gn_median = [Gn[g].copy() for g in group_min] | ||||
# fit edit costs. | # fit edit costs. | ||||
if fit_method == 'random': # random | if fit_method == 'random': # random | ||||
edit_cost_constant = random.sample(range(1, 10), 6) | |||||
if cost == 'LETTER': | |||||
edit_cost_constant = random.sample(range(1, 10), 3) | |||||
edit_cost_constant = [item * 0.1 for item in edit_cost_constant] | |||||
elif cost == 'LETTER2': | |||||
random.seed(time.time()) | |||||
edit_cost_constant = random.sample(range(1, 10), 5) | |||||
# edit_cost_constant = [item * 0.1 for item in edit_cost_constant] | |||||
else: | |||||
edit_cost_constant = random.sample(range(1, 10), 6) | |||||
print('edit costs used:', edit_cost_constant) | print('edit costs used:', edit_cost_constant) | ||||
elif fit_method == 'expert': # expert | elif fit_method == 'expert': # expert | ||||
edit_cost_constant = [3, 3, 1, 3, 3, 1] | edit_cost_constant = [3, 3, 1, 3, 3, 1] | ||||
elif fit_method == 'k-graphs': | elif fit_method == 'k-graphs': | ||||
itr_max = 6 | itr_max = 6 | ||||
algo_options = '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1' | |||||
params_ged = {'lib': 'gedlibpy', 'cost': 'CONSTANT', 'method': 'IPFP', | |||||
'algo_options': algo_options, 'stabilizer': None} | |||||
if cost == 'LETTER': | |||||
init_costs = [0.9, 1.7, 0.75] | |||||
elif cost == 'LETTER2': | |||||
init_costs = [0.675, 0.675, 0.75, 0.425, 0.425] | |||||
else: | |||||
init_costs = [3, 3, 1, 3, 3, 1] | |||||
algo_options = '--threads 1 --initial-solutions 40 --ratio-runs-from-initial-solutions 1' | |||||
params_ged = {'lib': 'gedlibpy', 'cost': cost, 'method': 'IPFP', | |||||
'algo_options': algo_options, 'stabilizer': None} | |||||
# fit on k-graph subset | # fit on k-graph subset | ||||
edit_cost_constant, _, _, _, _, _, _ = fit_GED_to_kernel_distance(Gn_median, | edit_cost_constant, _, _, _, _, _, _ = fit_GED_to_kernel_distance(Gn_median, | ||||
node_label, edge_label, gkernel, itr_max, params_ged=params_ged, parallel=True) | |||||
node_label, edge_label, gkernel, itr_max, params_ged=params_ged, | |||||
init_costs=init_costs, dataset=dataset, parallel=True) | |||||
elif fit_method == 'whole-dataset': | elif fit_method == 'whole-dataset': | ||||
itr_max = 6 | itr_max = 6 | ||||
algo_options = '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1' | |||||
params_ged = {'lib': 'gedlibpy', 'cost': 'CONSTANT', 'method': 'IPFP', | |||||
if cost == 'LETTER': | |||||
init_costs = [0.9, 1.7, 0.75] | |||||
elif cost == 'LETTER2': | |||||
init_costs = [0.675, 0.675, 0.75, 0.425, 0.425] | |||||
else: | |||||
init_costs = [3, 3, 1, 3, 3, 1] | |||||
algo_options = '--threads 1 --initial-solutions 40 --ratio-runs-from-initial-solutions 1' | |||||
params_ged = {'lib': 'gedlibpy', 'cost': cost, 'method': 'IPFP', | |||||
'algo_options': algo_options, 'stabilizer': None} | 'algo_options': algo_options, 'stabilizer': None} | ||||
# fit on all subset | # fit on all subset | ||||
edit_cost_constant, _, _, _, _, _, _ = fit_GED_to_kernel_distance(Gn, | edit_cost_constant, _, _, _, _, _, _ = fit_GED_to_kernel_distance(Gn, | ||||
node_label, edge_label, gkernel, itr_max, params_ged=params_ged, parallel=True) | |||||
node_label, edge_label, gkernel, itr_max, params_ged=params_ged, | |||||
init_costs=init_costs, dataset=dataset, parallel=True) | |||||
elif fit_method == 'precomputed': | elif fit_method == 'precomputed': | ||||
edit_cost_constant = edit_costs | edit_cost_constant = edit_costs | ||||
@@ -83,14 +105,17 @@ def median_on_k_closest_graphs(Gn, node_label, edge_label, gkernel, k, fit_metho | |||||
# compute set median and gen median using IAM (C++ through bash). | # compute set median and gen median using IAM (C++ through bash). | ||||
group_fnames = [Gn[g].graph['filename'] for g in group_min] | group_fnames = [Gn[g].graph['filename'] for g in group_min] | ||||
sod_sm, sod_gm, fname_sm, fname_gm = iam_bash(group_fnames, edit_cost_constant, | sod_sm, sod_gm, fname_sm, fname_gm = iam_bash(group_fnames, edit_cost_constant, | ||||
graph_dir=graph_dir, dataset=dataset) | |||||
cost=cost, graph_dir=graph_dir, | |||||
dataset=dataset) | |||||
# compute distances in kernel space. | # compute distances in kernel space. | ||||
Gn_median = [Gn[g].copy() for g in group_min] | Gn_median = [Gn[g].copy() for g in group_min] | ||||
set_median = loadGXL(fname_sm) | set_median = loadGXL(fname_sm) | ||||
gen_median = loadGXL(fname_gm) | gen_median = loadGXL(fname_gm) | ||||
if dataset == 'Letter': | |||||
# print(gen_median.nodes(data=True)) | |||||
# print(gen_median.edges(data=True)) | |||||
if dataset == 'letter': | |||||
for g in Gn_median: | for g in Gn_median: | ||||
reform_attributes(g) | reform_attributes(g) | ||||
reform_attributes(set_median) | reform_attributes(set_median) | ||||
@@ -98,16 +123,19 @@ def median_on_k_closest_graphs(Gn, node_label, edge_label, gkernel, k, fit_metho | |||||
# compute distance in kernel space for set median. | # compute distance in kernel space for set median. | ||||
Kmatrix_sm = compute_kernel([set_median] + Gn_median, gkernel, | Kmatrix_sm = compute_kernel([set_median] + Gn_median, gkernel, | ||||
None if dataset == 'Letter' else 'chem', | |||||
None if dataset == 'Letter' else 'valence', | |||||
None if dataset == 'letter' else 'chem', | |||||
None if dataset == 'letter' else 'valence', | |||||
False) | False) | ||||
dis_k_sm = dis_gstar(0, range(1, 1+len(Gn_median)), | dis_k_sm = dis_gstar(0, range(1, 1+len(Gn_median)), | ||||
[1 / len(Gn_median)] * len(Gn_median), Kmatrix_sm, withterm3=False) | [1 / len(Gn_median)] * len(Gn_median), Kmatrix_sm, withterm3=False) | ||||
# print(gen_median.nodes(data=True)) | |||||
# print(gen_median.edges(data=True)) | |||||
# print(set_median.nodes(data=True)) | |||||
# print(set_median.edges(data=True)) | |||||
# compute distance in kernel space for generalized median. | # compute distance in kernel space for generalized median. | ||||
Kmatrix_gm = compute_kernel([gen_median] + Gn_median, gkernel, | Kmatrix_gm = compute_kernel([gen_median] + Gn_median, gkernel, | ||||
None if dataset == 'Letter' else 'chem', | |||||
None if dataset == 'Letter' else 'valence', | |||||
None if dataset == 'letter' else 'chem', | |||||
None if dataset == 'letter' else 'valence', | |||||
False) | False) | ||||
dis_k_gm = dis_gstar(0, range(1, 1+len(Gn_median)), | dis_k_gm = dis_gstar(0, range(1, 1+len(Gn_median)), | ||||
[1 / len(Gn_median)] * len(Gn_median), Kmatrix_gm, withterm3=False) | [1 / len(Gn_median)] * len(Gn_median), Kmatrix_gm, withterm3=False) | ||||
@@ -61,8 +61,8 @@ def compute_kernel(Gn, graph_kernel, node_label, edge_label, verbose): | |||||
{'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}, | {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}, | ||||
n_jobs=multiprocessing.cpu_count(), verbose=verbose) | n_jobs=multiprocessing.cpu_count(), verbose=verbose) | ||||
elif graph_kernel == 'treeletkernel': | elif graph_kernel == 'treeletkernel': | ||||
# pkernel = functools.partial(polynomialkernel, d=2, c=1e5) | |||||
pkernel = functools.partial(gaussiankernel, gamma=1e-6) | |||||
pkernel = functools.partial(polynomialkernel, d=2, c=1e5) | |||||
# pkernel = functools.partial(gaussiankernel, gamma=1e-6) | |||||
mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) | mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) | ||||
Kmatrix, _ = treeletkernel(Gn, node_label=node_label, edge_label=edge_label, | Kmatrix, _ = treeletkernel(Gn, node_label=node_label, edge_label=edge_label, | ||||
sub_kernel=pkernel, | sub_kernel=pkernel, | ||||
@@ -19,11 +19,13 @@ from preimage.test_k_closest_graphs import median_on_k_closest_graphs, reform_at | |||||
from preimage.utils import get_same_item_indices | from preimage.utils import get_same_item_indices | ||||
from preimage.find_best_k import getRelations | from preimage.find_best_k import getRelations | ||||
def xp_letter_h(): | |||||
ds = {'name': 'Letter-high', | |||||
'dataset': '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/data/collections/Letter.xml', | |||||
def xp_letter_h_LETTER2_cost(): | |||||
ds = {'dataset': '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/data/collections/Letter.xml', | |||||
'graph_dir': '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/data/datasets/Letter/HIGH/'} # node/edge symb | 'graph_dir': '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/data/datasets/Letter/HIGH/'} # node/edge symb | ||||
Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['graph_dir']) | Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['graph_dir']) | ||||
for G in Gn: | |||||
reform_attributes(G) | |||||
# ds = {'name': 'Letter-high', | # ds = {'name': 'Letter-high', | ||||
# 'dataset': '../datasets/Letter-high/Letter-high_A.txt'} # node/edge symb | # 'dataset': '../datasets/Letter-high/Letter-high_A.txt'} # node/edge symb | ||||
# Gn, y_all = loadDataset(ds['dataset']) | # Gn, y_all = loadDataset(ds['dataset']) | ||||
@@ -33,32 +35,35 @@ def xp_letter_h(): | |||||
edge_label = None | edge_label = None | ||||
ds_name = 'letter-h' | ds_name = 'letter-h' | ||||
dir_output = 'results/xp_letter_h/' | dir_output = 'results/xp_letter_h/' | ||||
save_results = True | |||||
cost = 'LETTER2' | |||||
repeats = 1 | repeats = 1 | ||||
# k_list = range(2, 11) | # k_list = range(2, 11) | ||||
k_list = [150] | k_list = [150] | ||||
fit_method = 'precomputed' | |||||
fit_method = 'k-graphs' | |||||
# get indices by classes. | # get indices by classes. | ||||
y_idx = get_same_item_indices(y_all) | y_idx = get_same_item_indices(y_all) | ||||
# create result files. | |||||
fn_output_detail = 'results_detail.' + fit_method + '.csv' | |||||
f_detail = open(dir_output + fn_output_detail, 'a') | |||||
csv.writer(f_detail).writerow(['dataset', 'graph kernel', 'fit method', 'k', | |||||
'target', 'repeat', 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM', | |||||
'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM', | |||||
'dis_k gi -> GM', 'median set']) | |||||
f_detail.close() | |||||
fn_output_summary = 'results_summary.csv' | |||||
f_summary = open(dir_output + fn_output_summary, 'a') | |||||
csv.writer(f_summary).writerow(['dataset', 'graph kernel', 'fit method', 'k', | |||||
'target', 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM', | |||||
'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM', | |||||
'dis_k gi -> GM', '# SOD SM -> GM', '# dis_k SM -> GM', | |||||
'# dis_k gi -> SM', '# dis_k gi -> GM', 'repeats better SOD SM -> GM', | |||||
'repeats better dis_k SM -> GM', 'repeats better dis_k gi -> SM', | |||||
'repeats better dis_k gi -> GM']) | |||||
f_summary.close() | |||||
if save_results: | |||||
# create result files. | |||||
fn_output_detail = 'results_detail.' + ds_name + '.' + gkernel + '.' + fit_method + '.csv' | |||||
f_detail = open(dir_output + fn_output_detail, 'a') | |||||
csv.writer(f_detail).writerow(['dataset', 'graph kernel', 'fit method', 'k', | |||||
'target', 'repeat', 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM', | |||||
'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM', | |||||
'dis_k gi -> GM', 'median set']) | |||||
f_detail.close() | |||||
fn_output_summary = 'results_summary.' + ds_name + '.' + gkernel + '.' + fit_method + '.csv' | |||||
f_summary = open(dir_output + fn_output_summary, 'a') | |||||
csv.writer(f_summary).writerow(['dataset', 'graph kernel', 'fit method', 'k', | |||||
'target', 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM', | |||||
'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM', | |||||
'dis_k gi -> GM', '# SOD SM -> GM', '# dis_k SM -> GM', | |||||
'# dis_k gi -> SM', '# dis_k gi -> GM', 'repeats better SOD SM -> GM', | |||||
'repeats better dis_k SM -> GM', 'repeats better dis_k gi -> SM', | |||||
'repeats better dis_k gi -> GM']) | |||||
f_summary.close() | |||||
random.seed(1) | random.seed(1) | ||||
rdn_seed_list = random.sample(range(0, repeats * 100), repeats) | rdn_seed_list = random.sample(range(0, repeats * 100), repeats) | ||||
@@ -82,11 +87,11 @@ def xp_letter_h(): | |||||
for i, (y, values) in enumerate(y_idx.items()): | for i, (y, values) in enumerate(y_idx.items()): | ||||
print('\ny =', y) | print('\ny =', y) | ||||
# y = 'I' | |||||
# y = 'F' | |||||
# values = y_idx[y] | # values = y_idx[y] | ||||
# values = values[0:10] | |||||
# k = len(values) | |||||
# k = kkk | |||||
k = len(values) | |||||
sod_sm_list = [] | sod_sm_list = [] | ||||
sod_gm_list = [] | sod_gm_list = [] | ||||
@@ -114,20 +119,21 @@ def xp_letter_h(): | |||||
= median_on_k_closest_graphs(Gn_median, node_label, edge_label, | = median_on_k_closest_graphs(Gn_median, node_label, edge_label, | ||||
gkernel, k, fit_method=fit_method, graph_dir=ds['graph_dir'], | gkernel, k, fit_method=fit_method, graph_dir=ds['graph_dir'], | ||||
edit_costs=None, group_min=median_set_idx_idx, | edit_costs=None, group_min=median_set_idx_idx, | ||||
dataset='Letter', parallel=False) | |||||
dataset='Letter', cost=cost, parallel=False) | |||||
# write result detail. | # write result detail. | ||||
sod_sm2gm = getRelations(np.sign(sod_gm - sod_sm)) | sod_sm2gm = getRelations(np.sign(sod_gm - sod_sm)) | ||||
dis_k_sm2gm = getRelations(np.sign(dis_k_gm - dis_k_sm)) | dis_k_sm2gm = getRelations(np.sign(dis_k_gm - dis_k_sm)) | ||||
dis_k_gi2sm = getRelations(np.sign(dis_k_sm - dis_k_gi_min)) | dis_k_gi2sm = getRelations(np.sign(dis_k_sm - dis_k_gi_min)) | ||||
dis_k_gi2gm = getRelations(np.sign(dis_k_gm - dis_k_gi_min)) | dis_k_gi2gm = getRelations(np.sign(dis_k_gm - dis_k_gi_min)) | ||||
f_detail = open(dir_output + fn_output_detail, 'a') | |||||
csv.writer(f_detail).writerow([ds_name, gkernel, fit_method, k, | |||||
y, repeat, | |||||
sod_sm, sod_gm, dis_k_sm, dis_k_gm, | |||||
dis_k_gi_min, sod_sm2gm, dis_k_sm2gm, dis_k_gi2sm, | |||||
dis_k_gi2gm, median_set_idx]) | |||||
f_detail.close() | |||||
if save_results: | |||||
f_detail = open(dir_output + fn_output_detail, 'a') | |||||
csv.writer(f_detail).writerow([ds_name, gkernel, fit_method, k, | |||||
y, repeat, | |||||
sod_sm, sod_gm, dis_k_sm, dis_k_gm, | |||||
dis_k_gi_min, sod_sm2gm, dis_k_sm2gm, dis_k_gi2sm, | |||||
dis_k_gi2gm, median_set_idx]) | |||||
f_detail.close() | |||||
# compute result summary. | # compute result summary. | ||||
sod_sm_list.append(sod_sm) | sod_sm_list.append(sod_sm) | ||||
@@ -170,14 +176,17 @@ def xp_letter_h(): | |||||
# save median graphs. | # save median graphs. | ||||
fname_sm = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/output/tmp_ged/set_median.gxl' | fname_sm = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/output/tmp_ged/set_median.gxl' | ||||
fn_pre_sm_new = dir_output + 'medians/set_median.k' + str(int(k)) + '.y' + y + '.repeat' + str(repeat) | |||||
fn_pre_sm_new = dir_output + 'medians/set_median.' + fit_method \ | |||||
+ '.k' + str(int(k)) + '.y' + y + '.repeat' + str(repeat) | |||||
copyfile(fname_sm, fn_pre_sm_new + '.gxl') | copyfile(fname_sm, fn_pre_sm_new + '.gxl') | ||||
fname_gm = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/output/tmp_ged/gen_median.gxl' | fname_gm = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/output/tmp_ged/gen_median.gxl' | ||||
fn_pre_gm_new = dir_output + 'medians/gen_median.k' + str(int(k)) + '.y' + y + '.repeat' + str(repeat) | |||||
fn_pre_gm_new = dir_output + 'medians/gen_median.' + fit_method \ | |||||
+ '.k' + str(int(k)) + '.y' + y + '.repeat' + str(repeat) | |||||
copyfile(fname_gm, fn_pre_gm_new + '.gxl') | copyfile(fname_gm, fn_pre_gm_new + '.gxl') | ||||
G_best_kernel = Gn_median[idx_dis_k_gi_min].copy() | G_best_kernel = Gn_median[idx_dis_k_gi_min].copy() | ||||
reform_attributes(G_best_kernel) | reform_attributes(G_best_kernel) | ||||
fn_pre_g_best_kernel = dir_output + 'medians/g_best_kernel.k' + str(int(k)) + '.y' + y + '.repeat' + str(repeat) | |||||
fn_pre_g_best_kernel = dir_output + 'medians/g_best_kernel.' + fit_method \ | |||||
+ '.k' + str(int(k)) + '.y' + y + '.repeat' + str(repeat) | |||||
saveGXL(G_best_kernel, fn_pre_g_best_kernel + '.gxl', method='gedlib-letter') | saveGXL(G_best_kernel, fn_pre_g_best_kernel + '.gxl', method='gedlib-letter') | ||||
# plot median graphs. | # plot median graphs. | ||||
@@ -197,16 +206,17 @@ def xp_letter_h(): | |||||
dis_k_sm2gm_mean = getRelations(np.sign(dis_k_gm_mean_list[-1] - dis_k_sm_mean_list[-1])) | dis_k_sm2gm_mean = getRelations(np.sign(dis_k_gm_mean_list[-1] - dis_k_sm_mean_list[-1])) | ||||
dis_k_gi2sm_mean = getRelations(np.sign(dis_k_sm_mean_list[-1] - dis_k_gi_min_mean_list[-1])) | dis_k_gi2sm_mean = getRelations(np.sign(dis_k_sm_mean_list[-1] - dis_k_gi_min_mean_list[-1])) | ||||
dis_k_gi2gm_mean = getRelations(np.sign(dis_k_gm_mean_list[-1] - dis_k_gi_min_mean_list[-1])) | dis_k_gi2gm_mean = getRelations(np.sign(dis_k_gm_mean_list[-1] - dis_k_gi_min_mean_list[-1])) | ||||
f_summary = open(dir_output + fn_output_summary, 'a') | |||||
csv.writer(f_summary).writerow([ds_name, gkernel, fit_method, k, y, | |||||
sod_sm_mean_list[-1], sod_gm_mean_list[-1], | |||||
dis_k_sm_mean_list[-1], dis_k_gm_mean_list[-1], | |||||
dis_k_gi_min_mean_list[-1], sod_sm2gm_mean, dis_k_sm2gm_mean, | |||||
dis_k_gi2sm_mean, dis_k_gi2gm_mean, nb_sod_sm2gm, | |||||
nb_dis_k_sm2gm, nb_dis_k_gi2sm, nb_dis_k_gi2gm, | |||||
repeats_better_sod_sm2gm, repeats_better_dis_k_sm2gm, | |||||
repeats_better_dis_k_gi2sm, repeats_better_dis_k_gi2gm]) | |||||
f_summary.close() | |||||
if save_results: | |||||
f_summary = open(dir_output + fn_output_summary, 'a') | |||||
csv.writer(f_summary).writerow([ds_name, gkernel, fit_method, k, y, | |||||
sod_sm_mean_list[-1], sod_gm_mean_list[-1], | |||||
dis_k_sm_mean_list[-1], dis_k_gm_mean_list[-1], | |||||
dis_k_gi_min_mean_list[-1], sod_sm2gm_mean, dis_k_sm2gm_mean, | |||||
dis_k_gi2sm_mean, dis_k_gi2gm_mean, nb_sod_sm2gm, | |||||
nb_dis_k_sm2gm, nb_dis_k_gi2sm, nb_dis_k_gi2gm, | |||||
repeats_better_sod_sm2gm, repeats_better_dis_k_sm2gm, | |||||
repeats_better_dis_k_gi2sm, repeats_better_dis_k_gi2gm]) | |||||
f_summary.close() | |||||
# write result summary for each letter. | # write result summary for each letter. | ||||
@@ -219,13 +229,232 @@ def xp_letter_h(): | |||||
dis_k_sm2gm_mean = getRelations(np.sign(dis_k_gm_mean - dis_k_sm_mean)) | dis_k_sm2gm_mean = getRelations(np.sign(dis_k_gm_mean - dis_k_sm_mean)) | ||||
dis_k_gi2sm_mean = getRelations(np.sign(dis_k_sm_mean - dis_k_gi_min_mean)) | dis_k_gi2sm_mean = getRelations(np.sign(dis_k_sm_mean - dis_k_gi_min_mean)) | ||||
dis_k_gi2gm_mean = getRelations(np.sign(dis_k_gm_mean - dis_k_gi_min_mean)) | dis_k_gi2gm_mean = getRelations(np.sign(dis_k_gm_mean - dis_k_gi_min_mean)) | ||||
if save_results: | |||||
f_summary = open(dir_output + fn_output_summary, 'a') | |||||
csv.writer(f_summary).writerow([ds_name, gkernel, fit_method, k, 'all', | |||||
sod_sm_mean, sod_gm_mean, dis_k_sm_mean, dis_k_gm_mean, | |||||
dis_k_gi_min_mean, sod_sm2gm_mean, dis_k_sm2gm_mean, | |||||
dis_k_gi2sm_mean, dis_k_gi2gm_mean]) | |||||
f_summary.close() | |||||
print('\ncomplete.') | |||||
def xp_letter_h(): | |||||
ds = {'dataset': '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/data/collections/Letter.xml', | |||||
'graph_dir': '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/data/datasets/Letter/HIGH/'} # node/edge symb | |||||
Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['graph_dir']) | |||||
for G in Gn: | |||||
reform_attributes(G) | |||||
# ds = {'name': 'Letter-high', | |||||
# 'dataset': '../datasets/Letter-high/Letter-high_A.txt'} # node/edge symb | |||||
# Gn, y_all = loadDataset(ds['dataset']) | |||||
# Gn = Gn[0:50] | |||||
gkernel = 'structuralspkernel' | |||||
node_label = None | |||||
edge_label = None | |||||
ds_name = 'letter-h' | |||||
dir_output = 'results/xp_letter_h/' | |||||
save_results = False | |||||
repeats = 1 | |||||
# k_list = range(2, 11) | |||||
k_list = [150] | |||||
fit_method = 'k-graphs' | |||||
# get indices by classes. | |||||
y_idx = get_same_item_indices(y_all) | |||||
if save_results: | |||||
# create result files. | |||||
fn_output_detail = 'results_detail.' + ds_name + '.' + gkernel + '.' + fit_method + '.csv' | |||||
f_detail = open(dir_output + fn_output_detail, 'a') | |||||
csv.writer(f_detail).writerow(['dataset', 'graph kernel', 'fit method', 'k', | |||||
'target', 'repeat', 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM', | |||||
'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM', | |||||
'dis_k gi -> GM', 'median set']) | |||||
f_detail.close() | |||||
fn_output_summary = 'results_summary.' + ds_name + '.' + gkernel + '.' + fit_method + '.csv' | |||||
f_summary = open(dir_output + fn_output_summary, 'a') | f_summary = open(dir_output + fn_output_summary, 'a') | ||||
csv.writer(f_summary).writerow([ds_name, gkernel, fit_method, k, 'all', | |||||
sod_sm_mean, sod_gm_mean, dis_k_sm_mean, dis_k_gm_mean, | |||||
dis_k_gi_min_mean, sod_sm2gm_mean, dis_k_sm2gm_mean, | |||||
dis_k_gi2sm_mean, dis_k_gi2gm_mean]) | |||||
csv.writer(f_summary).writerow(['dataset', 'graph kernel', 'fit method', 'k', | |||||
'target', 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM', | |||||
'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM', | |||||
'dis_k gi -> GM', '# SOD SM -> GM', '# dis_k SM -> GM', | |||||
'# dis_k gi -> SM', '# dis_k gi -> GM', 'repeats better SOD SM -> GM', | |||||
'repeats better dis_k SM -> GM', 'repeats better dis_k gi -> SM', | |||||
'repeats better dis_k gi -> GM']) | |||||
f_summary.close() | f_summary.close() | ||||
random.seed(1) | |||||
rdn_seed_list = random.sample(range(0, repeats * 100), repeats) | |||||
for k in k_list: | |||||
print('\n--------- k =', k, '----------') | |||||
sod_sm_mean_list = [] | |||||
sod_gm_mean_list = [] | |||||
dis_k_sm_mean_list = [] | |||||
dis_k_gm_mean_list = [] | |||||
dis_k_gi_min_mean_list = [] | |||||
# nb_sod_sm2gm = [0, 0, 0] | |||||
# nb_dis_k_sm2gm = [0, 0, 0] | |||||
# nb_dis_k_gi2sm = [0, 0, 0] | |||||
# nb_dis_k_gi2gm = [0, 0, 0] | |||||
# repeats_better_sod_sm2gm = [] | |||||
# repeats_better_dis_k_sm2gm = [] | |||||
# repeats_better_dis_k_gi2sm = [] | |||||
# repeats_better_dis_k_gi2gm = [] | |||||
for i, (y, values) in enumerate(y_idx.items()): | |||||
print('\ny =', y) | |||||
# y = 'N' | |||||
# values = y_idx[y] | |||||
# values = values[0:10] | |||||
k = len(values) | |||||
sod_sm_list = [] | |||||
sod_gm_list = [] | |||||
dis_k_sm_list = [] | |||||
dis_k_gm_list = [] | |||||
dis_k_gi_min_list = [] | |||||
nb_sod_sm2gm = [0, 0, 0] | |||||
nb_dis_k_sm2gm = [0, 0, 0] | |||||
nb_dis_k_gi2sm = [0, 0, 0] | |||||
nb_dis_k_gi2gm = [0, 0, 0] | |||||
repeats_better_sod_sm2gm = [] | |||||
repeats_better_dis_k_sm2gm = [] | |||||
repeats_better_dis_k_gi2sm = [] | |||||
repeats_better_dis_k_gi2gm = [] | |||||
for repeat in range(repeats): | |||||
print('\nrepeat =', repeat) | |||||
random.seed(rdn_seed_list[repeat]) | |||||
median_set_idx_idx = random.sample(range(0, len(values)), k) | |||||
median_set_idx = [values[idx] for idx in median_set_idx_idx] | |||||
print('median set: ', median_set_idx) | |||||
Gn_median = [Gn[g] for g in values] | |||||
sod_sm, sod_gm, dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min, idx_dis_k_gi_min \ | |||||
= median_on_k_closest_graphs(Gn_median, node_label, edge_label, | |||||
gkernel, k, fit_method=fit_method, graph_dir=ds['graph_dir'], | |||||
edit_costs=None, group_min=median_set_idx_idx, | |||||
dataset='Letter', parallel=False) | |||||
# write result detail. | |||||
sod_sm2gm = getRelations(np.sign(sod_gm - sod_sm)) | |||||
dis_k_sm2gm = getRelations(np.sign(dis_k_gm - dis_k_sm)) | |||||
dis_k_gi2sm = getRelations(np.sign(dis_k_sm - dis_k_gi_min)) | |||||
dis_k_gi2gm = getRelations(np.sign(dis_k_gm - dis_k_gi_min)) | |||||
if save_results: | |||||
f_detail = open(dir_output + fn_output_detail, 'a') | |||||
csv.writer(f_detail).writerow([ds_name, gkernel, fit_method, k, | |||||
y, repeat, | |||||
sod_sm, sod_gm, dis_k_sm, dis_k_gm, | |||||
dis_k_gi_min, sod_sm2gm, dis_k_sm2gm, dis_k_gi2sm, | |||||
dis_k_gi2gm, median_set_idx]) | |||||
f_detail.close() | |||||
# compute result summary. | |||||
sod_sm_list.append(sod_sm) | |||||
sod_gm_list.append(sod_gm) | |||||
dis_k_sm_list.append(dis_k_sm) | |||||
dis_k_gm_list.append(dis_k_gm) | |||||
dis_k_gi_min_list.append(dis_k_gi_min) | |||||
# # SOD SM -> GM | |||||
if sod_sm > sod_gm: | |||||
nb_sod_sm2gm[0] += 1 | |||||
repeats_better_sod_sm2gm.append(repeat) | |||||
elif sod_sm == sod_gm: | |||||
nb_sod_sm2gm[1] += 1 | |||||
elif sod_sm < sod_gm: | |||||
nb_sod_sm2gm[2] += 1 | |||||
# # dis_k SM -> GM | |||||
if dis_k_sm > dis_k_gm: | |||||
nb_dis_k_sm2gm[0] += 1 | |||||
repeats_better_dis_k_sm2gm.append(repeat) | |||||
elif dis_k_sm == dis_k_gm: | |||||
nb_dis_k_sm2gm[1] += 1 | |||||
elif dis_k_sm < dis_k_gm: | |||||
nb_dis_k_sm2gm[2] += 1 | |||||
# # dis_k gi -> SM | |||||
if dis_k_gi_min > dis_k_sm: | |||||
nb_dis_k_gi2sm[0] += 1 | |||||
repeats_better_dis_k_gi2sm.append(repeat) | |||||
elif dis_k_gi_min == dis_k_sm: | |||||
nb_dis_k_gi2sm[1] += 1 | |||||
elif dis_k_gi_min < dis_k_sm: | |||||
nb_dis_k_gi2sm[2] += 1 | |||||
# # dis_k gi -> GM | |||||
if dis_k_gi_min > dis_k_gm: | |||||
nb_dis_k_gi2gm[0] += 1 | |||||
repeats_better_dis_k_gi2gm.append(repeat) | |||||
elif dis_k_gi_min == dis_k_gm: | |||||
nb_dis_k_gi2gm[1] += 1 | |||||
elif dis_k_gi_min < dis_k_gm: | |||||
nb_dis_k_gi2gm[2] += 1 | |||||
# save median graphs. | |||||
fname_sm = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/output/tmp_ged/set_median.gxl' | |||||
fn_pre_sm_new = dir_output + 'medians/set_median.' + fit_method \ | |||||
+ '.k' + str(int(k)) + '.y' + y + '.repeat' + str(repeat) | |||||
copyfile(fname_sm, fn_pre_sm_new + '.gxl') | |||||
fname_gm = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/output/tmp_ged/gen_median.gxl' | |||||
fn_pre_gm_new = dir_output + 'medians/gen_median.' + fit_method \ | |||||
+ '.k' + str(int(k)) + '.y' + y + '.repeat' + str(repeat) | |||||
copyfile(fname_gm, fn_pre_gm_new + '.gxl') | |||||
G_best_kernel = Gn_median[idx_dis_k_gi_min].copy() | |||||
reform_attributes(G_best_kernel) | |||||
fn_pre_g_best_kernel = dir_output + 'medians/g_best_kernel.' + fit_method \ | |||||
+ '.k' + str(int(k)) + '.y' + y + '.repeat' + str(repeat) | |||||
saveGXL(G_best_kernel, fn_pre_g_best_kernel + '.gxl', method='gedlib-letter') | |||||
# plot median graphs. | |||||
set_median = loadGXL(fn_pre_sm_new + '.gxl') | |||||
gen_median = loadGXL(fn_pre_gm_new + '.gxl') | |||||
draw_Letter_graph(set_median, fn_pre_sm_new) | |||||
draw_Letter_graph(gen_median, fn_pre_gm_new) | |||||
draw_Letter_graph(G_best_kernel, fn_pre_g_best_kernel) | |||||
# write result summary for each letter. | |||||
sod_sm_mean_list.append(np.mean(sod_sm_list)) | |||||
sod_gm_mean_list.append(np.mean(sod_gm_list)) | |||||
dis_k_sm_mean_list.append(np.mean(dis_k_sm_list)) | |||||
dis_k_gm_mean_list.append(np.mean(dis_k_gm_list)) | |||||
dis_k_gi_min_mean_list.append(np.mean(dis_k_gi_min_list)) | |||||
sod_sm2gm_mean = getRelations(np.sign(sod_gm_mean_list[-1] - sod_sm_mean_list[-1])) | |||||
dis_k_sm2gm_mean = getRelations(np.sign(dis_k_gm_mean_list[-1] - dis_k_sm_mean_list[-1])) | |||||
dis_k_gi2sm_mean = getRelations(np.sign(dis_k_sm_mean_list[-1] - dis_k_gi_min_mean_list[-1])) | |||||
dis_k_gi2gm_mean = getRelations(np.sign(dis_k_gm_mean_list[-1] - dis_k_gi_min_mean_list[-1])) | |||||
if save_results: | |||||
f_summary = open(dir_output + fn_output_summary, 'a') | |||||
csv.writer(f_summary).writerow([ds_name, gkernel, fit_method, k, y, | |||||
sod_sm_mean_list[-1], sod_gm_mean_list[-1], | |||||
dis_k_sm_mean_list[-1], dis_k_gm_mean_list[-1], | |||||
dis_k_gi_min_mean_list[-1], sod_sm2gm_mean, dis_k_sm2gm_mean, | |||||
dis_k_gi2sm_mean, dis_k_gi2gm_mean, nb_sod_sm2gm, | |||||
nb_dis_k_sm2gm, nb_dis_k_gi2sm, nb_dis_k_gi2gm, | |||||
repeats_better_sod_sm2gm, repeats_better_dis_k_sm2gm, | |||||
repeats_better_dis_k_gi2sm, repeats_better_dis_k_gi2gm]) | |||||
f_summary.close() | |||||
# write result summary for each letter. | |||||
sod_sm_mean = np.mean(sod_sm_mean_list) | |||||
sod_gm_mean = np.mean(sod_gm_mean_list) | |||||
dis_k_sm_mean = np.mean(dis_k_sm_mean_list) | |||||
dis_k_gm_mean = np.mean(dis_k_gm_mean_list) | |||||
dis_k_gi_min_mean = np.mean(dis_k_gi_min_list) | |||||
sod_sm2gm_mean = getRelations(np.sign(sod_gm_mean - sod_sm_mean)) | |||||
dis_k_sm2gm_mean = getRelations(np.sign(dis_k_gm_mean - dis_k_sm_mean)) | |||||
dis_k_gi2sm_mean = getRelations(np.sign(dis_k_sm_mean - dis_k_gi_min_mean)) | |||||
dis_k_gi2gm_mean = getRelations(np.sign(dis_k_gm_mean - dis_k_gi_min_mean)) | |||||
if save_results: | |||||
f_summary = open(dir_output + fn_output_summary, 'a') | |||||
csv.writer(f_summary).writerow([ds_name, gkernel, fit_method, k, 'all', | |||||
sod_sm_mean, sod_gm_mean, dis_k_sm_mean, dis_k_gm_mean, | |||||
dis_k_gi_min_mean, sod_sm2gm_mean, dis_k_sm2gm_mean, | |||||
dis_k_gi2sm_mean, dis_k_gi2gm_mean]) | |||||
f_summary.close() | |||||
print('\ncomplete.') | print('\ncomplete.') | ||||
@@ -243,4 +472,5 @@ def draw_Letter_graph(graph, file_prefix): | |||||
if __name__ == "__main__": | if __name__ == "__main__": | ||||
xp_letter_h() | |||||
# xp_letter_h() | |||||
xp_letter_h_LETTER2_cost() |