@@ -28,6 +28,7 @@ dslist = [ | |||||
# {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1_A.txt'}, # node symb | # {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1_A.txt'}, # node symb | ||||
# {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109_A.txt'}, # node symb | # {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109_A.txt'}, # node symb | ||||
# {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb | # {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb | ||||
# {'name': 'monoterpenoides', 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'}, # node/edge symb | |||||
# | # | ||||
# {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, | # {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, | ||||
# # node/edge symb | # # node/edge symb | ||||
@@ -57,7 +58,7 @@ estimator = marginalizedkernel | |||||
#param_grid_precomputed = {'p_quit': np.linspace(0.1, 0.3, 3), | #param_grid_precomputed = {'p_quit': np.linspace(0.1, 0.3, 3), | ||||
# 'n_iteration': np.linspace(1, 1, 1), | # 'n_iteration': np.linspace(1, 1, 1), | ||||
param_grid_precomputed = {'p_quit': np.linspace(0.1, 0.9, 9), | param_grid_precomputed = {'p_quit': np.linspace(0.1, 0.9, 9), | ||||
'n_iteration': np.linspace(5, 20, 4), | |||||
'n_iteration': np.linspace(1, 19, 7), | |||||
'remove_totters': [False]} | 'remove_totters': [False]} | ||||
param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)}, | param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)}, | ||||
{'alpha': np.logspace(-10, 10, num=41, base=10)}] | {'alpha': np.logspace(-10, 10, num=41, base=10)}] | ||||
@@ -24,6 +24,9 @@ dslist = [ | |||||
# {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1_A.txt'}, # node symb | # {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1_A.txt'}, # node symb | ||||
# {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109_A.txt'}, # node symb | # {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109_A.txt'}, # node symb | ||||
# {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb | # {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb | ||||
# {'name': 'monoterpenoides', 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'}, # node/edge | |||||
# {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt'}, | |||||
# # node nsymb symb | |||||
# | # | ||||
# {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, | # {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, | ||||
# # node/edge symb | # # node/edge symb | ||||
@@ -30,6 +30,8 @@ dslist = [ | |||||
# {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'}, | # {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'}, | ||||
# # node symb/nsymb | # # node symb/nsymb | ||||
# {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb | # {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb | ||||
# {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt'}, | |||||
# # node nsymb symb | |||||
# | # | ||||
# {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, | # {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, | ||||
# # node/edge symb | # # node/edge symb | ||||
@@ -26,6 +26,7 @@ dslist = [ | |||||
{'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109_A.txt'}, # node symb | {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109_A.txt'}, # node symb | ||||
{'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb | {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb | ||||
{'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'}, | {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'}, | ||||
# {'name': 'monoterpenoides', 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'}, # node/edge symb | |||||
# node symb/nsymb | # node symb/nsymb | ||||
# {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb | # {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb | ||||
# {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'}, | # {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'}, | ||||
@@ -27,7 +27,8 @@ dslist = [ | |||||
{'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb | {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb | ||||
{'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1_A.txt'}, # node symb | {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1_A.txt'}, # node symb | ||||
{'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109_A.txt'}, # node symb | {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109_A.txt'}, # node symb | ||||
{'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb | |||||
{'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb | |||||
# {'name': 'monoterpenoides', 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'}, # node/edge symb | |||||
# | # | ||||
# {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, | # {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, | ||||
# # node/edge symb | # # node/edge symb | ||||
@@ -54,11 +55,11 @@ dslist = [ | |||||
# {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',}, | # {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',}, | ||||
] | ] | ||||
estimator = untilhpathkernel | estimator = untilhpathkernel | ||||
param_grid_precomputed = {'depth': np.linspace(3, 10, 8), # [2], | |||||
'k_func': [None]} # ['MinMax', 'tanimoto'], | |||||
#param_grid_precomputed = {'depth': np.linspace(1, 10, 10), # [2], | |||||
# 'k_func': ['MinMax'], # ['MinMax', 'tanimoto'], | |||||
# 'compute_method': ['trie']} # ['MinMax']} | |||||
#param_grid_precomputed = {'depth': np.linspace(3, 10, 8), # [2], | |||||
# 'k_func': [None]} # ['MinMax', 'tanimoto'], | |||||
param_grid_precomputed = {'depth': np.linspace(1, 10, 10), # [2], | |||||
'k_func': ['MinMax', 'tanimoto'], # ['MinMax'], # | |||||
'compute_method': ['trie']} # ['MinMax']} | |||||
param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)}, | param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)}, | ||||
{'alpha': np.logspace(-10, 10, num=41, base=10)}] | {'alpha': np.logspace(-10, 10, num=41, base=10)}] | ||||
@@ -30,6 +30,8 @@ dslist = [ | |||||
{'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1_A.txt'}, # node symb | {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1_A.txt'}, # node symb | ||||
{'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109_A.txt'}, # node symb | {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109_A.txt'}, # node symb | ||||
{'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb | {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb | ||||
# {'name': 'monoterpenoides', 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'}, # node/edge symb | |||||
# | # | ||||
# {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, | # {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, | ||||
# # node/edge symb | # # node/edge symb | ||||
@@ -7,7 +7,7 @@ Created on Wed Oct 16 14:20:06 2019 | |||||
""" | """ | ||||
import numpy as np | import numpy as np | ||||
from tqdm import tqdm | from tqdm import tqdm | ||||
from itertools import combinations_with_replacement | |||||
from itertools import combinations_with_replacement, combinations | |||||
import multiprocessing | import multiprocessing | ||||
from multiprocessing import Pool | from multiprocessing import Pool | ||||
from functools import partial | from functools import partial | ||||
@@ -22,110 +22,88 @@ import sys | |||||
from ged import GED, get_nb_edit_operations | from ged import GED, get_nb_edit_operations | ||||
from utils import kernel_distance_matrix | from utils import kernel_distance_matrix | ||||
def fit_GED_to_kernel_distance(Gn, node_label, edge_label, gkernel, itr_max, | |||||
fitkernel=None, gamma=1.0): | |||||
def fit_GED_to_kernel_distance(Gn, node_label, edge_label, gkernel, itr_max, k=4, | |||||
params_ged={'lib': 'gedlibpy', 'cost': 'CONSTANT', | |||||
'method': 'IPFP', 'stabilizer': None}, | |||||
init_costs=[3, 3, 1, 3, 3, 1], | |||||
parallel=True): | |||||
# c_vi, c_vr, c_vs, c_ei, c_er, c_es or parts of them. | # c_vi, c_vr, c_vs, c_ei, c_er, c_es or parts of them. | ||||
# random.seed(1) | # random.seed(1) | ||||
cost_rdm = random.sample(range(1, 10), 6) | |||||
# edit_costs = cost_rdm + [0] | |||||
edit_costs = cost_rdm | |||||
# edit_costs = [i * 0.01 for i in cost_rdm] + [0] | |||||
# edit_costs = [0.2, 0.2, 0.2, 0.2, 0.2, 0] | |||||
# edit_costs = [0, 0, 0.9544, 0.026, 0.0196, 0] | |||||
# edit_costs = [0.008429912251810438, 0.025461055985319694, 0.2047320869225948, 0.004148727085832133, 0.0, 0] | |||||
idx_cost_nonzeros = [i for i, item in enumerate(edit_costs) if item != 0] | |||||
# cost_rdm = random.sample(range(1, 10), 6) | |||||
# init_costs = cost_rdm + [0] | |||||
# init_costs = cost_rdm | |||||
init_costs = [3, 3, 1, 3, 3, 1] | |||||
# init_costs = [i * 0.01 for i in cost_rdm] + [0] | |||||
# init_costs = [0.2, 0.2, 0.2, 0.2, 0.2, 0] | |||||
# init_costs = [0, 0, 0.9544, 0.026, 0.0196, 0] | |||||
# init_costs = [0.008429912251810438, 0.025461055985319694, 0.2047320869225948, 0.004148727085832133, 0.0, 0] | |||||
# idx_cost_nonzeros = [i for i, item in enumerate(edit_costs) if item != 0] | |||||
# compute distances in feature space. | # compute distances in feature space. | ||||
coef_dk = 1 | |||||
dis_k_mat, _, _, _ = kernel_distance_matrix(Gn, node_label, edge_label, gkernel=gkernel) | dis_k_mat, _, _, _ = kernel_distance_matrix(Gn, node_label, edge_label, gkernel=gkernel) | ||||
dis_k_vec = [] | dis_k_vec = [] | ||||
for i in range(len(dis_k_mat)): | for i in range(len(dis_k_mat)): | ||||
for j in range(i, len(dis_k_mat)): | |||||
# for j in range(i, len(dis_k_mat)): | |||||
for j in range(i + 1, len(dis_k_mat)): | |||||
dis_k_vec.append(dis_k_mat[i, j]) | dis_k_vec.append(dis_k_mat[i, j]) | ||||
dis_k_vec = np.array(dis_k_vec) | dis_k_vec = np.array(dis_k_vec) | ||||
if fitkernel == None: | |||||
dis_k_vec_ajusted = dis_k_vec | |||||
elif fitkernel == 'gaussian': | |||||
coef_dk = 1 / np.max(dis_k_vec) | |||||
idx_dk_nonzeros = np.where(dis_k_vec != 0)[0] | |||||
# remove 0's and constraint d_k between 0 and 1. | |||||
dis_k_vec = dis_k_vec[idx_dk_nonzeros] * coef_dk | |||||
dis_k_vec_ajusted = np.sqrt(-np.log(dis_k_vec) / gamma) | |||||
residual_list = [] | |||||
edit_cost_list = [] | |||||
time_list = [] | |||||
nb_cost_mat_list = [] | |||||
# init ged. | |||||
print('\ninitial:') | |||||
time0 = time.time() | |||||
params_ged['edit_cost_constant'] = init_costs | |||||
ged_vec_init, ged_mat, n_edit_operations = compute_geds(Gn, params_ged, | |||||
parallel=parallel) | |||||
residual_list = [np.sqrt(np.sum(np.square(np.array(ged_vec_init) - dis_k_vec)))] | |||||
time_list = [time.time() - time0] | |||||
edit_cost_list = [init_costs] | |||||
nb_cost_mat = np.array(n_edit_operations) | |||||
nb_cost_mat_list = [nb_cost_mat] | |||||
print('edit_costs:', init_costs) | |||||
print('residual_list:', residual_list) | |||||
for itr in range(itr_max): | for itr in range(itr_max): | ||||
print('\niteration', itr) | print('\niteration', itr) | ||||
time0 = time.time() | time0 = time.time() | ||||
# compute GEDs and numbers of edit operations. | |||||
edit_cost_constant = [i for i in edit_costs] | |||||
edit_cost_list.append(edit_cost_constant) | |||||
ged_all, ged_mat, n_edit_operations = compute_geds(Gn, edit_cost_constant, | |||||
idx_cost_nonzeros, parallel=True) | |||||
if fitkernel == None: | |||||
residual = np.sqrt(np.sum(np.square(np.array(ged_all) - dis_k_vec))) | |||||
elif fitkernel == 'gaussian': | |||||
ged_all = np.array(ged_all)[idx_dk_nonzeros] | |||||
residual = np.sqrt(np.sum(np.square( | |||||
np.exp(-gamma * ged_all ** 2) / coef_dk - dis_k_vec))) | |||||
residual_list.append(residual) | |||||
# "fit" geds to distances in feature space by tuning edit costs using the | # "fit" geds to distances in feature space by tuning edit costs using the | ||||
# Least Squares Method. | # Least Squares Method. | ||||
nb_cost_mat = np.array(n_edit_operations).T | |||||
if fitkernel == 'gaussian': | |||||
nb_cost_mat = nb_cost_mat[idx_dk_nonzeros] | |||||
nb_cost_mat_list.append(nb_cost_mat) | |||||
edit_costs_new, residual = compute_better_costs(nb_cost_mat, dis_k_vec_ajusted) | |||||
print('pseudo residual:', residual) | |||||
edit_costs_new, residual = update_costs(nb_cost_mat, dis_k_vec) | |||||
for i in range(len(edit_costs_new)): | for i in range(len(edit_costs_new)): | ||||
if edit_costs_new[i] < 0: | if edit_costs_new[i] < 0: | ||||
if edit_costs_new[i] > -1e-9: | if edit_costs_new[i] > -1e-9: | ||||
edit_costs_new[i] = 0 | edit_costs_new[i] = 0 | ||||
else: | else: | ||||
raise ValueError('The edit cost is negative.') | raise ValueError('The edit cost is negative.') | ||||
for idx, item in enumerate(idx_cost_nonzeros): | |||||
edit_costs[item] = edit_costs_new[idx] | |||||
# for i in range(len(edit_costs_new)): | |||||
# if edit_costs_new[i] < 0: | |||||
# edit_costs_new[i] = 0 | |||||
# compute new GEDs and numbers of edit operations. | |||||
params_ged['edit_cost_constant'] = edit_costs_new | |||||
ged_vec, ged_mat, n_edit_operations = compute_geds(Gn, params_ged, | |||||
parallel=parallel) | |||||
residual_list.append(np.sqrt(np.sum(np.square(np.array(ged_vec) - dis_k_vec)))) | |||||
time_list.append(time.time() - time0) | time_list.append(time.time() - time0) | ||||
print('edit_costs:', edit_costs) | |||||
edit_cost_list.append(edit_costs_new) | |||||
nb_cost_mat = np.array(n_edit_operations) | |||||
nb_cost_mat_list.append(nb_cost_mat) | |||||
print('edit_costs:', edit_costs_new) | |||||
print('residual_list:', residual_list) | print('residual_list:', residual_list) | ||||
print() | |||||
edit_cost_list.append(edit_costs) | |||||
ged_all, ged_mat, n_edit_operations = compute_geds(Gn, edit_costs, | |||||
idx_cost_nonzeros, parallel=True) | |||||
if fitkernel == 0: | |||||
residual = np.sqrt(np.sum(np.square(np.array(ged_all) - dis_k_vec))) | |||||
elif fitkernel == 'gaussian': | |||||
ged_all = np.array(ged_all)[idx_dk_nonzeros] | |||||
residual = np.sqrt(np.sum(np.square( | |||||
np.exp(-gamma * ged_all ** 2) / coef_dk - dis_k_vec))) | |||||
residual_list.append(residual) | |||||
nb_cost_mat_list.append(np.array(n_edit_operations).T) | |||||
return edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, \ | |||||
time_list, nb_cost_mat_list, coef_dk | |||||
return edit_costs_new, residual_list, edit_cost_list, dis_k_mat, ged_mat, \ | |||||
time_list, nb_cost_mat_list | |||||
def compute_geds(Gn, edit_cost_constant, idx_nonzeros, parallel=False): | |||||
def compute_geds(Gn, params_ged, parallel=False): | |||||
ged_mat = np.zeros((len(Gn), len(Gn))) | ged_mat = np.zeros((len(Gn), len(Gn))) | ||||
if parallel: | if parallel: | ||||
# print('parallel') | # print('parallel') | ||||
len_itr = int(len(Gn) * (len(Gn) + 1) / 2) | |||||
ged_all = [0 for i in range(len_itr)] | |||||
n_edit_operations = [[0 for i in range(len_itr)] for j in | |||||
range(len(idx_nonzeros))] | |||||
itr = combinations_with_replacement(range(0, len(Gn)), 2) | |||||
# len_itr = int(len(Gn) * (len(Gn) + 1) / 2) | |||||
len_itr = int(len(Gn) * (len(Gn) - 1) / 2) | |||||
ged_vec = [0 for i in range(len_itr)] | |||||
n_edit_operations = [0 for i in range(len_itr)] | |||||
# itr = combinations_with_replacement(range(0, len(Gn)), 2) | |||||
itr = combinations(range(0, len(Gn)), 2) | |||||
n_jobs = multiprocessing.cpu_count() | n_jobs = multiprocessing.cpu_count() | ||||
if len_itr < 100 * n_jobs: | if len_itr < 100 * n_jobs: | ||||
chunksize = int(len_itr / n_jobs) + 1 | chunksize = int(len_itr / n_jobs) + 1 | ||||
@@ -134,68 +112,52 @@ def compute_geds(Gn, edit_cost_constant, idx_nonzeros, parallel=False): | |||||
def init_worker(gn_toshare): | def init_worker(gn_toshare): | ||||
global G_gn | global G_gn | ||||
G_gn = gn_toshare | G_gn = gn_toshare | ||||
do_partial = partial(_wrapper_compute_ged_parallel, edit_cost_constant, | |||||
idx_nonzeros) | |||||
do_partial = partial(_wrapper_compute_ged_parallel, params_ged) | |||||
pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(Gn,)) | pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(Gn,)) | ||||
iterator = tqdm(pool.imap_unordered(do_partial, itr, chunksize), | iterator = tqdm(pool.imap_unordered(do_partial, itr, chunksize), | ||||
desc='computing GEDs', file=sys.stdout) | desc='computing GEDs', file=sys.stdout) | ||||
# iterator = pool.imap_unordered(do_partial, itr, chunksize) | # iterator = pool.imap_unordered(do_partial, itr, chunksize) | ||||
for i, j, dis, n_eo_tmp in iterator: | for i, j, dis, n_eo_tmp in iterator: | ||||
idx_itr = int(len(Gn) * i + j - i * (i + 1) / 2) | |||||
ged_all[idx_itr] = dis | |||||
idx_itr = int(len(Gn) * i + j - (i + 1) * (i + 2) / 2) | |||||
ged_vec[idx_itr] = dis | |||||
ged_mat[i][j] = dis | ged_mat[i][j] = dis | ||||
ged_mat[j][i] = dis | ged_mat[j][i] = dis | ||||
for idx, item in enumerate(idx_nonzeros): | |||||
n_edit_operations[idx][idx_itr] = n_eo_tmp[item] | |||||
n_edit_operations[idx_itr] = n_eo_tmp | |||||
# print('\n-------------------------------------------') | # print('\n-------------------------------------------') | ||||
# print(i, j, idx_itr, dis) | # print(i, j, idx_itr, dis) | ||||
pool.close() | pool.close() | ||||
pool.join() | pool.join() | ||||
else: | else: | ||||
ged_all = [] | |||||
n_edit_operations = [[] for i in range(len(idx_nonzeros))] | |||||
ged_vec = [] | |||||
n_edit_operations = [] | |||||
for i in tqdm(range(len(Gn)), desc='computing GEDs', file=sys.stdout): | for i in tqdm(range(len(Gn)), desc='computing GEDs', file=sys.stdout): | ||||
# for i in range(len(Gn)): | # for i in range(len(Gn)): | ||||
for j in range(i, len(Gn)): | |||||
# time0 = time.time() | |||||
dis, pi_forward, pi_backward = GED(Gn[i], Gn[j], lib='gedlibpy', | |||||
cost='CONSTANT', method='IPFP', | |||||
edit_cost_constant=edit_cost_constant, stabilizer='min', | |||||
repeat=50) | |||||
# time1 = time.time() - time0 | |||||
# time0 = time.time() | |||||
ged_all.append(dis) | |||||
for j in range(i + 1, len(Gn)): | |||||
dis, pi_forward, pi_backward = GED(Gn[i], Gn[j], **params_ged) | |||||
ged_vec.append(dis) | |||||
ged_mat[i][j] = dis | ged_mat[i][j] = dis | ||||
ged_mat[j][i] = dis | ged_mat[j][i] = dis | ||||
n_eo_tmp = get_nb_edit_operations(Gn[i], Gn[j], pi_forward, pi_backward) | n_eo_tmp = get_nb_edit_operations(Gn[i], Gn[j], pi_forward, pi_backward) | ||||
for idx, item in enumerate(idx_nonzeros): | |||||
n_edit_operations[idx].append(n_eo_tmp[item]) | |||||
# time2 = time.time() - time0 | |||||
# print(time1, time2, time1 / time2) | |||||
n_edit_operations.append(n_eo_tmp) | |||||
return ged_all, ged_mat, n_edit_operations | |||||
return ged_vec, ged_mat, n_edit_operations | |||||
def _wrapper_compute_ged_parallel(edit_cost_constant, idx_nonzeros, itr): | |||||
def _wrapper_compute_ged_parallel(params_ged, itr): | |||||
i = itr[0] | i = itr[0] | ||||
j = itr[1] | j = itr[1] | ||||
dis, n_eo_tmp = _compute_ged_parallel(G_gn[i], G_gn[j], edit_cost_constant, | |||||
idx_nonzeros) | |||||
dis, n_eo_tmp = _compute_ged_parallel(G_gn[i], G_gn[j], params_ged) | |||||
return i, j, dis, n_eo_tmp | return i, j, dis, n_eo_tmp | ||||
def _compute_ged_parallel(g1, g2, edit_cost_constant, idx_nonzeros): | |||||
dis, pi_forward, pi_backward = GED(g1, g2, lib='gedlibpy', | |||||
cost='CONSTANT', method='IPFP', | |||||
edit_cost_constant=edit_cost_constant, stabilizer='min', | |||||
repeat=50) | |||||
n_eo_tmp = get_nb_edit_operations(g1, g2, pi_forward, pi_backward) | |||||
def _compute_ged_parallel(g1, g2, params_ged): | |||||
dis, pi_forward, pi_backward = GED(g1, g2, **params_ged) | |||||
n_eo_tmp = get_nb_edit_operations(g1, g2, pi_forward, pi_backward) | |||||
return dis, n_eo_tmp | return dis, n_eo_tmp | ||||
def compute_better_costs(nb_cost_mat, dis_k_vec): | |||||
def update_costs(nb_cost_mat, dis_k_vec): | |||||
# # method 1: simple least square method. | # # method 1: simple least square method. | ||||
# edit_costs_new, residual, _, _ = np.linalg.lstsq(nb_cost_mat, dis_k_vec, | # edit_costs_new, residual, _, _ = np.linalg.lstsq(nb_cost_mat, dis_k_vec, | ||||
# rcond=None) | # rcond=None) | ||||
@@ -203,7 +165,7 @@ def compute_better_costs(nb_cost_mat, dis_k_vec): | |||||
# # method 2: least square method with x_i >= 0. | # # method 2: least square method with x_i >= 0. | ||||
# edit_costs_new, residual = optimize.nnls(nb_cost_mat, dis_k_vec) | # edit_costs_new, residual = optimize.nnls(nb_cost_mat, dis_k_vec) | ||||
# method 3: solve as a quadratic program with constraints: x_i >= 0, sum(x) = 1. | |||||
# method 3: solve as a quadratic program with constraints. | |||||
# P = np.dot(nb_cost_mat.T, nb_cost_mat) | # P = np.dot(nb_cost_mat.T, nb_cost_mat) | ||||
# q_T = -2 * np.dot(dis_k_vec.T, nb_cost_mat) | # q_T = -2 * np.dot(dis_k_vec.T, nb_cost_mat) | ||||
# G = -1 * np.identity(nb_cost_mat.shape[1]) | # G = -1 * np.identity(nb_cost_mat.shape[1]) | ||||
@@ -221,7 +183,7 @@ def compute_better_costs(nb_cost_mat, dis_k_vec): | |||||
# h = np.array([0 for i in range(nb_cost_mat.shape[1])]) | # h = np.array([0 for i in range(nb_cost_mat.shape[1])]) | ||||
x = cp.Variable(nb_cost_mat.shape[1]) | x = cp.Variable(nb_cost_mat.shape[1]) | ||||
cost = cp.sum_squares(nb_cost_mat * x - dis_k_vec) | cost = cp.sum_squares(nb_cost_mat * x - dis_k_vec) | ||||
constraints = [x >= [0.01 for i in range(nb_cost_mat.shape[1])], | |||||
constraints = [x >= [0.0001 for i in range(nb_cost_mat.shape[1])], | |||||
# np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0] | # np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0] | ||||
np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0, | np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0, | ||||
np.array([0.0, 0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0] | np.array([0.0, 0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0] | ||||
@@ -13,29 +13,30 @@ import multiprocessing | |||||
from multiprocessing import Pool | from multiprocessing import Pool | ||||
from functools import partial | from functools import partial | ||||
from gedlibpy import librariesImport, gedlibpy | |||||
from gedlibpy_linlin import librariesImport, gedlibpy | |||||
def GED(g1, g2, lib='gedlibpy', cost='CHEM_1', method='IPFP', | def GED(g1, g2, lib='gedlibpy', cost='CHEM_1', method='IPFP', | ||||
edit_cost_constant=[], stabilizer='min', repeat=50): | |||||
edit_cost_constant=[], algo_options='', stabilizer='min', repeat=50): | |||||
""" | """ | ||||
Compute GED for 2 graphs. | Compute GED for 2 graphs. | ||||
""" | """ | ||||
if lib == 'gedlibpy': | |||||
def convertGraph(G): | |||||
"""Convert a graph to the proper NetworkX format that can be | |||||
recognized by library gedlibpy. | |||||
""" | |||||
G_new = nx.Graph() | |||||
for nd, attrs in G.nodes(data=True): | |||||
G_new.add_node(str(nd), chem=attrs['atom']) | |||||
def convertGraph(G): | |||||
"""Convert a graph to the proper NetworkX format that can be | |||||
recognized by library gedlibpy. | |||||
""" | |||||
G_new = nx.Graph() | |||||
for nd, attrs in G.nodes(data=True): | |||||
G_new.add_node(str(nd), chem=attrs['atom']) | |||||
# G_new.add_node(str(nd), x=str(attrs['attributes'][0]), | # G_new.add_node(str(nd), x=str(attrs['attributes'][0]), | ||||
# y=str(attrs['attributes'][1])) | # y=str(attrs['attributes'][1])) | ||||
for nd1, nd2, attrs in G.edges(data=True): | |||||
G_new.add_edge(str(nd1), str(nd2), valence=attrs['bond_type']) | |||||
# G_new.add_edge(str(nd1), str(nd2)) | |||||
return G_new | |||||
for nd1, nd2, attrs in G.edges(data=True): | |||||
# G_new.add_edge(str(nd1), str(nd2), valence=attrs['bond_type']) | |||||
G_new.add_edge(str(nd1), str(nd2)) | |||||
return G_new | |||||
if lib == 'gedlibpy': | |||||
gedlibpy.restart_env() | gedlibpy.restart_env() | ||||
gedlibpy.add_nx_graph(convertGraph(g1), "") | gedlibpy.add_nx_graph(convertGraph(g1), "") | ||||
gedlibpy.add_nx_graph(convertGraph(g2), "") | gedlibpy.add_nx_graph(convertGraph(g2), "") | ||||
@@ -43,12 +44,12 @@ def GED(g1, g2, lib='gedlibpy', cost='CHEM_1', method='IPFP', | |||||
listID = gedlibpy.get_all_graph_ids() | listID = gedlibpy.get_all_graph_ids() | ||||
gedlibpy.set_edit_cost(cost, edit_cost_constant=edit_cost_constant) | gedlibpy.set_edit_cost(cost, edit_cost_constant=edit_cost_constant) | ||||
gedlibpy.init() | gedlibpy.init() | ||||
gedlibpy.set_method(method, "") | |||||
gedlibpy.set_method(method, algo_options) | |||||
gedlibpy.init_method() | gedlibpy.init_method() | ||||
g = listID[0] | g = listID[0] | ||||
h = listID[1] | h = listID[1] | ||||
if stabilizer == None: | |||||
if stabilizer is None: | |||||
gedlibpy.run_method(g, h) | gedlibpy.run_method(g, h) | ||||
pi_forward = gedlibpy.get_forward_map(g, h) | pi_forward = gedlibpy.get_forward_map(g, h) | ||||
pi_backward = gedlibpy.get_backward_map(g, h) | pi_backward = gedlibpy.get_backward_map(g, h) | ||||
@@ -107,13 +108,57 @@ def GED(g1, g2, lib='gedlibpy', cost='CHEM_1', method='IPFP', | |||||
dis = upper | dis = upper | ||||
# make the map label correct (label remove map as np.inf) | |||||
nodes1 = [n for n in g1.nodes()] | |||||
nodes2 = [n for n in g2.nodes()] | |||||
nb1 = nx.number_of_nodes(g1) | |||||
nb2 = nx.number_of_nodes(g2) | |||||
pi_forward = [nodes2[pi] if pi < nb2 else np.inf for pi in pi_forward] | |||||
pi_backward = [nodes1[pi] if pi < nb1 else np.inf for pi in pi_backward] | |||||
elif lib == 'gedlib-bash': | |||||
import time | |||||
import random | |||||
import sys | |||||
import os | |||||
sys.path.insert(0, "../") | |||||
from pygraph.utils.graphfiles import saveDataset | |||||
tmp_dir = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/output/tmp_ged/' | |||||
if not os.path.exists(tmp_dir): | |||||
os.makedirs(tmp_dir) | |||||
fn_collection = tmp_dir + 'collection.' + str(time.time()) + str(random.randint(0, 1e9)) | |||||
xparams = {'method': 'gedlib', 'graph_dir': fn_collection} | |||||
saveDataset([g1, g2], ['dummy', 'dummy'], gformat='gxl', group='xml', | |||||
filename=fn_collection, xparams=xparams) | |||||
command = 'GEDLIB_HOME=\'/media/ljia/DATA/research-repo/codes/others/gedlib/gedlib2\'\n' | |||||
command += 'LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$GEDLIB_HOME/lib\n' | |||||
command += 'export LD_LIBRARY_PATH\n' | |||||
command += 'cd \'/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/bin\'\n' | |||||
command += './ged_for_python_bash monoterpenoides ' + fn_collection \ | |||||
+ ' \'' + algo_options + '\' ' | |||||
for ec in edit_cost_constant: | |||||
command += str(ec) + ' ' | |||||
# output = os.system(command) | |||||
stream = os.popen(command) | |||||
output = stream.readlines() | |||||
# print(output) | |||||
dis = float(output[0].strip()) | |||||
runtime = float(output[1].strip()) | |||||
size_forward = int(output[2].strip()) | |||||
pi_forward = [int(item.strip()) for item in output[3:3+size_forward]] | |||||
pi_backward = [int(item.strip()) for item in output[3+size_forward:]] | |||||
# print(dis) | |||||
# print(runtime) | |||||
# print(size_forward) | |||||
# print(pi_forward) | |||||
# print(pi_backward) | |||||
# make the map label correct (label remove map as np.inf) | |||||
nodes1 = [n for n in g1.nodes()] | |||||
nodes2 = [n for n in g2.nodes()] | |||||
nb1 = nx.number_of_nodes(g1) | |||||
nb2 = nx.number_of_nodes(g2) | |||||
pi_forward = [nodes2[pi] if pi < nb2 else np.inf for pi in pi_forward] | |||||
pi_backward = [nodes1[pi] if pi < nb1 else np.inf for pi in pi_backward] | |||||
# print(pi_forward) | |||||
return dis, pi_forward, pi_backward | return dis, pi_forward, pi_backward | ||||
@@ -149,7 +194,7 @@ def GED_n(Gn, lib='gedlibpy', cost='CHEM_1', method='IPFP', | |||||
g = listID[0] | g = listID[0] | ||||
h = listID[1] | h = listID[1] | ||||
if stabilizer == None: | |||||
if stabilizer is None: | |||||
gedlibpy.run_method(g, h) | gedlibpy.run_method(g, h) | ||||
pi_forward = gedlibpy.get_forward_map(g, h) | pi_forward = gedlibpy.get_forward_map(g, h) | ||||
pi_backward = gedlibpy.get_backward_map(g, h) | pi_backward = gedlibpy.get_backward_map(g, h) | ||||
@@ -183,7 +228,8 @@ def GED_n(Gn, lib='gedlibpy', cost='CHEM_1', method='IPFP', | |||||
def ged_median(Gn, Gn_median, verbose=False, params_ged={'lib': 'gedlibpy', | def ged_median(Gn, Gn_median, verbose=False, params_ged={'lib': 'gedlibpy', | ||||
'cost': 'CHEM_1', 'method': 'IPFP', 'edit_cost_constant': [], | 'cost': 'CHEM_1', 'method': 'IPFP', 'edit_cost_constant': [], | ||||
'stabilizer': 'min', 'repeat': 50}, parallel=False): | |||||
'algo_options': '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1', | |||||
'stabilizer': None}, parallel=False): | |||||
if parallel: | if parallel: | ||||
len_itr = int(len(Gn)) | len_itr = int(len(Gn)) | ||||
pi_forward_list = [[] for i in range(len_itr)] | pi_forward_list = [[] for i in range(len_itr)] | ||||
@@ -23,7 +23,8 @@ def iam_upgraded(Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, ite_max=50, | |||||
connected=False, removeNodes=True, allBestInit=False, allBestNodes=False, | connected=False, removeNodes=True, allBestInit=False, allBestNodes=False, | ||||
allBestEdges=False, allBestOutput=False, | allBestEdges=False, allBestOutput=False, | ||||
params_ged={'lib': 'gedlibpy', 'cost': 'CHEM_1', 'method': 'IPFP', | params_ged={'lib': 'gedlibpy', 'cost': 'CHEM_1', 'method': 'IPFP', | ||||
'edit_cost_constant': [], 'stabilizer': 'min', 'repeat': 50}): | |||||
'edit_cost_constant': [], 'stabilizer': None, | |||||
'algo_options': '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1'}): | |||||
"""See my name, then you know what I do. | """See my name, then you know what I do. | ||||
""" | """ | ||||
# Gn_median = Gn_median[0:10] | # Gn_median = Gn_median[0:10] | ||||
@@ -435,6 +436,62 @@ def iam_upgraded(Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, ite_max=50, | |||||
return G_gen_median_list, sod_gen_median, sod_list, G_set_median_list, sod_set_median | return G_gen_median_list, sod_gen_median, sod_list, G_set_median_list, sod_set_median | ||||
def iam_bash(Gn_names, edit_cost_constant, dataset='monoterpenoides', | |||||
graph_dir='/media/ljia/DATA/research-repo/codes/Linlin/py-graph/datasets/monoterpenoides/'): | |||||
"""Compute the iam by c++ implementation (gedlib) through bash. | |||||
""" | |||||
import os | |||||
import time | |||||
def createCollectionFile(Gn_names, y, filename): | |||||
"""Create collection file. | |||||
""" | |||||
dirname_ds = os.path.dirname(filename) | |||||
if dirname_ds != '': | |||||
dirname_ds += '/' | |||||
if not os.path.exists(dirname_ds) : | |||||
os.makedirs(dirname_ds) | |||||
with open(filename + '.xml', 'w') as fgroup: | |||||
fgroup.write("<?xml version=\"1.0\"?>") | |||||
fgroup.write("\n<!DOCTYPE GraphCollection SYSTEM \"http://www.inf.unibz.it/~blumenthal/dtd/GraphCollection.dtd\">") | |||||
fgroup.write("\n<GraphCollection>") | |||||
for idx, fname in enumerate(Gn_names): | |||||
fgroup.write("\n\t<graph file=\"" + fname + "\" class=\"" + str(y[idx]) + "\"/>") | |||||
fgroup.write("\n</GraphCollection>") | |||||
fgroup.close() | |||||
tmp_dir = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/output/tmp_ged/' | |||||
fn_collection = tmp_dir + 'collection.' + str(time.time()) + str(random.randint(0, 1e9)) | |||||
createCollectionFile(Gn_names, ['dummy'] * len(Gn_names), fn_collection) | |||||
# graph_dir = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/generated_datsets/monoterpenoides/gxl' | |||||
command = 'GEDLIB_HOME=\'/media/ljia/DATA/research-repo/codes/others/gedlib/gedlib2\'\n' | |||||
command += 'LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$GEDLIB_HOME/lib\n' | |||||
command += 'export LD_LIBRARY_PATH\n' | |||||
command += 'cd \'/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/bin\'\n' | |||||
command += './iam_for_python_bash ' + dataset + ' ' + fn_collection \ | |||||
+ ' \'' + graph_dir + '\' ' | |||||
if edit_cost_constant is None: | |||||
command += 'None' | |||||
else: | |||||
for ec in edit_cost_constant: | |||||
command += str(ec) + ' ' | |||||
# output = os.system(command) | |||||
stream = os.popen(command) | |||||
output = stream.readlines() | |||||
# print(output) | |||||
sod_sm = float(output[0].strip()) | |||||
sod_gm= float(output[1].strip()) | |||||
fname_sm = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/output/tmp_ged/set_median.gxl' | |||||
fname_gm = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/output/tmp_ged/gen_median.gxl' | |||||
return sod_sm, sod_gm, fname_sm, fname_gm | |||||
############################################################################### | ############################################################################### | ||||
# Old implementations. | # Old implementations. | ||||
@@ -16,6 +16,319 @@ from utils import remove_edges | |||||
from fitDistance import fit_GED_to_kernel_distance | from fitDistance import fit_GED_to_kernel_distance | ||||
from utils import normalize_distance_matrix | from utils import normalize_distance_matrix | ||||
def median_paper_clcpc_python_best(): | |||||
"""c_vs <= c_vi + c_vr, c_es <= c_ei + c_er with ged computation with | |||||
python invoking the c++ code by bash command (with updated library). | |||||
""" | |||||
# ds = {'name': 'monoterpenoides', | |||||
# 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb | |||||
# _, y_all = loadDataset(ds['dataset']) | |||||
gkernel = 'untilhpathkernel' | |||||
node_label = 'atom' | |||||
edge_label = 'bond_type' | |||||
itr_max = 6 | |||||
algo_options = '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1' | |||||
params_ged = {'lib': 'gedlibpy', 'cost': 'CONSTANT', 'method': 'IPFP', | |||||
'algo_options': algo_options, 'stabilizer': None} | |||||
y_all = ['3', '1', '4', '6', '7', '8', '9', '2'] | |||||
repeats = 50 | |||||
collection_path = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/generated_datsets/monoterpenoides/' | |||||
graph_dir = collection_path + 'gxl/' | |||||
fn_edit_costs_output = 'results/median_paper/edit_costs_output.python_init40.k10.txt' | |||||
for y in y_all: | |||||
for repeat in range(repeats): | |||||
edit_costs_output_file = open(fn_edit_costs_output, 'a') | |||||
collection_file = collection_path + 'monoterpenoides_' + y + '_' + str(repeat) + '.xml' | |||||
Gn, _ = loadDataset(collection_file, extra_params=graph_dir) | |||||
edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \ | |||||
nb_cost_mat_list = fit_GED_to_kernel_distance(Gn, node_label, edge_label, | |||||
gkernel, itr_max, params_ged=params_ged, | |||||
parallel=True) | |||||
total_time = np.sum(time_list) | |||||
# print('\nedit_costs:', edit_costs) | |||||
# print('\nresidual_list:', residual_list) | |||||
# print('\nedit_cost_list:', edit_cost_list) | |||||
# print('\ndistance matrix in kernel space:', dis_k_mat) | |||||
# print('\nged matrix:', ged_mat) | |||||
# print('\ntotal time:', total_time) | |||||
# print('\nnb_cost_mat:', nb_cost_mat_list[-1]) | |||||
np.savez('results/median_paper/fit_distance.clcpc.python_init40.monot.elabeled.uhpkernel.y' | |||||
+ y + '.repeat' + str(repeat) + '.k10..gm', | |||||
edit_costs=edit_costs, | |||||
residual_list=residual_list, edit_cost_list=edit_cost_list, | |||||
dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list, | |||||
total_time=total_time, nb_cost_mat_list=nb_cost_mat_list) | |||||
for ec in edit_costs: | |||||
edit_costs_output_file.write(str(ec) + ' ') | |||||
edit_costs_output_file.write('\n') | |||||
edit_costs_output_file.close() | |||||
# # normalized distance matrices. | |||||
# gmfile = np.load('results/fit_distance.cs_leq_ci_plus_cr.cost_leq_1en2.monot.elabeled.uhpkernel.gm.npz') | |||||
# edit_costs = gmfile['edit_costs'] | |||||
# residual_list = gmfile['residual_list'] | |||||
# edit_cost_list = gmfile['edit_cost_list'] | |||||
# dis_k_mat = gmfile['dis_k_mat'] | |||||
# ged_mat = gmfile['ged_mat'] | |||||
# total_time = gmfile['total_time'] | |||||
# nb_cost_mat_list = gmfile['nb_cost_mat_list'] | |||||
nb_consistent, nb_inconsistent, ratio_consistent = pairwise_substitution_consistence(dis_k_mat, ged_mat) | |||||
print(nb_consistent, nb_inconsistent, ratio_consistent) | |||||
# norm_dis_k_mat = normalize_distance_matrix(dis_k_mat) | |||||
# plt.imshow(norm_dis_k_mat) | |||||
# plt.colorbar() | |||||
# plt.savefig('results/median_paper/norm_dis_k_mat.clcpc.python_best.monot.elabeled.uhpkernel.y' | |||||
# + y + '.repeat' + str(repeat) + '.eps', format='eps', dpi=300) | |||||
# plt.savefig('results/median_paper/norm_dis_k_mat.clcpc.python_best.monot.elabeled.uhpkernel.y' | |||||
# + y + '.repeat' + str(repeat) + '.png', format='png') | |||||
# # plt.show() | |||||
# plt.clf() | |||||
# | |||||
# norm_ged_mat = normalize_distance_matrix(ged_mat) | |||||
# plt.imshow(norm_ged_mat) | |||||
# plt.colorbar() | |||||
# plt.savefig('results/median_paper/norm_ged_mat.clcpc.python_best.monot.elabeled.uhpkernel.y' | |||||
# + y + '.repeat' + str(repeat) + '.eps', format='eps', dpi=300) | |||||
# plt.savefig('results/median_paper/norm_ged_mat.clcpc.python_best.monot.elabeled.uhpkernel.y' | |||||
# + y + '.repeat' + str(repeat) + '.png', format='png') | |||||
# # plt.show() | |||||
# plt.clf() | |||||
# | |||||
# norm_diff = norm_ged_mat - norm_dis_k_mat | |||||
# plt.imshow(norm_diff) | |||||
# plt.colorbar() | |||||
# plt.savefig('results/median_paper/diff_mat_norm_ged_dis_k.clcpc.python_best.monot.elabeled.uhpkernel.y' | |||||
# + y + '.repeat' + str(repeat) + '.eps', format='eps', dpi=300) | |||||
# plt.savefig('results/median_paper/diff_mat_norm_ged_dis_k.clcpc.python_best.monot.elabeled.uhpkernel.y' | |||||
# + y + '.repeat' + str(repeat) + '.png', format='png') | |||||
# # plt.show() | |||||
# plt.clf() | |||||
# # draw_count_bar(norm_diff) | |||||
def median_paper_clcpc_python_bash_cpp(): | |||||
"""c_vs <= c_vi + c_vr, c_es <= c_ei + c_er with ged computation with | |||||
python invoking the c++ code by bash command (with updated library). | |||||
""" | |||||
# ds = {'name': 'monoterpenoides', | |||||
# 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb | |||||
# _, y_all = loadDataset(ds['dataset']) | |||||
gkernel = 'untilhpathkernel' | |||||
node_label = 'atom' | |||||
edge_label = 'bond_type' | |||||
itr_max = 20 | |||||
algo_options = '--threads 6 --initial-solutions 10 --ratio-runs-from-initial-solutions .5' | |||||
params_ged = {'lib': 'gedlib-bash', 'cost': 'CONSTANT', 'method': 'IPFP', | |||||
'algo_options': algo_options} | |||||
y_all = ['3', '1', '4', '6', '7', '8', '9', '2'] | |||||
repeats = 50 | |||||
collection_path = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/generated_datsets/monoterpenoides/' | |||||
graph_dir = collection_path + 'gxl/' | |||||
fn_edit_costs_output = 'results/median_paper/edit_costs_output.txt' | |||||
for y in y_all: | |||||
for repeat in range(repeats): | |||||
edit_costs_output_file = open(fn_edit_costs_output, 'a') | |||||
collection_file = collection_path + 'monoterpenoides_' + y + '_' + str(repeat) + '.xml' | |||||
Gn, _ = loadDataset(collection_file, extra_params=graph_dir) | |||||
edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \ | |||||
nb_cost_mat_list, coef_dk = fit_GED_to_kernel_distance(Gn, node_label, edge_label, | |||||
gkernel, itr_max, params_ged=params_ged, | |||||
parallel=False) | |||||
total_time = np.sum(time_list) | |||||
# print('\nedit_costs:', edit_costs) | |||||
# print('\nresidual_list:', residual_list) | |||||
# print('\nedit_cost_list:', edit_cost_list) | |||||
# print('\ndistance matrix in kernel space:', dis_k_mat) | |||||
# print('\nged matrix:', ged_mat) | |||||
# print('\ntotal time:', total_time) | |||||
# print('\nnb_cost_mat:', nb_cost_mat_list[-1]) | |||||
np.savez('results/median_paper/fit_distance.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y' | |||||
+ y + '.repeat' + str(repeat) + '.gm', | |||||
edit_costs=edit_costs, | |||||
residual_list=residual_list, edit_cost_list=edit_cost_list, | |||||
dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list, | |||||
total_time=total_time, nb_cost_mat_list=nb_cost_mat_list, | |||||
coef_dk=coef_dk) | |||||
for ec in edit_costs: | |||||
edit_costs_output_file.write(str(ec) + ' ') | |||||
edit_costs_output_file.write('\n') | |||||
edit_costs_output_file.close() | |||||
# # normalized distance matrices. | |||||
# gmfile = np.load('results/fit_distance.cs_leq_ci_plus_cr.cost_leq_1en2.monot.elabeled.uhpkernel.gm.npz') | |||||
# edit_costs = gmfile['edit_costs'] | |||||
# residual_list = gmfile['residual_list'] | |||||
# edit_cost_list = gmfile['edit_cost_list'] | |||||
# dis_k_mat = gmfile['dis_k_mat'] | |||||
# ged_mat = gmfile['ged_mat'] | |||||
# total_time = gmfile['total_time'] | |||||
# nb_cost_mat_list = gmfile['nb_cost_mat_list'] | |||||
# coef_dk = gmfile['coef_dk'] | |||||
nb_consistent, nb_inconsistent, ratio_consistent = pairwise_substitution_consistence(dis_k_mat, ged_mat) | |||||
print(nb_consistent, nb_inconsistent, ratio_consistent) | |||||
# norm_dis_k_mat = normalize_distance_matrix(dis_k_mat) | |||||
# plt.imshow(norm_dis_k_mat) | |||||
# plt.colorbar() | |||||
# plt.savefig('results/median_paper/norm_dis_k_mat.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y' | |||||
# + y + '.repeat' + str(repeat) + '.eps', format='eps', dpi=300) | |||||
# plt.savefig('results/median_paper/norm_dis_k_mat.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y' | |||||
# + y + '.repeat' + str(repeat) + '.png', format='png') | |||||
# # plt.show() | |||||
# plt.clf() | |||||
# | |||||
# norm_ged_mat = normalize_distance_matrix(ged_mat) | |||||
# plt.imshow(norm_ged_mat) | |||||
# plt.colorbar() | |||||
# plt.savefig('results/median_paper/norm_ged_mat.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y' | |||||
# + y + '.repeat' + str(repeat) + '.eps', format='eps', dpi=300) | |||||
# plt.savefig('results/median_paper/norm_ged_mat.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y' | |||||
# + y + '.repeat' + str(repeat) + '.png', format='png') | |||||
# # plt.show() | |||||
# plt.clf() | |||||
# | |||||
# norm_diff = norm_ged_mat - norm_dis_k_mat | |||||
# plt.imshow(norm_diff) | |||||
# plt.colorbar() | |||||
# plt.savefig('results/median_paper/diff_mat_norm_ged_dis_k.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y' | |||||
# + y + '.repeat' + str(repeat) + '.eps', format='eps', dpi=300) | |||||
# plt.savefig('results/median_paper/diff_mat_norm_ged_dis_k.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y' | |||||
# + y + '.repeat' + str(repeat) + '.png', format='png') | |||||
# # plt.show() | |||||
# plt.clf() | |||||
# # draw_count_bar(norm_diff) | |||||
def test_cs_leq_ci_plus_cr_python_bash_cpp(): | |||||
"""c_vs <= c_vi + c_vr, c_es <= c_ei + c_er with ged computation with | |||||
python invoking the c++ code by bash command (with updated library). | |||||
""" | |||||
ds = {'name': 'monoterpenoides', | |||||
'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb | |||||
Gn, y_all = loadDataset(ds['dataset']) | |||||
# Gn = Gn[0:10] | |||||
gkernel = 'untilhpathkernel' | |||||
node_label = 'atom' | |||||
edge_label = 'bond_type' | |||||
itr_max = 10 | |||||
algo_options = '--threads 6 --initial-solutions 10 --ratio-runs-from-initial-solutions .5' | |||||
params_ged = {'lib': 'gedlib-bash', 'cost': 'CONSTANT', 'method': 'IPFP', | |||||
'algo_options': algo_options} | |||||
edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \ | |||||
nb_cost_mat_list, coef_dk = fit_GED_to_kernel_distance(Gn, node_label, edge_label, | |||||
gkernel, itr_max, params_ged=params_ged, | |||||
parallel=False) | |||||
total_time = np.sum(time_list) | |||||
print('\nedit_costs:', edit_costs) | |||||
print('\nresidual_list:', residual_list) | |||||
print('\nedit_cost_list:', edit_cost_list) | |||||
print('\ndistance matrix in kernel space:', dis_k_mat) | |||||
print('\nged matrix:', ged_mat) | |||||
print('\ntotal time:', total_time) | |||||
print('\nnb_cost_mat:', nb_cost_mat_list[-1]) | |||||
np.savez('results/fit_distance.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel.gm', | |||||
edit_costs=edit_costs, | |||||
residual_list=residual_list, edit_cost_list=edit_cost_list, | |||||
dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list, | |||||
total_time=total_time, nb_cost_mat_list=nb_cost_mat_list, | |||||
coef_dk=coef_dk) | |||||
# ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt', | |||||
# 'extra_params': {}} # node/edge symb | |||||
# Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params']) | |||||
## Gn = Gn[0:10] | |||||
## remove_edges(Gn) | |||||
# gkernel = 'untilhpathkernel' | |||||
# node_label = 'atom' | |||||
# edge_label = 'bond_type' | |||||
# itr_max = 10 | |||||
# edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \ | |||||
# nb_cost_mat_list, coef_dk = fit_GED_to_kernel_distance(Gn, node_label, edge_label, | |||||
# gkernel, itr_max) | |||||
# total_time = np.sum(time_list) | |||||
# print('\nedit_costs:', edit_costs) | |||||
# print('\nresidual_list:', residual_list) | |||||
# print('\nedit_cost_list:', edit_cost_list) | |||||
# print('\ndistance matrix in kernel space:', dis_k_mat) | |||||
# print('\nged matrix:', ged_mat) | |||||
# print('\ntotal time:', total_time) | |||||
# print('\nnb_cost_mat:', nb_cost_mat_list[-1]) | |||||
# np.savez('results/fit_distance.cs_leq_ci_plus_cr.mutag.elabeled.uhpkernel.gm', | |||||
# edit_costs=edit_costs, | |||||
# residual_list=residual_list, edit_cost_list=edit_cost_list, | |||||
# dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list, | |||||
# total_time=total_time, nb_cost_mat_list=nb_cost_mat_list, coef_dk) | |||||
# # normalized distance matrices. | |||||
# gmfile = np.load('results/fit_distance.cs_leq_ci_plus_cr.monot.elabeled.uhpkernel.gm.npz') | |||||
# edit_costs = gmfile['edit_costs'] | |||||
# residual_list = gmfile['residual_list'] | |||||
# edit_cost_list = gmfile['edit_cost_list'] | |||||
# dis_k_mat = gmfile['dis_k_mat'] | |||||
# ged_mat = gmfile['ged_mat'] | |||||
# total_time = gmfile['total_time'] | |||||
# nb_cost_mat_list = gmfile['nb_cost_mat_list'] | |||||
# coef_dk = gmfile['coef_dk'] | |||||
nb_consistent, nb_inconsistent, ratio_consistent = pairwise_substitution_consistence(dis_k_mat, ged_mat) | |||||
print(nb_consistent, nb_inconsistent, ratio_consistent) | |||||
# dis_k_sub = pairwise_substitution(dis_k_mat) | |||||
# ged_sub = pairwise_substitution(ged_mat) | |||||
# np.savez('results/sub_dis_mat.cs_leq_ci_plus_cr.gm', | |||||
# dis_k_sub=dis_k_sub, ged_sub=ged_sub) | |||||
norm_dis_k_mat = normalize_distance_matrix(dis_k_mat) | |||||
plt.imshow(norm_dis_k_mat) | |||||
plt.colorbar() | |||||
plt.savefig('results/norm_dis_k_mat.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel' | |||||
+ '.eps', format='eps', dpi=300) | |||||
plt.savefig('results/norm_dis_k_mat.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel' | |||||
+ '.png', format='png') | |||||
# plt.show() | |||||
plt.clf() | |||||
norm_ged_mat = normalize_distance_matrix(ged_mat) | |||||
plt.imshow(norm_ged_mat) | |||||
plt.colorbar() | |||||
plt.savefig('results/norm_ged_mat.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel' | |||||
+ '.eps', format='eps', dpi=300) | |||||
plt.savefig('results/norm_ged_mat.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel' | |||||
+ '.png', format='png') | |||||
# plt.show() | |||||
plt.clf() | |||||
norm_diff = norm_ged_mat - norm_dis_k_mat | |||||
plt.imshow(norm_diff) | |||||
plt.colorbar() | |||||
plt.savefig('results/diff_mat_norm_ged_dis_k.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel' | |||||
+ '.eps', format='eps', dpi=300) | |||||
plt.savefig('results/diff_mat_norm_ged_dis_k.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel' | |||||
+ '.png', format='png') | |||||
# plt.show() | |||||
plt.clf() | |||||
# draw_count_bar(norm_diff) | |||||
def test_anycosts(): | def test_anycosts(): | ||||
ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt', | ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt', | ||||
'extra_params': {}} # node/edge symb | 'extra_params': {}} # node/edge symb | ||||
@@ -295,8 +608,12 @@ def draw_count_bar(norm_diff): | |||||
if __name__ == '__main__': | if __name__ == '__main__': | ||||
# test_anycosts() | # test_anycosts() | ||||
test_cs_leq_ci_plus_cr() | |||||
# test_cs_leq_ci_plus_cr() | |||||
# test_unfitted() | # test_unfitted() | ||||
# test_cs_leq_ci_plus_cr_python_bash_cpp() | |||||
# median_paper_clcpc_python_bash_cpp() | |||||
median_paper_clcpc_python_best() | |||||
# x = np.array([[1,2,3],[4,5,6],[7,8,9]]) | # x = np.array([[1,2,3],[4,5,6],[7,8,9]]) | ||||
# xx = pairwise_substitution(x) | # xx = pairwise_substitution(x) |
@@ -22,6 +22,130 @@ from iam import iam_upgraded | |||||
from utils import remove_edges, compute_kernel, get_same_item_indices, dis_gstar | from utils import remove_edges, compute_kernel, get_same_item_indices, dis_gstar | ||||
#from ged import ged_median | #from ged import ged_median | ||||
def test_iam_monoterpenoides_with_init40(): | |||||
gkernel = 'untilhpathkernel' | |||||
node_label = 'atom' | |||||
edge_label = 'bond_type' | |||||
# unfitted edit costs. | |||||
c_vi = 3 | |||||
c_vr = 3 | |||||
c_vs = 1 | |||||
c_ei = 3 | |||||
c_er = 3 | |||||
c_es = 1 | |||||
ite_max_iam = 50 | |||||
epsilon_iam = 0.0001 | |||||
removeNodes = False | |||||
connected_iam = False | |||||
# parameters for IAM function | |||||
# ged_cost = 'CONSTANT' | |||||
ged_cost = 'CONSTANT' | |||||
ged_method = 'IPFP' | |||||
edit_cost_constant = [c_vi, c_vr, c_vs, c_ei, c_er, c_es] | |||||
ged_stabilizer = None | |||||
# ged_repeat = 50 | |||||
algo_options = '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1' | |||||
params_ged = {'lib': 'gedlibpy', 'cost': ged_cost, 'method': ged_method, | |||||
'edit_cost_constant': edit_cost_constant, | |||||
'algo_options': algo_options, | |||||
'stabilizer': ged_stabilizer} | |||||
collection_path = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/generated_datsets/monoterpenoides/' | |||||
graph_dir = collection_path + 'gxl/' | |||||
y_all = ['3', '1', '4', '6', '7', '8', '9', '2'] | |||||
repeats = 50 | |||||
# classify graphs according to classes. | |||||
time_list = [] | |||||
dis_ks_min_list = [] | |||||
dis_ks_set_median_list = [] | |||||
sod_gs_list = [] | |||||
g_best = [] | |||||
sod_set_median_list = [] | |||||
sod_list_list = [] | |||||
for y in y_all: | |||||
print('\n-------------------------------------------------------') | |||||
print('class of y:', y) | |||||
time_list.append([]) | |||||
dis_ks_min_list.append([]) | |||||
dis_ks_set_median_list.append([]) | |||||
sod_gs_list.append([]) | |||||
g_best.append([]) | |||||
sod_set_median_list.append([]) | |||||
for repeat in range(repeats): | |||||
# load median set. | |||||
collection_file = collection_path + 'monoterpenoides_' + y + '_' + str(repeat) + '.xml' | |||||
Gn_median, _ = loadDataset(collection_file, extra_params=graph_dir) | |||||
Gn_candidate = [g.copy() for g in Gn_median] | |||||
time0 = time.time() | |||||
G_gen_median_list, sod_gen_median, sod_list, G_set_median_list, sod_set_median \ | |||||
= iam_upgraded(Gn_median, | |||||
Gn_candidate, c_ei=c_ei, c_er=c_er, c_es=c_es, ite_max=ite_max_iam, | |||||
epsilon=epsilon_iam, node_label=node_label, edge_label=edge_label, | |||||
connected=connected_iam, removeNodes=removeNodes, | |||||
params_ged=params_ged) | |||||
time_total = time.time() - time0 | |||||
print('\ntime: ', time_total) | |||||
time_list[-1].append(time_total) | |||||
g_best[-1].append(G_gen_median_list[0]) | |||||
sod_set_median_list[-1].append(sod_set_median) | |||||
print('\nsmallest sod of the set median:', sod_set_median) | |||||
sod_gs_list[-1].append(sod_gen_median) | |||||
print('\nsmallest sod in graph space:', sod_gen_median) | |||||
sod_list_list.append(sod_list) | |||||
# # show the best graph and save it to file. | |||||
# print('one of the possible corresponding pre-images is') | |||||
# nx.draw(G_gen_median_list[0], labels=nx.get_node_attributes(G_gen_median_list[0], 'atom'), | |||||
# with_labels=True) | |||||
## plt.show() | |||||
# # plt.savefig('results/iam/mutag_median.fit_costs2.001.nb' + str(nb_median) + | |||||
## plt.savefig('results/iam/paper_compare/monoter_y' + str(y_class) + | |||||
## '_repeat' + str(repeat) + '_' + str(time.time()) + | |||||
## '.png', format="PNG") | |||||
# plt.clf() | |||||
# # print(G_gen_median_list[0].nodes(data=True)) | |||||
# # print(G_gen_median_list[0].edges(data=True)) | |||||
print('\nsods of the set median for this class:', sod_set_median_list[-1]) | |||||
print('\nsods in graph space for this class:', sod_gs_list[-1]) | |||||
# print('\ndistance in kernel space of set median for this class:', | |||||
# dis_ks_set_median_list[-1]) | |||||
# print('\nsmallest distances in kernel space for this class:', | |||||
# dis_ks_min_list[-1]) | |||||
print('\ntimes for this class:', time_list[-1]) | |||||
sod_set_median_list[-1] = np.mean(sod_set_median_list[-1]) | |||||
sod_gs_list[-1] = np.mean(sod_gs_list[-1]) | |||||
# dis_ks_set_median_list[-1] = np.mean(dis_ks_set_median_list[-1]) | |||||
# dis_ks_min_list[-1] = np.mean(dis_ks_min_list[-1]) | |||||
time_list[-1] = np.mean(time_list[-1]) | |||||
print() | |||||
print('\nmean sods of the set median for each class:', sod_set_median_list) | |||||
print('\nmean sods in graph space for each class:', sod_gs_list) | |||||
# print('\ndistances in kernel space of set median for each class:', | |||||
# dis_ks_set_median_list) | |||||
# print('\nmean smallest distances in kernel space for each class:', | |||||
# dis_ks_min_list) | |||||
print('\nmean times for each class:', time_list) | |||||
print('\nmean sods of the set median of all:', np.mean(sod_set_median_list)) | |||||
print('\nmean sods in graph space of all:', np.mean(sod_gs_list)) | |||||
# print('\nmean distances in kernel space of set median of all:', | |||||
# np.mean(dis_ks_set_median_list)) | |||||
# print('\nmean smallest distances in kernel space of all:', | |||||
# np.mean(dis_ks_min_list)) | |||||
print('\nmean times of all:', np.mean(time_list)) | |||||
def test_iam_monoterpenoides(): | def test_iam_monoterpenoides(): | ||||
ds = {'name': 'monoterpenoides', | ds = {'name': 'monoterpenoides', | ||||
'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb | 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb | ||||
@@ -834,9 +958,10 @@ if __name__ == '__main__': | |||||
# tests on different numbers of median-sets. | # tests on different numbers of median-sets. | ||||
# test_iam_median_nb() | # test_iam_median_nb() | ||||
# test_iam_letter_h() | # test_iam_letter_h() | ||||
test_iam_monoterpenoides() | |||||
# test_iam_monoterpenoides() | |||||
# test_iam_mutag() | # test_iam_mutag() | ||||
# test_iam_fitdistance() | # test_iam_fitdistance() | ||||
# print("test log") | # print("test log") | ||||
test_iam_monoterpenoides_with_init40() |
@@ -17,8 +17,10 @@ from pygraph.kernels.marginalizedKernel import marginalizedkernel | |||||
from pygraph.kernels.untilHPathKernel import untilhpathkernel | from pygraph.kernels.untilHPathKernel import untilhpathkernel | ||||
from pygraph.kernels.spKernel import spkernel | from pygraph.kernels.spKernel import spkernel | ||||
import functools | import functools | ||||
from pygraph.utils.kernels import deltakernel, gaussiankernel, kernelproduct | |||||
from pygraph.utils.kernels import deltakernel, gaussiankernel, kernelproduct, polynomialkernel | |||||
from pygraph.kernels.structuralspKernel import structuralspkernel | from pygraph.kernels.structuralspKernel import structuralspkernel | ||||
from pygraph.kernels.treeletKernel import treeletkernel | |||||
from pygraph.kernels.weisfeilerLehmanKernel import weisfeilerlehmankernel | |||||
def remove_edges(Gn): | def remove_edges(Gn): | ||||
@@ -46,18 +48,29 @@ def compute_kernel(Gn, graph_kernel, node_label, edge_label, verbose): | |||||
n_jobs=multiprocessing.cpu_count(), verbose=verbose) | n_jobs=multiprocessing.cpu_count(), verbose=verbose) | ||||
elif graph_kernel == 'untilhpathkernel': | elif graph_kernel == 'untilhpathkernel': | ||||
Kmatrix, _ = untilhpathkernel(Gn, node_label=node_label, edge_label=edge_label, | Kmatrix, _ = untilhpathkernel(Gn, node_label=node_label, edge_label=edge_label, | ||||
depth=10, k_func='MinMax', compute_method='trie', | |||||
depth=7, k_func='MinMax', compute_method='trie', | |||||
n_jobs=multiprocessing.cpu_count(), verbose=verbose) | n_jobs=multiprocessing.cpu_count(), verbose=verbose) | ||||
elif graph_kernel == 'spkernel': | elif graph_kernel == 'spkernel': | ||||
mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) | mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) | ||||
Kmatrix, _, _ = spkernel(Gn, node_label='atom', node_kernels= | |||||
Kmatrix, _, _ = spkernel(Gn, node_label=node_label, node_kernels= | |||||
{'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}, | {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}, | ||||
n_jobs=multiprocessing.cpu_count(), verbose=verbose) | n_jobs=multiprocessing.cpu_count(), verbose=verbose) | ||||
elif graph_kernel == 'structuralspkernel': | elif graph_kernel == 'structuralspkernel': | ||||
mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) | mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) | ||||
Kmatrix, _ = structuralspkernel(Gn, node_label='atom', node_kernels= | |||||
Kmatrix, _ = structuralspkernel(Gn, node_label=node_label, node_kernels= | |||||
{'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}, | {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}, | ||||
n_jobs=multiprocessing.cpu_count(), verbose=verbose) | n_jobs=multiprocessing.cpu_count(), verbose=verbose) | ||||
elif graph_kernel == 'treeletkernel': | |||||
# pkernel = functools.partial(polynomialkernel, d=2, c=1e5) | |||||
pkernel = functools.partial(gaussiankernel, gamma=1e-6) | |||||
mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) | |||||
Kmatrix, _ = treeletkernel(Gn, node_label=node_label, edge_label=edge_label, | |||||
sub_kernel=pkernel, | |||||
n_jobs=multiprocessing.cpu_count(), verbose=verbose) | |||||
elif graph_kernel == 'weisfeilerlehmankernel': | |||||
Kmatrix, _ = weisfeilerlehmankernel(Gn, node_label=node_label, edge_label=edge_label, | |||||
height=4, base_kernel='subtree', | |||||
n_jobs=multiprocessing.cpu_count(), verbose=verbose) | |||||
# normalization | # normalization | ||||
Kmatrix_diag = Kmatrix.diagonal().copy() | Kmatrix_diag = Kmatrix.diagonal().copy() | ||||
@@ -79,7 +92,7 @@ def gram2distances(Kmatrix): | |||||
def kernel_distance_matrix(Gn, node_label, edge_label, Kmatrix=None, gkernel=None): | def kernel_distance_matrix(Gn, node_label, edge_label, Kmatrix=None, gkernel=None): | ||||
dis_mat = np.empty((len(Gn), len(Gn))) | dis_mat = np.empty((len(Gn), len(Gn))) | ||||
if Kmatrix == None: | |||||
if Kmatrix is None: | |||||
Kmatrix = compute_kernel(Gn, gkernel, node_label, edge_label, True) | Kmatrix = compute_kernel(Gn, gkernel, node_label, edge_label, True) | ||||
for i in range(len(Gn)): | for i in range(len(Gn)): | ||||
for j in range(i, len(Gn)): | for j in range(i, len(Gn)): | ||||
@@ -109,6 +122,21 @@ def get_same_item_indices(ls): | |||||
return idx_dict | return idx_dict | ||||
def k_nearest_neighbors_to_median_in_kernel_space(Gn, Kmatrix=None, gkernel=None, | |||||
node_label=None, edge_label=None): | |||||
dis_k_all = [] # distance between g_star and each graph. | |||||
alpha = [1 / len(Gn)] * len(Gn) | |||||
if Kmatrix is None: | |||||
Kmatrix = compute_kernel(Gn, gkernel, node_label, edge_label, True) | |||||
term3 = 0 | |||||
for i1, a1 in enumerate(alpha): | |||||
for i2, a2 in enumerate(alpha): | |||||
term3 += a1 * a2 * Kmatrix[idx_gi[i1], idx_gi[i2]] | |||||
for ig, g in tqdm(enumerate(Gn_init), desc='computing distances', file=sys.stdout): | |||||
dtemp = dis_gstar(ig, idx_gi, alpha, Kmatrix, term3=term3) | |||||
dis_all.append(dtemp) | |||||
def normalize_distance_matrix(D): | def normalize_distance_matrix(D): | ||||
max_value = np.amax(D) | max_value = np.amax(D) | ||||
min_value = np.amin(D) | min_value = np.amin(D) |
@@ -124,21 +124,21 @@ def saveGXL(graph, filename, method='benoit'): | |||||
# reference: https://github.com/dbblumenthal/gedlib/blob/master/data/generate_molecules.py#L22 | # reference: https://github.com/dbblumenthal/gedlib/blob/master/data/generate_molecules.py#L22 | ||||
# pass | # pass | ||||
gxl_file = open(filename, 'w') | gxl_file = open(filename, 'w') | ||||
gxl_file.write("<?xml version=\"1.0\"?>\n") | |||||
gxl_file.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n") | |||||
gxl_file.write("<!DOCTYPE gxl SYSTEM \"http://www.gupro.de/GXL/gxl-1.0.dtd\">\n") | gxl_file.write("<!DOCTYPE gxl SYSTEM \"http://www.gupro.de/GXL/gxl-1.0.dtd\">\n") | ||||
gxl_file.write("<gxl>\n") | |||||
gxl_file.write("<gxl xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n") | |||||
gxl_file.write("<graph id=\"" + str(graph.graph['name']) + "\" edgeids=\"true\" edgemode=\"undirected\">\n") | gxl_file.write("<graph id=\"" + str(graph.graph['name']) + "\" edgeids=\"true\" edgemode=\"undirected\">\n") | ||||
for v, attrs in graph.nodes(data=True): | for v, attrs in graph.nodes(data=True): | ||||
gxl_file.write("<node id=\"_" + str(v) + "\">") | gxl_file.write("<node id=\"_" + str(v) + "\">") | ||||
gxl_file.write("<attr name=\"" + "chem" + "\"><int>" + str(attrs['atom']) + "</int></attr>") | |||||
gxl_file.write("<attr name=\"" + "chem" + "\"><int>" + str(attrs['chem']) + "</int></attr>") | |||||
gxl_file.write("</node>\n") | gxl_file.write("</node>\n") | ||||
for v1, v2, attrs in graph.edges(data=True): | for v1, v2, attrs in graph.edges(data=True): | ||||
gxl_file.write("<edge from=\"_" + str(v1) + "\" to=\"_" + str(v2) + "\">") | gxl_file.write("<edge from=\"_" + str(v1) + "\" to=\"_" + str(v2) + "\">") | ||||
# gxl_file.write("<attr name=\"valence\"><int>" + str(attrs['bond_type']) + "</int></attr>") | |||||
gxl_file.write("<attr name=\"valence\"><int>" + "1" + "</int></attr>") | |||||
gxl_file.write("<attr name=\"valence\"><int>" + str(attrs['valence']) + "</int></attr>") | |||||
# gxl_file.write("<attr name=\"valence\"><int>" + "1" + "</int></attr>") | |||||
gxl_file.write("</edge>\n") | gxl_file.write("</edge>\n") | ||||
gxl_file.write("</graph>\n") | gxl_file.write("</graph>\n") | ||||
gxl_file.write("</gxl>\n") | |||||
gxl_file.write("</gxl>") | |||||
gxl_file.close() | gxl_file.close() | ||||
elif method == 'gedlib-letter': | elif method == 'gedlib-letter': | ||||
# reference: https://github.com/dbblumenthal/gedlib/blob/master/data/generate_molecules.py#L22 | # reference: https://github.com/dbblumenthal/gedlib/blob/master/data/generate_molecules.py#L22 | ||||
@@ -147,15 +147,15 @@ def saveGXL(graph, filename, method='benoit'): | |||||
gxl_file.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n") | gxl_file.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n") | ||||
gxl_file.write("<!DOCTYPE gxl SYSTEM \"http://www.gupro.de/GXL/gxl-1.0.dtd\">\n") | gxl_file.write("<!DOCTYPE gxl SYSTEM \"http://www.gupro.de/GXL/gxl-1.0.dtd\">\n") | ||||
gxl_file.write("<gxl xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n") | gxl_file.write("<gxl xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n") | ||||
gxl_file.write("<graph id=\"" + str(graph.graph['name']) + "\" edgeids=\"false\" edgemode=\"undirected\">") | |||||
gxl_file.write("<graph id=\"" + str(graph.graph['name']) + "\" edgeids=\"false\" edgemode=\"undirected\">\n") | |||||
for v, attrs in graph.nodes(data=True): | for v, attrs in graph.nodes(data=True): | ||||
gxl_file.write("<node id=\"_" + str(v) + "\">") | gxl_file.write("<node id=\"_" + str(v) + "\">") | ||||
gxl_file.write("<attr name=\"x\"><float>" + str(attrs['attributes'][0]) + "</float></attr>") | gxl_file.write("<attr name=\"x\"><float>" + str(attrs['attributes'][0]) + "</float></attr>") | ||||
gxl_file.write("<attr name=\"y\"><float>" + str(attrs['attributes'][1]) + "</float></attr>") | gxl_file.write("<attr name=\"y\"><float>" + str(attrs['attributes'][1]) + "</float></attr>") | ||||
gxl_file.write("</node>") | |||||
gxl_file.write("</node>\n") | |||||
for v1, v2, attrs in graph.edges(data=True): | for v1, v2, attrs in graph.edges(data=True): | ||||
gxl_file.write("<edge from=\"_" + str(v1) + "\" to=\"_" + str(v2) + "\"/>") | |||||
gxl_file.write("</graph>") | |||||
gxl_file.write("<edge from=\"_" + str(v1) + "\" to=\"_" + str(v2) + "\"/>\n") | |||||
gxl_file.write("</graph>\n") | |||||
gxl_file.write("</gxl>") | gxl_file.write("</gxl>") | ||||
gxl_file.close() | gxl_file.close() | ||||
@@ -466,12 +466,15 @@ def loadDataset(filename, filename_y=None, extra_params=None): | |||||
def loadFromXML(filename, extra_params): | def loadFromXML(filename, extra_params): | ||||
import xml.etree.ElementTree as ET | import xml.etree.ElementTree as ET | ||||
dirname_dataset = dirname(filename) | |||||
if extra_params: | |||||
dirname_dataset = extra_params | |||||
else: | |||||
dirname_dataset = dirname(filename) | |||||
tree = ET.parse(filename) | tree = ET.parse(filename) | ||||
root = tree.getroot() | root = tree.getroot() | ||||
data = [] | data = [] | ||||
y = [] | y = [] | ||||
for graph in root.iter('print'): | |||||
for graph in root.iter('graph'): | |||||
mol_filename = graph.attrib['file'] | mol_filename = graph.attrib['file'] | ||||
mol_class = graph.attrib['class'] | mol_class = graph.attrib['class'] | ||||
data.append(loadGXL(dirname_dataset + '/' + mol_filename)) | data.append(loadGXL(dirname_dataset + '/' + mol_filename)) | ||||
@@ -541,15 +544,22 @@ def saveDataset(Gn, y, gformat='gxl', group=None, filename='gfile', xparams=None | |||||
dirname_ds += '/' | dirname_ds += '/' | ||||
if not os.path.exists(dirname_ds) : | if not os.path.exists(dirname_ds) : | ||||
os.makedirs(dirname_ds) | os.makedirs(dirname_ds) | ||||
if 'graph_dir' in xparams: | |||||
graph_dir = xparams['graph_dir'] + '/' | |||||
if not os.path.exists(graph_dir): | |||||
os.makedirs(graph_dir) | |||||
else: | |||||
graph_dir = dirname_ds | |||||
if group == 'xml' and gformat == 'gxl': | if group == 'xml' and gformat == 'gxl': | ||||
with open(filename + '.xml', 'w') as fgroup: | with open(filename + '.xml', 'w') as fgroup: | ||||
fgroup.write("<?xml version=\"1.0\"?>") | fgroup.write("<?xml version=\"1.0\"?>") | ||||
fgroup.write("\n<!DOCTYPE GraphCollection SYSTEM \"https://dbblumenthal.github.io/gedlib/GraphCollection_8dtd_source.html\">") | |||||
fgroup.write("\n<!DOCTYPE GraphCollection SYSTEM \"http://www.inf.unibz.it/~blumenthal/dtd/GraphCollection.dtd\">") | |||||
fgroup.write("\n<GraphCollection>") | fgroup.write("\n<GraphCollection>") | ||||
for idx, g in enumerate(Gn): | for idx, g in enumerate(Gn): | ||||
fname_tmp = "graph" + str(idx) + ".gxl" | fname_tmp = "graph" + str(idx) + ".gxl" | ||||
saveGXL(g, dirname_ds + fname_tmp, method=xparams['method']) | |||||
saveGXL(g, graph_dir + fname_tmp, method=xparams['method']) | |||||
fgroup.write("\n\t<graph file=\"" + fname_tmp + "\" class=\"" + str(y[idx]) + "\"/>") | fgroup.write("\n\t<graph file=\"" + fname_tmp + "\" class=\"" + str(y[idx]) + "\"/>") | ||||
fgroup.write("\n</GraphCollection>") | fgroup.write("\n</GraphCollection>") | ||||
fgroup.close() | fgroup.close() | ||||
@@ -558,18 +568,18 @@ def saveDataset(Gn, y, gformat='gxl', group=None, filename='gfile', xparams=None | |||||
if __name__ == '__main__': | if __name__ == '__main__': | ||||
# ### Load dataset from .ds file. | # ### Load dataset from .ds file. | ||||
# # .ct files. | # # .ct files. | ||||
ds = {'name': 'Alkane', 'dataset': '../../datasets/Alkane/dataset.ds', | |||||
'dataset_y': '../../datasets/Alkane/dataset_boiling_point_names.txt'} | |||||
Gn, y = loadDataset(ds['dataset'], filename_y=ds['dataset_y']) | |||||
# ds = {'name': 'Acyclic', 'dataset': '../../datasets/acyclic/dataset_bps.ds'} # node symb | |||||
# Gn, y = loadDataset(ds['dataset']) | |||||
# ds = {'name': 'MAO', 'dataset': '../../datasets/MAO/dataset.ds'} # node/edge symb | |||||
# Gn, y = loadDataset(ds['dataset']) | |||||
# ds = {'name': 'PAH', 'dataset': '../../datasets/PAH/dataset.ds'} # unlabeled | |||||
# Gn, y = loadDataset(ds['dataset']) | |||||
print(Gn[1].nodes(data=True)) | |||||
print(Gn[1].edges(data=True)) | |||||
print(y[1]) | |||||
# ds = {'name': 'Alkane', 'dataset': '../../datasets/Alkane/dataset.ds', | |||||
# 'dataset_y': '../../datasets/Alkane/dataset_boiling_point_names.txt'} | |||||
# Gn, y = loadDataset(ds['dataset'], filename_y=ds['dataset_y']) | |||||
## ds = {'name': 'Acyclic', 'dataset': '../../datasets/acyclic/dataset_bps.ds'} # node symb | |||||
## Gn, y = loadDataset(ds['dataset']) | |||||
## ds = {'name': 'MAO', 'dataset': '../../datasets/MAO/dataset.ds'} # node/edge symb | |||||
## Gn, y = loadDataset(ds['dataset']) | |||||
## ds = {'name': 'PAH', 'dataset': '../../datasets/PAH/dataset.ds'} # unlabeled | |||||
## Gn, y = loadDataset(ds['dataset']) | |||||
# print(Gn[1].nodes(data=True)) | |||||
# print(Gn[1].edges(data=True)) | |||||
# print(y[1]) | |||||
# # .gxl file. | # # .gxl file. | ||||
# ds = {'name': 'monoterpenoides', | # ds = {'name': 'monoterpenoides', | ||||
@@ -579,6 +589,33 @@ if __name__ == '__main__': | |||||
# print(Gn[1].edges(data=True)) | # print(Gn[1].edges(data=True)) | ||||
# print(y[1]) | # print(y[1]) | ||||
### Convert graph from one format to another. | |||||
# .gxl file. | |||||
import networkx as nx | |||||
ds = {'name': 'monoterpenoides', | |||||
'dataset': '../../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb | |||||
Gn, y = loadDataset(ds['dataset']) | |||||
y = [int(i) for i in y] | |||||
print(Gn[1].nodes(data=True)) | |||||
print(Gn[1].edges(data=True)) | |||||
print(y[1]) | |||||
# Convert a graph to the proper NetworkX format that can be recognized by library gedlib. | |||||
Gn_new = [] | |||||
for G in Gn: | |||||
G_new = nx.Graph() | |||||
for nd, attrs in G.nodes(data=True): | |||||
G_new.add_node(str(nd), chem=attrs['atom']) | |||||
for nd1, nd2, attrs in G.edges(data=True): | |||||
G_new.add_edge(str(nd1), str(nd2), valence=attrs['bond_type']) | |||||
# G_new.add_edge(str(nd1), str(nd2)) | |||||
Gn_new.append(G_new) | |||||
print(Gn_new[1].nodes(data=True)) | |||||
print(Gn_new[1].edges(data=True)) | |||||
print(Gn_new[1]) | |||||
filename = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/generated_datsets/monoterpenoides/gxl/monoterpenoides' | |||||
xparams = {'method': 'gedlib'} | |||||
saveDataset(Gn, y, gformat='gxl', group='xml', filename=filename, xparams=xparams) | |||||
# ds = {'name': 'MUTAG', 'dataset': '../../datasets/MUTAG/MUTAG.mat', | # ds = {'name': 'MUTAG', 'dataset': '../../datasets/MUTAG/MUTAG.mat', | ||||
# 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}} # node/edge symb | # 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}} # node/edge symb | ||||
# Gn, y = loadDataset(ds['dataset'], extra_params=ds['extra_params']) | # Gn, y = loadDataset(ds['dataset'], extra_params=ds['extra_params']) |