Browse Source

update preimege/ged.py

v0.1
jajupmochi 5 years ago
parent
commit
1a34c9f18e
13 changed files with 759 additions and 177 deletions
  1. +2
    -1
      notebooks/run_marginalizedkernel.py
  2. +3
    -0
      notebooks/run_spkernel.py
  3. +2
    -0
      notebooks/run_structuralspkernel.py
  4. +1
    -0
      notebooks/run_treeletkernel.py
  5. +7
    -6
      notebooks/run_untilhpathkernel.py
  6. +2
    -0
      notebooks/run_weisfeilerlehmankernel.py
  7. +72
    -110
      preimage/fitDistance.py
  8. +72
    -26
      preimage/ged.py
  9. +58
    -1
      preimage/iam.py
  10. +318
    -1
      preimage/test_fitDistance.py
  11. +126
    -1
      preimage/test_iam.py
  12. +33
    -5
      preimage/utils.py
  13. +63
    -26
      pygraph/utils/graphfiles.py

+ 2
- 1
notebooks/run_marginalizedkernel.py View File

@@ -28,6 +28,7 @@ dslist = [
# {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1_A.txt'}, # node symb # {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1_A.txt'}, # node symb
# {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109_A.txt'}, # node symb # {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109_A.txt'}, # node symb
# {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb # {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb
# {'name': 'monoterpenoides', 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'}, # node/edge symb
# #
# {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, # {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'},
# # node/edge symb # # node/edge symb
@@ -57,7 +58,7 @@ estimator = marginalizedkernel
#param_grid_precomputed = {'p_quit': np.linspace(0.1, 0.3, 3), #param_grid_precomputed = {'p_quit': np.linspace(0.1, 0.3, 3),
# 'n_iteration': np.linspace(1, 1, 1), # 'n_iteration': np.linspace(1, 1, 1),
param_grid_precomputed = {'p_quit': np.linspace(0.1, 0.9, 9), param_grid_precomputed = {'p_quit': np.linspace(0.1, 0.9, 9),
'n_iteration': np.linspace(5, 20, 4),
'n_iteration': np.linspace(1, 19, 7),
'remove_totters': [False]} 'remove_totters': [False]}
param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)}, param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)},
{'alpha': np.logspace(-10, 10, num=41, base=10)}] {'alpha': np.logspace(-10, 10, num=41, base=10)}]


+ 3
- 0
notebooks/run_spkernel.py View File

@@ -24,6 +24,9 @@ dslist = [
# {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1_A.txt'}, # node symb # {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1_A.txt'}, # node symb
# {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109_A.txt'}, # node symb # {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109_A.txt'}, # node symb
# {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb # {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb
# {'name': 'monoterpenoides', 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'}, # node/edge
# {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt'},
# # node nsymb symb
# #
# {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, # {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'},
# # node/edge symb # # node/edge symb


+ 2
- 0
notebooks/run_structuralspkernel.py View File

@@ -30,6 +30,8 @@ dslist = [
# {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'}, # {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'},
# # node symb/nsymb # # node symb/nsymb
# {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb # {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb
# {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt'},
# # node nsymb symb
# #
# {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, # {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'},
# # node/edge symb # # node/edge symb


+ 1
- 0
notebooks/run_treeletkernel.py View File

@@ -26,6 +26,7 @@ dslist = [
{'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109_A.txt'}, # node symb {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109_A.txt'}, # node symb
{'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb
{'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'}, {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'},
# {'name': 'monoterpenoides', 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'}, # node/edge symb
# node symb/nsymb # node symb/nsymb
# {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb # {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb
# {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'}, # {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'},


+ 7
- 6
notebooks/run_untilhpathkernel.py View File

@@ -27,7 +27,8 @@ dslist = [
{'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb
{'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1_A.txt'}, # node symb {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1_A.txt'}, # node symb
{'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109_A.txt'}, # node symb {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109_A.txt'}, # node symb
{'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb
{'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb
# {'name': 'monoterpenoides', 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'}, # node/edge symb
# #
# {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, # {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'},
# # node/edge symb # # node/edge symb
@@ -54,11 +55,11 @@ dslist = [
# {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',}, # {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',},
] ]
estimator = untilhpathkernel estimator = untilhpathkernel
param_grid_precomputed = {'depth': np.linspace(3, 10, 8), # [2],
'k_func': [None]} # ['MinMax', 'tanimoto'],
#param_grid_precomputed = {'depth': np.linspace(1, 10, 10), # [2],
# 'k_func': ['MinMax'], # ['MinMax', 'tanimoto'],
# 'compute_method': ['trie']} # ['MinMax']}
#param_grid_precomputed = {'depth': np.linspace(3, 10, 8), # [2],
# 'k_func': [None]} # ['MinMax', 'tanimoto'],
param_grid_precomputed = {'depth': np.linspace(1, 10, 10), # [2],
'k_func': ['MinMax', 'tanimoto'], # ['MinMax'], #
'compute_method': ['trie']} # ['MinMax']}
param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)}, param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)},
{'alpha': np.logspace(-10, 10, num=41, base=10)}] {'alpha': np.logspace(-10, 10, num=41, base=10)}]




+ 2
- 0
notebooks/run_weisfeilerlehmankernel.py View File

@@ -30,6 +30,8 @@ dslist = [
{'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1_A.txt'}, # node symb {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1_A.txt'}, # node symb
{'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109_A.txt'}, # node symb {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109_A.txt'}, # node symb
{'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb
# {'name': 'monoterpenoides', 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'}, # node/edge symb

# #
# {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, # {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'},
# # node/edge symb # # node/edge symb


+ 72
- 110
preimage/fitDistance.py View File

@@ -7,7 +7,7 @@ Created on Wed Oct 16 14:20:06 2019
""" """
import numpy as np import numpy as np
from tqdm import tqdm from tqdm import tqdm
from itertools import combinations_with_replacement
from itertools import combinations_with_replacement, combinations
import multiprocessing import multiprocessing
from multiprocessing import Pool from multiprocessing import Pool
from functools import partial from functools import partial
@@ -22,110 +22,88 @@ import sys
from ged import GED, get_nb_edit_operations from ged import GED, get_nb_edit_operations
from utils import kernel_distance_matrix from utils import kernel_distance_matrix


def fit_GED_to_kernel_distance(Gn, node_label, edge_label, gkernel, itr_max,
fitkernel=None, gamma=1.0):
def fit_GED_to_kernel_distance(Gn, node_label, edge_label, gkernel, itr_max, k=4,
params_ged={'lib': 'gedlibpy', 'cost': 'CONSTANT',
'method': 'IPFP', 'stabilizer': None},
init_costs=[3, 3, 1, 3, 3, 1],
parallel=True):
# c_vi, c_vr, c_vs, c_ei, c_er, c_es or parts of them. # c_vi, c_vr, c_vs, c_ei, c_er, c_es or parts of them.
# random.seed(1) # random.seed(1)
cost_rdm = random.sample(range(1, 10), 6)
# edit_costs = cost_rdm + [0]
edit_costs = cost_rdm
# edit_costs = [i * 0.01 for i in cost_rdm] + [0]
# edit_costs = [0.2, 0.2, 0.2, 0.2, 0.2, 0]
# edit_costs = [0, 0, 0.9544, 0.026, 0.0196, 0]
# edit_costs = [0.008429912251810438, 0.025461055985319694, 0.2047320869225948, 0.004148727085832133, 0.0, 0]
idx_cost_nonzeros = [i for i, item in enumerate(edit_costs) if item != 0]
# cost_rdm = random.sample(range(1, 10), 6)
# init_costs = cost_rdm + [0]
# init_costs = cost_rdm
init_costs = [3, 3, 1, 3, 3, 1]
# init_costs = [i * 0.01 for i in cost_rdm] + [0]
# init_costs = [0.2, 0.2, 0.2, 0.2, 0.2, 0]
# init_costs = [0, 0, 0.9544, 0.026, 0.0196, 0]
# init_costs = [0.008429912251810438, 0.025461055985319694, 0.2047320869225948, 0.004148727085832133, 0.0, 0]
# idx_cost_nonzeros = [i for i, item in enumerate(edit_costs) if item != 0]
# compute distances in feature space. # compute distances in feature space.
coef_dk = 1
dis_k_mat, _, _, _ = kernel_distance_matrix(Gn, node_label, edge_label, gkernel=gkernel) dis_k_mat, _, _, _ = kernel_distance_matrix(Gn, node_label, edge_label, gkernel=gkernel)
dis_k_vec = [] dis_k_vec = []
for i in range(len(dis_k_mat)): for i in range(len(dis_k_mat)):
for j in range(i, len(dis_k_mat)):
# for j in range(i, len(dis_k_mat)):
for j in range(i + 1, len(dis_k_mat)):
dis_k_vec.append(dis_k_mat[i, j]) dis_k_vec.append(dis_k_mat[i, j])
dis_k_vec = np.array(dis_k_vec) dis_k_vec = np.array(dis_k_vec)
if fitkernel == None:
dis_k_vec_ajusted = dis_k_vec
elif fitkernel == 'gaussian':
coef_dk = 1 / np.max(dis_k_vec)
idx_dk_nonzeros = np.where(dis_k_vec != 0)[0]
# remove 0's and constraint d_k between 0 and 1.
dis_k_vec = dis_k_vec[idx_dk_nonzeros] * coef_dk
dis_k_vec_ajusted = np.sqrt(-np.log(dis_k_vec) / gamma)
residual_list = []
edit_cost_list = []
time_list = []
nb_cost_mat_list = []
# init ged.
print('\ninitial:')
time0 = time.time()
params_ged['edit_cost_constant'] = init_costs
ged_vec_init, ged_mat, n_edit_operations = compute_geds(Gn, params_ged,
parallel=parallel)
residual_list = [np.sqrt(np.sum(np.square(np.array(ged_vec_init) - dis_k_vec)))]
time_list = [time.time() - time0]
edit_cost_list = [init_costs]
nb_cost_mat = np.array(n_edit_operations)
nb_cost_mat_list = [nb_cost_mat]
print('edit_costs:', init_costs)
print('residual_list:', residual_list)
for itr in range(itr_max): for itr in range(itr_max):
print('\niteration', itr) print('\niteration', itr)
time0 = time.time() time0 = time.time()
# compute GEDs and numbers of edit operations.
edit_cost_constant = [i for i in edit_costs]
edit_cost_list.append(edit_cost_constant)
ged_all, ged_mat, n_edit_operations = compute_geds(Gn, edit_cost_constant,
idx_cost_nonzeros, parallel=True)
if fitkernel == None:
residual = np.sqrt(np.sum(np.square(np.array(ged_all) - dis_k_vec)))
elif fitkernel == 'gaussian':
ged_all = np.array(ged_all)[idx_dk_nonzeros]
residual = np.sqrt(np.sum(np.square(
np.exp(-gamma * ged_all ** 2) / coef_dk - dis_k_vec)))
residual_list.append(residual)
# "fit" geds to distances in feature space by tuning edit costs using the # "fit" geds to distances in feature space by tuning edit costs using the
# Least Squares Method. # Least Squares Method.
nb_cost_mat = np.array(n_edit_operations).T
if fitkernel == 'gaussian':
nb_cost_mat = nb_cost_mat[idx_dk_nonzeros]
nb_cost_mat_list.append(nb_cost_mat)
edit_costs_new, residual = compute_better_costs(nb_cost_mat, dis_k_vec_ajusted)

print('pseudo residual:', residual)
edit_costs_new, residual = update_costs(nb_cost_mat, dis_k_vec)
for i in range(len(edit_costs_new)): for i in range(len(edit_costs_new)):
if edit_costs_new[i] < 0: if edit_costs_new[i] < 0:
if edit_costs_new[i] > -1e-9: if edit_costs_new[i] > -1e-9:
edit_costs_new[i] = 0 edit_costs_new[i] = 0
else: else:
raise ValueError('The edit cost is negative.') raise ValueError('The edit cost is negative.')
for idx, item in enumerate(idx_cost_nonzeros):
edit_costs[item] = edit_costs_new[idx]
# for i in range(len(edit_costs_new)):
# if edit_costs_new[i] < 0:
# edit_costs_new[i] = 0

# compute new GEDs and numbers of edit operations.
params_ged['edit_cost_constant'] = edit_costs_new
ged_vec, ged_mat, n_edit_operations = compute_geds(Gn, params_ged,
parallel=parallel)
residual_list.append(np.sqrt(np.sum(np.square(np.array(ged_vec) - dis_k_vec))))
time_list.append(time.time() - time0) time_list.append(time.time() - time0)
print('edit_costs:', edit_costs)
edit_cost_list.append(edit_costs_new)
nb_cost_mat = np.array(n_edit_operations)
nb_cost_mat_list.append(nb_cost_mat)
print('edit_costs:', edit_costs_new)
print('residual_list:', residual_list) print('residual_list:', residual_list)
print()
edit_cost_list.append(edit_costs)
ged_all, ged_mat, n_edit_operations = compute_geds(Gn, edit_costs,
idx_cost_nonzeros, parallel=True)
if fitkernel == 0:
residual = np.sqrt(np.sum(np.square(np.array(ged_all) - dis_k_vec)))
elif fitkernel == 'gaussian':
ged_all = np.array(ged_all)[idx_dk_nonzeros]
residual = np.sqrt(np.sum(np.square(
np.exp(-gamma * ged_all ** 2) / coef_dk - dis_k_vec)))
residual_list.append(residual)
nb_cost_mat_list.append(np.array(n_edit_operations).T)
return edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, \
time_list, nb_cost_mat_list, coef_dk
return edit_costs_new, residual_list, edit_cost_list, dis_k_mat, ged_mat, \
time_list, nb_cost_mat_list




def compute_geds(Gn, edit_cost_constant, idx_nonzeros, parallel=False):
def compute_geds(Gn, params_ged, parallel=False):
ged_mat = np.zeros((len(Gn), len(Gn))) ged_mat = np.zeros((len(Gn), len(Gn)))
if parallel: if parallel:
# print('parallel') # print('parallel')
len_itr = int(len(Gn) * (len(Gn) + 1) / 2)
ged_all = [0 for i in range(len_itr)]
n_edit_operations = [[0 for i in range(len_itr)] for j in
range(len(idx_nonzeros))]
itr = combinations_with_replacement(range(0, len(Gn)), 2)
# len_itr = int(len(Gn) * (len(Gn) + 1) / 2)
len_itr = int(len(Gn) * (len(Gn) - 1) / 2)
ged_vec = [0 for i in range(len_itr)]
n_edit_operations = [0 for i in range(len_itr)]
# itr = combinations_with_replacement(range(0, len(Gn)), 2)
itr = combinations(range(0, len(Gn)), 2)
n_jobs = multiprocessing.cpu_count() n_jobs = multiprocessing.cpu_count()
if len_itr < 100 * n_jobs: if len_itr < 100 * n_jobs:
chunksize = int(len_itr / n_jobs) + 1 chunksize = int(len_itr / n_jobs) + 1
@@ -134,68 +112,52 @@ def compute_geds(Gn, edit_cost_constant, idx_nonzeros, parallel=False):
def init_worker(gn_toshare): def init_worker(gn_toshare):
global G_gn global G_gn
G_gn = gn_toshare G_gn = gn_toshare
do_partial = partial(_wrapper_compute_ged_parallel, edit_cost_constant,
idx_nonzeros)
do_partial = partial(_wrapper_compute_ged_parallel, params_ged)
pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(Gn,)) pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(Gn,))
iterator = tqdm(pool.imap_unordered(do_partial, itr, chunksize), iterator = tqdm(pool.imap_unordered(do_partial, itr, chunksize),
desc='computing GEDs', file=sys.stdout) desc='computing GEDs', file=sys.stdout)
# iterator = pool.imap_unordered(do_partial, itr, chunksize) # iterator = pool.imap_unordered(do_partial, itr, chunksize)
for i, j, dis, n_eo_tmp in iterator: for i, j, dis, n_eo_tmp in iterator:
idx_itr = int(len(Gn) * i + j - i * (i + 1) / 2)
ged_all[idx_itr] = dis
idx_itr = int(len(Gn) * i + j - (i + 1) * (i + 2) / 2)
ged_vec[idx_itr] = dis
ged_mat[i][j] = dis ged_mat[i][j] = dis
ged_mat[j][i] = dis ged_mat[j][i] = dis
for idx, item in enumerate(idx_nonzeros):
n_edit_operations[idx][idx_itr] = n_eo_tmp[item]
n_edit_operations[idx_itr] = n_eo_tmp
# print('\n-------------------------------------------') # print('\n-------------------------------------------')
# print(i, j, idx_itr, dis) # print(i, j, idx_itr, dis)
pool.close() pool.close()
pool.join() pool.join()
else: else:
ged_all = []
n_edit_operations = [[] for i in range(len(idx_nonzeros))]
ged_vec = []
n_edit_operations = []
for i in tqdm(range(len(Gn)), desc='computing GEDs', file=sys.stdout): for i in tqdm(range(len(Gn)), desc='computing GEDs', file=sys.stdout):
# for i in range(len(Gn)): # for i in range(len(Gn)):
for j in range(i, len(Gn)):
# time0 = time.time()
dis, pi_forward, pi_backward = GED(Gn[i], Gn[j], lib='gedlibpy',
cost='CONSTANT', method='IPFP',
edit_cost_constant=edit_cost_constant, stabilizer='min',
repeat=50)
# time1 = time.time() - time0
# time0 = time.time()
ged_all.append(dis)
for j in range(i + 1, len(Gn)):
dis, pi_forward, pi_backward = GED(Gn[i], Gn[j], **params_ged)
ged_vec.append(dis)
ged_mat[i][j] = dis ged_mat[i][j] = dis
ged_mat[j][i] = dis ged_mat[j][i] = dis
n_eo_tmp = get_nb_edit_operations(Gn[i], Gn[j], pi_forward, pi_backward) n_eo_tmp = get_nb_edit_operations(Gn[i], Gn[j], pi_forward, pi_backward)
for idx, item in enumerate(idx_nonzeros):
n_edit_operations[idx].append(n_eo_tmp[item])
# time2 = time.time() - time0
# print(time1, time2, time1 / time2)
n_edit_operations.append(n_eo_tmp)
return ged_all, ged_mat, n_edit_operations
return ged_vec, ged_mat, n_edit_operations


def _wrapper_compute_ged_parallel(edit_cost_constant, idx_nonzeros, itr):
def _wrapper_compute_ged_parallel(params_ged, itr):
i = itr[0] i = itr[0]
j = itr[1] j = itr[1]
dis, n_eo_tmp = _compute_ged_parallel(G_gn[i], G_gn[j], edit_cost_constant,
idx_nonzeros)
dis, n_eo_tmp = _compute_ged_parallel(G_gn[i], G_gn[j], params_ged)
return i, j, dis, n_eo_tmp return i, j, dis, n_eo_tmp




def _compute_ged_parallel(g1, g2, edit_cost_constant, idx_nonzeros):
dis, pi_forward, pi_backward = GED(g1, g2, lib='gedlibpy',
cost='CONSTANT', method='IPFP',
edit_cost_constant=edit_cost_constant, stabilizer='min',
repeat=50)
n_eo_tmp = get_nb_edit_operations(g1, g2, pi_forward, pi_backward)
def _compute_ged_parallel(g1, g2, params_ged):
dis, pi_forward, pi_backward = GED(g1, g2, **params_ged)
n_eo_tmp = get_nb_edit_operations(g1, g2, pi_forward, pi_backward)
return dis, n_eo_tmp return dis, n_eo_tmp




def compute_better_costs(nb_cost_mat, dis_k_vec):
def update_costs(nb_cost_mat, dis_k_vec):
# # method 1: simple least square method. # # method 1: simple least square method.
# edit_costs_new, residual, _, _ = np.linalg.lstsq(nb_cost_mat, dis_k_vec, # edit_costs_new, residual, _, _ = np.linalg.lstsq(nb_cost_mat, dis_k_vec,
# rcond=None) # rcond=None)
@@ -203,7 +165,7 @@ def compute_better_costs(nb_cost_mat, dis_k_vec):
# # method 2: least square method with x_i >= 0. # # method 2: least square method with x_i >= 0.
# edit_costs_new, residual = optimize.nnls(nb_cost_mat, dis_k_vec) # edit_costs_new, residual = optimize.nnls(nb_cost_mat, dis_k_vec)
# method 3: solve as a quadratic program with constraints: x_i >= 0, sum(x) = 1.
# method 3: solve as a quadratic program with constraints.
# P = np.dot(nb_cost_mat.T, nb_cost_mat) # P = np.dot(nb_cost_mat.T, nb_cost_mat)
# q_T = -2 * np.dot(dis_k_vec.T, nb_cost_mat) # q_T = -2 * np.dot(dis_k_vec.T, nb_cost_mat)
# G = -1 * np.identity(nb_cost_mat.shape[1]) # G = -1 * np.identity(nb_cost_mat.shape[1])
@@ -221,7 +183,7 @@ def compute_better_costs(nb_cost_mat, dis_k_vec):
# h = np.array([0 for i in range(nb_cost_mat.shape[1])]) # h = np.array([0 for i in range(nb_cost_mat.shape[1])])
x = cp.Variable(nb_cost_mat.shape[1]) x = cp.Variable(nb_cost_mat.shape[1])
cost = cp.sum_squares(nb_cost_mat * x - dis_k_vec) cost = cp.sum_squares(nb_cost_mat * x - dis_k_vec)
constraints = [x >= [0.01 for i in range(nb_cost_mat.shape[1])],
constraints = [x >= [0.0001 for i in range(nb_cost_mat.shape[1])],
# np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0] # np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0]
np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0, np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0,
np.array([0.0, 0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0] np.array([0.0, 0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0]


+ 72
- 26
preimage/ged.py View File

@@ -13,29 +13,30 @@ import multiprocessing
from multiprocessing import Pool from multiprocessing import Pool
from functools import partial from functools import partial


from gedlibpy import librariesImport, gedlibpy
from gedlibpy_linlin import librariesImport, gedlibpy


def GED(g1, g2, lib='gedlibpy', cost='CHEM_1', method='IPFP', def GED(g1, g2, lib='gedlibpy', cost='CHEM_1', method='IPFP',
edit_cost_constant=[], stabilizer='min', repeat=50):
edit_cost_constant=[], algo_options='', stabilizer='min', repeat=50):
""" """
Compute GED for 2 graphs. Compute GED for 2 graphs.
""" """
if lib == 'gedlibpy':
def convertGraph(G):
"""Convert a graph to the proper NetworkX format that can be
recognized by library gedlibpy.
"""
G_new = nx.Graph()
for nd, attrs in G.nodes(data=True):
G_new.add_node(str(nd), chem=attrs['atom'])
def convertGraph(G):
"""Convert a graph to the proper NetworkX format that can be
recognized by library gedlibpy.
"""
G_new = nx.Graph()
for nd, attrs in G.nodes(data=True):
G_new.add_node(str(nd), chem=attrs['atom'])
# G_new.add_node(str(nd), x=str(attrs['attributes'][0]), # G_new.add_node(str(nd), x=str(attrs['attributes'][0]),
# y=str(attrs['attributes'][1])) # y=str(attrs['attributes'][1]))
for nd1, nd2, attrs in G.edges(data=True):
G_new.add_edge(str(nd1), str(nd2), valence=attrs['bond_type'])
# G_new.add_edge(str(nd1), str(nd2))
return G_new
for nd1, nd2, attrs in G.edges(data=True):
# G_new.add_edge(str(nd1), str(nd2), valence=attrs['bond_type'])
G_new.add_edge(str(nd1), str(nd2))
return G_new
if lib == 'gedlibpy':
gedlibpy.restart_env() gedlibpy.restart_env()
gedlibpy.add_nx_graph(convertGraph(g1), "") gedlibpy.add_nx_graph(convertGraph(g1), "")
gedlibpy.add_nx_graph(convertGraph(g2), "") gedlibpy.add_nx_graph(convertGraph(g2), "")
@@ -43,12 +44,12 @@ def GED(g1, g2, lib='gedlibpy', cost='CHEM_1', method='IPFP',
listID = gedlibpy.get_all_graph_ids() listID = gedlibpy.get_all_graph_ids()
gedlibpy.set_edit_cost(cost, edit_cost_constant=edit_cost_constant) gedlibpy.set_edit_cost(cost, edit_cost_constant=edit_cost_constant)
gedlibpy.init() gedlibpy.init()
gedlibpy.set_method(method, "")
gedlibpy.set_method(method, algo_options)
gedlibpy.init_method() gedlibpy.init_method()


g = listID[0] g = listID[0]
h = listID[1] h = listID[1]
if stabilizer == None:
if stabilizer is None:
gedlibpy.run_method(g, h) gedlibpy.run_method(g, h)
pi_forward = gedlibpy.get_forward_map(g, h) pi_forward = gedlibpy.get_forward_map(g, h)
pi_backward = gedlibpy.get_backward_map(g, h) pi_backward = gedlibpy.get_backward_map(g, h)
@@ -107,13 +108,57 @@ def GED(g1, g2, lib='gedlibpy', cost='CHEM_1', method='IPFP',
dis = upper dis = upper
# make the map label correct (label remove map as np.inf)
nodes1 = [n for n in g1.nodes()]
nodes2 = [n for n in g2.nodes()]
nb1 = nx.number_of_nodes(g1)
nb2 = nx.number_of_nodes(g2)
pi_forward = [nodes2[pi] if pi < nb2 else np.inf for pi in pi_forward]
pi_backward = [nodes1[pi] if pi < nb1 else np.inf for pi in pi_backward]
elif lib == 'gedlib-bash':
import time
import random
import sys
import os
sys.path.insert(0, "../")
from pygraph.utils.graphfiles import saveDataset
tmp_dir = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/output/tmp_ged/'
if not os.path.exists(tmp_dir):
os.makedirs(tmp_dir)
fn_collection = tmp_dir + 'collection.' + str(time.time()) + str(random.randint(0, 1e9))
xparams = {'method': 'gedlib', 'graph_dir': fn_collection}
saveDataset([g1, g2], ['dummy', 'dummy'], gformat='gxl', group='xml',
filename=fn_collection, xparams=xparams)
command = 'GEDLIB_HOME=\'/media/ljia/DATA/research-repo/codes/others/gedlib/gedlib2\'\n'
command += 'LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$GEDLIB_HOME/lib\n'
command += 'export LD_LIBRARY_PATH\n'
command += 'cd \'/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/bin\'\n'
command += './ged_for_python_bash monoterpenoides ' + fn_collection \
+ ' \'' + algo_options + '\' '
for ec in edit_cost_constant:
command += str(ec) + ' '
# output = os.system(command)
stream = os.popen(command)
output = stream.readlines()
# print(output)
dis = float(output[0].strip())
runtime = float(output[1].strip())
size_forward = int(output[2].strip())
pi_forward = [int(item.strip()) for item in output[3:3+size_forward]]
pi_backward = [int(item.strip()) for item in output[3+size_forward:]]

# print(dis)
# print(runtime)
# print(size_forward)
# print(pi_forward)
# print(pi_backward)
# make the map label correct (label remove map as np.inf)
nodes1 = [n for n in g1.nodes()]
nodes2 = [n for n in g2.nodes()]
nb1 = nx.number_of_nodes(g1)
nb2 = nx.number_of_nodes(g2)
pi_forward = [nodes2[pi] if pi < nb2 else np.inf for pi in pi_forward]
pi_backward = [nodes1[pi] if pi < nb1 else np.inf for pi in pi_backward]
# print(pi_forward)
return dis, pi_forward, pi_backward return dis, pi_forward, pi_backward


@@ -149,7 +194,7 @@ def GED_n(Gn, lib='gedlibpy', cost='CHEM_1', method='IPFP',


g = listID[0] g = listID[0]
h = listID[1] h = listID[1]
if stabilizer == None:
if stabilizer is None:
gedlibpy.run_method(g, h) gedlibpy.run_method(g, h)
pi_forward = gedlibpy.get_forward_map(g, h) pi_forward = gedlibpy.get_forward_map(g, h)
pi_backward = gedlibpy.get_backward_map(g, h) pi_backward = gedlibpy.get_backward_map(g, h)
@@ -183,7 +228,8 @@ def GED_n(Gn, lib='gedlibpy', cost='CHEM_1', method='IPFP',


def ged_median(Gn, Gn_median, verbose=False, params_ged={'lib': 'gedlibpy', def ged_median(Gn, Gn_median, verbose=False, params_ged={'lib': 'gedlibpy',
'cost': 'CHEM_1', 'method': 'IPFP', 'edit_cost_constant': [], 'cost': 'CHEM_1', 'method': 'IPFP', 'edit_cost_constant': [],
'stabilizer': 'min', 'repeat': 50}, parallel=False):
'algo_options': '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1',
'stabilizer': None}, parallel=False):
if parallel: if parallel:
len_itr = int(len(Gn)) len_itr = int(len(Gn))
pi_forward_list = [[] for i in range(len_itr)] pi_forward_list = [[] for i in range(len_itr)]


+ 58
- 1
preimage/iam.py View File

@@ -23,7 +23,8 @@ def iam_upgraded(Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, ite_max=50,
connected=False, removeNodes=True, allBestInit=False, allBestNodes=False, connected=False, removeNodes=True, allBestInit=False, allBestNodes=False,
allBestEdges=False, allBestOutput=False, allBestEdges=False, allBestOutput=False,
params_ged={'lib': 'gedlibpy', 'cost': 'CHEM_1', 'method': 'IPFP', params_ged={'lib': 'gedlibpy', 'cost': 'CHEM_1', 'method': 'IPFP',
'edit_cost_constant': [], 'stabilizer': 'min', 'repeat': 50}):
'edit_cost_constant': [], 'stabilizer': None,
'algo_options': '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1'}):
"""See my name, then you know what I do. """See my name, then you know what I do.
""" """
# Gn_median = Gn_median[0:10] # Gn_median = Gn_median[0:10]
@@ -435,6 +436,62 @@ def iam_upgraded(Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, ite_max=50,
return G_gen_median_list, sod_gen_median, sod_list, G_set_median_list, sod_set_median return G_gen_median_list, sod_gen_median, sod_list, G_set_median_list, sod_set_median




def iam_bash(Gn_names, edit_cost_constant, dataset='monoterpenoides',
graph_dir='/media/ljia/DATA/research-repo/codes/Linlin/py-graph/datasets/monoterpenoides/'):
"""Compute the iam by c++ implementation (gedlib) through bash.
"""
import os
import time

def createCollectionFile(Gn_names, y, filename):
"""Create collection file.
"""
dirname_ds = os.path.dirname(filename)
if dirname_ds != '':
dirname_ds += '/'
if not os.path.exists(dirname_ds) :
os.makedirs(dirname_ds)
with open(filename + '.xml', 'w') as fgroup:
fgroup.write("<?xml version=\"1.0\"?>")
fgroup.write("\n<!DOCTYPE GraphCollection SYSTEM \"http://www.inf.unibz.it/~blumenthal/dtd/GraphCollection.dtd\">")
fgroup.write("\n<GraphCollection>")
for idx, fname in enumerate(Gn_names):
fgroup.write("\n\t<graph file=\"" + fname + "\" class=\"" + str(y[idx]) + "\"/>")
fgroup.write("\n</GraphCollection>")
fgroup.close()

tmp_dir = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/output/tmp_ged/'
fn_collection = tmp_dir + 'collection.' + str(time.time()) + str(random.randint(0, 1e9))
createCollectionFile(Gn_names, ['dummy'] * len(Gn_names), fn_collection)
# graph_dir = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/generated_datsets/monoterpenoides/gxl'
command = 'GEDLIB_HOME=\'/media/ljia/DATA/research-repo/codes/others/gedlib/gedlib2\'\n'
command += 'LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$GEDLIB_HOME/lib\n'
command += 'export LD_LIBRARY_PATH\n'
command += 'cd \'/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/bin\'\n'
command += './iam_for_python_bash ' + dataset + ' ' + fn_collection \
+ ' \'' + graph_dir + '\' '
if edit_cost_constant is None:
command += 'None'
else:
for ec in edit_cost_constant:
command += str(ec) + ' '
# output = os.system(command)
stream = os.popen(command)

output = stream.readlines()
# print(output)
sod_sm = float(output[0].strip())
sod_gm= float(output[1].strip())
fname_sm = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/output/tmp_ged/set_median.gxl'
fname_gm = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/output/tmp_ged/gen_median.gxl'
return sod_sm, sod_gm, fname_sm, fname_gm




############################################################################### ###############################################################################
# Old implementations. # Old implementations.


+ 318
- 1
preimage/test_fitDistance.py View File

@@ -16,6 +16,319 @@ from utils import remove_edges
from fitDistance import fit_GED_to_kernel_distance from fitDistance import fit_GED_to_kernel_distance
from utils import normalize_distance_matrix from utils import normalize_distance_matrix



def median_paper_clcpc_python_best():
"""c_vs <= c_vi + c_vr, c_es <= c_ei + c_er with ged computation with
python invoking the c++ code by bash command (with updated library).
"""
# ds = {'name': 'monoterpenoides',
# 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb
# _, y_all = loadDataset(ds['dataset'])
gkernel = 'untilhpathkernel'
node_label = 'atom'
edge_label = 'bond_type'
itr_max = 6
algo_options = '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1'
params_ged = {'lib': 'gedlibpy', 'cost': 'CONSTANT', 'method': 'IPFP',
'algo_options': algo_options, 'stabilizer': None}
y_all = ['3', '1', '4', '6', '7', '8', '9', '2']
repeats = 50
collection_path = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/generated_datsets/monoterpenoides/'
graph_dir = collection_path + 'gxl/'
fn_edit_costs_output = 'results/median_paper/edit_costs_output.python_init40.k10.txt'

for y in y_all:
for repeat in range(repeats):
edit_costs_output_file = open(fn_edit_costs_output, 'a')
collection_file = collection_path + 'monoterpenoides_' + y + '_' + str(repeat) + '.xml'
Gn, _ = loadDataset(collection_file, extra_params=graph_dir)
edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \
nb_cost_mat_list = fit_GED_to_kernel_distance(Gn, node_label, edge_label,
gkernel, itr_max, params_ged=params_ged,
parallel=True)
total_time = np.sum(time_list)
# print('\nedit_costs:', edit_costs)
# print('\nresidual_list:', residual_list)
# print('\nedit_cost_list:', edit_cost_list)
# print('\ndistance matrix in kernel space:', dis_k_mat)
# print('\nged matrix:', ged_mat)
# print('\ntotal time:', total_time)
# print('\nnb_cost_mat:', nb_cost_mat_list[-1])
np.savez('results/median_paper/fit_distance.clcpc.python_init40.monot.elabeled.uhpkernel.y'
+ y + '.repeat' + str(repeat) + '.k10..gm',
edit_costs=edit_costs,
residual_list=residual_list, edit_cost_list=edit_cost_list,
dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list,
total_time=total_time, nb_cost_mat_list=nb_cost_mat_list)
for ec in edit_costs:
edit_costs_output_file.write(str(ec) + ' ')
edit_costs_output_file.write('\n')
edit_costs_output_file.close()
# # normalized distance matrices.
# gmfile = np.load('results/fit_distance.cs_leq_ci_plus_cr.cost_leq_1en2.monot.elabeled.uhpkernel.gm.npz')
# edit_costs = gmfile['edit_costs']
# residual_list = gmfile['residual_list']
# edit_cost_list = gmfile['edit_cost_list']
# dis_k_mat = gmfile['dis_k_mat']
# ged_mat = gmfile['ged_mat']
# total_time = gmfile['total_time']
# nb_cost_mat_list = gmfile['nb_cost_mat_list']
nb_consistent, nb_inconsistent, ratio_consistent = pairwise_substitution_consistence(dis_k_mat, ged_mat)
print(nb_consistent, nb_inconsistent, ratio_consistent)
# norm_dis_k_mat = normalize_distance_matrix(dis_k_mat)
# plt.imshow(norm_dis_k_mat)
# plt.colorbar()
# plt.savefig('results/median_paper/norm_dis_k_mat.clcpc.python_best.monot.elabeled.uhpkernel.y'
# + y + '.repeat' + str(repeat) + '.eps', format='eps', dpi=300)
# plt.savefig('results/median_paper/norm_dis_k_mat.clcpc.python_best.monot.elabeled.uhpkernel.y'
# + y + '.repeat' + str(repeat) + '.png', format='png')
# # plt.show()
# plt.clf()
#
# norm_ged_mat = normalize_distance_matrix(ged_mat)
# plt.imshow(norm_ged_mat)
# plt.colorbar()
# plt.savefig('results/median_paper/norm_ged_mat.clcpc.python_best.monot.elabeled.uhpkernel.y'
# + y + '.repeat' + str(repeat) + '.eps', format='eps', dpi=300)
# plt.savefig('results/median_paper/norm_ged_mat.clcpc.python_best.monot.elabeled.uhpkernel.y'
# + y + '.repeat' + str(repeat) + '.png', format='png')
# # plt.show()
# plt.clf()
#
# norm_diff = norm_ged_mat - norm_dis_k_mat
# plt.imshow(norm_diff)
# plt.colorbar()
# plt.savefig('results/median_paper/diff_mat_norm_ged_dis_k.clcpc.python_best.monot.elabeled.uhpkernel.y'
# + y + '.repeat' + str(repeat) + '.eps', format='eps', dpi=300)
# plt.savefig('results/median_paper/diff_mat_norm_ged_dis_k.clcpc.python_best.monot.elabeled.uhpkernel.y'
# + y + '.repeat' + str(repeat) + '.png', format='png')
# # plt.show()
# plt.clf()
# # draw_count_bar(norm_diff)


def median_paper_clcpc_python_bash_cpp():
"""c_vs <= c_vi + c_vr, c_es <= c_ei + c_er with ged computation with
python invoking the c++ code by bash command (with updated library).
"""
# ds = {'name': 'monoterpenoides',
# 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb
# _, y_all = loadDataset(ds['dataset'])
gkernel = 'untilhpathkernel'
node_label = 'atom'
edge_label = 'bond_type'
itr_max = 20
algo_options = '--threads 6 --initial-solutions 10 --ratio-runs-from-initial-solutions .5'
params_ged = {'lib': 'gedlib-bash', 'cost': 'CONSTANT', 'method': 'IPFP',
'algo_options': algo_options}
y_all = ['3', '1', '4', '6', '7', '8', '9', '2']
repeats = 50
collection_path = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/generated_datsets/monoterpenoides/'
graph_dir = collection_path + 'gxl/'
fn_edit_costs_output = 'results/median_paper/edit_costs_output.txt'

for y in y_all:
for repeat in range(repeats):
edit_costs_output_file = open(fn_edit_costs_output, 'a')
collection_file = collection_path + 'monoterpenoides_' + y + '_' + str(repeat) + '.xml'
Gn, _ = loadDataset(collection_file, extra_params=graph_dir)
edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \
nb_cost_mat_list, coef_dk = fit_GED_to_kernel_distance(Gn, node_label, edge_label,
gkernel, itr_max, params_ged=params_ged,
parallel=False)
total_time = np.sum(time_list)
# print('\nedit_costs:', edit_costs)
# print('\nresidual_list:', residual_list)
# print('\nedit_cost_list:', edit_cost_list)
# print('\ndistance matrix in kernel space:', dis_k_mat)
# print('\nged matrix:', ged_mat)
# print('\ntotal time:', total_time)
# print('\nnb_cost_mat:', nb_cost_mat_list[-1])
np.savez('results/median_paper/fit_distance.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y'
+ y + '.repeat' + str(repeat) + '.gm',
edit_costs=edit_costs,
residual_list=residual_list, edit_cost_list=edit_cost_list,
dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list,
total_time=total_time, nb_cost_mat_list=nb_cost_mat_list,
coef_dk=coef_dk)
for ec in edit_costs:
edit_costs_output_file.write(str(ec) + ' ')
edit_costs_output_file.write('\n')
edit_costs_output_file.close()
# # normalized distance matrices.
# gmfile = np.load('results/fit_distance.cs_leq_ci_plus_cr.cost_leq_1en2.monot.elabeled.uhpkernel.gm.npz')
# edit_costs = gmfile['edit_costs']
# residual_list = gmfile['residual_list']
# edit_cost_list = gmfile['edit_cost_list']
# dis_k_mat = gmfile['dis_k_mat']
# ged_mat = gmfile['ged_mat']
# total_time = gmfile['total_time']
# nb_cost_mat_list = gmfile['nb_cost_mat_list']
# coef_dk = gmfile['coef_dk']
nb_consistent, nb_inconsistent, ratio_consistent = pairwise_substitution_consistence(dis_k_mat, ged_mat)
print(nb_consistent, nb_inconsistent, ratio_consistent)
# norm_dis_k_mat = normalize_distance_matrix(dis_k_mat)
# plt.imshow(norm_dis_k_mat)
# plt.colorbar()
# plt.savefig('results/median_paper/norm_dis_k_mat.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y'
# + y + '.repeat' + str(repeat) + '.eps', format='eps', dpi=300)
# plt.savefig('results/median_paper/norm_dis_k_mat.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y'
# + y + '.repeat' + str(repeat) + '.png', format='png')
# # plt.show()
# plt.clf()
#
# norm_ged_mat = normalize_distance_matrix(ged_mat)
# plt.imshow(norm_ged_mat)
# plt.colorbar()
# plt.savefig('results/median_paper/norm_ged_mat.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y'
# + y + '.repeat' + str(repeat) + '.eps', format='eps', dpi=300)
# plt.savefig('results/median_paper/norm_ged_mat.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y'
# + y + '.repeat' + str(repeat) + '.png', format='png')
# # plt.show()
# plt.clf()
#
# norm_diff = norm_ged_mat - norm_dis_k_mat
# plt.imshow(norm_diff)
# plt.colorbar()
# plt.savefig('results/median_paper/diff_mat_norm_ged_dis_k.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y'
# + y + '.repeat' + str(repeat) + '.eps', format='eps', dpi=300)
# plt.savefig('results/median_paper/diff_mat_norm_ged_dis_k.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y'
# + y + '.repeat' + str(repeat) + '.png', format='png')
# # plt.show()
# plt.clf()
# # draw_count_bar(norm_diff)





def test_cs_leq_ci_plus_cr_python_bash_cpp():
"""c_vs <= c_vi + c_vr, c_es <= c_ei + c_er with ged computation with
python invoking the c++ code by bash command (with updated library).
"""
ds = {'name': 'monoterpenoides',
'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb
Gn, y_all = loadDataset(ds['dataset'])
# Gn = Gn[0:10]
gkernel = 'untilhpathkernel'
node_label = 'atom'
edge_label = 'bond_type'
itr_max = 10
algo_options = '--threads 6 --initial-solutions 10 --ratio-runs-from-initial-solutions .5'
params_ged = {'lib': 'gedlib-bash', 'cost': 'CONSTANT', 'method': 'IPFP',
'algo_options': algo_options}
edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \
nb_cost_mat_list, coef_dk = fit_GED_to_kernel_distance(Gn, node_label, edge_label,
gkernel, itr_max, params_ged=params_ged,
parallel=False)
total_time = np.sum(time_list)
print('\nedit_costs:', edit_costs)
print('\nresidual_list:', residual_list)
print('\nedit_cost_list:', edit_cost_list)
print('\ndistance matrix in kernel space:', dis_k_mat)
print('\nged matrix:', ged_mat)
print('\ntotal time:', total_time)
print('\nnb_cost_mat:', nb_cost_mat_list[-1])
np.savez('results/fit_distance.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel.gm',
edit_costs=edit_costs,
residual_list=residual_list, edit_cost_list=edit_cost_list,
dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list,
total_time=total_time, nb_cost_mat_list=nb_cost_mat_list,
coef_dk=coef_dk)
# ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
# 'extra_params': {}} # node/edge symb
# Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
## Gn = Gn[0:10]
## remove_edges(Gn)
# gkernel = 'untilhpathkernel'
# node_label = 'atom'
# edge_label = 'bond_type'
# itr_max = 10
# edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \
# nb_cost_mat_list, coef_dk = fit_GED_to_kernel_distance(Gn, node_label, edge_label,
# gkernel, itr_max)
# total_time = np.sum(time_list)
# print('\nedit_costs:', edit_costs)
# print('\nresidual_list:', residual_list)
# print('\nedit_cost_list:', edit_cost_list)
# print('\ndistance matrix in kernel space:', dis_k_mat)
# print('\nged matrix:', ged_mat)
# print('\ntotal time:', total_time)
# print('\nnb_cost_mat:', nb_cost_mat_list[-1])
# np.savez('results/fit_distance.cs_leq_ci_plus_cr.mutag.elabeled.uhpkernel.gm',
# edit_costs=edit_costs,
# residual_list=residual_list, edit_cost_list=edit_cost_list,
# dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list,
# total_time=total_time, nb_cost_mat_list=nb_cost_mat_list, coef_dk)
# # normalized distance matrices.
# gmfile = np.load('results/fit_distance.cs_leq_ci_plus_cr.monot.elabeled.uhpkernel.gm.npz')
# edit_costs = gmfile['edit_costs']
# residual_list = gmfile['residual_list']
# edit_cost_list = gmfile['edit_cost_list']
# dis_k_mat = gmfile['dis_k_mat']
# ged_mat = gmfile['ged_mat']
# total_time = gmfile['total_time']
# nb_cost_mat_list = gmfile['nb_cost_mat_list']
# coef_dk = gmfile['coef_dk']
nb_consistent, nb_inconsistent, ratio_consistent = pairwise_substitution_consistence(dis_k_mat, ged_mat)
print(nb_consistent, nb_inconsistent, ratio_consistent)
# dis_k_sub = pairwise_substitution(dis_k_mat)
# ged_sub = pairwise_substitution(ged_mat)
# np.savez('results/sub_dis_mat.cs_leq_ci_plus_cr.gm',
# dis_k_sub=dis_k_sub, ged_sub=ged_sub)
norm_dis_k_mat = normalize_distance_matrix(dis_k_mat)
plt.imshow(norm_dis_k_mat)
plt.colorbar()
plt.savefig('results/norm_dis_k_mat.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel'
+ '.eps', format='eps', dpi=300)
plt.savefig('results/norm_dis_k_mat.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel'
+ '.png', format='png')
# plt.show()
plt.clf()
norm_ged_mat = normalize_distance_matrix(ged_mat)
plt.imshow(norm_ged_mat)
plt.colorbar()
plt.savefig('results/norm_ged_mat.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel'
+ '.eps', format='eps', dpi=300)
plt.savefig('results/norm_ged_mat.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel'
+ '.png', format='png')
# plt.show()
plt.clf()
norm_diff = norm_ged_mat - norm_dis_k_mat
plt.imshow(norm_diff)
plt.colorbar()
plt.savefig('results/diff_mat_norm_ged_dis_k.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel'
+ '.eps', format='eps', dpi=300)
plt.savefig('results/diff_mat_norm_ged_dis_k.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel'
+ '.png', format='png')
# plt.show()
plt.clf()
# draw_count_bar(norm_diff)


def test_anycosts(): def test_anycosts():
ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt', ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
'extra_params': {}} # node/edge symb 'extra_params': {}} # node/edge symb
@@ -295,8 +608,12 @@ def draw_count_bar(norm_diff):
if __name__ == '__main__': if __name__ == '__main__':
# test_anycosts() # test_anycosts()
test_cs_leq_ci_plus_cr()
# test_cs_leq_ci_plus_cr()
# test_unfitted() # test_unfitted()
# test_cs_leq_ci_plus_cr_python_bash_cpp()
# median_paper_clcpc_python_bash_cpp()
median_paper_clcpc_python_best()

# x = np.array([[1,2,3],[4,5,6],[7,8,9]]) # x = np.array([[1,2,3],[4,5,6],[7,8,9]])
# xx = pairwise_substitution(x) # xx = pairwise_substitution(x)

+ 126
- 1
preimage/test_iam.py View File

@@ -22,6 +22,130 @@ from iam import iam_upgraded
from utils import remove_edges, compute_kernel, get_same_item_indices, dis_gstar from utils import remove_edges, compute_kernel, get_same_item_indices, dis_gstar
#from ged import ged_median #from ged import ged_median



def test_iam_monoterpenoides_with_init40():
gkernel = 'untilhpathkernel'
node_label = 'atom'
edge_label = 'bond_type'
# unfitted edit costs.
c_vi = 3
c_vr = 3
c_vs = 1
c_ei = 3
c_er = 3
c_es = 1
ite_max_iam = 50
epsilon_iam = 0.0001
removeNodes = False
connected_iam = False
# parameters for IAM function
# ged_cost = 'CONSTANT'
ged_cost = 'CONSTANT'
ged_method = 'IPFP'
edit_cost_constant = [c_vi, c_vr, c_vs, c_ei, c_er, c_es]
ged_stabilizer = None
# ged_repeat = 50
algo_options = '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1'
params_ged = {'lib': 'gedlibpy', 'cost': ged_cost, 'method': ged_method,
'edit_cost_constant': edit_cost_constant,
'algo_options': algo_options,
'stabilizer': ged_stabilizer}

collection_path = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/generated_datsets/monoterpenoides/'
graph_dir = collection_path + 'gxl/'
y_all = ['3', '1', '4', '6', '7', '8', '9', '2']
repeats = 50
# classify graphs according to classes.
time_list = []
dis_ks_min_list = []
dis_ks_set_median_list = []
sod_gs_list = []
g_best = []
sod_set_median_list = []
sod_list_list = []
for y in y_all:
print('\n-------------------------------------------------------')
print('class of y:', y)
time_list.append([])
dis_ks_min_list.append([])
dis_ks_set_median_list.append([])
sod_gs_list.append([])
g_best.append([])
sod_set_median_list.append([])
for repeat in range(repeats):
# load median set.
collection_file = collection_path + 'monoterpenoides_' + y + '_' + str(repeat) + '.xml'
Gn_median, _ = loadDataset(collection_file, extra_params=graph_dir)
Gn_candidate = [g.copy() for g in Gn_median]
time0 = time.time()
G_gen_median_list, sod_gen_median, sod_list, G_set_median_list, sod_set_median \
= iam_upgraded(Gn_median,
Gn_candidate, c_ei=c_ei, c_er=c_er, c_es=c_es, ite_max=ite_max_iam,
epsilon=epsilon_iam, node_label=node_label, edge_label=edge_label,
connected=connected_iam, removeNodes=removeNodes,
params_ged=params_ged)
time_total = time.time() - time0
print('\ntime: ', time_total)
time_list[-1].append(time_total)
g_best[-1].append(G_gen_median_list[0])
sod_set_median_list[-1].append(sod_set_median)
print('\nsmallest sod of the set median:', sod_set_median)
sod_gs_list[-1].append(sod_gen_median)
print('\nsmallest sod in graph space:', sod_gen_median)
sod_list_list.append(sod_list)
# # show the best graph and save it to file.
# print('one of the possible corresponding pre-images is')
# nx.draw(G_gen_median_list[0], labels=nx.get_node_attributes(G_gen_median_list[0], 'atom'),
# with_labels=True)
## plt.show()
# # plt.savefig('results/iam/mutag_median.fit_costs2.001.nb' + str(nb_median) +
## plt.savefig('results/iam/paper_compare/monoter_y' + str(y_class) +
## '_repeat' + str(repeat) + '_' + str(time.time()) +
## '.png', format="PNG")
# plt.clf()
# # print(G_gen_median_list[0].nodes(data=True))
# # print(G_gen_median_list[0].edges(data=True))
print('\nsods of the set median for this class:', sod_set_median_list[-1])
print('\nsods in graph space for this class:', sod_gs_list[-1])
# print('\ndistance in kernel space of set median for this class:',
# dis_ks_set_median_list[-1])
# print('\nsmallest distances in kernel space for this class:',
# dis_ks_min_list[-1])
print('\ntimes for this class:', time_list[-1])
sod_set_median_list[-1] = np.mean(sod_set_median_list[-1])
sod_gs_list[-1] = np.mean(sod_gs_list[-1])
# dis_ks_set_median_list[-1] = np.mean(dis_ks_set_median_list[-1])
# dis_ks_min_list[-1] = np.mean(dis_ks_min_list[-1])
time_list[-1] = np.mean(time_list[-1])
print()
print('\nmean sods of the set median for each class:', sod_set_median_list)
print('\nmean sods in graph space for each class:', sod_gs_list)
# print('\ndistances in kernel space of set median for each class:',
# dis_ks_set_median_list)
# print('\nmean smallest distances in kernel space for each class:',
# dis_ks_min_list)
print('\nmean times for each class:', time_list)
print('\nmean sods of the set median of all:', np.mean(sod_set_median_list))
print('\nmean sods in graph space of all:', np.mean(sod_gs_list))
# print('\nmean distances in kernel space of set median of all:',
# np.mean(dis_ks_set_median_list))
# print('\nmean smallest distances in kernel space of all:',
# np.mean(dis_ks_min_list))
print('\nmean times of all:', np.mean(time_list))




def test_iam_monoterpenoides(): def test_iam_monoterpenoides():
ds = {'name': 'monoterpenoides', ds = {'name': 'monoterpenoides',
'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb
@@ -834,9 +958,10 @@ if __name__ == '__main__':
# tests on different numbers of median-sets. # tests on different numbers of median-sets.
# test_iam_median_nb() # test_iam_median_nb()
# test_iam_letter_h() # test_iam_letter_h()
test_iam_monoterpenoides()
# test_iam_monoterpenoides()
# test_iam_mutag() # test_iam_mutag()
# test_iam_fitdistance() # test_iam_fitdistance()
# print("test log") # print("test log")
test_iam_monoterpenoides_with_init40()

+ 33
- 5
preimage/utils.py View File

@@ -17,8 +17,10 @@ from pygraph.kernels.marginalizedKernel import marginalizedkernel
from pygraph.kernels.untilHPathKernel import untilhpathkernel from pygraph.kernels.untilHPathKernel import untilhpathkernel
from pygraph.kernels.spKernel import spkernel from pygraph.kernels.spKernel import spkernel
import functools import functools
from pygraph.utils.kernels import deltakernel, gaussiankernel, kernelproduct
from pygraph.utils.kernels import deltakernel, gaussiankernel, kernelproduct, polynomialkernel
from pygraph.kernels.structuralspKernel import structuralspkernel from pygraph.kernels.structuralspKernel import structuralspkernel
from pygraph.kernels.treeletKernel import treeletkernel
from pygraph.kernels.weisfeilerLehmanKernel import weisfeilerlehmankernel




def remove_edges(Gn): def remove_edges(Gn):
@@ -46,18 +48,29 @@ def compute_kernel(Gn, graph_kernel, node_label, edge_label, verbose):
n_jobs=multiprocessing.cpu_count(), verbose=verbose) n_jobs=multiprocessing.cpu_count(), verbose=verbose)
elif graph_kernel == 'untilhpathkernel': elif graph_kernel == 'untilhpathkernel':
Kmatrix, _ = untilhpathkernel(Gn, node_label=node_label, edge_label=edge_label, Kmatrix, _ = untilhpathkernel(Gn, node_label=node_label, edge_label=edge_label,
depth=10, k_func='MinMax', compute_method='trie',
depth=7, k_func='MinMax', compute_method='trie',
n_jobs=multiprocessing.cpu_count(), verbose=verbose) n_jobs=multiprocessing.cpu_count(), verbose=verbose)
elif graph_kernel == 'spkernel': elif graph_kernel == 'spkernel':
mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)
Kmatrix, _, _ = spkernel(Gn, node_label='atom', node_kernels=
Kmatrix, _, _ = spkernel(Gn, node_label=node_label, node_kernels=
{'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}, {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel},
n_jobs=multiprocessing.cpu_count(), verbose=verbose) n_jobs=multiprocessing.cpu_count(), verbose=verbose)
elif graph_kernel == 'structuralspkernel': elif graph_kernel == 'structuralspkernel':
mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)
Kmatrix, _ = structuralspkernel(Gn, node_label='atom', node_kernels=
Kmatrix, _ = structuralspkernel(Gn, node_label=node_label, node_kernels=
{'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}, {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel},
n_jobs=multiprocessing.cpu_count(), verbose=verbose) n_jobs=multiprocessing.cpu_count(), verbose=verbose)
elif graph_kernel == 'treeletkernel':
# pkernel = functools.partial(polynomialkernel, d=2, c=1e5)
pkernel = functools.partial(gaussiankernel, gamma=1e-6)
mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)
Kmatrix, _ = treeletkernel(Gn, node_label=node_label, edge_label=edge_label,
sub_kernel=pkernel,
n_jobs=multiprocessing.cpu_count(), verbose=verbose)
elif graph_kernel == 'weisfeilerlehmankernel':
Kmatrix, _ = weisfeilerlehmankernel(Gn, node_label=node_label, edge_label=edge_label,
height=4, base_kernel='subtree',
n_jobs=multiprocessing.cpu_count(), verbose=verbose)
# normalization # normalization
Kmatrix_diag = Kmatrix.diagonal().copy() Kmatrix_diag = Kmatrix.diagonal().copy()
@@ -79,7 +92,7 @@ def gram2distances(Kmatrix):


def kernel_distance_matrix(Gn, node_label, edge_label, Kmatrix=None, gkernel=None): def kernel_distance_matrix(Gn, node_label, edge_label, Kmatrix=None, gkernel=None):
dis_mat = np.empty((len(Gn), len(Gn))) dis_mat = np.empty((len(Gn), len(Gn)))
if Kmatrix == None:
if Kmatrix is None:
Kmatrix = compute_kernel(Gn, gkernel, node_label, edge_label, True) Kmatrix = compute_kernel(Gn, gkernel, node_label, edge_label, True)
for i in range(len(Gn)): for i in range(len(Gn)):
for j in range(i, len(Gn)): for j in range(i, len(Gn)):
@@ -109,6 +122,21 @@ def get_same_item_indices(ls):
return idx_dict return idx_dict




def k_nearest_neighbors_to_median_in_kernel_space(Gn, Kmatrix=None, gkernel=None,
node_label=None, edge_label=None):
dis_k_all = [] # distance between g_star and each graph.
alpha = [1 / len(Gn)] * len(Gn)
if Kmatrix is None:
Kmatrix = compute_kernel(Gn, gkernel, node_label, edge_label, True)
term3 = 0
for i1, a1 in enumerate(alpha):
for i2, a2 in enumerate(alpha):
term3 += a1 * a2 * Kmatrix[idx_gi[i1], idx_gi[i2]]
for ig, g in tqdm(enumerate(Gn_init), desc='computing distances', file=sys.stdout):
dtemp = dis_gstar(ig, idx_gi, alpha, Kmatrix, term3=term3)
dis_all.append(dtemp)


def normalize_distance_matrix(D): def normalize_distance_matrix(D):
max_value = np.amax(D) max_value = np.amax(D)
min_value = np.amin(D) min_value = np.amin(D)

+ 63
- 26
pygraph/utils/graphfiles.py View File

@@ -124,21 +124,21 @@ def saveGXL(graph, filename, method='benoit'):
# reference: https://github.com/dbblumenthal/gedlib/blob/master/data/generate_molecules.py#L22 # reference: https://github.com/dbblumenthal/gedlib/blob/master/data/generate_molecules.py#L22
# pass # pass
gxl_file = open(filename, 'w') gxl_file = open(filename, 'w')
gxl_file.write("<?xml version=\"1.0\"?>\n")
gxl_file.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n")
gxl_file.write("<!DOCTYPE gxl SYSTEM \"http://www.gupro.de/GXL/gxl-1.0.dtd\">\n") gxl_file.write("<!DOCTYPE gxl SYSTEM \"http://www.gupro.de/GXL/gxl-1.0.dtd\">\n")
gxl_file.write("<gxl>\n")
gxl_file.write("<gxl xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n")
gxl_file.write("<graph id=\"" + str(graph.graph['name']) + "\" edgeids=\"true\" edgemode=\"undirected\">\n") gxl_file.write("<graph id=\"" + str(graph.graph['name']) + "\" edgeids=\"true\" edgemode=\"undirected\">\n")
for v, attrs in graph.nodes(data=True): for v, attrs in graph.nodes(data=True):
gxl_file.write("<node id=\"_" + str(v) + "\">") gxl_file.write("<node id=\"_" + str(v) + "\">")
gxl_file.write("<attr name=\"" + "chem" + "\"><int>" + str(attrs['atom']) + "</int></attr>")
gxl_file.write("<attr name=\"" + "chem" + "\"><int>" + str(attrs['chem']) + "</int></attr>")
gxl_file.write("</node>\n") gxl_file.write("</node>\n")
for v1, v2, attrs in graph.edges(data=True): for v1, v2, attrs in graph.edges(data=True):
gxl_file.write("<edge from=\"_" + str(v1) + "\" to=\"_" + str(v2) + "\">") gxl_file.write("<edge from=\"_" + str(v1) + "\" to=\"_" + str(v2) + "\">")
# gxl_file.write("<attr name=\"valence\"><int>" + str(attrs['bond_type']) + "</int></attr>")
gxl_file.write("<attr name=\"valence\"><int>" + "1" + "</int></attr>")
gxl_file.write("<attr name=\"valence\"><int>" + str(attrs['valence']) + "</int></attr>")
# gxl_file.write("<attr name=\"valence\"><int>" + "1" + "</int></attr>")
gxl_file.write("</edge>\n") gxl_file.write("</edge>\n")
gxl_file.write("</graph>\n") gxl_file.write("</graph>\n")
gxl_file.write("</gxl>\n")
gxl_file.write("</gxl>")
gxl_file.close() gxl_file.close()
elif method == 'gedlib-letter': elif method == 'gedlib-letter':
# reference: https://github.com/dbblumenthal/gedlib/blob/master/data/generate_molecules.py#L22 # reference: https://github.com/dbblumenthal/gedlib/blob/master/data/generate_molecules.py#L22
@@ -147,15 +147,15 @@ def saveGXL(graph, filename, method='benoit'):
gxl_file.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n") gxl_file.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n")
gxl_file.write("<!DOCTYPE gxl SYSTEM \"http://www.gupro.de/GXL/gxl-1.0.dtd\">\n") gxl_file.write("<!DOCTYPE gxl SYSTEM \"http://www.gupro.de/GXL/gxl-1.0.dtd\">\n")
gxl_file.write("<gxl xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n") gxl_file.write("<gxl xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n")
gxl_file.write("<graph id=\"" + str(graph.graph['name']) + "\" edgeids=\"false\" edgemode=\"undirected\">")
gxl_file.write("<graph id=\"" + str(graph.graph['name']) + "\" edgeids=\"false\" edgemode=\"undirected\">\n")
for v, attrs in graph.nodes(data=True): for v, attrs in graph.nodes(data=True):
gxl_file.write("<node id=\"_" + str(v) + "\">") gxl_file.write("<node id=\"_" + str(v) + "\">")
gxl_file.write("<attr name=\"x\"><float>" + str(attrs['attributes'][0]) + "</float></attr>") gxl_file.write("<attr name=\"x\"><float>" + str(attrs['attributes'][0]) + "</float></attr>")
gxl_file.write("<attr name=\"y\"><float>" + str(attrs['attributes'][1]) + "</float></attr>") gxl_file.write("<attr name=\"y\"><float>" + str(attrs['attributes'][1]) + "</float></attr>")
gxl_file.write("</node>")
gxl_file.write("</node>\n")
for v1, v2, attrs in graph.edges(data=True): for v1, v2, attrs in graph.edges(data=True):
gxl_file.write("<edge from=\"_" + str(v1) + "\" to=\"_" + str(v2) + "\"/>")
gxl_file.write("</graph>")
gxl_file.write("<edge from=\"_" + str(v1) + "\" to=\"_" + str(v2) + "\"/>\n")
gxl_file.write("</graph>\n")
gxl_file.write("</gxl>") gxl_file.write("</gxl>")
gxl_file.close() gxl_file.close()


@@ -466,12 +466,15 @@ def loadDataset(filename, filename_y=None, extra_params=None):
def loadFromXML(filename, extra_params): def loadFromXML(filename, extra_params):
import xml.etree.ElementTree as ET import xml.etree.ElementTree as ET
dirname_dataset = dirname(filename)
if extra_params:
dirname_dataset = extra_params
else:
dirname_dataset = dirname(filename)
tree = ET.parse(filename) tree = ET.parse(filename)
root = tree.getroot() root = tree.getroot()
data = [] data = []
y = [] y = []
for graph in root.iter('print'):
for graph in root.iter('graph'):
mol_filename = graph.attrib['file'] mol_filename = graph.attrib['file']
mol_class = graph.attrib['class'] mol_class = graph.attrib['class']
data.append(loadGXL(dirname_dataset + '/' + mol_filename)) data.append(loadGXL(dirname_dataset + '/' + mol_filename))
@@ -541,15 +544,22 @@ def saveDataset(Gn, y, gformat='gxl', group=None, filename='gfile', xparams=None
dirname_ds += '/' dirname_ds += '/'
if not os.path.exists(dirname_ds) : if not os.path.exists(dirname_ds) :
os.makedirs(dirname_ds) os.makedirs(dirname_ds)
if 'graph_dir' in xparams:
graph_dir = xparams['graph_dir'] + '/'
if not os.path.exists(graph_dir):
os.makedirs(graph_dir)
else:
graph_dir = dirname_ds
if group == 'xml' and gformat == 'gxl': if group == 'xml' and gformat == 'gxl':
with open(filename + '.xml', 'w') as fgroup: with open(filename + '.xml', 'w') as fgroup:
fgroup.write("<?xml version=\"1.0\"?>") fgroup.write("<?xml version=\"1.0\"?>")
fgroup.write("\n<!DOCTYPE GraphCollection SYSTEM \"https://dbblumenthal.github.io/gedlib/GraphCollection_8dtd_source.html\">")
fgroup.write("\n<!DOCTYPE GraphCollection SYSTEM \"http://www.inf.unibz.it/~blumenthal/dtd/GraphCollection.dtd\">")
fgroup.write("\n<GraphCollection>") fgroup.write("\n<GraphCollection>")
for idx, g in enumerate(Gn): for idx, g in enumerate(Gn):
fname_tmp = "graph" + str(idx) + ".gxl" fname_tmp = "graph" + str(idx) + ".gxl"
saveGXL(g, dirname_ds + fname_tmp, method=xparams['method'])
saveGXL(g, graph_dir + fname_tmp, method=xparams['method'])
fgroup.write("\n\t<graph file=\"" + fname_tmp + "\" class=\"" + str(y[idx]) + "\"/>") fgroup.write("\n\t<graph file=\"" + fname_tmp + "\" class=\"" + str(y[idx]) + "\"/>")
fgroup.write("\n</GraphCollection>") fgroup.write("\n</GraphCollection>")
fgroup.close() fgroup.close()
@@ -558,18 +568,18 @@ def saveDataset(Gn, y, gformat='gxl', group=None, filename='gfile', xparams=None
if __name__ == '__main__': if __name__ == '__main__':
# ### Load dataset from .ds file. # ### Load dataset from .ds file.
# # .ct files. # # .ct files.
ds = {'name': 'Alkane', 'dataset': '../../datasets/Alkane/dataset.ds',
'dataset_y': '../../datasets/Alkane/dataset_boiling_point_names.txt'}
Gn, y = loadDataset(ds['dataset'], filename_y=ds['dataset_y'])
# ds = {'name': 'Acyclic', 'dataset': '../../datasets/acyclic/dataset_bps.ds'} # node symb
# Gn, y = loadDataset(ds['dataset'])
# ds = {'name': 'MAO', 'dataset': '../../datasets/MAO/dataset.ds'} # node/edge symb
# Gn, y = loadDataset(ds['dataset'])
# ds = {'name': 'PAH', 'dataset': '../../datasets/PAH/dataset.ds'} # unlabeled
# Gn, y = loadDataset(ds['dataset'])
print(Gn[1].nodes(data=True))
print(Gn[1].edges(data=True))
print(y[1])
# ds = {'name': 'Alkane', 'dataset': '../../datasets/Alkane/dataset.ds',
# 'dataset_y': '../../datasets/Alkane/dataset_boiling_point_names.txt'}
# Gn, y = loadDataset(ds['dataset'], filename_y=ds['dataset_y'])
## ds = {'name': 'Acyclic', 'dataset': '../../datasets/acyclic/dataset_bps.ds'} # node symb
## Gn, y = loadDataset(ds['dataset'])
## ds = {'name': 'MAO', 'dataset': '../../datasets/MAO/dataset.ds'} # node/edge symb
## Gn, y = loadDataset(ds['dataset'])
## ds = {'name': 'PAH', 'dataset': '../../datasets/PAH/dataset.ds'} # unlabeled
## Gn, y = loadDataset(ds['dataset'])
# print(Gn[1].nodes(data=True))
# print(Gn[1].edges(data=True))
# print(y[1])
# # .gxl file. # # .gxl file.
# ds = {'name': 'monoterpenoides', # ds = {'name': 'monoterpenoides',
@@ -579,6 +589,33 @@ if __name__ == '__main__':
# print(Gn[1].edges(data=True)) # print(Gn[1].edges(data=True))
# print(y[1]) # print(y[1])
### Convert graph from one format to another.
# .gxl file.
import networkx as nx
ds = {'name': 'monoterpenoides',
'dataset': '../../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb
Gn, y = loadDataset(ds['dataset'])
y = [int(i) for i in y]
print(Gn[1].nodes(data=True))
print(Gn[1].edges(data=True))
print(y[1])
# Convert a graph to the proper NetworkX format that can be recognized by library gedlib.
Gn_new = []
for G in Gn:
G_new = nx.Graph()
for nd, attrs in G.nodes(data=True):
G_new.add_node(str(nd), chem=attrs['atom'])
for nd1, nd2, attrs in G.edges(data=True):
G_new.add_edge(str(nd1), str(nd2), valence=attrs['bond_type'])
# G_new.add_edge(str(nd1), str(nd2))
Gn_new.append(G_new)
print(Gn_new[1].nodes(data=True))
print(Gn_new[1].edges(data=True))
print(Gn_new[1])
filename = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/generated_datsets/monoterpenoides/gxl/monoterpenoides'
xparams = {'method': 'gedlib'}
saveDataset(Gn, y, gformat='gxl', group='xml', filename=filename, xparams=xparams)
# ds = {'name': 'MUTAG', 'dataset': '../../datasets/MUTAG/MUTAG.mat', # ds = {'name': 'MUTAG', 'dataset': '../../datasets/MUTAG/MUTAG.mat',
# 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}} # node/edge symb # 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}} # node/edge symb
# Gn, y = loadDataset(ds['dataset'], extra_params=ds['extra_params']) # Gn, y = loadDataset(ds['dataset'], extra_params=ds['extra_params'])

Loading…
Cancel
Save