diff --git a/notebooks/run_marginalizedkernel.py b/notebooks/run_marginalizedkernel.py
index df1c66b..cd7bf73 100644
--- a/notebooks/run_marginalizedkernel.py
+++ b/notebooks/run_marginalizedkernel.py
@@ -28,6 +28,7 @@ dslist = [
# {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1_A.txt'}, # node symb
# {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109_A.txt'}, # node symb
# {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb
+# {'name': 'monoterpenoides', 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'}, # node/edge symb
#
# {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'},
# # node/edge symb
@@ -57,7 +58,7 @@ estimator = marginalizedkernel
#param_grid_precomputed = {'p_quit': np.linspace(0.1, 0.3, 3),
# 'n_iteration': np.linspace(1, 1, 1),
param_grid_precomputed = {'p_quit': np.linspace(0.1, 0.9, 9),
- 'n_iteration': np.linspace(5, 20, 4),
+ 'n_iteration': np.linspace(1, 19, 7),
'remove_totters': [False]}
param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)},
{'alpha': np.logspace(-10, 10, num=41, base=10)}]
diff --git a/notebooks/run_spkernel.py b/notebooks/run_spkernel.py
index fcae61f..0698d2a 100644
--- a/notebooks/run_spkernel.py
+++ b/notebooks/run_spkernel.py
@@ -24,6 +24,9 @@ dslist = [
# {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1_A.txt'}, # node symb
# {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109_A.txt'}, # node symb
# {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb
+# {'name': 'monoterpenoides', 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'}, # node/edge
+# {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt'},
+# # node nsymb symb
#
# {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'},
# # node/edge symb
diff --git a/notebooks/run_structuralspkernel.py b/notebooks/run_structuralspkernel.py
index 071cd3c..223d832 100644
--- a/notebooks/run_structuralspkernel.py
+++ b/notebooks/run_structuralspkernel.py
@@ -30,6 +30,8 @@ dslist = [
# {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'},
# # node symb/nsymb
# {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb
+# {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt'},
+# # node nsymb symb
#
# {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'},
# # node/edge symb
diff --git a/notebooks/run_treeletkernel.py b/notebooks/run_treeletkernel.py
index 25c83d5..b4631fc 100644
--- a/notebooks/run_treeletkernel.py
+++ b/notebooks/run_treeletkernel.py
@@ -26,6 +26,7 @@ dslist = [
{'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109_A.txt'}, # node symb
{'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb
{'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'},
+# {'name': 'monoterpenoides', 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'}, # node/edge symb
# node symb/nsymb
# {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb
# {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'},
diff --git a/notebooks/run_untilhpathkernel.py b/notebooks/run_untilhpathkernel.py
index 6210708..3127ea5 100644
--- a/notebooks/run_untilhpathkernel.py
+++ b/notebooks/run_untilhpathkernel.py
@@ -27,7 +27,8 @@ dslist = [
{'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb
{'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1_A.txt'}, # node symb
{'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109_A.txt'}, # node symb
- {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb
+ {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb
+# {'name': 'monoterpenoides', 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'}, # node/edge symb
#
# {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'},
# # node/edge symb
@@ -54,11 +55,11 @@ dslist = [
# {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',},
]
estimator = untilhpathkernel
-param_grid_precomputed = {'depth': np.linspace(3, 10, 8), # [2],
- 'k_func': [None]} # ['MinMax', 'tanimoto'],
-#param_grid_precomputed = {'depth': np.linspace(1, 10, 10), # [2],
-# 'k_func': ['MinMax'], # ['MinMax', 'tanimoto'],
-# 'compute_method': ['trie']} # ['MinMax']}
+#param_grid_precomputed = {'depth': np.linspace(3, 10, 8), # [2],
+# 'k_func': [None]} # ['MinMax', 'tanimoto'],
+param_grid_precomputed = {'depth': np.linspace(1, 10, 10), # [2],
+ 'k_func': ['MinMax', 'tanimoto'], # ['MinMax'], #
+ 'compute_method': ['trie']} # ['MinMax']}
param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)},
{'alpha': np.logspace(-10, 10, num=41, base=10)}]
diff --git a/notebooks/run_weisfeilerlehmankernel.py b/notebooks/run_weisfeilerlehmankernel.py
index 423da8b..ed03adc 100644
--- a/notebooks/run_weisfeilerlehmankernel.py
+++ b/notebooks/run_weisfeilerlehmankernel.py
@@ -30,6 +30,8 @@ dslist = [
{'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1_A.txt'}, # node symb
{'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109_A.txt'}, # node symb
{'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb
+# {'name': 'monoterpenoides', 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'}, # node/edge symb
+
#
# {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'},
# # node/edge symb
diff --git a/preimage/fitDistance.py b/preimage/fitDistance.py
index 5268014..f07c3f2 100644
--- a/preimage/fitDistance.py
+++ b/preimage/fitDistance.py
@@ -7,7 +7,7 @@ Created on Wed Oct 16 14:20:06 2019
"""
import numpy as np
from tqdm import tqdm
-from itertools import combinations_with_replacement
+from itertools import combinations_with_replacement, combinations
import multiprocessing
from multiprocessing import Pool
from functools import partial
@@ -22,110 +22,88 @@ import sys
from ged import GED, get_nb_edit_operations
from utils import kernel_distance_matrix
-def fit_GED_to_kernel_distance(Gn, node_label, edge_label, gkernel, itr_max,
- fitkernel=None, gamma=1.0):
+def fit_GED_to_kernel_distance(Gn, node_label, edge_label, gkernel, itr_max, k=4,
+ params_ged={'lib': 'gedlibpy', 'cost': 'CONSTANT',
+ 'method': 'IPFP', 'stabilizer': None},
+ init_costs=[3, 3, 1, 3, 3, 1],
+ parallel=True):
# c_vi, c_vr, c_vs, c_ei, c_er, c_es or parts of them.
# random.seed(1)
- cost_rdm = random.sample(range(1, 10), 6)
-# edit_costs = cost_rdm + [0]
- edit_costs = cost_rdm
-# edit_costs = [i * 0.01 for i in cost_rdm] + [0]
-# edit_costs = [0.2, 0.2, 0.2, 0.2, 0.2, 0]
-# edit_costs = [0, 0, 0.9544, 0.026, 0.0196, 0]
-# edit_costs = [0.008429912251810438, 0.025461055985319694, 0.2047320869225948, 0.004148727085832133, 0.0, 0]
- idx_cost_nonzeros = [i for i, item in enumerate(edit_costs) if item != 0]
+# cost_rdm = random.sample(range(1, 10), 6)
+# init_costs = cost_rdm + [0]
+# init_costs = cost_rdm
+ init_costs = [3, 3, 1, 3, 3, 1]
+# init_costs = [i * 0.01 for i in cost_rdm] + [0]
+# init_costs = [0.2, 0.2, 0.2, 0.2, 0.2, 0]
+# init_costs = [0, 0, 0.9544, 0.026, 0.0196, 0]
+# init_costs = [0.008429912251810438, 0.025461055985319694, 0.2047320869225948, 0.004148727085832133, 0.0, 0]
+# idx_cost_nonzeros = [i for i, item in enumerate(edit_costs) if item != 0]
# compute distances in feature space.
- coef_dk = 1
dis_k_mat, _, _, _ = kernel_distance_matrix(Gn, node_label, edge_label, gkernel=gkernel)
dis_k_vec = []
for i in range(len(dis_k_mat)):
- for j in range(i, len(dis_k_mat)):
+# for j in range(i, len(dis_k_mat)):
+ for j in range(i + 1, len(dis_k_mat)):
dis_k_vec.append(dis_k_mat[i, j])
dis_k_vec = np.array(dis_k_vec)
- if fitkernel == None:
- dis_k_vec_ajusted = dis_k_vec
- elif fitkernel == 'gaussian':
- coef_dk = 1 / np.max(dis_k_vec)
- idx_dk_nonzeros = np.where(dis_k_vec != 0)[0]
- # remove 0's and constraint d_k between 0 and 1.
- dis_k_vec = dis_k_vec[idx_dk_nonzeros] * coef_dk
- dis_k_vec_ajusted = np.sqrt(-np.log(dis_k_vec) / gamma)
- residual_list = []
- edit_cost_list = []
- time_list = []
- nb_cost_mat_list = []
+ # init ged.
+ print('\ninitial:')
+ time0 = time.time()
+ params_ged['edit_cost_constant'] = init_costs
+ ged_vec_init, ged_mat, n_edit_operations = compute_geds(Gn, params_ged,
+ parallel=parallel)
+ residual_list = [np.sqrt(np.sum(np.square(np.array(ged_vec_init) - dis_k_vec)))]
+ time_list = [time.time() - time0]
+ edit_cost_list = [init_costs]
+ nb_cost_mat = np.array(n_edit_operations)
+ nb_cost_mat_list = [nb_cost_mat]
+ print('edit_costs:', init_costs)
+ print('residual_list:', residual_list)
for itr in range(itr_max):
print('\niteration', itr)
time0 = time.time()
- # compute GEDs and numbers of edit operations.
- edit_cost_constant = [i for i in edit_costs]
- edit_cost_list.append(edit_cost_constant)
-
- ged_all, ged_mat, n_edit_operations = compute_geds(Gn, edit_cost_constant,
- idx_cost_nonzeros, parallel=True)
-
- if fitkernel == None:
- residual = np.sqrt(np.sum(np.square(np.array(ged_all) - dis_k_vec)))
- elif fitkernel == 'gaussian':
- ged_all = np.array(ged_all)[idx_dk_nonzeros]
- residual = np.sqrt(np.sum(np.square(
- np.exp(-gamma * ged_all ** 2) / coef_dk - dis_k_vec)))
- residual_list.append(residual)
-
# "fit" geds to distances in feature space by tuning edit costs using the
# Least Squares Method.
- nb_cost_mat = np.array(n_edit_operations).T
- if fitkernel == 'gaussian':
- nb_cost_mat = nb_cost_mat[idx_dk_nonzeros]
- nb_cost_mat_list.append(nb_cost_mat)
- edit_costs_new, residual = compute_better_costs(nb_cost_mat, dis_k_vec_ajusted)
-
- print('pseudo residual:', residual)
+ edit_costs_new, residual = update_costs(nb_cost_mat, dis_k_vec)
for i in range(len(edit_costs_new)):
if edit_costs_new[i] < 0:
if edit_costs_new[i] > -1e-9:
edit_costs_new[i] = 0
else:
raise ValueError('The edit cost is negative.')
-
- for idx, item in enumerate(idx_cost_nonzeros):
- edit_costs[item] = edit_costs_new[idx]
-
+# for i in range(len(edit_costs_new)):
+# if edit_costs_new[i] < 0:
+# edit_costs_new[i] = 0
+
+ # compute new GEDs and numbers of edit operations.
+ params_ged['edit_cost_constant'] = edit_costs_new
+ ged_vec, ged_mat, n_edit_operations = compute_geds(Gn, params_ged,
+ parallel=parallel)
+ residual_list.append(np.sqrt(np.sum(np.square(np.array(ged_vec) - dis_k_vec))))
time_list.append(time.time() - time0)
-
- print('edit_costs:', edit_costs)
+ edit_cost_list.append(edit_costs_new)
+ nb_cost_mat = np.array(n_edit_operations)
+ nb_cost_mat_list.append(nb_cost_mat)
+ print('edit_costs:', edit_costs_new)
print('residual_list:', residual_list)
-
- print()
- edit_cost_list.append(edit_costs)
- ged_all, ged_mat, n_edit_operations = compute_geds(Gn, edit_costs,
- idx_cost_nonzeros, parallel=True)
- if fitkernel == 0:
- residual = np.sqrt(np.sum(np.square(np.array(ged_all) - dis_k_vec)))
- elif fitkernel == 'gaussian':
- ged_all = np.array(ged_all)[idx_dk_nonzeros]
- residual = np.sqrt(np.sum(np.square(
- np.exp(-gamma * ged_all ** 2) / coef_dk - dis_k_vec)))
- residual_list.append(residual)
- nb_cost_mat_list.append(np.array(n_edit_operations).T)
- return edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, \
- time_list, nb_cost_mat_list, coef_dk
+ return edit_costs_new, residual_list, edit_cost_list, dis_k_mat, ged_mat, \
+ time_list, nb_cost_mat_list
-def compute_geds(Gn, edit_cost_constant, idx_nonzeros, parallel=False):
+def compute_geds(Gn, params_ged, parallel=False):
ged_mat = np.zeros((len(Gn), len(Gn)))
if parallel:
# print('parallel')
- len_itr = int(len(Gn) * (len(Gn) + 1) / 2)
- ged_all = [0 for i in range(len_itr)]
- n_edit_operations = [[0 for i in range(len_itr)] for j in
- range(len(idx_nonzeros))]
-
- itr = combinations_with_replacement(range(0, len(Gn)), 2)
+# len_itr = int(len(Gn) * (len(Gn) + 1) / 2)
+ len_itr = int(len(Gn) * (len(Gn) - 1) / 2)
+ ged_vec = [0 for i in range(len_itr)]
+ n_edit_operations = [0 for i in range(len_itr)]
+# itr = combinations_with_replacement(range(0, len(Gn)), 2)
+ itr = combinations(range(0, len(Gn)), 2)
n_jobs = multiprocessing.cpu_count()
if len_itr < 100 * n_jobs:
chunksize = int(len_itr / n_jobs) + 1
@@ -134,68 +112,52 @@ def compute_geds(Gn, edit_cost_constant, idx_nonzeros, parallel=False):
def init_worker(gn_toshare):
global G_gn
G_gn = gn_toshare
- do_partial = partial(_wrapper_compute_ged_parallel, edit_cost_constant,
- idx_nonzeros)
+ do_partial = partial(_wrapper_compute_ged_parallel, params_ged)
pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(Gn,))
iterator = tqdm(pool.imap_unordered(do_partial, itr, chunksize),
desc='computing GEDs', file=sys.stdout)
# iterator = pool.imap_unordered(do_partial, itr, chunksize)
for i, j, dis, n_eo_tmp in iterator:
- idx_itr = int(len(Gn) * i + j - i * (i + 1) / 2)
- ged_all[idx_itr] = dis
+ idx_itr = int(len(Gn) * i + j - (i + 1) * (i + 2) / 2)
+ ged_vec[idx_itr] = dis
ged_mat[i][j] = dis
ged_mat[j][i] = dis
- for idx, item in enumerate(idx_nonzeros):
- n_edit_operations[idx][idx_itr] = n_eo_tmp[item]
+ n_edit_operations[idx_itr] = n_eo_tmp
# print('\n-------------------------------------------')
# print(i, j, idx_itr, dis)
pool.close()
pool.join()
else:
- ged_all = []
- n_edit_operations = [[] for i in range(len(idx_nonzeros))]
+ ged_vec = []
+ n_edit_operations = []
for i in tqdm(range(len(Gn)), desc='computing GEDs', file=sys.stdout):
# for i in range(len(Gn)):
- for j in range(i, len(Gn)):
-# time0 = time.time()
- dis, pi_forward, pi_backward = GED(Gn[i], Gn[j], lib='gedlibpy',
- cost='CONSTANT', method='IPFP',
- edit_cost_constant=edit_cost_constant, stabilizer='min',
- repeat=50)
-# time1 = time.time() - time0
-# time0 = time.time()
- ged_all.append(dis)
+ for j in range(i + 1, len(Gn)):
+ dis, pi_forward, pi_backward = GED(Gn[i], Gn[j], **params_ged)
+ ged_vec.append(dis)
ged_mat[i][j] = dis
ged_mat[j][i] = dis
n_eo_tmp = get_nb_edit_operations(Gn[i], Gn[j], pi_forward, pi_backward)
- for idx, item in enumerate(idx_nonzeros):
- n_edit_operations[idx].append(n_eo_tmp[item])
-# time2 = time.time() - time0
-# print(time1, time2, time1 / time2)
+ n_edit_operations.append(n_eo_tmp)
- return ged_all, ged_mat, n_edit_operations
+ return ged_vec, ged_mat, n_edit_operations
-def _wrapper_compute_ged_parallel(edit_cost_constant, idx_nonzeros, itr):
+def _wrapper_compute_ged_parallel(params_ged, itr):
i = itr[0]
j = itr[1]
- dis, n_eo_tmp = _compute_ged_parallel(G_gn[i], G_gn[j], edit_cost_constant,
- idx_nonzeros)
+ dis, n_eo_tmp = _compute_ged_parallel(G_gn[i], G_gn[j], params_ged)
return i, j, dis, n_eo_tmp
-def _compute_ged_parallel(g1, g2, edit_cost_constant, idx_nonzeros):
- dis, pi_forward, pi_backward = GED(g1, g2, lib='gedlibpy',
- cost='CONSTANT', method='IPFP',
- edit_cost_constant=edit_cost_constant, stabilizer='min',
- repeat=50)
- n_eo_tmp = get_nb_edit_operations(g1, g2, pi_forward, pi_backward)
-
+def _compute_ged_parallel(g1, g2, params_ged):
+ dis, pi_forward, pi_backward = GED(g1, g2, **params_ged)
+ n_eo_tmp = get_nb_edit_operations(g1, g2, pi_forward, pi_backward)
return dis, n_eo_tmp
-def compute_better_costs(nb_cost_mat, dis_k_vec):
+def update_costs(nb_cost_mat, dis_k_vec):
# # method 1: simple least square method.
# edit_costs_new, residual, _, _ = np.linalg.lstsq(nb_cost_mat, dis_k_vec,
# rcond=None)
@@ -203,7 +165,7 @@ def compute_better_costs(nb_cost_mat, dis_k_vec):
# # method 2: least square method with x_i >= 0.
# edit_costs_new, residual = optimize.nnls(nb_cost_mat, dis_k_vec)
- # method 3: solve as a quadratic program with constraints: x_i >= 0, sum(x) = 1.
+ # method 3: solve as a quadratic program with constraints.
# P = np.dot(nb_cost_mat.T, nb_cost_mat)
# q_T = -2 * np.dot(dis_k_vec.T, nb_cost_mat)
# G = -1 * np.identity(nb_cost_mat.shape[1])
@@ -221,7 +183,7 @@ def compute_better_costs(nb_cost_mat, dis_k_vec):
# h = np.array([0 for i in range(nb_cost_mat.shape[1])])
x = cp.Variable(nb_cost_mat.shape[1])
cost = cp.sum_squares(nb_cost_mat * x - dis_k_vec)
- constraints = [x >= [0.01 for i in range(nb_cost_mat.shape[1])],
+ constraints = [x >= [0.0001 for i in range(nb_cost_mat.shape[1])],
# np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0]
np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0,
np.array([0.0, 0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0]
diff --git a/preimage/ged.py b/preimage/ged.py
index eaa7294..073fae6 100644
--- a/preimage/ged.py
+++ b/preimage/ged.py
@@ -13,29 +13,30 @@ import multiprocessing
from multiprocessing import Pool
from functools import partial
-from gedlibpy import librariesImport, gedlibpy
+from gedlibpy_linlin import librariesImport, gedlibpy
def GED(g1, g2, lib='gedlibpy', cost='CHEM_1', method='IPFP',
- edit_cost_constant=[], stabilizer='min', repeat=50):
+ edit_cost_constant=[], algo_options='', stabilizer='min', repeat=50):
"""
Compute GED for 2 graphs.
"""
- if lib == 'gedlibpy':
- def convertGraph(G):
- """Convert a graph to the proper NetworkX format that can be
- recognized by library gedlibpy.
- """
- G_new = nx.Graph()
- for nd, attrs in G.nodes(data=True):
- G_new.add_node(str(nd), chem=attrs['atom'])
+ def convertGraph(G):
+ """Convert a graph to the proper NetworkX format that can be
+ recognized by library gedlibpy.
+ """
+ G_new = nx.Graph()
+ for nd, attrs in G.nodes(data=True):
+ G_new.add_node(str(nd), chem=attrs['atom'])
# G_new.add_node(str(nd), x=str(attrs['attributes'][0]),
# y=str(attrs['attributes'][1]))
- for nd1, nd2, attrs in G.edges(data=True):
- G_new.add_edge(str(nd1), str(nd2), valence=attrs['bond_type'])
-# G_new.add_edge(str(nd1), str(nd2))
-
- return G_new
+ for nd1, nd2, attrs in G.edges(data=True):
+# G_new.add_edge(str(nd1), str(nd2), valence=attrs['bond_type'])
+ G_new.add_edge(str(nd1), str(nd2))
+
+ return G_new
+
+ if lib == 'gedlibpy':
gedlibpy.restart_env()
gedlibpy.add_nx_graph(convertGraph(g1), "")
gedlibpy.add_nx_graph(convertGraph(g2), "")
@@ -43,12 +44,12 @@ def GED(g1, g2, lib='gedlibpy', cost='CHEM_1', method='IPFP',
listID = gedlibpy.get_all_graph_ids()
gedlibpy.set_edit_cost(cost, edit_cost_constant=edit_cost_constant)
gedlibpy.init()
- gedlibpy.set_method(method, "")
+ gedlibpy.set_method(method, algo_options)
gedlibpy.init_method()
g = listID[0]
h = listID[1]
- if stabilizer == None:
+ if stabilizer is None:
gedlibpy.run_method(g, h)
pi_forward = gedlibpy.get_forward_map(g, h)
pi_backward = gedlibpy.get_backward_map(g, h)
@@ -107,13 +108,57 @@ def GED(g1, g2, lib='gedlibpy', cost='CHEM_1', method='IPFP',
dis = upper
- # make the map label correct (label remove map as np.inf)
- nodes1 = [n for n in g1.nodes()]
- nodes2 = [n for n in g2.nodes()]
- nb1 = nx.number_of_nodes(g1)
- nb2 = nx.number_of_nodes(g2)
- pi_forward = [nodes2[pi] if pi < nb2 else np.inf for pi in pi_forward]
- pi_backward = [nodes1[pi] if pi < nb1 else np.inf for pi in pi_backward]
+ elif lib == 'gedlib-bash':
+ import time
+ import random
+ import sys
+ import os
+ sys.path.insert(0, "../")
+ from pygraph.utils.graphfiles import saveDataset
+
+ tmp_dir = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/output/tmp_ged/'
+ if not os.path.exists(tmp_dir):
+ os.makedirs(tmp_dir)
+ fn_collection = tmp_dir + 'collection.' + str(time.time()) + str(random.randint(0, 1e9))
+ xparams = {'method': 'gedlib', 'graph_dir': fn_collection}
+ saveDataset([g1, g2], ['dummy', 'dummy'], gformat='gxl', group='xml',
+ filename=fn_collection, xparams=xparams)
+
+ command = 'GEDLIB_HOME=\'/media/ljia/DATA/research-repo/codes/others/gedlib/gedlib2\'\n'
+ command += 'LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$GEDLIB_HOME/lib\n'
+ command += 'export LD_LIBRARY_PATH\n'
+ command += 'cd \'/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/bin\'\n'
+ command += './ged_for_python_bash monoterpenoides ' + fn_collection \
+ + ' \'' + algo_options + '\' '
+ for ec in edit_cost_constant:
+ command += str(ec) + ' '
+# output = os.system(command)
+ stream = os.popen(command)
+ output = stream.readlines()
+# print(output)
+
+ dis = float(output[0].strip())
+ runtime = float(output[1].strip())
+ size_forward = int(output[2].strip())
+ pi_forward = [int(item.strip()) for item in output[3:3+size_forward]]
+ pi_backward = [int(item.strip()) for item in output[3+size_forward:]]
+
+# print(dis)
+# print(runtime)
+# print(size_forward)
+# print(pi_forward)
+# print(pi_backward)
+
+
+ # make the map label correct (label remove map as np.inf)
+ nodes1 = [n for n in g1.nodes()]
+ nodes2 = [n for n in g2.nodes()]
+ nb1 = nx.number_of_nodes(g1)
+ nb2 = nx.number_of_nodes(g2)
+ pi_forward = [nodes2[pi] if pi < nb2 else np.inf for pi in pi_forward]
+ pi_backward = [nodes1[pi] if pi < nb1 else np.inf for pi in pi_backward]
+# print(pi_forward)
+
return dis, pi_forward, pi_backward
@@ -149,7 +194,7 @@ def GED_n(Gn, lib='gedlibpy', cost='CHEM_1', method='IPFP',
g = listID[0]
h = listID[1]
- if stabilizer == None:
+ if stabilizer is None:
gedlibpy.run_method(g, h)
pi_forward = gedlibpy.get_forward_map(g, h)
pi_backward = gedlibpy.get_backward_map(g, h)
@@ -183,7 +228,8 @@ def GED_n(Gn, lib='gedlibpy', cost='CHEM_1', method='IPFP',
def ged_median(Gn, Gn_median, verbose=False, params_ged={'lib': 'gedlibpy',
'cost': 'CHEM_1', 'method': 'IPFP', 'edit_cost_constant': [],
- 'stabilizer': 'min', 'repeat': 50}, parallel=False):
+ 'algo_options': '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1',
+ 'stabilizer': None}, parallel=False):
if parallel:
len_itr = int(len(Gn))
pi_forward_list = [[] for i in range(len_itr)]
diff --git a/preimage/iam.py b/preimage/iam.py
index fa38582..0a63b98 100644
--- a/preimage/iam.py
+++ b/preimage/iam.py
@@ -23,7 +23,8 @@ def iam_upgraded(Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, ite_max=50,
connected=False, removeNodes=True, allBestInit=False, allBestNodes=False,
allBestEdges=False, allBestOutput=False,
params_ged={'lib': 'gedlibpy', 'cost': 'CHEM_1', 'method': 'IPFP',
- 'edit_cost_constant': [], 'stabilizer': 'min', 'repeat': 50}):
+ 'edit_cost_constant': [], 'stabilizer': None,
+ 'algo_options': '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1'}):
"""See my name, then you know what I do.
"""
# Gn_median = Gn_median[0:10]
@@ -435,6 +436,62 @@ def iam_upgraded(Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, ite_max=50,
return G_gen_median_list, sod_gen_median, sod_list, G_set_median_list, sod_set_median
+def iam_bash(Gn_names, edit_cost_constant, dataset='monoterpenoides',
+ graph_dir='/media/ljia/DATA/research-repo/codes/Linlin/py-graph/datasets/monoterpenoides/'):
+ """Compute the iam by c++ implementation (gedlib) through bash.
+ """
+ import os
+ import time
+
+ def createCollectionFile(Gn_names, y, filename):
+ """Create collection file.
+ """
+ dirname_ds = os.path.dirname(filename)
+ if dirname_ds != '':
+ dirname_ds += '/'
+ if not os.path.exists(dirname_ds) :
+ os.makedirs(dirname_ds)
+
+ with open(filename + '.xml', 'w') as fgroup:
+ fgroup.write("")
+ fgroup.write("\n")
+ fgroup.write("\n")
+ for idx, fname in enumerate(Gn_names):
+ fgroup.write("\n\t")
+ fgroup.write("\n")
+ fgroup.close()
+
+ tmp_dir = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/output/tmp_ged/'
+ fn_collection = tmp_dir + 'collection.' + str(time.time()) + str(random.randint(0, 1e9))
+ createCollectionFile(Gn_names, ['dummy'] * len(Gn_names), fn_collection)
+# graph_dir = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/generated_datsets/monoterpenoides/gxl'
+
+
+ command = 'GEDLIB_HOME=\'/media/ljia/DATA/research-repo/codes/others/gedlib/gedlib2\'\n'
+ command += 'LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$GEDLIB_HOME/lib\n'
+ command += 'export LD_LIBRARY_PATH\n'
+ command += 'cd \'/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/bin\'\n'
+ command += './iam_for_python_bash ' + dataset + ' ' + fn_collection \
+ + ' \'' + graph_dir + '\' '
+ if edit_cost_constant is None:
+ command += 'None'
+ else:
+ for ec in edit_cost_constant:
+ command += str(ec) + ' '
+# output = os.system(command)
+ stream = os.popen(command)
+
+ output = stream.readlines()
+# print(output)
+ sod_sm = float(output[0].strip())
+ sod_gm= float(output[1].strip())
+
+ fname_sm = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/output/tmp_ged/set_median.gxl'
+ fname_gm = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/output/tmp_ged/gen_median.gxl'
+
+ return sod_sm, sod_gm, fname_sm, fname_gm
+
+
###############################################################################
# Old implementations.
diff --git a/preimage/test_fitDistance.py b/preimage/test_fitDistance.py
index f2de5ae..2f2907d 100644
--- a/preimage/test_fitDistance.py
+++ b/preimage/test_fitDistance.py
@@ -16,6 +16,319 @@ from utils import remove_edges
from fitDistance import fit_GED_to_kernel_distance
from utils import normalize_distance_matrix
+
+def median_paper_clcpc_python_best():
+ """c_vs <= c_vi + c_vr, c_es <= c_ei + c_er with ged computation with
+ python invoking the c++ code by bash command (with updated library).
+ """
+# ds = {'name': 'monoterpenoides',
+# 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb
+# _, y_all = loadDataset(ds['dataset'])
+ gkernel = 'untilhpathkernel'
+ node_label = 'atom'
+ edge_label = 'bond_type'
+ itr_max = 6
+ algo_options = '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1'
+ params_ged = {'lib': 'gedlibpy', 'cost': 'CONSTANT', 'method': 'IPFP',
+ 'algo_options': algo_options, 'stabilizer': None}
+
+ y_all = ['3', '1', '4', '6', '7', '8', '9', '2']
+ repeats = 50
+ collection_path = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/generated_datsets/monoterpenoides/'
+ graph_dir = collection_path + 'gxl/'
+
+ fn_edit_costs_output = 'results/median_paper/edit_costs_output.python_init40.k10.txt'
+
+ for y in y_all:
+ for repeat in range(repeats):
+ edit_costs_output_file = open(fn_edit_costs_output, 'a')
+ collection_file = collection_path + 'monoterpenoides_' + y + '_' + str(repeat) + '.xml'
+ Gn, _ = loadDataset(collection_file, extra_params=graph_dir)
+ edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \
+ nb_cost_mat_list = fit_GED_to_kernel_distance(Gn, node_label, edge_label,
+ gkernel, itr_max, params_ged=params_ged,
+ parallel=True)
+ total_time = np.sum(time_list)
+# print('\nedit_costs:', edit_costs)
+# print('\nresidual_list:', residual_list)
+# print('\nedit_cost_list:', edit_cost_list)
+# print('\ndistance matrix in kernel space:', dis_k_mat)
+# print('\nged matrix:', ged_mat)
+# print('\ntotal time:', total_time)
+# print('\nnb_cost_mat:', nb_cost_mat_list[-1])
+ np.savez('results/median_paper/fit_distance.clcpc.python_init40.monot.elabeled.uhpkernel.y'
+ + y + '.repeat' + str(repeat) + '.k10..gm',
+ edit_costs=edit_costs,
+ residual_list=residual_list, edit_cost_list=edit_cost_list,
+ dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list,
+ total_time=total_time, nb_cost_mat_list=nb_cost_mat_list)
+
+ for ec in edit_costs:
+ edit_costs_output_file.write(str(ec) + ' ')
+ edit_costs_output_file.write('\n')
+ edit_costs_output_file.close()
+
+
+# # normalized distance matrices.
+# gmfile = np.load('results/fit_distance.cs_leq_ci_plus_cr.cost_leq_1en2.monot.elabeled.uhpkernel.gm.npz')
+# edit_costs = gmfile['edit_costs']
+# residual_list = gmfile['residual_list']
+# edit_cost_list = gmfile['edit_cost_list']
+# dis_k_mat = gmfile['dis_k_mat']
+# ged_mat = gmfile['ged_mat']
+# total_time = gmfile['total_time']
+# nb_cost_mat_list = gmfile['nb_cost_mat_list']
+
+ nb_consistent, nb_inconsistent, ratio_consistent = pairwise_substitution_consistence(dis_k_mat, ged_mat)
+ print(nb_consistent, nb_inconsistent, ratio_consistent)
+
+# norm_dis_k_mat = normalize_distance_matrix(dis_k_mat)
+# plt.imshow(norm_dis_k_mat)
+# plt.colorbar()
+# plt.savefig('results/median_paper/norm_dis_k_mat.clcpc.python_best.monot.elabeled.uhpkernel.y'
+# + y + '.repeat' + str(repeat) + '.eps', format='eps', dpi=300)
+# plt.savefig('results/median_paper/norm_dis_k_mat.clcpc.python_best.monot.elabeled.uhpkernel.y'
+# + y + '.repeat' + str(repeat) + '.png', format='png')
+# # plt.show()
+# plt.clf()
+#
+# norm_ged_mat = normalize_distance_matrix(ged_mat)
+# plt.imshow(norm_ged_mat)
+# plt.colorbar()
+# plt.savefig('results/median_paper/norm_ged_mat.clcpc.python_best.monot.elabeled.uhpkernel.y'
+# + y + '.repeat' + str(repeat) + '.eps', format='eps', dpi=300)
+# plt.savefig('results/median_paper/norm_ged_mat.clcpc.python_best.monot.elabeled.uhpkernel.y'
+# + y + '.repeat' + str(repeat) + '.png', format='png')
+# # plt.show()
+# plt.clf()
+#
+# norm_diff = norm_ged_mat - norm_dis_k_mat
+# plt.imshow(norm_diff)
+# plt.colorbar()
+# plt.savefig('results/median_paper/diff_mat_norm_ged_dis_k.clcpc.python_best.monot.elabeled.uhpkernel.y'
+# + y + '.repeat' + str(repeat) + '.eps', format='eps', dpi=300)
+# plt.savefig('results/median_paper/diff_mat_norm_ged_dis_k.clcpc.python_best.monot.elabeled.uhpkernel.y'
+# + y + '.repeat' + str(repeat) + '.png', format='png')
+# # plt.show()
+# plt.clf()
+# # draw_count_bar(norm_diff)
+
+
+def median_paper_clcpc_python_bash_cpp():
+ """c_vs <= c_vi + c_vr, c_es <= c_ei + c_er with ged computation with
+ python invoking the c++ code by bash command (with updated library).
+ """
+# ds = {'name': 'monoterpenoides',
+# 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb
+# _, y_all = loadDataset(ds['dataset'])
+ gkernel = 'untilhpathkernel'
+ node_label = 'atom'
+ edge_label = 'bond_type'
+ itr_max = 20
+ algo_options = '--threads 6 --initial-solutions 10 --ratio-runs-from-initial-solutions .5'
+ params_ged = {'lib': 'gedlib-bash', 'cost': 'CONSTANT', 'method': 'IPFP',
+ 'algo_options': algo_options}
+
+ y_all = ['3', '1', '4', '6', '7', '8', '9', '2']
+ repeats = 50
+ collection_path = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/generated_datsets/monoterpenoides/'
+ graph_dir = collection_path + 'gxl/'
+
+ fn_edit_costs_output = 'results/median_paper/edit_costs_output.txt'
+
+ for y in y_all:
+ for repeat in range(repeats):
+ edit_costs_output_file = open(fn_edit_costs_output, 'a')
+ collection_file = collection_path + 'monoterpenoides_' + y + '_' + str(repeat) + '.xml'
+ Gn, _ = loadDataset(collection_file, extra_params=graph_dir)
+ edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \
+ nb_cost_mat_list, coef_dk = fit_GED_to_kernel_distance(Gn, node_label, edge_label,
+ gkernel, itr_max, params_ged=params_ged,
+ parallel=False)
+ total_time = np.sum(time_list)
+# print('\nedit_costs:', edit_costs)
+# print('\nresidual_list:', residual_list)
+# print('\nedit_cost_list:', edit_cost_list)
+# print('\ndistance matrix in kernel space:', dis_k_mat)
+# print('\nged matrix:', ged_mat)
+# print('\ntotal time:', total_time)
+# print('\nnb_cost_mat:', nb_cost_mat_list[-1])
+ np.savez('results/median_paper/fit_distance.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y'
+ + y + '.repeat' + str(repeat) + '.gm',
+ edit_costs=edit_costs,
+ residual_list=residual_list, edit_cost_list=edit_cost_list,
+ dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list,
+ total_time=total_time, nb_cost_mat_list=nb_cost_mat_list,
+ coef_dk=coef_dk)
+
+ for ec in edit_costs:
+ edit_costs_output_file.write(str(ec) + ' ')
+ edit_costs_output_file.write('\n')
+ edit_costs_output_file.close()
+
+
+# # normalized distance matrices.
+# gmfile = np.load('results/fit_distance.cs_leq_ci_plus_cr.cost_leq_1en2.monot.elabeled.uhpkernel.gm.npz')
+# edit_costs = gmfile['edit_costs']
+# residual_list = gmfile['residual_list']
+# edit_cost_list = gmfile['edit_cost_list']
+# dis_k_mat = gmfile['dis_k_mat']
+# ged_mat = gmfile['ged_mat']
+# total_time = gmfile['total_time']
+# nb_cost_mat_list = gmfile['nb_cost_mat_list']
+# coef_dk = gmfile['coef_dk']
+
+ nb_consistent, nb_inconsistent, ratio_consistent = pairwise_substitution_consistence(dis_k_mat, ged_mat)
+ print(nb_consistent, nb_inconsistent, ratio_consistent)
+
+# norm_dis_k_mat = normalize_distance_matrix(dis_k_mat)
+# plt.imshow(norm_dis_k_mat)
+# plt.colorbar()
+# plt.savefig('results/median_paper/norm_dis_k_mat.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y'
+# + y + '.repeat' + str(repeat) + '.eps', format='eps', dpi=300)
+# plt.savefig('results/median_paper/norm_dis_k_mat.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y'
+# + y + '.repeat' + str(repeat) + '.png', format='png')
+# # plt.show()
+# plt.clf()
+#
+# norm_ged_mat = normalize_distance_matrix(ged_mat)
+# plt.imshow(norm_ged_mat)
+# plt.colorbar()
+# plt.savefig('results/median_paper/norm_ged_mat.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y'
+# + y + '.repeat' + str(repeat) + '.eps', format='eps', dpi=300)
+# plt.savefig('results/median_paper/norm_ged_mat.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y'
+# + y + '.repeat' + str(repeat) + '.png', format='png')
+# # plt.show()
+# plt.clf()
+#
+# norm_diff = norm_ged_mat - norm_dis_k_mat
+# plt.imshow(norm_diff)
+# plt.colorbar()
+# plt.savefig('results/median_paper/diff_mat_norm_ged_dis_k.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y'
+# + y + '.repeat' + str(repeat) + '.eps', format='eps', dpi=300)
+# plt.savefig('results/median_paper/diff_mat_norm_ged_dis_k.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y'
+# + y + '.repeat' + str(repeat) + '.png', format='png')
+# # plt.show()
+# plt.clf()
+# # draw_count_bar(norm_diff)
+
+
+
+
+
+def test_cs_leq_ci_plus_cr_python_bash_cpp():
+ """c_vs <= c_vi + c_vr, c_es <= c_ei + c_er with ged computation with
+ python invoking the c++ code by bash command (with updated library).
+ """
+ ds = {'name': 'monoterpenoides',
+ 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb
+ Gn, y_all = loadDataset(ds['dataset'])
+# Gn = Gn[0:10]
+ gkernel = 'untilhpathkernel'
+ node_label = 'atom'
+ edge_label = 'bond_type'
+ itr_max = 10
+ algo_options = '--threads 6 --initial-solutions 10 --ratio-runs-from-initial-solutions .5'
+ params_ged = {'lib': 'gedlib-bash', 'cost': 'CONSTANT', 'method': 'IPFP',
+ 'algo_options': algo_options}
+ edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \
+ nb_cost_mat_list, coef_dk = fit_GED_to_kernel_distance(Gn, node_label, edge_label,
+ gkernel, itr_max, params_ged=params_ged,
+ parallel=False)
+ total_time = np.sum(time_list)
+ print('\nedit_costs:', edit_costs)
+ print('\nresidual_list:', residual_list)
+ print('\nedit_cost_list:', edit_cost_list)
+ print('\ndistance matrix in kernel space:', dis_k_mat)
+ print('\nged matrix:', ged_mat)
+ print('\ntotal time:', total_time)
+ print('\nnb_cost_mat:', nb_cost_mat_list[-1])
+ np.savez('results/fit_distance.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel.gm',
+ edit_costs=edit_costs,
+ residual_list=residual_list, edit_cost_list=edit_cost_list,
+ dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list,
+ total_time=total_time, nb_cost_mat_list=nb_cost_mat_list,
+ coef_dk=coef_dk)
+
+# ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
+# 'extra_params': {}} # node/edge symb
+# Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
+## Gn = Gn[0:10]
+## remove_edges(Gn)
+# gkernel = 'untilhpathkernel'
+# node_label = 'atom'
+# edge_label = 'bond_type'
+# itr_max = 10
+# edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \
+# nb_cost_mat_list, coef_dk = fit_GED_to_kernel_distance(Gn, node_label, edge_label,
+# gkernel, itr_max)
+# total_time = np.sum(time_list)
+# print('\nedit_costs:', edit_costs)
+# print('\nresidual_list:', residual_list)
+# print('\nedit_cost_list:', edit_cost_list)
+# print('\ndistance matrix in kernel space:', dis_k_mat)
+# print('\nged matrix:', ged_mat)
+# print('\ntotal time:', total_time)
+# print('\nnb_cost_mat:', nb_cost_mat_list[-1])
+# np.savez('results/fit_distance.cs_leq_ci_plus_cr.mutag.elabeled.uhpkernel.gm',
+# edit_costs=edit_costs,
+# residual_list=residual_list, edit_cost_list=edit_cost_list,
+# dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list,
+# total_time=total_time, nb_cost_mat_list=nb_cost_mat_list, coef_dk)
+
+
+# # normalized distance matrices.
+# gmfile = np.load('results/fit_distance.cs_leq_ci_plus_cr.monot.elabeled.uhpkernel.gm.npz')
+# edit_costs = gmfile['edit_costs']
+# residual_list = gmfile['residual_list']
+# edit_cost_list = gmfile['edit_cost_list']
+# dis_k_mat = gmfile['dis_k_mat']
+# ged_mat = gmfile['ged_mat']
+# total_time = gmfile['total_time']
+# nb_cost_mat_list = gmfile['nb_cost_mat_list']
+# coef_dk = gmfile['coef_dk']
+
+ nb_consistent, nb_inconsistent, ratio_consistent = pairwise_substitution_consistence(dis_k_mat, ged_mat)
+ print(nb_consistent, nb_inconsistent, ratio_consistent)
+
+# dis_k_sub = pairwise_substitution(dis_k_mat)
+# ged_sub = pairwise_substitution(ged_mat)
+# np.savez('results/sub_dis_mat.cs_leq_ci_plus_cr.gm',
+# dis_k_sub=dis_k_sub, ged_sub=ged_sub)
+
+
+ norm_dis_k_mat = normalize_distance_matrix(dis_k_mat)
+ plt.imshow(norm_dis_k_mat)
+ plt.colorbar()
+ plt.savefig('results/norm_dis_k_mat.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel'
+ + '.eps', format='eps', dpi=300)
+ plt.savefig('results/norm_dis_k_mat.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel'
+ + '.png', format='png')
+# plt.show()
+ plt.clf()
+
+ norm_ged_mat = normalize_distance_matrix(ged_mat)
+ plt.imshow(norm_ged_mat)
+ plt.colorbar()
+ plt.savefig('results/norm_ged_mat.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel'
+ + '.eps', format='eps', dpi=300)
+ plt.savefig('results/norm_ged_mat.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel'
+ + '.png', format='png')
+# plt.show()
+ plt.clf()
+
+ norm_diff = norm_ged_mat - norm_dis_k_mat
+ plt.imshow(norm_diff)
+ plt.colorbar()
+ plt.savefig('results/diff_mat_norm_ged_dis_k.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel'
+ + '.eps', format='eps', dpi=300)
+ plt.savefig('results/diff_mat_norm_ged_dis_k.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel'
+ + '.png', format='png')
+# plt.show()
+ plt.clf()
+# draw_count_bar(norm_diff)
+
+
def test_anycosts():
ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
'extra_params': {}} # node/edge symb
@@ -295,8 +608,12 @@ def draw_count_bar(norm_diff):
if __name__ == '__main__':
# test_anycosts()
- test_cs_leq_ci_plus_cr()
+# test_cs_leq_ci_plus_cr()
# test_unfitted()
+# test_cs_leq_ci_plus_cr_python_bash_cpp()
+# median_paper_clcpc_python_bash_cpp()
+ median_paper_clcpc_python_best()
+
# x = np.array([[1,2,3],[4,5,6],[7,8,9]])
# xx = pairwise_substitution(x)
\ No newline at end of file
diff --git a/preimage/test_iam.py b/preimage/test_iam.py
index 82d5446..5d286cc 100644
--- a/preimage/test_iam.py
+++ b/preimage/test_iam.py
@@ -22,6 +22,130 @@ from iam import iam_upgraded
from utils import remove_edges, compute_kernel, get_same_item_indices, dis_gstar
#from ged import ged_median
+
+def test_iam_monoterpenoides_with_init40():
+ gkernel = 'untilhpathkernel'
+ node_label = 'atom'
+ edge_label = 'bond_type'
+ # unfitted edit costs.
+ c_vi = 3
+ c_vr = 3
+ c_vs = 1
+ c_ei = 3
+ c_er = 3
+ c_es = 1
+ ite_max_iam = 50
+ epsilon_iam = 0.0001
+ removeNodes = False
+ connected_iam = False
+ # parameters for IAM function
+# ged_cost = 'CONSTANT'
+ ged_cost = 'CONSTANT'
+ ged_method = 'IPFP'
+ edit_cost_constant = [c_vi, c_vr, c_vs, c_ei, c_er, c_es]
+ ged_stabilizer = None
+# ged_repeat = 50
+ algo_options = '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1'
+ params_ged = {'lib': 'gedlibpy', 'cost': ged_cost, 'method': ged_method,
+ 'edit_cost_constant': edit_cost_constant,
+ 'algo_options': algo_options,
+ 'stabilizer': ged_stabilizer}
+
+
+ collection_path = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/generated_datsets/monoterpenoides/'
+ graph_dir = collection_path + 'gxl/'
+ y_all = ['3', '1', '4', '6', '7', '8', '9', '2']
+ repeats = 50
+
+ # classify graphs according to classes.
+ time_list = []
+ dis_ks_min_list = []
+ dis_ks_set_median_list = []
+ sod_gs_list = []
+ g_best = []
+ sod_set_median_list = []
+ sod_list_list = []
+ for y in y_all:
+ print('\n-------------------------------------------------------')
+ print('class of y:', y)
+
+ time_list.append([])
+ dis_ks_min_list.append([])
+ dis_ks_set_median_list.append([])
+ sod_gs_list.append([])
+ g_best.append([])
+ sod_set_median_list.append([])
+
+ for repeat in range(repeats):
+ # load median set.
+ collection_file = collection_path + 'monoterpenoides_' + y + '_' + str(repeat) + '.xml'
+ Gn_median, _ = loadDataset(collection_file, extra_params=graph_dir)
+ Gn_candidate = [g.copy() for g in Gn_median]
+
+ time0 = time.time()
+ G_gen_median_list, sod_gen_median, sod_list, G_set_median_list, sod_set_median \
+ = iam_upgraded(Gn_median,
+ Gn_candidate, c_ei=c_ei, c_er=c_er, c_es=c_es, ite_max=ite_max_iam,
+ epsilon=epsilon_iam, node_label=node_label, edge_label=edge_label,
+ connected=connected_iam, removeNodes=removeNodes,
+ params_ged=params_ged)
+ time_total = time.time() - time0
+ print('\ntime: ', time_total)
+ time_list[-1].append(time_total)
+ g_best[-1].append(G_gen_median_list[0])
+ sod_set_median_list[-1].append(sod_set_median)
+ print('\nsmallest sod of the set median:', sod_set_median)
+ sod_gs_list[-1].append(sod_gen_median)
+ print('\nsmallest sod in graph space:', sod_gen_median)
+ sod_list_list.append(sod_list)
+
+# # show the best graph and save it to file.
+# print('one of the possible corresponding pre-images is')
+# nx.draw(G_gen_median_list[0], labels=nx.get_node_attributes(G_gen_median_list[0], 'atom'),
+# with_labels=True)
+## plt.show()
+# # plt.savefig('results/iam/mutag_median.fit_costs2.001.nb' + str(nb_median) +
+## plt.savefig('results/iam/paper_compare/monoter_y' + str(y_class) +
+## '_repeat' + str(repeat) + '_' + str(time.time()) +
+## '.png', format="PNG")
+# plt.clf()
+# # print(G_gen_median_list[0].nodes(data=True))
+# # print(G_gen_median_list[0].edges(data=True))
+
+ print('\nsods of the set median for this class:', sod_set_median_list[-1])
+ print('\nsods in graph space for this class:', sod_gs_list[-1])
+# print('\ndistance in kernel space of set median for this class:',
+# dis_ks_set_median_list[-1])
+# print('\nsmallest distances in kernel space for this class:',
+# dis_ks_min_list[-1])
+ print('\ntimes for this class:', time_list[-1])
+
+ sod_set_median_list[-1] = np.mean(sod_set_median_list[-1])
+ sod_gs_list[-1] = np.mean(sod_gs_list[-1])
+# dis_ks_set_median_list[-1] = np.mean(dis_ks_set_median_list[-1])
+# dis_ks_min_list[-1] = np.mean(dis_ks_min_list[-1])
+ time_list[-1] = np.mean(time_list[-1])
+
+ print()
+ print('\nmean sods of the set median for each class:', sod_set_median_list)
+ print('\nmean sods in graph space for each class:', sod_gs_list)
+# print('\ndistances in kernel space of set median for each class:',
+# dis_ks_set_median_list)
+# print('\nmean smallest distances in kernel space for each class:',
+# dis_ks_min_list)
+ print('\nmean times for each class:', time_list)
+
+ print('\nmean sods of the set median of all:', np.mean(sod_set_median_list))
+ print('\nmean sods in graph space of all:', np.mean(sod_gs_list))
+# print('\nmean distances in kernel space of set median of all:',
+# np.mean(dis_ks_set_median_list))
+# print('\nmean smallest distances in kernel space of all:',
+# np.mean(dis_ks_min_list))
+ print('\nmean times of all:', np.mean(time_list))
+
+
+
+
def test_iam_monoterpenoides():
ds = {'name': 'monoterpenoides',
'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb
@@ -834,9 +958,10 @@ if __name__ == '__main__':
# tests on different numbers of median-sets.
# test_iam_median_nb()
# test_iam_letter_h()
- test_iam_monoterpenoides()
+# test_iam_monoterpenoides()
# test_iam_mutag()
# test_iam_fitdistance()
# print("test log")
+ test_iam_monoterpenoides_with_init40()
diff --git a/preimage/utils.py b/preimage/utils.py
index 99c63c0..51d4edf 100644
--- a/preimage/utils.py
+++ b/preimage/utils.py
@@ -17,8 +17,10 @@ from pygraph.kernels.marginalizedKernel import marginalizedkernel
from pygraph.kernels.untilHPathKernel import untilhpathkernel
from pygraph.kernels.spKernel import spkernel
import functools
-from pygraph.utils.kernels import deltakernel, gaussiankernel, kernelproduct
+from pygraph.utils.kernels import deltakernel, gaussiankernel, kernelproduct, polynomialkernel
from pygraph.kernels.structuralspKernel import structuralspkernel
+from pygraph.kernels.treeletKernel import treeletkernel
+from pygraph.kernels.weisfeilerLehmanKernel import weisfeilerlehmankernel
def remove_edges(Gn):
@@ -46,18 +48,29 @@ def compute_kernel(Gn, graph_kernel, node_label, edge_label, verbose):
n_jobs=multiprocessing.cpu_count(), verbose=verbose)
elif graph_kernel == 'untilhpathkernel':
Kmatrix, _ = untilhpathkernel(Gn, node_label=node_label, edge_label=edge_label,
- depth=10, k_func='MinMax', compute_method='trie',
+ depth=7, k_func='MinMax', compute_method='trie',
n_jobs=multiprocessing.cpu_count(), verbose=verbose)
elif graph_kernel == 'spkernel':
mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)
- Kmatrix, _, _ = spkernel(Gn, node_label='atom', node_kernels=
+ Kmatrix, _, _ = spkernel(Gn, node_label=node_label, node_kernels=
{'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel},
n_jobs=multiprocessing.cpu_count(), verbose=verbose)
elif graph_kernel == 'structuralspkernel':
mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)
- Kmatrix, _ = structuralspkernel(Gn, node_label='atom', node_kernels=
+ Kmatrix, _ = structuralspkernel(Gn, node_label=node_label, node_kernels=
{'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel},
n_jobs=multiprocessing.cpu_count(), verbose=verbose)
+ elif graph_kernel == 'treeletkernel':
+# pkernel = functools.partial(polynomialkernel, d=2, c=1e5)
+ pkernel = functools.partial(gaussiankernel, gamma=1e-6)
+ mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)
+ Kmatrix, _ = treeletkernel(Gn, node_label=node_label, edge_label=edge_label,
+ sub_kernel=pkernel,
+ n_jobs=multiprocessing.cpu_count(), verbose=verbose)
+ elif graph_kernel == 'weisfeilerlehmankernel':
+ Kmatrix, _ = weisfeilerlehmankernel(Gn, node_label=node_label, edge_label=edge_label,
+ height=4, base_kernel='subtree',
+ n_jobs=multiprocessing.cpu_count(), verbose=verbose)
# normalization
Kmatrix_diag = Kmatrix.diagonal().copy()
@@ -79,7 +92,7 @@ def gram2distances(Kmatrix):
def kernel_distance_matrix(Gn, node_label, edge_label, Kmatrix=None, gkernel=None):
dis_mat = np.empty((len(Gn), len(Gn)))
- if Kmatrix == None:
+ if Kmatrix is None:
Kmatrix = compute_kernel(Gn, gkernel, node_label, edge_label, True)
for i in range(len(Gn)):
for j in range(i, len(Gn)):
@@ -109,6 +122,21 @@ def get_same_item_indices(ls):
return idx_dict
+def k_nearest_neighbors_to_median_in_kernel_space(Gn, Kmatrix=None, gkernel=None,
+ node_label=None, edge_label=None):
+ dis_k_all = [] # distance between g_star and each graph.
+ alpha = [1 / len(Gn)] * len(Gn)
+ if Kmatrix is None:
+ Kmatrix = compute_kernel(Gn, gkernel, node_label, edge_label, True)
+ term3 = 0
+ for i1, a1 in enumerate(alpha):
+ for i2, a2 in enumerate(alpha):
+ term3 += a1 * a2 * Kmatrix[idx_gi[i1], idx_gi[i2]]
+ for ig, g in tqdm(enumerate(Gn_init), desc='computing distances', file=sys.stdout):
+ dtemp = dis_gstar(ig, idx_gi, alpha, Kmatrix, term3=term3)
+ dis_all.append(dtemp)
+
+
def normalize_distance_matrix(D):
max_value = np.amax(D)
min_value = np.amin(D)
diff --git a/pygraph/utils/graphfiles.py b/pygraph/utils/graphfiles.py
index f5daeda..48583dd 100644
--- a/pygraph/utils/graphfiles.py
+++ b/pygraph/utils/graphfiles.py
@@ -124,21 +124,21 @@ def saveGXL(graph, filename, method='benoit'):
# reference: https://github.com/dbblumenthal/gedlib/blob/master/data/generate_molecules.py#L22
# pass
gxl_file = open(filename, 'w')
- gxl_file.write("\n")
+ gxl_file.write("\n")
gxl_file.write("\n")
- gxl_file.write("\n")
+ gxl_file.write("\n")
gxl_file.write("\n")
for v, attrs in graph.nodes(data=True):
gxl_file.write("")
- gxl_file.write("" + str(attrs['atom']) + "")
+ gxl_file.write("" + str(attrs['chem']) + "")
gxl_file.write("\n")
for v1, v2, attrs in graph.edges(data=True):
gxl_file.write("")
-# gxl_file.write("" + str(attrs['bond_type']) + "")
- gxl_file.write("" + "1" + "")
+ gxl_file.write("" + str(attrs['valence']) + "")
+# gxl_file.write("" + "1" + "")
gxl_file.write("\n")
gxl_file.write("\n")
- gxl_file.write("\n")
+ gxl_file.write("")
gxl_file.close()
elif method == 'gedlib-letter':
# reference: https://github.com/dbblumenthal/gedlib/blob/master/data/generate_molecules.py#L22
@@ -147,15 +147,15 @@ def saveGXL(graph, filename, method='benoit'):
gxl_file.write("\n")
gxl_file.write("\n")
gxl_file.write("\n")
- gxl_file.write("")
+ gxl_file.write("\n")
for v, attrs in graph.nodes(data=True):
gxl_file.write("")
gxl_file.write("" + str(attrs['attributes'][0]) + "")
gxl_file.write("" + str(attrs['attributes'][1]) + "")
- gxl_file.write("")
+ gxl_file.write("\n")
for v1, v2, attrs in graph.edges(data=True):
- gxl_file.write("")
- gxl_file.write("")
+ gxl_file.write("\n")
+ gxl_file.write("\n")
gxl_file.write("")
gxl_file.close()
@@ -466,12 +466,15 @@ def loadDataset(filename, filename_y=None, extra_params=None):
def loadFromXML(filename, extra_params):
import xml.etree.ElementTree as ET
- dirname_dataset = dirname(filename)
+ if extra_params:
+ dirname_dataset = extra_params
+ else:
+ dirname_dataset = dirname(filename)
tree = ET.parse(filename)
root = tree.getroot()
data = []
y = []
- for graph in root.iter('print'):
+ for graph in root.iter('graph'):
mol_filename = graph.attrib['file']
mol_class = graph.attrib['class']
data.append(loadGXL(dirname_dataset + '/' + mol_filename))
@@ -541,15 +544,22 @@ def saveDataset(Gn, y, gformat='gxl', group=None, filename='gfile', xparams=None
dirname_ds += '/'
if not os.path.exists(dirname_ds) :
os.makedirs(dirname_ds)
+
+ if 'graph_dir' in xparams:
+ graph_dir = xparams['graph_dir'] + '/'
+ if not os.path.exists(graph_dir):
+ os.makedirs(graph_dir)
+ else:
+ graph_dir = dirname_ds
if group == 'xml' and gformat == 'gxl':
with open(filename + '.xml', 'w') as fgroup:
fgroup.write("")
- fgroup.write("\n")
+ fgroup.write("\n")
fgroup.write("\n")
for idx, g in enumerate(Gn):
fname_tmp = "graph" + str(idx) + ".gxl"
- saveGXL(g, dirname_ds + fname_tmp, method=xparams['method'])
+ saveGXL(g, graph_dir + fname_tmp, method=xparams['method'])
fgroup.write("\n\t")
fgroup.write("\n")
fgroup.close()
@@ -558,18 +568,18 @@ def saveDataset(Gn, y, gformat='gxl', group=None, filename='gfile', xparams=None
if __name__ == '__main__':
# ### Load dataset from .ds file.
# # .ct files.
- ds = {'name': 'Alkane', 'dataset': '../../datasets/Alkane/dataset.ds',
- 'dataset_y': '../../datasets/Alkane/dataset_boiling_point_names.txt'}
- Gn, y = loadDataset(ds['dataset'], filename_y=ds['dataset_y'])
-# ds = {'name': 'Acyclic', 'dataset': '../../datasets/acyclic/dataset_bps.ds'} # node symb
-# Gn, y = loadDataset(ds['dataset'])
-# ds = {'name': 'MAO', 'dataset': '../../datasets/MAO/dataset.ds'} # node/edge symb
-# Gn, y = loadDataset(ds['dataset'])
-# ds = {'name': 'PAH', 'dataset': '../../datasets/PAH/dataset.ds'} # unlabeled
-# Gn, y = loadDataset(ds['dataset'])
- print(Gn[1].nodes(data=True))
- print(Gn[1].edges(data=True))
- print(y[1])
+# ds = {'name': 'Alkane', 'dataset': '../../datasets/Alkane/dataset.ds',
+# 'dataset_y': '../../datasets/Alkane/dataset_boiling_point_names.txt'}
+# Gn, y = loadDataset(ds['dataset'], filename_y=ds['dataset_y'])
+## ds = {'name': 'Acyclic', 'dataset': '../../datasets/acyclic/dataset_bps.ds'} # node symb
+## Gn, y = loadDataset(ds['dataset'])
+## ds = {'name': 'MAO', 'dataset': '../../datasets/MAO/dataset.ds'} # node/edge symb
+## Gn, y = loadDataset(ds['dataset'])
+## ds = {'name': 'PAH', 'dataset': '../../datasets/PAH/dataset.ds'} # unlabeled
+## Gn, y = loadDataset(ds['dataset'])
+# print(Gn[1].nodes(data=True))
+# print(Gn[1].edges(data=True))
+# print(y[1])
# # .gxl file.
# ds = {'name': 'monoterpenoides',
@@ -579,6 +589,33 @@ if __name__ == '__main__':
# print(Gn[1].edges(data=True))
# print(y[1])
+ ### Convert graph from one format to another.
+ # .gxl file.
+ import networkx as nx
+ ds = {'name': 'monoterpenoides',
+ 'dataset': '../../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb
+ Gn, y = loadDataset(ds['dataset'])
+ y = [int(i) for i in y]
+ print(Gn[1].nodes(data=True))
+ print(Gn[1].edges(data=True))
+ print(y[1])
+ # Convert a graph to the proper NetworkX format that can be recognized by library gedlib.
+ Gn_new = []
+ for G in Gn:
+ G_new = nx.Graph()
+ for nd, attrs in G.nodes(data=True):
+ G_new.add_node(str(nd), chem=attrs['atom'])
+ for nd1, nd2, attrs in G.edges(data=True):
+ G_new.add_edge(str(nd1), str(nd2), valence=attrs['bond_type'])
+# G_new.add_edge(str(nd1), str(nd2))
+ Gn_new.append(G_new)
+ print(Gn_new[1].nodes(data=True))
+ print(Gn_new[1].edges(data=True))
+ print(Gn_new[1])
+ filename = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/generated_datsets/monoterpenoides/gxl/monoterpenoides'
+ xparams = {'method': 'gedlib'}
+ saveDataset(Gn, y, gformat='gxl', group='xml', filename=filename, xparams=xparams)
+
# ds = {'name': 'MUTAG', 'dataset': '../../datasets/MUTAG/MUTAG.mat',
# 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}} # node/edge symb
# Gn, y = loadDataset(ds['dataset'], extra_params=ds['extra_params'])