Browse Source

update preimege/ged.py

v0.1
jajupmochi 5 years ago
parent
commit
1a34c9f18e
13 changed files with 759 additions and 177 deletions
  1. +2
    -1
      notebooks/run_marginalizedkernel.py
  2. +3
    -0
      notebooks/run_spkernel.py
  3. +2
    -0
      notebooks/run_structuralspkernel.py
  4. +1
    -0
      notebooks/run_treeletkernel.py
  5. +7
    -6
      notebooks/run_untilhpathkernel.py
  6. +2
    -0
      notebooks/run_weisfeilerlehmankernel.py
  7. +72
    -110
      preimage/fitDistance.py
  8. +72
    -26
      preimage/ged.py
  9. +58
    -1
      preimage/iam.py
  10. +318
    -1
      preimage/test_fitDistance.py
  11. +126
    -1
      preimage/test_iam.py
  12. +33
    -5
      preimage/utils.py
  13. +63
    -26
      pygraph/utils/graphfiles.py

+ 2
- 1
notebooks/run_marginalizedkernel.py View File

@@ -28,6 +28,7 @@ dslist = [
# {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1_A.txt'}, # node symb
# {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109_A.txt'}, # node symb
# {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb
# {'name': 'monoterpenoides', 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'}, # node/edge symb
#
# {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'},
# # node/edge symb
@@ -57,7 +58,7 @@ estimator = marginalizedkernel
#param_grid_precomputed = {'p_quit': np.linspace(0.1, 0.3, 3),
# 'n_iteration': np.linspace(1, 1, 1),
param_grid_precomputed = {'p_quit': np.linspace(0.1, 0.9, 9),
'n_iteration': np.linspace(5, 20, 4),
'n_iteration': np.linspace(1, 19, 7),
'remove_totters': [False]}
param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)},
{'alpha': np.logspace(-10, 10, num=41, base=10)}]


+ 3
- 0
notebooks/run_spkernel.py View File

@@ -24,6 +24,9 @@ dslist = [
# {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1_A.txt'}, # node symb
# {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109_A.txt'}, # node symb
# {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb
# {'name': 'monoterpenoides', 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'}, # node/edge
# {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt'},
# # node nsymb symb
#
# {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'},
# # node/edge symb


+ 2
- 0
notebooks/run_structuralspkernel.py View File

@@ -30,6 +30,8 @@ dslist = [
# {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'},
# # node symb/nsymb
# {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb
# {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt'},
# # node nsymb symb
#
# {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'},
# # node/edge symb


+ 1
- 0
notebooks/run_treeletkernel.py View File

@@ -26,6 +26,7 @@ dslist = [
{'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109_A.txt'}, # node symb
{'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb
{'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'},
# {'name': 'monoterpenoides', 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'}, # node/edge symb
# node symb/nsymb
# {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb
# {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'},


+ 7
- 6
notebooks/run_untilhpathkernel.py View File

@@ -27,7 +27,8 @@ dslist = [
{'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb
{'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1_A.txt'}, # node symb
{'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109_A.txt'}, # node symb
{'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb
{'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb
# {'name': 'monoterpenoides', 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'}, # node/edge symb
#
# {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'},
# # node/edge symb
@@ -54,11 +55,11 @@ dslist = [
# {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',},
]
estimator = untilhpathkernel
param_grid_precomputed = {'depth': np.linspace(3, 10, 8), # [2],
'k_func': [None]} # ['MinMax', 'tanimoto'],
#param_grid_precomputed = {'depth': np.linspace(1, 10, 10), # [2],
# 'k_func': ['MinMax'], # ['MinMax', 'tanimoto'],
# 'compute_method': ['trie']} # ['MinMax']}
#param_grid_precomputed = {'depth': np.linspace(3, 10, 8), # [2],
# 'k_func': [None]} # ['MinMax', 'tanimoto'],
param_grid_precomputed = {'depth': np.linspace(1, 10, 10), # [2],
'k_func': ['MinMax', 'tanimoto'], # ['MinMax'], #
'compute_method': ['trie']} # ['MinMax']}
param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)},
{'alpha': np.logspace(-10, 10, num=41, base=10)}]



+ 2
- 0
notebooks/run_weisfeilerlehmankernel.py View File

@@ -30,6 +30,8 @@ dslist = [
{'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1_A.txt'}, # node symb
{'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109_A.txt'}, # node symb
{'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb
# {'name': 'monoterpenoides', 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'}, # node/edge symb

#
# {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'},
# # node/edge symb


+ 72
- 110
preimage/fitDistance.py View File

@@ -7,7 +7,7 @@ Created on Wed Oct 16 14:20:06 2019
"""
import numpy as np
from tqdm import tqdm
from itertools import combinations_with_replacement
from itertools import combinations_with_replacement, combinations
import multiprocessing
from multiprocessing import Pool
from functools import partial
@@ -22,110 +22,88 @@ import sys
from ged import GED, get_nb_edit_operations
from utils import kernel_distance_matrix

def fit_GED_to_kernel_distance(Gn, node_label, edge_label, gkernel, itr_max,
fitkernel=None, gamma=1.0):
def fit_GED_to_kernel_distance(Gn, node_label, edge_label, gkernel, itr_max, k=4,
params_ged={'lib': 'gedlibpy', 'cost': 'CONSTANT',
'method': 'IPFP', 'stabilizer': None},
init_costs=[3, 3, 1, 3, 3, 1],
parallel=True):
# c_vi, c_vr, c_vs, c_ei, c_er, c_es or parts of them.
# random.seed(1)
cost_rdm = random.sample(range(1, 10), 6)
# edit_costs = cost_rdm + [0]
edit_costs = cost_rdm
# edit_costs = [i * 0.01 for i in cost_rdm] + [0]
# edit_costs = [0.2, 0.2, 0.2, 0.2, 0.2, 0]
# edit_costs = [0, 0, 0.9544, 0.026, 0.0196, 0]
# edit_costs = [0.008429912251810438, 0.025461055985319694, 0.2047320869225948, 0.004148727085832133, 0.0, 0]
idx_cost_nonzeros = [i for i, item in enumerate(edit_costs) if item != 0]
# cost_rdm = random.sample(range(1, 10), 6)
# init_costs = cost_rdm + [0]
# init_costs = cost_rdm
init_costs = [3, 3, 1, 3, 3, 1]
# init_costs = [i * 0.01 for i in cost_rdm] + [0]
# init_costs = [0.2, 0.2, 0.2, 0.2, 0.2, 0]
# init_costs = [0, 0, 0.9544, 0.026, 0.0196, 0]
# init_costs = [0.008429912251810438, 0.025461055985319694, 0.2047320869225948, 0.004148727085832133, 0.0, 0]
# idx_cost_nonzeros = [i for i, item in enumerate(edit_costs) if item != 0]
# compute distances in feature space.
coef_dk = 1
dis_k_mat, _, _, _ = kernel_distance_matrix(Gn, node_label, edge_label, gkernel=gkernel)
dis_k_vec = []
for i in range(len(dis_k_mat)):
for j in range(i, len(dis_k_mat)):
# for j in range(i, len(dis_k_mat)):
for j in range(i + 1, len(dis_k_mat)):
dis_k_vec.append(dis_k_mat[i, j])
dis_k_vec = np.array(dis_k_vec)
if fitkernel == None:
dis_k_vec_ajusted = dis_k_vec
elif fitkernel == 'gaussian':
coef_dk = 1 / np.max(dis_k_vec)
idx_dk_nonzeros = np.where(dis_k_vec != 0)[0]
# remove 0's and constraint d_k between 0 and 1.
dis_k_vec = dis_k_vec[idx_dk_nonzeros] * coef_dk
dis_k_vec_ajusted = np.sqrt(-np.log(dis_k_vec) / gamma)
residual_list = []
edit_cost_list = []
time_list = []
nb_cost_mat_list = []
# init ged.
print('\ninitial:')
time0 = time.time()
params_ged['edit_cost_constant'] = init_costs
ged_vec_init, ged_mat, n_edit_operations = compute_geds(Gn, params_ged,
parallel=parallel)
residual_list = [np.sqrt(np.sum(np.square(np.array(ged_vec_init) - dis_k_vec)))]
time_list = [time.time() - time0]
edit_cost_list = [init_costs]
nb_cost_mat = np.array(n_edit_operations)
nb_cost_mat_list = [nb_cost_mat]
print('edit_costs:', init_costs)
print('residual_list:', residual_list)
for itr in range(itr_max):
print('\niteration', itr)
time0 = time.time()
# compute GEDs and numbers of edit operations.
edit_cost_constant = [i for i in edit_costs]
edit_cost_list.append(edit_cost_constant)
ged_all, ged_mat, n_edit_operations = compute_geds(Gn, edit_cost_constant,
idx_cost_nonzeros, parallel=True)
if fitkernel == None:
residual = np.sqrt(np.sum(np.square(np.array(ged_all) - dis_k_vec)))
elif fitkernel == 'gaussian':
ged_all = np.array(ged_all)[idx_dk_nonzeros]
residual = np.sqrt(np.sum(np.square(
np.exp(-gamma * ged_all ** 2) / coef_dk - dis_k_vec)))
residual_list.append(residual)
# "fit" geds to distances in feature space by tuning edit costs using the
# Least Squares Method.
nb_cost_mat = np.array(n_edit_operations).T
if fitkernel == 'gaussian':
nb_cost_mat = nb_cost_mat[idx_dk_nonzeros]
nb_cost_mat_list.append(nb_cost_mat)
edit_costs_new, residual = compute_better_costs(nb_cost_mat, dis_k_vec_ajusted)

print('pseudo residual:', residual)
edit_costs_new, residual = update_costs(nb_cost_mat, dis_k_vec)
for i in range(len(edit_costs_new)):
if edit_costs_new[i] < 0:
if edit_costs_new[i] > -1e-9:
edit_costs_new[i] = 0
else:
raise ValueError('The edit cost is negative.')
for idx, item in enumerate(idx_cost_nonzeros):
edit_costs[item] = edit_costs_new[idx]
# for i in range(len(edit_costs_new)):
# if edit_costs_new[i] < 0:
# edit_costs_new[i] = 0

# compute new GEDs and numbers of edit operations.
params_ged['edit_cost_constant'] = edit_costs_new
ged_vec, ged_mat, n_edit_operations = compute_geds(Gn, params_ged,
parallel=parallel)
residual_list.append(np.sqrt(np.sum(np.square(np.array(ged_vec) - dis_k_vec))))
time_list.append(time.time() - time0)
print('edit_costs:', edit_costs)
edit_cost_list.append(edit_costs_new)
nb_cost_mat = np.array(n_edit_operations)
nb_cost_mat_list.append(nb_cost_mat)
print('edit_costs:', edit_costs_new)
print('residual_list:', residual_list)
print()
edit_cost_list.append(edit_costs)
ged_all, ged_mat, n_edit_operations = compute_geds(Gn, edit_costs,
idx_cost_nonzeros, parallel=True)
if fitkernel == 0:
residual = np.sqrt(np.sum(np.square(np.array(ged_all) - dis_k_vec)))
elif fitkernel == 'gaussian':
ged_all = np.array(ged_all)[idx_dk_nonzeros]
residual = np.sqrt(np.sum(np.square(
np.exp(-gamma * ged_all ** 2) / coef_dk - dis_k_vec)))
residual_list.append(residual)
nb_cost_mat_list.append(np.array(n_edit_operations).T)
return edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, \
time_list, nb_cost_mat_list, coef_dk
return edit_costs_new, residual_list, edit_cost_list, dis_k_mat, ged_mat, \
time_list, nb_cost_mat_list


def compute_geds(Gn, edit_cost_constant, idx_nonzeros, parallel=False):
def compute_geds(Gn, params_ged, parallel=False):
ged_mat = np.zeros((len(Gn), len(Gn)))
if parallel:
# print('parallel')
len_itr = int(len(Gn) * (len(Gn) + 1) / 2)
ged_all = [0 for i in range(len_itr)]
n_edit_operations = [[0 for i in range(len_itr)] for j in
range(len(idx_nonzeros))]
itr = combinations_with_replacement(range(0, len(Gn)), 2)
# len_itr = int(len(Gn) * (len(Gn) + 1) / 2)
len_itr = int(len(Gn) * (len(Gn) - 1) / 2)
ged_vec = [0 for i in range(len_itr)]
n_edit_operations = [0 for i in range(len_itr)]
# itr = combinations_with_replacement(range(0, len(Gn)), 2)
itr = combinations(range(0, len(Gn)), 2)
n_jobs = multiprocessing.cpu_count()
if len_itr < 100 * n_jobs:
chunksize = int(len_itr / n_jobs) + 1
@@ -134,68 +112,52 @@ def compute_geds(Gn, edit_cost_constant, idx_nonzeros, parallel=False):
def init_worker(gn_toshare):
global G_gn
G_gn = gn_toshare
do_partial = partial(_wrapper_compute_ged_parallel, edit_cost_constant,
idx_nonzeros)
do_partial = partial(_wrapper_compute_ged_parallel, params_ged)
pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(Gn,))
iterator = tqdm(pool.imap_unordered(do_partial, itr, chunksize),
desc='computing GEDs', file=sys.stdout)
# iterator = pool.imap_unordered(do_partial, itr, chunksize)
for i, j, dis, n_eo_tmp in iterator:
idx_itr = int(len(Gn) * i + j - i * (i + 1) / 2)
ged_all[idx_itr] = dis
idx_itr = int(len(Gn) * i + j - (i + 1) * (i + 2) / 2)
ged_vec[idx_itr] = dis
ged_mat[i][j] = dis
ged_mat[j][i] = dis
for idx, item in enumerate(idx_nonzeros):
n_edit_operations[idx][idx_itr] = n_eo_tmp[item]
n_edit_operations[idx_itr] = n_eo_tmp
# print('\n-------------------------------------------')
# print(i, j, idx_itr, dis)
pool.close()
pool.join()
else:
ged_all = []
n_edit_operations = [[] for i in range(len(idx_nonzeros))]
ged_vec = []
n_edit_operations = []
for i in tqdm(range(len(Gn)), desc='computing GEDs', file=sys.stdout):
# for i in range(len(Gn)):
for j in range(i, len(Gn)):
# time0 = time.time()
dis, pi_forward, pi_backward = GED(Gn[i], Gn[j], lib='gedlibpy',
cost='CONSTANT', method='IPFP',
edit_cost_constant=edit_cost_constant, stabilizer='min',
repeat=50)
# time1 = time.time() - time0
# time0 = time.time()
ged_all.append(dis)
for j in range(i + 1, len(Gn)):
dis, pi_forward, pi_backward = GED(Gn[i], Gn[j], **params_ged)
ged_vec.append(dis)
ged_mat[i][j] = dis
ged_mat[j][i] = dis
n_eo_tmp = get_nb_edit_operations(Gn[i], Gn[j], pi_forward, pi_backward)
for idx, item in enumerate(idx_nonzeros):
n_edit_operations[idx].append(n_eo_tmp[item])
# time2 = time.time() - time0
# print(time1, time2, time1 / time2)
n_edit_operations.append(n_eo_tmp)
return ged_all, ged_mat, n_edit_operations
return ged_vec, ged_mat, n_edit_operations

def _wrapper_compute_ged_parallel(edit_cost_constant, idx_nonzeros, itr):
def _wrapper_compute_ged_parallel(params_ged, itr):
i = itr[0]
j = itr[1]
dis, n_eo_tmp = _compute_ged_parallel(G_gn[i], G_gn[j], edit_cost_constant,
idx_nonzeros)
dis, n_eo_tmp = _compute_ged_parallel(G_gn[i], G_gn[j], params_ged)
return i, j, dis, n_eo_tmp


def _compute_ged_parallel(g1, g2, edit_cost_constant, idx_nonzeros):
dis, pi_forward, pi_backward = GED(g1, g2, lib='gedlibpy',
cost='CONSTANT', method='IPFP',
edit_cost_constant=edit_cost_constant, stabilizer='min',
repeat=50)
n_eo_tmp = get_nb_edit_operations(g1, g2, pi_forward, pi_backward)
def _compute_ged_parallel(g1, g2, params_ged):
dis, pi_forward, pi_backward = GED(g1, g2, **params_ged)
n_eo_tmp = get_nb_edit_operations(g1, g2, pi_forward, pi_backward)
return dis, n_eo_tmp


def compute_better_costs(nb_cost_mat, dis_k_vec):
def update_costs(nb_cost_mat, dis_k_vec):
# # method 1: simple least square method.
# edit_costs_new, residual, _, _ = np.linalg.lstsq(nb_cost_mat, dis_k_vec,
# rcond=None)
@@ -203,7 +165,7 @@ def compute_better_costs(nb_cost_mat, dis_k_vec):
# # method 2: least square method with x_i >= 0.
# edit_costs_new, residual = optimize.nnls(nb_cost_mat, dis_k_vec)
# method 3: solve as a quadratic program with constraints: x_i >= 0, sum(x) = 1.
# method 3: solve as a quadratic program with constraints.
# P = np.dot(nb_cost_mat.T, nb_cost_mat)
# q_T = -2 * np.dot(dis_k_vec.T, nb_cost_mat)
# G = -1 * np.identity(nb_cost_mat.shape[1])
@@ -221,7 +183,7 @@ def compute_better_costs(nb_cost_mat, dis_k_vec):
# h = np.array([0 for i in range(nb_cost_mat.shape[1])])
x = cp.Variable(nb_cost_mat.shape[1])
cost = cp.sum_squares(nb_cost_mat * x - dis_k_vec)
constraints = [x >= [0.01 for i in range(nb_cost_mat.shape[1])],
constraints = [x >= [0.0001 for i in range(nb_cost_mat.shape[1])],
# np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0]
np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0,
np.array([0.0, 0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0]


+ 72
- 26
preimage/ged.py View File

@@ -13,29 +13,30 @@ import multiprocessing
from multiprocessing import Pool
from functools import partial

from gedlibpy import librariesImport, gedlibpy
from gedlibpy_linlin import librariesImport, gedlibpy

def GED(g1, g2, lib='gedlibpy', cost='CHEM_1', method='IPFP',
edit_cost_constant=[], stabilizer='min', repeat=50):
edit_cost_constant=[], algo_options='', stabilizer='min', repeat=50):
"""
Compute GED for 2 graphs.
"""
if lib == 'gedlibpy':
def convertGraph(G):
"""Convert a graph to the proper NetworkX format that can be
recognized by library gedlibpy.
"""
G_new = nx.Graph()
for nd, attrs in G.nodes(data=True):
G_new.add_node(str(nd), chem=attrs['atom'])
def convertGraph(G):
"""Convert a graph to the proper NetworkX format that can be
recognized by library gedlibpy.
"""
G_new = nx.Graph()
for nd, attrs in G.nodes(data=True):
G_new.add_node(str(nd), chem=attrs['atom'])
# G_new.add_node(str(nd), x=str(attrs['attributes'][0]),
# y=str(attrs['attributes'][1]))
for nd1, nd2, attrs in G.edges(data=True):
G_new.add_edge(str(nd1), str(nd2), valence=attrs['bond_type'])
# G_new.add_edge(str(nd1), str(nd2))
return G_new
for nd1, nd2, attrs in G.edges(data=True):
# G_new.add_edge(str(nd1), str(nd2), valence=attrs['bond_type'])
G_new.add_edge(str(nd1), str(nd2))
return G_new
if lib == 'gedlibpy':
gedlibpy.restart_env()
gedlibpy.add_nx_graph(convertGraph(g1), "")
gedlibpy.add_nx_graph(convertGraph(g2), "")
@@ -43,12 +44,12 @@ def GED(g1, g2, lib='gedlibpy', cost='CHEM_1', method='IPFP',
listID = gedlibpy.get_all_graph_ids()
gedlibpy.set_edit_cost(cost, edit_cost_constant=edit_cost_constant)
gedlibpy.init()
gedlibpy.set_method(method, "")
gedlibpy.set_method(method, algo_options)
gedlibpy.init_method()

g = listID[0]
h = listID[1]
if stabilizer == None:
if stabilizer is None:
gedlibpy.run_method(g, h)
pi_forward = gedlibpy.get_forward_map(g, h)
pi_backward = gedlibpy.get_backward_map(g, h)
@@ -107,13 +108,57 @@ def GED(g1, g2, lib='gedlibpy', cost='CHEM_1', method='IPFP',
dis = upper
# make the map label correct (label remove map as np.inf)
nodes1 = [n for n in g1.nodes()]
nodes2 = [n for n in g2.nodes()]
nb1 = nx.number_of_nodes(g1)
nb2 = nx.number_of_nodes(g2)
pi_forward = [nodes2[pi] if pi < nb2 else np.inf for pi in pi_forward]
pi_backward = [nodes1[pi] if pi < nb1 else np.inf for pi in pi_backward]
elif lib == 'gedlib-bash':
import time
import random
import sys
import os
sys.path.insert(0, "../")
from pygraph.utils.graphfiles import saveDataset
tmp_dir = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/output/tmp_ged/'
if not os.path.exists(tmp_dir):
os.makedirs(tmp_dir)
fn_collection = tmp_dir + 'collection.' + str(time.time()) + str(random.randint(0, 1e9))
xparams = {'method': 'gedlib', 'graph_dir': fn_collection}
saveDataset([g1, g2], ['dummy', 'dummy'], gformat='gxl', group='xml',
filename=fn_collection, xparams=xparams)
command = 'GEDLIB_HOME=\'/media/ljia/DATA/research-repo/codes/others/gedlib/gedlib2\'\n'
command += 'LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$GEDLIB_HOME/lib\n'
command += 'export LD_LIBRARY_PATH\n'
command += 'cd \'/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/bin\'\n'
command += './ged_for_python_bash monoterpenoides ' + fn_collection \
+ ' \'' + algo_options + '\' '
for ec in edit_cost_constant:
command += str(ec) + ' '
# output = os.system(command)
stream = os.popen(command)
output = stream.readlines()
# print(output)
dis = float(output[0].strip())
runtime = float(output[1].strip())
size_forward = int(output[2].strip())
pi_forward = [int(item.strip()) for item in output[3:3+size_forward]]
pi_backward = [int(item.strip()) for item in output[3+size_forward:]]

# print(dis)
# print(runtime)
# print(size_forward)
# print(pi_forward)
# print(pi_backward)
# make the map label correct (label remove map as np.inf)
nodes1 = [n for n in g1.nodes()]
nodes2 = [n for n in g2.nodes()]
nb1 = nx.number_of_nodes(g1)
nb2 = nx.number_of_nodes(g2)
pi_forward = [nodes2[pi] if pi < nb2 else np.inf for pi in pi_forward]
pi_backward = [nodes1[pi] if pi < nb1 else np.inf for pi in pi_backward]
# print(pi_forward)
return dis, pi_forward, pi_backward

@@ -149,7 +194,7 @@ def GED_n(Gn, lib='gedlibpy', cost='CHEM_1', method='IPFP',

g = listID[0]
h = listID[1]
if stabilizer == None:
if stabilizer is None:
gedlibpy.run_method(g, h)
pi_forward = gedlibpy.get_forward_map(g, h)
pi_backward = gedlibpy.get_backward_map(g, h)
@@ -183,7 +228,8 @@ def GED_n(Gn, lib='gedlibpy', cost='CHEM_1', method='IPFP',

def ged_median(Gn, Gn_median, verbose=False, params_ged={'lib': 'gedlibpy',
'cost': 'CHEM_1', 'method': 'IPFP', 'edit_cost_constant': [],
'stabilizer': 'min', 'repeat': 50}, parallel=False):
'algo_options': '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1',
'stabilizer': None}, parallel=False):
if parallel:
len_itr = int(len(Gn))
pi_forward_list = [[] for i in range(len_itr)]


+ 58
- 1
preimage/iam.py View File

@@ -23,7 +23,8 @@ def iam_upgraded(Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, ite_max=50,
connected=False, removeNodes=True, allBestInit=False, allBestNodes=False,
allBestEdges=False, allBestOutput=False,
params_ged={'lib': 'gedlibpy', 'cost': 'CHEM_1', 'method': 'IPFP',
'edit_cost_constant': [], 'stabilizer': 'min', 'repeat': 50}):
'edit_cost_constant': [], 'stabilizer': None,
'algo_options': '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1'}):
"""See my name, then you know what I do.
"""
# Gn_median = Gn_median[0:10]
@@ -435,6 +436,62 @@ def iam_upgraded(Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, ite_max=50,
return G_gen_median_list, sod_gen_median, sod_list, G_set_median_list, sod_set_median


def iam_bash(Gn_names, edit_cost_constant, dataset='monoterpenoides',
graph_dir='/media/ljia/DATA/research-repo/codes/Linlin/py-graph/datasets/monoterpenoides/'):
"""Compute the iam by c++ implementation (gedlib) through bash.
"""
import os
import time

def createCollectionFile(Gn_names, y, filename):
"""Create collection file.
"""
dirname_ds = os.path.dirname(filename)
if dirname_ds != '':
dirname_ds += '/'
if not os.path.exists(dirname_ds) :
os.makedirs(dirname_ds)
with open(filename + '.xml', 'w') as fgroup:
fgroup.write("<?xml version=\"1.0\"?>")
fgroup.write("\n<!DOCTYPE GraphCollection SYSTEM \"http://www.inf.unibz.it/~blumenthal/dtd/GraphCollection.dtd\">")
fgroup.write("\n<GraphCollection>")
for idx, fname in enumerate(Gn_names):
fgroup.write("\n\t<graph file=\"" + fname + "\" class=\"" + str(y[idx]) + "\"/>")
fgroup.write("\n</GraphCollection>")
fgroup.close()

tmp_dir = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/output/tmp_ged/'
fn_collection = tmp_dir + 'collection.' + str(time.time()) + str(random.randint(0, 1e9))
createCollectionFile(Gn_names, ['dummy'] * len(Gn_names), fn_collection)
# graph_dir = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/generated_datsets/monoterpenoides/gxl'
command = 'GEDLIB_HOME=\'/media/ljia/DATA/research-repo/codes/others/gedlib/gedlib2\'\n'
command += 'LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$GEDLIB_HOME/lib\n'
command += 'export LD_LIBRARY_PATH\n'
command += 'cd \'/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/bin\'\n'
command += './iam_for_python_bash ' + dataset + ' ' + fn_collection \
+ ' \'' + graph_dir + '\' '
if edit_cost_constant is None:
command += 'None'
else:
for ec in edit_cost_constant:
command += str(ec) + ' '
# output = os.system(command)
stream = os.popen(command)

output = stream.readlines()
# print(output)
sod_sm = float(output[0].strip())
sod_gm= float(output[1].strip())
fname_sm = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/output/tmp_ged/set_median.gxl'
fname_gm = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/output/tmp_ged/gen_median.gxl'
return sod_sm, sod_gm, fname_sm, fname_gm



###############################################################################
# Old implementations.


+ 318
- 1
preimage/test_fitDistance.py View File

@@ -16,6 +16,319 @@ from utils import remove_edges
from fitDistance import fit_GED_to_kernel_distance
from utils import normalize_distance_matrix


def median_paper_clcpc_python_best():
"""c_vs <= c_vi + c_vr, c_es <= c_ei + c_er with ged computation with
python invoking the c++ code by bash command (with updated library).
"""
# ds = {'name': 'monoterpenoides',
# 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb
# _, y_all = loadDataset(ds['dataset'])
gkernel = 'untilhpathkernel'
node_label = 'atom'
edge_label = 'bond_type'
itr_max = 6
algo_options = '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1'
params_ged = {'lib': 'gedlibpy', 'cost': 'CONSTANT', 'method': 'IPFP',
'algo_options': algo_options, 'stabilizer': None}
y_all = ['3', '1', '4', '6', '7', '8', '9', '2']
repeats = 50
collection_path = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/generated_datsets/monoterpenoides/'
graph_dir = collection_path + 'gxl/'
fn_edit_costs_output = 'results/median_paper/edit_costs_output.python_init40.k10.txt'

for y in y_all:
for repeat in range(repeats):
edit_costs_output_file = open(fn_edit_costs_output, 'a')
collection_file = collection_path + 'monoterpenoides_' + y + '_' + str(repeat) + '.xml'
Gn, _ = loadDataset(collection_file, extra_params=graph_dir)
edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \
nb_cost_mat_list = fit_GED_to_kernel_distance(Gn, node_label, edge_label,
gkernel, itr_max, params_ged=params_ged,
parallel=True)
total_time = np.sum(time_list)
# print('\nedit_costs:', edit_costs)
# print('\nresidual_list:', residual_list)
# print('\nedit_cost_list:', edit_cost_list)
# print('\ndistance matrix in kernel space:', dis_k_mat)
# print('\nged matrix:', ged_mat)
# print('\ntotal time:', total_time)
# print('\nnb_cost_mat:', nb_cost_mat_list[-1])
np.savez('results/median_paper/fit_distance.clcpc.python_init40.monot.elabeled.uhpkernel.y'
+ y + '.repeat' + str(repeat) + '.k10..gm',
edit_costs=edit_costs,
residual_list=residual_list, edit_cost_list=edit_cost_list,
dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list,
total_time=total_time, nb_cost_mat_list=nb_cost_mat_list)
for ec in edit_costs:
edit_costs_output_file.write(str(ec) + ' ')
edit_costs_output_file.write('\n')
edit_costs_output_file.close()
# # normalized distance matrices.
# gmfile = np.load('results/fit_distance.cs_leq_ci_plus_cr.cost_leq_1en2.monot.elabeled.uhpkernel.gm.npz')
# edit_costs = gmfile['edit_costs']
# residual_list = gmfile['residual_list']
# edit_cost_list = gmfile['edit_cost_list']
# dis_k_mat = gmfile['dis_k_mat']
# ged_mat = gmfile['ged_mat']
# total_time = gmfile['total_time']
# nb_cost_mat_list = gmfile['nb_cost_mat_list']
nb_consistent, nb_inconsistent, ratio_consistent = pairwise_substitution_consistence(dis_k_mat, ged_mat)
print(nb_consistent, nb_inconsistent, ratio_consistent)
# norm_dis_k_mat = normalize_distance_matrix(dis_k_mat)
# plt.imshow(norm_dis_k_mat)
# plt.colorbar()
# plt.savefig('results/median_paper/norm_dis_k_mat.clcpc.python_best.monot.elabeled.uhpkernel.y'
# + y + '.repeat' + str(repeat) + '.eps', format='eps', dpi=300)
# plt.savefig('results/median_paper/norm_dis_k_mat.clcpc.python_best.monot.elabeled.uhpkernel.y'
# + y + '.repeat' + str(repeat) + '.png', format='png')
# # plt.show()
# plt.clf()
#
# norm_ged_mat = normalize_distance_matrix(ged_mat)
# plt.imshow(norm_ged_mat)
# plt.colorbar()
# plt.savefig('results/median_paper/norm_ged_mat.clcpc.python_best.monot.elabeled.uhpkernel.y'
# + y + '.repeat' + str(repeat) + '.eps', format='eps', dpi=300)
# plt.savefig('results/median_paper/norm_ged_mat.clcpc.python_best.monot.elabeled.uhpkernel.y'
# + y + '.repeat' + str(repeat) + '.png', format='png')
# # plt.show()
# plt.clf()
#
# norm_diff = norm_ged_mat - norm_dis_k_mat
# plt.imshow(norm_diff)
# plt.colorbar()
# plt.savefig('results/median_paper/diff_mat_norm_ged_dis_k.clcpc.python_best.monot.elabeled.uhpkernel.y'
# + y + '.repeat' + str(repeat) + '.eps', format='eps', dpi=300)
# plt.savefig('results/median_paper/diff_mat_norm_ged_dis_k.clcpc.python_best.monot.elabeled.uhpkernel.y'
# + y + '.repeat' + str(repeat) + '.png', format='png')
# # plt.show()
# plt.clf()
# # draw_count_bar(norm_diff)


def median_paper_clcpc_python_bash_cpp():
"""c_vs <= c_vi + c_vr, c_es <= c_ei + c_er with ged computation with
python invoking the c++ code by bash command (with updated library).
"""
# ds = {'name': 'monoterpenoides',
# 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb
# _, y_all = loadDataset(ds['dataset'])
gkernel = 'untilhpathkernel'
node_label = 'atom'
edge_label = 'bond_type'
itr_max = 20
algo_options = '--threads 6 --initial-solutions 10 --ratio-runs-from-initial-solutions .5'
params_ged = {'lib': 'gedlib-bash', 'cost': 'CONSTANT', 'method': 'IPFP',
'algo_options': algo_options}
y_all = ['3', '1', '4', '6', '7', '8', '9', '2']
repeats = 50
collection_path = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/generated_datsets/monoterpenoides/'
graph_dir = collection_path + 'gxl/'
fn_edit_costs_output = 'results/median_paper/edit_costs_output.txt'

for y in y_all:
for repeat in range(repeats):
edit_costs_output_file = open(fn_edit_costs_output, 'a')
collection_file = collection_path + 'monoterpenoides_' + y + '_' + str(repeat) + '.xml'
Gn, _ = loadDataset(collection_file, extra_params=graph_dir)
edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \
nb_cost_mat_list, coef_dk = fit_GED_to_kernel_distance(Gn, node_label, edge_label,
gkernel, itr_max, params_ged=params_ged,
parallel=False)
total_time = np.sum(time_list)
# print('\nedit_costs:', edit_costs)
# print('\nresidual_list:', residual_list)
# print('\nedit_cost_list:', edit_cost_list)
# print('\ndistance matrix in kernel space:', dis_k_mat)
# print('\nged matrix:', ged_mat)
# print('\ntotal time:', total_time)
# print('\nnb_cost_mat:', nb_cost_mat_list[-1])
np.savez('results/median_paper/fit_distance.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y'
+ y + '.repeat' + str(repeat) + '.gm',
edit_costs=edit_costs,
residual_list=residual_list, edit_cost_list=edit_cost_list,
dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list,
total_time=total_time, nb_cost_mat_list=nb_cost_mat_list,
coef_dk=coef_dk)
for ec in edit_costs:
edit_costs_output_file.write(str(ec) + ' ')
edit_costs_output_file.write('\n')
edit_costs_output_file.close()
# # normalized distance matrices.
# gmfile = np.load('results/fit_distance.cs_leq_ci_plus_cr.cost_leq_1en2.monot.elabeled.uhpkernel.gm.npz')
# edit_costs = gmfile['edit_costs']
# residual_list = gmfile['residual_list']
# edit_cost_list = gmfile['edit_cost_list']
# dis_k_mat = gmfile['dis_k_mat']
# ged_mat = gmfile['ged_mat']
# total_time = gmfile['total_time']
# nb_cost_mat_list = gmfile['nb_cost_mat_list']
# coef_dk = gmfile['coef_dk']
nb_consistent, nb_inconsistent, ratio_consistent = pairwise_substitution_consistence(dis_k_mat, ged_mat)
print(nb_consistent, nb_inconsistent, ratio_consistent)
# norm_dis_k_mat = normalize_distance_matrix(dis_k_mat)
# plt.imshow(norm_dis_k_mat)
# plt.colorbar()
# plt.savefig('results/median_paper/norm_dis_k_mat.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y'
# + y + '.repeat' + str(repeat) + '.eps', format='eps', dpi=300)
# plt.savefig('results/median_paper/norm_dis_k_mat.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y'
# + y + '.repeat' + str(repeat) + '.png', format='png')
# # plt.show()
# plt.clf()
#
# norm_ged_mat = normalize_distance_matrix(ged_mat)
# plt.imshow(norm_ged_mat)
# plt.colorbar()
# plt.savefig('results/median_paper/norm_ged_mat.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y'
# + y + '.repeat' + str(repeat) + '.eps', format='eps', dpi=300)
# plt.savefig('results/median_paper/norm_ged_mat.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y'
# + y + '.repeat' + str(repeat) + '.png', format='png')
# # plt.show()
# plt.clf()
#
# norm_diff = norm_ged_mat - norm_dis_k_mat
# plt.imshow(norm_diff)
# plt.colorbar()
# plt.savefig('results/median_paper/diff_mat_norm_ged_dis_k.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y'
# + y + '.repeat' + str(repeat) + '.eps', format='eps', dpi=300)
# plt.savefig('results/median_paper/diff_mat_norm_ged_dis_k.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y'
# + y + '.repeat' + str(repeat) + '.png', format='png')
# # plt.show()
# plt.clf()
# # draw_count_bar(norm_diff)





def test_cs_leq_ci_plus_cr_python_bash_cpp():
"""c_vs <= c_vi + c_vr, c_es <= c_ei + c_er with ged computation with
python invoking the c++ code by bash command (with updated library).
"""
ds = {'name': 'monoterpenoides',
'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb
Gn, y_all = loadDataset(ds['dataset'])
# Gn = Gn[0:10]
gkernel = 'untilhpathkernel'
node_label = 'atom'
edge_label = 'bond_type'
itr_max = 10
algo_options = '--threads 6 --initial-solutions 10 --ratio-runs-from-initial-solutions .5'
params_ged = {'lib': 'gedlib-bash', 'cost': 'CONSTANT', 'method': 'IPFP',
'algo_options': algo_options}
edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \
nb_cost_mat_list, coef_dk = fit_GED_to_kernel_distance(Gn, node_label, edge_label,
gkernel, itr_max, params_ged=params_ged,
parallel=False)
total_time = np.sum(time_list)
print('\nedit_costs:', edit_costs)
print('\nresidual_list:', residual_list)
print('\nedit_cost_list:', edit_cost_list)
print('\ndistance matrix in kernel space:', dis_k_mat)
print('\nged matrix:', ged_mat)
print('\ntotal time:', total_time)
print('\nnb_cost_mat:', nb_cost_mat_list[-1])
np.savez('results/fit_distance.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel.gm',
edit_costs=edit_costs,
residual_list=residual_list, edit_cost_list=edit_cost_list,
dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list,
total_time=total_time, nb_cost_mat_list=nb_cost_mat_list,
coef_dk=coef_dk)
# ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
# 'extra_params': {}} # node/edge symb
# Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
## Gn = Gn[0:10]
## remove_edges(Gn)
# gkernel = 'untilhpathkernel'
# node_label = 'atom'
# edge_label = 'bond_type'
# itr_max = 10
# edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \
# nb_cost_mat_list, coef_dk = fit_GED_to_kernel_distance(Gn, node_label, edge_label,
# gkernel, itr_max)
# total_time = np.sum(time_list)
# print('\nedit_costs:', edit_costs)
# print('\nresidual_list:', residual_list)
# print('\nedit_cost_list:', edit_cost_list)
# print('\ndistance matrix in kernel space:', dis_k_mat)
# print('\nged matrix:', ged_mat)
# print('\ntotal time:', total_time)
# print('\nnb_cost_mat:', nb_cost_mat_list[-1])
# np.savez('results/fit_distance.cs_leq_ci_plus_cr.mutag.elabeled.uhpkernel.gm',
# edit_costs=edit_costs,
# residual_list=residual_list, edit_cost_list=edit_cost_list,
# dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list,
# total_time=total_time, nb_cost_mat_list=nb_cost_mat_list, coef_dk)
# # normalized distance matrices.
# gmfile = np.load('results/fit_distance.cs_leq_ci_plus_cr.monot.elabeled.uhpkernel.gm.npz')
# edit_costs = gmfile['edit_costs']
# residual_list = gmfile['residual_list']
# edit_cost_list = gmfile['edit_cost_list']
# dis_k_mat = gmfile['dis_k_mat']
# ged_mat = gmfile['ged_mat']
# total_time = gmfile['total_time']
# nb_cost_mat_list = gmfile['nb_cost_mat_list']
# coef_dk = gmfile['coef_dk']
nb_consistent, nb_inconsistent, ratio_consistent = pairwise_substitution_consistence(dis_k_mat, ged_mat)
print(nb_consistent, nb_inconsistent, ratio_consistent)
# dis_k_sub = pairwise_substitution(dis_k_mat)
# ged_sub = pairwise_substitution(ged_mat)
# np.savez('results/sub_dis_mat.cs_leq_ci_plus_cr.gm',
# dis_k_sub=dis_k_sub, ged_sub=ged_sub)
norm_dis_k_mat = normalize_distance_matrix(dis_k_mat)
plt.imshow(norm_dis_k_mat)
plt.colorbar()
plt.savefig('results/norm_dis_k_mat.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel'
+ '.eps', format='eps', dpi=300)
plt.savefig('results/norm_dis_k_mat.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel'
+ '.png', format='png')
# plt.show()
plt.clf()
norm_ged_mat = normalize_distance_matrix(ged_mat)
plt.imshow(norm_ged_mat)
plt.colorbar()
plt.savefig('results/norm_ged_mat.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel'
+ '.eps', format='eps', dpi=300)
plt.savefig('results/norm_ged_mat.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel'
+ '.png', format='png')
# plt.show()
plt.clf()
norm_diff = norm_ged_mat - norm_dis_k_mat
plt.imshow(norm_diff)
plt.colorbar()
plt.savefig('results/diff_mat_norm_ged_dis_k.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel'
+ '.eps', format='eps', dpi=300)
plt.savefig('results/diff_mat_norm_ged_dis_k.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel'
+ '.png', format='png')
# plt.show()
plt.clf()
# draw_count_bar(norm_diff)


def test_anycosts():
ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
'extra_params': {}} # node/edge symb
@@ -295,8 +608,12 @@ def draw_count_bar(norm_diff):
if __name__ == '__main__':
# test_anycosts()
test_cs_leq_ci_plus_cr()
# test_cs_leq_ci_plus_cr()
# test_unfitted()
# test_cs_leq_ci_plus_cr_python_bash_cpp()
# median_paper_clcpc_python_bash_cpp()
median_paper_clcpc_python_best()

# x = np.array([[1,2,3],[4,5,6],[7,8,9]])
# xx = pairwise_substitution(x)

+ 126
- 1
preimage/test_iam.py View File

@@ -22,6 +22,130 @@ from iam import iam_upgraded
from utils import remove_edges, compute_kernel, get_same_item_indices, dis_gstar
#from ged import ged_median


def test_iam_monoterpenoides_with_init40():
gkernel = 'untilhpathkernel'
node_label = 'atom'
edge_label = 'bond_type'
# unfitted edit costs.
c_vi = 3
c_vr = 3
c_vs = 1
c_ei = 3
c_er = 3
c_es = 1
ite_max_iam = 50
epsilon_iam = 0.0001
removeNodes = False
connected_iam = False
# parameters for IAM function
# ged_cost = 'CONSTANT'
ged_cost = 'CONSTANT'
ged_method = 'IPFP'
edit_cost_constant = [c_vi, c_vr, c_vs, c_ei, c_er, c_es]
ged_stabilizer = None
# ged_repeat = 50
algo_options = '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1'
params_ged = {'lib': 'gedlibpy', 'cost': ged_cost, 'method': ged_method,
'edit_cost_constant': edit_cost_constant,
'algo_options': algo_options,
'stabilizer': ged_stabilizer}

collection_path = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/generated_datsets/monoterpenoides/'
graph_dir = collection_path + 'gxl/'
y_all = ['3', '1', '4', '6', '7', '8', '9', '2']
repeats = 50
# classify graphs according to classes.
time_list = []
dis_ks_min_list = []
dis_ks_set_median_list = []
sod_gs_list = []
g_best = []
sod_set_median_list = []
sod_list_list = []
for y in y_all:
print('\n-------------------------------------------------------')
print('class of y:', y)
time_list.append([])
dis_ks_min_list.append([])
dis_ks_set_median_list.append([])
sod_gs_list.append([])
g_best.append([])
sod_set_median_list.append([])
for repeat in range(repeats):
# load median set.
collection_file = collection_path + 'monoterpenoides_' + y + '_' + str(repeat) + '.xml'
Gn_median, _ = loadDataset(collection_file, extra_params=graph_dir)
Gn_candidate = [g.copy() for g in Gn_median]
time0 = time.time()
G_gen_median_list, sod_gen_median, sod_list, G_set_median_list, sod_set_median \
= iam_upgraded(Gn_median,
Gn_candidate, c_ei=c_ei, c_er=c_er, c_es=c_es, ite_max=ite_max_iam,
epsilon=epsilon_iam, node_label=node_label, edge_label=edge_label,
connected=connected_iam, removeNodes=removeNodes,
params_ged=params_ged)
time_total = time.time() - time0
print('\ntime: ', time_total)
time_list[-1].append(time_total)
g_best[-1].append(G_gen_median_list[0])
sod_set_median_list[-1].append(sod_set_median)
print('\nsmallest sod of the set median:', sod_set_median)
sod_gs_list[-1].append(sod_gen_median)
print('\nsmallest sod in graph space:', sod_gen_median)
sod_list_list.append(sod_list)
# # show the best graph and save it to file.
# print('one of the possible corresponding pre-images is')
# nx.draw(G_gen_median_list[0], labels=nx.get_node_attributes(G_gen_median_list[0], 'atom'),
# with_labels=True)
## plt.show()
# # plt.savefig('results/iam/mutag_median.fit_costs2.001.nb' + str(nb_median) +
## plt.savefig('results/iam/paper_compare/monoter_y' + str(y_class) +
## '_repeat' + str(repeat) + '_' + str(time.time()) +
## '.png', format="PNG")
# plt.clf()
# # print(G_gen_median_list[0].nodes(data=True))
# # print(G_gen_median_list[0].edges(data=True))
print('\nsods of the set median for this class:', sod_set_median_list[-1])
print('\nsods in graph space for this class:', sod_gs_list[-1])
# print('\ndistance in kernel space of set median for this class:',
# dis_ks_set_median_list[-1])
# print('\nsmallest distances in kernel space for this class:',
# dis_ks_min_list[-1])
print('\ntimes for this class:', time_list[-1])
sod_set_median_list[-1] = np.mean(sod_set_median_list[-1])
sod_gs_list[-1] = np.mean(sod_gs_list[-1])
# dis_ks_set_median_list[-1] = np.mean(dis_ks_set_median_list[-1])
# dis_ks_min_list[-1] = np.mean(dis_ks_min_list[-1])
time_list[-1] = np.mean(time_list[-1])
print()
print('\nmean sods of the set median for each class:', sod_set_median_list)
print('\nmean sods in graph space for each class:', sod_gs_list)
# print('\ndistances in kernel space of set median for each class:',
# dis_ks_set_median_list)
# print('\nmean smallest distances in kernel space for each class:',
# dis_ks_min_list)
print('\nmean times for each class:', time_list)
print('\nmean sods of the set median of all:', np.mean(sod_set_median_list))
print('\nmean sods in graph space of all:', np.mean(sod_gs_list))
# print('\nmean distances in kernel space of set median of all:',
# np.mean(dis_ks_set_median_list))
# print('\nmean smallest distances in kernel space of all:',
# np.mean(dis_ks_min_list))
print('\nmean times of all:', np.mean(time_list))




def test_iam_monoterpenoides():
ds = {'name': 'monoterpenoides',
'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb
@@ -834,9 +958,10 @@ if __name__ == '__main__':
# tests on different numbers of median-sets.
# test_iam_median_nb()
# test_iam_letter_h()
test_iam_monoterpenoides()
# test_iam_monoterpenoides()
# test_iam_mutag()
# test_iam_fitdistance()
# print("test log")
test_iam_monoterpenoides_with_init40()

+ 33
- 5
preimage/utils.py View File

@@ -17,8 +17,10 @@ from pygraph.kernels.marginalizedKernel import marginalizedkernel
from pygraph.kernels.untilHPathKernel import untilhpathkernel
from pygraph.kernels.spKernel import spkernel
import functools
from pygraph.utils.kernels import deltakernel, gaussiankernel, kernelproduct
from pygraph.utils.kernels import deltakernel, gaussiankernel, kernelproduct, polynomialkernel
from pygraph.kernels.structuralspKernel import structuralspkernel
from pygraph.kernels.treeletKernel import treeletkernel
from pygraph.kernels.weisfeilerLehmanKernel import weisfeilerlehmankernel


def remove_edges(Gn):
@@ -46,18 +48,29 @@ def compute_kernel(Gn, graph_kernel, node_label, edge_label, verbose):
n_jobs=multiprocessing.cpu_count(), verbose=verbose)
elif graph_kernel == 'untilhpathkernel':
Kmatrix, _ = untilhpathkernel(Gn, node_label=node_label, edge_label=edge_label,
depth=10, k_func='MinMax', compute_method='trie',
depth=7, k_func='MinMax', compute_method='trie',
n_jobs=multiprocessing.cpu_count(), verbose=verbose)
elif graph_kernel == 'spkernel':
mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)
Kmatrix, _, _ = spkernel(Gn, node_label='atom', node_kernels=
Kmatrix, _, _ = spkernel(Gn, node_label=node_label, node_kernels=
{'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel},
n_jobs=multiprocessing.cpu_count(), verbose=verbose)
elif graph_kernel == 'structuralspkernel':
mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)
Kmatrix, _ = structuralspkernel(Gn, node_label='atom', node_kernels=
Kmatrix, _ = structuralspkernel(Gn, node_label=node_label, node_kernels=
{'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel},
n_jobs=multiprocessing.cpu_count(), verbose=verbose)
elif graph_kernel == 'treeletkernel':
# pkernel = functools.partial(polynomialkernel, d=2, c=1e5)
pkernel = functools.partial(gaussiankernel, gamma=1e-6)
mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)
Kmatrix, _ = treeletkernel(Gn, node_label=node_label, edge_label=edge_label,
sub_kernel=pkernel,
n_jobs=multiprocessing.cpu_count(), verbose=verbose)
elif graph_kernel == 'weisfeilerlehmankernel':
Kmatrix, _ = weisfeilerlehmankernel(Gn, node_label=node_label, edge_label=edge_label,
height=4, base_kernel='subtree',
n_jobs=multiprocessing.cpu_count(), verbose=verbose)
# normalization
Kmatrix_diag = Kmatrix.diagonal().copy()
@@ -79,7 +92,7 @@ def gram2distances(Kmatrix):

def kernel_distance_matrix(Gn, node_label, edge_label, Kmatrix=None, gkernel=None):
dis_mat = np.empty((len(Gn), len(Gn)))
if Kmatrix == None:
if Kmatrix is None:
Kmatrix = compute_kernel(Gn, gkernel, node_label, edge_label, True)
for i in range(len(Gn)):
for j in range(i, len(Gn)):
@@ -109,6 +122,21 @@ def get_same_item_indices(ls):
return idx_dict


def k_nearest_neighbors_to_median_in_kernel_space(Gn, Kmatrix=None, gkernel=None,
node_label=None, edge_label=None):
dis_k_all = [] # distance between g_star and each graph.
alpha = [1 / len(Gn)] * len(Gn)
if Kmatrix is None:
Kmatrix = compute_kernel(Gn, gkernel, node_label, edge_label, True)
term3 = 0
for i1, a1 in enumerate(alpha):
for i2, a2 in enumerate(alpha):
term3 += a1 * a2 * Kmatrix[idx_gi[i1], idx_gi[i2]]
for ig, g in tqdm(enumerate(Gn_init), desc='computing distances', file=sys.stdout):
dtemp = dis_gstar(ig, idx_gi, alpha, Kmatrix, term3=term3)
dis_all.append(dtemp)


def normalize_distance_matrix(D):
max_value = np.amax(D)
min_value = np.amin(D)

+ 63
- 26
pygraph/utils/graphfiles.py View File

@@ -124,21 +124,21 @@ def saveGXL(graph, filename, method='benoit'):
# reference: https://github.com/dbblumenthal/gedlib/blob/master/data/generate_molecules.py#L22
# pass
gxl_file = open(filename, 'w')
gxl_file.write("<?xml version=\"1.0\"?>\n")
gxl_file.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n")
gxl_file.write("<!DOCTYPE gxl SYSTEM \"http://www.gupro.de/GXL/gxl-1.0.dtd\">\n")
gxl_file.write("<gxl>\n")
gxl_file.write("<gxl xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n")
gxl_file.write("<graph id=\"" + str(graph.graph['name']) + "\" edgeids=\"true\" edgemode=\"undirected\">\n")
for v, attrs in graph.nodes(data=True):
gxl_file.write("<node id=\"_" + str(v) + "\">")
gxl_file.write("<attr name=\"" + "chem" + "\"><int>" + str(attrs['atom']) + "</int></attr>")
gxl_file.write("<attr name=\"" + "chem" + "\"><int>" + str(attrs['chem']) + "</int></attr>")
gxl_file.write("</node>\n")
for v1, v2, attrs in graph.edges(data=True):
gxl_file.write("<edge from=\"_" + str(v1) + "\" to=\"_" + str(v2) + "\">")
# gxl_file.write("<attr name=\"valence\"><int>" + str(attrs['bond_type']) + "</int></attr>")
gxl_file.write("<attr name=\"valence\"><int>" + "1" + "</int></attr>")
gxl_file.write("<attr name=\"valence\"><int>" + str(attrs['valence']) + "</int></attr>")
# gxl_file.write("<attr name=\"valence\"><int>" + "1" + "</int></attr>")
gxl_file.write("</edge>\n")
gxl_file.write("</graph>\n")
gxl_file.write("</gxl>\n")
gxl_file.write("</gxl>")
gxl_file.close()
elif method == 'gedlib-letter':
# reference: https://github.com/dbblumenthal/gedlib/blob/master/data/generate_molecules.py#L22
@@ -147,15 +147,15 @@ def saveGXL(graph, filename, method='benoit'):
gxl_file.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n")
gxl_file.write("<!DOCTYPE gxl SYSTEM \"http://www.gupro.de/GXL/gxl-1.0.dtd\">\n")
gxl_file.write("<gxl xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n")
gxl_file.write("<graph id=\"" + str(graph.graph['name']) + "\" edgeids=\"false\" edgemode=\"undirected\">")
gxl_file.write("<graph id=\"" + str(graph.graph['name']) + "\" edgeids=\"false\" edgemode=\"undirected\">\n")
for v, attrs in graph.nodes(data=True):
gxl_file.write("<node id=\"_" + str(v) + "\">")
gxl_file.write("<attr name=\"x\"><float>" + str(attrs['attributes'][0]) + "</float></attr>")
gxl_file.write("<attr name=\"y\"><float>" + str(attrs['attributes'][1]) + "</float></attr>")
gxl_file.write("</node>")
gxl_file.write("</node>\n")
for v1, v2, attrs in graph.edges(data=True):
gxl_file.write("<edge from=\"_" + str(v1) + "\" to=\"_" + str(v2) + "\"/>")
gxl_file.write("</graph>")
gxl_file.write("<edge from=\"_" + str(v1) + "\" to=\"_" + str(v2) + "\"/>\n")
gxl_file.write("</graph>\n")
gxl_file.write("</gxl>")
gxl_file.close()

@@ -466,12 +466,15 @@ def loadDataset(filename, filename_y=None, extra_params=None):
def loadFromXML(filename, extra_params):
import xml.etree.ElementTree as ET
dirname_dataset = dirname(filename)
if extra_params:
dirname_dataset = extra_params
else:
dirname_dataset = dirname(filename)
tree = ET.parse(filename)
root = tree.getroot()
data = []
y = []
for graph in root.iter('print'):
for graph in root.iter('graph'):
mol_filename = graph.attrib['file']
mol_class = graph.attrib['class']
data.append(loadGXL(dirname_dataset + '/' + mol_filename))
@@ -541,15 +544,22 @@ def saveDataset(Gn, y, gformat='gxl', group=None, filename='gfile', xparams=None
dirname_ds += '/'
if not os.path.exists(dirname_ds) :
os.makedirs(dirname_ds)
if 'graph_dir' in xparams:
graph_dir = xparams['graph_dir'] + '/'
if not os.path.exists(graph_dir):
os.makedirs(graph_dir)
else:
graph_dir = dirname_ds
if group == 'xml' and gformat == 'gxl':
with open(filename + '.xml', 'w') as fgroup:
fgroup.write("<?xml version=\"1.0\"?>")
fgroup.write("\n<!DOCTYPE GraphCollection SYSTEM \"https://dbblumenthal.github.io/gedlib/GraphCollection_8dtd_source.html\">")
fgroup.write("\n<!DOCTYPE GraphCollection SYSTEM \"http://www.inf.unibz.it/~blumenthal/dtd/GraphCollection.dtd\">")
fgroup.write("\n<GraphCollection>")
for idx, g in enumerate(Gn):
fname_tmp = "graph" + str(idx) + ".gxl"
saveGXL(g, dirname_ds + fname_tmp, method=xparams['method'])
saveGXL(g, graph_dir + fname_tmp, method=xparams['method'])
fgroup.write("\n\t<graph file=\"" + fname_tmp + "\" class=\"" + str(y[idx]) + "\"/>")
fgroup.write("\n</GraphCollection>")
fgroup.close()
@@ -558,18 +568,18 @@ def saveDataset(Gn, y, gformat='gxl', group=None, filename='gfile', xparams=None
if __name__ == '__main__':
# ### Load dataset from .ds file.
# # .ct files.
ds = {'name': 'Alkane', 'dataset': '../../datasets/Alkane/dataset.ds',
'dataset_y': '../../datasets/Alkane/dataset_boiling_point_names.txt'}
Gn, y = loadDataset(ds['dataset'], filename_y=ds['dataset_y'])
# ds = {'name': 'Acyclic', 'dataset': '../../datasets/acyclic/dataset_bps.ds'} # node symb
# Gn, y = loadDataset(ds['dataset'])
# ds = {'name': 'MAO', 'dataset': '../../datasets/MAO/dataset.ds'} # node/edge symb
# Gn, y = loadDataset(ds['dataset'])
# ds = {'name': 'PAH', 'dataset': '../../datasets/PAH/dataset.ds'} # unlabeled
# Gn, y = loadDataset(ds['dataset'])
print(Gn[1].nodes(data=True))
print(Gn[1].edges(data=True))
print(y[1])
# ds = {'name': 'Alkane', 'dataset': '../../datasets/Alkane/dataset.ds',
# 'dataset_y': '../../datasets/Alkane/dataset_boiling_point_names.txt'}
# Gn, y = loadDataset(ds['dataset'], filename_y=ds['dataset_y'])
## ds = {'name': 'Acyclic', 'dataset': '../../datasets/acyclic/dataset_bps.ds'} # node symb
## Gn, y = loadDataset(ds['dataset'])
## ds = {'name': 'MAO', 'dataset': '../../datasets/MAO/dataset.ds'} # node/edge symb
## Gn, y = loadDataset(ds['dataset'])
## ds = {'name': 'PAH', 'dataset': '../../datasets/PAH/dataset.ds'} # unlabeled
## Gn, y = loadDataset(ds['dataset'])
# print(Gn[1].nodes(data=True))
# print(Gn[1].edges(data=True))
# print(y[1])
# # .gxl file.
# ds = {'name': 'monoterpenoides',
@@ -579,6 +589,33 @@ if __name__ == '__main__':
# print(Gn[1].edges(data=True))
# print(y[1])
### Convert graph from one format to another.
# .gxl file.
import networkx as nx
ds = {'name': 'monoterpenoides',
'dataset': '../../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb
Gn, y = loadDataset(ds['dataset'])
y = [int(i) for i in y]
print(Gn[1].nodes(data=True))
print(Gn[1].edges(data=True))
print(y[1])
# Convert a graph to the proper NetworkX format that can be recognized by library gedlib.
Gn_new = []
for G in Gn:
G_new = nx.Graph()
for nd, attrs in G.nodes(data=True):
G_new.add_node(str(nd), chem=attrs['atom'])
for nd1, nd2, attrs in G.edges(data=True):
G_new.add_edge(str(nd1), str(nd2), valence=attrs['bond_type'])
# G_new.add_edge(str(nd1), str(nd2))
Gn_new.append(G_new)
print(Gn_new[1].nodes(data=True))
print(Gn_new[1].edges(data=True))
print(Gn_new[1])
filename = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/generated_datsets/monoterpenoides/gxl/monoterpenoides'
xparams = {'method': 'gedlib'}
saveDataset(Gn, y, gformat='gxl', group='xml', filename=filename, xparams=xparams)
# ds = {'name': 'MUTAG', 'dataset': '../../datasets/MUTAG/MUTAG.mat',
# 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}} # node/edge symb
# Gn, y = loadDataset(ds['dataset'], extra_params=ds['extra_params'])

Loading…
Cancel
Save