Browse Source

update pre-image.

v0.1
jajupmochi 5 years ago
parent
commit
b7112cd60f
11 changed files with 1306 additions and 226 deletions
  1. +1
    -0
      notebooks/run_spkernel.py
  2. +47
    -16
      preimage/fitDistance.py
  3. +106
    -17
      preimage/ged.py
  4. +36
    -26
      preimage/iam.py
  5. +2
    -2
      preimage/median.py
  6. +8
    -6
      preimage/preimage_iam.py
  7. +212
    -24
      preimage/test_fitDistance.py
  8. +741
    -66
      preimage/test_iam.py
  9. +30
    -16
      preimage/test_preimage_iam.py
  10. +5
    -5
      preimage/utils.py
  11. +118
    -48
      pygraph/utils/graphfiles.py

+ 1
- 0
notebooks/run_spkernel.py View File

@@ -74,6 +74,7 @@ for ds in dslist:
extra_params=(ds['extra_params'] if 'extra_params' in ds else None),
ds_name=ds['name'],
n_jobs=multiprocessing.cpu_count(),
# n_jobs=7,
read_gm_from_file=False,
verbose=True)
print()

+ 47
- 16
preimage/fitDistance.py View File

@@ -18,31 +18,44 @@ from scipy import optimize
import cvxpy as cp

import sys
sys.path.insert(0, "../")
#sys.path.insert(0, "../")
from ged import GED, get_nb_edit_operations
from utils import kernel_distance_matrix

def fit_GED_to_kernel_distance(Gn, gkernel, itr_max):
def fit_GED_to_kernel_distance(Gn, node_label, edge_label, gkernel, itr_max,
fitkernel=None, gamma=1.0):
# c_vi, c_vr, c_vs, c_ei, c_er, c_es or parts of them.
random.seed(1)
cost_rdm = random.sample(range(1, 10), 5)
edit_costs = cost_rdm + [0]
# random.seed(1)
cost_rdm = random.sample(range(1, 10), 6)
# edit_costs = cost_rdm + [0]
edit_costs = cost_rdm
# edit_costs = [i * 0.01 for i in cost_rdm] + [0]
# edit_costs = [0.2, 0.2, 0.2, 0.2, 0.2, 0]
# edit_costs = [0, 0, 0.9544, 0.026, 0.0196, 0]
# edit_costs = [0.008429912251810438, 0.025461055985319694, 0.2047320869225948, 0.004148727085832133, 0.0, 0]
idx_nonzeros = [i for i, item in enumerate(edit_costs) if item != 0]
idx_cost_nonzeros = [i for i, item in enumerate(edit_costs) if item != 0]
# compute distances in feature space.
dis_k_mat, _, _, _ = kernel_distance_matrix(Gn, gkernel=gkernel)
coef_dk = 1
dis_k_mat, _, _, _ = kernel_distance_matrix(Gn, node_label, edge_label, gkernel=gkernel)
dis_k_vec = []
for i in range(len(dis_k_mat)):
for j in range(i, len(dis_k_mat)):
dis_k_vec.append(dis_k_mat[i, j])
dis_k_vec = np.array(dis_k_vec)
if fitkernel == None:
dis_k_vec_ajusted = dis_k_vec
elif fitkernel == 'gaussian':
coef_dk = 1 / np.max(dis_k_vec)
idx_dk_nonzeros = np.where(dis_k_vec != 0)[0]
# remove 0's and constraint d_k between 0 and 1.
dis_k_vec = dis_k_vec[idx_dk_nonzeros] * coef_dk
dis_k_vec_ajusted = np.sqrt(-np.log(dis_k_vec) / gamma)
residual_list = []
edit_cost_list = []
time_list = []
nb_cost_mat_list = []
for itr in range(itr_max):
print('\niteration', itr)
@@ -52,15 +65,23 @@ def fit_GED_to_kernel_distance(Gn, gkernel, itr_max):
edit_cost_list.append(edit_cost_constant)
ged_all, ged_mat, n_edit_operations = compute_geds(Gn, edit_cost_constant,
idx_nonzeros, parallel=True)
idx_cost_nonzeros, parallel=True)
residual = np.sqrt(np.sum(np.square(np.array(ged_all) - dis_k_vec)))
if fitkernel == None:
residual = np.sqrt(np.sum(np.square(np.array(ged_all) - dis_k_vec)))
elif fitkernel == 'gaussian':
ged_all = np.array(ged_all)[idx_dk_nonzeros]
residual = np.sqrt(np.sum(np.square(
np.exp(-gamma * ged_all ** 2) / coef_dk - dis_k_vec)))
residual_list.append(residual)
# "fit" geds to distances in feature space by tuning edit costs using the
# Least Squares Method.
nb_cost_mat = np.array(n_edit_operations).T
edit_costs_new, residual = compute_better_costs(nb_cost_mat, dis_k_vec)
if fitkernel == 'gaussian':
nb_cost_mat = nb_cost_mat[idx_dk_nonzeros]
nb_cost_mat_list.append(nb_cost_mat)
edit_costs_new, residual = compute_better_costs(nb_cost_mat, dis_k_vec_ajusted)

print('pseudo residual:', residual)
for i in range(len(edit_costs_new)):
@@ -70,7 +91,7 @@ def fit_GED_to_kernel_distance(Gn, gkernel, itr_max):
else:
raise ValueError('The edit cost is negative.')
for idx, item in enumerate(idx_nonzeros):
for idx, item in enumerate(idx_cost_nonzeros):
edit_costs[item] = edit_costs_new[idx]
time_list.append(time.time() - time0)
@@ -78,14 +99,21 @@ def fit_GED_to_kernel_distance(Gn, gkernel, itr_max):
print('edit_costs:', edit_costs)
print('residual_list:', residual_list)
print()
edit_cost_list.append(edit_costs)
ged_all, ged_mat, n_edit_operations = compute_geds(Gn, edit_costs,
idx_nonzeros, parallel=True)
residual = np.sqrt(np.sum(np.square(np.array(ged_all) - dis_k_vec)))
idx_cost_nonzeros, parallel=True)
if fitkernel == 0:
residual = np.sqrt(np.sum(np.square(np.array(ged_all) - dis_k_vec)))
elif fitkernel == 'gaussian':
ged_all = np.array(ged_all)[idx_dk_nonzeros]
residual = np.sqrt(np.sum(np.square(
np.exp(-gamma * ged_all ** 2) / coef_dk - dis_k_vec)))
residual_list.append(residual)
nb_cost_mat_list.append(np.array(n_edit_operations).T)
return edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list
return edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, \
time_list, nb_cost_mat_list, coef_dk


def compute_geds(Gn, edit_cost_constant, idx_nonzeros, parallel=False):
@@ -193,7 +221,10 @@ def compute_better_costs(nb_cost_mat, dis_k_vec):
# h = np.array([0 for i in range(nb_cost_mat.shape[1])])
x = cp.Variable(nb_cost_mat.shape[1])
cost = cp.sum_squares(nb_cost_mat * x - dis_k_vec)
constraints = [x >= [0 for i in range(nb_cost_mat.shape[1])]]
constraints = [x >= [0.01 for i in range(nb_cost_mat.shape[1])],
# np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0]
np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0,
np.array([0.0, 0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0]
prob = cp.Problem(cp.Minimize(cost), constraints)
prob.solve()
edit_costs_new = x.value


+ 106
- 17
preimage/ged.py View File

@@ -9,11 +9,14 @@ import numpy as np
import networkx as nx
from tqdm import tqdm
import sys
import multiprocessing
from multiprocessing import Pool
from functools import partial

from gedlibpy import librariesImport, gedlibpy

def GED(g1, g2, lib='gedlibpy', cost='CHEM_1', method='IPFP',
edit_cost_constant=[], saveGXL='benoit', stabilizer='min', repeat=50):
edit_cost_constant=[], stabilizer='min', repeat=50):
"""
Compute GED for 2 graphs.
"""
@@ -25,9 +28,11 @@ def GED(g1, g2, lib='gedlibpy', cost='CHEM_1', method='IPFP',
G_new = nx.Graph()
for nd, attrs in G.nodes(data=True):
G_new.add_node(str(nd), chem=attrs['atom'])
# G_new.add_node(str(nd), x=str(attrs['attributes'][0]),
# y=str(attrs['attributes'][1]))
for nd1, nd2, attrs in G.edges(data=True):
# G_new.add_edge(str(nd1), str(nd2), valence=attrs['bond_type'])
G_new.add_edge(str(nd1), str(nd2))
G_new.add_edge(str(nd1), str(nd2), valence=attrs['bond_type'])
# G_new.add_edge(str(nd1), str(nd2))
return G_new
@@ -49,6 +54,32 @@ def GED(g1, g2, lib='gedlibpy', cost='CHEM_1', method='IPFP',
pi_backward = gedlibpy.get_backward_map(g, h)
upper = gedlibpy.get_upper_bound(g, h)
lower = gedlibpy.get_lower_bound(g, h)
elif stabilizer == 'mean':
# @todo: to be finished...
upper_list = [np.inf] * repeat
for itr in range(repeat):
gedlibpy.run_method(g, h)
upper_list[itr] = gedlibpy.get_upper_bound(g, h)
pi_forward = gedlibpy.get_forward_map(g, h)
pi_backward = gedlibpy.get_backward_map(g, h)
lower = gedlibpy.get_lower_bound(g, h)
upper = np.mean(upper_list)
elif stabilizer == 'median':
if repeat % 2 == 0:
repeat += 1
upper_list = [np.inf] * repeat
pi_forward_list = [0] * repeat
pi_backward_list = [0] * repeat
for itr in range(repeat):
gedlibpy.run_method(g, h)
upper_list[itr] = gedlibpy.get_upper_bound(g, h)
pi_forward_list[itr] = gedlibpy.get_forward_map(g, h)
pi_backward_list[itr] = gedlibpy.get_backward_map(g, h)
lower = gedlibpy.get_lower_bound(g, h)
upper = np.median(upper_list)
idx_median = upper_list.index(upper)
pi_forward = pi_forward_list[idx_median]
pi_backward = pi_backward_list[idx_median]
elif stabilizer == 'min':
upper = np.inf
for itr in range(repeat):
@@ -61,6 +92,18 @@ def GED(g1, g2, lib='gedlibpy', cost='CHEM_1', method='IPFP',
lower = gedlibpy.get_lower_bound(g, h)
if upper == 0:
break
elif stabilizer == 'max':
upper = 0
for itr in range(repeat):
gedlibpy.run_method(g, h)
upper_tmp = gedlibpy.get_upper_bound(g, h)
if upper_tmp > upper:
upper = upper_tmp
pi_forward = gedlibpy.get_forward_map(g, h)
pi_backward = gedlibpy.get_backward_map(g, h)
lower = gedlibpy.get_lower_bound(g, h)
elif stabilizer == 'gaussian':
pass
dis = upper
@@ -138,23 +181,69 @@ def GED_n(Gn, lib='gedlibpy', cost='CHEM_1', method='IPFP',
return dis, pi_forward, pi_backward


def ged_median(Gn, Gn_median, measure='ged', verbose=False,
ged_cost='CHEM_1', ged_method='IPFP', saveGXL='benoit'):
dis_list = []
pi_forward_list = []
for idx, G in tqdm(enumerate(Gn), desc='computing median distances',
file=sys.stdout) if verbose else enumerate(Gn):
dis_sum = 0
pi_forward_list.append([])
for G_p in Gn_median:
dis_tmp, pi_tmp_forward, pi_tmp_backward = GED(G, G_p,
cost=ged_cost, method=ged_method, saveGXL=saveGXL)
pi_forward_list[idx].append(pi_tmp_forward)
dis_sum += dis_tmp
dis_list.append(dis_sum)
def ged_median(Gn, Gn_median, verbose=False, params_ged={'lib': 'gedlibpy',
'cost': 'CHEM_1', 'method': 'IPFP', 'edit_cost_constant': [],
'stabilizer': 'min', 'repeat': 50}, parallel=False):
if parallel:
len_itr = int(len(Gn))
pi_forward_list = [[] for i in range(len_itr)]
dis_list = [0 for i in range(len_itr)]
itr = range(0, len_itr)
n_jobs = multiprocessing.cpu_count()
if len_itr < 100 * n_jobs:
chunksize = int(len_itr / n_jobs) + 1
else:
chunksize = 100
def init_worker(gn_toshare, gn_median_toshare):
global G_gn, G_gn_median
G_gn = gn_toshare
G_gn_median = gn_median_toshare
do_partial = partial(_compute_ged_median, params_ged)
pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(Gn, Gn_median))
if verbose:
iterator = tqdm(pool.imap_unordered(do_partial, itr, chunksize),
desc='computing GEDs', file=sys.stdout)
else:
iterator = pool.imap_unordered(do_partial, itr, chunksize)
for i, dis_sum, pi_forward in iterator:
pi_forward_list[i] = pi_forward
dis_list[i] = dis_sum
# print('\n-------------------------------------------')
# print(i, j, idx_itr, dis)
pool.close()
pool.join()
else:
dis_list = []
pi_forward_list = []
for idx, G in tqdm(enumerate(Gn), desc='computing median distances',
file=sys.stdout) if verbose else enumerate(Gn):
dis_sum = 0
pi_forward_list.append([])
for G_p in Gn_median:
dis_tmp, pi_tmp_forward, pi_tmp_backward = GED(G, G_p,
**params_ged)
pi_forward_list[idx].append(pi_tmp_forward)
dis_sum += dis_tmp
dis_list.append(dis_sum)
return dis_list, pi_forward_list


def _compute_ged_median(params_ged, itr):
# print(itr)
dis_sum = 0
pi_forward = []
for G_p in G_gn_median:
dis_tmp, pi_tmp_forward, pi_tmp_backward = GED(G_gn[itr], G_p,
**params_ged)
pi_forward.append(pi_tmp_forward)
dis_sum += dis_tmp
return itr, dis_sum, pi_forward


def get_nb_edit_operations(g1, g2, forward_map, backward_map):
"""Compute the number of each edit operations.
"""


+ 36
- 26
preimage/iam.py View File

@@ -22,20 +22,22 @@ def iam_upgraded(Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, ite_max=50,
epsilon=0.001, node_label='atom', edge_label='bond_type',
connected=False, removeNodes=True, allBestInit=False, allBestNodes=False,
allBestEdges=False, allBestOutput=False,
params_ged={'ged_cost': 'CHEM_1', 'ged_method': 'IPFP', 'saveGXL': 'benoit'}):
params_ged={'lib': 'gedlibpy', 'cost': 'CHEM_1', 'method': 'IPFP',
'edit_cost_constant': [], 'stabilizer': 'min', 'repeat': 50}):
"""See my name, then you know what I do.
"""
# Gn_median = Gn_median[0:10]
# Gn_median = [nx.convert_node_labels_to_integers(g) for g in Gn_median]
if removeNodes:
node_ir = np.inf # corresponding to the node remove and insertion.
label_r = 'thanksdanny' # the label for node remove. # @todo: make this label unrepeatable.
node_ir = np.inf # corresponding to the node remove and insertion.
label_r = 'thanksdanny' # the label for node remove. # @todo: make this label unrepeatable.
ds_attrs = get_dataset_attributes(Gn_median + Gn_candidate,
attr_names=['edge_labeled', 'node_attr_dim', 'edge_attr_dim'],
edge_label=edge_label)
node_label_set = get_node_labels(Gn_median, node_label)
edge_label_set = get_edge_labels(Gn_median, edge_label)

def generate_graph(G, pi_p_forward, label_set):
def generate_graph(G, pi_p_forward):
G_new_list = [G.copy()] # all "best" graphs generated in this iteration.
# nx.draw_networkx(G)
# import matplotlib.pyplot as plt
@@ -52,7 +54,7 @@ def iam_upgraded(Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, ite_max=50,
for ndi, (nd, _) in enumerate(G.nodes(data=True)):
h_i0_list = []
label_list = []
for label in label_set:
for label in node_label_set:
h_i0 = 0
for idx, g in enumerate(Gn_median):
pi_i = pi_p_forward[idx][ndi]
@@ -62,7 +64,7 @@ def iam_upgraded(Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, ite_max=50,
label_list.append(label)
# case when the node is to be removed.
if removeNodes:
h_i0_remove = 0 # @todo: maybe this can be added to the label_set above.
h_i0_remove = 0 # @todo: maybe this can be added to the node_label_set above.
for idx, g in enumerate(Gn_median):
pi_i = pi_p_forward[idx][ndi]
if pi_i == node_ir:
@@ -91,11 +93,10 @@ def iam_upgraded(Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, ite_max=50,
G_new_list = [ggg.copy() for ggg in G_new_list_nd]
else:
# choose one of the best randomly.
h_ij0_max = h_i0_list[idx_max[0]]
idx_rdm = random.randint(0, len(idx_max) - 1)
best_label = label_list[idx_max[idx_rdm]]
# check whether a_ij is 0 or 1.
h_i0_max = h_i0_list[idx_max[idx_rdm]]
g_new = G_new_list[0]
if best_label == label_r:
g_new.remove_node(nd)
@@ -134,8 +135,7 @@ def iam_upgraded(Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, ite_max=50,
# for nd1, nd2, _ in g_new.edges(data=True):
h_ij0_list = []
label_list = []
# @todo: compute edge label set before.
for label in get_edge_labels(Gn_median, edge_label):
for label in edge_label_set:
h_ij0 = 0
for idx, g in enumerate(Gn_median):
pi_i = pi_p_forward[idx][nd1i]
@@ -176,9 +176,9 @@ def iam_upgraded(Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, ite_max=50,
G_new_list_ed.append(g_tmp_copy)
g_tmp_list = [ggg.copy() for ggg in G_new_list_ed]
else: # choose one of the best randomly.
h_ij0_max = h_ij0_list[idx_max[0]]
idx_rdm = random.randint(0, len(idx_max) - 1)
best_label = label_list[idx_max[idx_rdm]]
h_ij0_max = h_ij0_list[idx_max[idx_rdm]]
# check whether a_ij is 0 or 1.
sij_norm = 0
@@ -192,6 +192,7 @@ def iam_upgraded(Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, ite_max=50,
g_new.add_edge(nd1, nd2)
g_new.edges[nd1, nd2][edge_label] = best_label
else:
# elif h_ij0_max < len(Gn_median) * c_er / c_es + sij_norm * (1 - (c_er + c_ei) / c_es):
if g_new.has_edge(nd1, nd2):
g_new.remove_edge(nd1, nd2)
g_tmp_list = [g_new]
@@ -221,8 +222,8 @@ def iam_upgraded(Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, ite_max=50,
if g_tmp.has_node(nd1) and g_tmp.has_node(nd2) \
and not g_tmp.has_edge(nd1, nd2):
g_tmp.add_edge(nd1, nd2)
# else: # @todo: which to use?
elif sij_norm < len(Gn_median) * c_er / (c_er + c_ei):
else: # @todo: which to use?
# elif sij_norm < len(Gn_median) * c_er / (c_er + c_ei):
if g_tmp.has_edge(nd1, nd2):
g_tmp.remove_edge(nd1, nd2)
# do not change anything when equal.
@@ -238,7 +239,7 @@ def iam_upgraded(Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, ite_max=50,
# # find the best graph generated in this iteration and update pi_p.
# @todo: should we update all graphs generated or just the best ones?
dis_list, pi_forward_list = ged_median(G_new_list, Gn_median,
**params_ged)
params_ged=params_ged)
# @todo: should we remove the identical and connectivity check?
# Don't know which is faster.
if ds_attrs['node_attr_dim'] == 0 and ds_attrs['edge_attr_dim'] == 0:
@@ -283,15 +284,16 @@ def iam_upgraded(Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, ite_max=50,
# while itr < ite_max and (np.abs(old_sod - cur_sod) > epsilon or
# np.abs(old_sod - cur_sod) == 0):
while itr < ite_max and np.abs(old_sod - cur_sod) > epsilon:
# while itr < ite_max:
# for itr in range(0, 5): # the convergence condition?
print('itr_iam is', itr)
G_new_list = []
pi_forward_new_list = []
dis_new_list = []
for idx, g in enumerate(G_list):
label_set = get_node_labels(Gn_median + [g], node_label)
# label_set = get_node_labels(Gn_median + [g], node_label)
G_tmp_list, pi_forward_tmp_list, dis_tmp_list = generate_graph(
g, pi_forward_list[idx], label_set)
g, pi_forward_list[idx])
G_new_list += G_tmp_list
pi_forward_new_list += pi_forward_tmp_list
dis_new_list += dis_tmp_list
@@ -325,7 +327,7 @@ def iam_upgraded(Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, ite_max=50,
print('\nsods:', sod_list, '\n')
return G_list, pi_forward_list, dis_min
return G_list, pi_forward_list, dis_min, sod_list
def remove_duplicates(Gn):
@@ -363,7 +365,8 @@ def iam_upgraded(Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, ite_max=50,
# compute set-median.
dis_min = np.inf
dis_list, pi_forward_all = ged_median(Gn_candidate, Gn_median,
**params_ged)
params_ged=params_ged, parallel=True)
print('finish computing GEDs.')
# find all smallest distances.
if allBestInit: # try all best init graphs.
idx_min_list = range(len(dis_list))
@@ -371,19 +374,26 @@ def iam_upgraded(Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, ite_max=50,
else:
idx_min_list = np.argwhere(dis_list == np.min(dis_list)).flatten().tolist()
dis_min = [dis_list[idx_min_list[0]]] * len(idx_min_list)
idx_min_rdm = random.randint(0, len(idx_min_list) - 1)
idx_min_list = [idx_min_list[idx_min_rdm]]
sod_set_median = np.min(dis_min)
# phase 2: iteration.
G_list = []
dis_list = []
pi_forward_list = []
G_set_median_list = []
# sod_list = []
for idx_tmp, idx_min in enumerate(idx_min_list):
# print('idx_min is', idx_min)
G = Gn_candidate[idx_min].copy()
G_set_median_list.append(G.copy())
# list of edit operations.
pi_p_forward = pi_forward_all[idx_min]
# pi_p_backward = pi_all_backward[idx_min]
Gi_list, pi_i_forward_list, dis_i_min = iteration_proc(G, pi_p_forward, dis_min[idx_tmp])
Gi_list, pi_i_forward_list, dis_i_min, sod_list = iteration_proc(G,
pi_p_forward, dis_min[idx_tmp])
G_list += Gi_list
dis_list += [dis_i_min] * len(Gi_list)
pi_forward_list += pi_i_forward_list
@@ -409,9 +419,9 @@ def iam_upgraded(Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, ite_max=50,
# print(g.edges(data=True))
# get the best median graphs
G_min_list, pi_forward_min_list, dis_min = best_median_graphs(
G_gen_median_list, pi_forward_min_list, sod_gen_median = best_median_graphs(
G_list, pi_forward_list, dis_list)
# for g in G_min_list:
# for g in G_gen_median_list:
# nx.draw_networkx(g)
# plt.show()
# print(g.nodes(data=True))
@@ -419,10 +429,10 @@ def iam_upgraded(Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, ite_max=50,
if not allBestOutput:
# randomly choose one graph.
idx_rdm = random.randint(0, len(G_min_list) - 1)
G_min_list = [G_min_list[idx_rdm]]
idx_rdm = random.randint(0, len(G_gen_median_list) - 1)
G_gen_median_list = [G_gen_median_list[idx_rdm]]
return G_min_list, dis_min
return G_gen_median_list, sod_gen_median, sod_list, G_set_median_list, sod_set_median





+ 2
- 2
preimage/median.py View File

@@ -5,8 +5,8 @@ import numpy as np
import networkx as nx
import time
import librariesImport
import script
from gedlibpy import librariesImport, gedlibpy
#import script
sys.path.insert(0, "/home/bgauzere/dev/optim-graphes/")
import pygraph
from pygraph.utils.graphfiles import loadDataset


+ 8
- 6
preimage/preimage_iam.py View File

@@ -27,8 +27,9 @@ def preimage_iam(Gn_init, Gn_median, alpha, idx_gi, Kmatrix, k, r_max,
params_iam={'c_ei': 1, 'c_er': 1, 'c_es': 1,
'ite_max': 50, 'epsilon': 0.001,
'removeNodes': True, 'connected': False},
params_ged={'ged_cost': 'CHEM_1', 'ged_method': 'IPFP',
'saveGXL': 'benoit'}):
params_ged={'lib': 'gedlibpy', 'cost': 'CHEM_1', 'method': 'IPFP',
'edit_cost_constant': [], 'stabilizer': 'min',
'repeat': 50}):
"""This function constructs graph pre-image by the iterative pre-image
framework in reference [1], algorithm 1, where the step of generating new
graphs randomly is replaced by the IAM algorithm in reference [2].
@@ -91,12 +92,12 @@ def preimage_iam(Gn_init, Gn_median, alpha, idx_gi, Kmatrix, k, r_max,
ghat_new_list = []
for g_tmp in Gk:
Gn_nearest_init = [g_tmp.copy()]
ghat_new_list_tmp, _ = iam_upgraded(Gn_nearest_median,
ghat_new_list_tmp, _, _ = iam_upgraded(Gn_nearest_median,
Gn_nearest_init, params_ged=params_ged, **params_iam)
ghat_new_list += ghat_new_list_tmp
else: # only the best graph in D_k is used to initialize IAM.
Gn_nearest_init = [g.copy() for g in Gk]
ghat_new_list, _ = iam_upgraded(Gn_nearest_median, Gn_nearest_init,
ghat_new_list, _, _ = iam_upgraded(Gn_nearest_median, Gn_nearest_init,
params_ged=params_ged, **params_iam)

# for g in g_tmp_list:
@@ -181,8 +182,9 @@ def preimage_iam_random_mix(Gn_init, Gn_median, alpha, idx_gi, Kmatrix, k, r_max
params_iam={'c_ei': 1, 'c_er': 1, 'c_es': 1,
'ite_max': 50, 'epsilon': 0.001,
'removeNodes': True, 'connected': False},
params_ged={'ged_cost': 'CHEM_1', 'ged_method': 'IPFP',
'saveGXL': 'benoit'}):
params_ged={'lib': 'gedlibpy', 'cost': 'CHEM_1',
'method': 'IPFP', 'edit_cost_constant': [],
'stabilizer': 'min', 'repeat': 50}):
"""This function constructs graph pre-image by the iterative pre-image
framework in reference [1], algorithm 1, where new graphs are generated
randomly and by the IAM algorithm in reference [2].


+ 212
- 24
preimage/test_fitDistance.py View File

@@ -7,7 +7,10 @@ Created on Thu Oct 24 11:50:56 2019
"""
from matplotlib import pyplot as plt
import numpy as np
from tqdm import tqdm

import sys
sys.path.insert(0, "../")
from pygraph.utils.graphfiles import loadDataset
from utils import remove_edges
from fitDistance import fit_GED_to_kernel_distance
@@ -21,21 +24,22 @@ def test_anycosts():
remove_edges(Gn)
gkernel = 'marginalizedkernel'
itr_max = 10
edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list = \
fit_GED_to_kernel_distance(Gn, gkernel, itr_max)
edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \
nb_cost_mat_list, coef_dk = fit_GED_to_kernel_distance(Gn, gkernel, itr_max)
total_time = np.sum(time_list)
print('\nedit_costs:', edit_costs)
print('\nresidual_list:', residual_list)
print('\nedit_cost_list:', edit_cost_list)
print('\ndistance matrix in kernel space:', dis_k_mat)
print('\nged matrix:', ged_mat)
print('total time:', total_time)
print('\ntotal time:', total_time)
print('\nnb_cost_mat:', nb_cost_mat_list[-1])
np.savez('results/fit_distance.any_costs.gm', edit_costs=edit_costs,
residual_list=residual_list, edit_cost_list=edit_cost_list,
dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list,
total_time=total_time)
total_time=total_time, nb_cost_mat_list=nb_cost_mat_list)
# normalized distance matrices.
# # normalized distance matrices.
# gmfile = np.load('results/fit_distance.any_costs.gm.npz')
# edit_costs = gmfile['edit_costs']
# residual_list = gmfile['residual_list']
@@ -43,72 +47,256 @@ def test_anycosts():
# dis_k_mat = gmfile['dis_k_mat']
# ged_mat = gmfile['ged_mat']
# total_time = gmfile['total_time']
## nb_cost_mat_list = gmfile['nb_cost_mat_list']
norm_dis_k_mat = normalize_distance_matrix(dis_k_mat)
plt.imshow(norm_dis_k_mat)
plt.colorbar()
plt.savefig('results/norm_dis_k_mat.any_costs' + '.eps', format='eps', dpi=300)
# plt.savefig('results/norm_dis_k_mat.any_costs' + '.jpg', format='jpg')
# plt.savefig('results/norm_dis_k_mat.any_costs' + '.png', format='png')
# plt.show()
plt.clf()
norm_ged_mat = normalize_distance_matrix(ged_mat)
plt.imshow(norm_ged_mat)
plt.colorbar()
plt.savefig('results/norm_ged_mat.any_costs' + '.eps', format='eps', dpi=300)
# plt.savefig('results/norm_ged_mat.any_costs' + '.jpg', format='jpg')
# plt.savefig('results/norm_ged_mat.any_costs' + '.png', format='png')
# plt.show()
plt.clf()
norm_diff = norm_ged_mat - norm_dis_k_mat
plt.imshow(norm_diff)
plt.colorbar()
plt.savefig('results/diff_mat_norm_ged_dis_k.any_costs' + '.eps', format='eps', dpi=300)
# plt.savefig('results/diff_mat_norm_ged_dis_k.any_costs' + '.png', format='png')
# plt.show()
plt.clf()
# draw_count_bar(norm_diff)

def test_cs_leq_ci_plus_cr():
"""c_vs <= c_vi + c_vr, c_es <= c_ei + c_er
"""
ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
'extra_params': {}} # node/edge symb
Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
ds = {'name': 'monoterpenoides',
'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb
Gn, y_all = loadDataset(ds['dataset'])
# Gn = Gn[0:10]
remove_edges(Gn)
gkernel = 'marginalizedkernel'
gkernel = 'untilhpathkernel'
node_label = 'atom'
edge_label = 'bond_type'
itr_max = 10
edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list = \
fit_GED_to_kernel_distance(Gn, gkernel, itr_max)
edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \
nb_cost_mat_list, coef_dk = fit_GED_to_kernel_distance(Gn, node_label, edge_label,
gkernel, itr_max,
fitkernel='gaussian')
total_time = np.sum(time_list)
print('\nedit_costs:', edit_costs)
print('\nresidual_list:', residual_list)
print('\nedit_cost_list:', edit_cost_list)
print('\ndistance matrix in kernel space:', dis_k_mat)
print('\nged matrix:', ged_mat)
print('total time:', total_time)
np.savez('results/fit_distance.cs_leq_ci_plus_cr.gm', edit_costs=edit_costs,
print('\ntotal time:', total_time)
print('\nnb_cost_mat:', nb_cost_mat_list[-1])
np.savez('results/fit_distance.cs_leq_ci_plus_cr.gaussian.cost_leq_1en2.monot.elabeled.uhpkernel.gm',
edit_costs=edit_costs,
residual_list=residual_list, edit_cost_list=edit_cost_list,
dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list,
total_time=total_time)
total_time=total_time, nb_cost_mat_list=nb_cost_mat_list,
coef_dk=coef_dk)
# ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
# 'extra_params': {}} # node/edge symb
# Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
## Gn = Gn[0:10]
## remove_edges(Gn)
# gkernel = 'untilhpathkernel'
# node_label = 'atom'
# edge_label = 'bond_type'
# itr_max = 10
# edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \
# nb_cost_mat_list, coef_dk = fit_GED_to_kernel_distance(Gn, node_label, edge_label,
# gkernel, itr_max)
# total_time = np.sum(time_list)
# print('\nedit_costs:', edit_costs)
# print('\nresidual_list:', residual_list)
# print('\nedit_cost_list:', edit_cost_list)
# print('\ndistance matrix in kernel space:', dis_k_mat)
# print('\nged matrix:', ged_mat)
# print('\ntotal time:', total_time)
# print('\nnb_cost_mat:', nb_cost_mat_list[-1])
# np.savez('results/fit_distance.cs_leq_ci_plus_cr.cost_leq_1en2.mutag.elabeled.uhpkernel.gm',
# edit_costs=edit_costs,
# residual_list=residual_list, edit_cost_list=edit_cost_list,
# dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list,
# total_time=total_time, nb_cost_mat_list=nb_cost_mat_list, coef_dk)
# # normalized distance matrices.
# gmfile = np.load('results/fit_distance.cs_leq_ci_plus_cr.cost_leq_1en2.monot.elabeled.uhpkernel.gm.npz')
# edit_costs = gmfile['edit_costs']
# residual_list = gmfile['residual_list']
# edit_cost_list = gmfile['edit_cost_list']
# dis_k_mat = gmfile['dis_k_mat']
# ged_mat = gmfile['ged_mat']
# total_time = gmfile['total_time']
# nb_cost_mat_list = gmfile['nb_cost_mat_list']
# coef_dk = gmfile['coef_dk']
nb_consistent, nb_inconsistent, ratio_consistent = pairwise_substitution_consistence(dis_k_mat, ged_mat)
print(nb_consistent, nb_inconsistent, ratio_consistent)
# dis_k_sub = pairwise_substitution(dis_k_mat)
# ged_sub = pairwise_substitution(ged_mat)
# np.savez('results/sub_dis_mat.cs_leq_ci_plus_cr.cost_leq_1en2.gm',
# dis_k_sub=dis_k_sub, ged_sub=ged_sub)
norm_dis_k_mat = normalize_distance_matrix(dis_k_mat)
plt.imshow(norm_dis_k_mat)
plt.colorbar()
plt.savefig('results/norm_dis_k_mat.cs_leq_ci_plus_cr.gaussian.cost_leq_1en2.monot.elabeled.uhpkernel'
+ '.eps', format='eps', dpi=300)
plt.savefig('results/norm_dis_k_mat.cs_leq_ci_plus_cr.gaussian.cost_leq_1en2.monot.elabeled.uhpkernel'
+ '.png', format='png')
# plt.show()
plt.clf()
norm_ged_mat = normalize_distance_matrix(ged_mat)
plt.imshow(norm_ged_mat)
plt.colorbar()
plt.savefig('results/norm_ged_mat.cs_leq_ci_plus_cr.gaussian.cost_leq_1en2.monot.elabeled.uhpkernel'
+ '.eps', format='eps', dpi=300)
plt.savefig('results/norm_ged_mat.cs_leq_ci_plus_cr.gaussian.cost_leq_1en2.monot.elabeled.uhpkernel'
+ '.png', format='png')
# plt.show()
plt.clf()
norm_diff = norm_ged_mat - norm_dis_k_mat
plt.imshow(norm_diff)
plt.colorbar()
plt.savefig('results/diff_mat_norm_ged_dis_k.cs_leq_ci_plus_cr.gaussian.cost_leq_1en2.monot.elabeled.uhpkernel'
+ '.eps', format='eps', dpi=300)
plt.savefig('results/diff_mat_norm_ged_dis_k.cs_leq_ci_plus_cr.gaussian.cost_leq_1en2.monot.elabeled.uhpkernel'
+ '.png', format='png')
# plt.show()
plt.clf()
# draw_count_bar(norm_diff)
def test_unfitted():
"""unfitted.
"""
from fitDistance import compute_geds
from utils import kernel_distance_matrix
ds = {'name': 'monoterpenoides',
'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb
Gn, y_all = loadDataset(ds['dataset'])
# Gn = Gn[0:10]
gkernel = 'untilhpathkernel'
node_label = 'atom'
edge_label = 'bond_type'

# ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
# 'extra_params': {}} # node/edge symb
# Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
## Gn = Gn[0:10]
## remove_edges(Gn)
# gkernel = 'marginalizedkernel'

dis_k_mat, _, _, _ = kernel_distance_matrix(Gn, node_label, edge_label, gkernel=gkernel)
ged_all, ged_mat, n_edit_operations = compute_geds(Gn, [3, 3, 1, 3, 3, 1],
[0, 1, 2, 3, 4, 5], parallel=True)
print('\ndistance matrix in kernel space:', dis_k_mat)
print('\nged matrix:', ged_mat)
# np.savez('results/fit_distance.cs_leq_ci_plus_cr.cost_leq_1en2.gm', edit_costs=edit_costs,
# residual_list=residual_list, edit_cost_list=edit_cost_list,
# dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list,
# total_time=total_time, nb_cost_mat_list=nb_cost_mat_list)
# normalized distance matrices.
# gmfile = np.load('results/fit_distance.cs_leq_ci_plus_cr.gm.npz')
# gmfile = np.load('results/fit_distance.cs_leq_ci_plus_cr.cost_leq_1en3.gm.npz')
# edit_costs = gmfile['edit_costs']
# residual_list = gmfile['residual_list']
# edit_cost_list = gmfile['edit_cost_list']
# dis_k_mat = gmfile['dis_k_mat']
# ged_mat = gmfile['ged_mat']
# total_time = gmfile['total_time']
# nb_cost_mat_list = gmfile['nb_cost_mat_list']
nb_consistent, nb_inconsistent, ratio_consistent = pairwise_substitution_consistence(dis_k_mat, ged_mat)
print(nb_consistent, nb_inconsistent, ratio_consistent)
norm_dis_k_mat = normalize_distance_matrix(dis_k_mat)
plt.imshow(norm_dis_k_mat)
plt.colorbar()
plt.savefig('results/norm_dis_k_mat.cs_leq_ci_plus_cr' + '.eps', format='eps', dpi=300)
# plt.savefig('results/norm_dis_k_mat.cs_leq_ci_plus_cr' + '.jpg', format='jpg')
plt.savefig('results/norm_dis_k_mat.unfitted.MUTAG' + '.eps', format='eps', dpi=300)
plt.savefig('results/norm_dis_k_mat.unfitted.MUTAG' + '.png', format='png')
# plt.show()
plt.clf()
norm_ged_mat = normalize_distance_matrix(ged_mat)
plt.imshow(norm_ged_mat)
plt.colorbar()
plt.savefig('results/norm_ged_mat.cs_leq_ci_plus_cr' + '.eps', format='eps', dpi=300)
# plt.savefig('results/norm_ged_mat.cs_leq_ci_plus_cr' + '.jpg', format='jpg')
plt.savefig('results/norm_ged_mat.unfitted.MUTAG' + '.eps', format='eps', dpi=300)
plt.savefig('results/norm_ged_mat.unfitted.MUTAG' + '.png', format='png')
# plt.show()
plt.clf()
norm_diff = norm_ged_mat - norm_dis_k_mat
plt.imshow(norm_diff)
plt.colorbar()
plt.savefig('results/diff_mat_norm_ged_dis_k.unfitted.MUTAG' + '.eps', format='eps', dpi=300)
plt.savefig('results/diff_mat_norm_ged_dis_k.unfitted.MUTAG' + '.png', format='png')
# plt.show()
plt.clf()
draw_count_bar(norm_diff)
def pairwise_substitution_consistence(mat1, mat2):
"""
"""
nb_consistent = 0
nb_inconsistent = 0
# the matrix is considered symmetric.
upper_tri1 = mat1[np.triu_indices_from(mat1)]
upper_tri2 = mat2[np.tril_indices_from(mat2)]
for i in tqdm(range(len(upper_tri1)), desc='computing consistence', file=sys.stdout):
for j in range(i, len(upper_tri1)):
if np.sign(upper_tri1[i] - upper_tri1[j]) == np.sign(upper_tri2[i] - upper_tri2[j]):
nb_consistent += 1
else:
nb_inconsistent += 1
return nb_consistent, nb_inconsistent, nb_consistent / (nb_consistent + nb_inconsistent)


def pairwise_substitution(mat):
# the matrix is considered symmetric.
upper_tri = mat[np.triu_indices_from(mat)]
sub_list = []
for i in tqdm(range(len(upper_tri)), desc='computing', file=sys.stdout):
for j in range(i, len(upper_tri)):
sub_list.append(upper_tri[i] - upper_tri[j])
return sub_list
def draw_count_bar(norm_diff):
import pandas
from collections import Counter, OrderedDict
norm_diff_cnt = norm_diff.flatten()
norm_diff_cnt = norm_diff_cnt * 10
norm_diff_cnt = np.floor(norm_diff_cnt)
norm_diff_cnt = Counter(norm_diff_cnt)
norm_diff_cnt = OrderedDict(sorted(norm_diff_cnt.items()))
df = pandas.DataFrame.from_dict(norm_diff_cnt, orient='index')
df.plot(kind='bar')
if __name__ == '__main__':
test_anycosts()
test_cs_leq_ci_plus_cr()
# test_anycosts()
# test_cs_leq_ci_plus_cr()
test_unfitted()
# x = np.array([[1,2,3],[4,5,6],[7,8,9]])
# xx = pairwise_substitution(x)

+ 741
- 66
preimage/test_iam.py View File

@@ -17,9 +17,363 @@ import random
import sys
sys.path.insert(0, "../")
from pygraph.utils.graphfiles import loadDataset
#from pygraph.utils.logger2file import *
from iam import iam_upgraded
from utils import remove_edges, compute_kernel, get_same_item_indices
from ged import ged_median
from utils import remove_edges, compute_kernel, get_same_item_indices, dis_gstar
#from ged import ged_median

def test_iam_monoterpenoides():
ds = {'name': 'monoterpenoides',
'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb
Gn, y_all = loadDataset(ds['dataset'])
# Gn = Gn[0:50]
gkernel = 'untilhpathkernel'
node_label = 'atom'
edge_label = 'bond_type'
# parameters for GED function from the IAM paper.
# fitted edit costs (Gaussian).
c_vi = 0.03620133402089074
c_vr = 0.0417574590207099
c_vs = 0.009992282328587499
c_ei = 0.08293120042342755
c_er = 0.09512220476358019
c_es = 0.09222529696841467
# # fitted edit costs (linear combinations).
# c_vi = 0.1749684054238749
# c_vr = 0.0734054228711457
# c_vs = 0.05017781726016715
# c_ei = 0.1869431164806936
# c_er = 0.32055856948274
# c_es = 0.2569469379247611
# # unfitted edit costs.
# c_vi = 3
# c_vr = 3
# c_vs = 1
# c_ei = 3
# c_er = 3
# c_es = 1
ite_max_iam = 50
epsilon_iam = 0.001
removeNodes = False
connected_iam = False
# parameters for IAM function
# ged_cost = 'CONSTANT'
ged_cost = 'CONSTANT'
ged_method = 'IPFP'
edit_cost_constant = [c_vi, c_vr, c_vs, c_ei, c_er, c_es]
# edit_cost_constant = []
ged_stabilizer = 'min'
ged_repeat = 50
params_ged = {'lib': 'gedlibpy', 'cost': ged_cost, 'method': ged_method,
'edit_cost_constant': edit_cost_constant,
'stabilizer': ged_stabilizer, 'repeat': ged_repeat}
# classify graphs according to letters.
time_list = []
dis_ks_min_list = []
dis_ks_set_median_list = []
sod_gs_list = []
g_best = []
sod_set_median_list = []
sod_list_list = []
idx_dict = get_same_item_indices(y_all)
for y_class in idx_dict:
print('\n-------------------------------------------------------')
print('class of y:', y_class)
Gn_class = [Gn[i].copy() for i in idx_dict[y_class]]
time_list.append([])
dis_ks_min_list.append([])
dis_ks_set_median_list.append([])
sod_gs_list.append([])
g_best.append([])
sod_set_median_list.append([])
for repeat in range(50):
idx_rdm = random.sample(range(len(Gn_class)), 10)
print('graphs chosen:', idx_rdm)
Gn_median = [Gn_class[idx].copy() for idx in idx_rdm]
Gn_candidate = [g.copy() for g in Gn_median]
alpha_range = [1 / len(Gn_median)] * len(Gn_median)
time0 = time.time()
G_gen_median_list, sod_gen_median, sod_list, G_set_median_list, sod_set_median \
= iam_upgraded(Gn_median,
Gn_candidate, c_ei=c_ei, c_er=c_er, c_es=c_es, ite_max=ite_max_iam,
epsilon=epsilon_iam, connected=connected_iam, removeNodes=removeNodes,
params_ged=params_ged)
time_total = time.time() - time0
print('\ntime: ', time_total)
time_list[-1].append(time_total)
g_best[-1].append(G_gen_median_list[0])
sod_set_median_list[-1].append(sod_set_median)
print('\nsmallest sod of the set median:', sod_set_median)
sod_gs_list[-1].append(sod_gen_median)
print('\nsmallest sod in graph space:', sod_gen_median)
sod_list_list.append(sod_list)
# show the best graph and save it to file.
print('one of the possible corresponding pre-images is')
nx.draw(G_gen_median_list[0], labels=nx.get_node_attributes(G_gen_median_list[0], 'atom'),
with_labels=True)
# plt.show()
# plt.savefig('results/iam/mutag_median.fit_costs2.001.nb' + str(nb_median) +
# plt.savefig('results/iam/paper_compare/monoter_y' + str(y_class) +
# '_repeat' + str(repeat) + '_' + str(time.time()) +
# '.png', format="PNG")
plt.clf()
# print(G_gen_median_list[0].nodes(data=True))
# print(G_gen_median_list[0].edges(data=True))
# compute distance between \psi and the set median graph.
knew_set_median = compute_kernel(G_set_median_list + Gn_median,
gkernel, node_label, edge_label, False)
dhat_new_set_median_list = []
for idx, g_tmp in enumerate(G_set_median_list):
# @todo: the term3 below could use the one at the beginning of the function.
dhat_new_set_median_list.append(dis_gstar(idx, range(len(G_set_median_list),
len(G_set_median_list) + len(Gn_median) + 1),
alpha_range, knew_set_median, withterm3=False))
print('\ndistance in kernel space of set median: ', dhat_new_set_median_list[0])
dis_ks_set_median_list[-1].append(dhat_new_set_median_list[0])
# compute distance between \psi and the new generated graphs.
knew = compute_kernel(G_gen_median_list + Gn_median, gkernel, node_label,
edge_label, False)
dhat_new_list = []
for idx, g_tmp in enumerate(G_gen_median_list):
# @todo: the term3 below could use the one at the beginning of the function.
dhat_new_list.append(dis_gstar(idx, range(len(G_gen_median_list),
len(G_gen_median_list) + len(Gn_median) + 1),
alpha_range, knew, withterm3=False))
print('\nsmallest distance in kernel space: ', dhat_new_list[0])
dis_ks_min_list[-1].append(dhat_new_list[0])

print('\nsods of the set median for this class:', sod_set_median_list[-1])
print('\nsods in graph space for this class:', sod_gs_list[-1])
print('\ndistance in kernel space of set median for this class:',
dis_ks_set_median_list[-1])
print('\nsmallest distances in kernel space for this class:',
dis_ks_min_list[-1])
print('\ntimes for this class:', time_list[-1])
sod_set_median_list[-1] = np.mean(sod_set_median_list[-1])
sod_gs_list[-1] = np.mean(sod_gs_list[-1])
dis_ks_set_median_list[-1] = np.mean(dis_ks_set_median_list[-1])
dis_ks_min_list[-1] = np.mean(dis_ks_min_list[-1])
time_list[-1] = np.mean(time_list[-1])
print()
print('\nmean sods of the set median for each class:', sod_set_median_list)
print('\nmean sods in graph space for each class:', sod_gs_list)
print('\ndistances in kernel space of set median for each class:',
dis_ks_set_median_list)
print('\nmean smallest distances in kernel space for each class:',
dis_ks_min_list)
print('\nmean times for each class:', time_list)
print('\nmean sods of the set median of all:', np.mean(sod_set_median_list))
print('\nmean sods in graph space of all:', np.mean(sod_gs_list))
print('\nmean distances in kernel space of set median of all:',
np.mean(dis_ks_set_median_list))
print('\nmean smallest distances in kernel space of all:',
np.mean(dis_ks_min_list))
print('\nmean times of all:', np.mean(time_list))
nb_better_sods = 0
nb_worse_sods = 0
nb_same_sods = 0
for sods in sod_list_list:
if sods[0] > sods[-1]:
nb_better_sods += 1
elif sods[0] < sods[-1]:
nb_worse_sods += 1
else:
nb_same_sods += 1
print('\n In', str(len(sod_list_list)), 'sod lists,', str(nb_better_sods),
'are getting better,', str(nb_worse_sods), 'are getting worse,',
str(nb_same_sods), 'are not changed; ', str(nb_better_sods / len(sod_list_list)),
'sods are improved.')
def test_iam_mutag():
ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
'extra_params': {}} # node/edge symb
Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
# Gn = Gn[0:50]
gkernel = 'untilhpathkernel'
node_label = 'atom'
edge_label = 'bond_type'
# parameters for GED function from the IAM paper.
# fitted edit costs.
c_vi = 0.03523843108436513
c_vr = 0.03347339739350128
c_vs = 0.06871290673612238
c_ei = 0.08591999846720685
c_er = 0.07962086440894103
c_es = 0.08596855855478233
# unfitted edit costs.
# c_vi = 3
# c_vr = 3
# c_vs = 1
# c_ei = 3
# c_er = 3
# c_es = 1
ite_max_iam = 50
epsilon_iam = 0.001
removeNodes = False
connected_iam = False
# parameters for IAM function
# ged_cost = 'CONSTANT'
ged_cost = 'CONSTANT'
ged_method = 'IPFP'
edit_cost_constant = [c_vi, c_vr, c_vs, c_ei, c_er, c_es]
# edit_cost_constant = []
ged_stabilizer = 'min'
ged_repeat = 50
params_ged = {'lib': 'gedlibpy', 'cost': ged_cost, 'method': ged_method,
'edit_cost_constant': edit_cost_constant,
'stabilizer': ged_stabilizer, 'repeat': ged_repeat}
# classify graphs according to letters.
time_list = []
dis_ks_min_list = []
dis_ks_set_median_list = []
sod_gs_list = []
g_best = []
sod_set_median_list = []
sod_list_list = []
idx_dict = get_same_item_indices(y_all)
for y_class in idx_dict:
print('\n-------------------------------------------------------')
print('class of y:', y_class)
Gn_class = [Gn[i].copy() for i in idx_dict[y_class]]
time_list.append([])
dis_ks_min_list.append([])
dis_ks_set_median_list.append([])
sod_gs_list.append([])
g_best.append([])
sod_set_median_list.append([])
for repeat in range(50):
idx_rdm = random.sample(range(len(Gn_class)), 10)
print('graphs chosen:', idx_rdm)
Gn_median = [Gn_class[idx].copy() for idx in idx_rdm]
Gn_candidate = [g.copy() for g in Gn_median]
alpha_range = [1 / len(Gn_median)] * len(Gn_median)
time0 = time.time()
G_gen_median_list, sod_gen_median, sod_list, G_set_median_list, sod_set_median \
= iam_upgraded(Gn_median,
Gn_candidate, c_ei=c_ei, c_er=c_er, c_es=c_es, ite_max=ite_max_iam,
epsilon=epsilon_iam, connected=connected_iam, removeNodes=removeNodes,
params_ged=params_ged)
time_total = time.time() - time0
print('\ntime: ', time_total)
time_list[-1].append(time_total)
g_best[-1].append(G_gen_median_list[0])
sod_set_median_list[-1].append(sod_set_median)
print('\nsmallest sod of the set median:', sod_set_median)
sod_gs_list[-1].append(sod_gen_median)
print('\nsmallest sod in graph space:', sod_gen_median)
sod_list_list.append(sod_list)
# show the best graph and save it to file.
print('one of the possible corresponding pre-images is')
nx.draw(G_gen_median_list[0], labels=nx.get_node_attributes(G_gen_median_list[0], 'atom'),
with_labels=True)
# plt.show()
# plt.savefig('results/iam/mutag_median.fit_costs2.001.nb' + str(nb_median) +
# plt.savefig('results/iam/paper_compare/mutag_y' + str(y_class) +
# '_repeat' + str(repeat) + '_' + str(time.time()) +
# '.png', format="PNG")
plt.clf()
# print(G_gen_median_list[0].nodes(data=True))
# print(G_gen_median_list[0].edges(data=True))
# compute distance between \psi and the set median graph.
knew_set_median = compute_kernel(G_set_median_list + Gn_median,
gkernel, node_label, edge_label, False)
dhat_new_set_median_list = []
for idx, g_tmp in enumerate(G_set_median_list):
# @todo: the term3 below could use the one at the beginning of the function.
dhat_new_set_median_list.append(dis_gstar(idx, range(len(G_set_median_list),
len(G_set_median_list) + len(Gn_median) + 1),
alpha_range, knew_set_median, withterm3=False))
print('\ndistance in kernel space of set median: ', dhat_new_set_median_list[0])
dis_ks_set_median_list[-1].append(dhat_new_set_median_list[0])
# compute distance between \psi and the new generated graphs.
knew = compute_kernel(G_gen_median_list + Gn_median, gkernel, node_label,
edge_label, False)
dhat_new_list = []
for idx, g_tmp in enumerate(G_gen_median_list):
# @todo: the term3 below could use the one at the beginning of the function.
dhat_new_list.append(dis_gstar(idx, range(len(G_gen_median_list),
len(G_gen_median_list) + len(Gn_median) + 1),
alpha_range, knew, withterm3=False))
print('\nsmallest distance in kernel space: ', dhat_new_list[0])
dis_ks_min_list[-1].append(dhat_new_list[0])

print('\nsods of the set median for this class:', sod_set_median_list[-1])
print('\nsods in graph space for this class:', sod_gs_list[-1])
print('\ndistance in kernel space of set median for this class:',
dis_ks_set_median_list[-1])
print('\nsmallest distances in kernel space for this class:',
dis_ks_min_list[-1])
print('\ntimes for this class:', time_list[-1])
sod_set_median_list[-1] = np.mean(sod_set_median_list[-1])
sod_gs_list[-1] = np.mean(sod_gs_list[-1])
dis_ks_set_median_list[-1] = np.mean(dis_ks_set_median_list[-1])
dis_ks_min_list[-1] = np.mean(dis_ks_min_list[-1])
time_list[-1] = np.mean(time_list[-1])
print()
print('\nmean sods of the set median for each class:', sod_set_median_list)
print('\nmean sods in graph space for each class:', sod_gs_list)
print('\ndistances in kernel space of set median for each class:',
dis_ks_set_median_list)
print('\nmean smallest distances in kernel space for each class:',
dis_ks_min_list)
print('\nmean times for each class:', time_list)
print('\nmean sods of the set median of all:', np.mean(sod_set_median_list))
print('\nmean sods in graph space of all:', np.mean(sod_gs_list))
print('\nmean distances in kernel space of set median of all:',
np.mean(dis_ks_set_median_list))
print('\nmean smallest distances in kernel space of all:',
np.mean(dis_ks_min_list))
print('\nmean times of all:', np.mean(time_list))
nb_better_sods = 0
nb_worse_sods = 0
nb_same_sods = 0
for sods in sod_list_list:
if sods[0] > sods[-1]:
nb_better_sods += 1
elif sods[0] < sods[-1]:
nb_worse_sods += 1
else:
nb_same_sods += 1
print('\n In', str(len(sod_list_list)), 'sod lists,', str(nb_better_sods),
'are getting better,', str(nb_worse_sods), 'are getting worse,',
str(nb_same_sods), 'are not changed; ', str(nb_better_sods / len(sod_list_list)),
'sods are improved.')

###############################################################################
# tests on different numbers of median-sets.
@@ -33,46 +387,352 @@ def test_iam_median_nb():
remove_edges(Gn)
gkernel = 'marginalizedkernel'
# lmbda = 0.03 # termination probalility
# r_max = 10 # iteration limit for pre-image.
# alpha_range = np.linspace(0.5, 0.5, 1)
# k = 5 # k nearest neighbors
# epsilon = 1e-6
# InitIAMWithAllDk = True
lmbda = 0.03 # termination probalility
# # parameters for GED function
# c_vi = 0.037
# c_vr = 0.038
# c_vs = 0.075
# c_ei = 0.001
# c_er = 0.001
# c_es = 0.0
# ite_max_iam = 50
# epsilon_iam = 0.001
# removeNodes = False
# connected_iam = False
# # parameters for IAM function
# ged_cost = 'CONSTANT'
# ged_method = 'IPFP'
# edit_cost_constant = [c_vi, c_vr, c_vs, c_ei, c_er, c_es]
# ged_stabilizer = 'min'
# ged_repeat = 50
# params_ged = {'lib': 'gedlibpy', 'cost': ged_cost, 'method': ged_method,
# 'edit_cost_constant': edit_cost_constant,
# 'stabilizer': ged_stabilizer, 'repeat': ged_repeat}
# parameters for GED function
ged_cost='CHEM_1'
ged_method='IPFP'
saveGXL='gedlib'
# parameters for IAM function
c_ei=1
c_er=1
c_es=1
c_vi = 4
c_vr = 4
c_vs = 2
c_ei = 1
c_er = 1
c_es = 1
ite_max_iam = 50
epsilon_iam = 0.001
removeNodes = False
connected_iam = False
# number of graphs; we what to compute the median of these graphs.
nb_median_range = [2, 3, 4, 5, 10, 20, 30, 40, 50, 100]
# parameters for IAM function
ged_cost = 'CHEM_1'
ged_method = 'IPFP'
edit_cost_constant = []
ged_stabilizer = 'min'
ged_repeat = 50
params_ged = {'lib': 'gedlibpy', 'cost': ged_cost, 'method': ged_method,
'edit_cost_constant': edit_cost_constant,
'stabilizer': ged_stabilizer, 'repeat': ged_repeat}
# find out all the graphs classified to positive group 1.
idx_dict = get_same_item_indices(y_all)
Gn = [Gn[i] for i in idx_dict[1]]
# number of graphs; we what to compute the median of these graphs.
# nb_median_range = [2, 3, 4, 5, 10, 20, 30, 40, 50, 100]
nb_median_range = [len(Gn)]
# # compute Gram matrix.
# time0 = time.time()
# km = compute_kernel(Gn, gkernel, True)
# time_km = time.time() - time0
# # write Gram matrix to file.
# np.savez('results/gram_matrix_marg_itr10_pq0.03_mutag_positive.gm', gm=km, gmtime=time_km)
time_list = []
dis_ks_min_list = []
sod_gs_list = []
# sod_gs_min_list = []
# nb_updated_list = []
# nb_updated_k_list = []
g_best = []
for nb_median in nb_median_range:
print('\n-------------------------------------------------------')
print('number of median graphs =', nb_median)
random.seed(1)
idx_rdm = random.sample(range(len(Gn)), nb_median)
print('graphs chosen:', idx_rdm)
Gn_median = [Gn[idx].copy() for idx in idx_rdm]
Gn_candidate = [g.copy() for g in Gn]
# for g in Gn_median:
# nx.draw(g, labels=nx.get_node_attributes(g, 'atom'), with_labels=True)
## plt.savefig("results/preimage_mix/mutag.png", format="PNG")
# plt.show()
# plt.clf()
###################################################################
# gmfile = np.load('results/gram_matrix_marg_itr10_pq0.03_mutag_positive.gm.npz')
# km_tmp = gmfile['gm']
# time_km = gmfile['gmtime']
# # modify mixed gram matrix.
# km = np.zeros((len(Gn) + nb_median, len(Gn) + nb_median))
# for i in range(len(Gn)):
# for j in range(i, len(Gn)):
# km[i, j] = km_tmp[i, j]
# km[j, i] = km[i, j]
# for i in range(len(Gn)):
# for j, idx in enumerate(idx_rdm):
# km[i, len(Gn) + j] = km[i, idx]
# km[len(Gn) + j, i] = km[i, idx]
# for i, idx1 in enumerate(idx_rdm):
# for j, idx2 in enumerate(idx_rdm):
# km[len(Gn) + i, len(Gn) + j] = km[idx1, idx2]
###################################################################
alpha_range = [1 / nb_median] * nb_median
time0 = time.time()
ghat_new_list, sod_min = iam_upgraded(Gn_median, Gn_candidate,
c_ei=c_ei, c_er=c_er, c_es=c_es, ite_max=ite_max_iam,
epsilon=epsilon_iam, connected=connected_iam, removeNodes=removeNodes,
params_ged=params_ged)
time_total = time.time() - time0
print('\ntime: ', time_total)
time_list.append(time_total)
# compute distance between \psi and the new generated graphs.
knew = compute_kernel(ghat_new_list + Gn_median, gkernel, False)
dhat_new_list = []
for idx, g_tmp in enumerate(ghat_new_list):
# @todo: the term3 below could use the one at the beginning of the function.
dhat_new_list.append(dis_gstar(idx, range(len(ghat_new_list),
len(ghat_new_list) + len(Gn_median) + 1),
alpha_range, knew, withterm3=False))
print('\nsmallest distance in kernel space: ', dhat_new_list[0])
dis_ks_min_list.append(dhat_new_list[0])
g_best.append(ghat_new_list[0])
# show the best graph and save it to file.
# print('the shortest distance is', dhat)
print('one of the possible corresponding pre-images is')
nx.draw(ghat_new_list[0], labels=nx.get_node_attributes(ghat_new_list[0], 'atom'),
with_labels=True)
plt.show()
# plt.savefig('results/iam/mutag_median.fit_costs2.001.nb' + str(nb_median) +
plt.savefig('results/iam/mutag_median_unfit2.nb' + str(nb_median) +
'.png', format="PNG")
plt.clf()
# print(ghat_list[0].nodes(data=True))
# print(ghat_list[0].edges(data=True))
sod_gs_list.append(sod_min)
# sod_gs_min_list.append(np.min(sod_min))
print('\nsmallest sod in graph space: ', sod_min)
print('\nsods in graph space: ', sod_gs_list)
# print('\nsmallest sod in graph space for each set of median graphs: ', sod_gs_min_list)
print('\nsmallest distance in kernel space for each set of median graphs: ',
dis_ks_min_list)
# print('\nnumber of updates of the best graph for each set of median graphs by IAM: ',
# nb_updated_list)
# print('\nnumber of updates of k nearest graphs for each set of median graphs by IAM: ',
# nb_updated_k_list)
print('\ntimes:', time_list)
def test_iam_letter_h():
from median import draw_Letter_graph
ds = {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt',
'extra_params': {}} # node nsymb
# ds = {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt',
# 'extra_params': {}} # node nsymb
# Gn = Gn[0:50]
Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
gkernel = 'structuralspkernel'
# parameters for GED function from the IAM paper.
c_vi = 3
c_vr = 3
c_vs = 1
c_ei = 3
c_er = 3
c_es = 1
ite_max_iam = 50
epsilon_iam = 0.001
removeNodes = False
connected_iam = False
# parameters for IAM function
# ged_cost = 'CONSTANT'
ged_cost = 'LETTER'
ged_method = 'IPFP'
# edit_cost_constant = [c_vi, c_vr, c_vs, c_ei, c_er, c_es]
edit_cost_constant = []
ged_stabilizer = 'min'
ged_repeat = 50
params_ged = {'lib': 'gedlibpy', 'cost': ged_cost, 'method': ged_method,
'edit_cost_constant': edit_cost_constant,
'stabilizer': ged_stabilizer, 'repeat': ged_repeat}
# classify graphs according to letters.
time_list = []
dis_ks_min_list = []
sod_gs_list = []
g_best = []
sod_set_median_list = []
idx_dict = get_same_item_indices(y_all)
for letter in idx_dict:
print('\n-------------------------------------------------------')
print('letter', letter)
Gn_let = [Gn[i].copy() for i in idx_dict[letter]]
time_list.append([])
dis_ks_min_list.append([])
sod_gs_list.append([])
g_best.append([])
sod_set_median_list.append([])
for repeat in range(50):
idx_rdm = random.sample(range(len(Gn_let)), 50)
print('graphs chosen:', idx_rdm)
Gn_median = [Gn_let[idx].copy() for idx in idx_rdm]
Gn_candidate = [g.copy() for g in Gn_median]
alpha_range = [1 / len(Gn_median)] * len(Gn_median)
time0 = time.time()
ghat_new_list, sod_min, sod_set_median = iam_upgraded(Gn_median,
Gn_candidate, c_ei=c_ei, c_er=c_er, c_es=c_es, ite_max=ite_max_iam,
epsilon=epsilon_iam, connected=connected_iam, removeNodes=removeNodes,
params_ged=params_ged)
time_total = time.time() - time0
print('\ntime: ', time_total)
time_list[-1].append(time_total)
g_best[-1].append(ghat_new_list[0])
sod_set_median_list[-1].append(sod_set_median)
print('\nsmallest sod of the set median:', sod_set_median)
sod_gs_list[-1].append(sod_min)
print('\nsmallest sod in graph space:', sod_min)
# show the best graph and save it to file.
print('one of the possible corresponding pre-images is')
draw_Letter_graph(ghat_new_list[0], savepath='results/iam/paper_compare/')
# compute distance between \psi and the new generated graphs.
knew = compute_kernel(ghat_new_list + Gn_median, gkernel, False)
dhat_new_list = []
for idx, g_tmp in enumerate(ghat_new_list):
# @todo: the term3 below could use the one at the beginning of the function.
dhat_new_list.append(dis_gstar(idx, range(len(ghat_new_list),
len(ghat_new_list) + len(Gn_median) + 1),
alpha_range, knew, withterm3=False))
print('\nsmallest distance in kernel space: ', dhat_new_list[0])
dis_ks_min_list[-1].append(dhat_new_list[0])
print('\nsods of the set median for this letter:', sod_set_median_list[-1])
print('\nsods in graph space for this letter:', sod_gs_list[-1])
print('\nsmallest distances in kernel space for this letter:',
dis_ks_min_list[-1])
print('\ntimes for this letter:', time_list[-1])
sod_set_median_list[-1] = np.mean(sod_set_median_list[-1])
sod_gs_list[-1] = np.mean(sod_gs_list[-1])
dis_ks_min_list[-1] = np.mean(dis_ks_min_list[-1])
time_list[-1] = np.mean(time_list[-1])
print('\nmean sods of the set median for each letter:', sod_set_median_list)
print('\nmean sods in graph space for each letter:', sod_gs_list)
print('\nmean smallest distances in kernel space for each letter:',
dis_ks_min_list)
print('\nmean times for each letter:', time_list)
print('\nmean sods of the set median of all:', np.mean(sod_set_median_list))
print('\nmean sods in graph space of all:', np.mean(sod_gs_list))
print('\nmean smallest distances in kernel space of all:',
np.mean(dis_ks_min_list))
print('\nmean times of all:', np.mean(time_list))




def test_iam_fitdistance():
ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
'extra_params': {}} # node/edge symb
Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
# Gn = Gn[0:50]
# remove_edges(Gn)
gkernel = 'marginalizedkernel'
node_label = 'atom'
edge_label = 'bond_type'
# lmbda = 0.03 # termination probalility
# # parameters for GED function
# c_vi = 0.037
# c_vr = 0.038
# c_vs = 0.075
# c_ei = 0.001
# c_er = 0.001
# c_es = 0.0
# ite_max_iam = 50
# epsilon_iam = 0.001
# removeNodes = False
# connected_iam = False
# # parameters for IAM function
# ged_cost = 'CONSTANT'
# ged_method = 'IPFP'
# edit_cost_constant = [c_vi, c_vr, c_vs, c_ei, c_er, c_es]
# ged_stabilizer = 'min'
# ged_repeat = 50
# params_ged = {'lib': 'gedlibpy', 'cost': ged_cost, 'method': ged_method,
# 'edit_cost_constant': edit_cost_constant,
# 'stabilizer': ged_stabilizer, 'repeat': ged_repeat}
# parameters for GED function
c_vi = 4
c_vr = 4
c_vs = 2
c_ei = 1
c_er = 1
c_es = 1
ite_max_iam = 50
epsilon_iam = 0.001
removeNodes = False
connected_iam = False
# parameters for IAM function
ged_cost = 'CHEM_1'
ged_method = 'IPFP'
edit_cost_constant = []
ged_stabilizer = 'min'
ged_repeat = 50
params_ged = {'lib': 'gedlibpy', 'cost': ged_cost, 'method': ged_method,
'edit_cost_constant': edit_cost_constant,
'stabilizer': ged_stabilizer, 'repeat': ged_repeat}
# find out all the graphs classified to positive group 1.
idx_dict = get_same_item_indices(y_all)
Gn = [Gn[i] for i in idx_dict[1]]
# number of graphs; we what to compute the median of these graphs.
# nb_median_range = [2, 3, 4, 5, 10, 20, 30, 40, 50, 100]
nb_median_range = [10]
# # compute Gram matrix.
# time0 = time.time()
# km = compute_kernel(Gn, gkernel, True)
# time_km = time.time() - time0
# # write Gram matrix to file.
# np.savez('results/gram_matrix_marg_itr10_pq0.03_mutag_positive.gm', gm=km, gmtime=time_km)
time_list = []
dis_ks_min_list = []
dis_ks_gen_median_list = []
sod_gs_list = []
sod_gs_min_list = []
nb_updated_list = []
nb_updated_k_list = []
# sod_gs_min_list = []
# nb_updated_list = []
# nb_updated_k_list = []
g_best = []
for nb_median in nb_median_range:
print('\n-------------------------------------------------------')
@@ -90,72 +750,80 @@ def test_iam_median_nb():
# plt.clf()
###################################################################
gmfile = np.load('results/gram_matrix_marg_itr10_pq0.03_mutag_positive.gm.npz')
km_tmp = gmfile['gm']
time_km = gmfile['gmtime']
# modify mixed gram matrix.
km = np.zeros((len(Gn) + nb_median, len(Gn) + nb_median))
for i in range(len(Gn)):
for j in range(i, len(Gn)):
km[i, j] = km_tmp[i, j]
km[j, i] = km[i, j]
for i in range(len(Gn)):
for j, idx in enumerate(idx_rdm):
km[i, len(Gn) + j] = km[i, idx]
km[len(Gn) + j, i] = km[i, idx]
for i, idx1 in enumerate(idx_rdm):
for j, idx2 in enumerate(idx_rdm):
km[len(Gn) + i, len(Gn) + j] = km[idx1, idx2]
# gmfile = np.load('results/gram_matrix_marg_itr10_pq0.03_mutag_positive.gm.npz')
# km_tmp = gmfile['gm']
# time_km = gmfile['gmtime']
# # modify mixed gram matrix.
# km = np.zeros((len(Gn) + nb_median, len(Gn) + nb_median))
# for i in range(len(Gn)):
# for j in range(i, len(Gn)):
# km[i, j] = km_tmp[i, j]
# km[j, i] = km[i, j]
# for i in range(len(Gn)):
# for j, idx in enumerate(idx_rdm):
# km[i, len(Gn) + j] = km[i, idx]
# km[len(Gn) + j, i] = km[i, idx]
# for i, idx1 in enumerate(idx_rdm):
# for j, idx2 in enumerate(idx_rdm):
# km[len(Gn) + i, len(Gn) + j] = km[idx1, idx2]
###################################################################
alpha_range = [1 / nb_median] * nb_median
time0 = time.time()
ghat_new_list, dis_min = iam_upgraded(Gn_median, Gn_candidate,
c_ei=c_ei, c_er=c_er, c_es=c_es, ite_max=ite_max_iam,
epsilon=epsilon_iam, removeNodes=removeNodes,
connected=connected_iam,
params_ged={'ged_cost': ged_cost, 'ged_method': ged_method,
'saveGXL': saveGXL})
G_gen_median_list, sod_gen_median, sod_list, G_set_median_list, sod_set_median \
= iam_upgraded(Gn_median, Gn_candidate,
c_ei=c_ei, c_er=c_er, c_es=c_es, ite_max=ite_max_iam,
epsilon=epsilon_iam, connected=connected_iam, removeNodes=removeNodes,
params_ged=params_ged)
time_total = time.time() - time0
print('\ntime: ', time_total)
time_list.append(time_total)
print('\nsmallest distance in kernel space: ', dhat)
dis_ks_min_list.append(dhat)
g_best.append(ghat_list)
print('\nnumber of updates of the best graph: ', nb_updated)
nb_updated_list.append(nb_updated)
print('\nnumber of updates of k nearest graphs: ', nb_updated_k)
nb_updated_k_list.append(nb_updated_k)
# compute distance between \psi and the new generated graphs.
knew = compute_kernel(G_gen_median_list + Gn_median, gkernel, node_label,
edge_label, False)
dhat_new_list = []
for idx, g_tmp in enumerate(G_gen_median_list):
# @todo: the term3 below could use the one at the beginning of the function.
dhat_new_list.append(dis_gstar(idx, range(len(G_gen_median_list),
len(G_gen_median_list) + len(Gn_median) + 1),
alpha_range, knew, withterm3=False))
print('\nsmallest distance in kernel space: ', dhat_new_list[0])
dis_ks_min_list.append(dhat_new_list[0])
g_best.append(G_gen_median_list[0])
# show the best graph and save it to file.
print('the shortest distance is', dhat)
# print('the shortest distance is', dhat)
print('one of the possible corresponding pre-images is')
nx.draw(ghat_list[0], labels=nx.get_node_attributes(ghat_list[0], 'atom'),
nx.draw(G_gen_median_list[0], labels=nx.get_node_attributes(G_gen_median_list[0], 'atom'),
with_labels=True)
plt.show()
plt.savefig('results/preimage_iam/mutag_median_nb' + str(nb_median) +
'.png', format="PNG")
# plt.savefig('results/iam/mutag_median.fit_costs2.001.nb' + str(nb_median) +
# plt.savefig('results/iam/mutag_median_unfit2.nb' + str(nb_median) +
# '.png', format="PNG")
plt.clf()
# print(ghat_list[0].nodes(data=True))
# print(ghat_list[0].edges(data=True))
# compute the corresponding sod in graph space.
sod_tmp, _ = ged_median([ghat_list[0]], Gn_median, ged_cost=ged_cost,
ged_method=ged_method, saveGXL=saveGXL)
sod_gs_list.append(sod_tmp)
sod_gs_min_list.append(np.min(sod_tmp))
print('\nsmallest sod in graph space: ', np.min(sod_tmp))
sod_gs_list.append(sod_gen_median)
# sod_gs_min_list.append(np.min(sod_gen_median))
print('\nsmallest sod in graph space: ', sod_gen_median)
print('\nsmallest sod of set median in graph space: ', sod_set_median)
print('\nsods in graph space: ', sod_gs_list)
print('\nsmallest sod in graph space for each set of median graphs: ', sod_gs_min_list)
# print('\nsmallest sod in graph space for each set of median graphs: ', sod_gs_min_list)
print('\nsmallest distance in kernel space for each set of median graphs: ',
dis_ks_min_list)
print('\nnumber of updates of the best graph for each set of median graphs by IAM: ',
nb_updated_list)
print('\nnumber of updates of k nearest graphs for each set of median graphs by IAM: ',
nb_updated_k_list)
# print('\nnumber of updates of the best graph for each set of median graphs by IAM: ',
# nb_updated_list)
# print('\nnumber of updates of k nearest graphs for each set of median graphs by IAM: ',
# nb_updated_k_list)
print('\ntimes:', time_list)
###############################################################################
@@ -164,4 +832,11 @@ def test_iam_median_nb():
if __name__ == '__main__':
###############################################################################
# tests on different numbers of median-sets.
test_iam_median_nb()
# test_iam_median_nb()
# test_iam_letter_h()
test_iam_monoterpenoides()
# test_iam_mutag()
# test_iam_fitdistance()
# print("test log")

+ 30
- 16
preimage/test_preimage_iam.py View File

@@ -192,26 +192,42 @@ def test_preimage_iam_median_nb():
gkernel = 'marginalizedkernel'
lmbda = 0.03 # termination probalility
r_max = 10 # iteration limit for pre-image.
r_max = 3 # iteration limit for pre-image.
# alpha_range = np.linspace(0.5, 0.5, 1)
k = 5 # k nearest neighbors
epsilon = 1e-6
InitIAMWithAllDk = True
# parameters for GED function
ged_cost='CHEM_1'
ged_method='IPFP'
saveGXL='gedlib'
# parameters for IAM function
c_ei=1
c_er=1
c_es=1
# c_vi = 0.037
# c_vr = 0.038
# c_vs = 0.075
# c_ei = 0.001
# c_er = 0.001
# c_es = 0.0
c_vi = 4
c_vr = 4
c_vs = 2
c_ei = 1
c_er = 1
c_es = 1
ite_max_iam = 50
epsilon_iam = 0.001
removeNodes = True
connected_iam = False
# parameters for GED function
# ged_cost='CHEM_1'
ged_cost = 'CONSTANT'
ged_method = 'IPFP'
edit_cost_constant = [c_vi, c_vr, c_vs, c_ei, c_er, c_es]
ged_stabilizer = 'min'
ged_repeat = 50
params_ged = {'lib': 'gedlibpy', 'cost': ged_cost, 'method': ged_method,
'edit_cost_constant': edit_cost_constant,
'stabilizer': ged_stabilizer, 'repeat': ged_repeat}
# number of graphs; we what to compute the median of these graphs.
nb_median_range = [2, 3, 4, 5, 10, 20, 30, 40, 50, 100]
# nb_median_range = [2, 3, 4, 5, 10, 20, 30, 40, 50, 100]
nb_median_range = [2]
# find out all the graphs classified to positive group 1.
idx_dict = get_same_item_indices(y_all)
@@ -274,8 +290,7 @@ def test_preimage_iam_median_nb():
params_iam={'c_ei': c_ei, 'c_er': c_er, 'c_es': c_es,
'ite_max': ite_max_iam, 'epsilon': epsilon_iam,
'removeNodes': removeNodes, 'connected': connected_iam},
params_ged={'ged_cost': ged_cost, 'ged_method': ged_method,
'saveGXL': saveGXL})
params_ged=params_ged)
time_total = time.time() - time0 + time_km
print('\ntime: ', time_total)
@@ -293,16 +308,15 @@ def test_preimage_iam_median_nb():
print('one of the possible corresponding pre-images is')
nx.draw(ghat_list[0], labels=nx.get_node_attributes(ghat_list[0], 'atom'),
with_labels=True)
# plt.show()
plt.savefig('results/preimage_iam/mutag_median_nb' + str(nb_median) +
'.png', format="PNG")
plt.show()
# plt.savefig('results/preimage_iam/mutag_median_cs.001_nb' + str(nb_median) +
# '.png', format="PNG")
plt.clf()
# print(ghat_list[0].nodes(data=True))
# print(ghat_list[0].edges(data=True))
# compute the corresponding sod in graph space.
sod_tmp, _ = ged_median([ghat_list[0]], Gn_median, ged_cost=ged_cost,
ged_method=ged_method, saveGXL=saveGXL)
sod_tmp, _ = ged_median([ghat_list[0]], Gn_median, params_ged=params_ged)
sod_gs_list.append(sod_tmp)
sod_gs_min_list.append(np.min(sod_tmp))
print('\nsmallest sod in graph space: ', np.min(sod_tmp))


+ 5
- 5
preimage/utils.py View File

@@ -39,13 +39,13 @@ def dis_gstar(idx_g, idx_gi, alpha, Kmatrix, term3=0, withterm3=True):
return np.sqrt(term1 - term2 + term3)


def compute_kernel(Gn, graph_kernel, verbose):
def compute_kernel(Gn, graph_kernel, node_label, edge_label, verbose):
if graph_kernel == 'marginalizedkernel':
Kmatrix, _ = marginalizedkernel(Gn, node_label='atom', edge_label=None,
Kmatrix, _ = marginalizedkernel(Gn, node_label=node_label, edge_label=edge_label,
p_quit=0.03, n_iteration=10, remove_totters=False,
n_jobs=multiprocessing.cpu_count(), verbose=verbose)
elif graph_kernel == 'untilhpathkernel':
Kmatrix, _ = untilhpathkernel(Gn, node_label='atom', edge_label=None,
Kmatrix, _ = untilhpathkernel(Gn, node_label=node_label, edge_label=edge_label,
depth=10, k_func='MinMax', compute_method='trie',
n_jobs=multiprocessing.cpu_count(), verbose=verbose)
elif graph_kernel == 'spkernel':
@@ -77,10 +77,10 @@ def gram2distances(Kmatrix):
return dmatrix


def kernel_distance_matrix(Gn, Kmatrix=None, gkernel=None):
def kernel_distance_matrix(Gn, node_label, edge_label, Kmatrix=None, gkernel=None):
dis_mat = np.empty((len(Gn), len(Gn)))
if Kmatrix == None:
Kmatrix = compute_kernel(Gn, gkernel, True)
Kmatrix = compute_kernel(Gn, gkernel, node_label, edge_label, True)
for i in range(len(Gn)):
for j in range(i, len(Gn)):
dis = Kmatrix[i, i] + Kmatrix[j, j] - 2 * Kmatrix[i, j]


+ 118
- 48
pygraph/utils/graphfiles.py View File

@@ -1,9 +1,9 @@
""" Utilities function to manage graph files
"""
from os.path import dirname, splitext

def loadCT(filename):
"""load data from .ct file.
"""load data from a Chemical Table (.ct) file.

Notes
------
@@ -13,8 +13,11 @@ def loadCT(filename):
0.0000 0.0000 0.0000 C <- each line describes a node (x,y,z + label)
0.0000 0.0000 0.0000 C
0.0000 0.0000 0.0000 O
1 3 1 1 <- each line describes an edge : to, from,?, label
1 3 1 1 <- each line describes an edge : to, from, bond type, bond stereo
2 3 1 1
Check https://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=10&ved=2ahUKEwivhaSdjsTlAhVhx4UKHczHA8gQFjAJegQIARAC&url=https%3A%2F%2Fwww.daylight.com%2Fmeetings%2Fmug05%2FKappler%2Fctfile.pdf&usg=AOvVaw1cDNrrmMClkFPqodlF2inS
for detailed format discription.
"""
import networkx as nx
from os.path import basename
@@ -35,22 +38,15 @@ def loadCT(filename):
for i in range(0, nb_nodes):
tmp = content[i + 2].split(" ")
tmp = [x for x in tmp if x != '']
g.add_node(i, atom=tmp[3], label=tmp[3])
g.add_node(i, atom=tmp[3].strip(),
label=[item.strip() for item in tmp[3:]],
attributes=[item.strip() for item in tmp[0:3]])
for i in range(0, nb_edges):
tmp = content[i + g.number_of_nodes() + 2].split(" ")
tmp = [x for x in tmp if x != '']
g.add_edge(
int(tmp[0]) - 1,
int(tmp[1]) - 1,
bond_type=tmp[3].strip(),
label=tmp[3].strip())


# for i in range(0, nb_edges):
# tmp = content[i + g.number_of_nodes() + 2]
# tmp = [tmp[i:i+3] for i in range(0, len(tmp), 3)]
# g.add_edge(int(tmp[0]) - 1, int(tmp[1]) - 1,
# bond_type=tmp[3].strip(), label=tmp[3].strip())
g.add_edge(int(tmp[0]) - 1, int(tmp[1]) - 1,
bond_type=tmp[2].strip(),
label=[item.strip() for item in tmp[2:]])
return g


@@ -71,6 +67,7 @@ def loadGXL(filename):
labels[attr.attrib['name']] = attr[0].text
if 'chem' in labels:
labels['label'] = labels['chem']
labels['atom'] = labels['chem']
g.add_node(index, **labels)
index += 1

@@ -80,6 +77,7 @@ def loadGXL(filename):
labels[attr.attrib['name']] = attr[0].text
if 'valence' in labels:
labels['label'] = labels['valence']
labels['bond_type'] = labels['valence']
g.add_edge(dic[edge.attrib['from']], dic[edge.attrib['to']], **labels)
return g

@@ -392,7 +390,7 @@ def loadDataset(filename, filename_y=None, extra_params=None):
Notes
-----
This function supports following graph dataset formats:
'ds': load data from .ct file. See comments of function loadCT for a example.
'ds': load data from .ds file. See comments of function loadFromDS for a example.
'cxl': load data from Graph eXchange Language file (.cxl file). See
http://www.gupro.de/GXL/Introduction/background.html, 2019 for detail.
'sdf': load data from structured data file (.sdf file). See
@@ -406,45 +404,24 @@ def loadDataset(filename, filename_y=None, extra_params=None):
2019 for details. Note here filename is the name of either .txt file in
the dataset directory.
"""
from os.path import dirname, splitext

dirname_dataset = dirname(filename)
extension = splitext(filename)[1][1:]
data = []
y = []
if extension == "ds":
content = open(filename).read().splitlines()
if filename_y is None or filename_y == '':
for i in range(0, len(content)):
tmp = content[i].split(' ')
# remove the '#'s in file names
data.append(
loadCT(dirname_dataset + '/' + tmp[0].replace('#', '', 1)))
y.append(float(tmp[1]))
else: # y in a seperate file
for i in range(0, len(content)):
tmp = content[i]
# remove the '#'s in file names
data.append(
loadCT(dirname_dataset + '/' + tmp.replace('#', '', 1)))
content_y = open(filename_y).read().splitlines()
# assume entries in filename and filename_y have the same order.
for item in content_y:
tmp = item.split(' ')
# assume the 3rd entry in a line is y (for Alkane dataset)
y.append(float(tmp[2]))
data, y = loadFromDS(filename, filename_y)
elif extension == "cxl":
import xml.etree.ElementTree as ET

dirname_dataset = dirname(filename)
tree = ET.parse(filename)
root = tree.getroot()
data = []
y = []
for graph in root.iter('print'):
for graph in root.iter('graph'):
mol_filename = graph.attrib['file']
mol_class = graph.attrib['class']
data.append(loadGXL(dirname_dataset + '/' + mol_filename))
y.append(mol_class)
elif extension == 'xml':
data, y = loadFromXML(filename, extra_params)
elif extension == "sdf":
import numpy as np
from tqdm import tqdm
@@ -471,6 +448,7 @@ def loadDataset(filename, filename_y=None, extra_params=None):
elif extension == "mat":
data, y = loadMAT(filename, extra_params)
elif extension == 'txt':
dirname_dataset = dirname(filename)
data, y = loadTXT(dirname_dataset)
# print(len(y))
# print(y)
@@ -485,6 +463,75 @@ def loadDataset(filename, filename_y=None, extra_params=None):
return data, y


def loadFromXML(filename, extra_params):
import xml.etree.ElementTree as ET
dirname_dataset = dirname(filename)
tree = ET.parse(filename)
root = tree.getroot()
data = []
y = []
for graph in root.iter('print'):
mol_filename = graph.attrib['file']
mol_class = graph.attrib['class']
data.append(loadGXL(dirname_dataset + '/' + mol_filename))
y.append(mol_class)
return data, y

def loadFromDS(filename, filename_y):
"""Load data from .ds file.
Possible graph formats include:
'.ct': see function loadCT for detail.
'.gxl': see dunction loadGXL for detail.
Note these graph formats are checked automatically by the extensions of
graph files.
"""
dirname_dataset = dirname(filename)
data = []
y = []
content = open(filename).read().splitlines()
extension = splitext(content[0].split(' ')[0])[1][1:]
if filename_y is None or filename_y == '':
if extension == 'ct':
for i in range(0, len(content)):
tmp = content[i].split(' ')
# remove the '#'s in file names
data.append(
loadCT(dirname_dataset + '/' + tmp[0].replace('#', '', 1)))
y.append(float(tmp[1]))
elif extension == 'gxl':
for i in range(0, len(content)):
tmp = content[i].split(' ')
# remove the '#'s in file names
data.append(
loadGXL(dirname_dataset + '/' + tmp[0].replace('#', '', 1)))
y.append(float(tmp[1]))
else: # y in a seperate file
if extension == 'ct':
for i in range(0, len(content)):
tmp = content[i]
# remove the '#'s in file names
data.append(
loadCT(dirname_dataset + '/' + tmp.replace('#', '', 1)))
elif extension == 'gxl':
for i in range(0, len(content)):
tmp = content[i]
# remove the '#'s in file names
data.append(
loadGXL(dirname_dataset + '/' + tmp.replace('#', '', 1)))
content_y = open(filename_y).read().splitlines()
# assume entries in filename and filename_y have the same order.
for item in content_y:
tmp = item.split(' ')
# assume the 3rd entry in a line is y (for Alkane dataset)
y.append(float(tmp[2]))
return data, y

def saveDataset(Gn, y, gformat='gxl', group=None, filename='gfile', xparams=None):
"""Save list of graphs.
"""
@@ -509,7 +556,30 @@ def saveDataset(Gn, y, gformat='gxl', group=None, filename='gfile', xparams=None
if __name__ == '__main__':
ds = {'name': 'MUTAG', 'dataset': '../../datasets/MUTAG/MUTAG.mat',
'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}} # node/edge symb
Gn, y = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
saveDataset(Gn, y, group='xml', filename='temp/temp')
# ### Load dataset from .ds file.
# # .ct files.
ds = {'name': 'Alkane', 'dataset': '../../datasets/Alkane/dataset.ds',
'dataset_y': '../../datasets/Alkane/dataset_boiling_point_names.txt'}
Gn, y = loadDataset(ds['dataset'], filename_y=ds['dataset_y'])
# ds = {'name': 'Acyclic', 'dataset': '../../datasets/acyclic/dataset_bps.ds'} # node symb
# Gn, y = loadDataset(ds['dataset'])
# ds = {'name': 'MAO', 'dataset': '../../datasets/MAO/dataset.ds'} # node/edge symb
# Gn, y = loadDataset(ds['dataset'])
# ds = {'name': 'PAH', 'dataset': '../../datasets/PAH/dataset.ds'} # unlabeled
# Gn, y = loadDataset(ds['dataset'])
print(Gn[1].nodes(data=True))
print(Gn[1].edges(data=True))
print(y[1])
# # .gxl file.
# ds = {'name': 'monoterpenoides',
# 'dataset': '../../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb
# Gn, y = loadDataset(ds['dataset'])
# print(Gn[1].nodes(data=True))
# print(Gn[1].edges(data=True))
# print(y[1])
# ds = {'name': 'MUTAG', 'dataset': '../../datasets/MUTAG/MUTAG.mat',
# 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}} # node/edge symb
# Gn, y = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
# saveDataset(Gn, y, group='xml', filename='temp/temp')

Loading…
Cancel
Save