Browse Source

Update pre-image.

v0.1
jajupmochi 5 years ago
parent
commit
57e13c9c5d
11 changed files with 6214 additions and 1002 deletions
  1. +5
    -3
      notebooks/run_untilhpathkernel.py
  2. +4191
    -491
      notebooks/utils/plot_all_graphs.ipynb
  3. +340
    -38
      preimage/gk_iam.py
  4. +493
    -368
      preimage/iam.py
  5. +29
    -29
      preimage/median.py
  6. +152
    -17
      preimage/preimage.py
  7. +298
    -17
      preimage/run_gk_iam.py
  8. +11
    -11
      preimage/test.py
  9. +599
    -0
      preimage/test_random_mutag.py
  10. +88
    -20
      pygraph/kernels/untilHPathKernel.py
  11. +8
    -8
      pygraph/utils/graphfiles.py

+ 5
- 3
notebooks/run_untilhpathkernel.py View File

@@ -54,9 +54,11 @@ dslist = [
# {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',}, # {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',},
] ]
estimator = untilhpathkernel estimator = untilhpathkernel
param_grid_precomputed = {'depth': np.linspace(1, 10, 10), # [2],
'k_func': ['MinMax'], # ['MinMax', 'tanimoto'],
'compute_method': ['trie']} # ['MinMax']}
param_grid_precomputed = {'depth': np.linspace(3, 10, 8), # [2],
'k_func': [None]} # ['MinMax', 'tanimoto'],
#param_grid_precomputed = {'depth': np.linspace(1, 10, 10), # [2],
# 'k_func': ['MinMax'], # ['MinMax', 'tanimoto'],
# 'compute_method': ['trie']} # ['MinMax']}
param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)}, param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)},
{'alpha': np.logspace(-10, 10, num=41, base=10)}] {'alpha': np.logspace(-10, 10, num=41, base=10)}]




+ 4191
- 491
notebooks/utils/plot_all_graphs.ipynb
File diff suppressed because it is too large
View File


+ 340
- 38
preimage/gk_iam.py View File

@@ -17,8 +17,11 @@ import multiprocessing
from tqdm import tqdm from tqdm import tqdm
import networkx as nx import networkx as nx
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import random


from iam import iam, test_iam_with_more_graphs_as_init, test_iam_moreGraphsAsInit_tryAllPossibleBestGraphs_deleteNodesInIterations
import matplotlib.pyplot as plt

from iam import iam, test_iam_with_more_graphs_as_init, iam_moreGraphsAsInit_tryAllPossibleBestGraphs
sys.path.insert(0, "../") sys.path.insert(0, "../")
from pygraph.kernels.marginalizedKernel import marginalizedkernel from pygraph.kernels.marginalizedKernel import marginalizedkernel
from pygraph.kernels.untilHPathKernel import untilhpathkernel from pygraph.kernels.untilHPathKernel import untilhpathkernel
@@ -67,7 +70,7 @@ def gk_iam(Gn, alpha):
# Gs_nearest = Gk + gihat_list # Gs_nearest = Gk + gihat_list
# g_tmp = iam(Gs_nearest) # g_tmp = iam(Gs_nearest)
# #
# # compute distance between phi and the new generated graph.
# # compute distance between \psi and the new generated graph.
# knew = marginalizedkernel([g_tmp, g1, g2], node_label='atom', edge_label=None, # knew = marginalizedkernel([g_tmp, g1, g2], node_label='atom', edge_label=None,
# p_quit=lmbda, n_iteration=20, remove_totters=False, # p_quit=lmbda, n_iteration=20, remove_totters=False,
# n_jobs=multiprocessing.cpu_count(), verbose=False) # n_jobs=multiprocessing.cpu_count(), verbose=False)
@@ -142,7 +145,7 @@ def gk_iam_nearest(Gn, alpha, idx_gi, Kmatrix, k, r_max):
print(g_tmp.nodes(data=True)) print(g_tmp.nodes(data=True))
print(g_tmp.edges(data=True)) print(g_tmp.edges(data=True))
# compute distance between phi and the new generated graph.
# compute distance between \psi and the new generated graph.
gi_list = [Gn[i] for i in idx_gi] gi_list = [Gn[i] for i in idx_gi]
knew = compute_kernel([g_tmp] + gi_list, 'untilhpathkernel', False) knew = compute_kernel([g_tmp] + gi_list, 'untilhpathkernel', False)
dnew = dis_gstar(0, range(1, len(gi_list) + 1), alpha, knew) dnew = dis_gstar(0, range(1, len(gi_list) + 1), alpha, knew)
@@ -236,7 +239,7 @@ def gk_iam_nearest(Gn, alpha, idx_gi, Kmatrix, k, r_max):
# print(g.nodes(data=True)) # print(g.nodes(data=True))
# print(g.edges(data=True)) # print(g.edges(data=True))
# #
# # compute distance between phi and the new generated graphs.
# # compute distance between \psi and the new generated graphs.
# gi_list = [Gn[i] for i in idx_gi] # gi_list = [Gn[i] for i in idx_gi]
# knew = compute_kernel(g_tmp_list + gi_list, 'marginalizedkernel', False) # knew = compute_kernel(g_tmp_list + gi_list, 'marginalizedkernel', False)
# dnew_list = [] # dnew_list = []
@@ -278,7 +281,12 @@ def gk_iam_nearest(Gn, alpha, idx_gi, Kmatrix, k, r_max):




def gk_iam_nearest_multi(Gn_init, Gn_median, alpha, idx_gi, Kmatrix, k, r_max, def gk_iam_nearest_multi(Gn_init, Gn_median, alpha, idx_gi, Kmatrix, k, r_max,
gkernel, c_ei=1, c_er=1, c_es=1, epsilon=0.001):
gkernel, epsilon=0.001,
params_iam={'c_ei': 1, 'c_er': 1, 'c_es': 1,
'ite_max': 50, 'epsilon': 0.001,
'removeNodes': True, 'connected': False},
params_ged={'ged_cost': 'CHEM_1', 'ged_method': 'IPFP',
'saveGXL': 'benoit'}):
"""This function constructs graph pre-image by the iterative pre-image """This function constructs graph pre-image by the iterative pre-image
framework in reference [1], algorithm 1, where the step of generating new framework in reference [1], algorithm 1, where the step of generating new
graphs randomly is replaced by the IAM algorithm in reference [2]. graphs randomly is replaced by the IAM algorithm in reference [2].
@@ -310,7 +318,7 @@ def gk_iam_nearest_multi(Gn_init, Gn_median, alpha, idx_gi, Kmatrix, k, r_max,
g0hat_list = [Gn_init[idx] for idx in sort_idx[0:nb_best]] # the nearest neighbors of phi in DN g0hat_list = [Gn_init[idx] for idx in sort_idx[0:nb_best]] # the nearest neighbors of phi in DN
if dis_gs[0] == 0: # the exact pre-image. if dis_gs[0] == 0: # the exact pre-image.
print('The exact pre-image is found from the input dataset.') print('The exact pre-image is found from the input dataset.')
return 0, g0hat_list
return 0, g0hat_list, 0, 0
dhat = dis_gs[0] # the nearest distance dhat = dis_gs[0] # the nearest distance
ghat_list = [g.copy() for g in g0hat_list] ghat_list = [g.copy() for g in g0hat_list]
# for g in ghat_list: # for g in ghat_list:
@@ -320,31 +328,33 @@ def gk_iam_nearest_multi(Gn_init, Gn_median, alpha, idx_gi, Kmatrix, k, r_max,
# print(g.nodes(data=True)) # print(g.nodes(data=True))
# print(g.edges(data=True)) # print(g.edges(data=True))
Gk = [Gn_init[ig].copy() for ig in sort_idx[0:k]] # the k nearest neighbors Gk = [Gn_init[ig].copy() for ig in sort_idx[0:k]] # the k nearest neighbors
# for gi in Gk:
## nx.draw_networkx(gi)
## plt.show()
for gi in Gk:
nx.draw(gi, labels=nx.get_node_attributes(gi, 'atom'), with_labels=True)
# nx.draw_networkx(gi)
plt.show()
# draw_Letter_graph(g) # draw_Letter_graph(g)
# print(gi.nodes(data=True))
# print(gi.edges(data=True))
Gs_nearest = Gk.copy()
print(gi.nodes(data=True))
print(gi.edges(data=True))
Gs_nearest = [g.copy() for g in Gk]
Gn_nearest_median = [g.copy() for g in Gs_nearest]
# gihat_list = [] # gihat_list = []
# i = 1 # i = 1
r = 0 r = 0
itr = 0
# cur_sod = dhat
# old_sod = cur_sod * 2
sod_list = [dhat]
itr_total = 0
# cur_dis = dhat
# old_dis = cur_dis * 2
dis_list = [dhat]
found = False found = False
nb_updated = 0 nb_updated = 0
while r < r_max:# and not found: # @todo: if not found?# and np.abs(old_sod - cur_sod) > epsilon:
print('\nr =', r)
print('itr for gk =', itr, '\n')
while r < r_max:# and not found: # @todo: if not found?# and np.abs(old_dis - cur_dis) > epsilon:
print('\nCurrent preimage iteration =', r)
print('Total preimage iteration =', itr_total, '\n')
found = False found = False
# Gs_nearest = Gk + gihat_list # Gs_nearest = Gk + gihat_list
# g_tmp = iam(Gs_nearest) # g_tmp = iam(Gs_nearest)
g_tmp_list, _ = test_iam_moreGraphsAsInit_tryAllPossibleBestGraphs_deleteNodesInIterations(
Gn_median, Gs_nearest, c_ei=c_ei, c_er=c_er, c_es=c_es)
g_tmp_list, _ = iam_moreGraphsAsInit_tryAllPossibleBestGraphs(
Gn_nearest_median, Gs_nearest, params_ged=params_ged, **params_iam)
# for g in g_tmp_list: # for g in g_tmp_list:
# nx.draw_networkx(g) # nx.draw_networkx(g)
# plt.show() # plt.show()
@@ -352,31 +362,73 @@ def gk_iam_nearest_multi(Gn_init, Gn_median, alpha, idx_gi, Kmatrix, k, r_max,
# print(g.nodes(data=True)) # print(g.nodes(data=True))
# print(g.edges(data=True)) # print(g.edges(data=True))
# compute distance between phi and the new generated graphs.
# compute distance between \psi and the new generated graphs.
knew = compute_kernel(g_tmp_list + Gn_median, gkernel, False) knew = compute_kernel(g_tmp_list + Gn_median, gkernel, False)
dnew_list = [] dnew_list = []
for idx, g_tmp in enumerate(g_tmp_list): for idx, g_tmp in enumerate(g_tmp_list):
# @todo: the term3 below could use the one at the beginning of the function.
dnew_list.append(dis_gstar(idx, range(len(g_tmp_list), dnew_list.append(dis_gstar(idx, range(len(g_tmp_list),
len(g_tmp_list) + len(Gn_median) + 1), alpha, knew,
withterm3=False))
len(g_tmp_list) + len(Gn_median) + 1),
alpha, knew, withterm3=False))
# dnew = knew[0, 0] - 2 * (alpha[0] * knew[0, 1] + alpha[1] * # dnew = knew[0, 0] - 2 * (alpha[0] * knew[0, 1] + alpha[1] *
# knew[0, 2]) + (alpha[0] * alpha[0] * k_list[0] + alpha[0] * # knew[0, 2]) + (alpha[0] * alpha[0] * k_list[0] + alpha[0] *
# alpha[1] * k_g2_list[0] + alpha[1] * alpha[0] * # alpha[1] * k_g2_list[0] + alpha[1] * alpha[0] *
# k_g1_list[1] + alpha[1] * alpha[1] * k_list[1]) # k_g1_list[1] + alpha[1] * alpha[1] * k_list[1])
# # find the new k nearest graphs.
# dnew_best = min(dnew_list)
# dis_gs = dnew_list + dis_gs # add the new nearest distances.
# Gs_nearest = [g.copy() for g in g_tmp_list] + Gs_nearest # add the corresponding graphs.
# sort_idx = np.argsort(dis_gs)
# if len([i for i in sort_idx[0:k] if i < len(dnew_list)]) > 0:
# print('We got new k nearest neighbors! Hurray!')
# dis_gs = [dis_gs[idx] for idx in sort_idx[0:k]] # the new k nearest distances.
## print(dis_gs[-1])
# Gs_nearest = [Gs_nearest[idx] for idx in sort_idx[0:k]]
# nb_best = len(np.argwhere(dis_gs == dis_gs[0]).flatten().tolist())
# if dnew_best < dhat and np.abs(dnew_best - dhat) > epsilon:
# print('I have smaller distance!')
# print(str(dhat) + '->' + str(dis_gs[0]))
# dhat = dis_gs[0]
# idx_best_list = np.argwhere(dnew_list == dhat).flatten().tolist()
# ghat_list = [g_tmp_list[idx].copy() for idx in idx_best_list]
## for g in ghat_list:
### nx.draw_networkx(g)
### plt.show()
## draw_Letter_graph(g)
## print(g.nodes(data=True))
## print(g.edges(data=True))
# r = 0
# found = True
# nb_updated += 1
# elif np.abs(dnew_best - dhat) < epsilon:
# print('I have almost equal distance!')
# print(str(dhat) + '->' + str(dnew_best))
# else:
# dis_gs = [dis_gs[idx] for idx in sort_idx[0:k]]
# Gs_nearest = [Gs_nearest[idx] for idx in sort_idx[0:k]]
# Gn_nearest_median = [g.copy() for g in Gs_nearest]
# if not found:
# r += 1
# find the new k nearest graphs. # find the new k nearest graphs.
dnew_best = min(dnew_list) dnew_best = min(dnew_list)
dis_gs = dnew_list + dis_gs # add the new nearest distances.
Gs_nearest = [g.copy() for g in g_tmp_list] + Gs_nearest # add the corresponding graphs.
sort_idx = np.argsort(dis_gs)
if np.abs(dnew_best - dhat) >= epsilon:
dis_gs = dnew_list + dis_gs # add the new nearest distances.
Gs_nearest = [g.copy() for g in g_tmp_list] + Gs_nearest # add the corresponding graphs.
sort_idx = np.argsort(dis_gs)
else: # if the new distance is equal to the old one.
# @todo: works if only one graph is generated.
Gs_nearest[0] = g_tmp_list[0].copy()
sort_idx = np.argsort(dis_gs)
if len([i for i in sort_idx[0:k] if i < len(dnew_list)]) > 0: if len([i for i in sort_idx[0:k] if i < len(dnew_list)]) > 0:
print('We got better k nearest neighbors! Hurray!')
print('We got new k nearest neighbors! Hurray!')
dis_gs = [dis_gs[idx] for idx in sort_idx[0:k]] # the new k nearest distances. dis_gs = [dis_gs[idx] for idx in sort_idx[0:k]] # the new k nearest distances.
print(dis_gs[-1])
# print(dis_gs[-1])
Gs_nearest = [Gs_nearest[idx] for idx in sort_idx[0:k]] Gs_nearest = [Gs_nearest[idx] for idx in sort_idx[0:k]]
nb_best = len(np.argwhere(dis_gs == dis_gs[0]).flatten().tolist()) nb_best = len(np.argwhere(dis_gs == dis_gs[0]).flatten().tolist())
if dnew_best < dhat and np.abs(dnew_best - dhat) > epsilon:
if dnew_best < dhat and np.abs(dnew_best - dhat) >= epsilon:
print('I have smaller distance!') print('I have smaller distance!')
print(str(dhat) + '->' + str(dis_gs[0])) print(str(dhat) + '->' + str(dis_gs[0]))
dhat = dis_gs[0] dhat = dis_gs[0]
@@ -394,19 +446,269 @@ def gk_iam_nearest_multi(Gn_init, Gn_median, alpha, idx_gi, Kmatrix, k, r_max,
elif np.abs(dnew_best - dhat) < epsilon: elif np.abs(dnew_best - dhat) < epsilon:
print('I have almost equal distance!') print('I have almost equal distance!')
print(str(dhat) + '->' + str(dnew_best)) print(str(dhat) + '->' + str(dnew_best))
else:
dis_gs = [dis_gs[idx] for idx in sort_idx[0:k]]
Gs_nearest = [Gs_nearest[idx] for idx in sort_idx[0:k]]
Gn_nearest_median = [g.copy() for g in Gs_nearest]
if not found: if not found:
r += 1 r += 1
# old_sod = cur_sod
# cur_sod = dnew_best
sod_list.append(dhat)
itr += 1
# old_dis = cur_dis
# cur_dis = dnew_best
dis_list.append(dhat)
itr_total += 1
print('\nthe graph is updated', nb_updated, 'times.') print('\nthe graph is updated', nb_updated, 'times.')
print('sods in kernel space:', sod_list, '\n')
print('distances in kernel space:', dis_list, '\n')
return dhat, ghat_list
return dhat, ghat_list, dis_list[-1], nb_updated



def preimage_iam_random_mix(Gn_init, Gn_median, alpha, idx_gi, Kmatrix, k, r_max,
l_max, gkernel, epsilon=0.001,
params_iam={'c_ei': 1, 'c_er': 1, 'c_es': 1,
'ite_max': 50, 'epsilon': 0.001,
'removeNodes': True, 'connected': False},
params_ged={'ged_cost': 'CHEM_1', 'ged_method': 'IPFP',
'saveGXL': 'benoit'}):
"""This function constructs graph pre-image by the iterative pre-image
framework in reference [1], algorithm 1, where new graphs are generated
randomly and by the IAM algorithm in reference [2].
notes
-----
Every time a set of n better graphs is acquired, their distances in kernel space are
compared with the k nearest ones, and the k nearest distances from the k+n
distances will be used as the new ones.
"""
Gn_init = [nx.convert_node_labels_to_integers(g) for g in Gn_init]
# compute k nearest neighbors of phi in DN.
dis_list = [] # distance between g_star and each graph.
term3 = 0
for i1, a1 in enumerate(alpha):
for i2, a2 in enumerate(alpha):
term3 += a1 * a2 * Kmatrix[idx_gi[i1], idx_gi[i2]]
for ig, g in tqdm(enumerate(Gn_init), desc='computing distances', file=sys.stdout):
dtemp = dis_gstar(ig, idx_gi, alpha, Kmatrix, term3=term3)
dis_list.append(dtemp)
# sort
sort_idx = np.argsort(dis_list)
dis_gs = [dis_list[idis] for idis in sort_idx[0:k]] # the k shortest distances
nb_best = len(np.argwhere(dis_gs == dis_gs[0]).flatten().tolist())
g0hat_list = [Gn_init[idx] for idx in sort_idx[0:nb_best]] # the nearest neighbors of phi in DN
if dis_gs[0] == 0: # the exact pre-image.
print('The exact pre-image is found from the input dataset.')
return 0, g0hat_list, 0, 0
dhat = dis_gs[0] # the nearest distance
ghat_list = [g.copy() for g in g0hat_list]
# for g in ghat_list:
# draw_Letter_graph(g)
# nx.draw_networkx(g)
# plt.show()
# print(g.nodes(data=True))
# print(g.edges(data=True))
Gk = [Gn_init[ig].copy() for ig in sort_idx[0:k]] # the k nearest neighbors
for gi in Gk:
nx.draw(gi, labels=nx.get_node_attributes(gi, 'atom'), with_labels=True)
# nx.draw_networkx(gi)
plt.show()
# draw_Letter_graph(g)
print(gi.nodes(data=True))
print(gi.edges(data=True))
Gs_nearest = [g.copy() for g in Gk]
Gn_nearest_median = [g.copy() for g in Gs_nearest]
# gihat_list = []
# i = 1
r = 0
itr_total = 0
# cur_dis = dhat
# old_dis = cur_dis * 2
dis_list = [dhat]
found = False
nb_updated_iam = 0
nb_updated_random = 0
while r < r_max: # and not found: # @todo: if not found?# and np.abs(old_dis - cur_dis) > epsilon:
print('\n-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-')
print('Current preimage iteration =', r)
print('Total preimage iteration =', itr_total, '\n')
found = False
# Gs_nearest = Gk + gihat_list
# g_tmp = iam(Gs_nearest)
g_tmp_list, _ = iam_moreGraphsAsInit_tryAllPossibleBestGraphs(
Gn_nearest_median, Gs_nearest, params_ged=params_ged, **params_iam)
# for g in g_tmp_list:
# nx.draw_networkx(g)
# plt.show()
# draw_Letter_graph(g)
# print(g.nodes(data=True))
# print(g.edges(data=True))
# compute distance between \psi and the new generated graphs.
knew = compute_kernel(g_tmp_list + Gn_median, gkernel, False)
dnew_list = []
for idx, g_tmp in enumerate(g_tmp_list):
# @todo: the term3 below could use the one at the beginning of the function.
dnew_list.append(dis_gstar(idx, range(len(g_tmp_list),
len(g_tmp_list) + len(Gn_median) + 1),
alpha, knew, withterm3=False))
# find the new k nearest graphs.
# @todo: for now only consider the situation when only one graph is generated by IAM.
dnew_best = min(dnew_list)
gnew_best = g_tmp_list[0].copy()
# when new distance is equal to the old one, use random generation.
if np.abs(dnew_best - dhat) < epsilon or dhat < dnew_best:
# Gs_nearest[0] = g_tmp_list[0].copy()
# sort_idx = np.argsort(dis_gs)
print('Distance almost equal or worse, switching to random generation now.')
print(str(dhat) + '->' + str(dnew_best))
if dnew_best > dhat and np.abs(dnew_best - dhat) >= epsilon:
dnew_best = dhat
gnew_best = Gs_nearest[0].copy()
# number of edges to be changed.
# @todo what if the log is negetive? how to choose alpha (scalar)? seems fdgs is always 1.
# fdgs = dnew_best
fdgs = nb_updated_random + 1
if fdgs < 1:
fdgs = 1
fdgs = int(np.ceil(np.log(fdgs)))
if fdgs < 1:
fdgs += 1
# fdgs = nb_updated_random + 1 # @todo:
# @todo: should we use just half of the adjacency matrix for undirected graphs?
nb_vpairs = nx.number_of_nodes(gnew_best) * (nx.number_of_nodes(gnew_best) - 1)
l = 0
while l < l_max:
# add and delete edges.
gtemp = gnew_best.copy()
np.random.seed()
# which edges to change.
# @todo: what if fdgs is bigger than nb_vpairs?
idx_change = random.sample(range(nb_vpairs), fdgs if
fdgs < nb_vpairs else nb_vpairs)
# idx_change = np.random.randint(0, nx.number_of_nodes(gs) *
# (nx.number_of_nodes(gs) - 1), fdgs)
for item in idx_change:
node1 = int(item / (nx.number_of_nodes(gtemp) - 1))
node2 = (item - node1 * (nx.number_of_nodes(gtemp) - 1))
if node2 >= node1: # skip the self pair.
node2 += 1
# @todo: is the randomness correct?
if not gtemp.has_edge(node1, node2):
gtemp.add_edge(node1, node2)
# nx.draw_networkx(gs)
# plt.show()
# nx.draw_networkx(gtemp)
# plt.show()
else:
gtemp.remove_edge(node1, node2)
# nx.draw_networkx(gs)
# plt.show()
# nx.draw_networkx(gtemp)
# plt.show()
# nx.draw_networkx(gtemp)
# plt.show()
# compute distance between \psi and the new generated graph.
knew = compute_kernel([gtemp] + Gn_median, gkernel, verbose=False)
dnew = dis_gstar(0, [1, 2], alpha, knew, withterm3=False)
# @todo: the new distance is smaller or also equal?
if dnew < dnew_best or np.abs(dnew_best - dnew) < epsilon:
if np.abs(dnew_best - dnew) < epsilon:
print('I am equal!')
dnew_best = dnew
gnew_best = gtemp.copy()
else:
print('\nI am smaller!')
print('l =', str(l))
print(dnew_best, '->', dnew)
dis_gs = [dnew] + dis_gs # add the new nearest distances.
Gs_nearest = [gtemp.copy()] + Gs_nearest # add the corresponding graphs.
sort_idx = np.argsort(dis_gs)
dis_gs = [dis_gs[idx] for idx in sort_idx[0:k]] # the new k nearest distances.
Gs_nearest = [Gs_nearest[idx] for idx in sort_idx[0:k]]
Gn_nearest_median = [g.copy() for g in Gs_nearest]
dhat = dnew
nb_updated_random += 1
found = True # found better graph.
r = 0
print('the graph is updated by random generation',
nb_updated_random, 'times.')
nx.draw(gtemp, labels=nx.get_node_attributes(gtemp, 'atom'),
with_labels=True)
## plt.savefig("results/gk_iam/simple_two/xx" + str(i) + ".png", format="PNG")
plt.show()
break
# nx.draw_networkx(gtemp)
# plt.show()
# print(gtemp.nodes(data=True))
# print(gtemp.edges(data=True))
l += 1
if l == l_max:
r += 1
else: # if the new distance is not equal to the old one.
dis_gs = dnew_list + dis_gs # add the new nearest distances.
Gs_nearest = [nx.convert_node_labels_to_integers(g).copy() for g
in g_tmp_list] + Gs_nearest # add the corresponding graphs.
sort_idx = np.argsort(dis_gs)
if len([i for i in sort_idx[0:k] if i < len(dnew_list)]) > 0:
print('We got new k nearest neighbors! Hurray!')
dis_gs = [dis_gs[idx] for idx in sort_idx[0:k]] # the new k nearest distances.
# print(dis_gs[-1])
Gs_nearest = [Gs_nearest[idx] for idx in sort_idx[0:k]]
nb_best = len(np.argwhere(dis_gs == dis_gs[0]).flatten().tolist())
if dnew_best < dhat:
print('I have smaller distance!')
print(str(dhat) + '->' + str(dis_gs[0]))
dhat = dis_gs[0]
idx_best_list = np.argwhere(dnew_list == dhat).flatten().tolist()
ghat_list = [g_tmp_list[idx].copy() for idx in idx_best_list]
# for g in ghat_list:
## nx.draw_networkx(g)
## plt.show()
# draw_Letter_graph(g)
# print(g.nodes(data=True))
# print(g.edges(data=True))
r = 0
found = True
nb_updated_iam += 1
print('the graph is updated by IAM', nb_updated_iam, 'times.')
nx.draw(ghat_list[0], labels=nx.get_node_attributes(ghat_list[0], 'atom'),
with_labels=True)
## plt.savefig("results/gk_iam/simple_two/xx" + str(i) + ".png", format="PNG")
plt.show()
else:
dis_gs = [dis_gs[idx] for idx in sort_idx[0:k]]
Gs_nearest = [Gs_nearest[idx] for idx in sort_idx[0:k]]
Gn_nearest_median = [g.copy() for g in Gs_nearest]
if not found:
r += 1
# old_dis = cur_dis
# cur_dis = dnew_best
dis_list.append(dhat)
itr_total += 1
print('\nthe k shortest distances are', dis_gs)
print('the shortest distances for previous iterations are', dis_list)
print('\nthe graph is updated by IAM', nb_updated_iam, 'times, and by random generation',
nb_updated_random, 'times.')
print('distances in kernel space:', dis_list, '\n')
return dhat, ghat_list, dis_list[-1], nb_updated_iam, nb_updated_random



###############################################################################
# useful functions.


def dis_gstar(idx_g, idx_gi, alpha, Kmatrix, term3=0, withterm3=True): def dis_gstar(idx_g, idx_gi, alpha, Kmatrix, term3=0, withterm3=True):
term1 = Kmatrix[idx_g, idx_g] term1 = Kmatrix[idx_g, idx_g]
@@ -424,10 +726,10 @@ def dis_gstar(idx_g, idx_gi, alpha, Kmatrix, term3=0, withterm3=True):
def compute_kernel(Gn, graph_kernel, verbose): def compute_kernel(Gn, graph_kernel, verbose):
if graph_kernel == 'marginalizedkernel': if graph_kernel == 'marginalizedkernel':
Kmatrix, _ = marginalizedkernel(Gn, node_label='atom', edge_label=None, Kmatrix, _ = marginalizedkernel(Gn, node_label='atom', edge_label=None,
p_quit=0.03, n_iteration=20, remove_totters=False,
p_quit=0.03, n_iteration=10, remove_totters=False,
n_jobs=multiprocessing.cpu_count(), verbose=verbose) n_jobs=multiprocessing.cpu_count(), verbose=verbose)
elif graph_kernel == 'untilhpathkernel': elif graph_kernel == 'untilhpathkernel':
Kmatrix, _ = untilhpathkernel(Gn, node_label='atom', edge_label='bond_type',
Kmatrix, _ = untilhpathkernel(Gn, node_label='atom', edge_label=None,
depth=10, k_func='MinMax', compute_method='trie', depth=10, k_func='MinMax', compute_method='trie',
n_jobs=multiprocessing.cpu_count(), verbose=verbose) n_jobs=multiprocessing.cpu_count(), verbose=verbose)
elif graph_kernel == 'spkernel': elif graph_kernel == 'spkernel':


+ 493
- 368
preimage/iam.py View File

@@ -20,7 +20,424 @@ from pygraph.utils.graphdataset import get_dataset_attributes
from pygraph.utils.utils import graph_isIdentical, get_node_labels, get_edge_labels from pygraph.utils.utils import graph_isIdentical, get_node_labels, get_edge_labels
#from pygraph.utils.utils import graph_deepcopy #from pygraph.utils.utils import graph_deepcopy


def iam_moreGraphsAsInit_tryAllPossibleBestGraphs(Gn_median, Gn_candidate,
c_ei=3, c_er=3, c_es=1, ite_max=50, epsilon=0.001,
node_label='atom', edge_label='bond_type',
connected=False, removeNodes=True, AllBestInit=True,
params_ged={'ged_cost': 'CHEM_1', 'ged_method': 'IPFP', 'saveGXL': 'benoit'}):
"""See my name, then you know what I do.
"""
from tqdm import tqdm
# Gn_median = Gn_median[0:10]
# Gn_median = [nx.convert_node_labels_to_integers(g) for g in Gn_median]
if removeNodes:
node_ir = np.inf # corresponding to the node remove and insertion.
label_r = 'thanksdanny' # the label for node remove. # @todo: make this label unrepeatable.
ds_attrs = get_dataset_attributes(Gn_median + Gn_candidate,
attr_names=['edge_labeled', 'node_attr_dim', 'edge_attr_dim'],
edge_label=edge_label)

def generate_graph(G, pi_p_forward, label_set):
G_new_list = [G.copy()] # all "best" graphs generated in this iteration.
# nx.draw_networkx(G)
# import matplotlib.pyplot as plt
# plt.show()
# print(pi_p_forward)
# update vertex labels.
# pre-compute h_i0 for each label.
# for label in get_node_labels(Gn, node_label):
# print(label)
# for nd in G.nodes(data=True):
# pass
if not ds_attrs['node_attr_dim']: # labels are symbolic
for ndi, (nd, _) in enumerate(G.nodes(data=True)):
h_i0_list = []
label_list = []
for label in label_set:
h_i0 = 0
for idx, g in enumerate(Gn_median):
pi_i = pi_p_forward[idx][ndi]
if pi_i != node_ir and g.nodes[pi_i][node_label] == label:
h_i0 += 1
h_i0_list.append(h_i0)
label_list.append(label)
# case when the node is to be removed.
if removeNodes:
h_i0_remove = 0 # @todo: maybe this can be added to the label_set above.
for idx, g in enumerate(Gn_median):
pi_i = pi_p_forward[idx][ndi]
if pi_i == node_ir:
h_i0_remove += 1
h_i0_list.append(h_i0_remove)
label_list.append(label_r)
# get the best labels.
idx_max = np.argwhere(h_i0_list == np.max(h_i0_list)).flatten().tolist()
nlabel_best = [label_list[idx] for idx in idx_max]
# generate "best" graphs with regard to "best" node labels.
G_new_list_nd = []
for g in G_new_list: # @todo: seems it can be simplified. The G_new_list will only contain 1 graph for now.
for nl in nlabel_best:
g_tmp = g.copy()
if nl == label_r:
g_tmp.remove_node(nd)
else:
g_tmp.nodes[nd][node_label] = nl
G_new_list_nd.append(g_tmp)
# nx.draw_networkx(g_tmp)
# import matplotlib.pyplot as plt
# plt.show()
# print(g_tmp.nodes(data=True))
# print(g_tmp.edges(data=True))
G_new_list = [ggg.copy() for ggg in G_new_list_nd]

else: # labels are non-symbolic
for ndi, (nd, _) in enumerate(G.nodes(data=True)):
Si_norm = 0
phi_i_bar = np.array([0.0 for _ in range(ds_attrs['node_attr_dim'])])
for idx, g in enumerate(Gn_median):
pi_i = pi_p_forward[idx][ndi]
if g.has_node(pi_i): #@todo: what if no g has node? phi_i_bar = 0?
Si_norm += 1
phi_i_bar += np.array([float(itm) for itm in g.nodes[pi_i]['attributes']])
phi_i_bar /= Si_norm
G_new_list[0].nodes[nd]['attributes'] = phi_i_bar
# for g in G_new_list:
# import matplotlib.pyplot as plt
# nx.draw(g, labels=nx.get_node_attributes(g, 'atom'), with_labels=True)
# plt.show()
# print(g.nodes(data=True))
# print(g.edges(data=True))
# update edge labels and adjacency matrix.
if ds_attrs['edge_labeled']:
G_new_list_edge = []
for g_new in G_new_list:
nd_list = [n for n in g_new.nodes()]
g_tmp_list = [g_new.copy()]
for nd1i in range(nx.number_of_nodes(g_new)):
nd1 = nd_list[nd1i]# @todo: not just edges, but all pairs of nodes
for nd2i in range(nd1i + 1, nx.number_of_nodes(g_new)):
nd2 = nd_list[nd2i]
# for nd1, nd2, _ in g_new.edges(data=True):
h_ij0_list = []
label_list = []
# @todo: compute edge label set before.
for label in get_edge_labels(Gn_median, edge_label):
h_ij0 = 0
for idx, g in enumerate(Gn_median):
pi_i = pi_p_forward[idx][nd1i]
pi_j = pi_p_forward[idx][nd2i]
h_ij0_p = (g.has_node(pi_i) and g.has_node(pi_j) and
g.has_edge(pi_i, pi_j) and
g.edges[pi_i, pi_j][edge_label] == label)
h_ij0 += h_ij0_p
h_ij0_list.append(h_ij0)
label_list.append(label)
# # case when the edge is to be removed.
# h_ij0_remove = 0
# for idx, g in enumerate(Gn_median):
# pi_i = pi_p_forward[idx][nd1i]
# pi_j = pi_p_forward[idx][nd2i]
# if g.has_node(pi_i) and g.has_node(pi_j) and not
# g.has_edge(pi_i, pi_j):
# h_ij0_remove += 1
# h_ij0_list.append(h_ij0_remove)
# label_list.append(label_r)
# get the best labels.
# choose all best graphs.
idx_max = np.argwhere(h_ij0_list == np.max(h_ij0_list)).flatten().tolist()
elabel_best = [label_list[idx] for idx in idx_max]
h_ij0_max = [h_ij0_list[idx] for idx in idx_max]
# generate "best" graphs with regard to "best" node labels.
G_new_list_ed = []
for g_tmp in g_tmp_list: # @todo: seems it can be simplified. The G_new_list will only contain 1 graph for now.
for idxl, el in enumerate(elabel_best):
g_tmp_copy = g_tmp.copy()
# check whether a_ij is 0 or 1.
sij_norm = 0
for idx, g in enumerate(Gn_median):
pi_i = pi_p_forward[idx][nd1i]
pi_j = pi_p_forward[idx][nd2i]
if g.has_node(pi_i) and g.has_node(pi_j) and \
g.has_edge(pi_i, pi_j):
sij_norm += 1
if h_ij0_max[idxl] > len(Gn_median) * c_er / c_es + \
sij_norm * (1 - (c_er + c_ei) / c_es):
if not g_tmp_copy.has_edge(nd1, nd2):
g_tmp_copy.add_edge(nd1, nd2)
g_tmp_copy.edges[nd1, nd2][edge_label] = elabel_best[idxl]
else:
if g_tmp_copy.has_edge(nd1, nd2):
g_tmp_copy.remove_edge(nd1, nd2)
G_new_list_ed.append(g_tmp_copy)
g_tmp_list = [ggg.copy() for ggg in G_new_list_ed]
G_new_list_edge += g_tmp_list
G_new_list = [ggg.copy() for ggg in G_new_list_edge]
# # choose one of the best randomly.
# idx_max = np.argwhere(h_ij0_list == np.max(h_ij0_list)).flatten().tolist()
# h_ij0_max = h_ij0_list[idx_max[0]]
# idx_rdm = random.randint(0, len(idx_max) - 1)
# best_label = label_list[idx_max[idx_rdm]]
#
# # check whether a_ij is 0 or 1.
# sij_norm = 0
# for idx, g in enumerate(Gn_median):
# pi_i = pi_p_forward[idx][nd1i]
# pi_j = pi_p_forward[idx][nd2i]
# if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j):
# sij_norm += 1
# if h_ij0_max > len(Gn_median) * c_er / c_es + sij_norm * (1 - (c_er + c_ei) / c_es):
# if not g_new.has_edge(nd1, nd2):
# g_new.add_edge(nd1, nd2)
# g_new.edges[nd1, nd2][edge_label] = best_label
# else:
# if g_new.has_edge(nd1, nd2):
# g_new.remove_edge(nd1, nd2)
else: # if edges are unlabeled
# @todo: is this even right? G or g_tmp? check if the new one is right
# @todo: works only for undirected graphs.
for g_tmp in G_new_list:
nd_list = [n for n in g_tmp.nodes()]
for nd1i in range(nx.number_of_nodes(g_tmp)):
nd1 = nd_list[nd1i]
for nd2i in range(nd1i + 1, nx.number_of_nodes(g_tmp)):
nd2 = nd_list[nd2i]
sij_norm = 0
for idx, g in enumerate(Gn_median):
pi_i = pi_p_forward[idx][nd1i]
pi_j = pi_p_forward[idx][nd2i]
if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j):
sij_norm += 1
if sij_norm > len(Gn_median) * c_er / (c_er + c_ei):
# @todo: should we consider if nd1 and nd2 in g_tmp?
# or just add the edge anyway?
if g_tmp.has_node(nd1) and g_tmp.has_node(nd2) \
and not g_tmp.has_edge(nd1, nd2):
g_tmp.add_edge(nd1, nd2)
# else: # @todo: which to use?
elif sij_norm < len(Gn_median) * c_er / (c_er + c_ei):
if g_tmp.has_edge(nd1, nd2):
g_tmp.remove_edge(nd1, nd2)
# do not change anything when equal.
# for i, g in enumerate(G_new_list):
# import matplotlib.pyplot as plt
# nx.draw(g, labels=nx.get_node_attributes(g, 'atom'), with_labels=True)
## plt.savefig("results/gk_iam/simple_two/xx" + str(i) + ".png", format="PNG")
# plt.show()
# print(g.nodes(data=True))
# print(g.edges(data=True))
# # find the best graph generated in this iteration and update pi_p.
# @todo: should we update all graphs generated or just the best ones?
dis_list, pi_forward_list = median_distance(G_new_list, Gn_median,
**params_ged)
# @todo: should we remove the identical and connectivity check?
# Don't know which is faster.
if ds_attrs['node_attr_dim'] == 0 and ds_attrs['edge_attr_dim'] == 0:
G_new_list, idx_list = remove_duplicates(G_new_list)
pi_forward_list = [pi_forward_list[idx] for idx in idx_list]
dis_list = [dis_list[idx] for idx in idx_list]
# if connected == True:
# G_new_list, idx_list = remove_disconnected(G_new_list)
# pi_forward_list = [pi_forward_list[idx] for idx in idx_list]
# idx_min_list = np.argwhere(dis_list == np.min(dis_list)).flatten().tolist()
# dis_min = dis_list[idx_min_tmp_list[0]]
# pi_forward_list = [pi_forward_list[idx] for idx in idx_min_list]
# G_new_list = [G_new_list[idx] for idx in idx_min_list]
# for g in G_new_list:
# import matplotlib.pyplot as plt
# nx.draw_networkx(g)
# plt.show()
# print(g.nodes(data=True))
# print(g.edges(data=True))
return G_new_list, pi_forward_list, dis_list
def best_median_graphs(Gn_candidate, pi_all_forward, dis_all):
idx_min_list = np.argwhere(dis_all == np.min(dis_all)).flatten().tolist()
dis_min = dis_all[idx_min_list[0]]
pi_forward_min_list = [pi_all_forward[idx] for idx in idx_min_list]
G_min_list = [Gn_candidate[idx] for idx in idx_min_list]
return G_min_list, pi_forward_min_list, dis_min
def iteration_proc(G, pi_p_forward, cur_sod):
G_list = [G]
pi_forward_list = [pi_p_forward]
old_sod = cur_sod * 2
sod_list = [cur_sod]
dis_list = [cur_sod]
# iterations.
itr = 0
# @todo: what if difference == 0?
# while itr < ite_max and (np.abs(old_sod - cur_sod) > epsilon or
# np.abs(old_sod - cur_sod) == 0):
while itr < ite_max and np.abs(old_sod - cur_sod) > epsilon:
# for itr in range(0, 5): # the convergence condition?
print('itr_iam is', itr)
G_new_list = []
pi_forward_new_list = []
dis_new_list = []
for idx, g in enumerate(G_list):
label_set = get_node_labels(Gn_median + [g], node_label)
G_tmp_list, pi_forward_tmp_list, dis_tmp_list = generate_graph(
g, pi_forward_list[idx], label_set)
G_new_list += G_tmp_list
pi_forward_new_list += pi_forward_tmp_list
dis_new_list += dis_tmp_list
# @todo: need to remove duplicates here?
G_list = [ggg.copy() for ggg in G_new_list]
pi_forward_list = [pitem.copy() for pitem in pi_forward_new_list]
dis_list = dis_new_list[:]
old_sod = cur_sod
cur_sod = np.min(dis_list)
sod_list.append(cur_sod)
itr += 1
# @todo: do we return all graphs or the best ones?
# get the best ones of the generated graphs.
G_list, pi_forward_list, dis_min = best_median_graphs(
G_list, pi_forward_list, dis_list)
if ds_attrs['node_attr_dim'] == 0 and ds_attrs['edge_attr_dim'] == 0:
G_list, idx_list = remove_duplicates(G_list)
pi_forward_list = [pi_forward_list[idx] for idx in idx_list]
# dis_list = [dis_list[idx] for idx in idx_list]
# import matplotlib.pyplot as plt
# for g in G_list:
# nx.draw_networkx(g)
# plt.show()
# print(g.nodes(data=True))
# print(g.edges(data=True))
print('\nsods:', sod_list, '\n')
return G_list, pi_forward_list, dis_min
def remove_duplicates(Gn):
"""Remove duplicate graphs from list.
"""
Gn_new = []
idx_list = []
for idx, g in enumerate(Gn):
dupl = False
for g_new in Gn_new:
if graph_isIdentical(g_new, g):
dupl = True
break
if not dupl:
Gn_new.append(g)
idx_list.append(idx)
return Gn_new, idx_list
def remove_disconnected(Gn):
"""Remove disconnected graphs from list.
"""
Gn_new = []
idx_list = []
for idx, g in enumerate(Gn):
if nx.is_connected(g):
Gn_new.append(g)
idx_list.append(idx)
return Gn_new, idx_list

# phase 1: initilize.
# compute set-median.
dis_min = np.inf
dis_list, pi_forward_all = median_distance(Gn_candidate, Gn_median,
**params_ged)
# find all smallest distances.
if AllBestInit: # try all best init graphs.
idx_min_list = range(len(dis_list))
dis_min = dis_list
else:
idx_min_list = np.argwhere(dis_list == np.min(dis_list)).flatten().tolist()
dis_min = [dis_list[idx_min_list[0]]] * len(idx_min_list)
# phase 2: iteration.
G_list = []
dis_list = []
pi_forward_list = []
for idx_tmp, idx_min in enumerate(idx_min_list):
# print('idx_min is', idx_min)
G = Gn_candidate[idx_min].copy()
# list of edit operations.
pi_p_forward = pi_forward_all[idx_min]
# pi_p_backward = pi_all_backward[idx_min]
Gi_list, pi_i_forward_list, dis_i_min = iteration_proc(G, pi_p_forward, dis_min[idx_tmp])
G_list += Gi_list
dis_list += [dis_i_min] * len(Gi_list)
pi_forward_list += pi_i_forward_list
if ds_attrs['node_attr_dim'] == 0 and ds_attrs['edge_attr_dim'] == 0:
G_list, idx_list = remove_duplicates(G_list)
dis_list = [dis_list[idx] for idx in idx_list]
pi_forward_list = [pi_forward_list[idx] for idx in idx_list]
if connected == True:
G_list_con, idx_list = remove_disconnected(G_list)
# if there is no connected graphs at all, then remain the disconnected ones.
if len(G_list_con) > 0: # @todo: ??????????????????????????
G_list = G_list_con
dis_list = [dis_list[idx] for idx in idx_list]
pi_forward_list = [pi_forward_list[idx] for idx in idx_list]


# import matplotlib.pyplot as plt
# for g in G_list:
# nx.draw_networkx(g)
# plt.show()
# print(g.nodes(data=True))
# print(g.edges(data=True))
# get the best median graphs
# dis_list, pi_forward_list = median_distance(G_list, Gn_median,
# **params_ged)
G_min_list, pi_forward_min_list, dis_min = best_median_graphs(
G_list, pi_forward_list, dis_list)
# for g in G_min_list:
# nx.draw_networkx(g)
# plt.show()
# print(g.nodes(data=True))
# print(g.edges(data=True))
# randomly choose one graph.
idx_rdm = random.randint(0, len(G_min_list) - 1)
G_min_list = [G_min_list[idx_rdm]]
return G_min_list, dis_min
















###############################################################################
def iam(Gn, c_ei=3, c_er=3, c_es=1, node_label='atom', edge_label='bond_type', def iam(Gn, c_ei=3, c_er=3, c_es=1, node_label='atom', edge_label='bond_type',
connected=True): connected=True):
"""See my name, then you know what I do. """See my name, then you know what I do.
@@ -148,27 +565,42 @@ def iam(Gn, c_ei=3, c_er=3, c_es=1, node_label='atom', edge_label='bond_type',
return G return G




def GED(g1, g2, lib='gedlib'):
def GED(g1, g2, lib='gedlib', cost='CHEM_1', method='IPFP', saveGXL='benoit',
stabilizer='min'):
""" """
Compute GED. Compute GED.
""" """
if lib == 'gedlib': if lib == 'gedlib':
# transform dataset to the 'xml' file as the GedLib required. # transform dataset to the 'xml' file as the GedLib required.
saveDataset([g1, g2], [None, None], group='xml', filename='ged_tmp/tmp')
# script.appel()
saveDataset([g1, g2], [None, None], group='xml', filename='ged_tmp/tmp',
xparams={'method': saveGXL})
# script.appel()
script.PyRestartEnv() script.PyRestartEnv()
script.PyLoadGXLGraph('ged_tmp/', 'ged_tmp/tmp.xml') script.PyLoadGXLGraph('ged_tmp/', 'ged_tmp/tmp.xml')
listID = script.PyGetGraphIds() listID = script.PyGetGraphIds()
script.PySetEditCost("LETTER") #("CHEM_1")
script.PySetEditCost(cost) #("CHEM_1")
script.PyInitEnv() script.PyInitEnv()
script.PySetMethod("IPFP", "")
script.PySetMethod(method, "")
script.PyInitMethod() script.PyInitMethod()
g = listID[0] g = listID[0]
h = listID[1] h = listID[1]
script.PyRunMethod(g, h)
pi_forward, pi_backward = script.PyGetAllMap(g, h)
upper = script.PyGetUpperBound(g, h)
lower = script.PyGetLowerBound(g, h)
if stabilizer == None:
script.PyRunMethod(g, h)
pi_forward, pi_backward = script.PyGetAllMap(g, h)
upper = script.PyGetUpperBound(g, h)
lower = script.PyGetLowerBound(g, h)
elif stabilizer == 'min':
upper = np.inf
for itr in range(50):
script.PyRunMethod(g, h)
upper_tmp = script.PyGetUpperBound(g, h)
if upper_tmp < upper:
upper = upper_tmp
pi_forward, pi_backward = script.PyGetAllMap(g, h)
lower = script.PyGetLowerBound(g, h)
if upper == 0:
break
dis = upper dis = upper
# make the map label correct (label remove map as np.inf) # make the map label correct (label remove map as np.inf)
@@ -177,12 +609,13 @@ def GED(g1, g2, lib='gedlib'):
nb1 = nx.number_of_nodes(g1) nb1 = nx.number_of_nodes(g1)
nb2 = nx.number_of_nodes(g2) nb2 = nx.number_of_nodes(g2)
pi_forward = [nodes2[pi] if pi < nb2 else np.inf for pi in pi_forward] pi_forward = [nodes2[pi] if pi < nb2 else np.inf for pi in pi_forward]
pi_backward = [nodes1[pi] if pi < nb1 else np.inf for pi in pi_backward]
pi_backward = [nodes1[pi] if pi < nb1 else np.inf for pi in pi_backward]
return dis, pi_forward, pi_backward return dis, pi_forward, pi_backward




def median_distance(Gn, Gn_median, measure='ged', verbose=False):
def median_distance(Gn, Gn_median, measure='ged', verbose=False,
ged_cost='CHEM_1', ged_method='IPFP', saveGXL='benoit'):
dis_list = [] dis_list = []
pi_forward_list = [] pi_forward_list = []
for idx, G in tqdm(enumerate(Gn), desc='computing median distances', for idx, G in tqdm(enumerate(Gn), desc='computing median distances',
@@ -190,7 +623,8 @@ def median_distance(Gn, Gn_median, measure='ged', verbose=False):
dis_sum = 0 dis_sum = 0
pi_forward_list.append([]) pi_forward_list.append([])
for G_p in Gn_median: for G_p in Gn_median:
dis_tmp, pi_tmp_forward, pi_tmp_backward = GED(G, G_p)
dis_tmp, pi_tmp_forward, pi_tmp_backward = GED(G, G_p,
cost=ged_cost, method=ged_method, saveGXL=saveGXL)
pi_forward_list[idx].append(pi_tmp_forward) pi_forward_list[idx].append(pi_tmp_forward)
dis_sum += dis_tmp dis_sum += dis_tmp
dis_list.append(dis_sum) dis_list.append(dis_sum)
@@ -228,137 +662,13 @@ def test_iam_with_more_graphs_as_init(Gn, G_candidate, c_ei=3, c_er=3, c_es=1,
# list of edit operations. # list of edit operations.
pi_p_forward = pi_all_forward[idx_min] pi_p_forward = pi_all_forward[idx_min]
pi_p_backward = pi_all_backward[idx_min] pi_p_backward = pi_all_backward[idx_min]
# phase 2: iteration.
ds_attrs = get_dataset_attributes(Gn + [G], attr_names=['edge_labeled', 'node_attr_dim'],
edge_label=edge_label)
label_set = get_node_labels(Gn + [G], node_label)
for itr in range(0, 10): # @todo: the convergence condition?
G_new = G.copy()
# update vertex labels.
# pre-compute h_i0 for each label.
# for label in get_node_labels(Gn, node_label):
# print(label)
# for nd in G.nodes(data=True):
# pass
if not ds_attrs['node_attr_dim']: # labels are symbolic
for nd in G.nodes():
h_i0_list = []
label_list = []
for label in label_set:
h_i0 = 0
for idx, g in enumerate(Gn):
pi_i = pi_p_forward[idx][nd]
if g.has_node(pi_i) and g.nodes[pi_i][node_label] == label:
h_i0 += 1
h_i0_list.append(h_i0)
label_list.append(label)
# choose one of the best randomly.
idx_max = np.argwhere(h_i0_list == np.max(h_i0_list)).flatten().tolist()
idx_rdm = random.randint(0, len(idx_max) - 1)
G_new.nodes[nd][node_label] = label_list[idx_max[idx_rdm]]
else: # labels are non-symbolic
for nd in G.nodes():
Si_norm = 0
phi_i_bar = np.array([0.0 for _ in range(ds_attrs['node_attr_dim'])])
for idx, g in enumerate(Gn):
pi_i = pi_p_forward[idx][nd]
if g.has_node(pi_i): #@todo: what if no g has node? phi_i_bar = 0?
Si_norm += 1
phi_i_bar += np.array([float(itm) for itm in g.nodes[pi_i]['attributes']])
phi_i_bar /= Si_norm
G_new.nodes[nd]['attributes'] = phi_i_bar
# update edge labels and adjacency matrix.
if ds_attrs['edge_labeled']:
for nd1, nd2, _ in G.edges(data=True):
h_ij0_list = []
label_list = []
for label in get_edge_labels(Gn, edge_label):
h_ij0 = 0
for idx, g in enumerate(Gn):
pi_i = pi_p_forward[idx][nd1]
pi_j = pi_p_forward[idx][nd2]
h_ij0_p = (g.has_node(pi_i) and g.has_node(pi_j) and
g.has_edge(pi_i, pi_j) and
g.edges[pi_i, pi_j][edge_label] == label)
h_ij0 += h_ij0_p
h_ij0_list.append(h_ij0)
label_list.append(label)
# choose one of the best randomly.
idx_max = np.argwhere(h_ij0_list == np.max(h_ij0_list)).flatten().tolist()
h_ij0_max = h_ij0_list[idx_max[0]]
idx_rdm = random.randint(0, len(idx_max) - 1)
best_label = label_list[idx_max[idx_rdm]]
# check whether a_ij is 0 or 1.
sij_norm = 0
for idx, g in enumerate(Gn):
pi_i = pi_p_forward[idx][nd1]
pi_j = pi_p_forward[idx][nd2]
if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j):
sij_norm += 1
if h_ij0_max > len(Gn) * c_er / c_es + sij_norm * (1 - (c_er + c_ei) / c_es):
if not G_new.has_edge(nd1, nd2):
G_new.add_edge(nd1, nd2)
G_new.edges[nd1, nd2][edge_label] = best_label
else:
if G_new.has_edge(nd1, nd2):
G_new.remove_edge(nd1, nd2)
else: # if edges are unlabeled
# @todo: works only for undirected graphs.
for nd1 in range(nx.number_of_nodes(G)):
for nd2 in range(nd1 + 1, nx.number_of_nodes(G)):
sij_norm = 0
for idx, g in enumerate(Gn):
pi_i = pi_p_forward[idx][nd1]
pi_j = pi_p_forward[idx][nd2]
if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j):
sij_norm += 1
if sij_norm > len(Gn) * c_er / (c_er + c_ei):
if not G_new.has_edge(nd1, nd2):
G_new.add_edge(nd1, nd2)
elif sij_norm < len(Gn) * c_er / (c_er + c_ei):
if G_new.has_edge(nd1, nd2):
G_new.remove_edge(nd1, nd2)
# do not change anything when equal.
G = G_new.copy()
# update pi_p
pi_p_forward = []
for G_p in Gn:
dist_tmp, pi_tmp_forward, pi_tmp_backward = GED(G, G_p)
pi_p_forward.append(pi_tmp_forward)
return G


def test_iam_moreGraphsAsInit_tryAllPossibleBestGraphs_deleteNodesInIterations(
Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, node_label='atom',
edge_label='bond_type', connected=False):
"""See my name, then you know what I do.
"""
from tqdm import tqdm
# Gn_median = Gn_median[0:10]
# Gn_median = [nx.convert_node_labels_to_integers(g) for g in Gn_median]
node_ir = np.inf # corresponding to the node remove and insertion.
label_r = 'thanksdanny' # the label for node remove. # @todo: make this label unrepeatable.
ds_attrs = get_dataset_attributes(Gn_median + Gn_candidate,
attr_names=['edge_labeled', 'node_attr_dim', 'edge_attr_dim'],
# phase 2: iteration.
ds_attrs = get_dataset_attributes(Gn + [G], attr_names=['edge_labeled', 'node_attr_dim'],
edge_label=edge_label) edge_label=edge_label)
ite_max = 50
epsilon = 0.001

def generate_graph(G, pi_p_forward, label_set):
G_new_list = [G.copy()] # all "best" graphs generated in this iteration.
# nx.draw_networkx(G)
# import matplotlib.pyplot as plt
# plt.show()
# print(pi_p_forward)
label_set = get_node_labels(Gn + [G], node_label)
for itr in range(0, 10): # @todo: the convergence condition?
G_new = G.copy()
# update vertex labels. # update vertex labels.
# pre-compute h_i0 for each label. # pre-compute h_i0 for each label.
# for label in get_node_labels(Gn, node_label): # for label in get_node_labels(Gn, node_label):
@@ -366,65 +676,41 @@ def test_iam_moreGraphsAsInit_tryAllPossibleBestGraphs_deleteNodesInIterations(
# for nd in G.nodes(data=True): # for nd in G.nodes(data=True):
# pass # pass
if not ds_attrs['node_attr_dim']: # labels are symbolic if not ds_attrs['node_attr_dim']: # labels are symbolic
for ndi, (nd, _) in enumerate(G.nodes(data=True)):
for nd in G.nodes():
h_i0_list = [] h_i0_list = []
label_list = [] label_list = []
for label in label_set: for label in label_set:
h_i0 = 0 h_i0 = 0
for idx, g in enumerate(Gn_median):
pi_i = pi_p_forward[idx][ndi]
if pi_i != node_ir and g.nodes[pi_i][node_label] == label:
for idx, g in enumerate(Gn):
pi_i = pi_p_forward[idx][nd]
if g.has_node(pi_i) and g.nodes[pi_i][node_label] == label:
h_i0 += 1 h_i0 += 1
h_i0_list.append(h_i0) h_i0_list.append(h_i0)
label_list.append(label) label_list.append(label)
# case when the node is to be removed.
h_i0_remove = 0
for idx, g in enumerate(Gn_median):
pi_i = pi_p_forward[idx][ndi]
if pi_i == node_ir:
h_i0_remove += 1
h_i0_list.append(h_i0_remove)
label_list.append(label_r)
# get the best labels.
# choose one of the best randomly.
idx_max = np.argwhere(h_i0_list == np.max(h_i0_list)).flatten().tolist() idx_max = np.argwhere(h_i0_list == np.max(h_i0_list)).flatten().tolist()
nlabel_best = [label_list[idx] for idx in idx_max]
# generate "best" graphs with regard to "best" node labels.
G_new_list_nd = []
for g in G_new_list: # @todo: seems it can be simplified. The G_new_list will only contain 1 graph for now.
for nl in nlabel_best:
g_tmp = g.copy()
if nl == label_r:
g_tmp.remove_node(nd)
else:
g_tmp.nodes[nd][node_label] = nl
G_new_list_nd.append(g_tmp)
# nx.draw_networkx(g_tmp)
# import matplotlib.pyplot as plt
# plt.show()
# print(g_tmp.nodes(data=True))
# print(g_tmp.edges(data=True))
G_new_list = G_new_list_nd[:]

idx_rdm = random.randint(0, len(idx_max) - 1)
G_new.nodes[nd][node_label] = label_list[idx_max[idx_rdm]]
else: # labels are non-symbolic else: # labels are non-symbolic
for ndi, (nd, _) in enumerate(G.nodes(data=True)):
for nd in G.nodes():
Si_norm = 0 Si_norm = 0
phi_i_bar = np.array([0.0 for _ in range(ds_attrs['node_attr_dim'])]) phi_i_bar = np.array([0.0 for _ in range(ds_attrs['node_attr_dim'])])
for idx, g in enumerate(Gn_median):
pi_i = pi_p_forward[idx][ndi]
for idx, g in enumerate(Gn):
pi_i = pi_p_forward[idx][nd]
if g.has_node(pi_i): #@todo: what if no g has node? phi_i_bar = 0? if g.has_node(pi_i): #@todo: what if no g has node? phi_i_bar = 0?
Si_norm += 1 Si_norm += 1
phi_i_bar += np.array([float(itm) for itm in g.nodes[pi_i]['attributes']]) phi_i_bar += np.array([float(itm) for itm in g.nodes[pi_i]['attributes']])
phi_i_bar /= Si_norm phi_i_bar /= Si_norm
G_new_list[0].nodes[nd]['attributes'] = phi_i_bar
G_new.nodes[nd]['attributes'] = phi_i_bar
# update edge labels and adjacency matrix. # update edge labels and adjacency matrix.
if ds_attrs['edge_labeled']: if ds_attrs['edge_labeled']:
for nd1, nd2, _ in G.edges(data=True): for nd1, nd2, _ in G.edges(data=True):
h_ij0_list = [] h_ij0_list = []
label_list = [] label_list = []
for label in get_edge_labels(Gn_median, edge_label):
for label in get_edge_labels(Gn, edge_label):
h_ij0 = 0 h_ij0 = 0
for idx, g in enumerate(Gn_median):
for idx, g in enumerate(Gn):
pi_i = pi_p_forward[idx][nd1] pi_i = pi_p_forward[idx][nd1]
pi_j = pi_p_forward[idx][nd2] pi_j = pi_p_forward[idx][nd2]
h_ij0_p = (g.has_node(pi_i) and g.has_node(pi_j) and h_ij0_p = (g.has_node(pi_i) and g.has_node(pi_j) and
@@ -441,12 +727,12 @@ def test_iam_moreGraphsAsInit_tryAllPossibleBestGraphs_deleteNodesInIterations(
# check whether a_ij is 0 or 1. # check whether a_ij is 0 or 1.
sij_norm = 0 sij_norm = 0
for idx, g in enumerate(Gn_median):
for idx, g in enumerate(Gn):
pi_i = pi_p_forward[idx][nd1] pi_i = pi_p_forward[idx][nd1]
pi_j = pi_p_forward[idx][nd2] pi_j = pi_p_forward[idx][nd2]
if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j): if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j):
sij_norm += 1 sij_norm += 1
if h_ij0_max > len(Gn_median) * c_er / c_es + sij_norm * (1 - (c_er + c_ei) / c_es):
if h_ij0_max > len(Gn) * c_er / c_es + sij_norm * (1 - (c_er + c_ei) / c_es):
if not G_new.has_edge(nd1, nd2): if not G_new.has_edge(nd1, nd2):
G_new.add_edge(nd1, nd2) G_new.add_edge(nd1, nd2)
G_new.edges[nd1, nd2][edge_label] = best_label G_new.edges[nd1, nd2][edge_label] = best_label
@@ -455,197 +741,36 @@ def test_iam_moreGraphsAsInit_tryAllPossibleBestGraphs_deleteNodesInIterations(
G_new.remove_edge(nd1, nd2) G_new.remove_edge(nd1, nd2)
else: # if edges are unlabeled else: # if edges are unlabeled
# @todo: works only for undirected graphs. # @todo: works only for undirected graphs.
nd_list = [n for n in G.nodes()]
for g_tmp in G_new_list:
for nd1i in range(nx.number_of_nodes(G)):
nd1 = nd_list[nd1i]
for nd2i in range(nd1i + 1, nx.number_of_nodes(G)):
nd2 = nd_list[nd2i]
sij_norm = 0
for idx, g in enumerate(Gn_median):
pi_i = pi_p_forward[idx][nd1i]
pi_j = pi_p_forward[idx][nd2i]
if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j):
sij_norm += 1
if sij_norm > len(Gn_median) * c_er / (c_er + c_ei):
# @todo: should we consider if nd1 and nd2 in g_tmp?
# or just add the edge anyway?
if g_tmp.has_node(nd1) and g_tmp.has_node(nd2) \
and not g_tmp.has_edge(nd1, nd2):
g_tmp.add_edge(nd1, nd2)
elif sij_norm < len(Gn_median) * c_er / (c_er + c_ei):
if g_tmp.has_edge(nd1, nd2):
g_tmp.remove_edge(nd1, nd2)
# do not change anything when equal.
# # find the best graph generated in this iteration and update pi_p.
# @todo: should we update all graphs generated or just the best ones?
dis_list, pi_forward_list = median_distance(G_new_list, Gn_median)
# @todo: should we remove the identical and connectivity check?
# Don't know which is faster.
if ds_attrs['node_attr_dim'] == 0 and ds_attrs['edge_attr_dim'] == 0:
G_new_list, idx_list = remove_duplicates(G_new_list)
pi_forward_list = [pi_forward_list[idx] for idx in idx_list]
dis_list = [dis_list[idx] for idx in idx_list]
# if connected == True:
# G_new_list, idx_list = remove_disconnected(G_new_list)
# pi_forward_list = [pi_forward_list[idx] for idx in idx_list]
# idx_min_list = np.argwhere(dis_list == np.min(dis_list)).flatten().tolist()
# dis_min = dis_list[idx_min_tmp_list[0]]
# pi_forward_list = [pi_forward_list[idx] for idx in idx_min_list]
# G_new_list = [G_new_list[idx] for idx in idx_min_list]
# for g in G_new_list:
# import matplotlib.pyplot as plt
# nx.draw_networkx(g)
# plt.show()
# print(g.nodes(data=True))
# print(g.edges(data=True))
return G_new_list, pi_forward_list, dis_list
def best_median_graphs(Gn_candidate, pi_all_forward, dis_all):
idx_min_list = np.argwhere(dis_all == np.min(dis_all)).flatten().tolist()
dis_min = dis_all[idx_min_list[0]]
pi_forward_min_list = [pi_all_forward[idx] for idx in idx_min_list]
G_min_list = [Gn_candidate[idx] for idx in idx_min_list]
return G_min_list, pi_forward_min_list, dis_min
def iteration_proc(G, pi_p_forward, cur_sod):
G_list = [G]
pi_forward_list = [pi_p_forward]
old_sod = cur_sod * 2
sod_list = [cur_sod]
# iterations.
itr = 0
while itr < ite_max and np.abs(old_sod - cur_sod) > epsilon:
# for itr in range(0, 5): # the convergence condition?
print('itr is', itr)
G_new_list = []
pi_forward_new_list = []
dis_new_list = []
for idx, G in enumerate(G_list):
label_set = get_node_labels(Gn_median + [G], node_label)
G_tmp_list, pi_forward_tmp_list, dis_tmp_list = generate_graph(
G, pi_forward_list[idx], label_set)
G_new_list += G_tmp_list
pi_forward_new_list += pi_forward_tmp_list
dis_new_list += dis_tmp_list
G_list = G_new_list[:]
pi_forward_list = pi_forward_new_list[:]
dis_list = dis_new_list[:]
old_sod = cur_sod
cur_sod = np.min(dis_list)
sod_list.append(cur_sod)
itr += 1
# @todo: do we return all graphs or the best ones?
# get the best ones of the generated graphs.
G_list, pi_forward_list, dis_min = best_median_graphs(
G_list, pi_forward_list, dis_list)
for nd1 in range(nx.number_of_nodes(G)):
for nd2 in range(nd1 + 1, nx.number_of_nodes(G)):
sij_norm = 0
for idx, g in enumerate(Gn):
pi_i = pi_p_forward[idx][nd1]
pi_j = pi_p_forward[idx][nd2]
if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j):
sij_norm += 1
if sij_norm > len(Gn) * c_er / (c_er + c_ei):
if not G_new.has_edge(nd1, nd2):
G_new.add_edge(nd1, nd2)
elif sij_norm < len(Gn) * c_er / (c_er + c_ei):
if G_new.has_edge(nd1, nd2):
G_new.remove_edge(nd1, nd2)
# do not change anything when equal.
G = G_new.copy()
if ds_attrs['node_attr_dim'] == 0 and ds_attrs['edge_attr_dim'] == 0:
G_list, idx_list = remove_duplicates(G_list)
pi_forward_list = [pi_forward_list[idx] for idx in idx_list]
# dis_list = [dis_list[idx] for idx in idx_list]
# import matplotlib.pyplot as plt
# for g in G_list:
# nx.draw_networkx(g)
# plt.show()
# print(g.nodes(data=True))
# print(g.edges(data=True))
print('\nsods:', sod_list, '\n')
return G_list, pi_forward_list, dis_min
def remove_duplicates(Gn):
"""Remove duplicate graphs from list.
"""
Gn_new = []
idx_list = []
for idx, g in enumerate(Gn):
dupl = False
for g_new in Gn_new:
if graph_isIdentical(g_new, g):
dupl = True
break
if not dupl:
Gn_new.append(g)
idx_list.append(idx)
return Gn_new, idx_list
# update pi_p
pi_p_forward = []
for G_p in Gn:
dist_tmp, pi_tmp_forward, pi_tmp_backward = GED(G, G_p)
pi_p_forward.append(pi_tmp_forward)
def remove_disconnected(Gn):
"""Remove disconnected graphs from list.
"""
Gn_new = []
idx_list = []
for idx, g in enumerate(Gn):
if nx.is_connected(g):
Gn_new.append(g)
idx_list.append(idx)
return Gn_new, idx_list
return G


###############################################################################


# phase 1: initilize.
# compute set-median.
dis_min = np.inf
dis_list, pi_forward_all = median_distance(Gn_candidate, Gn_median)
# find all smallest distances.
idx_min_list = np.argwhere(dis_list == np.min(dis_list)).flatten().tolist()
dis_min = dis_list[idx_min_list[0]]
# phase 2: iteration.
G_list = []
dis_list = []
pi_forward_list = []
for idx_min in idx_min_list:
# print('idx_min is', idx_min)
G = Gn_candidate[idx_min].copy()
# list of edit operations.
pi_p_forward = pi_forward_all[idx_min]
# pi_p_backward = pi_all_backward[idx_min]
Gi_list, pi_i_forward_list, dis_i_min = iteration_proc(G, pi_p_forward, dis_min)
G_list += Gi_list
dis_list.append(dis_i_min)
pi_forward_list += pi_i_forward_list
if ds_attrs['node_attr_dim'] == 0 and ds_attrs['edge_attr_dim'] == 0:
G_list, idx_list = remove_duplicates(G_list)
dis_list = [dis_list[idx] for idx in idx_list]
pi_forward_list = [pi_forward_list[idx] for idx in idx_list]
if connected == True:
G_list_con, idx_list = remove_disconnected(G_list)
# if there is no connected graphs at all, then remain the disconnected ones.
if len(G_list_con) > 0: # @todo: ??????????????????????????
G_list = G_list_con
dis_list = [dis_list[idx] for idx in idx_list]
pi_forward_list = [pi_forward_list[idx] for idx in idx_list]


# import matplotlib.pyplot as plt
# for g in G_list:
# nx.draw_networkx(g)
# plt.show()
# print(g.nodes(data=True))
# print(g.edges(data=True))
# get the best median graphs
# dis_list, pi_forward_list = median_distance(G_list, Gn_median)
G_min_list, pi_forward_min_list, dis_min = best_median_graphs(
G_list, pi_forward_list, dis_list)
# for g in G_min_list:
# nx.draw_networkx(g)
# plt.show()
# print(g.nodes(data=True))
# print(g.edges(data=True))
return G_min_list, dis_min




if __name__ == '__main__': if __name__ == '__main__':


+ 29
- 29
preimage/median.py View File

@@ -5,10 +5,10 @@ import numpy as np
import networkx as nx import networkx as nx
import time import time
#import librariesImport
#import script
#sys.path.insert(0, "/home/bgauzere/dev/optim-graphes/")
#import pygraph
import librariesImport
import script
sys.path.insert(0, "/home/bgauzere/dev/optim-graphes/")
import pygraph
from pygraph.utils.graphfiles import loadDataset from pygraph.utils.graphfiles import loadDataset
def replace_graph_in_env(script, graph, old_id, label='median'): def replace_graph_in_env(script, graph, old_id, label='median'):
@@ -191,28 +191,28 @@ def compute_median_set(script,listID):
return median_set_index, sod return median_set_index, sod
#if __name__ == "__main__":
# #Chargement du dataset
# script.PyLoadGXLGraph('/home/bgauzere/dev/gedlib/data/datasets/Letter/HIGH/', '/home/bgauzere/dev/gedlib/data/collections/Letter_Z.xml')
# script.PySetEditCost("LETTER")
# script.PyInitEnv()
# script.PySetMethod("IPFP", "")
# script.PyInitMethod()
#
# dataset,my_y = pygraph.utils.graphfiles.loadDataset("/home/bgauzere/dev/gedlib/data/datasets/Letter/HIGH/Letter_Z.cxl")
#
# listID = script.PyGetAllGraphIds()
# median, sod = compute_median(script,listID,dataset,verbose=True)
#
# print(sod)
# draw_Letter_graph(median)
if __name__ == '__main__':
# test draw_Letter_graph
ds = {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt',
'extra_params': {}} # node nsymb
Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
print(y_all)
for g in Gn:
draw_Letter_graph(g)
if __name__ == "__main__":
#Chargement du dataset
script.PyLoadGXLGraph('/home/bgauzere/dev/gedlib/data/datasets/Letter/HIGH/', '/home/bgauzere/dev/gedlib/data/collections/Letter_Z.xml')
script.PySetEditCost("LETTER")
script.PyInitEnv()
script.PySetMethod("IPFP", "")
script.PyInitMethod()
dataset,my_y = pygraph.utils.graphfiles.loadDataset("/home/bgauzere/dev/gedlib/data/datasets/Letter/HIGH/Letter_Z.cxl")
listID = script.PyGetAllGraphIds()
median, sod = compute_median(script,listID,dataset,verbose=True)
print(sod)
draw_Letter_graph(median)
#if __name__ == '__main__':
# # test draw_Letter_graph
# ds = {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt',
# 'extra_params': {}} # node nsymb
# Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
# print(y_all)
# for g in Gn:
# draw_Letter_graph(g)

+ 152
- 17
preimage/preimage.py View File

@@ -25,14 +25,16 @@ import functools
from pygraph.utils.kernels import deltakernel, gaussiankernel, kernelproduct from pygraph.utils.kernels import deltakernel, gaussiankernel, kernelproduct
from pygraph.kernels.structuralspKernel import structuralspkernel from pygraph.kernels.structuralspKernel import structuralspkernel


from gk_iam import dis_gstar



def compute_kernel(Gn, graph_kernel, verbose): def compute_kernel(Gn, graph_kernel, verbose):
if graph_kernel == 'marginalizedkernel': if graph_kernel == 'marginalizedkernel':
Kmatrix, _ = marginalizedkernel(Gn, node_label='atom', edge_label=None, Kmatrix, _ = marginalizedkernel(Gn, node_label='atom', edge_label=None,
p_quit=0.03, n_iteration=20, remove_totters=False,
p_quit=0.03, n_iteration=10, remove_totters=False,
n_jobs=multiprocessing.cpu_count(), verbose=verbose) n_jobs=multiprocessing.cpu_count(), verbose=verbose)
elif graph_kernel == 'untilhpathkernel': elif graph_kernel == 'untilhpathkernel':
Kmatrix, _ = untilhpathkernel(Gn, node_label='atom', edge_label='bond_type',
Kmatrix, _ = untilhpathkernel(Gn, node_label='atom', edge_label=None,
depth=10, k_func='MinMax', compute_method='trie', depth=10, k_func='MinMax', compute_method='trie',
n_jobs=multiprocessing.cpu_count(), verbose=verbose) n_jobs=multiprocessing.cpu_count(), verbose=verbose)
elif graph_kernel == 'spkernel': elif graph_kernel == 'spkernel':
@@ -47,34 +49,167 @@ def compute_kernel(Gn, graph_kernel, verbose):
n_jobs=multiprocessing.cpu_count(), verbose=verbose) n_jobs=multiprocessing.cpu_count(), verbose=verbose)
# normalization # normalization
# Kmatrix_diag = Kmatrix.diagonal().copy()
# for i in range(len(Kmatrix)):
# for j in range(i, len(Kmatrix)):
# Kmatrix[i][j] /= np.sqrt(Kmatrix_diag[i] * Kmatrix_diag[j])
# Kmatrix[j][i] = Kmatrix[i][j]
Kmatrix_diag = Kmatrix.diagonal().copy()
for i in range(len(Kmatrix)):
for j in range(i, len(Kmatrix)):
Kmatrix[i][j] /= np.sqrt(Kmatrix_diag[i] * Kmatrix_diag[j])
Kmatrix[j][i] = Kmatrix[i][j]
return Kmatrix return Kmatrix




def random_preimage(Gn_init, Gn_median, alpha, idx_gi, Kmatrix, k, r_max, l, gkernel):
Gn_init = [nx.convert_node_labels_to_integers(g) for g in Gn_init]
# compute k nearest neighbors of phi in DN.
dis_list = [] # distance between g_star and each graph.
term3 = 0
for i1, a1 in enumerate(alpha):
for i2, a2 in enumerate(alpha):
term3 += a1 * a2 * Kmatrix[idx_gi[i1], idx_gi[i2]]
for ig, g in tqdm(enumerate(Gn_init), desc='computing distances', file=sys.stdout):
dtemp = dis_gstar(ig, idx_gi, alpha, Kmatrix, term3=term3)
dis_list.append(dtemp)
# print(np.max(dis_list))
# print(np.min(dis_list))
# print(np.min([item for item in dis_list if item != 0]))
# print(np.mean(dis_list))
# sort
sort_idx = np.argsort(dis_list)
dis_gs = [dis_list[idis] for idis in sort_idx[0:k]] # the k shortest distances
nb_best = len(np.argwhere(dis_gs == dis_gs[0]).flatten().tolist())
g0hat_list = [Gn_init[idx] for idx in sort_idx[0:nb_best]] # the nearest neighbors of phi in DN
if dis_gs[0] == 0: # the exact pre-image.
print('The exact pre-image is found from the input dataset.')
return 0, g0hat_list[0], 0
dhat = dis_gs[0] # the nearest distance
# ghat_list = [g.copy() for g in g0hat_list]
# for g in ghat_list:
# draw_Letter_graph(g)
# nx.draw_networkx(g)
# plt.show()
# print(g.nodes(data=True))
# print(g.edges(data=True))
Gk = [Gn_init[ig].copy() for ig in sort_idx[0:k]] # the k nearest neighbors
# for gi in Gk:
## nx.draw_networkx(gi)
## plt.show()
# draw_Letter_graph(g)
# print(gi.nodes(data=True))
# print(gi.edges(data=True))
Gs_nearest = [g.copy() for g in Gk]
gihat_list = []
dihat_list = []
# i = 1
r = 0
# sod_list = [dhat]
# found = False
nb_updated = 0
g_best = []
while r < r_max:
print('\nr =', r)
print('itr for gk =', nb_updated, '\n')
found = False
dis_bests = dis_gs + dihat_list
# @todo what if the log is negetive? how to choose alpha (scalar)?
fdgs_list = np.array(dis_bests)
if np.min(fdgs_list) < 1:
fdgs_list /= np.min(dis_bests)
fdgs_list = [int(item) for item in np.ceil(np.log(fdgs_list))]
if np.min(fdgs_list) < 1:
fdgs_list = np.array(fdgs_list) + 1
for ig, gs in enumerate(Gs_nearest + gihat_list):
# nx.draw_networkx(gs)
# plt.show()
for trail in range(0, l):
# for trail in tqdm(range(0, l), desc='l loops', file=sys.stdout):
# add and delete edges.
gtemp = gs.copy()
np.random.seed()
# which edges to change.
# @todo: should we use just half of the adjacency matrix for undirected graphs?
nb_vpairs = nx.number_of_nodes(gs) * (nx.number_of_nodes(gs) - 1)
# @todo: what if fdgs is bigger than nb_vpairs?
idx_change = random.sample(range(nb_vpairs), fdgs_list[ig] if
fdgs_list[ig] < nb_vpairs else nb_vpairs)
# idx_change = np.random.randint(0, nx.number_of_nodes(gs) *
# (nx.number_of_nodes(gs) - 1), fdgs)
for item in idx_change:
node1 = int(item / (nx.number_of_nodes(gs) - 1))
node2 = (item - node1 * (nx.number_of_nodes(gs) - 1))
if node2 >= node1: # skip the self pair.
node2 += 1
# @todo: is the randomness correct?
if not gtemp.has_edge(node1, node2):
gtemp.add_edge(node1, node2)
# nx.draw_networkx(gs)
# plt.show()
# nx.draw_networkx(gtemp)
# plt.show()
else:
gtemp.remove_edge(node1, node2)
# nx.draw_networkx(gs)
# plt.show()
# nx.draw_networkx(gtemp)
# plt.show()
# nx.draw_networkx(gtemp)
# plt.show()
# compute distance between \psi and the new generated graph.
# knew = marginalizedkernel([gtemp, g1, g2], node_label='atom', edge_label=None,
# p_quit=lmbda, n_iteration=20, remove_totters=False,
# n_jobs=multiprocessing.cpu_count(), verbose=False)
knew = compute_kernel([gtemp] + Gn_median, gkernel, verbose=False)
dnew = dis_gstar(0, [1, 2], alpha, knew, withterm3=False)
if dnew <= dhat: # @todo: the new distance is smaller or also equal?
if dnew < dhat:
print('\nI am smaller!')
print('ig =', str(ig), ', l =', str(trail))
print(dhat, '->', dnew)
nb_updated += 1
elif dnew == dhat:
print('I am equal!')
# nx.draw_networkx(gtemp)
# plt.show()
# print(gtemp.nodes(data=True))
# print(gtemp.edges(data=True))
dhat = dnew
gnew = gtemp.copy()
found = True # found better graph.
if found:
r = 0
gihat_list = [gnew]
dihat_list = [dhat]
else:
r += 1
# dis_best.append(dhat)
g_best = (g0hat_list[0] if len(gihat_list) == 0 else gihat_list[0])
return dhat, g_best, nb_updated
# return 0, 0, 0


if __name__ == '__main__': if __name__ == '__main__':
# ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
# 'extra_params': {}} # node/edge symb
# ds = {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt',
# 'extra_params': {}} # node nsymb
# ds = {'name': 'Acyclic', 'dataset': '../datasets/monoterpenoides/trainset_9.ds',
# 'extra_params': {}}
ds = {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',
'extra_params': {}} # node symb
# ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
# 'extra_params': {}} # node/edge symb
ds = {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt',
'extra_params': {}} # node nsymb
# ds = {'name': 'Acyclic', 'dataset': '../datasets/monoterpenoides/trainset_9.ds',
# 'extra_params': {}}
# ds = {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',
# 'extra_params': {}} # node symb
DN, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params']) DN, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
#DN = DN[0:10] #DN = DN[0:10]
lmbda = 0.03 # termination probalility lmbda = 0.03 # termination probalility
r_max = 10 # recursions
r_max = 3 # 10 # iteration limit.
l = 500 l = 500
alpha_range = np.linspace(0.5, 0.5, 1) alpha_range = np.linspace(0.5, 0.5, 1)
#alpha_range = np.linspace(0.1, 0.9, 9) #alpha_range = np.linspace(0.1, 0.9, 9)
k = 5 # k nearest neighbors
k = 10 # 5 # k nearest neighbors
# randomly select two molecules # randomly select two molecules
#np.random.seed(1) #np.random.seed(1)


+ 298
- 17
preimage/run_gk_iam.py View File

@@ -245,6 +245,9 @@ def test_remove_bests(Gn, gkernel):
print(g.edges(data=True)) print(g.edges(data=True))
###############################################################################
# Tests on dataset Letter-H.
def test_gkiam_letter_h(): def test_gkiam_letter_h():
from gk_iam import gk_iam_nearest_multi, compute_kernel from gk_iam import gk_iam_nearest_multi, compute_kernel
from iam import median_distance from iam import median_distance
@@ -263,8 +266,10 @@ def test_gkiam_letter_h():
# classify graphs according to letters. # classify graphs according to letters.
idx_dict = get_same_item_indices(y_all) idx_dict = get_same_item_indices(y_all)
time_list = [] time_list = []
sod_list = []
sod_min_list = []
sod_ks_min_list = []
sod_gs_list = []
sod_gs_min_list = []
nb_updated_list = []
for letter in idx_dict: for letter in idx_dict:
print('\n-------------------------------------------------------\n') print('\n-------------------------------------------------------\n')
Gn_let = [Gn[i].copy() for i in idx_dict[letter]] Gn_let = [Gn[i].copy() for i in idx_dict[letter]]
@@ -280,10 +285,10 @@ def test_gkiam_letter_h():
# for each alpha # for each alpha
for alpha in alpha_range: for alpha in alpha_range:
print('alpha =', alpha) print('alpha =', alpha)
dhat, ghat_list = gk_iam_nearest_multi(Gn_let, Gn_let, [alpha] * len(Gn_let),
range(len(Gn_let), len(Gn_mix)), km,
k, r_max, gkernel, c_ei=1.7,
c_er=1.7, c_es=1.7)
dhat, ghat_list, sod_ks, nb_updated = gk_iam_nearest_multi(Gn_let,
Gn_let, [alpha] * len(Gn_let), range(len(Gn_let), len(Gn_mix)),
km, k, r_max, gkernel, c_ei=1.7, c_er=1.7, c_es=1.7,
ged_cost='LETTER', ged_method='IPFP', saveGXL='gedlib-letter')
dis_best.append(dhat) dis_best.append(dhat)
g_best.append(ghat_list) g_best.append(ghat_list)
time_list.append(time.time() - time0) time_list.append(time.time() - time0)
@@ -300,13 +305,18 @@ def test_gkiam_letter_h():
print(g.edges(data=True)) print(g.edges(data=True))
# compute the corresponding sod in graph space. (alpha range not considered.) # compute the corresponding sod in graph space. (alpha range not considered.)
sod_tmp, _ = median_distance(g_best[0], Gn_let)
sod_list.append(sod_tmp)
sod_min_list.append(np.min(sod_tmp))
sod_tmp, _ = median_distance(g_best[0], Gn_let, ged_cost='LETTER',
ged_method='IPFP', saveGXL='gedlib-letter')
sod_gs_list.append(sod_tmp)
sod_gs_min_list.append(np.min(sod_tmp))
sod_ks_min_list.append(sod_ks)
nb_updated_list.append(nb_updated)
print('\nsods in graph space: ', sod_list)
print('\nsmallest sod in graph space for each letter: ', sod_min_list)
print('\nsods in graph space: ', sod_gs_list)
print('\nsmallest sod in graph space for each letter: ', sod_gs_min_list)
print('\nsmallest sod in kernel space for each letter: ', sod_ks_min_list)
print('\nnumber of updates for each letter: ', nb_updated_list)
print('\ntimes:', time_list) print('\ntimes:', time_list)
@@ -356,7 +366,8 @@ def test_iam_letter_h():
for alpha in alpha_range: for alpha in alpha_range:
print('alpha =', alpha) print('alpha =', alpha)
ghat_list, dhat = test_iam_moreGraphsAsInit_tryAllPossibleBestGraphs_deleteNodesInIterations( ghat_list, dhat = test_iam_moreGraphsAsInit_tryAllPossibleBestGraphs_deleteNodesInIterations(
Gn_let, Gn_let, c_ei=1.7, c_er=1.7, c_es=1.7)
Gn_let, Gn_let, c_ei=1.7, c_er=1.7, c_es=1.7,
ged_cost='LETTER', ged_method='IPFP', saveGXL='gedlib-letter')
dis_best.append(dhat) dis_best.append(dhat)
g_best.append(ghat_list) g_best.append(ghat_list)
time_list.append(time.time() - time0) time_list.append(time.time() - time0)
@@ -388,18 +399,283 @@ def test_iam_letter_h():
print('\nsods in kernel space: ', sod_list) print('\nsods in kernel space: ', sod_list)
print('\nsmallest sod in kernel space for each letter: ', sod_min_list) print('\nsmallest sod in kernel space for each letter: ', sod_min_list)
print('\ntimes:', time_list) print('\ntimes:', time_list)
def test_random_preimage_letter_h():
from preimage import random_preimage, compute_kernel
ds = {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt',
'extra_params': {}} # node nsymb
# ds = {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt',
# 'extra_params': {}} # node nsymb
# ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
# 'extra_params': {}} # node/edge symb
# ds = {'name': 'Acyclic', 'dataset': '../datasets/monoterpenoides/trainset_9.ds',
# 'extra_params': {}}
# ds = {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',
# 'extra_params': {}} # node symb
Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
gkernel = 'structuralspkernel'
# lmbda = 0.03 # termination probalility
r_max = 3 # 10 # recursions
l = 500
# alpha_range = np.linspace(0.5, 0.5, 1)
#alpha_range = np.linspace(0.1, 0.9, 9)
k = 10 # 5 # k nearest neighbors
# classify graphs according to letters.
idx_dict = get_same_item_indices(y_all)
time_list = []
sod_list = []
sod_min_list = []
for letter in idx_dict:
print('\n-------------------------------------------------------\n')
Gn_let = [Gn[i].copy() for i in idx_dict[letter]]
Gn_mix = Gn_let + [g.copy() for g in Gn_let]
alpha_range = np.linspace(1 / len(Gn_let), 1 / len(Gn_let), 1)
# compute
time0 = time.time()
km = compute_kernel(Gn_mix, gkernel, True)
g_best = []
dis_best = []
# for each alpha
for alpha in alpha_range:
print('alpha =', alpha)
dhat, ghat_list = random_preimage(Gn_let, Gn_let, [alpha] * len(Gn_let),
range(len(Gn_let), len(Gn_mix)), km,
k, r_max, gkernel, c_ei=1.7,
c_er=1.7, c_es=1.7)
dis_best.append(dhat)
g_best.append(ghat_list)
time_list.append(time.time() - time0)
# show best graphs and save them to file.
for idx, item in enumerate(alpha_range):
print('when alpha is', item, 'the shortest distance is', dis_best[idx])
print('the corresponding pre-images are')
for g in g_best[idx]:
draw_Letter_graph(g, savepath='results/gk_iam/')
# nx.draw_networkx(g)
# plt.show()
print(g.nodes(data=True))
print(g.edges(data=True))
# compute the corresponding sod in graph space. (alpha range not considered.)
sod_tmp, _ = median_distance(g_best[0], Gn_let)
sod_list.append(sod_tmp)
sod_min_list.append(np.min(sod_tmp))
print('\nsods in graph space: ', sod_list)
print('\nsmallest sod in graph space for each letter: ', sod_min_list)
print('\ntimes:', time_list)

def test_gkiam_mutag():
from gk_iam import gk_iam_nearest_multi, compute_kernel
from iam import median_distance
ds = {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt',
'extra_params': {}} # node nsymb
# ds = {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt',
# 'extra_params': {}} # node nsymb
Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
gkernel = 'structuralspkernel'
lmbda = 0.03 # termination probalility
r_max = 3 # recursions
# alpha_range = np.linspace(0.5, 0.5, 1)
k = 20 # k nearest neighbors
# classify graphs according to letters.
idx_dict = get_same_item_indices(y_all)
time_list = []
sod_ks_min_list = []
sod_gs_list = []
sod_gs_min_list = []
nb_updated_list = []
for letter in idx_dict:
print('\n-------------------------------------------------------\n')
Gn_let = [Gn[i].copy() for i in idx_dict[letter]]
Gn_mix = Gn_let + [g.copy() for g in Gn_let]
alpha_range = np.linspace(1 / len(Gn_let), 1 / len(Gn_let), 1)
# compute
time0 = time.time()
km = compute_kernel(Gn_mix, gkernel, True)
g_best = []
dis_best = []
# for each alpha
for alpha in alpha_range:
print('alpha =', alpha)
dhat, ghat_list, sod_ks, nb_updated = gk_iam_nearest_multi(Gn_let, Gn_let, [alpha] * len(Gn_let),
range(len(Gn_let), len(Gn_mix)), km,
k, r_max, gkernel, c_ei=1.7,
c_er=1.7, c_es=1.7)
dis_best.append(dhat)
g_best.append(ghat_list)
time_list.append(time.time() - time0)
# show best graphs and save them to file.
for idx, item in enumerate(alpha_range):
print('when alpha is', item, 'the shortest distance is', dis_best[idx])
print('the corresponding pre-images are')
for g in g_best[idx]:
draw_Letter_graph(g, savepath='results/gk_iam/')
# nx.draw_networkx(g)
# plt.show()
print(g.nodes(data=True))
print(g.edges(data=True))
# compute the corresponding sod in graph space. (alpha range not considered.)
sod_tmp, _ = median_distance(g_best[0], Gn_let)
sod_gs_list.append(sod_tmp)
sod_gs_min_list.append(np.min(sod_tmp))
sod_ks_min_list.append(sod_ks)
nb_updated_list.append(nb_updated)
print('\nsods in graph space: ', sod_gs_list)
print('\nsmallest sod in graph space for each letter: ', sod_gs_min_list)
print('\nsmallest sod in kernel space for each letter: ', sod_ks_min_list)
print('\nnumber of updates for each letter: ', nb_updated_list)
print('\ntimes:', time_list)
###############################################################################
# Re-test.
def retest_the_simple_two():
from gk_iam import gk_iam_nearest_multi, compute_kernel
from iam import median_distance
from test_random_mutag import remove_edges
# The two simple graphs.
# g1 = nx.Graph(name='haha')
# g1.add_nodes_from([(0, {'atom': 'C'}), (1, {'atom': 'O'}), (2, {'atom': 'C'})])
# g1.add_edges_from([(0, 1, {'bond_type': '1'}), (1, 2, {'bond_type': '1'})])
# g2 = nx.Graph(name='hahaha')
# g2.add_nodes_from([(0, {'atom': 'C'}), (1, {'atom': 'O'}), (2, {'atom': 'C'}),
# (3, {'atom': 'O'}), (4, {'atom': 'C'})])
# g2.add_edges_from([(0, 1, {'bond_type': '1'}), (1, 2, {'bond_type': '1'}),
# (2, 3, {'bond_type': '1'}), (3, 4, {'bond_type': '1'})])
g1 = nx.Graph(name='haha')
g1.add_nodes_from([(0, {'atom': 'C'}), (1, {'atom': 'C'}), (2, {'atom': 'C'}),
(3, {'atom': 'S'}), (4, {'atom': 'S'})])
g1.add_edges_from([(0, 1, {'bond_type': '1'}), (1, 2, {'bond_type': '1'}),
(2, 3, {'bond_type': '1'}), (2, 4, {'bond_type': '1'})])
g2 = nx.Graph(name='hahaha')
g2.add_nodes_from([(0, {'atom': 'C'}), (1, {'atom': 'C'}), (2, {'atom': 'C'}),
(3, {'atom': 'O'}), (4, {'atom': 'O'})])
g2.add_edges_from([(0, 1, {'bond_type': '1'}), (1, 2, {'bond_type': '1'}),
(2, 3, {'bond_type': '1'}), (2, 4, {'bond_type': '1'})])
# # randomly select two molecules
# np.random.seed(1)
# idx_gi = [0, 6] # np.random.randint(0, len(Gn), 2)
# g1 = Gn[idx_gi[0]]
# g2 = Gn[idx_gi[1]]
# Gn_mix = [g.copy() for g in Gn]
# Gn_mix.append(g1.copy())
# Gn_mix.append(g2.copy())
Gn = [g1.copy(), g2.copy()]
remove_edges(Gn)
gkernel = 'marginalizedkernel'
lmbda = 0.03 # termination probalility
r_max = 10 # recursions
# l = 500
alpha_range = np.linspace(0.5, 0.5, 1)
k = 2 # k nearest neighbors
epsilon = 1e-6
ged_cost='CHEM_1'
ged_method='IPFP'
saveGXL='gedlib'
c_ei=1
c_er=1
c_es=1
Gn_mix = Gn + [g1.copy(), g2.copy()]
# compute
time0 = time.time()
km = compute_kernel(Gn_mix, gkernel, True)
time_km = time.time() - time0

time_list = []
sod_ks_min_list = []
sod_gs_list = []
sod_gs_min_list = []
nb_updated_list = []
g_best = []
# for each alpha
for alpha in alpha_range:
print('\n-------------------------------------------------------\n')
print('alpha =', alpha)
time0 = time.time()
dhat, ghat_list, sod_ks, nb_updated = gk_iam_nearest_multi(Gn, [g1, g2],
[alpha, 1 - alpha], range(len(Gn), len(Gn) + 2), km, k, r_max,
gkernel, c_ei=c_ei, c_er=c_er, c_es=c_es, epsilon=epsilon,
ged_cost=ged_cost, ged_method=ged_method, saveGXL=saveGXL)
time_total = time.time() - time0 + time_km
print('time: ', time_total)
time_list.append(time_total)
sod_ks_min_list.append(dhat)
g_best.append(ghat_list)
nb_updated_list.append(nb_updated)
# show best graphs and save them to file.
for idx, item in enumerate(alpha_range):
print('when alpha is', item, 'the shortest distance is', sod_ks_min_list[idx])
print('one of the possible corresponding pre-images is')
nx.draw(g_best[idx][0], labels=nx.get_node_attributes(g_best[idx][0], 'atom'),
with_labels=True)
plt.savefig('results/gk_iam/mutag_alpha' + str(item) + '.png', format="PNG")
plt.show()
print(g_best[idx][0].nodes(data=True))
print(g_best[idx][0].edges(data=True))
# for g in g_best[idx]:
# draw_Letter_graph(g, savepath='results/gk_iam/')
## nx.draw_networkx(g)
## plt.show()
# print(g.nodes(data=True))
# print(g.edges(data=True))
# compute the corresponding sod in graph space.
for idx, item in enumerate(alpha_range):
sod_tmp, _ = median_distance(g_best[0], [g1, g2], ged_cost=ged_cost,
ged_method=ged_method, saveGXL=saveGXL)
sod_gs_list.append(sod_tmp)
sod_gs_min_list.append(np.min(sod_tmp))
print('\nsods in graph space: ', sod_gs_list)
print('\nsmallest sod in graph space for each alpha: ', sod_gs_min_list)
print('\nsmallest sod in kernel space for each alpha: ', sod_ks_min_list)
print('\nnumber of updates for each alpha: ', nb_updated_list)
print('\ntimes:', time_list)


if __name__ == '__main__': if __name__ == '__main__':
# ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt', # ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
# 'extra_params': {}} # node/edge symb # 'extra_params': {}} # node/edge symb
ds = {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt',
'extra_params': {}} # node nsymb
# ds = {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt',
# 'extra_params': {}} # node nsymb
# ds = {'name': 'Acyclic', 'dataset': '../datasets/monoterpenoides/trainset_9.ds', # ds = {'name': 'Acyclic', 'dataset': '../datasets/monoterpenoides/trainset_9.ds',
# 'extra_params': {}} # 'extra_params': {}}
# ds = {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds', # ds = {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',
# 'extra_params': {}} # node symb # 'extra_params': {}} # node symb
Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
# Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
# Gn = Gn[0:20] # Gn = Gn[0:20]
# import networkx.algorithms.isomorphism as iso # import networkx.algorithms.isomorphism as iso
@@ -419,5 +695,10 @@ if __name__ == '__main__':
# test_the_simple_two(Gn, 'untilhpathkernel') # test_the_simple_two(Gn, 'untilhpathkernel')
# test_remove_bests(Gn, 'untilhpathkernel') # test_remove_bests(Gn, 'untilhpathkernel')
test_gkiam_letter_h()
# test_iam_letter_h()
# test_gkiam_letter_h()
# test_iam_letter_h()
# test_random_preimage_letter_h
###############################################################################
# retests.
retest_the_simple_two()

+ 11
- 11
preimage/test.py View File

@@ -18,17 +18,17 @@ def test() :
script.PyRestartEnv() script.PyRestartEnv()
# print("Here is the Python function !")
#
# print("List of Edit Cost Options : ")
# for i in script.listOfEditCostOptions :
# print (i)
# print("")
#
# print("List of Method Options : ")
# for j in script.listOfMethodOptions :
# print (j)
# print("")
print("Here is the Python function !")
print("List of Edit Cost Options : ")
for i in script.listOfEditCostOptions :
print (i)
print("")
print("List of Method Options : ")
for j in script.listOfMethodOptions :
print (j)
print("")
script.PyLoadGXLGraph('include/gedlib-master/data/datasets/Mutagenicity/data/', 'collections/MUTA_10.xml') script.PyLoadGXLGraph('include/gedlib-master/data/datasets/Mutagenicity/data/', 'collections/MUTA_10.xml')
listID = script.PyGetGraphIds() listID = script.PyGetGraphIds()


+ 599
- 0
preimage/test_random_mutag.py View File

@@ -0,0 +1,599 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Sep 5 15:59:00 2019

@author: ljia
"""

import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import time
from tqdm import tqdm

import os
import sys
sys.path.insert(0, "../")
from pygraph.utils.graphfiles import loadDataset

###############################################################################
# test on the combination of the two randomly chosen graphs. (the same as in the
# random pre-image paper.)

def test_preimage_mix_2combination_all_pairs():
from gk_iam import preimage_iam_random_mix, compute_kernel
from iam import median_distance
ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
'extra_params': {}} # node/edge symb
Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
# Gn = Gn[0:50]
remove_edges(Gn)
gkernel = 'marginalizedkernel'
lmbda = 0.03 # termination probalility
r_max = 10 # iteration limit for pre-image.
l_max = 500 # update limit for random generation
alpha_range = np.linspace(0.7, 1, 4)
k = 5 # k nearest neighbors
epsilon = 1e-6
# parameters for GED function
ged_cost='CHEM_1'
ged_method='IPFP'
saveGXL='gedlib'
# parameters for IAM function
c_ei=1
c_er=1
c_es=1
ite_max_iam = 50
epsilon_iam = 0.001
removeNodes = True
connected_iam = False
nb_update_mat_iam = np.full((len(Gn), len(Gn)), np.inf)
nb_update_mat_random = np.full((len(Gn), len(Gn)), np.inf)
# test on each pair of graphs.
# for idx1 in range(len(Gn) - 1, -1, -1):
# for idx2 in range(idx1, -1, -1):
for idx1 in range(187, 188):
for idx2 in range(167, 168):
g1 = Gn[idx1].copy()
g2 = Gn[idx2].copy()
# Gn[10] = []
# Gn[10] = []
nx.draw(g1, labels=nx.get_node_attributes(g1, 'atom'), with_labels=True)
plt.savefig("results/preimage_mix/mutag187.png", format="PNG")
plt.show()
plt.clf()
nx.draw(g2, labels=nx.get_node_attributes(g2, 'atom'), with_labels=True)
plt.savefig("results/preimage_mix/mutag167.png", format="PNG")
plt.show()
plt.clf()

###################################################################
# Gn_mix = [g.copy() for g in Gn]
# Gn_mix.append(g1.copy())
# Gn_mix.append(g2.copy())
#
# # compute
# time0 = time.time()
# km = compute_kernel(Gn_mix, gkernel, True)
# time_km = time.time() - time0
#
# # write Gram matrix to file and read it.
# np.savez('results/gram_matrix_uhpath_itr7_pq0.8.gm', gm=km, gmtime=time_km)
###################################################################
gmfile = np.load('results/gram_matrix_marg_itr10_pq0.03.gm.npz')
km = gmfile['gm']
time_km = gmfile['gmtime']
# modify mixed gram matrix.
for i in range(len(Gn)):
km[i, len(Gn)] = km[i, idx1]
km[i, len(Gn) + 1] = km[i, idx2]
km[len(Gn), i] = km[i, idx1]
km[len(Gn) + 1, i] = km[i, idx2]
km[len(Gn), len(Gn)] = km[idx1, idx1]
km[len(Gn), len(Gn) + 1] = km[idx1, idx2]
km[len(Gn) + 1, len(Gn)] = km[idx2, idx1]
km[len(Gn) + 1, len(Gn) + 1] = km[idx2, idx2]
###################################################################
# # use only the two graphs in median set as candidates.
# Gn = [g1.copy(), g2.copy()]
# Gn_mix = Gn + [g1.copy(), g2.copy()]
# # compute
# time0 = time.time()
# km = compute_kernel(Gn_mix, gkernel, True)
# time_km = time.time() - time0
time_list = []
dis_ks_min_list = []
sod_gs_list = []
sod_gs_min_list = []
nb_updated_list_iam = []
nb_updated_list_random = []
g_best = []
# for each alpha
for alpha in alpha_range:
print('\n-------------------------------------------------------\n')
print('alpha =', alpha)
time0 = time.time()
dhat, ghat_list, sod_ks, nb_updated_iam, nb_updated_random = \
preimage_iam_random_mix(Gn, [g1, g2],
[alpha, 1 - alpha], range(len(Gn), len(Gn) + 2), km, k, r_max,
l_max, gkernel, epsilon=epsilon,
params_iam={'c_ei': c_ei, 'c_er': c_er, 'c_es': c_es,
'ite_max': ite_max_iam, 'epsilon': epsilon_iam,
'removeNodes': removeNodes, 'connected': connected_iam},
params_ged={'ged_cost': ged_cost, 'ged_method': ged_method,
'saveGXL': saveGXL})
time_total = time.time() - time0 + time_km
print('time: ', time_total)
time_list.append(time_total)
dis_ks_min_list.append(dhat)
g_best.append(ghat_list)
nb_updated_list_iam.append(nb_updated_iam)
nb_updated_list_random.append(nb_updated_random)
# show best graphs and save them to file.
for idx, item in enumerate(alpha_range):
print('when alpha is', item, 'the shortest distance is', dis_ks_min_list[idx])
print('one of the possible corresponding pre-images is')
nx.draw(g_best[idx][0], labels=nx.get_node_attributes(g_best[idx][0], 'atom'),
with_labels=True)
plt.savefig('results/preimage_mix/mutag' + str(idx1) + '_' + str(idx2)
+ '_alpha' + str(item) + '.png', format="PNG")
# plt.show()
plt.clf()
# print(g_best[idx][0].nodes(data=True))
# print(g_best[idx][0].edges(data=True))
# for g in g_best[idx]:
# draw_Letter_graph(g, savepath='results/gk_iam/')
## nx.draw_networkx(g)
## plt.show()
# print(g.nodes(data=True))
# print(g.edges(data=True))
# compute the corresponding sod in graph space.
for idx, item in enumerate(alpha_range):
sod_tmp, _ = median_distance(g_best[0], [g1, g2], ged_cost=ged_cost,
ged_method=ged_method, saveGXL=saveGXL)
sod_gs_list.append(sod_tmp)
sod_gs_min_list.append(np.min(sod_tmp))
print('\nsods in graph space: ', sod_gs_list)
print('\nsmallest sod in graph space for each alpha: ', sod_gs_min_list)
print('\nsmallest distance in kernel space for each alpha: ', dis_ks_min_list)
print('\nnumber of updates for each alpha by IAM: ', nb_updated_list_iam)
print('\nnumber of updates for each alpha by random generation: ',
nb_updated_list_random)
print('\ntimes:', time_list)
nb_update_mat_iam[idx1, idx2] = nb_updated_list_iam[0]
nb_update_mat_random[idx1, idx2] = nb_updated_list_random[0]
str_fw = 'graphs %d and %d: %d times by IAM, %d times by random generation.\n' \
% (idx1, idx2, nb_updated_list_iam[0], nb_updated_list_random[0])
with open('results/preimage_mix/nb_updates.txt', 'r+') as file:
content = file.read()
file.seek(0, 0)
file.write(str_fw + content)

def test_gkiam_2combination_all_pairs():
from gk_iam import gk_iam_nearest_multi, compute_kernel
from iam import median_distance
ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
'extra_params': {}} # node/edge symb
Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
# Gn = Gn[0:50]
remove_edges(Gn)
gkernel = 'marginalizedkernel'
lmbda = 0.03 # termination probalility
r_max = 10 # iteration limit for pre-image.
alpha_range = np.linspace(1, 1, 1)
k = 5 # k nearest neighbors
epsilon = 1e-6
# parameters for GED function
ged_cost='CHEM_1'
ged_method='IPFP'
saveGXL='gedlib'
# parameters for IAM function
c_ei=1
c_er=1
c_es=1
ite_max_iam = 50
epsilon_iam = 0.001
removeNodes = True
connected_iam = False
nb_update_mat = np.full((len(Gn), len(Gn)), np.inf)
# test on each pair of graphs.
# for idx1 in range(len(Gn) - 1, -1, -1):
# for idx2 in range(idx1, -1, -1):
for idx1 in range(187, 188):
for idx2 in range(167, 168):
g1 = Gn[idx1].copy()
g2 = Gn[idx2].copy()
# Gn[10] = []
# Gn[10] = []
nx.draw(g1, labels=nx.get_node_attributes(g1, 'atom'), with_labels=True)
plt.savefig("results/gk_iam/all_pairs/mutag187.png", format="PNG")
plt.show()
plt.clf()
nx.draw(g2, labels=nx.get_node_attributes(g2, 'atom'), with_labels=True)
plt.savefig("results/gk_iam/all_pairs/mutag167.png", format="PNG")
plt.show()
plt.clf()

###################################################################
# Gn_mix = [g.copy() for g in Gn]
# Gn_mix.append(g1.copy())
# Gn_mix.append(g2.copy())
#
# # compute
# time0 = time.time()
# km = compute_kernel(Gn_mix, gkernel, True)
# time_km = time.time() - time0
#
# # write Gram matrix to file and read it.
# np.savez('results/gram_matrix_uhpath_itr7_pq0.8.gm', gm=km, gmtime=time_km)
###################################################################
gmfile = np.load('results/gram_matrix_marg_itr10_pq0.03.gm.npz')
km = gmfile['gm']
time_km = gmfile['gmtime']
# modify mixed gram matrix.
for i in range(len(Gn)):
km[i, len(Gn)] = km[i, idx1]
km[i, len(Gn) + 1] = km[i, idx2]
km[len(Gn), i] = km[i, idx1]
km[len(Gn) + 1, i] = km[i, idx2]
km[len(Gn), len(Gn)] = km[idx1, idx1]
km[len(Gn), len(Gn) + 1] = km[idx1, idx2]
km[len(Gn) + 1, len(Gn)] = km[idx2, idx1]
km[len(Gn) + 1, len(Gn) + 1] = km[idx2, idx2]
###################################################################
# # use only the two graphs in median set as candidates.
# Gn = [g1.copy(), g2.copy()]
# Gn_mix = Gn + [g1.copy(), g2.copy()]
# # compute
# time0 = time.time()
# km = compute_kernel(Gn_mix, gkernel, True)
# time_km = time.time() - time0
time_list = []
dis_ks_min_list = []
sod_gs_list = []
sod_gs_min_list = []
nb_updated_list = []
g_best = []
# for each alpha
for alpha in alpha_range:
print('\n-------------------------------------------------------\n')
print('alpha =', alpha)
time0 = time.time()
dhat, ghat_list, sod_ks, nb_updated = gk_iam_nearest_multi(Gn, [g1, g2],
[alpha, 1 - alpha], range(len(Gn), len(Gn) + 2), km, k, r_max,
gkernel, epsilon=epsilon,
params_iam={'c_ei': c_ei, 'c_er': c_er, 'c_es': c_es,
'ite_max': ite_max_iam, 'epsilon': epsilon_iam,
'removeNodes': removeNodes, 'connected': connected_iam},
params_ged={'ged_cost': ged_cost, 'ged_method': ged_method,
'saveGXL': saveGXL})
time_total = time.time() - time0 + time_km
print('time: ', time_total)
time_list.append(time_total)
dis_ks_min_list.append(dhat)
g_best.append(ghat_list)
nb_updated_list.append(nb_updated)
# show best graphs and save them to file.
for idx, item in enumerate(alpha_range):
print('when alpha is', item, 'the shortest distance is', dis_ks_min_list[idx])
print('one of the possible corresponding pre-images is')
nx.draw(g_best[idx][0], labels=nx.get_node_attributes(g_best[idx][0], 'atom'),
with_labels=True)
plt.savefig('results/gk_iam/mutag' + str(idx1) + '_' + str(idx2)
+ '_alpha' + str(item) + '.png', format="PNG")
# plt.show()
plt.clf()
# print(g_best[idx][0].nodes(data=True))
# print(g_best[idx][0].edges(data=True))
# for g in g_best[idx]:
# draw_Letter_graph(g, savepath='results/gk_iam/')
## nx.draw_networkx(g)
## plt.show()
# print(g.nodes(data=True))
# print(g.edges(data=True))
# compute the corresponding sod in graph space.
for idx, item in enumerate(alpha_range):
sod_tmp, _ = median_distance(g_best[0], [g1, g2], ged_cost=ged_cost,
ged_method=ged_method, saveGXL=saveGXL)
sod_gs_list.append(sod_tmp)
sod_gs_min_list.append(np.min(sod_tmp))
print('\nsods in graph space: ', sod_gs_list)
print('\nsmallest sod in graph space for each alpha: ', sod_gs_min_list)
print('\nsmallest distance in kernel space for each alpha: ', dis_ks_min_list)
print('\nnumber of updates for each alpha: ', nb_updated_list)
print('\ntimes:', time_list)
nb_update_mat[idx1, idx2] = nb_updated_list[0]
str_fw = 'graphs %d and %d: %d.\n' % (idx1, idx2, nb_updated_list[0])
with open('results/gk_iam/all_pairs/nb_updates.txt', 'r+') as file:
content = file.read()
file.seek(0, 0)
file.write(str_fw + content)

def test_gkiam_2combination():
from gk_iam import gk_iam_nearest_multi, compute_kernel
from iam import median_distance
ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
'extra_params': {}} # node/edge symb
Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
# Gn = Gn[0:50]
remove_edges(Gn)
gkernel = 'marginalizedkernel'
lmbda = 0.03 # termination probalility
r_max = 10 # iteration limit for pre-image.
alpha_range = np.linspace(0.5, 0.5, 1)
k = 20 # k nearest neighbors
epsilon = 1e-6
ged_cost='CHEM_1'
ged_method='IPFP'
saveGXL='gedlib'
c_ei=1
c_er=1
c_es=1
# randomly select two molecules
np.random.seed(1)
idx_gi = [10, 11] # np.random.randint(0, len(Gn), 2)
g1 = Gn[idx_gi[0]].copy()
g2 = Gn[idx_gi[1]].copy()
# Gn[10] = []
# Gn[10] = []
# nx.draw(g1, labels=nx.get_node_attributes(g1, 'atom'), with_labels=True)
# plt.savefig("results/random_preimage/mutag10.png", format="PNG")
# plt.show()
# nx.draw(g2, labels=nx.get_node_attributes(g2, 'atom'), with_labels=True)
# plt.savefig("results/random_preimage/mutag11.png", format="PNG")
# plt.show()
Gn_mix = [g.copy() for g in Gn]
Gn_mix.append(g1.copy())
Gn_mix.append(g2.copy())
# compute
# time0 = time.time()
# km = compute_kernel(Gn_mix, gkernel, True)
# time_km = time.time() - time0
# write Gram matrix to file and read it.
# np.savez('results/gram_matrix.gm', gm=km, gmtime=time_km)
gmfile = np.load('results/gram_matrix.gm.npz')
km = gmfile['gm']
time_km = gmfile['gmtime']
time_list = []
dis_ks_min_list = []
sod_gs_list = []
sod_gs_min_list = []
nb_updated_list = []
g_best = []
# for each alpha
for alpha in alpha_range:
print('\n-------------------------------------------------------\n')
print('alpha =', alpha)
time0 = time.time()
dhat, ghat_list, sod_ks, nb_updated = gk_iam_nearest_multi(Gn, [g1, g2],
[alpha, 1 - alpha], range(len(Gn), len(Gn) + 2), km, k, r_max,
gkernel, c_ei=c_ei, c_er=c_er, c_es=c_es, epsilon=epsilon,
ged_cost=ged_cost, ged_method=ged_method, saveGXL=saveGXL)
time_total = time.time() - time0 + time_km
print('time: ', time_total)
time_list.append(time_total)
dis_ks_min_list.append(dhat)
g_best.append(ghat_list)
nb_updated_list.append(nb_updated)
# show best graphs and save them to file.
for idx, item in enumerate(alpha_range):
print('when alpha is', item, 'the shortest distance is', dis_ks_min_list[idx])
print('one of the possible corresponding pre-images is')
nx.draw(g_best[idx][0], labels=nx.get_node_attributes(g_best[idx][0], 'atom'),
with_labels=True)
plt.savefig('results/gk_iam/mutag_alpha' + str(item) + '.png', format="PNG")
plt.show()
print(g_best[idx][0].nodes(data=True))
print(g_best[idx][0].edges(data=True))
# for g in g_best[idx]:
# draw_Letter_graph(g, savepath='results/gk_iam/')
## nx.draw_networkx(g)
## plt.show()
# print(g.nodes(data=True))
# print(g.edges(data=True))
# compute the corresponding sod in graph space.
for idx, item in enumerate(alpha_range):
sod_tmp, _ = median_distance(g_best[0], [g1, g2], ged_cost=ged_cost,
ged_method=ged_method, saveGXL=saveGXL)
sod_gs_list.append(sod_tmp)
sod_gs_min_list.append(np.min(sod_tmp))
print('\nsods in graph space: ', sod_gs_list)
print('\nsmallest sod in graph space for each alpha: ', sod_gs_min_list)
print('\nsmallest distance in kernel space for each alpha: ', dis_ks_min_list)
print('\nnumber of updates for each alpha: ', nb_updated_list)
print('\ntimes:', time_list)
def test_random_preimage_2combination():
# from gk_iam import compute_kernel
from preimage import random_preimage
ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
'extra_params': {}} # node/edge symb
Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
# Gn = Gn[0:12]
remove_edges(Gn)
gkernel = 'marginalizedkernel'
# dis_mat, dis_max, dis_min, dis_mean = kernel_distance_matrix(Gn, gkernel=gkernel)
# print(dis_max, dis_min, dis_mean)
lmbda = 0.03 # termination probalility
r_max = 10 # iteration limit for pre-image.
l = 500
alpha_range = np.linspace(0, 1, 11)
k = 5 # k nearest neighbors
# randomly select two molecules
np.random.seed(1)
idx_gi = [187, 167] # np.random.randint(0, len(Gn), 2)
g1 = Gn[idx_gi[0]].copy()
g2 = Gn[idx_gi[1]].copy()
# nx.draw(g1, labels=nx.get_node_attributes(g1, 'atom'), with_labels=True)
# plt.savefig("results/random_preimage/mutag10.png", format="PNG")
# plt.show()
# nx.draw(g2, labels=nx.get_node_attributes(g2, 'atom'), with_labels=True)
# plt.savefig("results/random_preimage/mutag11.png", format="PNG")
# plt.show()
######################################################################
# Gn_mix = [g.copy() for g in Gn]
# Gn_mix.append(g1.copy())
# Gn_mix.append(g2.copy())
#
## g_tmp = iam([g1, g2])
## nx.draw_networkx(g_tmp)
## plt.show()
#
# # compute
# time0 = time.time()
# km = compute_kernel(Gn_mix, gkernel, True)
# time_km = time.time() - time0
###################################################################
idx1 = idx_gi[0]
idx2 = idx_gi[1]
gmfile = np.load('results/gram_matrix_marg_itr10_pq0.03.gm.npz')
km = gmfile['gm']
time_km = gmfile['gmtime']
# modify mixed gram matrix.
for i in range(len(Gn)):
km[i, len(Gn)] = km[i, idx1]
km[i, len(Gn) + 1] = km[i, idx2]
km[len(Gn), i] = km[i, idx1]
km[len(Gn) + 1, i] = km[i, idx2]
km[len(Gn), len(Gn)] = km[idx1, idx1]
km[len(Gn), len(Gn) + 1] = km[idx1, idx2]
km[len(Gn) + 1, len(Gn)] = km[idx2, idx1]
km[len(Gn) + 1, len(Gn) + 1] = km[idx2, idx2]
###################################################################

time_list = []
nb_updated_list = []
g_best = []
dis_ks_min_list = []
# for each alpha
for alpha in alpha_range:
print('\n-------------------------------------------------------\n')
print('alpha =', alpha)
time0 = time.time()
dhat, ghat, nb_updated = random_preimage(Gn, [g1, g2], [alpha, 1 - alpha],
range(len(Gn), len(Gn) + 2), km,
k, r_max, l, gkernel)
time_total = time.time() - time0 + time_km
print('time: ', time_total)
time_list.append(time_total)
dis_ks_min_list.append(dhat)
g_best.append(ghat)
nb_updated_list.append(nb_updated)
# show best graphs and save them to file.
for idx, item in enumerate(alpha_range):
print('when alpha is', item, 'the shortest distance is', dis_ks_min_list[idx])
print('one of the possible corresponding pre-images is')
nx.draw(g_best[idx], labels=nx.get_node_attributes(g_best[idx], 'atom'),
with_labels=True)
plt.savefig('results/random_preimage/mutag_alpha' + str(item) + '.png', format="PNG")
plt.show()
plt.clf()
print(g_best[idx].nodes(data=True))
print(g_best[idx].edges(data=True))
# # compute the corresponding sod in graph space. (alpha range not considered.)
# sod_tmp, _ = median_distance(g_best[0], Gn_let)
# sod_gs_list.append(sod_tmp)
# sod_gs_min_list.append(np.min(sod_tmp))
# sod_ks_min_list.append(sod_ks)
# nb_updated_list.append(nb_updated)
# print('\nsmallest sod in graph space for each alpha: ', sod_gs_min_list)
print('\nsmallest distance in kernel space for each alpha: ', dis_ks_min_list)
print('\nnumber of updates for each alpha: ', nb_updated_list)
print('\ntimes:', time_list)

###############################################################################
# help functions

def remove_edges(Gn):
for G in Gn:
for _, _, attrs in G.edges(data=True):
attrs.clear()
def kernel_distance_matrix(Gn, Kmatrix=None, gkernel=None):
from gk_iam import compute_kernel
dis_mat = np.empty((len(Gn), len(Gn)))
if Kmatrix == None:
Kmatrix = compute_kernel(Gn, gkernel, True)
for i in range(len(Gn)):
for j in range(i, len(Gn)):
dis = Kmatrix[i, i] + Kmatrix[j, j] - 2 * Kmatrix[i, j]
if dis < 0:
if dis > -1e-10:
dis = 0
else:
raise ValueError('The distance is negative.')
dis_mat[i, j] = np.sqrt(dis)
dis_mat[j, i] = dis_mat[i, j]
dis_max = np.max(np.max(dis_mat))
dis_min = np.min(np.min(dis_mat[dis_mat != 0]))
dis_mean = np.mean(np.mean(dis_mat))
return dis_mat, dis_max, dis_min, dis_mean
###############################################################################

if __name__ == '__main__':
###############################################################################
# test on the combination of the two randomly chosen graphs. (the same as in the
# random pre-image paper.)
# test_random_preimage_2combination()
# test_gkiam_2combination()
# test_gkiam_2combination_all_pairs()
test_preimage_mix_2combination_all_pairs()

+ 88
- 20
pygraph/kernels/untilHPathKernel.py View File

@@ -51,6 +51,7 @@ def untilhpathkernel(*args,
applied for the graph kernel. The Following choices are available: applied for the graph kernel. The Following choices are available:
'MinMax': use the MiniMax kernel and counting feature map. 'MinMax': use the MiniMax kernel and counting feature map.
'tanimoto': use the Tanimoto kernel and binary feature map. 'tanimoto': use the Tanimoto kernel and binary feature map.
None: no sub-kernel is used, the kernel is computed directly.
compute_method : string compute_method : string
Computation method to store paths and compute the graph kernel. The Computation method to store paths and compute the graph kernel. The
Following choices are available: Following choices are available:
@@ -72,14 +73,16 @@ def untilhpathkernel(*args,
Kmatrix = np.zeros((len(Gn), len(Gn))) Kmatrix = np.zeros((len(Gn), len(Gn)))
ds_attrs = get_dataset_attributes( ds_attrs = get_dataset_attributes(
Gn, Gn,
attr_names=['node_labeled', 'edge_labeled', 'is_directed'],
attr_names=['node_labeled', 'node_attr_dim', 'edge_labeled',
'edge_attr_dim', 'is_directed'],
node_label=node_label, edge_label=edge_label) node_label=node_label, edge_label=edge_label)
if not ds_attrs['node_labeled']:
for G in Gn:
nx.set_node_attributes(G, '0', 'atom')
if not ds_attrs['edge_labeled']:
for G in Gn:
nx.set_edge_attributes(G, '0', 'bond_type')
if k_func != None:
if not ds_attrs['node_labeled']:
for G in Gn:
nx.set_node_attributes(G, '0', 'atom')
if not ds_attrs['edge_labeled']:
for G in Gn:
nx.set_edge_attributes(G, '0', 'bond_type')


start_time = time.time() start_time = time.time()


@@ -93,12 +96,15 @@ def untilhpathkernel(*args,
else: else:
chunksize = 100 chunksize = 100
all_paths = [[] for _ in range(len(Gn))] all_paths = [[] for _ in range(len(Gn))]
if compute_method == 'trie':
if compute_method == 'trie' and k_func != None:
getps_partial = partial(wrapper_find_all_path_as_trie, depth, getps_partial = partial(wrapper_find_all_path_as_trie, depth,
ds_attrs, node_label, edge_label) ds_attrs, node_label, edge_label)
else:
elif compute_method != 'trie' and k_func != None:
getps_partial = partial(wrapper_find_all_paths_until_length, depth, getps_partial = partial(wrapper_find_all_paths_until_length, depth,
ds_attrs, node_label, edge_label)
ds_attrs, node_label, edge_label, True)
else:
getps_partial = partial(wrapper_find_all_paths_until_length, depth,
ds_attrs, node_label, edge_label, False)
if verbose: if verbose:
iterator = tqdm(pool.imap_unordered(getps_partial, itr, chunksize), iterator = tqdm(pool.imap_unordered(getps_partial, itr, chunksize),
desc='getting paths', file=sys.stdout) desc='getting paths', file=sys.stdout)
@@ -110,10 +116,12 @@ def untilhpathkernel(*args,
pool.join() pool.join()
# for g in Gn: # for g in Gn:
# if compute_method == 'trie':
# if compute_method == 'trie' and k_func != None:
# find_all_path_as_trie(g, depth, ds_attrs, node_label, edge_label) # find_all_path_as_trie(g, depth, ds_attrs, node_label, edge_label)
# else:
# elif compute_method != 'trie' and k_func != None:
# find_all_paths_until_length(g, depth, ds_attrs, node_label, edge_label) # find_all_paths_until_length(g, depth, ds_attrs, node_label, edge_label)
# else:
# find_all_paths_until_length(g, depth, ds_attrs, node_label, edge_label, False)
## size = sys.getsizeof(all_paths) ## size = sys.getsizeof(all_paths)
## for item in all_paths: ## for item in all_paths:
@@ -130,20 +138,27 @@ def untilhpathkernel(*args,
## all_paths[i] = ps ## all_paths[i] = ps
## print(time.time() - ttt) ## print(time.time() - ttt)
if compute_method == 'trie':
if compute_method == 'trie' and k_func != None:
def init_worker(trie_toshare): def init_worker(trie_toshare):
global G_trie global G_trie
G_trie = trie_toshare G_trie = trie_toshare
do_partial = partial(wrapper_uhpath_do_trie, k_func) do_partial = partial(wrapper_uhpath_do_trie, k_func)
parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker,
glbv=(all_paths,), n_jobs=n_jobs, verbose=verbose) glbv=(all_paths,), n_jobs=n_jobs, verbose=verbose)
else:
elif compute_method != 'trie' and k_func != None:
def init_worker(plist_toshare): def init_worker(plist_toshare):
global G_plist global G_plist
G_plist = plist_toshare G_plist = plist_toshare
do_partial = partial(wrapper_uhpath_do_naive, k_func) do_partial = partial(wrapper_uhpath_do_naive, k_func)
parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker,
glbv=(all_paths,), n_jobs=n_jobs, verbose=verbose) glbv=(all_paths,), n_jobs=n_jobs, verbose=verbose)
else:
def init_worker(plist_toshare):
global G_plist
G_plist = plist_toshare
do_partial = partial(wrapper_uhpath_do_kernelless, ds_attrs, edge_kernels)
parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker,
glbv=(all_paths,), n_jobs=n_jobs, verbose=verbose)
# # ---- direct running, normally use single CPU core. ---- # # ---- direct running, normally use single CPU core. ----
@@ -353,12 +368,62 @@ def wrapper_uhpath_do_naive(k_func, itr):
return i, j, _untilhpathkernel_do_naive(G_plist[i], G_plist[j], k_func) return i, j, _untilhpathkernel_do_naive(G_plist[i], G_plist[j], k_func)




def _untilhpathkernel_do_kernelless(paths1, paths2, k_func):
"""Calculate path graph kernels up to depth d between 2 graphs naively.

Parameters
----------
paths_list : list of list
List of list of paths in all graphs, where for unlabeled graphs, each
path is represented by a list of nodes; while for labeled graphs, each
path is represented by a string consists of labels of nodes and/or
edges on that path.
k_func : function
A kernel function applied using different notions of fingerprint
similarity.

Return
------
kernel : float
Path kernel up to h between 2 graphs.
"""
all_paths = list(set(paths1 + paths2))

if k_func == 'tanimoto':
length_union = len(set(paths1 + paths2))
kernel = (len(set(paths1)) + len(set(paths2)) -
length_union) / length_union
# vector1 = [(1 if path in paths1 else 0) for path in all_paths]
# vector2 = [(1 if path in paths2 else 0) for path in all_paths]
# kernel_uv = np.dot(vector1, vector2)
# kernel = kernel_uv / (len(set(paths1)) + len(set(paths2)) - kernel_uv)

else: # MinMax kernel
path_count1 = Counter(paths1)
path_count2 = Counter(paths2)
vector1 = [(path_count1[key] if (key in path_count1.keys()) else 0)
for key in all_paths]
vector2 = [(path_count2[key] if (key in path_count2.keys()) else 0)
for key in all_paths]
kernel = np.sum(np.minimum(vector1, vector2)) / \
np.sum(np.maximum(vector1, vector2))

return kernel


def wrapper_uhpath_do_kernelless(k_func, itr):
i = itr[0]
j = itr[1]
return i, j, _untilhpathkernel_do_kernelless(G_plist[i], G_plist[j], k_func)


# @todo: (can be removed maybe) this method find paths repetively, it could be faster. # @todo: (can be removed maybe) this method find paths repetively, it could be faster.
def find_all_paths_until_length(G, def find_all_paths_until_length(G,
length, length,
ds_attrs, ds_attrs,
node_label='atom', node_label='atom',
edge_label='bond_type'):
edge_label='bond_type',
tolabelseqs=True):
"""Find all paths no longer than a certain maximum length in a graph. A """Find all paths no longer than a certain maximum length in a graph. A
recursive depth first search is applied. recursive depth first search is applied.


@@ -398,7 +463,7 @@ def find_all_paths_until_length(G,
# path_l = path_l_new[:] # path_l = path_l_new[:]


path_l = [[n] for n in G.nodes] # paths of length l path_l = [[n] for n in G.nodes] # paths of length l
all_paths = path_l[:]
all_paths = [p.copy() for p in path_l]
for l in range(1, length + 1): for l in range(1, length + 1):
path_lplus1 = [] path_lplus1 = []
for path in path_l: for path in path_l:
@@ -409,7 +474,7 @@ def find_all_paths_until_length(G,
path_lplus1.append(tmp) path_lplus1.append(tmp)


all_paths += path_lplus1 all_paths += path_lplus1
path_l = path_lplus1[:]
path_l = [p.copy() for p in path_lplus1]


# for i in range(0, length + 1): # for i in range(0, length + 1):
# new_paths = find_all_paths(G, i) # new_paths = find_all_paths(G, i)
@@ -419,15 +484,18 @@ def find_all_paths_until_length(G,


# consider labels # consider labels
# print(paths2labelseqs(all_paths, G, ds_attrs, node_label, edge_label)) # print(paths2labelseqs(all_paths, G, ds_attrs, node_label, edge_label))
return paths2labelseqs(all_paths, G, ds_attrs, node_label, edge_label)
print()
return (paths2labelseqs(all_paths, G, ds_attrs, node_label, edge_label)
if tolabelseqs else all_paths)
def wrapper_find_all_paths_until_length(length, ds_attrs, node_label, def wrapper_find_all_paths_until_length(length, ds_attrs, node_label,
edge_label, itr_item):
edge_label, tolabelseqs, itr_item):
g = itr_item[0] g = itr_item[0]
i = itr_item[1] i = itr_item[1]
return i, find_all_paths_until_length(g, length, ds_attrs, return i, find_all_paths_until_length(g, length, ds_attrs,
node_label=node_label, edge_label=edge_label)
node_label=node_label, edge_label=edge_label,
tolabelseqs=tolabelseqs)




def find_all_path_as_trie(G, def find_all_path_as_trie(G,


+ 8
- 8
pygraph/utils/graphfiles.py View File

@@ -84,7 +84,7 @@ def loadGXL(filename):
return g return g




def saveGXL(graph, filename, method='gedlib-letter'):
def saveGXL(graph, filename, method='benoit'):
if method == 'benoit': if method == 'benoit':
import xml.etree.ElementTree as ET import xml.etree.ElementTree as ET
root_node = ET.Element('gxl') root_node = ET.Element('gxl')
@@ -131,13 +131,13 @@ def saveGXL(graph, filename, method='gedlib-letter'):
gxl_file.write("<gxl>\n") gxl_file.write("<gxl>\n")
gxl_file.write("<graph id=\"" + str(graph.graph['name']) + "\" edgeids=\"true\" edgemode=\"undirected\">\n") gxl_file.write("<graph id=\"" + str(graph.graph['name']) + "\" edgeids=\"true\" edgemode=\"undirected\">\n")
for v, attrs in graph.nodes(data=True): for v, attrs in graph.nodes(data=True):
gxl_file.write("<node id=\"_" + str(v) + "\">\n")
gxl_file.write("<attr name=\"" + "chem" + "\"><int>" + str(attrs['atom']) + "</int></attr>\n")
gxl_file.write("<node id=\"_" + str(v) + "\">")
gxl_file.write("<attr name=\"" + "chem" + "\"><int>" + str(attrs['atom']) + "</int></attr>")
gxl_file.write("</node>\n") gxl_file.write("</node>\n")
for v1, v2, attrs in graph.edges(data=True): for v1, v2, attrs in graph.edges(data=True):
gxl_file.write("<edge from=\"_" + str(v1) + "\" to=\"_" + str(v2) + "\">\n")
# gxl_file.write("<attr name=\"valence\"><int>" + str(attrs['bond_type']) + "</int></attr>\n")
gxl_file.write("<attr name=\"valence\"><int>" + "1" + "</int></attr>\n")
gxl_file.write("<edge from=\"_" + str(v1) + "\" to=\"_" + str(v2) + "\">")
# gxl_file.write("<attr name=\"valence\"><int>" + str(attrs['bond_type']) + "</int></attr>")
gxl_file.write("<attr name=\"valence\"><int>" + "1" + "</int></attr>")
gxl_file.write("</edge>\n") gxl_file.write("</edge>\n")
gxl_file.write("</graph>\n") gxl_file.write("</graph>\n")
gxl_file.write("</gxl>\n") gxl_file.write("</gxl>\n")
@@ -485,7 +485,7 @@ def loadDataset(filename, filename_y=None, extra_params=None):
return data, y return data, y




def saveDataset(Gn, y, gformat='gxl', group=None, filename='gfile'):
def saveDataset(Gn, y, gformat='gxl', group=None, filename='gfile', xparams=None):
"""Save list of graphs. """Save list of graphs.
""" """
import os import os
@@ -502,7 +502,7 @@ def saveDataset(Gn, y, gformat='gxl', group=None, filename='gfile'):
fgroup.write("\n<GraphCollection>") fgroup.write("\n<GraphCollection>")
for idx, g in enumerate(Gn): for idx, g in enumerate(Gn):
fname_tmp = "graph" + str(idx) + ".gxl" fname_tmp = "graph" + str(idx) + ".gxl"
saveGXL(g, dirname_ds + fname_tmp)
saveGXL(g, dirname_ds + fname_tmp, method=xparams['method'])
fgroup.write("\n\t<graph file=\"" + fname_tmp + "\" class=\"" + str(y[idx]) + "\"/>") fgroup.write("\n\t<graph file=\"" + fname_tmp + "\" class=\"" + str(y[idx]) + "\"/>")
fgroup.write("\n</GraphCollection>") fgroup.write("\n</GraphCollection>")
fgroup.close() fgroup.close()


Loading…
Cancel
Save