Browse Source

Update pre-image.

v0.1
jajupmochi 5 years ago
parent
commit
57e13c9c5d
11 changed files with 6214 additions and 1002 deletions
  1. +5
    -3
      notebooks/run_untilhpathkernel.py
  2. +4191
    -491
      notebooks/utils/plot_all_graphs.ipynb
  3. +340
    -38
      preimage/gk_iam.py
  4. +493
    -368
      preimage/iam.py
  5. +29
    -29
      preimage/median.py
  6. +152
    -17
      preimage/preimage.py
  7. +298
    -17
      preimage/run_gk_iam.py
  8. +11
    -11
      preimage/test.py
  9. +599
    -0
      preimage/test_random_mutag.py
  10. +88
    -20
      pygraph/kernels/untilHPathKernel.py
  11. +8
    -8
      pygraph/utils/graphfiles.py

+ 5
- 3
notebooks/run_untilhpathkernel.py View File

@@ -54,9 +54,11 @@ dslist = [
# {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',},
]
estimator = untilhpathkernel
param_grid_precomputed = {'depth': np.linspace(1, 10, 10), # [2],
'k_func': ['MinMax'], # ['MinMax', 'tanimoto'],
'compute_method': ['trie']} # ['MinMax']}
param_grid_precomputed = {'depth': np.linspace(3, 10, 8), # [2],
'k_func': [None]} # ['MinMax', 'tanimoto'],
#param_grid_precomputed = {'depth': np.linspace(1, 10, 10), # [2],
# 'k_func': ['MinMax'], # ['MinMax', 'tanimoto'],
# 'compute_method': ['trie']} # ['MinMax']}
param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)},
{'alpha': np.logspace(-10, 10, num=41, base=10)}]



+ 4191
- 491
notebooks/utils/plot_all_graphs.ipynb
File diff suppressed because it is too large
View File


+ 340
- 38
preimage/gk_iam.py View File

@@ -17,8 +17,11 @@ import multiprocessing
from tqdm import tqdm
import networkx as nx
import matplotlib.pyplot as plt
import random

from iam import iam, test_iam_with_more_graphs_as_init, test_iam_moreGraphsAsInit_tryAllPossibleBestGraphs_deleteNodesInIterations
import matplotlib.pyplot as plt

from iam import iam, test_iam_with_more_graphs_as_init, iam_moreGraphsAsInit_tryAllPossibleBestGraphs
sys.path.insert(0, "../")
from pygraph.kernels.marginalizedKernel import marginalizedkernel
from pygraph.kernels.untilHPathKernel import untilhpathkernel
@@ -67,7 +70,7 @@ def gk_iam(Gn, alpha):
# Gs_nearest = Gk + gihat_list
# g_tmp = iam(Gs_nearest)
#
# # compute distance between phi and the new generated graph.
# # compute distance between \psi and the new generated graph.
# knew = marginalizedkernel([g_tmp, g1, g2], node_label='atom', edge_label=None,
# p_quit=lmbda, n_iteration=20, remove_totters=False,
# n_jobs=multiprocessing.cpu_count(), verbose=False)
@@ -142,7 +145,7 @@ def gk_iam_nearest(Gn, alpha, idx_gi, Kmatrix, k, r_max):
print(g_tmp.nodes(data=True))
print(g_tmp.edges(data=True))
# compute distance between phi and the new generated graph.
# compute distance between \psi and the new generated graph.
gi_list = [Gn[i] for i in idx_gi]
knew = compute_kernel([g_tmp] + gi_list, 'untilhpathkernel', False)
dnew = dis_gstar(0, range(1, len(gi_list) + 1), alpha, knew)
@@ -236,7 +239,7 @@ def gk_iam_nearest(Gn, alpha, idx_gi, Kmatrix, k, r_max):
# print(g.nodes(data=True))
# print(g.edges(data=True))
#
# # compute distance between phi and the new generated graphs.
# # compute distance between \psi and the new generated graphs.
# gi_list = [Gn[i] for i in idx_gi]
# knew = compute_kernel(g_tmp_list + gi_list, 'marginalizedkernel', False)
# dnew_list = []
@@ -278,7 +281,12 @@ def gk_iam_nearest(Gn, alpha, idx_gi, Kmatrix, k, r_max):


def gk_iam_nearest_multi(Gn_init, Gn_median, alpha, idx_gi, Kmatrix, k, r_max,
gkernel, c_ei=1, c_er=1, c_es=1, epsilon=0.001):
gkernel, epsilon=0.001,
params_iam={'c_ei': 1, 'c_er': 1, 'c_es': 1,
'ite_max': 50, 'epsilon': 0.001,
'removeNodes': True, 'connected': False},
params_ged={'ged_cost': 'CHEM_1', 'ged_method': 'IPFP',
'saveGXL': 'benoit'}):
"""This function constructs graph pre-image by the iterative pre-image
framework in reference [1], algorithm 1, where the step of generating new
graphs randomly is replaced by the IAM algorithm in reference [2].
@@ -310,7 +318,7 @@ def gk_iam_nearest_multi(Gn_init, Gn_median, alpha, idx_gi, Kmatrix, k, r_max,
g0hat_list = [Gn_init[idx] for idx in sort_idx[0:nb_best]] # the nearest neighbors of phi in DN
if dis_gs[0] == 0: # the exact pre-image.
print('The exact pre-image is found from the input dataset.')
return 0, g0hat_list
return 0, g0hat_list, 0, 0
dhat = dis_gs[0] # the nearest distance
ghat_list = [g.copy() for g in g0hat_list]
# for g in ghat_list:
@@ -320,31 +328,33 @@ def gk_iam_nearest_multi(Gn_init, Gn_median, alpha, idx_gi, Kmatrix, k, r_max,
# print(g.nodes(data=True))
# print(g.edges(data=True))
Gk = [Gn_init[ig].copy() for ig in sort_idx[0:k]] # the k nearest neighbors
# for gi in Gk:
## nx.draw_networkx(gi)
## plt.show()
for gi in Gk:
nx.draw(gi, labels=nx.get_node_attributes(gi, 'atom'), with_labels=True)
# nx.draw_networkx(gi)
plt.show()
# draw_Letter_graph(g)
# print(gi.nodes(data=True))
# print(gi.edges(data=True))
Gs_nearest = Gk.copy()
print(gi.nodes(data=True))
print(gi.edges(data=True))
Gs_nearest = [g.copy() for g in Gk]
Gn_nearest_median = [g.copy() for g in Gs_nearest]
# gihat_list = []
# i = 1
r = 0
itr = 0
# cur_sod = dhat
# old_sod = cur_sod * 2
sod_list = [dhat]
itr_total = 0
# cur_dis = dhat
# old_dis = cur_dis * 2
dis_list = [dhat]
found = False
nb_updated = 0
while r < r_max:# and not found: # @todo: if not found?# and np.abs(old_sod - cur_sod) > epsilon:
print('\nr =', r)
print('itr for gk =', itr, '\n')
while r < r_max:# and not found: # @todo: if not found?# and np.abs(old_dis - cur_dis) > epsilon:
print('\nCurrent preimage iteration =', r)
print('Total preimage iteration =', itr_total, '\n')
found = False
# Gs_nearest = Gk + gihat_list
# g_tmp = iam(Gs_nearest)
g_tmp_list, _ = test_iam_moreGraphsAsInit_tryAllPossibleBestGraphs_deleteNodesInIterations(
Gn_median, Gs_nearest, c_ei=c_ei, c_er=c_er, c_es=c_es)
g_tmp_list, _ = iam_moreGraphsAsInit_tryAllPossibleBestGraphs(
Gn_nearest_median, Gs_nearest, params_ged=params_ged, **params_iam)
# for g in g_tmp_list:
# nx.draw_networkx(g)
# plt.show()
@@ -352,31 +362,73 @@ def gk_iam_nearest_multi(Gn_init, Gn_median, alpha, idx_gi, Kmatrix, k, r_max,
# print(g.nodes(data=True))
# print(g.edges(data=True))
# compute distance between phi and the new generated graphs.
# compute distance between \psi and the new generated graphs.
knew = compute_kernel(g_tmp_list + Gn_median, gkernel, False)
dnew_list = []
for idx, g_tmp in enumerate(g_tmp_list):
# @todo: the term3 below could use the one at the beginning of the function.
dnew_list.append(dis_gstar(idx, range(len(g_tmp_list),
len(g_tmp_list) + len(Gn_median) + 1), alpha, knew,
withterm3=False))
len(g_tmp_list) + len(Gn_median) + 1),
alpha, knew, withterm3=False))
# dnew = knew[0, 0] - 2 * (alpha[0] * knew[0, 1] + alpha[1] *
# knew[0, 2]) + (alpha[0] * alpha[0] * k_list[0] + alpha[0] *
# alpha[1] * k_g2_list[0] + alpha[1] * alpha[0] *
# k_g1_list[1] + alpha[1] * alpha[1] * k_list[1])
# # find the new k nearest graphs.
# dnew_best = min(dnew_list)
# dis_gs = dnew_list + dis_gs # add the new nearest distances.
# Gs_nearest = [g.copy() for g in g_tmp_list] + Gs_nearest # add the corresponding graphs.
# sort_idx = np.argsort(dis_gs)
# if len([i for i in sort_idx[0:k] if i < len(dnew_list)]) > 0:
# print('We got new k nearest neighbors! Hurray!')
# dis_gs = [dis_gs[idx] for idx in sort_idx[0:k]] # the new k nearest distances.
## print(dis_gs[-1])
# Gs_nearest = [Gs_nearest[idx] for idx in sort_idx[0:k]]
# nb_best = len(np.argwhere(dis_gs == dis_gs[0]).flatten().tolist())
# if dnew_best < dhat and np.abs(dnew_best - dhat) > epsilon:
# print('I have smaller distance!')
# print(str(dhat) + '->' + str(dis_gs[0]))
# dhat = dis_gs[0]
# idx_best_list = np.argwhere(dnew_list == dhat).flatten().tolist()
# ghat_list = [g_tmp_list[idx].copy() for idx in idx_best_list]
## for g in ghat_list:
### nx.draw_networkx(g)
### plt.show()
## draw_Letter_graph(g)
## print(g.nodes(data=True))
## print(g.edges(data=True))
# r = 0
# found = True
# nb_updated += 1
# elif np.abs(dnew_best - dhat) < epsilon:
# print('I have almost equal distance!')
# print(str(dhat) + '->' + str(dnew_best))
# else:
# dis_gs = [dis_gs[idx] for idx in sort_idx[0:k]]
# Gs_nearest = [Gs_nearest[idx] for idx in sort_idx[0:k]]
# Gn_nearest_median = [g.copy() for g in Gs_nearest]
# if not found:
# r += 1
# find the new k nearest graphs.
dnew_best = min(dnew_list)
dis_gs = dnew_list + dis_gs # add the new nearest distances.
Gs_nearest = [g.copy() for g in g_tmp_list] + Gs_nearest # add the corresponding graphs.
sort_idx = np.argsort(dis_gs)
if np.abs(dnew_best - dhat) >= epsilon:
dis_gs = dnew_list + dis_gs # add the new nearest distances.
Gs_nearest = [g.copy() for g in g_tmp_list] + Gs_nearest # add the corresponding graphs.
sort_idx = np.argsort(dis_gs)
else: # if the new distance is equal to the old one.
# @todo: works if only one graph is generated.
Gs_nearest[0] = g_tmp_list[0].copy()
sort_idx = np.argsort(dis_gs)
if len([i for i in sort_idx[0:k] if i < len(dnew_list)]) > 0:
print('We got better k nearest neighbors! Hurray!')
print('We got new k nearest neighbors! Hurray!')
dis_gs = [dis_gs[idx] for idx in sort_idx[0:k]] # the new k nearest distances.
print(dis_gs[-1])
# print(dis_gs[-1])
Gs_nearest = [Gs_nearest[idx] for idx in sort_idx[0:k]]
nb_best = len(np.argwhere(dis_gs == dis_gs[0]).flatten().tolist())
if dnew_best < dhat and np.abs(dnew_best - dhat) > epsilon:
if dnew_best < dhat and np.abs(dnew_best - dhat) >= epsilon:
print('I have smaller distance!')
print(str(dhat) + '->' + str(dis_gs[0]))
dhat = dis_gs[0]
@@ -394,19 +446,269 @@ def gk_iam_nearest_multi(Gn_init, Gn_median, alpha, idx_gi, Kmatrix, k, r_max,
elif np.abs(dnew_best - dhat) < epsilon:
print('I have almost equal distance!')
print(str(dhat) + '->' + str(dnew_best))
else:
dis_gs = [dis_gs[idx] for idx in sort_idx[0:k]]
Gs_nearest = [Gs_nearest[idx] for idx in sort_idx[0:k]]
Gn_nearest_median = [g.copy() for g in Gs_nearest]
if not found:
r += 1
# old_sod = cur_sod
# cur_sod = dnew_best
sod_list.append(dhat)
itr += 1
# old_dis = cur_dis
# cur_dis = dnew_best
dis_list.append(dhat)
itr_total += 1
print('\nthe graph is updated', nb_updated, 'times.')
print('sods in kernel space:', sod_list, '\n')
print('distances in kernel space:', dis_list, '\n')
return dhat, ghat_list
return dhat, ghat_list, dis_list[-1], nb_updated



def preimage_iam_random_mix(Gn_init, Gn_median, alpha, idx_gi, Kmatrix, k, r_max,
l_max, gkernel, epsilon=0.001,
params_iam={'c_ei': 1, 'c_er': 1, 'c_es': 1,
'ite_max': 50, 'epsilon': 0.001,
'removeNodes': True, 'connected': False},
params_ged={'ged_cost': 'CHEM_1', 'ged_method': 'IPFP',
'saveGXL': 'benoit'}):
"""This function constructs graph pre-image by the iterative pre-image
framework in reference [1], algorithm 1, where new graphs are generated
randomly and by the IAM algorithm in reference [2].
notes
-----
Every time a set of n better graphs is acquired, their distances in kernel space are
compared with the k nearest ones, and the k nearest distances from the k+n
distances will be used as the new ones.
"""
Gn_init = [nx.convert_node_labels_to_integers(g) for g in Gn_init]
# compute k nearest neighbors of phi in DN.
dis_list = [] # distance between g_star and each graph.
term3 = 0
for i1, a1 in enumerate(alpha):
for i2, a2 in enumerate(alpha):
term3 += a1 * a2 * Kmatrix[idx_gi[i1], idx_gi[i2]]
for ig, g in tqdm(enumerate(Gn_init), desc='computing distances', file=sys.stdout):
dtemp = dis_gstar(ig, idx_gi, alpha, Kmatrix, term3=term3)
dis_list.append(dtemp)
# sort
sort_idx = np.argsort(dis_list)
dis_gs = [dis_list[idis] for idis in sort_idx[0:k]] # the k shortest distances
nb_best = len(np.argwhere(dis_gs == dis_gs[0]).flatten().tolist())
g0hat_list = [Gn_init[idx] for idx in sort_idx[0:nb_best]] # the nearest neighbors of phi in DN
if dis_gs[0] == 0: # the exact pre-image.
print('The exact pre-image is found from the input dataset.')
return 0, g0hat_list, 0, 0
dhat = dis_gs[0] # the nearest distance
ghat_list = [g.copy() for g in g0hat_list]
# for g in ghat_list:
# draw_Letter_graph(g)
# nx.draw_networkx(g)
# plt.show()
# print(g.nodes(data=True))
# print(g.edges(data=True))
Gk = [Gn_init[ig].copy() for ig in sort_idx[0:k]] # the k nearest neighbors
for gi in Gk:
nx.draw(gi, labels=nx.get_node_attributes(gi, 'atom'), with_labels=True)
# nx.draw_networkx(gi)
plt.show()
# draw_Letter_graph(g)
print(gi.nodes(data=True))
print(gi.edges(data=True))
Gs_nearest = [g.copy() for g in Gk]
Gn_nearest_median = [g.copy() for g in Gs_nearest]
# gihat_list = []
# i = 1
r = 0
itr_total = 0
# cur_dis = dhat
# old_dis = cur_dis * 2
dis_list = [dhat]
found = False
nb_updated_iam = 0
nb_updated_random = 0
while r < r_max: # and not found: # @todo: if not found?# and np.abs(old_dis - cur_dis) > epsilon:
print('\n-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-')
print('Current preimage iteration =', r)
print('Total preimage iteration =', itr_total, '\n')
found = False
# Gs_nearest = Gk + gihat_list
# g_tmp = iam(Gs_nearest)
g_tmp_list, _ = iam_moreGraphsAsInit_tryAllPossibleBestGraphs(
Gn_nearest_median, Gs_nearest, params_ged=params_ged, **params_iam)
# for g in g_tmp_list:
# nx.draw_networkx(g)
# plt.show()
# draw_Letter_graph(g)
# print(g.nodes(data=True))
# print(g.edges(data=True))
# compute distance between \psi and the new generated graphs.
knew = compute_kernel(g_tmp_list + Gn_median, gkernel, False)
dnew_list = []
for idx, g_tmp in enumerate(g_tmp_list):
# @todo: the term3 below could use the one at the beginning of the function.
dnew_list.append(dis_gstar(idx, range(len(g_tmp_list),
len(g_tmp_list) + len(Gn_median) + 1),
alpha, knew, withterm3=False))
# find the new k nearest graphs.
# @todo: for now only consider the situation when only one graph is generated by IAM.
dnew_best = min(dnew_list)
gnew_best = g_tmp_list[0].copy()
# when new distance is equal to the old one, use random generation.
if np.abs(dnew_best - dhat) < epsilon or dhat < dnew_best:
# Gs_nearest[0] = g_tmp_list[0].copy()
# sort_idx = np.argsort(dis_gs)
print('Distance almost equal or worse, switching to random generation now.')
print(str(dhat) + '->' + str(dnew_best))
if dnew_best > dhat and np.abs(dnew_best - dhat) >= epsilon:
dnew_best = dhat
gnew_best = Gs_nearest[0].copy()
# number of edges to be changed.
# @todo what if the log is negetive? how to choose alpha (scalar)? seems fdgs is always 1.
# fdgs = dnew_best
fdgs = nb_updated_random + 1
if fdgs < 1:
fdgs = 1
fdgs = int(np.ceil(np.log(fdgs)))
if fdgs < 1:
fdgs += 1
# fdgs = nb_updated_random + 1 # @todo:
# @todo: should we use just half of the adjacency matrix for undirected graphs?
nb_vpairs = nx.number_of_nodes(gnew_best) * (nx.number_of_nodes(gnew_best) - 1)
l = 0
while l < l_max:
# add and delete edges.
gtemp = gnew_best.copy()
np.random.seed()
# which edges to change.
# @todo: what if fdgs is bigger than nb_vpairs?
idx_change = random.sample(range(nb_vpairs), fdgs if
fdgs < nb_vpairs else nb_vpairs)
# idx_change = np.random.randint(0, nx.number_of_nodes(gs) *
# (nx.number_of_nodes(gs) - 1), fdgs)
for item in idx_change:
node1 = int(item / (nx.number_of_nodes(gtemp) - 1))
node2 = (item - node1 * (nx.number_of_nodes(gtemp) - 1))
if node2 >= node1: # skip the self pair.
node2 += 1
# @todo: is the randomness correct?
if not gtemp.has_edge(node1, node2):
gtemp.add_edge(node1, node2)
# nx.draw_networkx(gs)
# plt.show()
# nx.draw_networkx(gtemp)
# plt.show()
else:
gtemp.remove_edge(node1, node2)
# nx.draw_networkx(gs)
# plt.show()
# nx.draw_networkx(gtemp)
# plt.show()
# nx.draw_networkx(gtemp)
# plt.show()
# compute distance between \psi and the new generated graph.
knew = compute_kernel([gtemp] + Gn_median, gkernel, verbose=False)
dnew = dis_gstar(0, [1, 2], alpha, knew, withterm3=False)
# @todo: the new distance is smaller or also equal?
if dnew < dnew_best or np.abs(dnew_best - dnew) < epsilon:
if np.abs(dnew_best - dnew) < epsilon:
print('I am equal!')
dnew_best = dnew
gnew_best = gtemp.copy()
else:
print('\nI am smaller!')
print('l =', str(l))
print(dnew_best, '->', dnew)
dis_gs = [dnew] + dis_gs # add the new nearest distances.
Gs_nearest = [gtemp.copy()] + Gs_nearest # add the corresponding graphs.
sort_idx = np.argsort(dis_gs)
dis_gs = [dis_gs[idx] for idx in sort_idx[0:k]] # the new k nearest distances.
Gs_nearest = [Gs_nearest[idx] for idx in sort_idx[0:k]]
Gn_nearest_median = [g.copy() for g in Gs_nearest]
dhat = dnew
nb_updated_random += 1
found = True # found better graph.
r = 0
print('the graph is updated by random generation',
nb_updated_random, 'times.')
nx.draw(gtemp, labels=nx.get_node_attributes(gtemp, 'atom'),
with_labels=True)
## plt.savefig("results/gk_iam/simple_two/xx" + str(i) + ".png", format="PNG")
plt.show()
break
# nx.draw_networkx(gtemp)
# plt.show()
# print(gtemp.nodes(data=True))
# print(gtemp.edges(data=True))
l += 1
if l == l_max:
r += 1
else: # if the new distance is not equal to the old one.
dis_gs = dnew_list + dis_gs # add the new nearest distances.
Gs_nearest = [nx.convert_node_labels_to_integers(g).copy() for g
in g_tmp_list] + Gs_nearest # add the corresponding graphs.
sort_idx = np.argsort(dis_gs)
if len([i for i in sort_idx[0:k] if i < len(dnew_list)]) > 0:
print('We got new k nearest neighbors! Hurray!')
dis_gs = [dis_gs[idx] for idx in sort_idx[0:k]] # the new k nearest distances.
# print(dis_gs[-1])
Gs_nearest = [Gs_nearest[idx] for idx in sort_idx[0:k]]
nb_best = len(np.argwhere(dis_gs == dis_gs[0]).flatten().tolist())
if dnew_best < dhat:
print('I have smaller distance!')
print(str(dhat) + '->' + str(dis_gs[0]))
dhat = dis_gs[0]
idx_best_list = np.argwhere(dnew_list == dhat).flatten().tolist()
ghat_list = [g_tmp_list[idx].copy() for idx in idx_best_list]
# for g in ghat_list:
## nx.draw_networkx(g)
## plt.show()
# draw_Letter_graph(g)
# print(g.nodes(data=True))
# print(g.edges(data=True))
r = 0
found = True
nb_updated_iam += 1
print('the graph is updated by IAM', nb_updated_iam, 'times.')
nx.draw(ghat_list[0], labels=nx.get_node_attributes(ghat_list[0], 'atom'),
with_labels=True)
## plt.savefig("results/gk_iam/simple_two/xx" + str(i) + ".png", format="PNG")
plt.show()
else:
dis_gs = [dis_gs[idx] for idx in sort_idx[0:k]]
Gs_nearest = [Gs_nearest[idx] for idx in sort_idx[0:k]]
Gn_nearest_median = [g.copy() for g in Gs_nearest]
if not found:
r += 1
# old_dis = cur_dis
# cur_dis = dnew_best
dis_list.append(dhat)
itr_total += 1
print('\nthe k shortest distances are', dis_gs)
print('the shortest distances for previous iterations are', dis_list)
print('\nthe graph is updated by IAM', nb_updated_iam, 'times, and by random generation',
nb_updated_random, 'times.')
print('distances in kernel space:', dis_list, '\n')
return dhat, ghat_list, dis_list[-1], nb_updated_iam, nb_updated_random


###############################################################################
# useful functions.

def dis_gstar(idx_g, idx_gi, alpha, Kmatrix, term3=0, withterm3=True):
term1 = Kmatrix[idx_g, idx_g]
@@ -424,10 +726,10 @@ def dis_gstar(idx_g, idx_gi, alpha, Kmatrix, term3=0, withterm3=True):
def compute_kernel(Gn, graph_kernel, verbose):
if graph_kernel == 'marginalizedkernel':
Kmatrix, _ = marginalizedkernel(Gn, node_label='atom', edge_label=None,
p_quit=0.03, n_iteration=20, remove_totters=False,
p_quit=0.03, n_iteration=10, remove_totters=False,
n_jobs=multiprocessing.cpu_count(), verbose=verbose)
elif graph_kernel == 'untilhpathkernel':
Kmatrix, _ = untilhpathkernel(Gn, node_label='atom', edge_label='bond_type',
Kmatrix, _ = untilhpathkernel(Gn, node_label='atom', edge_label=None,
depth=10, k_func='MinMax', compute_method='trie',
n_jobs=multiprocessing.cpu_count(), verbose=verbose)
elif graph_kernel == 'spkernel':


+ 493
- 368
preimage/iam.py View File

@@ -20,7 +20,424 @@ from pygraph.utils.graphdataset import get_dataset_attributes
from pygraph.utils.utils import graph_isIdentical, get_node_labels, get_edge_labels
#from pygraph.utils.utils import graph_deepcopy

def iam_moreGraphsAsInit_tryAllPossibleBestGraphs(Gn_median, Gn_candidate,
c_ei=3, c_er=3, c_es=1, ite_max=50, epsilon=0.001,
node_label='atom', edge_label='bond_type',
connected=False, removeNodes=True, AllBestInit=True,
params_ged={'ged_cost': 'CHEM_1', 'ged_method': 'IPFP', 'saveGXL': 'benoit'}):
"""See my name, then you know what I do.
"""
from tqdm import tqdm
# Gn_median = Gn_median[0:10]
# Gn_median = [nx.convert_node_labels_to_integers(g) for g in Gn_median]
if removeNodes:
node_ir = np.inf # corresponding to the node remove and insertion.
label_r = 'thanksdanny' # the label for node remove. # @todo: make this label unrepeatable.
ds_attrs = get_dataset_attributes(Gn_median + Gn_candidate,
attr_names=['edge_labeled', 'node_attr_dim', 'edge_attr_dim'],
edge_label=edge_label)

def generate_graph(G, pi_p_forward, label_set):
G_new_list = [G.copy()] # all "best" graphs generated in this iteration.
# nx.draw_networkx(G)
# import matplotlib.pyplot as plt
# plt.show()
# print(pi_p_forward)
# update vertex labels.
# pre-compute h_i0 for each label.
# for label in get_node_labels(Gn, node_label):
# print(label)
# for nd in G.nodes(data=True):
# pass
if not ds_attrs['node_attr_dim']: # labels are symbolic
for ndi, (nd, _) in enumerate(G.nodes(data=True)):
h_i0_list = []
label_list = []
for label in label_set:
h_i0 = 0
for idx, g in enumerate(Gn_median):
pi_i = pi_p_forward[idx][ndi]
if pi_i != node_ir and g.nodes[pi_i][node_label] == label:
h_i0 += 1
h_i0_list.append(h_i0)
label_list.append(label)
# case when the node is to be removed.
if removeNodes:
h_i0_remove = 0 # @todo: maybe this can be added to the label_set above.
for idx, g in enumerate(Gn_median):
pi_i = pi_p_forward[idx][ndi]
if pi_i == node_ir:
h_i0_remove += 1
h_i0_list.append(h_i0_remove)
label_list.append(label_r)
# get the best labels.
idx_max = np.argwhere(h_i0_list == np.max(h_i0_list)).flatten().tolist()
nlabel_best = [label_list[idx] for idx in idx_max]
# generate "best" graphs with regard to "best" node labels.
G_new_list_nd = []
for g in G_new_list: # @todo: seems it can be simplified. The G_new_list will only contain 1 graph for now.
for nl in nlabel_best:
g_tmp = g.copy()
if nl == label_r:
g_tmp.remove_node(nd)
else:
g_tmp.nodes[nd][node_label] = nl
G_new_list_nd.append(g_tmp)
# nx.draw_networkx(g_tmp)
# import matplotlib.pyplot as plt
# plt.show()
# print(g_tmp.nodes(data=True))
# print(g_tmp.edges(data=True))
G_new_list = [ggg.copy() for ggg in G_new_list_nd]

else: # labels are non-symbolic
for ndi, (nd, _) in enumerate(G.nodes(data=True)):
Si_norm = 0
phi_i_bar = np.array([0.0 for _ in range(ds_attrs['node_attr_dim'])])
for idx, g in enumerate(Gn_median):
pi_i = pi_p_forward[idx][ndi]
if g.has_node(pi_i): #@todo: what if no g has node? phi_i_bar = 0?
Si_norm += 1
phi_i_bar += np.array([float(itm) for itm in g.nodes[pi_i]['attributes']])
phi_i_bar /= Si_norm
G_new_list[0].nodes[nd]['attributes'] = phi_i_bar
# for g in G_new_list:
# import matplotlib.pyplot as plt
# nx.draw(g, labels=nx.get_node_attributes(g, 'atom'), with_labels=True)
# plt.show()
# print(g.nodes(data=True))
# print(g.edges(data=True))
# update edge labels and adjacency matrix.
if ds_attrs['edge_labeled']:
G_new_list_edge = []
for g_new in G_new_list:
nd_list = [n for n in g_new.nodes()]
g_tmp_list = [g_new.copy()]
for nd1i in range(nx.number_of_nodes(g_new)):
nd1 = nd_list[nd1i]# @todo: not just edges, but all pairs of nodes
for nd2i in range(nd1i + 1, nx.number_of_nodes(g_new)):
nd2 = nd_list[nd2i]
# for nd1, nd2, _ in g_new.edges(data=True):
h_ij0_list = []
label_list = []
# @todo: compute edge label set before.
for label in get_edge_labels(Gn_median, edge_label):
h_ij0 = 0
for idx, g in enumerate(Gn_median):
pi_i = pi_p_forward[idx][nd1i]
pi_j = pi_p_forward[idx][nd2i]
h_ij0_p = (g.has_node(pi_i) and g.has_node(pi_j) and
g.has_edge(pi_i, pi_j) and
g.edges[pi_i, pi_j][edge_label] == label)
h_ij0 += h_ij0_p
h_ij0_list.append(h_ij0)
label_list.append(label)
# # case when the edge is to be removed.
# h_ij0_remove = 0
# for idx, g in enumerate(Gn_median):
# pi_i = pi_p_forward[idx][nd1i]
# pi_j = pi_p_forward[idx][nd2i]
# if g.has_node(pi_i) and g.has_node(pi_j) and not
# g.has_edge(pi_i, pi_j):
# h_ij0_remove += 1
# h_ij0_list.append(h_ij0_remove)
# label_list.append(label_r)
# get the best labels.
# choose all best graphs.
idx_max = np.argwhere(h_ij0_list == np.max(h_ij0_list)).flatten().tolist()
elabel_best = [label_list[idx] for idx in idx_max]
h_ij0_max = [h_ij0_list[idx] for idx in idx_max]
# generate "best" graphs with regard to "best" node labels.
G_new_list_ed = []
for g_tmp in g_tmp_list: # @todo: seems it can be simplified. The G_new_list will only contain 1 graph for now.
for idxl, el in enumerate(elabel_best):
g_tmp_copy = g_tmp.copy()
# check whether a_ij is 0 or 1.
sij_norm = 0
for idx, g in enumerate(Gn_median):
pi_i = pi_p_forward[idx][nd1i]
pi_j = pi_p_forward[idx][nd2i]
if g.has_node(pi_i) and g.has_node(pi_j) and \
g.has_edge(pi_i, pi_j):
sij_norm += 1
if h_ij0_max[idxl] > len(Gn_median) * c_er / c_es + \
sij_norm * (1 - (c_er + c_ei) / c_es):
if not g_tmp_copy.has_edge(nd1, nd2):
g_tmp_copy.add_edge(nd1, nd2)
g_tmp_copy.edges[nd1, nd2][edge_label] = elabel_best[idxl]
else:
if g_tmp_copy.has_edge(nd1, nd2):
g_tmp_copy.remove_edge(nd1, nd2)
G_new_list_ed.append(g_tmp_copy)
g_tmp_list = [ggg.copy() for ggg in G_new_list_ed]
G_new_list_edge += g_tmp_list
G_new_list = [ggg.copy() for ggg in G_new_list_edge]
# # choose one of the best randomly.
# idx_max = np.argwhere(h_ij0_list == np.max(h_ij0_list)).flatten().tolist()
# h_ij0_max = h_ij0_list[idx_max[0]]
# idx_rdm = random.randint(0, len(idx_max) - 1)
# best_label = label_list[idx_max[idx_rdm]]
#
# # check whether a_ij is 0 or 1.
# sij_norm = 0
# for idx, g in enumerate(Gn_median):
# pi_i = pi_p_forward[idx][nd1i]
# pi_j = pi_p_forward[idx][nd2i]
# if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j):
# sij_norm += 1
# if h_ij0_max > len(Gn_median) * c_er / c_es + sij_norm * (1 - (c_er + c_ei) / c_es):
# if not g_new.has_edge(nd1, nd2):
# g_new.add_edge(nd1, nd2)
# g_new.edges[nd1, nd2][edge_label] = best_label
# else:
# if g_new.has_edge(nd1, nd2):
# g_new.remove_edge(nd1, nd2)
else: # if edges are unlabeled
# @todo: is this even right? G or g_tmp? check if the new one is right
# @todo: works only for undirected graphs.
for g_tmp in G_new_list:
nd_list = [n for n in g_tmp.nodes()]
for nd1i in range(nx.number_of_nodes(g_tmp)):
nd1 = nd_list[nd1i]
for nd2i in range(nd1i + 1, nx.number_of_nodes(g_tmp)):
nd2 = nd_list[nd2i]
sij_norm = 0
for idx, g in enumerate(Gn_median):
pi_i = pi_p_forward[idx][nd1i]
pi_j = pi_p_forward[idx][nd2i]
if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j):
sij_norm += 1
if sij_norm > len(Gn_median) * c_er / (c_er + c_ei):
# @todo: should we consider if nd1 and nd2 in g_tmp?
# or just add the edge anyway?
if g_tmp.has_node(nd1) and g_tmp.has_node(nd2) \
and not g_tmp.has_edge(nd1, nd2):
g_tmp.add_edge(nd1, nd2)
# else: # @todo: which to use?
elif sij_norm < len(Gn_median) * c_er / (c_er + c_ei):
if g_tmp.has_edge(nd1, nd2):
g_tmp.remove_edge(nd1, nd2)
# do not change anything when equal.
# for i, g in enumerate(G_new_list):
# import matplotlib.pyplot as plt
# nx.draw(g, labels=nx.get_node_attributes(g, 'atom'), with_labels=True)
## plt.savefig("results/gk_iam/simple_two/xx" + str(i) + ".png", format="PNG")
# plt.show()
# print(g.nodes(data=True))
# print(g.edges(data=True))
# # find the best graph generated in this iteration and update pi_p.
# @todo: should we update all graphs generated or just the best ones?
dis_list, pi_forward_list = median_distance(G_new_list, Gn_median,
**params_ged)
# @todo: should we remove the identical and connectivity check?
# Don't know which is faster.
if ds_attrs['node_attr_dim'] == 0 and ds_attrs['edge_attr_dim'] == 0:
G_new_list, idx_list = remove_duplicates(G_new_list)
pi_forward_list = [pi_forward_list[idx] for idx in idx_list]
dis_list = [dis_list[idx] for idx in idx_list]
# if connected == True:
# G_new_list, idx_list = remove_disconnected(G_new_list)
# pi_forward_list = [pi_forward_list[idx] for idx in idx_list]
# idx_min_list = np.argwhere(dis_list == np.min(dis_list)).flatten().tolist()
# dis_min = dis_list[idx_min_tmp_list[0]]
# pi_forward_list = [pi_forward_list[idx] for idx in idx_min_list]
# G_new_list = [G_new_list[idx] for idx in idx_min_list]
# for g in G_new_list:
# import matplotlib.pyplot as plt
# nx.draw_networkx(g)
# plt.show()
# print(g.nodes(data=True))
# print(g.edges(data=True))
return G_new_list, pi_forward_list, dis_list
def best_median_graphs(Gn_candidate, pi_all_forward, dis_all):
idx_min_list = np.argwhere(dis_all == np.min(dis_all)).flatten().tolist()
dis_min = dis_all[idx_min_list[0]]
pi_forward_min_list = [pi_all_forward[idx] for idx in idx_min_list]
G_min_list = [Gn_candidate[idx] for idx in idx_min_list]
return G_min_list, pi_forward_min_list, dis_min
def iteration_proc(G, pi_p_forward, cur_sod):
G_list = [G]
pi_forward_list = [pi_p_forward]
old_sod = cur_sod * 2
sod_list = [cur_sod]
dis_list = [cur_sod]
# iterations.
itr = 0
# @todo: what if difference == 0?
# while itr < ite_max and (np.abs(old_sod - cur_sod) > epsilon or
# np.abs(old_sod - cur_sod) == 0):
while itr < ite_max and np.abs(old_sod - cur_sod) > epsilon:
# for itr in range(0, 5): # the convergence condition?
print('itr_iam is', itr)
G_new_list = []
pi_forward_new_list = []
dis_new_list = []
for idx, g in enumerate(G_list):
label_set = get_node_labels(Gn_median + [g], node_label)
G_tmp_list, pi_forward_tmp_list, dis_tmp_list = generate_graph(
g, pi_forward_list[idx], label_set)
G_new_list += G_tmp_list
pi_forward_new_list += pi_forward_tmp_list
dis_new_list += dis_tmp_list
# @todo: need to remove duplicates here?
G_list = [ggg.copy() for ggg in G_new_list]
pi_forward_list = [pitem.copy() for pitem in pi_forward_new_list]
dis_list = dis_new_list[:]
old_sod = cur_sod
cur_sod = np.min(dis_list)
sod_list.append(cur_sod)
itr += 1
# @todo: do we return all graphs or the best ones?
# get the best ones of the generated graphs.
G_list, pi_forward_list, dis_min = best_median_graphs(
G_list, pi_forward_list, dis_list)
if ds_attrs['node_attr_dim'] == 0 and ds_attrs['edge_attr_dim'] == 0:
G_list, idx_list = remove_duplicates(G_list)
pi_forward_list = [pi_forward_list[idx] for idx in idx_list]
# dis_list = [dis_list[idx] for idx in idx_list]
# import matplotlib.pyplot as plt
# for g in G_list:
# nx.draw_networkx(g)
# plt.show()
# print(g.nodes(data=True))
# print(g.edges(data=True))
print('\nsods:', sod_list, '\n')
return G_list, pi_forward_list, dis_min
def remove_duplicates(Gn):
"""Remove duplicate graphs from list.
"""
Gn_new = []
idx_list = []
for idx, g in enumerate(Gn):
dupl = False
for g_new in Gn_new:
if graph_isIdentical(g_new, g):
dupl = True
break
if not dupl:
Gn_new.append(g)
idx_list.append(idx)
return Gn_new, idx_list
def remove_disconnected(Gn):
"""Remove disconnected graphs from list.
"""
Gn_new = []
idx_list = []
for idx, g in enumerate(Gn):
if nx.is_connected(g):
Gn_new.append(g)
idx_list.append(idx)
return Gn_new, idx_list

# phase 1: initilize.
# compute set-median.
dis_min = np.inf
dis_list, pi_forward_all = median_distance(Gn_candidate, Gn_median,
**params_ged)
# find all smallest distances.
if AllBestInit: # try all best init graphs.
idx_min_list = range(len(dis_list))
dis_min = dis_list
else:
idx_min_list = np.argwhere(dis_list == np.min(dis_list)).flatten().tolist()
dis_min = [dis_list[idx_min_list[0]]] * len(idx_min_list)
# phase 2: iteration.
G_list = []
dis_list = []
pi_forward_list = []
for idx_tmp, idx_min in enumerate(idx_min_list):
# print('idx_min is', idx_min)
G = Gn_candidate[idx_min].copy()
# list of edit operations.
pi_p_forward = pi_forward_all[idx_min]
# pi_p_backward = pi_all_backward[idx_min]
Gi_list, pi_i_forward_list, dis_i_min = iteration_proc(G, pi_p_forward, dis_min[idx_tmp])
G_list += Gi_list
dis_list += [dis_i_min] * len(Gi_list)
pi_forward_list += pi_i_forward_list
if ds_attrs['node_attr_dim'] == 0 and ds_attrs['edge_attr_dim'] == 0:
G_list, idx_list = remove_duplicates(G_list)
dis_list = [dis_list[idx] for idx in idx_list]
pi_forward_list = [pi_forward_list[idx] for idx in idx_list]
if connected == True:
G_list_con, idx_list = remove_disconnected(G_list)
# if there is no connected graphs at all, then remain the disconnected ones.
if len(G_list_con) > 0: # @todo: ??????????????????????????
G_list = G_list_con
dis_list = [dis_list[idx] for idx in idx_list]
pi_forward_list = [pi_forward_list[idx] for idx in idx_list]

# import matplotlib.pyplot as plt
# for g in G_list:
# nx.draw_networkx(g)
# plt.show()
# print(g.nodes(data=True))
# print(g.edges(data=True))
# get the best median graphs
# dis_list, pi_forward_list = median_distance(G_list, Gn_median,
# **params_ged)
G_min_list, pi_forward_min_list, dis_min = best_median_graphs(
G_list, pi_forward_list, dis_list)
# for g in G_min_list:
# nx.draw_networkx(g)
# plt.show()
# print(g.nodes(data=True))
# print(g.edges(data=True))
# randomly choose one graph.
idx_rdm = random.randint(0, len(G_min_list) - 1)
G_min_list = [G_min_list[idx_rdm]]
return G_min_list, dis_min
















###############################################################################
def iam(Gn, c_ei=3, c_er=3, c_es=1, node_label='atom', edge_label='bond_type',
connected=True):
"""See my name, then you know what I do.
@@ -148,27 +565,42 @@ def iam(Gn, c_ei=3, c_er=3, c_es=1, node_label='atom', edge_label='bond_type',
return G


def GED(g1, g2, lib='gedlib'):
def GED(g1, g2, lib='gedlib', cost='CHEM_1', method='IPFP', saveGXL='benoit',
stabilizer='min'):
"""
Compute GED.
"""
if lib == 'gedlib':
# transform dataset to the 'xml' file as the GedLib required.
saveDataset([g1, g2], [None, None], group='xml', filename='ged_tmp/tmp')
# script.appel()
saveDataset([g1, g2], [None, None], group='xml', filename='ged_tmp/tmp',
xparams={'method': saveGXL})
# script.appel()
script.PyRestartEnv()
script.PyLoadGXLGraph('ged_tmp/', 'ged_tmp/tmp.xml')
listID = script.PyGetGraphIds()
script.PySetEditCost("LETTER") #("CHEM_1")
script.PySetEditCost(cost) #("CHEM_1")
script.PyInitEnv()
script.PySetMethod("IPFP", "")
script.PySetMethod(method, "")
script.PyInitMethod()
g = listID[0]
h = listID[1]
script.PyRunMethod(g, h)
pi_forward, pi_backward = script.PyGetAllMap(g, h)
upper = script.PyGetUpperBound(g, h)
lower = script.PyGetLowerBound(g, h)
if stabilizer == None:
script.PyRunMethod(g, h)
pi_forward, pi_backward = script.PyGetAllMap(g, h)
upper = script.PyGetUpperBound(g, h)
lower = script.PyGetLowerBound(g, h)
elif stabilizer == 'min':
upper = np.inf
for itr in range(50):
script.PyRunMethod(g, h)
upper_tmp = script.PyGetUpperBound(g, h)
if upper_tmp < upper:
upper = upper_tmp
pi_forward, pi_backward = script.PyGetAllMap(g, h)
lower = script.PyGetLowerBound(g, h)
if upper == 0:
break
dis = upper
# make the map label correct (label remove map as np.inf)
@@ -177,12 +609,13 @@ def GED(g1, g2, lib='gedlib'):
nb1 = nx.number_of_nodes(g1)
nb2 = nx.number_of_nodes(g2)
pi_forward = [nodes2[pi] if pi < nb2 else np.inf for pi in pi_forward]
pi_backward = [nodes1[pi] if pi < nb1 else np.inf for pi in pi_backward]
pi_backward = [nodes1[pi] if pi < nb1 else np.inf for pi in pi_backward]
return dis, pi_forward, pi_backward


def median_distance(Gn, Gn_median, measure='ged', verbose=False):
def median_distance(Gn, Gn_median, measure='ged', verbose=False,
ged_cost='CHEM_1', ged_method='IPFP', saveGXL='benoit'):
dis_list = []
pi_forward_list = []
for idx, G in tqdm(enumerate(Gn), desc='computing median distances',
@@ -190,7 +623,8 @@ def median_distance(Gn, Gn_median, measure='ged', verbose=False):
dis_sum = 0
pi_forward_list.append([])
for G_p in Gn_median:
dis_tmp, pi_tmp_forward, pi_tmp_backward = GED(G, G_p)
dis_tmp, pi_tmp_forward, pi_tmp_backward = GED(G, G_p,
cost=ged_cost, method=ged_method, saveGXL=saveGXL)
pi_forward_list[idx].append(pi_tmp_forward)
dis_sum += dis_tmp
dis_list.append(dis_sum)
@@ -228,137 +662,13 @@ def test_iam_with_more_graphs_as_init(Gn, G_candidate, c_ei=3, c_er=3, c_es=1,
# list of edit operations.
pi_p_forward = pi_all_forward[idx_min]
pi_p_backward = pi_all_backward[idx_min]
# phase 2: iteration.
ds_attrs = get_dataset_attributes(Gn + [G], attr_names=['edge_labeled', 'node_attr_dim'],
edge_label=edge_label)
label_set = get_node_labels(Gn + [G], node_label)
for itr in range(0, 10): # @todo: the convergence condition?
G_new = G.copy()
# update vertex labels.
# pre-compute h_i0 for each label.
# for label in get_node_labels(Gn, node_label):
# print(label)
# for nd in G.nodes(data=True):
# pass
if not ds_attrs['node_attr_dim']: # labels are symbolic
for nd in G.nodes():
h_i0_list = []
label_list = []
for label in label_set:
h_i0 = 0
for idx, g in enumerate(Gn):
pi_i = pi_p_forward[idx][nd]
if g.has_node(pi_i) and g.nodes[pi_i][node_label] == label:
h_i0 += 1
h_i0_list.append(h_i0)
label_list.append(label)
# choose one of the best randomly.
idx_max = np.argwhere(h_i0_list == np.max(h_i0_list)).flatten().tolist()
idx_rdm = random.randint(0, len(idx_max) - 1)
G_new.nodes[nd][node_label] = label_list[idx_max[idx_rdm]]
else: # labels are non-symbolic
for nd in G.nodes():
Si_norm = 0
phi_i_bar = np.array([0.0 for _ in range(ds_attrs['node_attr_dim'])])
for idx, g in enumerate(Gn):
pi_i = pi_p_forward[idx][nd]
if g.has_node(pi_i): #@todo: what if no g has node? phi_i_bar = 0?
Si_norm += 1
phi_i_bar += np.array([float(itm) for itm in g.nodes[pi_i]['attributes']])
phi_i_bar /= Si_norm
G_new.nodes[nd]['attributes'] = phi_i_bar
# update edge labels and adjacency matrix.
if ds_attrs['edge_labeled']:
for nd1, nd2, _ in G.edges(data=True):
h_ij0_list = []
label_list = []
for label in get_edge_labels(Gn, edge_label):
h_ij0 = 0
for idx, g in enumerate(Gn):
pi_i = pi_p_forward[idx][nd1]
pi_j = pi_p_forward[idx][nd2]
h_ij0_p = (g.has_node(pi_i) and g.has_node(pi_j) and
g.has_edge(pi_i, pi_j) and
g.edges[pi_i, pi_j][edge_label] == label)
h_ij0 += h_ij0_p
h_ij0_list.append(h_ij0)
label_list.append(label)
# choose one of the best randomly.
idx_max = np.argwhere(h_ij0_list == np.max(h_ij0_list)).flatten().tolist()
h_ij0_max = h_ij0_list[idx_max[0]]
idx_rdm = random.randint(0, len(idx_max) - 1)
best_label = label_list[idx_max[idx_rdm]]
# check whether a_ij is 0 or 1.
sij_norm = 0
for idx, g in enumerate(Gn):
pi_i = pi_p_forward[idx][nd1]
pi_j = pi_p_forward[idx][nd2]
if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j):
sij_norm += 1
if h_ij0_max > len(Gn) * c_er / c_es + sij_norm * (1 - (c_er + c_ei) / c_es):
if not G_new.has_edge(nd1, nd2):
G_new.add_edge(nd1, nd2)
G_new.edges[nd1, nd2][edge_label] = best_label
else:
if G_new.has_edge(nd1, nd2):
G_new.remove_edge(nd1, nd2)
else: # if edges are unlabeled
# @todo: works only for undirected graphs.
for nd1 in range(nx.number_of_nodes(G)):
for nd2 in range(nd1 + 1, nx.number_of_nodes(G)):
sij_norm = 0
for idx, g in enumerate(Gn):
pi_i = pi_p_forward[idx][nd1]
pi_j = pi_p_forward[idx][nd2]
if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j):
sij_norm += 1
if sij_norm > len(Gn) * c_er / (c_er + c_ei):
if not G_new.has_edge(nd1, nd2):
G_new.add_edge(nd1, nd2)
elif sij_norm < len(Gn) * c_er / (c_er + c_ei):
if G_new.has_edge(nd1, nd2):
G_new.remove_edge(nd1, nd2)
# do not change anything when equal.
G = G_new.copy()
# update pi_p
pi_p_forward = []
for G_p in Gn:
dist_tmp, pi_tmp_forward, pi_tmp_backward = GED(G, G_p)
pi_p_forward.append(pi_tmp_forward)
return G


def test_iam_moreGraphsAsInit_tryAllPossibleBestGraphs_deleteNodesInIterations(
Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, node_label='atom',
edge_label='bond_type', connected=False):
"""See my name, then you know what I do.
"""
from tqdm import tqdm
# Gn_median = Gn_median[0:10]
# Gn_median = [nx.convert_node_labels_to_integers(g) for g in Gn_median]
node_ir = np.inf # corresponding to the node remove and insertion.
label_r = 'thanksdanny' # the label for node remove. # @todo: make this label unrepeatable.
ds_attrs = get_dataset_attributes(Gn_median + Gn_candidate,
attr_names=['edge_labeled', 'node_attr_dim', 'edge_attr_dim'],
# phase 2: iteration.
ds_attrs = get_dataset_attributes(Gn + [G], attr_names=['edge_labeled', 'node_attr_dim'],
edge_label=edge_label)
ite_max = 50
epsilon = 0.001

def generate_graph(G, pi_p_forward, label_set):
G_new_list = [G.copy()] # all "best" graphs generated in this iteration.
# nx.draw_networkx(G)
# import matplotlib.pyplot as plt
# plt.show()
# print(pi_p_forward)
label_set = get_node_labels(Gn + [G], node_label)
for itr in range(0, 10): # @todo: the convergence condition?
G_new = G.copy()
# update vertex labels.
# pre-compute h_i0 for each label.
# for label in get_node_labels(Gn, node_label):
@@ -366,65 +676,41 @@ def test_iam_moreGraphsAsInit_tryAllPossibleBestGraphs_deleteNodesInIterations(
# for nd in G.nodes(data=True):
# pass
if not ds_attrs['node_attr_dim']: # labels are symbolic
for ndi, (nd, _) in enumerate(G.nodes(data=True)):
for nd in G.nodes():
h_i0_list = []
label_list = []
for label in label_set:
h_i0 = 0
for idx, g in enumerate(Gn_median):
pi_i = pi_p_forward[idx][ndi]
if pi_i != node_ir and g.nodes[pi_i][node_label] == label:
for idx, g in enumerate(Gn):
pi_i = pi_p_forward[idx][nd]
if g.has_node(pi_i) and g.nodes[pi_i][node_label] == label:
h_i0 += 1
h_i0_list.append(h_i0)
label_list.append(label)
# case when the node is to be removed.
h_i0_remove = 0
for idx, g in enumerate(Gn_median):
pi_i = pi_p_forward[idx][ndi]
if pi_i == node_ir:
h_i0_remove += 1
h_i0_list.append(h_i0_remove)
label_list.append(label_r)
# get the best labels.
# choose one of the best randomly.
idx_max = np.argwhere(h_i0_list == np.max(h_i0_list)).flatten().tolist()
nlabel_best = [label_list[idx] for idx in idx_max]
# generate "best" graphs with regard to "best" node labels.
G_new_list_nd = []
for g in G_new_list: # @todo: seems it can be simplified. The G_new_list will only contain 1 graph for now.
for nl in nlabel_best:
g_tmp = g.copy()
if nl == label_r:
g_tmp.remove_node(nd)
else:
g_tmp.nodes[nd][node_label] = nl
G_new_list_nd.append(g_tmp)
# nx.draw_networkx(g_tmp)
# import matplotlib.pyplot as plt
# plt.show()
# print(g_tmp.nodes(data=True))
# print(g_tmp.edges(data=True))
G_new_list = G_new_list_nd[:]

idx_rdm = random.randint(0, len(idx_max) - 1)
G_new.nodes[nd][node_label] = label_list[idx_max[idx_rdm]]
else: # labels are non-symbolic
for ndi, (nd, _) in enumerate(G.nodes(data=True)):
for nd in G.nodes():
Si_norm = 0
phi_i_bar = np.array([0.0 for _ in range(ds_attrs['node_attr_dim'])])
for idx, g in enumerate(Gn_median):
pi_i = pi_p_forward[idx][ndi]
for idx, g in enumerate(Gn):
pi_i = pi_p_forward[idx][nd]
if g.has_node(pi_i): #@todo: what if no g has node? phi_i_bar = 0?
Si_norm += 1
phi_i_bar += np.array([float(itm) for itm in g.nodes[pi_i]['attributes']])
phi_i_bar /= Si_norm
G_new_list[0].nodes[nd]['attributes'] = phi_i_bar
G_new.nodes[nd]['attributes'] = phi_i_bar
# update edge labels and adjacency matrix.
if ds_attrs['edge_labeled']:
for nd1, nd2, _ in G.edges(data=True):
h_ij0_list = []
label_list = []
for label in get_edge_labels(Gn_median, edge_label):
for label in get_edge_labels(Gn, edge_label):
h_ij0 = 0
for idx, g in enumerate(Gn_median):
for idx, g in enumerate(Gn):
pi_i = pi_p_forward[idx][nd1]
pi_j = pi_p_forward[idx][nd2]
h_ij0_p = (g.has_node(pi_i) and g.has_node(pi_j) and
@@ -441,12 +727,12 @@ def test_iam_moreGraphsAsInit_tryAllPossibleBestGraphs_deleteNodesInIterations(
# check whether a_ij is 0 or 1.
sij_norm = 0
for idx, g in enumerate(Gn_median):
for idx, g in enumerate(Gn):
pi_i = pi_p_forward[idx][nd1]
pi_j = pi_p_forward[idx][nd2]
if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j):
sij_norm += 1
if h_ij0_max > len(Gn_median) * c_er / c_es + sij_norm * (1 - (c_er + c_ei) / c_es):
if h_ij0_max > len(Gn) * c_er / c_es + sij_norm * (1 - (c_er + c_ei) / c_es):
if not G_new.has_edge(nd1, nd2):
G_new.add_edge(nd1, nd2)
G_new.edges[nd1, nd2][edge_label] = best_label
@@ -455,197 +741,36 @@ def test_iam_moreGraphsAsInit_tryAllPossibleBestGraphs_deleteNodesInIterations(
G_new.remove_edge(nd1, nd2)
else: # if edges are unlabeled
# @todo: works only for undirected graphs.
nd_list = [n for n in G.nodes()]
for g_tmp in G_new_list:
for nd1i in range(nx.number_of_nodes(G)):
nd1 = nd_list[nd1i]
for nd2i in range(nd1i + 1, nx.number_of_nodes(G)):
nd2 = nd_list[nd2i]
sij_norm = 0
for idx, g in enumerate(Gn_median):
pi_i = pi_p_forward[idx][nd1i]
pi_j = pi_p_forward[idx][nd2i]
if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j):
sij_norm += 1
if sij_norm > len(Gn_median) * c_er / (c_er + c_ei):
# @todo: should we consider if nd1 and nd2 in g_tmp?
# or just add the edge anyway?
if g_tmp.has_node(nd1) and g_tmp.has_node(nd2) \
and not g_tmp.has_edge(nd1, nd2):
g_tmp.add_edge(nd1, nd2)
elif sij_norm < len(Gn_median) * c_er / (c_er + c_ei):
if g_tmp.has_edge(nd1, nd2):
g_tmp.remove_edge(nd1, nd2)
# do not change anything when equal.
# # find the best graph generated in this iteration and update pi_p.
# @todo: should we update all graphs generated or just the best ones?
dis_list, pi_forward_list = median_distance(G_new_list, Gn_median)
# @todo: should we remove the identical and connectivity check?
# Don't know which is faster.
if ds_attrs['node_attr_dim'] == 0 and ds_attrs['edge_attr_dim'] == 0:
G_new_list, idx_list = remove_duplicates(G_new_list)
pi_forward_list = [pi_forward_list[idx] for idx in idx_list]
dis_list = [dis_list[idx] for idx in idx_list]
# if connected == True:
# G_new_list, idx_list = remove_disconnected(G_new_list)
# pi_forward_list = [pi_forward_list[idx] for idx in idx_list]
# idx_min_list = np.argwhere(dis_list == np.min(dis_list)).flatten().tolist()
# dis_min = dis_list[idx_min_tmp_list[0]]
# pi_forward_list = [pi_forward_list[idx] for idx in idx_min_list]
# G_new_list = [G_new_list[idx] for idx in idx_min_list]
# for g in G_new_list:
# import matplotlib.pyplot as plt
# nx.draw_networkx(g)
# plt.show()
# print(g.nodes(data=True))
# print(g.edges(data=True))
return G_new_list, pi_forward_list, dis_list
def best_median_graphs(Gn_candidate, pi_all_forward, dis_all):
idx_min_list = np.argwhere(dis_all == np.min(dis_all)).flatten().tolist()
dis_min = dis_all[idx_min_list[0]]
pi_forward_min_list = [pi_all_forward[idx] for idx in idx_min_list]
G_min_list = [Gn_candidate[idx] for idx in idx_min_list]
return G_min_list, pi_forward_min_list, dis_min
def iteration_proc(G, pi_p_forward, cur_sod):
G_list = [G]
pi_forward_list = [pi_p_forward]
old_sod = cur_sod * 2
sod_list = [cur_sod]
# iterations.
itr = 0
while itr < ite_max and np.abs(old_sod - cur_sod) > epsilon:
# for itr in range(0, 5): # the convergence condition?
print('itr is', itr)
G_new_list = []
pi_forward_new_list = []
dis_new_list = []
for idx, G in enumerate(G_list):
label_set = get_node_labels(Gn_median + [G], node_label)
G_tmp_list, pi_forward_tmp_list, dis_tmp_list = generate_graph(
G, pi_forward_list[idx], label_set)
G_new_list += G_tmp_list
pi_forward_new_list += pi_forward_tmp_list
dis_new_list += dis_tmp_list
G_list = G_new_list[:]
pi_forward_list = pi_forward_new_list[:]
dis_list = dis_new_list[:]
old_sod = cur_sod
cur_sod = np.min(dis_list)
sod_list.append(cur_sod)
itr += 1
# @todo: do we return all graphs or the best ones?
# get the best ones of the generated graphs.
G_list, pi_forward_list, dis_min = best_median_graphs(
G_list, pi_forward_list, dis_list)
for nd1 in range(nx.number_of_nodes(G)):
for nd2 in range(nd1 + 1, nx.number_of_nodes(G)):
sij_norm = 0
for idx, g in enumerate(Gn):
pi_i = pi_p_forward[idx][nd1]
pi_j = pi_p_forward[idx][nd2]
if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j):
sij_norm += 1
if sij_norm > len(Gn) * c_er / (c_er + c_ei):
if not G_new.has_edge(nd1, nd2):
G_new.add_edge(nd1, nd2)
elif sij_norm < len(Gn) * c_er / (c_er + c_ei):
if G_new.has_edge(nd1, nd2):
G_new.remove_edge(nd1, nd2)
# do not change anything when equal.
G = G_new.copy()
if ds_attrs['node_attr_dim'] == 0 and ds_attrs['edge_attr_dim'] == 0:
G_list, idx_list = remove_duplicates(G_list)
pi_forward_list = [pi_forward_list[idx] for idx in idx_list]
# dis_list = [dis_list[idx] for idx in idx_list]
# import matplotlib.pyplot as plt
# for g in G_list:
# nx.draw_networkx(g)
# plt.show()
# print(g.nodes(data=True))
# print(g.edges(data=True))
print('\nsods:', sod_list, '\n')
return G_list, pi_forward_list, dis_min
def remove_duplicates(Gn):
"""Remove duplicate graphs from list.
"""
Gn_new = []
idx_list = []
for idx, g in enumerate(Gn):
dupl = False
for g_new in Gn_new:
if graph_isIdentical(g_new, g):
dupl = True
break
if not dupl:
Gn_new.append(g)
idx_list.append(idx)
return Gn_new, idx_list
# update pi_p
pi_p_forward = []
for G_p in Gn:
dist_tmp, pi_tmp_forward, pi_tmp_backward = GED(G, G_p)
pi_p_forward.append(pi_tmp_forward)
def remove_disconnected(Gn):
"""Remove disconnected graphs from list.
"""
Gn_new = []
idx_list = []
for idx, g in enumerate(Gn):
if nx.is_connected(g):
Gn_new.append(g)
idx_list.append(idx)
return Gn_new, idx_list
return G


###############################################################################

# phase 1: initilize.
# compute set-median.
dis_min = np.inf
dis_list, pi_forward_all = median_distance(Gn_candidate, Gn_median)
# find all smallest distances.
idx_min_list = np.argwhere(dis_list == np.min(dis_list)).flatten().tolist()
dis_min = dis_list[idx_min_list[0]]
# phase 2: iteration.
G_list = []
dis_list = []
pi_forward_list = []
for idx_min in idx_min_list:
# print('idx_min is', idx_min)
G = Gn_candidate[idx_min].copy()
# list of edit operations.
pi_p_forward = pi_forward_all[idx_min]
# pi_p_backward = pi_all_backward[idx_min]
Gi_list, pi_i_forward_list, dis_i_min = iteration_proc(G, pi_p_forward, dis_min)
G_list += Gi_list
dis_list.append(dis_i_min)
pi_forward_list += pi_i_forward_list
if ds_attrs['node_attr_dim'] == 0 and ds_attrs['edge_attr_dim'] == 0:
G_list, idx_list = remove_duplicates(G_list)
dis_list = [dis_list[idx] for idx in idx_list]
pi_forward_list = [pi_forward_list[idx] for idx in idx_list]
if connected == True:
G_list_con, idx_list = remove_disconnected(G_list)
# if there is no connected graphs at all, then remain the disconnected ones.
if len(G_list_con) > 0: # @todo: ??????????????????????????
G_list = G_list_con
dis_list = [dis_list[idx] for idx in idx_list]
pi_forward_list = [pi_forward_list[idx] for idx in idx_list]

# import matplotlib.pyplot as plt
# for g in G_list:
# nx.draw_networkx(g)
# plt.show()
# print(g.nodes(data=True))
# print(g.edges(data=True))
# get the best median graphs
# dis_list, pi_forward_list = median_distance(G_list, Gn_median)
G_min_list, pi_forward_min_list, dis_min = best_median_graphs(
G_list, pi_forward_list, dis_list)
# for g in G_min_list:
# nx.draw_networkx(g)
# plt.show()
# print(g.nodes(data=True))
# print(g.edges(data=True))
return G_min_list, dis_min


if __name__ == '__main__':


+ 29
- 29
preimage/median.py View File

@@ -5,10 +5,10 @@ import numpy as np
import networkx as nx
import time
#import librariesImport
#import script
#sys.path.insert(0, "/home/bgauzere/dev/optim-graphes/")
#import pygraph
import librariesImport
import script
sys.path.insert(0, "/home/bgauzere/dev/optim-graphes/")
import pygraph
from pygraph.utils.graphfiles import loadDataset
def replace_graph_in_env(script, graph, old_id, label='median'):
@@ -191,28 +191,28 @@ def compute_median_set(script,listID):
return median_set_index, sod
#if __name__ == "__main__":
# #Chargement du dataset
# script.PyLoadGXLGraph('/home/bgauzere/dev/gedlib/data/datasets/Letter/HIGH/', '/home/bgauzere/dev/gedlib/data/collections/Letter_Z.xml')
# script.PySetEditCost("LETTER")
# script.PyInitEnv()
# script.PySetMethod("IPFP", "")
# script.PyInitMethod()
#
# dataset,my_y = pygraph.utils.graphfiles.loadDataset("/home/bgauzere/dev/gedlib/data/datasets/Letter/HIGH/Letter_Z.cxl")
#
# listID = script.PyGetAllGraphIds()
# median, sod = compute_median(script,listID,dataset,verbose=True)
#
# print(sod)
# draw_Letter_graph(median)
if __name__ == '__main__':
# test draw_Letter_graph
ds = {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt',
'extra_params': {}} # node nsymb
Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
print(y_all)
for g in Gn:
draw_Letter_graph(g)
if __name__ == "__main__":
#Chargement du dataset
script.PyLoadGXLGraph('/home/bgauzere/dev/gedlib/data/datasets/Letter/HIGH/', '/home/bgauzere/dev/gedlib/data/collections/Letter_Z.xml')
script.PySetEditCost("LETTER")
script.PyInitEnv()
script.PySetMethod("IPFP", "")
script.PyInitMethod()
dataset,my_y = pygraph.utils.graphfiles.loadDataset("/home/bgauzere/dev/gedlib/data/datasets/Letter/HIGH/Letter_Z.cxl")
listID = script.PyGetAllGraphIds()
median, sod = compute_median(script,listID,dataset,verbose=True)
print(sod)
draw_Letter_graph(median)
#if __name__ == '__main__':
# # test draw_Letter_graph
# ds = {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt',
# 'extra_params': {}} # node nsymb
# Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
# print(y_all)
# for g in Gn:
# draw_Letter_graph(g)

+ 152
- 17
preimage/preimage.py View File

@@ -25,14 +25,16 @@ import functools
from pygraph.utils.kernels import deltakernel, gaussiankernel, kernelproduct
from pygraph.kernels.structuralspKernel import structuralspkernel

from gk_iam import dis_gstar


def compute_kernel(Gn, graph_kernel, verbose):
if graph_kernel == 'marginalizedkernel':
Kmatrix, _ = marginalizedkernel(Gn, node_label='atom', edge_label=None,
p_quit=0.03, n_iteration=20, remove_totters=False,
p_quit=0.03, n_iteration=10, remove_totters=False,
n_jobs=multiprocessing.cpu_count(), verbose=verbose)
elif graph_kernel == 'untilhpathkernel':
Kmatrix, _ = untilhpathkernel(Gn, node_label='atom', edge_label='bond_type',
Kmatrix, _ = untilhpathkernel(Gn, node_label='atom', edge_label=None,
depth=10, k_func='MinMax', compute_method='trie',
n_jobs=multiprocessing.cpu_count(), verbose=verbose)
elif graph_kernel == 'spkernel':
@@ -47,34 +49,167 @@ def compute_kernel(Gn, graph_kernel, verbose):
n_jobs=multiprocessing.cpu_count(), verbose=verbose)
# normalization
# Kmatrix_diag = Kmatrix.diagonal().copy()
# for i in range(len(Kmatrix)):
# for j in range(i, len(Kmatrix)):
# Kmatrix[i][j] /= np.sqrt(Kmatrix_diag[i] * Kmatrix_diag[j])
# Kmatrix[j][i] = Kmatrix[i][j]
Kmatrix_diag = Kmatrix.diagonal().copy()
for i in range(len(Kmatrix)):
for j in range(i, len(Kmatrix)):
Kmatrix[i][j] /= np.sqrt(Kmatrix_diag[i] * Kmatrix_diag[j])
Kmatrix[j][i] = Kmatrix[i][j]
return Kmatrix


def random_preimage(Gn_init, Gn_median, alpha, idx_gi, Kmatrix, k, r_max, l, gkernel):
Gn_init = [nx.convert_node_labels_to_integers(g) for g in Gn_init]
# compute k nearest neighbors of phi in DN.
dis_list = [] # distance between g_star and each graph.
term3 = 0
for i1, a1 in enumerate(alpha):
for i2, a2 in enumerate(alpha):
term3 += a1 * a2 * Kmatrix[idx_gi[i1], idx_gi[i2]]
for ig, g in tqdm(enumerate(Gn_init), desc='computing distances', file=sys.stdout):
dtemp = dis_gstar(ig, idx_gi, alpha, Kmatrix, term3=term3)
dis_list.append(dtemp)
# print(np.max(dis_list))
# print(np.min(dis_list))
# print(np.min([item for item in dis_list if item != 0]))
# print(np.mean(dis_list))
# sort
sort_idx = np.argsort(dis_list)
dis_gs = [dis_list[idis] for idis in sort_idx[0:k]] # the k shortest distances
nb_best = len(np.argwhere(dis_gs == dis_gs[0]).flatten().tolist())
g0hat_list = [Gn_init[idx] for idx in sort_idx[0:nb_best]] # the nearest neighbors of phi in DN
if dis_gs[0] == 0: # the exact pre-image.
print('The exact pre-image is found from the input dataset.')
return 0, g0hat_list[0], 0
dhat = dis_gs[0] # the nearest distance
# ghat_list = [g.copy() for g in g0hat_list]
# for g in ghat_list:
# draw_Letter_graph(g)
# nx.draw_networkx(g)
# plt.show()
# print(g.nodes(data=True))
# print(g.edges(data=True))
Gk = [Gn_init[ig].copy() for ig in sort_idx[0:k]] # the k nearest neighbors
# for gi in Gk:
## nx.draw_networkx(gi)
## plt.show()
# draw_Letter_graph(g)
# print(gi.nodes(data=True))
# print(gi.edges(data=True))
Gs_nearest = [g.copy() for g in Gk]
gihat_list = []
dihat_list = []
# i = 1
r = 0
# sod_list = [dhat]
# found = False
nb_updated = 0
g_best = []
while r < r_max:
print('\nr =', r)
print('itr for gk =', nb_updated, '\n')
found = False
dis_bests = dis_gs + dihat_list
# @todo what if the log is negetive? how to choose alpha (scalar)?
fdgs_list = np.array(dis_bests)
if np.min(fdgs_list) < 1:
fdgs_list /= np.min(dis_bests)
fdgs_list = [int(item) for item in np.ceil(np.log(fdgs_list))]
if np.min(fdgs_list) < 1:
fdgs_list = np.array(fdgs_list) + 1
for ig, gs in enumerate(Gs_nearest + gihat_list):
# nx.draw_networkx(gs)
# plt.show()
for trail in range(0, l):
# for trail in tqdm(range(0, l), desc='l loops', file=sys.stdout):
# add and delete edges.
gtemp = gs.copy()
np.random.seed()
# which edges to change.
# @todo: should we use just half of the adjacency matrix for undirected graphs?
nb_vpairs = nx.number_of_nodes(gs) * (nx.number_of_nodes(gs) - 1)
# @todo: what if fdgs is bigger than nb_vpairs?
idx_change = random.sample(range(nb_vpairs), fdgs_list[ig] if
fdgs_list[ig] < nb_vpairs else nb_vpairs)
# idx_change = np.random.randint(0, nx.number_of_nodes(gs) *
# (nx.number_of_nodes(gs) - 1), fdgs)
for item in idx_change:
node1 = int(item / (nx.number_of_nodes(gs) - 1))
node2 = (item - node1 * (nx.number_of_nodes(gs) - 1))
if node2 >= node1: # skip the self pair.
node2 += 1
# @todo: is the randomness correct?
if not gtemp.has_edge(node1, node2):
gtemp.add_edge(node1, node2)
# nx.draw_networkx(gs)
# plt.show()
# nx.draw_networkx(gtemp)
# plt.show()
else:
gtemp.remove_edge(node1, node2)
# nx.draw_networkx(gs)
# plt.show()
# nx.draw_networkx(gtemp)
# plt.show()
# nx.draw_networkx(gtemp)
# plt.show()
# compute distance between \psi and the new generated graph.
# knew = marginalizedkernel([gtemp, g1, g2], node_label='atom', edge_label=None,
# p_quit=lmbda, n_iteration=20, remove_totters=False,
# n_jobs=multiprocessing.cpu_count(), verbose=False)
knew = compute_kernel([gtemp] + Gn_median, gkernel, verbose=False)
dnew = dis_gstar(0, [1, 2], alpha, knew, withterm3=False)
if dnew <= dhat: # @todo: the new distance is smaller or also equal?
if dnew < dhat:
print('\nI am smaller!')
print('ig =', str(ig), ', l =', str(trail))
print(dhat, '->', dnew)
nb_updated += 1
elif dnew == dhat:
print('I am equal!')
# nx.draw_networkx(gtemp)
# plt.show()
# print(gtemp.nodes(data=True))
# print(gtemp.edges(data=True))
dhat = dnew
gnew = gtemp.copy()
found = True # found better graph.
if found:
r = 0
gihat_list = [gnew]
dihat_list = [dhat]
else:
r += 1
# dis_best.append(dhat)
g_best = (g0hat_list[0] if len(gihat_list) == 0 else gihat_list[0])
return dhat, g_best, nb_updated
# return 0, 0, 0


if __name__ == '__main__':
# ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
# 'extra_params': {}} # node/edge symb
# ds = {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt',
# 'extra_params': {}} # node nsymb
# ds = {'name': 'Acyclic', 'dataset': '../datasets/monoterpenoides/trainset_9.ds',
# 'extra_params': {}}
ds = {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',
'extra_params': {}} # node symb
# ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
# 'extra_params': {}} # node/edge symb
ds = {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt',
'extra_params': {}} # node nsymb
# ds = {'name': 'Acyclic', 'dataset': '../datasets/monoterpenoides/trainset_9.ds',
# 'extra_params': {}}
# ds = {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',
# 'extra_params': {}} # node symb
DN, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
#DN = DN[0:10]
lmbda = 0.03 # termination probalility
r_max = 10 # recursions
r_max = 3 # 10 # iteration limit.
l = 500
alpha_range = np.linspace(0.5, 0.5, 1)
#alpha_range = np.linspace(0.1, 0.9, 9)
k = 5 # k nearest neighbors
k = 10 # 5 # k nearest neighbors
# randomly select two molecules
#np.random.seed(1)


+ 298
- 17
preimage/run_gk_iam.py View File

@@ -245,6 +245,9 @@ def test_remove_bests(Gn, gkernel):
print(g.edges(data=True))
###############################################################################
# Tests on dataset Letter-H.
def test_gkiam_letter_h():
from gk_iam import gk_iam_nearest_multi, compute_kernel
from iam import median_distance
@@ -263,8 +266,10 @@ def test_gkiam_letter_h():
# classify graphs according to letters.
idx_dict = get_same_item_indices(y_all)
time_list = []
sod_list = []
sod_min_list = []
sod_ks_min_list = []
sod_gs_list = []
sod_gs_min_list = []
nb_updated_list = []
for letter in idx_dict:
print('\n-------------------------------------------------------\n')
Gn_let = [Gn[i].copy() for i in idx_dict[letter]]
@@ -280,10 +285,10 @@ def test_gkiam_letter_h():
# for each alpha
for alpha in alpha_range:
print('alpha =', alpha)
dhat, ghat_list = gk_iam_nearest_multi(Gn_let, Gn_let, [alpha] * len(Gn_let),
range(len(Gn_let), len(Gn_mix)), km,
k, r_max, gkernel, c_ei=1.7,
c_er=1.7, c_es=1.7)
dhat, ghat_list, sod_ks, nb_updated = gk_iam_nearest_multi(Gn_let,
Gn_let, [alpha] * len(Gn_let), range(len(Gn_let), len(Gn_mix)),
km, k, r_max, gkernel, c_ei=1.7, c_er=1.7, c_es=1.7,
ged_cost='LETTER', ged_method='IPFP', saveGXL='gedlib-letter')
dis_best.append(dhat)
g_best.append(ghat_list)
time_list.append(time.time() - time0)
@@ -300,13 +305,18 @@ def test_gkiam_letter_h():
print(g.edges(data=True))
# compute the corresponding sod in graph space. (alpha range not considered.)
sod_tmp, _ = median_distance(g_best[0], Gn_let)
sod_list.append(sod_tmp)
sod_min_list.append(np.min(sod_tmp))
sod_tmp, _ = median_distance(g_best[0], Gn_let, ged_cost='LETTER',
ged_method='IPFP', saveGXL='gedlib-letter')
sod_gs_list.append(sod_tmp)
sod_gs_min_list.append(np.min(sod_tmp))
sod_ks_min_list.append(sod_ks)
nb_updated_list.append(nb_updated)
print('\nsods in graph space: ', sod_list)
print('\nsmallest sod in graph space for each letter: ', sod_min_list)
print('\nsods in graph space: ', sod_gs_list)
print('\nsmallest sod in graph space for each letter: ', sod_gs_min_list)
print('\nsmallest sod in kernel space for each letter: ', sod_ks_min_list)
print('\nnumber of updates for each letter: ', nb_updated_list)
print('\ntimes:', time_list)
@@ -356,7 +366,8 @@ def test_iam_letter_h():
for alpha in alpha_range:
print('alpha =', alpha)
ghat_list, dhat = test_iam_moreGraphsAsInit_tryAllPossibleBestGraphs_deleteNodesInIterations(
Gn_let, Gn_let, c_ei=1.7, c_er=1.7, c_es=1.7)
Gn_let, Gn_let, c_ei=1.7, c_er=1.7, c_es=1.7,
ged_cost='LETTER', ged_method='IPFP', saveGXL='gedlib-letter')
dis_best.append(dhat)
g_best.append(ghat_list)
time_list.append(time.time() - time0)
@@ -388,18 +399,283 @@ def test_iam_letter_h():
print('\nsods in kernel space: ', sod_list)
print('\nsmallest sod in kernel space for each letter: ', sod_min_list)
print('\ntimes:', time_list)
def test_random_preimage_letter_h():
from preimage import random_preimage, compute_kernel
ds = {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt',
'extra_params': {}} # node nsymb
# ds = {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt',
# 'extra_params': {}} # node nsymb
# ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
# 'extra_params': {}} # node/edge symb
# ds = {'name': 'Acyclic', 'dataset': '../datasets/monoterpenoides/trainset_9.ds',
# 'extra_params': {}}
# ds = {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',
# 'extra_params': {}} # node symb
Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
gkernel = 'structuralspkernel'
# lmbda = 0.03 # termination probalility
r_max = 3 # 10 # recursions
l = 500
# alpha_range = np.linspace(0.5, 0.5, 1)
#alpha_range = np.linspace(0.1, 0.9, 9)
k = 10 # 5 # k nearest neighbors
# classify graphs according to letters.
idx_dict = get_same_item_indices(y_all)
time_list = []
sod_list = []
sod_min_list = []
for letter in idx_dict:
print('\n-------------------------------------------------------\n')
Gn_let = [Gn[i].copy() for i in idx_dict[letter]]
Gn_mix = Gn_let + [g.copy() for g in Gn_let]
alpha_range = np.linspace(1 / len(Gn_let), 1 / len(Gn_let), 1)
# compute
time0 = time.time()
km = compute_kernel(Gn_mix, gkernel, True)
g_best = []
dis_best = []
# for each alpha
for alpha in alpha_range:
print('alpha =', alpha)
dhat, ghat_list = random_preimage(Gn_let, Gn_let, [alpha] * len(Gn_let),
range(len(Gn_let), len(Gn_mix)), km,
k, r_max, gkernel, c_ei=1.7,
c_er=1.7, c_es=1.7)
dis_best.append(dhat)
g_best.append(ghat_list)
time_list.append(time.time() - time0)
# show best graphs and save them to file.
for idx, item in enumerate(alpha_range):
print('when alpha is', item, 'the shortest distance is', dis_best[idx])
print('the corresponding pre-images are')
for g in g_best[idx]:
draw_Letter_graph(g, savepath='results/gk_iam/')
# nx.draw_networkx(g)
# plt.show()
print(g.nodes(data=True))
print(g.edges(data=True))
# compute the corresponding sod in graph space. (alpha range not considered.)
sod_tmp, _ = median_distance(g_best[0], Gn_let)
sod_list.append(sod_tmp)
sod_min_list.append(np.min(sod_tmp))
print('\nsods in graph space: ', sod_list)
print('\nsmallest sod in graph space for each letter: ', sod_min_list)
print('\ntimes:', time_list)

def test_gkiam_mutag():
from gk_iam import gk_iam_nearest_multi, compute_kernel
from iam import median_distance
ds = {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt',
'extra_params': {}} # node nsymb
# ds = {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt',
# 'extra_params': {}} # node nsymb
Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
gkernel = 'structuralspkernel'
lmbda = 0.03 # termination probalility
r_max = 3 # recursions
# alpha_range = np.linspace(0.5, 0.5, 1)
k = 20 # k nearest neighbors
# classify graphs according to letters.
idx_dict = get_same_item_indices(y_all)
time_list = []
sod_ks_min_list = []
sod_gs_list = []
sod_gs_min_list = []
nb_updated_list = []
for letter in idx_dict:
print('\n-------------------------------------------------------\n')
Gn_let = [Gn[i].copy() for i in idx_dict[letter]]
Gn_mix = Gn_let + [g.copy() for g in Gn_let]
alpha_range = np.linspace(1 / len(Gn_let), 1 / len(Gn_let), 1)
# compute
time0 = time.time()
km = compute_kernel(Gn_mix, gkernel, True)
g_best = []
dis_best = []
# for each alpha
for alpha in alpha_range:
print('alpha =', alpha)
dhat, ghat_list, sod_ks, nb_updated = gk_iam_nearest_multi(Gn_let, Gn_let, [alpha] * len(Gn_let),
range(len(Gn_let), len(Gn_mix)), km,
k, r_max, gkernel, c_ei=1.7,
c_er=1.7, c_es=1.7)
dis_best.append(dhat)
g_best.append(ghat_list)
time_list.append(time.time() - time0)
# show best graphs and save them to file.
for idx, item in enumerate(alpha_range):
print('when alpha is', item, 'the shortest distance is', dis_best[idx])
print('the corresponding pre-images are')
for g in g_best[idx]:
draw_Letter_graph(g, savepath='results/gk_iam/')
# nx.draw_networkx(g)
# plt.show()
print(g.nodes(data=True))
print(g.edges(data=True))
# compute the corresponding sod in graph space. (alpha range not considered.)
sod_tmp, _ = median_distance(g_best[0], Gn_let)
sod_gs_list.append(sod_tmp)
sod_gs_min_list.append(np.min(sod_tmp))
sod_ks_min_list.append(sod_ks)
nb_updated_list.append(nb_updated)
print('\nsods in graph space: ', sod_gs_list)
print('\nsmallest sod in graph space for each letter: ', sod_gs_min_list)
print('\nsmallest sod in kernel space for each letter: ', sod_ks_min_list)
print('\nnumber of updates for each letter: ', nb_updated_list)
print('\ntimes:', time_list)
###############################################################################
# Re-test.
def retest_the_simple_two():
from gk_iam import gk_iam_nearest_multi, compute_kernel
from iam import median_distance
from test_random_mutag import remove_edges
# The two simple graphs.
# g1 = nx.Graph(name='haha')
# g1.add_nodes_from([(0, {'atom': 'C'}), (1, {'atom': 'O'}), (2, {'atom': 'C'})])
# g1.add_edges_from([(0, 1, {'bond_type': '1'}), (1, 2, {'bond_type': '1'})])
# g2 = nx.Graph(name='hahaha')
# g2.add_nodes_from([(0, {'atom': 'C'}), (1, {'atom': 'O'}), (2, {'atom': 'C'}),
# (3, {'atom': 'O'}), (4, {'atom': 'C'})])
# g2.add_edges_from([(0, 1, {'bond_type': '1'}), (1, 2, {'bond_type': '1'}),
# (2, 3, {'bond_type': '1'}), (3, 4, {'bond_type': '1'})])
g1 = nx.Graph(name='haha')
g1.add_nodes_from([(0, {'atom': 'C'}), (1, {'atom': 'C'}), (2, {'atom': 'C'}),
(3, {'atom': 'S'}), (4, {'atom': 'S'})])
g1.add_edges_from([(0, 1, {'bond_type': '1'}), (1, 2, {'bond_type': '1'}),
(2, 3, {'bond_type': '1'}), (2, 4, {'bond_type': '1'})])
g2 = nx.Graph(name='hahaha')
g2.add_nodes_from([(0, {'atom': 'C'}), (1, {'atom': 'C'}), (2, {'atom': 'C'}),
(3, {'atom': 'O'}), (4, {'atom': 'O'})])
g2.add_edges_from([(0, 1, {'bond_type': '1'}), (1, 2, {'bond_type': '1'}),
(2, 3, {'bond_type': '1'}), (2, 4, {'bond_type': '1'})])
# # randomly select two molecules
# np.random.seed(1)
# idx_gi = [0, 6] # np.random.randint(0, len(Gn), 2)
# g1 = Gn[idx_gi[0]]
# g2 = Gn[idx_gi[1]]
# Gn_mix = [g.copy() for g in Gn]
# Gn_mix.append(g1.copy())
# Gn_mix.append(g2.copy())
Gn = [g1.copy(), g2.copy()]
remove_edges(Gn)
gkernel = 'marginalizedkernel'
lmbda = 0.03 # termination probalility
r_max = 10 # recursions
# l = 500
alpha_range = np.linspace(0.5, 0.5, 1)
k = 2 # k nearest neighbors
epsilon = 1e-6
ged_cost='CHEM_1'
ged_method='IPFP'
saveGXL='gedlib'
c_ei=1
c_er=1
c_es=1
Gn_mix = Gn + [g1.copy(), g2.copy()]
# compute
time0 = time.time()
km = compute_kernel(Gn_mix, gkernel, True)
time_km = time.time() - time0

time_list = []
sod_ks_min_list = []
sod_gs_list = []
sod_gs_min_list = []
nb_updated_list = []
g_best = []
# for each alpha
for alpha in alpha_range:
print('\n-------------------------------------------------------\n')
print('alpha =', alpha)
time0 = time.time()
dhat, ghat_list, sod_ks, nb_updated = gk_iam_nearest_multi(Gn, [g1, g2],
[alpha, 1 - alpha], range(len(Gn), len(Gn) + 2), km, k, r_max,
gkernel, c_ei=c_ei, c_er=c_er, c_es=c_es, epsilon=epsilon,
ged_cost=ged_cost, ged_method=ged_method, saveGXL=saveGXL)
time_total = time.time() - time0 + time_km
print('time: ', time_total)
time_list.append(time_total)
sod_ks_min_list.append(dhat)
g_best.append(ghat_list)
nb_updated_list.append(nb_updated)
# show best graphs and save them to file.
for idx, item in enumerate(alpha_range):
print('when alpha is', item, 'the shortest distance is', sod_ks_min_list[idx])
print('one of the possible corresponding pre-images is')
nx.draw(g_best[idx][0], labels=nx.get_node_attributes(g_best[idx][0], 'atom'),
with_labels=True)
plt.savefig('results/gk_iam/mutag_alpha' + str(item) + '.png', format="PNG")
plt.show()
print(g_best[idx][0].nodes(data=True))
print(g_best[idx][0].edges(data=True))
# for g in g_best[idx]:
# draw_Letter_graph(g, savepath='results/gk_iam/')
## nx.draw_networkx(g)
## plt.show()
# print(g.nodes(data=True))
# print(g.edges(data=True))
# compute the corresponding sod in graph space.
for idx, item in enumerate(alpha_range):
sod_tmp, _ = median_distance(g_best[0], [g1, g2], ged_cost=ged_cost,
ged_method=ged_method, saveGXL=saveGXL)
sod_gs_list.append(sod_tmp)
sod_gs_min_list.append(np.min(sod_tmp))
print('\nsods in graph space: ', sod_gs_list)
print('\nsmallest sod in graph space for each alpha: ', sod_gs_min_list)
print('\nsmallest sod in kernel space for each alpha: ', sod_ks_min_list)
print('\nnumber of updates for each alpha: ', nb_updated_list)
print('\ntimes:', time_list)

if __name__ == '__main__':
# ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
# 'extra_params': {}} # node/edge symb
ds = {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt',
'extra_params': {}} # node nsymb
# ds = {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt',
# 'extra_params': {}} # node nsymb
# ds = {'name': 'Acyclic', 'dataset': '../datasets/monoterpenoides/trainset_9.ds',
# 'extra_params': {}}
# ds = {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',
# 'extra_params': {}} # node symb
Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
# Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
# Gn = Gn[0:20]
# import networkx.algorithms.isomorphism as iso
@@ -419,5 +695,10 @@ if __name__ == '__main__':
# test_the_simple_two(Gn, 'untilhpathkernel')
# test_remove_bests(Gn, 'untilhpathkernel')
test_gkiam_letter_h()
# test_iam_letter_h()
# test_gkiam_letter_h()
# test_iam_letter_h()
# test_random_preimage_letter_h
###############################################################################
# retests.
retest_the_simple_two()

+ 11
- 11
preimage/test.py View File

@@ -18,17 +18,17 @@ def test() :
script.PyRestartEnv()
# print("Here is the Python function !")
#
# print("List of Edit Cost Options : ")
# for i in script.listOfEditCostOptions :
# print (i)
# print("")
#
# print("List of Method Options : ")
# for j in script.listOfMethodOptions :
# print (j)
# print("")
print("Here is the Python function !")
print("List of Edit Cost Options : ")
for i in script.listOfEditCostOptions :
print (i)
print("")
print("List of Method Options : ")
for j in script.listOfMethodOptions :
print (j)
print("")
script.PyLoadGXLGraph('include/gedlib-master/data/datasets/Mutagenicity/data/', 'collections/MUTA_10.xml')
listID = script.PyGetGraphIds()


+ 599
- 0
preimage/test_random_mutag.py View File

@@ -0,0 +1,599 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Sep 5 15:59:00 2019

@author: ljia
"""

import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import time
from tqdm import tqdm

import os
import sys
sys.path.insert(0, "../")
from pygraph.utils.graphfiles import loadDataset

###############################################################################
# test on the combination of the two randomly chosen graphs. (the same as in the
# random pre-image paper.)

def test_preimage_mix_2combination_all_pairs():
from gk_iam import preimage_iam_random_mix, compute_kernel
from iam import median_distance
ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
'extra_params': {}} # node/edge symb
Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
# Gn = Gn[0:50]
remove_edges(Gn)
gkernel = 'marginalizedkernel'
lmbda = 0.03 # termination probalility
r_max = 10 # iteration limit for pre-image.
l_max = 500 # update limit for random generation
alpha_range = np.linspace(0.7, 1, 4)
k = 5 # k nearest neighbors
epsilon = 1e-6
# parameters for GED function
ged_cost='CHEM_1'
ged_method='IPFP'
saveGXL='gedlib'
# parameters for IAM function
c_ei=1
c_er=1
c_es=1
ite_max_iam = 50
epsilon_iam = 0.001
removeNodes = True
connected_iam = False
nb_update_mat_iam = np.full((len(Gn), len(Gn)), np.inf)
nb_update_mat_random = np.full((len(Gn), len(Gn)), np.inf)
# test on each pair of graphs.
# for idx1 in range(len(Gn) - 1, -1, -1):
# for idx2 in range(idx1, -1, -1):
for idx1 in range(187, 188):
for idx2 in range(167, 168):
g1 = Gn[idx1].copy()
g2 = Gn[idx2].copy()
# Gn[10] = []
# Gn[10] = []
nx.draw(g1, labels=nx.get_node_attributes(g1, 'atom'), with_labels=True)
plt.savefig("results/preimage_mix/mutag187.png", format="PNG")
plt.show()
plt.clf()
nx.draw(g2, labels=nx.get_node_attributes(g2, 'atom'), with_labels=True)
plt.savefig("results/preimage_mix/mutag167.png", format="PNG")
plt.show()
plt.clf()

###################################################################
# Gn_mix = [g.copy() for g in Gn]
# Gn_mix.append(g1.copy())
# Gn_mix.append(g2.copy())
#
# # compute
# time0 = time.time()
# km = compute_kernel(Gn_mix, gkernel, True)
# time_km = time.time() - time0
#
# # write Gram matrix to file and read it.
# np.savez('results/gram_matrix_uhpath_itr7_pq0.8.gm', gm=km, gmtime=time_km)
###################################################################
gmfile = np.load('results/gram_matrix_marg_itr10_pq0.03.gm.npz')
km = gmfile['gm']
time_km = gmfile['gmtime']
# modify mixed gram matrix.
for i in range(len(Gn)):
km[i, len(Gn)] = km[i, idx1]
km[i, len(Gn) + 1] = km[i, idx2]
km[len(Gn), i] = km[i, idx1]
km[len(Gn) + 1, i] = km[i, idx2]
km[len(Gn), len(Gn)] = km[idx1, idx1]
km[len(Gn), len(Gn) + 1] = km[idx1, idx2]
km[len(Gn) + 1, len(Gn)] = km[idx2, idx1]
km[len(Gn) + 1, len(Gn) + 1] = km[idx2, idx2]
###################################################################
# # use only the two graphs in median set as candidates.
# Gn = [g1.copy(), g2.copy()]
# Gn_mix = Gn + [g1.copy(), g2.copy()]
# # compute
# time0 = time.time()
# km = compute_kernel(Gn_mix, gkernel, True)
# time_km = time.time() - time0
time_list = []
dis_ks_min_list = []
sod_gs_list = []
sod_gs_min_list = []
nb_updated_list_iam = []
nb_updated_list_random = []
g_best = []
# for each alpha
for alpha in alpha_range:
print('\n-------------------------------------------------------\n')
print('alpha =', alpha)
time0 = time.time()
dhat, ghat_list, sod_ks, nb_updated_iam, nb_updated_random = \
preimage_iam_random_mix(Gn, [g1, g2],
[alpha, 1 - alpha], range(len(Gn), len(Gn) + 2), km, k, r_max,
l_max, gkernel, epsilon=epsilon,
params_iam={'c_ei': c_ei, 'c_er': c_er, 'c_es': c_es,
'ite_max': ite_max_iam, 'epsilon': epsilon_iam,
'removeNodes': removeNodes, 'connected': connected_iam},
params_ged={'ged_cost': ged_cost, 'ged_method': ged_method,
'saveGXL': saveGXL})
time_total = time.time() - time0 + time_km
print('time: ', time_total)
time_list.append(time_total)
dis_ks_min_list.append(dhat)
g_best.append(ghat_list)
nb_updated_list_iam.append(nb_updated_iam)
nb_updated_list_random.append(nb_updated_random)
# show best graphs and save them to file.
for idx, item in enumerate(alpha_range):
print('when alpha is', item, 'the shortest distance is', dis_ks_min_list[idx])
print('one of the possible corresponding pre-images is')
nx.draw(g_best[idx][0], labels=nx.get_node_attributes(g_best[idx][0], 'atom'),
with_labels=True)
plt.savefig('results/preimage_mix/mutag' + str(idx1) + '_' + str(idx2)
+ '_alpha' + str(item) + '.png', format="PNG")
# plt.show()
plt.clf()
# print(g_best[idx][0].nodes(data=True))
# print(g_best[idx][0].edges(data=True))
# for g in g_best[idx]:
# draw_Letter_graph(g, savepath='results/gk_iam/')
## nx.draw_networkx(g)
## plt.show()
# print(g.nodes(data=True))
# print(g.edges(data=True))
# compute the corresponding sod in graph space.
for idx, item in enumerate(alpha_range):
sod_tmp, _ = median_distance(g_best[0], [g1, g2], ged_cost=ged_cost,
ged_method=ged_method, saveGXL=saveGXL)
sod_gs_list.append(sod_tmp)
sod_gs_min_list.append(np.min(sod_tmp))
print('\nsods in graph space: ', sod_gs_list)
print('\nsmallest sod in graph space for each alpha: ', sod_gs_min_list)
print('\nsmallest distance in kernel space for each alpha: ', dis_ks_min_list)
print('\nnumber of updates for each alpha by IAM: ', nb_updated_list_iam)
print('\nnumber of updates for each alpha by random generation: ',
nb_updated_list_random)
print('\ntimes:', time_list)
nb_update_mat_iam[idx1, idx2] = nb_updated_list_iam[0]
nb_update_mat_random[idx1, idx2] = nb_updated_list_random[0]
str_fw = 'graphs %d and %d: %d times by IAM, %d times by random generation.\n' \
% (idx1, idx2, nb_updated_list_iam[0], nb_updated_list_random[0])
with open('results/preimage_mix/nb_updates.txt', 'r+') as file:
content = file.read()
file.seek(0, 0)
file.write(str_fw + content)

def test_gkiam_2combination_all_pairs():
from gk_iam import gk_iam_nearest_multi, compute_kernel
from iam import median_distance
ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
'extra_params': {}} # node/edge symb
Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
# Gn = Gn[0:50]
remove_edges(Gn)
gkernel = 'marginalizedkernel'
lmbda = 0.03 # termination probalility
r_max = 10 # iteration limit for pre-image.
alpha_range = np.linspace(1, 1, 1)
k = 5 # k nearest neighbors
epsilon = 1e-6
# parameters for GED function
ged_cost='CHEM_1'
ged_method='IPFP'
saveGXL='gedlib'
# parameters for IAM function
c_ei=1
c_er=1
c_es=1
ite_max_iam = 50
epsilon_iam = 0.001
removeNodes = True
connected_iam = False
nb_update_mat = np.full((len(Gn), len(Gn)), np.inf)
# test on each pair of graphs.
# for idx1 in range(len(Gn) - 1, -1, -1):
# for idx2 in range(idx1, -1, -1):
for idx1 in range(187, 188):
for idx2 in range(167, 168):
g1 = Gn[idx1].copy()
g2 = Gn[idx2].copy()
# Gn[10] = []
# Gn[10] = []
nx.draw(g1, labels=nx.get_node_attributes(g1, 'atom'), with_labels=True)
plt.savefig("results/gk_iam/all_pairs/mutag187.png", format="PNG")
plt.show()
plt.clf()
nx.draw(g2, labels=nx.get_node_attributes(g2, 'atom'), with_labels=True)
plt.savefig("results/gk_iam/all_pairs/mutag167.png", format="PNG")
plt.show()
plt.clf()

###################################################################
# Gn_mix = [g.copy() for g in Gn]
# Gn_mix.append(g1.copy())
# Gn_mix.append(g2.copy())
#
# # compute
# time0 = time.time()
# km = compute_kernel(Gn_mix, gkernel, True)
# time_km = time.time() - time0
#
# # write Gram matrix to file and read it.
# np.savez('results/gram_matrix_uhpath_itr7_pq0.8.gm', gm=km, gmtime=time_km)
###################################################################
gmfile = np.load('results/gram_matrix_marg_itr10_pq0.03.gm.npz')
km = gmfile['gm']
time_km = gmfile['gmtime']
# modify mixed gram matrix.
for i in range(len(Gn)):
km[i, len(Gn)] = km[i, idx1]
km[i, len(Gn) + 1] = km[i, idx2]
km[len(Gn), i] = km[i, idx1]
km[len(Gn) + 1, i] = km[i, idx2]
km[len(Gn), len(Gn)] = km[idx1, idx1]
km[len(Gn), len(Gn) + 1] = km[idx1, idx2]
km[len(Gn) + 1, len(Gn)] = km[idx2, idx1]
km[len(Gn) + 1, len(Gn) + 1] = km[idx2, idx2]
###################################################################
# # use only the two graphs in median set as candidates.
# Gn = [g1.copy(), g2.copy()]
# Gn_mix = Gn + [g1.copy(), g2.copy()]
# # compute
# time0 = time.time()
# km = compute_kernel(Gn_mix, gkernel, True)
# time_km = time.time() - time0
time_list = []
dis_ks_min_list = []
sod_gs_list = []
sod_gs_min_list = []
nb_updated_list = []
g_best = []
# for each alpha
for alpha in alpha_range:
print('\n-------------------------------------------------------\n')
print('alpha =', alpha)
time0 = time.time()
dhat, ghat_list, sod_ks, nb_updated = gk_iam_nearest_multi(Gn, [g1, g2],
[alpha, 1 - alpha], range(len(Gn), len(Gn) + 2), km, k, r_max,
gkernel, epsilon=epsilon,
params_iam={'c_ei': c_ei, 'c_er': c_er, 'c_es': c_es,
'ite_max': ite_max_iam, 'epsilon': epsilon_iam,
'removeNodes': removeNodes, 'connected': connected_iam},
params_ged={'ged_cost': ged_cost, 'ged_method': ged_method,
'saveGXL': saveGXL})
time_total = time.time() - time0 + time_km
print('time: ', time_total)
time_list.append(time_total)
dis_ks_min_list.append(dhat)
g_best.append(ghat_list)
nb_updated_list.append(nb_updated)
# show best graphs and save them to file.
for idx, item in enumerate(alpha_range):
print('when alpha is', item, 'the shortest distance is', dis_ks_min_list[idx])
print('one of the possible corresponding pre-images is')
nx.draw(g_best[idx][0], labels=nx.get_node_attributes(g_best[idx][0], 'atom'),
with_labels=True)
plt.savefig('results/gk_iam/mutag' + str(idx1) + '_' + str(idx2)
+ '_alpha' + str(item) + '.png', format="PNG")
# plt.show()
plt.clf()
# print(g_best[idx][0].nodes(data=True))
# print(g_best[idx][0].edges(data=True))
# for g in g_best[idx]:
# draw_Letter_graph(g, savepath='results/gk_iam/')
## nx.draw_networkx(g)
## plt.show()
# print(g.nodes(data=True))
# print(g.edges(data=True))
# compute the corresponding sod in graph space.
for idx, item in enumerate(alpha_range):
sod_tmp, _ = median_distance(g_best[0], [g1, g2], ged_cost=ged_cost,
ged_method=ged_method, saveGXL=saveGXL)
sod_gs_list.append(sod_tmp)
sod_gs_min_list.append(np.min(sod_tmp))
print('\nsods in graph space: ', sod_gs_list)
print('\nsmallest sod in graph space for each alpha: ', sod_gs_min_list)
print('\nsmallest distance in kernel space for each alpha: ', dis_ks_min_list)
print('\nnumber of updates for each alpha: ', nb_updated_list)
print('\ntimes:', time_list)
nb_update_mat[idx1, idx2] = nb_updated_list[0]
str_fw = 'graphs %d and %d: %d.\n' % (idx1, idx2, nb_updated_list[0])
with open('results/gk_iam/all_pairs/nb_updates.txt', 'r+') as file:
content = file.read()
file.seek(0, 0)
file.write(str_fw + content)

def test_gkiam_2combination():
from gk_iam import gk_iam_nearest_multi, compute_kernel
from iam import median_distance
ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
'extra_params': {}} # node/edge symb
Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
# Gn = Gn[0:50]
remove_edges(Gn)
gkernel = 'marginalizedkernel'
lmbda = 0.03 # termination probalility
r_max = 10 # iteration limit for pre-image.
alpha_range = np.linspace(0.5, 0.5, 1)
k = 20 # k nearest neighbors
epsilon = 1e-6
ged_cost='CHEM_1'
ged_method='IPFP'
saveGXL='gedlib'
c_ei=1
c_er=1
c_es=1
# randomly select two molecules
np.random.seed(1)
idx_gi = [10, 11] # np.random.randint(0, len(Gn), 2)
g1 = Gn[idx_gi[0]].copy()
g2 = Gn[idx_gi[1]].copy()
# Gn[10] = []
# Gn[10] = []
# nx.draw(g1, labels=nx.get_node_attributes(g1, 'atom'), with_labels=True)
# plt.savefig("results/random_preimage/mutag10.png", format="PNG")
# plt.show()
# nx.draw(g2, labels=nx.get_node_attributes(g2, 'atom'), with_labels=True)
# plt.savefig("results/random_preimage/mutag11.png", format="PNG")
# plt.show()
Gn_mix = [g.copy() for g in Gn]
Gn_mix.append(g1.copy())
Gn_mix.append(g2.copy())
# compute
# time0 = time.time()
# km = compute_kernel(Gn_mix, gkernel, True)
# time_km = time.time() - time0
# write Gram matrix to file and read it.
# np.savez('results/gram_matrix.gm', gm=km, gmtime=time_km)
gmfile = np.load('results/gram_matrix.gm.npz')
km = gmfile['gm']
time_km = gmfile['gmtime']
time_list = []
dis_ks_min_list = []
sod_gs_list = []
sod_gs_min_list = []
nb_updated_list = []
g_best = []
# for each alpha
for alpha in alpha_range:
print('\n-------------------------------------------------------\n')
print('alpha =', alpha)
time0 = time.time()
dhat, ghat_list, sod_ks, nb_updated = gk_iam_nearest_multi(Gn, [g1, g2],
[alpha, 1 - alpha], range(len(Gn), len(Gn) + 2), km, k, r_max,
gkernel, c_ei=c_ei, c_er=c_er, c_es=c_es, epsilon=epsilon,
ged_cost=ged_cost, ged_method=ged_method, saveGXL=saveGXL)
time_total = time.time() - time0 + time_km
print('time: ', time_total)
time_list.append(time_total)
dis_ks_min_list.append(dhat)
g_best.append(ghat_list)
nb_updated_list.append(nb_updated)
# show best graphs and save them to file.
for idx, item in enumerate(alpha_range):
print('when alpha is', item, 'the shortest distance is', dis_ks_min_list[idx])
print('one of the possible corresponding pre-images is')
nx.draw(g_best[idx][0], labels=nx.get_node_attributes(g_best[idx][0], 'atom'),
with_labels=True)
plt.savefig('results/gk_iam/mutag_alpha' + str(item) + '.png', format="PNG")
plt.show()
print(g_best[idx][0].nodes(data=True))
print(g_best[idx][0].edges(data=True))
# for g in g_best[idx]:
# draw_Letter_graph(g, savepath='results/gk_iam/')
## nx.draw_networkx(g)
## plt.show()
# print(g.nodes(data=True))
# print(g.edges(data=True))
# compute the corresponding sod in graph space.
for idx, item in enumerate(alpha_range):
sod_tmp, _ = median_distance(g_best[0], [g1, g2], ged_cost=ged_cost,
ged_method=ged_method, saveGXL=saveGXL)
sod_gs_list.append(sod_tmp)
sod_gs_min_list.append(np.min(sod_tmp))
print('\nsods in graph space: ', sod_gs_list)
print('\nsmallest sod in graph space for each alpha: ', sod_gs_min_list)
print('\nsmallest distance in kernel space for each alpha: ', dis_ks_min_list)
print('\nnumber of updates for each alpha: ', nb_updated_list)
print('\ntimes:', time_list)
def test_random_preimage_2combination():
# from gk_iam import compute_kernel
from preimage import random_preimage
ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
'extra_params': {}} # node/edge symb
Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
# Gn = Gn[0:12]
remove_edges(Gn)
gkernel = 'marginalizedkernel'
# dis_mat, dis_max, dis_min, dis_mean = kernel_distance_matrix(Gn, gkernel=gkernel)
# print(dis_max, dis_min, dis_mean)
lmbda = 0.03 # termination probalility
r_max = 10 # iteration limit for pre-image.
l = 500
alpha_range = np.linspace(0, 1, 11)
k = 5 # k nearest neighbors
# randomly select two molecules
np.random.seed(1)
idx_gi = [187, 167] # np.random.randint(0, len(Gn), 2)
g1 = Gn[idx_gi[0]].copy()
g2 = Gn[idx_gi[1]].copy()
# nx.draw(g1, labels=nx.get_node_attributes(g1, 'atom'), with_labels=True)
# plt.savefig("results/random_preimage/mutag10.png", format="PNG")
# plt.show()
# nx.draw(g2, labels=nx.get_node_attributes(g2, 'atom'), with_labels=True)
# plt.savefig("results/random_preimage/mutag11.png", format="PNG")
# plt.show()
######################################################################
# Gn_mix = [g.copy() for g in Gn]
# Gn_mix.append(g1.copy())
# Gn_mix.append(g2.copy())
#
## g_tmp = iam([g1, g2])
## nx.draw_networkx(g_tmp)
## plt.show()
#
# # compute
# time0 = time.time()
# km = compute_kernel(Gn_mix, gkernel, True)
# time_km = time.time() - time0
###################################################################
idx1 = idx_gi[0]
idx2 = idx_gi[1]
gmfile = np.load('results/gram_matrix_marg_itr10_pq0.03.gm.npz')
km = gmfile['gm']
time_km = gmfile['gmtime']
# modify mixed gram matrix.
for i in range(len(Gn)):
km[i, len(Gn)] = km[i, idx1]
km[i, len(Gn) + 1] = km[i, idx2]
km[len(Gn), i] = km[i, idx1]
km[len(Gn) + 1, i] = km[i, idx2]
km[len(Gn), len(Gn)] = km[idx1, idx1]
km[len(Gn), len(Gn) + 1] = km[idx1, idx2]
km[len(Gn) + 1, len(Gn)] = km[idx2, idx1]
km[len(Gn) + 1, len(Gn) + 1] = km[idx2, idx2]
###################################################################

time_list = []
nb_updated_list = []
g_best = []
dis_ks_min_list = []
# for each alpha
for alpha in alpha_range:
print('\n-------------------------------------------------------\n')
print('alpha =', alpha)
time0 = time.time()
dhat, ghat, nb_updated = random_preimage(Gn, [g1, g2], [alpha, 1 - alpha],
range(len(Gn), len(Gn) + 2), km,
k, r_max, l, gkernel)
time_total = time.time() - time0 + time_km
print('time: ', time_total)
time_list.append(time_total)
dis_ks_min_list.append(dhat)
g_best.append(ghat)
nb_updated_list.append(nb_updated)
# show best graphs and save them to file.
for idx, item in enumerate(alpha_range):
print('when alpha is', item, 'the shortest distance is', dis_ks_min_list[idx])
print('one of the possible corresponding pre-images is')
nx.draw(g_best[idx], labels=nx.get_node_attributes(g_best[idx], 'atom'),
with_labels=True)
plt.savefig('results/random_preimage/mutag_alpha' + str(item) + '.png', format="PNG")
plt.show()
plt.clf()
print(g_best[idx].nodes(data=True))
print(g_best[idx].edges(data=True))
# # compute the corresponding sod in graph space. (alpha range not considered.)
# sod_tmp, _ = median_distance(g_best[0], Gn_let)
# sod_gs_list.append(sod_tmp)
# sod_gs_min_list.append(np.min(sod_tmp))
# sod_ks_min_list.append(sod_ks)
# nb_updated_list.append(nb_updated)
# print('\nsmallest sod in graph space for each alpha: ', sod_gs_min_list)
print('\nsmallest distance in kernel space for each alpha: ', dis_ks_min_list)
print('\nnumber of updates for each alpha: ', nb_updated_list)
print('\ntimes:', time_list)

###############################################################################
# help functions

def remove_edges(Gn):
for G in Gn:
for _, _, attrs in G.edges(data=True):
attrs.clear()
def kernel_distance_matrix(Gn, Kmatrix=None, gkernel=None):
from gk_iam import compute_kernel
dis_mat = np.empty((len(Gn), len(Gn)))
if Kmatrix == None:
Kmatrix = compute_kernel(Gn, gkernel, True)
for i in range(len(Gn)):
for j in range(i, len(Gn)):
dis = Kmatrix[i, i] + Kmatrix[j, j] - 2 * Kmatrix[i, j]
if dis < 0:
if dis > -1e-10:
dis = 0
else:
raise ValueError('The distance is negative.')
dis_mat[i, j] = np.sqrt(dis)
dis_mat[j, i] = dis_mat[i, j]
dis_max = np.max(np.max(dis_mat))
dis_min = np.min(np.min(dis_mat[dis_mat != 0]))
dis_mean = np.mean(np.mean(dis_mat))
return dis_mat, dis_max, dis_min, dis_mean
###############################################################################

if __name__ == '__main__':
###############################################################################
# test on the combination of the two randomly chosen graphs. (the same as in the
# random pre-image paper.)
# test_random_preimage_2combination()
# test_gkiam_2combination()
# test_gkiam_2combination_all_pairs()
test_preimage_mix_2combination_all_pairs()

+ 88
- 20
pygraph/kernels/untilHPathKernel.py View File

@@ -51,6 +51,7 @@ def untilhpathkernel(*args,
applied for the graph kernel. The Following choices are available:
'MinMax': use the MiniMax kernel and counting feature map.
'tanimoto': use the Tanimoto kernel and binary feature map.
None: no sub-kernel is used, the kernel is computed directly.
compute_method : string
Computation method to store paths and compute the graph kernel. The
Following choices are available:
@@ -72,14 +73,16 @@ def untilhpathkernel(*args,
Kmatrix = np.zeros((len(Gn), len(Gn)))
ds_attrs = get_dataset_attributes(
Gn,
attr_names=['node_labeled', 'edge_labeled', 'is_directed'],
attr_names=['node_labeled', 'node_attr_dim', 'edge_labeled',
'edge_attr_dim', 'is_directed'],
node_label=node_label, edge_label=edge_label)
if not ds_attrs['node_labeled']:
for G in Gn:
nx.set_node_attributes(G, '0', 'atom')
if not ds_attrs['edge_labeled']:
for G in Gn:
nx.set_edge_attributes(G, '0', 'bond_type')
if k_func != None:
if not ds_attrs['node_labeled']:
for G in Gn:
nx.set_node_attributes(G, '0', 'atom')
if not ds_attrs['edge_labeled']:
for G in Gn:
nx.set_edge_attributes(G, '0', 'bond_type')

start_time = time.time()

@@ -93,12 +96,15 @@ def untilhpathkernel(*args,
else:
chunksize = 100
all_paths = [[] for _ in range(len(Gn))]
if compute_method == 'trie':
if compute_method == 'trie' and k_func != None:
getps_partial = partial(wrapper_find_all_path_as_trie, depth,
ds_attrs, node_label, edge_label)
else:
elif compute_method != 'trie' and k_func != None:
getps_partial = partial(wrapper_find_all_paths_until_length, depth,
ds_attrs, node_label, edge_label)
ds_attrs, node_label, edge_label, True)
else:
getps_partial = partial(wrapper_find_all_paths_until_length, depth,
ds_attrs, node_label, edge_label, False)
if verbose:
iterator = tqdm(pool.imap_unordered(getps_partial, itr, chunksize),
desc='getting paths', file=sys.stdout)
@@ -110,10 +116,12 @@ def untilhpathkernel(*args,
pool.join()
# for g in Gn:
# if compute_method == 'trie':
# if compute_method == 'trie' and k_func != None:
# find_all_path_as_trie(g, depth, ds_attrs, node_label, edge_label)
# else:
# elif compute_method != 'trie' and k_func != None:
# find_all_paths_until_length(g, depth, ds_attrs, node_label, edge_label)
# else:
# find_all_paths_until_length(g, depth, ds_attrs, node_label, edge_label, False)
## size = sys.getsizeof(all_paths)
## for item in all_paths:
@@ -130,20 +138,27 @@ def untilhpathkernel(*args,
## all_paths[i] = ps
## print(time.time() - ttt)
if compute_method == 'trie':
if compute_method == 'trie' and k_func != None:
def init_worker(trie_toshare):
global G_trie
G_trie = trie_toshare
do_partial = partial(wrapper_uhpath_do_trie, k_func)
parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker,
glbv=(all_paths,), n_jobs=n_jobs, verbose=verbose)
else:
elif compute_method != 'trie' and k_func != None:
def init_worker(plist_toshare):
global G_plist
G_plist = plist_toshare
do_partial = partial(wrapper_uhpath_do_naive, k_func)
parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker,
glbv=(all_paths,), n_jobs=n_jobs, verbose=verbose)
else:
def init_worker(plist_toshare):
global G_plist
G_plist = plist_toshare
do_partial = partial(wrapper_uhpath_do_kernelless, ds_attrs, edge_kernels)
parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker,
glbv=(all_paths,), n_jobs=n_jobs, verbose=verbose)
# # ---- direct running, normally use single CPU core. ----
@@ -353,12 +368,62 @@ def wrapper_uhpath_do_naive(k_func, itr):
return i, j, _untilhpathkernel_do_naive(G_plist[i], G_plist[j], k_func)


def _untilhpathkernel_do_kernelless(paths1, paths2, k_func):
"""Calculate path graph kernels up to depth d between 2 graphs naively.

Parameters
----------
paths_list : list of list
List of list of paths in all graphs, where for unlabeled graphs, each
path is represented by a list of nodes; while for labeled graphs, each
path is represented by a string consists of labels of nodes and/or
edges on that path.
k_func : function
A kernel function applied using different notions of fingerprint
similarity.

Return
------
kernel : float
Path kernel up to h between 2 graphs.
"""
all_paths = list(set(paths1 + paths2))

if k_func == 'tanimoto':
length_union = len(set(paths1 + paths2))
kernel = (len(set(paths1)) + len(set(paths2)) -
length_union) / length_union
# vector1 = [(1 if path in paths1 else 0) for path in all_paths]
# vector2 = [(1 if path in paths2 else 0) for path in all_paths]
# kernel_uv = np.dot(vector1, vector2)
# kernel = kernel_uv / (len(set(paths1)) + len(set(paths2)) - kernel_uv)

else: # MinMax kernel
path_count1 = Counter(paths1)
path_count2 = Counter(paths2)
vector1 = [(path_count1[key] if (key in path_count1.keys()) else 0)
for key in all_paths]
vector2 = [(path_count2[key] if (key in path_count2.keys()) else 0)
for key in all_paths]
kernel = np.sum(np.minimum(vector1, vector2)) / \
np.sum(np.maximum(vector1, vector2))

return kernel


def wrapper_uhpath_do_kernelless(k_func, itr):
i = itr[0]
j = itr[1]
return i, j, _untilhpathkernel_do_kernelless(G_plist[i], G_plist[j], k_func)


# @todo: (can be removed maybe) this method find paths repetively, it could be faster.
def find_all_paths_until_length(G,
length,
ds_attrs,
node_label='atom',
edge_label='bond_type'):
edge_label='bond_type',
tolabelseqs=True):
"""Find all paths no longer than a certain maximum length in a graph. A
recursive depth first search is applied.

@@ -398,7 +463,7 @@ def find_all_paths_until_length(G,
# path_l = path_l_new[:]

path_l = [[n] for n in G.nodes] # paths of length l
all_paths = path_l[:]
all_paths = [p.copy() for p in path_l]
for l in range(1, length + 1):
path_lplus1 = []
for path in path_l:
@@ -409,7 +474,7 @@ def find_all_paths_until_length(G,
path_lplus1.append(tmp)

all_paths += path_lplus1
path_l = path_lplus1[:]
path_l = [p.copy() for p in path_lplus1]

# for i in range(0, length + 1):
# new_paths = find_all_paths(G, i)
@@ -419,15 +484,18 @@ def find_all_paths_until_length(G,

# consider labels
# print(paths2labelseqs(all_paths, G, ds_attrs, node_label, edge_label))
return paths2labelseqs(all_paths, G, ds_attrs, node_label, edge_label)
print()
return (paths2labelseqs(all_paths, G, ds_attrs, node_label, edge_label)
if tolabelseqs else all_paths)
def wrapper_find_all_paths_until_length(length, ds_attrs, node_label,
edge_label, itr_item):
edge_label, tolabelseqs, itr_item):
g = itr_item[0]
i = itr_item[1]
return i, find_all_paths_until_length(g, length, ds_attrs,
node_label=node_label, edge_label=edge_label)
node_label=node_label, edge_label=edge_label,
tolabelseqs=tolabelseqs)


def find_all_path_as_trie(G,


+ 8
- 8
pygraph/utils/graphfiles.py View File

@@ -84,7 +84,7 @@ def loadGXL(filename):
return g


def saveGXL(graph, filename, method='gedlib-letter'):
def saveGXL(graph, filename, method='benoit'):
if method == 'benoit':
import xml.etree.ElementTree as ET
root_node = ET.Element('gxl')
@@ -131,13 +131,13 @@ def saveGXL(graph, filename, method='gedlib-letter'):
gxl_file.write("<gxl>\n")
gxl_file.write("<graph id=\"" + str(graph.graph['name']) + "\" edgeids=\"true\" edgemode=\"undirected\">\n")
for v, attrs in graph.nodes(data=True):
gxl_file.write("<node id=\"_" + str(v) + "\">\n")
gxl_file.write("<attr name=\"" + "chem" + "\"><int>" + str(attrs['atom']) + "</int></attr>\n")
gxl_file.write("<node id=\"_" + str(v) + "\">")
gxl_file.write("<attr name=\"" + "chem" + "\"><int>" + str(attrs['atom']) + "</int></attr>")
gxl_file.write("</node>\n")
for v1, v2, attrs in graph.edges(data=True):
gxl_file.write("<edge from=\"_" + str(v1) + "\" to=\"_" + str(v2) + "\">\n")
# gxl_file.write("<attr name=\"valence\"><int>" + str(attrs['bond_type']) + "</int></attr>\n")
gxl_file.write("<attr name=\"valence\"><int>" + "1" + "</int></attr>\n")
gxl_file.write("<edge from=\"_" + str(v1) + "\" to=\"_" + str(v2) + "\">")
# gxl_file.write("<attr name=\"valence\"><int>" + str(attrs['bond_type']) + "</int></attr>")
gxl_file.write("<attr name=\"valence\"><int>" + "1" + "</int></attr>")
gxl_file.write("</edge>\n")
gxl_file.write("</graph>\n")
gxl_file.write("</gxl>\n")
@@ -485,7 +485,7 @@ def loadDataset(filename, filename_y=None, extra_params=None):
return data, y


def saveDataset(Gn, y, gformat='gxl', group=None, filename='gfile'):
def saveDataset(Gn, y, gformat='gxl', group=None, filename='gfile', xparams=None):
"""Save list of graphs.
"""
import os
@@ -502,7 +502,7 @@ def saveDataset(Gn, y, gformat='gxl', group=None, filename='gfile'):
fgroup.write("\n<GraphCollection>")
for idx, g in enumerate(Gn):
fname_tmp = "graph" + str(idx) + ".gxl"
saveGXL(g, dirname_ds + fname_tmp)
saveGXL(g, dirname_ds + fname_tmp, method=xparams['method'])
fgroup.write("\n\t<graph file=\"" + fname_tmp + "\" class=\"" + str(y[idx]) + "\"/>")
fgroup.write("\n</GraphCollection>")
fgroup.close()


Loading…
Cancel
Save