@@ -20,5 +20,8 @@ pygraph/kernels/*_sym.py | |||
*.dat | |||
*.pyc | |||
preimage/* | |||
!preimage/*.py | |||
__pycache__ | |||
##*# |
@@ -0,0 +1,196 @@ | |||
#!/usr/bin/env python3 | |||
# -*- coding: utf-8 -*- | |||
""" | |||
Created on Tue Apr 30 17:07:43 2019 | |||
A graph pre-image method combining iterative pre-image method in reference [1] | |||
and the iterative alternate minimizations (IAM) in reference [2]. | |||
@author: ljia | |||
@references: | |||
[1] Gökhan H Bakir, Alexander Zien, and Koji Tsuda. Learning to and graph | |||
pre-images. In Joint Pattern Re ognition Symposium , pages 253-261. Springer, 2004. | |||
[2] Generalized median graph via iterative alternate minimization. | |||
""" | |||
import numpy as np | |||
import multiprocessing | |||
from tqdm import tqdm | |||
import networkx as nx | |||
import matplotlib.pyplot as plt | |||
from iam import iam | |||
def gk_iam(Gn, alpha): | |||
"""This function constructs graph pre-image by the iterative pre-image | |||
framework in reference [1], algorithm 1, where the step of generating new | |||
graphs randomly is replaced by the IAM algorithm in reference [2]. | |||
notes | |||
----- | |||
Every time a better graph is acquired, the older one is replaced by it. | |||
""" | |||
# compute k nearest neighbors of phi in DN. | |||
dis_list = [] # distance between g_star and each graph. | |||
for ig, g in tqdm(enumerate(Gn), desc='computing distances', file=sys.stdout): | |||
dtemp = k_list[ig] - 2 * (alpha * k_g1_list[ig] + (1 - alpha) * | |||
k_g2_list[ig]) + (alpha * alpha * k_list[idx1] + alpha * | |||
(1 - alpha) * k_g2_list[idx1] + (1 - alpha) * alpha * | |||
k_g1_list[idx2] + (1 - alpha) * (1 - alpha) * k_list[idx2]) | |||
dis_list.append(dtemp) | |||
# sort | |||
sort_idx = np.argsort(dis_list) | |||
dis_gs = [dis_list[idis] for idis in sort_idx[0:k]] | |||
g0hat = Gn[sort_idx[0]] # the nearest neighbor of phi in DN | |||
if dis_gs[0] == 0: # the exact pre-image. | |||
print('The exact pre-image is found from the input dataset.') | |||
return 0, g0hat | |||
dhat = dis_gs[0] # the nearest distance | |||
Gk = [Gn[ig] for ig in sort_idx[0:k]] # the k nearest neighbors | |||
gihat_list = [] | |||
# i = 1 | |||
r = 1 | |||
while r < r_max: | |||
print('r =', r) | |||
# found = False | |||
Gs_nearest = Gk + gihat_list | |||
g_tmp = iam(Gs_nearest) | |||
# compute distance between phi and the new generated graph. | |||
knew = marginalizedkernel([g_tmp, g1, g2], node_label='atom', edge_label=None, | |||
p_quit=lmbda, n_iteration=20, remove_totters=False, | |||
n_jobs=multiprocessing.cpu_count(), verbose=False) | |||
dnew = knew[0][0, 0] - 2 * (alpha * knew[0][0, 1] + (1 - alpha) * | |||
knew[0][0, 2]) + (alpha * alpha * k_list[idx1] + alpha * | |||
(1 - alpha) * k_g2_list[idx1] + (1 - alpha) * alpha * | |||
k_g1_list[idx2] + (1 - alpha) * (1 - alpha) * k_list[idx2]) | |||
if dnew <= dhat: # the new distance is smaller | |||
print('I am smaller!') | |||
dhat = dnew | |||
g_new = g_tmp.copy() # found better graph. | |||
gihat_list = [g_new] | |||
dis_gs.append(dhat) | |||
r = 0 | |||
else: | |||
r += 1 | |||
ghat = ([g0hat] if len(gihat_list) == 0 else gihat_list) | |||
return dhat, ghat | |||
def gk_iam_nearest(Gn, alpha): | |||
"""This function constructs graph pre-image by the iterative pre-image | |||
framework in reference [1], algorithm 1, where the step of generating new | |||
graphs randomly is replaced by the IAM algorithm in reference [2]. | |||
notes | |||
----- | |||
Every time a better graph is acquired, its distance in kernel space is | |||
compared with the k nearest ones, and the k nearest distances from the k+1 | |||
distances will be used as the new ones. | |||
""" | |||
# compute k nearest neighbors of phi in DN. | |||
dis_list = [] # distance between g_star and each graph. | |||
for ig, g in tqdm(enumerate(Gn), desc='computing distances', file=sys.stdout): | |||
dtemp = k_list[ig] - 2 * (alpha * k_g1_list[ig] + (1 - alpha) * | |||
k_g2_list[ig]) + (alpha * alpha * k_list[idx1] + alpha * | |||
(1 - alpha) * k_g2_list[idx1] + (1 - alpha) * alpha * | |||
k_g1_list[idx2] + (1 - alpha) * (1 - alpha) * k_list[idx2]) | |||
dis_list.append(dtemp) | |||
# sort | |||
sort_idx = np.argsort(dis_list) | |||
dis_gs = [dis_list[idis] for idis in sort_idx[0:k]] # the k shortest distances | |||
g0hat = Gn[sort_idx[0]] # the nearest neighbor of phi in DN | |||
if dis_gs[0] == 0: # the exact pre-image. | |||
print('The exact pre-image is found from the input dataset.') | |||
return 0, g0hat | |||
dhat = dis_gs[0] # the nearest distance | |||
ghat = g0hat | |||
Gk = [Gn[ig] for ig in sort_idx[0:k]] # the k nearest neighbors | |||
Gs_nearest = Gk | |||
# gihat_list = [] | |||
# i = 1 | |||
r = 1 | |||
while r < r_max: | |||
print('r =', r) | |||
# found = False | |||
# Gs_nearest = Gk + gihat_list | |||
g_tmp = iam(Gs_nearest) | |||
# compute distance between phi and the new generated graph. | |||
knew = marginalizedkernel([g_tmp, g1, g2], node_label='atom', edge_label=None, | |||
p_quit=lmbda, n_iteration=20, remove_totters=False, | |||
n_jobs=multiprocessing.cpu_count(), verbose=False) | |||
dnew = knew[0][0, 0] - 2 * (alpha * knew[0][0, 1] + (1 - alpha) * | |||
knew[0][0, 2]) + (alpha * alpha * k_list[idx1] + alpha * | |||
(1 - alpha) * k_g2_list[idx1] + (1 - alpha) * alpha * | |||
k_g1_list[idx2] + (1 - alpha) * (1 - alpha) * k_list[idx2]) | |||
if dnew <= dhat: # the new distance is smaller | |||
print('I am smaller!') | |||
dhat = dnew | |||
g_new = g_tmp.copy() # found better graph. | |||
ghat = g_tmp.copy() | |||
dis_gs.append(dhat) # add the new nearest distance. | |||
Gs_nearest.append(g_new) # add the corresponding graph. | |||
sort_idx = np.argsort(dis_gs) | |||
dis_gs = [dis_gs[idx] for idx in sort_idx[0:k]] # the new k nearest distances. | |||
Gs_nearest = [Gs_nearest[idx] for idx in sort_idx[0:k]] | |||
r = 0 | |||
else: | |||
r += 1 | |||
return dhat, ghat | |||
if __name__ == '__main__': | |||
import sys | |||
sys.path.insert(0, "../") | |||
from pygraph.kernels.marginalizedKernel import marginalizedkernel | |||
from pygraph.utils.graphfiles import loadDataset | |||
ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat', | |||
'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}} # node/edge symb | |||
Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params']) | |||
# Gn = Gn[0:10] | |||
lmbda = 0.03 # termination probalility | |||
r_max = 10 # recursions | |||
l = 500 | |||
alpha_range = np.linspace(0.1, 0.9, 9) | |||
k = 5 # k nearest neighbors | |||
# randomly select two molecules | |||
np.random.seed(1) | |||
idx1, idx2 = np.random.randint(0, len(Gn), 2) | |||
g1 = Gn[idx1] | |||
g2 = Gn[idx2] | |||
# compute | |||
k_list = [] # kernel between each graph and itself. | |||
k_g1_list = [] # kernel between each graph and g1 | |||
k_g2_list = [] # kernel between each graph and g2 | |||
for ig, g in tqdm(enumerate(Gn), desc='computing self kernels', file=sys.stdout): | |||
ktemp = marginalizedkernel([g, g1, g2], node_label='atom', edge_label=None, | |||
p_quit=lmbda, n_iteration=20, remove_totters=False, | |||
n_jobs=multiprocessing.cpu_count(), verbose=False) | |||
k_list.append(ktemp[0][0, 0]) | |||
k_g1_list.append(ktemp[0][0, 1]) | |||
k_g2_list.append(ktemp[0][0, 2]) | |||
g_best = [] | |||
dis_best = [] | |||
# for each alpha | |||
for alpha in alpha_range: | |||
print('alpha =', alpha) | |||
dhat, ghat = gk_iam_nearest(Gn, alpha) | |||
dis_best.append(dhat) | |||
g_best.append(ghat) | |||
for idx, item in enumerate(alpha_range): | |||
print('when alpha is', item, 'the shortest distance is', dis_best[idx]) | |||
print('the corresponding pre-image is') | |||
nx.draw_networkx(g_best[idx]) | |||
plt.show() |
@@ -0,0 +1,195 @@ | |||
#!/usr/bin/env python3 | |||
# -*- coding: utf-8 -*- | |||
""" | |||
Created on Fri Apr 26 11:49:12 2019 | |||
Iterative alternate minimizations using GED. | |||
@author: ljia | |||
""" | |||
import numpy as np | |||
import random | |||
import networkx as nx | |||
import sys | |||
#from Cython_GedLib_2 import librariesImport, script | |||
import librariesImport, script | |||
sys.path.insert(0, "../") | |||
from pygraph.utils.graphfiles import saveDataset | |||
from pygraph.utils.graphdataset import get_dataset_attributes | |||
def iam(Gn, node_label='atom', edge_label='bond_type'): | |||
"""See my name, then you know what I do. | |||
""" | |||
# Gn = Gn[0:10] | |||
Gn = [nx.convert_node_labels_to_integers(g) for g in Gn] | |||
c_er = 1 | |||
c_es = 1 | |||
c_ei = 1 | |||
# phase 1: initilize. | |||
# compute set-median. | |||
dis_min = np.inf | |||
pi_p = [] | |||
pi_all = [] | |||
for idx1, G_p in enumerate(Gn): | |||
dist_sum = 0 | |||
pi_all.append([]) | |||
for idx2, G_p_prime in enumerate(Gn): | |||
dist_tmp, pi_tmp = GED(G_p, G_p_prime) | |||
pi_all[idx1].append(pi_tmp) | |||
dist_sum += dist_tmp | |||
if dist_sum < dis_min: | |||
dis_min = dist_sum | |||
G = G_p.copy() | |||
idx_min = idx1 | |||
# list of edit operations. | |||
pi_p = pi_all[idx_min] | |||
# phase 2: iteration. | |||
ds_attrs = get_dataset_attributes(Gn, attr_names=['edge_labeled', 'node_attr_dim'], | |||
edge_label=edge_label) | |||
for itr in range(0, 10): | |||
G_new = G.copy() | |||
# update vertex labels. | |||
# pre-compute h_i0 for each label. | |||
# for label in get_node_labels(Gn, node_label): | |||
# print(label) | |||
# for nd in G.nodes(data=True): | |||
# pass | |||
if not ds_attrs['node_attr_dim']: # labels are symbolic | |||
for nd, _ in G.nodes(data=True): | |||
h_i0_list = [] | |||
label_list = [] | |||
for label in get_node_labels(Gn, node_label): | |||
h_i0 = 0 | |||
for idx, g in enumerate(Gn): | |||
pi_i = pi_p[idx][nd] | |||
if g.has_node(pi_i) and g.nodes[pi_i][node_label] == label: | |||
h_i0 += 1 | |||
h_i0_list.append(h_i0) | |||
label_list.append(label) | |||
# choose one of the best randomly. | |||
idx_max = np.argwhere(h_i0_list == np.max(h_i0_list)).flatten().tolist() | |||
idx_rdm = random.randint(0, len(idx_max) - 1) | |||
G_new.nodes[nd][node_label] = label_list[idx_max[idx_rdm]] | |||
else: # labels are non-symbolic | |||
for nd, _ in G.nodes(data=True): | |||
Si_norm = 0 | |||
phi_i_bar = np.array([0.0 for _ in range(ds_attrs['node_attr_dim'])]) | |||
for idx, g in enumerate(Gn): | |||
pi_i = pi_p[idx][nd] | |||
if g.has_node(pi_i): #@todo: what if no g has node? phi_i_bar = 0? | |||
Si_norm += 1 | |||
phi_i_bar += np.array([float(itm) for itm in g.nodes[pi_i]['attributes']]) | |||
phi_i_bar /= Si_norm | |||
G_new.nodes[nd]['attributes'] = phi_i_bar | |||
# update edge labels and adjacency matrix. | |||
if ds_attrs['edge_labeled']: | |||
for nd1, nd2, _ in G.edges(data=True): | |||
h_ij0_list = [] | |||
label_list = [] | |||
for label in get_edge_labels(Gn, edge_label): | |||
h_ij0 = 0 | |||
for idx, g in enumerate(Gn): | |||
pi_i = pi_p[idx][nd1] | |||
pi_j = pi_p[idx][nd2] | |||
h_ij0_p = (g.has_node(pi_i) and g.has_node(pi_j) and | |||
g.has_edge(pi_i, pi_j) and | |||
g.edges[pi_i, pi_j][edge_label] == label) | |||
h_ij0 += h_ij0_p | |||
h_ij0_list.append(h_ij0) | |||
label_list.append(label) | |||
# choose one of the best randomly. | |||
idx_max = np.argwhere(h_ij0_list == np.max(h_ij0_list)).flatten().tolist() | |||
h_ij0_max = h_ij0_list[idx_max[0]] | |||
idx_rdm = random.randint(0, len(idx_max) - 1) | |||
best_label = label_list[idx_max[idx_rdm]] | |||
# check whether a_ij is 0 or 1. | |||
sij_norm = 0 | |||
for idx, g in enumerate(Gn): | |||
pi_i = pi_p[idx][nd1] | |||
pi_j = pi_p[idx][nd2] | |||
if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j): | |||
sij_norm += 1 | |||
if h_ij0_max > len(Gn) * c_er / c_es + sij_norm * (1 - (c_er + c_ei) / c_es): | |||
if not G_new.has_edge(nd1, nd2): | |||
G_new.add_edge(nd1, nd2) | |||
G_new.edges[nd1, nd2][edge_label] = best_label | |||
else: | |||
if G_new.has_edge(nd1, nd2): | |||
G_new.remove_edge(nd1, nd2) | |||
else: # if edges are unlabeled | |||
for nd1, nd2, _ in G.edges(data=True): | |||
sij_norm = 0 | |||
for idx, g in enumerate(Gn): | |||
pi_i = pi_p[idx][nd1] | |||
pi_j = pi_p[idx][nd2] | |||
if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j): | |||
sij_norm += 1 | |||
if sij_norm > len(Gn) * c_er / (c_er + c_ei): | |||
if not G_new.has_edge(nd1, nd2): | |||
G_new.add_edge(nd1, nd2) | |||
else: | |||
if G_new.has_edge(nd1, nd2): | |||
G_new.remove_edge(nd1, nd2) | |||
G = G_new.copy() | |||
return G | |||
def GED(g1, g2, lib='gedlib'): | |||
""" | |||
Compute GED. It is a dummy function for now. | |||
""" | |||
if lib == 'gedlib': | |||
saveDataset([g1, g2], [None, None], group='xml', filename='ged_tmp/tmp') | |||
script.appel() | |||
script.PyRestartEnv() | |||
script.PyLoadGXLGraph('ged_tmp/', 'collections/tmp.xml') | |||
listID = script.PyGetGraphIds() | |||
script.PySetEditCost("CHEM_1") | |||
script.PyInitEnv() | |||
script.PySetMethod("BIPARTITE", "") | |||
script.PyInitMethod() | |||
g = listID[0] | |||
h = listID[1] | |||
script.PyRunMethod(g, h) | |||
liste = script.PyGetAllMap(g, h) | |||
upper = script.PyGetUpperBound(g, h) | |||
lower = script.PyGetLowerBound(g, h) | |||
dis = upper + lower | |||
pi = liste[0] | |||
return dis, pi | |||
def get_node_labels(Gn, node_label): | |||
nl = set() | |||
for G in Gn: | |||
nl = nl | set(nx.get_node_attributes(G, node_label).values()) | |||
return nl | |||
def get_edge_labels(Gn, edge_label): | |||
el = set() | |||
for G in Gn: | |||
el = el | set(nx.get_edge_attributes(G, edge_label).values()) | |||
return el | |||
if __name__ == '__main__': | |||
from pygraph.utils.graphfiles import loadDataset | |||
ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat', | |||
'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}} # node/edge symb | |||
# ds = {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt', | |||
# 'extra_params': {}} # node nsymb | |||
# ds = {'name': 'Acyclic', 'dataset': '../datasets/monoterpenoides/trainset_9.ds', | |||
# 'extra_params': {}} | |||
Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params']) | |||
iam(Gn) |
@@ -0,0 +1,5 @@ | |||
from ctypes import * | |||
lib1 = cdll.LoadLibrary('lib/fann/libdoublefann.so') | |||
lib2 = cdll.LoadLibrary('lib/libsvm.3.22/libsvm.so') | |||
lib3 = cdll.LoadLibrary('lib/nomad/libnomad.so') | |||
lib4 = cdll.LoadLibrary('lib/nomad/libsgtelib.so') |
@@ -0,0 +1,5 @@ | |||
from ctypes import * | |||
lib1 = cdll.LoadLibrary('Cython_GedLib_2/lib/fann/libdoublefann.so') | |||
lib2 = cdll.LoadLibrary('Cython_GedLib_2/lib/libsvm.3.22/libsvm.so') | |||
lib3 = cdll.LoadLibrary('Cython_GedLib_2/lib/nomad/libnomad.so') | |||
lib4 = cdll.LoadLibrary('Cython_GedLib_2/lib/nomad/libsgtelib.so') |
@@ -126,6 +126,7 @@ for alpha in alpha_range: | |||
dhat = dnew | |||
gnew = gtemp.copy() | |||
found = True # found better graph. | |||
r = 0 | |||
if found: | |||
gihat_list = [gnew] | |||
dis_gs.append(dhat) | |||
@@ -0,0 +1,26 @@ | |||
#from distutils.core import setup | |||
from distutils.extension import Extension | |||
#from Cython.Distutils import build_ext | |||
from distutils.core import setup | |||
from Cython.Build import cythonize | |||
#setup(ext_modules=cythonize("script.pyx")) | |||
extensions = [Extension("script", | |||
sources=["script.pyx", "src/essai.cpp"], | |||
include_dirs=["include","include/lsape", "include/Eigen", "include/nomad", "include/sgtelib", "include/libsvm.3.22", "include/fann", "include/boost_1_69_0"], | |||
library_dirs=["lib/fann","lib/gedlib", "lib/libsvm.3.22","lib/nomad"], | |||
libraries=["doublefann","sgtelib", "svm", "nomad"], | |||
language="c++", | |||
extra_compile_args=["-std=c++11"], | |||
extra_link_args=["-std=c++11"])] | |||
setup(ext_modules=cythonize(extensions)) | |||
#extensions = [Extension("script", sources=["script.pyx", "include/gedlib-master/src/env/ged_env.ipp"], include_dirs=["."], language="c++")] | |||
#setup(name = "script", ext_modules = extensions, cmdclass = {'build_ext':build_ext},) | |||
# Commande Bash : python setup.py build_ext --inplace |
@@ -0,0 +1,57 @@ | |||
#export LD_LIBRARY_PATH=.:/export/home/lambertn/Documents/Cython_GedLib_2/lib/fann/:/export/home/lambertn/Documents/Cython_GedLib_2/lib/libsvm.3.22:/export/home/lambertn/Documents/Cython_GedLib_2/lib/nomad | |||
#Pour que "import script" trouve les librairies qu'a besoin GedLib | |||
#Equivalent à définir la variable d'environnement LD_LIBRARY_PATH sur un bash | |||
#Permet de fonctionner sur Idle et autre sans définir à chaque fois la variable d'environnement | |||
#os.environ ne fonctionne pas dans ce cas | |||
import librariesImport, script | |||
#import script | |||
#truc = script.computeEditDistanceOnGXlGraphs('include/gedlib-master/data/datasets/Mutagenicity/data/','collections/MUTA_10.xml',"CHEM_1", "BIPARTITE", "") | |||
#print(truc) | |||
#script.PyRestartEnv() | |||
#script.appel() | |||
def test() : | |||
# script.appel() | |||
script.PyRestartEnv() | |||
# print("Here is the Python function !") | |||
# | |||
# print("List of Edit Cost Options : ") | |||
# for i in script.listOfEditCostOptions : | |||
# print (i) | |||
# print("") | |||
# | |||
# print("List of Method Options : ") | |||
# for j in script.listOfMethodOptions : | |||
# print (j) | |||
# print("") | |||
script.PyLoadGXLGraph('include/gedlib-master/data/datasets/Mutagenicity/data/', 'collections/MUTA_10.xml') | |||
listID = script.PyGetGraphIds() | |||
afficheId = "" | |||
for i in listID : | |||
afficheId+=str(i) + " " | |||
print("Number of graphs = " + str(len(listID)) + ", list of Ids = " + afficheId) | |||
script.PySetEditCost("CHEM_1") | |||
script.PyInitEnv() | |||
script.PySetMethod("BIPARTITE", "") | |||
script.PyInitMethod() | |||
g = listID[0] | |||
h = listID[1] | |||
script.PyRunMethod(g,h) | |||
liste = script.PyGetAllMap(g,h) | |||
print("Forward map : " ,liste[0], ", Backward map : ", liste[1]) | |||
print ("Upper Bound = " + str(script.PyGetUpperBound(g,h)) + ", Lower Bound = " + str(script.PyGetLowerBound(g,h)) + ", Runtime = " + str(script.PyGetRuntime(g,h))) | |||
test() |
@@ -52,10 +52,10 @@ def get_dataset_attributes(Gn, | |||
return False if edge_label is None else True | |||
def get_edge_label_num(Gn): | |||
nl = set() | |||
el = set() | |||
for G in Gn: | |||
nl = nl | set(nx.get_edge_attributes(G, edge_label).values()) | |||
return len(nl) | |||
el = el | set(nx.get_edge_attributes(G, edge_label).values()) | |||
return len(el) | |||
def is_directed(Gn): | |||
return nx.is_directed(Gn[0]) | |||
@@ -22,8 +22,8 @@ def loadCT(filename): | |||
with open(filename) as f: | |||
content = f.read().splitlines() | |||
g = nx.Graph( | |||
name=str(content[0]), | |||
filename=basename(filename)) # set name of the graph | |||
name = str(content[0]), | |||
filename = basename(filename)) # set name of the graph | |||
tmp = content[1].split(" ") | |||
if tmp[0] == '': | |||
nb_nodes = int(tmp[1]) # number of the nodes | |||
@@ -84,43 +84,63 @@ def loadGXL(filename): | |||
return g | |||
def saveGXL(graph, filename): | |||
import xml.etree.ElementTree as ET | |||
root_node = ET.Element('gxl') | |||
attr = dict() | |||
attr['id'] = graph.graph['name'] | |||
attr['edgeids'] = 'true' | |||
attr['edgemode'] = 'undirected' | |||
graph_node = ET.SubElement(root_node, 'graph', attrib=attr) | |||
for v in graph: | |||
current_node = ET.SubElement(graph_node, 'node', attrib={'id': str(v)}) | |||
for attr in graph.nodes[v].keys(): | |||
cur_attr = ET.SubElement( | |||
current_node, 'attr', attrib={'name': attr}) | |||
cur_value = ET.SubElement(cur_attr, | |||
graph.nodes[v][attr].__class__.__name__) | |||
cur_value.text = graph.nodes[v][attr] | |||
for v1 in graph: | |||
for v2 in graph[v1]: | |||
if (v1 < v2): # Non oriented graphs | |||
cur_edge = ET.SubElement( | |||
graph_node, | |||
'edge', | |||
attrib={ | |||
'from': str(v1), | |||
'to': str(v2) | |||
}) | |||
for attr in graph[v1][v2].keys(): | |||
cur_attr = ET.SubElement( | |||
cur_edge, 'attr', attrib={'name': attr}) | |||
cur_value = ET.SubElement( | |||
cur_attr, graph[v1][v2][attr].__class__.__name__) | |||
cur_value.text = str(graph[v1][v2][attr]) | |||
tree = ET.ElementTree(root_node) | |||
tree.write(filename) | |||
def saveGXL(graph, filename, method='benoit'): | |||
if method == 'benoit': | |||
import xml.etree.ElementTree as ET | |||
root_node = ET.Element('gxl') | |||
attr = dict() | |||
attr['id'] = str(graph.graph['name']) | |||
attr['edgeids'] = 'true' | |||
attr['edgemode'] = 'undirected' | |||
graph_node = ET.SubElement(root_node, 'graph', attrib=attr) | |||
for v in graph: | |||
current_node = ET.SubElement(graph_node, 'node', attrib={'id': str(v)}) | |||
for attr in graph.nodes[v].keys(): | |||
cur_attr = ET.SubElement( | |||
current_node, 'attr', attrib={'name': attr}) | |||
cur_value = ET.SubElement(cur_attr, | |||
graph.nodes[v][attr].__class__.__name__) | |||
cur_value.text = graph.nodes[v][attr] | |||
for v1 in graph: | |||
for v2 in graph[v1]: | |||
if (v1 < v2): # Non oriented graphs | |||
cur_edge = ET.SubElement( | |||
graph_node, | |||
'edge', | |||
attrib={ | |||
'from': str(v1), | |||
'to': str(v2) | |||
}) | |||
for attr in graph[v1][v2].keys(): | |||
cur_attr = ET.SubElement( | |||
cur_edge, 'attr', attrib={'name': attr}) | |||
cur_value = ET.SubElement( | |||
cur_attr, graph[v1][v2][attr].__class__.__name__) | |||
cur_value.text = str(graph[v1][v2][attr]) | |||
tree = ET.ElementTree(root_node) | |||
tree.write(filename) | |||
elif method == 'gedlib': | |||
# reference: https://github.com/dbblumenthal/gedlib/blob/master/data/generate_molecules.py#L22 | |||
pass | |||
# gxl_file = open(filename, 'w') | |||
# gxl_file.write("<?xml version=\"1.0\"?>\n") | |||
# gxl_file.write("<!DOCTYPE gxl SYSTEM \"http://www.gupro.de/GXL/gxl-1.0.dtd\">\n") | |||
# gxl_file.write("<gxl>\n") | |||
# gxl_file.write("<graph id=\"" + str(graph.graph['name']) + "\" edgeids=\"false\" edgemode=\"undirected\">\n") | |||
# for v in graph: | |||
# gxl_file.write("<node id=\"_" + str(v) + "\">\n") | |||
# gxl_file.write("<attr name=\"chem\"><int>" + str(self.node_labels[node]) + "</int></attr>\n") | |||
# gxl_file.write("</node>\n") | |||
# for edge in self.edge_list: | |||
# gxl_file.write("<edge from=\"_" + str(edge[0]) + "\" to=\"_" + str(edge[1]) + "\">\n") | |||
# gxl_file.write("<attr name=\"valence\"><int>1</int></attr>\n") | |||
# gxl_file.write("</edge>\n") | |||
# gxl_file.write("</graph>\n") | |||
# gxl_file.write("</gxl>\n") | |||
# gxl_file.close() | |||
def loadSDF(filename): | |||
@@ -412,3 +432,33 @@ def loadDataset(filename, filename_y=None, extra_params=None): | |||
# print(g.edges(data=True)) | |||
return data, y | |||
def saveDataset(Gn, y, gformat='gxl', group=None, filename='gfile'): | |||
"""Save list of graphs. | |||
""" | |||
import os | |||
dirname_ds = os.path.dirname(filename) | |||
if dirname_ds != '': | |||
dirname_ds += '/' | |||
if not os.path.exists(dirname_ds) : | |||
os.makedirs(dirname_ds) | |||
if group == 'xml' and gformat == 'gxl': | |||
with open(filename + '.xml', 'w') as fgroup: | |||
fgroup.write("<?xml version=\"1.0\"?>") | |||
fgroup.write("\n<!DOCTYPE GraphCollection SYSTEM \"https://dbblumenthal.github.io/gedlib/GraphCollection_8dtd_source.html\">") | |||
fgroup.write("\n<GraphCollection>") | |||
for idx, g in enumerate(Gn): | |||
fname_tmp = "graph" + str(idx) + ".gxl" | |||
saveGXL(g, dirname_ds + fname_tmp) | |||
fgroup.write("\n\t<graph file=\"" + fname_tmp + "\" class=\"" + str(y[idx]) + "\"/>") | |||
fgroup.write("\n</GraphCollection>") | |||
fgroup.close() | |||
if __name__ == '__main__': | |||
ds = {'name': 'MUTAG', 'dataset': '../../datasets/MUTAG/MUTAG.mat', | |||
'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}} # node/edge symb | |||
Gn, y = loadDataset(ds['dataset'], extra_params=ds['extra_params']) | |||
saveDataset(Gn, y, group='xml', filename='temp/temp') |
@@ -420,55 +420,6 @@ def model_selection_for_precomputed_kernel(datafile, | |||
# np.save(results_name_pre + 'best_gram_matrix_time.dt', | |||
# best_gram_matrix_time) | |||
# print out as table. | |||
from collections import OrderedDict | |||
from tabulate import tabulate | |||
table_dict = {} | |||
if model_type == 'regression': | |||
for param_in in param_list: | |||
param_in['alpha'] = '{:.2e}'.format(param_in['alpha']) | |||
else: | |||
for param_in in param_list: | |||
param_in['C'] = '{:.2e}'.format(param_in['C']) | |||
table_dict['params'] = [{**param_out, **param_in} | |||
for param_in in param_list for param_out in param_list_pre_revised] | |||
table_dict['gram_matrix_time'] = [ | |||
'{:.2f}'.format(gram_matrix_time[index_out]) | |||
for param_in in param_list | |||
for index_out, _ in enumerate(param_list_pre_revised) | |||
] | |||
table_dict['valid_perf'] = [ | |||
'{:.2f}±{:.2f}'.format(average_val_scores[index_out][index_in], | |||
std_val_scores[index_out][index_in]) | |||
for index_in, _ in enumerate(param_list) | |||
for index_out, _ in enumerate(param_list_pre_revised) | |||
] | |||
table_dict['test_perf'] = [ | |||
'{:.2f}±{:.2f}'.format(average_perf_scores[index_out][index_in], | |||
std_perf_scores[index_out][index_in]) | |||
for index_in, _ in enumerate(param_list) | |||
for index_out, _ in enumerate(param_list_pre_revised) | |||
] | |||
table_dict['train_perf'] = [ | |||
'{:.2f}±{:.2f}'.format(average_train_scores[index_out][index_in], | |||
std_train_scores[index_out][index_in]) | |||
for index_in, _ in enumerate(param_list) | |||
for index_out, _ in enumerate(param_list_pre_revised) | |||
] | |||
keyorder = [ | |||
'params', 'train_perf', 'valid_perf', 'test_perf', | |||
'gram_matrix_time' | |||
] | |||
if verbose: | |||
print() | |||
tb_print = tabulate( | |||
OrderedDict( | |||
sorted(table_dict.items(), | |||
key=lambda i: keyorder.index(i[0]))), | |||
headers='keys') | |||
# print(tb_print) | |||
str_fw += 'table of performance v.s. hyper-params:\n\n%s\n\n' % tb_print | |||
# read gram matrices from file. | |||
else: | |||
# Grid of parameters with a discrete number of values for each. | |||
@@ -632,58 +583,16 @@ def model_selection_for_precomputed_kernel(datafile, | |||
# str_fw += 'time to calculate best gram matrix: {:.2f}±{:.2f}s\n'.format(ave_bgmt, std_bgmt) | |||
str_fw += 'training time with hyper-param choices who did not participate in calculation of gram matrices: {:.2f}s\n\n'.format(tt_poster) | |||
# print out as table. | |||
from collections import OrderedDict | |||
from tabulate import tabulate | |||
table_dict = {} | |||
if model_type == 'regression': | |||
for param_in in param_list: | |||
param_in['alpha'] = '{:.2e}'.format(param_in['alpha']) | |||
else: | |||
for param_in in param_list: | |||
param_in['C'] = '{:.2e}'.format(param_in['C']) | |||
table_dict['params'] = [{**param_out, **param_in} | |||
for param_in in param_list for param_out in param_list_pre_revised] | |||
# table_dict['gram_matrix_time'] = [ | |||
# '{:.2f}'.format(gram_matrix_time[index_out]) | |||
# for param_in in param_list | |||
# for index_out, _ in enumerate(param_list_pre_revised) | |||
# ] | |||
table_dict['valid_perf'] = [ | |||
'{:.2f}±{:.2f}'.format(average_val_scores[index_out][index_in], | |||
std_val_scores[index_out][index_in]) | |||
for index_in, _ in enumerate(param_list) | |||
for index_out, _ in enumerate(param_list_pre_revised) | |||
] | |||
table_dict['test_perf'] = [ | |||
'{:.2f}±{:.2f}'.format(average_perf_scores[index_out][index_in], | |||
std_perf_scores[index_out][index_in]) | |||
for index_in, _ in enumerate(param_list) | |||
for index_out, _ in enumerate(param_list_pre_revised) | |||
] | |||
table_dict['train_perf'] = [ | |||
'{:.2f}±{:.2f}'.format(average_train_scores[index_out][index_in], | |||
std_train_scores[index_out][index_in]) | |||
for index_in, _ in enumerate(param_list) | |||
for index_out, _ in enumerate(param_list_pre_revised) | |||
] | |||
keyorder = [ | |||
'params', 'train_perf', 'valid_perf', 'test_perf' | |||
] | |||
if verbose: | |||
print() | |||
tb_print = tabulate( | |||
OrderedDict( | |||
sorted(table_dict.items(), | |||
key=lambda i: keyorder.index(i[0]))), | |||
headers='keys') | |||
# print(tb_print) | |||
str_fw += 'table of performance v.s. hyper-params:\n\n%s\n\n' % tb_print | |||
# open file to save all results for this dataset. | |||
if not os.path.exists(results_dir): | |||
os.makedirs(results_dir) | |||
# print out results as table. | |||
str_fw += printResultsInTable(param_list, param_list_pre_revised, average_val_scores, | |||
std_val_scores, average_perf_scores, std_perf_scores, | |||
average_train_scores, std_train_scores, gram_matrix_time, | |||
model_type, verbose) | |||
# open file to save all results for this dataset. | |||
if not os.path.exists(results_dir + '/' + ds_name + '.output.txt'): | |||
with open(results_dir + '/' + ds_name + '.output.txt', 'w') as f: | |||
@@ -974,4 +883,55 @@ def read_gram_matrices_from_file(results_dir, ds_name): | |||
gram_matrices = gmfile['gms'] # a list to store gram matrices for all param_grid_precomputed | |||
param_list_pre_revised = gmfile['params'] # list to store param grids precomputed ignoring the useless ones | |||
y = gmfile['y'].tolist() | |||
return gram_matrices, param_list_pre_revised, y | |||
return gram_matrices, param_list_pre_revised, y | |||
def printResultsInTable(param_list, param_list_pre_revised, average_val_scores, | |||
std_val_scores, average_perf_scores, std_perf_scores, | |||
average_train_scores, std_train_scores, gram_matrix_time, | |||
model_type, verbose): | |||
from collections import OrderedDict | |||
from tabulate import tabulate | |||
table_dict = {} | |||
if model_type == 'regression': | |||
for param_in in param_list: | |||
param_in['alpha'] = '{:.2e}'.format(param_in['alpha']) | |||
else: | |||
for param_in in param_list: | |||
param_in['C'] = '{:.2e}'.format(param_in['C']) | |||
table_dict['params'] = [{**param_out, **param_in} | |||
for param_in in param_list for param_out in param_list_pre_revised] | |||
table_dict['gram_matrix_time'] = [ | |||
'{:.2f}'.format(gram_matrix_time[index_out]) | |||
for param_in in param_list | |||
for index_out, _ in enumerate(param_list_pre_revised) | |||
] | |||
table_dict['valid_perf'] = [ | |||
'{:.2f}±{:.2f}'.format(average_val_scores[index_out][index_in], | |||
std_val_scores[index_out][index_in]) | |||
for index_in, _ in enumerate(param_list) | |||
for index_out, _ in enumerate(param_list_pre_revised) | |||
] | |||
table_dict['test_perf'] = [ | |||
'{:.2f}±{:.2f}'.format(average_perf_scores[index_out][index_in], | |||
std_perf_scores[index_out][index_in]) | |||
for index_in, _ in enumerate(param_list) | |||
for index_out, _ in enumerate(param_list_pre_revised) | |||
] | |||
table_dict['train_perf'] = [ | |||
'{:.2f}±{:.2f}'.format(average_train_scores[index_out][index_in], | |||
std_train_scores[index_out][index_in]) | |||
for index_in, _ in enumerate(param_list) | |||
for index_out, _ in enumerate(param_list_pre_revised) | |||
] | |||
keyorder = [ | |||
'params', 'train_perf', 'valid_perf', 'test_perf', | |||
'gram_matrix_time' | |||
] | |||
if verbose: | |||
print() | |||
tb_print = tabulate(OrderedDict(sorted(table_dict.items(), | |||
key=lambda i: keyorder.index(i[0]))), headers='keys') | |||
# print(tb_print) | |||
return 'table of performance v.s. hyper-params:\n\n%s\n\n' % tb_print |