@@ -20,5 +20,8 @@ pygraph/kernels/*_sym.py | |||||
*.dat | *.dat | ||||
*.pyc | *.pyc | ||||
preimage/* | |||||
!preimage/*.py | |||||
__pycache__ | __pycache__ | ||||
##*# | ##*# |
@@ -0,0 +1,196 @@ | |||||
#!/usr/bin/env python3 | |||||
# -*- coding: utf-8 -*- | |||||
""" | |||||
Created on Tue Apr 30 17:07:43 2019 | |||||
A graph pre-image method combining iterative pre-image method in reference [1] | |||||
and the iterative alternate minimizations (IAM) in reference [2]. | |||||
@author: ljia | |||||
@references: | |||||
[1] Gökhan H Bakir, Alexander Zien, and Koji Tsuda. Learning to and graph | |||||
pre-images. In Joint Pattern Re ognition Symposium , pages 253-261. Springer, 2004. | |||||
[2] Generalized median graph via iterative alternate minimization. | |||||
""" | |||||
import numpy as np | |||||
import multiprocessing | |||||
from tqdm import tqdm | |||||
import networkx as nx | |||||
import matplotlib.pyplot as plt | |||||
from iam import iam | |||||
def gk_iam(Gn, alpha): | |||||
"""This function constructs graph pre-image by the iterative pre-image | |||||
framework in reference [1], algorithm 1, where the step of generating new | |||||
graphs randomly is replaced by the IAM algorithm in reference [2]. | |||||
notes | |||||
----- | |||||
Every time a better graph is acquired, the older one is replaced by it. | |||||
""" | |||||
# compute k nearest neighbors of phi in DN. | |||||
dis_list = [] # distance between g_star and each graph. | |||||
for ig, g in tqdm(enumerate(Gn), desc='computing distances', file=sys.stdout): | |||||
dtemp = k_list[ig] - 2 * (alpha * k_g1_list[ig] + (1 - alpha) * | |||||
k_g2_list[ig]) + (alpha * alpha * k_list[idx1] + alpha * | |||||
(1 - alpha) * k_g2_list[idx1] + (1 - alpha) * alpha * | |||||
k_g1_list[idx2] + (1 - alpha) * (1 - alpha) * k_list[idx2]) | |||||
dis_list.append(dtemp) | |||||
# sort | |||||
sort_idx = np.argsort(dis_list) | |||||
dis_gs = [dis_list[idis] for idis in sort_idx[0:k]] | |||||
g0hat = Gn[sort_idx[0]] # the nearest neighbor of phi in DN | |||||
if dis_gs[0] == 0: # the exact pre-image. | |||||
print('The exact pre-image is found from the input dataset.') | |||||
return 0, g0hat | |||||
dhat = dis_gs[0] # the nearest distance | |||||
Gk = [Gn[ig] for ig in sort_idx[0:k]] # the k nearest neighbors | |||||
gihat_list = [] | |||||
# i = 1 | |||||
r = 1 | |||||
while r < r_max: | |||||
print('r =', r) | |||||
# found = False | |||||
Gs_nearest = Gk + gihat_list | |||||
g_tmp = iam(Gs_nearest) | |||||
# compute distance between phi and the new generated graph. | |||||
knew = marginalizedkernel([g_tmp, g1, g2], node_label='atom', edge_label=None, | |||||
p_quit=lmbda, n_iteration=20, remove_totters=False, | |||||
n_jobs=multiprocessing.cpu_count(), verbose=False) | |||||
dnew = knew[0][0, 0] - 2 * (alpha * knew[0][0, 1] + (1 - alpha) * | |||||
knew[0][0, 2]) + (alpha * alpha * k_list[idx1] + alpha * | |||||
(1 - alpha) * k_g2_list[idx1] + (1 - alpha) * alpha * | |||||
k_g1_list[idx2] + (1 - alpha) * (1 - alpha) * k_list[idx2]) | |||||
if dnew <= dhat: # the new distance is smaller | |||||
print('I am smaller!') | |||||
dhat = dnew | |||||
g_new = g_tmp.copy() # found better graph. | |||||
gihat_list = [g_new] | |||||
dis_gs.append(dhat) | |||||
r = 0 | |||||
else: | |||||
r += 1 | |||||
ghat = ([g0hat] if len(gihat_list) == 0 else gihat_list) | |||||
return dhat, ghat | |||||
def gk_iam_nearest(Gn, alpha): | |||||
"""This function constructs graph pre-image by the iterative pre-image | |||||
framework in reference [1], algorithm 1, where the step of generating new | |||||
graphs randomly is replaced by the IAM algorithm in reference [2]. | |||||
notes | |||||
----- | |||||
Every time a better graph is acquired, its distance in kernel space is | |||||
compared with the k nearest ones, and the k nearest distances from the k+1 | |||||
distances will be used as the new ones. | |||||
""" | |||||
# compute k nearest neighbors of phi in DN. | |||||
dis_list = [] # distance between g_star and each graph. | |||||
for ig, g in tqdm(enumerate(Gn), desc='computing distances', file=sys.stdout): | |||||
dtemp = k_list[ig] - 2 * (alpha * k_g1_list[ig] + (1 - alpha) * | |||||
k_g2_list[ig]) + (alpha * alpha * k_list[idx1] + alpha * | |||||
(1 - alpha) * k_g2_list[idx1] + (1 - alpha) * alpha * | |||||
k_g1_list[idx2] + (1 - alpha) * (1 - alpha) * k_list[idx2]) | |||||
dis_list.append(dtemp) | |||||
# sort | |||||
sort_idx = np.argsort(dis_list) | |||||
dis_gs = [dis_list[idis] for idis in sort_idx[0:k]] # the k shortest distances | |||||
g0hat = Gn[sort_idx[0]] # the nearest neighbor of phi in DN | |||||
if dis_gs[0] == 0: # the exact pre-image. | |||||
print('The exact pre-image is found from the input dataset.') | |||||
return 0, g0hat | |||||
dhat = dis_gs[0] # the nearest distance | |||||
ghat = g0hat | |||||
Gk = [Gn[ig] for ig in sort_idx[0:k]] # the k nearest neighbors | |||||
Gs_nearest = Gk | |||||
# gihat_list = [] | |||||
# i = 1 | |||||
r = 1 | |||||
while r < r_max: | |||||
print('r =', r) | |||||
# found = False | |||||
# Gs_nearest = Gk + gihat_list | |||||
g_tmp = iam(Gs_nearest) | |||||
# compute distance between phi and the new generated graph. | |||||
knew = marginalizedkernel([g_tmp, g1, g2], node_label='atom', edge_label=None, | |||||
p_quit=lmbda, n_iteration=20, remove_totters=False, | |||||
n_jobs=multiprocessing.cpu_count(), verbose=False) | |||||
dnew = knew[0][0, 0] - 2 * (alpha * knew[0][0, 1] + (1 - alpha) * | |||||
knew[0][0, 2]) + (alpha * alpha * k_list[idx1] + alpha * | |||||
(1 - alpha) * k_g2_list[idx1] + (1 - alpha) * alpha * | |||||
k_g1_list[idx2] + (1 - alpha) * (1 - alpha) * k_list[idx2]) | |||||
if dnew <= dhat: # the new distance is smaller | |||||
print('I am smaller!') | |||||
dhat = dnew | |||||
g_new = g_tmp.copy() # found better graph. | |||||
ghat = g_tmp.copy() | |||||
dis_gs.append(dhat) # add the new nearest distance. | |||||
Gs_nearest.append(g_new) # add the corresponding graph. | |||||
sort_idx = np.argsort(dis_gs) | |||||
dis_gs = [dis_gs[idx] for idx in sort_idx[0:k]] # the new k nearest distances. | |||||
Gs_nearest = [Gs_nearest[idx] for idx in sort_idx[0:k]] | |||||
r = 0 | |||||
else: | |||||
r += 1 | |||||
return dhat, ghat | |||||
if __name__ == '__main__': | |||||
import sys | |||||
sys.path.insert(0, "../") | |||||
from pygraph.kernels.marginalizedKernel import marginalizedkernel | |||||
from pygraph.utils.graphfiles import loadDataset | |||||
ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat', | |||||
'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}} # node/edge symb | |||||
Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params']) | |||||
# Gn = Gn[0:10] | |||||
lmbda = 0.03 # termination probalility | |||||
r_max = 10 # recursions | |||||
l = 500 | |||||
alpha_range = np.linspace(0.1, 0.9, 9) | |||||
k = 5 # k nearest neighbors | |||||
# randomly select two molecules | |||||
np.random.seed(1) | |||||
idx1, idx2 = np.random.randint(0, len(Gn), 2) | |||||
g1 = Gn[idx1] | |||||
g2 = Gn[idx2] | |||||
# compute | |||||
k_list = [] # kernel between each graph and itself. | |||||
k_g1_list = [] # kernel between each graph and g1 | |||||
k_g2_list = [] # kernel between each graph and g2 | |||||
for ig, g in tqdm(enumerate(Gn), desc='computing self kernels', file=sys.stdout): | |||||
ktemp = marginalizedkernel([g, g1, g2], node_label='atom', edge_label=None, | |||||
p_quit=lmbda, n_iteration=20, remove_totters=False, | |||||
n_jobs=multiprocessing.cpu_count(), verbose=False) | |||||
k_list.append(ktemp[0][0, 0]) | |||||
k_g1_list.append(ktemp[0][0, 1]) | |||||
k_g2_list.append(ktemp[0][0, 2]) | |||||
g_best = [] | |||||
dis_best = [] | |||||
# for each alpha | |||||
for alpha in alpha_range: | |||||
print('alpha =', alpha) | |||||
dhat, ghat = gk_iam_nearest(Gn, alpha) | |||||
dis_best.append(dhat) | |||||
g_best.append(ghat) | |||||
for idx, item in enumerate(alpha_range): | |||||
print('when alpha is', item, 'the shortest distance is', dis_best[idx]) | |||||
print('the corresponding pre-image is') | |||||
nx.draw_networkx(g_best[idx]) | |||||
plt.show() |
@@ -0,0 +1,195 @@ | |||||
#!/usr/bin/env python3 | |||||
# -*- coding: utf-8 -*- | |||||
""" | |||||
Created on Fri Apr 26 11:49:12 2019 | |||||
Iterative alternate minimizations using GED. | |||||
@author: ljia | |||||
""" | |||||
import numpy as np | |||||
import random | |||||
import networkx as nx | |||||
import sys | |||||
#from Cython_GedLib_2 import librariesImport, script | |||||
import librariesImport, script | |||||
sys.path.insert(0, "../") | |||||
from pygraph.utils.graphfiles import saveDataset | |||||
from pygraph.utils.graphdataset import get_dataset_attributes | |||||
def iam(Gn, node_label='atom', edge_label='bond_type'): | |||||
"""See my name, then you know what I do. | |||||
""" | |||||
# Gn = Gn[0:10] | |||||
Gn = [nx.convert_node_labels_to_integers(g) for g in Gn] | |||||
c_er = 1 | |||||
c_es = 1 | |||||
c_ei = 1 | |||||
# phase 1: initilize. | |||||
# compute set-median. | |||||
dis_min = np.inf | |||||
pi_p = [] | |||||
pi_all = [] | |||||
for idx1, G_p in enumerate(Gn): | |||||
dist_sum = 0 | |||||
pi_all.append([]) | |||||
for idx2, G_p_prime in enumerate(Gn): | |||||
dist_tmp, pi_tmp = GED(G_p, G_p_prime) | |||||
pi_all[idx1].append(pi_tmp) | |||||
dist_sum += dist_tmp | |||||
if dist_sum < dis_min: | |||||
dis_min = dist_sum | |||||
G = G_p.copy() | |||||
idx_min = idx1 | |||||
# list of edit operations. | |||||
pi_p = pi_all[idx_min] | |||||
# phase 2: iteration. | |||||
ds_attrs = get_dataset_attributes(Gn, attr_names=['edge_labeled', 'node_attr_dim'], | |||||
edge_label=edge_label) | |||||
for itr in range(0, 10): | |||||
G_new = G.copy() | |||||
# update vertex labels. | |||||
# pre-compute h_i0 for each label. | |||||
# for label in get_node_labels(Gn, node_label): | |||||
# print(label) | |||||
# for nd in G.nodes(data=True): | |||||
# pass | |||||
if not ds_attrs['node_attr_dim']: # labels are symbolic | |||||
for nd, _ in G.nodes(data=True): | |||||
h_i0_list = [] | |||||
label_list = [] | |||||
for label in get_node_labels(Gn, node_label): | |||||
h_i0 = 0 | |||||
for idx, g in enumerate(Gn): | |||||
pi_i = pi_p[idx][nd] | |||||
if g.has_node(pi_i) and g.nodes[pi_i][node_label] == label: | |||||
h_i0 += 1 | |||||
h_i0_list.append(h_i0) | |||||
label_list.append(label) | |||||
# choose one of the best randomly. | |||||
idx_max = np.argwhere(h_i0_list == np.max(h_i0_list)).flatten().tolist() | |||||
idx_rdm = random.randint(0, len(idx_max) - 1) | |||||
G_new.nodes[nd][node_label] = label_list[idx_max[idx_rdm]] | |||||
else: # labels are non-symbolic | |||||
for nd, _ in G.nodes(data=True): | |||||
Si_norm = 0 | |||||
phi_i_bar = np.array([0.0 for _ in range(ds_attrs['node_attr_dim'])]) | |||||
for idx, g in enumerate(Gn): | |||||
pi_i = pi_p[idx][nd] | |||||
if g.has_node(pi_i): #@todo: what if no g has node? phi_i_bar = 0? | |||||
Si_norm += 1 | |||||
phi_i_bar += np.array([float(itm) for itm in g.nodes[pi_i]['attributes']]) | |||||
phi_i_bar /= Si_norm | |||||
G_new.nodes[nd]['attributes'] = phi_i_bar | |||||
# update edge labels and adjacency matrix. | |||||
if ds_attrs['edge_labeled']: | |||||
for nd1, nd2, _ in G.edges(data=True): | |||||
h_ij0_list = [] | |||||
label_list = [] | |||||
for label in get_edge_labels(Gn, edge_label): | |||||
h_ij0 = 0 | |||||
for idx, g in enumerate(Gn): | |||||
pi_i = pi_p[idx][nd1] | |||||
pi_j = pi_p[idx][nd2] | |||||
h_ij0_p = (g.has_node(pi_i) and g.has_node(pi_j) and | |||||
g.has_edge(pi_i, pi_j) and | |||||
g.edges[pi_i, pi_j][edge_label] == label) | |||||
h_ij0 += h_ij0_p | |||||
h_ij0_list.append(h_ij0) | |||||
label_list.append(label) | |||||
# choose one of the best randomly. | |||||
idx_max = np.argwhere(h_ij0_list == np.max(h_ij0_list)).flatten().tolist() | |||||
h_ij0_max = h_ij0_list[idx_max[0]] | |||||
idx_rdm = random.randint(0, len(idx_max) - 1) | |||||
best_label = label_list[idx_max[idx_rdm]] | |||||
# check whether a_ij is 0 or 1. | |||||
sij_norm = 0 | |||||
for idx, g in enumerate(Gn): | |||||
pi_i = pi_p[idx][nd1] | |||||
pi_j = pi_p[idx][nd2] | |||||
if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j): | |||||
sij_norm += 1 | |||||
if h_ij0_max > len(Gn) * c_er / c_es + sij_norm * (1 - (c_er + c_ei) / c_es): | |||||
if not G_new.has_edge(nd1, nd2): | |||||
G_new.add_edge(nd1, nd2) | |||||
G_new.edges[nd1, nd2][edge_label] = best_label | |||||
else: | |||||
if G_new.has_edge(nd1, nd2): | |||||
G_new.remove_edge(nd1, nd2) | |||||
else: # if edges are unlabeled | |||||
for nd1, nd2, _ in G.edges(data=True): | |||||
sij_norm = 0 | |||||
for idx, g in enumerate(Gn): | |||||
pi_i = pi_p[idx][nd1] | |||||
pi_j = pi_p[idx][nd2] | |||||
if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j): | |||||
sij_norm += 1 | |||||
if sij_norm > len(Gn) * c_er / (c_er + c_ei): | |||||
if not G_new.has_edge(nd1, nd2): | |||||
G_new.add_edge(nd1, nd2) | |||||
else: | |||||
if G_new.has_edge(nd1, nd2): | |||||
G_new.remove_edge(nd1, nd2) | |||||
G = G_new.copy() | |||||
return G | |||||
def GED(g1, g2, lib='gedlib'): | |||||
""" | |||||
Compute GED. It is a dummy function for now. | |||||
""" | |||||
if lib == 'gedlib': | |||||
saveDataset([g1, g2], [None, None], group='xml', filename='ged_tmp/tmp') | |||||
script.appel() | |||||
script.PyRestartEnv() | |||||
script.PyLoadGXLGraph('ged_tmp/', 'collections/tmp.xml') | |||||
listID = script.PyGetGraphIds() | |||||
script.PySetEditCost("CHEM_1") | |||||
script.PyInitEnv() | |||||
script.PySetMethod("BIPARTITE", "") | |||||
script.PyInitMethod() | |||||
g = listID[0] | |||||
h = listID[1] | |||||
script.PyRunMethod(g, h) | |||||
liste = script.PyGetAllMap(g, h) | |||||
upper = script.PyGetUpperBound(g, h) | |||||
lower = script.PyGetLowerBound(g, h) | |||||
dis = upper + lower | |||||
pi = liste[0] | |||||
return dis, pi | |||||
def get_node_labels(Gn, node_label): | |||||
nl = set() | |||||
for G in Gn: | |||||
nl = nl | set(nx.get_node_attributes(G, node_label).values()) | |||||
return nl | |||||
def get_edge_labels(Gn, edge_label): | |||||
el = set() | |||||
for G in Gn: | |||||
el = el | set(nx.get_edge_attributes(G, edge_label).values()) | |||||
return el | |||||
if __name__ == '__main__': | |||||
from pygraph.utils.graphfiles import loadDataset | |||||
ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat', | |||||
'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}} # node/edge symb | |||||
# ds = {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt', | |||||
# 'extra_params': {}} # node nsymb | |||||
# ds = {'name': 'Acyclic', 'dataset': '../datasets/monoterpenoides/trainset_9.ds', | |||||
# 'extra_params': {}} | |||||
Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params']) | |||||
iam(Gn) |
@@ -0,0 +1,5 @@ | |||||
from ctypes import * | |||||
lib1 = cdll.LoadLibrary('lib/fann/libdoublefann.so') | |||||
lib2 = cdll.LoadLibrary('lib/libsvm.3.22/libsvm.so') | |||||
lib3 = cdll.LoadLibrary('lib/nomad/libnomad.so') | |||||
lib4 = cdll.LoadLibrary('lib/nomad/libsgtelib.so') |
@@ -0,0 +1,5 @@ | |||||
from ctypes import * | |||||
lib1 = cdll.LoadLibrary('Cython_GedLib_2/lib/fann/libdoublefann.so') | |||||
lib2 = cdll.LoadLibrary('Cython_GedLib_2/lib/libsvm.3.22/libsvm.so') | |||||
lib3 = cdll.LoadLibrary('Cython_GedLib_2/lib/nomad/libnomad.so') | |||||
lib4 = cdll.LoadLibrary('Cython_GedLib_2/lib/nomad/libsgtelib.so') |
@@ -126,6 +126,7 @@ for alpha in alpha_range: | |||||
dhat = dnew | dhat = dnew | ||||
gnew = gtemp.copy() | gnew = gtemp.copy() | ||||
found = True # found better graph. | found = True # found better graph. | ||||
r = 0 | |||||
if found: | if found: | ||||
gihat_list = [gnew] | gihat_list = [gnew] | ||||
dis_gs.append(dhat) | dis_gs.append(dhat) | ||||
@@ -0,0 +1,26 @@ | |||||
#from distutils.core import setup | |||||
from distutils.extension import Extension | |||||
#from Cython.Distutils import build_ext | |||||
from distutils.core import setup | |||||
from Cython.Build import cythonize | |||||
#setup(ext_modules=cythonize("script.pyx")) | |||||
extensions = [Extension("script", | |||||
sources=["script.pyx", "src/essai.cpp"], | |||||
include_dirs=["include","include/lsape", "include/Eigen", "include/nomad", "include/sgtelib", "include/libsvm.3.22", "include/fann", "include/boost_1_69_0"], | |||||
library_dirs=["lib/fann","lib/gedlib", "lib/libsvm.3.22","lib/nomad"], | |||||
libraries=["doublefann","sgtelib", "svm", "nomad"], | |||||
language="c++", | |||||
extra_compile_args=["-std=c++11"], | |||||
extra_link_args=["-std=c++11"])] | |||||
setup(ext_modules=cythonize(extensions)) | |||||
#extensions = [Extension("script", sources=["script.pyx", "include/gedlib-master/src/env/ged_env.ipp"], include_dirs=["."], language="c++")] | |||||
#setup(name = "script", ext_modules = extensions, cmdclass = {'build_ext':build_ext},) | |||||
# Commande Bash : python setup.py build_ext --inplace |
@@ -0,0 +1,57 @@ | |||||
#export LD_LIBRARY_PATH=.:/export/home/lambertn/Documents/Cython_GedLib_2/lib/fann/:/export/home/lambertn/Documents/Cython_GedLib_2/lib/libsvm.3.22:/export/home/lambertn/Documents/Cython_GedLib_2/lib/nomad | |||||
#Pour que "import script" trouve les librairies qu'a besoin GedLib | |||||
#Equivalent à définir la variable d'environnement LD_LIBRARY_PATH sur un bash | |||||
#Permet de fonctionner sur Idle et autre sans définir à chaque fois la variable d'environnement | |||||
#os.environ ne fonctionne pas dans ce cas | |||||
import librariesImport, script | |||||
#import script | |||||
#truc = script.computeEditDistanceOnGXlGraphs('include/gedlib-master/data/datasets/Mutagenicity/data/','collections/MUTA_10.xml',"CHEM_1", "BIPARTITE", "") | |||||
#print(truc) | |||||
#script.PyRestartEnv() | |||||
#script.appel() | |||||
def test() : | |||||
# script.appel() | |||||
script.PyRestartEnv() | |||||
# print("Here is the Python function !") | |||||
# | |||||
# print("List of Edit Cost Options : ") | |||||
# for i in script.listOfEditCostOptions : | |||||
# print (i) | |||||
# print("") | |||||
# | |||||
# print("List of Method Options : ") | |||||
# for j in script.listOfMethodOptions : | |||||
# print (j) | |||||
# print("") | |||||
script.PyLoadGXLGraph('include/gedlib-master/data/datasets/Mutagenicity/data/', 'collections/MUTA_10.xml') | |||||
listID = script.PyGetGraphIds() | |||||
afficheId = "" | |||||
for i in listID : | |||||
afficheId+=str(i) + " " | |||||
print("Number of graphs = " + str(len(listID)) + ", list of Ids = " + afficheId) | |||||
script.PySetEditCost("CHEM_1") | |||||
script.PyInitEnv() | |||||
script.PySetMethod("BIPARTITE", "") | |||||
script.PyInitMethod() | |||||
g = listID[0] | |||||
h = listID[1] | |||||
script.PyRunMethod(g,h) | |||||
liste = script.PyGetAllMap(g,h) | |||||
print("Forward map : " ,liste[0], ", Backward map : ", liste[1]) | |||||
print ("Upper Bound = " + str(script.PyGetUpperBound(g,h)) + ", Lower Bound = " + str(script.PyGetLowerBound(g,h)) + ", Runtime = " + str(script.PyGetRuntime(g,h))) | |||||
test() |
@@ -52,10 +52,10 @@ def get_dataset_attributes(Gn, | |||||
return False if edge_label is None else True | return False if edge_label is None else True | ||||
def get_edge_label_num(Gn): | def get_edge_label_num(Gn): | ||||
nl = set() | |||||
el = set() | |||||
for G in Gn: | for G in Gn: | ||||
nl = nl | set(nx.get_edge_attributes(G, edge_label).values()) | |||||
return len(nl) | |||||
el = el | set(nx.get_edge_attributes(G, edge_label).values()) | |||||
return len(el) | |||||
def is_directed(Gn): | def is_directed(Gn): | ||||
return nx.is_directed(Gn[0]) | return nx.is_directed(Gn[0]) | ||||
@@ -22,8 +22,8 @@ def loadCT(filename): | |||||
with open(filename) as f: | with open(filename) as f: | ||||
content = f.read().splitlines() | content = f.read().splitlines() | ||||
g = nx.Graph( | g = nx.Graph( | ||||
name=str(content[0]), | |||||
filename=basename(filename)) # set name of the graph | |||||
name = str(content[0]), | |||||
filename = basename(filename)) # set name of the graph | |||||
tmp = content[1].split(" ") | tmp = content[1].split(" ") | ||||
if tmp[0] == '': | if tmp[0] == '': | ||||
nb_nodes = int(tmp[1]) # number of the nodes | nb_nodes = int(tmp[1]) # number of the nodes | ||||
@@ -84,43 +84,63 @@ def loadGXL(filename): | |||||
return g | return g | ||||
def saveGXL(graph, filename): | |||||
import xml.etree.ElementTree as ET | |||||
root_node = ET.Element('gxl') | |||||
attr = dict() | |||||
attr['id'] = graph.graph['name'] | |||||
attr['edgeids'] = 'true' | |||||
attr['edgemode'] = 'undirected' | |||||
graph_node = ET.SubElement(root_node, 'graph', attrib=attr) | |||||
for v in graph: | |||||
current_node = ET.SubElement(graph_node, 'node', attrib={'id': str(v)}) | |||||
for attr in graph.nodes[v].keys(): | |||||
cur_attr = ET.SubElement( | |||||
current_node, 'attr', attrib={'name': attr}) | |||||
cur_value = ET.SubElement(cur_attr, | |||||
graph.nodes[v][attr].__class__.__name__) | |||||
cur_value.text = graph.nodes[v][attr] | |||||
for v1 in graph: | |||||
for v2 in graph[v1]: | |||||
if (v1 < v2): # Non oriented graphs | |||||
cur_edge = ET.SubElement( | |||||
graph_node, | |||||
'edge', | |||||
attrib={ | |||||
'from': str(v1), | |||||
'to': str(v2) | |||||
}) | |||||
for attr in graph[v1][v2].keys(): | |||||
cur_attr = ET.SubElement( | |||||
cur_edge, 'attr', attrib={'name': attr}) | |||||
cur_value = ET.SubElement( | |||||
cur_attr, graph[v1][v2][attr].__class__.__name__) | |||||
cur_value.text = str(graph[v1][v2][attr]) | |||||
tree = ET.ElementTree(root_node) | |||||
tree.write(filename) | |||||
def saveGXL(graph, filename, method='benoit'): | |||||
if method == 'benoit': | |||||
import xml.etree.ElementTree as ET | |||||
root_node = ET.Element('gxl') | |||||
attr = dict() | |||||
attr['id'] = str(graph.graph['name']) | |||||
attr['edgeids'] = 'true' | |||||
attr['edgemode'] = 'undirected' | |||||
graph_node = ET.SubElement(root_node, 'graph', attrib=attr) | |||||
for v in graph: | |||||
current_node = ET.SubElement(graph_node, 'node', attrib={'id': str(v)}) | |||||
for attr in graph.nodes[v].keys(): | |||||
cur_attr = ET.SubElement( | |||||
current_node, 'attr', attrib={'name': attr}) | |||||
cur_value = ET.SubElement(cur_attr, | |||||
graph.nodes[v][attr].__class__.__name__) | |||||
cur_value.text = graph.nodes[v][attr] | |||||
for v1 in graph: | |||||
for v2 in graph[v1]: | |||||
if (v1 < v2): # Non oriented graphs | |||||
cur_edge = ET.SubElement( | |||||
graph_node, | |||||
'edge', | |||||
attrib={ | |||||
'from': str(v1), | |||||
'to': str(v2) | |||||
}) | |||||
for attr in graph[v1][v2].keys(): | |||||
cur_attr = ET.SubElement( | |||||
cur_edge, 'attr', attrib={'name': attr}) | |||||
cur_value = ET.SubElement( | |||||
cur_attr, graph[v1][v2][attr].__class__.__name__) | |||||
cur_value.text = str(graph[v1][v2][attr]) | |||||
tree = ET.ElementTree(root_node) | |||||
tree.write(filename) | |||||
elif method == 'gedlib': | |||||
# reference: https://github.com/dbblumenthal/gedlib/blob/master/data/generate_molecules.py#L22 | |||||
pass | |||||
# gxl_file = open(filename, 'w') | |||||
# gxl_file.write("<?xml version=\"1.0\"?>\n") | |||||
# gxl_file.write("<!DOCTYPE gxl SYSTEM \"http://www.gupro.de/GXL/gxl-1.0.dtd\">\n") | |||||
# gxl_file.write("<gxl>\n") | |||||
# gxl_file.write("<graph id=\"" + str(graph.graph['name']) + "\" edgeids=\"false\" edgemode=\"undirected\">\n") | |||||
# for v in graph: | |||||
# gxl_file.write("<node id=\"_" + str(v) + "\">\n") | |||||
# gxl_file.write("<attr name=\"chem\"><int>" + str(self.node_labels[node]) + "</int></attr>\n") | |||||
# gxl_file.write("</node>\n") | |||||
# for edge in self.edge_list: | |||||
# gxl_file.write("<edge from=\"_" + str(edge[0]) + "\" to=\"_" + str(edge[1]) + "\">\n") | |||||
# gxl_file.write("<attr name=\"valence\"><int>1</int></attr>\n") | |||||
# gxl_file.write("</edge>\n") | |||||
# gxl_file.write("</graph>\n") | |||||
# gxl_file.write("</gxl>\n") | |||||
# gxl_file.close() | |||||
def loadSDF(filename): | def loadSDF(filename): | ||||
@@ -412,3 +432,33 @@ def loadDataset(filename, filename_y=None, extra_params=None): | |||||
# print(g.edges(data=True)) | # print(g.edges(data=True)) | ||||
return data, y | return data, y | ||||
def saveDataset(Gn, y, gformat='gxl', group=None, filename='gfile'): | |||||
"""Save list of graphs. | |||||
""" | |||||
import os | |||||
dirname_ds = os.path.dirname(filename) | |||||
if dirname_ds != '': | |||||
dirname_ds += '/' | |||||
if not os.path.exists(dirname_ds) : | |||||
os.makedirs(dirname_ds) | |||||
if group == 'xml' and gformat == 'gxl': | |||||
with open(filename + '.xml', 'w') as fgroup: | |||||
fgroup.write("<?xml version=\"1.0\"?>") | |||||
fgroup.write("\n<!DOCTYPE GraphCollection SYSTEM \"https://dbblumenthal.github.io/gedlib/GraphCollection_8dtd_source.html\">") | |||||
fgroup.write("\n<GraphCollection>") | |||||
for idx, g in enumerate(Gn): | |||||
fname_tmp = "graph" + str(idx) + ".gxl" | |||||
saveGXL(g, dirname_ds + fname_tmp) | |||||
fgroup.write("\n\t<graph file=\"" + fname_tmp + "\" class=\"" + str(y[idx]) + "\"/>") | |||||
fgroup.write("\n</GraphCollection>") | |||||
fgroup.close() | |||||
if __name__ == '__main__': | |||||
ds = {'name': 'MUTAG', 'dataset': '../../datasets/MUTAG/MUTAG.mat', | |||||
'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}} # node/edge symb | |||||
Gn, y = loadDataset(ds['dataset'], extra_params=ds['extra_params']) | |||||
saveDataset(Gn, y, group='xml', filename='temp/temp') |
@@ -420,55 +420,6 @@ def model_selection_for_precomputed_kernel(datafile, | |||||
# np.save(results_name_pre + 'best_gram_matrix_time.dt', | # np.save(results_name_pre + 'best_gram_matrix_time.dt', | ||||
# best_gram_matrix_time) | # best_gram_matrix_time) | ||||
# print out as table. | |||||
from collections import OrderedDict | |||||
from tabulate import tabulate | |||||
table_dict = {} | |||||
if model_type == 'regression': | |||||
for param_in in param_list: | |||||
param_in['alpha'] = '{:.2e}'.format(param_in['alpha']) | |||||
else: | |||||
for param_in in param_list: | |||||
param_in['C'] = '{:.2e}'.format(param_in['C']) | |||||
table_dict['params'] = [{**param_out, **param_in} | |||||
for param_in in param_list for param_out in param_list_pre_revised] | |||||
table_dict['gram_matrix_time'] = [ | |||||
'{:.2f}'.format(gram_matrix_time[index_out]) | |||||
for param_in in param_list | |||||
for index_out, _ in enumerate(param_list_pre_revised) | |||||
] | |||||
table_dict['valid_perf'] = [ | |||||
'{:.2f}±{:.2f}'.format(average_val_scores[index_out][index_in], | |||||
std_val_scores[index_out][index_in]) | |||||
for index_in, _ in enumerate(param_list) | |||||
for index_out, _ in enumerate(param_list_pre_revised) | |||||
] | |||||
table_dict['test_perf'] = [ | |||||
'{:.2f}±{:.2f}'.format(average_perf_scores[index_out][index_in], | |||||
std_perf_scores[index_out][index_in]) | |||||
for index_in, _ in enumerate(param_list) | |||||
for index_out, _ in enumerate(param_list_pre_revised) | |||||
] | |||||
table_dict['train_perf'] = [ | |||||
'{:.2f}±{:.2f}'.format(average_train_scores[index_out][index_in], | |||||
std_train_scores[index_out][index_in]) | |||||
for index_in, _ in enumerate(param_list) | |||||
for index_out, _ in enumerate(param_list_pre_revised) | |||||
] | |||||
keyorder = [ | |||||
'params', 'train_perf', 'valid_perf', 'test_perf', | |||||
'gram_matrix_time' | |||||
] | |||||
if verbose: | |||||
print() | |||||
tb_print = tabulate( | |||||
OrderedDict( | |||||
sorted(table_dict.items(), | |||||
key=lambda i: keyorder.index(i[0]))), | |||||
headers='keys') | |||||
# print(tb_print) | |||||
str_fw += 'table of performance v.s. hyper-params:\n\n%s\n\n' % tb_print | |||||
# read gram matrices from file. | # read gram matrices from file. | ||||
else: | else: | ||||
# Grid of parameters with a discrete number of values for each. | # Grid of parameters with a discrete number of values for each. | ||||
@@ -632,58 +583,16 @@ def model_selection_for_precomputed_kernel(datafile, | |||||
# str_fw += 'time to calculate best gram matrix: {:.2f}±{:.2f}s\n'.format(ave_bgmt, std_bgmt) | # str_fw += 'time to calculate best gram matrix: {:.2f}±{:.2f}s\n'.format(ave_bgmt, std_bgmt) | ||||
str_fw += 'training time with hyper-param choices who did not participate in calculation of gram matrices: {:.2f}s\n\n'.format(tt_poster) | str_fw += 'training time with hyper-param choices who did not participate in calculation of gram matrices: {:.2f}s\n\n'.format(tt_poster) | ||||
# print out as table. | |||||
from collections import OrderedDict | |||||
from tabulate import tabulate | |||||
table_dict = {} | |||||
if model_type == 'regression': | |||||
for param_in in param_list: | |||||
param_in['alpha'] = '{:.2e}'.format(param_in['alpha']) | |||||
else: | |||||
for param_in in param_list: | |||||
param_in['C'] = '{:.2e}'.format(param_in['C']) | |||||
table_dict['params'] = [{**param_out, **param_in} | |||||
for param_in in param_list for param_out in param_list_pre_revised] | |||||
# table_dict['gram_matrix_time'] = [ | |||||
# '{:.2f}'.format(gram_matrix_time[index_out]) | |||||
# for param_in in param_list | |||||
# for index_out, _ in enumerate(param_list_pre_revised) | |||||
# ] | |||||
table_dict['valid_perf'] = [ | |||||
'{:.2f}±{:.2f}'.format(average_val_scores[index_out][index_in], | |||||
std_val_scores[index_out][index_in]) | |||||
for index_in, _ in enumerate(param_list) | |||||
for index_out, _ in enumerate(param_list_pre_revised) | |||||
] | |||||
table_dict['test_perf'] = [ | |||||
'{:.2f}±{:.2f}'.format(average_perf_scores[index_out][index_in], | |||||
std_perf_scores[index_out][index_in]) | |||||
for index_in, _ in enumerate(param_list) | |||||
for index_out, _ in enumerate(param_list_pre_revised) | |||||
] | |||||
table_dict['train_perf'] = [ | |||||
'{:.2f}±{:.2f}'.format(average_train_scores[index_out][index_in], | |||||
std_train_scores[index_out][index_in]) | |||||
for index_in, _ in enumerate(param_list) | |||||
for index_out, _ in enumerate(param_list_pre_revised) | |||||
] | |||||
keyorder = [ | |||||
'params', 'train_perf', 'valid_perf', 'test_perf' | |||||
] | |||||
if verbose: | |||||
print() | |||||
tb_print = tabulate( | |||||
OrderedDict( | |||||
sorted(table_dict.items(), | |||||
key=lambda i: keyorder.index(i[0]))), | |||||
headers='keys') | |||||
# print(tb_print) | |||||
str_fw += 'table of performance v.s. hyper-params:\n\n%s\n\n' % tb_print | |||||
# open file to save all results for this dataset. | # open file to save all results for this dataset. | ||||
if not os.path.exists(results_dir): | if not os.path.exists(results_dir): | ||||
os.makedirs(results_dir) | os.makedirs(results_dir) | ||||
# print out results as table. | |||||
str_fw += printResultsInTable(param_list, param_list_pre_revised, average_val_scores, | |||||
std_val_scores, average_perf_scores, std_perf_scores, | |||||
average_train_scores, std_train_scores, gram_matrix_time, | |||||
model_type, verbose) | |||||
# open file to save all results for this dataset. | # open file to save all results for this dataset. | ||||
if not os.path.exists(results_dir + '/' + ds_name + '.output.txt'): | if not os.path.exists(results_dir + '/' + ds_name + '.output.txt'): | ||||
with open(results_dir + '/' + ds_name + '.output.txt', 'w') as f: | with open(results_dir + '/' + ds_name + '.output.txt', 'w') as f: | ||||
@@ -974,4 +883,55 @@ def read_gram_matrices_from_file(results_dir, ds_name): | |||||
gram_matrices = gmfile['gms'] # a list to store gram matrices for all param_grid_precomputed | gram_matrices = gmfile['gms'] # a list to store gram matrices for all param_grid_precomputed | ||||
param_list_pre_revised = gmfile['params'] # list to store param grids precomputed ignoring the useless ones | param_list_pre_revised = gmfile['params'] # list to store param grids precomputed ignoring the useless ones | ||||
y = gmfile['y'].tolist() | y = gmfile['y'].tolist() | ||||
return gram_matrices, param_list_pre_revised, y | |||||
return gram_matrices, param_list_pre_revised, y | |||||
def printResultsInTable(param_list, param_list_pre_revised, average_val_scores, | |||||
std_val_scores, average_perf_scores, std_perf_scores, | |||||
average_train_scores, std_train_scores, gram_matrix_time, | |||||
model_type, verbose): | |||||
from collections import OrderedDict | |||||
from tabulate import tabulate | |||||
table_dict = {} | |||||
if model_type == 'regression': | |||||
for param_in in param_list: | |||||
param_in['alpha'] = '{:.2e}'.format(param_in['alpha']) | |||||
else: | |||||
for param_in in param_list: | |||||
param_in['C'] = '{:.2e}'.format(param_in['C']) | |||||
table_dict['params'] = [{**param_out, **param_in} | |||||
for param_in in param_list for param_out in param_list_pre_revised] | |||||
table_dict['gram_matrix_time'] = [ | |||||
'{:.2f}'.format(gram_matrix_time[index_out]) | |||||
for param_in in param_list | |||||
for index_out, _ in enumerate(param_list_pre_revised) | |||||
] | |||||
table_dict['valid_perf'] = [ | |||||
'{:.2f}±{:.2f}'.format(average_val_scores[index_out][index_in], | |||||
std_val_scores[index_out][index_in]) | |||||
for index_in, _ in enumerate(param_list) | |||||
for index_out, _ in enumerate(param_list_pre_revised) | |||||
] | |||||
table_dict['test_perf'] = [ | |||||
'{:.2f}±{:.2f}'.format(average_perf_scores[index_out][index_in], | |||||
std_perf_scores[index_out][index_in]) | |||||
for index_in, _ in enumerate(param_list) | |||||
for index_out, _ in enumerate(param_list_pre_revised) | |||||
] | |||||
table_dict['train_perf'] = [ | |||||
'{:.2f}±{:.2f}'.format(average_train_scores[index_out][index_in], | |||||
std_train_scores[index_out][index_in]) | |||||
for index_in, _ in enumerate(param_list) | |||||
for index_out, _ in enumerate(param_list_pre_revised) | |||||
] | |||||
keyorder = [ | |||||
'params', 'train_perf', 'valid_perf', 'test_perf', | |||||
'gram_matrix_time' | |||||
] | |||||
if verbose: | |||||
print() | |||||
tb_print = tabulate(OrderedDict(sorted(table_dict.items(), | |||||
key=lambda i: keyorder.index(i[0]))), headers='keys') | |||||
# print(tb_print) | |||||
return 'table of performance v.s. hyper-params:\n\n%s\n\n' % tb_print |