diff --git a/.gitignore b/.gitignore index ead11c4..4fa9bbe 100644 --- a/.gitignore +++ b/.gitignore @@ -20,5 +20,8 @@ pygraph/kernels/*_sym.py *.dat *.pyc +preimage/* +!preimage/*.py + __pycache__ ##*# diff --git a/preimage/gk_iam.py b/preimage/gk_iam.py new file mode 100644 index 0000000..206b640 --- /dev/null +++ b/preimage/gk_iam.py @@ -0,0 +1,196 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Tue Apr 30 17:07:43 2019 + +A graph pre-image method combining iterative pre-image method in reference [1] +and the iterative alternate minimizations (IAM) in reference [2]. +@author: ljia +@references: + [1] Gökhan H Bakir, Alexander Zien, and Koji Tsuda. Learning to and graph + pre-images. In Joint Pattern Re ognition Symposium , pages 253-261. Springer, 2004. + [2] Generalized median graph via iterative alternate minimization. +""" +import numpy as np +import multiprocessing +from tqdm import tqdm +import networkx as nx +import matplotlib.pyplot as plt + +from iam import iam + + +def gk_iam(Gn, alpha): + """This function constructs graph pre-image by the iterative pre-image + framework in reference [1], algorithm 1, where the step of generating new + graphs randomly is replaced by the IAM algorithm in reference [2]. + + notes + ----- + Every time a better graph is acquired, the older one is replaced by it. + """ + # compute k nearest neighbors of phi in DN. + dis_list = [] # distance between g_star and each graph. + for ig, g in tqdm(enumerate(Gn), desc='computing distances', file=sys.stdout): + dtemp = k_list[ig] - 2 * (alpha * k_g1_list[ig] + (1 - alpha) * + k_g2_list[ig]) + (alpha * alpha * k_list[idx1] + alpha * + (1 - alpha) * k_g2_list[idx1] + (1 - alpha) * alpha * + k_g1_list[idx2] + (1 - alpha) * (1 - alpha) * k_list[idx2]) + dis_list.append(dtemp) + + # sort + sort_idx = np.argsort(dis_list) + dis_gs = [dis_list[idis] for idis in sort_idx[0:k]] + g0hat = Gn[sort_idx[0]] # the nearest neighbor of phi in DN + if dis_gs[0] == 0: # the exact pre-image. + print('The exact pre-image is found from the input dataset.') + return 0, g0hat + dhat = dis_gs[0] # the nearest distance + Gk = [Gn[ig] for ig in sort_idx[0:k]] # the k nearest neighbors + gihat_list = [] + +# i = 1 + r = 1 + while r < r_max: + print('r =', r) +# found = False + Gs_nearest = Gk + gihat_list + g_tmp = iam(Gs_nearest) + + # compute distance between phi and the new generated graph. + knew = marginalizedkernel([g_tmp, g1, g2], node_label='atom', edge_label=None, + p_quit=lmbda, n_iteration=20, remove_totters=False, + n_jobs=multiprocessing.cpu_count(), verbose=False) + dnew = knew[0][0, 0] - 2 * (alpha * knew[0][0, 1] + (1 - alpha) * + knew[0][0, 2]) + (alpha * alpha * k_list[idx1] + alpha * + (1 - alpha) * k_g2_list[idx1] + (1 - alpha) * alpha * + k_g1_list[idx2] + (1 - alpha) * (1 - alpha) * k_list[idx2]) + if dnew <= dhat: # the new distance is smaller + print('I am smaller!') + dhat = dnew + g_new = g_tmp.copy() # found better graph. + gihat_list = [g_new] + dis_gs.append(dhat) + r = 0 + else: + r += 1 + + ghat = ([g0hat] if len(gihat_list) == 0 else gihat_list) + + return dhat, ghat + + +def gk_iam_nearest(Gn, alpha): + """This function constructs graph pre-image by the iterative pre-image + framework in reference [1], algorithm 1, where the step of generating new + graphs randomly is replaced by the IAM algorithm in reference [2]. + + notes + ----- + Every time a better graph is acquired, its distance in kernel space is + compared with the k nearest ones, and the k nearest distances from the k+1 + distances will be used as the new ones. + """ + # compute k nearest neighbors of phi in DN. + dis_list = [] # distance between g_star and each graph. + for ig, g in tqdm(enumerate(Gn), desc='computing distances', file=sys.stdout): + dtemp = k_list[ig] - 2 * (alpha * k_g1_list[ig] + (1 - alpha) * + k_g2_list[ig]) + (alpha * alpha * k_list[idx1] + alpha * + (1 - alpha) * k_g2_list[idx1] + (1 - alpha) * alpha * + k_g1_list[idx2] + (1 - alpha) * (1 - alpha) * k_list[idx2]) + dis_list.append(dtemp) + + # sort + sort_idx = np.argsort(dis_list) + dis_gs = [dis_list[idis] for idis in sort_idx[0:k]] # the k shortest distances + g0hat = Gn[sort_idx[0]] # the nearest neighbor of phi in DN + if dis_gs[0] == 0: # the exact pre-image. + print('The exact pre-image is found from the input dataset.') + return 0, g0hat + dhat = dis_gs[0] # the nearest distance + ghat = g0hat + Gk = [Gn[ig] for ig in sort_idx[0:k]] # the k nearest neighbors + Gs_nearest = Gk +# gihat_list = [] + +# i = 1 + r = 1 + while r < r_max: + print('r =', r) +# found = False +# Gs_nearest = Gk + gihat_list + g_tmp = iam(Gs_nearest) + + # compute distance between phi and the new generated graph. + knew = marginalizedkernel([g_tmp, g1, g2], node_label='atom', edge_label=None, + p_quit=lmbda, n_iteration=20, remove_totters=False, + n_jobs=multiprocessing.cpu_count(), verbose=False) + dnew = knew[0][0, 0] - 2 * (alpha * knew[0][0, 1] + (1 - alpha) * + knew[0][0, 2]) + (alpha * alpha * k_list[idx1] + alpha * + (1 - alpha) * k_g2_list[idx1] + (1 - alpha) * alpha * + k_g1_list[idx2] + (1 - alpha) * (1 - alpha) * k_list[idx2]) + if dnew <= dhat: # the new distance is smaller + print('I am smaller!') + dhat = dnew + g_new = g_tmp.copy() # found better graph. + ghat = g_tmp.copy() + dis_gs.append(dhat) # add the new nearest distance. + Gs_nearest.append(g_new) # add the corresponding graph. + sort_idx = np.argsort(dis_gs) + dis_gs = [dis_gs[idx] for idx in sort_idx[0:k]] # the new k nearest distances. + Gs_nearest = [Gs_nearest[idx] for idx in sort_idx[0:k]] + r = 0 + else: + r += 1 + + return dhat, ghat + + +if __name__ == '__main__': + import sys + sys.path.insert(0, "../") + from pygraph.kernels.marginalizedKernel import marginalizedkernel + from pygraph.utils.graphfiles import loadDataset + ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat', + 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}} # node/edge symb + Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params']) +# Gn = Gn[0:10] + + lmbda = 0.03 # termination probalility + r_max = 10 # recursions + l = 500 + alpha_range = np.linspace(0.1, 0.9, 9) + k = 5 # k nearest neighbors + + # randomly select two molecules + np.random.seed(1) + idx1, idx2 = np.random.randint(0, len(Gn), 2) + g1 = Gn[idx1] + g2 = Gn[idx2] + + # compute + k_list = [] # kernel between each graph and itself. + k_g1_list = [] # kernel between each graph and g1 + k_g2_list = [] # kernel between each graph and g2 + for ig, g in tqdm(enumerate(Gn), desc='computing self kernels', file=sys.stdout): + ktemp = marginalizedkernel([g, g1, g2], node_label='atom', edge_label=None, + p_quit=lmbda, n_iteration=20, remove_totters=False, + n_jobs=multiprocessing.cpu_count(), verbose=False) + k_list.append(ktemp[0][0, 0]) + k_g1_list.append(ktemp[0][0, 1]) + k_g2_list.append(ktemp[0][0, 2]) + + g_best = [] + dis_best = [] + # for each alpha + for alpha in alpha_range: + print('alpha =', alpha) + dhat, ghat = gk_iam_nearest(Gn, alpha) + dis_best.append(dhat) + g_best.append(ghat) + + for idx, item in enumerate(alpha_range): + print('when alpha is', item, 'the shortest distance is', dis_best[idx]) + print('the corresponding pre-image is') + nx.draw_networkx(g_best[idx]) + plt.show() \ No newline at end of file diff --git a/preimage/iam.py b/preimage/iam.py new file mode 100644 index 0000000..cabb05f --- /dev/null +++ b/preimage/iam.py @@ -0,0 +1,195 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Fri Apr 26 11:49:12 2019 + +Iterative alternate minimizations using GED. +@author: ljia +""" +import numpy as np +import random +import networkx as nx + +import sys +#from Cython_GedLib_2 import librariesImport, script +import librariesImport, script +sys.path.insert(0, "../") +from pygraph.utils.graphfiles import saveDataset +from pygraph.utils.graphdataset import get_dataset_attributes + + +def iam(Gn, node_label='atom', edge_label='bond_type'): + """See my name, then you know what I do. + """ +# Gn = Gn[0:10] + Gn = [nx.convert_node_labels_to_integers(g) for g in Gn] + + c_er = 1 + c_es = 1 + c_ei = 1 + + # phase 1: initilize. + # compute set-median. + dis_min = np.inf + pi_p = [] + pi_all = [] + for idx1, G_p in enumerate(Gn): + dist_sum = 0 + pi_all.append([]) + for idx2, G_p_prime in enumerate(Gn): + dist_tmp, pi_tmp = GED(G_p, G_p_prime) + pi_all[idx1].append(pi_tmp) + dist_sum += dist_tmp + if dist_sum < dis_min: + dis_min = dist_sum + G = G_p.copy() + idx_min = idx1 + # list of edit operations. + pi_p = pi_all[idx_min] + + # phase 2: iteration. + ds_attrs = get_dataset_attributes(Gn, attr_names=['edge_labeled', 'node_attr_dim'], + edge_label=edge_label) + for itr in range(0, 10): + G_new = G.copy() + # update vertex labels. + # pre-compute h_i0 for each label. +# for label in get_node_labels(Gn, node_label): +# print(label) +# for nd in G.nodes(data=True): +# pass + if not ds_attrs['node_attr_dim']: # labels are symbolic + for nd, _ in G.nodes(data=True): + h_i0_list = [] + label_list = [] + for label in get_node_labels(Gn, node_label): + h_i0 = 0 + for idx, g in enumerate(Gn): + pi_i = pi_p[idx][nd] + if g.has_node(pi_i) and g.nodes[pi_i][node_label] == label: + h_i0 += 1 + h_i0_list.append(h_i0) + label_list.append(label) + # choose one of the best randomly. + idx_max = np.argwhere(h_i0_list == np.max(h_i0_list)).flatten().tolist() + idx_rdm = random.randint(0, len(idx_max) - 1) + G_new.nodes[nd][node_label] = label_list[idx_max[idx_rdm]] + else: # labels are non-symbolic + for nd, _ in G.nodes(data=True): + Si_norm = 0 + phi_i_bar = np.array([0.0 for _ in range(ds_attrs['node_attr_dim'])]) + for idx, g in enumerate(Gn): + pi_i = pi_p[idx][nd] + if g.has_node(pi_i): #@todo: what if no g has node? phi_i_bar = 0? + Si_norm += 1 + phi_i_bar += np.array([float(itm) for itm in g.nodes[pi_i]['attributes']]) + phi_i_bar /= Si_norm + G_new.nodes[nd]['attributes'] = phi_i_bar + + # update edge labels and adjacency matrix. + if ds_attrs['edge_labeled']: + for nd1, nd2, _ in G.edges(data=True): + h_ij0_list = [] + label_list = [] + for label in get_edge_labels(Gn, edge_label): + h_ij0 = 0 + for idx, g in enumerate(Gn): + pi_i = pi_p[idx][nd1] + pi_j = pi_p[idx][nd2] + h_ij0_p = (g.has_node(pi_i) and g.has_node(pi_j) and + g.has_edge(pi_i, pi_j) and + g.edges[pi_i, pi_j][edge_label] == label) + h_ij0 += h_ij0_p + h_ij0_list.append(h_ij0) + label_list.append(label) + # choose one of the best randomly. + idx_max = np.argwhere(h_ij0_list == np.max(h_ij0_list)).flatten().tolist() + h_ij0_max = h_ij0_list[idx_max[0]] + idx_rdm = random.randint(0, len(idx_max) - 1) + best_label = label_list[idx_max[idx_rdm]] + + # check whether a_ij is 0 or 1. + sij_norm = 0 + for idx, g in enumerate(Gn): + pi_i = pi_p[idx][nd1] + pi_j = pi_p[idx][nd2] + if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j): + sij_norm += 1 + if h_ij0_max > len(Gn) * c_er / c_es + sij_norm * (1 - (c_er + c_ei) / c_es): + if not G_new.has_edge(nd1, nd2): + G_new.add_edge(nd1, nd2) + G_new.edges[nd1, nd2][edge_label] = best_label + else: + if G_new.has_edge(nd1, nd2): + G_new.remove_edge(nd1, nd2) + else: # if edges are unlabeled + for nd1, nd2, _ in G.edges(data=True): + sij_norm = 0 + for idx, g in enumerate(Gn): + pi_i = pi_p[idx][nd1] + pi_j = pi_p[idx][nd2] + if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j): + sij_norm += 1 + if sij_norm > len(Gn) * c_er / (c_er + c_ei): + if not G_new.has_edge(nd1, nd2): + G_new.add_edge(nd1, nd2) + else: + if G_new.has_edge(nd1, nd2): + G_new.remove_edge(nd1, nd2) + + G = G_new.copy() + + return G + + +def GED(g1, g2, lib='gedlib'): + """ + Compute GED. It is a dummy function for now. + """ + if lib == 'gedlib': + saveDataset([g1, g2], [None, None], group='xml', filename='ged_tmp/tmp') + script.appel() + script.PyRestartEnv() + script.PyLoadGXLGraph('ged_tmp/', 'collections/tmp.xml') + listID = script.PyGetGraphIds() + script.PySetEditCost("CHEM_1") + script.PyInitEnv() + script.PySetMethod("BIPARTITE", "") + script.PyInitMethod() + g = listID[0] + h = listID[1] + script.PyRunMethod(g, h) + liste = script.PyGetAllMap(g, h) + upper = script.PyGetUpperBound(g, h) + lower = script.PyGetLowerBound(g, h) + dis = upper + lower + pi = liste[0] + + return dis, pi + + +def get_node_labels(Gn, node_label): + nl = set() + for G in Gn: + nl = nl | set(nx.get_node_attributes(G, node_label).values()) + return nl + + +def get_edge_labels(Gn, edge_label): + el = set() + for G in Gn: + el = el | set(nx.get_edge_attributes(G, edge_label).values()) + return el + + +if __name__ == '__main__': + from pygraph.utils.graphfiles import loadDataset + ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat', + 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}} # node/edge symb +# ds = {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt', +# 'extra_params': {}} # node nsymb +# ds = {'name': 'Acyclic', 'dataset': '../datasets/monoterpenoides/trainset_9.ds', +# 'extra_params': {}} + Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params']) + + iam(Gn) \ No newline at end of file diff --git a/preimage/librariesImport.py b/preimage/librariesImport.py new file mode 100644 index 0000000..f1573cc --- /dev/null +++ b/preimage/librariesImport.py @@ -0,0 +1,5 @@ +from ctypes import * +lib1 = cdll.LoadLibrary('lib/fann/libdoublefann.so') +lib2 = cdll.LoadLibrary('lib/libsvm.3.22/libsvm.so') +lib3 = cdll.LoadLibrary('lib/nomad/libnomad.so') +lib4 = cdll.LoadLibrary('lib/nomad/libsgtelib.so') diff --git a/preimage/librariesImport2.py b/preimage/librariesImport2.py new file mode 100644 index 0000000..94f2940 --- /dev/null +++ b/preimage/librariesImport2.py @@ -0,0 +1,5 @@ +from ctypes import * +lib1 = cdll.LoadLibrary('Cython_GedLib_2/lib/fann/libdoublefann.so') +lib2 = cdll.LoadLibrary('Cython_GedLib_2/lib/libsvm.3.22/libsvm.so') +lib3 = cdll.LoadLibrary('Cython_GedLib_2/lib/nomad/libnomad.so') +lib4 = cdll.LoadLibrary('Cython_GedLib_2/lib/nomad/libsgtelib.so') diff --git a/preimage/preimage.py b/preimage/preimage.py index c466087..d449062 100644 --- a/preimage/preimage.py +++ b/preimage/preimage.py @@ -126,6 +126,7 @@ for alpha in alpha_range: dhat = dnew gnew = gtemp.copy() found = True # found better graph. + r = 0 if found: gihat_list = [gnew] dis_gs.append(dhat) diff --git a/preimage/setup.py b/preimage/setup.py new file mode 100644 index 0000000..381a51f --- /dev/null +++ b/preimage/setup.py @@ -0,0 +1,26 @@ +#from distutils.core import setup +from distutils.extension import Extension +#from Cython.Distutils import build_ext + +from distutils.core import setup +from Cython.Build import cythonize + +#setup(ext_modules=cythonize("script.pyx")) + +extensions = [Extension("script", + sources=["script.pyx", "src/essai.cpp"], + include_dirs=["include","include/lsape", "include/Eigen", "include/nomad", "include/sgtelib", "include/libsvm.3.22", "include/fann", "include/boost_1_69_0"], + library_dirs=["lib/fann","lib/gedlib", "lib/libsvm.3.22","lib/nomad"], + libraries=["doublefann","sgtelib", "svm", "nomad"], + language="c++", + extra_compile_args=["-std=c++11"], + extra_link_args=["-std=c++11"])] + +setup(ext_modules=cythonize(extensions)) + +#extensions = [Extension("script", sources=["script.pyx", "include/gedlib-master/src/env/ged_env.ipp"], include_dirs=["."], language="c++")] + +#setup(name = "script", ext_modules = extensions, cmdclass = {'build_ext':build_ext},) + + +# Commande Bash : python setup.py build_ext --inplace diff --git a/preimage/test.py b/preimage/test.py new file mode 100644 index 0000000..e6ca558 --- /dev/null +++ b/preimage/test.py @@ -0,0 +1,57 @@ +#export LD_LIBRARY_PATH=.:/export/home/lambertn/Documents/Cython_GedLib_2/lib/fann/:/export/home/lambertn/Documents/Cython_GedLib_2/lib/libsvm.3.22:/export/home/lambertn/Documents/Cython_GedLib_2/lib/nomad + +#Pour que "import script" trouve les librairies qu'a besoin GedLib +#Equivalent à définir la variable d'environnement LD_LIBRARY_PATH sur un bash +#Permet de fonctionner sur Idle et autre sans définir à chaque fois la variable d'environnement +#os.environ ne fonctionne pas dans ce cas +import librariesImport, script + +#import script + +#truc = script.computeEditDistanceOnGXlGraphs('include/gedlib-master/data/datasets/Mutagenicity/data/','collections/MUTA_10.xml',"CHEM_1", "BIPARTITE", "") +#print(truc) +#script.PyRestartEnv() +#script.appel() + +def test() : +# script.appel() + + script.PyRestartEnv() + +# print("Here is the Python function !") +# +# print("List of Edit Cost Options : ") +# for i in script.listOfEditCostOptions : +# print (i) +# print("") +# +# print("List of Method Options : ") +# for j in script.listOfMethodOptions : +# print (j) +# print("") + + script.PyLoadGXLGraph('include/gedlib-master/data/datasets/Mutagenicity/data/', 'collections/MUTA_10.xml') + listID = script.PyGetGraphIds() + + afficheId = "" + for i in listID : + afficheId+=str(i) + " " + print("Number of graphs = " + str(len(listID)) + ", list of Ids = " + afficheId) + + script.PySetEditCost("CHEM_1") + + script.PyInitEnv() + + script.PySetMethod("BIPARTITE", "") + script.PyInitMethod() + + g = listID[0] + h = listID[1] + + script.PyRunMethod(g,h) + liste = script.PyGetAllMap(g,h) + print("Forward map : " ,liste[0], ", Backward map : ", liste[1]) + print ("Upper Bound = " + str(script.PyGetUpperBound(g,h)) + ", Lower Bound = " + str(script.PyGetLowerBound(g,h)) + ", Runtime = " + str(script.PyGetRuntime(g,h))) + + +test() diff --git a/pygraph/utils/graphdataset.py b/pygraph/utils/graphdataset.py index deaa182..4ca2c15 100644 --- a/pygraph/utils/graphdataset.py +++ b/pygraph/utils/graphdataset.py @@ -52,10 +52,10 @@ def get_dataset_attributes(Gn, return False if edge_label is None else True def get_edge_label_num(Gn): - nl = set() + el = set() for G in Gn: - nl = nl | set(nx.get_edge_attributes(G, edge_label).values()) - return len(nl) + el = el | set(nx.get_edge_attributes(G, edge_label).values()) + return len(el) def is_directed(Gn): return nx.is_directed(Gn[0]) diff --git a/pygraph/utils/graphfiles.py b/pygraph/utils/graphfiles.py index 4d0feb8..9f7ab8f 100644 --- a/pygraph/utils/graphfiles.py +++ b/pygraph/utils/graphfiles.py @@ -22,8 +22,8 @@ def loadCT(filename): with open(filename) as f: content = f.read().splitlines() g = nx.Graph( - name=str(content[0]), - filename=basename(filename)) # set name of the graph + name = str(content[0]), + filename = basename(filename)) # set name of the graph tmp = content[1].split(" ") if tmp[0] == '': nb_nodes = int(tmp[1]) # number of the nodes @@ -84,43 +84,63 @@ def loadGXL(filename): return g -def saveGXL(graph, filename): - import xml.etree.ElementTree as ET - root_node = ET.Element('gxl') - attr = dict() - attr['id'] = graph.graph['name'] - attr['edgeids'] = 'true' - attr['edgemode'] = 'undirected' - graph_node = ET.SubElement(root_node, 'graph', attrib=attr) - - for v in graph: - current_node = ET.SubElement(graph_node, 'node', attrib={'id': str(v)}) - for attr in graph.nodes[v].keys(): - cur_attr = ET.SubElement( - current_node, 'attr', attrib={'name': attr}) - cur_value = ET.SubElement(cur_attr, - graph.nodes[v][attr].__class__.__name__) - cur_value.text = graph.nodes[v][attr] - - for v1 in graph: - for v2 in graph[v1]: - if (v1 < v2): # Non oriented graphs - cur_edge = ET.SubElement( - graph_node, - 'edge', - attrib={ - 'from': str(v1), - 'to': str(v2) - }) - for attr in graph[v1][v2].keys(): - cur_attr = ET.SubElement( - cur_edge, 'attr', attrib={'name': attr}) - cur_value = ET.SubElement( - cur_attr, graph[v1][v2][attr].__class__.__name__) - cur_value.text = str(graph[v1][v2][attr]) - - tree = ET.ElementTree(root_node) - tree.write(filename) +def saveGXL(graph, filename, method='benoit'): + if method == 'benoit': + import xml.etree.ElementTree as ET + root_node = ET.Element('gxl') + attr = dict() + attr['id'] = str(graph.graph['name']) + attr['edgeids'] = 'true' + attr['edgemode'] = 'undirected' + graph_node = ET.SubElement(root_node, 'graph', attrib=attr) + + for v in graph: + current_node = ET.SubElement(graph_node, 'node', attrib={'id': str(v)}) + for attr in graph.nodes[v].keys(): + cur_attr = ET.SubElement( + current_node, 'attr', attrib={'name': attr}) + cur_value = ET.SubElement(cur_attr, + graph.nodes[v][attr].__class__.__name__) + cur_value.text = graph.nodes[v][attr] + + for v1 in graph: + for v2 in graph[v1]: + if (v1 < v2): # Non oriented graphs + cur_edge = ET.SubElement( + graph_node, + 'edge', + attrib={ + 'from': str(v1), + 'to': str(v2) + }) + for attr in graph[v1][v2].keys(): + cur_attr = ET.SubElement( + cur_edge, 'attr', attrib={'name': attr}) + cur_value = ET.SubElement( + cur_attr, graph[v1][v2][attr].__class__.__name__) + cur_value.text = str(graph[v1][v2][attr]) + + tree = ET.ElementTree(root_node) + tree.write(filename) + elif method == 'gedlib': + # reference: https://github.com/dbblumenthal/gedlib/blob/master/data/generate_molecules.py#L22 + pass +# gxl_file = open(filename, 'w') +# gxl_file.write("\n") +# gxl_file.write("\n") +# gxl_file.write("\n") +# gxl_file.write("\n") +# for v in graph: +# gxl_file.write("\n") +# gxl_file.write("" + str(self.node_labels[node]) + "\n") +# gxl_file.write("\n") +# for edge in self.edge_list: +# gxl_file.write("\n") +# gxl_file.write("1\n") +# gxl_file.write("\n") +# gxl_file.write("\n") +# gxl_file.write("\n") +# gxl_file.close() def loadSDF(filename): @@ -412,3 +432,33 @@ def loadDataset(filename, filename_y=None, extra_params=None): # print(g.edges(data=True)) return data, y + + +def saveDataset(Gn, y, gformat='gxl', group=None, filename='gfile'): + """Save list of graphs. + """ + import os + dirname_ds = os.path.dirname(filename) + if dirname_ds != '': + dirname_ds += '/' + if not os.path.exists(dirname_ds) : + os.makedirs(dirname_ds) + + if group == 'xml' and gformat == 'gxl': + with open(filename + '.xml', 'w') as fgroup: + fgroup.write("") + fgroup.write("\n") + fgroup.write("\n") + for idx, g in enumerate(Gn): + fname_tmp = "graph" + str(idx) + ".gxl" + saveGXL(g, dirname_ds + fname_tmp) + fgroup.write("\n\t") + fgroup.write("\n") + fgroup.close() + + +if __name__ == '__main__': + ds = {'name': 'MUTAG', 'dataset': '../../datasets/MUTAG/MUTAG.mat', + 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}} # node/edge symb + Gn, y = loadDataset(ds['dataset'], extra_params=ds['extra_params']) + saveDataset(Gn, y, group='xml', filename='temp/temp') \ No newline at end of file diff --git a/pygraph/utils/model_selection_precomputed.py b/pygraph/utils/model_selection_precomputed.py index 174b674..a6f0303 100644 --- a/pygraph/utils/model_selection_precomputed.py +++ b/pygraph/utils/model_selection_precomputed.py @@ -420,55 +420,6 @@ def model_selection_for_precomputed_kernel(datafile, # np.save(results_name_pre + 'best_gram_matrix_time.dt', # best_gram_matrix_time) - # print out as table. - from collections import OrderedDict - from tabulate import tabulate - table_dict = {} - if model_type == 'regression': - for param_in in param_list: - param_in['alpha'] = '{:.2e}'.format(param_in['alpha']) - else: - for param_in in param_list: - param_in['C'] = '{:.2e}'.format(param_in['C']) - table_dict['params'] = [{**param_out, **param_in} - for param_in in param_list for param_out in param_list_pre_revised] - table_dict['gram_matrix_time'] = [ - '{:.2f}'.format(gram_matrix_time[index_out]) - for param_in in param_list - for index_out, _ in enumerate(param_list_pre_revised) - ] - table_dict['valid_perf'] = [ - '{:.2f}±{:.2f}'.format(average_val_scores[index_out][index_in], - std_val_scores[index_out][index_in]) - for index_in, _ in enumerate(param_list) - for index_out, _ in enumerate(param_list_pre_revised) - ] - table_dict['test_perf'] = [ - '{:.2f}±{:.2f}'.format(average_perf_scores[index_out][index_in], - std_perf_scores[index_out][index_in]) - for index_in, _ in enumerate(param_list) - for index_out, _ in enumerate(param_list_pre_revised) - ] - table_dict['train_perf'] = [ - '{:.2f}±{:.2f}'.format(average_train_scores[index_out][index_in], - std_train_scores[index_out][index_in]) - for index_in, _ in enumerate(param_list) - for index_out, _ in enumerate(param_list_pre_revised) - ] - keyorder = [ - 'params', 'train_perf', 'valid_perf', 'test_perf', - 'gram_matrix_time' - ] - if verbose: - print() - tb_print = tabulate( - OrderedDict( - sorted(table_dict.items(), - key=lambda i: keyorder.index(i[0]))), - headers='keys') -# print(tb_print) - str_fw += 'table of performance v.s. hyper-params:\n\n%s\n\n' % tb_print - # read gram matrices from file. else: # Grid of parameters with a discrete number of values for each. @@ -632,58 +583,16 @@ def model_selection_for_precomputed_kernel(datafile, # str_fw += 'time to calculate best gram matrix: {:.2f}±{:.2f}s\n'.format(ave_bgmt, std_bgmt) str_fw += 'training time with hyper-param choices who did not participate in calculation of gram matrices: {:.2f}s\n\n'.format(tt_poster) - # print out as table. - from collections import OrderedDict - from tabulate import tabulate - table_dict = {} - if model_type == 'regression': - for param_in in param_list: - param_in['alpha'] = '{:.2e}'.format(param_in['alpha']) - else: - for param_in in param_list: - param_in['C'] = '{:.2e}'.format(param_in['C']) - table_dict['params'] = [{**param_out, **param_in} - for param_in in param_list for param_out in param_list_pre_revised] -# table_dict['gram_matrix_time'] = [ -# '{:.2f}'.format(gram_matrix_time[index_out]) -# for param_in in param_list -# for index_out, _ in enumerate(param_list_pre_revised) -# ] - table_dict['valid_perf'] = [ - '{:.2f}±{:.2f}'.format(average_val_scores[index_out][index_in], - std_val_scores[index_out][index_in]) - for index_in, _ in enumerate(param_list) - for index_out, _ in enumerate(param_list_pre_revised) - ] - table_dict['test_perf'] = [ - '{:.2f}±{:.2f}'.format(average_perf_scores[index_out][index_in], - std_perf_scores[index_out][index_in]) - for index_in, _ in enumerate(param_list) - for index_out, _ in enumerate(param_list_pre_revised) - ] - table_dict['train_perf'] = [ - '{:.2f}±{:.2f}'.format(average_train_scores[index_out][index_in], - std_train_scores[index_out][index_in]) - for index_in, _ in enumerate(param_list) - for index_out, _ in enumerate(param_list_pre_revised) - ] - keyorder = [ - 'params', 'train_perf', 'valid_perf', 'test_perf' - ] - if verbose: - print() - tb_print = tabulate( - OrderedDict( - sorted(table_dict.items(), - key=lambda i: keyorder.index(i[0]))), - headers='keys') -# print(tb_print) - str_fw += 'table of performance v.s. hyper-params:\n\n%s\n\n' % tb_print - # open file to save all results for this dataset. if not os.path.exists(results_dir): os.makedirs(results_dir) + # print out results as table. + str_fw += printResultsInTable(param_list, param_list_pre_revised, average_val_scores, + std_val_scores, average_perf_scores, std_perf_scores, + average_train_scores, std_train_scores, gram_matrix_time, + model_type, verbose) + # open file to save all results for this dataset. if not os.path.exists(results_dir + '/' + ds_name + '.output.txt'): with open(results_dir + '/' + ds_name + '.output.txt', 'w') as f: @@ -974,4 +883,55 @@ def read_gram_matrices_from_file(results_dir, ds_name): gram_matrices = gmfile['gms'] # a list to store gram matrices for all param_grid_precomputed param_list_pre_revised = gmfile['params'] # list to store param grids precomputed ignoring the useless ones y = gmfile['y'].tolist() - return gram_matrices, param_list_pre_revised, y \ No newline at end of file + return gram_matrices, param_list_pre_revised, y + + +def printResultsInTable(param_list, param_list_pre_revised, average_val_scores, + std_val_scores, average_perf_scores, std_perf_scores, + average_train_scores, std_train_scores, gram_matrix_time, + model_type, verbose): + from collections import OrderedDict + from tabulate import tabulate + table_dict = {} + if model_type == 'regression': + for param_in in param_list: + param_in['alpha'] = '{:.2e}'.format(param_in['alpha']) + else: + for param_in in param_list: + param_in['C'] = '{:.2e}'.format(param_in['C']) + table_dict['params'] = [{**param_out, **param_in} + for param_in in param_list for param_out in param_list_pre_revised] + table_dict['gram_matrix_time'] = [ + '{:.2f}'.format(gram_matrix_time[index_out]) + for param_in in param_list + for index_out, _ in enumerate(param_list_pre_revised) + ] + table_dict['valid_perf'] = [ + '{:.2f}±{:.2f}'.format(average_val_scores[index_out][index_in], + std_val_scores[index_out][index_in]) + for index_in, _ in enumerate(param_list) + for index_out, _ in enumerate(param_list_pre_revised) + ] + table_dict['test_perf'] = [ + '{:.2f}±{:.2f}'.format(average_perf_scores[index_out][index_in], + std_perf_scores[index_out][index_in]) + for index_in, _ in enumerate(param_list) + for index_out, _ in enumerate(param_list_pre_revised) + ] + table_dict['train_perf'] = [ + '{:.2f}±{:.2f}'.format(average_train_scores[index_out][index_in], + std_train_scores[index_out][index_in]) + for index_in, _ in enumerate(param_list) + for index_out, _ in enumerate(param_list_pre_revised) + ] + + keyorder = [ + 'params', 'train_perf', 'valid_perf', 'test_perf', + 'gram_matrix_time' + ] + if verbose: + print() + tb_print = tabulate(OrderedDict(sorted(table_dict.items(), + key=lambda i: keyorder.index(i[0]))), headers='keys') +# print(tb_print) + return 'table of performance v.s. hyper-params:\n\n%s\n\n' % tb_print \ No newline at end of file