test of a new preimage algorithm.

6 years ago · a13dd94f6a
--- a/.gitignore
+++ b/.gitignore
@@ -20,5 +20,8 @@ pygraph/kernels/*_sym.py
 *.dat
 *.pyc

 preimage/*
 !preimage/*.py

 __pycache__
 ##*#
--- a/preimage/gk_iam.py
+++ b/preimage/gk_iam.py
@@ -0,0 +1,196 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Tue Apr 30 17:07:43 2019

 A graph pre-image method combining iterative pre-image method in reference [1] 
 and the iterative alternate minimizations (IAM) in reference [2].
@author: ljia
@references:
    [1] Gökhan H Bakir, Alexander Zien, and Koji Tsuda. Learning to and graph 
    pre-images. In Joint Pattern Re ognition Symposium , pages 253-261. Springer, 2004.
    [2] Generalized median graph via iterative alternate minimization.
 """
 import numpy as np
 import multiprocessing
 from tqdm import tqdm
 import networkx as nx
 import matplotlib.pyplot as plt

 from iam import iam


 def gk_iam(Gn, alpha):
    """This function constructs graph pre-image by the iterative pre-image 
    framework in reference [1], algorithm 1, where the step of generating new 
    graphs randomly is replaced by the IAM algorithm in reference [2].
    
    notes
    -----
    Every time a better graph is acquired, the older one is replaced by it.
    """
    # compute k nearest neighbors of phi in DN.
    dis_list = [] # distance between g_star and each graph.
    for ig, g in tqdm(enumerate(Gn), desc='computing distances', file=sys.stdout):
        dtemp = k_list[ig] - 2 * (alpha * k_g1_list[ig] + (1 - alpha) * 
                      k_g2_list[ig]) + (alpha * alpha * k_list[idx1] + alpha * 
                      (1 - alpha) * k_g2_list[idx1] + (1 - alpha) * alpha * 
                      k_g1_list[idx2] + (1 - alpha) * (1 - alpha) * k_list[idx2])
        dis_list.append(dtemp)
        
    # sort
    sort_idx = np.argsort(dis_list)
    dis_gs = [dis_list[idis] for idis in sort_idx[0:k]]
    g0hat = Gn[sort_idx[0]] # the nearest neighbor of phi in DN
    if dis_gs[0] == 0: # the exact pre-image.
        print('The exact pre-image is found from the input dataset.')
        return 0, g0hat
    dhat = dis_gs[0] # the nearest distance
    Gk = [Gn[ig] for ig in sort_idx[0:k]] # the k nearest neighbors
    gihat_list = []
    
 #    i = 1
    r = 1
    while r < r_max:
        print('r =', r)
 #        found = False
        Gs_nearest = Gk + gihat_list
        g_tmp = iam(Gs_nearest)
        
        # compute distance between phi and the new generated graph.
        knew = marginalizedkernel([g_tmp, g1, g2], node_label='atom', edge_label=None,
                       p_quit=lmbda, n_iteration=20, remove_totters=False,
                       n_jobs=multiprocessing.cpu_count(), verbose=False)
        dnew = knew[0][0, 0] - 2 * (alpha * knew[0][0, 1] + (1 - alpha) * 
              knew[0][0, 2]) + (alpha * alpha * k_list[idx1] + alpha * 
              (1 - alpha) * k_g2_list[idx1] + (1 - alpha) * alpha * 
              k_g1_list[idx2] + (1 - alpha) * (1 - alpha) * k_list[idx2])
        if dnew <= dhat: # the new distance is smaller
            print('I am smaller!')
            dhat = dnew
            g_new = g_tmp.copy() # found better graph.
            gihat_list = [g_new]
            dis_gs.append(dhat)
            r = 0
        else:
            r += 1
            
    ghat = ([g0hat] if len(gihat_list) == 0 else gihat_list)
    
    return dhat, ghat


 def gk_iam_nearest(Gn, alpha):
    """This function constructs graph pre-image by the iterative pre-image 
    framework in reference [1], algorithm 1, where the step of generating new 
    graphs randomly is replaced by the IAM algorithm in reference [2].
    
    notes
    -----
    Every time a better graph is acquired, its distance in kernel space is
    compared with the k nearest ones, and the k nearest distances from the k+1
    distances will be used as the new ones.
    """
    # compute k nearest neighbors of phi in DN.
    dis_list = [] # distance between g_star and each graph.
    for ig, g in tqdm(enumerate(Gn), desc='computing distances', file=sys.stdout):
        dtemp = k_list[ig] - 2 * (alpha * k_g1_list[ig] + (1 - alpha) * 
                      k_g2_list[ig]) + (alpha * alpha * k_list[idx1] + alpha * 
                      (1 - alpha) * k_g2_list[idx1] + (1 - alpha) * alpha * 
                      k_g1_list[idx2] + (1 - alpha) * (1 - alpha) * k_list[idx2])
        dis_list.append(dtemp)
        
    # sort
    sort_idx = np.argsort(dis_list)
    dis_gs = [dis_list[idis] for idis in sort_idx[0:k]] # the k shortest distances
    g0hat = Gn[sort_idx[0]] # the nearest neighbor of phi in DN
    if dis_gs[0] == 0: # the exact pre-image.
        print('The exact pre-image is found from the input dataset.')
        return 0, g0hat
    dhat = dis_gs[0] # the nearest distance
    ghat = g0hat
    Gk = [Gn[ig] for ig in sort_idx[0:k]] # the k nearest neighbors
    Gs_nearest = Gk
 #    gihat_list = []
    
 #    i = 1
    r = 1
    while r < r_max:
        print('r =', r)
 #        found = False
 #        Gs_nearest = Gk + gihat_list
        g_tmp = iam(Gs_nearest)
        
        # compute distance between phi and the new generated graph.
        knew = marginalizedkernel([g_tmp, g1, g2], node_label='atom', edge_label=None,
                       p_quit=lmbda, n_iteration=20, remove_totters=False,
                       n_jobs=multiprocessing.cpu_count(), verbose=False)
        dnew = knew[0][0, 0] - 2 * (alpha * knew[0][0, 1] + (1 - alpha) * 
              knew[0][0, 2]) + (alpha * alpha * k_list[idx1] + alpha * 
              (1 - alpha) * k_g2_list[idx1] + (1 - alpha) * alpha * 
              k_g1_list[idx2] + (1 - alpha) * (1 - alpha) * k_list[idx2])
        if dnew <= dhat: # the new distance is smaller
            print('I am smaller!')
            dhat = dnew
            g_new = g_tmp.copy() # found better graph.
            ghat = g_tmp.copy()
            dis_gs.append(dhat) # add the new nearest distance.
            Gs_nearest.append(g_new) # add the corresponding graph.
            sort_idx = np.argsort(dis_gs)
            dis_gs = [dis_gs[idx] for idx in sort_idx[0:k]] # the new k nearest distances.
            Gs_nearest = [Gs_nearest[idx] for idx in sort_idx[0:k]]
            r = 0
        else:
            r += 1
    
    return dhat, ghat
            

 if __name__ == '__main__':
    import sys
    sys.path.insert(0, "../")
    from pygraph.kernels.marginalizedKernel import marginalizedkernel
    from pygraph.utils.graphfiles import loadDataset
    ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat',
          'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}  # node/edge symb
    Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
 #    Gn = Gn[0:10]
    
    lmbda = 0.03 # termination probalility
    r_max = 10 # recursions
    l = 500
    alpha_range = np.linspace(0.1, 0.9, 9)
    k = 5 # k nearest neighbors
    
    # randomly select two molecules
    np.random.seed(1)
    idx1, idx2 = np.random.randint(0, len(Gn), 2)
    g1 = Gn[idx1]
    g2 = Gn[idx2]
    
    # compute 
    k_list = [] # kernel between each graph and itself.
    k_g1_list = [] # kernel between each graph and g1
    k_g2_list = [] # kernel between each graph and g2
    for ig, g in tqdm(enumerate(Gn), desc='computing self kernels', file=sys.stdout): 
        ktemp = marginalizedkernel([g, g1, g2], node_label='atom', edge_label=None,
                                   p_quit=lmbda, n_iteration=20, remove_totters=False,
                                   n_jobs=multiprocessing.cpu_count(), verbose=False)
        k_list.append(ktemp[0][0, 0])
        k_g1_list.append(ktemp[0][0, 1])
        k_g2_list.append(ktemp[0][0, 2])

    g_best = []
    dis_best = []
    # for each alpha
    for alpha in alpha_range:
        print('alpha =', alpha)
        dhat, ghat = gk_iam_nearest(Gn, alpha)
        dis_best.append(dhat)
        g_best.append(ghat)
        
    for idx, item in enumerate(alpha_range):
        print('when alpha is', item, 'the shortest distance is', dis_best[idx])
        print('the corresponding pre-image is')
        nx.draw_networkx(g_best[idx])
        plt.show()
--- a/preimage/iam.py
+++ b/preimage/iam.py
@@ -0,0 +1,195 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Fri Apr 26 11:49:12 2019

 Iterative alternate minimizations using GED.
@author: ljia
 """
 import numpy as np
 import random
 import networkx as nx

 import sys
 #from Cython_GedLib_2 import librariesImport, script
 import librariesImport, script
 sys.path.insert(0, "../")
 from pygraph.utils.graphfiles import saveDataset
 from pygraph.utils.graphdataset import get_dataset_attributes


 def iam(Gn, node_label='atom', edge_label='bond_type'):
    """See my name, then you know what I do.
    """
 #    Gn = Gn[0:10]
    Gn = [nx.convert_node_labels_to_integers(g) for g in Gn]
    
    c_er = 1
    c_es = 1
    c_ei = 1
    
    # phase 1: initilize.
    # compute set-median.
    dis_min = np.inf
    pi_p = []
    pi_all = []
    for idx1, G_p in enumerate(Gn):
        dist_sum = 0
        pi_all.append([])
        for idx2, G_p_prime in enumerate(Gn):
            dist_tmp, pi_tmp = GED(G_p, G_p_prime)
            pi_all[idx1].append(pi_tmp)
            dist_sum += dist_tmp
        if dist_sum < dis_min:
            dis_min = dist_sum
            G = G_p.copy()
            idx_min = idx1
    # list of edit operations.        
    pi_p = pi_all[idx_min]
            
    # phase 2: iteration.
    ds_attrs = get_dataset_attributes(Gn, attr_names=['edge_labeled', 'node_attr_dim'], 
                                      edge_label=edge_label)
    for itr in range(0, 10):
        G_new = G.copy()
        # update vertex labels.
        # pre-compute h_i0 for each label.
 #        for label in get_node_labels(Gn, node_label):
 #            print(label)
 #        for nd in G.nodes(data=True):
 #            pass
        if not ds_attrs['node_attr_dim']: # labels are symbolic
            for nd, _ in G.nodes(data=True):
                h_i0_list = []
                label_list = []
                for label in get_node_labels(Gn, node_label):
                    h_i0 = 0
                    for idx, g in enumerate(Gn):
                        pi_i = pi_p[idx][nd]
                        if g.has_node(pi_i) and g.nodes[pi_i][node_label] == label:
                            h_i0 += 1
                    h_i0_list.append(h_i0)
                    label_list.append(label)
                # choose one of the best randomly.
                idx_max = np.argwhere(h_i0_list == np.max(h_i0_list)).flatten().tolist()
                idx_rdm = random.randint(0, len(idx_max) - 1)
                G_new.nodes[nd][node_label] = label_list[idx_max[idx_rdm]]
        else: # labels are non-symbolic
            for nd, _ in G.nodes(data=True):
                Si_norm = 0
                phi_i_bar = np.array([0.0 for _ in range(ds_attrs['node_attr_dim'])])
                for idx, g in enumerate(Gn):
                    pi_i = pi_p[idx][nd]
                    if g.has_node(pi_i): #@todo: what if no g has node? phi_i_bar = 0?
                        Si_norm += 1
                        phi_i_bar += np.array([float(itm) for itm in g.nodes[pi_i]['attributes']])                
                phi_i_bar /= Si_norm
                G_new.nodes[nd]['attributes'] = phi_i_bar
                                            
        # update edge labels and adjacency matrix.
        if ds_attrs['edge_labeled']:
            for nd1, nd2, _ in G.edges(data=True):
                h_ij0_list = []
                label_list = []
                for label in get_edge_labels(Gn, edge_label):
                    h_ij0 = 0
                    for idx, g in enumerate(Gn):
                        pi_i = pi_p[idx][nd1]
                        pi_j = pi_p[idx][nd2]
                        h_ij0_p = (g.has_node(pi_i) and g.has_node(pi_j) and 
                                   g.has_edge(pi_i, pi_j) and 
                                   g.edges[pi_i, pi_j][edge_label] == label)
                        h_ij0 += h_ij0_p
                    h_ij0_list.append(h_ij0)
                    label_list.append(label)
                # choose one of the best randomly.
                idx_max = np.argwhere(h_ij0_list == np.max(h_ij0_list)).flatten().tolist()
                h_ij0_max = h_ij0_list[idx_max[0]]
                idx_rdm = random.randint(0, len(idx_max) - 1)
                best_label = label_list[idx_max[idx_rdm]]
                       
                # check whether a_ij is 0 or 1.
                sij_norm = 0
                for idx, g in enumerate(Gn):
                    pi_i = pi_p[idx][nd1]
                    pi_j = pi_p[idx][nd2]
                    if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j):
                       sij_norm += 1
                if h_ij0_max > len(Gn) * c_er / c_es + sij_norm * (1 - (c_er + c_ei) / c_es):
                    if not G_new.has_edge(nd1, nd2):
                        G_new.add_edge(nd1, nd2)
                    G_new.edges[nd1, nd2][edge_label] = best_label
                else:
                    if G_new.has_edge(nd1, nd2):
                        G_new.remove_edge(nd1, nd2)                
        else: # if edges are unlabeled
            for nd1, nd2, _ in G.edges(data=True):
                sij_norm = 0
                for idx, g in enumerate(Gn):
                    pi_i = pi_p[idx][nd1]
                    pi_j = pi_p[idx][nd2]
                    if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j):
                       sij_norm += 1
                if sij_norm > len(Gn) * c_er / (c_er + c_ei):
                    if not G_new.has_edge(nd1, nd2):
                        G_new.add_edge(nd1, nd2)
                else:
                    if G_new.has_edge(nd1, nd2):
                        G_new.remove_edge(nd1, nd2)
                        
        G = G_new.copy()
    
    return G


 def GED(g1, g2, lib='gedlib'):
    """
    Compute GED. It is a dummy function for now.
    """
    if lib == 'gedlib':
        saveDataset([g1, g2], [None, None], group='xml', filename='ged_tmp/tmp')
        script.appel()
        script.PyRestartEnv()
        script.PyLoadGXLGraph('ged_tmp/', 'collections/tmp.xml')
        listID = script.PyGetGraphIds()
        script.PySetEditCost("CHEM_1")
        script.PyInitEnv()
        script.PySetMethod("BIPARTITE", "")
        script.PyInitMethod()
        g = listID[0]
        h = listID[1]
        script.PyRunMethod(g, h)
        liste = script.PyGetAllMap(g, h)
        upper = script.PyGetUpperBound(g, h)
        lower = script.PyGetLowerBound(g, h)        
        dis = upper + lower
        pi = liste[0]
        
    return dis, pi


 def get_node_labels(Gn, node_label):
    nl = set()
    for G in Gn:
        nl = nl | set(nx.get_node_attributes(G, node_label).values())
    return nl


 def get_edge_labels(Gn, edge_label):
    el = set()
    for G in Gn:
        el = el | set(nx.get_edge_attributes(G, edge_label).values())
    return el


 if __name__ == '__main__':
    from pygraph.utils.graphfiles import loadDataset
    ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat',
          'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}  # node/edge symb
 #    ds = {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt',
 #          'extra_params': {}} # node nsymb
 #    ds = {'name': 'Acyclic', 'dataset': '../datasets/monoterpenoides/trainset_9.ds',
 #          'extra_params': {}}
    Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])

    iam(Gn)
--- a/preimage/librariesImport.py
+++ b/preimage/librariesImport.py
@@ -0,0 +1,5 @@
 from ctypes import *
 lib1 = cdll.LoadLibrary('lib/fann/libdoublefann.so')
 lib2 = cdll.LoadLibrary('lib/libsvm.3.22/libsvm.so')
 lib3 = cdll.LoadLibrary('lib/nomad/libnomad.so')
 lib4 = cdll.LoadLibrary('lib/nomad/libsgtelib.so')
--- a/preimage/librariesImport2.py
+++ b/preimage/librariesImport2.py
@@ -0,0 +1,5 @@
 from ctypes import *
 lib1 = cdll.LoadLibrary('Cython_GedLib_2/lib/fann/libdoublefann.so')
 lib2 = cdll.LoadLibrary('Cython_GedLib_2/lib/libsvm.3.22/libsvm.so')
 lib3 = cdll.LoadLibrary('Cython_GedLib_2/lib/nomad/libnomad.so')
 lib4 = cdll.LoadLibrary('Cython_GedLib_2/lib/nomad/libsgtelib.so')
--- a/preimage/preimage.py
+++ b/preimage/preimage.py
@@ -126,6 +126,7 @@ for alpha in alpha_range:
                    dhat = dnew
                    gnew = gtemp.copy()
                    found = True # found better graph.
                    r = 0
        if found:
            gihat_list = [gnew]
            dis_gs.append(dhat)
--- a/preimage/setup.py
+++ b/preimage/setup.py
@@ -0,0 +1,26 @@
 #from distutils.core import setup
 from distutils.extension import Extension
 #from Cython.Distutils import build_ext

 from distutils.core import setup
 from Cython.Build import cythonize

 #setup(ext_modules=cythonize("script.pyx"))

 extensions = [Extension("script",
                        sources=["script.pyx", "src/essai.cpp"],
                        include_dirs=["include","include/lsape", "include/Eigen", "include/nomad", "include/sgtelib", "include/libsvm.3.22", "include/fann", "include/boost_1_69_0"],
                        library_dirs=["lib/fann","lib/gedlib", "lib/libsvm.3.22","lib/nomad"],
                        libraries=["doublefann","sgtelib", "svm", "nomad"],
                        language="c++",
                        extra_compile_args=["-std=c++11"],
                        extra_link_args=["-std=c++11"])]

 setup(ext_modules=cythonize(extensions))

 #extensions = [Extension("script", sources=["script.pyx", "include/gedlib-master/src/env/ged_env.ipp"],  include_dirs=["."], language="c++")]
 
 #setup(name = "script", ext_modules = extensions, cmdclass = {'build_ext':build_ext},)


 # Commande Bash : python setup.py build_ext --inplace
--- a/preimage/test.py
+++ b/preimage/test.py
@@ -0,0 +1,57 @@
 #export LD_LIBRARY_PATH=.:/export/home/lambertn/Documents/Cython_GedLib_2/lib/fann/:/export/home/lambertn/Documents/Cython_GedLib_2/lib/libsvm.3.22:/export/home/lambertn/Documents/Cython_GedLib_2/lib/nomad

 #Pour que "import script" trouve les librairies qu'a besoin GedLib
 #Equivalent à définir la variable d'environnement LD_LIBRARY_PATH sur un bash
 #Permet de fonctionner sur Idle et autre sans définir à chaque fois la variable d'environnement
 #os.environ ne fonctionne pas dans ce cas
 import librariesImport, script

 #import script

 #truc = script.computeEditDistanceOnGXlGraphs('include/gedlib-master/data/datasets/Mutagenicity/data/','collections/MUTA_10.xml',"CHEM_1", "BIPARTITE", "") 
 #print(truc)
 #script.PyRestartEnv()
 #script.appel()

 def test() :
 #    script.appel()
    
    script.PyRestartEnv()
    
 #    print("Here is the Python function !")
 #    
 #    print("List of Edit Cost Options : ")
 #    for i in script.listOfEditCostOptions :
 #        print (i)
 #    print("")
 #
 #    print("List of Method Options : ")
 #    for j in script.listOfMethodOptions :
 #        print (j)
 #    print("")
    
    script.PyLoadGXLGraph('include/gedlib-master/data/datasets/Mutagenicity/data/', 'collections/MUTA_10.xml')
    listID = script.PyGetGraphIds()
    
    afficheId = ""
    for i in listID :
        afficheId+=str(i) + " "
    print("Number of graphs = " + str(len(listID)) + ", list of Ids = " + afficheId)

    script.PySetEditCost("CHEM_1")

    script.PyInitEnv()

    script.PySetMethod("BIPARTITE", "")
    script.PyInitMethod()

    g = listID[0]
    h = listID[1]

    script.PyRunMethod(g,h)
    liste = script.PyGetAllMap(g,h)
    print("Forward map : " ,liste[0], ", Backward map : ", liste[1])
    print ("Upper Bound = " + str(script.PyGetUpperBound(g,h)) + ", Lower Bound = " + str(script.PyGetLowerBound(g,h)) + ", Runtime = " + str(script.PyGetRuntime(g,h)))


 test()
--- a/pygraph/utils/graphdataset.py
+++ b/pygraph/utils/graphdataset.py
@@ -52,10 +52,10 @@ def get_dataset_attributes(Gn,
        return False if edge_label is None else True

    def get_edge_label_num(Gn):
        nl = set()
        el = set()
        for G in Gn:
            nl = nl | set(nx.get_edge_attributes(G, edge_label).values())
        return len(nl)
            el = el | set(nx.get_edge_attributes(G, edge_label).values())
        return len(el)

    def is_directed(Gn):
        return nx.is_directed(Gn[0])
--- a/pygraph/utils/graphfiles.py
+++ b/pygraph/utils/graphfiles.py
@@ -22,8 +22,8 @@ def loadCT(filename):
    with open(filename) as f:
        content = f.read().splitlines()
        g = nx.Graph(
            name=str(content[0]),
            filename=basename(filename))  # set name of the graph
            name = str(content[0]),
            filename = basename(filename))  # set name of the graph
        tmp = content[1].split(" ")
        if tmp[0] == '':
            nb_nodes = int(tmp[1])  # number of the nodes
@@ -84,43 +84,63 @@ def loadGXL(filename):
    return g


 def saveGXL(graph, filename):
    import xml.etree.ElementTree as ET
    root_node = ET.Element('gxl')
    attr = dict()
    attr['id'] = graph.graph['name']
    attr['edgeids'] = 'true'
    attr['edgemode'] = 'undirected'
    graph_node = ET.SubElement(root_node, 'graph', attrib=attr)

    for v in graph:
        current_node = ET.SubElement(graph_node, 'node', attrib={'id': str(v)})
        for attr in graph.nodes[v].keys():
            cur_attr = ET.SubElement(
                current_node, 'attr', attrib={'name': attr})
            cur_value = ET.SubElement(cur_attr,
                                      graph.nodes[v][attr].__class__.__name__)
            cur_value.text = graph.nodes[v][attr]

    for v1 in graph:
        for v2 in graph[v1]:
            if (v1 < v2):  # Non oriented graphs
                cur_edge = ET.SubElement(
                    graph_node,
                    'edge',
                    attrib={
                        'from': str(v1),
                        'to': str(v2)
                    })
                for attr in graph[v1][v2].keys():
                    cur_attr = ET.SubElement(
                        cur_edge, 'attr', attrib={'name': attr})
                    cur_value = ET.SubElement(
                        cur_attr, graph[v1][v2][attr].__class__.__name__)
                    cur_value.text = str(graph[v1][v2][attr])

    tree = ET.ElementTree(root_node)
    tree.write(filename)
 def saveGXL(graph, filename, method='benoit'):
    if method == 'benoit':
        import xml.etree.ElementTree as ET
        root_node = ET.Element('gxl')
        attr = dict()
        attr['id'] = str(graph.graph['name'])
        attr['edgeids'] = 'true'
        attr['edgemode'] = 'undirected'
        graph_node = ET.SubElement(root_node, 'graph', attrib=attr)
    
        for v in graph:
            current_node = ET.SubElement(graph_node, 'node', attrib={'id': str(v)})
            for attr in graph.nodes[v].keys():
                cur_attr = ET.SubElement(
                    current_node, 'attr', attrib={'name': attr})
                cur_value = ET.SubElement(cur_attr,
                                          graph.nodes[v][attr].__class__.__name__)
                cur_value.text = graph.nodes[v][attr]
    
        for v1 in graph:
            for v2 in graph[v1]:
                if (v1 < v2):  # Non oriented graphs
                    cur_edge = ET.SubElement(
                        graph_node,
                        'edge',
                        attrib={
                            'from': str(v1),
                            'to': str(v2)
                        })
                    for attr in graph[v1][v2].keys():
                        cur_attr = ET.SubElement(
                            cur_edge, 'attr', attrib={'name': attr})
                        cur_value = ET.SubElement(
                            cur_attr, graph[v1][v2][attr].__class__.__name__)
                        cur_value.text = str(graph[v1][v2][attr])
    
        tree = ET.ElementTree(root_node)
        tree.write(filename)
    elif method == 'gedlib':
        # reference: https://github.com/dbblumenthal/gedlib/blob/master/data/generate_molecules.py#L22
        pass
 #        gxl_file = open(filename, 'w')
 #        gxl_file.write("<?xml version=\"1.0\"?>\n")
 #        gxl_file.write("<!DOCTYPE gxl SYSTEM \"http://www.gupro.de/GXL/gxl-1.0.dtd\">\n")
 #        gxl_file.write("<gxl>\n")
 #        gxl_file.write("<graph id=\"" + str(graph.graph['name']) + "\" edgeids=\"false\" edgemode=\"undirected\">\n")
 #        for v in graph:
 #            gxl_file.write("<node id=\"_" + str(v) + "\">\n")
 #            gxl_file.write("<attr name=\"chem\"><int>" + str(self.node_labels[node]) + "</int></attr>\n")
 #            gxl_file.write("</node>\n")
 #        for edge in self.edge_list:
 #            gxl_file.write("<edge from=\"_" + str(edge[0]) + "\" to=\"_" + str(edge[1]) + "\">\n")
 #            gxl_file.write("<attr name=\"valence\"><int>1</int></attr>\n")
 #            gxl_file.write("</edge>\n")
 #        gxl_file.write("</graph>\n")
 #        gxl_file.write("</gxl>\n")
 #        gxl_file.close()


 def loadSDF(filename):
@@ -412,3 +432,33 @@ def loadDataset(filename, filename_y=None, extra_params=None):
        #     print(g.edges(data=True))

    return data, y


 def saveDataset(Gn, y, gformat='gxl', group=None, filename='gfile'):
    """Save list of graphs.
    """
    import os
    dirname_ds = os.path.dirname(filename)
    if dirname_ds != '':
        dirname_ds += '/'
        if not os.path.exists(dirname_ds) :
            os.makedirs(dirname_ds)
            
    if group == 'xml' and gformat == 'gxl':
        with open(filename + '.xml', 'w') as fgroup:
            fgroup.write("<?xml version=\"1.0\"?>")
            fgroup.write("\n<!DOCTYPE GraphCollection SYSTEM \"https://dbblumenthal.github.io/gedlib/GraphCollection_8dtd_source.html\">")
            fgroup.write("\n<GraphCollection>")
            for idx, g in enumerate(Gn):
                fname_tmp = "graph" + str(idx) + ".gxl"
                saveGXL(g, dirname_ds + fname_tmp)
                fgroup.write("\n\t<graph file=\"" + fname_tmp + "\" class=\"" + str(y[idx]) + "\"/>")
            fgroup.write("\n</GraphCollection>")
            fgroup.close()
            
            
 if __name__ == '__main__':    
    ds = {'name': 'MUTAG', 'dataset': '../../datasets/MUTAG/MUTAG.mat',
          'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}  # node/edge symb
    Gn, y = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
    saveDataset(Gn, y, group='xml', filename='temp/temp')
--- a/pygraph/utils/model_selection_precomputed.py
+++ b/pygraph/utils/model_selection_precomputed.py
@@ -420,55 +420,6 @@ def model_selection_for_precomputed_kernel(datafile,
            # np.save(results_name_pre + 'best_gram_matrix_time.dt',
            #         best_gram_matrix_time)
    
            # print out as table.
            from collections import OrderedDict
            from tabulate import tabulate
            table_dict = {}
            if model_type == 'regression':
                for param_in in param_list:
                    param_in['alpha'] = '{:.2e}'.format(param_in['alpha'])
            else:
                for param_in in param_list:
                    param_in['C'] = '{:.2e}'.format(param_in['C'])
            table_dict['params'] = [{**param_out, **param_in}
                                    for param_in in param_list for param_out in param_list_pre_revised]
            table_dict['gram_matrix_time'] = [
                '{:.2f}'.format(gram_matrix_time[index_out])
                for param_in in param_list
                for index_out, _ in enumerate(param_list_pre_revised)
            ]
            table_dict['valid_perf'] = [
                '{:.2f}±{:.2f}'.format(average_val_scores[index_out][index_in],
                                       std_val_scores[index_out][index_in])
                for index_in, _ in enumerate(param_list)
                for index_out, _ in enumerate(param_list_pre_revised)
            ]
            table_dict['test_perf'] = [
                '{:.2f}±{:.2f}'.format(average_perf_scores[index_out][index_in],
                                       std_perf_scores[index_out][index_in])
                for index_in, _ in enumerate(param_list)
                for index_out, _ in enumerate(param_list_pre_revised)
            ]
            table_dict['train_perf'] = [
                '{:.2f}±{:.2f}'.format(average_train_scores[index_out][index_in],
                                       std_train_scores[index_out][index_in])
                for index_in, _ in enumerate(param_list)
                for index_out, _ in enumerate(param_list_pre_revised)
            ]
            keyorder = [
                'params', 'train_perf', 'valid_perf', 'test_perf',
                'gram_matrix_time'
            ]
            if verbose:
                print()
            tb_print = tabulate(
                OrderedDict(
                    sorted(table_dict.items(),
                           key=lambda i: keyorder.index(i[0]))),
                headers='keys')
 #            print(tb_print)
            str_fw += 'table of performance v.s. hyper-params:\n\n%s\n\n' % tb_print
    
    # read gram matrices from file.
    else:    
        # Grid of parameters with a discrete number of values for each.
@@ -632,58 +583,16 @@ def model_selection_for_precomputed_kernel(datafile,
 #        str_fw += 'time to calculate best gram matrix: {:.2f}±{:.2f}s\n'.format(ave_bgmt, std_bgmt)
        str_fw += 'training time with hyper-param choices who did not participate in calculation of gram matrices: {:.2f}s\n\n'.format(tt_poster)

        # print out as table.
        from collections import OrderedDict
        from tabulate import tabulate
        table_dict = {}
        if model_type == 'regression':
            for param_in in param_list:
                param_in['alpha'] = '{:.2e}'.format(param_in['alpha'])
        else:
            for param_in in param_list:
                param_in['C'] = '{:.2e}'.format(param_in['C'])
        table_dict['params'] = [{**param_out, **param_in}
                                for param_in in param_list for param_out in param_list_pre_revised]
 #        table_dict['gram_matrix_time'] = [
 #            '{:.2f}'.format(gram_matrix_time[index_out])
 #            for param_in in param_list
 #            for index_out, _ in enumerate(param_list_pre_revised)
 #        ]
        table_dict['valid_perf'] = [
            '{:.2f}±{:.2f}'.format(average_val_scores[index_out][index_in],
                                   std_val_scores[index_out][index_in])
            for index_in, _ in enumerate(param_list)
            for index_out, _ in enumerate(param_list_pre_revised)
        ]
        table_dict['test_perf'] = [
            '{:.2f}±{:.2f}'.format(average_perf_scores[index_out][index_in],
                                   std_perf_scores[index_out][index_in])
            for index_in, _ in enumerate(param_list)
            for index_out, _ in enumerate(param_list_pre_revised)
        ]
        table_dict['train_perf'] = [
            '{:.2f}±{:.2f}'.format(average_train_scores[index_out][index_in],
                                   std_train_scores[index_out][index_in])
            for index_in, _ in enumerate(param_list)
            for index_out, _ in enumerate(param_list_pre_revised)
        ]
        keyorder = [
            'params', 'train_perf', 'valid_perf', 'test_perf'
        ]
        if verbose:
            print()
        tb_print = tabulate(
            OrderedDict(
                sorted(table_dict.items(),
                       key=lambda i: keyorder.index(i[0]))),
            headers='keys')
 #        print(tb_print)
        str_fw += 'table of performance v.s. hyper-params:\n\n%s\n\n' % tb_print

        # open file to save all results for this dataset.
        if not os.path.exists(results_dir):
            os.makedirs(results_dir)
            
    # print out results as table.
    str_fw += printResultsInTable(param_list, param_list_pre_revised, average_val_scores,
              std_val_scores, average_perf_scores, std_perf_scores,
              average_train_scores, std_train_scores, gram_matrix_time,
              model_type, verbose)
            
    # open file to save all results for this dataset.
    if not os.path.exists(results_dir + '/' + ds_name + '.output.txt'):
        with open(results_dir + '/' + ds_name + '.output.txt', 'w') as f:
@@ -974,4 +883,55 @@ def read_gram_matrices_from_file(results_dir, ds_name):
    gram_matrices = gmfile['gms'] # a list to store gram matrices for all param_grid_precomputed
    param_list_pre_revised = gmfile['params'] # list to store param grids precomputed ignoring the useless ones
    y = gmfile['y'].tolist()
    return gram_matrices, param_list_pre_revised, y
    return gram_matrices, param_list_pre_revised, y


 def printResultsInTable(param_list, param_list_pre_revised, average_val_scores,
                        std_val_scores, average_perf_scores, std_perf_scores,
                        average_train_scores, std_train_scores, gram_matrix_time,
                        model_type, verbose):
    from collections import OrderedDict
    from tabulate import tabulate
    table_dict = {}
    if model_type == 'regression':
        for param_in in param_list:
            param_in['alpha'] = '{:.2e}'.format(param_in['alpha'])
    else:
        for param_in in param_list:
            param_in['C'] = '{:.2e}'.format(param_in['C'])
    table_dict['params'] = [{**param_out, **param_in}
                            for param_in in param_list for param_out in param_list_pre_revised]
    table_dict['gram_matrix_time'] = [
        '{:.2f}'.format(gram_matrix_time[index_out])
        for param_in in param_list
        for index_out, _ in enumerate(param_list_pre_revised)
    ]
    table_dict['valid_perf'] = [
        '{:.2f}±{:.2f}'.format(average_val_scores[index_out][index_in],
                               std_val_scores[index_out][index_in])
        for index_in, _ in enumerate(param_list)
        for index_out, _ in enumerate(param_list_pre_revised)
    ]
    table_dict['test_perf'] = [
        '{:.2f}±{:.2f}'.format(average_perf_scores[index_out][index_in],
                               std_perf_scores[index_out][index_in])
        for index_in, _ in enumerate(param_list)
        for index_out, _ in enumerate(param_list_pre_revised)
    ]
    table_dict['train_perf'] = [
        '{:.2f}±{:.2f}'.format(average_train_scores[index_out][index_in],
                               std_train_scores[index_out][index_in])
        for index_in, _ in enumerate(param_list)
        for index_out, _ in enumerate(param_list_pre_revised)
    ]
    
    keyorder = [
        'params', 'train_perf', 'valid_perf', 'test_perf',
        'gram_matrix_time'
    ]
    if verbose:
        print()
    tb_print = tabulate(OrderedDict(sorted(table_dict.items(), 
                        key=lambda i: keyorder.index(i[0]))), headers='keys')
 #            print(tb_print)
    return 'table of performance v.s. hyper-params:\n\n%s\n\n' % tb_print