Merge branch 'ljia' of https://git.litislab.fr/bgauzere/py-graph into ljia

7 years ago · 1dbe6630c8
--- a/notebooks/py-graph_test.ipynb
+++ b/notebooks/py-graph_test.ipynb
@@ -0,0 +1,170 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "autoscroll": false,
    "ein.tags": "worksheet-0",
    "slideshow": {
     "slide_type": "-"
    }
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import paths\n",
    "\n",
    "import pygraph\n",
    "\n",
    "from pygraph.utils.graphfiles import loadDataset\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "autoscroll": false,
    "ein.tags": "worksheet-0",
    "slideshow": {
     "slide_type": "-"
    }
   },
   "outputs": [],
   "source": [
    "import networkx as nx\n",
    "import numpy as np\n",
    "import matplotlib.pyplot  as plt\n",
    "\n",
    "# We load a ds dataset\n",
    "# load it from https://brunl01.users.greyc.fr/CHEMISTRY/Acyclic.tar.gz\n",
    "dataset, y = loadDataset(\"/home/bgauzere/work/Datasets/Acyclic/dataset_bps.ds\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "autoscroll": false,
    "ein.tags": "worksheet-0",
    "slideshow": {
     "slide_type": "-"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 183/183 [07:41<00:00,  2.52s/it]\n",
      "100%|██████████| 183/183 [08:39<00:00,  2.84s/it]\n",
      "100%|██████████| 183/183 [05:19<00:00,  1.75s/it]\n",
      "100%|██████████| 183/183 [05:50<00:00,  1.91s/it]\n"
     ]
    }
   ],
   "source": [
    "#Compute graph edit distances\n",
    "\n",
    "from tqdm import tqdm\n",
    "from pygraph.c_ext.lsape_binders import  lsap_solverHG\n",
    "from pygraph.ged.costfunctions import ConstantCostFunction\n",
    "from pygraph.ged.GED import ged\n",
    "import time\n",
    "\n",
    "cf = ConstantCostFunction(1,3,1,3)\n",
    "N=len(dataset)\n",
    "\n",
    "methods=['Riesen + LSAP', 'Neigh + LSAP', 'Riesen + LSAPE', 'Neigh + LSAPE']\n",
    "ged_distances = [ np.zeros((N,N)), np.zeros((N,N)), np.zeros((N,N)), np.zeros((N,N))]\n",
    "\n",
    "times = list()\n",
    "start = time.clock()\n",
    "for i in tqdm(range(0,N)):\n",
    "    for j in range(0,N):\n",
    "        ged_distances[0][i,j] = ged(dataset[i],dataset[j],cf=cf, method='Riesen')[0]\n",
    "times.append(time.clock() - start)\n",
    "\n",
    "\n",
    "start = time.clock()\n",
    "for i in tqdm(range(0,N)):\n",
    "    for j in range(0,N):\n",
    "        ged_distances[1][i,j] = ged(dataset[i],dataset[j],cf=cf, method='Neighboorhood')[0]\n",
    "\n",
    "times.append(time.clock() - start)\n",
    "\n",
    "start = time.clock()\n",
    "for i in tqdm(range(0,N)):\n",
    "    for j in range(0,N):\n",
    "        ged_distances[2][i,j] = ged(dataset[i],dataset[j],cf=cf, method='Riesen',solver=lsap_solverHG)[0]\n",
    "times.append(time.clock() - start)\n",
    "\n",
    "start = time.clock()\n",
    "for i in tqdm(range(0,N)):\n",
    "    for j in range(0,N):\n",
    "        ged_distances[3][i,j] = ged(dataset[i],dataset[j],cf=cf, method='Neighboorhood',solver=lsap_solverHG)[0]\n",
    "times.append(time.clock() - start)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "autoscroll": false,
    "ein.tags": "worksheet-0",
    "slideshow": {
     "slide_type": "-"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      " method \t mean \t mean \t time\n",
      " Riesen + LSAP \t 37.79903849025053 \t 35.31207262086058 \t 463.300405 \n",
      " Neigh + LSAP \t 36.2281047508137 \t 33.85869987159963 \t 521.7821730000001 \n",
      " Riesen + LSAPE \t 35.95508973095643 \t 34.10092866314312 \t 319.83455500000014 \n",
      " Neigh + LSAPE \t 34.5005822807489 \t 32.5735614679447 \t 350.48029599999995 \n"
     ]
    }
   ],
   "source": [
    "print(\" method \\t mean \\t mean \\t time\")\n",
    "data = list()\n",
    "for i in range(0,len(ged_distances)):\n",
    "    ged_ = np.minimum(ged_distances[i],ged_distances[i].transpose())\n",
    "    print(\" {} \\t {} \\t {} \\t {} \".format(methods[i], np.mean(ged_distances[i]),np.mean(ged_), times[i]))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.2"
  },
  "name": "py-graph_test.ipynb"
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
--- a/pygraph/init.py
+++ b/pygraph/init.py
@@ -0,0 +1,21 @@
 # -*-coding:utf-8 -*-
 """
 Pygraph

 This  package contains 4 sub  packages :
        * c_ext : binders to C++ code
        * ged : allows to compute graph edit distance between networkX graphs
        * kernels : computation of graph kernels, ie graph similarity measure compatible with SVM
        * notebooks : examples of code using this library
        * utils : Diverse computation on graphs
 """

 # info
 __version__ = "0.1"
 __author__  = "Benoit Gaüzère"
 __date__    = "November 2017"
 
 # import sub modules
 from pygraph import c_ext
 from pygraph import ged
 from pygraph import utils
--- a/pygraph/c_ext/Makefile
+++ b/pygraph/c_ext/Makefile
@@ -0,0 +1,5 @@
 # You must specify your env variable LSAPE_DIR
 #LSAPE_DIR=/home/bgauzere/Téléchargements/lsape/include/

 liblsap.so:lsap.cpp
 	g++ -fPIC -I/home/bgauzere/Téléchargements/lsape/include/ -shared lsap.cpp -o liblsap.so -O3 -I$(LSAPE_DIR)
--- a/pygraph/c_ext/README.md
+++ b/pygraph/c_ext/README.md
@@ -0,0 +1,6 @@
 Python wrapper for lsape method

 Specify your LSAPE_DIR env variable with the location of the source
 code to compile

 source code : https://bougleux.users.greyc.fr/lsape/
--- a/pygraph/c_ext/init.py
+++ b/pygraph/c_ext/init.py
@@ -0,0 +1,17 @@
 # -*-coding:utf-8 -*-
 """Pygraph - c_ext module

 This package binds some C++ code to python 

 lsape_binders.py : binders to C++ code of LSAPE methods implemented in 
 https://bougleux.users.greyc.fr/lsape/

 """

 # info
 __version__ = "0.1"
 __author__ = "Benoit Gaüzère"
 __date__ = "November 2017"

 # import sub modules
 from pygraph.c_ext import lsape_binders
--- a/pygraph/c_ext/lsap.cpp
+++ b/pygraph/c_ext/lsap.cpp
@@ -0,0 +1,43 @@
 /*
 Python wrapper
 */

 #include "hungarian-lsape.hh"
 #include "hungarian-lsap.hh"

 #include <cstdio>

 extern "C" int lsap(double * C, const int nm, long * rho, long * varrho){
  double * u = new double[nm];
  double * v = new double[nm];

  int * rho_int = new int[nm];
  int * varrho_int = new int[nm];

  hungarianLSAP(C,nm,nm,rho_int,u,v,varrho_int);
  //Find a better way to do
  for (int i =0;i<nm;i++){
    rho[i] = (long)(rho_int[i]);
    varrho[i] = (long)(varrho_int[i]);
  }  
  return 0;
 }



 extern "C" int * lsape(double * C, const int n, const int m, long * rho, long * varrho){
  double * u = new double[n];
  double * v = new double[m];

  int * rho_int = new int[n];
  int * varrho_int = new int[m];

  hungarianLSAPE(C,n,m,rho_int,varrho_int,u,v);
  for (int i =0;i<n;i++)
    rho[i] = (long)(rho_int[i]);

  for (int i =0;i<m;i++)
    varrho[i] = (long)(varrho_int[i]);
  
  return 0;
 }
--- a/pygraph/c_ext/lsape_binders.py
+++ b/pygraph/c_ext/lsape_binders.py
@@ -0,0 +1,23 @@
 import numpy as np
 import ctypes as c
 from ctypes import cdll
 import os.path

 def lsap_solverHG(C):
    ''' Binding for lsape hungarian solver '''

    nm = C.shape[0]
    dll_name = 'liblsap.so'
    lib = cdll.LoadLibrary(os.path.abspath(
        os.path.join(os.path.dirname(__file__), dll_name)))
    lib.lsap.restype = c.c_int
    rho = np.zeros((nm, 1), int)
    varrho = np.zeros((nm, 1), int)
    C[C == np.inf] = 10000

    lib.lsap(c.c_void_p(C.transpose().ctypes.data),
             c.c_int(nm),
             c.c_void_p(rho.ctypes.data),
             c.c_void_p(varrho.ctypes.data))

    return np.array(range(0, nm)), np.array([c.c_int(i).value for i in varrho])
--- a/pygraph/ged/GED.py
+++ b/pygraph/ged/GED.py
@@ -0,0 +1,72 @@
 from pygraph.ged.costfunctions import ConstantCostFunction, RiesenCostFunction
 from pygraph.ged.costfunctions import NeighboorhoodCostFunction
 from pygraph.ged.bipartiteGED import computeBipartiteCostMatrix, getOptimalMapping
 from scipy.optimize import linear_sum_assignment

 def ged(G1, G2, method='Riesen', rho=None, varrho=None,
        cf=ConstantCostFunction(1, 3, 1, 3),
        solver=linear_sum_assignment):
    """Compute Graph Edit Distance between G1 and G2 according to mapping
    encoded within rho and varrho. Graph's node must be indexed by a
    index which is used is rho and varrho 
    NB: Utilisation de
    dictionnaire pour etre plus versatile ?

    """
    if ((rho is None) or (varrho is None)):
        if(method == 'Riesen'):
            cf_bp = RiesenCostFunction(cf,lsap_solver=solver)
        elif(method == 'Neighboorhood'):
            cf_bp = NeighboorhoodCostFunction(cf,lsap_solver=solver)
        elif(method == 'Basic'):
            cf_bp = cf
        else:
            raise NameError('Non existent method ')

        rho, varrho = getOptimalMapping(
            computeBipartiteCostMatrix(G1, G2, cf_bp), lsap_solver=solver)

    n = G1.number_of_nodes()
    m = G2.number_of_nodes()
    ged = 0
    for i in G1.nodes():
        phi_i = rho[i]
        if(phi_i >= m):
            ged += cf.cnd(i, G1)
        else:
            ged += cf.cns(i, phi_i, G1, G2)
    for j in G2.nodes():
        phi_j = varrho[j]
        if(phi_j >= n):
            ged += cf.cni(j, G2)

    for e in G1.edges(data=True):
        i = e[0]
        j = e[1]
        phi_i = rho[i]
        phi_j = rho[j]
        if (phi_i < m) and (phi_j < m):
            mappedEdge = len(list(filter(lambda x: True if
                                         x == phi_j else False, G2[phi_i])))
            if(mappedEdge):
                e2 = [phi_i, phi_j, G2[phi_i][phi_j]]
                min_cost = min(cf.ces(e, e2, G1, G2),
                               cf.ced(e, G1) + cf.cei(e2, G2))
                ged += min_cost
            else:
                ged += cf.ced(e, G1)
        else:
            ged += cf.ced(e, G1)
    for e in G2.edges(data=True):
        i = e[0]
        j = e[1]
        phi_i = varrho[i]
        phi_j = varrho[j]
        if (phi_i < n) and (phi_j < n):
            mappedEdge = len(list(filter(lambda x: True if x == phi_j
                                         else False, G1[phi_i])))
            if(not mappedEdge):
                ged += cf.cei(e, G2)
        else:
            ged += cf.ced(e, G2)
    return ged, rho, varrho
--- a/pygraph/ged/init.py
+++ b/pygraph/ged/init.py
@@ -0,0 +1,17 @@
 # -*-coding:utf-8 -*-
 """Pygraph - ged module

 Implement some methods to compute ged between graphs


 """

 # info
 __version__ = "0.1"
 __author__ = "Benoit Gaüzère"
 __date__ = "November 2017"

 from pygraph.ged import costfunctions
 from pygraph.ged import bipartiteGED
 from pygraph.ged import GED

--- a/pygraph/ged/bipartiteGED.py
+++ b/pygraph/ged/bipartiteGED.py
@@ -0,0 +1,33 @@
 import numpy as np
 from scipy.optimize import linear_sum_assignment
 from pygraph.ged.costfunctions import ConstantCostFunction


 def computeBipartiteCostMatrix(G1, G2, cf=ConstantCostFunction(1, 3, 1, 3)):
    """Compute a Cost Matrix according to cost function cf"""
    n = G1.number_of_nodes()
    m = G2.number_of_nodes()
    nm = n + m
    C = np.ones([nm, nm])*np.inf
    C[n:, m:] = 0

    for u in G1.nodes():
        for v in G2.nodes():
            cost = cf.cns(u, v, G1, G2)
            C[u, v] = cost

    for v in G1.nodes():
        C[v, m + v] = cf.cnd(v, G1)

    for v in G2.nodes():
        C[n + v, v] = cf.cni(v, G2)
    return C


 def getOptimalMapping(C, lsap_solver=linear_sum_assignment):
    """Compute an optimal linear mapping according to cost Matrix C
    inclure les progs C de Seb

    """
    row_ind, col_ind = lsap_solver(C)
    return col_ind, row_ind[np.argsort(col_ind)]
--- a/pygraph/ged/costfunctions.py
+++ b/pygraph/ged/costfunctions.py
@@ -0,0 +1,138 @@
 import numpy as np
 from scipy.optimize import linear_sum_assignment


 class ConstantCostFunction:
    """ Define a symmetric constant cost fonction for edit operations """
    def __init__(self, cns, cni, ces, cei):
        self.cns_ = cns
        self.cni_ = self.cnd_ = cni
        self.ces_ = ces
        self.cei_ = self.ced_ = cei

    def cns(self, node_u, node_v, g1, g2):
        """ return substitution edit operation cost between node_u of G1 and node_v of G2"""
        return (g1.node[node_u]['label'] != g2.node[node_v]['label'])*self.cns_

    def cnd(self, u, G1):
        return self.cnd_

    def cni(self, v, G2):
        return self.cni_

    def ces(self, e1, e2, G1, G2):
        """tester avec des attributs autres que symboliques en testant
        l'operateur __eq__"""
        return (e1[2]['label'] != e2[2]['label'])*self.ces_

    def ced(self, e1, G1):
        return self.ced_

    def cei(self, e2, G2):
        return self.cei_


 class RiesenCostFunction():
    """ Cost function associated to the computation of a cost matrix between nodes for LSAP"""
    def __init__(self, cf, lsap_solver=linear_sum_assignment):
        self.cf_ = cf
        self.lsap_solver_ = lsap_solver

    def cns(self, u, v, G1, G2):
        """ u et v sont des id de noeuds """
        n = len(G1[u])
        m = len(G2[v])
        sub_C = np.ones([n+m, n+m]) * np.inf
        sub_C[n:, m:] = 0
        i = 0
        l_nbr_u = G1[u]
        l_nbr_v = G2[v]
        for nbr_u in l_nbr_u:
            j = 0
            e1 = [u, nbr_u, G1[u][nbr_u]]
            for nbr_v in G2[v]:
                e2 = [v, nbr_v, G2[v][nbr_v]]
                sub_C[i, j] = self.cf_.ces(e1, e2, G1, G2)
                j += 1
            i += 1

        i = 0
        for nbr_u in l_nbr_u:
            sub_C[i, m+i] = self.cf_.ced([u, nbr_u, G1[u][nbr_u]], G1)
            i += 1

        j = 0
        for nbr_v in l_nbr_v:
            sub_C[n+j, j] = self.cf_.cei([v, nbr_v, G2[v][nbr_v]], G2)
            j += 1
        row_ind, col_ind = self.lsap_solver_(sub_C)
        cost = np.sum(sub_C[row_ind, col_ind])
        return self.cf_.cns(u, v, G1, G2) + cost

    def cnd(self, u, G1):
        cost = 0
        for nbr in G1[u]:
            cost += self.cf_.ced([u,nbr,G1[u][nbr]],G1)
            
        return self.cf_.cnd(u,G1) + cost

    def cni(self, v, G2):
        cost = 0
        for nbr in G2[v]:
            cost += self.cf_.cei([v,nbr,G2[v][nbr]], G2)
            
        return self.cf_.cni(v, G2) + cost


 class NeighboorhoodCostFunction():
    """ Cost function associated to the computation of a cost matrix between nodes for LSAP"""
    def __init__(self, cf, lsap_solver=linear_sum_assignment):
        self.cf_ =  cf
        self.lsap_solver_ = lsap_solver

    def cns(self, u, v, G1, G2):
        """ u et v sont des id de noeuds """
        n = len(G1[u])
        m = len(G2[v])
        sub_C = np.ones([n+m, n+m]) * np.inf
        sub_C[n:, m:] = 0
        i = 0
        l_nbr_u = G1[u]
        l_nbr_v = G2[v]
        for nbr_u in l_nbr_u:
            j = 0
            e1 = [u, nbr_u, G1[u][nbr_u]]
            for nbr_v in G2[v]:
                e2 = [v, nbr_v, G2[v][nbr_v]]
                sub_C[i, j] = self.cf_.ces(e1, e2, G1, G2)
                sub_C[i, j] += self.cf_.cns(nbr_u, nbr_v, G1, G2)
                j += 1
            i += 1

        i = 0
        for nbr_u in l_nbr_u:
            sub_C[i, m+i] = self.cf_.ced([u, nbr_u, G1[u][nbr_u]], G1)
            sub_C[i, m+i] += self.cf_.cnd(nbr_u, G1)
            i += 1

        j = 0
        for nbr_v in l_nbr_v:
            sub_C[n+j, j] = self.cf_.cei([v, nbr_v, G2[v][nbr_v]], G2)
            sub_C[n+j, j] += self.cf_.cni(nbr_v, G2)
            j += 1

        row_ind, col_ind = self.lsap_solver_(sub_C)
        cost = np.sum(sub_C[row_ind, col_ind])
        return self.cf_.cns(u, v, G1, G2) + cost

    def cnd(self, u, G1):
        cost = 0
        for nbr in G1[u]:
            cost += self.cf_.ced([u, nbr, G1[u][nbr]], G1)
        return self.cf_.cnd(u, G1) + cost

    def cni(self, v, G2):
        cost = 0
        for nbr in G2[v]:
            cost += self.cf_.cei([v, nbr, G2[v][nbr]], G2)
        return self.cf_.cni(v, G2) + cost
--- a/pygraph/kernels/.gitignore
+++ b/pygraph/kernels/.gitignore
--- a/pygraph/kernels/spkernel.py
+++ b/pygraph/kernels/spkernel.py
@@ -0,0 +1,68 @@
 import sys
 import pathlib
 sys.path.insert(0, "../")


 import networkx as nx
 import numpy as np
 import time

 from utils.utils import getSPGraph


 def spkernel(*args):
    """Calculate shortest-path kernels between graphs.
    
    Parameters
    ----------
    Gn : List of NetworkX graph
        List of graphs between which the kernels are calculated.
    /
    G1, G2 : NetworkX graphs
        2 graphs between which the kernel is calculated.
        
    Return
    ------
    Kmatrix/Kernel : Numpy matrix/int
        Kernel matrix, each element of which is the sp kernel between 2 praphs. / SP Kernel between 2 graphs.
        
    References
    ----------
    [1] Borgwardt KM, Kriegel HP. Shortest-path kernels on graphs. InData Mining, Fifth IEEE International Conference on 2005 Nov 27 (pp. 8-pp). IEEE.
    """
    if len(args) == 1: # for a list of graphs
        Gn = args[0]
        
        Kmatrix = np.zeros((len(Gn), len(Gn)))
    
        Sn = [] # get shortest path graphs of Gn
        for i in range(0, len(Gn)):
            Sn.append(getSPGraph(Gn[i]))

        start_time = time.time()
        for i in range(0, len(Gn)):
            for j in range(i, len(Gn)):
                for e1 in Sn[i].edges(data = True):
                    for e2 in Sn[j].edges(data = True):          
                        if e1[2]['cost'] != 0 and e1[2]['cost'] == e2[2]['cost'] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])):
                            Kmatrix[i][j] += 1
                            Kmatrix[j][i] += (0 if i == j else 1)

        print("--- shortest path kernel matrix of size %d built in %s seconds ---" % (len(Gn), (time.time() - start_time)))
        
        return Kmatrix
        
    else: # for only 2 graphs
        G1 = args[0]
        G2 = args[1]
        
        kernel = 0
        
        for e1 in G1.edges(data = True):
            for e2 in G2.edges(data = True):          
                if e1[2]['cost'] != 0 and e1[2]['cost'] == e2[2]['cost'] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])):
                    kernel += 1

        print("--- shortest path kernel built in %s seconds ---" % (time.time() - start_time))
        
        return kernel
--- a/pygraph/utils/init.py
+++ b/pygraph/utils/init.py
@@ -0,0 +1,17 @@
 # -*-coding:utf-8 -*-
 """Pygraph - utils module

 Implement some methods to manage graphs
 graphfiles.py : load .gxl and .ct files
 utils.py : compute some properties on networkX graphs


 """

 # info
 __version__ = "0.1"
 __author__ = "Benoit Gaüzère"
 __date__ = "November 2017"

 from pygraph.utils import graphfiles
 from pygraph.utils import utils
--- a/pygraph/utils/graphfiles.py
+++ b/pygraph/utils/graphfiles.py
@@ -0,0 +1,87 @@
 import networkx as nx
   
 def loadCT(filename):
    """load data from .ct file.
    
    Notes
    ------ 
    a typical example of data in .ct is like this:
    
     3 2  <- number of nodes and edges
        0.0000    0.0000    0.0000 C <- each line describes a node, the last parameter in which is the label of the node, representing a chemical element @Q what are the first 3 numbers?
        0.0000    0.0000    0.0000 C
        0.0000    0.0000    0.0000 O
      1  3  1  1 <- each line describes an edge, the first two numbers represent two nodes of the edge, the last number represents the label. @Q what are the 3th numbers?
      2  3  1  1
    """
    content = open(filename).read().splitlines()
    G = nx.Graph(name=str(content[0])) # set name of the graph
    tmp = content[1].split(" ")
    if tmp[0] == '':
        nb_nodes = int(tmp[1]) # number of the nodes
        nb_edges = int(tmp[2]) # number of the edges
    else:
        nb_nodes = int(tmp[0])
        nb_edges = int(tmp[1])

    for i in range(0, nb_nodes):
        tmp = content[i + 2].split(" ")
        tmp = [x for x in tmp if x != '']
        G.add_node(i, label=tmp[3])

    for i in range(0, nb_edges):
        tmp = content[i + G.number_of_nodes() + 2].split(" ")
        tmp = [x for x in tmp if x != '']
        G.add_edge(int(tmp[0]) - 1, int(tmp[1]) - 1, label=int(tmp[3]))
    return G


 def loadGXL(filename):
    import networkx as nx
    import xml.etree.ElementTree as ET

    tree = ET.parse(filename)
    root = tree.getroot()
    index = 0
    G = nx.Graph()
    dic={}
    for node in root.iter('node'):
        label = node.find('attr')[0].text
        dic[node.attrib['id']] = index
        G.add_node(index, id=node.attrib['id'], label=label)
        index += 1
        
    for edge in root.iter('edge'):
        label = edge.find('attr')[0].text
        G.add_edge(dic[edge.attrib['from']], dic[edge.attrib['to']], label=label)
    return G
 
 def loadDataset(filename):
    """load file list of the dataset.
    """
    from os.path import dirname, splitext

    dirname_dataset = dirname(filename)
    extension = splitext(filename)[1][1:]
    data = []
    y = []
    if(extension == "ds"):
        content = open(filename).read().splitlines()
        for i in range(0, len(content)):
            tmp = content[i].split(' ')
            data.append(loadCT(dirname_dataset + '/' + tmp[0].replace('#', '', 1))) # remove the '#'s in file names
            y.append(float(tmp[1]))
    elif(extension == "cxl"):
        import xml.etree.ElementTree as ET

        tree = ET.parse(filename)
        root = tree.getroot()
        data = []
        y = []
        for graph in root.iter('print'):
            mol_filename = graph.attrib['file']
            mol_class = graph.attrib['class']
            data.append(loadGXL(dirname_dataset + '/' + mol_filename))
            y.append(mol_class)

    return data, y
--- a/pygraph/utils/utils.py
+++ b/pygraph/utils/utils.py
@@ -0,0 +1,59 @@
 import networkx as nx
 import numpy as np


 def getSPLengths(G1):
    sp = nx.shortest_path(G1)
    distances = np.zeros((G1.number_of_nodes(), G1.number_of_nodes()))
    for i in sp.keys():
        for j in sp[i].keys():
            distances[i, j] = len(sp[i][j])-1
    return distances

 def getSPGraph(G):
    """Transform graph G to its corresponding shortest-paths graph.
    
    Parameters
    ----------
    G : NetworkX graph
        The graph to be tramsformed.
        
    Return
    ------
    S : NetworkX graph
        The shortest-paths graph corresponding to G.
        
    Notes
    ------
    For an input graph G, its corresponding shortest-paths graph S contains the same set of nodes as G, while there exists an edge between all nodes in S which are connected by a walk in G. Every edge in S between two nodes is labeled by the shortest distance between these two nodes.
    
    References
    ----------
    [1] Borgwardt KM, Kriegel HP. Shortest-path kernels on graphs. InData Mining, Fifth IEEE International Conference on 2005 Nov 27 (pp. 8-pp). IEEE.
    """
    return floydTransformation(G)
            
 def floydTransformation(G):
    """Transform graph G to its corresponding shortest-paths graph using Floyd-transformation.
    
    Parameters
    ----------
    G : NetworkX graph
        The graph to be tramsformed.
        
    Return
    ------
    S : NetworkX graph
        The shortest-paths graph corresponding to G.
        
    References
    ----------
    [1] Borgwardt KM, Kriegel HP. Shortest-path kernels on graphs. InData Mining, Fifth IEEE International Conference on 2005 Nov 27 (pp. 8-pp). IEEE.
    """
    spMatrix = nx.floyd_warshall_numpy(G) # @todo weigth label not considered
    S = nx.Graph()
    S.add_nodes_from(G.nodes(data=True))
    for i in range(0, G.number_of_nodes()):
        for j in range(0, G.number_of_nodes()):
            S.add_edge(i, j, cost = spMatrix[i, j])
    return S
--- a/tests/README.md
+++ b/tests/README.md
@@ -0,0 +1,5 @@
 To use the library : 
 $> virtualenv --python=/usr/bin/python3.5 venv
 $> pip install -r requirements.txt
 $> source venv/bin/activate
 ... Go use pygraph
--- a/tests/opt.py
+++ b/tests/opt.py
@@ -0,0 +1,66 @@
 import ot
 import sys
 import pathlib
 sys.path.insert(0, "../")

 from pygraph.utils.graphfiles import loadDataset
 from pygraph.ged.costfunctions import ConstantCostFunction
 from pygraph.utils.utils import getSPLengths
 from tqdm import tqdm
 import numpy as np
 from scipy.optimize import linear_sum_assignment
 from pygraph.ged.GED import ged
 import scipy

 def pad(C, n):
    C_pad = np.zeros((n, n))
    C_pad[:C.shape[0], :C.shape[1]] = C
    return C_pad

 if (__name__ == "__main__"):
    ds_filename = "/home/bgauzere/work/Datasets/Acyclic/dataset_bps.ds"
    dataset, y = loadDataset(ds_filename)
    cf = ConstantCostFunction(1, 3, 1, 3)
    N = len(dataset)

    pairs = list()
    
    ged_distances = list() #np.zeros((N, N))
    gw_distances = list() #np.zeros((N, N))
    for i in tqdm(range(0, N)):
        for j in tqdm(range(i, N)):
            G1 = dataset[i]
            G2 = dataset[j]
            n = G1.number_of_nodes()
            m = G2.number_of_nodes()
            if(n == m):
                C1 = getSPLengths(G1)
                C2 = getSPLengths(G2)

                C1 /= C1.max()
                C2 /= C2.max()

                dim = max(n, m)
                if(n < m):
                    C1 = pad(C1, dim)
                elif (m < n):
                    C2 = pad(C2, dim)

                p = ot.unif(dim)
                q = ot.unif(dim)

                gw = ot.gromov_wasserstein(C1, C2, p, q,
                                           'square_loss', epsilon=5e-3)
                row_ind, col_ind = linear_sum_assignment(-gw)
                rho = col_ind
                varrho = row_ind[np.argsort(col_ind)]
                pairs.append((i,j))
                gw_distances.append(ged(G1, G2, cf=cf, rho=rho, varrho=varrho)[0])

                ged_distances.append(ged(G1, G2, cf=cf)[0])

    print("Moyenne sur Riesen : {}".format(np.mean(ged_distances)))
    print("Moyenne sur GW : {} ".format(np.mean(gw_distances)))

    np.save("distances_riesen", ged_distances)
    np.save("distances_gw", gw_distances)
--- a/tests/requirements.txt
+++ b/tests/requirements.txt
@@ -0,0 +1,16 @@
 cycler==0.10.0
 Cython==0.27.3
 decorator==4.1.2
 matplotlib==2.1.0
 networkx==2.0
 numpy==1.13.3
 pkg-resources==0.0.0
 POT==0.4.0
 pyparsing==2.2.0
 python-dateutil==2.6.1
 pytz==2017.3
 scikit-learn==0.19.1
 scipy==1.0.0
 six==1.11.0
 sklearn==0.0
 tqdm==4.19.4