Merge branch 'master' of https://git.litislab.fr/bgauzere/py-graph

add pygraph/kernels/spkernel.py modify pygraph/utils/util.py and pygraph/utils/graphfiles.py
7 years ago · d8a96ce408
--- a/notebooks/.ipynb_checkpoints/test_lib-checkpoint.ipynb
+++ b/notebooks/.ipynb_checkpoints/test_lib-checkpoint.ipynb
--- a/notebooks/py-graph_test.ipynb
+++ b/notebooks/py-graph_test.ipynb
@@ -0,0 +1,170 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "autoscroll": false,
    "ein.tags": "worksheet-0",
    "slideshow": {
     "slide_type": "-"
    }
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import paths\n",
    "\n",
    "import pygraph\n",
    "\n",
    "from pygraph.utils.graphfiles import loadDataset\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "autoscroll": false,
    "ein.tags": "worksheet-0",
    "slideshow": {
     "slide_type": "-"
    }
   },
   "outputs": [],
   "source": [
    "import networkx as nx\n",
    "import numpy as np\n",
    "import matplotlib.pyplot  as plt\n",
    "\n",
    "# We load a ds dataset\n",
    "# load it from https://brunl01.users.greyc.fr/CHEMISTRY/Acyclic.tar.gz\n",
    "dataset, y = loadDataset(\"/home/bgauzere/work/Datasets/Acyclic/dataset_bps.ds\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "autoscroll": false,
    "ein.tags": "worksheet-0",
    "slideshow": {
     "slide_type": "-"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 183/183 [07:41<00:00,  2.52s/it]\n",
      "100%|██████████| 183/183 [08:39<00:00,  2.84s/it]\n",
      "100%|██████████| 183/183 [05:19<00:00,  1.75s/it]\n",
      "100%|██████████| 183/183 [05:50<00:00,  1.91s/it]\n"
     ]
    }
   ],
   "source": [
    "#Compute graph edit distances\n",
    "\n",
    "from tqdm import tqdm\n",
    "from pygraph.c_ext.lsape_binders import  lsap_solverHG\n",
    "from pygraph.ged.costfunctions import ConstantCostFunction\n",
    "from pygraph.ged.GED import ged\n",
    "import time\n",
    "\n",
    "cf = ConstantCostFunction(1,3,1,3)\n",
    "N=len(dataset)\n",
    "\n",
    "methods=['Riesen + LSAP', 'Neigh + LSAP', 'Riesen + LSAPE', 'Neigh + LSAPE']\n",
    "ged_distances = [ np.zeros((N,N)), np.zeros((N,N)), np.zeros((N,N)), np.zeros((N,N))]\n",
    "\n",
    "times = list()\n",
    "start = time.clock()\n",
    "for i in tqdm(range(0,N)):\n",
    "    for j in range(0,N):\n",
    "        ged_distances[0][i,j] = ged(dataset[i],dataset[j],cf=cf, method='Riesen')[0]\n",
    "times.append(time.clock() - start)\n",
    "\n",
    "\n",
    "start = time.clock()\n",
    "for i in tqdm(range(0,N)):\n",
    "    for j in range(0,N):\n",
    "        ged_distances[1][i,j] = ged(dataset[i],dataset[j],cf=cf, method='Neighboorhood')[0]\n",
    "\n",
    "times.append(time.clock() - start)\n",
    "\n",
    "start = time.clock()\n",
    "for i in tqdm(range(0,N)):\n",
    "    for j in range(0,N):\n",
    "        ged_distances[2][i,j] = ged(dataset[i],dataset[j],cf=cf, method='Riesen',solver=lsap_solverHG)[0]\n",
    "times.append(time.clock() - start)\n",
    "\n",
    "start = time.clock()\n",
    "for i in tqdm(range(0,N)):\n",
    "    for j in range(0,N):\n",
    "        ged_distances[3][i,j] = ged(dataset[i],dataset[j],cf=cf, method='Neighboorhood',solver=lsap_solverHG)[0]\n",
    "times.append(time.clock() - start)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "autoscroll": false,
    "ein.tags": "worksheet-0",
    "slideshow": {
     "slide_type": "-"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      " method \t mean \t mean \t time\n",
      " Riesen + LSAP \t 37.79903849025053 \t 35.31207262086058 \t 463.300405 \n",
      " Neigh + LSAP \t 36.2281047508137 \t 33.85869987159963 \t 521.7821730000001 \n",
      " Riesen + LSAPE \t 35.95508973095643 \t 34.10092866314312 \t 319.83455500000014 \n",
      " Neigh + LSAPE \t 34.5005822807489 \t 32.5735614679447 \t 350.48029599999995 \n"
     ]
    }
   ],
   "source": [
    "print(\" method \\t mean \\t mean \\t time\")\n",
    "data = list()\n",
    "for i in range(0,len(ged_distances)):\n",
    "    ged_ = np.minimum(ged_distances[i],ged_distances[i].transpose())\n",
    "    print(\" {} \\t {} \\t {} \\t {} \".format(methods[i], np.mean(ged_distances[i]),np.mean(ged_), times[i]))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.2"
  },
  "name": "py-graph_test.ipynb"
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
--- a/pygraph/init.py
+++ b/pygraph/init.py
@@ -0,0 +1,21 @@
 # -*-coding:utf-8 -*-
 """
 Pygraph

 This  package contains 4 sub  packages :
        * c_ext : binders to C++ code
        * ged : allows to compute graph edit distance between networkX graphs
        * kernels : computation of graph kernels, ie graph similarity measure compatible with SVM
        * notebooks : examples of code using this library
        * utils : Diverse computation on graphs
 """

 # info
 __version__ = "0.1"
 __author__  = "Benoit Gaüzère"
 __date__    = "November 2017"
 
 # import sub modules
 from pygraph import c_ext
 from pygraph import ged
 from pygraph import utils
--- a/pygraph/c_ext/Makefile
+++ b/pygraph/c_ext/Makefile
@@ -0,0 +1,5 @@
 # You must specify your env variable LSAPE_DIR
 #LSAPE_DIR=/home/bgauzere/Téléchargements/lsape/include/

 liblsap.so:lsap.cpp
 	g++ -fPIC -I/home/bgauzere/Téléchargements/lsape/include/ -shared lsap.cpp -o liblsap.so -O3 -I$(LSAPE_DIR)
--- a/pygraph/c_ext/README.md
+++ b/pygraph/c_ext/README.md
@@ -0,0 +1,6 @@
 Python wrapper for lsape method

 Specify your LSAPE_DIR env variable with the location of the source
 code to compile

 source code : https://bougleux.users.greyc.fr/lsape/
--- a/pygraph/c_ext/init.py
+++ b/pygraph/c_ext/init.py
@@ -0,0 +1,17 @@
 # -*-coding:utf-8 -*-
 """Pygraph - c_ext module

 This package binds some C++ code to python 

 lsape_binders.py : binders to C++ code of LSAPE methods implemented in 
 https://bougleux.users.greyc.fr/lsape/

 """

 # info
 __version__ = "0.1"
 __author__ = "Benoit Gaüzère"
 __date__ = "November 2017"

 # import sub modules
 from pygraph.c_ext import lsape_binders
--- a/pygraph/c_ext/lsap.cpp
+++ b/pygraph/c_ext/lsap.cpp
@@ -0,0 +1,43 @@
 /*
 Python wrapper
 */

 #include "hungarian-lsape.hh"
 #include "hungarian-lsap.hh"

 #include <cstdio>

 extern "C" int lsap(double * C, const int nm, long * rho, long * varrho){
  double * u = new double[nm];
  double * v = new double[nm];

  int * rho_int = new int[nm];
  int * varrho_int = new int[nm];

  hungarianLSAP(C,nm,nm,rho_int,u,v,varrho_int);
  //Find a better way to do
  for (int i =0;i<nm;i++){
    rho[i] = (long)(rho_int[i]);
    varrho[i] = (long)(varrho_int[i]);
  }  
  return 0;
 }



 extern "C" int * lsape(double * C, const int n, const int m, long * rho, long * varrho){
  double * u = new double[n];
  double * v = new double[m];

  int * rho_int = new int[n];
  int * varrho_int = new int[m];

  hungarianLSAPE(C,n,m,rho_int,varrho_int,u,v);
  for (int i =0;i<n;i++)
    rho[i] = (long)(rho_int[i]);

  for (int i =0;i<m;i++)
    varrho[i] = (long)(varrho_int[i]);
  
  return 0;
 }
--- a/pygraph/c_ext/lsape_binders.py
+++ b/pygraph/c_ext/lsape_binders.py
@@ -0,0 +1,23 @@
 import numpy as np
 import ctypes as c
 from ctypes import cdll
 import os.path

 def lsap_solverHG(C):
    ''' Binding for lsape hungarian solver '''

    nm = C.shape[0]
    dll_name = 'liblsap.so'
    lib = cdll.LoadLibrary(os.path.abspath(
        os.path.join(os.path.dirname(__file__), dll_name)))
    lib.lsap.restype = c.c_int
    rho = np.zeros((nm, 1), int)
    varrho = np.zeros((nm, 1), int)
    C[C == np.inf] = 10000

    lib.lsap(c.c_void_p(C.transpose().ctypes.data),
             c.c_int(nm),
             c.c_void_p(rho.ctypes.data),
             c.c_void_p(varrho.ctypes.data))

    return np.array(range(0, nm)), np.array([c.c_int(i).value for i in varrho])
--- a/pygraph/ged/GED.py
+++ b/pygraph/ged/GED.py
@@ -1,10 +1,11 @@
 from ged.costfunctions import BasicCostFunction, RiesenCostFunction
 from ged.costfunctions import NeighboorhoodCostFunction
 from ged.bipartiteGED import computeBipartiteCostMatrix, getOptimalMapping

 from pygraph.ged.costfunctions import ConstantCostFunction, RiesenCostFunction
 from pygraph.ged.costfunctions import NeighboorhoodCostFunction
 from pygraph.ged.bipartiteGED import computeBipartiteCostMatrix, getOptimalMapping
 from scipy.optimize import linear_sum_assignment

 def ged(G1, G2, method='Riesen', rho=None, varrho=None,
        cf=BasicCostFunction(1, 3, 1, 3)):
        cf=ConstantCostFunction(1, 3, 1, 3),
        solver=linear_sum_assignment):
    """Compute Graph Edit Distance between G1 and G2 according to mapping
    encoded within rho and varrho. Graph's node must be indexed by a
    index which is used is rho and varrho 
@@ -14,31 +15,32 @@ def ged(G1, G2, method='Riesen', rho=None, varrho=None,
    """
    if ((rho is None) or (varrho is None)):
        if(method == 'Riesen'):
            cf_bp = RiesenCostFunction(cf)
            cf_bp = RiesenCostFunction(cf,lsap_solver=solver)
        elif(method == 'Neighboorhood'):
            cf_bp = NeighboorhoodCostFunction(cf)
            cf_bp = NeighboorhoodCostFunction(cf,lsap_solver=solver)
        elif(method == 'Basic'):
            cf_bp = cf
        else:
            raise NameError('Non existent method ')

        rho, varrho = getOptimalMapping(computeBipartiteCostMatrix(G1, G2, cf_bp))
        rho, varrho = getOptimalMapping(
            computeBipartiteCostMatrix(G1, G2, cf_bp), lsap_solver=solver)

    n = G1.number_of_nodes()
    m = G2.number_of_nodes()
    ged = 0
    for i in G1.nodes_iter():
    for i in G1.nodes():
        phi_i = rho[i]
        if(phi_i >= m):
            ged += cf.cnd(i, G1)
        else:
            ged += cf.cns(i, phi_i, G1, G2)
    for j in G2.nodes_iter():
    for j in G2.nodes():
        phi_j = varrho[j]
        if(phi_j >= n):
            ged += cf.cni(j, G2)

    for e in G1.edges_iter(data=True):
    for e in G1.edges(data=True):
        i = e[0]
        j = e[1]
        phi_i = rho[i]
@@ -49,13 +51,13 @@ def ged(G1, G2, method='Riesen', rho=None, varrho=None,
            if(mappedEdge):
                e2 = [phi_i, phi_j, G2[phi_i][phi_j]]
                min_cost = min(cf.ces(e, e2, G1, G2),
                               cf.ced(e, G1), cf.cei(e2, G2))
                               cf.ced(e, G1) + cf.cei(e2, G2))
                ged += min_cost
            else:
                ged += cf.ced(e, G1)
        else:
            ged += cf.ced(e, G1)
    for e in G2.edges_iter(data=True):
    for e in G2.edges(data=True):
        i = e[0]
        j = e[1]
        phi_i = varrho[i]
@@ -68,7 +70,3 @@ def ged(G1, G2, method='Riesen', rho=None, varrho=None,
        else:
            ged += cf.ced(e, G2)
    return ged, rho, varrho


 def computeDistanceMatrix(dataset):
        pass
--- a/pygraph/ged/init.py
+++ b/pygraph/ged/init.py
@@ -0,0 +1,17 @@
 # -*-coding:utf-8 -*-
 """Pygraph - ged module

 Implement some methods to compute ged between graphs


 """

 # info
 __version__ = "0.1"
 __author__ = "Benoit Gaüzère"
 __date__ = "November 2017"

 from pygraph.ged import costfunctions
 from pygraph.ged import bipartiteGED
 from pygraph.ged import GED

--- a/pygraph/ged/bipartiteGED.py
+++ b/pygraph/ged/bipartiteGED.py
@@ -1,9 +1,9 @@
 import numpy as np
 from scipy.optimize import linear_sum_assignment
 from ged.costfunctions import BasicCostFunction
 from pygraph.ged.costfunctions import ConstantCostFunction


 def computeBipartiteCostMatrix(G1, G2, cf=BasicCostFunction(1, 3, 1, 3)):
 def computeBipartiteCostMatrix(G1, G2, cf=ConstantCostFunction(1, 3, 1, 3)):
    """Compute a Cost Matrix according to cost function cf"""
    n = G1.number_of_nodes()
    m = G2.number_of_nodes()
@@ -11,23 +11,23 @@ def computeBipartiteCostMatrix(G1, G2, cf=BasicCostFunction(1, 3, 1, 3)):
    C = np.ones([nm, nm])*np.inf
    C[n:, m:] = 0

    for u in G1.nodes_iter():
        for v in G2.nodes_iter():
    for u in G1.nodes():
        for v in G2.nodes():
            cost = cf.cns(u, v, G1, G2)
            C[u, v] = cost

    for v in G1.nodes_iter():
    for v in G1.nodes():
        C[v, m + v] = cf.cnd(v, G1)

    for v in G2.nodes_iter():
    for v in G2.nodes():
        C[n + v, v] = cf.cni(v, G2)
    return C


 def getOptimalMapping(C):
 def getOptimalMapping(C, lsap_solver=linear_sum_assignment):
    """Compute an optimal linear mapping according to cost Matrix C
    inclure les progs C de Seb

    """
    row_ind, col_ind = linear_sum_assignment(C)
    row_ind, col_ind = lsap_solver(C)
    return col_ind, row_ind[np.argsort(col_ind)]
--- a/pygraph/ged/costfunctions.py
+++ b/pygraph/ged/costfunctions.py
@@ -2,15 +2,17 @@ import numpy as np
 from scipy.optimize import linear_sum_assignment


 class BasicCostFunction:
 class ConstantCostFunction:
    """ Define a symmetric constant cost fonction for edit operations """
    def __init__(self, cns, cni, ces, cei):
        self.cns_ = cns
        self.cni_ = self.cnd_ = cni
        self.ces_ = ces
        self.cei_ = self.ced_ = cei

    def cns(self, u, v, G1, G2):
        return (G1.node[u]['label'] != G2.node[v]['label'])*self.cns_
    def cns(self, node_u, node_v, g1, g2):
        """ return substitution edit operation cost between node_u of G1 and node_v of G2"""
        return (g1.node[node_u]['label'] != g2.node[node_v]['label'])*self.cns_

    def cnd(self, u, G1):
        return self.cnd_
@@ -30,9 +32,11 @@ class BasicCostFunction:
        return self.cei_


 class RiesenCostFunction(BasicCostFunction):
    def __init__(self, cf):
        BasicCostFunction.__init__(self, cf.cns_, cf.cni_, cf.ces_, cf.cei_)
 class RiesenCostFunction():
    """ Cost function associated to the computation of a cost matrix between nodes for LSAP"""
    def __init__(self, cf, lsap_solver=linear_sum_assignment):
        self.cf_ = cf
        self.lsap_solver_ = lsap_solver

    def cns(self, u, v, G1, G2):
        """ u et v sont des id de noeuds """
@@ -48,41 +52,43 @@ class RiesenCostFunction(BasicCostFunction):
            e1 = [u, nbr_u, G1[u][nbr_u]]
            for nbr_v in G2[v]:
                e2 = [v, nbr_v, G2[v][nbr_v]]
                sub_C[i, j] = self.ces(e1, e2, G1, G2)
                sub_C[i, j] = self.cf_.ces(e1, e2, G1, G2)
                j += 1
            i += 1

        i = 0
        for nbr_u in l_nbr_u:
            sub_C[i, m+i] = self.ced([u, nbr_u, G1[u][nbr_u]], G1)
            sub_C[i, m+i] = self.cf_.ced([u, nbr_u, G1[u][nbr_u]], G1)
            i += 1

        j = 0
        for nbr_v in l_nbr_v:
            sub_C[n+j, j] = self.cei([v, nbr_v, G2[v][nbr_v]], G2)
            sub_C[n+j, j] = self.cf_.cei([v, nbr_v, G2[v][nbr_v]], G2)
            j += 1
        row_ind, col_ind = linear_sum_assignment(sub_C)
        row_ind, col_ind = self.lsap_solver_(sub_C)
        cost = np.sum(sub_C[row_ind, col_ind])
        return BasicCostFunction.cns(self, u, v, G1, G2) + cost
        return self.cf_.cns(u, v, G1, G2) + cost

    def cnd(self, u, G1):
        cost = 0
        for nbr in G1[u]:
            cost += BasicCostFunction.ced(self,[u,nbr,G1[u][nbr]],G1)
            cost += self.cf_.ced([u,nbr,G1[u][nbr]],G1)
            
        return BasicCostFunction.cnd(self,u,G1) + cost
        return self.cf_.cnd(u,G1) + cost

    def cni(self, v, G2):
        cost = 0
        for nbr in G2[v]:
            cost += BasicCostFunction.cei(self, [v,nbr,G2[v][nbr]], G2)
            cost += self.cf_.cei([v,nbr,G2[v][nbr]], G2)
            
        return BasicCostFunction.cni(self, v, G2) + cost
        return self.cf_.cni(v, G2) + cost


 class NeighboorhoodCostFunction(BasicCostFunction):
    def __init__(self, cf):
        BasicCostFunction.__init__(self, cf.cns_, cf.cni_, cf.ces_, cf.cei_)
 class NeighboorhoodCostFunction():
    """ Cost function associated to the computation of a cost matrix between nodes for LSAP"""
    def __init__(self, cf, lsap_solver=linear_sum_assignment):
        self.cf_ =  cf
        self.lsap_solver_ = lsap_solver

    def cns(self, u, v, G1, G2):
        """ u et v sont des id de noeuds """
@@ -98,36 +104,35 @@ class NeighboorhoodCostFunction(BasicCostFunction):
            e1 = [u, nbr_u, G1[u][nbr_u]]
            for nbr_v in G2[v]:
                e2 = [v, nbr_v, G2[v][nbr_v]]
                sub_C[i, j] = self.ces(e1, e2, G1, G2)
                sub_C[i, j] += BasicCostFunction.cns(self,
                                                     nbr_u, nbr_v, G1, G2)
                sub_C[i, j] = self.cf_.ces(e1, e2, G1, G2)
                sub_C[i, j] += self.cf_.cns(nbr_u, nbr_v, G1, G2)
                j += 1
            i += 1

        i = 0
        for nbr_u in l_nbr_u:
            sub_C[i, m+i] = self.ced([u, nbr_u, G1[u][nbr_u]], G1)
            sub_C[i, m+i] += BasicCostFunction.cnd(self, nbr_u, G1)
            sub_C[i, m+i] = self.cf_.ced([u, nbr_u, G1[u][nbr_u]], G1)
            sub_C[i, m+i] += self.cf_.cnd(nbr_u, G1)
            i += 1

        j = 0
        for nbr_v in l_nbr_v:
            sub_C[n+j, j] = self.cei([v, nbr_v, G2[v][nbr_v]], G2)
            sub_C[n+j, j] += BasicCostFunction.cni(self, nbr_v, G2)
            sub_C[n+j, j] = self.cf_.cei([v, nbr_v, G2[v][nbr_v]], G2)
            sub_C[n+j, j] += self.cf_.cni(nbr_v, G2)
            j += 1

        row_ind, col_ind = linear_sum_assignment(sub_C)
        row_ind, col_ind = self.lsap_solver_(sub_C)
        cost = np.sum(sub_C[row_ind, col_ind])
        return BasicCostFunction.cns(self, u, v, G1, G2) + cost
        return self.cf_.cns(u, v, G1, G2) + cost

    def cnd(self, u, G1):
        cost = 0
        for nbr in G1[u]:
            cost += BasicCostFunction.ced(self, [u, nbr, G1[u][nbr]], G1)
        return BasicCostFunction.cnd(self, u, G1) + cost
            cost += self.cf_.ced([u, nbr, G1[u][nbr]], G1)
        return self.cf_.cnd(u, G1) + cost

    def cni(self, v, G2):
        cost = 0
        for nbr in G2[v]:
            cost += BasicCostFunction.cei(self, [v, nbr, G2[v][nbr]], G2)
        return BasicCostFunction.cni(self, v, G2) + cost
            cost += self.cf_.cei([v, nbr, G2[v][nbr]], G2)
        return self.cf_.cni(v, G2) + cost
--- a/pygraph/kernels/.gitignore
+++ b/pygraph/kernels/.gitignore
--- a/pygraph/kernels/spkernel.py
+++ b/pygraph/kernels/spkernel.py
@@ -0,0 +1,68 @@
 import sys
 import pathlib
 sys.path.insert(0, "../")


 import networkx as nx
 import numpy as np
 import time

 from utils.utils import getSPGraph


 def spkernel(*args):
    """Calculate shortest-path kernels between graphs.
    
    Parameters
    ----------
    Gn : List of NetworkX graph
        List of graphs between which the kernels are calculated.
    /
    G1, G2 : NetworkX graphs
        2 graphs between which the kernel is calculated.
        
    Return
    ------
    Kmatrix/Kernel : Numpy matrix/int
        Kernel matrix, each element of which is the sp kernel between 2 praphs. / SP Kernel between 2 graphs.
        
    References
    ----------
    [1] Borgwardt KM, Kriegel HP. Shortest-path kernels on graphs. InData Mining, Fifth IEEE International Conference on 2005 Nov 27 (pp. 8-pp). IEEE.
    """
    if len(args) == 1: # for a list of graphs
        Gn = args[0]
        
        Kmatrix = np.zeros((len(Gn), len(Gn)))
    
        Sn = [] # get shortest path graphs of Gn
        for i in range(0, len(Gn)):
            Sn.append(getSPGraph(Gn[i]))

        start_time = time.time()
        for i in range(0, len(Gn)):
            for j in range(i, len(Gn)):
                for e1 in Sn[i].edges(data = True):
                    for e2 in Sn[j].edges(data = True):          
                        if e1[2]['cost'] != 0 and e1[2]['cost'] == e2[2]['cost'] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])):
                            Kmatrix[i][j] += 1
                            Kmatrix[j][i] += (0 if i == j else 1)

        print("--- shortest path kernel matrix of size %d built in %s seconds ---" % (len(Gn), (time.time() - start_time)))
        
        return Kmatrix
        
    else: # for only 2 graphs
        G1 = args[0]
        G2 = args[1]
        
        kernel = 0
        
        for e1 in G1.edges(data = True):
            for e2 in G2.edges(data = True):          
                if e1[2]['cost'] != 0 and e1[2]['cost'] == e2[2]['cost'] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])):
                    kernel += 1

        print("--- shortest path kernel built in %s seconds ---" % (time.time() - start_time))
        
        return kernel
--- a/pygraph/utils/init.py
+++ b/pygraph/utils/init.py
@@ -0,0 +1,17 @@
 # -*-coding:utf-8 -*-
 """Pygraph - utils module

 Implement some methods to manage graphs
 graphfiles.py : load .gxl and .ct files
 utils.py : compute some properties on networkX graphs


 """

 # info
 __version__ = "0.1"
 __author__ = "Benoit Gaüzère"
 __date__ = "November 2017"

 from pygraph.utils import graphfiles
 from pygraph.utils import utils
--- a/pygraph/utils/graphfiles.py
+++ b/pygraph/utils/graphfiles.py
@@ -1,13 +1,25 @@
 import networkx as nx


   
 def loadCT(filename):
    """load data from .ct file.
    
    Notes
    ------ 
    a typical example of data in .ct is like this:
    
     3 2  <- number of nodes and edges
        0.0000    0.0000    0.0000 C <- each line describes a node, the last parameter in which is the label of the node, representing a chemical element @Q what are the first 3 numbers?
        0.0000    0.0000    0.0000 C
        0.0000    0.0000    0.0000 O
      1  3  1  1 <- each line describes an edge, the first two numbers represent two nodes of the edge, the last number represents the label. @Q what are the 3th numbers?
      2  3  1  1
    """
    content = open(filename).read().splitlines()
    G = nx.Graph(name=str(content[0]))
    G = nx.Graph(name=str(content[0])) # set name of the graph
    tmp = content[1].split(" ")
    if tmp[0] == '':
        nb_nodes = int(tmp[1])
        nb_edges = int(tmp[2])
        nb_nodes = int(tmp[1]) # number of the nodes
        nb_edges = int(tmp[2]) # number of the edges
    else:
        nb_nodes = int(tmp[0])
        nb_edges = int(tmp[1])
@@ -18,7 +30,7 @@ def loadCT(filename):
        G.add_node(i, label=tmp[3])

    for i in range(0, nb_edges):
        tmp = content[i+G.number_of_nodes()+2].split(" ")
        tmp = content[i + G.number_of_nodes() + 2].split(" ")
        tmp = [x for x in tmp if x != '']
        G.add_edge(int(tmp[0]) - 1, int(tmp[1]) - 1, label=int(tmp[3]))
    return G
@@ -43,9 +55,10 @@ def loadGXL(filename):
        label = edge.find('attr')[0].text
        G.add_edge(dic[edge.attrib['from']], dic[edge.attrib['to']], label=label)
    return G


 
 def loadDataset(filename):
    """load file list of the dataset.
    """
    from os.path import dirname, splitext

    dirname_dataset = dirname(filename)
@@ -56,7 +69,7 @@ def loadDataset(filename):
        content = open(filename).read().splitlines()
        for i in range(0, len(content)):
            tmp = content[i].split(' ')
            data.append(loadCT(dirname_dataset + '/' + tmp[0]))
            data.append(loadCT(dirname_dataset + '/' + tmp[0].replace('#', '', 1))) # remove the '#'s in file names
            y.append(float(tmp[1]))
    elif(extension == "cxl"):
        import xml.etree.ElementTree as ET
--- a/pygraph/utils/utils.py
+++ b/pygraph/utils/utils.py
@@ -0,0 +1,59 @@
 import networkx as nx
 import numpy as np


 def getSPLengths(G1):
    sp = nx.shortest_path(G1)
    distances = np.zeros((G1.number_of_nodes(), G1.number_of_nodes()))
    for i in np.keys():
        for j in np[i].keys():
            distances[i, j] = len(sp[i][j])-1
    return distances

 def getSPGraph(G):
    """Transform graph G to its corresponding shortest-paths graph.
    
    Parameters
    ----------
    G : NetworkX graph
        The graph to be tramsformed.
        
    Return
    ------
    S : NetworkX graph
        The shortest-paths graph corresponding to G.
        
    Notes
    ------
    For an input graph G, its corresponding shortest-paths graph S contains the same set of nodes as G, while there exists an edge between all nodes in S which are connected by a walk in G. Every edge in S between two nodes is labeled by the shortest distance between these two nodes.
    
    References
    ----------
    [1] Borgwardt KM, Kriegel HP. Shortest-path kernels on graphs. InData Mining, Fifth IEEE International Conference on 2005 Nov 27 (pp. 8-pp). IEEE.
    """
    return floydTransformation(G)
            
 def floydTransformation(G):
    """Transform graph G to its corresponding shortest-paths graph using Floyd-transformation.
    
    Parameters
    ----------
    G : NetworkX graph
        The graph to be tramsformed.
        
    Return
    ------
    S : NetworkX graph
        The shortest-paths graph corresponding to G.
        
    References
    ----------
    [1] Borgwardt KM, Kriegel HP. Shortest-path kernels on graphs. InData Mining, Fifth IEEE International Conference on 2005 Nov 27 (pp. 8-pp). IEEE.
    """
    spMatrix = nx.floyd_warshall_numpy(G) # @todo weigth label not considered
    S = nx.Graph()
    S.add_nodes_from(G.nodes(data=True))
    for i in range(0, G.number_of_nodes()):
        for j in range(0, G.number_of_nodes()):
            S.add_edge(i, j, cost = spMatrix[i, j])
    return S
--- a/tests/README.md
+++ b/tests/README.md
@@ -0,0 +1,5 @@
 To use the library : 
 $> virtualenv --python=/usr/bin/python3.5 venv
 $> pip install -r requirements.txt
 $> source venv/bin/activate
 ... Go use pygraph
--- a/tests/opt.py
+++ b/tests/opt.py
@@ -0,0 +1,66 @@
 import ot
 import sys
 import pathlib
 sys.path.insert(0, "../")

 from pygraph.utils.graphfiles import loadDataset
 from pygraph.ged.costfunctions import ConstantCostFunction
 from pygraph.utils.utils import getSPLengths
 from tqdm import tqdm
 import numpy as np
 from scipy.optimize import linear_sum_assignment
 from pygraph.ged.GED import ged
 import scipy

 def pad(C, n):
    C_pad = np.zeros((n, n))
    C_pad[:C.shape[0], :C.shape[1]] = C
    return C_pad

 if (__name__ == "__main__"):
    ds_filename = "/home/bgauzere/work/Datasets/Acyclic/dataset_bps.ds"
    dataset, y = loadDataset(ds_filename)
    cf = ConstantCostFunction(1, 3, 1, 3)
    N = len(dataset)

    pairs = list()
    
    ged_distances = list() #np.zeros((N, N))
    gw_distances = list() #np.zeros((N, N))
    for i in tqdm(range(0, N)):
        for j in tqdm(range(i, N)):
            G1 = dataset[i]
            G2 = dataset[j]
            n = G1.number_of_nodes()
            m = G2.number_of_nodes()
            if(n == m):
                C1 = getSPLengths(G1)
                C2 = getSPLengths(G2)

                C1 /= C1.max()
                C2 /= C2.max()

                dim = max(n, m)
                if(n < m):
                    C1 = pad(C1, dim)
                elif (m < n):
                    C2 = pad(C2, dim)

                p = ot.unif(dim)
                q = ot.unif(dim)

                gw = ot.gromov_wasserstein(C1, C2, p, q,
                                           'square_loss', epsilon=5e-3)
                row_ind, col_ind = linear_sum_assignment(-gw)
                rho = col_ind
                varrho = row_ind[np.argsort(col_ind)]
                pairs.append((i,j))
                gw_distances.append(ged(G1, G2, cf=cf, rho=rho, varrho=varrho)[0])

                ged_distances.append(ged(G1, G2, cf=cf)[0])

    print("Moyenne sur Riesen : {}".format(np.mean(ged_distances)))
    print("Moyenne sur GW : {} ".format(np.mean(gw_distances)))

    np.save("distances_riesen", ged_distances)
    np.save("distances_gw", gw_distances)
--- a/tests/requirements.txt
+++ b/tests/requirements.txt
@@ -0,0 +1,16 @@
 cycler==0.10.0
 Cython==0.27.3
 decorator==4.1.2
 matplotlib==2.1.0
 networkx==2.0
 numpy==1.13.3
 pkg-resources==0.0.0
 POT==0.4.0
 pyparsing==2.2.0
 python-dateutil==2.6.1
 pytz==2017.3
 scikit-learn==0.19.1
 scipy==1.0.0
 six==1.11.0
 sklearn==0.0
 tqdm==4.19.4
--- a/utils/utils.py
+++ b/utils/utils.py
@@ -1,10 +0,0 @@
 import networkx as nx
 import numpy as np


 def getSPLengths(G1):
    sp = nx.shortest_path(G1)
    distances = np.zeros((G1.number_of_nodes(), G1.number_of_nodes()))
    for i in np.keys():
        for j in np[i].keys():
            distances[i, j] = len(sp[i][j])-1