add pygraph/kernels/spkernel.py modify pygraph/utils/util.py and pygraph/utils/graphfiles.pyv0.1
@@ -0,0 +1,170 @@ | |||
{ | |||
"cells": [ | |||
{ | |||
"cell_type": "code", | |||
"execution_count": 1, | |||
"metadata": { | |||
"autoscroll": false, | |||
"ein.tags": "worksheet-0", | |||
"slideshow": { | |||
"slide_type": "-" | |||
} | |||
}, | |||
"outputs": [], | |||
"source": [ | |||
"import numpy as np\n", | |||
"import paths\n", | |||
"\n", | |||
"import pygraph\n", | |||
"\n", | |||
"from pygraph.utils.graphfiles import loadDataset\n" | |||
] | |||
}, | |||
{ | |||
"cell_type": "code", | |||
"execution_count": 2, | |||
"metadata": { | |||
"autoscroll": false, | |||
"ein.tags": "worksheet-0", | |||
"slideshow": { | |||
"slide_type": "-" | |||
} | |||
}, | |||
"outputs": [], | |||
"source": [ | |||
"import networkx as nx\n", | |||
"import numpy as np\n", | |||
"import matplotlib.pyplot as plt\n", | |||
"\n", | |||
"# We load a ds dataset\n", | |||
"# load it from https://brunl01.users.greyc.fr/CHEMISTRY/Acyclic.tar.gz\n", | |||
"dataset, y = loadDataset(\"/home/bgauzere/work/Datasets/Acyclic/dataset_bps.ds\")" | |||
] | |||
}, | |||
{ | |||
"cell_type": "code", | |||
"execution_count": 3, | |||
"metadata": { | |||
"autoscroll": false, | |||
"ein.tags": "worksheet-0", | |||
"slideshow": { | |||
"slide_type": "-" | |||
} | |||
}, | |||
"outputs": [ | |||
{ | |||
"name": "stderr", | |||
"output_type": "stream", | |||
"text": [ | |||
"100%|██████████| 183/183 [07:41<00:00, 2.52s/it]\n", | |||
"100%|██████████| 183/183 [08:39<00:00, 2.84s/it]\n", | |||
"100%|██████████| 183/183 [05:19<00:00, 1.75s/it]\n", | |||
"100%|██████████| 183/183 [05:50<00:00, 1.91s/it]\n" | |||
] | |||
} | |||
], | |||
"source": [ | |||
"#Compute graph edit distances\n", | |||
"\n", | |||
"from tqdm import tqdm\n", | |||
"from pygraph.c_ext.lsape_binders import lsap_solverHG\n", | |||
"from pygraph.ged.costfunctions import ConstantCostFunction\n", | |||
"from pygraph.ged.GED import ged\n", | |||
"import time\n", | |||
"\n", | |||
"cf = ConstantCostFunction(1,3,1,3)\n", | |||
"N=len(dataset)\n", | |||
"\n", | |||
"methods=['Riesen + LSAP', 'Neigh + LSAP', 'Riesen + LSAPE', 'Neigh + LSAPE']\n", | |||
"ged_distances = [ np.zeros((N,N)), np.zeros((N,N)), np.zeros((N,N)), np.zeros((N,N))]\n", | |||
"\n", | |||
"times = list()\n", | |||
"start = time.clock()\n", | |||
"for i in tqdm(range(0,N)):\n", | |||
" for j in range(0,N):\n", | |||
" ged_distances[0][i,j] = ged(dataset[i],dataset[j],cf=cf, method='Riesen')[0]\n", | |||
"times.append(time.clock() - start)\n", | |||
"\n", | |||
"\n", | |||
"start = time.clock()\n", | |||
"for i in tqdm(range(0,N)):\n", | |||
" for j in range(0,N):\n", | |||
" ged_distances[1][i,j] = ged(dataset[i],dataset[j],cf=cf, method='Neighboorhood')[0]\n", | |||
"\n", | |||
"times.append(time.clock() - start)\n", | |||
"\n", | |||
"start = time.clock()\n", | |||
"for i in tqdm(range(0,N)):\n", | |||
" for j in range(0,N):\n", | |||
" ged_distances[2][i,j] = ged(dataset[i],dataset[j],cf=cf, method='Riesen',solver=lsap_solverHG)[0]\n", | |||
"times.append(time.clock() - start)\n", | |||
"\n", | |||
"start = time.clock()\n", | |||
"for i in tqdm(range(0,N)):\n", | |||
" for j in range(0,N):\n", | |||
" ged_distances[3][i,j] = ged(dataset[i],dataset[j],cf=cf, method='Neighboorhood',solver=lsap_solverHG)[0]\n", | |||
"times.append(time.clock() - start)" | |||
] | |||
}, | |||
{ | |||
"cell_type": "code", | |||
"execution_count": 5, | |||
"metadata": { | |||
"autoscroll": false, | |||
"ein.tags": "worksheet-0", | |||
"slideshow": { | |||
"slide_type": "-" | |||
} | |||
}, | |||
"outputs": [ | |||
{ | |||
"name": "stdout", | |||
"output_type": "stream", | |||
"text": [ | |||
" method \t mean \t mean \t time\n", | |||
" Riesen + LSAP \t 37.79903849025053 \t 35.31207262086058 \t 463.300405 \n", | |||
" Neigh + LSAP \t 36.2281047508137 \t 33.85869987159963 \t 521.7821730000001 \n", | |||
" Riesen + LSAPE \t 35.95508973095643 \t 34.10092866314312 \t 319.83455500000014 \n", | |||
" Neigh + LSAPE \t 34.5005822807489 \t 32.5735614679447 \t 350.48029599999995 \n" | |||
] | |||
} | |||
], | |||
"source": [ | |||
"print(\" method \\t mean \\t mean \\t time\")\n", | |||
"data = list()\n", | |||
"for i in range(0,len(ged_distances)):\n", | |||
" ged_ = np.minimum(ged_distances[i],ged_distances[i].transpose())\n", | |||
" print(\" {} \\t {} \\t {} \\t {} \".format(methods[i], np.mean(ged_distances[i]),np.mean(ged_), times[i]))\n" | |||
] | |||
}, | |||
{ | |||
"cell_type": "code", | |||
"execution_count": null, | |||
"metadata": {}, | |||
"outputs": [], | |||
"source": [] | |||
} | |||
], | |||
"metadata": { | |||
"kernelspec": { | |||
"display_name": "Python 3", | |||
"language": "python", | |||
"name": "python3" | |||
}, | |||
"language_info": { | |||
"codemirror_mode": { | |||
"name": "ipython", | |||
"version": 3 | |||
}, | |||
"file_extension": ".py", | |||
"mimetype": "text/x-python", | |||
"name": "python", | |||
"nbconvert_exporter": "python", | |||
"pygments_lexer": "ipython3", | |||
"version": "3.6.2" | |||
}, | |||
"name": "py-graph_test.ipynb" | |||
}, | |||
"nbformat": 4, | |||
"nbformat_minor": 2 | |||
} |
@@ -0,0 +1,21 @@ | |||
# -*-coding:utf-8 -*- | |||
""" | |||
Pygraph | |||
This package contains 4 sub packages : | |||
* c_ext : binders to C++ code | |||
* ged : allows to compute graph edit distance between networkX graphs | |||
* kernels : computation of graph kernels, ie graph similarity measure compatible with SVM | |||
* notebooks : examples of code using this library | |||
* utils : Diverse computation on graphs | |||
""" | |||
# info | |||
__version__ = "0.1" | |||
__author__ = "Benoit Gaüzère" | |||
__date__ = "November 2017" | |||
# import sub modules | |||
from pygraph import c_ext | |||
from pygraph import ged | |||
from pygraph import utils |
@@ -0,0 +1,5 @@ | |||
# You must specify your env variable LSAPE_DIR | |||
#LSAPE_DIR=/home/bgauzere/Téléchargements/lsape/include/ | |||
liblsap.so:lsap.cpp | |||
g++ -fPIC -I/home/bgauzere/Téléchargements/lsape/include/ -shared lsap.cpp -o liblsap.so -O3 -I$(LSAPE_DIR) |
@@ -0,0 +1,6 @@ | |||
Python wrapper for lsape method | |||
Specify your LSAPE_DIR env variable with the location of the source | |||
code to compile | |||
source code : https://bougleux.users.greyc.fr/lsape/ |
@@ -0,0 +1,17 @@ | |||
# -*-coding:utf-8 -*- | |||
"""Pygraph - c_ext module | |||
This package binds some C++ code to python | |||
lsape_binders.py : binders to C++ code of LSAPE methods implemented in | |||
https://bougleux.users.greyc.fr/lsape/ | |||
""" | |||
# info | |||
__version__ = "0.1" | |||
__author__ = "Benoit Gaüzère" | |||
__date__ = "November 2017" | |||
# import sub modules | |||
from pygraph.c_ext import lsape_binders |
@@ -0,0 +1,43 @@ | |||
/* | |||
Python wrapper | |||
*/ | |||
#include "hungarian-lsape.hh" | |||
#include "hungarian-lsap.hh" | |||
#include <cstdio> | |||
extern "C" int lsap(double * C, const int nm, long * rho, long * varrho){ | |||
double * u = new double[nm]; | |||
double * v = new double[nm]; | |||
int * rho_int = new int[nm]; | |||
int * varrho_int = new int[nm]; | |||
hungarianLSAP(C,nm,nm,rho_int,u,v,varrho_int); | |||
//Find a better way to do | |||
for (int i =0;i<nm;i++){ | |||
rho[i] = (long)(rho_int[i]); | |||
varrho[i] = (long)(varrho_int[i]); | |||
} | |||
return 0; | |||
} | |||
extern "C" int * lsape(double * C, const int n, const int m, long * rho, long * varrho){ | |||
double * u = new double[n]; | |||
double * v = new double[m]; | |||
int * rho_int = new int[n]; | |||
int * varrho_int = new int[m]; | |||
hungarianLSAPE(C,n,m,rho_int,varrho_int,u,v); | |||
for (int i =0;i<n;i++) | |||
rho[i] = (long)(rho_int[i]); | |||
for (int i =0;i<m;i++) | |||
varrho[i] = (long)(varrho_int[i]); | |||
return 0; | |||
} |
@@ -0,0 +1,23 @@ | |||
import numpy as np | |||
import ctypes as c | |||
from ctypes import cdll | |||
import os.path | |||
def lsap_solverHG(C): | |||
''' Binding for lsape hungarian solver ''' | |||
nm = C.shape[0] | |||
dll_name = 'liblsap.so' | |||
lib = cdll.LoadLibrary(os.path.abspath( | |||
os.path.join(os.path.dirname(__file__), dll_name))) | |||
lib.lsap.restype = c.c_int | |||
rho = np.zeros((nm, 1), int) | |||
varrho = np.zeros((nm, 1), int) | |||
C[C == np.inf] = 10000 | |||
lib.lsap(c.c_void_p(C.transpose().ctypes.data), | |||
c.c_int(nm), | |||
c.c_void_p(rho.ctypes.data), | |||
c.c_void_p(varrho.ctypes.data)) | |||
return np.array(range(0, nm)), np.array([c.c_int(i).value for i in varrho]) |
@@ -1,10 +1,11 @@ | |||
from ged.costfunctions import BasicCostFunction, RiesenCostFunction | |||
from ged.costfunctions import NeighboorhoodCostFunction | |||
from ged.bipartiteGED import computeBipartiteCostMatrix, getOptimalMapping | |||
from pygraph.ged.costfunctions import ConstantCostFunction, RiesenCostFunction | |||
from pygraph.ged.costfunctions import NeighboorhoodCostFunction | |||
from pygraph.ged.bipartiteGED import computeBipartiteCostMatrix, getOptimalMapping | |||
from scipy.optimize import linear_sum_assignment | |||
def ged(G1, G2, method='Riesen', rho=None, varrho=None, | |||
cf=BasicCostFunction(1, 3, 1, 3)): | |||
cf=ConstantCostFunction(1, 3, 1, 3), | |||
solver=linear_sum_assignment): | |||
"""Compute Graph Edit Distance between G1 and G2 according to mapping | |||
encoded within rho and varrho. Graph's node must be indexed by a | |||
index which is used is rho and varrho | |||
@@ -14,31 +15,32 @@ def ged(G1, G2, method='Riesen', rho=None, varrho=None, | |||
""" | |||
if ((rho is None) or (varrho is None)): | |||
if(method == 'Riesen'): | |||
cf_bp = RiesenCostFunction(cf) | |||
cf_bp = RiesenCostFunction(cf,lsap_solver=solver) | |||
elif(method == 'Neighboorhood'): | |||
cf_bp = NeighboorhoodCostFunction(cf) | |||
cf_bp = NeighboorhoodCostFunction(cf,lsap_solver=solver) | |||
elif(method == 'Basic'): | |||
cf_bp = cf | |||
else: | |||
raise NameError('Non existent method ') | |||
rho, varrho = getOptimalMapping(computeBipartiteCostMatrix(G1, G2, cf_bp)) | |||
rho, varrho = getOptimalMapping( | |||
computeBipartiteCostMatrix(G1, G2, cf_bp), lsap_solver=solver) | |||
n = G1.number_of_nodes() | |||
m = G2.number_of_nodes() | |||
ged = 0 | |||
for i in G1.nodes_iter(): | |||
for i in G1.nodes(): | |||
phi_i = rho[i] | |||
if(phi_i >= m): | |||
ged += cf.cnd(i, G1) | |||
else: | |||
ged += cf.cns(i, phi_i, G1, G2) | |||
for j in G2.nodes_iter(): | |||
for j in G2.nodes(): | |||
phi_j = varrho[j] | |||
if(phi_j >= n): | |||
ged += cf.cni(j, G2) | |||
for e in G1.edges_iter(data=True): | |||
for e in G1.edges(data=True): | |||
i = e[0] | |||
j = e[1] | |||
phi_i = rho[i] | |||
@@ -49,13 +51,13 @@ def ged(G1, G2, method='Riesen', rho=None, varrho=None, | |||
if(mappedEdge): | |||
e2 = [phi_i, phi_j, G2[phi_i][phi_j]] | |||
min_cost = min(cf.ces(e, e2, G1, G2), | |||
cf.ced(e, G1), cf.cei(e2, G2)) | |||
cf.ced(e, G1) + cf.cei(e2, G2)) | |||
ged += min_cost | |||
else: | |||
ged += cf.ced(e, G1) | |||
else: | |||
ged += cf.ced(e, G1) | |||
for e in G2.edges_iter(data=True): | |||
for e in G2.edges(data=True): | |||
i = e[0] | |||
j = e[1] | |||
phi_i = varrho[i] | |||
@@ -68,7 +70,3 @@ def ged(G1, G2, method='Riesen', rho=None, varrho=None, | |||
else: | |||
ged += cf.ced(e, G2) | |||
return ged, rho, varrho | |||
def computeDistanceMatrix(dataset): | |||
pass |
@@ -0,0 +1,17 @@ | |||
# -*-coding:utf-8 -*- | |||
"""Pygraph - ged module | |||
Implement some methods to compute ged between graphs | |||
""" | |||
# info | |||
__version__ = "0.1" | |||
__author__ = "Benoit Gaüzère" | |||
__date__ = "November 2017" | |||
from pygraph.ged import costfunctions | |||
from pygraph.ged import bipartiteGED | |||
from pygraph.ged import GED | |||
@@ -1,9 +1,9 @@ | |||
import numpy as np | |||
from scipy.optimize import linear_sum_assignment | |||
from ged.costfunctions import BasicCostFunction | |||
from pygraph.ged.costfunctions import ConstantCostFunction | |||
def computeBipartiteCostMatrix(G1, G2, cf=BasicCostFunction(1, 3, 1, 3)): | |||
def computeBipartiteCostMatrix(G1, G2, cf=ConstantCostFunction(1, 3, 1, 3)): | |||
"""Compute a Cost Matrix according to cost function cf""" | |||
n = G1.number_of_nodes() | |||
m = G2.number_of_nodes() | |||
@@ -11,23 +11,23 @@ def computeBipartiteCostMatrix(G1, G2, cf=BasicCostFunction(1, 3, 1, 3)): | |||
C = np.ones([nm, nm])*np.inf | |||
C[n:, m:] = 0 | |||
for u in G1.nodes_iter(): | |||
for v in G2.nodes_iter(): | |||
for u in G1.nodes(): | |||
for v in G2.nodes(): | |||
cost = cf.cns(u, v, G1, G2) | |||
C[u, v] = cost | |||
for v in G1.nodes_iter(): | |||
for v in G1.nodes(): | |||
C[v, m + v] = cf.cnd(v, G1) | |||
for v in G2.nodes_iter(): | |||
for v in G2.nodes(): | |||
C[n + v, v] = cf.cni(v, G2) | |||
return C | |||
def getOptimalMapping(C): | |||
def getOptimalMapping(C, lsap_solver=linear_sum_assignment): | |||
"""Compute an optimal linear mapping according to cost Matrix C | |||
inclure les progs C de Seb | |||
""" | |||
row_ind, col_ind = linear_sum_assignment(C) | |||
row_ind, col_ind = lsap_solver(C) | |||
return col_ind, row_ind[np.argsort(col_ind)] |
@@ -2,15 +2,17 @@ import numpy as np | |||
from scipy.optimize import linear_sum_assignment | |||
class BasicCostFunction: | |||
class ConstantCostFunction: | |||
""" Define a symmetric constant cost fonction for edit operations """ | |||
def __init__(self, cns, cni, ces, cei): | |||
self.cns_ = cns | |||
self.cni_ = self.cnd_ = cni | |||
self.ces_ = ces | |||
self.cei_ = self.ced_ = cei | |||
def cns(self, u, v, G1, G2): | |||
return (G1.node[u]['label'] != G2.node[v]['label'])*self.cns_ | |||
def cns(self, node_u, node_v, g1, g2): | |||
""" return substitution edit operation cost between node_u of G1 and node_v of G2""" | |||
return (g1.node[node_u]['label'] != g2.node[node_v]['label'])*self.cns_ | |||
def cnd(self, u, G1): | |||
return self.cnd_ | |||
@@ -30,9 +32,11 @@ class BasicCostFunction: | |||
return self.cei_ | |||
class RiesenCostFunction(BasicCostFunction): | |||
def __init__(self, cf): | |||
BasicCostFunction.__init__(self, cf.cns_, cf.cni_, cf.ces_, cf.cei_) | |||
class RiesenCostFunction(): | |||
""" Cost function associated to the computation of a cost matrix between nodes for LSAP""" | |||
def __init__(self, cf, lsap_solver=linear_sum_assignment): | |||
self.cf_ = cf | |||
self.lsap_solver_ = lsap_solver | |||
def cns(self, u, v, G1, G2): | |||
""" u et v sont des id de noeuds """ | |||
@@ -48,41 +52,43 @@ class RiesenCostFunction(BasicCostFunction): | |||
e1 = [u, nbr_u, G1[u][nbr_u]] | |||
for nbr_v in G2[v]: | |||
e2 = [v, nbr_v, G2[v][nbr_v]] | |||
sub_C[i, j] = self.ces(e1, e2, G1, G2) | |||
sub_C[i, j] = self.cf_.ces(e1, e2, G1, G2) | |||
j += 1 | |||
i += 1 | |||
i = 0 | |||
for nbr_u in l_nbr_u: | |||
sub_C[i, m+i] = self.ced([u, nbr_u, G1[u][nbr_u]], G1) | |||
sub_C[i, m+i] = self.cf_.ced([u, nbr_u, G1[u][nbr_u]], G1) | |||
i += 1 | |||
j = 0 | |||
for nbr_v in l_nbr_v: | |||
sub_C[n+j, j] = self.cei([v, nbr_v, G2[v][nbr_v]], G2) | |||
sub_C[n+j, j] = self.cf_.cei([v, nbr_v, G2[v][nbr_v]], G2) | |||
j += 1 | |||
row_ind, col_ind = linear_sum_assignment(sub_C) | |||
row_ind, col_ind = self.lsap_solver_(sub_C) | |||
cost = np.sum(sub_C[row_ind, col_ind]) | |||
return BasicCostFunction.cns(self, u, v, G1, G2) + cost | |||
return self.cf_.cns(u, v, G1, G2) + cost | |||
def cnd(self, u, G1): | |||
cost = 0 | |||
for nbr in G1[u]: | |||
cost += BasicCostFunction.ced(self,[u,nbr,G1[u][nbr]],G1) | |||
cost += self.cf_.ced([u,nbr,G1[u][nbr]],G1) | |||
return BasicCostFunction.cnd(self,u,G1) + cost | |||
return self.cf_.cnd(u,G1) + cost | |||
def cni(self, v, G2): | |||
cost = 0 | |||
for nbr in G2[v]: | |||
cost += BasicCostFunction.cei(self, [v,nbr,G2[v][nbr]], G2) | |||
cost += self.cf_.cei([v,nbr,G2[v][nbr]], G2) | |||
return BasicCostFunction.cni(self, v, G2) + cost | |||
return self.cf_.cni(v, G2) + cost | |||
class NeighboorhoodCostFunction(BasicCostFunction): | |||
def __init__(self, cf): | |||
BasicCostFunction.__init__(self, cf.cns_, cf.cni_, cf.ces_, cf.cei_) | |||
class NeighboorhoodCostFunction(): | |||
""" Cost function associated to the computation of a cost matrix between nodes for LSAP""" | |||
def __init__(self, cf, lsap_solver=linear_sum_assignment): | |||
self.cf_ = cf | |||
self.lsap_solver_ = lsap_solver | |||
def cns(self, u, v, G1, G2): | |||
""" u et v sont des id de noeuds """ | |||
@@ -98,36 +104,35 @@ class NeighboorhoodCostFunction(BasicCostFunction): | |||
e1 = [u, nbr_u, G1[u][nbr_u]] | |||
for nbr_v in G2[v]: | |||
e2 = [v, nbr_v, G2[v][nbr_v]] | |||
sub_C[i, j] = self.ces(e1, e2, G1, G2) | |||
sub_C[i, j] += BasicCostFunction.cns(self, | |||
nbr_u, nbr_v, G1, G2) | |||
sub_C[i, j] = self.cf_.ces(e1, e2, G1, G2) | |||
sub_C[i, j] += self.cf_.cns(nbr_u, nbr_v, G1, G2) | |||
j += 1 | |||
i += 1 | |||
i = 0 | |||
for nbr_u in l_nbr_u: | |||
sub_C[i, m+i] = self.ced([u, nbr_u, G1[u][nbr_u]], G1) | |||
sub_C[i, m+i] += BasicCostFunction.cnd(self, nbr_u, G1) | |||
sub_C[i, m+i] = self.cf_.ced([u, nbr_u, G1[u][nbr_u]], G1) | |||
sub_C[i, m+i] += self.cf_.cnd(nbr_u, G1) | |||
i += 1 | |||
j = 0 | |||
for nbr_v in l_nbr_v: | |||
sub_C[n+j, j] = self.cei([v, nbr_v, G2[v][nbr_v]], G2) | |||
sub_C[n+j, j] += BasicCostFunction.cni(self, nbr_v, G2) | |||
sub_C[n+j, j] = self.cf_.cei([v, nbr_v, G2[v][nbr_v]], G2) | |||
sub_C[n+j, j] += self.cf_.cni(nbr_v, G2) | |||
j += 1 | |||
row_ind, col_ind = linear_sum_assignment(sub_C) | |||
row_ind, col_ind = self.lsap_solver_(sub_C) | |||
cost = np.sum(sub_C[row_ind, col_ind]) | |||
return BasicCostFunction.cns(self, u, v, G1, G2) + cost | |||
return self.cf_.cns(u, v, G1, G2) + cost | |||
def cnd(self, u, G1): | |||
cost = 0 | |||
for nbr in G1[u]: | |||
cost += BasicCostFunction.ced(self, [u, nbr, G1[u][nbr]], G1) | |||
return BasicCostFunction.cnd(self, u, G1) + cost | |||
cost += self.cf_.ced([u, nbr, G1[u][nbr]], G1) | |||
return self.cf_.cnd(u, G1) + cost | |||
def cni(self, v, G2): | |||
cost = 0 | |||
for nbr in G2[v]: | |||
cost += BasicCostFunction.cei(self, [v, nbr, G2[v][nbr]], G2) | |||
return BasicCostFunction.cni(self, v, G2) + cost | |||
cost += self.cf_.cei([v, nbr, G2[v][nbr]], G2) | |||
return self.cf_.cni(v, G2) + cost |
@@ -0,0 +1,68 @@ | |||
import sys | |||
import pathlib | |||
sys.path.insert(0, "../") | |||
import networkx as nx | |||
import numpy as np | |||
import time | |||
from utils.utils import getSPGraph | |||
def spkernel(*args): | |||
"""Calculate shortest-path kernels between graphs. | |||
Parameters | |||
---------- | |||
Gn : List of NetworkX graph | |||
List of graphs between which the kernels are calculated. | |||
/ | |||
G1, G2 : NetworkX graphs | |||
2 graphs between which the kernel is calculated. | |||
Return | |||
------ | |||
Kmatrix/Kernel : Numpy matrix/int | |||
Kernel matrix, each element of which is the sp kernel between 2 praphs. / SP Kernel between 2 graphs. | |||
References | |||
---------- | |||
[1] Borgwardt KM, Kriegel HP. Shortest-path kernels on graphs. InData Mining, Fifth IEEE International Conference on 2005 Nov 27 (pp. 8-pp). IEEE. | |||
""" | |||
if len(args) == 1: # for a list of graphs | |||
Gn = args[0] | |||
Kmatrix = np.zeros((len(Gn), len(Gn))) | |||
Sn = [] # get shortest path graphs of Gn | |||
for i in range(0, len(Gn)): | |||
Sn.append(getSPGraph(Gn[i])) | |||
start_time = time.time() | |||
for i in range(0, len(Gn)): | |||
for j in range(i, len(Gn)): | |||
for e1 in Sn[i].edges(data = True): | |||
for e2 in Sn[j].edges(data = True): | |||
if e1[2]['cost'] != 0 and e1[2]['cost'] == e2[2]['cost'] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])): | |||
Kmatrix[i][j] += 1 | |||
Kmatrix[j][i] += (0 if i == j else 1) | |||
print("--- shortest path kernel matrix of size %d built in %s seconds ---" % (len(Gn), (time.time() - start_time))) | |||
return Kmatrix | |||
else: # for only 2 graphs | |||
G1 = args[0] | |||
G2 = args[1] | |||
kernel = 0 | |||
for e1 in G1.edges(data = True): | |||
for e2 in G2.edges(data = True): | |||
if e1[2]['cost'] != 0 and e1[2]['cost'] == e2[2]['cost'] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])): | |||
kernel += 1 | |||
print("--- shortest path kernel built in %s seconds ---" % (time.time() - start_time)) | |||
return kernel |
@@ -0,0 +1,17 @@ | |||
# -*-coding:utf-8 -*- | |||
"""Pygraph - utils module | |||
Implement some methods to manage graphs | |||
graphfiles.py : load .gxl and .ct files | |||
utils.py : compute some properties on networkX graphs | |||
""" | |||
# info | |||
__version__ = "0.1" | |||
__author__ = "Benoit Gaüzère" | |||
__date__ = "November 2017" | |||
from pygraph.utils import graphfiles | |||
from pygraph.utils import utils |
@@ -1,13 +1,25 @@ | |||
import networkx as nx | |||
def loadCT(filename): | |||
"""load data from .ct file. | |||
Notes | |||
------ | |||
a typical example of data in .ct is like this: | |||
3 2 <- number of nodes and edges | |||
0.0000 0.0000 0.0000 C <- each line describes a node, the last parameter in which is the label of the node, representing a chemical element @Q what are the first 3 numbers? | |||
0.0000 0.0000 0.0000 C | |||
0.0000 0.0000 0.0000 O | |||
1 3 1 1 <- each line describes an edge, the first two numbers represent two nodes of the edge, the last number represents the label. @Q what are the 3th numbers? | |||
2 3 1 1 | |||
""" | |||
content = open(filename).read().splitlines() | |||
G = nx.Graph(name=str(content[0])) | |||
G = nx.Graph(name=str(content[0])) # set name of the graph | |||
tmp = content[1].split(" ") | |||
if tmp[0] == '': | |||
nb_nodes = int(tmp[1]) | |||
nb_edges = int(tmp[2]) | |||
nb_nodes = int(tmp[1]) # number of the nodes | |||
nb_edges = int(tmp[2]) # number of the edges | |||
else: | |||
nb_nodes = int(tmp[0]) | |||
nb_edges = int(tmp[1]) | |||
@@ -18,7 +30,7 @@ def loadCT(filename): | |||
G.add_node(i, label=tmp[3]) | |||
for i in range(0, nb_edges): | |||
tmp = content[i+G.number_of_nodes()+2].split(" ") | |||
tmp = content[i + G.number_of_nodes() + 2].split(" ") | |||
tmp = [x for x in tmp if x != ''] | |||
G.add_edge(int(tmp[0]) - 1, int(tmp[1]) - 1, label=int(tmp[3])) | |||
return G | |||
@@ -43,9 +55,10 @@ def loadGXL(filename): | |||
label = edge.find('attr')[0].text | |||
G.add_edge(dic[edge.attrib['from']], dic[edge.attrib['to']], label=label) | |||
return G | |||
def loadDataset(filename): | |||
"""load file list of the dataset. | |||
""" | |||
from os.path import dirname, splitext | |||
dirname_dataset = dirname(filename) | |||
@@ -56,7 +69,7 @@ def loadDataset(filename): | |||
content = open(filename).read().splitlines() | |||
for i in range(0, len(content)): | |||
tmp = content[i].split(' ') | |||
data.append(loadCT(dirname_dataset + '/' + tmp[0])) | |||
data.append(loadCT(dirname_dataset + '/' + tmp[0].replace('#', '', 1))) # remove the '#'s in file names | |||
y.append(float(tmp[1])) | |||
elif(extension == "cxl"): | |||
import xml.etree.ElementTree as ET |
@@ -0,0 +1,59 @@ | |||
import networkx as nx | |||
import numpy as np | |||
def getSPLengths(G1): | |||
sp = nx.shortest_path(G1) | |||
distances = np.zeros((G1.number_of_nodes(), G1.number_of_nodes())) | |||
for i in np.keys(): | |||
for j in np[i].keys(): | |||
distances[i, j] = len(sp[i][j])-1 | |||
return distances | |||
def getSPGraph(G): | |||
"""Transform graph G to its corresponding shortest-paths graph. | |||
Parameters | |||
---------- | |||
G : NetworkX graph | |||
The graph to be tramsformed. | |||
Return | |||
------ | |||
S : NetworkX graph | |||
The shortest-paths graph corresponding to G. | |||
Notes | |||
------ | |||
For an input graph G, its corresponding shortest-paths graph S contains the same set of nodes as G, while there exists an edge between all nodes in S which are connected by a walk in G. Every edge in S between two nodes is labeled by the shortest distance between these two nodes. | |||
References | |||
---------- | |||
[1] Borgwardt KM, Kriegel HP. Shortest-path kernels on graphs. InData Mining, Fifth IEEE International Conference on 2005 Nov 27 (pp. 8-pp). IEEE. | |||
""" | |||
return floydTransformation(G) | |||
def floydTransformation(G): | |||
"""Transform graph G to its corresponding shortest-paths graph using Floyd-transformation. | |||
Parameters | |||
---------- | |||
G : NetworkX graph | |||
The graph to be tramsformed. | |||
Return | |||
------ | |||
S : NetworkX graph | |||
The shortest-paths graph corresponding to G. | |||
References | |||
---------- | |||
[1] Borgwardt KM, Kriegel HP. Shortest-path kernels on graphs. InData Mining, Fifth IEEE International Conference on 2005 Nov 27 (pp. 8-pp). IEEE. | |||
""" | |||
spMatrix = nx.floyd_warshall_numpy(G) # @todo weigth label not considered | |||
S = nx.Graph() | |||
S.add_nodes_from(G.nodes(data=True)) | |||
for i in range(0, G.number_of_nodes()): | |||
for j in range(0, G.number_of_nodes()): | |||
S.add_edge(i, j, cost = spMatrix[i, j]) | |||
return S |
@@ -0,0 +1,5 @@ | |||
To use the library : | |||
$> virtualenv --python=/usr/bin/python3.5 venv | |||
$> pip install -r requirements.txt | |||
$> source venv/bin/activate | |||
... Go use pygraph |
@@ -0,0 +1,66 @@ | |||
import ot | |||
import sys | |||
import pathlib | |||
sys.path.insert(0, "../") | |||
from pygraph.utils.graphfiles import loadDataset | |||
from pygraph.ged.costfunctions import ConstantCostFunction | |||
from pygraph.utils.utils import getSPLengths | |||
from tqdm import tqdm | |||
import numpy as np | |||
from scipy.optimize import linear_sum_assignment | |||
from pygraph.ged.GED import ged | |||
import scipy | |||
def pad(C, n): | |||
C_pad = np.zeros((n, n)) | |||
C_pad[:C.shape[0], :C.shape[1]] = C | |||
return C_pad | |||
if (__name__ == "__main__"): | |||
ds_filename = "/home/bgauzere/work/Datasets/Acyclic/dataset_bps.ds" | |||
dataset, y = loadDataset(ds_filename) | |||
cf = ConstantCostFunction(1, 3, 1, 3) | |||
N = len(dataset) | |||
pairs = list() | |||
ged_distances = list() #np.zeros((N, N)) | |||
gw_distances = list() #np.zeros((N, N)) | |||
for i in tqdm(range(0, N)): | |||
for j in tqdm(range(i, N)): | |||
G1 = dataset[i] | |||
G2 = dataset[j] | |||
n = G1.number_of_nodes() | |||
m = G2.number_of_nodes() | |||
if(n == m): | |||
C1 = getSPLengths(G1) | |||
C2 = getSPLengths(G2) | |||
C1 /= C1.max() | |||
C2 /= C2.max() | |||
dim = max(n, m) | |||
if(n < m): | |||
C1 = pad(C1, dim) | |||
elif (m < n): | |||
C2 = pad(C2, dim) | |||
p = ot.unif(dim) | |||
q = ot.unif(dim) | |||
gw = ot.gromov_wasserstein(C1, C2, p, q, | |||
'square_loss', epsilon=5e-3) | |||
row_ind, col_ind = linear_sum_assignment(-gw) | |||
rho = col_ind | |||
varrho = row_ind[np.argsort(col_ind)] | |||
pairs.append((i,j)) | |||
gw_distances.append(ged(G1, G2, cf=cf, rho=rho, varrho=varrho)[0]) | |||
ged_distances.append(ged(G1, G2, cf=cf)[0]) | |||
print("Moyenne sur Riesen : {}".format(np.mean(ged_distances))) | |||
print("Moyenne sur GW : {} ".format(np.mean(gw_distances))) | |||
np.save("distances_riesen", ged_distances) | |||
np.save("distances_gw", gw_distances) |
@@ -0,0 +1,16 @@ | |||
cycler==0.10.0 | |||
Cython==0.27.3 | |||
decorator==4.1.2 | |||
matplotlib==2.1.0 | |||
networkx==2.0 | |||
numpy==1.13.3 | |||
pkg-resources==0.0.0 | |||
POT==0.4.0 | |||
pyparsing==2.2.0 | |||
python-dateutil==2.6.1 | |||
pytz==2017.3 | |||
scikit-learn==0.19.1 | |||
scipy==1.0.0 | |||
six==1.11.0 | |||
sklearn==0.0 | |||
tqdm==4.19.4 |
@@ -1,10 +0,0 @@ | |||
import networkx as nx | |||
import numpy as np | |||
def getSPLengths(G1): | |||
sp = nx.shortest_path(G1) | |||
distances = np.zeros((G1.number_of_nodes(), G1.number_of_nodes())) | |||
for i in np.keys(): | |||
for j in np[i].keys(): | |||
distances[i, j] = len(sp[i][j])-1 |