Browse Source

Set tqdm usages in all graph kernels with self-defined intervals so that it will print less lines when writing to a file.

v0.2.x
jajupmochi 4 years ago
parent
commit
cb964cf9f7
12 changed files with 725 additions and 779 deletions
  1. +66
    -66
      gklearn/kernels/common_walk.py
  2. +75
    -93
      gklearn/kernels/conjugate_gradient.py
  3. +76
    -93
      gklearn/kernels/fixed_point.py
  4. +68
    -82
      gklearn/kernels/marginalized.py
  5. +135
    -145
      gklearn/kernels/path_up_to_h.py
  6. +13
    -27
      gklearn/kernels/shortest_path.py
  7. +52
    -66
      gklearn/kernels/spectral_decomposition.py
  8. +58
    -75
      gklearn/kernels/sylvester_equation.py
  9. +118
    -131
      gklearn/kernels/treelet.py
  10. +8
    -1
      gklearn/tests/test_graph_kernels.py
  11. +1
    -0
      gklearn/utils/__init__.py
  12. +55
    -0
      gklearn/utils/iters.py

+ 66
- 66
gklearn/kernels/common_walk.py View File

@@ -5,15 +5,15 @@ Created on Tue Aug 18 11:21:31 2020


@author: ljia @author: ljia


@references:
@references:


[1] Thomas Gärtner, Peter Flach, and Stefan Wrobel. On graph kernels:
[1] Thomas Gärtner, Peter Flach, and Stefan Wrobel. On graph kernels:
Hardness results and efficient alternatives. Learning Theory and Kernel Hardness results and efficient alternatives. Learning Theory and Kernel
Machines, pages 129–143, 2003. Machines, pages 129–143, 2003.
""" """


import sys import sys
from tqdm import tqdm
from gklearn.utils import get_iters
import numpy as np import numpy as np
import networkx as nx import networkx as nx
from gklearn.utils import SpecialLabel from gklearn.utils import SpecialLabel
@@ -23,7 +23,7 @@ from gklearn.kernels import GraphKernel




class CommonWalk(GraphKernel): class CommonWalk(GraphKernel):
def __init__(self, **kwargs): def __init__(self, **kwargs):
GraphKernel.__init__(self) GraphKernel.__init__(self)
self._node_labels = kwargs.get('node_labels', []) self._node_labels = kwargs.get('node_labels', [])
@@ -39,17 +39,16 @@ class CommonWalk(GraphKernel):
self._add_dummy_labels(self._graphs) self._add_dummy_labels(self._graphs)
if not self._ds_infos['directed']: # convert if not self._ds_infos['directed']: # convert
self._graphs = [G.to_directed() for G in self._graphs] self._graphs = [G.to_directed() for G in self._graphs]
# compute Gram matrix. # compute Gram matrix.
gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) gram_matrix = np.zeros((len(self._graphs), len(self._graphs)))
from itertools import combinations_with_replacement from itertools import combinations_with_replacement
itr = combinations_with_replacement(range(0, len(self._graphs)), 2) itr = combinations_with_replacement(range(0, len(self._graphs)), 2)
if self._verbose >= 2:
iterator = tqdm(itr, desc='Computing kernels', file=sys.stdout)
else:
iterator = itr
len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2)
iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout,
length=len_itr, verbose=(self._verbose >= 2))

# direct product graph method - exponential # direct product graph method - exponential
if self._compute_method == 'exp': if self._compute_method == 'exp':
for i, j in iterator: for i, j in iterator:
@@ -62,50 +61,51 @@ class CommonWalk(GraphKernel):
kernel = self._kernel_do_geo(self._graphs[i], self._graphs[j], self._weight) kernel = self._kernel_do_geo(self._graphs[i], self._graphs[j], self._weight)
gram_matrix[i][j] = kernel gram_matrix[i][j] = kernel
gram_matrix[j][i] = kernel gram_matrix[j][i] = kernel
return gram_matrix return gram_matrix
def _compute_gm_imap_unordered(self): def _compute_gm_imap_unordered(self):
self._check_graphs(self._graphs) self._check_graphs(self._graphs)
self._add_dummy_labels(self._graphs) self._add_dummy_labels(self._graphs)
if not self._ds_infos['directed']: # convert if not self._ds_infos['directed']: # convert
self._graphs = [G.to_directed() for G in self._graphs] self._graphs = [G.to_directed() for G in self._graphs]
# compute Gram matrix. # compute Gram matrix.
gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) gram_matrix = np.zeros((len(self._graphs), len(self._graphs)))
# def init_worker(gn_toshare): # def init_worker(gn_toshare):
# global G_gn # global G_gn
# G_gn = gn_toshare # G_gn = gn_toshare
# direct product graph method - exponential # direct product graph method - exponential
if self._compute_method == 'exp': if self._compute_method == 'exp':
do_fun = self._wrapper_kernel_do_exp do_fun = self._wrapper_kernel_do_exp
# direct product graph method - geometric # direct product graph method - geometric
elif self._compute_method == 'geo': elif self._compute_method == 'geo':
do_fun = self._wrapper_kernel_do_geo do_fun = self._wrapper_kernel_do_geo
parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=_init_worker_gm,
parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=_init_worker_gm,
glbv=(self._graphs,), n_jobs=self._n_jobs, verbose=self._verbose) glbv=(self._graphs,), n_jobs=self._n_jobs, verbose=self._verbose)
return gram_matrix return gram_matrix
def _compute_kernel_list_series(self, g1, g_list): def _compute_kernel_list_series(self, g1, g_list):
self._check_graphs(g_list + [g1]) self._check_graphs(g_list + [g1])
self._add_dummy_labels(g_list + [g1]) self._add_dummy_labels(g_list + [g1])
if not self._ds_infos['directed']: # convert if not self._ds_infos['directed']: # convert
g1 = g1.to_directed() g1 = g1.to_directed()
g_list = [G.to_directed() for G in g_list] g_list = [G.to_directed() for G in g_list]
# compute kernel list. # compute kernel list.
kernel_list = [None] * len(g_list) kernel_list = [None] * len(g_list)
if self._verbose >= 2: if self._verbose >= 2:
iterator = tqdm(range(len(g_list)), desc='Computing kernels', file=sys.stdout)
iterator = get_iters(range(len(g_list)), desc='Computing kernels',
file=sys.stdout, length=len(g_list), verbose=(self._verbose >= 2))
else: else:
iterator = range(len(g_list)) iterator = range(len(g_list))
# direct product graph method - exponential # direct product graph method - exponential
if self._compute_method == 'exp': if self._compute_method == 'exp':
for i in iterator: for i in iterator:
@@ -116,17 +116,17 @@ class CommonWalk(GraphKernel):
for i in iterator: for i in iterator:
kernel = self._kernel_do_geo(g1, g_list[i], self._weight) kernel = self._kernel_do_geo(g1, g_list[i], self._weight)
kernel_list[i] = kernel kernel_list[i] = kernel
return kernel_list return kernel_list
def _compute_kernel_list_imap_unordered(self, g1, g_list): def _compute_kernel_list_imap_unordered(self, g1, g_list):
self._check_graphs(g_list + [g1]) self._check_graphs(g_list + [g1])
self._add_dummy_labels(g_list + [g1]) self._add_dummy_labels(g_list + [g1])
if not self._ds_infos['directed']: # convert if not self._ds_infos['directed']: # convert
g1 = g1.to_directed() g1 = g1.to_directed()
g_list = [G.to_directed() for G in g_list] g_list = [G.to_directed() for G in g_list]
# compute kernel list. # compute kernel list.
kernel_list = [None] * len(g_list) kernel_list = [None] * len(g_list)


@@ -134,61 +134,61 @@ class CommonWalk(GraphKernel):
# global G_g1, G_g_list # global G_g1, G_g_list
# G_g1 = g1_toshare # G_g1 = g1_toshare
# G_g_list = g_list_toshare # G_g_list = g_list_toshare
# direct product graph method - exponential # direct product graph method - exponential
if self._compute_method == 'exp': if self._compute_method == 'exp':
do_fun = self._wrapper_kernel_list_do_exp do_fun = self._wrapper_kernel_list_do_exp
# direct product graph method - geometric # direct product graph method - geometric
elif self._compute_method == 'geo': elif self._compute_method == 'geo':
do_fun = self._wrapper_kernel_list_do_geo do_fun = self._wrapper_kernel_list_do_geo
def func_assign(result, var_to_assign):
def func_assign(result, var_to_assign):
var_to_assign[result[0]] = result[1] var_to_assign[result[0]] = result[1]
itr = range(len(g_list)) itr = range(len(g_list))
len_itr = len(g_list) len_itr = len(g_list)
parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr,
init_worker=_init_worker_list, glbv=(g1, g_list), method='imap_unordered',
init_worker=_init_worker_list, glbv=(g1, g_list), method='imap_unordered',
n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose) n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose)
return kernel_list return kernel_list
def _wrapper_kernel_list_do_exp(self, itr): def _wrapper_kernel_list_do_exp(self, itr):
return itr, self._kernel_do_exp(G_g1, G_g_list[itr], self._weight) return itr, self._kernel_do_exp(G_g1, G_g_list[itr], self._weight)




def _wrapper_kernel_list_do_geo(self, itr): def _wrapper_kernel_list_do_geo(self, itr):
return itr, self._kernel_do_geo(G_g1, G_g_list[itr], self._weight) return itr, self._kernel_do_geo(G_g1, G_g_list[itr], self._weight)
def _compute_single_kernel_series(self, g1, g2): def _compute_single_kernel_series(self, g1, g2):
self._check_graphs([g1] + [g2]) self._check_graphs([g1] + [g2])
self._add_dummy_labels([g1] + [g2]) self._add_dummy_labels([g1] + [g2])
if not self._ds_infos['directed']: # convert if not self._ds_infos['directed']: # convert
g1 = g1.to_directed() g1 = g1.to_directed()
g2 = g2.to_directed() g2 = g2.to_directed()
# direct product graph method - exponential # direct product graph method - exponential
if self._compute_method == 'exp': if self._compute_method == 'exp':
kernel = self._kernel_do_exp(g1, g2, self._weight)
kernel = self._kernel_do_exp(g1, g2, self._weight)
# direct product graph method - geometric # direct product graph method - geometric
elif self._compute_method == 'geo': elif self._compute_method == 'geo':
kernel = self._kernel_do_geo(g1, g2, self._weight)
kernel = self._kernel_do_geo(g1, g2, self._weight)

return kernel



return kernel
def _kernel_do_exp(self, g1, g2, beta): def _kernel_do_exp(self, g1, g2, beta):
"""Compute common walk graph kernel between 2 graphs using exponential
"""Compute common walk graph kernel between 2 graphs using exponential
series. series.
Parameters Parameters
---------- ----------
g1, g2 : NetworkX graphs g1, g2 : NetworkX graphs
Graphs between which the kernels are computed. Graphs between which the kernels are computed.
beta : integer beta : integer
Weight. Weight.
Return Return
------ ------
kernel : float kernel : float
@@ -200,9 +200,9 @@ class CommonWalk(GraphKernel):
if nx.number_of_nodes(gp) < 2: if nx.number_of_nodes(gp) < 2:
return 0 return 0
A = nx.adjacency_matrix(gp).todense() A = nx.adjacency_matrix(gp).todense()
ew, ev = np.linalg.eig(A) ew, ev = np.linalg.eig(A)
# # remove imaginary part if possible.
# # remove imaginary part if possible.
# # @todo: don't know if it is necessary. # # @todo: don't know if it is necessary.
# for i in range(len(ew)): # for i in range(len(ew)):
# if np.abs(ew[i].imag) < 1e-9: # if np.abs(ew[i].imag) < 1e-9:
@@ -220,27 +220,27 @@ class CommonWalk(GraphKernel):
kernel = exp_D.sum() kernel = exp_D.sum()
if (kernel.real == 0 and np.abs(kernel.imag) < 1e-9) or np.abs(kernel.imag / kernel.real) < 1e-9: if (kernel.real == 0 and np.abs(kernel.imag) < 1e-9) or np.abs(kernel.imag / kernel.real) < 1e-9:
kernel = kernel.real kernel = kernel.real
return kernel return kernel
def _wrapper_kernel_do_exp(self, itr): def _wrapper_kernel_do_exp(self, itr):
i = itr[0] i = itr[0]
j = itr[1] j = itr[1]
return i, j, self._kernel_do_exp(G_gn[i], G_gn[j], self._weight) return i, j, self._kernel_do_exp(G_gn[i], G_gn[j], self._weight)
def _kernel_do_geo(self, g1, g2, gamma): def _kernel_do_geo(self, g1, g2, gamma):
"""Compute common walk graph kernel between 2 graphs using geometric
"""Compute common walk graph kernel between 2 graphs using geometric
series. series.
Parameters Parameters
---------- ----------
g1, g2 : NetworkX graphs g1, g2 : NetworkX graphs
Graphs between which the kernels are computed. Graphs between which the kernels are computed.
gamma : integer gamma : integer
Weight. Weight.
Return Return
------ ------
kernel : float kernel : float
@@ -258,19 +258,19 @@ class CommonWalk(GraphKernel):
# except np.linalg.LinAlgError: # except np.linalg.LinAlgError:
# return np.nan # return np.nan


def _wrapper_kernel_do_geo(self, itr): def _wrapper_kernel_do_geo(self, itr):
i = itr[0] i = itr[0]
j = itr[1] j = itr[1]
return i, j, self._kernel_do_geo(G_gn[i], G_gn[j], self._weight) return i, j, self._kernel_do_geo(G_gn[i], G_gn[j], self._weight)
def _check_graphs(self, Gn): def _check_graphs(self, Gn):
for g in Gn: for g in Gn:
if nx.number_of_nodes(g) == 1: if nx.number_of_nodes(g) == 1:
raise Exception('Graphs must contain more than 1 nodes to construct adjacency matrices.') raise Exception('Graphs must contain more than 1 nodes to construct adjacency matrices.')
def _add_dummy_labels(self, Gn): def _add_dummy_labels(self, Gn):
if len(self._node_labels) == 0 or (len(self._node_labels) == 1 and self._node_labels[0] == SpecialLabel.DUMMY): if len(self._node_labels) == 0 or (len(self._node_labels) == 1 and self._node_labels[0] == SpecialLabel.DUMMY):
for i in range(len(Gn)): for i in range(len(Gn)):
@@ -280,13 +280,13 @@ class CommonWalk(GraphKernel):
for i in range(len(Gn)): for i in range(len(Gn)):
nx.set_edge_attributes(Gn[i], '0', SpecialLabel.DUMMY) nx.set_edge_attributes(Gn[i], '0', SpecialLabel.DUMMY)
self._edge_labels = [SpecialLabel.DUMMY] self._edge_labels = [SpecialLabel.DUMMY]
def _init_worker_gm(gn_toshare): def _init_worker_gm(gn_toshare):
global G_gn global G_gn
G_gn = gn_toshare G_gn = gn_toshare
def _init_worker_list(g1_toshare, g_list_toshare): def _init_worker_list(g1_toshare, g_list_toshare):
global G_g1, G_g_list global G_g1, G_g_list
G_g1 = g1_toshare G_g1 = g1_toshare

+ 75
- 93
gklearn/kernels/conjugate_gradient.py View File

@@ -5,13 +5,13 @@ Created on Thu Aug 20 16:09:51 2020


@author: ljia @author: ljia


@references:
@references:


[1] S Vichy N Vishwanathan, Nicol N Schraudolph, Risi Kondor, and Karsten M Borgwardt. Graph kernels. Journal of Machine Learning Research, 11(Apr):1201–1242, 2010. [1] S Vichy N Vishwanathan, Nicol N Schraudolph, Risi Kondor, and Karsten M Borgwardt. Graph kernels. Journal of Machine Learning Research, 11(Apr):1201–1242, 2010.
""" """


import sys import sys
from tqdm import tqdm
from gklearn.utils import get_iters
import numpy as np import numpy as np
import networkx as nx import networkx as nx
from scipy.sparse import identity from scipy.sparse import identity
@@ -22,8 +22,8 @@ from gklearn.utils.utils import compute_vertex_kernels




class ConjugateGradient(RandomWalkMeta): class ConjugateGradient(RandomWalkMeta):
def __init__(self, **kwargs): def __init__(self, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self._node_kernels = kwargs.get('node_kernels', None) self._node_kernels = kwargs.get('node_kernels', None)
@@ -32,33 +32,28 @@ class ConjugateGradient(RandomWalkMeta):
self._edge_labels = kwargs.get('edge_labels', []) self._edge_labels = kwargs.get('edge_labels', [])
self._node_attrs = kwargs.get('node_attrs', []) self._node_attrs = kwargs.get('node_attrs', [])
self._edge_attrs = kwargs.get('edge_attrs', []) self._edge_attrs = kwargs.get('edge_attrs', [])


def _compute_gm_series(self): def _compute_gm_series(self):
self._check_edge_weight(self._graphs, self._verbose) self._check_edge_weight(self._graphs, self._verbose)
self._check_graphs(self._graphs) self._check_graphs(self._graphs)
lmda = self._weight lmda = self._weight
# Compute Gram matrix. # Compute Gram matrix.
gram_matrix = np.zeros((len(self._graphs), len(self._graphs)))
gram_matrix = np.zeros((len(self._graphs), len(self._graphs)))
# Reindex nodes using consecutive integers for the convenience of kernel computation. # Reindex nodes using consecutive integers for the convenience of kernel computation.
if self._verbose >= 2:
iterator = tqdm(self._graphs, desc='Reindex vertices', file=sys.stdout)
else:
iterator = self._graphs
iterator = get_iters(self._graphs, desc='Reindex vertices', file=sys.stdout, verbose=(self._verbose >= 2))
self._graphs = [nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in iterator] self._graphs = [nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in iterator]

if self._p is None and self._q is None: # p and q are uniform distributions as default. if self._p is None and self._q is None: # p and q are uniform distributions as default.
from itertools import combinations_with_replacement from itertools import combinations_with_replacement
itr = combinations_with_replacement(range(0, len(self._graphs)), 2) itr = combinations_with_replacement(range(0, len(self._graphs)), 2)
if self._verbose >= 2:
iterator = tqdm(itr, desc='Computing kernels', file=sys.stdout)
else:
iterator = itr
len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2)
iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, length=len_itr, verbose=(self._verbose >= 2))

for i, j in iterator: for i, j in iterator:
kernel = self._kernel_do(self._graphs[i], self._graphs[j], lmda) kernel = self._kernel_do(self._graphs[i], self._graphs[j], lmda)
gram_matrix[i][j] = kernel gram_matrix[i][j] = kernel
@@ -66,92 +61,79 @@ class ConjugateGradient(RandomWalkMeta):


else: # @todo else: # @todo
pass pass
return gram_matrix return gram_matrix
def _compute_gm_imap_unordered(self): def _compute_gm_imap_unordered(self):
self._check_edge_weight(self._graphs, self._verbose) self._check_edge_weight(self._graphs, self._verbose)
self._check_graphs(self._graphs) self._check_graphs(self._graphs)
# Compute Gram matrix. # Compute Gram matrix.
gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) gram_matrix = np.zeros((len(self._graphs), len(self._graphs)))
# @todo: parallel this. # @todo: parallel this.
# Reindex nodes using consecutive integers for the convenience of kernel computation. # Reindex nodes using consecutive integers for the convenience of kernel computation.
if self._verbose >= 2:
iterator = tqdm(self._graphs, desc='Reindex vertices', file=sys.stdout)
else:
iterator = self._graphs
iterator = get_iters(self._graphs, desc='Reindex vertices', file=sys.stdout, verbose=(self._verbose >= 2))
self._graphs = [nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in iterator] self._graphs = [nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in iterator]

if self._p is None and self._q is None: # p and q are uniform distributions as default. if self._p is None and self._q is None: # p and q are uniform distributions as default.


def init_worker(gn_toshare): def init_worker(gn_toshare):
global G_gn global G_gn
G_gn = gn_toshare G_gn = gn_toshare
do_fun = self._wrapper_kernel_do do_fun = self._wrapper_kernel_do
parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker,
parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker,
glbv=(self._graphs,), n_jobs=self._n_jobs, verbose=self._verbose) glbv=(self._graphs,), n_jobs=self._n_jobs, verbose=self._verbose)


else: # @todo else: # @todo
pass pass
return gram_matrix return gram_matrix
def _compute_kernel_list_series(self, g1, g_list): def _compute_kernel_list_series(self, g1, g_list):
self._check_edge_weight(g_list + [g1], self._verbose) self._check_edge_weight(g_list + [g1], self._verbose)
self._check_graphs(g_list + [g1]) self._check_graphs(g_list + [g1])
lmda = self._weight lmda = self._weight
# compute kernel list. # compute kernel list.
kernel_list = [None] * len(g_list) kernel_list = [None] * len(g_list)


# Reindex nodes using consecutive integers for the convenience of kernel computation. # Reindex nodes using consecutive integers for the convenience of kernel computation.
g1 = nx.convert_node_labels_to_integers(g1, first_label=0, label_attribute='label_orignal') g1 = nx.convert_node_labels_to_integers(g1, first_label=0, label_attribute='label_orignal')
if self._verbose >= 2:
iterator = tqdm(g_list, desc='Reindex vertices', file=sys.stdout)
else:
iterator = g_list
iterator = get_iters(g_list, desc='Reindex vertices', file=sys.stdout, verbose=(self._verbose >= 2))
g_list = [nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in iterator] g_list = [nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in iterator]

if self._p is None and self._q is None: # p and q are uniform distributions as default. if self._p is None and self._q is None: # p and q are uniform distributions as default.
iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self._verbose >= 2))


if self._verbose >= 2:
iterator = tqdm(range(len(g_list)), desc='Computing kernels', file=sys.stdout)
else:
iterator = range(len(g_list))
for i in iterator: for i in iterator:
kernel = self._kernel_do(g1, g_list[i], lmda) kernel = self._kernel_do(g1, g_list[i], lmda)
kernel_list[i] = kernel kernel_list[i] = kernel


else: # @todo else: # @todo
pass pass
return kernel_list return kernel_list
def _compute_kernel_list_imap_unordered(self, g1, g_list): def _compute_kernel_list_imap_unordered(self, g1, g_list):
self._check_edge_weight(g_list + [g1], self._verbose) self._check_edge_weight(g_list + [g1], self._verbose)
self._check_graphs(g_list + [g1]) self._check_graphs(g_list + [g1])
# compute kernel list. # compute kernel list.
kernel_list = [None] * len(g_list) kernel_list = [None] * len(g_list)
# Reindex nodes using consecutive integers for the convenience of kernel computation. # Reindex nodes using consecutive integers for the convenience of kernel computation.
g1 = nx.convert_node_labels_to_integers(g1, first_label=0, label_attribute='label_orignal') g1 = nx.convert_node_labels_to_integers(g1, first_label=0, label_attribute='label_orignal')
# @todo: parallel this. # @todo: parallel this.
if self._verbose >= 2:
iterator = tqdm(g_list, desc='Reindex vertices', file=sys.stdout)
else:
iterator = g_list
iterator = get_iters(g_list, desc='Reindex vertices', file=sys.stdout, verbose=(self._verbose >= 2))
g_list = [nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in iterator] g_list = [nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in iterator]

if self._p is None and self._q is None: # p and q are uniform distributions as default. if self._p is None and self._q is None: # p and q are uniform distributions as default.


def init_worker(g1_toshare, g_list_toshare): def init_worker(g1_toshare, g_list_toshare):
@@ -159,56 +141,56 @@ class ConjugateGradient(RandomWalkMeta):
G_g1 = g1_toshare G_g1 = g1_toshare
G_g_list = g_list_toshare G_g_list = g_list_toshare


do_fun = self._wrapper_kernel_list_do
def func_assign(result, var_to_assign):
do_fun = self._wrapper_kernel_list_do
def func_assign(result, var_to_assign):
var_to_assign[result[0]] = result[1] var_to_assign[result[0]] = result[1]
itr = range(len(g_list)) itr = range(len(g_list))
len_itr = len(g_list) len_itr = len(g_list)
parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr,
init_worker=init_worker, glbv=(g1, g_list), method='imap_unordered',
init_worker=init_worker, glbv=(g1, g_list), method='imap_unordered',
n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose) n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose)
else: # @todo else: # @todo
pass pass
return kernel_list return kernel_list




def _wrapper_kernel_list_do(self, itr): def _wrapper_kernel_list_do(self, itr):
return itr, self._kernel_do(G_g1, G_g_list[itr], self._weight) return itr, self._kernel_do(G_g1, G_g_list[itr], self._weight)
def _compute_single_kernel_series(self, g1, g2): def _compute_single_kernel_series(self, g1, g2):
self._check_edge_weight([g1] + [g2], self._verbose) self._check_edge_weight([g1] + [g2], self._verbose)
self._check_graphs([g1] + [g2]) self._check_graphs([g1] + [g2])
lmda = self._weight lmda = self._weight
# Reindex nodes using consecutive integers for the convenience of kernel computation. # Reindex nodes using consecutive integers for the convenience of kernel computation.
g1 = nx.convert_node_labels_to_integers(g1, first_label=0, label_attribute='label_orignal') g1 = nx.convert_node_labels_to_integers(g1, first_label=0, label_attribute='label_orignal')
g2 = nx.convert_node_labels_to_integers(g2, first_label=0, label_attribute='label_orignal') g2 = nx.convert_node_labels_to_integers(g2, first_label=0, label_attribute='label_orignal')
if self._p is None and self._q is None: # p and q are uniform distributions as default. if self._p is None and self._q is None: # p and q are uniform distributions as default.
kernel = self._kernel_do(g1, g2, lmda) kernel = self._kernel_do(g1, g2, lmda)


else: # @todo else: # @todo
pass pass
return kernel
return kernel
def _kernel_do(self, g1, g2, lmda): def _kernel_do(self, g1, g2, lmda):
# Frist, compute kernels between all pairs of nodes using the method borrowed # Frist, compute kernels between all pairs of nodes using the method borrowed
# from FCSP. It is faster than directly computing all edge kernels
# from FCSP. It is faster than directly computing all edge kernels
# when $d_1d_2>2$, where $d_1$ and $d_2$ are vertex degrees of the # when $d_1d_2>2$, where $d_1$ and $d_2$ are vertex degrees of the
# graphs compared, which is the most case we went though. For very
# graphs compared, which is the most case we went though. For very
# sparse graphs, this would be slow. # sparse graphs, this would be slow.
vk_dict = self._compute_vertex_kernels(g1, g2) vk_dict = self._compute_vertex_kernels(g1, g2)
# Compute the weight matrix of the direct product graph.
w_times, w_dim = self._compute_weight_matrix(g1, g2, vk_dict)
# Compute the weight matrix of the direct product graph.
w_times, w_dim = self._compute_weight_matrix(g1, g2, vk_dict)
# use uniform distribution if there is no prior knowledge. # use uniform distribution if there is no prior knowledge.
p_times_uni = 1 / w_dim p_times_uni = 1 / w_dim
A = identity(w_times.shape[0]) - w_times * lmda A = identity(w_times.shape[0]) - w_times * lmda
@@ -217,27 +199,27 @@ class ConjugateGradient(RandomWalkMeta):
# use uniform distribution if there is no prior knowledge. # use uniform distribution if there is no prior knowledge.
q_times = np.full((1, w_dim), p_times_uni) q_times = np.full((1, w_dim), p_times_uni)
return np.dot(q_times, x) return np.dot(q_times, x)
def _wrapper_kernel_do(self, itr): def _wrapper_kernel_do(self, itr):
i = itr[0] i = itr[0]
j = itr[1] j = itr[1]
return i, j, self._kernel_do(G_gn[i], G_gn[j], self._weight) return i, j, self._kernel_do(G_gn[i], G_gn[j], self._weight)
def _func_fp(x, p_times, lmda, w_times): def _func_fp(x, p_times, lmda, w_times):
haha = w_times * x haha = w_times * x
haha = lmda * haha haha = lmda * haha
haha = p_times + haha haha = p_times + haha
return p_times + lmda * np.dot(w_times, x) return p_times + lmda * np.dot(w_times, x)
def _compute_vertex_kernels(self, g1, g2): def _compute_vertex_kernels(self, g1, g2):
"""Compute vertex kernels between vertices of two graphs. """Compute vertex kernels between vertices of two graphs.
""" """
return compute_vertex_kernels(g1, g2, self._node_kernels, node_labels=self._node_labels, node_attrs=self._node_attrs) return compute_vertex_kernels(g1, g2, self._node_kernels, node_labels=self._node_labels, node_attrs=self._node_attrs)
# @todo: move if out to make it faster. # @todo: move if out to make it faster.
# @todo: node/edge kernels use direct function rather than dicts. # @todo: node/edge kernels use direct function rather than dicts.
def _compute_weight_matrix(self, g1, g2, vk_dict): def _compute_weight_matrix(self, g1, g2, vk_dict):
@@ -250,20 +232,20 @@ class ConjugateGradient(RandomWalkMeta):
e1_attrs = [e1[2][ea] for ea in self._edge_attrs] e1_attrs = [e1[2][ea] for ea in self._edge_attrs]
e2_attrs = [e2[2][ea] for ea in self._edge_attrs] e2_attrs = [e2[2][ea] for ea in self._edge_attrs]
return ke(e1_labels, e2_labels, e1_attrs, e2_attrs) return ke(e1_labels, e2_labels, e1_attrs, e2_attrs)
def compute_ek_10(e1, e2, ke): def compute_ek_10(e1, e2, ke):
e1_labels = [e1[2][el] for el in self._edge_labels] e1_labels = [e1[2][el] for el in self._edge_labels]
e2_labels = [e2[2][el] for el in self._edge_labels] e2_labels = [e2[2][el] for el in self._edge_labels]
return ke(e1_labels, e2_labels) return ke(e1_labels, e2_labels)
def compute_ek_01(e1, e2, ke): def compute_ek_01(e1, e2, ke):
e1_attrs = [e1[2][ea] for ea in self._edge_attrs] e1_attrs = [e1[2][ea] for ea in self._edge_attrs]
e2_attrs = [e2[2][ea] for ea in self._edge_attrs] e2_attrs = [e2[2][ea] for ea in self._edge_attrs]
return ke(e1_attrs, e2_attrs) return ke(e1_attrs, e2_attrs)
def compute_ek_00(e1, e2, ke): def compute_ek_00(e1, e2, ke):
return 1 return 1
# Select the proper edge kernel. # Select the proper edge kernel.
if len(self._edge_labels) > 0: if len(self._edge_labels) > 0:
# edge symb and non-synb labeled # edge symb and non-synb labeled
@@ -283,11 +265,11 @@ class ConjugateGradient(RandomWalkMeta):
else: else:
ke = None ke = None
ek_temp = compute_ek_00 # @todo: check how much slower is this. ek_temp = compute_ek_00 # @todo: check how much slower is this.
# Compute the weight matrix. # Compute the weight matrix.
w_dim = nx.number_of_nodes(g1) * nx.number_of_nodes(g2) w_dim = nx.number_of_nodes(g1) * nx.number_of_nodes(g2)
w_times = np.zeros((w_dim, w_dim)) w_times = np.zeros((w_dim, w_dim))
if vk_dict: # node labeled if vk_dict: # node labeled
if self._ds_infos['directed']: if self._ds_infos['directed']:
for e1 in g1.edges(data=True): for e1 in g1.edges(data=True):


+ 76
- 93
gklearn/kernels/fixed_point.py View File

@@ -5,13 +5,13 @@ Created on Thu Aug 20 16:09:51 2020


@author: ljia @author: ljia


@references:
@references:


[1] S Vichy N Vishwanathan, Nicol N Schraudolph, Risi Kondor, and Karsten M Borgwardt. Graph kernels. Journal of Machine Learning Research, 11(Apr):1201–1242, 2010. [1] S Vichy N Vishwanathan, Nicol N Schraudolph, Risi Kondor, and Karsten M Borgwardt. Graph kernels. Journal of Machine Learning Research, 11(Apr):1201–1242, 2010.
""" """


import sys import sys
from tqdm import tqdm
from gklearn.utils import get_iters
import numpy as np import numpy as np
import networkx as nx import networkx as nx
from scipy import optimize from scipy import optimize
@@ -22,8 +22,8 @@ from gklearn.utils.utils import compute_vertex_kernels




class FixedPoint(RandomWalkMeta): class FixedPoint(RandomWalkMeta):
def __init__(self, **kwargs): def __init__(self, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self._node_kernels = kwargs.get('node_kernels', None) self._node_kernels = kwargs.get('node_kernels', None)
@@ -32,33 +32,28 @@ class FixedPoint(RandomWalkMeta):
self._edge_labels = kwargs.get('edge_labels', []) self._edge_labels = kwargs.get('edge_labels', [])
self._node_attrs = kwargs.get('node_attrs', []) self._node_attrs = kwargs.get('node_attrs', [])
self._edge_attrs = kwargs.get('edge_attrs', []) self._edge_attrs = kwargs.get('edge_attrs', [])


def _compute_gm_series(self): def _compute_gm_series(self):
self._check_edge_weight(self._graphs, self._verbose) self._check_edge_weight(self._graphs, self._verbose)
self._check_graphs(self._graphs) self._check_graphs(self._graphs)
lmda = self._weight lmda = self._weight
# Compute Gram matrix. # Compute Gram matrix.
gram_matrix = np.zeros((len(self._graphs), len(self._graphs)))
gram_matrix = np.zeros((len(self._graphs), len(self._graphs)))
# Reindex nodes using consecutive integers for the convenience of kernel computation. # Reindex nodes using consecutive integers for the convenience of kernel computation.
if self._verbose >= 2:
iterator = tqdm(self._graphs, desc='Reindex vertices', file=sys.stdout)
else:
iterator = self._graphs
iterator = get_iters(self._graphs, desc='Reindex vertices', file=sys.stdout,verbose=(self._verbose >= 2))
self._graphs = [nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in iterator] self._graphs = [nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in iterator]

if self._p is None and self._q is None: # p and q are uniform distributions as default. if self._p is None and self._q is None: # p and q are uniform distributions as default.
from itertools import combinations_with_replacement from itertools import combinations_with_replacement
itr = combinations_with_replacement(range(0, len(self._graphs)), 2) itr = combinations_with_replacement(range(0, len(self._graphs)), 2)
if self._verbose >= 2:
iterator = tqdm(itr, desc='Computing kernels', file=sys.stdout)
else:
iterator = itr
len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2)
iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, length=len_itr, verbose=(self._verbose >= 2))

for i, j in iterator: for i, j in iterator:
kernel = self._kernel_do(self._graphs[i], self._graphs[j], lmda) kernel = self._kernel_do(self._graphs[i], self._graphs[j], lmda)
gram_matrix[i][j] = kernel gram_matrix[i][j] = kernel
@@ -66,92 +61,80 @@ class FixedPoint(RandomWalkMeta):


else: # @todo else: # @todo
pass pass
return gram_matrix return gram_matrix
def _compute_gm_imap_unordered(self): def _compute_gm_imap_unordered(self):
self._check_edge_weight(self._graphs, self._verbose) self._check_edge_weight(self._graphs, self._verbose)
self._check_graphs(self._graphs) self._check_graphs(self._graphs)
# Compute Gram matrix. # Compute Gram matrix.
gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) gram_matrix = np.zeros((len(self._graphs), len(self._graphs)))
# @todo: parallel this. # @todo: parallel this.
# Reindex nodes using consecutive integers for the convenience of kernel computation. # Reindex nodes using consecutive integers for the convenience of kernel computation.
if self._verbose >= 2:
iterator = tqdm(self._graphs, desc='Reindex vertices', file=sys.stdout)
else:
iterator = self._graphs
iterator = get_iters(self._graphs, desc='Reindex vertices', file=sys.stdout, verbose=(self._verbose >= 2))
self._graphs = [nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in iterator] self._graphs = [nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in iterator]

if self._p is None and self._q is None: # p and q are uniform distributions as default. if self._p is None and self._q is None: # p and q are uniform distributions as default.


def init_worker(gn_toshare): def init_worker(gn_toshare):
global G_gn global G_gn
G_gn = gn_toshare G_gn = gn_toshare
do_fun = self._wrapper_kernel_do do_fun = self._wrapper_kernel_do
parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker,
parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker,
glbv=(self._graphs,), n_jobs=self._n_jobs, verbose=self._verbose) glbv=(self._graphs,), n_jobs=self._n_jobs, verbose=self._verbose)


else: # @todo else: # @todo
pass pass
return gram_matrix return gram_matrix
def _compute_kernel_list_series(self, g1, g_list): def _compute_kernel_list_series(self, g1, g_list):
self._check_edge_weight(g_list + [g1], self._verbose) self._check_edge_weight(g_list + [g1], self._verbose)
self._check_graphs(g_list + [g1]) self._check_graphs(g_list + [g1])
lmda = self._weight lmda = self._weight
# compute kernel list. # compute kernel list.
kernel_list = [None] * len(g_list) kernel_list = [None] * len(g_list)


# Reindex nodes using consecutive integers for the convenience of kernel computation. # Reindex nodes using consecutive integers for the convenience of kernel computation.
g1 = nx.convert_node_labels_to_integers(g1, first_label=0, label_attribute='label_orignal') g1 = nx.convert_node_labels_to_integers(g1, first_label=0, label_attribute='label_orignal')
if self._verbose >= 2:
iterator = tqdm(g_list, desc='Reindex vertices', file=sys.stdout)
else:
iterator = g_list
iterator = get_iters(g_list, desc='Reindex vertices', file=sys.stdout, verbose=(self._verbose >= 2))
g_list = [nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in iterator] g_list = [nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in iterator]

if self._p is None and self._q is None: # p and q are uniform distributions as default. if self._p is None and self._q is None: # p and q are uniform distributions as default.


if self._verbose >= 2:
iterator = tqdm(range(len(g_list)), desc='Computing kernels', file=sys.stdout)
else:
iterator = range(len(g_list))
iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self._verbose >= 2))

for i in iterator: for i in iterator:
kernel = self._kernel_do(g1, g_list[i], lmda) kernel = self._kernel_do(g1, g_list[i], lmda)
kernel_list[i] = kernel kernel_list[i] = kernel


else: # @todo else: # @todo
pass pass
return kernel_list return kernel_list
def _compute_kernel_list_imap_unordered(self, g1, g_list): def _compute_kernel_list_imap_unordered(self, g1, g_list):
self._check_edge_weight(g_list + [g1], self._verbose) self._check_edge_weight(g_list + [g1], self._verbose)
self._check_graphs(g_list + [g1]) self._check_graphs(g_list + [g1])
# compute kernel list. # compute kernel list.
kernel_list = [None] * len(g_list) kernel_list = [None] * len(g_list)
# Reindex nodes using consecutive integers for the convenience of kernel computation. # Reindex nodes using consecutive integers for the convenience of kernel computation.
g1 = nx.convert_node_labels_to_integers(g1, first_label=0, label_attribute='label_orignal') g1 = nx.convert_node_labels_to_integers(g1, first_label=0, label_attribute='label_orignal')
# @todo: parallel this. # @todo: parallel this.
if self._verbose >= 2:
iterator = tqdm(g_list, desc='Reindex vertices', file=sys.stdout)
else:
iterator = g_list
iterator = get_iters(g_list, desc='Reindex vertices', file=sys.stdout, verbose=(self._verbose >= 2))
g_list = [nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in iterator] g_list = [nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in iterator]

if self._p is None and self._q is None: # p and q are uniform distributions as default. if self._p is None and self._q is None: # p and q are uniform distributions as default.


def init_worker(g1_toshare, g_list_toshare): def init_worker(g1_toshare, g_list_toshare):
@@ -159,56 +142,56 @@ class FixedPoint(RandomWalkMeta):
G_g1 = g1_toshare G_g1 = g1_toshare
G_g_list = g_list_toshare G_g_list = g_list_toshare


do_fun = self._wrapper_kernel_list_do
def func_assign(result, var_to_assign):
do_fun = self._wrapper_kernel_list_do
def func_assign(result, var_to_assign):
var_to_assign[result[0]] = result[1] var_to_assign[result[0]] = result[1]
itr = range(len(g_list)) itr = range(len(g_list))
len_itr = len(g_list) len_itr = len(g_list)
parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr,
init_worker=init_worker, glbv=(g1, g_list), method='imap_unordered',
init_worker=init_worker, glbv=(g1, g_list), method='imap_unordered',
n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose) n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose)
else: # @todo else: # @todo
pass pass
return kernel_list return kernel_list




def _wrapper_kernel_list_do(self, itr): def _wrapper_kernel_list_do(self, itr):
return itr, self._kernel_do(G_g1, G_g_list[itr], self._weight) return itr, self._kernel_do(G_g1, G_g_list[itr], self._weight)
def _compute_single_kernel_series(self, g1, g2): def _compute_single_kernel_series(self, g1, g2):
self._check_edge_weight([g1] + [g2], self._verbose) self._check_edge_weight([g1] + [g2], self._verbose)
self._check_graphs([g1] + [g2]) self._check_graphs([g1] + [g2])
lmda = self._weight lmda = self._weight
# Reindex nodes using consecutive integers for the convenience of kernel computation. # Reindex nodes using consecutive integers for the convenience of kernel computation.
g1 = nx.convert_node_labels_to_integers(g1, first_label=0, label_attribute='label_orignal') g1 = nx.convert_node_labels_to_integers(g1, first_label=0, label_attribute='label_orignal')
g2 = nx.convert_node_labels_to_integers(g2, first_label=0, label_attribute='label_orignal') g2 = nx.convert_node_labels_to_integers(g2, first_label=0, label_attribute='label_orignal')
if self._p is None and self._q is None: # p and q are uniform distributions as default. if self._p is None and self._q is None: # p and q are uniform distributions as default.
kernel = self._kernel_do(g1, g2, lmda) kernel = self._kernel_do(g1, g2, lmda)


else: # @todo else: # @todo
pass pass
return kernel
return kernel
def _kernel_do(self, g1, g2, lmda): def _kernel_do(self, g1, g2, lmda):
# Frist, compute kernels between all pairs of nodes using the method borrowed # Frist, compute kernels between all pairs of nodes using the method borrowed
# from FCSP. It is faster than directly computing all edge kernels
# from FCSP. It is faster than directly computing all edge kernels
# when $d_1d_2>2$, where $d_1$ and $d_2$ are vertex degrees of the # when $d_1d_2>2$, where $d_1$ and $d_2$ are vertex degrees of the
# graphs compared, which is the most case we went though. For very
# graphs compared, which is the most case we went though. For very
# sparse graphs, this would be slow. # sparse graphs, this would be slow.
vk_dict = self._compute_vertex_kernels(g1, g2) vk_dict = self._compute_vertex_kernels(g1, g2)
# Compute the weight matrix of the direct product graph.
w_times, w_dim = self._compute_weight_matrix(g1, g2, vk_dict)
# Compute the weight matrix of the direct product graph.
w_times, w_dim = self._compute_weight_matrix(g1, g2, vk_dict)
# use uniform distribution if there is no prior knowledge. # use uniform distribution if there is no prior knowledge.
p_times_uni = 1 / w_dim p_times_uni = 1 / w_dim
p_times = np.full((w_dim, 1), p_times_uni) p_times = np.full((w_dim, 1), p_times_uni)
@@ -216,27 +199,27 @@ class FixedPoint(RandomWalkMeta):
# use uniform distribution if there is no prior knowledge. # use uniform distribution if there is no prior knowledge.
q_times = np.full((1, w_dim), p_times_uni) q_times = np.full((1, w_dim), p_times_uni)
return np.dot(q_times, x) return np.dot(q_times, x)
def _wrapper_kernel_do(self, itr): def _wrapper_kernel_do(self, itr):
i = itr[0] i = itr[0]
j = itr[1] j = itr[1]
return i, j, self._kernel_do(G_gn[i], G_gn[j], self._weight) return i, j, self._kernel_do(G_gn[i], G_gn[j], self._weight)
def _func_fp(self, x, p_times, lmda, w_times): def _func_fp(self, x, p_times, lmda, w_times):
haha = w_times * x haha = w_times * x
haha = lmda * haha haha = lmda * haha
haha = p_times + haha haha = p_times + haha
return p_times + lmda * np.dot(w_times, x) return p_times + lmda * np.dot(w_times, x)
def _compute_vertex_kernels(self, g1, g2): def _compute_vertex_kernels(self, g1, g2):
"""Compute vertex kernels between vertices of two graphs. """Compute vertex kernels between vertices of two graphs.
""" """
return compute_vertex_kernels(g1, g2, self._node_kernels, node_labels=self._node_labels, node_attrs=self._node_attrs) return compute_vertex_kernels(g1, g2, self._node_kernels, node_labels=self._node_labels, node_attrs=self._node_attrs)
# @todo: move if out to make it faster. # @todo: move if out to make it faster.
# @todo: node/edge kernels use direct function rather than dicts. # @todo: node/edge kernels use direct function rather than dicts.
def _compute_weight_matrix(self, g1, g2, vk_dict): def _compute_weight_matrix(self, g1, g2, vk_dict):
@@ -249,20 +232,20 @@ class FixedPoint(RandomWalkMeta):
e1_attrs = [e1[2][ea] for ea in self._edge_attrs] e1_attrs = [e1[2][ea] for ea in self._edge_attrs]
e2_attrs = [e2[2][ea] for ea in self._edge_attrs] e2_attrs = [e2[2][ea] for ea in self._edge_attrs]
return ke(e1_labels, e2_labels, e1_attrs, e2_attrs) return ke(e1_labels, e2_labels, e1_attrs, e2_attrs)
def compute_ek_10(e1, e2, ke): def compute_ek_10(e1, e2, ke):
e1_labels = [e1[2][el] for el in self._edge_labels] e1_labels = [e1[2][el] for el in self._edge_labels]
e2_labels = [e2[2][el] for el in self._edge_labels] e2_labels = [e2[2][el] for el in self._edge_labels]
return ke(e1_labels, e2_labels) return ke(e1_labels, e2_labels)
def compute_ek_01(e1, e2, ke): def compute_ek_01(e1, e2, ke):
e1_attrs = [e1[2][ea] for ea in self._edge_attrs] e1_attrs = [e1[2][ea] for ea in self._edge_attrs]
e2_attrs = [e2[2][ea] for ea in self._edge_attrs] e2_attrs = [e2[2][ea] for ea in self._edge_attrs]
return ke(e1_attrs, e2_attrs) return ke(e1_attrs, e2_attrs)
def compute_ek_00(e1, e2, ke): def compute_ek_00(e1, e2, ke):
return 1 return 1
# Select the proper edge kernel. # Select the proper edge kernel.
if len(self._edge_labels) > 0: if len(self._edge_labels) > 0:
# edge symb and non-synb labeled # edge symb and non-synb labeled
@@ -282,11 +265,11 @@ class FixedPoint(RandomWalkMeta):
else: else:
ke = None ke = None
ek_temp = compute_ek_00 # @todo: check how much slower is this. ek_temp = compute_ek_00 # @todo: check how much slower is this.
# Compute the weight matrix. # Compute the weight matrix.
w_dim = nx.number_of_nodes(g1) * nx.number_of_nodes(g2) w_dim = nx.number_of_nodes(g1) * nx.number_of_nodes(g2)
w_times = np.zeros((w_dim, w_dim)) w_times = np.zeros((w_dim, w_dim))
if vk_dict: # node labeled if vk_dict: # node labeled
if self._ds_infos['directed']: if self._ds_infos['directed']:
for e1 in g1.edges(data=True): for e1 in g1.edges(data=True):


+ 68
- 82
gklearn/kernels/marginalized.py View File

@@ -7,19 +7,19 @@ Created on Wed Jun 3 22:22:57 2020


@references: @references:


[1] H. Kashima, K. Tsuda, and A. Inokuchi. Marginalized kernels between
labeled graphs. In Proceedings of the 20th International Conference on
[1] H. Kashima, K. Tsuda, and A. Inokuchi. Marginalized kernels between
labeled graphs. In Proceedings of the 20th International Conference on
Machine Learning, Washington, DC, United States, 2003. Machine Learning, Washington, DC, United States, 2003.


[2] Pierre Mahé, Nobuhisa Ueda, Tatsuya Akutsu, Jean-Luc Perret, and
Jean-Philippe Vert. Extensions of marginalized graph kernels. In
Proceedings of the twenty-first international conference on Machine
[2] Pierre Mahé, Nobuhisa Ueda, Tatsuya Akutsu, Jean-Luc Perret, and
Jean-Philippe Vert. Extensions of marginalized graph kernels. In
Proceedings of the twenty-first international conference on Machine
learning, page 70. ACM, 2004. learning, page 70. ACM, 2004.
""" """


import sys import sys
from multiprocessing import Pool from multiprocessing import Pool
from tqdm import tqdm
from gklearn.utils import get_iters
import numpy as np import numpy as np
import networkx as nx import networkx as nx
from gklearn.utils import SpecialLabel from gklearn.utils import SpecialLabel
@@ -30,7 +30,7 @@ from gklearn.kernels import GraphKernel




class Marginalized(GraphKernel): class Marginalized(GraphKernel):
def __init__(self, **kwargs): def __init__(self, **kwargs):
GraphKernel.__init__(self) GraphKernel.__init__(self)
self._node_labels = kwargs.get('node_labels', []) self._node_labels = kwargs.get('node_labels', [])
@@ -44,35 +44,31 @@ class Marginalized(GraphKernel):


def _compute_gm_series(self): def _compute_gm_series(self):
self._add_dummy_labels(self._graphs) self._add_dummy_labels(self._graphs)
if self._remove_totters: if self._remove_totters:
if self._verbose >= 2:
iterator = tqdm(self._graphs, desc='removing tottering', file=sys.stdout)
else:
iterator = self._graphs
iterator = get_iters(self._graphs, desc='removing tottering', file=sys.stdout, verbose=(self._verbose >= 2))
# @todo: this may not work. # @todo: this may not work.
self._graphs = [untotterTransformation(G, self._node_labels, self._edge_labels) for G in iterator] self._graphs = [untotterTransformation(G, self._node_labels, self._edge_labels) for G in iterator]

# compute Gram matrix. # compute Gram matrix.
gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) gram_matrix = np.zeros((len(self._graphs), len(self._graphs)))
from itertools import combinations_with_replacement from itertools import combinations_with_replacement
itr = combinations_with_replacement(range(0, len(self._graphs)), 2) itr = combinations_with_replacement(range(0, len(self._graphs)), 2)
if self._verbose >= 2:
iterator = tqdm(itr, desc='Computing kernels', file=sys.stdout)
else:
iterator = itr
len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2)
iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout,
length=len_itr, verbose=(self._verbose >= 2))
for i, j in iterator: for i, j in iterator:
kernel = self._kernel_do(self._graphs[i], self._graphs[j]) kernel = self._kernel_do(self._graphs[i], self._graphs[j])
gram_matrix[i][j] = kernel gram_matrix[i][j] = kernel
gram_matrix[j][i] = kernel # @todo: no directed graph considered? gram_matrix[j][i] = kernel # @todo: no directed graph considered?
return gram_matrix return gram_matrix
def _compute_gm_imap_unordered(self): def _compute_gm_imap_unordered(self):
self._add_dummy_labels(self._graphs) self._add_dummy_labels(self._graphs)
if self._remove_totters: if self._remove_totters:
pool = Pool(self._n_jobs) pool = Pool(self._n_jobs)
itr = range(0, len(self._graphs)) itr = range(0, len(self._graphs))
@@ -81,57 +77,49 @@ class Marginalized(GraphKernel):
else: else:
chunksize = 100 chunksize = 100
remove_fun = self._wrapper_untotter remove_fun = self._wrapper_untotter
if self._verbose >= 2:
iterator = tqdm(pool.imap_unordered(remove_fun, itr, chunksize),
desc='removing tottering', file=sys.stdout)
else:
iterator = pool.imap_unordered(remove_fun, itr, chunksize)
iterator = get_iters(pool.imap_unordered(remove_fun, itr, chunksize),
desc='removing tottering', file=sys.stdout,
length=len(self._graphs), verbose=(self._verbose >= 2))
for i, g in iterator: for i, g in iterator:
self._graphs[i] = g self._graphs[i] = g
pool.close() pool.close()
pool.join() pool.join()
# compute Gram matrix. # compute Gram matrix.
gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) gram_matrix = np.zeros((len(self._graphs), len(self._graphs)))
def init_worker(gn_toshare): def init_worker(gn_toshare):
global G_gn global G_gn
G_gn = gn_toshare G_gn = gn_toshare
do_fun = self._wrapper_kernel_do do_fun = self._wrapper_kernel_do
parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker,
parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker,
glbv=(self._graphs,), n_jobs=self._n_jobs, verbose=self._verbose) glbv=(self._graphs,), n_jobs=self._n_jobs, verbose=self._verbose)
return gram_matrix return gram_matrix
def _compute_kernel_list_series(self, g1, g_list): def _compute_kernel_list_series(self, g1, g_list):
self._add_dummy_labels(g_list + [g1]) self._add_dummy_labels(g_list + [g1])
if self._remove_totters: if self._remove_totters:
g1 = untotterTransformation(g1, self._node_labels, self._edge_labels) # @todo: this may not work. g1 = untotterTransformation(g1, self._node_labels, self._edge_labels) # @todo: this may not work.
if self._verbose >= 2:
iterator = tqdm(g_list, desc='removing tottering', file=sys.stdout)
else:
iterator = g_list
iterator = get_iters(g_list, desc='removing tottering', file=sys.stdout, verbose=(self._verbose >= 2))
# @todo: this may not work. # @todo: this may not work.
g_list = [untotterTransformation(G, self._node_labels, self._edge_labels) for G in iterator] g_list = [untotterTransformation(G, self._node_labels, self._edge_labels) for G in iterator]

# compute kernel list. # compute kernel list.
kernel_list = [None] * len(g_list) kernel_list = [None] * len(g_list)
if self._verbose >= 2:
iterator = tqdm(range(len(g_list)), desc='Computing kernels', file=sys.stdout)
else:
iterator = range(len(g_list))
iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self._verbose >= 2))
for i in iterator: for i in iterator:
kernel = self._kernel_do(g1, g_list[i]) kernel = self._kernel_do(g1, g_list[i])
kernel_list[i] = kernel kernel_list[i] = kernel
return kernel_list return kernel_list
def _compute_kernel_list_imap_unordered(self, g1, g_list): def _compute_kernel_list_imap_unordered(self, g1, g_list):
self._add_dummy_labels(g_list + [g1]) self._add_dummy_labels(g_list + [g1])
if self._remove_totters: if self._remove_totters:
g1 = untotterTransformation(g1, self._node_labels, self._edge_labels) # @todo: this may not work. g1 = untotterTransformation(g1, self._node_labels, self._edge_labels) # @todo: this may not work.
pool = Pool(self._n_jobs) pool = Pool(self._n_jobs)
@@ -141,16 +129,14 @@ class Marginalized(GraphKernel):
else: else:
chunksize = 100 chunksize = 100
remove_fun = self._wrapper_untotter remove_fun = self._wrapper_untotter
if self._verbose >= 2:
iterator = tqdm(pool.imap_unordered(remove_fun, itr, chunksize),
desc='removing tottering', file=sys.stdout)
else:
iterator = pool.imap_unordered(remove_fun, itr, chunksize)
iterator = get_iters(pool.imap_unordered(remove_fun, itr, chunksize),
desc='removing tottering', file=sys.stdout,
length=len(g_list), verbose=(self._verbose >= 2))
for i, g in iterator: for i, g in iterator:
g_list[i] = g g_list[i] = g
pool.close() pool.close()
pool.join() pool.join()
# compute kernel list. # compute kernel list.
kernel_list = [None] * len(g_list) kernel_list = [None] * len(g_list)


@@ -159,38 +145,38 @@ class Marginalized(GraphKernel):
G_g1 = g1_toshare G_g1 = g1_toshare
G_g_list = g_list_toshare G_g_list = g_list_toshare
do_fun = self._wrapper_kernel_list_do do_fun = self._wrapper_kernel_list_do
def func_assign(result, var_to_assign):
def func_assign(result, var_to_assign):
var_to_assign[result[0]] = result[1] var_to_assign[result[0]] = result[1]
itr = range(len(g_list)) itr = range(len(g_list))
len_itr = len(g_list) len_itr = len(g_list)
parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr,
init_worker=init_worker, glbv=(g1, g_list), method='imap_unordered',
init_worker=init_worker, glbv=(g1, g_list), method='imap_unordered',
n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose) n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose)
return kernel_list return kernel_list
def _wrapper_kernel_list_do(self, itr): def _wrapper_kernel_list_do(self, itr):
return itr, self._kernel_do(G_g1, G_g_list[itr]) return itr, self._kernel_do(G_g1, G_g_list[itr])
def _compute_single_kernel_series(self, g1, g2): def _compute_single_kernel_series(self, g1, g2):
self._add_dummy_labels([g1] + [g2]) self._add_dummy_labels([g1] + [g2])
if self._remove_totters: if self._remove_totters:
g1 = untotterTransformation(g1, self._node_labels, self._edge_labels) # @todo: this may not work. g1 = untotterTransformation(g1, self._node_labels, self._edge_labels) # @todo: this may not work.
g2 = untotterTransformation(g2, self._node_labels, self._edge_labels) g2 = untotterTransformation(g2, self._node_labels, self._edge_labels)
kernel = self._kernel_do(g1, g2) kernel = self._kernel_do(g1, g2)
return kernel
return kernel
def _kernel_do(self, g1, g2): def _kernel_do(self, g1, g2):
"""Compute marginalized graph kernel between 2 graphs. """Compute marginalized graph kernel between 2 graphs.
Parameters Parameters
---------- ----------
g1, g2 : NetworkX graphs g1, g2 : NetworkX graphs
2 graphs between which the kernel is computed. 2 graphs between which the kernel is computed.
Return Return
------ ------
kernel : float kernel : float
@@ -204,10 +190,10 @@ class Marginalized(GraphKernel):
# (uniform distribution over |G|) # (uniform distribution over |G|)
p_init_G1 = 1 / num_nodes_G1 p_init_G1 = 1 / num_nodes_G1
p_init_G2 = 1 / num_nodes_G2 p_init_G2 = 1 / num_nodes_G2
q = self._p_quit * self._p_quit q = self._p_quit * self._p_quit
r1 = q r1 = q
# # initial R_inf # # initial R_inf
# # matrix to save all the R_inf for all pairs of nodes # # matrix to save all the R_inf for all pairs of nodes
# R_inf = np.zeros([num_nodes_G1, num_nodes_G2]) # R_inf = np.zeros([num_nodes_G1, num_nodes_G2])
@@ -229,7 +215,7 @@ class Marginalized(GraphKernel):
# neighbor_n2 = g2[node2[0]] # neighbor_n2 = g2[node2[0]]
# if len(neighbor_n2) > 0: # if len(neighbor_n2) > 0:
# p_trans_n2 = (1 - p_quit) / len(neighbor_n2) # p_trans_n2 = (1 - p_quit) / len(neighbor_n2)
#
#
# for neighbor1 in neighbor_n1: # for neighbor1 in neighbor_n1:
# for neighbor2 in neighbor_n2: # for neighbor2 in neighbor_n2:
# t = p_trans_n1 * p_trans_n2 * \ # t = p_trans_n1 * p_trans_n2 * \
@@ -238,7 +224,7 @@ class Marginalized(GraphKernel):
# deltakernel( # deltakernel(
# neighbor_n1[neighbor1][edge_label], # neighbor_n1[neighbor1][edge_label],
# neighbor_n2[neighbor2][edge_label]) # neighbor_n2[neighbor2][edge_label])
#
#
# R_inf_new[node1[0]][node2[0]] += t * R_inf[neighbor1][ # R_inf_new[node1[0]][node2[0]] += t * R_inf[neighbor1][
# neighbor2] # ref [1] equation (8) # neighbor2] # ref [1] equation (8)
# R_inf[:] = R_inf_new # R_inf[:] = R_inf_new
@@ -249,8 +235,8 @@ class Marginalized(GraphKernel):
# s = p_init_G1 * p_init_G2 * deltakernel( # s = p_init_G1 * p_init_G2 * deltakernel(
# node1[1][node_label], node2[1][node_label]) # node1[1][node_label], node2[1][node_label])
# kernel += s * R_inf[node1[0]][node2[0]] # ref [1] equation (6) # kernel += s * R_inf[node1[0]][node2[0]] # ref [1] equation (6)
R_inf = {} # dict to save all the R_inf for all pairs of nodes R_inf = {} # dict to save all the R_inf for all pairs of nodes
# initial R_inf, the 1st iteration. # initial R_inf, the 1st iteration.
for node1 in g1.nodes(): for node1 in g1.nodes():
@@ -266,7 +252,7 @@ class Marginalized(GraphKernel):
R_inf[(node1, node2)] = self._p_quit R_inf[(node1, node2)] = self._p_quit
else: else:
R_inf[(node1, node2)] = 1 R_inf[(node1, node2)] = 1
# compute all transition probability first. # compute all transition probability first.
t_dict = {} t_dict = {}
if self._n_iteration > 1: if self._n_iteration > 1:
@@ -287,11 +273,11 @@ class Marginalized(GraphKernel):
p_trans_n1 * p_trans_n2 * \ p_trans_n1 * p_trans_n2 * \
deltakernel(tuple(g1.nodes[neighbor1][nl] for nl in self._node_labels), tuple(g2.nodes[neighbor2][nl] for nl in self._node_labels)) * \ deltakernel(tuple(g1.nodes[neighbor1][nl] for nl in self._node_labels), tuple(g2.nodes[neighbor2][nl] for nl in self._node_labels)) * \
deltakernel(tuple(neighbor_n1[neighbor1][el] for el in self._edge_labels), tuple(neighbor_n2[neighbor2][el] for el in self._edge_labels)) deltakernel(tuple(neighbor_n1[neighbor1][el] for el in self._edge_labels), tuple(neighbor_n2[neighbor2][el] for el in self._edge_labels))
# Compute R_inf with a simple interative method # Compute R_inf with a simple interative method
for i in range(2, self._n_iteration + 1): for i in range(2, self._n_iteration + 1):
R_inf_old = R_inf.copy() R_inf_old = R_inf.copy()
# Compute R_inf for each pair of nodes # Compute R_inf for each pair of nodes
for node1 in g1.nodes(): for node1 in g1.nodes():
neighbor_n1 = g1[node1] neighbor_n1 = g1[node1]
@@ -301,32 +287,32 @@ class Marginalized(GraphKernel):
if len(neighbor_n1) > 0: if len(neighbor_n1) > 0:
for node2 in g2.nodes(): for node2 in g2.nodes():
neighbor_n2 = g2[node2] neighbor_n2 = g2[node2]
if len(neighbor_n2) > 0:
if len(neighbor_n2) > 0:
R_inf[(node1, node2)] = r1 R_inf[(node1, node2)] = r1
for neighbor1 in neighbor_n1: for neighbor1 in neighbor_n1:
for neighbor2 in neighbor_n2: for neighbor2 in neighbor_n2:
R_inf[(node1, node2)] += \ R_inf[(node1, node2)] += \
(t_dict[(node1, node2, neighbor1, neighbor2)] * \ (t_dict[(node1, node2, neighbor1, neighbor2)] * \
R_inf_old[(neighbor1, neighbor2)]) # ref [1] equation (8) R_inf_old[(neighbor1, neighbor2)]) # ref [1] equation (8)
# add elements of R_inf up and compute kernel. # add elements of R_inf up and compute kernel.
for (n1, n2), value in R_inf.items(): for (n1, n2), value in R_inf.items():
s = p_init_G1 * p_init_G2 * deltakernel(tuple(g1.nodes[n1][nl] for nl in self._node_labels), tuple(g2.nodes[n2][nl] for nl in self._node_labels)) s = p_init_G1 * p_init_G2 * deltakernel(tuple(g1.nodes[n1][nl] for nl in self._node_labels), tuple(g2.nodes[n2][nl] for nl in self._node_labels))
kernel += s * value # ref [1] equation (6) kernel += s * value # ref [1] equation (6)
return kernel return kernel
def _wrapper_kernel_do(self, itr): def _wrapper_kernel_do(self, itr):
i = itr[0] i = itr[0]
j = itr[1] j = itr[1]
return i, j, self._kernel_do(G_gn[i], G_gn[j]) return i, j, self._kernel_do(G_gn[i], G_gn[j])


def _wrapper_untotter(self, i): def _wrapper_untotter(self, i):
return i, untotterTransformation(self._graphs[i], self._node_labels, self._edge_labels) # @todo: this may not work. return i, untotterTransformation(self._graphs[i], self._node_labels, self._edge_labels) # @todo: this may not work.
def _add_dummy_labels(self, Gn): def _add_dummy_labels(self, Gn):
if len(self._node_labels) == 0 or (len(self._node_labels) == 1 and self._node_labels[0] == SpecialLabel.DUMMY): if len(self._node_labels) == 0 or (len(self._node_labels) == 1 and self._node_labels[0] == SpecialLabel.DUMMY):
for i in range(len(Gn)): for i in range(len(Gn)):


+ 135
- 145
gklearn/kernels/path_up_to_h.py View File

@@ -5,15 +5,15 @@ Created on Fri Apr 10 18:33:13 2020


@author: ljia @author: ljia


@references:
@references:


[1] Liva Ralaivola, Sanjay J Swamidass, Hiroto Saigo, and Pierre
Baldi. Graph kernels for chemical informatics. Neural networks,
[1] Liva Ralaivola, Sanjay J Swamidass, Hiroto Saigo, and Pierre
Baldi. Graph kernels for chemical informatics. Neural networks,
18(8):1093–1110, 2005. 18(8):1093–1110, 2005.
""" """
import sys import sys
from multiprocessing import Pool from multiprocessing import Pool
from tqdm import tqdm
from gklearn.utils import get_iters
import numpy as np import numpy as np
import networkx as nx import networkx as nx
from collections import Counter from collections import Counter
@@ -25,7 +25,7 @@ from gklearn.utils import Trie




class PathUpToH(GraphKernel): # @todo: add function for k_func is None class PathUpToH(GraphKernel): # @todo: add function for k_func is None
def __init__(self, **kwargs): def __init__(self, **kwargs):
GraphKernel.__init__(self) GraphKernel.__init__(self)
self._node_labels = kwargs.get('node_labels', []) self._node_labels = kwargs.get('node_labels', [])
@@ -38,16 +38,14 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None


def _compute_gm_series(self): def _compute_gm_series(self):
self._add_dummy_labels(self._graphs) self._add_dummy_labels(self._graphs)
from itertools import combinations_with_replacement from itertools import combinations_with_replacement
itr_kernel = combinations_with_replacement(range(0, len(self._graphs)), 2)
if self._verbose >= 2:
iterator_ps = tqdm(range(0, len(self._graphs)), desc='getting paths', file=sys.stdout)
iterator_kernel = tqdm(itr_kernel, desc='Computing kernels', file=sys.stdout)
else:
iterator_ps = range(0, len(self._graphs))
iterator_kernel = itr_kernel
itr_kernel = combinations_with_replacement(range(0, len(self._graphs)), 2)
iterator_ps = get_iters(range(0, len(self._graphs)), desc='getting paths', file=sys.stdout, length=len(self._graphs), verbose=(self._verbose >= 2))
len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2)
iterator_kernel = get_iters(itr_kernel, desc='Computing kernels',
file=sys.stdout, length=len_itr, verbose=(self._verbose >= 2))

gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) gram_matrix = np.zeros((len(self._graphs), len(self._graphs)))


if self._compute_method == 'trie': if self._compute_method == 'trie':
@@ -62,13 +60,13 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None
kernel = self._kernel_do_naive(all_paths[i], all_paths[j]) kernel = self._kernel_do_naive(all_paths[i], all_paths[j])
gram_matrix[i][j] = kernel gram_matrix[i][j] = kernel
gram_matrix[j][i] = kernel gram_matrix[j][i] = kernel
return gram_matrix return gram_matrix
def _compute_gm_imap_unordered(self): def _compute_gm_imap_unordered(self):
self._add_dummy_labels(self._graphs) self._add_dummy_labels(self._graphs)
# get all paths of all graphs before computing kernels to save time, # get all paths of all graphs before computing kernels to save time,
# but this may cost a lot of memory for large datasets. # but this may cost a lot of memory for large datasets.
pool = Pool(self._n_jobs) pool = Pool(self._n_jobs)
@@ -80,23 +78,21 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None
all_paths = [[] for _ in range(len(self._graphs))] all_paths = [[] for _ in range(len(self._graphs))]
if self._compute_method == 'trie' and self._k_func is not None: if self._compute_method == 'trie' and self._k_func is not None:
get_ps_fun = self._wrapper_find_all_path_as_trie get_ps_fun = self._wrapper_find_all_path_as_trie
elif self._compute_method != 'trie' and self._k_func is not None:
get_ps_fun = partial(self._wrapper_find_all_paths_until_length, True)
else:
get_ps_fun = partial(self._wrapper_find_all_paths_until_length, False)
if self._verbose >= 2:
iterator = tqdm(pool.imap_unordered(get_ps_fun, itr, chunksize),
desc='getting paths', file=sys.stdout)
elif self._compute_method != 'trie' and self._k_func is not None:
get_ps_fun = partial(self._wrapper_find_all_paths_until_length, True)
else: else:
iterator = pool.imap_unordered(get_ps_fun, itr, chunksize)
get_ps_fun = partial(self._wrapper_find_all_paths_until_length, False)
iterator = get_iters(pool.imap_unordered(get_ps_fun, itr, chunksize),
desc='getting paths', file=sys.stdout,
length=len(self._graphs), verbose=(self._verbose >= 2))
for i, ps in iterator: for i, ps in iterator:
all_paths[i] = ps all_paths[i] = ps
pool.close() pool.close()
pool.join() pool.join()
# compute Gram matrix. # compute Gram matrix.
gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) gram_matrix = np.zeros((len(self._graphs), len(self._graphs)))
if self._compute_method == 'trie' and self._k_func is not None: if self._compute_method == 'trie' and self._k_func is not None:
def init_worker(trie_toshare): def init_worker(trie_toshare):
global G_trie global G_trie
@@ -106,28 +102,24 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None
def init_worker(plist_toshare): def init_worker(plist_toshare):
global G_plist global G_plist
G_plist = plist_toshare G_plist = plist_toshare
do_fun = self._wrapper_kernel_do_naive
do_fun = self._wrapper_kernel_do_naive
else: else:
def init_worker(plist_toshare): def init_worker(plist_toshare):
global G_plist global G_plist
G_plist = plist_toshare G_plist = plist_toshare
do_fun = self._wrapper_kernel_do_kernelless # @todo: what is this?
parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker,
glbv=(all_paths,), n_jobs=self._n_jobs, verbose=self._verbose)
do_fun = self._wrapper_kernel_do_kernelless # @todo: what is this?
parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker,
glbv=(all_paths,), n_jobs=self._n_jobs, verbose=self._verbose)
return gram_matrix return gram_matrix
def _compute_kernel_list_series(self, g1, g_list): def _compute_kernel_list_series(self, g1, g_list):
self._add_dummy_labels(g_list + [g1]) self._add_dummy_labels(g_list + [g1])
if self._verbose >= 2:
iterator_ps = tqdm(g_list, desc='getting paths', file=sys.stdout)
iterator_kernel = tqdm(range(len(g_list)), desc='Computing kernels', file=sys.stdout)
else:
iterator_ps = g_list
iterator_kernel = range(len(g_list))

iterator_ps = get_iters(g_list, desc='getting paths', file=sys.stdout, verbose=(self._verbose >= 2))
iterator_kernel = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self._verbose >= 2))

kernel_list = [None] * len(g_list) kernel_list = [None] * len(g_list)


if self._compute_method == 'trie': if self._compute_method == 'trie':
@@ -142,13 +134,13 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None
for i in iterator_kernel: for i in iterator_kernel:
kernel = self._kernel_do_naive(paths_g1, paths_g_list[i]) kernel = self._kernel_do_naive(paths_g1, paths_g_list[i])
kernel_list[i] = kernel kernel_list[i] = kernel
return kernel_list return kernel_list
def _compute_kernel_list_imap_unordered(self, g1, g_list): def _compute_kernel_list_imap_unordered(self, g1, g_list):
self._add_dummy_labels(g_list + [g1]) self._add_dummy_labels(g_list + [g1])
# get all paths of all graphs before computing kernels to save time, # get all paths of all graphs before computing kernels to save time,
# but this may cost a lot of memory for large datasets. # but this may cost a lot of memory for large datasets.
pool = Pool(self._n_jobs) pool = Pool(self._n_jobs)
@@ -162,48 +154,46 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None
paths_g1 = self._find_all_path_as_trie(g1) paths_g1 = self._find_all_path_as_trie(g1)
get_ps_fun = self._wrapper_find_all_path_as_trie get_ps_fun = self._wrapper_find_all_path_as_trie
elif self._compute_method != 'trie' and self._k_func is not None: elif self._compute_method != 'trie' and self._k_func is not None:
paths_g1 = self._find_all_paths_until_length(g1)
get_ps_fun = partial(self._wrapper_find_all_paths_until_length, True)
paths_g1 = self._find_all_paths_until_length(g1)
get_ps_fun = partial(self._wrapper_find_all_paths_until_length, True)
else: else:
paths_g1 = self._find_all_paths_until_length(g1)
paths_g1 = self._find_all_paths_until_length(g1)
get_ps_fun = partial(self._wrapper_find_all_paths_until_length, False) get_ps_fun = partial(self._wrapper_find_all_paths_until_length, False)
if self._verbose >= 2:
iterator = tqdm(pool.imap_unordered(get_ps_fun, itr, chunksize),
desc='getting paths', file=sys.stdout)
else:
iterator = pool.imap_unordered(get_ps_fun, itr, chunksize)
iterator = get_iters(pool.imap_unordered(get_ps_fun, itr, chunksize),
desc='getting paths', file=sys.stdout,
length=len(g_list), verbose=(self._verbose >= 2))
for i, ps in iterator: for i, ps in iterator:
paths_g_list[i] = ps paths_g_list[i] = ps
pool.close() pool.close()
pool.join() pool.join()
# compute kernel list. # compute kernel list.
kernel_list = [None] * len(g_list) kernel_list = [None] * len(g_list)
def init_worker(p1_toshare, plist_toshare): def init_worker(p1_toshare, plist_toshare):
global G_p1, G_plist global G_p1, G_plist
G_p1 = p1_toshare G_p1 = p1_toshare
G_plist = plist_toshare G_plist = plist_toshare
do_fun = self._wrapper_kernel_list_do do_fun = self._wrapper_kernel_list_do
def func_assign(result, var_to_assign):
def func_assign(result, var_to_assign):
var_to_assign[result[0]] = result[1] var_to_assign[result[0]] = result[1]
itr = range(len(g_list)) itr = range(len(g_list))
len_itr = len(g_list) len_itr = len(g_list)
parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr,
init_worker=init_worker, glbv=(paths_g1, paths_g_list), method='imap_unordered', n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose) init_worker=init_worker, glbv=(paths_g1, paths_g_list), method='imap_unordered', n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose)
return kernel_list return kernel_list
def _wrapper_kernel_list_do(self, itr): def _wrapper_kernel_list_do(self, itr):
if self._compute_method == 'trie' and self._k_func is not None: if self._compute_method == 'trie' and self._k_func is not None:
return itr, self._kernel_do_trie(G_p1, G_plist[itr]) return itr, self._kernel_do_trie(G_p1, G_plist[itr])
elif self._compute_method != 'trie' and self._k_func is not None: elif self._compute_method != 'trie' and self._k_func is not None:
return itr, self._kernel_do_naive(G_p1, G_plist[itr])
return itr, self._kernel_do_naive(G_p1, G_plist[itr])
else: else:
return itr, self._kernel_do_kernelless(G_p1, G_plist[itr]) return itr, self._kernel_do_kernelless(G_p1, G_plist[itr])
def _compute_single_kernel_series(self, g1, g2): def _compute_single_kernel_series(self, g1, g2):
self._add_dummy_labels([g1] + [g2]) self._add_dummy_labels([g1] + [g2])
if self._compute_method == 'trie': if self._compute_method == 'trie':
@@ -214,32 +204,32 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None
paths_g1 = self._find_all_paths_until_length(g1) paths_g1 = self._find_all_paths_until_length(g1)
paths_g2 = self._find_all_paths_until_length(g2) paths_g2 = self._find_all_paths_until_length(g2)
kernel = self._kernel_do_naive(paths_g1, paths_g2) kernel = self._kernel_do_naive(paths_g1, paths_g2)
return kernel
return kernel



def _kernel_do_trie(self, trie1, trie2): def _kernel_do_trie(self, trie1, trie2):
"""Compute path graph kernels up to depth d between 2 graphs using trie. """Compute path graph kernels up to depth d between 2 graphs using trie.
Parameters Parameters
---------- ----------
trie1, trie2 : list trie1, trie2 : list
Tries that contains all paths in 2 graphs. Tries that contains all paths in 2 graphs.
k_func : function k_func : function
A kernel function applied using different notions of fingerprint
A kernel function applied using different notions of fingerprint
similarity. similarity.
Return Return
------ ------
kernel : float kernel : float
Path kernel up to h between 2 graphs. Path kernel up to h between 2 graphs.
""" """
if self._k_func == 'tanimoto':
# traverse all paths in graph1 and search them in graph2. Deep-first
if self._k_func == 'tanimoto':
# traverse all paths in graph1 and search them in graph2. Deep-first
# search is applied. # search is applied.
def traverseTrie1t(root, trie2, setlist, pcurrent=[]):
def traverseTrie1t(root, trie2, setlist, pcurrent=[]): # @todo: no need to use value (# of occurrence of paths) in this case.
for key, node in root['children'].items(): for key, node in root['children'].items():
pcurrent.append(key) pcurrent.append(key)
if node['isEndOfWord']:
if node['isEndOfWord']:
setlist[1] += 1 setlist[1] += 1
count2 = trie2.searchWord(pcurrent) count2 = trie2.searchWord(pcurrent)
if count2 != 0: if count2 != 0:
@@ -250,17 +240,17 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None
del pcurrent[-1] del pcurrent[-1]
if pcurrent != []: if pcurrent != []:
del pcurrent[-1] del pcurrent[-1]
# traverse all paths in graph2 and find out those that are not in
# graph1. Deep-first search is applied.
# traverse all paths in graph2 and find out those that are not in
# graph1. Deep-first search is applied.
def traverseTrie2t(root, trie1, setlist, pcurrent=[]): def traverseTrie2t(root, trie1, setlist, pcurrent=[]):
for key, node in root['children'].items(): for key, node in root['children'].items():
pcurrent.append(key) pcurrent.append(key)
if node['isEndOfWord']: if node['isEndOfWord']:
# print(node['count']) # print(node['count'])
count1 = trie1.searchWord(pcurrent) count1 = trie1.searchWord(pcurrent)
if count1 == 0:
if count1 == 0:
setlist[1] += 1 setlist[1] += 1
if node['children'] != {}: if node['children'] != {}:
traverseTrie2t(node, trie1, setlist, pcurrent) traverseTrie2t(node, trie1, setlist, pcurrent)
@@ -268,7 +258,7 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None
del pcurrent[-1] del pcurrent[-1]
if pcurrent != []: if pcurrent != []:
del pcurrent[-1] del pcurrent[-1]
setlist = [0, 0] # intersection and union of path sets of g1, g2. setlist = [0, 0] # intersection and union of path sets of g1, g2.
# print(trie1.root) # print(trie1.root)
# print(trie2.root) # print(trie2.root)
@@ -277,9 +267,9 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None
traverseTrie2t(trie2.root, trie1, setlist) traverseTrie2t(trie2.root, trie1, setlist)
# print(setlist) # print(setlist)
kernel = setlist[0] / setlist[1] kernel = setlist[0] / setlist[1]
elif self._k_func == 'MinMax': # MinMax kernel
# traverse all paths in graph1 and search them in graph2. Deep-first
elif self._k_func == 'MinMax': # MinMax kernel
# traverse all paths in graph1 and search them in graph2. Deep-first
# search is applied. # search is applied.
def traverseTrie1m(root, trie2, sumlist, pcurrent=[]): def traverseTrie1m(root, trie2, sumlist, pcurrent=[]):
for key, node in root['children'].items(): for key, node in root['children'].items():
@@ -296,16 +286,16 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None
del pcurrent[-1] del pcurrent[-1]
if pcurrent != []: if pcurrent != []:
del pcurrent[-1] del pcurrent[-1]
# traverse all paths in graph2 and find out those that are not in
# graph1. Deep-first search is applied.
# traverse all paths in graph2 and find out those that are not in
# graph1. Deep-first search is applied.
def traverseTrie2m(root, trie1, sumlist, pcurrent=[]): def traverseTrie2m(root, trie1, sumlist, pcurrent=[]):
for key, node in root['children'].items(): for key, node in root['children'].items():
pcurrent.append(key) pcurrent.append(key)
if node['isEndOfWord']:
if node['isEndOfWord']:
# print(node['count']) # print(node['count'])
count1 = trie1.searchWord(pcurrent) count1 = trie1.searchWord(pcurrent)
if count1 == 0:
if count1 == 0:
sumlist[1] += node['count'] sumlist[1] += node['count']
if node['children'] != {}: if node['children'] != {}:
traverseTrie2m(node, trie1, sumlist, pcurrent) traverseTrie2m(node, trie1, sumlist, pcurrent)
@@ -313,7 +303,7 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None
del pcurrent[-1] del pcurrent[-1]
if pcurrent != []: if pcurrent != []:
del pcurrent[-1] del pcurrent[-1]
sumlist = [0, 0] # sum of mins and sum of maxs sumlist = [0, 0] # sum of mins and sum of maxs
# print(trie1.root) # print(trie1.root)
# print(trie2.root) # print(trie2.root)
@@ -324,37 +314,37 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None
kernel = sumlist[0] / sumlist[1] kernel = sumlist[0] / sumlist[1]
else: else:
raise Exception('The given "k_func" cannot be recognized. Possible choices include: "tanimoto", "MinMax".') raise Exception('The given "k_func" cannot be recognized. Possible choices include: "tanimoto", "MinMax".')
return kernel return kernel
def _wrapper_kernel_do_trie(self, itr): def _wrapper_kernel_do_trie(self, itr):
i = itr[0] i = itr[0]
j = itr[1] j = itr[1]
return i, j, self._kernel_do_trie(G_trie[i], G_trie[j]) return i, j, self._kernel_do_trie(G_trie[i], G_trie[j])
def _kernel_do_naive(self, paths1, paths2): def _kernel_do_naive(self, paths1, paths2):
"""Compute path graph kernels up to depth d between 2 graphs naively. """Compute path graph kernels up to depth d between 2 graphs naively.
Parameters Parameters
---------- ----------
paths_list : list of list paths_list : list of list
List of list of paths in all graphs, where for unlabeled graphs, each
path is represented by a list of nodes; while for labeled graphs, each
path is represented by a string consists of labels of nodes and/or
List of list of paths in all graphs, where for unlabeled graphs, each
path is represented by a list of nodes; while for labeled graphs, each
path is represented by a string consists of labels of nodes and/or
edges on that path. edges on that path.
k_func : function k_func : function
A kernel function applied using different notions of fingerprint
A kernel function applied using different notions of fingerprint
similarity. similarity.
Return Return
------ ------
kernel : float kernel : float
Path kernel up to h between 2 graphs. Path kernel up to h between 2 graphs.
""" """
all_paths = list(set(paths1 + paths2)) all_paths = list(set(paths1 + paths2))
if self._k_func == 'tanimoto': if self._k_func == 'tanimoto':
length_union = len(set(paths1 + paths2)) length_union = len(set(paths1 + paths2))
kernel = (len(set(paths1)) + len(set(paths2)) - kernel = (len(set(paths1)) + len(set(paths2)) -
@@ -363,7 +353,7 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None
# vector2 = [(1 if path in paths2 else 0) for path in all_paths] # vector2 = [(1 if path in paths2 else 0) for path in all_paths]
# kernel_uv = np.dot(vector1, vector2) # kernel_uv = np.dot(vector1, vector2)
# kernel = kernel_uv / (len(set(paths1)) + len(set(paths2)) - kernel_uv) # kernel = kernel_uv / (len(set(paths1)) + len(set(paths2)) - kernel_uv)
elif self._k_func == 'MinMax': # MinMax kernel elif self._k_func == 'MinMax': # MinMax kernel
path_count1 = Counter(paths1) path_count1 = Counter(paths1)
path_count2 = Counter(paths2) path_count2 = Counter(paths2)
@@ -373,7 +363,7 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None
for key in all_paths] for key in all_paths]
kernel = np.sum(np.minimum(vector1, vector2)) / \ kernel = np.sum(np.minimum(vector1, vector2)) / \
np.sum(np.maximum(vector1, vector2)) np.sum(np.maximum(vector1, vector2))
elif self._k_func is None: # no sub-kernel used; compare paths directly. elif self._k_func is None: # no sub-kernel used; compare paths directly.
path_count1 = Counter(paths1) path_count1 = Counter(paths1)
path_count2 = Counter(paths2) path_count2 = Counter(paths2)
@@ -382,27 +372,27 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None
vector2 = [(path_count2[key] if (key in path_count2.keys()) else 0) vector2 = [(path_count2[key] if (key in path_count2.keys()) else 0)
for key in all_paths] for key in all_paths]
kernel = np.dot(vector1, vector2) kernel = np.dot(vector1, vector2)
else: else:
raise Exception('The given "k_func" cannot be recognized. Possible choices include: "tanimoto", "MinMax" and None.') raise Exception('The given "k_func" cannot be recognized. Possible choices include: "tanimoto", "MinMax" and None.')
return kernel return kernel
def _wrapper_kernel_do_naive(self, itr): def _wrapper_kernel_do_naive(self, itr):
i = itr[0] i = itr[0]
j = itr[1] j = itr[1]
return i, j, self._kernel_do_naive(G_plist[i], G_plist[j]) return i, j, self._kernel_do_naive(G_plist[i], G_plist[j])
def _find_all_path_as_trie(self, G): def _find_all_path_as_trie(self, G):
# all_path = find_all_paths_until_length(G, length, ds_attrs,
# all_path = find_all_paths_until_length(G, length, ds_attrs,
# node_label=node_label, # node_label=node_label,
# edge_label=edge_label) # edge_label=edge_label)
# ptrie = Trie() # ptrie = Trie()
# for path in all_path: # for path in all_path:
# ptrie.insertWord(path) # ptrie.insertWord(path)
# ptrie = Trie() # ptrie = Trie()
# path_l = [[n] for n in G.nodes] # paths of length l # path_l = [[n] for n in G.nodes] # paths of length l
# path_l_str = paths2labelseqs(path_l, G, ds_attrs, node_label, edge_label) # path_l_str = paths2labelseqs(path_l, G, ds_attrs, node_label, edge_label)
@@ -421,15 +411,15 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None
# path_l_str = paths2labelseqs(path_l, G, ds_attrs, node_label, edge_label) # path_l_str = paths2labelseqs(path_l, G, ds_attrs, node_label, edge_label)
# for p in path_l_str: # for p in path_l_str:
# ptrie.insertWord(p) # ptrie.insertWord(p)
#
#
# print(time.time() - time1) # print(time.time() - time1)
# print(ptrie.root) # print(ptrie.root)
# print() # print()
# traverse all paths up to length h in a graph and construct a trie with
# them. Deep-first search is applied. Notice the reverse of each path is
# also stored to the trie.
# traverse all paths up to length h in a graph and construct a trie with
# them. Deep-first search is applied. Notice the reverse of each path is
# also stored to the trie.
def traverseGraph(root, ptrie, G, pcurrent=[]): def traverseGraph(root, ptrie, G, pcurrent=[]):
if len(pcurrent) < self._depth + 1: if len(pcurrent) < self._depth + 1:
for neighbor in G[root]: for neighbor in G[root]:
@@ -439,8 +429,8 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None
ptrie.insertWord(plstr[0]) ptrie.insertWord(plstr[0])
traverseGraph(neighbor, ptrie, G, pcurrent) traverseGraph(neighbor, ptrie, G, pcurrent)
del pcurrent[-1] del pcurrent[-1]
ptrie = Trie() ptrie = Trie()
path_l = [[n] for n in G.nodes] # paths of length l path_l = [[n] for n in G.nodes] # paths of length l
path_l_str = self._paths2labelseqs(path_l, G) path_l_str = self._paths2labelseqs(path_l, G)
@@ -448,18 +438,18 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None
ptrie.insertWord(p) ptrie.insertWord(p)
for n in G.nodes: for n in G.nodes:
traverseGraph(n, ptrie, G, pcurrent=[n]) traverseGraph(n, ptrie, G, pcurrent=[n])
# def traverseGraph(root, all_paths, length, G, ds_attrs, node_label, edge_label, # def traverseGraph(root, all_paths, length, G, ds_attrs, node_label, edge_label,
# pcurrent=[]): # pcurrent=[]):
# if len(pcurrent) < length + 1: # if len(pcurrent) < length + 1:
# for neighbor in G[root]: # for neighbor in G[root]:
# if neighbor not in pcurrent: # if neighbor not in pcurrent:
# pcurrent.append(neighbor) # pcurrent.append(neighbor)
# plstr = paths2labelseqs([pcurrent], G, ds_attrs,
# plstr = paths2labelseqs([pcurrent], G, ds_attrs,
# node_label, edge_label) # node_label, edge_label)
# all_paths.append(pcurrent[:]) # all_paths.append(pcurrent[:])
# traverseGraph(neighbor, all_paths, length, G, ds_attrs,
# traverseGraph(neighbor, all_paths, length, G, ds_attrs,
# node_label, edge_label, pcurrent) # node_label, edge_label, pcurrent)
# del pcurrent[-1] # del pcurrent[-1]
# #
@@ -470,24 +460,24 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None
## for p in path_l_str: ## for p in path_l_str:
## ptrie.insertWord(p) ## ptrie.insertWord(p)
# for n in G.nodes: # for n in G.nodes:
# traverseGraph(n, all_paths, length, G, ds_attrs, node_label, edge_label,
# traverseGraph(n, all_paths, length, G, ds_attrs, node_label, edge_label,
# pcurrent=[n]) # pcurrent=[n])
# print(ptrie.root) # print(ptrie.root)
return ptrie return ptrie
def _wrapper_find_all_path_as_trie(self, itr_item): def _wrapper_find_all_path_as_trie(self, itr_item):
g = itr_item[0] g = itr_item[0]
i = itr_item[1] i = itr_item[1]
return i, self._find_all_path_as_trie(g) return i, self._find_all_path_as_trie(g)
# @todo: (can be removed maybe) this method find paths repetively, it could be faster. # @todo: (can be removed maybe) this method find paths repetively, it could be faster.
def _find_all_paths_until_length(self, G, tolabelseqs=True): def _find_all_paths_until_length(self, G, tolabelseqs=True):
"""Find all paths no longer than a certain maximum length in a graph. A
"""Find all paths no longer than a certain maximum length in a graph. A
recursive depth first search is applied. recursive depth first search is applied.
Parameters Parameters
---------- ----------
G : NetworkX graphs G : NetworkX graphs
@@ -500,13 +490,13 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None
Node attribute used as label. The default node label is atom. Node attribute used as label. The default node label is atom.
edge_label : string edge_label : string
Edge attribute used as label. The default edge label is bond_type. Edge attribute used as label. The default edge label is bond_type.
Return Return
------ ------
path : list path : list
List of paths retrieved, where for unlabeled graphs, each path is
represented by a list of nodes; while for labeled graphs, each path is
represented by a list of strings consists of labels of nodes and/or
List of paths retrieved, where for unlabeled graphs, each path is
represented by a list of nodes; while for labeled graphs, each path is
represented by a list of strings consists of labels of nodes and/or
edges on that path. edges on that path.
""" """
# path_l = [tuple([n]) for n in G.nodes] # paths of length l # path_l = [tuple([n]) for n in G.nodes] # paths of length l
@@ -519,10 +509,10 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None
# tmp = path + (neighbor, ) # tmp = path + (neighbor, )
# if tuple(tmp[::-1]) not in path_l_new: # if tuple(tmp[::-1]) not in path_l_new:
# path_l_new.append(tuple(tmp)) # path_l_new.append(tuple(tmp))
# all_paths += path_l_new # all_paths += path_l_new
# path_l = path_l_new[:] # path_l = path_l_new[:]
path_l = [[n] for n in G.nodes] # paths of length l path_l = [[n] for n in G.nodes] # paths of length l
all_paths = [p.copy() for p in path_l] all_paths = [p.copy() for p in path_l]
for l in range(1, self._depth + 1): for l in range(1, self._depth + 1):
@@ -533,28 +523,28 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None
tmp = path + [neighbor] tmp = path + [neighbor]
# if tmp[::-1] not in path_lplus1: # if tmp[::-1] not in path_lplus1:
path_lplus1.append(tmp) path_lplus1.append(tmp)
all_paths += path_lplus1 all_paths += path_lplus1
path_l = [p.copy() for p in path_lplus1] path_l = [p.copy() for p in path_lplus1]
# for i in range(0, self._depth + 1): # for i in range(0, self._depth + 1):
# new_paths = find_all_paths(G, i) # new_paths = find_all_paths(G, i)
# if new_paths == []: # if new_paths == []:
# break # break
# all_paths.extend(new_paths) # all_paths.extend(new_paths)
# consider labels # consider labels
# print(paths2labelseqs(all_paths, G, ds_attrs, node_label, edge_label)) # print(paths2labelseqs(all_paths, G, ds_attrs, node_label, edge_label))
# print() # print()
return (self._paths2labelseqs(all_paths, G) if tolabelseqs else all_paths) return (self._paths2labelseqs(all_paths, G) if tolabelseqs else all_paths)
def _wrapper_find_all_paths_until_length(self, tolabelseqs, itr_item): def _wrapper_find_all_paths_until_length(self, tolabelseqs, itr_item):
g = itr_item[0] g = itr_item[0]
i = itr_item[1] i = itr_item[1]
return i, self._find_all_paths_until_length(g, tolabelseqs=tolabelseqs) return i, self._find_all_paths_until_length(g, tolabelseqs=tolabelseqs)
def _paths2labelseqs(self, plist, G): def _paths2labelseqs(self, plist, G):
if len(self._node_labels) > 0: if len(self._node_labels) > 0:
if len(self._edge_labels) > 0: if len(self._edge_labels) > 0:
@@ -589,8 +579,8 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None
else: else:
return [tuple(['0' for node in path]) for path in plist] return [tuple(['0' for node in path]) for path in plist]
# return [tuple([len(path)]) for path in all_paths] # return [tuple([len(path)]) for path in all_paths]
def _add_dummy_labels(self, Gn): def _add_dummy_labels(self, Gn):
if self._k_func is not None: if self._k_func is not None:
if len(self._node_labels) == 0 or (len(self._node_labels) == 1 and self._node_labels[0] == SpecialLabel.DUMMY): if len(self._node_labels) == 0 or (len(self._node_labels) == 1 and self._node_labels[0] == SpecialLabel.DUMMY):


+ 13
- 27
gklearn/kernels/shortest_path.py View File

@@ -15,7 +15,7 @@ import sys
from itertools import product from itertools import product
# from functools import partial # from functools import partial
from multiprocessing import Pool from multiprocessing import Pool
from tqdm import tqdm
from gklearn.utils import get_iters
import numpy as np import numpy as np
import networkx as nx import networkx as nx
from gklearn.utils.parallel import parallel_gm, parallel_me from gklearn.utils.parallel import parallel_gm, parallel_me
@@ -38,10 +38,7 @@ class ShortestPath(GraphKernel):
def _compute_gm_series(self): def _compute_gm_series(self):
self._all_graphs_have_edges(self._graphs) self._all_graphs_have_edges(self._graphs)
# get shortest path graph of each graph. # get shortest path graph of each graph.
if self._verbose >= 2:
iterator = tqdm(self._graphs, desc='getting sp graphs', file=sys.stdout)
else:
iterator = self._graphs
iterator = get_iters(self._graphs, desc='getting sp graphs', file=sys.stdout, verbose=(self._verbose >= 2))
self._graphs = [getSPGraph(g, edge_weight=self._edge_weight) for g in iterator] self._graphs = [getSPGraph(g, edge_weight=self._edge_weight) for g in iterator]


# compute Gram matrix. # compute Gram matrix.
@@ -49,10 +46,9 @@ class ShortestPath(GraphKernel):


from itertools import combinations_with_replacement from itertools import combinations_with_replacement
itr = combinations_with_replacement(range(0, len(self._graphs)), 2) itr = combinations_with_replacement(range(0, len(self._graphs)), 2)
if self._verbose >= 2:
iterator = tqdm(itr, desc='Computing kernels', file=sys.stdout)
else:
iterator = itr
len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2)
iterator = get_iters(itr, desc='Computing kernels',
length=len_itr, file=sys.stdout,verbose=(self._verbose >= 2))
for i, j in iterator: for i, j in iterator:
kernel = self._sp_do(self._graphs[i], self._graphs[j]) kernel = self._sp_do(self._graphs[i], self._graphs[j])
gram_matrix[i][j] = kernel gram_matrix[i][j] = kernel
@@ -71,11 +67,9 @@ class ShortestPath(GraphKernel):
chunksize = int(len(self._graphs) / self._n_jobs) + 1 chunksize = int(len(self._graphs) / self._n_jobs) + 1
else: else:
chunksize = 100 chunksize = 100
if self._verbose >= 2:
iterator = tqdm(pool.imap_unordered(get_sp_graphs_fun, itr, chunksize),
desc='getting sp graphs', file=sys.stdout)
else:
iterator = pool.imap_unordered(get_sp_graphs_fun, itr, chunksize)
iterator = get_iters(pool.imap_unordered(get_sp_graphs_fun, itr, chunksize),
desc='getting sp graphs', file=sys.stdout,
length=len(self._graphs), verbose=(self._verbose >= 2))
for i, g in iterator: for i, g in iterator:
self._graphs[i] = g self._graphs[i] = g
pool.close() pool.close()
@@ -98,18 +92,12 @@ class ShortestPath(GraphKernel):
self._all_graphs_have_edges([g1] + g_list) self._all_graphs_have_edges([g1] + g_list)
# get shortest path graphs of g1 and each graph in g_list. # get shortest path graphs of g1 and each graph in g_list.
g1 = getSPGraph(g1, edge_weight=self._edge_weight) g1 = getSPGraph(g1, edge_weight=self._edge_weight)
if self._verbose >= 2:
iterator = tqdm(g_list, desc='getting sp graphs', file=sys.stdout)
else:
iterator = g_list
iterator = get_iters(g_list, desc='getting sp graphs', file=sys.stdout, verbose=(self._verbose >= 2))
g_list = [getSPGraph(g, edge_weight=self._edge_weight) for g in iterator] g_list = [getSPGraph(g, edge_weight=self._edge_weight) for g in iterator]


# compute kernel list. # compute kernel list.
kernel_list = [None] * len(g_list) kernel_list = [None] * len(g_list)
if self._verbose >= 2:
iterator = tqdm(range(len(g_list)), desc='Computing kernels', file=sys.stdout)
else:
iterator = range(len(g_list))
iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self._verbose >= 2))
for i in iterator: for i in iterator:
kernel = self._sp_do(g1, g_list[i]) kernel = self._sp_do(g1, g_list[i])
kernel_list[i] = kernel kernel_list[i] = kernel
@@ -128,11 +116,9 @@ class ShortestPath(GraphKernel):
chunksize = int(len(g_list) / self._n_jobs) + 1 chunksize = int(len(g_list) / self._n_jobs) + 1
else: else:
chunksize = 100 chunksize = 100
if self._verbose >= 2:
iterator = tqdm(pool.imap_unordered(get_sp_graphs_fun, itr, chunksize),
desc='getting sp graphs', file=sys.stdout)
else:
iterator = pool.imap_unordered(get_sp_graphs_fun, itr, chunksize)
iterator = get_iters(pool.imap_unordered(get_sp_graphs_fun, itr, chunksize),
desc='getting sp graphs', file=sys.stdout,
length=len(g_list), verbose=(self._verbose >= 2))
for i, g in iterator: for i, g in iterator:
g_list[i] = g g_list[i] = g
pool.close() pool.close()


+ 52
- 66
gklearn/kernels/spectral_decomposition.py View File

@@ -5,13 +5,13 @@ Created on Thu Aug 20 16:12:45 2020


@author: ljia @author: ljia


@references:
@references:


[1] S Vichy N Vishwanathan, Nicol N Schraudolph, Risi Kondor, and Karsten M Borgwardt. Graph kernels. Journal of Machine Learning Research, 11(Apr):1201–1242, 2010. [1] S Vichy N Vishwanathan, Nicol N Schraudolph, Risi Kondor, and Karsten M Borgwardt. Graph kernels. Journal of Machine Learning Research, 11(Apr):1201–1242, 2010.
""" """


import sys import sys
from tqdm import tqdm
from gklearn.utils import get_iters
import numpy as np import numpy as np
import networkx as nx import networkx as nx
from scipy.sparse import kron from scipy.sparse import kron
@@ -20,12 +20,12 @@ from gklearn.kernels import RandomWalkMeta




class SpectralDecomposition(RandomWalkMeta): class SpectralDecomposition(RandomWalkMeta):
def __init__(self, **kwargs): def __init__(self, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self._sub_kernel = kwargs.get('sub_kernel', None) self._sub_kernel = kwargs.get('sub_kernel', None)


def _compute_gm_series(self): def _compute_gm_series(self):
self._check_edge_weight(self._graphs, self._verbose) self._check_edge_weight(self._graphs, self._verbose)
@@ -33,18 +33,15 @@ class SpectralDecomposition(RandomWalkMeta):
if self._verbose >= 2: if self._verbose >= 2:
import warnings import warnings
warnings.warn('All labels are ignored. Only works for undirected graphs.') warnings.warn('All labels are ignored. Only works for undirected graphs.')
# compute Gram matrix. # compute Gram matrix.
gram_matrix = np.zeros((len(self._graphs), len(self._graphs)))
gram_matrix = np.zeros((len(self._graphs), len(self._graphs)))
if self._q is None: if self._q is None:
# precompute the spectral decomposition of each graph. # precompute the spectral decomposition of each graph.
P_list = [] P_list = []
D_list = [] D_list = []
if self._verbose >= 2:
iterator = tqdm(self._graphs, desc='spectral decompose', file=sys.stdout)
else:
iterator = self._graphs
iterator = get_iters(self._graphs, desc='spectral decompose', file=sys.stdout, verbose=(self._verbose >= 2))
for G in iterator: for G in iterator:
# don't normalize adjacency matrices if q is a uniform vector. Note # don't normalize adjacency matrices if q is a uniform vector. Note
# A actually is the transpose of the adjacency matrix. # A actually is the transpose of the adjacency matrix.
@@ -60,42 +57,37 @@ class SpectralDecomposition(RandomWalkMeta):


from itertools import combinations_with_replacement from itertools import combinations_with_replacement
itr = combinations_with_replacement(range(0, len(self._graphs)), 2) itr = combinations_with_replacement(range(0, len(self._graphs)), 2)
if self._verbose >= 2:
iterator = tqdm(itr, desc='Computing kernels', file=sys.stdout)
else:
iterator = itr
len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2)
iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, length=len_itr, verbose=(self._verbose >= 2))

for i, j in iterator: for i, j in iterator:
kernel = self._kernel_do(q_T_list[i], q_T_list[j], P_list[i], P_list[j], D_list[i], D_list[j], self._weight, self._sub_kernel) kernel = self._kernel_do(q_T_list[i], q_T_list[j], P_list[i], P_list[j], D_list[i], D_list[j], self._weight, self._sub_kernel)
gram_matrix[i][j] = kernel gram_matrix[i][j] = kernel
gram_matrix[j][i] = kernel gram_matrix[j][i] = kernel
else: # @todo else: # @todo
pass pass
else: # @todo else: # @todo
pass pass
return gram_matrix return gram_matrix
def _compute_gm_imap_unordered(self): def _compute_gm_imap_unordered(self):
self._check_edge_weight(self._graphs, self._verbose) self._check_edge_weight(self._graphs, self._verbose)
self._check_graphs(self._graphs) self._check_graphs(self._graphs)
if self._verbose >= 2: if self._verbose >= 2:
import warnings import warnings
warnings.warn('All labels are ignored. Only works for undirected graphs.') warnings.warn('All labels are ignored. Only works for undirected graphs.')
# compute Gram matrix. # compute Gram matrix.
gram_matrix = np.zeros((len(self._graphs), len(self._graphs)))
gram_matrix = np.zeros((len(self._graphs), len(self._graphs)))
if self._q is None: if self._q is None:
# precompute the spectral decomposition of each graph. # precompute the spectral decomposition of each graph.
P_list = [] P_list = []
D_list = [] D_list = []
if self._verbose >= 2:
iterator = tqdm(self._graphs, desc='spectral decompose', file=sys.stdout)
else:
iterator = self._graphs
iterator = get_iters(self._graphs, desc='spectral decompose', file=sys.stdout, verbose=(self._verbose >= 2))
for G in iterator: for G in iterator:
# don't normalize adjacency matrices if q is a uniform vector. Note # don't normalize adjacency matrices if q is a uniform vector. Note
# A actually is the transpose of the adjacency matrix. # A actually is the transpose of the adjacency matrix.
@@ -106,45 +98,42 @@ class SpectralDecomposition(RandomWalkMeta):


if self._p is None: # p is uniform distribution as default. if self._p is None: # p is uniform distribution as default.
q_T_list = [np.full((1, nx.number_of_nodes(G)), 1 / nx.number_of_nodes(G)) for G in self._graphs] # @todo: parallel? q_T_list = [np.full((1, nx.number_of_nodes(G)), 1 / nx.number_of_nodes(G)) for G in self._graphs] # @todo: parallel?
def init_worker(q_T_list_toshare, P_list_toshare, D_list_toshare): def init_worker(q_T_list_toshare, P_list_toshare, D_list_toshare):
global G_q_T_list, G_P_list, G_D_list global G_q_T_list, G_P_list, G_D_list
G_q_T_list = q_T_list_toshare G_q_T_list = q_T_list_toshare
G_P_list = P_list_toshare G_P_list = P_list_toshare
G_D_list = D_list_toshare G_D_list = D_list_toshare
do_fun = self._wrapper_kernel_do
parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker,
do_fun = self._wrapper_kernel_do
parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker,
glbv=(q_T_list, P_list, D_list), n_jobs=self._n_jobs, verbose=self._verbose) glbv=(q_T_list, P_list, D_list), n_jobs=self._n_jobs, verbose=self._verbose)


else: # @todo else: # @todo
pass pass
else: # @todo else: # @todo
pass pass
return gram_matrix return gram_matrix
def _compute_kernel_list_series(self, g1, g_list): def _compute_kernel_list_series(self, g1, g_list):
self._check_edge_weight(g_list + [g1], self._verbose) self._check_edge_weight(g_list + [g1], self._verbose)
self._check_graphs(g_list + [g1]) self._check_graphs(g_list + [g1])
if self._verbose >= 2: if self._verbose >= 2:
import warnings import warnings
warnings.warn('All labels are ignored. Only works for undirected graphs.') warnings.warn('All labels are ignored. Only works for undirected graphs.')
# compute kernel list. # compute kernel list.
kernel_list = [None] * len(g_list) kernel_list = [None] * len(g_list)
if self._q is None: if self._q is None:
# precompute the spectral decomposition of each graph. # precompute the spectral decomposition of each graph.
A1 = nx.adjacency_matrix(g1, self._edge_weight).todense().transpose() A1 = nx.adjacency_matrix(g1, self._edge_weight).todense().transpose()
D1, P1 = np.linalg.eig(A1) D1, P1 = np.linalg.eig(A1)
P_list = [] P_list = []
D_list = [] D_list = []
if self._verbose >= 2:
iterator = tqdm(g_list, desc='spectral decompose', file=sys.stdout)
else:
iterator = g_list
iterator = get_iters(g_list, desc='spectral decompose', file=sys.stdout, verbose=(self._verbose >= 2))
for G in iterator: for G in iterator:
# don't normalize adjacency matrices if q is a uniform vector. Note # don't normalize adjacency matrices if q is a uniform vector. Note
# A actually is the transpose of the adjacency matrix. # A actually is the transpose of the adjacency matrix.
@@ -156,33 +145,30 @@ class SpectralDecomposition(RandomWalkMeta):
if self._p is None: # p is uniform distribution as default. if self._p is None: # p is uniform distribution as default.
q_T1 = 1 / nx.number_of_nodes(g1) q_T1 = 1 / nx.number_of_nodes(g1)
q_T_list = [np.full((1, nx.number_of_nodes(G)), 1 / nx.number_of_nodes(G)) for G in g_list] q_T_list = [np.full((1, nx.number_of_nodes(G)), 1 / nx.number_of_nodes(G)) for G in g_list]
if self._verbose >= 2:
iterator = tqdm(range(len(g_list)), desc='Computing kernels', file=sys.stdout)
else:
iterator = range(len(g_list))
iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self._verbose >= 2))

for i in iterator: for i in iterator:
kernel = self._kernel_do(q_T1, q_T_list[i], P1, P_list[i], D1, D_list[i], self._weight, self._sub_kernel) kernel = self._kernel_do(q_T1, q_T_list[i], P1, P_list[i], D1, D_list[i], self._weight, self._sub_kernel)
kernel_list[i] = kernel kernel_list[i] = kernel
else: # @todo else: # @todo
pass pass
else: # @todo else: # @todo
pass pass
return kernel_list return kernel_list
def _compute_kernel_list_imap_unordered(self, g1, g_list): def _compute_kernel_list_imap_unordered(self, g1, g_list):
self._check_edge_weight(g_list + [g1], self._verbose) self._check_edge_weight(g_list + [g1], self._verbose)
self._check_graphs(g_list + [g1]) self._check_graphs(g_list + [g1])
if self._verbose >= 2: if self._verbose >= 2:
import warnings import warnings
warnings.warn('All labels are ignored. Only works for undirected graphs.') warnings.warn('All labels are ignored. Only works for undirected graphs.')
# compute kernel list. # compute kernel list.
kernel_list = [None] * len(g_list) kernel_list = [None] * len(g_list)
if self._q is None: if self._q is None:
# precompute the spectral decomposition of each graph. # precompute the spectral decomposition of each graph.
A1 = nx.adjacency_matrix(g1, self._edge_weight).todense().transpose() A1 = nx.adjacency_matrix(g1, self._edge_weight).todense().transpose()
@@ -204,7 +190,7 @@ class SpectralDecomposition(RandomWalkMeta):
if self._p is None: # p is uniform distribution as default. if self._p is None: # p is uniform distribution as default.
q_T1 = 1 / nx.number_of_nodes(g1) q_T1 = 1 / nx.number_of_nodes(g1)
q_T_list = [np.full((1, nx.number_of_nodes(G)), 1 / nx.number_of_nodes(G)) for G in g_list] # @todo: parallel? q_T_list = [np.full((1, nx.number_of_nodes(G)), 1 / nx.number_of_nodes(G)) for G in g_list] # @todo: parallel?
def init_worker(q_T1_toshare, P1_toshare, D1_toshare, q_T_list_toshare, P_list_toshare, D_list_toshare): def init_worker(q_T1_toshare, P1_toshare, D1_toshare, q_T_list_toshare, P_list_toshare, D_list_toshare):
global G_q_T1, G_P1, G_D1, G_q_T_list, G_P_list, G_D_list global G_q_T1, G_P1, G_D1, G_q_T_list, G_P_list, G_D_list
G_q_T1 = q_T1_toshare G_q_T1 = q_T1_toshare
@@ -214,34 +200,34 @@ class SpectralDecomposition(RandomWalkMeta):
G_P_list = P_list_toshare G_P_list = P_list_toshare
G_D_list = D_list_toshare G_D_list = D_list_toshare


do_fun = self._wrapper_kernel_list_do
def func_assign(result, var_to_assign):
do_fun = self._wrapper_kernel_list_do
def func_assign(result, var_to_assign):
var_to_assign[result[0]] = result[1] var_to_assign[result[0]] = result[1]
itr = range(len(g_list)) itr = range(len(g_list))
len_itr = len(g_list) len_itr = len(g_list)
parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr,
init_worker=init_worker, glbv=(q_T1, P1, D1, q_T_list, P_list, D_list), method='imap_unordered', n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose) init_worker=init_worker, glbv=(q_T1, P1, D1, q_T_list, P_list, D_list), method='imap_unordered', n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose)
else: # @todo else: # @todo
pass pass
else: # @todo else: # @todo
pass pass
return kernel_list return kernel_list




def _wrapper_kernel_list_do(self, itr): def _wrapper_kernel_list_do(self, itr):
return itr, self._kernel_do(G_q_T1, G_q_T_list[itr], G_P1, G_P_list[itr], G_D1, G_D_list[itr], self._weight, self._sub_kernel) return itr, self._kernel_do(G_q_T1, G_q_T_list[itr], G_P1, G_P_list[itr], G_D1, G_D_list[itr], self._weight, self._sub_kernel)
def _compute_single_kernel_series(self, g1, g2): def _compute_single_kernel_series(self, g1, g2):
self._check_edge_weight([g1] + [g2], self._verbose) self._check_edge_weight([g1] + [g2], self._verbose)
self._check_graphs([g1] + [g2]) self._check_graphs([g1] + [g2])
if self._verbose >= 2: if self._verbose >= 2:
import warnings import warnings
warnings.warn('All labels are ignored. Only works for undirected graphs.') warnings.warn('All labels are ignored. Only works for undirected graphs.')
if self._q is None: if self._q is None:
# precompute the spectral decomposition of each graph. # precompute the spectral decomposition of each graph.
A1 = nx.adjacency_matrix(g1, self._edge_weight).todense().transpose() A1 = nx.adjacency_matrix(g1, self._edge_weight).todense().transpose()
@@ -257,10 +243,10 @@ class SpectralDecomposition(RandomWalkMeta):
pass pass
else: # @todo else: # @todo
pass pass
return kernel
return kernel
def _kernel_do(self, q_T1, q_T2, P1, P2, D1, D2, weight, sub_kernel): def _kernel_do(self, q_T1, q_T2, P1, P2, D1, D2, weight, sub_kernel):
# use uniform distribution if there is no prior knowledge. # use uniform distribution if there is no prior knowledge.
kl = kron(np.dot(q_T1, P1), np.dot(q_T2, P2)).todense() kl = kron(np.dot(q_T1, P1), np.dot(q_T2, P2)).todense()
@@ -276,7 +262,7 @@ class SpectralDecomposition(RandomWalkMeta):
kmiddle = np.linalg.inv(kmiddle) kmiddle = np.linalg.inv(kmiddle)
return np.dot(np.dot(kl, kmiddle), kl.T)[0, 0] return np.dot(np.dot(kl, kmiddle), kl.T)[0, 0]


def _wrapper_kernel_do(self, itr): def _wrapper_kernel_do(self, itr):
i = itr[0] i = itr[0]
j = itr[1] j = itr[1]

+ 58
- 75
gklearn/kernels/sylvester_equation.py View File

@@ -5,13 +5,13 @@ Created on Wed Aug 19 17:24:46 2020


@author: ljia @author: ljia


@references:
@references:


[1] S Vichy N Vishwanathan, Nicol N Schraudolph, Risi Kondor, and Karsten M Borgwardt. Graph kernels. Journal of Machine Learning Research, 11(Apr):1201–1242, 2010. [1] S Vichy N Vishwanathan, Nicol N Schraudolph, Risi Kondor, and Karsten M Borgwardt. Graph kernels. Journal of Machine Learning Research, 11(Apr):1201–1242, 2010.
""" """


import sys import sys
from tqdm import tqdm
from gklearn.utils import get_iters
import numpy as np import numpy as np
import networkx as nx import networkx as nx
from control import dlyap from control import dlyap
@@ -20,11 +20,11 @@ from gklearn.kernels import RandomWalkMeta




class SylvesterEquation(RandomWalkMeta): class SylvesterEquation(RandomWalkMeta):
def __init__(self, **kwargs): def __init__(self, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)


def _compute_gm_series(self): def _compute_gm_series(self):
self._check_edge_weight(self._graphs, self._verbose) self._check_edge_weight(self._graphs, self._verbose)
@@ -32,24 +32,21 @@ class SylvesterEquation(RandomWalkMeta):
if self._verbose >= 2: if self._verbose >= 2:
import warnings import warnings
warnings.warn('All labels are ignored.') warnings.warn('All labels are ignored.')
lmda = self._weight lmda = self._weight
# compute Gram matrix. # compute Gram matrix.
gram_matrix = np.zeros((len(self._graphs), len(self._graphs)))
gram_matrix = np.zeros((len(self._graphs), len(self._graphs)))
if self._q is None: if self._q is None:
# don't normalize adjacency matrices if q is a uniform vector. Note # don't normalize adjacency matrices if q is a uniform vector. Note
# A_wave_list actually contains the transposes of the adjacency matrices. # A_wave_list actually contains the transposes of the adjacency matrices.
if self._verbose >= 2:
iterator = tqdm(self._graphs, desc='compute adjacency matrices', file=sys.stdout)
else:
iterator = self._graphs
iterator = get_iters(self._graphs, desc='compute adjacency matrices', file=sys.stdout, verbose=(self._verbose >= 2))
A_wave_list = [nx.adjacency_matrix(G, self._edge_weight).todense().transpose() for G in iterator] A_wave_list = [nx.adjacency_matrix(G, self._edge_weight).todense().transpose() for G in iterator]
# # normalized adjacency matrices # # normalized adjacency matrices
# A_wave_list = [] # A_wave_list = []
# for G in tqdm(Gn, desc='compute adjacency matrices', file=sys.stdout): # for G in tqdm(Gn, desc='compute adjacency matrices', file=sys.stdout):
# A_tilde = nx.adjacency_matrix(G, eweight).todense().transpose()
# A_tilde = nx.adjacency_matrix(G, eweight).todense().transpose()
# norm = A_tilde.sum(axis=0) # norm = A_tilde.sum(axis=0)
# norm[norm == 0] = 1 # norm[norm == 0] = 1
# A_wave_list.append(A_tilde / norm) # A_wave_list.append(A_tilde / norm)
@@ -57,119 +54,105 @@ class SylvesterEquation(RandomWalkMeta):
if self._p is None: # p is uniform distribution as default. if self._p is None: # p is uniform distribution as default.
from itertools import combinations_with_replacement from itertools import combinations_with_replacement
itr = combinations_with_replacement(range(0, len(self._graphs)), 2) itr = combinations_with_replacement(range(0, len(self._graphs)), 2)
if self._verbose >= 2:
iterator = tqdm(itr, desc='Computing kernels', file=sys.stdout)
else:
iterator = itr
len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2)
iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, length=len_itr, verbose=(self._verbose >= 2))

for i, j in iterator: for i, j in iterator:
kernel = self._kernel_do(A_wave_list[i], A_wave_list[j], lmda) kernel = self._kernel_do(A_wave_list[i], A_wave_list[j], lmda)
gram_matrix[i][j] = kernel gram_matrix[i][j] = kernel
gram_matrix[j][i] = kernel gram_matrix[j][i] = kernel
else: # @todo else: # @todo
pass pass
else: # @todo else: # @todo
pass pass
return gram_matrix return gram_matrix
def _compute_gm_imap_unordered(self): def _compute_gm_imap_unordered(self):
self._check_edge_weight(self._graphs, self._verbose) self._check_edge_weight(self._graphs, self._verbose)
self._check_graphs(self._graphs) self._check_graphs(self._graphs)
if self._verbose >= 2: if self._verbose >= 2:
import warnings import warnings
warnings.warn('All labels are ignored.') warnings.warn('All labels are ignored.')
# compute Gram matrix. # compute Gram matrix.
gram_matrix = np.zeros((len(self._graphs), len(self._graphs)))
gram_matrix = np.zeros((len(self._graphs), len(self._graphs)))
if self._q is None: if self._q is None:
# don't normalize adjacency matrices if q is a uniform vector. Note # don't normalize adjacency matrices if q is a uniform vector. Note
# A_wave_list actually contains the transposes of the adjacency matrices. # A_wave_list actually contains the transposes of the adjacency matrices.
if self._verbose >= 2:
iterator = tqdm(self._graphs, desc='compute adjacency matrices', file=sys.stdout)
else:
iterator = self._graphs
iterator = get_iters(self._graphs, desc='compute adjacency matrices', file=sys.stdout, verbose=(self._verbose >= 2))
A_wave_list = [nx.adjacency_matrix(G, self._edge_weight).todense().transpose() for G in iterator] # @todo: parallel? A_wave_list = [nx.adjacency_matrix(G, self._edge_weight).todense().transpose() for G in iterator] # @todo: parallel?


if self._p is None: # p is uniform distribution as default. if self._p is None: # p is uniform distribution as default.
def init_worker(A_wave_list_toshare): def init_worker(A_wave_list_toshare):
global G_A_wave_list global G_A_wave_list
G_A_wave_list = A_wave_list_toshare G_A_wave_list = A_wave_list_toshare
do_fun = self._wrapper_kernel_do do_fun = self._wrapper_kernel_do
parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker,
parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker,
glbv=(A_wave_list,), n_jobs=self._n_jobs, verbose=self._verbose) glbv=(A_wave_list,), n_jobs=self._n_jobs, verbose=self._verbose)


else: # @todo else: # @todo
pass pass
else: # @todo else: # @todo
pass pass
return gram_matrix return gram_matrix
def _compute_kernel_list_series(self, g1, g_list): def _compute_kernel_list_series(self, g1, g_list):
self._check_edge_weight(g_list + [g1], self._verbose) self._check_edge_weight(g_list + [g1], self._verbose)
self._check_graphs(g_list + [g1]) self._check_graphs(g_list + [g1])
if self._verbose >= 2: if self._verbose >= 2:
import warnings import warnings
warnings.warn('All labels are ignored.') warnings.warn('All labels are ignored.')
lmda = self._weight lmda = self._weight
# compute kernel list. # compute kernel list.
kernel_list = [None] * len(g_list) kernel_list = [None] * len(g_list)
if self._q is None: if self._q is None:
# don't normalize adjacency matrices if q is a uniform vector. Note # don't normalize adjacency matrices if q is a uniform vector. Note
# A_wave_list actually contains the transposes of the adjacency matrices. # A_wave_list actually contains the transposes of the adjacency matrices.
A_wave_1 = nx.adjacency_matrix(g1, self._edge_weight).todense().transpose() A_wave_1 = nx.adjacency_matrix(g1, self._edge_weight).todense().transpose()
if self._verbose >= 2:
iterator = tqdm(g_list, desc='compute adjacency matrices', file=sys.stdout)
else:
iterator = g_list
iterator = get_iters(g_list, desc='compute adjacency matrices', file=sys.stdout, verbose=(self._verbose >= 2))
A_wave_list = [nx.adjacency_matrix(G, self._edge_weight).todense().transpose() for G in iterator] A_wave_list = [nx.adjacency_matrix(G, self._edge_weight).todense().transpose() for G in iterator]


if self._p is None: # p is uniform distribution as default. if self._p is None: # p is uniform distribution as default.
if self._verbose >= 2:
iterator = tqdm(range(len(g_list)), desc='Computing kernels', file=sys.stdout)
else:
iterator = range(len(g_list))
iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self._verbose >= 2))

for i in iterator: for i in iterator:
kernel = self._kernel_do(A_wave_1, A_wave_list[i], lmda) kernel = self._kernel_do(A_wave_1, A_wave_list[i], lmda)
kernel_list[i] = kernel kernel_list[i] = kernel
else: # @todo else: # @todo
pass pass
else: # @todo else: # @todo
pass pass
return kernel_list return kernel_list
def _compute_kernel_list_imap_unordered(self, g1, g_list): def _compute_kernel_list_imap_unordered(self, g1, g_list):
self._check_edge_weight(g_list + [g1], self._verbose) self._check_edge_weight(g_list + [g1], self._verbose)
self._check_graphs(g_list + [g1]) self._check_graphs(g_list + [g1])
if self._verbose >= 2: if self._verbose >= 2:
import warnings import warnings
warnings.warn('All labels are ignored.') warnings.warn('All labels are ignored.')
# compute kernel list. # compute kernel list.
kernel_list = [None] * len(g_list) kernel_list = [None] * len(g_list)
if self._q is None: if self._q is None:
# don't normalize adjacency matrices if q is a uniform vector. Note # don't normalize adjacency matrices if q is a uniform vector. Note
# A_wave_list actually contains the transposes of the adjacency matrices. # A_wave_list actually contains the transposes of the adjacency matrices.
A_wave_1 = nx.adjacency_matrix(g1, self._edge_weight).todense().transpose() A_wave_1 = nx.adjacency_matrix(g1, self._edge_weight).todense().transpose()
if self._verbose >= 2:
iterator = tqdm(g_list, desc='compute adjacency matrices', file=sys.stdout)
else:
iterator = g_list
iterator = get_iters(g_list, desc='compute adjacency matrices', file=sys.stdout, verbose=(self._verbose >= 2))
A_wave_list = [nx.adjacency_matrix(G, self._edge_weight).todense().transpose() for G in iterator] # @todo: parallel? A_wave_list = [nx.adjacency_matrix(G, self._edge_weight).todense().transpose() for G in iterator] # @todo: parallel?


if self._p is None: # p is uniform distribution as default. if self._p is None: # p is uniform distribution as default.
@@ -178,37 +161,37 @@ class SylvesterEquation(RandomWalkMeta):
G_A_wave_1 = A_wave_1_toshare G_A_wave_1 = A_wave_1_toshare
G_A_wave_list = A_wave_list_toshare G_A_wave_list = A_wave_list_toshare


do_fun = self._wrapper_kernel_list_do
def func_assign(result, var_to_assign):
do_fun = self._wrapper_kernel_list_do
def func_assign(result, var_to_assign):
var_to_assign[result[0]] = result[1] var_to_assign[result[0]] = result[1]
itr = range(len(g_list)) itr = range(len(g_list))
len_itr = len(g_list) len_itr = len(g_list)
parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr,
init_worker=init_worker, glbv=(A_wave_1, A_wave_list), method='imap_unordered',
init_worker=init_worker, glbv=(A_wave_1, A_wave_list), method='imap_unordered',
n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose) n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose)
else: # @todo else: # @todo
pass pass
else: # @todo else: # @todo
pass pass
return kernel_list return kernel_list




def _wrapper_kernel_list_do(self, itr): def _wrapper_kernel_list_do(self, itr):
return itr, self._kernel_do(G_A_wave_1, G_A_wave_list[itr], self._weight) return itr, self._kernel_do(G_A_wave_1, G_A_wave_list[itr], self._weight)
def _compute_single_kernel_series(self, g1, g2): def _compute_single_kernel_series(self, g1, g2):
self._check_edge_weight([g1] + [g2], self._verbose) self._check_edge_weight([g1] + [g2], self._verbose)
self._check_graphs([g1] + [g2]) self._check_graphs([g1] + [g2])
if self._verbose >= 2: if self._verbose >= 2:
import warnings import warnings
warnings.warn('All labels are ignored.') warnings.warn('All labels are ignored.')
lmda = self._weight lmda = self._weight
if self._q is None: if self._q is None:
# don't normalize adjacency matrices if q is a uniform vector. Note # don't normalize adjacency matrices if q is a uniform vector. Note
# A_wave_list actually contains the transposes of the adjacency matrices. # A_wave_list actually contains the transposes of the adjacency matrices.
@@ -220,12 +203,12 @@ class SylvesterEquation(RandomWalkMeta):
pass pass
else: # @todo else: # @todo
pass pass
return kernel
return kernel
def _kernel_do(self, A_wave1, A_wave2, lmda): def _kernel_do(self, A_wave1, A_wave2, lmda):
S = lmda * A_wave2 S = lmda * A_wave2
T_t = A_wave1 T_t = A_wave1
# use uniform distribution if there is no prior knowledge. # use uniform distribution if there is no prior knowledge.
@@ -237,8 +220,8 @@ class SylvesterEquation(RandomWalkMeta):
# use uniform distribution if there is no prior knowledge. # use uniform distribution if there is no prior knowledge.
q_times = np.full((1, nb_pd), p_times_uni) q_times = np.full((1, nb_pd), p_times_uni)
return np.dot(q_times, X) return np.dot(q_times, X)
def _wrapper_kernel_do(self, itr): def _wrapper_kernel_do(self, itr):
i = itr[0] i = itr[0]
j = itr[1] j = itr[1]

+ 118
- 131
gklearn/kernels/treelet.py View File

@@ -5,15 +5,15 @@ Created on Mon Apr 13 18:02:46 2020


@author: ljia @author: ljia


@references:
@references:


[1] Gaüzère B, Brun L, Villemin D. Two new graphs kernels in
[1] Gaüzère B, Brun L, Villemin D. Two new graphs kernels in
chemoinformatics. Pattern Recognition Letters. 2012 Nov 1;33(15):2038-47. chemoinformatics. Pattern Recognition Letters. 2012 Nov 1;33(15):2038-47.
""" """


import sys import sys
from multiprocessing import Pool from multiprocessing import Pool
from tqdm import tqdm
from gklearn.utils import get_iters
import numpy as np import numpy as np
import networkx as nx import networkx as nx
from collections import Counter from collections import Counter
@@ -25,7 +25,7 @@ from gklearn.kernels import GraphKernel




class Treelet(GraphKernel): class Treelet(GraphKernel):
def __init__(self, **kwargs): def __init__(self, **kwargs):
GraphKernel.__init__(self) GraphKernel.__init__(self)
self._node_labels = kwargs.get('node_labels', []) self._node_labels = kwargs.get('node_labels', [])
@@ -38,38 +38,35 @@ class Treelet(GraphKernel):


def _compute_gm_series(self): def _compute_gm_series(self):
self._add_dummy_labels(self._graphs) self._add_dummy_labels(self._graphs)
# get all canonical keys of all graphs before computing kernels to save
# get all canonical keys of all graphs before computing kernels to save
# time, but this may cost a lot of memory for large dataset. # time, but this may cost a lot of memory for large dataset.
canonkeys = [] canonkeys = []
if self._verbose >= 2:
iterator = tqdm(self._graphs, desc='getting canonkeys', file=sys.stdout)
else:
iterator = self._graphs
iterator = get_iters(self._graphs, desc='getting canonkeys', file=sys.stdout,
verbose=(self._verbose >= 2))
for g in iterator: for g in iterator:
canonkeys.append(self._get_canonkeys(g)) canonkeys.append(self._get_canonkeys(g))
# compute Gram matrix. # compute Gram matrix.
gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) gram_matrix = np.zeros((len(self._graphs), len(self._graphs)))
from itertools import combinations_with_replacement from itertools import combinations_with_replacement
itr = combinations_with_replacement(range(0, len(self._graphs)), 2) itr = combinations_with_replacement(range(0, len(self._graphs)), 2)
if self._verbose >= 2:
iterator = tqdm(itr, desc='Computing kernels', file=sys.stdout)
else:
iterator = itr
len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2)
iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout,
length=len_itr, verbose=(self._verbose >= 2))
for i, j in iterator: for i, j in iterator:
kernel = self._kernel_do(canonkeys[i], canonkeys[j]) kernel = self._kernel_do(canonkeys[i], canonkeys[j])
gram_matrix[i][j] = kernel gram_matrix[i][j] = kernel
gram_matrix[j][i] = kernel # @todo: no directed graph considered? gram_matrix[j][i] = kernel # @todo: no directed graph considered?
return gram_matrix return gram_matrix
def _compute_gm_imap_unordered(self): def _compute_gm_imap_unordered(self):
self._add_dummy_labels(self._graphs) self._add_dummy_labels(self._graphs)
# get all canonical keys of all graphs before computing kernels to save
# get all canonical keys of all graphs before computing kernels to save
# time, but this may cost a lot of memory for large dataset. # time, but this may cost a lot of memory for large dataset.
pool = Pool(self._n_jobs) pool = Pool(self._n_jobs)
itr = zip(self._graphs, range(0, len(self._graphs))) itr = zip(self._graphs, range(0, len(self._graphs)))
@@ -79,60 +76,52 @@ class Treelet(GraphKernel):
chunksize = 100 chunksize = 100
canonkeys = [[] for _ in range(len(self._graphs))] canonkeys = [[] for _ in range(len(self._graphs))]
get_fun = self._wrapper_get_canonkeys get_fun = self._wrapper_get_canonkeys
if self._verbose >= 2:
iterator = tqdm(pool.imap_unordered(get_fun, itr, chunksize),
desc='getting canonkeys', file=sys.stdout)
else:
iterator = pool.imap_unordered(get_fun, itr, chunksize)
iterator = get_iters(pool.imap_unordered(get_fun, itr, chunksize),
desc='getting canonkeys', file=sys.stdout,
length=len(self._graphs), verbose=(self._verbose >= 2))
for i, ck in iterator: for i, ck in iterator:
canonkeys[i] = ck canonkeys[i] = ck
pool.close() pool.close()
pool.join() pool.join()
# compute Gram matrix. # compute Gram matrix.
gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) gram_matrix = np.zeros((len(self._graphs), len(self._graphs)))
def init_worker(canonkeys_toshare): def init_worker(canonkeys_toshare):
global G_canonkeys global G_canonkeys
G_canonkeys = canonkeys_toshare G_canonkeys = canonkeys_toshare
do_fun = self._wrapper_kernel_do do_fun = self._wrapper_kernel_do
parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker,
parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker,
glbv=(canonkeys,), n_jobs=self._n_jobs, verbose=self._verbose) glbv=(canonkeys,), n_jobs=self._n_jobs, verbose=self._verbose)
return gram_matrix return gram_matrix
def _compute_kernel_list_series(self, g1, g_list): def _compute_kernel_list_series(self, g1, g_list):
self._add_dummy_labels(g_list + [g1]) self._add_dummy_labels(g_list + [g1])
# get all canonical keys of all graphs before computing kernels to save
# get all canonical keys of all graphs before computing kernels to save
# time, but this may cost a lot of memory for large dataset. # time, but this may cost a lot of memory for large dataset.
canonkeys_1 = self._get_canonkeys(g1) canonkeys_1 = self._get_canonkeys(g1)
canonkeys_list = [] canonkeys_list = []
if self._verbose >= 2:
iterator = tqdm(g_list, desc='getting canonkeys', file=sys.stdout)
else:
iterator = g_list
iterator = get_iters(g_list, desc='getting canonkeys', file=sys.stdout, verbose=(self._verbose >= 2))
for g in iterator: for g in iterator:
canonkeys_list.append(self._get_canonkeys(g)) canonkeys_list.append(self._get_canonkeys(g))

# compute kernel list. # compute kernel list.
kernel_list = [None] * len(g_list) kernel_list = [None] * len(g_list)
if self._verbose >= 2:
iterator = tqdm(range(len(g_list)), desc='Computing kernels', file=sys.stdout)
else:
iterator = range(len(g_list))
iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self._verbose >= 2))
for i in iterator: for i in iterator:
kernel = self._kernel_do(canonkeys_1, canonkeys_list[i]) kernel = self._kernel_do(canonkeys_1, canonkeys_list[i])
kernel_list[i] = kernel kernel_list[i] = kernel
return kernel_list return kernel_list
def _compute_kernel_list_imap_unordered(self, g1, g_list): def _compute_kernel_list_imap_unordered(self, g1, g_list):
self._add_dummy_labels(g_list + [g1]) self._add_dummy_labels(g_list + [g1])
# get all canonical keys of all graphs before computing kernels to save
# get all canonical keys of all graphs before computing kernels to save
# time, but this may cost a lot of memory for large dataset. # time, but this may cost a lot of memory for large dataset.
canonkeys_1 = self._get_canonkeys(g1) canonkeys_1 = self._get_canonkeys(g1)
canonkeys_list = [[] for _ in range(len(g_list))] canonkeys_list = [[] for _ in range(len(g_list))]
@@ -143,16 +132,14 @@ class Treelet(GraphKernel):
else: else:
chunksize = 100 chunksize = 100
get_fun = self._wrapper_get_canonkeys get_fun = self._wrapper_get_canonkeys
if self._verbose >= 2:
iterator = tqdm(pool.imap_unordered(get_fun, itr, chunksize),
desc='getting canonkeys', file=sys.stdout)
else:
iterator = pool.imap_unordered(get_fun, itr, chunksize)
iterator = get_iters(pool.imap_unordered(get_fun, itr, chunksize),
desc='getting canonkeys', file=sys.stdout,
length=len(g_list), verbose=(self._verbose >= 2))
for i, ck in iterator: for i, ck in iterator:
canonkeys_list[i] = ck canonkeys_list[i] = ck
pool.close() pool.close()
pool.join() pool.join()
# compute kernel list. # compute kernel list.
kernel_list = [None] * len(g_list) kernel_list = [None] * len(g_list)


@@ -161,37 +148,37 @@ class Treelet(GraphKernel):
G_ck_1 = ck_1_toshare G_ck_1 = ck_1_toshare
G_ck_list = ck_list_toshare G_ck_list = ck_list_toshare
do_fun = self._wrapper_kernel_list_do do_fun = self._wrapper_kernel_list_do
def func_assign(result, var_to_assign):
def func_assign(result, var_to_assign):
var_to_assign[result[0]] = result[1] var_to_assign[result[0]] = result[1]
itr = range(len(g_list)) itr = range(len(g_list))
len_itr = len(g_list) len_itr = len(g_list)
parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr,
init_worker=init_worker, glbv=(canonkeys_1, canonkeys_list), method='imap_unordered',
init_worker=init_worker, glbv=(canonkeys_1, canonkeys_list), method='imap_unordered',
n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose) n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose)
return kernel_list return kernel_list
def _wrapper_kernel_list_do(self, itr): def _wrapper_kernel_list_do(self, itr):
return itr, self._kernel_do(G_ck_1, G_ck_list[itr]) return itr, self._kernel_do(G_ck_1, G_ck_list[itr])
def _compute_single_kernel_series(self, g1, g2): def _compute_single_kernel_series(self, g1, g2):
self._add_dummy_labels([g1] + [g2]) self._add_dummy_labels([g1] + [g2])
canonkeys_1 = self._get_canonkeys(g1) canonkeys_1 = self._get_canonkeys(g1)
canonkeys_2 = self._get_canonkeys(g2) canonkeys_2 = self._get_canonkeys(g2)
kernel = self._kernel_do(canonkeys_1, canonkeys_2) kernel = self._kernel_do(canonkeys_1, canonkeys_2)
return kernel
return kernel
def _kernel_do(self, canonkey1, canonkey2): def _kernel_do(self, canonkey1, canonkey2):
"""Compute treelet graph kernel between 2 graphs. """Compute treelet graph kernel between 2 graphs.
Parameters Parameters
---------- ----------
canonkey1, canonkey2 : list canonkey1, canonkey2 : list
List of canonical keys in 2 graphs, where each key is represented by a string. List of canonical keys in 2 graphs, where each key is represented by a string.
Return Return
------ ------
kernel : float kernel : float
@@ -199,38 +186,38 @@ class Treelet(GraphKernel):
""" """
keys = set(canonkey1.keys()) & set(canonkey2.keys()) # find same canonical keys in both graphs keys = set(canonkey1.keys()) & set(canonkey2.keys()) # find same canonical keys in both graphs
vector1 = np.array([(canonkey1[key] if (key in canonkey1.keys()) else 0) for key in keys]) vector1 = np.array([(canonkey1[key] if (key in canonkey1.keys()) else 0) for key in keys])
vector2 = np.array([(canonkey2[key] if (key in canonkey2.keys()) else 0) for key in keys])
kernel = self._sub_kernel(vector1, vector2)
vector2 = np.array([(canonkey2[key] if (key in canonkey2.keys()) else 0) for key in keys])
kernel = self._sub_kernel(vector1, vector2)
return kernel return kernel
def _wrapper_kernel_do(self, itr): def _wrapper_kernel_do(self, itr):
i = itr[0] i = itr[0]
j = itr[1] j = itr[1]
return i, j, self._kernel_do(G_canonkeys[i], G_canonkeys[j]) return i, j, self._kernel_do(G_canonkeys[i], G_canonkeys[j])
def _get_canonkeys(self, G): def _get_canonkeys(self, G):
"""Generate canonical keys of all treelets in a graph. """Generate canonical keys of all treelets in a graph.
Parameters Parameters
---------- ----------
G : NetworkX graphs G : NetworkX graphs
The graph in which keys are generated. The graph in which keys are generated.
Return Return
------ ------
canonkey/canonkey_l : dict canonkey/canonkey_l : dict
For unlabeled graphs, canonkey is a dictionary which records amount of
every tree pattern. For labeled graphs, canonkey_l is one which keeps
For unlabeled graphs, canonkey is a dictionary which records amount of
every tree pattern. For labeled graphs, canonkey_l is one which keeps
track of amount of every treelet. track of amount of every treelet.
""" """
patterns = {} # a dictionary which consists of lists of patterns for all graphlet. patterns = {} # a dictionary which consists of lists of patterns for all graphlet.
canonkey = {} # canonical key, a dictionary which records amount of every tree pattern. canonkey = {} # canonical key, a dictionary which records amount of every tree pattern.
### structural analysis ### ### structural analysis ###
### In this section, a list of patterns is generated for each graphlet,
### where every pattern is represented by nodes ordered by Morgan's
### In this section, a list of patterns is generated for each graphlet,
### where every pattern is represented by nodes ordered by Morgan's
### extended labeling. ### extended labeling.
# linear patterns # linear patterns
patterns['0'] = list(G.nodes()) patterns['0'] = list(G.nodes())
@@ -238,16 +225,16 @@ class Treelet(GraphKernel):
for i in range(1, 6): # for i in range(1, 6): for i in range(1, 6): # for i in range(1, 6):
patterns[str(i)] = find_all_paths(G, i, self._ds_infos['directed']) patterns[str(i)] = find_all_paths(G, i, self._ds_infos['directed'])
canonkey[str(i)] = len(patterns[str(i)]) canonkey[str(i)] = len(patterns[str(i)])
# n-star patterns # n-star patterns
patterns['3star'] = [[node] + [neighbor for neighbor in G[node]] for node in G.nodes() if G.degree(node) == 3] patterns['3star'] = [[node] + [neighbor for neighbor in G[node]] for node in G.nodes() if G.degree(node) == 3]
patterns['4star'] = [[node] + [neighbor for neighbor in G[node]] for node in G.nodes() if G.degree(node) == 4] patterns['4star'] = [[node] + [neighbor for neighbor in G[node]] for node in G.nodes() if G.degree(node) == 4]
patterns['5star'] = [[node] + [neighbor for neighbor in G[node]] for node in G.nodes() if G.degree(node) == 5]
patterns['5star'] = [[node] + [neighbor for neighbor in G[node]] for node in G.nodes() if G.degree(node) == 5]
# n-star patterns # n-star patterns
canonkey['6'] = len(patterns['3star']) canonkey['6'] = len(patterns['3star'])
canonkey['8'] = len(patterns['4star']) canonkey['8'] = len(patterns['4star'])
canonkey['d'] = len(patterns['5star']) canonkey['d'] = len(patterns['5star'])
# pattern 7 # pattern 7
patterns['7'] = [] # the 1st line of Table 1 in Ref [1] patterns['7'] = [] # the 1st line of Table 1 in Ref [1]
for pattern in patterns['3star']: for pattern in patterns['3star']:
@@ -261,7 +248,7 @@ class Treelet(GraphKernel):
new_pattern = pattern_t + [neighborx] new_pattern = pattern_t + [neighborx]
patterns['7'].append(new_pattern) patterns['7'].append(new_pattern)
canonkey['7'] = len(patterns['7']) canonkey['7'] = len(patterns['7'])
# pattern 11 # pattern 11
patterns['11'] = [] # the 4th line of Table 1 in Ref [1] patterns['11'] = [] # the 4th line of Table 1 in Ref [1]
for pattern in patterns['4star']: for pattern in patterns['4star']:
@@ -274,7 +261,7 @@ class Treelet(GraphKernel):
new_pattern = pattern_t + [neighborx] new_pattern = pattern_t + [neighborx]
patterns['11'].append(new_pattern) patterns['11'].append(new_pattern)
canonkey['b'] = len(patterns['11']) canonkey['b'] = len(patterns['11'])
# pattern 12 # pattern 12
patterns['12'] = [] # the 5th line of Table 1 in Ref [1] patterns['12'] = [] # the 5th line of Table 1 in Ref [1]
rootlist = [] # a list of root nodes, whose extended labels are 3 rootlist = [] # a list of root nodes, whose extended labels are 3
@@ -294,7 +281,7 @@ class Treelet(GraphKernel):
# new_patterns = [ pattern + [neighborx1] + [neighborx2] for neighborx1 in G[pattern[i]] if neighborx1 != pattern[0] for neighborx2 in G[pattern[i]] if (neighborx1 > neighborx2 and neighborx2 != pattern[0]) ] # new_patterns = [ pattern + [neighborx1] + [neighborx2] for neighborx1 in G[pattern[i]] if neighborx1 != pattern[0] for neighborx2 in G[pattern[i]] if (neighborx1 > neighborx2 and neighborx2 != pattern[0]) ]
patterns['12'].append(new_pattern) patterns['12'].append(new_pattern)
canonkey['c'] = int(len(patterns['12']) / 2) canonkey['c'] = int(len(patterns['12']) / 2)
# pattern 9 # pattern 9
patterns['9'] = [] # the 2nd line of Table 1 in Ref [1] patterns['9'] = [] # the 2nd line of Table 1 in Ref [1]
for pattern in patterns['3star']: for pattern in patterns['3star']:
@@ -311,10 +298,10 @@ class Treelet(GraphKernel):
new_pattern = pattern_t + [neighborx1] + [neighborx2] new_pattern = pattern_t + [neighborx1] + [neighborx2]
patterns['9'].append(new_pattern) patterns['9'].append(new_pattern)
canonkey['9'] = len(patterns['9']) canonkey['9'] = len(patterns['9'])
# pattern 10 # pattern 10
patterns['10'] = [] # the 3rd line of Table 1 in Ref [1] patterns['10'] = [] # the 3rd line of Table 1 in Ref [1]
for pattern in patterns['3star']:
for pattern in patterns['3star']:
for i in range(1, len(pattern)): for i in range(1, len(pattern)):
if G.degree(pattern[i]) >= 2: if G.degree(pattern[i]) >= 2:
for neighborx in G[pattern[i]]: for neighborx in G[pattern[i]]:
@@ -324,20 +311,20 @@ class Treelet(GraphKernel):
new_patterns = [ pattern_t + [neighborx] + [neighborxx] for neighborxx in G[neighborx] if neighborxx != pattern[i] ] new_patterns = [ pattern_t + [neighborx] + [neighborxx] for neighborxx in G[neighborx] if neighborxx != pattern[i] ]
patterns['10'].extend(new_patterns) patterns['10'].extend(new_patterns)
canonkey['a'] = len(patterns['10']) canonkey['a'] = len(patterns['10'])
### labeling information ### ### labeling information ###
### In this section, a list of canonical keys is generated for every
### pattern obtained in the structural analysis section above, which is a
### In this section, a list of canonical keys is generated for every
### pattern obtained in the structural analysis section above, which is a
### string corresponding to a unique treelet. A dictionary is built to keep ### string corresponding to a unique treelet. A dictionary is built to keep
### track of the amount of every treelet. ### track of the amount of every treelet.
if len(self._node_labels) > 0 or len(self._edge_labels) > 0: if len(self._node_labels) > 0 or len(self._edge_labels) > 0:
canonkey_l = {} # canonical key, a dictionary which keeps track of amount of every treelet. canonkey_l = {} # canonical key, a dictionary which keeps track of amount of every treelet.
# linear patterns # linear patterns
canonkey_t = Counter(get_mlti_dim_node_attrs(G, self._node_labels)) canonkey_t = Counter(get_mlti_dim_node_attrs(G, self._node_labels))
for key in canonkey_t: for key in canonkey_t:
canonkey_l[('0', key)] = canonkey_t[key] canonkey_l[('0', key)] = canonkey_t[key]
for i in range(1, 6): # for i in range(1, 6): for i in range(1, 6): # for i in range(1, 6):
treelet = [] treelet = []
for pattern in patterns[str(i)]: for pattern in patterns[str(i)]:
@@ -349,7 +336,7 @@ class Treelet(GraphKernel):
canonkey_t = canonlist if canonlist < canonlist[::-1] else canonlist[::-1] canonkey_t = canonlist if canonlist < canonlist[::-1] else canonlist[::-1]
treelet.append(tuple([str(i)] + canonkey_t)) treelet.append(tuple([str(i)] + canonkey_t))
canonkey_l.update(Counter(treelet)) canonkey_l.update(Counter(treelet))
# n-star patterns # n-star patterns
for i in range(3, 6): for i in range(3, 6):
treelet = [] treelet = []
@@ -361,12 +348,12 @@ class Treelet(GraphKernel):
canonlist.append(tuple((nlabels, elabels))) canonlist.append(tuple((nlabels, elabels)))
canonlist.sort() canonlist.sort()
canonlist = list(chain.from_iterable(canonlist)) canonlist = list(chain.from_iterable(canonlist))
canonkey_t = tuple(['d' if i == 5 else str(i * 2)] +
[tuple(G.nodes[pattern[0]][nl] for nl in self._node_labels)]
canonkey_t = tuple(['d' if i == 5 else str(i * 2)] +
[tuple(G.nodes[pattern[0]][nl] for nl in self._node_labels)]
+ canonlist) + canonlist)
treelet.append(canonkey_t) treelet.append(canonkey_t)
canonkey_l.update(Counter(treelet)) canonkey_l.update(Counter(treelet))
# pattern 7 # pattern 7
treelet = [] treelet = []
for pattern in patterns['7']: for pattern in patterns['7']:
@@ -377,15 +364,15 @@ class Treelet(GraphKernel):
canonlist.append(tuple((nlabels, elabels))) canonlist.append(tuple((nlabels, elabels)))
canonlist.sort() canonlist.sort()
canonlist = list(chain.from_iterable(canonlist)) canonlist = list(chain.from_iterable(canonlist))
canonkey_t = tuple(['7']
+ [tuple(G.nodes[pattern[0]][nl] for nl in self._node_labels)] + canonlist
+ [tuple(G.nodes[pattern[3]][nl] for nl in self._node_labels)]
canonkey_t = tuple(['7']
+ [tuple(G.nodes[pattern[0]][nl] for nl in self._node_labels)] + canonlist
+ [tuple(G.nodes[pattern[3]][nl] for nl in self._node_labels)]
+ [tuple(G[pattern[3]][pattern[0]][el] for el in self._edge_labels)] + [tuple(G[pattern[3]][pattern[0]][el] for el in self._edge_labels)]
+ [tuple(G.nodes[pattern[4]][nl] for nl in self._node_labels)]
+ [tuple(G.nodes[pattern[4]][nl] for nl in self._node_labels)]
+ [tuple(G[pattern[4]][pattern[3]][el] for el in self._edge_labels)]) + [tuple(G[pattern[4]][pattern[3]][el] for el in self._edge_labels)])
treelet.append(canonkey_t) treelet.append(canonkey_t)
canonkey_l.update(Counter(treelet)) canonkey_l.update(Counter(treelet))
# pattern 11 # pattern 11
treelet = [] treelet = []
for pattern in patterns['11']: for pattern in patterns['11']:
@@ -396,15 +383,15 @@ class Treelet(GraphKernel):
canonlist.append(tuple((nlabels, elabels))) canonlist.append(tuple((nlabels, elabels)))
canonlist.sort() canonlist.sort()
canonlist = list(chain.from_iterable(canonlist)) canonlist = list(chain.from_iterable(canonlist))
canonkey_t = tuple(['b']
+ [tuple(G.nodes[pattern[0]][nl] for nl in self._node_labels)] + canonlist
+ [tuple(G.nodes[pattern[4]][nl] for nl in self._node_labels)]
canonkey_t = tuple(['b']
+ [tuple(G.nodes[pattern[0]][nl] for nl in self._node_labels)] + canonlist
+ [tuple(G.nodes[pattern[4]][nl] for nl in self._node_labels)]
+ [tuple(G[pattern[4]][pattern[0]][el] for el in self._edge_labels)] + [tuple(G[pattern[4]][pattern[0]][el] for el in self._edge_labels)]
+ [tuple(G.nodes[pattern[5]][nl] for nl in self._node_labels)]
+ [tuple(G.nodes[pattern[5]][nl] for nl in self._node_labels)]
+ [tuple(G[pattern[5]][pattern[4]][el] for el in self._edge_labels)]) + [tuple(G[pattern[5]][pattern[4]][el] for el in self._edge_labels)])
treelet.append(canonkey_t) treelet.append(canonkey_t)
canonkey_l.update(Counter(treelet)) canonkey_l.update(Counter(treelet))
# pattern 10 # pattern 10
treelet = [] treelet = []
for pattern in patterns['10']: for pattern in patterns['10']:
@@ -418,15 +405,15 @@ class Treelet(GraphKernel):
canonlist.sort() canonlist.sort()
canonkey0 = list(chain.from_iterable(canonlist)) canonkey0 = list(chain.from_iterable(canonlist))
canonkey_t = tuple(['a'] canonkey_t = tuple(['a']
+ [tuple(G.nodes[pattern[3]][nl] for nl in self._node_labels)]
+ [tuple(G.nodes[pattern[4]][nl] for nl in self._node_labels)]
+ [tuple(G[pattern[4]][pattern[3]][el] for el in self._edge_labels)]
+ [tuple(G.nodes[pattern[0]][nl] for nl in self._node_labels)]
+ [tuple(G[pattern[0]][pattern[3]][el] for el in self._edge_labels)]
+ [tuple(G.nodes[pattern[3]][nl] for nl in self._node_labels)]
+ [tuple(G.nodes[pattern[4]][nl] for nl in self._node_labels)]
+ [tuple(G[pattern[4]][pattern[3]][el] for el in self._edge_labels)]
+ [tuple(G.nodes[pattern[0]][nl] for nl in self._node_labels)]
+ [tuple(G[pattern[0]][pattern[3]][el] for el in self._edge_labels)]
+ canonkey4 + canonkey0) + canonkey4 + canonkey0)
treelet.append(canonkey_t) treelet.append(canonkey_t)
canonkey_l.update(Counter(treelet)) canonkey_l.update(Counter(treelet))
# pattern 12 # pattern 12
treelet = [] treelet = []
for pattern in patterns['12']: for pattern in patterns['12']:
@@ -444,22 +431,22 @@ class Treelet(GraphKernel):
canonlist3.append(tuple((nlabels, elabels))) canonlist3.append(tuple((nlabels, elabels)))
canonlist3.sort() canonlist3.sort()
canonlist3 = list(chain.from_iterable(canonlist3)) canonlist3 = list(chain.from_iterable(canonlist3))
# 2 possible key can be generated from 2 nodes with extended label 3,
# 2 possible key can be generated from 2 nodes with extended label 3,
# select the one with lower lexicographic order. # select the one with lower lexicographic order.
canonkey_t1 = tuple(['c']
+ [tuple(G.nodes[pattern[0]][nl] for nl in self._node_labels)] + canonlist0
+ [tuple(G.nodes[pattern[3]][nl] for nl in self._node_labels)]
+ [tuple(G[pattern[3]][pattern[0]][el] for el in self._edge_labels)]
canonkey_t1 = tuple(['c']
+ [tuple(G.nodes[pattern[0]][nl] for nl in self._node_labels)] + canonlist0
+ [tuple(G.nodes[pattern[3]][nl] for nl in self._node_labels)]
+ [tuple(G[pattern[3]][pattern[0]][el] for el in self._edge_labels)]
+ canonlist3) + canonlist3)
canonkey_t2 = tuple(['c']
+ [tuple(G.nodes[pattern[3]][nl] for nl in self._node_labels)] + canonlist3
+ [tuple(G.nodes[pattern[0]][nl] for nl in self._node_labels)]
+ [tuple(G[pattern[0]][pattern[3]][el] for el in self._edge_labels)]
canonkey_t2 = tuple(['c']
+ [tuple(G.nodes[pattern[3]][nl] for nl in self._node_labels)] + canonlist3
+ [tuple(G.nodes[pattern[0]][nl] for nl in self._node_labels)]
+ [tuple(G[pattern[0]][pattern[3]][el] for el in self._edge_labels)]
+ canonlist0) + canonlist0)
treelet.append(canonkey_t1 if canonkey_t1 < canonkey_t2 else canonkey_t2) treelet.append(canonkey_t1 if canonkey_t1 < canonkey_t2 else canonkey_t2)
canonkey_l.update(Counter(treelet)) canonkey_l.update(Counter(treelet))
# pattern 9 # pattern 9
treelet = [] treelet = []
for pattern in patterns['9']: for pattern in patterns['9']:
@@ -469,7 +456,7 @@ class Treelet(GraphKernel):
tuple(G[pattern[5]][pattern[3]][el] for el in self._edge_labels)] tuple(G[pattern[5]][pattern[3]][el] for el in self._edge_labels)]
prekey2 = [tuple(G.nodes[pattern[2]][nl] for nl in self._node_labels), prekey2 = [tuple(G.nodes[pattern[2]][nl] for nl in self._node_labels),
tuple(G[pattern[2]][pattern[0]][el] for el in self._edge_labels)] tuple(G[pattern[2]][pattern[0]][el] for el in self._edge_labels)]
prekey3 = [tuple(G.nodes[pattern[3]][nl] for nl in self._node_labels),
prekey3 = [tuple(G.nodes[pattern[3]][nl] for nl in self._node_labels),
tuple(G[pattern[3]][pattern[0]][el] for el in self._edge_labels)] tuple(G[pattern[3]][pattern[0]][el] for el in self._edge_labels)]
if prekey2 + canonkey2 < prekey3 + canonkey3: if prekey2 + canonkey2 < prekey3 + canonkey3:
canonkey_t = [tuple(G.nodes[pattern[1]][nl] for nl in self._node_labels)] \ canonkey_t = [tuple(G.nodes[pattern[1]][nl] for nl in self._node_labels)] \
@@ -480,21 +467,21 @@ class Treelet(GraphKernel):
+ [tuple(G[pattern[1]][pattern[0]][el] for el in self._edge_labels)] \ + [tuple(G[pattern[1]][pattern[0]][el] for el in self._edge_labels)] \
+ prekey3 + prekey2 + canonkey3 + canonkey2 + prekey3 + prekey2 + canonkey3 + canonkey2
treelet.append(tuple(['9'] treelet.append(tuple(['9']
+ [tuple(G.nodes[pattern[0]][nl] for nl in self._node_labels)]
+ [tuple(G.nodes[pattern[0]][nl] for nl in self._node_labels)]
+ canonkey_t)) + canonkey_t))
canonkey_l.update(Counter(treelet)) canonkey_l.update(Counter(treelet))
return canonkey_l return canonkey_l
return canonkey return canonkey
def _wrapper_get_canonkeys(self, itr_item): def _wrapper_get_canonkeys(self, itr_item):
g = itr_item[0] g = itr_item[0]
i = itr_item[1] i = itr_item[1]
return i, self._get_canonkeys(g) return i, self._get_canonkeys(g)
def _add_dummy_labels(self, Gn): def _add_dummy_labels(self, Gn):
if len(self._node_labels) == 0 or (len(self._node_labels) == 1 and self._node_labels[0] == SpecialLabel.DUMMY): if len(self._node_labels) == 0 or (len(self._node_labels) == 1 and self._node_labels[0] == SpecialLabel.DUMMY):
for i in range(len(Gn)): for i in range(len(Gn)):


+ 8
- 1
gklearn/tests/test_graph_kernels.py View File

@@ -555,5 +555,12 @@ if __name__ == "__main__":
# test_RandomWalk('Acyclic', 'conjugate', None, 'imap_unordered') # test_RandomWalk('Acyclic', 'conjugate', None, 'imap_unordered')
# test_RandomWalk('Acyclic', 'fp', None, None) # test_RandomWalk('Acyclic', 'fp', None, None)
# test_RandomWalk('Acyclic', 'spectral', 'exp', 'imap_unordered') # test_RandomWalk('Acyclic', 'spectral', 'exp', 'imap_unordered')
# test_CommonWalk('AIDS', 0.01, 'geo')
# test_CommonWalk('Acyclic', 0.01, 'geo')
# test_Marginalized('Acyclic', False)
# test_ShortestPath('Acyclic') # test_ShortestPath('Acyclic')
# test_PathUpToH('Acyclic', 'MinMax')
# test_Treelet('Acyclic')
# test_SylvesterEquation('Acyclic')
# test_ConjugateGradient('Acyclic')
# test_FixedPoint('Acyclic')
# test_SpectralDecomposition('Acyclic', 'exp')

+ 1
- 0
gklearn/utils/__init__.py View File

@@ -25,3 +25,4 @@ from gklearn.utils.utils import normalize_gram_matrix, compute_distance_matrix
from gklearn.utils.trie import Trie from gklearn.utils.trie import Trie
from gklearn.utils.knn import knn_cv, knn_classification from gklearn.utils.knn import knn_cv, knn_classification
from gklearn.utils.model_selection_precomputed import model_selection_for_precomputed_kernel from gklearn.utils.model_selection_precomputed import model_selection_for_precomputed_kernel
from gklearn.utils.iters import get_iters

+ 55
- 0
gklearn/utils/iters.py View File

@@ -0,0 +1,55 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Dec 24 10:35:26 2020

@author: ljia
"""

from tqdm import tqdm
import math


def get_iters(iterable, desc=None, file=None, length=None, verbose=True, **kwargs):
if verbose:
if 'miniters' not in kwargs:
if length is None:
try:
kwargs['miniters'] = math.ceil(len(iterable) / 100)
except TypeError:
raise
kwargs['miniters'] = 100
else:
kwargs['miniters'] = math.ceil(length / 100)
if 'maxinterval' not in kwargs:
kwargs['maxinterval'] = 600
return tqdm(iterable, desc=desc, file=file, **kwargs)
else:
return iterable



# class mytqdm(tqdm):


# def __init__(iterable=None, desc=None, total=None, leave=True,
# file=None, ncols=None, mininterval=0.1, maxinterval=10.0,
# miniters=None, ascii=None, disable=False, unit='it',
# unit_scale=False, dynamic_ncols=False, smoothing=0.3,
# bar_format=None, initial=0, position=None, postfix=None,
# unit_divisor=1000, write_bytes=None, lock_args=None,
# nrows=None,
# gui=False, **kwargs):
# if iterable is not None:
# miniters=math.ceil(len(iterable) / 100)
# maxinterval=600
# super().__init__(iterable=iterable, desc=desc, total=total, leave=leave,
# file=file, ncols=ncols, mininterval=mininterval, maxinterval=maxinterval,
# miniters=miniters, ascii=ascii, disable=disable, unit=unit,
# unit_scale=unit_scale, dynamic_ncols=dynamic_ncols, smoothing=smoothing,
# bar_format=bar_format, initial=initial, position=position, postfix=postfix,
# unit_divisor=unit_divisor, write_bytes=write_bytes, lock_args=lock_args,
# nrows=nrows,
# gui=gui, **kwargs)

# tqdm = mytqdm

Loading…
Cancel
Save