Browse Source

Add parallelization to WL subtree kernel.

v0.2.x
jajupmochi 4 years ago
parent
commit
e84bc76866
2 changed files with 292 additions and 171 deletions
  1. +291
    -171
      gklearn/kernels/weisfeiler_lehman.py
  2. +1
    -0
      gklearn/tests/test_graph_kernels.py

+ 291
- 171
gklearn/kernels/weisfeiler_lehman.py View File

@@ -7,22 +7,22 @@ Created on Tue Apr 14 15:16:34 2020


@references: @references:


[1] Shervashidze N, Schweitzer P, Leeuwen EJ, Mehlhorn K, Borgwardt KM.
Weisfeiler-lehman graph kernels. Journal of Machine Learning Research.
[1] Shervashidze N, Schweitzer P, Leeuwen EJ, Mehlhorn K, Borgwardt KM.
Weisfeiler-lehman graph kernels. Journal of Machine Learning Research.
2011;12(Sep):2539-61. 2011;12(Sep):2539-61.
""" """


import numpy as np import numpy as np
import networkx as nx import networkx as nx
from collections import Counter from collections import Counter
from functools import partial
# from functools import partial
from gklearn.utils import SpecialLabel from gklearn.utils import SpecialLabel
from gklearn.utils.parallel import parallel_gm
from gklearn.utils.parallel import parallel_gm, parallel_me
from gklearn.kernels import GraphKernel from gklearn.kernels import GraphKernel




class WeisfeilerLehman(GraphKernel): # @todo: total parallelization and sp, edge user kernel.
class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel.
def __init__(self, **kwargs): def __init__(self, **kwargs):
GraphKernel.__init__(self) GraphKernel.__init__(self)
self._node_labels = kwargs.get('node_labels', []) self._node_labels = kwargs.get('node_labels', [])
@@ -33,115 +33,235 @@ class WeisfeilerLehman(GraphKernel): # @todo: total parallelization and sp, edge




def _compute_gm_series(self): def _compute_gm_series(self):
if self._verbose >= 2:
import warnings
warnings.warn('A part of the computation is parallelized.')
# if self._verbose >= 2:
# import warnings
# warnings.warn('A part of the computation is parallelized.')
self._add_dummy_node_labels(self._graphs) self._add_dummy_node_labels(self._graphs)
# for WL subtree kernel # for WL subtree kernel
if self._base_kernel == 'subtree':
if self._base_kernel == 'subtree':
gram_matrix = self._subtree_kernel_do(self._graphs) gram_matrix = self._subtree_kernel_do(self._graphs)
# for WL shortest path kernel # for WL shortest path kernel
elif self._base_kernel == 'sp': elif self._base_kernel == 'sp':
gram_matrix = self._sp_kernel_do(self._graphs) gram_matrix = self._sp_kernel_do(self._graphs)
# for WL edge kernel # for WL edge kernel
elif self._base_kernel == 'edge': elif self._base_kernel == 'edge':
gram_matrix = self._edge_kernel_do(self._graphs) gram_matrix = self._edge_kernel_do(self._graphs)
# for user defined base kernel # for user defined base kernel
else: else:
gram_matrix = self._user_kernel_do(self._graphs) gram_matrix = self._user_kernel_do(self._graphs)
return gram_matrix return gram_matrix
def _compute_gm_imap_unordered(self): def _compute_gm_imap_unordered(self):
if self._verbose >= 2:
import warnings
warnings.warn('Only a part of the computation is parallelized due to the structure of this kernel.')
return self._compute_gm_series()
self._add_dummy_node_labels(self._graphs)

if self._base_kernel == 'subtree':
gram_matrix = np.zeros((len(self._graphs), len(self._graphs)))

def init_worker(gn_toshare):
global G_gn
G_gn = gn_toshare
do_fun = self._wrapper_pairwise
parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker,
glbv=(self._graphs,), n_jobs=self._n_jobs, verbose=self._verbose)
return gram_matrix
else:
if self._verbose >= 2:
import warnings
warnings.warn('This base kernel is not parallelized. The serial computation is used instead.')
return self._compute_gm_series()


def _compute_kernel_list_series(self, g1, g_list): # @todo: this should be better. def _compute_kernel_list_series(self, g1, g_list): # @todo: this should be better.
if self._verbose >= 2:
import warnings
warnings.warn('A part of the computation is parallelized.')
# if self._verbose >= 2:
# import warnings
# warnings.warn('A part of the computation is parallelized.')
self._add_dummy_node_labels(g_list + [g1]) self._add_dummy_node_labels(g_list + [g1])
# for WL subtree kernel # for WL subtree kernel
if self._base_kernel == 'subtree':
if self._base_kernel == 'subtree':
gram_matrix = self._subtree_kernel_do(g_list + [g1]) gram_matrix = self._subtree_kernel_do(g_list + [g1])
# for WL shortest path kernel # for WL shortest path kernel
elif self._base_kernel == 'sp': elif self._base_kernel == 'sp':
gram_matrix = self._sp_kernel_do(g_list + [g1]) gram_matrix = self._sp_kernel_do(g_list + [g1])
# for WL edge kernel # for WL edge kernel
elif self._base_kernel == 'edge': elif self._base_kernel == 'edge':
gram_matrix = self._edge_kernel_do(g_list + [g1]) gram_matrix = self._edge_kernel_do(g_list + [g1])
# for user defined base kernel # for user defined base kernel
else: else:
gram_matrix = self._user_kernel_do(g_list + [g1]) gram_matrix = self._user_kernel_do(g_list + [g1])
return list(gram_matrix[-1][0:-1]) return list(gram_matrix[-1][0:-1])
def _compute_kernel_list_imap_unordered(self, g1, g_list): def _compute_kernel_list_imap_unordered(self, g1, g_list):
if self._verbose >= 2:
import warnings
warnings.warn('Only a part of the computation is parallelized due to the structure of this kernel.')
return self._compute_kernel_list_series(g1, g_list)
self._add_dummy_node_labels(g_list + [g1])

if self._base_kernel == 'subtree':
kernel_list = [None] * len(g_list)

def init_worker(g1_toshare, g_list_toshare):
global G_g1, G_g_list
G_g1 = g1_toshare
G_g_list = g_list_toshare
do_fun = self._wrapper_kernel_list_do
def func_assign(result, var_to_assign):
var_to_assign[result[0]] = result[1]
itr = range(len(g_list))
len_itr = len(g_list)
parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr,
init_worker=init_worker, glbv=(g1, g_list), method='imap_unordered',
n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose)
return kernel_list
else:
if self._verbose >= 2:
import warnings
warnings.warn('This base kernel is not parallelized. The serial computation is used instead.')
return self._compute_kernel_list_series(g1, g_list)


def _wrapper_kernel_list_do(self, itr): def _wrapper_kernel_list_do(self, itr):
pass
return self._kernel_do_exp(G_g1, G_g_list[itr])
def _compute_single_kernel_series(self, g1, g2): # @todo: this should be better. def _compute_single_kernel_series(self, g1, g2): # @todo: this should be better.
self._add_dummy_node_labels([g1] + [g2]) self._add_dummy_node_labels([g1] + [g2])


# for WL subtree kernel # for WL subtree kernel
if self._base_kernel == 'subtree':
if self._base_kernel == 'subtree':
gram_matrix = self._subtree_kernel_do([g1] + [g2]) gram_matrix = self._subtree_kernel_do([g1] + [g2])
# for WL shortest path kernel # for WL shortest path kernel
elif self._base_kernel == 'sp': elif self._base_kernel == 'sp':
gram_matrix = self._sp_kernel_do([g1] + [g2]) gram_matrix = self._sp_kernel_do([g1] + [g2])
# for WL edge kernel # for WL edge kernel
elif self._base_kernel == 'edge': elif self._base_kernel == 'edge':
gram_matrix = self._edge_kernel_do([g1] + [g2]) gram_matrix = self._edge_kernel_do([g1] + [g2])
# for user defined base kernel # for user defined base kernel
else: else:
gram_matrix = self._user_kernel_do([g1] + [g2]) gram_matrix = self._user_kernel_do([g1] + [g2])
return gram_matrix[0][1] return gram_matrix[0][1]


def pairwise_kernel(self, g1, g2):
Gn = [g1, g2]
kernel = 0

# initial for height = 0
all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration

# for each graph
for G in Gn:
# set all labels into a tuple.
for nd, attrs in G.nodes(data=True): # @todo: there may be a better way.
G.nodes[nd]['label_tuple'] = tuple(attrs[name] for name in self._node_labels)
# get the set of original labels
labels_ori = list(nx.get_node_attributes(G, 'label_tuple').values())
# number of occurence of each label in G
all_num_of_each_label.append(dict(Counter(labels_ori)))

# Compute subtree kernel with the 0th iteration and add it to the final kernel.
kernel = self._compute_kernel_itr(kernel, all_num_of_each_label)

# iterate each height
for h in range(1, self._height + 1):
all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
# all_labels_ori = set() # all unique orignal labels in all graphs in this iteration
all_num_of_each_label = [] # number of occurence of each label in G

# @todo: parallel this part.
for idx, G in enumerate(Gn):

all_multisets = []
for node, attrs in G.nodes(data=True):
# Multiset-label determination.
multiset = [G.nodes[neighbors]['label_tuple'] for neighbors in G[node]]
# sorting each multiset
multiset.sort()
multiset = [attrs['label_tuple']] + multiset # add the prefix
all_multisets.append(tuple(multiset))

# label compression
set_unique = list(set(all_multisets)) # set of unique multiset labels
# a dictionary mapping original labels to new ones.
set_compressed = {}
# if a label occured before, assign its former compressed label,
# else assign the number of labels occured + 1 as the compressed label.
for value in set_unique:
if value in all_set_compressed.keys():
set_compressed.update({value: all_set_compressed[value]})
else:
set_compressed.update({value: str(num_of_labels_occured + 1)})
num_of_labels_occured += 1

all_set_compressed.update(set_compressed)

# relabel nodes
for idx, node in enumerate(G.nodes()):
G.nodes[node]['label_tuple'] = set_compressed[all_multisets[idx]]

# get the set of compressed labels
labels_comp = list(nx.get_node_attributes(G, 'label_tuple').values())
# all_labels_ori.update(labels_comp)
all_num_of_each_label.append(dict(Counter(labels_comp)))

# Compute subtree kernel with h iterations and add it to the final kernel
kernel = self._compute_kernel_itr(kernel, all_num_of_each_label)

return kernel


def _wrapper_pairwise(self, itr):
i = itr[0]
j = itr[1]
return i, j, self.pairwise_kernel(G_gn[i], G_gn[j])


def _compute_kernel_itr(kernel, all_num_of_each_label):
labels = set(list(all_num_of_each_label[0].keys()) +
list(all_num_of_each_label[1].keys()))
vector1 = np.array([(all_num_of_each_label[0][label]
if (label in all_num_of_each_label[0].keys()) else 0)
for label in labels])
vector2 = np.array([(all_num_of_each_label[1][label]
if (label in all_num_of_each_label[1].keys()) else 0)
for label in labels])
kernel += np.dot(vector1, vector2)
return kernel


def _subtree_kernel_do(self, Gn): def _subtree_kernel_do(self, Gn):
"""Compute Weisfeiler-Lehman kernels between graphs. """Compute Weisfeiler-Lehman kernels between graphs.
Parameters Parameters
---------- ----------
Gn : List of NetworkX graph Gn : List of NetworkX graph
List of graphs between which the kernels are computed.
List of graphs between which the kernels are computed.
Return Return
------ ------
gram_matrix : Numpy matrix gram_matrix : Numpy matrix
Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs. Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
""" """
gram_matrix = np.zeros((len(Gn), len(Gn))) gram_matrix = np.zeros((len(Gn), len(Gn)))
# initial for height = 0 # initial for height = 0
all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration
# for each graph # for each graph
for G in Gn: for G in Gn:
# set all labels into a tuple. # set all labels into a tuple.
@@ -151,112 +271,112 @@ class WeisfeilerLehman(GraphKernel): # @todo: total parallelization and sp, edge
labels_ori = list(nx.get_node_attributes(G, 'label_tuple').values()) labels_ori = list(nx.get_node_attributes(G, 'label_tuple').values())
# number of occurence of each label in G # number of occurence of each label in G
all_num_of_each_label.append(dict(Counter(labels_ori))) all_num_of_each_label.append(dict(Counter(labels_ori)))
# Compute subtree kernel with the 0th iteration and add it to the final kernel. # Compute subtree kernel with the 0th iteration and add it to the final kernel.
self._compute_gram_itr(gram_matrix, all_num_of_each_label, Gn)
self._compute_gram_itr(gram_matrix, all_num_of_each_label)
# iterate each height # iterate each height
for h in range(1, self._height + 1): for h in range(1, self._height + 1):
all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
# all_labels_ori = set() # all unique orignal labels in all graphs in this iteration # all_labels_ori = set() # all unique orignal labels in all graphs in this iteration
all_num_of_each_label = [] # number of occurence of each label in G all_num_of_each_label = [] # number of occurence of each label in G
# @todo: parallel this part. # @todo: parallel this part.
for idx, G in enumerate(Gn): for idx, G in enumerate(Gn):
all_multisets = [] all_multisets = []
for node, attrs in G.nodes(data=True): for node, attrs in G.nodes(data=True):
# Multiset-label determination. # Multiset-label determination.
multiset = [G.nodes[neighbors]['label_tuple'] for neighbors in G[node]] multiset = [G.nodes[neighbors]['label_tuple'] for neighbors in G[node]]
# sorting each multiset # sorting each multiset
multiset.sort() multiset.sort()
multiset = [attrs['label_tuple']] + multiset # add the prefix
multiset = [attrs['label_tuple']] + multiset # add the prefix
all_multisets.append(tuple(multiset)) all_multisets.append(tuple(multiset))
# label compression # label compression
set_unique = list(set(all_multisets)) # set of unique multiset labels set_unique = list(set(all_multisets)) # set of unique multiset labels
# a dictionary mapping original labels to new ones.
# a dictionary mapping original labels to new ones.
set_compressed = {} set_compressed = {}
# if a label occured before, assign its former compressed label,
# else assign the number of labels occured + 1 as the compressed label.
# if a label occured before, assign its former compressed label,
# else assign the number of labels occured + 1 as the compressed label.
for value in set_unique: for value in set_unique:
if value in all_set_compressed.keys(): if value in all_set_compressed.keys():
set_compressed.update({value: all_set_compressed[value]}) set_compressed.update({value: all_set_compressed[value]})
else: else:
set_compressed.update({value: str(num_of_labels_occured + 1)}) set_compressed.update({value: str(num_of_labels_occured + 1)})
num_of_labels_occured += 1 num_of_labels_occured += 1
all_set_compressed.update(set_compressed) all_set_compressed.update(set_compressed)
# relabel nodes # relabel nodes
for idx, node in enumerate(G.nodes()): for idx, node in enumerate(G.nodes()):
G.nodes[node]['label_tuple'] = set_compressed[all_multisets[idx]] G.nodes[node]['label_tuple'] = set_compressed[all_multisets[idx]]
# get the set of compressed labels # get the set of compressed labels
labels_comp = list(nx.get_node_attributes(G, 'label_tuple').values()) labels_comp = list(nx.get_node_attributes(G, 'label_tuple').values())
# all_labels_ori.update(labels_comp) # all_labels_ori.update(labels_comp)
all_num_of_each_label.append(dict(Counter(labels_comp))) all_num_of_each_label.append(dict(Counter(labels_comp)))
# Compute subtree kernel with h iterations and add it to the final kernel # Compute subtree kernel with h iterations and add it to the final kernel
self._compute_gram_itr(gram_matrix, all_num_of_each_label, Gn)
self._compute_gram_itr(gram_matrix, all_num_of_each_label)
return gram_matrix return gram_matrix


def _compute_gram_itr(self, gram_matrix, all_num_of_each_label, Gn):
def _compute_gram_itr(self, gram_matrix, all_num_of_each_label):
"""Compute Gram matrix using the base kernel. """Compute Gram matrix using the base kernel.
""" """
if self._parallel == 'imap_unordered':
# compute kernels.
def init_worker(alllabels_toshare):
global G_alllabels
G_alllabels = alllabels_toshare
do_partial = partial(self._wrapper_compute_subtree_kernel, gram_matrix)
parallel_gm(do_partial, gram_matrix, Gn, init_worker=init_worker,
glbv=(all_num_of_each_label,), n_jobs=self._n_jobs, verbose=self._verbose)
elif self._parallel is None:
for i in range(len(gram_matrix)):
for j in range(i, len(gram_matrix)):
gram_matrix[i][j] = self._compute_subtree_kernel(all_num_of_each_label[i],
all_num_of_each_label[j], gram_matrix[i][j])
gram_matrix[j][i] = gram_matrix[i][j]
# if self._parallel == 'imap_unordered':
# # compute kernels.
# def init_worker(alllabels_toshare):
# global G_alllabels
# G_alllabels = alllabels_toshare
# do_partial = partial(self._wrapper_compute_subtree_kernel, gram_matrix)
# parallel_gm(do_partial, gram_matrix, Gn, init_worker=init_worker,
# glbv=(all_num_of_each_label,), n_jobs=self._n_jobs, verbose=self._verbose)
# elif self._parallel is None:
for i in range(len(gram_matrix)):
for j in range(i, len(gram_matrix)):
gram_matrix[i][j] = self._compute_subtree_kernel(all_num_of_each_label[i],
all_num_of_each_label[j], gram_matrix[i][j])
gram_matrix[j][i] = gram_matrix[i][j]
def _compute_subtree_kernel(self, num_of_each_label1, num_of_each_label2, kernel): def _compute_subtree_kernel(self, num_of_each_label1, num_of_each_label2, kernel):
"""Compute the subtree kernel. """Compute the subtree kernel.
""" """
labels = set(list(num_of_each_label1.keys()) + list(num_of_each_label2.keys())) labels = set(list(num_of_each_label1.keys()) + list(num_of_each_label2.keys()))
vector1 = np.array([(num_of_each_label1[label]
if (label in num_of_each_label1.keys()) else 0)
vector1 = np.array([(num_of_each_label1[label]
if (label in num_of_each_label1.keys()) else 0)
for label in labels]) for label in labels])
vector2 = np.array([(num_of_each_label2[label]
if (label in num_of_each_label2.keys()) else 0)
vector2 = np.array([(num_of_each_label2[label]
if (label in num_of_each_label2.keys()) else 0)
for label in labels]) for label in labels])
kernel += np.dot(vector1, vector2) kernel += np.dot(vector1, vector2)
return kernel return kernel
def _wrapper_compute_subtree_kernel(self, gram_matrix, itr):
i = itr[0]
j = itr[1]
return i, j, self._compute_subtree_kernel(G_alllabels[i], G_alllabels[j], gram_matrix[i][j])
# def _wrapper_compute_subtree_kernel(self, gram_matrix, itr):
# i = itr[0]
# j = itr[1]
# return i, j, self._compute_subtree_kernel(G_alllabels[i], G_alllabels[j], gram_matrix[i][j])
def _wl_spkernel_do(Gn, node_label, edge_label, height): def _wl_spkernel_do(Gn, node_label, edge_label, height):
"""Compute Weisfeiler-Lehman shortest path kernels between graphs. """Compute Weisfeiler-Lehman shortest path kernels between graphs.
Parameters Parameters
---------- ----------
Gn : List of NetworkX graph Gn : List of NetworkX graph
List of graphs between which the kernels are computed.
List of graphs between which the kernels are computed.
node_label : string node_label : string
node attribute used as label.
node attribute used as label.
edge_label : string edge_label : string
edge attribute used as label.
edge attribute used as label.
height : int height : int
subtree height. subtree height.
Return Return
------ ------
gram_matrix : Numpy matrix gram_matrix : Numpy matrix
@@ -264,22 +384,22 @@ class WeisfeilerLehman(GraphKernel): # @todo: total parallelization and sp, edge
""" """
pass pass
from gklearn.utils.utils import getSPGraph from gklearn.utils.utils import getSPGraph
# init. # init.
height = int(height) height = int(height)
gram_matrix = np.zeros((len(Gn), len(Gn))) # init kernel gram_matrix = np.zeros((len(Gn), len(Gn))) # init kernel
Gn = [ getSPGraph(G, edge_weight = edge_label) for G in Gn ] # get shortest path graphs of Gn Gn = [ getSPGraph(G, edge_weight = edge_label) for G in Gn ] # get shortest path graphs of Gn
# initial for height = 0 # initial for height = 0
for i in range(0, len(Gn)): for i in range(0, len(Gn)):
for j in range(i, len(Gn)): for j in range(i, len(Gn)):
for e1 in Gn[i].edges(data = True): for e1 in Gn[i].edges(data = True):
for e2 in Gn[j].edges(data = True):
for e2 in Gn[j].edges(data = True):
if e1[2]['cost'] != 0 and e1[2]['cost'] == e2[2]['cost'] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])): if e1[2]['cost'] != 0 and e1[2]['cost'] == e2[2]['cost'] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])):
gram_matrix[i][j] += 1 gram_matrix[i][j] += 1
gram_matrix[j][i] = gram_matrix[i][j] gram_matrix[j][i] = gram_matrix[i][j]
# iterate each height # iterate each height
for h in range(1, height + 1): for h in range(1, height + 1):
all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
@@ -291,193 +411,193 @@ class WeisfeilerLehman(GraphKernel): # @todo: total parallelization and sp, edge
multiset = [ G.node[neighbors][node_label] for neighbors in G[node[0]] ] multiset = [ G.node[neighbors][node_label] for neighbors in G[node[0]] ]
# sorting each multiset # sorting each multiset
multiset.sort() multiset.sort()
multiset = node[1][node_label] + ''.join(multiset) # concatenate to a string and add the prefix
set_multisets.append(multiset)
multiset = node[1][node_label] + ''.join(multiset) # concatenate to a string and add the prefix
set_multisets.append(multiset)
# label compression # label compression
set_unique = list(set(set_multisets)) # set of unique multiset labels set_unique = list(set(set_multisets)) # set of unique multiset labels
# a dictionary mapping original labels to new ones.
# a dictionary mapping original labels to new ones.
set_compressed = {} set_compressed = {}
# if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label
# if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label
for value in set_unique: for value in set_unique:
if value in all_set_compressed.keys(): if value in all_set_compressed.keys():
set_compressed.update({ value : all_set_compressed[value] }) set_compressed.update({ value : all_set_compressed[value] })
else: else:
set_compressed.update({ value : str(num_of_labels_occured + 1) }) set_compressed.update({ value : str(num_of_labels_occured + 1) })
num_of_labels_occured += 1 num_of_labels_occured += 1
all_set_compressed.update(set_compressed) all_set_compressed.update(set_compressed)
# relabel nodes # relabel nodes
for node in G.nodes(data = True): for node in G.nodes(data = True):
node[1][node_label] = set_compressed[set_multisets[node[0]]] node[1][node_label] = set_compressed[set_multisets[node[0]]]
# Compute subtree kernel with h iterations and add it to the final kernel # Compute subtree kernel with h iterations and add it to the final kernel
for i in range(0, len(Gn)): for i in range(0, len(Gn)):
for j in range(i, len(Gn)): for j in range(i, len(Gn)):
for e1 in Gn[i].edges(data = True): for e1 in Gn[i].edges(data = True):
for e2 in Gn[j].edges(data = True):
for e2 in Gn[j].edges(data = True):
if e1[2]['cost'] != 0 and e1[2]['cost'] == e2[2]['cost'] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])): if e1[2]['cost'] != 0 and e1[2]['cost'] == e2[2]['cost'] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])):
gram_matrix[i][j] += 1 gram_matrix[i][j] += 1
gram_matrix[j][i] = gram_matrix[i][j] gram_matrix[j][i] = gram_matrix[i][j]
return gram_matrix return gram_matrix
def _wl_edgekernel_do(Gn, node_label, edge_label, height): def _wl_edgekernel_do(Gn, node_label, edge_label, height):
"""Compute Weisfeiler-Lehman edge kernels between graphs. """Compute Weisfeiler-Lehman edge kernels between graphs.
Parameters Parameters
---------- ----------
Gn : List of NetworkX graph Gn : List of NetworkX graph
List of graphs between which the kernels are computed.
List of graphs between which the kernels are computed.
node_label : string node_label : string
node attribute used as label.
node attribute used as label.
edge_label : string edge_label : string
edge attribute used as label.
edge attribute used as label.
height : int height : int
subtree height. subtree height.
Return Return
------ ------
gram_matrix : Numpy matrix gram_matrix : Numpy matrix
Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs. Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
"""
"""
pass pass
# init. # init.
height = int(height) height = int(height)
gram_matrix = np.zeros((len(Gn), len(Gn))) # init kernel gram_matrix = np.zeros((len(Gn), len(Gn))) # init kernel
# initial for height = 0 # initial for height = 0
for i in range(0, len(Gn)): for i in range(0, len(Gn)):
for j in range(i, len(Gn)): for j in range(i, len(Gn)):
for e1 in Gn[i].edges(data = True): for e1 in Gn[i].edges(data = True):
for e2 in Gn[j].edges(data = True):
for e2 in Gn[j].edges(data = True):
if e1[2][edge_label] == e2[2][edge_label] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])): if e1[2][edge_label] == e2[2][edge_label] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])):
gram_matrix[i][j] += 1 gram_matrix[i][j] += 1
gram_matrix[j][i] = gram_matrix[i][j] gram_matrix[j][i] = gram_matrix[i][j]
# iterate each height # iterate each height
for h in range(1, height + 1): for h in range(1, height + 1):
all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
for G in Gn: # for each graph for G in Gn: # for each graph
set_multisets = []
set_multisets = []
for node in G.nodes(data = True): for node in G.nodes(data = True):
# Multiset-label determination. # Multiset-label determination.
multiset = [ G.node[neighbors][node_label] for neighbors in G[node[0]] ] multiset = [ G.node[neighbors][node_label] for neighbors in G[node[0]] ]
# sorting each multiset # sorting each multiset
multiset.sort() multiset.sort()
multiset = node[1][node_label] + ''.join(multiset) # concatenate to a string and add the prefix
set_multisets.append(multiset)
multiset = node[1][node_label] + ''.join(multiset) # concatenate to a string and add the prefix
set_multisets.append(multiset)
# label compression # label compression
set_unique = list(set(set_multisets)) # set of unique multiset labels set_unique = list(set(set_multisets)) # set of unique multiset labels
# a dictionary mapping original labels to new ones.
# a dictionary mapping original labels to new ones.
set_compressed = {} set_compressed = {}
# if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label
# if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label
for value in set_unique: for value in set_unique:
if value in all_set_compressed.keys(): if value in all_set_compressed.keys():
set_compressed.update({ value : all_set_compressed[value] }) set_compressed.update({ value : all_set_compressed[value] })
else: else:
set_compressed.update({ value : str(num_of_labels_occured + 1) }) set_compressed.update({ value : str(num_of_labels_occured + 1) })
num_of_labels_occured += 1 num_of_labels_occured += 1
all_set_compressed.update(set_compressed) all_set_compressed.update(set_compressed)
# relabel nodes # relabel nodes
for node in G.nodes(data = True): for node in G.nodes(data = True):
node[1][node_label] = set_compressed[set_multisets[node[0]]] node[1][node_label] = set_compressed[set_multisets[node[0]]]
# Compute subtree kernel with h iterations and add it to the final kernel # Compute subtree kernel with h iterations and add it to the final kernel
for i in range(0, len(Gn)): for i in range(0, len(Gn)):
for j in range(i, len(Gn)): for j in range(i, len(Gn)):
for e1 in Gn[i].edges(data = True): for e1 in Gn[i].edges(data = True):
for e2 in Gn[j].edges(data = True):
for e2 in Gn[j].edges(data = True):
if e1[2][edge_label] == e2[2][edge_label] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])): if e1[2][edge_label] == e2[2][edge_label] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])):
gram_matrix[i][j] += 1 gram_matrix[i][j] += 1
gram_matrix[j][i] = gram_matrix[i][j] gram_matrix[j][i] = gram_matrix[i][j]
return gram_matrix return gram_matrix
def _wl_userkernel_do(Gn, node_label, edge_label, height, base_kernel): def _wl_userkernel_do(Gn, node_label, edge_label, height, base_kernel):
"""Compute Weisfeiler-Lehman kernels based on user-defined kernel between graphs. """Compute Weisfeiler-Lehman kernels based on user-defined kernel between graphs.
Parameters Parameters
---------- ----------
Gn : List of NetworkX graph Gn : List of NetworkX graph
List of graphs between which the kernels are computed.
List of graphs between which the kernels are computed.
node_label : string node_label : string
node attribute used as label.
node attribute used as label.
edge_label : string edge_label : string
edge attribute used as label.
edge attribute used as label.
height : int height : int
subtree height. subtree height.
base_kernel : string base_kernel : string
Name of the base kernel function used in each iteration of WL kernel. This function returns a Numpy matrix, each element of which is the user-defined Weisfeiler-Lehman kernel between 2 praphs. Name of the base kernel function used in each iteration of WL kernel. This function returns a Numpy matrix, each element of which is the user-defined Weisfeiler-Lehman kernel between 2 praphs.
Return Return
------ ------
gram_matrix : Numpy matrix gram_matrix : Numpy matrix
Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs. Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
"""
"""
pass pass
# init. # init.
height = int(height) height = int(height)
gram_matrix = np.zeros((len(Gn), len(Gn))) # init kernel gram_matrix = np.zeros((len(Gn), len(Gn))) # init kernel
# initial for height = 0 # initial for height = 0
gram_matrix = base_kernel(Gn, node_label, edge_label) gram_matrix = base_kernel(Gn, node_label, edge_label)
# iterate each height # iterate each height
for h in range(1, height + 1): for h in range(1, height + 1):
all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
for G in Gn: # for each graph for G in Gn: # for each graph
set_multisets = []
set_multisets = []
for node in G.nodes(data = True): for node in G.nodes(data = True):
# Multiset-label determination. # Multiset-label determination.
multiset = [ G.node[neighbors][node_label] for neighbors in G[node[0]] ] multiset = [ G.node[neighbors][node_label] for neighbors in G[node[0]] ]
# sorting each multiset # sorting each multiset
multiset.sort() multiset.sort()
multiset = node[1][node_label] + ''.join(multiset) # concatenate to a string and add the prefix
set_multisets.append(multiset)
multiset = node[1][node_label] + ''.join(multiset) # concatenate to a string and add the prefix
set_multisets.append(multiset)
# label compression # label compression
set_unique = list(set(set_multisets)) # set of unique multiset labels set_unique = list(set(set_multisets)) # set of unique multiset labels
# a dictionary mapping original labels to new ones.
# a dictionary mapping original labels to new ones.
set_compressed = {} set_compressed = {}
# if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label
# if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label
for value in set_unique: for value in set_unique:
if value in all_set_compressed.keys(): if value in all_set_compressed.keys():
set_compressed.update({ value : all_set_compressed[value] }) set_compressed.update({ value : all_set_compressed[value] })
else: else:
set_compressed.update({ value : str(num_of_labels_occured + 1) }) set_compressed.update({ value : str(num_of_labels_occured + 1) })
num_of_labels_occured += 1 num_of_labels_occured += 1
all_set_compressed.update(set_compressed) all_set_compressed.update(set_compressed)
# relabel nodes # relabel nodes
for node in G.nodes(data = True): for node in G.nodes(data = True):
node[1][node_label] = set_compressed[set_multisets[node[0]]] node[1][node_label] = set_compressed[set_multisets[node[0]]]
# Compute kernel with h iterations and add it to the final kernel # Compute kernel with h iterations and add it to the final kernel
gram_matrix += base_kernel(Gn, node_label, edge_label) gram_matrix += base_kernel(Gn, node_label, edge_label)
return gram_matrix return gram_matrix
def _add_dummy_node_labels(self, Gn): def _add_dummy_node_labels(self, Gn):
if len(self._node_labels) == 0 or (len(self._node_labels) == 1 and self._node_labels[0] == SpecialLabel.DUMMY): if len(self._node_labels) == 0 or (len(self._node_labels) == 1 and self._node_labels[0] == SpecialLabel.DUMMY):
for i in range(len(Gn)): for i in range(len(Gn)):
nx.set_node_attributes(Gn[i], '0', SpecialLabel.DUMMY) nx.set_node_attributes(Gn[i], '0', SpecialLabel.DUMMY)
self._node_labels = [SpecialLabel.DUMMY] self._node_labels = [SpecialLabel.DUMMY]
class WLSubtree(WeisfeilerLehman): class WLSubtree(WeisfeilerLehman):
def __init__(self, **kwargs): def __init__(self, **kwargs):
kwargs['base_kernel'] = 'subtree' kwargs['base_kernel'] = 'subtree'
super().__init__(**kwargs) super().__init__(**kwargs)

+ 1
- 0
gklearn/tests/test_graph_kernels.py View File

@@ -544,3 +544,4 @@ if __name__ == "__main__":
# test_RandomWalk('Acyclic', 'fp', None, None) # test_RandomWalk('Acyclic', 'fp', None, None)
# test_RandomWalk('Acyclic', 'spectral', 'exp', 'imap_unordered') # test_RandomWalk('Acyclic', 'spectral', 'exp', 'imap_unordered')
# test_CommonWalk('Alkane', 0.01, 'geo') # test_CommonWalk('Alkane', 0.01, 'geo')
# test_ShortestPath('Acyclic')

Loading…
Cancel
Save