From a69f3c710e9c28bcd2e596dd7f731ba7483f2760 Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Tue, 14 Apr 2020 16:52:35 +0200 Subject: [PATCH] Add class WeisfeilerLehman. --- gklearn/kernels/weisfeiler_lehman.py | 465 +++++++++++++++++++++++++++++++++++ 1 file changed, 465 insertions(+) create mode 100644 gklearn/kernels/weisfeiler_lehman.py diff --git a/gklearn/kernels/weisfeiler_lehman.py b/gklearn/kernels/weisfeiler_lehman.py new file mode 100644 index 0000000..4ecb13f --- /dev/null +++ b/gklearn/kernels/weisfeiler_lehman.py @@ -0,0 +1,465 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Tue Apr 14 15:16:34 2020 + +@author: ljia + +@references: + + [1] Shervashidze N, Schweitzer P, Leeuwen EJ, Mehlhorn K, Borgwardt KM. + Weisfeiler-lehman graph kernels. Journal of Machine Learning Research. + 2011;12(Sep):2539-61. +""" + +import numpy as np +import networkx as nx +from collections import Counter +from functools import partial +from gklearn.utils.parallel import parallel_gm +from gklearn.kernels import GraphKernel + + +class WeisfeilerLehman(GraphKernel): # @todo: total parallelization and sp, edge user kernel. + + def __init__(self, **kwargs): + GraphKernel.__init__(self) + self.__node_labels = kwargs.get('node_labels', []) + self.__edge_labels = kwargs.get('edge_labels', []) + self.__height = int(kwargs.get('height', 0)) + self.__base_kernel = kwargs.get('base_kernel', 'subtree') + self.__ds_infos = kwargs.get('ds_infos', {}) + + + def _compute_gm_series(self): + self.__add_dummy_node_labels(self._graphs) + + # for WL subtree kernel + if self.__base_kernel == 'subtree': + gram_matrix = self.__subtree_kernel_do(self._graphs) + + # for WL shortest path kernel + elif self.__base_kernel == 'sp': + gram_matrix = self.__sp_kernel_do(self._graphs) + + # for WL edge kernel + elif self.__base_kernel == 'edge': + gram_matrix = self.__edge_kernel_do(self._graphs) + + # for user defined base kernel + else: + gram_matrix = self.__user_kernel_do(self._graphs) + + return gram_matrix + + + def _compute_gm_imap_unordered(self): + if self._verbose >= 2: + raise Warning('Only a part of the computation is parallelized due to the structure of this kernel.') + return self._compute_gm_series() + + + def _compute_kernel_list_series(self, g1, g_list): # @todo: this should be better. + self.__add_dummy_node_labels(g_list + [g1]) + + # for WL subtree kernel + if self.__base_kernel == 'subtree': + gram_matrix = self.__subtree_kernel_do(g_list + [g1]) + + # for WL shortest path kernel + elif self.__base_kernel == 'sp': + gram_matrix = self.__sp_kernel_do(g_list + [g1]) + + # for WL edge kernel + elif self.__base_kernel == 'edge': + gram_matrix = self.__edge_kernel_do(g_list + [g1]) + + # for user defined base kernel + else: + gram_matrix = self.__user_kernel_do(g_list + [g1]) + + return list(gram_matrix[-1][0:-1]) + + + def _compute_kernel_list_imap_unordered(self, g1, g_list): + if self._verbose >= 2: + raise Warning('Only a part of the computation is parallelized due to the structure of this kernel.') + return self._compute_gm_imap_unordered() + + + def _wrapper_kernel_list_do(self, itr): + pass + + + def _compute_single_kernel_series(self, g1, g2): # @todo: this should be better. + self.__add_dummy_node_labels([g1] + [g2]) + + # for WL subtree kernel + if self.__base_kernel == 'subtree': + gram_matrix = self.__subtree_kernel_do([g1] + [g2]) + + # for WL shortest path kernel + elif self.__base_kernel == 'sp': + gram_matrix = self.__sp_kernel_do([g1] + [g2]) + + # for WL edge kernel + elif self.__base_kernel == 'edge': + gram_matrix = self.__edge_kernel_do([g1] + [g2]) + + # for user defined base kernel + else: + gram_matrix = self.__user_kernel_do([g1] + [g2]) + + return gram_matrix[0][1] + + + def __subtree_kernel_do(self, Gn): + """Calculate Weisfeiler-Lehman kernels between graphs. + + Parameters + ---------- + Gn : List of NetworkX graph + List of graphs between which the kernels are calculated. + + Return + ------ + gram_matrix : Numpy matrix + Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs. + """ + gram_matrix = np.zeros((len(Gn), len(Gn))) + + # initial for height = 0 + all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration + + # for each graph + for G in Gn: + # set all labels into a tuple. + for nd, attrs in G.nodes(data=True): # @todo: there may be a better way. + G.nodes[nd]['label_tuple'] = tuple(attrs[name] for name in self.__node_labels) + # get the set of original labels + labels_ori = list(nx.get_node_attributes(G, 'label_tuple').values()) + # number of occurence of each label in G + all_num_of_each_label.append(dict(Counter(labels_ori))) + + # calculate subtree kernel with the 0th iteration and add it to the final kernel. + self.__compute_gram_matrix(gram_matrix, all_num_of_each_label, Gn) + + # iterate each height + for h in range(1, self.__height + 1): + all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration + num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs + # all_labels_ori = set() # all unique orignal labels in all graphs in this iteration + all_num_of_each_label = [] # number of occurence of each label in G + + # @todo: parallel this part. + for idx, G in enumerate(Gn): + + all_multisets = [] + for node, attrs in G.nodes(data=True): + # Multiset-label determination. + multiset = [G.nodes[neighbors]['label_tuple'] for neighbors in G[node]] + # sorting each multiset + multiset.sort() + multiset = [attrs['label_tuple']] + multiset # add the prefix + all_multisets.append(tuple(multiset)) + + # label compression + set_unique = list(set(all_multisets)) # set of unique multiset labels + # a dictionary mapping original labels to new ones. + set_compressed = {} + # if a label occured before, assign its former compressed label, + # else assign the number of labels occured + 1 as the compressed label. + for value in set_unique: + if value in all_set_compressed.keys(): + set_compressed.update({value: all_set_compressed[value]}) + else: + set_compressed.update({value: str(num_of_labels_occured + 1)}) + num_of_labels_occured += 1 + + all_set_compressed.update(set_compressed) + + # relabel nodes + for idx, node in enumerate(G.nodes()): + G.nodes[node]['label_tuple'] = set_compressed[all_multisets[idx]] + + # get the set of compressed labels + labels_comp = list(nx.get_node_attributes(G, 'label_tuple').values()) + # all_labels_ori.update(labels_comp) + all_num_of_each_label.append(dict(Counter(labels_comp))) + + # calculate subtree kernel with h iterations and add it to the final kernel + self.__compute_gram_matrix(gram_matrix, all_num_of_each_label, Gn) + + return gram_matrix + + + def __compute_gram_matrix(self, gram_matrix, all_num_of_each_label, Gn): + """Compute Gram matrix using the base kernel. + """ + if self._parallel == 'imap_unordered': + # compute kernels. + def init_worker(alllabels_toshare): + global G_alllabels + G_alllabels = alllabels_toshare + do_partial = partial(self._wrapper_compute_subtree_kernel, gram_matrix) + parallel_gm(do_partial, gram_matrix, Gn, init_worker=init_worker, + glbv=(all_num_of_each_label,), n_jobs=self._n_jobs, verbose=self._verbose) + elif self._parallel is None: + for i in range(len(gram_matrix)): + for j in range(i, len(gram_matrix)): + gram_matrix[i][j] = self.__compute_subtree_kernel(all_num_of_each_label[i], + all_num_of_each_label[j], gram_matrix[i][j]) + gram_matrix[j][i] = gram_matrix[i][j] + + + def __compute_subtree_kernel(self, num_of_each_label1, num_of_each_label2, kernel): + """Compute the subtree kernel. + """ + labels = set(list(num_of_each_label1.keys()) + list(num_of_each_label2.keys())) + vector1 = np.array([(num_of_each_label1[label] + if (label in num_of_each_label1.keys()) else 0) + for label in labels]) + vector2 = np.array([(num_of_each_label2[label] + if (label in num_of_each_label2.keys()) else 0) + for label in labels]) + kernel += np.dot(vector1, vector2) + return kernel + + + def _wrapper_compute_subtree_kernel(self, gram_matrix, itr): + i = itr[0] + j = itr[1] + return i, j, self.__compute_subtree_kernel(G_alllabels[i], G_alllabels[j], gram_matrix[i][j]) + + + def _wl_spkernel_do(Gn, node_label, edge_label, height): + """Calculate Weisfeiler-Lehman shortest path kernels between graphs. + + Parameters + ---------- + Gn : List of NetworkX graph + List of graphs between which the kernels are calculated. + node_label : string + node attribute used as label. + edge_label : string + edge attribute used as label. + height : int + subtree height. + + Return + ------ + gram_matrix : Numpy matrix + Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs. + """ + pass + from gklearn.utils.utils import getSPGraph + + # init. + height = int(height) + gram_matrix = np.zeros((len(Gn), len(Gn))) # init kernel + + Gn = [ getSPGraph(G, edge_weight = edge_label) for G in Gn ] # get shortest path graphs of Gn + + # initial for height = 0 + for i in range(0, len(Gn)): + for j in range(i, len(Gn)): + for e1 in Gn[i].edges(data = True): + for e2 in Gn[j].edges(data = True): + if e1[2]['cost'] != 0 and e1[2]['cost'] == e2[2]['cost'] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])): + gram_matrix[i][j] += 1 + gram_matrix[j][i] = gram_matrix[i][j] + + # iterate each height + for h in range(1, height + 1): + all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration + num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs + for G in Gn: # for each graph + set_multisets = [] + for node in G.nodes(data = True): + # Multiset-label determination. + multiset = [ G.node[neighbors][node_label] for neighbors in G[node[0]] ] + # sorting each multiset + multiset.sort() + multiset = node[1][node_label] + ''.join(multiset) # concatenate to a string and add the prefix + set_multisets.append(multiset) + + # label compression + set_unique = list(set(set_multisets)) # set of unique multiset labels + # a dictionary mapping original labels to new ones. + set_compressed = {} + # if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label + for value in set_unique: + if value in all_set_compressed.keys(): + set_compressed.update({ value : all_set_compressed[value] }) + else: + set_compressed.update({ value : str(num_of_labels_occured + 1) }) + num_of_labels_occured += 1 + + all_set_compressed.update(set_compressed) + + # relabel nodes + for node in G.nodes(data = True): + node[1][node_label] = set_compressed[set_multisets[node[0]]] + + # calculate subtree kernel with h iterations and add it to the final kernel + for i in range(0, len(Gn)): + for j in range(i, len(Gn)): + for e1 in Gn[i].edges(data = True): + for e2 in Gn[j].edges(data = True): + if e1[2]['cost'] != 0 and e1[2]['cost'] == e2[2]['cost'] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])): + gram_matrix[i][j] += 1 + gram_matrix[j][i] = gram_matrix[i][j] + + return gram_matrix + + + + def _wl_edgekernel_do(Gn, node_label, edge_label, height): + """Calculate Weisfeiler-Lehman edge kernels between graphs. + + Parameters + ---------- + Gn : List of NetworkX graph + List of graphs between which the kernels are calculated. + node_label : string + node attribute used as label. + edge_label : string + edge attribute used as label. + height : int + subtree height. + + Return + ------ + gram_matrix : Numpy matrix + Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs. + """ + pass + # init. + height = int(height) + gram_matrix = np.zeros((len(Gn), len(Gn))) # init kernel + + # initial for height = 0 + for i in range(0, len(Gn)): + for j in range(i, len(Gn)): + for e1 in Gn[i].edges(data = True): + for e2 in Gn[j].edges(data = True): + if e1[2][edge_label] == e2[2][edge_label] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])): + gram_matrix[i][j] += 1 + gram_matrix[j][i] = gram_matrix[i][j] + + # iterate each height + for h in range(1, height + 1): + all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration + num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs + for G in Gn: # for each graph + set_multisets = [] + for node in G.nodes(data = True): + # Multiset-label determination. + multiset = [ G.node[neighbors][node_label] for neighbors in G[node[0]] ] + # sorting each multiset + multiset.sort() + multiset = node[1][node_label] + ''.join(multiset) # concatenate to a string and add the prefix + set_multisets.append(multiset) + + # label compression + set_unique = list(set(set_multisets)) # set of unique multiset labels + # a dictionary mapping original labels to new ones. + set_compressed = {} + # if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label + for value in set_unique: + if value in all_set_compressed.keys(): + set_compressed.update({ value : all_set_compressed[value] }) + else: + set_compressed.update({ value : str(num_of_labels_occured + 1) }) + num_of_labels_occured += 1 + + all_set_compressed.update(set_compressed) + + # relabel nodes + for node in G.nodes(data = True): + node[1][node_label] = set_compressed[set_multisets[node[0]]] + + # calculate subtree kernel with h iterations and add it to the final kernel + for i in range(0, len(Gn)): + for j in range(i, len(Gn)): + for e1 in Gn[i].edges(data = True): + for e2 in Gn[j].edges(data = True): + if e1[2][edge_label] == e2[2][edge_label] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])): + gram_matrix[i][j] += 1 + gram_matrix[j][i] = gram_matrix[i][j] + + return gram_matrix + + + def _wl_userkernel_do(Gn, node_label, edge_label, height, base_kernel): + """Calculate Weisfeiler-Lehman kernels based on user-defined kernel between graphs. + + Parameters + ---------- + Gn : List of NetworkX graph + List of graphs between which the kernels are calculated. + node_label : string + node attribute used as label. + edge_label : string + edge attribute used as label. + height : int + subtree height. + base_kernel : string + Name of the base kernel function used in each iteration of WL kernel. This function returns a Numpy matrix, each element of which is the user-defined Weisfeiler-Lehman kernel between 2 praphs. + + Return + ------ + gram_matrix : Numpy matrix + Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs. + """ + pass + # init. + height = int(height) + gram_matrix = np.zeros((len(Gn), len(Gn))) # init kernel + + # initial for height = 0 + gram_matrix = base_kernel(Gn, node_label, edge_label) + + # iterate each height + for h in range(1, height + 1): + all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration + num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs + for G in Gn: # for each graph + set_multisets = [] + for node in G.nodes(data = True): + # Multiset-label determination. + multiset = [ G.node[neighbors][node_label] for neighbors in G[node[0]] ] + # sorting each multiset + multiset.sort() + multiset = node[1][node_label] + ''.join(multiset) # concatenate to a string and add the prefix + set_multisets.append(multiset) + + # label compression + set_unique = list(set(set_multisets)) # set of unique multiset labels + # a dictionary mapping original labels to new ones. + set_compressed = {} + # if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label + for value in set_unique: + if value in all_set_compressed.keys(): + set_compressed.update({ value : all_set_compressed[value] }) + else: + set_compressed.update({ value : str(num_of_labels_occured + 1) }) + num_of_labels_occured += 1 + + all_set_compressed.update(set_compressed) + + # relabel nodes + for node in G.nodes(data = True): + node[1][node_label] = set_compressed[set_multisets[node[0]]] + + # calculate kernel with h iterations and add it to the final kernel + gram_matrix += base_kernel(Gn, node_label, edge_label) + + return gram_matrix + + + def __add_dummy_node_labels(self, Gn): + if len(self.__node_labels) == 0: + for G in Gn: + nx.set_node_attributes(G, '0', 'dummy') + self.__node_labels.append('dummy') \ No newline at end of file