From a69f3c710e9c28bcd2e596dd7f731ba7483f2760 Mon Sep 17 00:00:00 2001
From: jajupmochi <jajupmochi@gmail.com>
Date: Tue, 14 Apr 2020 16:52:35 +0200
Subject: [PATCH] Add class WeisfeilerLehman.

---
 gklearn/kernels/weisfeiler_lehman.py | 465 +++++++++++++++++++++++++++++++++++
 1 file changed, 465 insertions(+)
 create mode 100644 gklearn/kernels/weisfeiler_lehman.py

diff --git a/gklearn/kernels/weisfeiler_lehman.py b/gklearn/kernels/weisfeiler_lehman.py
new file mode 100644
index 0000000..4ecb13f
--- /dev/null
+++ b/gklearn/kernels/weisfeiler_lehman.py
@@ -0,0 +1,465 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Apr 14 15:16:34 2020
+
+@author: ljia
+
+@references:
+
+	[1] Shervashidze N, Schweitzer P, Leeuwen EJ, Mehlhorn K, Borgwardt KM. 
+	Weisfeiler-lehman graph kernels. Journal of Machine Learning Research. 
+	2011;12(Sep):2539-61.
+"""
+
+import numpy as np
+import networkx as nx
+from collections import Counter
+from functools import partial
+from gklearn.utils.parallel import parallel_gm
+from gklearn.kernels import GraphKernel
+
+
+class WeisfeilerLehman(GraphKernel): # @todo: total parallelization and sp, edge user kernel.
+	
+	def __init__(self, **kwargs):
+		GraphKernel.__init__(self)
+		self.__node_labels = kwargs.get('node_labels', [])
+		self.__edge_labels = kwargs.get('edge_labels', [])
+		self.__height = int(kwargs.get('height', 0))
+		self.__base_kernel = kwargs.get('base_kernel', 'subtree')
+		self.__ds_infos = kwargs.get('ds_infos', {})
+
+
+	def _compute_gm_series(self):
+		self.__add_dummy_node_labels(self._graphs)
+		
+		# for WL subtree kernel
+		if self.__base_kernel == 'subtree':		   
+			gram_matrix = self.__subtree_kernel_do(self._graphs)
+	
+		# for WL shortest path kernel
+		elif self.__base_kernel == 'sp':
+			gram_matrix = self.__sp_kernel_do(self._graphs)
+	
+		# for WL edge kernel
+		elif self.__base_kernel == 'edge':
+			gram_matrix = self.__edge_kernel_do(self._graphs)
+	
+		# for user defined base kernel
+		else:
+			gram_matrix = self.__user_kernel_do(self._graphs)
+				
+		return gram_matrix
+			
+			
+	def _compute_gm_imap_unordered(self):
+		if self._verbose >= 2:
+			raise Warning('Only a part of the computation is parallelized due to the structure of this kernel.')
+		return self._compute_gm_series()
+	
+	
+	def _compute_kernel_list_series(self, g1, g_list): # @todo: this should be better.
+		self.__add_dummy_node_labels(g_list + [g1])
+				
+		# for WL subtree kernel
+		if self.__base_kernel == 'subtree':		   
+			gram_matrix = self.__subtree_kernel_do(g_list + [g1])
+	
+		# for WL shortest path kernel
+		elif self.__base_kernel == 'sp':
+			gram_matrix = self.__sp_kernel_do(g_list + [g1])
+	
+		# for WL edge kernel
+		elif self.__base_kernel == 'edge':
+			gram_matrix = self.__edge_kernel_do(g_list + [g1])
+	
+		# for user defined base kernel
+		else:
+			gram_matrix = self.__user_kernel_do(g_list + [g1])
+				
+		return list(gram_matrix[-1][0:-1])
+	
+	
+	def _compute_kernel_list_imap_unordered(self, g1, g_list):
+		if self._verbose >= 2:
+			raise Warning('Only a part of the computation is parallelized due to the structure of this kernel.')
+		return self._compute_gm_imap_unordered()
+	
+	
+	def _wrapper_kernel_list_do(self, itr):
+		pass
+	
+	
+	def _compute_single_kernel_series(self, g1, g2):  # @todo: this should be better.
+		self.__add_dummy_node_labels([g1] + [g2])
+
+		# for WL subtree kernel
+		if self.__base_kernel == 'subtree':		   
+			gram_matrix = self.__subtree_kernel_do([g1] + [g2])
+	
+		# for WL shortest path kernel
+		elif self.__base_kernel == 'sp':
+			gram_matrix = self.__sp_kernel_do([g1] + [g2])
+	
+		# for WL edge kernel
+		elif self.__base_kernel == 'edge':
+			gram_matrix = self.__edge_kernel_do([g1] + [g2])
+	
+		# for user defined base kernel
+		else:
+			gram_matrix = self.__user_kernel_do([g1] + [g2])
+				
+		return gram_matrix[0][1]
+	
+	
+	def __subtree_kernel_do(self, Gn):
+		"""Calculate Weisfeiler-Lehman kernels between graphs.
+	
+		Parameters
+		----------
+		Gn : List of NetworkX graph
+			List of graphs between which the kernels are calculated.	   
+	
+		Return
+		------
+		gram_matrix : Numpy matrix
+			Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
+		"""
+		gram_matrix = np.zeros((len(Gn), len(Gn)))
+	
+		# initial for height = 0
+		all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration
+	
+		# for each graph
+		for G in Gn:
+			# set all labels into a tuple.
+			for nd, attrs in G.nodes(data=True): # @todo: there may be a better way.
+				G.nodes[nd]['label_tuple'] = tuple(attrs[name] for name in self.__node_labels)
+			# get the set of original labels
+			labels_ori = list(nx.get_node_attributes(G, 'label_tuple').values())
+			# number of occurence of each label in G
+			all_num_of_each_label.append(dict(Counter(labels_ori)))
+	
+		# calculate subtree kernel with the 0th iteration and add it to the final kernel.
+		self.__compute_gram_matrix(gram_matrix, all_num_of_each_label, Gn)
+	
+		# iterate each height
+		for h in range(1, self.__height + 1):
+			all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
+			num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
+	#		all_labels_ori = set() # all unique orignal labels in all graphs in this iteration
+			all_num_of_each_label = [] # number of occurence of each label in G
+	
+			# @todo: parallel this part.
+			for idx, G in enumerate(Gn):
+	
+				all_multisets = []
+				for node, attrs in G.nodes(data=True):
+					# Multiset-label determination.
+					multiset = [G.nodes[neighbors]['label_tuple'] for neighbors in G[node]]
+					# sorting each multiset
+					multiset.sort()
+					multiset = [attrs['label_tuple']] + multiset # add the prefix 
+					all_multisets.append(tuple(multiset))
+	
+				# label compression
+				set_unique = list(set(all_multisets)) # set of unique multiset labels
+				# a dictionary mapping original labels to new ones. 
+				set_compressed = {}
+				# if a label occured before, assign its former compressed label, 
+				# else assign the number of labels occured + 1 as the compressed label. 
+				for value in set_unique:
+					if value in all_set_compressed.keys():
+						set_compressed.update({value: all_set_compressed[value]})
+					else:
+						set_compressed.update({value: str(num_of_labels_occured + 1)})
+						num_of_labels_occured += 1
+	
+				all_set_compressed.update(set_compressed)
+	
+				# relabel nodes
+				for idx, node in enumerate(G.nodes()):
+					G.nodes[node]['label_tuple'] = set_compressed[all_multisets[idx]]
+	
+				# get the set of compressed labels
+				labels_comp = list(nx.get_node_attributes(G, 'label_tuple').values())
+	#			all_labels_ori.update(labels_comp)
+				all_num_of_each_label.append(dict(Counter(labels_comp)))
+	
+			# calculate subtree kernel with h iterations and add it to the final kernel
+			self.__compute_gram_matrix(gram_matrix, all_num_of_each_label, Gn)
+	
+		return gram_matrix
+
+	
+	def __compute_gram_matrix(self, gram_matrix, all_num_of_each_label, Gn):
+		"""Compute Gram matrix using the base kernel.
+		"""
+		if self._parallel == 'imap_unordered':
+			# compute kernels.
+			def init_worker(alllabels_toshare):
+				global G_alllabels
+				G_alllabels = alllabels_toshare
+			do_partial = partial(self._wrapper_compute_subtree_kernel, gram_matrix)
+			parallel_gm(do_partial, gram_matrix, Gn, init_worker=init_worker, 
+						glbv=(all_num_of_each_label,), n_jobs=self._n_jobs, verbose=self._verbose)
+		elif self._parallel is None:
+			for i in range(len(gram_matrix)):
+				for j in range(i, len(gram_matrix)):
+					gram_matrix[i][j] = self.__compute_subtree_kernel(all_num_of_each_label[i],
+						   all_num_of_each_label[j], gram_matrix[i][j])
+					gram_matrix[j][i] = gram_matrix[i][j]
+					
+					
+	def __compute_subtree_kernel(self, num_of_each_label1, num_of_each_label2, kernel):
+		"""Compute the subtree kernel.
+		"""
+		labels = set(list(num_of_each_label1.keys()) + list(num_of_each_label2.keys()))
+		vector1 = np.array([(num_of_each_label1[label] 
+							if (label in num_of_each_label1.keys()) else 0) 
+							for label in labels])
+		vector2 = np.array([(num_of_each_label2[label] 
+							if (label in num_of_each_label2.keys()) else 0) 
+							for label in labels])
+		kernel += np.dot(vector1, vector2)
+		return kernel
+	
+	
+	def _wrapper_compute_subtree_kernel(self, gram_matrix, itr):
+		i = itr[0]
+		j = itr[1]
+		return i, j, self.__compute_subtree_kernel(G_alllabels[i], G_alllabels[j], gram_matrix[i][j])
+				
+	
+	def _wl_spkernel_do(Gn, node_label, edge_label, height):
+		"""Calculate Weisfeiler-Lehman shortest path kernels between graphs.
+		
+		Parameters
+		----------
+		Gn : List of NetworkX graph
+			List of graphs between which the kernels are calculated.	   
+		node_label : string
+			node attribute used as label.	  
+		edge_label : string
+			edge attribute used as label.	   
+		height : int
+			subtree height.
+			
+		Return
+		------
+		gram_matrix : Numpy matrix
+			Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
+		"""
+		pass
+		from gklearn.utils.utils import getSPGraph
+		  
+		# init.
+		height = int(height)
+		gram_matrix = np.zeros((len(Gn), len(Gn))) # init kernel
+	
+		Gn = [ getSPGraph(G, edge_weight = edge_label) for G in Gn ] # get shortest path graphs of Gn
+		
+		# initial for height = 0
+		for i in range(0, len(Gn)):
+			for j in range(i, len(Gn)):
+				for e1 in Gn[i].edges(data = True):
+					for e2 in Gn[j].edges(data = True):		  
+						if e1[2]['cost'] != 0 and e1[2]['cost'] == e2[2]['cost'] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])):
+							gram_matrix[i][j] += 1
+				gram_matrix[j][i] = gram_matrix[i][j]
+				
+		# iterate each height
+		for h in range(1, height + 1):
+			all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
+			num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
+			for G in Gn: # for each graph
+				set_multisets = []
+				for node in G.nodes(data = True):
+					# Multiset-label determination.
+					multiset = [ G.node[neighbors][node_label] for neighbors in G[node[0]] ]
+					# sorting each multiset
+					multiset.sort()
+					multiset = node[1][node_label] + ''.join(multiset) # concatenate to a string and add the prefix 
+					set_multisets.append(multiset)		  
+	
+				# label compression
+				set_unique = list(set(set_multisets)) # set of unique multiset labels
+				# a dictionary mapping original labels to new ones. 
+				set_compressed = {}
+				# if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label 
+				for value in set_unique:
+					if value in all_set_compressed.keys():
+						set_compressed.update({ value : all_set_compressed[value] })
+					else:
+						set_compressed.update({ value : str(num_of_labels_occured + 1) })
+						num_of_labels_occured += 1
+	
+				all_set_compressed.update(set_compressed)
+				
+				# relabel nodes
+				for node in G.nodes(data = True):
+					node[1][node_label] = set_compressed[set_multisets[node[0]]]
+					
+			# calculate subtree kernel with h iterations and add it to the final kernel
+			for i in range(0, len(Gn)):
+				for j in range(i, len(Gn)):
+					for e1 in Gn[i].edges(data = True):
+						for e2 in Gn[j].edges(data = True):		  
+							if e1[2]['cost'] != 0 and e1[2]['cost'] == e2[2]['cost'] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])):
+								gram_matrix[i][j] += 1
+					gram_matrix[j][i] = gram_matrix[i][j]
+			
+		return gram_matrix
+	
+	
+	
+	def _wl_edgekernel_do(Gn, node_label, edge_label, height):
+		"""Calculate Weisfeiler-Lehman edge kernels between graphs.
+		
+		Parameters
+		----------
+		Gn : List of NetworkX graph
+			List of graphs between which the kernels are calculated.	   
+		node_label : string
+			node attribute used as label.	  
+		edge_label : string
+			edge attribute used as label.	   
+		height : int
+			subtree height.
+			
+		Return
+		------
+		gram_matrix : Numpy matrix
+			Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
+		"""	  
+		pass
+		# init.
+		height = int(height)
+		gram_matrix = np.zeros((len(Gn), len(Gn))) # init kernel
+	  
+		# initial for height = 0
+		for i in range(0, len(Gn)):
+			for j in range(i, len(Gn)):
+				for e1 in Gn[i].edges(data = True):
+					for e2 in Gn[j].edges(data = True):		  
+						if e1[2][edge_label] == e2[2][edge_label] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])):
+							gram_matrix[i][j] += 1
+				gram_matrix[j][i] = gram_matrix[i][j]
+				
+		# iterate each height
+		for h in range(1, height + 1):
+			all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
+			num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
+			for G in Gn: # for each graph
+				set_multisets = []			
+				for node in G.nodes(data = True):
+					# Multiset-label determination.
+					multiset = [ G.node[neighbors][node_label] for neighbors in G[node[0]] ]
+					# sorting each multiset
+					multiset.sort()
+					multiset = node[1][node_label] + ''.join(multiset) # concatenate to a string and add the prefix 
+					set_multisets.append(multiset)		  
+	
+				# label compression
+				set_unique = list(set(set_multisets)) # set of unique multiset labels
+				# a dictionary mapping original labels to new ones. 
+				set_compressed = {}
+				# if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label 
+				for value in set_unique:
+					if value in all_set_compressed.keys():
+						set_compressed.update({ value : all_set_compressed[value] })
+					else:
+						set_compressed.update({ value : str(num_of_labels_occured + 1) })
+						num_of_labels_occured += 1
+	
+				all_set_compressed.update(set_compressed)
+				
+				# relabel nodes
+				for node in G.nodes(data = True):
+					node[1][node_label] = set_compressed[set_multisets[node[0]]]
+					
+			# calculate subtree kernel with h iterations and add it to the final kernel
+			for i in range(0, len(Gn)):
+				for j in range(i, len(Gn)):
+					for e1 in Gn[i].edges(data = True):
+						for e2 in Gn[j].edges(data = True):		  
+							if e1[2][edge_label] == e2[2][edge_label] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])):
+								gram_matrix[i][j] += 1
+					gram_matrix[j][i] = gram_matrix[i][j]
+			
+		return gram_matrix
+	
+	
+	def _wl_userkernel_do(Gn, node_label, edge_label, height, base_kernel):
+		"""Calculate Weisfeiler-Lehman kernels based on user-defined kernel between graphs.
+		
+		Parameters
+		----------
+		Gn : List of NetworkX graph
+			List of graphs between which the kernels are calculated.	   
+		node_label : string
+			node attribute used as label.	  
+		edge_label : string
+			edge attribute used as label.	   
+		height : int
+			subtree height.
+		base_kernel : string
+			Name of the base kernel function used in each iteration of WL kernel. This function returns a Numpy matrix, each element of which is the user-defined Weisfeiler-Lehman kernel between 2 praphs.
+			
+		Return
+		------
+		gram_matrix : Numpy matrix
+			Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
+		"""	  
+		pass
+		# init.
+		height = int(height)
+		gram_matrix = np.zeros((len(Gn), len(Gn))) # init kernel
+	  
+		# initial for height = 0
+		gram_matrix = base_kernel(Gn, node_label, edge_label)
+				
+		# iterate each height
+		for h in range(1, height + 1):
+			all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
+			num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
+			for G in Gn: # for each graph
+				set_multisets = []		   
+				for node in G.nodes(data = True):
+					# Multiset-label determination.
+					multiset = [ G.node[neighbors][node_label] for neighbors in G[node[0]] ]
+					# sorting each multiset
+					multiset.sort()
+					multiset = node[1][node_label] + ''.join(multiset) # concatenate to a string and add the prefix 
+					set_multisets.append(multiset)		  
+	
+				# label compression
+				set_unique = list(set(set_multisets)) # set of unique multiset labels
+				# a dictionary mapping original labels to new ones. 
+				set_compressed = {}
+				# if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label 
+				for value in set_unique:
+					if value in all_set_compressed.keys():
+						set_compressed.update({ value : all_set_compressed[value] })
+					else:
+						set_compressed.update({ value : str(num_of_labels_occured + 1) })
+						num_of_labels_occured += 1
+	
+				all_set_compressed.update(set_compressed)
+				
+				# relabel nodes
+				for node in G.nodes(data = True):
+					node[1][node_label] = set_compressed[set_multisets[node[0]]]
+					
+			# calculate kernel with h iterations and add it to the final kernel
+			gram_matrix += base_kernel(Gn, node_label, edge_label)
+			
+		return gram_matrix
+	
+	
+	def __add_dummy_node_labels(self, Gn):
+		if len(self.__node_labels) == 0:
+			for G in Gn:
+				nx.set_node_attributes(G, '0', 'dummy')
+			self.__node_labels.append('dummy')
\ No newline at end of file