From 83ad1949b03e50c444e87ac38ee9fb7da1c0bbbe Mon Sep 17 00:00:00 2001
From: linlin <jajupmochi@gmail.com>
Date: Tue, 6 Oct 2020 17:26:11 +0200
Subject: [PATCH] New translations weisfeilerLehmanKernel.py (Chinese
 Simplified)

---
 lang/zh/gklearn/kernels/weisfeilerLehmanKernel.py | 570 ++++++++++++++++++++++
 1 file changed, 570 insertions(+)
 create mode 100644 lang/zh/gklearn/kernels/weisfeilerLehmanKernel.py

diff --git a/lang/zh/gklearn/kernels/weisfeilerLehmanKernel.py b/lang/zh/gklearn/kernels/weisfeilerLehmanKernel.py
new file mode 100644
index 0000000..222f5c5
--- /dev/null
+++ b/lang/zh/gklearn/kernels/weisfeilerLehmanKernel.py
@@ -0,0 +1,570 @@
+"""
+@author: linlin
+
+@references:
+
+	[1] Shervashidze N, Schweitzer P, Leeuwen EJ, Mehlhorn K, Borgwardt KM. 
+	Weisfeiler-lehman graph kernels. Journal of Machine Learning Research. 
+	2011;12(Sep):2539-61.
+"""
+
+import sys
+from collections import Counter
+from functools import partial
+import time
+#from multiprocessing import Pool
+from tqdm import tqdm
+
+import networkx as nx
+import numpy as np
+
+#from gklearn.kernels.pathKernel import pathkernel
+from gklearn.utils.graphdataset import get_dataset_attributes
+from gklearn.utils.parallel import parallel_gm
+
+# @todo: support edge kernel, sp kernel, user-defined kernel.
+def weisfeilerlehmankernel(*args, 
+						   node_label='atom',
+						   edge_label='bond_type',
+						   height=0,
+						   base_kernel='subtree',
+						   parallel=None,
+						   n_jobs=None, 
+						   chunksize=None,
+						   verbose=True):
+	"""Calculate Weisfeiler-Lehman kernels between graphs.
+	
+	Parameters
+	----------
+	Gn : List of NetworkX graph
+		List of graphs between which the kernels are calculated.
+	
+	G1, G2 : NetworkX graphs
+		Two graphs between which the kernel is calculated.		
+
+	node_label : string
+		Node attribute used as label. The default node label is atom.		
+
+	edge_label : string
+		Edge attribute used as label. The default edge label is bond_type.		
+
+	height : int
+		Subtree height.
+
+	base_kernel : string
+		Base kernel used in each iteration of WL kernel. Only default 'subtree' 
+		kernel can be applied for now.
+
+	parallel : None
+		Which paralleliztion method is applied to compute the kernel. No 
+		parallelization can be applied for now.
+
+	n_jobs : int
+		Number of jobs for parallelization. The default is to use all 
+		computational cores. This argument is only valid when one of the 
+		parallelization method is applied and can be ignored for now.
+
+	Return
+	------
+	Kmatrix : Numpy matrix
+		Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
+
+	Notes
+	-----
+	This function now supports WL subtree kernel only.
+	"""
+#		The default base 
+#		kernel is subtree kernel. For user-defined kernel, base_kernel is the 
+#		name of the base kernel function used in each iteration of WL kernel. 
+#		This function returns a Numpy matrix, each element of which is the 
+#		user-defined Weisfeiler-Lehman kernel between 2 praphs.
+	# pre-process
+	base_kernel = base_kernel.lower()
+	Gn = args[0] if len(args) == 1 else [args[0], args[1]] # arrange all graphs in a list
+	Gn = [g.copy() for g in Gn]
+	ds_attrs = get_dataset_attributes(Gn, attr_names=['node_labeled'], 
+									  node_label=node_label)
+	if not ds_attrs['node_labeled']:
+		for G in Gn:
+			nx.set_node_attributes(G, '0', 'atom')
+
+	start_time = time.time()
+
+	# for WL subtree kernel
+	if base_kernel == 'subtree':		   
+		Kmatrix = _wl_kernel_do(Gn, node_label, edge_label, height, parallel, n_jobs, chunksize, verbose)
+
+	# for WL shortest path kernel
+	elif base_kernel == 'sp':
+		Kmatrix = _wl_spkernel_do(Gn, node_label, edge_label, height)
+
+	# for WL edge kernel
+	elif base_kernel == 'edge':
+		Kmatrix = _wl_edgekernel_do(Gn, node_label, edge_label, height)
+
+	# for user defined base kernel
+	else:
+		Kmatrix = _wl_userkernel_do(Gn, node_label, edge_label, height, base_kernel)
+
+	run_time = time.time() - start_time
+	if verbose:
+		print("\n --- Weisfeiler-Lehman %s kernel matrix of size %d built in %s seconds ---" 
+			  % (base_kernel, len(args[0]), run_time))
+
+	return Kmatrix, run_time
+
+
+def _wl_kernel_do(Gn, node_label, edge_label, height, parallel, n_jobs, chunksize, verbose):
+	"""Calculate Weisfeiler-Lehman kernels between graphs.
+
+	Parameters
+	----------
+	Gn : List of NetworkX graph
+		List of graphs between which the kernels are calculated.	   
+	node_label : string
+		node attribute used as label.
+	edge_label : string
+		edge attribute used as label.	  
+	height : int
+		wl height.
+
+	Return
+	------
+	Kmatrix : Numpy matrix
+		Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
+	"""
+	height = int(height)
+	Kmatrix = np.zeros((len(Gn), len(Gn)))
+
+	# initial for height = 0
+	all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration
+
+	# for each graph
+	for G in Gn:
+		# get the set of original labels
+		labels_ori = list(nx.get_node_attributes(G, node_label).values())
+		# number of occurence of each label in G
+		all_num_of_each_label.append(dict(Counter(labels_ori)))
+
+	# calculate subtree kernel with the 0th iteration and add it to the final kernel
+	compute_kernel_matrix(Kmatrix, all_num_of_each_label, Gn, parallel, n_jobs, chunksize, False)
+
+	# iterate each height
+	for h in range(1, height + 1):
+		all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
+		num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
+#		all_labels_ori = set() # all unique orignal labels in all graphs in this iteration
+		all_num_of_each_label = [] # number of occurence of each label in G
+
+#		# for each graph
+#		# ---- use pool.imap_unordered to parallel and track progress. ----
+#		pool = Pool(n_jobs)
+#		itr = zip(Gn, range(0, len(Gn)))
+#		if len(Gn) < 100 * n_jobs:
+#			chunksize = int(len(Gn) / n_jobs) + 1
+#		else:
+#			chunksize = 100
+#		all_multisets_list = [[] for _ in range(len(Gn))]
+##		set_unique_list = [[] for _ in range(len(Gn))]
+#		get_partial = partial(wrapper_wl_iteration, node_label)
+##		if verbose:
+##			iterator = tqdm(pool.imap_unordered(get_partial, itr, chunksize),
+##							desc='wl iteration', file=sys.stdout)
+##		else:
+#		iterator = pool.imap_unordered(get_partial, itr, chunksize)
+#		for i, all_multisets in iterator:
+#			all_multisets_list[i] = all_multisets
+##			set_unique_list[i] = set_unique
+##			all_set_unique = all_set_unique | set(set_unique)
+#		pool.close()
+#		pool.join()
+		
+#		all_set_unique = set()
+#		for uset in all_multisets_list:
+#			all_set_unique = all_set_unique | set(uset)
+#			
+#		all_set_unique = list(all_set_unique)
+##		# a dictionary mapping original labels to new ones. 
+##		set_compressed = {}
+##		for idx, uset in enumerate(all_set_unique):
+##			set_compressed.update({uset: idx})
+#			
+#		for ig, G in enumerate(Gn):
+#
+##			# a dictionary mapping original labels to new ones. 
+##			set_compressed = {}
+##			# if a label occured before, assign its former compressed label, 
+##			# else assign the number of labels occured + 1 as the compressed label. 
+##			for value in set_unique_list[i]:
+##				if uset in all_set_unique:
+##					set_compressed.update({uset: all_set_compressed[value]})
+##				else:
+##					set_compressed.update({value: str(num_of_labels_occured + 1)})
+##					num_of_labels_occured += 1
+#					
+##			all_set_compressed.update(set_compressed)
+#			
+#			# relabel nodes
+#			for idx, node in enumerate(G.nodes()):
+#				G.nodes[node][node_label] = all_set_unique.index(all_multisets_list[ig][idx])
+#				
+#			# get the set of compressed labels
+#			labels_comp = list(nx.get_node_attributes(G, node_label).values())
+##			all_labels_ori.update(labels_comp)
+#			all_num_of_each_label[ig] = dict(Counter(labels_comp))
+			
+			
+
+		
+#		all_set_unique = list(all_set_unique)
+		
+		
+		# @todo: parallel this part.
+		for idx, G in enumerate(Gn):
+
+			all_multisets = []
+			for node, attrs in G.nodes(data=True):
+				# Multiset-label determination.
+				multiset = [G.nodes[neighbors][node_label] for neighbors in G[node]]
+				# sorting each multiset
+				multiset.sort()
+				multiset = [attrs[node_label]] + multiset # add the prefix 
+				all_multisets.append(tuple(multiset))
+
+			# label compression
+			set_unique = list(set(all_multisets)) # set of unique multiset labels
+			# a dictionary mapping original labels to new ones. 
+			set_compressed = {}
+			# if a label occured before, assign its former compressed label, 
+			# else assign the number of labels occured + 1 as the compressed label. 
+			for value in set_unique:
+				if value in all_set_compressed.keys():
+					set_compressed.update({value: all_set_compressed[value]})
+				else:
+					set_compressed.update({value: str(num_of_labels_occured + 1)})
+					num_of_labels_occured += 1
+
+			all_set_compressed.update(set_compressed)
+
+			# relabel nodes
+			for idx, node in enumerate(G.nodes()):
+				G.nodes[node][node_label] = set_compressed[all_multisets[idx]]
+
+			# get the set of compressed labels
+			labels_comp = list(nx.get_node_attributes(G, node_label).values())
+#			all_labels_ori.update(labels_comp)
+			all_num_of_each_label.append(dict(Counter(labels_comp)))
+
+		# calculate subtree kernel with h iterations and add it to the final kernel
+		compute_kernel_matrix(Kmatrix, all_num_of_each_label, Gn, parallel, n_jobs, chunksize, False)
+
+	return Kmatrix
+
+
+def wl_iteration(G, node_label):
+	all_multisets = []
+	for node, attrs in G.nodes(data=True):
+		# Multiset-label determination.
+		multiset = [G.nodes[neighbors][node_label] for neighbors in G[node]]
+		# sorting each multiset
+		multiset.sort()
+		multiset = [attrs[node_label]] + multiset # add the prefix 
+		all_multisets.append(tuple(multiset))
+#	# label compression
+#	set_unique = list(set(all_multisets)) # set of unique multiset labels
+	return all_multisets
+	
+#	# a dictionary mapping original labels to new ones. 
+#	set_compressed = {}
+#	# if a label occured before, assign its former compressed label, 
+#	# else assign the number of labels occured + 1 as the compressed label. 
+#	for value in set_unique:
+#		if value in all_set_compressed.keys():
+#			set_compressed.update({value: all_set_compressed[value]})
+#		else:
+#			set_compressed.update({value: str(num_of_labels_occured + 1)})
+#			num_of_labels_occured += 1
+#
+#	all_set_compressed.update(set_compressed)
+#
+#	# relabel nodes
+#	for idx, node in enumerate(G.nodes()):
+#		G.nodes[node][node_label] = set_compressed[all_multisets[idx]]
+#
+#	# get the set of compressed labels
+#	labels_comp = list(nx.get_node_attributes(G, node_label).values())
+#	all_labels_ori.update(labels_comp)
+#	all_num_of_each_label.append(dict(Counter(labels_comp)))
+#	return
+
+
+def wrapper_wl_iteration(node_label, itr_item):
+	g = itr_item[0]
+	i = itr_item[1]
+	all_multisets = wl_iteration(g, node_label)
+	return i, all_multisets
+
+
+def compute_kernel_matrix(Kmatrix, all_num_of_each_label, Gn, parallel, n_jobs, chunksize, verbose):
+	"""Compute kernel matrix using the base kernel.
+	"""
+	if parallel == 'imap_unordered':
+		# compute kernels.
+		def init_worker(alllabels_toshare):
+			global G_alllabels
+			G_alllabels = alllabels_toshare
+		do_partial = partial(wrapper_compute_subtree_kernel, Kmatrix)
+		parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, 
+					glbv=(all_num_of_each_label,), n_jobs=n_jobs, chunksize=chunksize, verbose=verbose)
+	elif parallel == None:
+		for i in range(len(Kmatrix)):
+			for j in range(i, len(Kmatrix)):
+				Kmatrix[i][j] = compute_subtree_kernel(all_num_of_each_label[i],
+					   all_num_of_each_label[j], Kmatrix[i][j])
+				Kmatrix[j][i] = Kmatrix[i][j]
+
+
+def compute_subtree_kernel(num_of_each_label1, num_of_each_label2, kernel):
+	"""Compute the subtree kernel.
+	"""
+	labels = set(list(num_of_each_label1.keys()) + list(num_of_each_label2.keys()))
+	vector1 = np.array([(num_of_each_label1[label] 
+						if (label in num_of_each_label1.keys()) else 0) 
+						for label in labels])
+	vector2 = np.array([(num_of_each_label2[label] 
+						if (label in num_of_each_label2.keys()) else 0) 
+						for label in labels])
+	kernel += np.dot(vector1, vector2)
+	return kernel
+
+
+def wrapper_compute_subtree_kernel(Kmatrix, itr):
+	i = itr[0]
+	j = itr[1]
+	return i, j, compute_subtree_kernel(G_alllabels[i], G_alllabels[j], Kmatrix[i][j])
+				
+
+def _wl_spkernel_do(Gn, node_label, edge_label, height):
+	"""Calculate Weisfeiler-Lehman shortest path kernels between graphs.
+	
+	Parameters
+	----------
+	Gn : List of NetworkX graph
+		List of graphs between which the kernels are calculated.	   
+	node_label : string
+		node attribute used as label.	  
+	edge_label : string
+		edge attribute used as label.	   
+	height : int
+		subtree height.
+		
+	Return
+	------
+	Kmatrix : Numpy matrix
+		Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
+	"""
+	pass
+	from gklearn.utils.utils import getSPGraph
+	  
+	# init.
+	height = int(height)
+	Kmatrix = np.zeros((len(Gn), len(Gn))) # init kernel
+
+	Gn = [ getSPGraph(G, edge_weight = edge_label) for G in Gn ] # get shortest path graphs of Gn
+	
+	# initial for height = 0
+	for i in range(0, len(Gn)):
+		for j in range(i, len(Gn)):
+			for e1 in Gn[i].edges(data = True):
+				for e2 in Gn[j].edges(data = True):		  
+					if e1[2]['cost'] != 0 and e1[2]['cost'] == e2[2]['cost'] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])):
+						Kmatrix[i][j] += 1
+			Kmatrix[j][i] = Kmatrix[i][j]
+			
+	# iterate each height
+	for h in range(1, height + 1):
+		all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
+		num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
+		for G in Gn: # for each graph
+			set_multisets = []
+			for node in G.nodes(data = True):
+				# Multiset-label determination.
+				multiset = [ G.node[neighbors][node_label] for neighbors in G[node[0]] ]
+				# sorting each multiset
+				multiset.sort()
+				multiset = node[1][node_label] + ''.join(multiset) # concatenate to a string and add the prefix 
+				set_multisets.append(multiset)		  
+
+			# label compression
+			set_unique = list(set(set_multisets)) # set of unique multiset labels
+			# a dictionary mapping original labels to new ones. 
+			set_compressed = {}
+			# if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label 
+			for value in set_unique:
+				if value in all_set_compressed.keys():
+					set_compressed.update({ value : all_set_compressed[value] })
+				else:
+					set_compressed.update({ value : str(num_of_labels_occured + 1) })
+					num_of_labels_occured += 1
+
+			all_set_compressed.update(set_compressed)
+			
+			# relabel nodes
+			for node in G.nodes(data = True):
+				node[1][node_label] = set_compressed[set_multisets[node[0]]]
+				
+		# calculate subtree kernel with h iterations and add it to the final kernel
+		for i in range(0, len(Gn)):
+			for j in range(i, len(Gn)):
+				for e1 in Gn[i].edges(data = True):
+					for e2 in Gn[j].edges(data = True):		  
+						if e1[2]['cost'] != 0 and e1[2]['cost'] == e2[2]['cost'] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])):
+							Kmatrix[i][j] += 1
+				Kmatrix[j][i] = Kmatrix[i][j]
+		
+	return Kmatrix
+
+
+
+def _wl_edgekernel_do(Gn, node_label, edge_label, height):
+	"""Calculate Weisfeiler-Lehman edge kernels between graphs.
+	
+	Parameters
+	----------
+	Gn : List of NetworkX graph
+		List of graphs between which the kernels are calculated.	   
+	node_label : string
+		node attribute used as label.	  
+	edge_label : string
+		edge attribute used as label.	   
+	height : int
+		subtree height.
+		
+	Return
+	------
+	Kmatrix : Numpy matrix
+		Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
+	"""	  
+	pass
+	# init.
+	height = int(height)
+	Kmatrix = np.zeros((len(Gn), len(Gn))) # init kernel
+  
+	# initial for height = 0
+	for i in range(0, len(Gn)):
+		for j in range(i, len(Gn)):
+			for e1 in Gn[i].edges(data = True):
+				for e2 in Gn[j].edges(data = True):		  
+					if e1[2][edge_label] == e2[2][edge_label] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])):
+						Kmatrix[i][j] += 1
+			Kmatrix[j][i] = Kmatrix[i][j]
+			
+	# iterate each height
+	for h in range(1, height + 1):
+		all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
+		num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
+		for G in Gn: # for each graph
+			set_multisets = []			
+			for node in G.nodes(data = True):
+				# Multiset-label determination.
+				multiset = [ G.node[neighbors][node_label] for neighbors in G[node[0]] ]
+				# sorting each multiset
+				multiset.sort()
+				multiset = node[1][node_label] + ''.join(multiset) # concatenate to a string and add the prefix 
+				set_multisets.append(multiset)		  
+
+			# label compression
+			set_unique = list(set(set_multisets)) # set of unique multiset labels
+			# a dictionary mapping original labels to new ones. 
+			set_compressed = {}
+			# if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label 
+			for value in set_unique:
+				if value in all_set_compressed.keys():
+					set_compressed.update({ value : all_set_compressed[value] })
+				else:
+					set_compressed.update({ value : str(num_of_labels_occured + 1) })
+					num_of_labels_occured += 1
+
+			all_set_compressed.update(set_compressed)
+			
+			# relabel nodes
+			for node in G.nodes(data = True):
+				node[1][node_label] = set_compressed[set_multisets[node[0]]]
+				
+		# calculate subtree kernel with h iterations and add it to the final kernel
+		for i in range(0, len(Gn)):
+			for j in range(i, len(Gn)):
+				for e1 in Gn[i].edges(data = True):
+					for e2 in Gn[j].edges(data = True):		  
+						if e1[2][edge_label] == e2[2][edge_label] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])):
+							Kmatrix[i][j] += 1
+				Kmatrix[j][i] = Kmatrix[i][j]
+		
+	return Kmatrix
+
+
+def _wl_userkernel_do(Gn, node_label, edge_label, height, base_kernel):
+	"""Calculate Weisfeiler-Lehman kernels based on user-defined kernel between graphs.
+	
+	Parameters
+	----------
+	Gn : List of NetworkX graph
+		List of graphs between which the kernels are calculated.	   
+	node_label : string
+		node attribute used as label.	  
+	edge_label : string
+		edge attribute used as label.	   
+	height : int
+		subtree height.
+	base_kernel : string
+		Name of the base kernel function used in each iteration of WL kernel. This function returns a Numpy matrix, each element of which is the user-defined Weisfeiler-Lehman kernel between 2 praphs.
+		
+	Return
+	------
+	Kmatrix : Numpy matrix
+		Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
+	"""	  
+	pass
+	# init.
+	height = int(height)
+	Kmatrix = np.zeros((len(Gn), len(Gn))) # init kernel
+  
+	# initial for height = 0
+	Kmatrix = base_kernel(Gn, node_label, edge_label)
+			
+	# iterate each height
+	for h in range(1, height + 1):
+		all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
+		num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
+		for G in Gn: # for each graph
+			set_multisets = []		   
+			for node in G.nodes(data = True):
+				# Multiset-label determination.
+				multiset = [ G.node[neighbors][node_label] for neighbors in G[node[0]] ]
+				# sorting each multiset
+				multiset.sort()
+				multiset = node[1][node_label] + ''.join(multiset) # concatenate to a string and add the prefix 
+				set_multisets.append(multiset)		  
+
+			# label compression
+			set_unique = list(set(set_multisets)) # set of unique multiset labels
+			# a dictionary mapping original labels to new ones. 
+			set_compressed = {}
+			# if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label 
+			for value in set_unique:
+				if value in all_set_compressed.keys():
+					set_compressed.update({ value : all_set_compressed[value] })
+				else:
+					set_compressed.update({ value : str(num_of_labels_occured + 1) })
+					num_of_labels_occured += 1
+
+			all_set_compressed.update(set_compressed)
+			
+			# relabel nodes
+			for node in G.nodes(data = True):
+				node[1][node_label] = set_compressed[set_multisets[node[0]]]
+				
+		# calculate kernel with h iterations and add it to the final kernel
+		Kmatrix += base_kernel(Gn, node_label, edge_label)
+		
+	return Kmatrix