OpenI
/
graphkit-learn

"""
@author: linlin

@references:

	[1] Shervashidze N, Schweitzer P, Leeuwen EJ, Mehlhorn K, Borgwardt KM. 
	Weisfeiler-lehman graph kernels. Journal of Machine Learning Research. 
	2011;12(Sep):2539-61.
"""

import sys
from collections import Counter
from functools import partial
import time
#from multiprocessing import Pool
from tqdm import tqdm

import networkx as nx
import numpy as np

#from gklearn.kernels.pathKernel import pathkernel
from gklearn.utils.graphdataset import get_dataset_attributes
from gklearn.utils.parallel import parallel_gm

# @todo: support edge kernel, sp kernel, user-defined kernel.
def weisfeilerlehmankernel(*args, 
						   node_label='atom',
						   edge_label='bond_type',
						   height=0,
						   base_kernel='subtree',
						   parallel=None,
						   n_jobs=None, 
						   verbose=True):
	"""Calculate Weisfeiler-Lehman kernels between graphs.
	
	Parameters
	----------
	Gn : List of NetworkX graph
		List of graphs between which the kernels are calculated.
	
	G1, G2 : NetworkX graphs
		Two graphs between which the kernel is calculated.		

	node_label : string
		Node attribute used as label. The default node label is atom.		

	edge_label : string
		Edge attribute used as label. The default edge label is bond_type.		

	height : int
		Subtree height.

	base_kernel : string
		Base kernel used in each iteration of WL kernel. Only default 'subtree' 
		kernel can be applied for now.

	parallel : None
		Which paralleliztion method is applied to compute the kernel. No 
		parallelization can be applied for now.

	n_jobs : int
		Number of jobs for parallelization. The default is to use all 
		computational cores. This argument is only valid when one of the 
		parallelization method is applied and can be ignored for now.

	Return
	------
	Kmatrix : Numpy matrix
		Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.

	Notes
	-----
	This function now supports WL subtree kernel only.
	"""
#		The default base 
#		kernel is subtree kernel. For user-defined kernel, base_kernel is the 
#		name of the base kernel function used in each iteration of WL kernel. 
#		This function returns a Numpy matrix, each element of which is the 
#		user-defined Weisfeiler-Lehman kernel between 2 praphs.
	# pre-process
	base_kernel = base_kernel.lower()
	Gn = args[0] if len(args) == 1 else [args[0], args[1]] # arrange all graphs in a list
	Gn = [g.copy() for g in Gn]
	ds_attrs = get_dataset_attributes(Gn, attr_names=['node_labeled'], 
									  node_label=node_label)
	if not ds_attrs['node_labeled']:
		for G in Gn:
			nx.set_node_attributes(G, '0', 'atom')

	start_time = time.time()

	# for WL subtree kernel
	if base_kernel == 'subtree':		   
		Kmatrix = _wl_kernel_do(Gn, node_label, edge_label, height, parallel, n_jobs, verbose)

	# for WL shortest path kernel
	elif base_kernel == 'sp':
		Kmatrix = _wl_spkernel_do(Gn, node_label, edge_label, height)

	# for WL edge kernel
	elif base_kernel == 'edge':
		Kmatrix = _wl_edgekernel_do(Gn, node_label, edge_label, height)

	# for user defined base kernel
	else:
		Kmatrix = _wl_userkernel_do(Gn, node_label, edge_label, height, base_kernel)

	run_time = time.time() - start_time
	if verbose:
		print("\n --- Weisfeiler-Lehman %s kernel matrix of size %d built in %s seconds ---" 
			  % (base_kernel, len(args[0]), run_time))

	return Kmatrix, run_time


def _wl_kernel_do(Gn, node_label, edge_label, height, parallel, n_jobs, verbose):
	"""Calculate Weisfeiler-Lehman kernels between graphs.

	Parameters
	----------
	Gn : List of NetworkX graph
		List of graphs between which the kernels are calculated.	   
	node_label : string
		node attribute used as label.
	edge_label : string
		edge attribute used as label.	  
	height : int
		wl height.

	Return
	------
	Kmatrix : Numpy matrix
		Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
	"""
	height = int(height)
	Kmatrix = np.zeros((len(Gn), len(Gn)))

	# initial for height = 0
	all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration

	# for each graph
	for G in Gn:
		# get the set of original labels
		labels_ori = list(nx.get_node_attributes(G, node_label).values())
		# number of occurence of each label in G
		all_num_of_each_label.append(dict(Counter(labels_ori)))

	# calculate subtree kernel with the 0th iteration and add it to the final kernel
	compute_kernel_matrix(Kmatrix, all_num_of_each_label, Gn, parallel, n_jobs, False)

	# iterate each height
	for h in range(1, height + 1):
		all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
		num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
#		all_labels_ori = set() # all unique orignal labels in all graphs in this iteration
		all_num_of_each_label = [] # number of occurence of each label in G

#		# for each graph
#		# ---- use pool.imap_unordered to parallel and track progress. ----
#		pool = Pool(n_jobs)
#		itr = zip(Gn, range(0, len(Gn)))
#		if len(Gn) < 100 * n_jobs:
#			chunksize = int(len(Gn) / n_jobs) + 1
#		else:
#			chunksize = 100
#		all_multisets_list = [[] for _ in range(len(Gn))]
##		set_unique_list = [[] for _ in range(len(Gn))]
#		get_partial = partial(wrapper_wl_iteration, node_label)
##		if verbose:
##			iterator = tqdm(pool.imap_unordered(get_partial, itr, chunksize),
##							desc='wl iteration', file=sys.stdout)
##		else:
#		iterator = pool.imap_unordered(get_partial, itr, chunksize)
#		for i, all_multisets in iterator:
#			all_multisets_list[i] = all_multisets
##			set_unique_list[i] = set_unique
##			all_set_unique = all_set_unique | set(set_unique)
#		pool.close()
#		pool.join()
		
#		all_set_unique = set()
#		for uset in all_multisets_list:
#			all_set_unique = all_set_unique | set(uset)
#			
#		all_set_unique = list(all_set_unique)
##		# a dictionary mapping original labels to new ones. 
##		set_compressed = {}
##		for idx, uset in enumerate(all_set_unique):
##			set_compressed.update({uset: idx})
#			
#		for ig, G in enumerate(Gn):
#
##			# a dictionary mapping original labels to new ones. 
##			set_compressed = {}
##			# if a label occured before, assign its former compressed label, 
##			# else assign the number of labels occured + 1 as the compressed label. 
##			for value in set_unique_list[i]:
##				if uset in all_set_unique:
##					set_compressed.update({uset: all_set_compressed[value]})
##				else:
##					set_compressed.update({value: str(num_of_labels_occured + 1)})
##					num_of_labels_occured += 1
#					
##			all_set_compressed.update(set_compressed)
#			
#			# relabel nodes
#			for idx, node in enumerate(G.nodes()):
#				G.nodes[node][node_label] = all_set_unique.index(all_multisets_list[ig][idx])
#				
#			# get the set of compressed labels
#			labels_comp = list(nx.get_node_attributes(G, node_label).values())
##			all_labels_ori.update(labels_comp)
#			all_num_of_each_label[ig] = dict(Counter(labels_comp))
			
			
#		all_set_unique = list(all_set_unique)
		
		
		# @todo: parallel this part.
		for idx, G in enumerate(Gn):

			all_multisets = []
			for node, attrs in G.nodes(data=True):
				# Multiset-label determination.
				multiset = [G.nodes[neighbors][node_label] for neighbors in G[node]]
				# sorting each multiset
				multiset.sort()
				multiset = [attrs[node_label]] + multiset # add the prefix 
				all_multisets.append(tuple(multiset))

			# label compression
			set_unique = list(set(all_multisets)) # set of unique multiset labels
			# a dictionary mapping original labels to new ones. 
			set_compressed = {}
			# if a label occured before, assign its former compressed label, 
			# else assign the number of labels occured + 1 as the compressed label. 
			for value in set_unique:
				if value in all_set_compressed.keys():
					set_compressed.update({value: all_set_compressed[value]})
				else:
					set_compressed.update({value: str(num_of_labels_occured + 1)})
					num_of_labels_occured += 1

			all_set_compressed.update(set_compressed)

			# relabel nodes
			for idx, node in enumerate(G.nodes()):
				G.nodes[node][node_label] = set_compressed[all_multisets[idx]]

			# get the set of compressed labels
			labels_comp = list(nx.get_node_attributes(G, node_label).values())
#			all_labels_ori.update(labels_comp)
			all_num_of_each_label.append(dict(Counter(labels_comp)))

		# calculate subtree kernel with h iterations and add it to the final kernel
		compute_kernel_matrix(Kmatrix, all_num_of_each_label, Gn, parallel, n_jobs, False)

	return Kmatrix


def wl_iteration(G, node_label):
	all_multisets = []
	for node, attrs in G.nodes(data=True):
		# Multiset-label determination.
		multiset = [G.nodes[neighbors][node_label] for neighbors in G[node]]
		# sorting each multiset
		multiset.sort()
		multiset = [attrs[node_label]] + multiset # add the prefix 
		all_multisets.append(tuple(multiset))
#	# label compression
#	set_unique = list(set(all_multisets)) # set of unique multiset labels
	return all_multisets
	
#	# a dictionary mapping original labels to new ones. 
#	set_compressed = {}
#	# if a label occured before, assign its former compressed label, 
#	# else assign the number of labels occured + 1 as the compressed label. 
#	for value in set_unique:
#		if value in all_set_compressed.keys():
#			set_compressed.update({value: all_set_compressed[value]})
#		else:
#			set_compressed.update({value: str(num_of_labels_occured + 1)})
#			num_of_labels_occured += 1
#
#	all_set_compressed.update(set_compressed)
#
#	# relabel nodes
#	for idx, node in enumerate(G.nodes()):
#		G.nodes[node][node_label] = set_compressed[all_multisets[idx]]
#
#	# get the set of compressed labels
#	labels_comp = list(nx.get_node_attributes(G, node_label).values())
#	all_labels_ori.update(labels_comp)
#	all_num_of_each_label.append(dict(Counter(labels_comp)))
#	return


def wrapper_wl_iteration(node_label, itr_item):
	g = itr_item[0]
	i = itr_item[1]
	all_multisets = wl_iteration(g, node_label)
	return i, all_multisets


def compute_kernel_matrix(Kmatrix, all_num_of_each_label, Gn, parallel, n_jobs, verbose):
	"""Compute kernel matrix using the base kernel.
	"""
	if parallel == 'imap_unordered':
		# compute kernels.
		def init_worker(alllabels_toshare):
			global G_alllabels
			G_alllabels = alllabels_toshare
		do_partial = partial(wrapper_compute_subtree_kernel, Kmatrix)
		parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, 
					glbv=(all_num_of_each_label,), n_jobs=n_jobs, verbose=verbose)
	elif parallel == None:
		for i in range(len(Kmatrix)):
			for j in range(i, len(Kmatrix)):
				Kmatrix[i][j] = compute_subtree_kernel(all_num_of_each_label[i],
					   all_num_of_each_label[j], Kmatrix[i][j])
				Kmatrix[j][i] = Kmatrix[i][j]


def compute_subtree_kernel(num_of_each_label1, num_of_each_label2, kernel):
	"""Compute the subtree kernel.
	"""
	labels = set(list(num_of_each_label1.keys()) + list(num_of_each_label2.keys()))
	vector1 = np.array([(num_of_each_label1[label] 
						if (label in num_of_each_label1.keys()) else 0) 
						for label in labels])
	vector2 = np.array([(num_of_each_label2[label] 
						if (label in num_of_each_label2.keys()) else 0) 
						for label in labels])
	kernel += np.dot(vector1, vector2)
	return kernel


def wrapper_compute_subtree_kernel(Kmatrix, itr):
	i = itr[0]
	j = itr[1]
	return i, j, compute_subtree_kernel(G_alllabels[i], G_alllabels[j], Kmatrix[i][j])
				

def _wl_spkernel_do(Gn, node_label, edge_label, height):
	"""Calculate Weisfeiler-Lehman shortest path kernels between graphs.
	
	Parameters
	----------
	Gn : List of NetworkX graph
		List of graphs between which the kernels are calculated.	   
	node_label : string
		node attribute used as label.	  
	edge_label : string
		edge attribute used as label.	   
	height : int
		subtree height.
		
	Return
	------
	Kmatrix : Numpy matrix
		Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
	"""
	pass
	from gklearn.utils.utils import getSPGraph
	  
	# init.
	height = int(height)
	Kmatrix = np.zeros((len(Gn), len(Gn))) # init kernel

	Gn = [ getSPGraph(G, edge_weight = edge_label) for G in Gn ] # get shortest path graphs of Gn
	
	# initial for height = 0
	for i in range(0, len(Gn)):
		for j in range(i, len(Gn)):
			for e1 in Gn[i].edges(data = True):
				for e2 in Gn[j].edges(data = True):		  
					if e1[2]['cost'] != 0 and e1[2]['cost'] == e2[2]['cost'] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])):
						Kmatrix[i][j] += 1
			Kmatrix[j][i] = Kmatrix[i][j]
			
	# iterate each height
	for h in range(1, height + 1):
		all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
		num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
		for G in Gn: # for each graph
			set_multisets = []
			for node in G.nodes(data = True):
				# Multiset-label determination.
				multiset = [ G.node[neighbors][node_label] for neighbors in G[node[0]] ]
				# sorting each multiset
				multiset.sort()
				multiset = node[1][node_label] + ''.join(multiset) # concatenate to a string and add the prefix 
				set_multisets.append(multiset)		  

			# label compression
			set_unique = list(set(set_multisets)) # set of unique multiset labels
			# a dictionary mapping original labels to new ones. 
			set_compressed = {}
			# if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label 
			for value in set_unique:
				if value in all_set_compressed.keys():
					set_compressed.update({ value : all_set_compressed[value] })
				else:
					set_compressed.update({ value : str(num_of_labels_occured + 1) })
					num_of_labels_occured += 1

			all_set_compressed.update(set_compressed)
			
			# relabel nodes
			for node in G.nodes(data = True):
				node[1][node_label] = set_compressed[set_multisets[node[0]]]
				
		# calculate subtree kernel with h iterations and add it to the final kernel
		for i in range(0, len(Gn)):
			for j in range(i, len(Gn)):
				for e1 in Gn[i].edges(data = True):
					for e2 in Gn[j].edges(data = True):		  
						if e1[2]['cost'] != 0 and e1[2]['cost'] == e2[2]['cost'] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])):
							Kmatrix[i][j] += 1
				Kmatrix[j][i] = Kmatrix[i][j]
		
	return Kmatrix


def _wl_edgekernel_do(Gn, node_label, edge_label, height):
	"""Calculate Weisfeiler-Lehman edge kernels between graphs.
	
	Parameters
	----------
	Gn : List of NetworkX graph
		List of graphs between which the kernels are calculated.	   
	node_label : string
		node attribute used as label.	  
	edge_label : string
		edge attribute used as label.	   
	height : int
		subtree height.
		
	Return
	------
	Kmatrix : Numpy matrix
		Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
	"""	  
	pass
	# init.
	height = int(height)
	Kmatrix = np.zeros((len(Gn), len(Gn))) # init kernel
  
	# initial for height = 0
	for i in range(0, len(Gn)):
		for j in range(i, len(Gn)):
			for e1 in Gn[i].edges(data = True):
				for e2 in Gn[j].edges(data = True):		  
					if e1[2][edge_label] == e2[2][edge_label] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])):
						Kmatrix[i][j] += 1
			Kmatrix[j][i] = Kmatrix[i][j]
			
	# iterate each height
	for h in range(1, height + 1):
		all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
		num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
		for G in Gn: # for each graph
			set_multisets = []			
			for node in G.nodes(data = True):
				# Multiset-label determination.
				multiset = [ G.node[neighbors][node_label] for neighbors in G[node[0]] ]
				# sorting each multiset
				multiset.sort()
				multiset = node[1][node_label] + ''.join(multiset) # concatenate to a string and add the prefix 
				set_multisets.append(multiset)		  

			# label compression
			set_unique = list(set(set_multisets)) # set of unique multiset labels
			# a dictionary mapping original labels to new ones. 
			set_compressed = {}
			# if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label 
			for value in set_unique:
				if value in all_set_compressed.keys():
					set_compressed.update({ value : all_set_compressed[value] })
				else:
					set_compressed.update({ value : str(num_of_labels_occured + 1) })
					num_of_labels_occured += 1

			all_set_compressed.update(set_compressed)
			
			# relabel nodes
			for node in G.nodes(data = True):
				node[1][node_label] = set_compressed[set_multisets[node[0]]]
				
		# calculate subtree kernel with h iterations and add it to the final kernel
		for i in range(0, len(Gn)):
			for j in range(i, len(Gn)):
				for e1 in Gn[i].edges(data = True):
					for e2 in Gn[j].edges(data = True):		  
						if e1[2][edge_label] == e2[2][edge_label] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])):
							Kmatrix[i][j] += 1
				Kmatrix[j][i] = Kmatrix[i][j]
		
	return Kmatrix


def _wl_userkernel_do(Gn, node_label, edge_label, height, base_kernel):
	"""Calculate Weisfeiler-Lehman kernels based on user-defined kernel between graphs.
	
	Parameters
	----------
	Gn : List of NetworkX graph
		List of graphs between which the kernels are calculated.	   
	node_label : string
		node attribute used as label.	  
	edge_label : string
		edge attribute used as label.	   
	height : int
		subtree height.
	base_kernel : string
		Name of the base kernel function used in each iteration of WL kernel. This function returns a Numpy matrix, each element of which is the user-defined Weisfeiler-Lehman kernel between 2 praphs.
		
	Return
	------
	Kmatrix : Numpy matrix
		Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
	"""	  
	pass
	# init.
	height = int(height)
	Kmatrix = np.zeros((len(Gn), len(Gn))) # init kernel
  
	# initial for height = 0
	Kmatrix = base_kernel(Gn, node_label, edge_label)
			
	# iterate each height
	for h in range(1, height + 1):
		all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
		num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
		for G in Gn: # for each graph
			set_multisets = []		   
			for node in G.nodes(data = True):
				# Multiset-label determination.
				multiset = [ G.node[neighbors][node_label] for neighbors in G[node[0]] ]
				# sorting each multiset
				multiset.sort()
				multiset = node[1][node_label] + ''.join(multiset) # concatenate to a string and add the prefix 
				set_multisets.append(multiset)		  

			# label compression
			set_unique = list(set(set_multisets)) # set of unique multiset labels
			# a dictionary mapping original labels to new ones. 
			set_compressed = {}
			# if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label 
			for value in set_unique:
				if value in all_set_compressed.keys():
					set_compressed.update({ value : all_set_compressed[value] })
				else:
					set_compressed.update({ value : str(num_of_labels_occured + 1) })
					num_of_labels_occured += 1

			all_set_compressed.update(set_compressed)
			
			# relabel nodes
			for node in G.nodes(data = True):
				node[1][node_label] = set_compressed[set_multisets[node[0]]]
				
		# calculate kernel with h iterations and add it to the final kernel
		Kmatrix += base_kernel(Gn, node_label, edge_label)
		
	return Kmatrix