|
- """
- @author: linlin
- @references:
- [1] Shervashidze N, Schweitzer P, Leeuwen EJ, Mehlhorn K, Borgwardt KM.
- Weisfeiler-lehman graph kernels. Journal of Machine Learning Research.
- 2011;12(Sep):2539-61.
- """
-
- import sys
- from collections import Counter
- sys.path.insert(0, "../")
- from functools import partial
- import time
- #from multiprocessing import Pool
- from tqdm import tqdm
-
- import networkx as nx
- import numpy as np
-
- #from pygraph.kernels.pathKernel import pathkernel
- from pygraph.utils.graphdataset import get_dataset_attributes
- from pygraph.utils.parallel import parallel_gm
-
- # @todo: support edge kernel, sp kernel, user-defined kernel.
- def weisfeilerlehmankernel(*args,
- node_label='atom',
- edge_label='bond_type',
- height=0,
- base_kernel='subtree',
- parallel=None,
- n_jobs=None,
- verbose=True):
- """Calculate Weisfeiler-Lehman kernels between graphs.
-
- Parameters
- ----------
- Gn : List of NetworkX graph
- List of graphs between which the kernels are calculated.
- /
- G1, G2 : NetworkX graphs
- 2 graphs between which the kernel is calculated.
- node_label : string
- node attribute used as label. The default node label is atom.
- edge_label : string
- edge attribute used as label. The default edge label is bond_type.
- height : int
- subtree height
- base_kernel : string
- base kernel used in each iteration of WL kernel. The default base kernel is subtree kernel. For user-defined kernel, base_kernel is the name of the base kernel function used in each iteration of WL kernel. This function returns a Numpy matrix, each element of which is the user-defined Weisfeiler-Lehman kernel between 2 praphs.
-
- Return
- ------
- Kmatrix : Numpy matrix
- Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
-
- Notes
- -----
- This function now supports WL subtree kernel only.
- """
- # pre-process
- base_kernel = base_kernel.lower()
- Gn = args[0] if len(args) == 1 else [args[0], args[1]] # arrange all graphs in a list
- Gn = [g.copy() for g in Gn]
- ds_attrs = get_dataset_attributes(Gn, attr_names=['node_labeled'],
- node_label=node_label)
- if not ds_attrs['node_labeled']:
- for G in Gn:
- nx.set_node_attributes(G, '0', 'atom')
-
- start_time = time.time()
-
- # for WL subtree kernel
- if base_kernel == 'subtree':
- Kmatrix = _wl_kernel_do(Gn, node_label, edge_label, height, parallel, n_jobs, verbose)
-
- # for WL shortest path kernel
- elif base_kernel == 'sp':
- Kmatrix = _wl_spkernel_do(Gn, node_label, edge_label, height)
-
- # for WL edge kernel
- elif base_kernel == 'edge':
- Kmatrix = _wl_edgekernel_do(Gn, node_label, edge_label, height)
-
- # for user defined base kernel
- else:
- Kmatrix = _wl_userkernel_do(Gn, node_label, edge_label, height, base_kernel)
-
- run_time = time.time() - start_time
- if verbose:
- print("\n --- Weisfeiler-Lehman %s kernel matrix of size %d built in %s seconds ---"
- % (base_kernel, len(args[0]), run_time))
-
- return Kmatrix, run_time
-
-
- def _wl_kernel_do(Gn, node_label, edge_label, height, parallel, n_jobs, verbose):
- """Calculate Weisfeiler-Lehman kernels between graphs.
-
- Parameters
- ----------
- Gn : List of NetworkX graph
- List of graphs between which the kernels are calculated.
- node_label : string
- node attribute used as label.
- edge_label : string
- edge attribute used as label.
- height : int
- wl height.
-
- Return
- ------
- Kmatrix : Numpy matrix
- Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
- """
- height = int(height)
- Kmatrix = np.zeros((len(Gn), len(Gn)))
-
- # initial for height = 0
- all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration
-
- # for each graph
- for G in Gn:
- # get the set of original labels
- labels_ori = list(nx.get_node_attributes(G, node_label).values())
- # number of occurence of each label in G
- all_num_of_each_label.append(dict(Counter(labels_ori)))
-
- # calculate subtree kernel with the 0th iteration and add it to the final kernel
- compute_kernel_matrix(Kmatrix, all_num_of_each_label, Gn, parallel, n_jobs, False)
-
- # iterate each height
- for h in range(1, height + 1):
- all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
- num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
- # all_labels_ori = set() # all unique orignal labels in all graphs in this iteration
- all_num_of_each_label = [] # number of occurence of each label in G
-
- # # for each graph
- # # ---- use pool.imap_unordered to parallel and track progress. ----
- # pool = Pool(n_jobs)
- # itr = zip(Gn, range(0, len(Gn)))
- # if len(Gn) < 100 * n_jobs:
- # chunksize = int(len(Gn) / n_jobs) + 1
- # else:
- # chunksize = 100
- # all_multisets_list = [[] for _ in range(len(Gn))]
- ## set_unique_list = [[] for _ in range(len(Gn))]
- # get_partial = partial(wrapper_wl_iteration, node_label)
- ## if verbose:
- ## iterator = tqdm(pool.imap_unordered(get_partial, itr, chunksize),
- ## desc='wl iteration', file=sys.stdout)
- ## else:
- # iterator = pool.imap_unordered(get_partial, itr, chunksize)
- # for i, all_multisets in iterator:
- # all_multisets_list[i] = all_multisets
- ## set_unique_list[i] = set_unique
- ## all_set_unique = all_set_unique | set(set_unique)
- # pool.close()
- # pool.join()
-
- # all_set_unique = set()
- # for uset in all_multisets_list:
- # all_set_unique = all_set_unique | set(uset)
- #
- # all_set_unique = list(all_set_unique)
- ## # a dictionary mapping original labels to new ones.
- ## set_compressed = {}
- ## for idx, uset in enumerate(all_set_unique):
- ## set_compressed.update({uset: idx})
- #
- # for ig, G in enumerate(Gn):
- #
- ## # a dictionary mapping original labels to new ones.
- ## set_compressed = {}
- ## # if a label occured before, assign its former compressed label,
- ## # else assign the number of labels occured + 1 as the compressed label.
- ## for value in set_unique_list[i]:
- ## if uset in all_set_unique:
- ## set_compressed.update({uset: all_set_compressed[value]})
- ## else:
- ## set_compressed.update({value: str(num_of_labels_occured + 1)})
- ## num_of_labels_occured += 1
- #
- ## all_set_compressed.update(set_compressed)
- #
- # # relabel nodes
- # for idx, node in enumerate(G.nodes()):
- # G.nodes[node][node_label] = all_set_unique.index(all_multisets_list[ig][idx])
- #
- # # get the set of compressed labels
- # labels_comp = list(nx.get_node_attributes(G, node_label).values())
- ## all_labels_ori.update(labels_comp)
- # all_num_of_each_label[ig] = dict(Counter(labels_comp))
-
-
-
-
- # all_set_unique = list(all_set_unique)
-
-
- # @todo: parallel this part.
- for idx, G in enumerate(Gn):
-
- all_multisets = []
- for node, attrs in G.nodes(data=True):
- # Multiset-label determination.
- multiset = [G.nodes[neighbors][node_label] for neighbors in G[node]]
- # sorting each multiset
- multiset.sort()
- multiset = [attrs[node_label]] + multiset # add the prefix
- all_multisets.append(tuple(multiset))
-
- # label compression
- set_unique = list(set(all_multisets)) # set of unique multiset labels
- # a dictionary mapping original labels to new ones.
- set_compressed = {}
- # if a label occured before, assign its former compressed label,
- # else assign the number of labels occured + 1 as the compressed label.
- for value in set_unique:
- if value in all_set_compressed.keys():
- set_compressed.update({value: all_set_compressed[value]})
- else:
- set_compressed.update({value: str(num_of_labels_occured + 1)})
- num_of_labels_occured += 1
-
- all_set_compressed.update(set_compressed)
-
- # relabel nodes
- for idx, node in enumerate(G.nodes()):
- G.nodes[node][node_label] = set_compressed[all_multisets[idx]]
-
- # get the set of compressed labels
- labels_comp = list(nx.get_node_attributes(G, node_label).values())
- # all_labels_ori.update(labels_comp)
- all_num_of_each_label.append(dict(Counter(labels_comp)))
-
- # calculate subtree kernel with h iterations and add it to the final kernel
- compute_kernel_matrix(Kmatrix, all_num_of_each_label, Gn, parallel, n_jobs, False)
-
- return Kmatrix
-
-
- def wl_iteration(G, node_label):
- all_multisets = []
- for node, attrs in G.nodes(data=True):
- # Multiset-label determination.
- multiset = [G.nodes[neighbors][node_label] for neighbors in G[node]]
- # sorting each multiset
- multiset.sort()
- multiset = [attrs[node_label]] + multiset # add the prefix
- all_multisets.append(tuple(multiset))
- # # label compression
- # set_unique = list(set(all_multisets)) # set of unique multiset labels
- return all_multisets
-
- # # a dictionary mapping original labels to new ones.
- # set_compressed = {}
- # # if a label occured before, assign its former compressed label,
- # # else assign the number of labels occured + 1 as the compressed label.
- # for value in set_unique:
- # if value in all_set_compressed.keys():
- # set_compressed.update({value: all_set_compressed[value]})
- # else:
- # set_compressed.update({value: str(num_of_labels_occured + 1)})
- # num_of_labels_occured += 1
- #
- # all_set_compressed.update(set_compressed)
- #
- # # relabel nodes
- # for idx, node in enumerate(G.nodes()):
- # G.nodes[node][node_label] = set_compressed[all_multisets[idx]]
- #
- # # get the set of compressed labels
- # labels_comp = list(nx.get_node_attributes(G, node_label).values())
- # all_labels_ori.update(labels_comp)
- # all_num_of_each_label.append(dict(Counter(labels_comp)))
- # return
-
-
- def wrapper_wl_iteration(node_label, itr_item):
- g = itr_item[0]
- i = itr_item[1]
- all_multisets = wl_iteration(g, node_label)
- return i, all_multisets
-
-
- def compute_kernel_matrix(Kmatrix, all_num_of_each_label, Gn, parallel, n_jobs, verbose):
- """Compute kernel matrix using the base kernel.
- """
- if parallel == 'imap_unordered':
- # compute kernels.
- def init_worker(alllabels_toshare):
- global G_alllabels
- G_alllabels = alllabels_toshare
- do_partial = partial(wrapper_compute_subtree_kernel, Kmatrix)
- parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker,
- glbv=(all_num_of_each_label,), n_jobs=n_jobs, verbose=verbose)
- elif parallel == None:
- for i in range(len(Kmatrix)):
- for j in range(i, len(Kmatrix)):
- Kmatrix[i][j] = compute_subtree_kernel(all_num_of_each_label[i],
- all_num_of_each_label[j], Kmatrix[i][j])
- Kmatrix[j][i] = Kmatrix[i][j]
-
-
- def compute_subtree_kernel(num_of_each_label1, num_of_each_label2, kernel):
- """Compute the subtree kernel.
- """
- labels = set(list(num_of_each_label1.keys()) + list(num_of_each_label2.keys()))
- vector1 = np.array([(num_of_each_label1[label]
- if (label in num_of_each_label1.keys()) else 0)
- for label in labels])
- vector2 = np.array([(num_of_each_label2[label]
- if (label in num_of_each_label2.keys()) else 0)
- for label in labels])
- kernel += np.dot(vector1, vector2)
- return kernel
-
-
- def wrapper_compute_subtree_kernel(Kmatrix, itr):
- i = itr[0]
- j = itr[1]
- return i, j, compute_subtree_kernel(G_alllabels[i], G_alllabels[j], Kmatrix[i][j])
-
-
- def _wl_spkernel_do(Gn, node_label, edge_label, height):
- """Calculate Weisfeiler-Lehman shortest path kernels between graphs.
-
- Parameters
- ----------
- Gn : List of NetworkX graph
- List of graphs between which the kernels are calculated.
- node_label : string
- node attribute used as label.
- edge_label : string
- edge attribute used as label.
- height : int
- subtree height.
-
- Return
- ------
- Kmatrix : Numpy matrix
- Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
- """
- pass
- from pygraph.utils.utils import getSPGraph
-
- # init.
- height = int(height)
- Kmatrix = np.zeros((len(Gn), len(Gn))) # init kernel
-
- Gn = [ getSPGraph(G, edge_weight = edge_label) for G in Gn ] # get shortest path graphs of Gn
-
- # initial for height = 0
- for i in range(0, len(Gn)):
- for j in range(i, len(Gn)):
- for e1 in Gn[i].edges(data = True):
- for e2 in Gn[j].edges(data = True):
- if e1[2]['cost'] != 0 and e1[2]['cost'] == e2[2]['cost'] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])):
- Kmatrix[i][j] += 1
- Kmatrix[j][i] = Kmatrix[i][j]
-
- # iterate each height
- for h in range(1, height + 1):
- all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
- num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
- for G in Gn: # for each graph
- set_multisets = []
- for node in G.nodes(data = True):
- # Multiset-label determination.
- multiset = [ G.node[neighbors][node_label] for neighbors in G[node[0]] ]
- # sorting each multiset
- multiset.sort()
- multiset = node[1][node_label] + ''.join(multiset) # concatenate to a string and add the prefix
- set_multisets.append(multiset)
-
- # label compression
- set_unique = list(set(set_multisets)) # set of unique multiset labels
- # a dictionary mapping original labels to new ones.
- set_compressed = {}
- # if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label
- for value in set_unique:
- if value in all_set_compressed.keys():
- set_compressed.update({ value : all_set_compressed[value] })
- else:
- set_compressed.update({ value : str(num_of_labels_occured + 1) })
- num_of_labels_occured += 1
-
- all_set_compressed.update(set_compressed)
-
- # relabel nodes
- for node in G.nodes(data = True):
- node[1][node_label] = set_compressed[set_multisets[node[0]]]
-
- # calculate subtree kernel with h iterations and add it to the final kernel
- for i in range(0, len(Gn)):
- for j in range(i, len(Gn)):
- for e1 in Gn[i].edges(data = True):
- for e2 in Gn[j].edges(data = True):
- if e1[2]['cost'] != 0 and e1[2]['cost'] == e2[2]['cost'] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])):
- Kmatrix[i][j] += 1
- Kmatrix[j][i] = Kmatrix[i][j]
-
- return Kmatrix
-
-
-
- def _wl_edgekernel_do(Gn, node_label, edge_label, height):
- """Calculate Weisfeiler-Lehman edge kernels between graphs.
-
- Parameters
- ----------
- Gn : List of NetworkX graph
- List of graphs between which the kernels are calculated.
- node_label : string
- node attribute used as label.
- edge_label : string
- edge attribute used as label.
- height : int
- subtree height.
-
- Return
- ------
- Kmatrix : Numpy matrix
- Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
- """
- pass
- # init.
- height = int(height)
- Kmatrix = np.zeros((len(Gn), len(Gn))) # init kernel
-
- # initial for height = 0
- for i in range(0, len(Gn)):
- for j in range(i, len(Gn)):
- for e1 in Gn[i].edges(data = True):
- for e2 in Gn[j].edges(data = True):
- if e1[2][edge_label] == e2[2][edge_label] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])):
- Kmatrix[i][j] += 1
- Kmatrix[j][i] = Kmatrix[i][j]
-
- # iterate each height
- for h in range(1, height + 1):
- all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
- num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
- for G in Gn: # for each graph
- set_multisets = []
- for node in G.nodes(data = True):
- # Multiset-label determination.
- multiset = [ G.node[neighbors][node_label] for neighbors in G[node[0]] ]
- # sorting each multiset
- multiset.sort()
- multiset = node[1][node_label] + ''.join(multiset) # concatenate to a string and add the prefix
- set_multisets.append(multiset)
-
- # label compression
- set_unique = list(set(set_multisets)) # set of unique multiset labels
- # a dictionary mapping original labels to new ones.
- set_compressed = {}
- # if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label
- for value in set_unique:
- if value in all_set_compressed.keys():
- set_compressed.update({ value : all_set_compressed[value] })
- else:
- set_compressed.update({ value : str(num_of_labels_occured + 1) })
- num_of_labels_occured += 1
-
- all_set_compressed.update(set_compressed)
-
- # relabel nodes
- for node in G.nodes(data = True):
- node[1][node_label] = set_compressed[set_multisets[node[0]]]
-
- # calculate subtree kernel with h iterations and add it to the final kernel
- for i in range(0, len(Gn)):
- for j in range(i, len(Gn)):
- for e1 in Gn[i].edges(data = True):
- for e2 in Gn[j].edges(data = True):
- if e1[2][edge_label] == e2[2][edge_label] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])):
- Kmatrix[i][j] += 1
- Kmatrix[j][i] = Kmatrix[i][j]
-
- return Kmatrix
-
-
- def _wl_userkernel_do(Gn, node_label, edge_label, height, base_kernel):
- """Calculate Weisfeiler-Lehman kernels based on user-defined kernel between graphs.
-
- Parameters
- ----------
- Gn : List of NetworkX graph
- List of graphs between which the kernels are calculated.
- node_label : string
- node attribute used as label.
- edge_label : string
- edge attribute used as label.
- height : int
- subtree height.
- base_kernel : string
- Name of the base kernel function used in each iteration of WL kernel. This function returns a Numpy matrix, each element of which is the user-defined Weisfeiler-Lehman kernel between 2 praphs.
-
- Return
- ------
- Kmatrix : Numpy matrix
- Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
- """
- pass
- # init.
- height = int(height)
- Kmatrix = np.zeros((len(Gn), len(Gn))) # init kernel
-
- # initial for height = 0
- Kmatrix = base_kernel(Gn, node_label, edge_label)
-
- # iterate each height
- for h in range(1, height + 1):
- all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
- num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
- for G in Gn: # for each graph
- set_multisets = []
- for node in G.nodes(data = True):
- # Multiset-label determination.
- multiset = [ G.node[neighbors][node_label] for neighbors in G[node[0]] ]
- # sorting each multiset
- multiset.sort()
- multiset = node[1][node_label] + ''.join(multiset) # concatenate to a string and add the prefix
- set_multisets.append(multiset)
-
- # label compression
- set_unique = list(set(set_multisets)) # set of unique multiset labels
- # a dictionary mapping original labels to new ones.
- set_compressed = {}
- # if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label
- for value in set_unique:
- if value in all_set_compressed.keys():
- set_compressed.update({ value : all_set_compressed[value] })
- else:
- set_compressed.update({ value : str(num_of_labels_occured + 1) })
- num_of_labels_occured += 1
-
- all_set_compressed.update(set_compressed)
-
- # relabel nodes
- for node in G.nodes(data = True):
- node[1][node_label] = set_compressed[set_multisets[node[0]]]
-
- # calculate kernel with h iterations and add it to the final kernel
- Kmatrix += base_kernel(Gn, node_label, edge_label)
-
- return Kmatrix
|