|
@@ -0,0 +1,570 @@ |
|
|
|
|
|
""" |
|
|
|
|
|
@author: linlin |
|
|
|
|
|
|
|
|
|
|
|
@references: |
|
|
|
|
|
|
|
|
|
|
|
[1] Shervashidze N, Schweitzer P, Leeuwen EJ, Mehlhorn K, Borgwardt KM. |
|
|
|
|
|
Weisfeiler-lehman graph kernels. Journal of Machine Learning Research. |
|
|
|
|
|
2011;12(Sep):2539-61. |
|
|
|
|
|
""" |
|
|
|
|
|
|
|
|
|
|
|
import sys |
|
|
|
|
|
from collections import Counter |
|
|
|
|
|
from functools import partial |
|
|
|
|
|
import time |
|
|
|
|
|
#from multiprocessing import Pool |
|
|
|
|
|
from tqdm import tqdm |
|
|
|
|
|
|
|
|
|
|
|
import networkx as nx |
|
|
|
|
|
import numpy as np |
|
|
|
|
|
|
|
|
|
|
|
#from gklearn.kernels.pathKernel import pathkernel |
|
|
|
|
|
from gklearn.utils.graphdataset import get_dataset_attributes |
|
|
|
|
|
from gklearn.utils.parallel import parallel_gm |
|
|
|
|
|
|
|
|
|
|
|
# @todo: support edge kernel, sp kernel, user-defined kernel. |
|
|
|
|
|
def weisfeilerlehmankernel(*args, |
|
|
|
|
|
node_label='atom', |
|
|
|
|
|
edge_label='bond_type', |
|
|
|
|
|
height=0, |
|
|
|
|
|
base_kernel='subtree', |
|
|
|
|
|
parallel=None, |
|
|
|
|
|
n_jobs=None, |
|
|
|
|
|
chunksize=None, |
|
|
|
|
|
verbose=True): |
|
|
|
|
|
"""Calculate Weisfeiler-Lehman kernels between graphs. |
|
|
|
|
|
|
|
|
|
|
|
Parameters |
|
|
|
|
|
---------- |
|
|
|
|
|
Gn : List of NetworkX graph |
|
|
|
|
|
List of graphs between which the kernels are calculated. |
|
|
|
|
|
|
|
|
|
|
|
G1, G2 : NetworkX graphs |
|
|
|
|
|
Two graphs between which the kernel is calculated. |
|
|
|
|
|
|
|
|
|
|
|
node_label : string |
|
|
|
|
|
Node attribute used as label. The default node label is atom. |
|
|
|
|
|
|
|
|
|
|
|
edge_label : string |
|
|
|
|
|
Edge attribute used as label. The default edge label is bond_type. |
|
|
|
|
|
|
|
|
|
|
|
height : int |
|
|
|
|
|
Subtree height. |
|
|
|
|
|
|
|
|
|
|
|
base_kernel : string |
|
|
|
|
|
Base kernel used in each iteration of WL kernel. Only default 'subtree' |
|
|
|
|
|
kernel can be applied for now. |
|
|
|
|
|
|
|
|
|
|
|
parallel : None |
|
|
|
|
|
Which paralleliztion method is applied to compute the kernel. No |
|
|
|
|
|
parallelization can be applied for now. |
|
|
|
|
|
|
|
|
|
|
|
n_jobs : int |
|
|
|
|
|
Number of jobs for parallelization. The default is to use all |
|
|
|
|
|
computational cores. This argument is only valid when one of the |
|
|
|
|
|
parallelization method is applied and can be ignored for now. |
|
|
|
|
|
|
|
|
|
|
|
Return |
|
|
|
|
|
------ |
|
|
|
|
|
Kmatrix : Numpy matrix |
|
|
|
|
|
Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs. |
|
|
|
|
|
|
|
|
|
|
|
Notes |
|
|
|
|
|
----- |
|
|
|
|
|
This function now supports WL subtree kernel only. |
|
|
|
|
|
""" |
|
|
|
|
|
# The default base |
|
|
|
|
|
# kernel is subtree kernel. For user-defined kernel, base_kernel is the |
|
|
|
|
|
# name of the base kernel function used in each iteration of WL kernel. |
|
|
|
|
|
# This function returns a Numpy matrix, each element of which is the |
|
|
|
|
|
# user-defined Weisfeiler-Lehman kernel between 2 praphs. |
|
|
|
|
|
# pre-process |
|
|
|
|
|
base_kernel = base_kernel.lower() |
|
|
|
|
|
Gn = args[0] if len(args) == 1 else [args[0], args[1]] # arrange all graphs in a list |
|
|
|
|
|
Gn = [g.copy() for g in Gn] |
|
|
|
|
|
ds_attrs = get_dataset_attributes(Gn, attr_names=['node_labeled'], |
|
|
|
|
|
node_label=node_label) |
|
|
|
|
|
if not ds_attrs['node_labeled']: |
|
|
|
|
|
for G in Gn: |
|
|
|
|
|
nx.set_node_attributes(G, '0', 'atom') |
|
|
|
|
|
|
|
|
|
|
|
start_time = time.time() |
|
|
|
|
|
|
|
|
|
|
|
# for WL subtree kernel |
|
|
|
|
|
if base_kernel == 'subtree': |
|
|
|
|
|
Kmatrix = _wl_kernel_do(Gn, node_label, edge_label, height, parallel, n_jobs, chunksize, verbose) |
|
|
|
|
|
|
|
|
|
|
|
# for WL shortest path kernel |
|
|
|
|
|
elif base_kernel == 'sp': |
|
|
|
|
|
Kmatrix = _wl_spkernel_do(Gn, node_label, edge_label, height) |
|
|
|
|
|
|
|
|
|
|
|
# for WL edge kernel |
|
|
|
|
|
elif base_kernel == 'edge': |
|
|
|
|
|
Kmatrix = _wl_edgekernel_do(Gn, node_label, edge_label, height) |
|
|
|
|
|
|
|
|
|
|
|
# for user defined base kernel |
|
|
|
|
|
else: |
|
|
|
|
|
Kmatrix = _wl_userkernel_do(Gn, node_label, edge_label, height, base_kernel) |
|
|
|
|
|
|
|
|
|
|
|
run_time = time.time() - start_time |
|
|
|
|
|
if verbose: |
|
|
|
|
|
print("\n --- Weisfeiler-Lehman %s kernel matrix of size %d built in %s seconds ---" |
|
|
|
|
|
% (base_kernel, len(args[0]), run_time)) |
|
|
|
|
|
|
|
|
|
|
|
return Kmatrix, run_time |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _wl_kernel_do(Gn, node_label, edge_label, height, parallel, n_jobs, chunksize, verbose): |
|
|
|
|
|
"""Calculate Weisfeiler-Lehman kernels between graphs. |
|
|
|
|
|
|
|
|
|
|
|
Parameters |
|
|
|
|
|
---------- |
|
|
|
|
|
Gn : List of NetworkX graph |
|
|
|
|
|
List of graphs between which the kernels are calculated. |
|
|
|
|
|
node_label : string |
|
|
|
|
|
node attribute used as label. |
|
|
|
|
|
edge_label : string |
|
|
|
|
|
edge attribute used as label. |
|
|
|
|
|
height : int |
|
|
|
|
|
wl height. |
|
|
|
|
|
|
|
|
|
|
|
Return |
|
|
|
|
|
------ |
|
|
|
|
|
Kmatrix : Numpy matrix |
|
|
|
|
|
Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs. |
|
|
|
|
|
""" |
|
|
|
|
|
height = int(height) |
|
|
|
|
|
Kmatrix = np.zeros((len(Gn), len(Gn))) |
|
|
|
|
|
|
|
|
|
|
|
# initial for height = 0 |
|
|
|
|
|
all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration |
|
|
|
|
|
|
|
|
|
|
|
# for each graph |
|
|
|
|
|
for G in Gn: |
|
|
|
|
|
# get the set of original labels |
|
|
|
|
|
labels_ori = list(nx.get_node_attributes(G, node_label).values()) |
|
|
|
|
|
# number of occurence of each label in G |
|
|
|
|
|
all_num_of_each_label.append(dict(Counter(labels_ori))) |
|
|
|
|
|
|
|
|
|
|
|
# calculate subtree kernel with the 0th iteration and add it to the final kernel |
|
|
|
|
|
compute_kernel_matrix(Kmatrix, all_num_of_each_label, Gn, parallel, n_jobs, chunksize, False) |
|
|
|
|
|
|
|
|
|
|
|
# iterate each height |
|
|
|
|
|
for h in range(1, height + 1): |
|
|
|
|
|
all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration |
|
|
|
|
|
num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs |
|
|
|
|
|
# all_labels_ori = set() # all unique orignal labels in all graphs in this iteration |
|
|
|
|
|
all_num_of_each_label = [] # number of occurence of each label in G |
|
|
|
|
|
|
|
|
|
|
|
# # for each graph |
|
|
|
|
|
# # ---- use pool.imap_unordered to parallel and track progress. ---- |
|
|
|
|
|
# pool = Pool(n_jobs) |
|
|
|
|
|
# itr = zip(Gn, range(0, len(Gn))) |
|
|
|
|
|
# if len(Gn) < 100 * n_jobs: |
|
|
|
|
|
# chunksize = int(len(Gn) / n_jobs) + 1 |
|
|
|
|
|
# else: |
|
|
|
|
|
# chunksize = 100 |
|
|
|
|
|
# all_multisets_list = [[] for _ in range(len(Gn))] |
|
|
|
|
|
## set_unique_list = [[] for _ in range(len(Gn))] |
|
|
|
|
|
# get_partial = partial(wrapper_wl_iteration, node_label) |
|
|
|
|
|
## if verbose: |
|
|
|
|
|
## iterator = tqdm(pool.imap_unordered(get_partial, itr, chunksize), |
|
|
|
|
|
## desc='wl iteration', file=sys.stdout) |
|
|
|
|
|
## else: |
|
|
|
|
|
# iterator = pool.imap_unordered(get_partial, itr, chunksize) |
|
|
|
|
|
# for i, all_multisets in iterator: |
|
|
|
|
|
# all_multisets_list[i] = all_multisets |
|
|
|
|
|
## set_unique_list[i] = set_unique |
|
|
|
|
|
## all_set_unique = all_set_unique | set(set_unique) |
|
|
|
|
|
# pool.close() |
|
|
|
|
|
# pool.join() |
|
|
|
|
|
|
|
|
|
|
|
# all_set_unique = set() |
|
|
|
|
|
# for uset in all_multisets_list: |
|
|
|
|
|
# all_set_unique = all_set_unique | set(uset) |
|
|
|
|
|
# |
|
|
|
|
|
# all_set_unique = list(all_set_unique) |
|
|
|
|
|
## # a dictionary mapping original labels to new ones. |
|
|
|
|
|
## set_compressed = {} |
|
|
|
|
|
## for idx, uset in enumerate(all_set_unique): |
|
|
|
|
|
## set_compressed.update({uset: idx}) |
|
|
|
|
|
# |
|
|
|
|
|
# for ig, G in enumerate(Gn): |
|
|
|
|
|
# |
|
|
|
|
|
## # a dictionary mapping original labels to new ones. |
|
|
|
|
|
## set_compressed = {} |
|
|
|
|
|
## # if a label occured before, assign its former compressed label, |
|
|
|
|
|
## # else assign the number of labels occured + 1 as the compressed label. |
|
|
|
|
|
## for value in set_unique_list[i]: |
|
|
|
|
|
## if uset in all_set_unique: |
|
|
|
|
|
## set_compressed.update({uset: all_set_compressed[value]}) |
|
|
|
|
|
## else: |
|
|
|
|
|
## set_compressed.update({value: str(num_of_labels_occured + 1)}) |
|
|
|
|
|
## num_of_labels_occured += 1 |
|
|
|
|
|
# |
|
|
|
|
|
## all_set_compressed.update(set_compressed) |
|
|
|
|
|
# |
|
|
|
|
|
# # relabel nodes |
|
|
|
|
|
# for idx, node in enumerate(G.nodes()): |
|
|
|
|
|
# G.nodes[node][node_label] = all_set_unique.index(all_multisets_list[ig][idx]) |
|
|
|
|
|
# |
|
|
|
|
|
# # get the set of compressed labels |
|
|
|
|
|
# labels_comp = list(nx.get_node_attributes(G, node_label).values()) |
|
|
|
|
|
## all_labels_ori.update(labels_comp) |
|
|
|
|
|
# all_num_of_each_label[ig] = dict(Counter(labels_comp)) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# all_set_unique = list(all_set_unique) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# @todo: parallel this part. |
|
|
|
|
|
for idx, G in enumerate(Gn): |
|
|
|
|
|
|
|
|
|
|
|
all_multisets = [] |
|
|
|
|
|
for node, attrs in G.nodes(data=True): |
|
|
|
|
|
# Multiset-label determination. |
|
|
|
|
|
multiset = [G.nodes[neighbors][node_label] for neighbors in G[node]] |
|
|
|
|
|
# sorting each multiset |
|
|
|
|
|
multiset.sort() |
|
|
|
|
|
multiset = [attrs[node_label]] + multiset # add the prefix |
|
|
|
|
|
all_multisets.append(tuple(multiset)) |
|
|
|
|
|
|
|
|
|
|
|
# label compression |
|
|
|
|
|
set_unique = list(set(all_multisets)) # set of unique multiset labels |
|
|
|
|
|
# a dictionary mapping original labels to new ones. |
|
|
|
|
|
set_compressed = {} |
|
|
|
|
|
# if a label occured before, assign its former compressed label, |
|
|
|
|
|
# else assign the number of labels occured + 1 as the compressed label. |
|
|
|
|
|
for value in set_unique: |
|
|
|
|
|
if value in all_set_compressed.keys(): |
|
|
|
|
|
set_compressed.update({value: all_set_compressed[value]}) |
|
|
|
|
|
else: |
|
|
|
|
|
set_compressed.update({value: str(num_of_labels_occured + 1)}) |
|
|
|
|
|
num_of_labels_occured += 1 |
|
|
|
|
|
|
|
|
|
|
|
all_set_compressed.update(set_compressed) |
|
|
|
|
|
|
|
|
|
|
|
# relabel nodes |
|
|
|
|
|
for idx, node in enumerate(G.nodes()): |
|
|
|
|
|
G.nodes[node][node_label] = set_compressed[all_multisets[idx]] |
|
|
|
|
|
|
|
|
|
|
|
# get the set of compressed labels |
|
|
|
|
|
labels_comp = list(nx.get_node_attributes(G, node_label).values()) |
|
|
|
|
|
# all_labels_ori.update(labels_comp) |
|
|
|
|
|
all_num_of_each_label.append(dict(Counter(labels_comp))) |
|
|
|
|
|
|
|
|
|
|
|
# calculate subtree kernel with h iterations and add it to the final kernel |
|
|
|
|
|
compute_kernel_matrix(Kmatrix, all_num_of_each_label, Gn, parallel, n_jobs, chunksize, False) |
|
|
|
|
|
|
|
|
|
|
|
return Kmatrix |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def wl_iteration(G, node_label): |
|
|
|
|
|
all_multisets = [] |
|
|
|
|
|
for node, attrs in G.nodes(data=True): |
|
|
|
|
|
# Multiset-label determination. |
|
|
|
|
|
multiset = [G.nodes[neighbors][node_label] for neighbors in G[node]] |
|
|
|
|
|
# sorting each multiset |
|
|
|
|
|
multiset.sort() |
|
|
|
|
|
multiset = [attrs[node_label]] + multiset # add the prefix |
|
|
|
|
|
all_multisets.append(tuple(multiset)) |
|
|
|
|
|
# # label compression |
|
|
|
|
|
# set_unique = list(set(all_multisets)) # set of unique multiset labels |
|
|
|
|
|
return all_multisets |
|
|
|
|
|
|
|
|
|
|
|
# # a dictionary mapping original labels to new ones. |
|
|
|
|
|
# set_compressed = {} |
|
|
|
|
|
# # if a label occured before, assign its former compressed label, |
|
|
|
|
|
# # else assign the number of labels occured + 1 as the compressed label. |
|
|
|
|
|
# for value in set_unique: |
|
|
|
|
|
# if value in all_set_compressed.keys(): |
|
|
|
|
|
# set_compressed.update({value: all_set_compressed[value]}) |
|
|
|
|
|
# else: |
|
|
|
|
|
# set_compressed.update({value: str(num_of_labels_occured + 1)}) |
|
|
|
|
|
# num_of_labels_occured += 1 |
|
|
|
|
|
# |
|
|
|
|
|
# all_set_compressed.update(set_compressed) |
|
|
|
|
|
# |
|
|
|
|
|
# # relabel nodes |
|
|
|
|
|
# for idx, node in enumerate(G.nodes()): |
|
|
|
|
|
# G.nodes[node][node_label] = set_compressed[all_multisets[idx]] |
|
|
|
|
|
# |
|
|
|
|
|
# # get the set of compressed labels |
|
|
|
|
|
# labels_comp = list(nx.get_node_attributes(G, node_label).values()) |
|
|
|
|
|
# all_labels_ori.update(labels_comp) |
|
|
|
|
|
# all_num_of_each_label.append(dict(Counter(labels_comp))) |
|
|
|
|
|
# return |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def wrapper_wl_iteration(node_label, itr_item): |
|
|
|
|
|
g = itr_item[0] |
|
|
|
|
|
i = itr_item[1] |
|
|
|
|
|
all_multisets = wl_iteration(g, node_label) |
|
|
|
|
|
return i, all_multisets |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def compute_kernel_matrix(Kmatrix, all_num_of_each_label, Gn, parallel, n_jobs, chunksize, verbose): |
|
|
|
|
|
"""Compute kernel matrix using the base kernel. |
|
|
|
|
|
""" |
|
|
|
|
|
if parallel == 'imap_unordered': |
|
|
|
|
|
# compute kernels. |
|
|
|
|
|
def init_worker(alllabels_toshare): |
|
|
|
|
|
global G_alllabels |
|
|
|
|
|
G_alllabels = alllabels_toshare |
|
|
|
|
|
do_partial = partial(wrapper_compute_subtree_kernel, Kmatrix) |
|
|
|
|
|
parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, |
|
|
|
|
|
glbv=(all_num_of_each_label,), n_jobs=n_jobs, chunksize=chunksize, verbose=verbose) |
|
|
|
|
|
elif parallel == None: |
|
|
|
|
|
for i in range(len(Kmatrix)): |
|
|
|
|
|
for j in range(i, len(Kmatrix)): |
|
|
|
|
|
Kmatrix[i][j] = compute_subtree_kernel(all_num_of_each_label[i], |
|
|
|
|
|
all_num_of_each_label[j], Kmatrix[i][j]) |
|
|
|
|
|
Kmatrix[j][i] = Kmatrix[i][j] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def compute_subtree_kernel(num_of_each_label1, num_of_each_label2, kernel): |
|
|
|
|
|
"""Compute the subtree kernel. |
|
|
|
|
|
""" |
|
|
|
|
|
labels = set(list(num_of_each_label1.keys()) + list(num_of_each_label2.keys())) |
|
|
|
|
|
vector1 = np.array([(num_of_each_label1[label] |
|
|
|
|
|
if (label in num_of_each_label1.keys()) else 0) |
|
|
|
|
|
for label in labels]) |
|
|
|
|
|
vector2 = np.array([(num_of_each_label2[label] |
|
|
|
|
|
if (label in num_of_each_label2.keys()) else 0) |
|
|
|
|
|
for label in labels]) |
|
|
|
|
|
kernel += np.dot(vector1, vector2) |
|
|
|
|
|
return kernel |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def wrapper_compute_subtree_kernel(Kmatrix, itr): |
|
|
|
|
|
i = itr[0] |
|
|
|
|
|
j = itr[1] |
|
|
|
|
|
return i, j, compute_subtree_kernel(G_alllabels[i], G_alllabels[j], Kmatrix[i][j]) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _wl_spkernel_do(Gn, node_label, edge_label, height): |
|
|
|
|
|
"""Calculate Weisfeiler-Lehman shortest path kernels between graphs. |
|
|
|
|
|
|
|
|
|
|
|
Parameters |
|
|
|
|
|
---------- |
|
|
|
|
|
Gn : List of NetworkX graph |
|
|
|
|
|
List of graphs between which the kernels are calculated. |
|
|
|
|
|
node_label : string |
|
|
|
|
|
node attribute used as label. |
|
|
|
|
|
edge_label : string |
|
|
|
|
|
edge attribute used as label. |
|
|
|
|
|
height : int |
|
|
|
|
|
subtree height. |
|
|
|
|
|
|
|
|
|
|
|
Return |
|
|
|
|
|
------ |
|
|
|
|
|
Kmatrix : Numpy matrix |
|
|
|
|
|
Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs. |
|
|
|
|
|
""" |
|
|
|
|
|
pass |
|
|
|
|
|
from gklearn.utils.utils import getSPGraph |
|
|
|
|
|
|
|
|
|
|
|
# init. |
|
|
|
|
|
height = int(height) |
|
|
|
|
|
Kmatrix = np.zeros((len(Gn), len(Gn))) # init kernel |
|
|
|
|
|
|
|
|
|
|
|
Gn = [ getSPGraph(G, edge_weight = edge_label) for G in Gn ] # get shortest path graphs of Gn |
|
|
|
|
|
|
|
|
|
|
|
# initial for height = 0 |
|
|
|
|
|
for i in range(0, len(Gn)): |
|
|
|
|
|
for j in range(i, len(Gn)): |
|
|
|
|
|
for e1 in Gn[i].edges(data = True): |
|
|
|
|
|
for e2 in Gn[j].edges(data = True): |
|
|
|
|
|
if e1[2]['cost'] != 0 and e1[2]['cost'] == e2[2]['cost'] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])): |
|
|
|
|
|
Kmatrix[i][j] += 1 |
|
|
|
|
|
Kmatrix[j][i] = Kmatrix[i][j] |
|
|
|
|
|
|
|
|
|
|
|
# iterate each height |
|
|
|
|
|
for h in range(1, height + 1): |
|
|
|
|
|
all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration |
|
|
|
|
|
num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs |
|
|
|
|
|
for G in Gn: # for each graph |
|
|
|
|
|
set_multisets = [] |
|
|
|
|
|
for node in G.nodes(data = True): |
|
|
|
|
|
# Multiset-label determination. |
|
|
|
|
|
multiset = [ G.node[neighbors][node_label] for neighbors in G[node[0]] ] |
|
|
|
|
|
# sorting each multiset |
|
|
|
|
|
multiset.sort() |
|
|
|
|
|
multiset = node[1][node_label] + ''.join(multiset) # concatenate to a string and add the prefix |
|
|
|
|
|
set_multisets.append(multiset) |
|
|
|
|
|
|
|
|
|
|
|
# label compression |
|
|
|
|
|
set_unique = list(set(set_multisets)) # set of unique multiset labels |
|
|
|
|
|
# a dictionary mapping original labels to new ones. |
|
|
|
|
|
set_compressed = {} |
|
|
|
|
|
# if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label |
|
|
|
|
|
for value in set_unique: |
|
|
|
|
|
if value in all_set_compressed.keys(): |
|
|
|
|
|
set_compressed.update({ value : all_set_compressed[value] }) |
|
|
|
|
|
else: |
|
|
|
|
|
set_compressed.update({ value : str(num_of_labels_occured + 1) }) |
|
|
|
|
|
num_of_labels_occured += 1 |
|
|
|
|
|
|
|
|
|
|
|
all_set_compressed.update(set_compressed) |
|
|
|
|
|
|
|
|
|
|
|
# relabel nodes |
|
|
|
|
|
for node in G.nodes(data = True): |
|
|
|
|
|
node[1][node_label] = set_compressed[set_multisets[node[0]]] |
|
|
|
|
|
|
|
|
|
|
|
# calculate subtree kernel with h iterations and add it to the final kernel |
|
|
|
|
|
for i in range(0, len(Gn)): |
|
|
|
|
|
for j in range(i, len(Gn)): |
|
|
|
|
|
for e1 in Gn[i].edges(data = True): |
|
|
|
|
|
for e2 in Gn[j].edges(data = True): |
|
|
|
|
|
if e1[2]['cost'] != 0 and e1[2]['cost'] == e2[2]['cost'] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])): |
|
|
|
|
|
Kmatrix[i][j] += 1 |
|
|
|
|
|
Kmatrix[j][i] = Kmatrix[i][j] |
|
|
|
|
|
|
|
|
|
|
|
return Kmatrix |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _wl_edgekernel_do(Gn, node_label, edge_label, height): |
|
|
|
|
|
"""Calculate Weisfeiler-Lehman edge kernels between graphs. |
|
|
|
|
|
|
|
|
|
|
|
Parameters |
|
|
|
|
|
---------- |
|
|
|
|
|
Gn : List of NetworkX graph |
|
|
|
|
|
List of graphs between which the kernels are calculated. |
|
|
|
|
|
node_label : string |
|
|
|
|
|
node attribute used as label. |
|
|
|
|
|
edge_label : string |
|
|
|
|
|
edge attribute used as label. |
|
|
|
|
|
height : int |
|
|
|
|
|
subtree height. |
|
|
|
|
|
|
|
|
|
|
|
Return |
|
|
|
|
|
------ |
|
|
|
|
|
Kmatrix : Numpy matrix |
|
|
|
|
|
Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs. |
|
|
|
|
|
""" |
|
|
|
|
|
pass |
|
|
|
|
|
# init. |
|
|
|
|
|
height = int(height) |
|
|
|
|
|
Kmatrix = np.zeros((len(Gn), len(Gn))) # init kernel |
|
|
|
|
|
|
|
|
|
|
|
# initial for height = 0 |
|
|
|
|
|
for i in range(0, len(Gn)): |
|
|
|
|
|
for j in range(i, len(Gn)): |
|
|
|
|
|
for e1 in Gn[i].edges(data = True): |
|
|
|
|
|
for e2 in Gn[j].edges(data = True): |
|
|
|
|
|
if e1[2][edge_label] == e2[2][edge_label] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])): |
|
|
|
|
|
Kmatrix[i][j] += 1 |
|
|
|
|
|
Kmatrix[j][i] = Kmatrix[i][j] |
|
|
|
|
|
|
|
|
|
|
|
# iterate each height |
|
|
|
|
|
for h in range(1, height + 1): |
|
|
|
|
|
all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration |
|
|
|
|
|
num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs |
|
|
|
|
|
for G in Gn: # for each graph |
|
|
|
|
|
set_multisets = [] |
|
|
|
|
|
for node in G.nodes(data = True): |
|
|
|
|
|
# Multiset-label determination. |
|
|
|
|
|
multiset = [ G.node[neighbors][node_label] for neighbors in G[node[0]] ] |
|
|
|
|
|
# sorting each multiset |
|
|
|
|
|
multiset.sort() |
|
|
|
|
|
multiset = node[1][node_label] + ''.join(multiset) # concatenate to a string and add the prefix |
|
|
|
|
|
set_multisets.append(multiset) |
|
|
|
|
|
|
|
|
|
|
|
# label compression |
|
|
|
|
|
set_unique = list(set(set_multisets)) # set of unique multiset labels |
|
|
|
|
|
# a dictionary mapping original labels to new ones. |
|
|
|
|
|
set_compressed = {} |
|
|
|
|
|
# if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label |
|
|
|
|
|
for value in set_unique: |
|
|
|
|
|
if value in all_set_compressed.keys(): |
|
|
|
|
|
set_compressed.update({ value : all_set_compressed[value] }) |
|
|
|
|
|
else: |
|
|
|
|
|
set_compressed.update({ value : str(num_of_labels_occured + 1) }) |
|
|
|
|
|
num_of_labels_occured += 1 |
|
|
|
|
|
|
|
|
|
|
|
all_set_compressed.update(set_compressed) |
|
|
|
|
|
|
|
|
|
|
|
# relabel nodes |
|
|
|
|
|
for node in G.nodes(data = True): |
|
|
|
|
|
node[1][node_label] = set_compressed[set_multisets[node[0]]] |
|
|
|
|
|
|
|
|
|
|
|
# calculate subtree kernel with h iterations and add it to the final kernel |
|
|
|
|
|
for i in range(0, len(Gn)): |
|
|
|
|
|
for j in range(i, len(Gn)): |
|
|
|
|
|
for e1 in Gn[i].edges(data = True): |
|
|
|
|
|
for e2 in Gn[j].edges(data = True): |
|
|
|
|
|
if e1[2][edge_label] == e2[2][edge_label] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])): |
|
|
|
|
|
Kmatrix[i][j] += 1 |
|
|
|
|
|
Kmatrix[j][i] = Kmatrix[i][j] |
|
|
|
|
|
|
|
|
|
|
|
return Kmatrix |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _wl_userkernel_do(Gn, node_label, edge_label, height, base_kernel): |
|
|
|
|
|
"""Calculate Weisfeiler-Lehman kernels based on user-defined kernel between graphs. |
|
|
|
|
|
|
|
|
|
|
|
Parameters |
|
|
|
|
|
---------- |
|
|
|
|
|
Gn : List of NetworkX graph |
|
|
|
|
|
List of graphs between which the kernels are calculated. |
|
|
|
|
|
node_label : string |
|
|
|
|
|
node attribute used as label. |
|
|
|
|
|
edge_label : string |
|
|
|
|
|
edge attribute used as label. |
|
|
|
|
|
height : int |
|
|
|
|
|
subtree height. |
|
|
|
|
|
base_kernel : string |
|
|
|
|
|
Name of the base kernel function used in each iteration of WL kernel. This function returns a Numpy matrix, each element of which is the user-defined Weisfeiler-Lehman kernel between 2 praphs. |
|
|
|
|
|
|
|
|
|
|
|
Return |
|
|
|
|
|
------ |
|
|
|
|
|
Kmatrix : Numpy matrix |
|
|
|
|
|
Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs. |
|
|
|
|
|
""" |
|
|
|
|
|
pass |
|
|
|
|
|
# init. |
|
|
|
|
|
height = int(height) |
|
|
|
|
|
Kmatrix = np.zeros((len(Gn), len(Gn))) # init kernel |
|
|
|
|
|
|
|
|
|
|
|
# initial for height = 0 |
|
|
|
|
|
Kmatrix = base_kernel(Gn, node_label, edge_label) |
|
|
|
|
|
|
|
|
|
|
|
# iterate each height |
|
|
|
|
|
for h in range(1, height + 1): |
|
|
|
|
|
all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration |
|
|
|
|
|
num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs |
|
|
|
|
|
for G in Gn: # for each graph |
|
|
|
|
|
set_multisets = [] |
|
|
|
|
|
for node in G.nodes(data = True): |
|
|
|
|
|
# Multiset-label determination. |
|
|
|
|
|
multiset = [ G.node[neighbors][node_label] for neighbors in G[node[0]] ] |
|
|
|
|
|
# sorting each multiset |
|
|
|
|
|
multiset.sort() |
|
|
|
|
|
multiset = node[1][node_label] + ''.join(multiset) # concatenate to a string and add the prefix |
|
|
|
|
|
set_multisets.append(multiset) |
|
|
|
|
|
|
|
|
|
|
|
# label compression |
|
|
|
|
|
set_unique = list(set(set_multisets)) # set of unique multiset labels |
|
|
|
|
|
# a dictionary mapping original labels to new ones. |
|
|
|
|
|
set_compressed = {} |
|
|
|
|
|
# if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label |
|
|
|
|
|
for value in set_unique: |
|
|
|
|
|
if value in all_set_compressed.keys(): |
|
|
|
|
|
set_compressed.update({ value : all_set_compressed[value] }) |
|
|
|
|
|
else: |
|
|
|
|
|
set_compressed.update({ value : str(num_of_labels_occured + 1) }) |
|
|
|
|
|
num_of_labels_occured += 1 |
|
|
|
|
|
|
|
|
|
|
|
all_set_compressed.update(set_compressed) |
|
|
|
|
|
|
|
|
|
|
|
# relabel nodes |
|
|
|
|
|
for node in G.nodes(data = True): |
|
|
|
|
|
node[1][node_label] = set_compressed[set_multisets[node[0]]] |
|
|
|
|
|
|
|
|
|
|
|
# calculate kernel with h iterations and add it to the final kernel |
|
|
|
|
|
Kmatrix += base_kernel(Gn, node_label, edge_label) |
|
|
|
|
|
|
|
|
|
|
|
return Kmatrix |