@@ -12,8 +12,8 @@ import matplotlib.pyplot as plt | |||||
from numpy.linalg import eig | from numpy.linalg import eig | ||||
# read gram matrices from file. | # read gram matrices from file. | ||||
results_dir = 'results/marginalizedkernel' | |||||
ds_name = 'Letter-med' | |||||
results_dir = 'results/marginalizedkernel/myria' | |||||
ds_name = 'ENZYMES' | |||||
gmfile = np.load(results_dir + '/' + ds_name + '.gm.npz') | gmfile = np.load(results_dir + '/' + ds_name + '.gm.npz') | ||||
#print('gm time: ', gmfile['gmtime']) | #print('gm time: ', gmfile['gmtime']) | ||||
# a list to store gram matrices for all param_grid_precomputed | # a list to store gram matrices for all param_grid_precomputed | ||||
@@ -12,17 +12,17 @@ import multiprocessing | |||||
from pygraph.kernels.marginalizedKernel import marginalizedkernel | from pygraph.kernels.marginalizedKernel import marginalizedkernel | ||||
dslist = [ | dslist = [ | ||||
# {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds', | |||||
# 'task': 'regression'}, # node symb | |||||
# {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression', | |||||
# 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt', }, | |||||
# # contains single node graph, node symb | |||||
# {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds', }, # node/edge symb | |||||
# {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds', }, # unlabeled | |||||
# {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat', | |||||
# 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, # node/edge symb | |||||
# {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'}, | |||||
# # node nsymb | |||||
{'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds', | |||||
'task': 'regression'}, # node symb | |||||
{'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression', | |||||
'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt', }, | |||||
# contains single node graph, node symb | |||||
{'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds', }, # node/edge symb | |||||
{'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds', }, # unlabeled | |||||
{'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat', | |||||
'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, # node/edge symb | |||||
{'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'}, | |||||
# node nsymb | |||||
{'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'}, | {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'}, | ||||
# node symb/nsymb | # node symb/nsymb | ||||
# {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, | # {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, | ||||
@@ -81,5 +81,5 @@ for ds in dslist: | |||||
extra_params=(ds['extra_params'] if 'extra_params' in ds else None), | extra_params=(ds['extra_params'] if 'extra_params' in ds else None), | ||||
ds_name=ds['name'], | ds_name=ds['name'], | ||||
n_jobs=multiprocessing.cpu_count(), | n_jobs=multiprocessing.cpu_count(), | ||||
read_gm_from_file=True) | |||||
read_gm_from_file=False) | |||||
print() | print() |
@@ -65,7 +65,7 @@ param_grid_precomputed = {'node_kernels': | |||||
[{'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}], | [{'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}], | ||||
'edge_kernels': | 'edge_kernels': | ||||
[{'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}], | [{'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}], | ||||
'compute_method': ['trie']} | |||||
'compute_method': ['naive']} | |||||
param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)}, | param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)}, | ||||
{'alpha': np.logspace(-10, 10, num=41, base=10)}] | {'alpha': np.logspace(-10, 10, num=41, base=10)}] | ||||
@@ -0,0 +1,141 @@ | |||||
#!/usr/bin/env python3 | |||||
# -*- coding: utf-8 -*- | |||||
""" | |||||
Created on Wed Mar 6 16:03:11 2019 | |||||
pre-image | |||||
@author: ljia | |||||
""" | |||||
import sys | |||||
import numpy as np | |||||
import multiprocessing | |||||
from tqdm import tqdm | |||||
import networkx as nx | |||||
import matplotlib.pyplot as plt | |||||
sys.path.insert(0, "../") | |||||
from pygraph.kernels.marginalizedKernel import marginalizedkernel | |||||
from pygraph.utils.graphfiles import loadDataset | |||||
ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat', | |||||
'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}} # node/edge symb | |||||
DN, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params']) | |||||
DN = DN[0:10] | |||||
lmbda = 0.03 # termination probalility | |||||
r_max = 10 # recursions | |||||
l = 500 | |||||
alpha_range = np.linspace(0.1, 0.9, 9) | |||||
k = 5 # k nearest neighbors | |||||
# randomly select two molecules | |||||
np.random.seed(1) | |||||
idx1, idx2 = np.random.randint(0, len(DN), 2) | |||||
g1 = DN[idx1] | |||||
g2 = DN[idx2] | |||||
# compute | |||||
k_list = [] # kernel between each graph and itself. | |||||
k_g1_list = [] # kernel between each graph and g1 | |||||
k_g2_list = [] # kernel between each graph and g2 | |||||
for ig, g in tqdm(enumerate(DN), desc='computing self kernels', file=sys.stdout): | |||||
ktemp = marginalizedkernel([g, g1, g2], node_label='atom', edge_label=None, | |||||
p_quit=lmbda, n_iteration=20, remove_totters=False, | |||||
n_jobs=multiprocessing.cpu_count(), verbose=False) | |||||
k_list.append(ktemp[0][0, 0]) | |||||
k_g1_list.append(ktemp[0][0, 1]) | |||||
k_g2_list.append(ktemp[0][0, 2]) | |||||
g_best = [] | |||||
dis_best = [] | |||||
# for each alpha | |||||
for alpha in alpha_range: | |||||
print('alpha =', alpha) | |||||
# compute k nearest neighbors of phi in DN. | |||||
dis_list = [] # distance between g_star and each graph. | |||||
for ig, g in tqdm(enumerate(DN), desc='computing distances', file=sys.stdout): | |||||
dtemp = k_list[ig] - 2 * (alpha * k_g1_list[ig] + (1 - alpha) * | |||||
k_g2_list[ig]) + (alpha * alpha * k_list[idx1] + alpha * | |||||
(1 - alpha) * k_g2_list[idx1] + (1 - alpha) * alpha * | |||||
k_g1_list[idx2] + (1 - alpha) * (1 - alpha) * k_list[idx2]) | |||||
dis_list.append(dtemp) | |||||
# sort | |||||
sort_idx = np.argsort(dis_list) | |||||
dis_gs = [dis_list[idis] for idis in sort_idx[0:k]] | |||||
g0hat = DN[sort_idx[0]] # the nearest neighbor of phi in DN | |||||
if dis_gs[0] == 0: # the exact pre-image. | |||||
print('The exact pre-image is found from the input dataset.') | |||||
g_pimg = g0hat | |||||
break | |||||
dhat = dis_gs[0] # the nearest distance | |||||
Dk = [DN[ig] for ig in sort_idx[0:k]] # the k nearest neighbors | |||||
gihat_list = [] | |||||
i = 1 | |||||
r = 1 | |||||
while r < r_max: | |||||
print('r =', r) | |||||
found = False | |||||
for ig, gs in enumerate(Dk + gihat_list): | |||||
# nx.draw_networkx(gs) | |||||
# plt.show() | |||||
fdgs = int(np.abs(np.ceil(np.log(alpha * dis_gs[ig])))) # @todo ??? | |||||
for trail in tqdm(range(0, l), desc='l loop', file=sys.stdout): | |||||
# add and delete edges. | |||||
gtemp = gs.copy() | |||||
np.random.seed() | |||||
# which edges to change. | |||||
idx_change = np.random.randint(0, nx.number_of_nodes(gs) * | |||||
(nx.number_of_nodes(gs) - 1), fdgs) | |||||
for item in idx_change: | |||||
node1 = int(item / (nx.number_of_nodes(gs) - 1)) | |||||
node2 = (item - node1 * (nx.number_of_nodes(gs) - 1)) | |||||
if node2 >= node1: | |||||
node2 += 1 | |||||
# @todo: is the randomness correct? | |||||
if not gtemp.has_edge(node1, node2): | |||||
gtemp.add_edges_from([(node1, node2, {'bond_type': 0})]) | |||||
# nx.draw_networkx(gs) | |||||
# plt.show() | |||||
# nx.draw_networkx(gtemp) | |||||
# plt.show() | |||||
else: | |||||
gtemp.remove_edge(node1, node2) | |||||
# nx.draw_networkx(gs) | |||||
# plt.show() | |||||
# nx.draw_networkx(gtemp) | |||||
# plt.show() | |||||
# nx.draw_networkx(gtemp) | |||||
# plt.show() | |||||
# compute distance between phi and the new generated graph. | |||||
knew = marginalizedkernel([gtemp, g1, g2], node_label='atom', edge_label=None, | |||||
p_quit=lmbda, n_iteration=20, remove_totters=False, | |||||
n_jobs=multiprocessing.cpu_count(), verbose=False) | |||||
dnew = knew[0][0, 0] - 2 * (alpha * knew[0][0, 1] + (1 - alpha) * | |||||
knew[0][0, 2]) + (alpha * alpha * k_list[idx1] + alpha * | |||||
(1 - alpha) * k_g2_list[idx1] + (1 - alpha) * alpha * | |||||
k_g1_list[idx2] + (1 - alpha) * (1 - alpha) * k_list[idx2]) | |||||
if dnew <= dhat: # the new distance is smaller | |||||
print('I am smaller!') | |||||
dhat = dnew | |||||
gnew = gtemp.copy() | |||||
found = True # found better graph. | |||||
if found: | |||||
gihat_list = [gnew] | |||||
dis_gs.append(dhat) | |||||
else: | |||||
r += 1 | |||||
dis_best.append(dhat) | |||||
g_best += ([g0hat] if len(gihat_list) == 0 else gihat_list) | |||||
for idx, item in enumerate(alpha_range): | |||||
print('when alpha is', item, 'the shortest distance is', dis_best[idx]) | |||||
print('the corresponding pre-image is') | |||||
nx.draw_networkx(g_best[idx]) | |||||
plt.show() |
@@ -0,0 +1,842 @@ | |||||
#!/usr/bin/env python3 | |||||
# -*- coding: utf-8 -*- | |||||
""" | |||||
Created on Sun Dec 23 16:53:57 2018 | |||||
@author: ljia | |||||
@references: S Vichy N Vishwanathan, Nicol N Schraudolph, Risi Kondor, and | |||||
Karsten M Borgwardt. Graph kernels. Journal of Machine Learning Research, | |||||
11(Apr):1201–1242, 2010. | |||||
""" | |||||
import sys | |||||
sys.path.insert(0, "../") | |||||
import time | |||||
from functools import partial | |||||
from tqdm import tqdm | |||||
import networkx as nx | |||||
import numpy as np | |||||
from scipy.sparse import identity, kron | |||||
from scipy.sparse.linalg import cg | |||||
from scipy.optimize import fixed_point | |||||
from pygraph.utils.graphdataset import get_dataset_attributes | |||||
from pygraph.utils.parallel import parallel_gm | |||||
def randomwalkkernel(*args, | |||||
# params for all method. | |||||
compute_method=None, | |||||
weight=1, | |||||
p=None, | |||||
q=None, | |||||
edge_weight=None, | |||||
# params for conjugate and fp method. | |||||
node_kernels=None, | |||||
edge_kernels=None, | |||||
node_label='atom', | |||||
edge_label='bond_type', | |||||
# params for spectral method. | |||||
sub_kernel=None, | |||||
n_jobs=None): | |||||
"""Calculate random walk graph kernels. | |||||
Parameters | |||||
---------- | |||||
Gn : List of NetworkX graph | |||||
List of graphs between which the kernels are calculated. | |||||
/ | |||||
G1, G2 : NetworkX graphs | |||||
2 graphs between which the kernel is calculated. | |||||
node_label : string | |||||
node attribute used as label. The default node label is atom. | |||||
edge_label : string | |||||
edge attribute used as label. The default edge label is bond_type. | |||||
h : integer | |||||
Longest length of walks. | |||||
method : string | |||||
Method used to compute the random walk kernel. Available methods are 'sylvester', 'conjugate', 'fp', 'spectral' and 'kron'. | |||||
Return | |||||
------ | |||||
Kmatrix : Numpy matrix | |||||
Kernel matrix, each element of which is the path kernel up to d between 2 praphs. | |||||
""" | |||||
compute_method = compute_method.lower() | |||||
Gn = args[0] if len(args) == 1 else [args[0], args[1]] | |||||
eweight = None | |||||
if edge_weight == None: | |||||
print('\n None edge weight specified. Set all weight to 1.\n') | |||||
else: | |||||
try: | |||||
some_weight = list( | |||||
nx.get_edge_attributes(Gn[0], edge_weight).values())[0] | |||||
if isinstance(some_weight, float) or isinstance(some_weight, int): | |||||
eweight = edge_weight | |||||
else: | |||||
print( | |||||
'\n Edge weight with name %s is not float or integer. Set all weight to 1.\n' | |||||
% edge_weight) | |||||
except: | |||||
print( | |||||
'\n Edge weight with name "%s" is not found in the edge attributes. Set all weight to 1.\n' | |||||
% edge_weight) | |||||
ds_attrs = get_dataset_attributes( | |||||
Gn, | |||||
attr_names=['node_labeled', 'node_attr_dim', 'edge_labeled', | |||||
'edge_attr_dim', 'is_directed'], | |||||
node_label=node_label, | |||||
edge_label=edge_label) | |||||
ds_attrs['node_attr_dim'] = 0 | |||||
ds_attrs['edge_attr_dim'] = 0 | |||||
# remove graphs with no edges, as no walk can be found in their structures, | |||||
# so the weight matrix between such a graph and itself might be zero. | |||||
len_gn = len(Gn) | |||||
Gn = [(idx, G) for idx, G in enumerate(Gn) if nx.number_of_edges(G) != 0] | |||||
idx = [G[0] for G in Gn] | |||||
Gn = [G[1] for G in Gn] | |||||
if len(Gn) != len_gn: | |||||
print('\n %d graphs are removed as they don\'t contain edges.\n' % | |||||
(len_gn - len(Gn))) | |||||
start_time = time.time() | |||||
# # get vertex and edge concatenated labels for each graph | |||||
# label_list, d = getLabels(Gn, node_label, edge_label, ds_attrs['is_directed']) | |||||
# gmf = filterGramMatrix(A_wave_list[0], label_list[0], ('C', '0', 'O'), ds_attrs['is_directed']) | |||||
if compute_method == 'sylvester': | |||||
import warnings | |||||
warnings.warn('All labels are ignored.') | |||||
Kmatrix = _sylvester_equation(Gn, weight, p, q, eweight, n_jobs) | |||||
elif compute_method == 'conjugate': | |||||
Kmatrix = _conjugate_gradient(Gn, weight, p, q, ds_attrs, | |||||
node_kernels, edge_kernels, | |||||
node_label, edge_label, eweight, n_jobs) | |||||
elif compute_method == 'fp': | |||||
Kmatrix = _fixed_point(Gn, weight, p, q, ds_attrs, node_kernels, | |||||
edge_kernels, node_label, edge_label, | |||||
eweight, n_jobs) | |||||
elif compute_method == 'spectral': | |||||
import warnings | |||||
warnings.warn('All labels are ignored. Only works for undirected graphs.') | |||||
Kmatrix = _spectral_decomposition(Gn, weight, p, q, sub_kernel, eweight, n_jobs) | |||||
elif compute_method == 'kron': | |||||
for i in range(0, len(Gn)): | |||||
for j in range(i, len(Gn)): | |||||
Kmatrix[i][j] = _randomwalkkernel_kron(Gn[i], Gn[j], | |||||
node_label, edge_label) | |||||
Kmatrix[j][i] = Kmatrix[i][j] | |||||
else: | |||||
raise Exception( | |||||
'compute method name incorrect. Available methods: "sylvester", "conjugate", "fp", "spectral" and "kron".' | |||||
) | |||||
run_time = time.time() - start_time | |||||
print( | |||||
"\n --- kernel matrix of random walk kernel of size %d built in %s seconds ---" | |||||
% (len(Gn), run_time)) | |||||
return Kmatrix, run_time, idx | |||||
############################################################################### | |||||
def _sylvester_equation(Gn, lmda, p, q, eweight, n_jobs): | |||||
"""Calculate walk graph kernels up to n between 2 graphs using Sylvester method. | |||||
Parameters | |||||
---------- | |||||
G1, G2 : NetworkX graph | |||||
Graphs between which the kernel is calculated. | |||||
node_label : string | |||||
node attribute used as label. | |||||
edge_label : string | |||||
edge attribute used as label. | |||||
Return | |||||
------ | |||||
kernel : float | |||||
Kernel between 2 graphs. | |||||
""" | |||||
Kmatrix = np.zeros((len(Gn), len(Gn))) | |||||
if q == None: | |||||
# don't normalize adjacency matrices if q is a uniform vector. Note | |||||
# A_wave_list accually contains the transposes of the adjacency matrices. | |||||
A_wave_list = [ | |||||
nx.adjacency_matrix(G, eweight).todense().transpose() for G in tqdm( | |||||
Gn, desc='compute adjacency matrices', file=sys.stdout) | |||||
] | |||||
# # normalized adjacency matrices | |||||
# A_wave_list = [] | |||||
# for G in tqdm(Gn, desc='compute adjacency matrices', file=sys.stdout): | |||||
# A_tilde = nx.adjacency_matrix(G, eweight).todense().transpose() | |||||
# norm = A_tilde.sum(axis=0) | |||||
# norm[norm == 0] = 1 | |||||
# A_wave_list.append(A_tilde / norm) | |||||
if p == None: # p is uniform distribution as default. | |||||
def init_worker(Awl_toshare): | |||||
global G_Awl | |||||
G_Awl = Awl_toshare | |||||
do_partial = partial(wrapper_se_do, lmda) | |||||
parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, | |||||
glbv=(A_wave_list,), n_jobs=n_jobs) | |||||
# pbar = tqdm( | |||||
# total=(1 + len(Gn)) * len(Gn) / 2, | |||||
# desc='calculating kernels', | |||||
# file=sys.stdout) | |||||
# for i in range(0, len(Gn)): | |||||
# for j in range(i, len(Gn)): | |||||
# S = lmda * A_wave_list[j] | |||||
# T_t = A_wave_list[i] | |||||
# # use uniform distribution if there is no prior knowledge. | |||||
# nb_pd = len(A_wave_list[i]) * len(A_wave_list[j]) | |||||
# p_times_uni = 1 / nb_pd | |||||
# M0 = np.full((len(A_wave_list[j]), len(A_wave_list[i])), p_times_uni) | |||||
# X = dlyap(S, T_t, M0) | |||||
# X = np.reshape(X, (-1, 1), order='F') | |||||
# # use uniform distribution if there is no prior knowledge. | |||||
# q_times = np.full((1, nb_pd), p_times_uni) | |||||
# Kmatrix[i][j] = np.dot(q_times, X) | |||||
# Kmatrix[j][i] = Kmatrix[i][j] | |||||
# pbar.update(1) | |||||
return Kmatrix | |||||
def wrapper_se_do(lmda, itr): | |||||
i = itr[0] | |||||
j = itr[1] | |||||
return i, j, _se_do(G_Awl[i], G_Awl[j], lmda) | |||||
def _se_do(A_wave1, A_wave2, lmda): | |||||
from control import dlyap | |||||
S = lmda * A_wave2 | |||||
T_t = A_wave1 | |||||
# use uniform distribution if there is no prior knowledge. | |||||
nb_pd = len(A_wave1) * len(A_wave2) | |||||
p_times_uni = 1 / nb_pd | |||||
M0 = np.full((len(A_wave2), len(A_wave1)), p_times_uni) | |||||
X = dlyap(S, T_t, M0) | |||||
X = np.reshape(X, (-1, 1), order='F') | |||||
# use uniform distribution if there is no prior knowledge. | |||||
q_times = np.full((1, nb_pd), p_times_uni) | |||||
return np.dot(q_times, X) | |||||
############################################################################### | |||||
def _conjugate_gradient(Gn, lmda, p, q, ds_attrs, node_kernels, edge_kernels, | |||||
node_label, edge_label, eweight, n_jobs): | |||||
"""Calculate walk graph kernels up to n between 2 graphs using conjugate method. | |||||
Parameters | |||||
---------- | |||||
G1, G2 : NetworkX graph | |||||
Graphs between which the kernel is calculated. | |||||
node_label : string | |||||
node attribute used as label. | |||||
edge_label : string | |||||
edge attribute used as label. | |||||
Return | |||||
------ | |||||
kernel : float | |||||
Kernel between 2 graphs. | |||||
""" | |||||
Kmatrix = np.zeros((len(Gn), len(Gn))) | |||||
# if not ds_attrs['node_labeled'] and ds_attrs['node_attr_dim'] < 1 and \ | |||||
# not ds_attrs['edge_labeled'] and ds_attrs['edge_attr_dim'] < 1: | |||||
# # this is faster from unlabeled graphs. @todo: why? | |||||
# if q == None: | |||||
# # don't normalize adjacency matrices if q is a uniform vector. Note | |||||
# # A_wave_list accually contains the transposes of the adjacency matrices. | |||||
# A_wave_list = [ | |||||
# nx.adjacency_matrix(G, eweight).todense().transpose() for G in | |||||
# tqdm(Gn, desc='compute adjacency matrices', file=sys.stdout) | |||||
# ] | |||||
# if p == None: # p is uniform distribution as default. | |||||
# def init_worker(Awl_toshare): | |||||
# global G_Awl | |||||
# G_Awl = Awl_toshare | |||||
# do_partial = partial(wrapper_cg_unlabled_do, lmda) | |||||
# parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, | |||||
# glbv=(A_wave_list,), n_jobs=n_jobs) | |||||
# else: | |||||
# reindex nodes using consecutive integers for convenience of kernel calculation. | |||||
Gn = [nx.convert_node_labels_to_integers( | |||||
g, first_label=0, label_attribute='label_orignal') for g in tqdm( | |||||
Gn, desc='reindex vertices', file=sys.stdout)] | |||||
if p == None and q == None: # p and q are uniform distributions as default. | |||||
def init_worker(gn_toshare): | |||||
global G_gn | |||||
G_gn = gn_toshare | |||||
do_partial = partial(wrapper_cg_labled_do, ds_attrs, node_kernels, | |||||
node_label, edge_kernels, edge_label, lmda) | |||||
parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, | |||||
glbv=(Gn,), n_jobs=n_jobs) | |||||
# pbar = tqdm( | |||||
# total=(1 + len(Gn)) * len(Gn) / 2, | |||||
# desc='calculating kernels', | |||||
# file=sys.stdout) | |||||
# for i in range(0, len(Gn)): | |||||
# for j in range(i, len(Gn)): | |||||
# result = _cg_labled_do(Gn[i], Gn[j], ds_attrs, node_kernels, | |||||
# node_label, edge_kernels, edge_label, lmda) | |||||
# Kmatrix[i][j] = result | |||||
# Kmatrix[j][i] = Kmatrix[i][j] | |||||
# pbar.update(1) | |||||
return Kmatrix | |||||
def wrapper_cg_unlabled_do(lmda, itr): | |||||
i = itr[0] | |||||
j = itr[1] | |||||
return i, j, _cg_unlabled_do(G_Awl[i], G_Awl[j], lmda) | |||||
def _cg_unlabled_do(A_wave1, A_wave2, lmda): | |||||
nb_pd = len(A_wave1) * len(A_wave2) | |||||
p_times_uni = 1 / nb_pd | |||||
w_times = kron(A_wave1, A_wave2).todense() | |||||
A = identity(w_times.shape[0]) - w_times * lmda | |||||
b = np.full((nb_pd, 1), p_times_uni) | |||||
x, _ = cg(A, b) | |||||
# use uniform distribution if there is no prior knowledge. | |||||
q_times = np.full((1, nb_pd), p_times_uni) | |||||
return np.dot(q_times, x) | |||||
def wrapper_cg_labled_do(ds_attrs, node_kernels, node_label, edge_kernels, | |||||
edge_label, lmda, itr): | |||||
i = itr[0] | |||||
j = itr[1] | |||||
return i, j, _cg_labled_do(G_gn[i], G_gn[j], ds_attrs, node_kernels, | |||||
node_label, edge_kernels, edge_label, lmda) | |||||
def _cg_labled_do(g1, g2, ds_attrs, node_kernels, node_label, | |||||
edge_kernels, edge_label, lmda): | |||||
# Frist, ompute kernels between all pairs of nodes, method borrowed | |||||
# from FCSP. It is faster than directly computing all edge kernels | |||||
# when $d_1d_2>2$, where $d_1$ and $d_2$ are vertex degrees of the | |||||
# graphs compared, which is the most case we went though. For very | |||||
# sparse graphs, this would be slow. | |||||
vk_dict = computeVK(g1, g2, ds_attrs, node_kernels, node_label) | |||||
# Compute weight matrix of the direct product graph. | |||||
w_times, w_dim = computeW(g1, g2, vk_dict, ds_attrs, | |||||
edge_kernels, edge_label) | |||||
# use uniform distribution if there is no prior knowledge. | |||||
p_times_uni = 1 / w_dim | |||||
A = identity(w_times.shape[0]) - w_times * lmda | |||||
b = np.full((w_dim, 1), p_times_uni) | |||||
x, _ = cg(A, b) | |||||
# use uniform distribution if there is no prior knowledge. | |||||
q_times = np.full((1, w_dim), p_times_uni) | |||||
return np.dot(q_times, x) | |||||
############################################################################### | |||||
def _fixed_point(Gn, lmda, p, q, ds_attrs, node_kernels, edge_kernels, | |||||
node_label, edge_label, eweight, n_jobs): | |||||
"""Calculate walk graph kernels up to n between 2 graphs using Fixed-Point method. | |||||
Parameters | |||||
---------- | |||||
G1, G2 : NetworkX graph | |||||
Graphs between which the kernel is calculated. | |||||
node_label : string | |||||
node attribute used as label. | |||||
edge_label : string | |||||
edge attribute used as label. | |||||
Return | |||||
------ | |||||
kernel : float | |||||
Kernel between 2 graphs. | |||||
""" | |||||
Kmatrix = np.zeros((len(Gn), len(Gn))) | |||||
# if not ds_attrs['node_labeled'] and ds_attrs['node_attr_dim'] < 1 and \ | |||||
# not ds_attrs['edge_labeled'] and ds_attrs['edge_attr_dim'] > 1: | |||||
# # this is faster from unlabeled graphs. @todo: why? | |||||
# if q == None: | |||||
# # don't normalize adjacency matrices if q is a uniform vector. Note | |||||
# # A_wave_list accually contains the transposes of the adjacency matrices. | |||||
# A_wave_list = [ | |||||
# nx.adjacency_matrix(G, eweight).todense().transpose() for G in | |||||
# tqdm(Gn, desc='compute adjacency matrices', file=sys.stdout) | |||||
# ] | |||||
# if p == None: # p is uniform distribution as default. | |||||
# pbar = tqdm( | |||||
# total=(1 + len(Gn)) * len(Gn) / 2, | |||||
# desc='calculating kernels', | |||||
# file=sys.stdout) | |||||
# for i in range(0, len(Gn)): | |||||
# for j in range(i, len(Gn)): | |||||
# # use uniform distribution if there is no prior knowledge. | |||||
# nb_pd = len(A_wave_list[i]) * len(A_wave_list[j]) | |||||
# p_times_uni = 1 / nb_pd | |||||
# w_times = kron(A_wave_list[i], A_wave_list[j]).todense() | |||||
# p_times = np.full((nb_pd, 1), p_times_uni) | |||||
# x = fixed_point(func_fp, p_times, args=(p_times, lmda, w_times)) | |||||
# # use uniform distribution if there is no prior knowledge. | |||||
# q_times = np.full((1, nb_pd), p_times_uni) | |||||
# Kmatrix[i][j] = np.dot(q_times, x) | |||||
# Kmatrix[j][i] = Kmatrix[i][j] | |||||
# pbar.update(1) | |||||
# else: | |||||
# reindex nodes using consecutive integers for convenience of kernel calculation. | |||||
Gn = [nx.convert_node_labels_to_integers( | |||||
g, first_label=0, label_attribute='label_orignal') for g in tqdm( | |||||
Gn, desc='reindex vertices', file=sys.stdout)] | |||||
if p == None and q == None: # p and q are uniform distributions as default. | |||||
def init_worker(gn_toshare): | |||||
global G_gn | |||||
G_gn = gn_toshare | |||||
do_partial = partial(wrapper_fp_labled_do, ds_attrs, node_kernels, | |||||
node_label, edge_kernels, edge_label, lmda) | |||||
parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, | |||||
glbv=(Gn,), n_jobs=n_jobs) | |||||
return Kmatrix | |||||
def wrapper_fp_labled_do(ds_attrs, node_kernels, node_label, edge_kernels, | |||||
edge_label, lmda, itr): | |||||
i = itr[0] | |||||
j = itr[1] | |||||
return i, j, _fp_labled_do(G_gn[i], G_gn[j], ds_attrs, node_kernels, | |||||
node_label, edge_kernels, edge_label, lmda) | |||||
def _fp_labled_do(g1, g2, ds_attrs, node_kernels, node_label, | |||||
edge_kernels, edge_label, lmda): | |||||
# Frist, ompute kernels between all pairs of nodes, method borrowed | |||||
# from FCSP. It is faster than directly computing all edge kernels | |||||
# when $d_1d_2>2$, where $d_1$ and $d_2$ are vertex degrees of the | |||||
# graphs compared, which is the most case we went though. For very | |||||
# sparse graphs, this would be slow. | |||||
vk_dict = computeVK(g1, g2, ds_attrs, node_kernels, node_label) | |||||
# Compute weight matrix of the direct product graph. | |||||
w_times, w_dim = computeW(g1, g2, vk_dict, ds_attrs, | |||||
edge_kernels, edge_label) | |||||
# use uniform distribution if there is no prior knowledge. | |||||
p_times_uni = 1 / w_dim | |||||
p_times = np.full((w_dim, 1), p_times_uni) | |||||
x = fixed_point(func_fp, p_times, args=(p_times, lmda, w_times), | |||||
xtol=1e-06, maxiter=1000) | |||||
# use uniform distribution if there is no prior knowledge. | |||||
q_times = np.full((1, w_dim), p_times_uni) | |||||
return np.dot(q_times, x) | |||||
def func_fp(x, p_times, lmda, w_times): | |||||
haha = w_times * x | |||||
haha = lmda * haha | |||||
haha = p_times + haha | |||||
return p_times + lmda * np.dot(w_times, x) | |||||
############################################################################### | |||||
def _spectral_decomposition(Gn, weight, p, q, sub_kernel, eweight, n_jobs): | |||||
"""Calculate walk graph kernels up to n between 2 unlabeled graphs using | |||||
spectral decomposition method. Labels will be ignored. | |||||
Parameters | |||||
---------- | |||||
G1, G2 : NetworkX graph | |||||
Graphs between which the kernel is calculated. | |||||
node_label : string | |||||
node attribute used as label. | |||||
edge_label : string | |||||
edge attribute used as label. | |||||
Return | |||||
------ | |||||
kernel : float | |||||
Kernel between 2 graphs. | |||||
""" | |||||
Kmatrix = np.zeros((len(Gn), len(Gn))) | |||||
if q == None: | |||||
# precompute the spectral decomposition of each graph. | |||||
P_list = [] | |||||
D_list = [] | |||||
for G in tqdm(Gn, desc='spectral decompose', file=sys.stdout): | |||||
# don't normalize adjacency matrices if q is a uniform vector. Note | |||||
# A accually is the transpose of the adjacency matrix. | |||||
A = nx.adjacency_matrix(G, eweight).todense().transpose() | |||||
ew, ev = np.linalg.eig(A) | |||||
D_list.append(ew) | |||||
P_list.append(ev) | |||||
# P_inv_list = [p.T for p in P_list] # @todo: also works for directed graphs? | |||||
if p == None: # p is uniform distribution as default. | |||||
q_T_list = [np.full((1, nx.number_of_nodes(G)), 1 / nx.number_of_nodes(G)) for G in Gn] | |||||
# q_T_list = [q.T for q in q_list] | |||||
def init_worker(q_T_toshare, P_toshare, D_toshare): | |||||
global G_q_T, G_P, G_D | |||||
G_q_T = q_T_toshare | |||||
G_P = P_toshare | |||||
G_D = D_toshare | |||||
do_partial = partial(wrapper_sd_do, weight, sub_kernel) | |||||
parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, | |||||
glbv=(q_T_list, P_list, D_list), n_jobs=n_jobs) | |||||
# pbar = tqdm( | |||||
# total=(1 + len(Gn)) * len(Gn) / 2, | |||||
# desc='calculating kernels', | |||||
# file=sys.stdout) | |||||
# for i in range(0, len(Gn)): | |||||
# for j in range(i, len(Gn)): | |||||
# result = _sd_do(q_T_list[i], q_T_list[j], P_list[i], P_list[j], | |||||
# D_list[i], D_list[j], weight, sub_kernel) | |||||
# Kmatrix[i][j] = result | |||||
# Kmatrix[j][i] = Kmatrix[i][j] | |||||
# pbar.update(1) | |||||
return Kmatrix | |||||
def wrapper_sd_do(weight, sub_kernel, itr): | |||||
i = itr[0] | |||||
j = itr[1] | |||||
return i, j, _sd_do(G_q_T[i], G_q_T[j], G_P[i], G_P[j], G_D[i], G_D[j], | |||||
weight, sub_kernel) | |||||
def _sd_do(q_T1, q_T2, P1, P2, D1, D2, weight, sub_kernel): | |||||
# use uniform distribution if there is no prior knowledge. | |||||
kl = kron(np.dot(q_T1, P1), np.dot(q_T2, P2)).todense() | |||||
# @todo: this is not be needed when p = q (kr = kl.T) for undirected graphs | |||||
# kr = kron(np.dot(P_inv_list[i], q_list[i]), np.dot(P_inv_list[j], q_list[j])).todense() | |||||
if sub_kernel == 'exp': | |||||
D_diag = np.array([d1 * d2 for d1 in D1 for d2 in D2]) | |||||
kmiddle = np.diag(np.exp(weight * D_diag)) | |||||
elif sub_kernel == 'geo': | |||||
D_diag = np.array([d1 * d2 for d1 in D1 for d2 in D2]) | |||||
kmiddle = np.diag(weight * D_diag) | |||||
kmiddle = np.identity(len(kmiddle)) - weight * kmiddle | |||||
kmiddle = np.linalg.inv(kmiddle) | |||||
return np.dot(np.dot(kl, kmiddle), kl.T)[0, 0] | |||||
############################################################################### | |||||
def _randomwalkkernel_kron(G1, G2, node_label, edge_label): | |||||
"""Calculate walk graph kernels up to n between 2 graphs using nearest Kronecker product approximation method. | |||||
Parameters | |||||
---------- | |||||
G1, G2 : NetworkX graph | |||||
Graphs between which the kernel is calculated. | |||||
node_label : string | |||||
node attribute used as label. | |||||
edge_label : string | |||||
edge attribute used as label. | |||||
Return | |||||
------ | |||||
kernel : float | |||||
Kernel between 2 graphs. | |||||
""" | |||||
pass | |||||
############################################################################### | |||||
def getLabels(Gn, node_label, edge_label, directed): | |||||
"""Get symbolic labels of a graph dataset, where vertex labels are dealt | |||||
with by concatenating them to the edge labels of adjacent edges. | |||||
""" | |||||
label_list = [] | |||||
label_set = set() | |||||
for g in Gn: | |||||
label_g = {} | |||||
for e in g.edges(data=True): | |||||
nl1 = g.node[e[0]][node_label] | |||||
nl2 = g.node[e[1]][node_label] | |||||
if not directed and nl1 > nl2: | |||||
nl1, nl2 = nl2, nl1 | |||||
label = (nl1, e[2][edge_label], nl2) | |||||
label_g[(e[0], e[1])] = label | |||||
label_list.append(label_g) | |||||
label_set = set([l for lg in label_list for l in lg.values()]) | |||||
return label_list, len(label_set) | |||||
def filterGramMatrix(gmt, label_dict, label, directed): | |||||
"""Compute (the transpose of) the Gram matrix filtered by a label. | |||||
""" | |||||
gmf = np.zeros(gmt.shape) | |||||
for (n1, n2), l in label_dict.items(): | |||||
if l == label: | |||||
gmf[n2, n1] = gmt[n2, n1] | |||||
if not directed: | |||||
gmf[n1, n2] = gmt[n1, n2] | |||||
return gmf | |||||
def computeVK(g1, g2, ds_attrs, node_kernels, node_label): | |||||
'''Compute vertex kernels between vertices of two graphs. | |||||
''' | |||||
vk_dict = {} # shortest path matrices dict | |||||
if ds_attrs['node_labeled']: | |||||
# node symb and non-synb labeled | |||||
if ds_attrs['node_attr_dim'] > 0: | |||||
kn = node_kernels['mix'] | |||||
for n1 in g1.nodes(data=True): | |||||
for n2 in g2.nodes(data=True): | |||||
vk_dict[(n1[0], n2[0])] = kn( | |||||
n1[1][node_label], n2[1][node_label], | |||||
n1[1]['attributes'], n2[1]['attributes']) | |||||
# node symb labeled | |||||
else: | |||||
kn = node_kernels['symb'] | |||||
for n1 in g1.nodes(data=True): | |||||
for n2 in g2.nodes(data=True): | |||||
vk_dict[(n1[0], n2[0])] = kn(n1[1][node_label], | |||||
n2[1][node_label]) | |||||
else: | |||||
# node non-synb labeled | |||||
if ds_attrs['node_attr_dim'] > 0: | |||||
kn = node_kernels['nsymb'] | |||||
for n1 in g1.nodes(data=True): | |||||
for n2 in g2.nodes(data=True): | |||||
vk_dict[(n1[0], n2[0])] = kn(n1[1]['attributes'], | |||||
n2[1]['attributes']) | |||||
# node unlabeled | |||||
else: | |||||
pass | |||||
return vk_dict | |||||
def computeW(g1, g2, vk_dict, ds_attrs, edge_kernels, edge_label): | |||||
'''Compute weight matrix of the direct product graph. | |||||
''' | |||||
w_dim = nx.number_of_nodes(g1) * nx.number_of_nodes(g2) | |||||
w_times = np.zeros((w_dim, w_dim)) | |||||
if vk_dict: # node labeled | |||||
if ds_attrs['is_directed']: | |||||
if ds_attrs['edge_labeled']: | |||||
# edge symb and non-synb labeled | |||||
if ds_attrs['edge_attr_dim'] > 0: | |||||
ke = edge_kernels['mix'] | |||||
for e1 in g1.edges(data=True): | |||||
for e2 in g2.edges(data=True): | |||||
ek_temp = ke(e1[2][edge_label], e2[2][edge_label], | |||||
e1[2]['attributes'], e2[2]['attributes']) | |||||
w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0], | |||||
e1[1] * nx.number_of_nodes(g2) + e2[1]) | |||||
w_times[w_idx] = vk_dict[(e1[0], e2[0])] \ | |||||
* ek_temp * vk_dict[(e1[1], e2[1])] | |||||
# edge symb labeled | |||||
else: | |||||
ke = edge_kernels['symb'] | |||||
for e1 in g1.edges(data=True): | |||||
for e2 in g2.edges(data=True): | |||||
ek_temp = ke(e1[2][edge_label], e2[2][edge_label]) | |||||
w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0], | |||||
e1[1] * nx.number_of_nodes(g2) + e2[1]) | |||||
w_times[w_idx] = vk_dict[(e1[0], e2[0])] \ | |||||
* ek_temp * vk_dict[(e1[1], e2[1])] | |||||
else: | |||||
# edge non-synb labeled | |||||
if ds_attrs['edge_attr_dim'] > 0: | |||||
ke = edge_kernels['nsymb'] | |||||
for e1 in g1.edges(data=True): | |||||
for e2 in g2.edges(data=True): | |||||
ek_temp = ke(e1[2]['attributes'], e2[2]['attributes']) | |||||
w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0], | |||||
e1[1] * nx.number_of_nodes(g2) + e2[1]) | |||||
w_times[w_idx] = vk_dict[(e1[0], e2[0])] \ | |||||
* ek_temp * vk_dict[(e1[1], e2[1])] | |||||
# edge unlabeled | |||||
else: | |||||
for e1 in g1.edges(data=True): | |||||
for e2 in g2.edges(data=True): | |||||
w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0], | |||||
e1[1] * nx.number_of_nodes(g2) + e2[1]) | |||||
w_times[w_idx] = vk_dict[(e1[0], e2[0])] \ | |||||
* vk_dict[(e1[1], e2[1])] | |||||
else: # undirected | |||||
if ds_attrs['edge_labeled']: | |||||
# edge symb and non-synb labeled | |||||
if ds_attrs['edge_attr_dim'] > 0: | |||||
ke = edge_kernels['mix'] | |||||
for e1 in g1.edges(data=True): | |||||
for e2 in g2.edges(data=True): | |||||
ek_temp = ke(e1[2][edge_label], e2[2][edge_label], | |||||
e1[2]['attributes'], e2[2]['attributes']) | |||||
w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0], | |||||
e1[1] * nx.number_of_nodes(g2) + e2[1]) | |||||
w_times[w_idx] = vk_dict[(e1[0], e2[0])] \ | |||||
* ek_temp * vk_dict[(e1[1], e2[1])] \ | |||||
+ vk_dict[(e1[0], e2[1])] \ | |||||
* ek_temp * vk_dict[(e1[1], e2[0])] | |||||
w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]] | |||||
w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1], | |||||
e1[1] * nx.number_of_nodes(g2) + e2[0]) | |||||
w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]] | |||||
w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]] | |||||
# edge symb labeled | |||||
else: | |||||
ke = edge_kernels['symb'] | |||||
for e1 in g1.edges(data=True): | |||||
for e2 in g2.edges(data=True): | |||||
ek_temp = ke(e1[2][edge_label], e2[2][edge_label]) | |||||
w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0], | |||||
e1[1] * nx.number_of_nodes(g2) + e2[1]) | |||||
w_times[w_idx] = vk_dict[(e1[0], e2[0])] \ | |||||
* ek_temp * vk_dict[(e1[1], e2[1])] \ | |||||
+ vk_dict[(e1[0], e2[1])] \ | |||||
* ek_temp * vk_dict[(e1[1], e2[0])] | |||||
w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]] | |||||
w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1], | |||||
e1[1] * nx.number_of_nodes(g2) + e2[0]) | |||||
w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]] | |||||
w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]] | |||||
else: | |||||
# edge non-synb labeled | |||||
if ds_attrs['edge_attr_dim'] > 0: | |||||
ke = edge_kernels['nsymb'] | |||||
for e1 in g1.edges(data=True): | |||||
for e2 in g2.edges(data=True): | |||||
ek_temp = ke(e1[2]['attributes'], e2[2]['attributes']) | |||||
w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0], | |||||
e1[1] * nx.number_of_nodes(g2) + e2[1]) | |||||
w_times[w_idx] = vk_dict[(e1[0], e2[0])] \ | |||||
* ek_temp * vk_dict[(e1[1], e2[1])] \ | |||||
+ vk_dict[(e1[0], e2[1])] \ | |||||
* ek_temp * vk_dict[(e1[1], e2[0])] | |||||
w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]] | |||||
w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1], | |||||
e1[1] * nx.number_of_nodes(g2) + e2[0]) | |||||
w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]] | |||||
w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]] | |||||
# edge unlabeled | |||||
else: | |||||
for e1 in g1.edges(data=True): | |||||
for e2 in g2.edges(data=True): | |||||
w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0], | |||||
e1[1] * nx.number_of_nodes(g2) + e2[1]) | |||||
w_times[w_idx] = vk_dict[(e1[0], e2[0])] \ | |||||
* vk_dict[(e1[1], e2[1])] \ | |||||
+ vk_dict[(e1[0], e2[1])] \ | |||||
* vk_dict[(e1[1], e2[0])] | |||||
w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]] | |||||
w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1], | |||||
e1[1] * nx.number_of_nodes(g2) + e2[0]) | |||||
w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]] | |||||
w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]] | |||||
else: # node unlabeled | |||||
if ds_attrs['is_directed']: | |||||
if ds_attrs['edge_labeled']: | |||||
# edge symb and non-synb labeled | |||||
if ds_attrs['edge_attr_dim'] > 0: | |||||
ke = edge_kernels['mix'] | |||||
for e1 in g1.edges(data=True): | |||||
for e2 in g2.edges(data=True): | |||||
ek_temp = ke(e1[2][edge_label], e2[2][edge_label], | |||||
e1[2]['attributes'], e2[2]['attributes']) | |||||
w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0], | |||||
e1[1] * nx.number_of_nodes(g2) + e2[1]) | |||||
w_times[w_idx] = ek_temp | |||||
# edge symb labeled | |||||
else: | |||||
ke = edge_kernels['symb'] | |||||
for e1 in g1.edges(data=True): | |||||
for e2 in g2.edges(data=True): | |||||
ek_temp = ke(e1[2][edge_label], e2[2][edge_label]) | |||||
w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0], | |||||
e1[1] * nx.number_of_nodes(g2) + e2[1]) | |||||
w_times[w_idx] = ek_temp | |||||
else: | |||||
# edge non-synb labeled | |||||
if ds_attrs['edge_attr_dim'] > 0: | |||||
ke = edge_kernels['nsymb'] | |||||
for e1 in g1.edges(data=True): | |||||
for e2 in g2.edges(data=True): | |||||
ek_temp = ke(e1[2]['attributes'], e2[2]['attributes']) | |||||
w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0], | |||||
e1[1] * nx.number_of_nodes(g2) + e2[1]) | |||||
w_times[w_idx] = ek_temp | |||||
# edge unlabeled | |||||
else: | |||||
for e1 in g1.edges(data=True): | |||||
for e2 in g2.edges(data=True): | |||||
w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0], | |||||
e1[1] * nx.number_of_nodes(g2) + e2[1]) | |||||
w_times[w_idx] = 1 | |||||
else: # undirected | |||||
if ds_attrs['edge_labeled']: | |||||
# edge symb and non-synb labeled | |||||
if ds_attrs['edge_attr_dim'] > 0: | |||||
ke = edge_kernels['mix'] | |||||
for e1 in g1.edges(data=True): | |||||
for e2 in g2.edges(data=True): | |||||
ek_temp = ke(e1[2][edge_label], e2[2][edge_label], | |||||
e1[2]['attributes'], e2[2]['attributes']) | |||||
w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0], | |||||
e1[1] * nx.number_of_nodes(g2) + e2[1]) | |||||
w_times[w_idx] = ek_temp | |||||
w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]] | |||||
w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1], | |||||
e1[1] * nx.number_of_nodes(g2) + e2[0]) | |||||
w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]] | |||||
w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]] | |||||
# edge symb labeled | |||||
else: | |||||
ke = edge_kernels['symb'] | |||||
for e1 in g1.edges(data=True): | |||||
for e2 in g2.edges(data=True): | |||||
ek_temp = ke(e1[2][edge_label], e2[2][edge_label]) | |||||
w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0], | |||||
e1[1] * nx.number_of_nodes(g2) + e2[1]) | |||||
w_times[w_idx] = ek_temp | |||||
w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]] | |||||
w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1], | |||||
e1[1] * nx.number_of_nodes(g2) + e2[0]) | |||||
w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]] | |||||
w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]] | |||||
else: | |||||
# edge non-synb labeled | |||||
if ds_attrs['edge_attr_dim'] > 0: | |||||
ke = edge_kernels['nsymb'] | |||||
for e1 in g1.edges(data=True): | |||||
for e2 in g2.edges(data=True): | |||||
ek_temp = ke(e1[2]['attributes'], e2[2]['attributes']) | |||||
w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0], | |||||
e1[1] * nx.number_of_nodes(g2) + e2[1]) | |||||
w_times[w_idx] = ek_temp | |||||
w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]] | |||||
w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1], | |||||
e1[1] * nx.number_of_nodes(g2) + e2[0]) | |||||
w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]] | |||||
w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]] | |||||
# edge unlabeled | |||||
else: | |||||
for e1 in g1.edges(data=True): | |||||
for e2 in g2.edges(data=True): | |||||
w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0], | |||||
e1[1] * nx.number_of_nodes(g2) + e2[1]) | |||||
w_times[w_idx] = 1 | |||||
w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]] | |||||
w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1], | |||||
e1[1] * nx.number_of_nodes(g2) + e2[0]) | |||||
w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]] | |||||
w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]] | |||||
return w_times, w_dim |
@@ -0,0 +1,200 @@ | |||||
#!/usr/bin/env python3 | |||||
# -*- coding: utf-8 -*- | |||||
""" | |||||
Created on Fri Dec 21 18:02:00 2018 | |||||
@author: ljia | |||||
""" | |||||
import sys | |||||
import time | |||||
from itertools import product | |||||
from functools import partial | |||||
from multiprocessing import Pool | |||||
from tqdm import tqdm | |||||
import networkx as nx | |||||
import numpy as np | |||||
from pygraph.utils.utils import getSPGraph | |||||
from pygraph.utils.graphdataset import get_dataset_attributes | |||||
from pygraph.utils.parallel import parallel_gm | |||||
sys.path.insert(0, "../") | |||||
def spkernel(*args, | |||||
node_label='atom', | |||||
edge_weight=None, | |||||
node_kernels=None, | |||||
n_jobs=None): | |||||
"""Calculate shortest-path kernels between graphs. | |||||
Parameters | |||||
---------- | |||||
Gn : List of NetworkX graph | |||||
List of graphs between which the kernels are calculated. | |||||
/ | |||||
G1, G2 : NetworkX graphs | |||||
2 graphs between which the kernel is calculated. | |||||
node_label : string | |||||
node attribute used as label. The default node label is atom. | |||||
edge_weight : string | |||||
Edge attribute name corresponding to the edge weight. | |||||
node_kernels: dict | |||||
A dictionary of kernel functions for nodes, including 3 items: 'symb' | |||||
for symbolic node labels, 'nsymb' for non-symbolic node labels, 'mix' | |||||
for both labels. The first 2 functions take two node labels as | |||||
parameters, and the 'mix' function takes 4 parameters, a symbolic and a | |||||
non-symbolic label for each the two nodes. Each label is in form of 2-D | |||||
dimension array (n_samples, n_features). Each function returns an | |||||
number as the kernel value. Ignored when nodes are unlabeled. | |||||
Return | |||||
------ | |||||
Kmatrix : Numpy matrix | |||||
Kernel matrix, each element of which is the sp kernel between 2 praphs. | |||||
""" | |||||
# pre-process | |||||
Gn = args[0] if len(args) == 1 else [args[0], args[1]] | |||||
weight = None | |||||
if edge_weight is None: | |||||
print('\n None edge weight specified. Set all weight to 1.\n') | |||||
else: | |||||
try: | |||||
some_weight = list( | |||||
nx.get_edge_attributes(Gn[0], edge_weight).values())[0] | |||||
if isinstance(some_weight, (float, int)): | |||||
weight = edge_weight | |||||
else: | |||||
print( | |||||
'\n Edge weight with name %s is not float or integer. Set all weight to 1.\n' | |||||
% edge_weight) | |||||
except: | |||||
print( | |||||
'\n Edge weight with name "%s" is not found in the edge attributes. Set all weight to 1.\n' | |||||
% edge_weight) | |||||
ds_attrs = get_dataset_attributes( | |||||
Gn, | |||||
attr_names=['node_labeled', 'node_attr_dim', 'is_directed'], | |||||
node_label=node_label) | |||||
ds_attrs['node_attr_dim'] = 0 | |||||
# remove graphs with no edges, as no sp can be found in their structures, | |||||
# so the kernel between such a graph and itself will be zero. | |||||
len_gn = len(Gn) | |||||
Gn = [(idx, G) for idx, G in enumerate(Gn) if nx.number_of_edges(G) != 0] | |||||
idx = [G[0] for G in Gn] | |||||
Gn = [G[1] for G in Gn] | |||||
if len(Gn) != len_gn: | |||||
print('\n %d graphs are removed as they don\'t contain edges.\n' % | |||||
(len_gn - len(Gn))) | |||||
start_time = time.time() | |||||
pool = Pool(n_jobs) | |||||
# get shortest path graphs of Gn | |||||
getsp_partial = partial(wrapper_getSPGraph, weight) | |||||
itr = zip(Gn, range(0, len(Gn))) | |||||
if len(Gn) < 100 * n_jobs: | |||||
# # use default chunksize as pool.map when iterable is less than 100 | |||||
# chunksize, extra = divmod(len(Gn), n_jobs * 4) | |||||
# if extra: | |||||
# chunksize += 1 | |||||
chunksize = int(len(Gn) / n_jobs) + 1 | |||||
else: | |||||
chunksize = 100 | |||||
for i, g in tqdm( | |||||
pool.imap_unordered(getsp_partial, itr, chunksize), | |||||
desc='getting sp graphs', file=sys.stdout): | |||||
Gn[i] = g | |||||
pool.close() | |||||
pool.join() | |||||
Kmatrix = np.zeros((len(Gn), len(Gn))) | |||||
# ---- use pool.imap_unordered to parallel and track progress. ---- | |||||
def init_worker(gn_toshare): | |||||
global G_gn | |||||
G_gn = gn_toshare | |||||
do_partial = partial(wrapper_sp_do, ds_attrs, node_label, node_kernels) | |||||
parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, | |||||
glbv=(Gn,), n_jobs=n_jobs) | |||||
run_time = time.time() - start_time | |||||
print( | |||||
"\n --- shortest path kernel matrix of size %d built in %s seconds ---" | |||||
% (len(Gn), run_time)) | |||||
return Kmatrix, run_time, idx | |||||
def spkernel_do(g1, g2, ds_attrs, node_label, node_kernels): | |||||
kernel = 0 | |||||
# compute shortest path matrices first, method borrowed from FCSP. | |||||
vk_dict = {} # shortest path matrices dict | |||||
if ds_attrs['node_labeled']: | |||||
# node symb and non-synb labeled | |||||
if ds_attrs['node_attr_dim'] > 0: | |||||
kn = node_kernels['mix'] | |||||
for n1, n2 in product( | |||||
g1.nodes(data=True), g2.nodes(data=True)): | |||||
vk_dict[(n1[0], n2[0])] = kn( | |||||
n1[1][node_label], n2[1][node_label], | |||||
n1[1]['attributes'], n2[1]['attributes']) | |||||
# node symb labeled | |||||
else: | |||||
kn = node_kernels['symb'] | |||||
for n1 in g1.nodes(data=True): | |||||
for n2 in g2.nodes(data=True): | |||||
vk_dict[(n1[0], n2[0])] = kn(n1[1][node_label], | |||||
n2[1][node_label]) | |||||
else: | |||||
# node non-synb labeled | |||||
if ds_attrs['node_attr_dim'] > 0: | |||||
kn = node_kernels['nsymb'] | |||||
for n1 in g1.nodes(data=True): | |||||
for n2 in g2.nodes(data=True): | |||||
vk_dict[(n1[0], n2[0])] = kn(n1[1]['attributes'], | |||||
n2[1]['attributes']) | |||||
# node unlabeled | |||||
else: | |||||
for e1, e2 in product( | |||||
g1.edges(data=True), g2.edges(data=True)): | |||||
if e1[2]['cost'] == e2[2]['cost']: | |||||
kernel += 1 | |||||
return kernel | |||||
# compute graph kernels | |||||
if ds_attrs['is_directed']: | |||||
for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)): | |||||
if e1[2]['cost'] == e2[2]['cost']: | |||||
nk11, nk22 = vk_dict[(e1[0], e2[0])], vk_dict[(e1[1], | |||||
e2[1])] | |||||
kn1 = nk11 * nk22 | |||||
kernel += kn1 | |||||
else: | |||||
for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)): | |||||
if e1[2]['cost'] == e2[2]['cost']: | |||||
# each edge walk is counted twice, starting from both its extreme nodes. | |||||
nk11, nk12, nk21, nk22 = vk_dict[(e1[0], e2[0])], vk_dict[( | |||||
e1[0], e2[1])], vk_dict[(e1[1], | |||||
e2[0])], vk_dict[(e1[1], | |||||
e2[1])] | |||||
kn1 = nk11 * nk22 | |||||
kn2 = nk12 * nk21 | |||||
kernel += kn1 + kn2 | |||||
return kernel | |||||
def wrapper_sp_do(ds_attrs, node_label, node_kernels, itr): | |||||
i = itr[0] | |||||
j = itr[1] | |||||
return i, j, spkernel_do(G_gn[i], G_gn[j], ds_attrs, node_label, node_kernels) | |||||
def wrapper_getSPGraph(weight, itr_item): | |||||
g = itr_item[0] | |||||
i = itr_item[1] | |||||
return i, getSPGraph(g, edge_weight=weight) |
@@ -0,0 +1,464 @@ | |||||
#!/usr/bin/env python3 | |||||
# -*- coding: utf-8 -*- | |||||
""" | |||||
Created on Sun Dec 23 16:42:48 2018 | |||||
@author: ljia | |||||
""" | |||||
import sys | |||||
import time | |||||
from itertools import combinations, product | |||||
from functools import partial | |||||
from multiprocessing import Pool | |||||
from tqdm import tqdm | |||||
import networkx as nx | |||||
import numpy as np | |||||
from pygraph.utils.graphdataset import get_dataset_attributes | |||||
from pygraph.utils.parallel import parallel_gm | |||||
sys.path.insert(0, "../") | |||||
def structuralspkernel(*args, | |||||
node_label='atom', | |||||
edge_weight=None, | |||||
edge_label='bond_type', | |||||
node_kernels=None, | |||||
edge_kernels=None, | |||||
n_jobs=None): | |||||
"""Calculate mean average structural shortest path kernels between graphs. | |||||
Parameters | |||||
---------- | |||||
Gn : List of NetworkX graph | |||||
List of graphs between which the kernels are calculated. | |||||
/ | |||||
G1, G2 : NetworkX graphs | |||||
2 graphs between which the kernel is calculated. | |||||
node_label : string | |||||
node attribute used as label. The default node label is atom. | |||||
edge_weight : string | |||||
Edge attribute name corresponding to the edge weight. | |||||
edge_label : string | |||||
edge attribute used as label. The default edge label is bond_type. | |||||
node_kernels: dict | |||||
A dictionary of kernel functions for nodes, including 3 items: 'symb' | |||||
for symbolic node labels, 'nsymb' for non-symbolic node labels, 'mix' | |||||
for both labels. The first 2 functions take two node labels as | |||||
parameters, and the 'mix' function takes 4 parameters, a symbolic and a | |||||
non-symbolic label for each the two nodes. Each label is in form of 2-D | |||||
dimension array (n_samples, n_features). Each function returns a number | |||||
as the kernel value. Ignored when nodes are unlabeled. | |||||
edge_kernels: dict | |||||
A dictionary of kernel functions for edges, including 3 items: 'symb' | |||||
for symbolic edge labels, 'nsymb' for non-symbolic edge labels, 'mix' | |||||
for both labels. The first 2 functions take two edge labels as | |||||
parameters, and the 'mix' function takes 4 parameters, a symbolic and a | |||||
non-symbolic label for each the two edges. Each label is in form of 2-D | |||||
dimension array (n_samples, n_features). Each function returns a number | |||||
as the kernel value. Ignored when edges are unlabeled. | |||||
Return | |||||
------ | |||||
Kmatrix : Numpy matrix | |||||
Kernel matrix, each element of which is the mean average structural | |||||
shortest path kernel between 2 praphs. | |||||
""" | |||||
# pre-process | |||||
Gn = args[0] if len(args) == 1 else [args[0], args[1]] | |||||
weight = None | |||||
if edge_weight is None: | |||||
print('\n None edge weight specified. Set all weight to 1.\n') | |||||
else: | |||||
try: | |||||
some_weight = list( | |||||
nx.get_edge_attributes(Gn[0], edge_weight).values())[0] | |||||
if isinstance(some_weight, (float, int)): | |||||
weight = edge_weight | |||||
else: | |||||
print( | |||||
'\n Edge weight with name %s is not float or integer. Set all weight to 1.\n' | |||||
% edge_weight) | |||||
except: | |||||
print( | |||||
'\n Edge weight with name "%s" is not found in the edge attributes. Set all weight to 1.\n' | |||||
% edge_weight) | |||||
ds_attrs = get_dataset_attributes( | |||||
Gn, | |||||
attr_names=['node_labeled', 'node_attr_dim', 'edge_labeled', | |||||
'edge_attr_dim', 'is_directed'], | |||||
node_label=node_label, edge_label=edge_label) | |||||
ds_attrs['node_attr_dim'] = 0 | |||||
ds_attrs['edge_attr_dim'] = 0 | |||||
start_time = time.time() | |||||
# get shortest paths of each graph in Gn | |||||
splist = [None] * len(Gn) | |||||
pool = Pool(n_jobs) | |||||
# get shortest path graphs of Gn | |||||
getsp_partial = partial(wrapper_getSP, weight, ds_attrs['is_directed']) | |||||
itr = zip(Gn, range(0, len(Gn))) | |||||
if len(Gn) < 100 * n_jobs: | |||||
chunksize = int(len(Gn) / n_jobs) + 1 | |||||
else: | |||||
chunksize = 100 | |||||
# chunksize = 300 # int(len(list(itr)) / n_jobs) | |||||
for i, sp in tqdm( | |||||
pool.imap_unordered(getsp_partial, itr, chunksize), | |||||
desc='getting shortest paths', | |||||
file=sys.stdout): | |||||
splist[i] = sp | |||||
# time.sleep(10) | |||||
pool.close() | |||||
pool.join() | |||||
# # get shortest paths of each graph in Gn | |||||
# splist = [[] for _ in range(len(Gn))] | |||||
# # get shortest path graphs of Gn | |||||
# getsp_partial = partial(wrapper_getSP, weight, ds_attrs['is_directed']) | |||||
# itr = zip(Gn, range(0, len(Gn))) | |||||
# if len(Gn) < 1000 * n_jobs: | |||||
# chunksize = int(len(Gn) / n_jobs) + 1 | |||||
# else: | |||||
# chunksize = 1000 | |||||
# # chunksize = 300 # int(len(list(itr)) / n_jobs) | |||||
# from contextlib import closing | |||||
# with closing(Pool(n_jobs)) as pool: | |||||
## for i, sp in tqdm( | |||||
# res = pool.imap_unordered(getsp_partial, itr, 10) | |||||
## desc='getting shortest paths', | |||||
## file=sys.stdout): | |||||
## splist[i] = sp | |||||
## time.sleep(10) | |||||
# pool.close() | |||||
# pool.join() | |||||
# ss = 0 | |||||
# ss += sys.getsizeof(splist) | |||||
# for spss in splist: | |||||
# ss += sys.getsizeof(spss) | |||||
# for spp in spss: | |||||
# ss += sys.getsizeof(spp) | |||||
# time.sleep(20) | |||||
# # ---- direct running, normally use single CPU core. ---- | |||||
# splist = [] | |||||
# for g in tqdm(Gn, desc='getting sp graphs', file=sys.stdout): | |||||
# splist.append(get_shortest_paths(g, weight, ds_attrs['is_directed'])) | |||||
# # ---- only for the Fast Computation of Shortest Path Kernel (FCSP) | |||||
# sp_ml = [0] * len(Gn) # shortest path matrices | |||||
# for i in result_sp: | |||||
# sp_ml[i[0]] = i[1] | |||||
# edge_x_g = [[] for i in range(len(sp_ml))] | |||||
# edge_y_g = [[] for i in range(len(sp_ml))] | |||||
# edge_w_g = [[] for i in range(len(sp_ml))] | |||||
# for idx, item in enumerate(sp_ml): | |||||
# for i1 in range(len(item)): | |||||
# for i2 in range(i1 + 1, len(item)): | |||||
# if item[i1, i2] != np.inf: | |||||
# edge_x_g[idx].append(i1) | |||||
# edge_y_g[idx].append(i2) | |||||
# edge_w_g[idx].append(item[i1, i2]) | |||||
# print(len(edge_x_g[0])) | |||||
# print(len(edge_y_g[0])) | |||||
# print(len(edge_w_g[0])) | |||||
Kmatrix = np.zeros((len(Gn), len(Gn))) | |||||
# ---- use pool.imap_unordered to parallel and track progress. ---- | |||||
def init_worker(spl_toshare, gs_toshare): | |||||
global G_spl, G_gs | |||||
G_spl = spl_toshare | |||||
G_gs = gs_toshare | |||||
do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label, | |||||
node_kernels, edge_kernels) | |||||
parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, | |||||
glbv=(splist, Gn), n_jobs=n_jobs) | |||||
# # ---- use pool.imap_unordered to parallel and track progress. ---- | |||||
# pool = Pool(n_jobs) | |||||
# do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label, | |||||
# node_kernels, edge_kernels) | |||||
# itr = zip(combinations_with_replacement(Gn, 2), | |||||
# combinations_with_replacement(splist, 2), | |||||
# combinations_with_replacement(range(0, len(Gn)), 2)) | |||||
# len_itr = int(len(Gn) * (len(Gn) + 1) / 2) | |||||
# if len_itr < 1000 * n_jobs: | |||||
# chunksize = int(len_itr / n_jobs) + 1 | |||||
# else: | |||||
# chunksize = 1000 | |||||
# for i, j, kernel in tqdm( | |||||
# pool.imap_unordered(do_partial, itr, chunksize), | |||||
# desc='calculating kernels', | |||||
# file=sys.stdout): | |||||
# Kmatrix[i][j] = kernel | |||||
# Kmatrix[j][i] = kernel | |||||
# pool.close() | |||||
# pool.join() | |||||
# # ---- use pool.map to parallel. ---- | |||||
# pool = Pool(n_jobs) | |||||
# do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label, | |||||
# node_kernels, edge_kernels) | |||||
# itr = zip(combinations_with_replacement(Gn, 2), | |||||
# combinations_with_replacement(splist, 2), | |||||
# combinations_with_replacement(range(0, len(Gn)), 2)) | |||||
# for i, j, kernel in tqdm( | |||||
# pool.map(do_partial, itr), desc='calculating kernels', | |||||
# file=sys.stdout): | |||||
# Kmatrix[i][j] = kernel | |||||
# Kmatrix[j][i] = kernel | |||||
# pool.close() | |||||
# pool.join() | |||||
# # ---- use pool.imap_unordered to parallel and track progress. ---- | |||||
# do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label, | |||||
# node_kernels, edge_kernels) | |||||
# itr = zip(combinations_with_replacement(Gn, 2), | |||||
# combinations_with_replacement(splist, 2), | |||||
# combinations_with_replacement(range(0, len(Gn)), 2)) | |||||
# len_itr = int(len(Gn) * (len(Gn) + 1) / 2) | |||||
# if len_itr < 1000 * n_jobs: | |||||
# chunksize = int(len_itr / n_jobs) + 1 | |||||
# else: | |||||
# chunksize = 1000 | |||||
# from contextlib import closing | |||||
# with closing(Pool(n_jobs)) as pool: | |||||
# for i, j, kernel in tqdm( | |||||
# pool.imap_unordered(do_partial, itr, 1000), | |||||
# desc='calculating kernels', | |||||
# file=sys.stdout): | |||||
# Kmatrix[i][j] = kernel | |||||
# Kmatrix[j][i] = kernel | |||||
# pool.close() | |||||
# pool.join() | |||||
# # ---- direct running, normally use single CPU core. ---- | |||||
# from itertools import combinations_with_replacement | |||||
# itr = combinations_with_replacement(range(0, len(Gn)), 2) | |||||
# for i, j in tqdm(itr, desc='calculating kernels', file=sys.stdout): | |||||
# kernel = structuralspkernel_do(Gn[i], Gn[j], splist[i], splist[j], | |||||
# ds_attrs, node_label, edge_label, node_kernels, edge_kernels) | |||||
## if(kernel > 1): | |||||
## print("error here ") | |||||
# Kmatrix[i][j] = kernel | |||||
# Kmatrix[j][i] = kernel | |||||
run_time = time.time() - start_time | |||||
print( | |||||
"\n --- shortest path kernel matrix of size %d built in %s seconds ---" | |||||
% (len(Gn), run_time)) | |||||
return Kmatrix, run_time | |||||
def structuralspkernel_do(g1, g2, spl1, spl2, ds_attrs, node_label, edge_label, | |||||
node_kernels, edge_kernels): | |||||
kernel = 0 | |||||
# First, compute shortest path matrices, method borrowed from FCSP. | |||||
vk_dict = {} # shortest path matrices dict | |||||
if ds_attrs['node_labeled']: | |||||
# node symb and non-synb labeled | |||||
if ds_attrs['node_attr_dim'] > 0: | |||||
kn = node_kernels['mix'] | |||||
for n1, n2 in product( | |||||
g1.nodes(data=True), g2.nodes(data=True)): | |||||
vk_dict[(n1[0], n2[0])] = kn( | |||||
n1[1][node_label], n2[1][node_label], | |||||
n1[1]['attributes'], n2[1]['attributes']) | |||||
# node symb labeled | |||||
else: | |||||
kn = node_kernels['symb'] | |||||
for n1 in g1.nodes(data=True): | |||||
for n2 in g2.nodes(data=True): | |||||
vk_dict[(n1[0], n2[0])] = kn(n1[1][node_label], | |||||
n2[1][node_label]) | |||||
else: | |||||
# node non-synb labeled | |||||
if ds_attrs['node_attr_dim'] > 0: | |||||
kn = node_kernels['nsymb'] | |||||
for n1 in g1.nodes(data=True): | |||||
for n2 in g2.nodes(data=True): | |||||
vk_dict[(n1[0], n2[0])] = kn(n1[1]['attributes'], | |||||
n2[1]['attributes']) | |||||
# node unlabeled | |||||
else: | |||||
pass | |||||
# Then, compute kernels between all pairs of edges, which idea is an | |||||
# extension of FCSP. It suits sparse graphs, which is the most case we | |||||
# went though. For dense graphs, this would be slow. | |||||
ek_dict = {} # dict of edge kernels | |||||
if ds_attrs['edge_labeled']: | |||||
# edge symb and non-synb labeled | |||||
if ds_attrs['edge_attr_dim'] > 0: | |||||
ke = edge_kernels['mix'] | |||||
for e1, e2 in product( | |||||
g1.edges(data=True), g2.edges(data=True)): | |||||
ek_temp = ke(e1[2][edge_label], e2[2][edge_label], | |||||
e1[2]['attributes'], e2[2]['attributes']) | |||||
ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ek_temp | |||||
ek_dict[((e1[1], e1[0]), (e2[0], e2[1]))] = ek_temp | |||||
ek_dict[((e1[0], e1[1]), (e2[1], e2[0]))] = ek_temp | |||||
ek_dict[((e1[1], e1[0]), (e2[1], e2[0]))] = ek_temp | |||||
# edge symb labeled | |||||
else: | |||||
ke = edge_kernels['symb'] | |||||
for e1 in g1.edges(data=True): | |||||
for e2 in g2.edges(data=True): | |||||
ek_temp = ke(e1[2][edge_label], e2[2][edge_label]) | |||||
ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ek_temp | |||||
ek_dict[((e1[1], e1[0]), (e2[0], e2[1]))] = ek_temp | |||||
ek_dict[((e1[0], e1[1]), (e2[1], e2[0]))] = ek_temp | |||||
ek_dict[((e1[1], e1[0]), (e2[1], e2[0]))] = ek_temp | |||||
else: | |||||
# edge non-synb labeled | |||||
if ds_attrs['edge_attr_dim'] > 0: | |||||
ke = edge_kernels['nsymb'] | |||||
for e1 in g1.edges(data=True): | |||||
for e2 in g2.edges(data=True): | |||||
ek_temp = kn(e1[2]['attributes'], e2[2]['attributes']) | |||||
ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ek_temp | |||||
ek_dict[((e1[1], e1[0]), (e2[0], e2[1]))] = ek_temp | |||||
ek_dict[((e1[0], e1[1]), (e2[1], e2[0]))] = ek_temp | |||||
ek_dict[((e1[1], e1[0]), (e2[1], e2[0]))] = ek_temp | |||||
# edge unlabeled | |||||
else: | |||||
pass | |||||
# compute graph kernels | |||||
if vk_dict: | |||||
if ek_dict: | |||||
for p1, p2 in product(spl1, spl2): | |||||
if len(p1) == len(p2): | |||||
kpath = vk_dict[(p1[0], p2[0])] | |||||
if kpath: | |||||
for idx in range(1, len(p1)): | |||||
kpath *= vk_dict[(p1[idx], p2[idx])] * \ | |||||
ek_dict[((p1[idx-1], p1[idx]), | |||||
(p2[idx-1], p2[idx]))] | |||||
if not kpath: | |||||
break | |||||
kernel += kpath # add up kernels of all paths | |||||
else: | |||||
for p1, p2 in product(spl1, spl2): | |||||
if len(p1) == len(p2): | |||||
kpath = vk_dict[(p1[0], p2[0])] | |||||
if kpath: | |||||
for idx in range(1, len(p1)): | |||||
kpath *= vk_dict[(p1[idx], p2[idx])] | |||||
if not kpath: | |||||
break | |||||
kernel += kpath # add up kernels of all paths | |||||
else: | |||||
if ek_dict: | |||||
for p1, p2 in product(spl1, spl2): | |||||
if len(p1) == len(p2): | |||||
if len(p1) == 0: | |||||
kernel += 1 | |||||
else: | |||||
kpath = 1 | |||||
for idx in range(0, len(p1) - 1): | |||||
kpath *= ek_dict[((p1[idx], p1[idx+1]), | |||||
(p2[idx], p2[idx+1]))] | |||||
if not kpath: | |||||
break | |||||
kernel += kpath # add up kernels of all paths | |||||
else: | |||||
for p1, p2 in product(spl1, spl2): | |||||
if len(p1) == len(p2): | |||||
kernel += 1 | |||||
kernel = kernel / (len(spl1) * len(spl2)) # calculate mean average | |||||
# # ---- exact implementation of the Fast Computation of Shortest Path Kernel (FCSP), reference [2], sadly it is slower than the current implementation | |||||
# # compute vertex kernel matrix | |||||
# try: | |||||
# vk_mat = np.zeros((nx.number_of_nodes(g1), | |||||
# nx.number_of_nodes(g2))) | |||||
# g1nl = enumerate(g1.nodes(data=True)) | |||||
# g2nl = enumerate(g2.nodes(data=True)) | |||||
# for i1, n1 in g1nl: | |||||
# for i2, n2 in g2nl: | |||||
# vk_mat[i1][i2] = kn( | |||||
# n1[1][node_label], n2[1][node_label], | |||||
# [n1[1]['attributes']], [n2[1]['attributes']]) | |||||
# range1 = range(0, len(edge_w_g[i])) | |||||
# range2 = range(0, len(edge_w_g[j])) | |||||
# for i1 in range1: | |||||
# x1 = edge_x_g[i][i1] | |||||
# y1 = edge_y_g[i][i1] | |||||
# w1 = edge_w_g[i][i1] | |||||
# for i2 in range2: | |||||
# x2 = edge_x_g[j][i2] | |||||
# y2 = edge_y_g[j][i2] | |||||
# w2 = edge_w_g[j][i2] | |||||
# ke = (w1 == w2) | |||||
# if ke > 0: | |||||
# kn1 = vk_mat[x1][x2] * vk_mat[y1][y2] | |||||
# kn2 = vk_mat[x1][y2] * vk_mat[y1][x2] | |||||
# Kmatrix += kn1 + kn2 | |||||
return kernel | |||||
def wrapper_ssp_do(ds_attrs, node_label, edge_label, node_kernels, | |||||
edge_kernels, itr): | |||||
i = itr[0] | |||||
j = itr[1] | |||||
return i, j, structuralspkernel_do(G_gs[i], G_gs[j], G_spl[i], G_spl[j], | |||||
ds_attrs, node_label, edge_label, | |||||
node_kernels, edge_kernels) | |||||
def get_shortest_paths(G, weight, directed): | |||||
"""Get all shortest paths of a graph. | |||||
Parameters | |||||
---------- | |||||
G : NetworkX graphs | |||||
The graphs whose paths are calculated. | |||||
weight : string/None | |||||
edge attribute used as weight to calculate the shortest path. | |||||
directed: boolean | |||||
Whether graph is directed. | |||||
Return | |||||
------ | |||||
sp : list of list | |||||
List of shortest paths of the graph, where each path is represented by a list of nodes. | |||||
""" | |||||
sp = [] | |||||
for n1, n2 in combinations(G.nodes(), 2): | |||||
try: | |||||
spltemp = list(nx.all_shortest_paths(G, n1, n2, weight=weight)) | |||||
except nx.NetworkXNoPath: # nodes not connected | |||||
# sp.append([]) | |||||
pass | |||||
else: | |||||
sp += spltemp | |||||
# each edge walk is counted twice, starting from both its extreme nodes. | |||||
if not directed: | |||||
sp += [sptemp[::-1] for sptemp in spltemp] | |||||
# add single nodes as length 0 paths. | |||||
sp += [[n] for n in G.nodes()] | |||||
return sp | |||||
def wrapper_getSP(weight, directed, itr_item): | |||||
g = itr_item[0] | |||||
i = itr_item[1] | |||||
return i, get_shortest_paths(g, weight, directed) |
@@ -33,8 +33,9 @@ def marginalizedkernel(*args, | |||||
edge_label='bond_type', | edge_label='bond_type', | ||||
p_quit=0.5, | p_quit=0.5, | ||||
n_iteration=20, | n_iteration=20, | ||||
remove_totters=True, | |||||
n_jobs=None): | |||||
remove_totters=False, | |||||
n_jobs=None, | |||||
verbose=True): | |||||
"""Calculate marginalized graph kernels between graphs. | """Calculate marginalized graph kernels between graphs. | ||||
Parameters | Parameters | ||||
@@ -63,16 +64,18 @@ def marginalizedkernel(*args, | |||||
""" | """ | ||||
# pre-process | # pre-process | ||||
n_iteration = int(n_iteration) | n_iteration = int(n_iteration) | ||||
Gn = args[0] if len(args) == 1 else [args[0], args[1]] | |||||
Gn = args[0][:] if len(args) == 1 else [args[0].copy(), args[1].copy()] | |||||
ds_attrs = get_dataset_attributes( | ds_attrs = get_dataset_attributes( | ||||
Gn, | Gn, | ||||
attr_names=['node_labeled', 'edge_labeled', 'is_directed'], | attr_names=['node_labeled', 'edge_labeled', 'is_directed'], | ||||
node_label=node_label, edge_label=edge_label) | node_label=node_label, edge_label=edge_label) | ||||
if not ds_attrs['node_labeled']: | |||||
if not ds_attrs['node_labeled'] or node_label == None: | |||||
node_label = 'atom' | |||||
for G in Gn: | for G in Gn: | ||||
nx.set_node_attributes(G, '0', 'atom') | nx.set_node_attributes(G, '0', 'atom') | ||||
if not ds_attrs['edge_labeled']: | |||||
if not ds_attrs['edge_labeled'] or edge_label == None: | |||||
edge_label = 'bond_type' | |||||
for G in Gn: | for G in Gn: | ||||
nx.set_edge_attributes(G, '0', 'bond_type') | nx.set_edge_attributes(G, '0', 'bond_type') | ||||
@@ -110,26 +113,26 @@ def marginalizedkernel(*args, | |||||
do_partial = partial(wrapper_marg_do, node_label, edge_label, | do_partial = partial(wrapper_marg_do, node_label, edge_label, | ||||
p_quit, n_iteration) | p_quit, n_iteration) | ||||
parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, | parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, | ||||
glbv=(Gn,), n_jobs=n_jobs) | |||||
glbv=(Gn,), n_jobs=n_jobs, verbose=verbose) | |||||
# # ---- direct running, normally use single CPU core. ---- | # # ---- direct running, normally use single CPU core. ---- | ||||
# pbar = tqdm( | |||||
# total=(1 + len(Gn)) * len(Gn) / 2, | |||||
# desc='calculating kernels', | |||||
# file=sys.stdout) | |||||
## pbar = tqdm( | |||||
## total=(1 + len(Gn)) * len(Gn) / 2, | |||||
## desc='calculating kernels', | |||||
## file=sys.stdout) | |||||
# for i in range(0, len(Gn)): | # for i in range(0, len(Gn)): | ||||
# for j in range(i, len(Gn)): | # for j in range(i, len(Gn)): | ||||
# print(i, j) | |||||
## print(i, j) | |||||
# Kmatrix[i][j] = _marginalizedkernel_do(Gn[i], Gn[j], node_label, | # Kmatrix[i][j] = _marginalizedkernel_do(Gn[i], Gn[j], node_label, | ||||
# edge_label, p_quit, n_iteration) | # edge_label, p_quit, n_iteration) | ||||
# Kmatrix[j][i] = Kmatrix[i][j] | # Kmatrix[j][i] = Kmatrix[i][j] | ||||
# pbar.update(1) | |||||
## pbar.update(1) | |||||
run_time = time.time() - start_time | run_time = time.time() - start_time | ||||
print( | |||||
"\n --- marginalized kernel matrix of size %d built in %s seconds ---" | |||||
% (len(Gn), run_time)) | |||||
if verbose: | |||||
print("\n --- marginalized kernel matrix of size %d built in %s seconds ---" | |||||
% (len(Gn), run_time)) | |||||
return Kmatrix, run_time | return Kmatrix, run_time | ||||
@@ -23,7 +23,8 @@ def spkernel(*args, | |||||
node_label='atom', | node_label='atom', | ||||
edge_weight=None, | edge_weight=None, | ||||
node_kernels=None, | node_kernels=None, | ||||
n_jobs=None): | |||||
n_jobs=None, | |||||
verbose=True): | |||||
"""Calculate shortest-path kernels between graphs. | """Calculate shortest-path kernels between graphs. | ||||
Parameters | Parameters | ||||
@@ -148,7 +149,7 @@ def spkernel(*args, | |||||
G_gn = gn_toshare | G_gn = gn_toshare | ||||
do_partial = partial(wrapper_sp_do, ds_attrs, node_label, node_kernels) | do_partial = partial(wrapper_sp_do, ds_attrs, node_label, node_kernels) | ||||
parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, | parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, | ||||
glbv=(Gn,), n_jobs=n_jobs) | |||||
glbv=(Gn,), n_jobs=n_jobs, verbose=verbose) | |||||
# # ---- use pool.map to parallel. ---- | # # ---- use pool.map to parallel. ---- | ||||
@@ -31,7 +31,7 @@ def structuralspkernel(*args, | |||||
edge_label='bond_type', | edge_label='bond_type', | ||||
node_kernels=None, | node_kernels=None, | ||||
edge_kernels=None, | edge_kernels=None, | ||||
compute_method='trie', | |||||
compute_method='naive', | |||||
n_jobs=None): | n_jobs=None): | ||||
"""Calculate mean average structural shortest path kernels between graphs. | """Calculate mean average structural shortest path kernels between graphs. | ||||
@@ -242,7 +242,7 @@ def model_selection_for_precomputed_kernel(datafile, | |||||
# gms_array = Array('d', np.reshape(gram_matrices.copy(), -1, order='C')) | # gms_array = Array('d', np.reshape(gram_matrices.copy(), -1, order='C')) | ||||
# pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(gms_array, gms_shape)) | # pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(gms_array, gms_shape)) | ||||
pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(gram_matrices,)) | pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(gram_matrices,)) | ||||
trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, y, model_type) | |||||
trial_do_partial = partial(parallel_trial_do, param_list_pre_revised, param_list, y, model_type) | |||||
train_pref = [] | train_pref = [] | ||||
val_pref = [] | val_pref = [] | ||||
test_pref = [] | test_pref = [] | ||||
@@ -473,7 +473,7 @@ def model_selection_for_precomputed_kernel(datafile, | |||||
G_gms = gms_toshare | G_gms = gms_toshare | ||||
pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(gram_matrices,)) | pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(gram_matrices,)) | ||||
trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, y, model_type) | |||||
trial_do_partial = partial(parallel_trial_do, param_list_pre_revised, param_list, y, model_type) | |||||
train_pref = [] | train_pref = [] | ||||
val_pref = [] | val_pref = [] | ||||
test_pref = [] | test_pref = [] | ||||
@@ -656,7 +656,7 @@ def model_selection_for_precomputed_kernel(datafile, | |||||
f.write(str_fw + '\n\n\n' + content) | f.write(str_fw + '\n\n\n' + content) | ||||
def trial_do(param_list_pre_revised, param_list, y, model_type, trial): # Test set level | |||||
def trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, trial): # Test set level | |||||
# # get gram matrices from global variables. | # # get gram matrices from global variables. | ||||
# gram_matrices = np.reshape(G_gms.copy(), G_gms_shape, order='C') | # gram_matrices = np.reshape(G_gms.copy(), G_gms_shape, order='C') | ||||
@@ -679,7 +679,7 @@ def trial_do(param_list_pre_revised, param_list, y, model_type, trial): # Test s | |||||
# get gram matrices from global variables. | # get gram matrices from global variables. | ||||
# gm_now = G_gms[index_out * G_gms_shape[1] * G_gms_shape[2]:(index_out + 1) * G_gms_shape[1] * G_gms_shape[2]] | # gm_now = G_gms[index_out * G_gms_shape[1] * G_gms_shape[2]:(index_out + 1) * G_gms_shape[1] * G_gms_shape[2]] | ||||
# gm_now = np.reshape(gm_now.copy(), (G_gms_shape[1], G_gms_shape[2]), order='C') | # gm_now = np.reshape(gm_now.copy(), (G_gms_shape[1], G_gms_shape[2]), order='C') | ||||
gm_now = G_gms[index_out].copy() | |||||
gm_now = gram_matrices[index_out].copy() | |||||
# split gram matrix and y to app and test sets. | # split gram matrix and y to app and test sets. | ||||
indices = range(len(y)) | indices = range(len(y)) | ||||
@@ -822,6 +822,12 @@ def trial_do(param_list_pre_revised, param_list, y, model_type, trial): # Test s | |||||
return train_pref, val_pref, test_pref | return train_pref, val_pref, test_pref | ||||
def parallel_trial_do(param_list_pre_revised, param_list, y, model_type, trial): | |||||
train_pref, val_pref, test_pref = trial_do(param_list_pre_revised, | |||||
param_list, G_gms, y, | |||||
model_type, trial) | |||||
return train_pref, val_pref, test_pref | |||||
def compute_gram_matrices(dataset, y, estimator, param_list_precomputed, | def compute_gram_matrices(dataset, y, estimator, param_list_precomputed, | ||||
results_dir, ds_name, | results_dir, ds_name, | ||||
@@ -11,7 +11,8 @@ from tqdm import tqdm | |||||
import sys | import sys | ||||
def parallel_me(func, func_assign, var_to_assign, itr, len_itr=None, init_worker=None, | def parallel_me(func, func_assign, var_to_assign, itr, len_itr=None, init_worker=None, | ||||
glbv=None, method=None, n_jobs=None, chunksize=None, itr_desc=''): | |||||
glbv=None, method=None, n_jobs=None, chunksize=None, itr_desc='', | |||||
verbose=True): | |||||
''' | ''' | ||||
''' | ''' | ||||
if method == 'imap_unordered': | if method == 'imap_unordered': | ||||
@@ -29,8 +30,9 @@ def parallel_me(func, func_assign, var_to_assign, itr, len_itr=None, init_worker | |||||
chunksize = int(len_itr / n_jobs) + 1 | chunksize = int(len_itr / n_jobs) + 1 | ||||
else: | else: | ||||
chunksize = 100 | chunksize = 100 | ||||
for result in tqdm(pool.imap_unordered(func, itr, chunksize), | |||||
desc=itr_desc, file=sys.stdout): | |||||
for result in (tqdm(pool.imap_unordered(func, itr, chunksize), | |||||
desc=itr_desc, file=sys.stdout) if verbose else | |||||
pool.imap_unordered(func, itr, chunksize)): | |||||
func_assign(result, var_to_assign) | func_assign(result, var_to_assign) | ||||
else: | else: | ||||
with Pool(processes=n_jobs) as pool: | with Pool(processes=n_jobs) as pool: | ||||
@@ -41,14 +43,16 @@ def parallel_me(func, func_assign, var_to_assign, itr, len_itr=None, init_worker | |||||
chunksize = int(len_itr / n_jobs) + 1 | chunksize = int(len_itr / n_jobs) + 1 | ||||
else: | else: | ||||
chunksize = 100 | chunksize = 100 | ||||
for result in tqdm(pool.imap_unordered(func, itr, chunksize), | |||||
desc=itr_desc, file=sys.stdout): | |||||
for result in (tqdm(pool.imap_unordered(func, itr, chunksize), | |||||
desc=itr_desc, file=sys.stdout) if verbose else | |||||
pool.imap_unordered(func, itr, chunksize)): | |||||
func_assign(result, var_to_assign) | func_assign(result, var_to_assign) | ||||
def parallel_gm(func, Kmatrix, Gn, init_worker=None, glbv=None, | def parallel_gm(func, Kmatrix, Gn, init_worker=None, glbv=None, | ||||
method='imap_unordered', n_jobs=None, chunksize=None): | |||||
method='imap_unordered', n_jobs=None, chunksize=None, | |||||
verbose=True): | |||||
from itertools import combinations_with_replacement | from itertools import combinations_with_replacement | ||||
def func_assign(result, var_to_assign): | def func_assign(result, var_to_assign): | ||||
var_to_assign[result[0]][result[1]] = result[2] | var_to_assign[result[0]][result[1]] = result[2] | ||||
@@ -57,4 +61,4 @@ def parallel_gm(func, Kmatrix, Gn, init_worker=None, glbv=None, | |||||
len_itr = int(len(Gn) * (len(Gn) + 1) / 2) | len_itr = int(len(Gn) * (len(Gn) + 1) / 2) | ||||
parallel_me(func, func_assign, Kmatrix, itr, len_itr=len_itr, | parallel_me(func, func_assign, Kmatrix, itr, len_itr=len_itr, | ||||
init_worker=init_worker, glbv=glbv, method=method, n_jobs=n_jobs, | init_worker=init_worker, glbv=glbv, method=method, n_jobs=n_jobs, | ||||
chunksize=chunksize, itr_desc='calculating kernels') | |||||
chunksize=chunksize, itr_desc='calculating kernels', verbose=verbose) |