Browse Source

rearrange pygraph/kernels file.

v0.1
jajupmochi 6 years ago
parent
commit
6b4738df30
18 changed files with 1705 additions and 44 deletions
  1. +2
    -2
      notebooks/check_gm.py
  2. BIN
      notebooks/preimage/results.gm.npz
  3. +12
    -12
      notebooks/run_marginalizedkernel.py
  4. +1
    -1
      notebooks/run_structuralspkernel.py
  5. +141
    -0
      preimage/preimage.py
  6. +0
    -0
      pygraph/kernels/building/cyclicPatternKernel.py
  7. +0
    -0
      pygraph/kernels/building/pathKernel.py
  8. +0
    -0
      pygraph/kernels/building/treePatternKernel.py
  9. +0
    -0
      pygraph/kernels/building/treeletKernel.py
  10. +0
    -0
      pygraph/kernels/building/weisfeilerLehmanKernel.py
  11. +842
    -0
      pygraph/kernels/else/rwalk_sym.py
  12. +200
    -0
      pygraph/kernels/else/sp_sym.py
  13. +464
    -0
      pygraph/kernels/else/ssp_sym.py
  14. +18
    -15
      pygraph/kernels/marginalizedKernel.py
  15. +3
    -2
      pygraph/kernels/spKernel.py
  16. +1
    -1
      pygraph/kernels/structuralspKernel.py
  17. +10
    -4
      pygraph/utils/model_selection_precomputed.py
  18. +11
    -7
      pygraph/utils/parallel.py

+ 2
- 2
notebooks/check_gm.py View File

@@ -12,8 +12,8 @@ import matplotlib.pyplot as plt
from numpy.linalg import eig

# read gram matrices from file.
results_dir = 'results/marginalizedkernel'
ds_name = 'Letter-med'
results_dir = 'results/marginalizedkernel/myria'
ds_name = 'ENZYMES'
gmfile = np.load(results_dir + '/' + ds_name + '.gm.npz')
#print('gm time: ', gmfile['gmtime'])
# a list to store gram matrices for all param_grid_precomputed


BIN
notebooks/preimage/results.gm.npz View File


+ 12
- 12
notebooks/run_marginalizedkernel.py View File

@@ -12,17 +12,17 @@ import multiprocessing
from pygraph.kernels.marginalizedKernel import marginalizedkernel

dslist = [
# {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',
# 'task': 'regression'}, # node symb
# {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression',
# 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt', },
# # contains single node graph, node symb
# {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds', }, # node/edge symb
# {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds', }, # unlabeled
# {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat',
# 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, # node/edge symb
# {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'},
# # node nsymb
{'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',
'task': 'regression'}, # node symb
{'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression',
'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt', },
# contains single node graph, node symb
{'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds', }, # node/edge symb
{'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds', }, # unlabeled
{'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat',
'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, # node/edge symb
{'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'},
# node nsymb
{'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'},
# node symb/nsymb
# {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'},
@@ -81,5 +81,5 @@ for ds in dslist:
extra_params=(ds['extra_params'] if 'extra_params' in ds else None),
ds_name=ds['name'],
n_jobs=multiprocessing.cpu_count(),
read_gm_from_file=True)
read_gm_from_file=False)
print()

+ 1
- 1
notebooks/run_structuralspkernel.py View File

@@ -65,7 +65,7 @@ param_grid_precomputed = {'node_kernels':
[{'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}],
'edge_kernels':
[{'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}],
'compute_method': ['trie']}
'compute_method': ['naive']}
param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)},
{'alpha': np.logspace(-10, 10, num=41, base=10)}]



+ 141
- 0
preimage/preimage.py View File

@@ -0,0 +1,141 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Mar 6 16:03:11 2019

pre-image
@author: ljia
"""

import sys
import numpy as np
import multiprocessing
from tqdm import tqdm
import networkx as nx
import matplotlib.pyplot as plt


sys.path.insert(0, "../")
from pygraph.kernels.marginalizedKernel import marginalizedkernel
from pygraph.utils.graphfiles import loadDataset


ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat',
'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}} # node/edge symb

DN, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
DN = DN[0:10]

lmbda = 0.03 # termination probalility
r_max = 10 # recursions
l = 500
alpha_range = np.linspace(0.1, 0.9, 9)
k = 5 # k nearest neighbors

# randomly select two molecules
np.random.seed(1)
idx1, idx2 = np.random.randint(0, len(DN), 2)
g1 = DN[idx1]
g2 = DN[idx2]

# compute
k_list = [] # kernel between each graph and itself.
k_g1_list = [] # kernel between each graph and g1
k_g2_list = [] # kernel between each graph and g2
for ig, g in tqdm(enumerate(DN), desc='computing self kernels', file=sys.stdout):
ktemp = marginalizedkernel([g, g1, g2], node_label='atom', edge_label=None,
p_quit=lmbda, n_iteration=20, remove_totters=False,
n_jobs=multiprocessing.cpu_count(), verbose=False)
k_list.append(ktemp[0][0, 0])
k_g1_list.append(ktemp[0][0, 1])
k_g2_list.append(ktemp[0][0, 2])

g_best = []
dis_best = []
# for each alpha
for alpha in alpha_range:
print('alpha =', alpha)
# compute k nearest neighbors of phi in DN.
dis_list = [] # distance between g_star and each graph.
for ig, g in tqdm(enumerate(DN), desc='computing distances', file=sys.stdout):
dtemp = k_list[ig] - 2 * (alpha * k_g1_list[ig] + (1 - alpha) *
k_g2_list[ig]) + (alpha * alpha * k_list[idx1] + alpha *
(1 - alpha) * k_g2_list[idx1] + (1 - alpha) * alpha *
k_g1_list[idx2] + (1 - alpha) * (1 - alpha) * k_list[idx2])
dis_list.append(dtemp)
# sort
sort_idx = np.argsort(dis_list)
dis_gs = [dis_list[idis] for idis in sort_idx[0:k]]
g0hat = DN[sort_idx[0]] # the nearest neighbor of phi in DN
if dis_gs[0] == 0: # the exact pre-image.
print('The exact pre-image is found from the input dataset.')
g_pimg = g0hat
break
dhat = dis_gs[0] # the nearest distance
Dk = [DN[ig] for ig in sort_idx[0:k]] # the k nearest neighbors
gihat_list = []
i = 1
r = 1
while r < r_max:
print('r =', r)
found = False
for ig, gs in enumerate(Dk + gihat_list):
# nx.draw_networkx(gs)
# plt.show()
fdgs = int(np.abs(np.ceil(np.log(alpha * dis_gs[ig])))) # @todo ???
for trail in tqdm(range(0, l), desc='l loop', file=sys.stdout):
# add and delete edges.
gtemp = gs.copy()
np.random.seed()
# which edges to change.
idx_change = np.random.randint(0, nx.number_of_nodes(gs) *
(nx.number_of_nodes(gs) - 1), fdgs)
for item in idx_change:
node1 = int(item / (nx.number_of_nodes(gs) - 1))
node2 = (item - node1 * (nx.number_of_nodes(gs) - 1))
if node2 >= node1:
node2 += 1
# @todo: is the randomness correct?
if not gtemp.has_edge(node1, node2):
gtemp.add_edges_from([(node1, node2, {'bond_type': 0})])
# nx.draw_networkx(gs)
# plt.show()
# nx.draw_networkx(gtemp)
# plt.show()
else:
gtemp.remove_edge(node1, node2)
# nx.draw_networkx(gs)
# plt.show()
# nx.draw_networkx(gtemp)
# plt.show()
# nx.draw_networkx(gtemp)
# plt.show()
# compute distance between phi and the new generated graph.
knew = marginalizedkernel([gtemp, g1, g2], node_label='atom', edge_label=None,
p_quit=lmbda, n_iteration=20, remove_totters=False,
n_jobs=multiprocessing.cpu_count(), verbose=False)
dnew = knew[0][0, 0] - 2 * (alpha * knew[0][0, 1] + (1 - alpha) *
knew[0][0, 2]) + (alpha * alpha * k_list[idx1] + alpha *
(1 - alpha) * k_g2_list[idx1] + (1 - alpha) * alpha *
k_g1_list[idx2] + (1 - alpha) * (1 - alpha) * k_list[idx2])
if dnew <= dhat: # the new distance is smaller
print('I am smaller!')
dhat = dnew
gnew = gtemp.copy()
found = True # found better graph.
if found:
gihat_list = [gnew]
dis_gs.append(dhat)
else:
r += 1
dis_best.append(dhat)
g_best += ([g0hat] if len(gihat_list) == 0 else gihat_list)

for idx, item in enumerate(alpha_range):
print('when alpha is', item, 'the shortest distance is', dis_best[idx])
print('the corresponding pre-image is')
nx.draw_networkx(g_best[idx])
plt.show()

pygraph/kernels/cyclicPatternKernel.py → pygraph/kernels/building/cyclicPatternKernel.py View File


pygraph/kernels/pathKernel.py → pygraph/kernels/building/pathKernel.py View File


pygraph/kernels/treePatternKernel.py → pygraph/kernels/building/treePatternKernel.py View File


pygraph/kernels/treeletKernel.py → pygraph/kernels/building/treeletKernel.py View File


pygraph/kernels/weisfeilerLehmanKernel.py → pygraph/kernels/building/weisfeilerLehmanKernel.py View File


+ 842
- 0
pygraph/kernels/else/rwalk_sym.py View File

@@ -0,0 +1,842 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun Dec 23 16:53:57 2018

@author: ljia
@references: S Vichy N Vishwanathan, Nicol N Schraudolph, Risi Kondor, and
Karsten M Borgwardt. Graph kernels. Journal of Machine Learning Research,
11(Apr):1201–1242, 2010.
"""

import sys
sys.path.insert(0, "../")
import time
from functools import partial
from tqdm import tqdm

import networkx as nx
import numpy as np
from scipy.sparse import identity, kron
from scipy.sparse.linalg import cg
from scipy.optimize import fixed_point

from pygraph.utils.graphdataset import get_dataset_attributes
from pygraph.utils.parallel import parallel_gm

def randomwalkkernel(*args,
# params for all method.
compute_method=None,
weight=1,
p=None,
q=None,
edge_weight=None,
# params for conjugate and fp method.
node_kernels=None,
edge_kernels=None,
node_label='atom',
edge_label='bond_type',
# params for spectral method.
sub_kernel=None,
n_jobs=None):
"""Calculate random walk graph kernels.
Parameters
----------
Gn : List of NetworkX graph
List of graphs between which the kernels are calculated.
/
G1, G2 : NetworkX graphs
2 graphs between which the kernel is calculated.
node_label : string
node attribute used as label. The default node label is atom.
edge_label : string
edge attribute used as label. The default edge label is bond_type.
h : integer
Longest length of walks.
method : string
Method used to compute the random walk kernel. Available methods are 'sylvester', 'conjugate', 'fp', 'spectral' and 'kron'.

Return
------
Kmatrix : Numpy matrix
Kernel matrix, each element of which is the path kernel up to d between 2 praphs.
"""
compute_method = compute_method.lower()
Gn = args[0] if len(args) == 1 else [args[0], args[1]]

eweight = None
if edge_weight == None:
print('\n None edge weight specified. Set all weight to 1.\n')
else:
try:
some_weight = list(
nx.get_edge_attributes(Gn[0], edge_weight).values())[0]
if isinstance(some_weight, float) or isinstance(some_weight, int):
eweight = edge_weight
else:
print(
'\n Edge weight with name %s is not float or integer. Set all weight to 1.\n'
% edge_weight)
except:
print(
'\n Edge weight with name "%s" is not found in the edge attributes. Set all weight to 1.\n'
% edge_weight)

ds_attrs = get_dataset_attributes(
Gn,
attr_names=['node_labeled', 'node_attr_dim', 'edge_labeled',
'edge_attr_dim', 'is_directed'],
node_label=node_label,
edge_label=edge_label)
ds_attrs['node_attr_dim'] = 0
ds_attrs['edge_attr_dim'] = 0
# remove graphs with no edges, as no walk can be found in their structures,
# so the weight matrix between such a graph and itself might be zero.
len_gn = len(Gn)
Gn = [(idx, G) for idx, G in enumerate(Gn) if nx.number_of_edges(G) != 0]
idx = [G[0] for G in Gn]
Gn = [G[1] for G in Gn]
if len(Gn) != len_gn:
print('\n %d graphs are removed as they don\'t contain edges.\n' %
(len_gn - len(Gn)))

start_time = time.time()
# # get vertex and edge concatenated labels for each graph
# label_list, d = getLabels(Gn, node_label, edge_label, ds_attrs['is_directed'])
# gmf = filterGramMatrix(A_wave_list[0], label_list[0], ('C', '0', 'O'), ds_attrs['is_directed'])

if compute_method == 'sylvester':
import warnings
warnings.warn('All labels are ignored.')
Kmatrix = _sylvester_equation(Gn, weight, p, q, eweight, n_jobs)

elif compute_method == 'conjugate':
Kmatrix = _conjugate_gradient(Gn, weight, p, q, ds_attrs,
node_kernels, edge_kernels,
node_label, edge_label, eweight, n_jobs)
elif compute_method == 'fp':
Kmatrix = _fixed_point(Gn, weight, p, q, ds_attrs, node_kernels,
edge_kernels, node_label, edge_label,
eweight, n_jobs)

elif compute_method == 'spectral':
import warnings
warnings.warn('All labels are ignored. Only works for undirected graphs.')
Kmatrix = _spectral_decomposition(Gn, weight, p, q, sub_kernel, eweight, n_jobs)

elif compute_method == 'kron':
for i in range(0, len(Gn)):
for j in range(i, len(Gn)):
Kmatrix[i][j] = _randomwalkkernel_kron(Gn[i], Gn[j],
node_label, edge_label)
Kmatrix[j][i] = Kmatrix[i][j]
else:
raise Exception(
'compute method name incorrect. Available methods: "sylvester", "conjugate", "fp", "spectral" and "kron".'
)

run_time = time.time() - start_time
print(
"\n --- kernel matrix of random walk kernel of size %d built in %s seconds ---"
% (len(Gn), run_time))

return Kmatrix, run_time, idx


###############################################################################
def _sylvester_equation(Gn, lmda, p, q, eweight, n_jobs):
"""Calculate walk graph kernels up to n between 2 graphs using Sylvester method.

Parameters
----------
G1, G2 : NetworkX graph
Graphs between which the kernel is calculated.
node_label : string
node attribute used as label.
edge_label : string
edge attribute used as label.

Return
------
kernel : float
Kernel between 2 graphs.
"""
Kmatrix = np.zeros((len(Gn), len(Gn)))

if q == None:
# don't normalize adjacency matrices if q is a uniform vector. Note
# A_wave_list accually contains the transposes of the adjacency matrices.
A_wave_list = [
nx.adjacency_matrix(G, eweight).todense().transpose() for G in tqdm(
Gn, desc='compute adjacency matrices', file=sys.stdout)
]
# # normalized adjacency matrices
# A_wave_list = []
# for G in tqdm(Gn, desc='compute adjacency matrices', file=sys.stdout):
# A_tilde = nx.adjacency_matrix(G, eweight).todense().transpose()
# norm = A_tilde.sum(axis=0)
# norm[norm == 0] = 1
# A_wave_list.append(A_tilde / norm)
if p == None: # p is uniform distribution as default.
def init_worker(Awl_toshare):
global G_Awl
G_Awl = Awl_toshare
do_partial = partial(wrapper_se_do, lmda)
parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker,
glbv=(A_wave_list,), n_jobs=n_jobs)
# pbar = tqdm(
# total=(1 + len(Gn)) * len(Gn) / 2,
# desc='calculating kernels',
# file=sys.stdout)
# for i in range(0, len(Gn)):
# for j in range(i, len(Gn)):
# S = lmda * A_wave_list[j]
# T_t = A_wave_list[i]
# # use uniform distribution if there is no prior knowledge.
# nb_pd = len(A_wave_list[i]) * len(A_wave_list[j])
# p_times_uni = 1 / nb_pd
# M0 = np.full((len(A_wave_list[j]), len(A_wave_list[i])), p_times_uni)
# X = dlyap(S, T_t, M0)
# X = np.reshape(X, (-1, 1), order='F')
# # use uniform distribution if there is no prior knowledge.
# q_times = np.full((1, nb_pd), p_times_uni)
# Kmatrix[i][j] = np.dot(q_times, X)
# Kmatrix[j][i] = Kmatrix[i][j]
# pbar.update(1)

return Kmatrix


def wrapper_se_do(lmda, itr):
i = itr[0]
j = itr[1]
return i, j, _se_do(G_Awl[i], G_Awl[j], lmda)


def _se_do(A_wave1, A_wave2, lmda):
from control import dlyap
S = lmda * A_wave2
T_t = A_wave1
# use uniform distribution if there is no prior knowledge.
nb_pd = len(A_wave1) * len(A_wave2)
p_times_uni = 1 / nb_pd
M0 = np.full((len(A_wave2), len(A_wave1)), p_times_uni)
X = dlyap(S, T_t, M0)
X = np.reshape(X, (-1, 1), order='F')
# use uniform distribution if there is no prior knowledge.
q_times = np.full((1, nb_pd), p_times_uni)
return np.dot(q_times, X)


###############################################################################
def _conjugate_gradient(Gn, lmda, p, q, ds_attrs, node_kernels, edge_kernels,
node_label, edge_label, eweight, n_jobs):
"""Calculate walk graph kernels up to n between 2 graphs using conjugate method.

Parameters
----------
G1, G2 : NetworkX graph
Graphs between which the kernel is calculated.
node_label : string
node attribute used as label.
edge_label : string
edge attribute used as label.

Return
------
kernel : float
Kernel between 2 graphs.
"""
Kmatrix = np.zeros((len(Gn), len(Gn)))
# if not ds_attrs['node_labeled'] and ds_attrs['node_attr_dim'] < 1 and \
# not ds_attrs['edge_labeled'] and ds_attrs['edge_attr_dim'] < 1:
# # this is faster from unlabeled graphs. @todo: why?
# if q == None:
# # don't normalize adjacency matrices if q is a uniform vector. Note
# # A_wave_list accually contains the transposes of the adjacency matrices.
# A_wave_list = [
# nx.adjacency_matrix(G, eweight).todense().transpose() for G in
# tqdm(Gn, desc='compute adjacency matrices', file=sys.stdout)
# ]
# if p == None: # p is uniform distribution as default.
# def init_worker(Awl_toshare):
# global G_Awl
# G_Awl = Awl_toshare
# do_partial = partial(wrapper_cg_unlabled_do, lmda)
# parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker,
# glbv=(A_wave_list,), n_jobs=n_jobs)
# else:
# reindex nodes using consecutive integers for convenience of kernel calculation.
Gn = [nx.convert_node_labels_to_integers(
g, first_label=0, label_attribute='label_orignal') for g in tqdm(
Gn, desc='reindex vertices', file=sys.stdout)]
if p == None and q == None: # p and q are uniform distributions as default.
def init_worker(gn_toshare):
global G_gn
G_gn = gn_toshare
do_partial = partial(wrapper_cg_labled_do, ds_attrs, node_kernels,
node_label, edge_kernels, edge_label, lmda)
parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker,
glbv=(Gn,), n_jobs=n_jobs)
# pbar = tqdm(
# total=(1 + len(Gn)) * len(Gn) / 2,
# desc='calculating kernels',
# file=sys.stdout)
# for i in range(0, len(Gn)):
# for j in range(i, len(Gn)):
# result = _cg_labled_do(Gn[i], Gn[j], ds_attrs, node_kernels,
# node_label, edge_kernels, edge_label, lmda)
# Kmatrix[i][j] = result
# Kmatrix[j][i] = Kmatrix[i][j]
# pbar.update(1)
return Kmatrix


def wrapper_cg_unlabled_do(lmda, itr):
i = itr[0]
j = itr[1]
return i, j, _cg_unlabled_do(G_Awl[i], G_Awl[j], lmda)


def _cg_unlabled_do(A_wave1, A_wave2, lmda):
nb_pd = len(A_wave1) * len(A_wave2)
p_times_uni = 1 / nb_pd
w_times = kron(A_wave1, A_wave2).todense()
A = identity(w_times.shape[0]) - w_times * lmda
b = np.full((nb_pd, 1), p_times_uni)
x, _ = cg(A, b)
# use uniform distribution if there is no prior knowledge.
q_times = np.full((1, nb_pd), p_times_uni)
return np.dot(q_times, x)


def wrapper_cg_labled_do(ds_attrs, node_kernels, node_label, edge_kernels,
edge_label, lmda, itr):
i = itr[0]
j = itr[1]
return i, j, _cg_labled_do(G_gn[i], G_gn[j], ds_attrs, node_kernels,
node_label, edge_kernels, edge_label, lmda)


def _cg_labled_do(g1, g2, ds_attrs, node_kernels, node_label,
edge_kernels, edge_label, lmda):
# Frist, ompute kernels between all pairs of nodes, method borrowed
# from FCSP. It is faster than directly computing all edge kernels
# when $d_1d_2>2$, where $d_1$ and $d_2$ are vertex degrees of the
# graphs compared, which is the most case we went though. For very
# sparse graphs, this would be slow.
vk_dict = computeVK(g1, g2, ds_attrs, node_kernels, node_label)
# Compute weight matrix of the direct product graph.
w_times, w_dim = computeW(g1, g2, vk_dict, ds_attrs,
edge_kernels, edge_label)
# use uniform distribution if there is no prior knowledge.
p_times_uni = 1 / w_dim
A = identity(w_times.shape[0]) - w_times * lmda
b = np.full((w_dim, 1), p_times_uni)
x, _ = cg(A, b)
# use uniform distribution if there is no prior knowledge.
q_times = np.full((1, w_dim), p_times_uni)
return np.dot(q_times, x)


###############################################################################
def _fixed_point(Gn, lmda, p, q, ds_attrs, node_kernels, edge_kernels,
node_label, edge_label, eweight, n_jobs):
"""Calculate walk graph kernels up to n between 2 graphs using Fixed-Point method.

Parameters
----------
G1, G2 : NetworkX graph
Graphs between which the kernel is calculated.
node_label : string
node attribute used as label.
edge_label : string
edge attribute used as label.

Return
------
kernel : float
Kernel between 2 graphs.
"""

Kmatrix = np.zeros((len(Gn), len(Gn)))
# if not ds_attrs['node_labeled'] and ds_attrs['node_attr_dim'] < 1 and \
# not ds_attrs['edge_labeled'] and ds_attrs['edge_attr_dim'] > 1:
# # this is faster from unlabeled graphs. @todo: why?
# if q == None:
# # don't normalize adjacency matrices if q is a uniform vector. Note
# # A_wave_list accually contains the transposes of the adjacency matrices.
# A_wave_list = [
# nx.adjacency_matrix(G, eweight).todense().transpose() for G in
# tqdm(Gn, desc='compute adjacency matrices', file=sys.stdout)
# ]
# if p == None: # p is uniform distribution as default.
# pbar = tqdm(
# total=(1 + len(Gn)) * len(Gn) / 2,
# desc='calculating kernels',
# file=sys.stdout)
# for i in range(0, len(Gn)):
# for j in range(i, len(Gn)):
# # use uniform distribution if there is no prior knowledge.
# nb_pd = len(A_wave_list[i]) * len(A_wave_list[j])
# p_times_uni = 1 / nb_pd
# w_times = kron(A_wave_list[i], A_wave_list[j]).todense()
# p_times = np.full((nb_pd, 1), p_times_uni)
# x = fixed_point(func_fp, p_times, args=(p_times, lmda, w_times))
# # use uniform distribution if there is no prior knowledge.
# q_times = np.full((1, nb_pd), p_times_uni)
# Kmatrix[i][j] = np.dot(q_times, x)
# Kmatrix[j][i] = Kmatrix[i][j]
# pbar.update(1)
# else:
# reindex nodes using consecutive integers for convenience of kernel calculation.
Gn = [nx.convert_node_labels_to_integers(
g, first_label=0, label_attribute='label_orignal') for g in tqdm(
Gn, desc='reindex vertices', file=sys.stdout)]
if p == None and q == None: # p and q are uniform distributions as default.
def init_worker(gn_toshare):
global G_gn
G_gn = gn_toshare
do_partial = partial(wrapper_fp_labled_do, ds_attrs, node_kernels,
node_label, edge_kernels, edge_label, lmda)
parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker,
glbv=(Gn,), n_jobs=n_jobs)
return Kmatrix


def wrapper_fp_labled_do(ds_attrs, node_kernels, node_label, edge_kernels,
edge_label, lmda, itr):
i = itr[0]
j = itr[1]
return i, j, _fp_labled_do(G_gn[i], G_gn[j], ds_attrs, node_kernels,
node_label, edge_kernels, edge_label, lmda)


def _fp_labled_do(g1, g2, ds_attrs, node_kernels, node_label,
edge_kernels, edge_label, lmda):
# Frist, ompute kernels between all pairs of nodes, method borrowed
# from FCSP. It is faster than directly computing all edge kernels
# when $d_1d_2>2$, where $d_1$ and $d_2$ are vertex degrees of the
# graphs compared, which is the most case we went though. For very
# sparse graphs, this would be slow.
vk_dict = computeVK(g1, g2, ds_attrs, node_kernels, node_label)
# Compute weight matrix of the direct product graph.
w_times, w_dim = computeW(g1, g2, vk_dict, ds_attrs,
edge_kernels, edge_label)
# use uniform distribution if there is no prior knowledge.
p_times_uni = 1 / w_dim
p_times = np.full((w_dim, 1), p_times_uni)
x = fixed_point(func_fp, p_times, args=(p_times, lmda, w_times),
xtol=1e-06, maxiter=1000)
# use uniform distribution if there is no prior knowledge.
q_times = np.full((1, w_dim), p_times_uni)
return np.dot(q_times, x)


def func_fp(x, p_times, lmda, w_times):
haha = w_times * x
haha = lmda * haha
haha = p_times + haha
return p_times + lmda * np.dot(w_times, x)


###############################################################################
def _spectral_decomposition(Gn, weight, p, q, sub_kernel, eweight, n_jobs):
"""Calculate walk graph kernels up to n between 2 unlabeled graphs using
spectral decomposition method. Labels will be ignored.

Parameters
----------
G1, G2 : NetworkX graph
Graphs between which the kernel is calculated.
node_label : string
node attribute used as label.
edge_label : string
edge attribute used as label.

Return
------
kernel : float
Kernel between 2 graphs.
"""
Kmatrix = np.zeros((len(Gn), len(Gn)))

if q == None:
# precompute the spectral decomposition of each graph.
P_list = []
D_list = []
for G in tqdm(Gn, desc='spectral decompose', file=sys.stdout):
# don't normalize adjacency matrices if q is a uniform vector. Note
# A accually is the transpose of the adjacency matrix.
A = nx.adjacency_matrix(G, eweight).todense().transpose()
ew, ev = np.linalg.eig(A)
D_list.append(ew)
P_list.append(ev)
# P_inv_list = [p.T for p in P_list] # @todo: also works for directed graphs?

if p == None: # p is uniform distribution as default.
q_T_list = [np.full((1, nx.number_of_nodes(G)), 1 / nx.number_of_nodes(G)) for G in Gn]
# q_T_list = [q.T for q in q_list]
def init_worker(q_T_toshare, P_toshare, D_toshare):
global G_q_T, G_P, G_D
G_q_T = q_T_toshare
G_P = P_toshare
G_D = D_toshare
do_partial = partial(wrapper_sd_do, weight, sub_kernel)
parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker,
glbv=(q_T_list, P_list, D_list), n_jobs=n_jobs)
# pbar = tqdm(
# total=(1 + len(Gn)) * len(Gn) / 2,
# desc='calculating kernels',
# file=sys.stdout)
# for i in range(0, len(Gn)):
# for j in range(i, len(Gn)):
# result = _sd_do(q_T_list[i], q_T_list[j], P_list[i], P_list[j],
# D_list[i], D_list[j], weight, sub_kernel)
# Kmatrix[i][j] = result
# Kmatrix[j][i] = Kmatrix[i][j]
# pbar.update(1)
return Kmatrix


def wrapper_sd_do(weight, sub_kernel, itr):
i = itr[0]
j = itr[1]
return i, j, _sd_do(G_q_T[i], G_q_T[j], G_P[i], G_P[j], G_D[i], G_D[j],
weight, sub_kernel)


def _sd_do(q_T1, q_T2, P1, P2, D1, D2, weight, sub_kernel):
# use uniform distribution if there is no prior knowledge.
kl = kron(np.dot(q_T1, P1), np.dot(q_T2, P2)).todense()
# @todo: this is not be needed when p = q (kr = kl.T) for undirected graphs
# kr = kron(np.dot(P_inv_list[i], q_list[i]), np.dot(P_inv_list[j], q_list[j])).todense()
if sub_kernel == 'exp':
D_diag = np.array([d1 * d2 for d1 in D1 for d2 in D2])
kmiddle = np.diag(np.exp(weight * D_diag))
elif sub_kernel == 'geo':
D_diag = np.array([d1 * d2 for d1 in D1 for d2 in D2])
kmiddle = np.diag(weight * D_diag)
kmiddle = np.identity(len(kmiddle)) - weight * kmiddle
kmiddle = np.linalg.inv(kmiddle)
return np.dot(np.dot(kl, kmiddle), kl.T)[0, 0]


###############################################################################
def _randomwalkkernel_kron(G1, G2, node_label, edge_label):
"""Calculate walk graph kernels up to n between 2 graphs using nearest Kronecker product approximation method.

Parameters
----------
G1, G2 : NetworkX graph
Graphs between which the kernel is calculated.
node_label : string
node attribute used as label.
edge_label : string
edge attribute used as label.

Return
------
kernel : float
Kernel between 2 graphs.
"""
pass


###############################################################################
def getLabels(Gn, node_label, edge_label, directed):
"""Get symbolic labels of a graph dataset, where vertex labels are dealt
with by concatenating them to the edge labels of adjacent edges.
"""
label_list = []
label_set = set()
for g in Gn:
label_g = {}
for e in g.edges(data=True):
nl1 = g.node[e[0]][node_label]
nl2 = g.node[e[1]][node_label]
if not directed and nl1 > nl2:
nl1, nl2 = nl2, nl1
label = (nl1, e[2][edge_label], nl2)
label_g[(e[0], e[1])] = label
label_list.append(label_g)
label_set = set([l for lg in label_list for l in lg.values()])
return label_list, len(label_set)


def filterGramMatrix(gmt, label_dict, label, directed):
"""Compute (the transpose of) the Gram matrix filtered by a label.
"""
gmf = np.zeros(gmt.shape)
for (n1, n2), l in label_dict.items():
if l == label:
gmf[n2, n1] = gmt[n2, n1]
if not directed:
gmf[n1, n2] = gmt[n1, n2]
return gmf


def computeVK(g1, g2, ds_attrs, node_kernels, node_label):
'''Compute vertex kernels between vertices of two graphs.
'''
vk_dict = {} # shortest path matrices dict
if ds_attrs['node_labeled']:
# node symb and non-synb labeled
if ds_attrs['node_attr_dim'] > 0:
kn = node_kernels['mix']
for n1 in g1.nodes(data=True):
for n2 in g2.nodes(data=True):
vk_dict[(n1[0], n2[0])] = kn(
n1[1][node_label], n2[1][node_label],
n1[1]['attributes'], n2[1]['attributes'])
# node symb labeled
else:
kn = node_kernels['symb']
for n1 in g1.nodes(data=True):
for n2 in g2.nodes(data=True):
vk_dict[(n1[0], n2[0])] = kn(n1[1][node_label],
n2[1][node_label])
else:
# node non-synb labeled
if ds_attrs['node_attr_dim'] > 0:
kn = node_kernels['nsymb']
for n1 in g1.nodes(data=True):
for n2 in g2.nodes(data=True):
vk_dict[(n1[0], n2[0])] = kn(n1[1]['attributes'],
n2[1]['attributes'])
# node unlabeled
else:
pass
return vk_dict


def computeW(g1, g2, vk_dict, ds_attrs, edge_kernels, edge_label):
'''Compute weight matrix of the direct product graph.
'''
w_dim = nx.number_of_nodes(g1) * nx.number_of_nodes(g2)
w_times = np.zeros((w_dim, w_dim))
if vk_dict: # node labeled
if ds_attrs['is_directed']:
if ds_attrs['edge_labeled']:
# edge symb and non-synb labeled
if ds_attrs['edge_attr_dim'] > 0:
ke = edge_kernels['mix']
for e1 in g1.edges(data=True):
for e2 in g2.edges(data=True):
ek_temp = ke(e1[2][edge_label], e2[2][edge_label],
e1[2]['attributes'], e2[2]['attributes'])
w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
e1[1] * nx.number_of_nodes(g2) + e2[1])
w_times[w_idx] = vk_dict[(e1[0], e2[0])] \
* ek_temp * vk_dict[(e1[1], e2[1])]
# edge symb labeled
else:
ke = edge_kernels['symb']
for e1 in g1.edges(data=True):
for e2 in g2.edges(data=True):
ek_temp = ke(e1[2][edge_label], e2[2][edge_label])
w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
e1[1] * nx.number_of_nodes(g2) + e2[1])
w_times[w_idx] = vk_dict[(e1[0], e2[0])] \
* ek_temp * vk_dict[(e1[1], e2[1])]
else:
# edge non-synb labeled
if ds_attrs['edge_attr_dim'] > 0:
ke = edge_kernels['nsymb']
for e1 in g1.edges(data=True):
for e2 in g2.edges(data=True):
ek_temp = ke(e1[2]['attributes'], e2[2]['attributes'])
w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
e1[1] * nx.number_of_nodes(g2) + e2[1])
w_times[w_idx] = vk_dict[(e1[0], e2[0])] \
* ek_temp * vk_dict[(e1[1], e2[1])]
# edge unlabeled
else:
for e1 in g1.edges(data=True):
for e2 in g2.edges(data=True):
w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
e1[1] * nx.number_of_nodes(g2) + e2[1])
w_times[w_idx] = vk_dict[(e1[0], e2[0])] \
* vk_dict[(e1[1], e2[1])]
else: # undirected
if ds_attrs['edge_labeled']:
# edge symb and non-synb labeled
if ds_attrs['edge_attr_dim'] > 0:
ke = edge_kernels['mix']
for e1 in g1.edges(data=True):
for e2 in g2.edges(data=True):
ek_temp = ke(e1[2][edge_label], e2[2][edge_label],
e1[2]['attributes'], e2[2]['attributes'])
w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
e1[1] * nx.number_of_nodes(g2) + e2[1])
w_times[w_idx] = vk_dict[(e1[0], e2[0])] \
* ek_temp * vk_dict[(e1[1], e2[1])] \
+ vk_dict[(e1[0], e2[1])] \
* ek_temp * vk_dict[(e1[1], e2[0])]
w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]]
w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1],
e1[1] * nx.number_of_nodes(g2) + e2[0])
w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]]
w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]]
# edge symb labeled
else:
ke = edge_kernels['symb']
for e1 in g1.edges(data=True):
for e2 in g2.edges(data=True):
ek_temp = ke(e1[2][edge_label], e2[2][edge_label])
w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
e1[1] * nx.number_of_nodes(g2) + e2[1])
w_times[w_idx] = vk_dict[(e1[0], e2[0])] \
* ek_temp * vk_dict[(e1[1], e2[1])] \
+ vk_dict[(e1[0], e2[1])] \
* ek_temp * vk_dict[(e1[1], e2[0])]
w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]]
w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1],
e1[1] * nx.number_of_nodes(g2) + e2[0])
w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]]
w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]]
else:
# edge non-synb labeled
if ds_attrs['edge_attr_dim'] > 0:
ke = edge_kernels['nsymb']
for e1 in g1.edges(data=True):
for e2 in g2.edges(data=True):
ek_temp = ke(e1[2]['attributes'], e2[2]['attributes'])
w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
e1[1] * nx.number_of_nodes(g2) + e2[1])
w_times[w_idx] = vk_dict[(e1[0], e2[0])] \
* ek_temp * vk_dict[(e1[1], e2[1])] \
+ vk_dict[(e1[0], e2[1])] \
* ek_temp * vk_dict[(e1[1], e2[0])]
w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]]
w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1],
e1[1] * nx.number_of_nodes(g2) + e2[0])
w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]]
w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]]
# edge unlabeled
else:
for e1 in g1.edges(data=True):
for e2 in g2.edges(data=True):
w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
e1[1] * nx.number_of_nodes(g2) + e2[1])
w_times[w_idx] = vk_dict[(e1[0], e2[0])] \
* vk_dict[(e1[1], e2[1])] \
+ vk_dict[(e1[0], e2[1])] \
* vk_dict[(e1[1], e2[0])]
w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]]
w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1],
e1[1] * nx.number_of_nodes(g2) + e2[0])
w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]]
w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]]
else: # node unlabeled
if ds_attrs['is_directed']:
if ds_attrs['edge_labeled']:
# edge symb and non-synb labeled
if ds_attrs['edge_attr_dim'] > 0:
ke = edge_kernels['mix']
for e1 in g1.edges(data=True):
for e2 in g2.edges(data=True):
ek_temp = ke(e1[2][edge_label], e2[2][edge_label],
e1[2]['attributes'], e2[2]['attributes'])
w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
e1[1] * nx.number_of_nodes(g2) + e2[1])
w_times[w_idx] = ek_temp
# edge symb labeled
else:
ke = edge_kernels['symb']
for e1 in g1.edges(data=True):
for e2 in g2.edges(data=True):
ek_temp = ke(e1[2][edge_label], e2[2][edge_label])
w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
e1[1] * nx.number_of_nodes(g2) + e2[1])
w_times[w_idx] = ek_temp
else:
# edge non-synb labeled
if ds_attrs['edge_attr_dim'] > 0:
ke = edge_kernels['nsymb']
for e1 in g1.edges(data=True):
for e2 in g2.edges(data=True):
ek_temp = ke(e1[2]['attributes'], e2[2]['attributes'])
w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
e1[1] * nx.number_of_nodes(g2) + e2[1])
w_times[w_idx] = ek_temp
# edge unlabeled
else:
for e1 in g1.edges(data=True):
for e2 in g2.edges(data=True):
w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
e1[1] * nx.number_of_nodes(g2) + e2[1])
w_times[w_idx] = 1
else: # undirected
if ds_attrs['edge_labeled']:
# edge symb and non-synb labeled
if ds_attrs['edge_attr_dim'] > 0:
ke = edge_kernels['mix']
for e1 in g1.edges(data=True):
for e2 in g2.edges(data=True):
ek_temp = ke(e1[2][edge_label], e2[2][edge_label],
e1[2]['attributes'], e2[2]['attributes'])
w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
e1[1] * nx.number_of_nodes(g2) + e2[1])
w_times[w_idx] = ek_temp
w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]]
w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1],
e1[1] * nx.number_of_nodes(g2) + e2[0])
w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]]
w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]]
# edge symb labeled
else:
ke = edge_kernels['symb']
for e1 in g1.edges(data=True):
for e2 in g2.edges(data=True):
ek_temp = ke(e1[2][edge_label], e2[2][edge_label])
w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
e1[1] * nx.number_of_nodes(g2) + e2[1])
w_times[w_idx] = ek_temp
w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]]
w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1],
e1[1] * nx.number_of_nodes(g2) + e2[0])
w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]]
w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]]
else:
# edge non-synb labeled
if ds_attrs['edge_attr_dim'] > 0:
ke = edge_kernels['nsymb']
for e1 in g1.edges(data=True):
for e2 in g2.edges(data=True):
ek_temp = ke(e1[2]['attributes'], e2[2]['attributes'])
w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
e1[1] * nx.number_of_nodes(g2) + e2[1])
w_times[w_idx] = ek_temp
w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]]
w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1],
e1[1] * nx.number_of_nodes(g2) + e2[0])
w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]]
w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]]
# edge unlabeled
else:
for e1 in g1.edges(data=True):
for e2 in g2.edges(data=True):
w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
e1[1] * nx.number_of_nodes(g2) + e2[1])
w_times[w_idx] = 1
w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]]
w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1],
e1[1] * nx.number_of_nodes(g2) + e2[0])
w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]]
w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]]
return w_times, w_dim

+ 200
- 0
pygraph/kernels/else/sp_sym.py View File

@@ -0,0 +1,200 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Dec 21 18:02:00 2018

@author: ljia
"""

import sys
import time
from itertools import product
from functools import partial
from multiprocessing import Pool
from tqdm import tqdm

import networkx as nx
import numpy as np

from pygraph.utils.utils import getSPGraph
from pygraph.utils.graphdataset import get_dataset_attributes
from pygraph.utils.parallel import parallel_gm
sys.path.insert(0, "../")

def spkernel(*args,
node_label='atom',
edge_weight=None,
node_kernels=None,
n_jobs=None):
"""Calculate shortest-path kernels between graphs.

Parameters
----------
Gn : List of NetworkX graph
List of graphs between which the kernels are calculated.
/
G1, G2 : NetworkX graphs
2 graphs between which the kernel is calculated.
node_label : string
node attribute used as label. The default node label is atom.
edge_weight : string
Edge attribute name corresponding to the edge weight.
node_kernels: dict
A dictionary of kernel functions for nodes, including 3 items: 'symb'
for symbolic node labels, 'nsymb' for non-symbolic node labels, 'mix'
for both labels. The first 2 functions take two node labels as
parameters, and the 'mix' function takes 4 parameters, a symbolic and a
non-symbolic label for each the two nodes. Each label is in form of 2-D
dimension array (n_samples, n_features). Each function returns an
number as the kernel value. Ignored when nodes are unlabeled.

Return
------
Kmatrix : Numpy matrix
Kernel matrix, each element of which is the sp kernel between 2 praphs.
"""
# pre-process
Gn = args[0] if len(args) == 1 else [args[0], args[1]]
weight = None
if edge_weight is None:
print('\n None edge weight specified. Set all weight to 1.\n')
else:
try:
some_weight = list(
nx.get_edge_attributes(Gn[0], edge_weight).values())[0]
if isinstance(some_weight, (float, int)):
weight = edge_weight
else:
print(
'\n Edge weight with name %s is not float or integer. Set all weight to 1.\n'
% edge_weight)
except:
print(
'\n Edge weight with name "%s" is not found in the edge attributes. Set all weight to 1.\n'
% edge_weight)
ds_attrs = get_dataset_attributes(
Gn,
attr_names=['node_labeled', 'node_attr_dim', 'is_directed'],
node_label=node_label)
ds_attrs['node_attr_dim'] = 0

# remove graphs with no edges, as no sp can be found in their structures,
# so the kernel between such a graph and itself will be zero.
len_gn = len(Gn)
Gn = [(idx, G) for idx, G in enumerate(Gn) if nx.number_of_edges(G) != 0]
idx = [G[0] for G in Gn]
Gn = [G[1] for G in Gn]
if len(Gn) != len_gn:
print('\n %d graphs are removed as they don\'t contain edges.\n' %
(len_gn - len(Gn)))

start_time = time.time()

pool = Pool(n_jobs)
# get shortest path graphs of Gn
getsp_partial = partial(wrapper_getSPGraph, weight)
itr = zip(Gn, range(0, len(Gn)))
if len(Gn) < 100 * n_jobs:
# # use default chunksize as pool.map when iterable is less than 100
# chunksize, extra = divmod(len(Gn), n_jobs * 4)
# if extra:
# chunksize += 1
chunksize = int(len(Gn) / n_jobs) + 1
else:
chunksize = 100
for i, g in tqdm(
pool.imap_unordered(getsp_partial, itr, chunksize),
desc='getting sp graphs', file=sys.stdout):
Gn[i] = g
pool.close()
pool.join()

Kmatrix = np.zeros((len(Gn), len(Gn)))

# ---- use pool.imap_unordered to parallel and track progress. ----
def init_worker(gn_toshare):
global G_gn
G_gn = gn_toshare
do_partial = partial(wrapper_sp_do, ds_attrs, node_label, node_kernels)
parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker,
glbv=(Gn,), n_jobs=n_jobs)

run_time = time.time() - start_time
print(
"\n --- shortest path kernel matrix of size %d built in %s seconds ---"
% (len(Gn), run_time))

return Kmatrix, run_time, idx


def spkernel_do(g1, g2, ds_attrs, node_label, node_kernels):
kernel = 0

# compute shortest path matrices first, method borrowed from FCSP.
vk_dict = {} # shortest path matrices dict
if ds_attrs['node_labeled']:
# node symb and non-synb labeled
if ds_attrs['node_attr_dim'] > 0:
kn = node_kernels['mix']
for n1, n2 in product(
g1.nodes(data=True), g2.nodes(data=True)):
vk_dict[(n1[0], n2[0])] = kn(
n1[1][node_label], n2[1][node_label],
n1[1]['attributes'], n2[1]['attributes'])
# node symb labeled
else:
kn = node_kernels['symb']
for n1 in g1.nodes(data=True):
for n2 in g2.nodes(data=True):
vk_dict[(n1[0], n2[0])] = kn(n1[1][node_label],
n2[1][node_label])
else:
# node non-synb labeled
if ds_attrs['node_attr_dim'] > 0:
kn = node_kernels['nsymb']
for n1 in g1.nodes(data=True):
for n2 in g2.nodes(data=True):
vk_dict[(n1[0], n2[0])] = kn(n1[1]['attributes'],
n2[1]['attributes'])
# node unlabeled
else:
for e1, e2 in product(
g1.edges(data=True), g2.edges(data=True)):
if e1[2]['cost'] == e2[2]['cost']:
kernel += 1
return kernel

# compute graph kernels
if ds_attrs['is_directed']:
for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)):
if e1[2]['cost'] == e2[2]['cost']:
nk11, nk22 = vk_dict[(e1[0], e2[0])], vk_dict[(e1[1],
e2[1])]
kn1 = nk11 * nk22
kernel += kn1
else:
for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)):
if e1[2]['cost'] == e2[2]['cost']:
# each edge walk is counted twice, starting from both its extreme nodes.
nk11, nk12, nk21, nk22 = vk_dict[(e1[0], e2[0])], vk_dict[(
e1[0], e2[1])], vk_dict[(e1[1],
e2[0])], vk_dict[(e1[1],
e2[1])]
kn1 = nk11 * nk22
kn2 = nk12 * nk21
kernel += kn1 + kn2

return kernel


def wrapper_sp_do(ds_attrs, node_label, node_kernels, itr):
i = itr[0]
j = itr[1]
return i, j, spkernel_do(G_gn[i], G_gn[j], ds_attrs, node_label, node_kernels)


def wrapper_getSPGraph(weight, itr_item):
g = itr_item[0]
i = itr_item[1]
return i, getSPGraph(g, edge_weight=weight)

+ 464
- 0
pygraph/kernels/else/ssp_sym.py View File

@@ -0,0 +1,464 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun Dec 23 16:42:48 2018

@author: ljia
"""

import sys
import time
from itertools import combinations, product
from functools import partial
from multiprocessing import Pool
from tqdm import tqdm

import networkx as nx
import numpy as np

from pygraph.utils.graphdataset import get_dataset_attributes
from pygraph.utils.parallel import parallel_gm

sys.path.insert(0, "../")


def structuralspkernel(*args,
node_label='atom',
edge_weight=None,
edge_label='bond_type',
node_kernels=None,
edge_kernels=None,
n_jobs=None):
"""Calculate mean average structural shortest path kernels between graphs.

Parameters
----------
Gn : List of NetworkX graph
List of graphs between which the kernels are calculated.
/
G1, G2 : NetworkX graphs
2 graphs between which the kernel is calculated.
node_label : string
node attribute used as label. The default node label is atom.
edge_weight : string
Edge attribute name corresponding to the edge weight.
edge_label : string
edge attribute used as label. The default edge label is bond_type.
node_kernels: dict
A dictionary of kernel functions for nodes, including 3 items: 'symb'
for symbolic node labels, 'nsymb' for non-symbolic node labels, 'mix'
for both labels. The first 2 functions take two node labels as
parameters, and the 'mix' function takes 4 parameters, a symbolic and a
non-symbolic label for each the two nodes. Each label is in form of 2-D
dimension array (n_samples, n_features). Each function returns a number
as the kernel value. Ignored when nodes are unlabeled.
edge_kernels: dict
A dictionary of kernel functions for edges, including 3 items: 'symb'
for symbolic edge labels, 'nsymb' for non-symbolic edge labels, 'mix'
for both labels. The first 2 functions take two edge labels as
parameters, and the 'mix' function takes 4 parameters, a symbolic and a
non-symbolic label for each the two edges. Each label is in form of 2-D
dimension array (n_samples, n_features). Each function returns a number
as the kernel value. Ignored when edges are unlabeled.

Return
------
Kmatrix : Numpy matrix
Kernel matrix, each element of which is the mean average structural
shortest path kernel between 2 praphs.
"""
# pre-process
Gn = args[0] if len(args) == 1 else [args[0], args[1]]
weight = None
if edge_weight is None:
print('\n None edge weight specified. Set all weight to 1.\n')
else:
try:
some_weight = list(
nx.get_edge_attributes(Gn[0], edge_weight).values())[0]
if isinstance(some_weight, (float, int)):
weight = edge_weight
else:
print(
'\n Edge weight with name %s is not float or integer. Set all weight to 1.\n'
% edge_weight)
except:
print(
'\n Edge weight with name "%s" is not found in the edge attributes. Set all weight to 1.\n'
% edge_weight)
ds_attrs = get_dataset_attributes(
Gn,
attr_names=['node_labeled', 'node_attr_dim', 'edge_labeled',
'edge_attr_dim', 'is_directed'],
node_label=node_label, edge_label=edge_label)
ds_attrs['node_attr_dim'] = 0
ds_attrs['edge_attr_dim'] = 0

start_time = time.time()

# get shortest paths of each graph in Gn
splist = [None] * len(Gn)
pool = Pool(n_jobs)
# get shortest path graphs of Gn
getsp_partial = partial(wrapper_getSP, weight, ds_attrs['is_directed'])
itr = zip(Gn, range(0, len(Gn)))
if len(Gn) < 100 * n_jobs:
chunksize = int(len(Gn) / n_jobs) + 1
else:
chunksize = 100
# chunksize = 300 # int(len(list(itr)) / n_jobs)
for i, sp in tqdm(
pool.imap_unordered(getsp_partial, itr, chunksize),
desc='getting shortest paths',
file=sys.stdout):
splist[i] = sp
# time.sleep(10)
pool.close()
pool.join()
# # get shortest paths of each graph in Gn
# splist = [[] for _ in range(len(Gn))]
# # get shortest path graphs of Gn
# getsp_partial = partial(wrapper_getSP, weight, ds_attrs['is_directed'])
# itr = zip(Gn, range(0, len(Gn)))
# if len(Gn) < 1000 * n_jobs:
# chunksize = int(len(Gn) / n_jobs) + 1
# else:
# chunksize = 1000
# # chunksize = 300 # int(len(list(itr)) / n_jobs)
# from contextlib import closing
# with closing(Pool(n_jobs)) as pool:
## for i, sp in tqdm(
# res = pool.imap_unordered(getsp_partial, itr, 10)
## desc='getting shortest paths',
## file=sys.stdout):
## splist[i] = sp
## time.sleep(10)
# pool.close()
# pool.join()
# ss = 0
# ss += sys.getsizeof(splist)
# for spss in splist:
# ss += sys.getsizeof(spss)
# for spp in spss:
# ss += sys.getsizeof(spp)
# time.sleep(20)
# # ---- direct running, normally use single CPU core. ----
# splist = []
# for g in tqdm(Gn, desc='getting sp graphs', file=sys.stdout):
# splist.append(get_shortest_paths(g, weight, ds_attrs['is_directed']))

# # ---- only for the Fast Computation of Shortest Path Kernel (FCSP)
# sp_ml = [0] * len(Gn) # shortest path matrices
# for i in result_sp:
# sp_ml[i[0]] = i[1]
# edge_x_g = [[] for i in range(len(sp_ml))]
# edge_y_g = [[] for i in range(len(sp_ml))]
# edge_w_g = [[] for i in range(len(sp_ml))]
# for idx, item in enumerate(sp_ml):
# for i1 in range(len(item)):
# for i2 in range(i1 + 1, len(item)):
# if item[i1, i2] != np.inf:
# edge_x_g[idx].append(i1)
# edge_y_g[idx].append(i2)
# edge_w_g[idx].append(item[i1, i2])
# print(len(edge_x_g[0]))
# print(len(edge_y_g[0]))
# print(len(edge_w_g[0]))

Kmatrix = np.zeros((len(Gn), len(Gn)))
# ---- use pool.imap_unordered to parallel and track progress. ----
def init_worker(spl_toshare, gs_toshare):
global G_spl, G_gs
G_spl = spl_toshare
G_gs = gs_toshare
do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label,
node_kernels, edge_kernels)
parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker,
glbv=(splist, Gn), n_jobs=n_jobs)

# # ---- use pool.imap_unordered to parallel and track progress. ----
# pool = Pool(n_jobs)
# do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label,
# node_kernels, edge_kernels)
# itr = zip(combinations_with_replacement(Gn, 2),
# combinations_with_replacement(splist, 2),
# combinations_with_replacement(range(0, len(Gn)), 2))
# len_itr = int(len(Gn) * (len(Gn) + 1) / 2)
# if len_itr < 1000 * n_jobs:
# chunksize = int(len_itr / n_jobs) + 1
# else:
# chunksize = 1000
# for i, j, kernel in tqdm(
# pool.imap_unordered(do_partial, itr, chunksize),
# desc='calculating kernels',
# file=sys.stdout):
# Kmatrix[i][j] = kernel
# Kmatrix[j][i] = kernel
# pool.close()
# pool.join()
# # ---- use pool.map to parallel. ----
# pool = Pool(n_jobs)
# do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label,
# node_kernels, edge_kernels)
# itr = zip(combinations_with_replacement(Gn, 2),
# combinations_with_replacement(splist, 2),
# combinations_with_replacement(range(0, len(Gn)), 2))
# for i, j, kernel in tqdm(
# pool.map(do_partial, itr), desc='calculating kernels',
# file=sys.stdout):
# Kmatrix[i][j] = kernel
# Kmatrix[j][i] = kernel
# pool.close()
# pool.join()

# # ---- use pool.imap_unordered to parallel and track progress. ----
# do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label,
# node_kernels, edge_kernels)
# itr = zip(combinations_with_replacement(Gn, 2),
# combinations_with_replacement(splist, 2),
# combinations_with_replacement(range(0, len(Gn)), 2))
# len_itr = int(len(Gn) * (len(Gn) + 1) / 2)
# if len_itr < 1000 * n_jobs:
# chunksize = int(len_itr / n_jobs) + 1
# else:
# chunksize = 1000
# from contextlib import closing
# with closing(Pool(n_jobs)) as pool:
# for i, j, kernel in tqdm(
# pool.imap_unordered(do_partial, itr, 1000),
# desc='calculating kernels',
# file=sys.stdout):
# Kmatrix[i][j] = kernel
# Kmatrix[j][i] = kernel
# pool.close()
# pool.join()


# # ---- direct running, normally use single CPU core. ----
# from itertools import combinations_with_replacement
# itr = combinations_with_replacement(range(0, len(Gn)), 2)
# for i, j in tqdm(itr, desc='calculating kernels', file=sys.stdout):
# kernel = structuralspkernel_do(Gn[i], Gn[j], splist[i], splist[j],
# ds_attrs, node_label, edge_label, node_kernels, edge_kernels)
## if(kernel > 1):
## print("error here ")
# Kmatrix[i][j] = kernel
# Kmatrix[j][i] = kernel

run_time = time.time() - start_time
print(
"\n --- shortest path kernel matrix of size %d built in %s seconds ---"
% (len(Gn), run_time))

return Kmatrix, run_time


def structuralspkernel_do(g1, g2, spl1, spl2, ds_attrs, node_label, edge_label,
node_kernels, edge_kernels):
kernel = 0

# First, compute shortest path matrices, method borrowed from FCSP.
vk_dict = {} # shortest path matrices dict
if ds_attrs['node_labeled']:
# node symb and non-synb labeled
if ds_attrs['node_attr_dim'] > 0:
kn = node_kernels['mix']
for n1, n2 in product(
g1.nodes(data=True), g2.nodes(data=True)):
vk_dict[(n1[0], n2[0])] = kn(
n1[1][node_label], n2[1][node_label],
n1[1]['attributes'], n2[1]['attributes'])
# node symb labeled
else:
kn = node_kernels['symb']
for n1 in g1.nodes(data=True):
for n2 in g2.nodes(data=True):
vk_dict[(n1[0], n2[0])] = kn(n1[1][node_label],
n2[1][node_label])
else:
# node non-synb labeled
if ds_attrs['node_attr_dim'] > 0:
kn = node_kernels['nsymb']
for n1 in g1.nodes(data=True):
for n2 in g2.nodes(data=True):
vk_dict[(n1[0], n2[0])] = kn(n1[1]['attributes'],
n2[1]['attributes'])
# node unlabeled
else:
pass

# Then, compute kernels between all pairs of edges, which idea is an
# extension of FCSP. It suits sparse graphs, which is the most case we
# went though. For dense graphs, this would be slow.
ek_dict = {} # dict of edge kernels
if ds_attrs['edge_labeled']:
# edge symb and non-synb labeled
if ds_attrs['edge_attr_dim'] > 0:
ke = edge_kernels['mix']
for e1, e2 in product(
g1.edges(data=True), g2.edges(data=True)):
ek_temp = ke(e1[2][edge_label], e2[2][edge_label],
e1[2]['attributes'], e2[2]['attributes'])
ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ek_temp
ek_dict[((e1[1], e1[0]), (e2[0], e2[1]))] = ek_temp
ek_dict[((e1[0], e1[1]), (e2[1], e2[0]))] = ek_temp
ek_dict[((e1[1], e1[0]), (e2[1], e2[0]))] = ek_temp
# edge symb labeled
else:
ke = edge_kernels['symb']
for e1 in g1.edges(data=True):
for e2 in g2.edges(data=True):
ek_temp = ke(e1[2][edge_label], e2[2][edge_label])
ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ek_temp
ek_dict[((e1[1], e1[0]), (e2[0], e2[1]))] = ek_temp
ek_dict[((e1[0], e1[1]), (e2[1], e2[0]))] = ek_temp
ek_dict[((e1[1], e1[0]), (e2[1], e2[0]))] = ek_temp
else:
# edge non-synb labeled
if ds_attrs['edge_attr_dim'] > 0:
ke = edge_kernels['nsymb']
for e1 in g1.edges(data=True):
for e2 in g2.edges(data=True):
ek_temp = kn(e1[2]['attributes'], e2[2]['attributes'])
ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ek_temp
ek_dict[((e1[1], e1[0]), (e2[0], e2[1]))] = ek_temp
ek_dict[((e1[0], e1[1]), (e2[1], e2[0]))] = ek_temp
ek_dict[((e1[1], e1[0]), (e2[1], e2[0]))] = ek_temp
# edge unlabeled
else:
pass

# compute graph kernels
if vk_dict:
if ek_dict:
for p1, p2 in product(spl1, spl2):
if len(p1) == len(p2):
kpath = vk_dict[(p1[0], p2[0])]
if kpath:
for idx in range(1, len(p1)):
kpath *= vk_dict[(p1[idx], p2[idx])] * \
ek_dict[((p1[idx-1], p1[idx]),
(p2[idx-1], p2[idx]))]
if not kpath:
break
kernel += kpath # add up kernels of all paths
else:
for p1, p2 in product(spl1, spl2):
if len(p1) == len(p2):
kpath = vk_dict[(p1[0], p2[0])]
if kpath:
for idx in range(1, len(p1)):
kpath *= vk_dict[(p1[idx], p2[idx])]
if not kpath:
break
kernel += kpath # add up kernels of all paths
else:
if ek_dict:
for p1, p2 in product(spl1, spl2):
if len(p1) == len(p2):
if len(p1) == 0:
kernel += 1
else:
kpath = 1
for idx in range(0, len(p1) - 1):
kpath *= ek_dict[((p1[idx], p1[idx+1]),
(p2[idx], p2[idx+1]))]
if not kpath:
break
kernel += kpath # add up kernels of all paths
else:
for p1, p2 in product(spl1, spl2):
if len(p1) == len(p2):
kernel += 1

kernel = kernel / (len(spl1) * len(spl2)) # calculate mean average

# # ---- exact implementation of the Fast Computation of Shortest Path Kernel (FCSP), reference [2], sadly it is slower than the current implementation
# # compute vertex kernel matrix
# try:
# vk_mat = np.zeros((nx.number_of_nodes(g1),
# nx.number_of_nodes(g2)))
# g1nl = enumerate(g1.nodes(data=True))
# g2nl = enumerate(g2.nodes(data=True))
# for i1, n1 in g1nl:
# for i2, n2 in g2nl:
# vk_mat[i1][i2] = kn(
# n1[1][node_label], n2[1][node_label],
# [n1[1]['attributes']], [n2[1]['attributes']])

# range1 = range(0, len(edge_w_g[i]))
# range2 = range(0, len(edge_w_g[j]))
# for i1 in range1:
# x1 = edge_x_g[i][i1]
# y1 = edge_y_g[i][i1]
# w1 = edge_w_g[i][i1]
# for i2 in range2:
# x2 = edge_x_g[j][i2]
# y2 = edge_y_g[j][i2]
# w2 = edge_w_g[j][i2]
# ke = (w1 == w2)
# if ke > 0:
# kn1 = vk_mat[x1][x2] * vk_mat[y1][y2]
# kn2 = vk_mat[x1][y2] * vk_mat[y1][x2]
# Kmatrix += kn1 + kn2
return kernel


def wrapper_ssp_do(ds_attrs, node_label, edge_label, node_kernels,
edge_kernels, itr):
i = itr[0]
j = itr[1]
return i, j, structuralspkernel_do(G_gs[i], G_gs[j], G_spl[i], G_spl[j],
ds_attrs, node_label, edge_label,
node_kernels, edge_kernels)


def get_shortest_paths(G, weight, directed):
"""Get all shortest paths of a graph.

Parameters
----------
G : NetworkX graphs
The graphs whose paths are calculated.
weight : string/None
edge attribute used as weight to calculate the shortest path.
directed: boolean
Whether graph is directed.

Return
------
sp : list of list
List of shortest paths of the graph, where each path is represented by a list of nodes.
"""
sp = []
for n1, n2 in combinations(G.nodes(), 2):
try:
spltemp = list(nx.all_shortest_paths(G, n1, n2, weight=weight))
except nx.NetworkXNoPath: # nodes not connected
# sp.append([])
pass
else:
sp += spltemp
# each edge walk is counted twice, starting from both its extreme nodes.
if not directed:
sp += [sptemp[::-1] for sptemp in spltemp]
# add single nodes as length 0 paths.
sp += [[n] for n in G.nodes()]
return sp


def wrapper_getSP(weight, directed, itr_item):
g = itr_item[0]
i = itr_item[1]
return i, get_shortest_paths(g, weight, directed)

+ 18
- 15
pygraph/kernels/marginalizedKernel.py View File

@@ -33,8 +33,9 @@ def marginalizedkernel(*args,
edge_label='bond_type',
p_quit=0.5,
n_iteration=20,
remove_totters=True,
n_jobs=None):
remove_totters=False,
n_jobs=None,
verbose=True):
"""Calculate marginalized graph kernels between graphs.

Parameters
@@ -63,16 +64,18 @@ def marginalizedkernel(*args,
"""
# pre-process
n_iteration = int(n_iteration)
Gn = args[0] if len(args) == 1 else [args[0], args[1]]
Gn = args[0][:] if len(args) == 1 else [args[0].copy(), args[1].copy()]
ds_attrs = get_dataset_attributes(
Gn,
attr_names=['node_labeled', 'edge_labeled', 'is_directed'],
node_label=node_label, edge_label=edge_label)
if not ds_attrs['node_labeled']:
if not ds_attrs['node_labeled'] or node_label == None:
node_label = 'atom'
for G in Gn:
nx.set_node_attributes(G, '0', 'atom')
if not ds_attrs['edge_labeled']:
if not ds_attrs['edge_labeled'] or edge_label == None:
edge_label = 'bond_type'
for G in Gn:
nx.set_edge_attributes(G, '0', 'bond_type')

@@ -110,26 +113,26 @@ def marginalizedkernel(*args,
do_partial = partial(wrapper_marg_do, node_label, edge_label,
p_quit, n_iteration)
parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker,
glbv=(Gn,), n_jobs=n_jobs)
glbv=(Gn,), n_jobs=n_jobs, verbose=verbose)


# # ---- direct running, normally use single CPU core. ----
# pbar = tqdm(
# total=(1 + len(Gn)) * len(Gn) / 2,
# desc='calculating kernels',
# file=sys.stdout)
## pbar = tqdm(
## total=(1 + len(Gn)) * len(Gn) / 2,
## desc='calculating kernels',
## file=sys.stdout)
# for i in range(0, len(Gn)):
# for j in range(i, len(Gn)):
# print(i, j)
## print(i, j)
# Kmatrix[i][j] = _marginalizedkernel_do(Gn[i], Gn[j], node_label,
# edge_label, p_quit, n_iteration)
# Kmatrix[j][i] = Kmatrix[i][j]
# pbar.update(1)
## pbar.update(1)

run_time = time.time() - start_time
print(
"\n --- marginalized kernel matrix of size %d built in %s seconds ---"
% (len(Gn), run_time))
if verbose:
print("\n --- marginalized kernel matrix of size %d built in %s seconds ---"
% (len(Gn), run_time))

return Kmatrix, run_time



+ 3
- 2
pygraph/kernels/spKernel.py View File

@@ -23,7 +23,8 @@ def spkernel(*args,
node_label='atom',
edge_weight=None,
node_kernels=None,
n_jobs=None):
n_jobs=None,
verbose=True):
"""Calculate shortest-path kernels between graphs.

Parameters
@@ -148,7 +149,7 @@ def spkernel(*args,
G_gn = gn_toshare
do_partial = partial(wrapper_sp_do, ds_attrs, node_label, node_kernels)
parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker,
glbv=(Gn,), n_jobs=n_jobs)
glbv=(Gn,), n_jobs=n_jobs, verbose=verbose)


# # ---- use pool.map to parallel. ----


+ 1
- 1
pygraph/kernels/structuralspKernel.py View File

@@ -31,7 +31,7 @@ def structuralspkernel(*args,
edge_label='bond_type',
node_kernels=None,
edge_kernels=None,
compute_method='trie',
compute_method='naive',
n_jobs=None):
"""Calculate mean average structural shortest path kernels between graphs.



+ 10
- 4
pygraph/utils/model_selection_precomputed.py View File

@@ -242,7 +242,7 @@ def model_selection_for_precomputed_kernel(datafile,
# gms_array = Array('d', np.reshape(gram_matrices.copy(), -1, order='C'))
# pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(gms_array, gms_shape))
pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(gram_matrices,))
trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, y, model_type)
trial_do_partial = partial(parallel_trial_do, param_list_pre_revised, param_list, y, model_type)
train_pref = []
val_pref = []
test_pref = []
@@ -473,7 +473,7 @@ def model_selection_for_precomputed_kernel(datafile,
G_gms = gms_toshare

pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(gram_matrices,))
trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, y, model_type)
trial_do_partial = partial(parallel_trial_do, param_list_pre_revised, param_list, y, model_type)
train_pref = []
val_pref = []
test_pref = []
@@ -656,7 +656,7 @@ def model_selection_for_precomputed_kernel(datafile,
f.write(str_fw + '\n\n\n' + content)


def trial_do(param_list_pre_revised, param_list, y, model_type, trial): # Test set level
def trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, trial): # Test set level

# # get gram matrices from global variables.
# gram_matrices = np.reshape(G_gms.copy(), G_gms_shape, order='C')
@@ -679,7 +679,7 @@ def trial_do(param_list_pre_revised, param_list, y, model_type, trial): # Test s
# get gram matrices from global variables.
# gm_now = G_gms[index_out * G_gms_shape[1] * G_gms_shape[2]:(index_out + 1) * G_gms_shape[1] * G_gms_shape[2]]
# gm_now = np.reshape(gm_now.copy(), (G_gms_shape[1], G_gms_shape[2]), order='C')
gm_now = G_gms[index_out].copy()
gm_now = gram_matrices[index_out].copy()
# split gram matrix and y to app and test sets.
indices = range(len(y))
@@ -822,6 +822,12 @@ def trial_do(param_list_pre_revised, param_list, y, model_type, trial): # Test s

return train_pref, val_pref, test_pref

def parallel_trial_do(param_list_pre_revised, param_list, y, model_type, trial):
train_pref, val_pref, test_pref = trial_do(param_list_pre_revised,
param_list, G_gms, y,
model_type, trial)
return train_pref, val_pref, test_pref


def compute_gram_matrices(dataset, y, estimator, param_list_precomputed,
results_dir, ds_name,


+ 11
- 7
pygraph/utils/parallel.py View File

@@ -11,7 +11,8 @@ from tqdm import tqdm
import sys

def parallel_me(func, func_assign, var_to_assign, itr, len_itr=None, init_worker=None,
glbv=None, method=None, n_jobs=None, chunksize=None, itr_desc=''):
glbv=None, method=None, n_jobs=None, chunksize=None, itr_desc='',
verbose=True):
'''
'''
if method == 'imap_unordered':
@@ -29,8 +30,9 @@ def parallel_me(func, func_assign, var_to_assign, itr, len_itr=None, init_worker
chunksize = int(len_itr / n_jobs) + 1
else:
chunksize = 100
for result in tqdm(pool.imap_unordered(func, itr, chunksize),
desc=itr_desc, file=sys.stdout):
for result in (tqdm(pool.imap_unordered(func, itr, chunksize),
desc=itr_desc, file=sys.stdout) if verbose else
pool.imap_unordered(func, itr, chunksize)):
func_assign(result, var_to_assign)
else:
with Pool(processes=n_jobs) as pool:
@@ -41,14 +43,16 @@ def parallel_me(func, func_assign, var_to_assign, itr, len_itr=None, init_worker
chunksize = int(len_itr / n_jobs) + 1
else:
chunksize = 100
for result in tqdm(pool.imap_unordered(func, itr, chunksize),
desc=itr_desc, file=sys.stdout):
for result in (tqdm(pool.imap_unordered(func, itr, chunksize),
desc=itr_desc, file=sys.stdout) if verbose else
pool.imap_unordered(func, itr, chunksize)):
func_assign(result, var_to_assign)

def parallel_gm(func, Kmatrix, Gn, init_worker=None, glbv=None,
method='imap_unordered', n_jobs=None, chunksize=None):
method='imap_unordered', n_jobs=None, chunksize=None,
verbose=True):
from itertools import combinations_with_replacement
def func_assign(result, var_to_assign):
var_to_assign[result[0]][result[1]] = result[2]
@@ -57,4 +61,4 @@ def parallel_gm(func, Kmatrix, Gn, init_worker=None, glbv=None,
len_itr = int(len(Gn) * (len(Gn) + 1) / 2)
parallel_me(func, func_assign, Kmatrix, itr, len_itr=len_itr,
init_worker=init_worker, glbv=glbv, method=method, n_jobs=n_jobs,
chunksize=chunksize, itr_desc='calculating kernels')
chunksize=chunksize, itr_desc='calculating kernels', verbose=verbose)

Loading…
Cancel
Save