Browse Source

1. apply multiprocessing.Pool.imap_unordered method instead of the map method, so that tqdm could be applied to track progress.

2. apply part of the Fast Computation of Shortest Path Kernel (FCSP) to speed up the sp kernel.
v0.1
jajupmochi 7 years ago
parent
commit
06b32cdf8a
3 changed files with 514 additions and 1030 deletions
  1. +149
    -727
      notebooks/run_spkernel.ipynb
  2. +335
    -293
      pygraph/kernels/spKernel.py
  3. +30
    -10
      pygraph/utils/model_selection_precomputed.py

+ 149
- 727
notebooks/run_spkernel.ipynb
File diff suppressed because it is too large
View File


+ 335
- 293
pygraph/kernels/spKernel.py View File

@@ -8,7 +8,7 @@ import pathlib
sys.path.insert(0, "../")
from tqdm import tqdm
import time
from itertools import combinations_with_replacement, product
from itertools import combinations, combinations_with_replacement, product
from functools import partial
from joblib import Parallel, delayed
from multiprocessing import Pool
@@ -77,207 +77,108 @@ def spkernel(*args,
if len(Gn) != len_gn:
print('\n %d graphs are removed as they don\'t contain edges.\n' %
(len_gn - len(Gn)))

start_time = time.time()
pool = Pool(n_jobs)

pool = Pool(n_jobs)
# get shortest path graphs of Gn
getsp_partial = partial(wrap_getSPGraph, Gn, edge_weight)
result_sp = pool.map(getsp_partial, range(0, len(Gn)))
for i in result_sp:
Gn[i[0]] = i[1]
if len(Gn) < 100:
# use default chunksize as pool.map when iterable is less than 100
chunksize, extra = divmod(len(Gn), n_jobs * 4)
if extra:
chunksize += 1
else:
chunksize = 100
# chunksize = 300 # int(len(list(itr)) / n_jobs)
for i, g in tqdm(
pool.imap_unordered(getsp_partial, range(0, len(Gn)), chunksize),
desc='getting sp graphs',
file=sys.stdout):
Gn[i] = g

# Gn = [
# getSPGraph(G, edge_weight=edge_weight)
# for G in tqdm(Gn, desc='getting sp graphs', file=sys.stdout)
# ]
# # ---- use pool.map to parallel ----
# result_sp = pool.map(getsp_partial, range(0, len(Gn)))
# for i in result_sp:
# Gn[i[0]] = i[1]
# or
# getsp_partial = partial(wrap_getSPGraph, Gn, edge_weight)
# for i, g in tqdm(
# pool.map(getsp_partial, range(0, len(Gn))),
# desc='getting sp graphs',
# file=sys.stdout):
# Gn[i] = g

# # ---- only for the Fast Computation of Shortest Path Kernel (FCSP)
# sp_ml = [0] * len(Gn) # shortest path matrices
# for i in result_sp:
# sp_ml[i[0]] = i[1]
# edge_x_g = [[] for i in range(len(sp_ml))]
# edge_y_g = [[] for i in range(len(sp_ml))]
# edge_w_g = [[] for i in range(len(sp_ml))]
# for idx, item in enumerate(sp_ml):
# for i1 in range(len(item)):
# for i2 in range(i1 + 1, len(item)):
# if item[i1, i2] != np.inf:
# edge_x_g[idx].append(i1)
# edge_y_g[idx].append(i2)
# edge_w_g[idx].append(item[i1, i2])
# print(len(edge_x_g[0]))
# print(len(edge_y_g[0]))
# print(len(edge_w_g[0]))

Kmatrix = np.zeros((len(Gn), len(Gn)))

# ---- use pool.imap_unordered to parallel and track progress. ----
do_partial = partial(spkernel_do, Gn, ds_attrs, node_label, node_kernels)
itr = combinations_with_replacement(range(0, len(Gn)), 2)
# chunksize = 2000 # int(len(list(itr)) / n_jobs)
# for i, j, kernel in tqdm(pool.imap_unordered(do_partial, itr, chunksize)):
# Kmatrix[i][j] = kernel
# Kmatrix[j][i] = kernel

result_perf = pool.map(do_partial, itr)
len_itr = int(len(Gn) * (len(Gn) + 1) / 2)
if len_itr < 100:
chunksize, extra = divmod(len_itr, n_jobs * 4)
if extra:
chunksize += 1
else:
chunksize = 100
for i, j, kernel in tqdm(
pool.imap_unordered(do_partial, itr, chunksize),
desc='calculating kernels',
file=sys.stdout):
Kmatrix[i][j] = kernel
Kmatrix[j][i] = kernel
pool.close()
pool.join()

# # ---- use pool.map to parallel. ----
# # result_perf = pool.map(do_partial, itr)
# do_partial = partial(spkernel_do, Gn, ds_attrs, node_label, node_kernels)
# itr = combinations_with_replacement(range(0, len(Gn)), 2)
# for i, j, kernel in tqdm(
# pool.map(do_partial, itr), desc='calculating kernels',
# file=sys.stdout):
# Kmatrix[i][j] = kernel
# Kmatrix[j][i] = kernel
# pool.close()
# pool.join()

# # ---- use joblib.Parallel to parallel and track progress. ----
# result_perf = Parallel(
# n_jobs=n_jobs, verbose=10)(
# delayed(do_partial)(ij)
# for ij in combinations_with_replacement(range(0, len(Gn)), 2))

# result_perf = [
# do_partial(ij)
# for ij in combinations_with_replacement(range(0, len(Gn)), 2)
# ]
# for i in result_perf:
# Kmatrix[i[0]][i[1]] = i[2]
# Kmatrix[i[1]][i[0]] = i[2]

for i in result_perf:
Kmatrix[i[0]][i[1]] = i[2]
Kmatrix[i[1]][i[0]] = i[2]

# pbar = tqdm(
# total=((len(Gn) + 1) * len(Gn) / 2),
# desc='calculating kernels',
# file=sys.stdout)
# if ds_attrs['node_labeled']:
# # node symb and non-synb labeled
# if ds_attrs['node_attr_dim'] > 0:
# if ds_attrs['is_directed']:
# for i, j in combinations_with_replacement(
# range(0, len(Gn)), 2):
# for e1, e2 in product(
# Gn[i].edges(data=True), Gn[j].edges(data=True)):
# if e1[2]['cost'] == e2[2]['cost']:
# kn = node_kernels['mix']
# try:
# n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[
# i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[
# j].nodes[e2[1]]
# kn1 = kn(n11[node_label], n21[node_label], [
# n11['attributes']
# ], [n21['attributes']]) * kn(
# n12[node_label], n22[node_label],
# [n12['attributes']], [n22['attributes']])
# Kmatrix[i][j] += kn1
# except KeyError: # missing labels or attributes
# pass
# Kmatrix[j][i] = Kmatrix[i][j]
# pbar.update(1)

# else:
# for i, j in combinations_with_replacement(
# range(0, len(Gn)), 2):
# for e1, e2 in product(
# Gn[i].edges(data=True), Gn[j].edges(data=True)):
# if e1[2]['cost'] == e2[2]['cost']:
# kn = node_kernels['mix']
# try:
# # each edge walk is counted twice, starting from both its extreme nodes.
# n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[
# i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[
# j].nodes[e2[1]]
# kn1 = kn(n11[node_label], n21[node_label], [
# n11['attributes']
# ], [n21['attributes']]) * kn(
# n12[node_label], n22[node_label],
# [n12['attributes']], [n22['attributes']])
# kn2 = kn(n11[node_label], n22[node_label], [
# n11['attributes']
# ], [n22['attributes']]) * kn(
# n12[node_label], n21[node_label],
# [n12['attributes']], [n21['attributes']])
# Kmatrix[i][j] += kn1 + kn2
# except KeyError: # missing labels or attributes
# pass
# Kmatrix[j][i] = Kmatrix[i][j]
# pbar.update(1)
# # node symb labeled
# else:
# if ds_attrs['is_directed']:
# for i, j in combinations_with_replacement(
# range(0, len(Gn)), 2):
# for e1, e2 in product(
# Gn[i].edges(data=True), Gn[j].edges(data=True)):
# if e1[2]['cost'] == e2[2]['cost']:
# kn = node_kernels['symb']
# try:
# n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[
# i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[
# j].nodes[e2[1]]
# kn1 = kn(n11[node_label],
# n21[node_label]) * kn(
# n12[node_label], n22[node_label])
# Kmatrix[i][j] += kn1
# except KeyError: # missing labels
# pass
# Kmatrix[j][i] = Kmatrix[i][j]
# pbar.update(1)

# else:
# for i, j in combinations_with_replacement(
# range(0, len(Gn)), 2):
# for e1, e2 in product(
# Gn[i].edges(data=True), Gn[j].edges(data=True)):
# if e1[2]['cost'] == e2[2]['cost']:
# kn = node_kernels['symb']
# try:
# # each edge walk is counted twice, starting from both its extreme nodes.
# n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[
# i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[
# j].nodes[e2[1]]
# kn1 = kn(n11[node_label],
# n21[node_label]) * kn(
# n12[node_label], n22[node_label])
# kn2 = kn(n11[node_label],
# n22[node_label]) * kn(
# n12[node_label], n21[node_label])
# Kmatrix[i][j] += kn1 + kn2
# except KeyError: # missing labels
# pass
# Kmatrix[j][i] = Kmatrix[i][j]
# pbar.update(1)
# else:
# # node non-synb labeled
# if ds_attrs['node_attr_dim'] > 0:
# if ds_attrs['is_directed']:
# for i, j in combinations_with_replacement(
# range(0, len(Gn)), 2):
# for e1, e2 in product(
# Gn[i].edges(data=True), Gn[j].edges(data=True)):
# if e1[2]['cost'] == e2[2]['cost']:
# kn = node_kernels['nsymb']
# try:
# # each edge walk is counted twice, starting from both its extreme nodes.
# n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[
# i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[
# j].nodes[e2[1]]
# kn1 = kn([n11['attributes']],
# [n21['attributes']]) * kn(
# [n12['attributes']],
# [n22['attributes']])
# Kmatrix[i][j] += kn1
# except KeyError: # missing attributes
# pass
# Kmatrix[j][i] = Kmatrix[i][j]
# pbar.update(1)
# else:
# for i, j in combinations_with_replacement(
# range(0, len(Gn)), 2):
# for e1, e2 in product(
# Gn[i].edges(data=True), Gn[j].edges(data=True)):
# if e1[2]['cost'] == e2[2]['cost']:
# kn = node_kernels['nsymb']
# try:
# # each edge walk is counted twice, starting from both its extreme nodes.
# n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[
# i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[
# j].nodes[e2[1]]
# kn1 = kn([n11['attributes']],
# [n21['attributes']]) * kn(
# [n12['attributes']],
# [n22['attributes']])
# kn2 = kn([n11['attributes']],
# [n22['attributes']]) * kn(
# [n12['attributes']],
# [n21['attributes']])
# Kmatrix[i][j] += kn1 + kn2
# except KeyError: # missing attributes
# pass
# Kmatrix[j][i] = Kmatrix[i][j]
# pbar.update(1)

# # node unlabeled
# else:
# for i, j in combinations_with_replacement(range(0, len(Gn)), 2):
# for e1, e2 in product(
# Gn[i].edges(data=True), Gn[j].edges(data=True)):
# if e1[2]['cost'] == e2[2]['cost']:
# Kmatrix[i][j] += 1
# Kmatrix[j][i] = Kmatrix[i][j]
# pbar.update(1)
# # ---- direct running, normally use single CPU core. ----
# itr = combinations_with_replacement(range(0, len(Gn)), 2)
# for gs in tqdm(itr, desc='calculating kernels', file=sys.stdout):
# i, j, kernel = spkernel_do(Gn, ds_attrs, node_label, node_kernels, gs)
# Kmatrix[i][j] = kernel
# Kmatrix[j][i] = kernel

run_time = time.time() - start_time
print(
@@ -291,130 +192,271 @@ def spkernel_do(Gn, ds_attrs, node_label, node_kernels, ij):

i = ij[0]
j = ij[1]
g1 = Gn[i]
g2 = Gn[j]
Kmatrix = 0
if ds_attrs['node_labeled']:
# node symb and non-synb labeled
if ds_attrs['node_attr_dim'] > 0:
if ds_attrs['is_directed']:
for e1, e2 in product(
Gn[i].edges(data=True), Gn[j].edges(data=True)):
if e1[2]['cost'] == e2[2]['cost']:
kn = node_kernels['mix']
try:
n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[
i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[
j].nodes[e2[1]]
kn1 = kn(
n11[node_label], n21[node_label],
[n11['attributes']], [n21['attributes']]) * kn(
n12[node_label], n22[node_label],
[n12['attributes']], [n22['attributes']])
Kmatrix += kn1
except KeyError: # missing labels or attributes
pass

try:
# compute shortest path matrices first, method borrowed from FCSP.
if ds_attrs['node_labeled']:
# node symb and non-synb labeled
if ds_attrs['node_attr_dim'] > 0:
kn = node_kernels['mix']
vk_dict = {} # shortest path matrices dict
for n1, n2 in product(
g1.nodes(data=True), g2.nodes(data=True)):
vk_dict[(n1[0], n2[0])] = kn(
n1[1][node_label], n2[1][node_label],
[n1[1]['attributes']], [n2[1]['attributes']])
# node symb labeled
else:
for e1, e2 in product(
Gn[i].edges(data=True), Gn[j].edges(data=True)):
if e1[2]['cost'] == e2[2]['cost']:
kn = node_kernels['mix']
try:
# each edge walk is counted twice, starting from both its extreme nodes.
n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[
i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[
j].nodes[e2[1]]
kn1 = kn(
n11[node_label], n21[node_label],
[n11['attributes']], [n21['attributes']]) * kn(
n12[node_label], n22[node_label],
[n12['attributes']], [n22['attributes']])
kn2 = kn(
n11[node_label], n22[node_label],
[n11['attributes']], [n22['attributes']]) * kn(
n12[node_label], n21[node_label],
[n12['attributes']], [n21['attributes']])
Kmatrix += kn1 + kn2
except KeyError: # missing labels or attributes
pass
# node symb labeled
kn = node_kernels['symb']
vk_dict = {} # shortest path matrices dict
for n1 in g1.nodes(data=True):
for n2 in g2.nodes(data=True):
vk_dict[(n1[0], n2[0])] = kn(n1[1][node_label],
n2[1][node_label])
else:
if ds_attrs['is_directed']:
for e1, e2 in product(
Gn[i].edges(data=True), Gn[j].edges(data=True)):
if e1[2]['cost'] == e2[2]['cost']:
kn = node_kernels['symb']
try:
n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[
i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[
j].nodes[e2[1]]
kn1 = kn(n11[node_label], n21[node_label]) * kn(
n12[node_label], n22[node_label])
Kmatrix += kn1
except KeyError: # missing labels
pass
else:
for e1, e2 in product(
Gn[i].edges(data=True), Gn[j].edges(data=True)):
if e1[2]['cost'] == e2[2]['cost']:
kn = node_kernels['symb']
try:
# each edge walk is counted twice, starting from both its extreme nodes.
n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[
i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[
j].nodes[e2[1]]
kn1 = kn(n11[node_label], n21[node_label]) * kn(
n12[node_label], n22[node_label])
kn2 = kn(n11[node_label], n22[node_label]) * kn(
n12[node_label], n21[node_label])
Kmatrix += kn1 + kn2
except KeyError: # missing labels
pass
else:
# node non-synb labeled
if ds_attrs['node_attr_dim'] > 0:
if ds_attrs['is_directed']:
for e1, e2 in product(
Gn[i].edges(data=True), Gn[j].edges(data=True)):
if e1[2]['cost'] == e2[2]['cost']:
kn = node_kernels['nsymb']
try:
# each edge walk is counted twice, starting from both its extreme nodes.
n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[
i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[
j].nodes[e2[1]]
kn1 = kn(
[n11['attributes']], [n21['attributes']]) * kn(
[n12['attributes']], [n22['attributes']])
Kmatrix += kn1
except KeyError: # missing attributes
pass
# node non-synb labeled
if ds_attrs['node_attr_dim'] > 0:
kn = node_kernels['nsymb']
vk_dict = {} # shortest path matrices dict
for n1 in g1.nodes(data=True):
for n2 in g2.nodes(data=True):
vk_dict[(n1[0], n2[0])] = kn([n1[1]['attributes']],
[n2[1]['attributes']])
# node unlabeled
else:
for e1, e2 in product(
Gn[i].edges(data=True), Gn[j].edges(data=True)):
if e1[2]['cost'] == e2[2]['cost']:
kn = node_kernels['nsymb']
try:
# each edge walk is counted twice, starting from both its extreme nodes.
n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[
i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[
j].nodes[e2[1]]
kn1 = kn(
[n11['attributes']], [n21['attributes']]) * kn(
[n12['attributes']], [n22['attributes']])
kn2 = kn(
[n11['attributes']], [n22['attributes']]) * kn(
[n12['attributes']], [n21['attributes']])
Kmatrix += kn1 + kn2
except KeyError: # missing attributes
pass
# node unlabeled
Kmatrix += 1
return i, j, Kmatrix

# compute graph kernels
if ds_attrs['is_directed']:
for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)):
if e1[2]['cost'] == e2[2]['cost']:
# each edge walk is counted twice, starting from both its extreme nodes.
nk11, nk22 = vk_dict[(e1[0], e2[0])], vk_dict[(e1[1],
e2[1])]
kn1 = nk11 * nk22
Kmatrix += kn1 + kn2
else:
for e1, e2 in product(
Gn[i].edges(data=True), Gn[j].edges(data=True)):
for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)):
if e1[2]['cost'] == e2[2]['cost']:
Kmatrix += 1
# each edge walk is counted twice, starting from both its extreme nodes.
nk11, nk12, nk21, nk22 = vk_dict[(e1[0], e2[0])], vk_dict[(
e1[0], e2[1])], vk_dict[(e1[1],
e2[0])], vk_dict[(e1[1],
e2[1])]
kn1 = nk11 * nk22
kn2 = nk12 * nk21
Kmatrix += kn1 + kn2

# # ---- exact implementation of the Fast Computation of Shortest Path Kernel (FCSP), reference [2], sadly it is slower than the current implementation
# # compute vertex kernel matrix
# try:
# vk_mat = np.zeros((nx.number_of_nodes(g1),
# nx.number_of_nodes(g2)))
# g1nl = enumerate(g1.nodes(data=True))
# g2nl = enumerate(g2.nodes(data=True))
# for i1, n1 in g1nl:
# for i2, n2 in g2nl:
# vk_mat[i1][i2] = kn(
# n1[1][node_label], n2[1][node_label],
# [n1[1]['attributes']], [n2[1]['attributes']])

# range1 = range(0, len(edge_w_g[i]))
# range2 = range(0, len(edge_w_g[j]))
# for i1 in range1:
# x1 = edge_x_g[i][i1]
# y1 = edge_y_g[i][i1]
# w1 = edge_w_g[i][i1]
# for i2 in range2:
# x2 = edge_x_g[j][i2]
# y2 = edge_y_g[j][i2]
# w2 = edge_w_g[j][i2]
# ke = (w1 == w2)
# if ke > 0:
# kn1 = vk_mat[x1][x2] * vk_mat[y1][y2]
# kn2 = vk_mat[x1][y2] * vk_mat[y1][x2]
# Kmatrix += kn1 + kn2
except KeyError: # missing labels or attributes
pass

return i, j, Kmatrix


def wrap_getSPGraph(Gn, weight, i):
return i, getSPGraph(Gn[i], edge_weight=weight)
return i, getSPGraph(Gn[i], edge_weight=weight)
# return i, nx.floyd_warshall_numpy(Gn[i], weight=weight)


# def spkernel_do(Gn, ds_attrs, node_label, node_kernels, ij):

# i = ij[0]
# j = ij[1]
# g1 = Gn[i]
# g2 = Gn[j]
# Kmatrix = 0
# if ds_attrs['node_labeled']:
# # node symb and non-synb labeled
# if ds_attrs['node_attr_dim'] > 0:
# if ds_attrs['is_directed']:
# for e1, e2 in product(
# Gn[i].edges(data=True), Gn[j].edges(data=True)):
# if e1[2]['cost'] == e2[2]['cost']:
# kn = node_kernels['mix']
# try:
# n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[
# i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[
# j].nodes[e2[1]]
# kn1 = kn(
# n11[node_label], n21[node_label],
# [n11['attributes']], [n21['attributes']]) * kn(
# n12[node_label], n22[node_label],
# [n12['attributes']], [n22['attributes']])
# Kmatrix += kn1
# except KeyError: # missing labels or attributes
# pass
# else:
# kn = node_kernels['mix']
# try:
# # compute shortest path matrices first, method borrowed from FCSP.
# vk_dict = {} # shortest path matrices dict
# for n1 in g1.nodes(data=True):
# for n2 in g2.nodes(data=True):
# vk_dict[(n1[0], n2[0])] = kn(
# n1[1][node_label], n2[1][node_label],
# [n1[1]['attributes']], [n2[1]['attributes']])

# for e1, e2 in product(
# g1.edges(data=True), g2.edges(data=True)):
# if e1[2]['cost'] == e2[2]['cost']:
# # each edge walk is counted twice, starting from both its extreme nodes.
# nk11, nk12, nk21, nk22 = vk_dict[(
# e1[0],
# e2[0])], vk_dict[(e1[0], e2[1])], vk_dict[(
# e1[1], e2[0])], vk_dict[(e1[1], e2[1])]
# kn1 = nk11 * nk22
# kn2 = nk12 * nk21
# Kmatrix += kn1 + kn2

# # # ---- exact implementation of the Fast Computation of Shortest Path Kernel (FCSP), reference [2], sadly it is slower than the current implementation
# # # compute vertex kernel matrix
# # try:
# # vk_mat = np.zeros((nx.number_of_nodes(g1),
# # nx.number_of_nodes(g2)))
# # g1nl = enumerate(g1.nodes(data=True))
# # g2nl = enumerate(g2.nodes(data=True))
# # for i1, n1 in g1nl:
# # for i2, n2 in g2nl:
# # vk_mat[i1][i2] = kn(
# # n1[1][node_label], n2[1][node_label],
# # [n1[1]['attributes']], [n2[1]['attributes']])

# # range1 = range(0, len(edge_w_g[i]))
# # range2 = range(0, len(edge_w_g[j]))
# # for i1 in range1:
# # x1 = edge_x_g[i][i1]
# # y1 = edge_y_g[i][i1]
# # w1 = edge_w_g[i][i1]
# # for i2 in range2:
# # x2 = edge_x_g[j][i2]
# # y2 = edge_y_g[j][i2]
# # w2 = edge_w_g[j][i2]
# # ke = (w1 == w2)
# # if ke > 0:
# # kn1 = vk_mat[x1][x2] * vk_mat[y1][y2]
# # kn2 = vk_mat[x1][y2] * vk_mat[y1][x2]
# # Kmatrix += kn1 + kn2

# except KeyError: # missing labels or attributes
# pass

# # node symb labeled
# else:
# if ds_attrs['is_directed']:
# for e1, e2 in product(
# Gn[i].edges(data=True), Gn[j].edges(data=True)):
# if e1[2]['cost'] == e2[2]['cost']:
# kn = node_kernels['symb']
# try:
# n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[
# i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[
# j].nodes[e2[1]]
# kn1 = kn(n11[node_label], n21[node_label]) * kn(
# n12[node_label], n22[node_label])
# Kmatrix += kn1
# except KeyError: # missing labels
# pass
# else:
# kn = node_kernels['symb']
# try:
# # compute shortest path matrices first, method borrowed from FCSP.
# vk_dict = {} # shortest path matrices dict
# for n1 in g1.nodes(data=True):
# for n2 in g2.nodes(data=True):
# vk_dict[(n1[0], n2[0])] = kn(
# n1[1][node_label], n2[1][node_label])

# for e1, e2 in product(
# g1.edges(data=True), g2.edges(data=True)):
# if e1[2]['cost'] == e2[2]['cost']:
# # each edge walk is counted twice, starting from both its extreme nodes.
# nk11, nk12, nk21, nk22 = vk_dict[(
# e1[0],
# e2[0])], vk_dict[(e1[0], e2[1])], vk_dict[(
# e1[1], e2[0])], vk_dict[(e1[1], e2[1])]
# kn1 = nk11 * nk22
# kn2 = nk12 * nk21
# Kmatrix += kn1 + kn2
# except KeyError: # missing labels
# pass
# else:
# # node non-synb labeled
# if ds_attrs['node_attr_dim'] > 0:
# if ds_attrs['is_directed']:
# for e1, e2 in product(
# Gn[i].edges(data=True), Gn[j].edges(data=True)):
# if e1[2]['cost'] == e2[2]['cost']:
# kn = node_kernels['nsymb']
# try:
# # each edge walk is counted twice, starting from both its extreme nodes.
# n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[
# i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[
# j].nodes[e2[1]]
# kn1 = kn(
# [n11['attributes']], [n21['attributes']]) * kn(
# [n12['attributes']], [n22['attributes']])
# Kmatrix += kn1
# except KeyError: # missing attributes
# pass
# else:
# for e1, e2 in product(
# Gn[i].edges(data=True), Gn[j].edges(data=True)):
# if e1[2]['cost'] == e2[2]['cost']:
# kn = node_kernels['nsymb']
# try:
# # each edge walk is counted twice, starting from both its extreme nodes.
# n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[
# i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[
# j].nodes[e2[1]]
# kn1 = kn(
# [n11['attributes']], [n21['attributes']]) * kn(
# [n12['attributes']], [n22['attributes']])
# kn2 = kn(
# [n11['attributes']], [n22['attributes']]) * kn(
# [n12['attributes']], [n21['attributes']])
# Kmatrix += kn1 + kn2
# except KeyError: # missing attributes
# pass
# # node unlabeled
# else:
# for e1, e2 in product(
# Gn[i].edges(data=True), Gn[j].edges(data=True)):
# if e1[2]['cost'] == e2[2]['cost']:
# Kmatrix += 1

# return i, j, Kmatrix

+ 30
- 10
pygraph/utils/model_selection_precomputed.py View File

@@ -190,24 +190,44 @@ def model_selection_for_precomputed_kernel(datafile,
)
pool = Pool(n_jobs)
trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, gram_matrices, y, model_type)
result_perf = pool.map(trial_do_partial, range(NUM_TRIALS))
train_pref = [item[0] for item in result_perf]
val_pref = [item[1] for item in result_perf]
test_pref = [item[2] for item in result_perf]
train_pref = []
val_pref = []
test_pref = []
if NUM_TRIALS < 100:
chunksize, extra = divmod(NUM_TRIALS, n_jobs * 4)
if extra:
chunksize += 1
else:
chunksize = 100
for o1, o2, o3 in tqdm(pool.imap_unordered(trial_do_partial, range(NUM_TRIALS), chunksize), desc='cross validation', file=sys.stdout):
train_pref.append(o1)
val_pref.append(o2)
test_pref.append(o3)
pool.close()
pool.join()

# # ---- use pool.map to parallel. ----
# result_perf = pool.map(trial_do_partial, range(NUM_TRIALS))
# train_pref = [item[0] for item in result_perf]
# val_pref = [item[1] for item in result_perf]
# test_pref = [item[2] for item in result_perf]

# # ---- use joblib.Parallel to parallel and track progress. ----
# trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, gram_matrices, y, model_type)
# result_perf = Parallel(n_jobs=n_jobs, verbose=10)(delayed(trial_do_partial)(trial) for trial in range(NUM_TRIALS))
# train_pref = [item[0] for item in result_perf]
# val_pref = [item[1] for item in result_perf]
# test_pref = [item[2] for item in result_perf]


# pbar.clear()
# np.save(results_name_pre + 'train_pref.dt', train_pref)
# np.save(results_name_pre + 'val_pref.dt', val_pref)
# np.save(results_name_pre + 'test_pref.dt', test_pref)
# # ---- direct running, normally use single CPU core. ----
# train_pref = []
# val_pref = []
# test_pref = []
# for i in tqdm(range(NUM_TRIALS), desc='cross validation', file=sys.stdout):
# o1, o2, o3 = trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, i)
# train_pref.append(o1)
# val_pref.append(o2)
# test_pref.append(o3)

print()
print('4. Getting final performance...')
@@ -479,4 +499,4 @@ def trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, t
test_pref[index_out][index_in] = np.mean(
current_test_perf)

return train_pref, val_pref, test_pref
return train_pref, val_pref, test_pref

Loading…
Cancel
Save