Browse Source

add comments in run_spkernel.py.

v0.1
jajupmochi 6 years ago
parent
commit
123391d3a1
8 changed files with 530 additions and 157 deletions
  1. +12
    -0
      .gitignore
  2. +3
    -2
      notebooks/run_degree_differs_uhp.py
  3. +24
    -21
      notebooks/run_spkernel.py
  4. +2
    -1
      notebooks/run_structuralspkernel.py
  5. +2
    -1
      notebooks/run_vertex_differs_uhp.py
  6. +2
    -2
      pygraph/kernels/randomWalkKernel.py
  7. +484
    -129
      pygraph/kernels/structuralspKernel.py
  8. +1
    -1
      pygraph/kernels/untilHPathKernel.py

+ 12
- 0
.gitignore View File

@@ -2,11 +2,23 @@
.ipynb_checkpoints
datasets/*
!datasets/ds.py
!datasets/Alkane/*
!datasets/acyclic/*
!datasets/MAO/*
!datasets/PAH/*
!datasets/MUTAG/*
!datasets/Letter-med/*
!datasets/ENZYMES_txt/*
notebooks/results/*
notebooks/check_gm/*
notebooks/test_parallel/*
requirements/*
pygraph/model.py
pygraph/kernels/*_sym.py
*.npy
*.eps
*.dat
*.pyc

__pycache__
##*#

+ 3
- 2
notebooks/run_degree_differs_uhp.py View File

@@ -27,7 +27,8 @@ def run_ms(dataset, y, ds):
from pygraph.kernels.untilHPathKernel import untilhpathkernel
estimator = untilhpathkernel
param_grid_precomputed = {'depth': np.linspace(1, 10, 10), # [2],
'k_func': ['MinMax', 'tanimoto']} # ['MinMax']}
'k_func': ['MinMax', 'tanimoto'],
'compute_method': ['trie']} # ['MinMax']}
param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)},
{'alpha': np.logspace(-10, 10, num=41, base=10)}]

@@ -58,7 +59,7 @@ for ds in dslist:
ave_time = []
std_time = []
ave_degree = []
for piece in range(1, 5):
for piece in range(0, 5):
print('piece', str(piece), ':')
Gn_p = Gn[len_1piece * piece:len_1piece * (piece + 1)]
y_all_p = y_all[len_1piece * piece:len_1piece * (piece + 1)]


+ 24
- 21
notebooks/run_spkernel.py View File

@@ -6,6 +6,7 @@ from pygraph.kernels.spKernel import spkernel
from pygraph.utils.kernels import deltakernel, gaussiankernel, kernelproduct
#from pygraph.utils.model_selection_precomputed import trial_do

# datasets
dslist = [
{'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',
'task': 'regression'}, # node symb
@@ -20,31 +21,31 @@ dslist = [
# node nsymb
{'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'},
# node symb/nsymb

# {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'},
# # node/edge symb
# {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat',
# 'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}}, # node symb

# {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb
# # # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb
# # # {'name': 'COX2', 'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt'}, # node symb/nsymb
# {'name': 'Fingerprint', 'dataset': '../datasets/Fingerprint/Fingerprint_A.txt'},
#
# # {'name': 'DHFR', 'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt'}, # node symb/nsymb
# # {'name': 'SYNTHETIC', 'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt'}, # node symb/nsymb
# # {'name': 'MSRC9', 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'}, # node symb
# # {'name': 'MSRC21', 'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'}, # node symb
# # {'name': 'FIRSTMM_DB', 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'}, # node symb/nsymb ,edge nsymb

# # {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'}, # node symb/nsymb
# # {'name': 'PROTEINS_full', 'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'}, # node symb/nsymb
# # {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb
# {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1.mat',
# 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb
# {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109.mat',
# 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb
# {'name': 'NCI-HIV', 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf',
# 'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',}, # node/edge symb
#
# {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb
# {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb
# {'name': 'COX2', 'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt'}, # node symb/nsymb
# {'name': 'Fingerprint', 'dataset': '../datasets/Fingerprint/Fingerprint_A.txt'},
# {'name': 'DHFR', 'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt'}, # node symb/nsymb
# {'name': 'SYNTHETIC', 'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt'}, # node symb/nsymb
# {'name': 'MSRC9', 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'}, # node symb
# {'name': 'MSRC21', 'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'}, # node symb
# {'name': 'FIRSTMM_DB', 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'}, # node symb/nsymb ,edge nsymb
#
# {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'}, # node symb/nsymb
# {'name': 'PROTEINS_full', 'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'}, # node symb/nsymb
# {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb
# {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1.mat',
# 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb
# {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109.mat',
# 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb
# {'name': 'NCI-HIV', 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf',
# 'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',}, # node/edge symb

# # not working below
# {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',},
@@ -53,12 +54,14 @@ dslist = [
# {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',},
]
estimator = spkernel
# hyper-parameters
mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)
param_grid_precomputed = {'node_kernels': [
{'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}]}
param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)},
{'alpha': np.logspace(-10, 10, num=41, base=10)}]

# for each dataset, do model selection.
for ds in dslist:
print()
print(ds['name'])


+ 2
- 1
notebooks/run_structuralspkernel.py View File

@@ -64,7 +64,8 @@ mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)
param_grid_precomputed = {'node_kernels':
[{'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}],
'edge_kernels':
[{'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}]}
[{'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}],
'compute_method': ['trie']}
param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)},
{'alpha': np.logspace(-10, 10, num=41, base=10)}]



+ 2
- 1
notebooks/run_vertex_differs_uhp.py View File

@@ -38,7 +38,8 @@ def run_ms(dataset, y, ds):
from pygraph.kernels.untilHPathKernel import untilhpathkernel
estimator = untilhpathkernel
param_grid_precomputed = {'depth': np.linspace(1, 10, 10), # [2],
'k_func': ['MinMax', 'tanimoto']} # ['MinMax']}
'k_func': ['MinMax', 'tanimoto'],
'compute_method': ['trie']} # ['MinMax']}
param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)},
{'alpha': np.logspace(-10, 10, num=41, base=10)}]



+ 2
- 2
pygraph/kernels/randomWalkKernel.py View File

@@ -319,7 +319,7 @@ def wrapper_cg_labled_do(ds_attrs, node_kernels, node_label, edge_kernels,

def _cg_labled_do(g1, g2, ds_attrs, node_kernels, node_label,
edge_kernels, edge_label, lmda):
# Frist, ompute kernels between all pairs of nodes, method borrowed
# Frist, compute kernels between all pairs of nodes, method borrowed
# from FCSP. It is faster than directly computing all edge kernels
# when $d_1d_2>2$, where $d_1$ and $d_2$ are vertex degrees of the
# graphs compared, which is the most case we went though. For very
@@ -417,7 +417,7 @@ def wrapper_fp_labled_do(ds_attrs, node_kernels, node_label, edge_kernels,

def _fp_labled_do(g1, g2, ds_attrs, node_kernels, node_label,
edge_kernels, edge_label, lmda):
# Frist, ompute kernels between all pairs of nodes, method borrowed
# Frist, compute kernels between all pairs of nodes, method borrowed
# from FCSP. It is faster than directly computing all edge kernels
# when $d_1d_2>2$, where $d_1$ and $d_2$ are vertex degrees of the
# graphs compared, which is the most case we went though. For very


+ 484
- 129
pygraph/kernels/structuralspKernel.py View File

@@ -20,6 +20,7 @@ import numpy as np

from pygraph.utils.graphdataset import get_dataset_attributes
from pygraph.utils.parallel import parallel_gm
from pygraph.utils.trie import Trie

sys.path.insert(0, "../")

@@ -30,6 +31,7 @@ def structuralspkernel(*args,
edge_label='bond_type',
node_kernels=None,
edge_kernels=None,
compute_method='trie',
n_jobs=None):
"""Calculate mean average structural shortest path kernels between graphs.

@@ -99,14 +101,16 @@ def structuralspkernel(*args,
# get shortest paths of each graph in Gn
splist = [None] * len(Gn)
pool = Pool(n_jobs)
# get shortest path graphs of Gn
getsp_partial = partial(wrapper_getSP, weight, ds_attrs['is_directed'])
itr = zip(Gn, range(0, len(Gn)))
if len(Gn) < 100 * n_jobs:
chunksize = int(len(Gn) / n_jobs) + 1
else:
chunksize = 100
# chunksize = 300 # int(len(list(itr)) / n_jobs)
# get shortest path graphs of Gn
if compute_method == 'trie':
getsp_partial = partial(wrapper_getSP_trie, weight, ds_attrs['is_directed'])
else:
getsp_partial = partial(wrapper_getSP_naive, weight, ds_attrs['is_directed'])
for i, sp in tqdm(
pool.imap_unordered(getsp_partial, itr, chunksize),
desc='getting shortest paths',
@@ -117,27 +121,6 @@ def structuralspkernel(*args,
pool.join()
# # get shortest paths of each graph in Gn
# splist = [[] for _ in range(len(Gn))]
# # get shortest path graphs of Gn
# getsp_partial = partial(wrapper_getSP, weight, ds_attrs['is_directed'])
# itr = zip(Gn, range(0, len(Gn)))
# if len(Gn) < 1000 * n_jobs:
# chunksize = int(len(Gn) / n_jobs) + 1
# else:
# chunksize = 1000
# # chunksize = 300 # int(len(list(itr)) / n_jobs)
# from contextlib import closing
# with closing(Pool(n_jobs)) as pool:
## for i, sp in tqdm(
# res = pool.imap_unordered(getsp_partial, itr, 10)
## desc='getting shortest paths',
## file=sys.stdout):
## splist[i] = sp
## time.sleep(10)
# pool.close()
# pool.join()
# ss = 0
# ss += sys.getsizeof(splist)
# for spss in splist:
@@ -150,8 +133,12 @@ def structuralspkernel(*args,
# # ---- direct running, normally use single CPU core. ----
# splist = []
# for g in tqdm(Gn, desc='getting sp graphs', file=sys.stdout):
# splist.append(get_shortest_paths(g, weight, ds_attrs['is_directed']))
# if compute_method == 'trie':
# for g in tqdm(Gn, desc='getting sp graphs', file=sys.stdout):
# splist.append(get_sps_as_trie(g, weight, ds_attrs['is_directed']))
# else:
# for g in tqdm(Gn, desc='getting sp graphs', file=sys.stdout):
# splist.append(get_shortest_paths(g, weight, ds_attrs['is_directed']))

# # ---- only for the Fast Computation of Shortest Path Kernel (FCSP)
# sp_ml = [0] * len(Gn) # shortest path matrices
@@ -177,33 +164,17 @@ def structuralspkernel(*args,
def init_worker(spl_toshare, gs_toshare):
global G_spl, G_gs
G_spl = spl_toshare
G_gs = gs_toshare
do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label,
node_kernels, edge_kernels)
parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker,
glbv=(splist, Gn), n_jobs=n_jobs)

# # ---- use pool.imap_unordered to parallel and track progress. ----
# pool = Pool(n_jobs)
# do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label,
# node_kernels, edge_kernels)
# itr = zip(combinations_with_replacement(Gn, 2),
# combinations_with_replacement(splist, 2),
# combinations_with_replacement(range(0, len(Gn)), 2))
# len_itr = int(len(Gn) * (len(Gn) + 1) / 2)
# if len_itr < 1000 * n_jobs:
# chunksize = int(len_itr / n_jobs) + 1
# else:
# chunksize = 1000
# for i, j, kernel in tqdm(
# pool.imap_unordered(do_partial, itr, chunksize),
# desc='calculating kernels',
# file=sys.stdout):
# Kmatrix[i][j] = kernel
# Kmatrix[j][i] = kernel
# pool.close()
# pool.join()
G_gs = gs_toshare
if compute_method == 'trie':
do_partial = partial(wrapper_ssp_do_trie, ds_attrs, node_label, edge_label,
node_kernels, edge_kernels)
parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker,
glbv=(splist, Gn), n_jobs=n_jobs)
else:
do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label,
node_kernels, edge_kernels)
parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker,
glbv=(splist, Gn), n_jobs=n_jobs)
# # ---- use pool.map to parallel. ----
# pool = Pool(n_jobs)
@@ -244,14 +215,22 @@ def structuralspkernel(*args,


# # ---- direct running, normally use single CPU core. ----
# from itertools import combinations_with_replacement
# itr = combinations_with_replacement(range(0, len(Gn)), 2)
# for i, j in tqdm(itr, desc='calculating kernels', file=sys.stdout):
# kernel = structuralspkernel_do(Gn[i], Gn[j], splist[i], splist[j],
# ds_attrs, node_label, edge_label, node_kernels, edge_kernels)
## if(kernel > 1):
## print("error here ")
# Kmatrix[i][j] = kernel
# Kmatrix[j][i] = kernel
# if compute_method == 'trie':
# for i, j in tqdm(itr, desc='calculating kernels', file=sys.stdout):
# kernel = ssp_do_trie(Gn[i], Gn[j], splist[i], splist[j],
# ds_attrs, node_label, edge_label, node_kernels, edge_kernels)
# Kmatrix[i][j] = kernel
# Kmatrix[j][i] = kernel
# else:
# for i, j in tqdm(itr, desc='calculating kernels', file=sys.stdout):
# kernel = structuralspkernel_do(Gn[i], Gn[j], splist[i], splist[j],
# ds_attrs, node_label, edge_label, node_kernels, edge_kernels)
# # if(kernel > 1):
# # print("error here ")
# Kmatrix[i][j] = kernel
# Kmatrix[j][i] = kernel

run_time = time.time() - start_time
print(
@@ -267,75 +246,11 @@ def structuralspkernel_do(g1, g2, spl1, spl2, ds_attrs, node_label, edge_label,
kernel = 0

# First, compute shortest path matrices, method borrowed from FCSP.
vk_dict = {} # shortest path matrices dict
if ds_attrs['node_labeled']:
# node symb and non-synb labeled
if ds_attrs['node_attr_dim'] > 0:
kn = node_kernels['mix']
for n1, n2 in product(
g1.nodes(data=True), g2.nodes(data=True)):
vk_dict[(n1[0], n2[0])] = kn(
n1[1][node_label], n2[1][node_label],
n1[1]['attributes'], n2[1]['attributes'])
# node symb labeled
else:
kn = node_kernels['symb']
for n1 in g1.nodes(data=True):
for n2 in g2.nodes(data=True):
vk_dict[(n1[0], n2[0])] = kn(n1[1][node_label],
n2[1][node_label])
else:
# node non-synb labeled
if ds_attrs['node_attr_dim'] > 0:
kn = node_kernels['nsymb']
for n1 in g1.nodes(data=True):
for n2 in g2.nodes(data=True):
vk_dict[(n1[0], n2[0])] = kn(n1[1]['attributes'],
n2[1]['attributes'])
# node unlabeled
else:
pass

# Then, compute kernels between all pairs of edges, which idea is an
vk_dict = getAllNodeKernels(g1, g2, node_kernels, node_label, ds_attrs)
# Then, compute kernels between all pairs of edges, which is an idea of
# extension of FCSP. It suits sparse graphs, which is the most case we
# went though. For dense graphs, this would be slow.
ek_dict = {} # dict of edge kernels
if ds_attrs['edge_labeled']:
# edge symb and non-synb labeled
if ds_attrs['edge_attr_dim'] > 0:
ke = edge_kernels['mix']
for e1, e2 in product(
g1.edges(data=True), g2.edges(data=True)):
ek_temp = ke(e1[2][edge_label], e2[2][edge_label],
e1[2]['attributes'], e2[2]['attributes'])
ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ek_temp
ek_dict[((e1[1], e1[0]), (e2[0], e2[1]))] = ek_temp
ek_dict[((e1[0], e1[1]), (e2[1], e2[0]))] = ek_temp
ek_dict[((e1[1], e1[0]), (e2[1], e2[0]))] = ek_temp
# edge symb labeled
else:
ke = edge_kernels['symb']
for e1 in g1.edges(data=True):
for e2 in g2.edges(data=True):
ek_temp = ke(e1[2][edge_label], e2[2][edge_label])
ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ek_temp
ek_dict[((e1[1], e1[0]), (e2[0], e2[1]))] = ek_temp
ek_dict[((e1[0], e1[1]), (e2[1], e2[0]))] = ek_temp
ek_dict[((e1[1], e1[0]), (e2[1], e2[0]))] = ek_temp
else:
# edge non-synb labeled
if ds_attrs['edge_attr_dim'] > 0:
ke = edge_kernels['nsymb']
for e1 in g1.edges(data=True):
for e2 in g2.edges(data=True):
ek_temp = kn(e1[2]['attributes'], e2[2]['attributes'])
ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ek_temp
ek_dict[((e1[1], e1[0]), (e2[0], e2[1]))] = ek_temp
ek_dict[((e1[0], e1[1]), (e2[1], e2[0]))] = ek_temp
ek_dict[((e1[1], e1[0]), (e2[1], e2[0]))] = ek_temp
# edge unlabeled
else:
pass
ek_dict = getAllEdgeKernels(g1, g2, edge_kernels, edge_label, ds_attrs)

# compute graph kernels
if vk_dict:
@@ -420,6 +335,399 @@ def wrapper_ssp_do(ds_attrs, node_label, edge_label, node_kernels,
return i, j, structuralspkernel_do(G_gs[i], G_gs[j], G_spl[i], G_spl[j],
ds_attrs, node_label, edge_label,
node_kernels, edge_kernels)
def ssp_do_trie(g1, g2, trie1, trie2, ds_attrs, node_label, edge_label,
node_kernels, edge_kernels):
# # traverse all paths in graph1. Deep-first search is applied.
# def traverseBothTrie(root, trie2, kernel, pcurrent=[]):
# for key, node in root['children'].items():
# pcurrent.append(key)
# if node['isEndOfWord']:
# # print(node['count'])
# traverseTrie2(trie2.root, pcurrent, kernel,
# pcurrent=[])
# if node['children'] != {}:
# traverseBothTrie(node, trie2, kernel, pcurrent)
# else:
# del pcurrent[-1]
# if pcurrent != []:
# del pcurrent[-1]
#
#
# # traverse all paths in graph2 and find out those that are not in
# # graph1. Deep-first search is applied.
# def traverseTrie2(root, p1, kernel, pcurrent=[]):
# for key, node in root['children'].items():
# pcurrent.append(key)
# if node['isEndOfWord']:
# # print(node['count'])
# kernel[0] += computePathKernel(p1, pcurrent, vk_dict, ek_dict)
# if node['children'] != {}:
# traverseTrie2(node, p1, kernel, pcurrent)
# else:
# del pcurrent[-1]
# if pcurrent != []:
# del pcurrent[-1]
#
#
# kernel = [0]
#
# # First, compute shortest path matrices, method borrowed from FCSP.
# vk_dict = getAllNodeKernels(g1, g2, node_kernels, node_label, ds_attrs)
# # Then, compute kernels between all pairs of edges, which is an idea of
# # extension of FCSP. It suits sparse graphs, which is the most case we
# # went though. For dense graphs, this would be slow.
# ek_dict = getAllEdgeKernels(g1, g2, edge_kernels, edge_label, ds_attrs)
#
# # compute graph kernels
# traverseBothTrie(trie1[0].root, trie2[0], kernel)
#
# kernel = kernel[0] / (trie1[1] * trie2[1]) # calculate mean average

# # traverse all paths in graph1. Deep-first search is applied.
# def traverseBothTrie(root, trie2, kernel, vk_dict, ek_dict, pcurrent=[]):
# for key, node in root['children'].items():
# pcurrent.append(key)
# if node['isEndOfWord']:
# # print(node['count'])
# traverseTrie2(trie2.root, pcurrent, kernel, vk_dict, ek_dict,
# pcurrent=[])
# if node['children'] != {}:
# traverseBothTrie(node, trie2, kernel, vk_dict, ek_dict, pcurrent)
# else:
# del pcurrent[-1]
# if pcurrent != []:
# del pcurrent[-1]
#
#
# # traverse all paths in graph2 and find out those that are not in
# # graph1. Deep-first search is applied.
# def traverseTrie2(root, p1, kernel, vk_dict, ek_dict, pcurrent=[]):
# for key, node in root['children'].items():
# pcurrent.append(key)
# if node['isEndOfWord']:
# # print(node['count'])
# kernel[0] += computePathKernel(p1, pcurrent, vk_dict, ek_dict)
# if node['children'] != {}:
# traverseTrie2(node, p1, kernel, vk_dict, ek_dict, pcurrent)
# else:
# del pcurrent[-1]
# if pcurrent != []:
# del pcurrent[-1]
kernel = [0]

# First, compute shortest path matrices, method borrowed from FCSP.
vk_dict = getAllNodeKernels(g1, g2, node_kernels, node_label, ds_attrs)
# Then, compute kernels between all pairs of edges, which is an idea of
# extension of FCSP. It suits sparse graphs, which is the most case we
# went though. For dense graphs, this would be slow.
ek_dict = getAllEdgeKernels(g1, g2, edge_kernels, edge_label, ds_attrs)

# compute graph kernels
# traverseBothTrie(trie1[0].root, trie2[0], kernel, vk_dict, ek_dict)
if vk_dict:
if ek_dict:
traverseBothTriem(trie1[0].root, trie2[0], kernel, vk_dict, ek_dict)
else:
traverseBothTriev(trie1[0].root, trie2[0], kernel, vk_dict, ek_dict)
else:
if ek_dict:
traverseBothTriee(trie1[0].root, trie2[0], kernel, vk_dict, ek_dict)
else:
traverseBothTrieu(trie1[0].root, trie2[0], kernel, vk_dict, ek_dict)

kernel = kernel[0] / (trie1[1] * trie2[1]) # calculate mean average

return kernel


def wrapper_ssp_do_trie(ds_attrs, node_label, edge_label, node_kernels,
edge_kernels, itr):
i = itr[0]
j = itr[1]
return i, j, ssp_do_trie(G_gs[i], G_gs[j], G_spl[i], G_spl[j], ds_attrs,
node_label, edge_label, node_kernels, edge_kernels)

def getAllNodeKernels(g1, g2, node_kernels, node_label, ds_attrs):
# compute shortest path matrices, method borrowed from FCSP.
vk_dict = {} # shortest path matrices dict
if ds_attrs['node_labeled']:
# node symb and non-synb labeled
if ds_attrs['node_attr_dim'] > 0:
kn = node_kernels['mix']
for n1, n2 in product(
g1.nodes(data=True), g2.nodes(data=True)):
vk_dict[(n1[0], n2[0])] = kn(
n1[1][node_label], n2[1][node_label],
n1[1]['attributes'], n2[1]['attributes'])
# node symb labeled
else:
kn = node_kernels['symb']
for n1 in g1.nodes(data=True):
for n2 in g2.nodes(data=True):
vk_dict[(n1[0], n2[0])] = kn(n1[1][node_label],
n2[1][node_label])
else:
# node non-synb labeled
if ds_attrs['node_attr_dim'] > 0:
kn = node_kernels['nsymb']
for n1 in g1.nodes(data=True):
for n2 in g2.nodes(data=True):
vk_dict[(n1[0], n2[0])] = kn(n1[1]['attributes'],
n2[1]['attributes'])
# node unlabeled
else:
pass
return vk_dict


def getAllEdgeKernels(g1, g2, edge_kernels, edge_label, ds_attrs):
# compute kernels between all pairs of edges, which is an idea of
# extension of FCSP. It suits sparse graphs, which is the most case we
# went though. For dense graphs, this would be slow.
ek_dict = {} # dict of edge kernels
if ds_attrs['edge_labeled']:
# edge symb and non-synb labeled
if ds_attrs['edge_attr_dim'] > 0:
ke = edge_kernels['mix']
for e1, e2 in product(
g1.edges(data=True), g2.edges(data=True)):
ek_temp = ke(e1[2][edge_label], e2[2][edge_label],
e1[2]['attributes'], e2[2]['attributes'])
ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ek_temp
ek_dict[((e1[1], e1[0]), (e2[0], e2[1]))] = ek_temp
ek_dict[((e1[0], e1[1]), (e2[1], e2[0]))] = ek_temp
ek_dict[((e1[1], e1[0]), (e2[1], e2[0]))] = ek_temp
# edge symb labeled
else:
ke = edge_kernels['symb']
for e1 in g1.edges(data=True):
for e2 in g2.edges(data=True):
ek_temp = ke(e1[2][edge_label], e2[2][edge_label])
ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ek_temp
ek_dict[((e1[1], e1[0]), (e2[0], e2[1]))] = ek_temp
ek_dict[((e1[0], e1[1]), (e2[1], e2[0]))] = ek_temp
ek_dict[((e1[1], e1[0]), (e2[1], e2[0]))] = ek_temp
else:
# edge non-synb labeled
if ds_attrs['edge_attr_dim'] > 0:
ke = edge_kernels['nsymb']
for e1 in g1.edges(data=True):
for e2 in g2.edges(data=True):
ek_temp = ke(e1[2]['attributes'], e2[2]['attributes'])
ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ek_temp
ek_dict[((e1[1], e1[0]), (e2[0], e2[1]))] = ek_temp
ek_dict[((e1[0], e1[1]), (e2[1], e2[0]))] = ek_temp
ek_dict[((e1[1], e1[0]), (e2[1], e2[0]))] = ek_temp
# edge unlabeled
else:
pass
return ek_dict
# traverse all paths in graph1. Deep-first search is applied.
def traverseBothTriem(root, trie2, kernel, vk_dict, ek_dict, pcurrent=[]):
for key, node in root['children'].items():
pcurrent.append(key)
if node['isEndOfWord']:
# print(node['count'])
traverseTrie2m(trie2.root, pcurrent, kernel, vk_dict, ek_dict,
pcurrent=[])
if node['children'] != {}:
traverseBothTriem(node, trie2, kernel, vk_dict, ek_dict, pcurrent)
else:
del pcurrent[-1]
if pcurrent != []:
del pcurrent[-1]
# traverse all paths in graph2 and find out those that are not in
# graph1. Deep-first search is applied.
def traverseTrie2m(root, p1, kernel, vk_dict, ek_dict, pcurrent=[]):
for key, node in root['children'].items():
pcurrent.append(key)
if node['isEndOfWord']:
# print(node['count'])
if len(p1) == len(pcurrent):
kpath = vk_dict[(p1[0], pcurrent[0])]
if kpath:
for idx in range(1, len(p1)):
kpath *= vk_dict[(p1[idx], pcurrent[idx])] * \
ek_dict[((p1[idx-1], p1[idx]),
(pcurrent[idx-1], pcurrent[idx]))]
if not kpath:
break
kernel[0] += kpath # add up kernels of all paths
if node['children'] != {}:
traverseTrie2m(node, p1, kernel, vk_dict, ek_dict, pcurrent)
else:
del pcurrent[-1]
if pcurrent != []:
del pcurrent[-1]

# traverse all paths in graph1. Deep-first search is applied.
def traverseBothTriev(root, trie2, kernel, vk_dict, ek_dict, pcurrent=[]):
for key, node in root['children'].items():
pcurrent.append(key)
if node['isEndOfWord']:
# print(node['count'])
traverseTrie2v(trie2.root, pcurrent, kernel, vk_dict, ek_dict,
pcurrent=[])
if node['children'] != {}:
traverseBothTriev(node, trie2, kernel, vk_dict, ek_dict, pcurrent)
else:
del pcurrent[-1]
if pcurrent != []:
del pcurrent[-1]
# traverse all paths in graph2 and find out those that are not in
# graph1. Deep-first search is applied.
def traverseTrie2v(root, p1, kernel, vk_dict, ek_dict, pcurrent=[]):
for key, node in root['children'].items():
pcurrent.append(key)
if node['isEndOfWord']:
# print(node['count'])
if len(p1) == len(pcurrent):
kpath = vk_dict[(p1[0], pcurrent[0])]
if kpath:
for idx in range(1, len(p1)):
kpath *= vk_dict[(p1[idx], pcurrent[idx])]
if not kpath:
break
kernel[0] += kpath # add up kernels of all paths
if node['children'] != {}:
traverseTrie2v(node, p1, kernel, vk_dict, ek_dict, pcurrent)
else:
del pcurrent[-1]
if pcurrent != []:
del pcurrent[-1]
# traverse all paths in graph1. Deep-first search is applied.
def traverseBothTriee(root, trie2, kernel, vk_dict, ek_dict, pcurrent=[]):
for key, node in root['children'].items():
pcurrent.append(key)
if node['isEndOfWord']:
# print(node['count'])
traverseTrie2e(trie2.root, pcurrent, kernel, vk_dict, ek_dict,
pcurrent=[])
if node['children'] != {}:
traverseBothTriee(node, trie2, kernel, vk_dict, ek_dict, pcurrent)
else:
del pcurrent[-1]
if pcurrent != []:
del pcurrent[-1]
# traverse all paths in graph2 and find out those that are not in
# graph1. Deep-first search is applied.
def traverseTrie2e(root, p1, kernel, vk_dict, ek_dict, pcurrent=[]):
for key, node in root['children'].items():
pcurrent.append(key)
if node['isEndOfWord']:
# print(node['count'])
if len(p1) == len(pcurrent):
if len(p1) == 0:
kernel += 1
else:
kpath = 1
for idx in range(0, len(p1) - 1):
kpath *= ek_dict[((p1[idx], p1[idx+1]),
(pcurrent[idx], pcurrent[idx+1]))]
if not kpath:
break
kernel[0] += kpath # add up kernels of all paths
if node['children'] != {}:
traverseTrie2e(node, p1, kernel, vk_dict, ek_dict, pcurrent)
else:
del pcurrent[-1]
if pcurrent != []:
del pcurrent[-1]
# traverse all paths in graph1. Deep-first search is applied.
def traverseBothTrieu(root, trie2, kernel, vk_dict, ek_dict, pcurrent=[]):
for key, node in root['children'].items():
pcurrent.append(key)
if node['isEndOfWord']:
# print(node['count'])
traverseTrie2u(trie2.root, pcurrent, kernel, vk_dict, ek_dict,
pcurrent=[])
if node['children'] != {}:
traverseBothTrieu(node, trie2, kernel, vk_dict, ek_dict, pcurrent)
else:
del pcurrent[-1]
if pcurrent != []:
del pcurrent[-1]
# traverse all paths in graph2 and find out those that are not in
# graph1. Deep-first search is applied.
def traverseTrie2u(root, p1, kernel, vk_dict, ek_dict, pcurrent=[]):
for key, node in root['children'].items():
pcurrent.append(key)
if node['isEndOfWord']:
# print(node['count'])
if len(p1) == len(pcurrent):
kernel[0] += 1
if node['children'] != {}:
traverseTrie2u(node, p1, kernel, vk_dict, ek_dict, pcurrent)
else:
del pcurrent[-1]
if pcurrent != []:
del pcurrent[-1]
#def computePathKernel(p1, p2, vk_dict, ek_dict):
# kernel = 0
# if vk_dict:
# if ek_dict:
# if len(p1) == len(p2):
# kpath = vk_dict[(p1[0], p2[0])]
# if kpath:
# for idx in range(1, len(p1)):
# kpath *= vk_dict[(p1[idx], p2[idx])] * \
# ek_dict[((p1[idx-1], p1[idx]),
# (p2[idx-1], p2[idx]))]
# if not kpath:
# break
# kernel += kpath # add up kernels of all paths
# else:
# if len(p1) == len(p2):
# kpath = vk_dict[(p1[0], p2[0])]
# if kpath:
# for idx in range(1, len(p1)):
# kpath *= vk_dict[(p1[idx], p2[idx])]
# if not kpath:
# break
# kernel += kpath # add up kernels of all paths
# else:
# if ek_dict:
# if len(p1) == len(p2):
# if len(p1) == 0:
# kernel += 1
# else:
# kpath = 1
# for idx in range(0, len(p1) - 1):
# kpath *= ek_dict[((p1[idx], p1[idx+1]),
# (p2[idx], p2[idx+1]))]
# if not kpath:
# break
# kernel += kpath # add up kernels of all paths
# else:
# if len(p1) == len(p2):
# kernel += 1
#
# return kernel


def get_shortest_paths(G, weight, directed):
@@ -457,7 +765,54 @@ def get_shortest_paths(G, weight, directed):
return sp


def wrapper_getSP(weight, directed, itr_item):
def wrapper_getSP_naive(weight, directed, itr_item):
g = itr_item[0]
i = itr_item[1]
return i, get_shortest_paths(g, weight, directed)


def get_sps_as_trie(G, weight, directed):
"""Get all shortest paths of a graph and insert them into a trie.

Parameters
----------
G : NetworkX graphs
The graphs whose paths are calculated.
weight : string/None
edge attribute used as weight to calculate the shortest path.
directed: boolean
Whether graph is directed.

Return
------
sp : list of list
List of shortest paths of the graph, where each path is represented by a list of nodes.
"""
sptrie = Trie()
lensp = 0
for n1, n2 in combinations(G.nodes(), 2):
try:
spltemp = list(nx.all_shortest_paths(G, n1, n2, weight=weight))
except nx.NetworkXNoPath: # nodes not connected
pass
else:
lensp += len(spltemp)
if not directed:
lensp += len(spltemp)
for sp in spltemp:
sptrie.insertWord(sp)
# each edge walk is counted twice, starting from both its extreme nodes.
if not directed:
sptrie.insertWord(sp[::-1])
# add single nodes as length 0 paths.
for n in G.nodes():
sptrie.insertWord([n])

return sptrie, lensp + nx.number_of_nodes(G)


def wrapper_getSP_trie(weight, directed, itr_item):
g = itr_item[0]
i = itr_item[1]
return i, get_shortest_paths(g, weight, directed)
return i, get_sps_as_trie(g, weight, directed)

+ 1
- 1
pygraph/kernels/untilHPathKernel.py View File

@@ -27,7 +27,7 @@ def untilhpathkernel(*args,
edge_label='bond_type',
depth=10,
k_func='tanimoto',
compute_method='naive',
compute_method='trie',
n_jobs=None):
"""Calculate path graph kernels up to depth/hight h between graphs.
Parameters


Loading…
Cancel
Save