Browse Source

1. upgrate spkernel, now supporting symbolic and non-symbolic node labels, directed and undirected graphs, and allowing user-defined node kernels.

2. model_selection_precomputed can now save all results as human readable text.
3. modify pygraph.utils.utils.floydTransformation and pygraph.utils.graphdataset.get_dataset_attributes.
v0.1
jajupmochi 7 years ago
parent
commit
efca08fb30
24 changed files with 1330 additions and 18992 deletions
  1. +4
    -2
      .gitignore
  2. +112
    -0
      datasets/ds.py
  3. +8
    -0
      notebooks/libs.py
  4. +0
    -3
      notebooks/paths.py
  5. +17
    -138
      notebooks/run_commonwalkkernel.ipynb
  6. +0
    -1555
      notebooks/run_marginalizedkernel_acyclic.ipynb
  7. +159
    -9
      notebooks/run_pathkernel.ipynb
  8. +0
    -822
      notebooks/run_pathkernel_acyclic.ipynb
  9. +56
    -94
      notebooks/run_spkernel.ipynb
  10. +56
    -0
      notebooks/run_spkernel.py
  11. +0
    -599
      notebooks/run_spkernel_acyclic.ipynb
  12. +0
    -3627
      notebooks/run_untildpathkernel_acyclic.ipynb
  13. +0
    -7476
      notebooks/run_untilnwalkkernel.ipynb
  14. +0
    -3743
      notebooks/run_weisfeilerLehmankernel_acyclic.ipynb
  15. +0
    -18
      pygraph/kernels/deltaKernel.py
  16. +0
    -95
      pygraph/kernels/randomwalkKernel.py
  17. +211
    -42
      pygraph/kernels/spKernel.py
  18. +0
    -219
      pygraph/kernels/untildPathKernel.py
  19. +0
    -262
      pygraph/kernels/untilnWalkKernel.py
  20. +3
    -3
      pygraph/utils/graphdataset.py
  21. +83
    -0
      pygraph/utils/kernels.py
  22. +298
    -283
      pygraph/utils/model_selection_precomputed.py
  23. +320
    -0
      pygraph/utils/suffix_tree.py
  24. +3
    -2
      pygraph/utils/utils.py

+ 4
- 2
.gitignore View File

@@ -1,7 +1,9 @@
# Jupyter Notebook
.ipynb_checkpoints
datasets
notebooks/results
datasets/*
!datasets/ds.py
notebooks/results/*
requirements/*

__pycache__
##*#

+ 112
- 0
datasets/ds.py View File

@@ -0,0 +1,112 @@
dslist = [
{
'name': 'Acyclic',
'dataset': '../datasets/acyclic/dataset_bps.ds',
'task': 'regression'
}, # node_labeled
{
'name': 'COIL-DEL',
'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'
}, # edge_labeled
{
'name': 'PAH',
'dataset': '../datasets/PAH/dataset.ds',
}, # unlabeled
{
'name': 'Mutagenicity',
'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'
}, # fully_labeled
{
'name': 'MAO',
'dataset': '../datasets/MAO/dataset.ds',
},
{
'name': 'MUTAG',
'dataset': '../datasets/MUTAG/MUTAG.mat',
'extra_params': {
'am_sp_al_nl_el': [0, 0, 3, 1, 2]
}
},
{
'name': 'Alkane',
'dataset': '../datasets/Alkane/dataset.ds',
'task': 'regression',
'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt',
},
{
'name': 'BZR',
'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'
},
{
'name': 'COX2',
'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt'
},
{
'name': 'ENZYMES',
'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'
},
{
'name': 'DHFR',
'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt'
},
{
'name': 'SYNTHETIC',
'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt'
},
{
'name': 'MSRC9',
'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'
},
{
'name': 'MSRC21',
'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'
},
{
'name': 'FIRSTMM_DB',
'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'
},
{
'name': 'PROTEINS',
'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'
},
{
'name': 'PROTEINS_full',
'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'
},
{
'name': 'D&D',
'dataset': '../datasets/D&D/DD.mat',
'extra_params': {
'am_sp_al_nl_el': [0, 1, 2, 1, -1]
}
},
{
'name': 'AIDS',
'dataset': '../datasets/AIDS/AIDS_A.txt'
},
{
'name': 'NCI1',
'dataset': '../datasets/NCI1/NCI1.mat',
'extra_params': {
'am_sp_al_nl_el': [1, 1, 2, 0, -1]
}
},
{
'name': 'NCI109',
'dataset': '../datasets/NCI109/NCI109.mat',
'extra_params': {
'am_sp_al_nl_el': [1, 1, 2, 0, -1]
}
},
{
'name': 'NCI-HIV',
'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf',
'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',
},

# # not working below
# {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',},
# {'name': 'PTC_FR', 'dataset': '../datasets/PTC/Train/FR.ds',},
# {'name': 'PTC_MM', 'dataset': '../datasets/PTC/Train/MM.ds',},
# {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',},
]

+ 8
- 0
notebooks/libs.py View File

@@ -0,0 +1,8 @@
import sys
import pathlib
sys.path.insert(0, "../")

import numpy as np

from pygraph.utils.model_selection_precomputed import model_selection_for_precomputed_kernel
from datasets.ds import dslist

+ 0
- 3
notebooks/paths.py View File

@@ -1,3 +0,0 @@
import sys
import pathlib
sys.path.insert(0, "../")

+ 17
- 138
notebooks/run_commonwalkkernel.ipynb
File diff suppressed because it is too large
View File


+ 0
- 1555
notebooks/run_marginalizedkernel_acyclic.ipynb
File diff suppressed because it is too large
View File


+ 159
- 9
notebooks/run_pathkernel.ipynb
File diff suppressed because it is too large
View File


+ 0
- 822
notebooks/run_pathkernel_acyclic.ipynb
File diff suppressed because it is too large
View File


+ 56
- 94
notebooks/run_spkernel.ipynb
File diff suppressed because it is too large
View File


+ 56
- 0
notebooks/run_spkernel.py View File

@@ -0,0 +1,56 @@
from libs import *
from pygraph.kernels.spKernel import spkernel

dslist = [
# {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds', 'task': 'regression'}, # node_labeled
# {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge_labeled
# {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds',}, # unlabeled
{'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, # fully_labeled
# {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds',},

# {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat',
# 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}},
# {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression',
# 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt',},
# {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'},
# {'name': 'COX2', 'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt'},
{'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'},
# {'name': 'DHFR', 'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt'},
# {'name': 'SYNTHETIC', 'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt'},
# {'name': 'MSRC9', 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'},
# {'name': 'MSRC21', 'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'},
# {'name': 'FIRSTMM_DB', 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'},

# {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'},
# {'name': 'PROTEINS_full', 'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'},
# {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat',
# 'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}},
# {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'},
# {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1.mat',
# 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}},
# {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109.mat',
# 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}},
# {'name': 'NCI-HIV', 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf',
# 'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',},
# # not working below
# {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',},
# {'name': 'PTC_FR', 'dataset': '../datasets/PTC/Train/FR.ds',},
# {'name': 'PTC_MM', 'dataset': '../datasets/PTC/Train/MM.ds',},
# {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',},
]
estimator = spkernel
param_grid_precomputed = {}
param_grid = [{'C': np.logspace(-10, 10, num = 41, base = 10)},
{'alpha': np.logspace(-10, 10, num = 41, base = 10)}]

for ds in dslist:
print()
print(ds['name'])
model_selection_for_precomputed_kernel(
ds['dataset'], estimator, param_grid_precomputed,
(param_grid[1] if ('task' in ds and ds['task'] == 'regression') else param_grid[0]),
(ds['task'] if 'task' in ds else 'classification'), NUM_TRIALS=30,
datafile_y=(ds['dataset_y'] if 'dataset_y' in ds else None),
extra_params=(ds['extra_params'] if 'extra_params' in ds else None))
print()

+ 0
- 599
notebooks/run_spkernel_acyclic.ipynb
File diff suppressed because it is too large
View File


+ 0
- 3627
notebooks/run_untildpathkernel_acyclic.ipynb
File diff suppressed because it is too large
View File


+ 0
- 7476
notebooks/run_untilnwalkkernel.ipynb
File diff suppressed because it is too large
View File


+ 0
- 3743
notebooks/run_weisfeilerLehmankernel_acyclic.ipynb
File diff suppressed because it is too large
View File


+ 0
- 18
pygraph/kernels/deltaKernel.py View File

@@ -1,18 +0,0 @@
def deltakernel(condition):
"""Return 1 if condition holds, 0 otherwise.

Parameters
----------
condition : Boolean
A condition, according to which the kernel is set to 1 or 0.

Return
------
kernel : integer
Delta kernel.

References
----------
[1] H. Kashima, K. Tsuda, and A. Inokuchi. Marginalized kernels between labeled graphs. In Proceedings of the 20th International Conference on Machine Learning, Washington, DC, United States, 2003.
"""
return condition #(1 if condition else 0)

+ 0
- 95
pygraph/kernels/randomwalkKernel.py View File

@@ -1,95 +0,0 @@
"""
@author: linlin
@references: S Vichy N Vishwanathan, Nicol N Schraudolph, Risi Kondor, and Karsten M Borgwardt. Graph kernels. Journal of Machine Learning Research, 11(Apr):1201–1242, 2010.
"""

import sys
import pathlib
sys.path.insert(0, "../")
import time

# from collections import Counter

import networkx as nx
import numpy as np


def randomwalkkernel(*args, node_label='atom', edge_label='bond_type', labeled=True, n=10, method=''):
"""Calculate random walk graph kernels.
Parameters
----------
Gn : List of NetworkX graph
List of graphs between which the kernels are calculated.
/
G1, G2 : NetworkX graphs
2 graphs between which the kernel is calculated.
node_label : string
node attribute used as label. The default node label is atom.
edge_label : string
edge attribute used as label. The default edge label is bond_type.
labeled : boolean
Whether the graphs are labeled. The default is True.
n : integer
Longest length of walks.
method : string
Method used to compute the random walk kernel. Available methods are 'sylvester', 'conjugate', 'fp', 'spectral' and 'kron'.

Return
------
Kmatrix : Numpy matrix
Kernel matrix, each element of which is the path kernel up to d between 2 praphs.
"""
method = method.lower()
Gn = args[0] if len(args) == 1 else [args[0], args[1]] # arrange all graphs in a list
Kmatrix = np.zeros((len(Gn), len(Gn)))
n = int(n)

start_time = time.time()

# get all paths of all graphs before calculating kernels to save time, but this may cost a lot of memory for large dataset.
all_walks = [ find_all_walks_until_length(Gn[i], n, node_label = node_label, edge_label = edge_label, labeled = labeled) for i in range(0, len(Gn)) ]

for i in range(0, len(Gn)):
for j in range(i, len(Gn)):
Kmatrix[i][j] = _randomwalkkernel_do(all_walks[i], all_walks[j], node_label = node_label, edge_label = edge_label, labeled = labeled)
Kmatrix[j][i] = Kmatrix[i][j]

run_time = time.time() - start_time
print("\n --- kernel matrix of walk kernel up to %d of size %d built in %s seconds ---" % (n, len(Gn), run_time))

return Kmatrix, run_time


def _randomwalkkernel_do(walks1, walks2, node_label = 'atom', edge_label = 'bond_type', labeled = True, method=''):
"""Calculate walk graph kernels up to n between 2 graphs.

Parameters
----------
walks1, walks2 : list
List of walks in 2 graphs, where for unlabeled graphs, each walk is represented by a list of nodes; while for labeled graphs, each walk is represented by a string consists of labels of nodes and edges on that walk.
node_label : string
node attribute used as label. The default node label is atom.
edge_label : string
edge attribute used as label. The default edge label is bond_type.
labeled : boolean
Whether the graphs are labeled. The default is True.

Return
------
kernel : float
Treelet Kernel between 2 graphs.
"""

if method == 'sylvester':
import warnings
warnings.warn('The Sylvester equation (rather than generalized Sylvester equation) is used; only walks of length 1 is considered.')
from control import dlyap
dpg = nx.tensor_product(G1, G2) # direct product graph
X = dlyap(A, Q, C)
pass

else:
raise Exception('No computation method specified.')

return kernel


+ 211
- 42
pygraph/kernels/spKernel.py View File

@@ -16,7 +16,7 @@ from pygraph.utils.utils import getSPGraph
from pygraph.utils.graphdataset import get_dataset_attributes


def spkernel(*args, node_label='atom', edge_weight=None):
def spkernel(*args, node_label='atom', edge_weight=None, node_kernels=None):
"""Calculate shortest-path kernels between graphs.

Parameters
@@ -27,24 +27,50 @@ def spkernel(*args, node_label='atom', edge_weight=None):
G1, G2 : NetworkX graphs
2 graphs between which the kernel is calculated.
edge_weight : string
Edge attribute corresponding to the edge weight.
Edge attribute name corresponding to the edge weight.
node_kernels: dict
A dictionary of kernel functions for nodes, including 3 items: 'symb' for symbolic node labels, 'nsymb' for non-symbolic node labels, 'mix' for both labels. The first 2 functions take two node labels as parameters, and the 'mix' function takes 4 parameters, a symbolic and a non-symbolic label for each the two nodes. Each label is in form of 2-D dimension array (n_samples, n_features). Each function returns an number as the kernel value. Ignored when nodes are unlabeled.

Return
------
Kmatrix : Numpy matrix
Kernel matrix, each element of which is the sp kernel between 2 praphs.
"""
# pre-process
Gn = args[0] if len(args) == 1 else [args[0], args[1]]
Kmatrix = np.zeros((len(Gn), len(Gn)))
try:
some_weight = list(
nx.get_edge_attributes(Gn[0], edge_weight).values())[0]
weight = edge_label if isinstance(some_weight, float) or isinstance(
some_weight, int) else None
except:
weight = None

Gn = [nx.to_directed(G) for G in Gn]

weight = None
if edge_weight == None:
print('\n None edge weight specified. Set all weight to 1.\n')
else:
try:
some_weight = list(
nx.get_edge_attributes(Gn[0], edge_weight).values())[0]
if isinstance(some_weight, float) or isinstance(some_weight, int):
weight = edge_weight
else:
print(
'\n Edge weight with name %s is not float or integer. Set all weight to 1.\n'
% edge_weight)
except:
print(
'\n Edge weight with name "%s" is not found in the edge attributes. Set all weight to 1.\n'
% edge_weight)
ds_attrs = get_dataset_attributes(
Gn, attr_names=['node_labeled'], node_label=node_label)
Gn,
attr_names=['node_labeled', 'node_attr_dim', 'is_directed'],
node_label=node_label)

# remove graphs with no edges, as no sp can be found in their structures, so the kernel between such a graph and itself will be zero.
len_gn = len(Gn)
Gn = [(idx, G) for idx, G in enumerate(Gn) if nx.number_of_edges(G) != 0]
idx = [G[0] for G in Gn]
Gn = [G[1] for G in Gn]
if len(Gn) != len_gn:
print('\n %d graphs are removed as they don\'t contain edges.\n' %
(len_gn - len(Gn)))

start_time = time.time()

@@ -54,44 +80,187 @@ def spkernel(*args, node_label='atom', edge_weight=None):
for G in tqdm(Gn, desc='getting sp graphs', file=sys.stdout)
]

Kmatrix = np.zeros((len(Gn), len(Gn)))
pbar = tqdm(
total=((len(Gn) + 1) * len(Gn) / 2),
desc='calculating kernels',
file=sys.stdout)
if ds_attrs['node_labeled']:
for i in range(0, len(Gn)):
for j in range(i, len(Gn)):
for e1 in Gn[i].edges(data=True):
for e2 in Gn[j].edges(data=True):
# cost of a node to itself equals to 0, cost between two disconnected nodes is Inf.
if e1[2]['cost'] != 0 and e1[2] != np.Inf and e1[2]['cost'] == e2[2]['cost'] and {
Gn[i].nodes[e1[0]][node_label],
Gn[i].nodes[e1[1]][node_label]
} == {
Gn[j].nodes[e2[0]][node_label],
Gn[j].nodes[e2[1]][node_label]
}:
Kmatrix[i][j] += 1
Kmatrix[j][i] = Kmatrix[i][j]
pbar.update(1)
# node symb and non-synb labeled
if ds_attrs['node_attr_dim'] > 0:
if ds_attrs['is_directed']:
for i in range(0, len(Gn)):
for j in range(i, len(Gn)):
for e1 in Gn[i].edges(data=True):
for e2 in Gn[j].edges(data=True):
if e1[2]['cost'] == e2[2]['cost']:
kn = node_kernels['mix']
try:
n11, n12, n21, n22 = Gn[i].nodes[e1[
0]], Gn[i].nodes[e1[1]], Gn[
j].nodes[e2[0]], Gn[j].nodes[
e2[1]]
kn1 = kn(n11[node_label], n21[
node_label], [n11['attributes']],
[n21['attributes']]) * kn(
n12[node_label],
n22[node_label],
[n12['attributes']],
[n22['attributes']])
Kmatrix[i][j] += kn1
except KeyError: # missing labels or attributes
pass
Kmatrix[j][i] = Kmatrix[i][j]
pbar.update(1)

else:
for i in range(0, len(Gn)):
for j in range(i, len(Gn)):
for e1 in Gn[i].edges(data=True):
for e2 in Gn[j].edges(data=True):
if e1[2]['cost'] == e2[2]['cost']:
kn = node_kernels['mix']
try:
# each edge walk is counted twice, starting from both its extreme nodes.
n11, n12, n21, n22 = Gn[i].nodes[e1[
0]], Gn[i].nodes[e1[1]], Gn[
j].nodes[e2[0]], Gn[j].nodes[
e2[1]]
kn1 = kn(n11[node_label], n21[
node_label], [n11['attributes']],
[n21['attributes']]) * kn(
n12[node_label],
n22[node_label],
[n12['attributes']],
[n22['attributes']])
kn2 = kn(n11[node_label], n22[
node_label], [n11['attributes']],
[n22['attributes']]) * kn(
n12[node_label],
n21[node_label],
[n12['attributes']],
[n21['attributes']])
Kmatrix[i][j] += kn1 + kn2
except KeyError: # missing labels or attributes
pass
Kmatrix[j][i] = Kmatrix[i][j]
pbar.update(1)
# node symb labeled
else:
if ds_attrs['is_directed']:
for i in range(0, len(Gn)):
for j in range(i, len(Gn)):
for e1 in Gn[i].edges(data=True):
for e2 in Gn[j].edges(data=True):
if e1[2]['cost'] == e2[2]['cost']:
kn = node_kernels['symb']
try:
n11, n12, n21, n22 = Gn[i].nodes[e1[
0]], Gn[i].nodes[e1[1]], Gn[
j].nodes[e2[0]], Gn[j].nodes[
e2[1]]
kn1 = kn(n11[node_label],
n21[node_label]) * kn(
n12[node_label],
n22[node_label])
Kmatrix[i][j] += kn1
except KeyError: # missing labels
pass
Kmatrix[j][i] = Kmatrix[i][j]
pbar.update(1)

else:
for i in range(0, len(Gn)):
for j in range(i, len(Gn)):
for e1 in Gn[i].edges(data=True):
for e2 in Gn[j].edges(data=True):
if e1[2]['cost'] == e2[2]['cost']:
kn = node_kernels['symb']
try:
# each edge walk is counted twice, starting from both its extreme nodes.
n11, n12, n21, n22 = Gn[i].nodes[e1[
0]], Gn[i].nodes[e1[1]], Gn[
j].nodes[e2[0]], Gn[j].nodes[
e2[1]]
kn1 = kn(n11[node_label],
n21[node_label]) * kn(
n12[node_label],
n22[node_label])
kn2 = kn(n11[node_label],
n22[node_label]) * kn(
n12[node_label],
n21[node_label])
Kmatrix[i][j] += kn1 + kn2
except KeyError: # missing labels
pass
Kmatrix[j][i] = Kmatrix[i][j]
pbar.update(1)
else:
for i in range(0, len(Gn)):
for j in range(i, len(Gn)):
# kernel_t = [ e1[2]['cost'] != 0 and e1[2]['cost'] == e2[2]['cost'] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])) \
# for e1 in Sn[i].edges(data = True) for e2 in Sn[j].edges(data = True) ]
# Kmatrix[i][j] = np.sum(kernel_t)
# Kmatrix[j][i] = Kmatrix[i][j]

for e1 in Gn[i].edges(data=True):
for e2 in Gn[j].edges(data=True):
if e1[2]['cost'] != 0 and e1[2] != np.Inf and e1[2]['cost'] == e2[2]['cost']:
Kmatrix[i][j] += 1
Kmatrix[j][i] = Kmatrix[i][j]
pbar.update(1)
# node non-synb labeled
if ds_attrs['node_attr_dim'] > 0:
if ds_attrs['is_directed']:
for i in range(0, len(Gn)):
for j in range(i, len(Gn)):
for e1 in Gn[i].edges(data=True):
for e2 in Gn[j].edges(data=True):
if e1[2]['cost'] == e2[2]['cost']:
kn = node_kernels['nsymb']
try:
# each edge walk is counted twice, starting from both its extreme nodes.
n11, n12, n21, n22 = Gn[i].nodes[e1[
0]], Gn[i].nodes[e1[1]], Gn[
j].nodes[e2[0]], Gn[j].nodes[
e2[1]]
kn1 = kn([n11['attributes']],
[n21['attributes']]) * kn(
[n12['attributes']],
[n22['attributes']])
Kmatrix[i][j] += kn1
except KeyError: # missing attributes
pass
Kmatrix[j][i] = Kmatrix[i][j]
pbar.update(1)
else:
for i in range(0, len(Gn)):
for j in range(i, len(Gn)):
for e1 in Gn[i].edges(data=True):
for e2 in Gn[j].edges(data=True):
if e1[2]['cost'] == e2[2]['cost']:
kn = node_kernels['nsymb']
try:
# each edge walk is counted twice, starting from both its extreme nodes.
n11, n12, n21, n22 = Gn[i].nodes[e1[
0]], Gn[i].nodes[e1[1]], Gn[
j].nodes[e2[0]], Gn[j].nodes[
e2[1]]
kn1 = kn([n11['attributes']],
[n21['attributes']]) * kn(
[n12['attributes']],
[n22['attributes']])
kn2 = kn([n11['attributes']],
[n22['attributes']]) * kn(
[n12['attributes']],
[n21['attributes']])
Kmatrix[i][j] += kn1 + kn2
except KeyError: # missing attributes
pass
Kmatrix[j][i] = Kmatrix[i][j]
pbar.update(1)

# node unlabeled
else:
for i in range(0, len(Gn)):
for j in range(i, len(Gn)):
for e1 in Gn[i].edges(data=True):
for e2 in Gn[j].edges(data=True):
if e1[2]['cost'] == e2[2]['cost']:
Kmatrix[i][j] += 1
Kmatrix[j][i] = Kmatrix[i][j]
pbar.update(1)

run_time = time.time() - start_time
print(
"--- shortest path kernel matrix of size %d built in %s seconds ---" %
(len(Gn), run_time))
"\n --- shortest path kernel matrix of size %d built in %s seconds ---"
% (len(Gn), run_time))

return Kmatrix, run_time
return Kmatrix, run_time, idx

+ 0
- 219
pygraph/kernels/untildPathKernel.py View File

@@ -1,219 +0,0 @@
"""
@author: linlin
@references: Liva Ralaivola, Sanjay J Swamidass, Hiroto Saigo, and Pierre Baldi. Graph kernels for chemical informatics. Neural networks, 18(8):1093–1110, 2005.
"""

import sys
import pathlib
sys.path.insert(0, "../")
import time
from collections import Counter

import networkx as nx
import numpy as np


def untildpathkernel(*args, node_label='atom', edge_label='bond_type', labeled=True, depth=10, k_func='tanimoto'):
"""Calculate path graph kernels up to depth d between graphs.
Parameters
----------
Gn : List of NetworkX graph
List of graphs between which the kernels are calculated.
/
G1, G2 : NetworkX graphs
2 graphs between which the kernel is calculated.
node_label : string
node attribute used as label. The default node label is atom.
edge_label : string
edge attribute used as label. The default edge label is bond_type.
labeled : boolean
Whether the graphs are labeled. The default is True.
depth : integer
Depth of search. Longest length of paths.
k_func : function
A kernel function used using different notions of fingerprint similarity.

Return
------
Kmatrix : Numpy matrix
Kernel matrix, each element of which is the path kernel up to d between 2 praphs.
"""
depth = int(depth)
if len(args) == 1: # for a list of graphs
Gn = args[0]
Kmatrix = np.zeros((len(Gn), len(Gn)))

start_time = time.time()

# get all paths of all graphs before calculating kernels to save time, but this may cost a lot of memory for large dataset.
all_paths = [find_all_paths_until_length(
Gn[i], depth, node_label=node_label, edge_label=edge_label, labeled=labeled) for i in range(0, len(Gn))]

for i in range(0, len(Gn)):
for j in range(i, len(Gn)):
Kmatrix[i][j] = _untildpathkernel_do(
all_paths[i], all_paths[j], k_func, node_label=node_label, edge_label=edge_label, labeled=labeled)
Kmatrix[j][i] = Kmatrix[i][j]

run_time = time.time() - start_time
print("\n --- kernel matrix of path kernel up to %d of size %d built in %s seconds ---" %
(depth, len(Gn), run_time))

return Kmatrix, run_time

else: # for only 2 graphs

start_time = time.time()

all_paths1 = find_all_paths_until_length(
args[0], depth, node_label=node_label, edge_label=edge_label, labeled=labeled)
all_paths2 = find_all_paths_until_length(
args[1], depth, node_label=node_label, edge_label=edge_label, labeled=labeled)

kernel = _untildpathkernel_do(
all_paths1, all_paths2, k_func, node_label=node_label, edge_label=edge_label, labeled=labeled)

run_time = time.time() - start_time
print("\n --- path kernel up to %d built in %s seconds ---" %
(depth, run_time))

return kernel, run_time


def _untildpathkernel_do(paths1, paths2, k_func, node_label='atom', edge_label='bond_type', labeled=True):
"""Calculate path graph kernels up to depth d between 2 graphs.

Parameters
----------
paths1, paths2 : list
List of paths in 2 graphs, where for unlabeled graphs, each path is represented by a list of nodes; while for labeled graphs, each path is represented by a string consists of labels of nodes and edges on that path.
k_func : function
A kernel function used using different notions of fingerprint similarity.
node_label : string
node attribute used as label. The default node label is atom.
edge_label : string
edge attribute used as label. The default edge label is bond_type.
labeled : boolean
Whether the graphs are labeled. The default is True.

Return
------
kernel : float
Treelet Kernel between 2 graphs.
"""
all_paths = list(set(paths1 + paths2))

if k_func == 'tanimoto':
vector1 = [(1 if path in paths1 else 0) for path in all_paths]
vector2 = [(1 if path in paths2 else 0) for path in all_paths]
kernel_uv = np.dot(vector1, vector2)
kernel = kernel_uv / (len(set(paths1)) + len(set(paths2)) - kernel_uv)

else: # MinMax kernel
path_count1 = Counter(paths1)
path_count2 = Counter(paths2)
vector1 = [(path_count1[key] if (key in path_count1.keys()) else 0)
for key in all_paths]
vector2 = [(path_count2[key] if (key in path_count2.keys()) else 0)
for key in all_paths]
kernel = np.sum(np.minimum(vector1, vector2)) / \
np.sum(np.maximum(vector1, vector2))

return kernel

# this method find paths repetively, it could be faster.


def find_all_paths_until_length(G, length, node_label='atom', edge_label='bond_type', labeled=True):
"""Find all paths with a certain maximum length in a graph. A recursive depth first search is applied.

Parameters
----------
G : NetworkX graphs
The graph in which paths are searched.
length : integer
The maximum length of paths.
node_label : string
node attribute used as label. The default node label is atom.
edge_label : string
edge attribute used as label. The default edge label is bond_type.
labeled : boolean
Whether the graphs are labeled. The default is True.

Return
------
path : list
List of paths retrieved, where for unlabeled graphs, each path is represented by a list of nodes; while for labeled graphs, each path is represented by a string consists of labels of nodes and edges on that path.
"""
all_paths = []
for i in range(0, length + 1):
new_paths = find_all_paths(G, i)
if new_paths == []:
break
all_paths.extend(new_paths)

if labeled == True: # convert paths to strings
path_strs = []
for path in all_paths:
strlist = [G.node[node][node_label] + G[node]
[path[path.index(node) + 1]][edge_label] for node in path[:-1]]
path_strs.append(''.join(strlist) + G.node[path[-1]][node_label])

return path_strs

return all_paths


def find_paths(G, source_node, length):
"""Find all paths with a certain length those start from a source node. A recursive depth first search is applied.

Parameters
----------
G : NetworkX graphs
The graph in which paths are searched.
source_node : integer
The number of the node from where all paths start.
length : integer
The length of paths.

Return
------
path : list of list
List of paths retrieved, where each path is represented by a list of nodes.
"""
return [[source_node]] if length == 0 else \
[[source_node] + path for neighbor in G[source_node]
for path in find_paths(G, neighbor, length - 1) if source_node not in path]


def find_all_paths(G, length):
"""Find all paths with a certain length in a graph. A recursive depth first search is applied.

Parameters
----------
G : NetworkX graphs
The graph in which paths are searched.
length : integer
The length of paths.

Return
------
path : list of list
List of paths retrieved, where each path is represented by a list of nodes.
"""
all_paths = []
for node in G:
all_paths.extend(find_paths(G, node, length))

# The following process is not carried out according to the original article
# all_paths_r = [ path[::-1] for path in all_paths ]

# # For each path, two presentation are retrieved from its two extremities. Remove one of them.
# for idx, path in enumerate(all_paths[:-1]):
# for path2 in all_paths_r[idx+1::]:
# if path == path2:
# all_paths[idx] = []
# break

# return list(filter(lambda a: a != [], all_paths))
return all_paths

+ 0
- 262
pygraph/kernels/untilnWalkKernel.py View File

@@ -1,262 +0,0 @@
"""
@author: linlin
@references:
[1] Thomas Gärtner, Peter Flach, and Stefan Wrobel. On graph kernels: Hardness results and efficient alternatives. Learning Theory and Kernel Machines, pages 129–143, 2003.
"""

import sys
import pathlib
sys.path.insert(0, "../")
import time

from collections import Counter

import networkx as nx
import numpy as np


def untilnwalkkernel(*args,
node_label='atom',
edge_label='bond_type',
labeled=True,
n=None,
compute_method='direct'):
"""Calculate common walk graph kernels up to depth d between graphs.
Parameters
----------
Gn : List of NetworkX graph
List of graphs between which the kernels are calculated.
/
G1, G2 : NetworkX graphs
2 graphs between which the kernel is calculated.
node_label : string
node attribute used as label. The default node label is atom.
edge_label : string
edge attribute used as label. The default edge label is bond_type.
labeled : boolean
Whether the graphs are labeled. The default is True.
n : integer
Longest length of walks.
compute_method : string
Method used to compute walk kernel. The Following choices are available:
'direct' : direct product graph method, as shown in reference [1]. The time complexity is O(n^6) for unlabeled graphs with n vertices.
'brute' : brute force, simply search for all walks and compare them.

Return
------
Kmatrix : Numpy matrix
Kernel matrix, each element of which is the path kernel up to d between 2 graphs.
"""
# arrange all graphs in a list
Gn = args[0] if len(args) == 1 else [args[0], args[1]]
Kmatrix = np.zeros((len(Gn), len(Gn)))
n = int(n)

start_time = time.time()

# direct product graph method
if compute_method == 'direct':
for i in range(0, len(Gn)):
for j in range(i, len(Gn)):
Kmatrix[i][j] = _untilnwalkkernel_direct(
Gn[i], Gn[j], node_label, edge_label, labeled)
Kmatrix[j][i] = Kmatrix[i][j]

# search all paths use brute force.
elif compute_method == 'brute':
# get all paths of all graphs before calculating kernels to save time, but this may cost a lot of memory for large dataset.
all_walks = [
find_all_walks_until_length(Gn[i], n, node_label, edge_label,
labeled) for i in range(0, len(Gn))
]

for i in range(0, len(Gn)):
for j in range(i, len(Gn)):
Kmatrix[i][j] = _untilnwalkkernel_brute(
all_walks[i],
all_walks[j],
node_label=node_label,
edge_label=edge_label,
labeled=labeled)
Kmatrix[j][i] = Kmatrix[i][j]

run_time = time.time() - start_time
print(
"\n --- kernel matrix of walk kernel up to %d of size %d built in %s seconds ---"
% (n, len(Gn), run_time))

return Kmatrix, run_time


def _untilnwalkkernel_direct(G1, G2, node_label, edge_label, labeled):
"""Calculate walk graph kernels up to n between 2 graphs using direct product graphs.

Parameters
----------
G1, G2 : NetworkX graph
Graphs between which the kernel is calculated.
node_label : string
node attribute used as label.
edge_label : string
edge attribute used as label.
labeled : boolean
Whether the graphs are labeled.

Return
------
kernel : float
Treelet Kernel between 2 graphs.
"""

# get tensor product / direct product
gp = nx.tensor_product(G1, G2)
from matplotlib import pyplot as plt
nx.draw_networkx(G1)
plt.show()
nx.draw_networkx(G2)
plt.show()
kernel = 0
nx.draw_networkx(gp)

plt.show()
return kernel


def _untilnwalkkernel_brute(walks1,
walks2,
node_label='atom',
edge_label='bond_type',
labeled=True):
"""Calculate walk graph kernels up to n between 2 graphs.

Parameters
----------
walks1, walks2 : list
List of walks in 2 graphs, where for unlabeled graphs, each walk is represented by a list of nodes; while for labeled graphs, each walk is represented by a string consists of labels of nodes and edges on that walk.
node_label : string
node attribute used as label. The default node label is atom.
edge_label : string
edge attribute used as label. The default edge label is bond_type.
labeled : boolean
Whether the graphs are labeled. The default is True.

Return
------
kernel : float
Treelet Kernel between 2 graphs.
"""
counts_walks1 = dict(Counter(walks1))
counts_walks2 = dict(Counter(walks2))
all_walks = list(set(walks1 + walks2))

vector1 = [(counts_walks1[walk] if walk in walks1 else 0)
for walk in all_walks]
vector2 = [(counts_walks2[walk] if walk in walks2 else 0)
for walk in all_walks]
kernel = np.dot(vector1, vector2)

return kernel


# this method find walks repetively, it could be faster.
def find_all_walks_until_length(G,
length,
node_label='atom',
edge_label='bond_type',
labeled=True):
"""Find all walks with a certain maximum length in a graph. A recursive depth first search is applied.

Parameters
----------
G : NetworkX graphs
The graph in which walks are searched.
length : integer
The maximum length of walks.
node_label : string
node attribute used as label. The default node label is atom.
edge_label : string
edge attribute used as label. The default edge label is bond_type.
labeled : boolean
Whether the graphs are labeled. The default is True.

Return
------
walk : list
List of walks retrieved, where for unlabeled graphs, each walk is represented by a list of nodes; while for labeled graphs, each walk is represented by a string consists of labels of nodes and edges on that walk.
"""
all_walks = []
# @todo: in this way, the time complexity is close to N(d^n+d^(n+1)+...+1), which could be optimized to O(Nd^n)
for i in range(0, length + 1):
new_walks = find_all_walks(G, i)
if new_walks == []:
break
all_walks.extend(new_walks)

if labeled == True: # convert paths to strings
walk_strs = []
for walk in all_walks:
strlist = [
G.node[node][node_label] +
G[node][walk[walk.index(node) + 1]][edge_label]
for node in walk[:-1]
]
walk_strs.append(''.join(strlist) + G.node[walk[-1]][node_label])

return walk_strs

return all_walks


def find_walks(G, source_node, length):
"""Find all walks with a certain length those start from a source node. A recursive depth first search is applied.

Parameters
----------
G : NetworkX graphs
The graph in which walks are searched.
source_node : integer
The number of the node from where all walks start.
length : integer
The length of walks.

Return
------
walk : list of list
List of walks retrieved, where each walk is represented by a list of nodes.
"""
return [[source_node]] if length == 0 else \
[ [source_node] + walk for neighbor in G[source_node] \
for walk in find_walks(G, neighbor, length - 1) ]


def find_all_walks(G, length):
"""Find all walks with a certain length in a graph. A recursive depth first search is applied.

Parameters
----------
G : NetworkX graphs
The graph in which walks are searched.
length : integer
The length of walks.

Return
------
walk : list of list
List of walks retrieved, where each walk is represented by a list of nodes.
"""
all_walks = []
for node in G:
all_walks.extend(find_walks(G, node, length))

### The following process is not carried out according to the original article
# all_paths_r = [ path[::-1] for path in all_paths ]

# # For each path, two presentation are retrieved from its two extremities. Remove one of them.
# for idx, path in enumerate(all_paths[:-1]):
# for path2 in all_paths_r[idx+1::]:
# if path == path2:
# all_paths[idx] = []
# break

# return list(filter(lambda a: a != [], all_paths))
return all_walks

+ 3
- 3
pygraph/utils/graphdataset.py View File

@@ -111,7 +111,7 @@ def get_dataset_attributes(Gn,
if 'attributes' in attrs:
return len(attrs['attributes'])
else:
return False
return 0

def get_edge_attr_dim(Gn):
for G in Gn:
@@ -120,8 +120,8 @@ def get_dataset_attributes(Gn,
if 'attributes' in e[2]:
return len(e[2]['attributes'])
else:
return False
return False
return 0
return 0

if attr_names == []:
attr_names = [


+ 83
- 0
pygraph/utils/kernels.py View File

@@ -0,0 +1,83 @@
"""Those who are not graph kernels. We can be kernels for nodes or edges!
"""


def deltakernel(x, y):
"""Delta kernel. Return 1 if x == y, 0 otherwise.

Parameters
----------
x, y : any
Two parts to compare.

Return
------
kernel : integer
Delta kernel.

References
----------
[1] H. Kashima, K. Tsuda, and A. Inokuchi. Marginalized kernels between labeled graphs. In Proceedings of the 20th International Conference on Machine Learning, Washington, DC, United States, 2003.
"""
return x == y #(1 if condition else 0)


def gaussiankernel(x, y):
"""Gaussian kernel. Use sklearn.metrics.pairwise.rbf_kernel instead.
"""
pass


def kernelsum(k1, k2, d11, d12, d21=None, d22=None, lamda1=1, lamda2=1):
"""Sum of a pair of kernels.

k = lamda1 * k1(d11, d12) + lamda2 * k2(d21, d22)

Parameters
----------
k1, k2 : function
A pair of kernel functions.
d11, d12:
Inputs of k1. If d21 or d22 is None, apply d11, d12 to both k1 and k2.
d21, d22:
Inputs of k2.
lamda1, lamda2: float
Coefficients of the product.

Return
------
kernel : integer

"""
if d21 == None or d22 == None:
kernel = lamda1 * k1(d11, d12) + lamda2 * k2(d11, d12)
else:
kernel = lamda1 * k1(d11, d12) + lamda2 * k2(d21, d22)
return kernel


def kernelproduct(k1, k2, d11, d12, d21=None, d22=None, lamda=1):
"""Product of a pair of kernels.

k = lamda * k1(d11, d12) * k2(d21, d22)

Parameters
----------
k1, k2 : function
A pair of kernel functions.
d11, d12:
Inputs of k1. If d21 or d22 is None, apply d11, d12 to both k1 and k2.
d21, d22:
Inputs of k2.
lamda: float
Coefficient of the product.

Return
------
kernel : integer
"""
if d21 == None or d22 == None:
kernel = lamda * k1(d11, d12) * k2(d11, d12)
else:
kernel = lamda * k1(d11, d12) * k2(d21, d22)
return kernel

+ 298
- 283
pygraph/utils/model_selection_precomputed.py View File

@@ -4,7 +4,8 @@ def model_selection_for_precomputed_kernel(datafile, estimator,
param_grid_precomputed, param_grid,
model_type, NUM_TRIALS=30,
datafile_y=None,
extra_params=None):
extra_params=None,
ds_name='ds-unknown'):
"""Perform model selection, fitting and testing for precomputed kernels using nested cv. Print out neccessary data during the process then finally the results.

Parameters
@@ -14,9 +15,9 @@ def model_selection_for_precomputed_kernel(datafile, estimator,
estimator : function
kernel function used to estimate. This function needs to return a gram matrix.
param_grid_precomputed : dictionary
Dictionary with names (string) of parameters used to calculate gram matrices as keys and lists of parameter settings to try as values. This enables searching over any sequence of parameter settings.
Dictionary with names (string) of parameters used to calculate gram matrices as keys and lists of parameter settings to try as values. This enables searching over any sequence of parameter settings. Params with length 1 will be omitted.
param_grid : dictionary
Dictionary with names (string) of parameters used as penelties as keys and lists of parameter settings to try as values. This enables searching over any sequence of parameter settings.
Dictionary with names (string) of parameters used as penelties as keys and lists of parameter settings to try as values. This enables searching over any sequence of parameter settings. Params with length 1 will be omitted.
model_type : string
Typr of the problem, can be regression or classification.
NUM_TRIALS : integer
@@ -49,7 +50,7 @@ def model_selection_for_precomputed_kernel(datafile, estimator,
import sys
sys.path.insert(0, "../")
import os
from os.path import basename
from os.path import basename, splitext
from pygraph.utils.graphfiles import loadDataset
from tqdm import tqdm
tqdm.monitor_interval = 0
@@ -57,291 +58,305 @@ def model_selection_for_precomputed_kernel(datafile, estimator,
results_dir = '../notebooks/results/' + estimator.__name__
if not os.path.exists(results_dir):
os.makedirs(results_dir)
results_name_pre = results_dir + '/' + basename(datafile) + '_'

# setup the model type
model_type = model_type.lower()
if model_type != 'regression' and model_type != 'classification':
raise Exception(
'The model type is incorrect! Please choose from regression or classification.')
print()
print('--- This is a %s problem ---' % model_type)
# open file to save all results for this dataset.
with open(results_dir + '/' + ds_name + '.txt', 'w') as fresults:
fresults.write('# This file contains results of ' + estimator.__name__ + ' on dataset ' + ds_name + ',\n# including gram matrices, serial numbers for gram matrix figures and performance.\n\n')

# Load the dataset
print()
print('1. Loading dataset from file...')
dataset, y = loadDataset(datafile, filename_y=datafile_y, extra_params=extra_params)
# import matplotlib.pyplot as plt
# import networkx as nx
# nx.draw_networkx(dataset[30])
# plt.show()

# Grid of parameters with a discrete number of values for each.
param_list_precomputed = list(ParameterGrid(param_grid_precomputed))
param_list = list(ParameterGrid(param_grid))
# np.savetxt(results_name_pre + 'param_grid_precomputed.dt',
# [[key, value] for key, value in sorted(param_grid_precomputed)])
# np.savetxt(results_name_pre + 'param_grid.dt',
# [[key, value] for key, value in sorted(param_grid)])

gram_matrices = [] # a list to store gram matrices for all param_grid_precomputed
gram_matrix_time = [] # a list to store time to calculate gram matrices
param_list_pre_revised = [] # list to store param grids precomputed ignoring the useless ones

# calculate all gram matrices
print()
print('2. Calculating gram matrices. This could take a while...')
nb_gm_ignore = 0 # the number of gram matrices those should not be considered, as they may contain elements that are not numbers (NaN)
for params_out in param_list_precomputed:
# setup the model type
model_type = model_type.lower()
if model_type != 'regression' and model_type != 'classification':
raise Exception(
'The model type is incorrect! Please choose from regression or classification.')
print()
if params_out != {}:
print('gram matrix with parameters', params_out, 'is: ')
print('--- This is a %s problem ---' % model_type)
fresults.write('This is a %s problem.\n\n' % model_type)

Kmatrix, current_run_time = estimator(dataset, **params_out)
Kmatrix_diag = Kmatrix.diagonal().copy()
# Load the dataset
print()
print('\nI. Loading dataset from file...')
dataset, y = loadDataset(datafile, filename_y=datafile_y, extra_params=extra_params)

# import matplotlib.pyplot as plt
# import networkx as nx
# nx.draw_networkx(dataset[30])
# plt.show()

# Grid of parameters with a discrete number of values for each.
param_list_precomputed = list(ParameterGrid(param_grid_precomputed))
param_list = list(ParameterGrid(param_grid))
# np.savetxt(results_name_pre + 'param_grid_precomputed.dt',
# [[key, value] for key, value in sorted(param_grid_precomputed)])
# np.savetxt(results_name_pre + 'param_grid.dt',
# [[key, value] for key, value in sorted(param_grid)])

gram_matrices = [] # a list to store gram matrices for all param_grid_precomputed
gram_matrix_time = [] # a list to store time to calculate gram matrices
param_list_pre_revised = [] # list to store param grids precomputed ignoring the useless ones

# calculate all gram matrices
print()
print('2. Calculating gram matrices. This could take a while...')
fresults.write('\nI. Gram matrices.\n\n')
nb_gm_ignore = 0 # the number of gram matrices those should not be considered, as they may contain elements that are not numbers (NaN)
for idx, params_out in enumerate(param_list_precomputed):
rtn_data = estimator(dataset, **params_out)
Kmatrix = rtn_data[0]
current_run_time = rtn_data[1]
if len(rtn_data) == 3:
idx_trim = rtn_data[2] # the index of trimmed graph list
y = [y[idx] for idx in idx_trim]
Kmatrix_diag = Kmatrix.diagonal().copy()
for i in range(len(Kmatrix)):
for j in range(i, len(Kmatrix)):
# if Kmatrix_diag[i] != 0 and Kmatrix_diag[j] != 0:
Kmatrix[i][j] /= np.sqrt(Kmatrix_diag[i] * Kmatrix_diag[j])
Kmatrix[j][i] = Kmatrix[i][j]

print()
if params_out == {}:
print('the gram matrix is: ')
fresults.write('the gram matrix is:\n\n')
else:
print('the gram matrix with parameters', params_out, 'is: ')
fresults.write('the gram matrix with parameters %s is:\n\n' % params_out)
if np.isnan(Kmatrix).any(): # if the matrix contains elements that are not numbers
nb_gm_ignore += 1
print('ignored, as it contains elements that are not numbers.')
fresults.write('ignored, as it contains elements that are not numbers.\n\n')
else:
print(Kmatrix)
fresults.write(np.array2string(Kmatrix, separator=',', threshold=np.inf, floatmode='unique') + '\n\n')
plt.matshow(Kmatrix)
plt.colorbar()
fig_file_name = results_dir + '/GM[ds]' + ds_name
if params_out != {}:
fig_file_name += '[params]' + str(idx)
plt.savefig(fig_file_name + '.eps', format='eps', dpi=300)
plt.show()
gram_matrices.append(Kmatrix)
gram_matrix_time.append(current_run_time)
param_list_pre_revised.append(params_out)
print()
print('{} gram matrices are calculated, {} of which are ignored.'.format(len(param_list_precomputed), nb_gm_ignore))
fresults.write('{} gram matrices are calculated, {} of which are ignored.\n\n'.format(len(param_list_precomputed), nb_gm_ignore))
fresults.write('serial numbers of gram matrix figure and their corresponding parameters settings:\n\n')
fresults.write(''.join(['{}: {}\n'.format(idx, params_out)
for idx, params_out in enumerate(param_list_precomputed)]))

for i in range(len(Kmatrix)):
for j in range(i, len(Kmatrix)):
# print(Kmatrix[i][j])
# if Kmatrix_diag[i] != 0 and Kmatrix_diag[j] != 0:
Kmatrix[i][j] /= np.sqrt(Kmatrix_diag[i] * Kmatrix_diag[j])
# print(i, j, Kmatrix[i][j], Kmatrix_diag[i], Kmatrix_diag[j])
Kmatrix[j][i] = Kmatrix[i][j]
print()
print('3. Fitting and predicting using nested cross validation. This could really take a while...')
# Arrays to store scores
train_pref = np.zeros(
(NUM_TRIALS, len(param_list_pre_revised), len(param_list)))
val_pref = np.zeros(
(NUM_TRIALS, len(param_list_pre_revised), len(param_list)))
test_pref = np.zeros(
(NUM_TRIALS, len(param_list_pre_revised), len(param_list)))

# Loop for each trial
pbar = tqdm(total=NUM_TRIALS * len(param_list_pre_revised) * len(param_list),
desc='calculate performance', file=sys.stdout)
for trial in range(NUM_TRIALS): # Test set level
# loop for each outer param tuple
for index_out, params_out in enumerate(param_list_pre_revised):
# split gram matrix and y to app and test sets.
X_app, X_test, y_app, y_test = train_test_split(
gram_matrices[index_out], y, test_size=0.1)
split_index_app = [y.index(y_i) for y_i in y_app if y_i in y]
# split_index_test = [y.index(y_i) for y_i in y_test if y_i in y]
X_app = X_app[:, split_index_app]
X_test = X_test[:, split_index_app]
y_app = np.array(y_app)
y_test = np.array(y_test)

# loop for each inner param tuple
for index_in, params_in in enumerate(param_list):
inner_cv = KFold(n_splits=10, shuffle=True, random_state=trial)
current_train_perf = []
current_valid_perf = []
current_test_perf = []

# For regression use the Kernel Ridge method
try:
if model_type == 'regression':
KR = KernelRidge(kernel='precomputed', **params_in)
# loop for each split on validation set level
# validation set level
for train_index, valid_index in inner_cv.split(X_app):
KR.fit(X_app[train_index, :]
[:, train_index], y_app[train_index])
# predict on the train, validation and test set
y_pred_train = KR.predict(
X_app[train_index, :][:, train_index])
y_pred_valid = KR.predict(
X_app[valid_index, :][:, train_index])
y_pred_test = KR.predict(X_test[:, train_index])
# root mean squared errors
current_train_perf.append(
np.sqrt(mean_squared_error(y_app[train_index], y_pred_train)))
current_valid_perf.append(
np.sqrt(mean_squared_error(y_app[valid_index], y_pred_valid)))
current_test_perf.append(
np.sqrt(mean_squared_error(y_test, y_pred_test)))
# For clcassification use SVM
else:
KR = SVC(kernel='precomputed', **params_in)
# loop for each split on validation set level
# validation set level
for train_index, valid_index in inner_cv.split(X_app):
KR.fit(X_app[train_index, :]
[:, train_index], y_app[train_index])
# predict on the train, validation and test set
y_pred_train = KR.predict(
X_app[train_index, :][:, train_index])
y_pred_valid = KR.predict(
X_app[valid_index, :][:, train_index])
y_pred_test = KR.predict(
X_test[:, train_index])
# root mean squared errors
current_train_perf.append(accuracy_score(
y_app[train_index], y_pred_train))
current_valid_perf.append(accuracy_score(
y_app[valid_index], y_pred_valid))
current_test_perf.append(
accuracy_score(y_test, y_pred_test))
except ValueError:
print(sys.exc_info()[0])
print(params_out, params_in)
# average performance on inner splits
train_pref[trial][index_out][index_in] = np.mean(
current_train_perf)
val_pref[trial][index_out][index_in] = np.mean(
current_valid_perf)
test_pref[trial][index_out][index_in] = np.mean(
current_test_perf)
pbar.update(1)
pbar.clear()
# np.save(results_name_pre + 'train_pref.dt', train_pref)
# np.save(results_name_pre + 'val_pref.dt', val_pref)
# np.save(results_name_pre + 'test_pref.dt', test_pref)

if np.isnan(Kmatrix).any(): # if the matrix contains elements that are not numbers
nb_gm_ignore += 1
print('ignored, as it contains elements that are not numbers.')
print()
print('4. Getting final performance...')
fresults.write('\nII. Performance.\n\n')
# averages and confidences of performances on outer trials for each combination of parameters
average_train_scores = np.mean(train_pref, axis=0)
average_val_scores = np.mean(val_pref, axis=0)
average_perf_scores = np.mean(test_pref, axis=0)
# sample std is used here
std_train_scores = np.std(train_pref, axis=0, ddof=1)
std_val_scores = np.std(val_pref, axis=0, ddof=1)
std_perf_scores = np.std(test_pref, axis=0, ddof=1)
if model_type == 'regression':
best_val_perf = np.amin(average_val_scores)
else:
print(Kmatrix)
plt.matshow(Kmatrix)
plt.colorbar()
fig_name_suffix = '_'.join(['{}-{}'.format(key, val)
for key, val in sorted(params_out.items())])

plt.savefig(
results_name_pre + 'gram_matrix_{}.png'.format(fig_name_suffix))
plt.show()
gram_matrices.append(Kmatrix)
gram_matrix_time.append(current_run_time)
param_list_pre_revised.append(params_out)
np.save(results_name_pre + 'gram_matrices.dt', gram_matrices)
np.save(results_name_pre + 'param_list_precomputed.dt', param_list_pre_revised)
np.save(results_name_pre + 'param_list.dt', param_list)
print()
print('{} gram matrices are calculated, {} of which are ignored.'.format(len(param_list_precomputed), nb_gm_ignore))

print()
print('3. Fitting and predicting using nested cross validation. This could really take a while...')
# Arrays to store scores
train_pref = np.zeros(
(NUM_TRIALS, len(param_list_pre_revised), len(param_list)))
val_pref = np.zeros(
(NUM_TRIALS, len(param_list_pre_revised), len(param_list)))
test_pref = np.zeros(
(NUM_TRIALS, len(param_list_pre_revised), len(param_list)))

# Loop for each trial
pbar = tqdm(total=NUM_TRIALS * len(param_list_pre_revised) * len(param_list),
desc='calculate performance', file=sys.stdout)
for trial in range(NUM_TRIALS): # Test set level
# loop for each outer param tuple
for index_out, params_out in enumerate(param_list_pre_revised):
# split gram matrix and y to app and test sets.
X_app, X_test, y_app, y_test = train_test_split(
gram_matrices[index_out], y, test_size=0.1)
split_index_app = [y.index(y_i) for y_i in y_app if y_i in y]
# split_index_test = [y.index(y_i) for y_i in y_test if y_i in y]
X_app = X_app[:, split_index_app]
X_test = X_test[:, split_index_app]
y_app = np.array(y_app)
y_test = np.array(y_test)

# loop for each inner param tuple
for index_in, params_in in enumerate(param_list):
inner_cv = KFold(n_splits=10, shuffle=True, random_state=trial)
current_train_perf = []
current_valid_perf = []
current_test_perf = []

# For regression use the Kernel Ridge method
try:
if model_type == 'regression':
KR = KernelRidge(kernel='precomputed', **params_in)
# loop for each split on validation set level
# validation set level
for train_index, valid_index in inner_cv.split(X_app):
KR.fit(X_app[train_index, :]
[:, train_index], y_app[train_index])

# predict on the train, validation and test set
y_pred_train = KR.predict(
X_app[train_index, :][:, train_index])
y_pred_valid = KR.predict(
X_app[valid_index, :][:, train_index])
y_pred_test = KR.predict(X_test[:, train_index])

# root mean squared errors
current_train_perf.append(
np.sqrt(mean_squared_error(y_app[train_index], y_pred_train)))
current_valid_perf.append(
np.sqrt(mean_squared_error(y_app[valid_index], y_pred_valid)))
current_test_perf.append(
np.sqrt(mean_squared_error(y_test, y_pred_test)))
# For clcassification use SVM
else:
KR = SVC(kernel='precomputed', **params_in)
# loop for each split on validation set level
# validation set level
for train_index, valid_index in inner_cv.split(X_app):
KR.fit(X_app[train_index, :]
[:, train_index], y_app[train_index])

# predict on the train, validation and test set
y_pred_train = KR.predict(
X_app[train_index, :][:, train_index])
y_pred_valid = KR.predict(
X_app[valid_index, :][:, train_index])
y_pred_test = KR.predict(
X_test[:, train_index])

# root mean squared errors
current_train_perf.append(accuracy_score(
y_app[train_index], y_pred_train))
current_valid_perf.append(accuracy_score(
y_app[valid_index], y_pred_valid))
current_test_perf.append(
accuracy_score(y_test, y_pred_test))
except ValueError:
print(sys.exc_info()[0])
print(params_out, params_in)

# average performance on inner splits
train_pref[trial][index_out][index_in] = np.mean(
current_train_perf)
val_pref[trial][index_out][index_in] = np.mean(
current_valid_perf)
test_pref[trial][index_out][index_in] = np.mean(
current_test_perf)

pbar.update(1)
pbar.clear()
np.save(results_name_pre + 'train_pref.dt', train_pref)
np.save(results_name_pre + 'val_pref.dt', val_pref)
np.save(results_name_pre + 'test_pref.dt', test_pref)
# print('val_pref: ', val_pref) #####
# print(val_pref.shape)

print()
print('4. Getting final performances...')
# averages and confidences of performances on outer trials for each combination of parameters
average_train_scores = np.mean(train_pref, axis=0)
average_val_scores = np.mean(val_pref, axis=0)
# print('average_val_scores: ', average_val_scores) #####
# print(average_val_scores.shape)
average_perf_scores = np.mean(test_pref, axis=0)
# sample std is used here
std_train_scores = np.std(train_pref, axis=0, ddof=1)
std_val_scores = np.std(val_pref, axis=0, ddof=1)
std_perf_scores = np.std(test_pref, axis=0, ddof=1)

if model_type == 'regression':
best_val_perf = np.amin(average_val_scores)
else:
best_val_perf = np.amax(average_val_scores)
# print()
# print('best_val_perf: ', best_val_perf) #####
# print(best_val_perf.shape)
best_params_index = np.where(average_val_scores == best_val_perf)
# print('best_params_index: ', best_params_index) #####
#print(best_params_index[0])
#print(best_params_index[1])
# print(best_params_index.shape)
best_params_out = [param_list_pre_revised[i] for i in best_params_index[0]]
best_params_in = [param_list[i] for i in best_params_index[1]]
# print('best_params_index: ', best_params_index)
print('best_params_out: ', best_params_out)
print('best_params_in: ', best_params_in)
print()
print('best_val_perf: ', best_val_perf)

# below: only find one performance; muitiple pref might exist
best_val_std = std_val_scores[best_params_index[0]
[0]][best_params_index[1][0]]
print('best_val_std: ', best_val_std)

final_performance = average_perf_scores[best_params_index[0]
[0]][best_params_index[1][0]]
final_confidence = std_perf_scores[best_params_index[0]
[0]][best_params_index[1][0]]
print('final_performance: ', final_performance)
print('final_confidence: ', final_confidence)
train_performance = average_train_scores[best_params_index[0]
[0]][best_params_index[1][0]]
train_std = std_train_scores[best_params_index[0]
[0]][best_params_index[1][0]]
print('train_performance: ', train_performance)
print('train_std: ', train_std)

print()
average_gram_matrix_time = np.mean(gram_matrix_time)
std_gram_matrix_time = np.std(gram_matrix_time, ddof=1)
best_gram_matrix_time = gram_matrix_time[best_params_index[0][0]]
print('time to calculate gram matrix with different hyperpapams: {:.2f}±{:.2f}'
.format(average_gram_matrix_time, std_gram_matrix_time))
print('time to calculate best gram matrix: ', best_gram_matrix_time, 's')

# save results to file
np.savetxt(results_name_pre + 'average_train_scores.dt',
average_train_scores)
np.savetxt(results_name_pre + 'average_val_scores', average_val_scores)
np.savetxt(results_name_pre + 'average_perf_scores.dt',
average_perf_scores)
np.savetxt(results_name_pre + 'std_train_scores.dt', std_train_scores)
np.savetxt(results_name_pre + 'std_val_scores.dt', std_val_scores)
np.savetxt(results_name_pre + 'std_perf_scores.dt', std_perf_scores)

np.save(results_name_pre + 'best_params_index', best_params_index)
np.save(results_name_pre + 'best_params_pre.dt', best_params_out)
np.save(results_name_pre + 'best_params_in.dt', best_params_in)
np.save(results_name_pre + 'best_val_perf.dt', best_val_perf)
np.save(results_name_pre + 'best_val_std.dt', best_val_std)
np.save(results_name_pre + 'final_performance.dt', final_performance)
np.save(results_name_pre + 'final_confidence.dt', final_confidence)
np.save(results_name_pre + 'train_performance.dt', train_performance)
np.save(results_name_pre + 'train_std.dt', train_std)
best_val_perf = np.amax(average_val_scores)
best_params_index = np.where(average_val_scores == best_val_perf)
# find smallest val std with best val perf.
best_val_stds = [std_val_scores[value][best_params_index[1][idx]] for idx, value in enumerate(best_params_index[0])]
min_val_std = np.amin(best_val_stds)
best_params_index = np.where(std_val_scores == min_val_std)
best_params_out = [param_list_pre_revised[i] for i in best_params_index[0]]
best_params_in = [param_list[i] for i in best_params_index[1]]
print('best_params_out: ', best_params_out)
print('best_params_in: ', best_params_in)
print()
print('best_val_perf: ', best_val_perf)
print('best_val_std: ', min_val_std)
fresults.write('best settings of hyper-params to build gram matrix: %s\n' % best_params_out)
fresults.write('best settings of other hyper-params: %s\n\n' % best_params_in)
fresults.write('best_val_perf: %s\n' % best_val_perf)
fresults.write('best_val_std: %s\n' % min_val_std)

final_performance = [average_perf_scores[value][best_params_index[1][idx]] for idx, value in enumerate(best_params_index[0])]
final_confidence = [std_perf_scores[value][best_params_index[1][idx]] for idx, value in enumerate(best_params_index[0])]
print('final_performance: ', final_performance)
print('final_confidence: ', final_confidence)
fresults.write('final_performance: %s\n' % final_performance)
fresults.write('final_confidence: %s\n' % final_confidence)
train_performance = [average_train_scores[value][best_params_index[1][idx]] for idx, value in enumerate(best_params_index[0])]
train_std = [std_train_scores[value][best_params_index[1][idx]] for idx, value in enumerate(best_params_index[0])]
print('train_performance: %s' % train_performance)
print('train_std: ', train_std)
fresults.write('train_performance: %s\n' % train_performance)
fresults.write('train_std: %s\n\n' % train_std)

np.save(results_name_pre + 'gram_matrix_time.dt', gram_matrix_time)
np.save(results_name_pre + 'average_gram_matrix_time.dt',
average_gram_matrix_time)
np.save(results_name_pre + 'std_gram_matrix_time.dt',
std_gram_matrix_time)
np.save(results_name_pre + 'best_gram_matrix_time.dt',
best_gram_matrix_time)
print()
average_gram_matrix_time = np.mean(gram_matrix_time)
std_gram_matrix_time = np.std(gram_matrix_time, ddof=1)
best_gram_matrix_time = [gram_matrix_time[i] for i in best_params_index[0]]
ave_bgmt = np.mean(best_gram_matrix_time)
std_bgmt = np.std(best_gram_matrix_time, ddof=1)
print('time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s'
.format(average_gram_matrix_time, std_gram_matrix_time))
print('time to calculate best gram matrix: {:.2f}±{:.2f}s'.format(ave_bgmt, std_bgmt))
fresults.write('time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s\n'
.format(average_gram_matrix_time, std_gram_matrix_time))
fresults.write('time to calculate best gram matrix: {:.2f}±{:.2f}s\n\n'.format(ave_bgmt, std_bgmt))

# # save results to file
# np.savetxt(results_name_pre + 'average_train_scores.dt',
# average_train_scores)
# np.savetxt(results_name_pre + 'average_val_scores', average_val_scores)
# np.savetxt(results_name_pre + 'average_perf_scores.dt',
# average_perf_scores)
# np.savetxt(results_name_pre + 'std_train_scores.dt', std_train_scores)
# np.savetxt(results_name_pre + 'std_val_scores.dt', std_val_scores)
# np.savetxt(results_name_pre + 'std_perf_scores.dt', std_perf_scores)
# np.save(results_name_pre + 'best_params_index', best_params_index)
# np.save(results_name_pre + 'best_params_pre.dt', best_params_out)
# np.save(results_name_pre + 'best_params_in.dt', best_params_in)
# np.save(results_name_pre + 'best_val_perf.dt', best_val_perf)
# np.save(results_name_pre + 'best_val_std.dt', best_val_std)
# np.save(results_name_pre + 'final_performance.dt', final_performance)
# np.save(results_name_pre + 'final_confidence.dt', final_confidence)
# np.save(results_name_pre + 'train_performance.dt', train_performance)
# np.save(results_name_pre + 'train_std.dt', train_std)
# np.save(results_name_pre + 'gram_matrix_time.dt', gram_matrix_time)
# np.save(results_name_pre + 'average_gram_matrix_time.dt',
# average_gram_matrix_time)
# np.save(results_name_pre + 'std_gram_matrix_time.dt',
# std_gram_matrix_time)
# np.save(results_name_pre + 'best_gram_matrix_time.dt',
# best_gram_matrix_time)
# print out as table.
from collections import OrderedDict
from tabulate import tabulate
table_dict = {}
if model_type == 'regression':
for param_in in param_list:
param_in['alpha'] = '{:.2e}'.format(param_in['alpha'])
else:
for param_in in param_list:
param_in['C'] = '{:.2e}'.format(param_in['C'])
table_dict['params'] = [{**param_out, **param_in}
for param_in in param_list for param_out in param_list_pre_revised]
table_dict['gram_matrix_time'] = ['{:.2f}'.format(gram_matrix_time[index_out])
for param_in in param_list for index_out, _ in enumerate(param_list_pre_revised)]
table_dict['valid_perf'] = ['{:.2f}±{:.2f}'.format(average_val_scores[index_out][index_in], std_val_scores[index_out][index_in])
for index_in, _ in enumerate(param_list) for index_out, _ in enumerate(param_list_pre_revised)]
table_dict['test_perf'] = ['{:.2f}±{:.2f}'.format(average_perf_scores[index_out][index_in], std_perf_scores[index_out][index_in])
for index_in, _ in enumerate(param_list) for index_out, _ in enumerate(param_list_pre_revised)]
table_dict['train_perf'] = ['{:.2f}±{:.2f}'.format(average_train_scores[index_out][index_in], std_train_scores[index_out][index_in])
for index_in, _ in enumerate(param_list) for index_out, _ in enumerate(param_list_pre_revised)]
keyorder = ['params', 'train_perf', 'valid_perf',
'test_perf', 'gram_matrix_time']
print()
tb_print = tabulate(OrderedDict(sorted(table_dict.items(),
key=lambda i: keyorder.index(i[0]))), headers='keys')
print(tb_print)
fresults.write('table of performance v.s. hyper-params:\n\n%s\n\n' % tb_print)

# print out as table.
from collections import OrderedDict
from tabulate import tabulate
table_dict = {}
if model_type == 'regression':
for param_in in param_list:
param_in['alpha'] = '{:.2e}'.format(param_in['alpha'])
else:
for param_in in param_list:
param_in['C'] = '{:.2e}'.format(param_in['C'])
table_dict['params'] = [{**param_out, **param_in}
for param_in in param_list for param_out in param_list_pre_revised]
table_dict['gram_matrix_time'] = ['{:.2f}'.format(gram_matrix_time[index_out])
for param_in in param_list for index_out, _ in enumerate(param_list_pre_revised)]
table_dict['valid_perf'] = ['{:.2f}±{:.2f}'.format(average_val_scores[index_out][index_in], std_val_scores[index_out][index_in])
for index_in, _ in enumerate(param_list) for index_out, _ in enumerate(param_list_pre_revised)]
table_dict['test_perf'] = ['{:.2f}±{:.2f}'.format(average_perf_scores[index_out][index_in], std_perf_scores[index_out][index_in])
for index_in, _ in enumerate(param_list) for index_out, _ in enumerate(param_list_pre_revised)]
table_dict['train_perf'] = ['{:.2f}±{:.2f}'.format(average_train_scores[index_out][index_in], std_train_scores[index_out][index_in])
for index_in, _ in enumerate(param_list) for index_out, _ in enumerate(param_list_pre_revised)]
keyorder = ['params', 'train_perf', 'valid_perf',
'test_perf', 'gram_matrix_time']
print()
print(tabulate(OrderedDict(sorted(table_dict.items(),
key=lambda i: keyorder.index(i[0]))), headers='keys'))
np.save(results_name_pre + 'results_vs_params.dt', table_dict)
fresults.close()

+ 320
- 0
pygraph/utils/suffix_tree.py View File

@@ -0,0 +1,320 @@
"""
@author: linlin
@references:
[1] https://github.com/ptrus/suffix-trees/blob/master/suffix_trees/STree.py, 2018.6
"""

import sys


class STree():
"""Class representing the suffix tree. The generalized suffix tree is supported."""

def __init__(self, input=''):
self.root = _SNode()
self.root.depth = 0
self.root.idx = 0
self.root.parent = self.root
self.root._add_suffix_link(self.root)

if not input == '':
self.build(input)

def _check_input(self, input):
"""Checks the validity of the input.
In case of an invalid input throws ValueError.
"""
if isinstance(input, str):
return 'st'
elif isinstance(input, list):
if all(isinstance(item, str) for item in input):
return 'gst'

raise ValueError("String argument should be of type String or"
" a list of strings")

def build(self, x):
"""Builds the Suffix tree on the given input.
If the input is of type List of Strings:
Generalized Suffix Tree is built.
:param x: String or List of Strings
"""
type = self._check_input(x)

if type == 'st':
x += next(self._terminalSymbolsGenerator())
self._build(x)
if type == 'gst':
self._build_generalized(x)

def _build(self, x):
"""Builds a Suffix tree."""
self.word = x
self._build_McCreight(x)

def _build_McCreight(self, x):
"""Builds a Suffix tree using McCreight O(n) algorithm.
Algorithm based on:
McCreight, Edward M. "A space-economical suffix tree construction algorithm." - ACM, 1976.
Implementation based on:
UH CS - 58093 String Processing Algorithms Lecture Notes
"""
u = self.root
d = 0
for i in range(len(x)):
while u.depth == d and u._has_transition(x[d + i]):
u = u._get_transition_link(x[d + i])
d = d + 1
while d < u.depth and x[u.idx + d] == x[i + d]:
d = d + 1
if d < u.depth:
u = self._create_node(x, u, d)
self._create_leaf(x, i, u, d)
if not u._get_suffix_link():
self._compute_slink(x, u)
u = u._get_suffix_link()
d = d - 1
if d < 0:
d = 0

def _create_node(self, x, u, d):
i = u.idx
p = u.parent
v = _SNode(idx=i, depth=d)
v._add_transition_link(u, x[i + d])
u.parent = v
p._add_transition_link(v, x[i + p.depth])
v.parent = p
return v

def _create_leaf(self, x, i, u, d):
w = _SNode()
w.idx = i
w.depth = len(x) - i
u._add_transition_link(w, x[i + d])
w.parent = u
return w

def _compute_slink(self, x, u):
d = u.depth
v = u.parent._get_suffix_link()
while v.depth < d - 1:
v = v._get_transition_link(x[u.idx + v.depth + 1])
if v.depth > d - 1:
v = self._create_node(x, v, d - 1)
u._add_suffix_link(v)

def _build_Ukkonen(self, x):
"""Builds a Suffix tree using Ukkonen's online O(n) algorithm.
Algorithm based on:
Ukkonen, Esko. "On-line construction of suffix trees." - Algorithmica, 1995.
"""
# TODO.
raise NotImplementedError()

def _build_generalized(self, xs):
"""Builds a Generalized Suffix Tree (GST) from the array of strings provided.
"""
terminal_gen = self._terminalSymbolsGenerator()

_xs = ''.join([x + next(terminal_gen) for x in xs])
self.word = _xs
self._generalized_word_starts(xs)
self._build(_xs)
self.root._traverse(self._label_generalized)

def _label_generalized(self, node):
"""Helper method that labels the nodes of GST with indexes of strings
found in their descendants.
"""
if node.is_leaf():
x = {self._get_word_start_index(node.idx)}
else:
x = {
n
for ns in node.transition_links for n in ns[0].generalized_idxs
}
node.generalized_idxs = x

def _get_word_start_index(self, idx):
"""Helper method that returns the index of the string based on node's
starting index"""
i = 0
for _idx in self.word_starts[1:]:
if idx < _idx:
return i
else:
i += 1
return i

def lcs(self, stringIdxs=-1):
"""Returns the Largest Common Substring of Strings provided in stringIdxs.
If stringIdxs is not provided, the LCS of all strings is returned.
::param stringIdxs: Optional: List of indexes of strings.
"""
if stringIdxs == -1 or not isinstance(stringIdxs, list):
stringIdxs = set(range(len(self.word_starts)))
else:
stringIdxs = set(stringIdxs)

deepestNode = self._find_lcs(self.root, stringIdxs)
start = deepestNode.idx
end = deepestNode.idx + deepestNode.depth
return self.word[start:end]

def _find_lcs(self, node, stringIdxs):
"""Helper method that finds LCS by traversing the labeled GSD."""
nodes = [
self._find_lcs(n, stringIdxs) for (n, _) in node.transition_links
if n.generalized_idxs.issuperset(stringIdxs)
]

if nodes == []:
return node

deepestNode = max(nodes, key=lambda n: n.depth)
return deepestNode

def _generalized_word_starts(self, xs):
"""Helper method returns the starting indexes of strings in GST"""
self.word_starts = []
i = 0
for n in range(len(xs)):
self.word_starts.append(i)
i += len(xs[n]) + 1

def find(self, y):
"""Returns starting position of the substring y in the string used for
building the Suffix tree.
:param y: String
:return: Index of the starting position of string y in the string used for building the Suffix tree
-1 if y is not a substring.
"""
node = self.root
while True:
edge = self._edgeLabel(node, node.parent)
if edge.startswith(y):
return node.idx

i = 0
while (i < len(edge) and edge[i] == y[0]):
y = y[1:]
i += 1

if i != 0:
if i == len(edge) and y != '':
pass
else:
return -1

node = node._get_transition_link(y[0])
if not node:
return -1

def find_all(self, y):
y_input = y
node = self.root
while True:
edge = self._edgeLabel(node, node.parent)
if edge.startswith(y):
break

i = 0
while (i < len(edge) and edge[i] == y[0]):
y = y[1:]
i += 1

if i != 0:
if i == len(edge) and y != '':
pass
else:
return []

node = node._get_transition_link(y[0])
if not node:
return []

leaves = node._get_leaves()
return [n.idx for n in leaves]

def _edgeLabel(self, node, parent):
"""Helper method, returns the edge label between a node and it's parent"""
return self.word[node.idx + parent.depth:node.idx + node.depth]

def _terminalSymbolsGenerator(self):
"""Generator of unique terminal symbols used for building the Generalized Suffix Tree.
Unicode Private Use Area U+E000..U+F8FF is used to ensure that terminal symbols
are not part of the input string.
"""
py2 = sys.version[0] < '3'
UPPAs = list(
list(range(0xE000, 0xF8FF + 1)) +
list(range(0xF0000, 0xFFFFD + 1)) +
list(range(0x100000, 0x10FFFD + 1)))
for i in UPPAs:
if py2:
yield (unichr(i))
else:
yield (chr(i))
raise ValueError("To many input strings.")


class _SNode():
"""Class representing a Node in the Suffix tree."""

def __init__(self, idx=-1, parentNode=None, depth=-1):
# Links
self._suffix_link = None
self.transition_links = []
# Properties
self.idx = idx
self.depth = depth
self.parent = parentNode
self.generalized_idxs = {}

def __str__(self):
return ("SNode: idx:" + str(self.idx) + " depth:" + str(self.depth) +
" transitons:" + str(self.transition_links))

def _add_suffix_link(self, snode):
self._suffix_link = snode

def _get_suffix_link(self):
if self._suffix_link != None:
return self._suffix_link
else:
return False

def _get_transition_link(self, suffix):
for node, _suffix in self.transition_links:
if _suffix == '__@__' or suffix == _suffix:
return node
return False

def _add_transition_link(self, snode, suffix=''):
tl = self._get_transition_link(suffix)
if tl: # TODO: imporve this.
self.transition_links.remove((tl, suffix))
self.transition_links.append((snode, suffix))

def _has_transition(self, suffix):
for node, _suffix in self.transition_links:
if _suffix == '__@__' or suffix == _suffix:
return True
return False

def is_leaf(self):
return self.transition_links == []

def _traverse(self, f):
for (node, _) in self.transition_links:
node._traverse(f)
f(self)

def _get_leaves(self):
if self.is_leaf():
return [self]
else:
return [
x for (n, _) in self.transition_links for x in n._get_leaves()
]

+ 3
- 2
pygraph/utils/utils.py View File

@@ -62,8 +62,9 @@ def floydTransformation(G, edge_weight=None):
S = nx.Graph()
S.add_nodes_from(G.nodes(data=True))
for i in range(0, G.number_of_nodes()):
for j in range(i, G.number_of_nodes()):
S.add_edge(i, j, cost=spMatrix[i, j])
for j in range(i + 1, G.number_of_nodes()):
if spMatrix[i, j] != np.inf:
S.add_edge(i, j, cost=spMatrix[i, j])
return S




Loading…
Cancel
Save